aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.hz2
-rw-r--r--kernel/Kconfig.instrumentation6
-rw-r--r--kernel/Kconfig.preempt13
-rw-r--r--kernel/Makefile9
-rw-r--r--kernel/acct.c4
-rw-r--r--kernel/audit.c444
-rw-r--r--kernel/auditfilter.c54
-rw-r--r--kernel/auditsc.c349
-rw-r--r--kernel/backtracetest.c48
-rw-r--r--kernel/cgroup.c9
-rw-r--r--kernel/cpu.c164
-rw-r--r--kernel/cpu_acct.c186
-rw-r--r--kernel/cpuset.c14
-rw-r--r--kernel/exit.c102
-rw-r--r--kernel/extable.c3
-rw-r--r--kernel/fork.c78
-rw-r--r--kernel/futex.c123
-rw-r--r--kernel/futex_compat.c29
-rw-r--r--kernel/hrtimer.c270
-rw-r--r--kernel/irq/chip.c9
-rw-r--r--kernel/irq/handle.c8
-rw-r--r--kernel/irq/manage.c3
-rw-r--r--kernel/irq/proc.c21
-rw-r--r--kernel/irq/spurious.c5
-rw-r--r--kernel/kallsyms.c18
-rw-r--r--kernel/kexec.c1
-rw-r--r--kernel/kmod.c13
-rw-r--r--kernel/kprobes.c2
-rw-r--r--kernel/ksysfs.c82
-rw-r--r--kernel/kthread.c12
-rw-r--r--kernel/latencytop.c239
-rw-r--r--kernel/lockdep.c75
-rw-r--r--kernel/marker.c41
-rw-r--r--kernel/module.c294
-rw-r--r--kernel/mutex.c36
-rw-r--r--kernel/panic.c41
-rw-r--r--kernel/params.c55
-rw-r--r--kernel/pid.c2
-rw-r--r--kernel/posix-cpu-timers.c30
-rw-r--r--kernel/power/Kconfig65
-rw-r--r--kernel/power/disk.c236
-rw-r--r--kernel/power/main.c197
-rw-r--r--kernel/power/pm.c4
-rw-r--r--kernel/power/power.h94
-rw-r--r--kernel/power/process.c6
-rw-r--r--kernel/power/snapshot.c31
-rw-r--r--kernel/power/swap.c33
-rw-r--r--kernel/power/swsusp.c48
-rw-r--r--kernel/power/user.c109
-rw-r--r--kernel/printk.c68
-rw-r--r--kernel/profile.c99
-rw-r--r--kernel/ptrace.c181
-rw-r--r--kernel/rcuclassic.c575
-rw-r--r--kernel/rcupdate.c576
-rw-r--r--kernel/rcupreempt.c953
-rw-r--r--kernel/rcupreempt_trace.c330
-rw-r--r--kernel/rcutorture.c6
-rw-r--r--kernel/resource.c2
-rw-r--r--kernel/rtmutex-tester.c2
-rw-r--r--kernel/rwsem.c5
-rw-r--r--kernel/sched.c1876
-rw-r--r--kernel/sched_debug.c30
-rw-r--r--kernel/sched_fair.c509
-rw-r--r--kernel/sched_idletask.c42
-rw-r--r--kernel/sched_rt.c1115
-rw-r--r--kernel/sched_stats.h14
-rw-r--r--kernel/signal.c30
-rw-r--r--kernel/softirq.c11
-rw-r--r--kernel/softlockup.c114
-rw-r--r--kernel/spinlock.c3
-rw-r--r--kernel/stop_machine.c4
-rw-r--r--kernel/sys.c20
-rw-r--r--kernel/sys_ni.c5
-rw-r--r--kernel/sysctl.c319
-rw-r--r--kernel/sysctl_check.c83
-rw-r--r--kernel/taskstats.c36
-rw-r--r--kernel/test_kprobes.c216
-rw-r--r--kernel/time.c1
-rw-r--r--kernel/time/clockevents.c18
-rw-r--r--kernel/time/clocksource.c33
-rw-r--r--kernel/time/ntp.c11
-rw-r--r--kernel/time/tick-broadcast.c65
-rw-r--r--kernel/time/tick-internal.h2
-rw-r--r--kernel/time/tick-sched.c97
-rw-r--r--kernel/time/timekeeping.c36
-rw-r--r--kernel/time/timer_list.c8
-rw-r--r--kernel/time/timer_stats.c2
-rw-r--r--kernel/timer.c123
-rw-r--r--kernel/user.c147
-rw-r--r--kernel/utsname_sysctl.c4
-rw-r--r--kernel/wait.c2
-rw-r--r--kernel/workqueue.c40
92 files changed, 8290 insertions, 3240 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 4af15802ccd4..526128a2e622 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -54,3 +54,5 @@ config HZ
54 default 300 if HZ_300 54 default 300 if HZ_300
55 default 1000 if HZ_1000 55 default 1000 if HZ_1000
56 56
57config SCHED_HRTICK
58 def_bool HIGH_RES_TIMERS && X86
diff --git a/kernel/Kconfig.instrumentation b/kernel/Kconfig.instrumentation
index f5f2c769d95e..468f47ad7503 100644
--- a/kernel/Kconfig.instrumentation
+++ b/kernel/Kconfig.instrumentation
@@ -20,8 +20,8 @@ config PROFILING
20 20
21config OPROFILE 21config OPROFILE
22 tristate "OProfile system profiling (EXPERIMENTAL)" 22 tristate "OProfile system profiling (EXPERIMENTAL)"
23 depends on PROFILING 23 depends on PROFILING && !UML
24 depends on ALPHA || ARM || BLACKFIN || X86_32 || IA64 || M32R || MIPS || PARISC || PPC || S390 || SUPERH || SPARC || X86_64 24 depends on ARCH_SUPPORTS_OPROFILE || ALPHA || ARM || BLACKFIN || IA64 || M32R || PARISC || PPC || S390 || SUPERH || SPARC
25 help 25 help
26 OProfile is a profiling system capable of profiling the 26 OProfile is a profiling system capable of profiling the
27 whole system, include the kernel, kernel modules, libraries, 27 whole system, include the kernel, kernel modules, libraries,
@@ -31,7 +31,7 @@ config OPROFILE
31 31
32config KPROBES 32config KPROBES
33 bool "Kprobes" 33 bool "Kprobes"
34 depends on KALLSYMS && MODULES 34 depends on KALLSYMS && MODULES && !UML
35 depends on X86_32 || IA64 || PPC || S390 || SPARC64 || X86_64 || AVR32 35 depends on X86_32 || IA64 || PPC || S390 || SPARC64 || X86_64 || AVR32
36 help 36 help
37 Kprobes allows you to trap at almost any kernel address and 37 Kprobes allows you to trap at almost any kernel address and
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index c64ce9c14207..0669b70fa6a3 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -52,14 +52,13 @@ config PREEMPT
52 52
53endchoice 53endchoice
54 54
55config PREEMPT_BKL 55config RCU_TRACE
56 bool "Preempt The Big Kernel Lock" 56 bool "Enable tracing for RCU - currently stats in debugfs"
57 depends on SMP || PREEMPT 57 select DEBUG_FS
58 default y 58 default y
59 help 59 help
60 This option reduces the latency of the kernel by making the 60 This option provides tracing in RCU which presents stats
61 big kernel lock preemptible. 61 in debugfs for debugging RCU implementation.
62 62
63 Say Y here if you are building a kernel for a desktop system. 63 Say Y here if you want to enable RCU tracing
64 Say N if you are unsure. 64 Say N if you are unsure.
65
diff --git a/kernel/Makefile b/kernel/Makefile
index f60afe742599..8885627ea021 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -36,14 +36,15 @@ obj-$(CONFIG_KALLSYMS) += kallsyms.o
36obj-$(CONFIG_PM) += power/ 36obj-$(CONFIG_PM) += power/
37obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o 37obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
38obj-$(CONFIG_KEXEC) += kexec.o 38obj-$(CONFIG_KEXEC) += kexec.o
39obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
39obj-$(CONFIG_COMPAT) += compat.o 40obj-$(CONFIG_COMPAT) += compat.o
40obj-$(CONFIG_CGROUPS) += cgroup.o 41obj-$(CONFIG_CGROUPS) += cgroup.o
41obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o 42obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
42obj-$(CONFIG_CPUSETS) += cpuset.o 43obj-$(CONFIG_CPUSETS) += cpuset.o
43obj-$(CONFIG_CGROUP_CPUACCT) += cpu_acct.o
44obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o 44obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
45obj-$(CONFIG_IKCONFIG) += configs.o 45obj-$(CONFIG_IKCONFIG) += configs.o
46obj-$(CONFIG_STOP_MACHINE) += stop_machine.o 46obj-$(CONFIG_STOP_MACHINE) += stop_machine.o
47obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o
47obj-$(CONFIG_AUDIT) += audit.o auditfilter.o 48obj-$(CONFIG_AUDIT) += audit.o auditfilter.o
48obj-$(CONFIG_AUDITSYSCALL) += auditsc.o 49obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
49obj-$(CONFIG_AUDIT_TREE) += audit_tree.o 50obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
@@ -53,11 +54,17 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
53obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ 54obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
54obj-$(CONFIG_SECCOMP) += seccomp.o 55obj-$(CONFIG_SECCOMP) += seccomp.o
55obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 56obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
57obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o
58obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o
59ifeq ($(CONFIG_PREEMPT_RCU),y)
60obj-$(CONFIG_RCU_TRACE) += rcupreempt_trace.o
61endif
56obj-$(CONFIG_RELAY) += relay.o 62obj-$(CONFIG_RELAY) += relay.o
57obj-$(CONFIG_SYSCTL) += utsname_sysctl.o 63obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
58obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 64obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
59obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o 65obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
60obj-$(CONFIG_MARKERS) += marker.o 66obj-$(CONFIG_MARKERS) += marker.o
67obj-$(CONFIG_LATENCYTOP) += latencytop.o
61 68
62ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) 69ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
63# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 70# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/acct.c b/kernel/acct.c
index fce53d8df8a7..521dfa53cb99 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -413,7 +413,7 @@ static u32 encode_float(u64 value)
413 * The acct_process() call is the workhorse of the process 413 * The acct_process() call is the workhorse of the process
414 * accounting system. The struct acct is built here and then written 414 * accounting system. The struct acct is built here and then written
415 * into the accounting file. This function should only be called from 415 * into the accounting file. This function should only be called from
416 * do_exit(). 416 * do_exit() or when switching to a different output file.
417 */ 417 */
418 418
419/* 419/*
@@ -482,7 +482,7 @@ static void do_acct_process(struct file *file)
482#endif 482#endif
483#if ACCT_VERSION==3 483#if ACCT_VERSION==3
484 ac.ac_pid = current->tgid; 484 ac.ac_pid = current->tgid;
485 ac.ac_ppid = current->parent->tgid; 485 ac.ac_ppid = current->real_parent->tgid;
486#endif 486#endif
487 487
488 spin_lock_irq(&current->sighand->siglock); 488 spin_lock_irq(&current->sighand->siglock);
diff --git a/kernel/audit.c b/kernel/audit.c
index f93c2713017d..c8555b180213 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -66,10 +66,11 @@
66 * (Initialization happens after skb_init is called.) */ 66 * (Initialization happens after skb_init is called.) */
67static int audit_initialized; 67static int audit_initialized;
68 68
69/* 0 - no auditing 69#define AUDIT_OFF 0
70 * 1 - auditing enabled 70#define AUDIT_ON 1
71 * 2 - auditing enabled and configuration is locked/unchangeable. */ 71#define AUDIT_LOCKED 2
72int audit_enabled; 72int audit_enabled;
73int audit_ever_enabled;
73 74
74/* Default state when kernel boots without any parameters. */ 75/* Default state when kernel boots without any parameters. */
75static int audit_default; 76static int audit_default;
@@ -152,8 +153,10 @@ struct audit_buffer {
152 153
153static void audit_set_pid(struct audit_buffer *ab, pid_t pid) 154static void audit_set_pid(struct audit_buffer *ab, pid_t pid)
154{ 155{
155 struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); 156 if (ab) {
156 nlh->nlmsg_pid = pid; 157 struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
158 nlh->nlmsg_pid = pid;
159 }
157} 160}
158 161
159void audit_panic(const char *message) 162void audit_panic(const char *message)
@@ -163,7 +166,8 @@ void audit_panic(const char *message)
163 case AUDIT_FAIL_SILENT: 166 case AUDIT_FAIL_SILENT:
164 break; 167 break;
165 case AUDIT_FAIL_PRINTK: 168 case AUDIT_FAIL_PRINTK:
166 printk(KERN_ERR "audit: %s\n", message); 169 if (printk_ratelimit())
170 printk(KERN_ERR "audit: %s\n", message);
167 break; 171 break;
168 case AUDIT_FAIL_PANIC: 172 case AUDIT_FAIL_PANIC:
169 panic("audit: %s\n", message); 173 panic("audit: %s\n", message);
@@ -231,161 +235,107 @@ void audit_log_lost(const char *message)
231 } 235 }
232 236
233 if (print) { 237 if (print) {
234 printk(KERN_WARNING 238 if (printk_ratelimit())
235 "audit: audit_lost=%d audit_rate_limit=%d audit_backlog_limit=%d\n", 239 printk(KERN_WARNING
236 atomic_read(&audit_lost), 240 "audit: audit_lost=%d audit_rate_limit=%d "
237 audit_rate_limit, 241 "audit_backlog_limit=%d\n",
238 audit_backlog_limit); 242 atomic_read(&audit_lost),
243 audit_rate_limit,
244 audit_backlog_limit);
239 audit_panic(message); 245 audit_panic(message);
240 } 246 }
241} 247}
242 248
243static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid) 249static int audit_log_config_change(char *function_name, int new, int old,
250 uid_t loginuid, u32 sid, int allow_changes)
244{ 251{
245 int res, rc = 0, old = audit_rate_limit; 252 struct audit_buffer *ab;
246 253 int rc = 0;
247 /* check if we are locked */
248 if (audit_enabled == 2)
249 res = 0;
250 else
251 res = 1;
252 254
255 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
256 audit_log_format(ab, "%s=%d old=%d by auid=%u", function_name, new,
257 old, loginuid);
253 if (sid) { 258 if (sid) {
254 char *ctx = NULL; 259 char *ctx = NULL;
255 u32 len; 260 u32 len;
256 if ((rc = selinux_sid_to_string(sid, &ctx, &len)) == 0) { 261
257 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 262 rc = selinux_sid_to_string(sid, &ctx, &len);
258 "audit_rate_limit=%d old=%d by auid=%u" 263 if (rc) {
259 " subj=%s res=%d", 264 audit_log_format(ab, " sid=%u", sid);
260 limit, old, loginuid, ctx, res); 265 allow_changes = 0; /* Something weird, deny request */
266 } else {
267 audit_log_format(ab, " subj=%s", ctx);
261 kfree(ctx); 268 kfree(ctx);
262 } else 269 }
263 res = 0; /* Something weird, deny request */
264 } 270 }
265 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 271 audit_log_format(ab, " res=%d", allow_changes);
266 "audit_rate_limit=%d old=%d by auid=%u res=%d", 272 audit_log_end(ab);
267 limit, old, loginuid, res);
268
269 /* If we are allowed, make the change */
270 if (res == 1)
271 audit_rate_limit = limit;
272 /* Not allowed, update reason */
273 else if (rc == 0)
274 rc = -EPERM;
275 return rc; 273 return rc;
276} 274}
277 275
278static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid) 276static int audit_do_config_change(char *function_name, int *to_change,
277 int new, uid_t loginuid, u32 sid)
279{ 278{
280 int res, rc = 0, old = audit_backlog_limit; 279 int allow_changes, rc = 0, old = *to_change;
281 280
282 /* check if we are locked */ 281 /* check if we are locked */
283 if (audit_enabled == 2) 282 if (audit_enabled == AUDIT_LOCKED)
284 res = 0; 283 allow_changes = 0;
285 else 284 else
286 res = 1; 285 allow_changes = 1;
287 286
288 if (sid) { 287 if (audit_enabled != AUDIT_OFF) {
289 char *ctx = NULL; 288 rc = audit_log_config_change(function_name, new, old,
290 u32 len; 289 loginuid, sid, allow_changes);
291 if ((rc = selinux_sid_to_string(sid, &ctx, &len)) == 0) { 290 if (rc)
292 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 291 allow_changes = 0;
293 "audit_backlog_limit=%d old=%d by auid=%u"
294 " subj=%s res=%d",
295 limit, old, loginuid, ctx, res);
296 kfree(ctx);
297 } else
298 res = 0; /* Something weird, deny request */
299 } 292 }
300 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
301 "audit_backlog_limit=%d old=%d by auid=%u res=%d",
302 limit, old, loginuid, res);
303 293
304 /* If we are allowed, make the change */ 294 /* If we are allowed, make the change */
305 if (res == 1) 295 if (allow_changes == 1)
306 audit_backlog_limit = limit; 296 *to_change = new;
307 /* Not allowed, update reason */ 297 /* Not allowed, update reason */
308 else if (rc == 0) 298 else if (rc == 0)
309 rc = -EPERM; 299 rc = -EPERM;
310 return rc; 300 return rc;
311} 301}
312 302
313static int audit_set_enabled(int state, uid_t loginuid, u32 sid) 303static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid)
314{ 304{
315 int res, rc = 0, old = audit_enabled; 305 return audit_do_config_change("audit_rate_limit", &audit_rate_limit,
306 limit, loginuid, sid);
307}
308
309static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid)
310{
311 return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit,
312 limit, loginuid, sid);
313}
316 314
317 if (state < 0 || state > 2) 315static int audit_set_enabled(int state, uid_t loginuid, u32 sid)
316{
317 int rc;
318 if (state < AUDIT_OFF || state > AUDIT_LOCKED)
318 return -EINVAL; 319 return -EINVAL;
319 320
320 /* check if we are locked */ 321 rc = audit_do_config_change("audit_enabled", &audit_enabled, state,
321 if (audit_enabled == 2) 322 loginuid, sid);
322 res = 0;
323 else
324 res = 1;
325 323
326 if (sid) { 324 if (!rc)
327 char *ctx = NULL; 325 audit_ever_enabled |= !!state;
328 u32 len;
329 if ((rc = selinux_sid_to_string(sid, &ctx, &len)) == 0) {
330 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
331 "audit_enabled=%d old=%d by auid=%u"
332 " subj=%s res=%d",
333 state, old, loginuid, ctx, res);
334 kfree(ctx);
335 } else
336 res = 0; /* Something weird, deny request */
337 }
338 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
339 "audit_enabled=%d old=%d by auid=%u res=%d",
340 state, old, loginuid, res);
341 326
342 /* If we are allowed, make the change */
343 if (res == 1)
344 audit_enabled = state;
345 /* Not allowed, update reason */
346 else if (rc == 0)
347 rc = -EPERM;
348 return rc; 327 return rc;
349} 328}
350 329
351static int audit_set_failure(int state, uid_t loginuid, u32 sid) 330static int audit_set_failure(int state, uid_t loginuid, u32 sid)
352{ 331{
353 int res, rc = 0, old = audit_failure;
354
355 if (state != AUDIT_FAIL_SILENT 332 if (state != AUDIT_FAIL_SILENT
356 && state != AUDIT_FAIL_PRINTK 333 && state != AUDIT_FAIL_PRINTK
357 && state != AUDIT_FAIL_PANIC) 334 && state != AUDIT_FAIL_PANIC)
358 return -EINVAL; 335 return -EINVAL;
359 336
360 /* check if we are locked */ 337 return audit_do_config_change("audit_failure", &audit_failure, state,
361 if (audit_enabled == 2) 338 loginuid, sid);
362 res = 0;
363 else
364 res = 1;
365
366 if (sid) {
367 char *ctx = NULL;
368 u32 len;
369 if ((rc = selinux_sid_to_string(sid, &ctx, &len)) == 0) {
370 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
371 "audit_failure=%d old=%d by auid=%u"
372 " subj=%s res=%d",
373 state, old, loginuid, ctx, res);
374 kfree(ctx);
375 } else
376 res = 0; /* Something weird, deny request */
377 }
378 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
379 "audit_failure=%d old=%d by auid=%u res=%d",
380 state, old, loginuid, res);
381
382 /* If we are allowed, make the change */
383 if (res == 1)
384 audit_failure = state;
385 /* Not allowed, update reason */
386 else if (rc == 0)
387 rc = -EPERM;
388 return rc;
389} 339}
390 340
391static int kauditd_thread(void *dummy) 341static int kauditd_thread(void *dummy)
@@ -405,7 +355,11 @@ static int kauditd_thread(void *dummy)
405 audit_pid = 0; 355 audit_pid = 0;
406 } 356 }
407 } else { 357 } else {
408 printk(KERN_NOTICE "%s\n", skb->data + NLMSG_SPACE(0)); 358 if (printk_ratelimit())
359 printk(KERN_NOTICE "%s\n", skb->data +
360 NLMSG_SPACE(0));
361 else
362 audit_log_lost("printk limit exceeded\n");
409 kfree_skb(skb); 363 kfree_skb(skb);
410 } 364 }
411 } else { 365 } else {
@@ -573,6 +527,33 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
573 return err; 527 return err;
574} 528}
575 529
530static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type,
531 u32 pid, u32 uid, uid_t auid, u32 sid)
532{
533 int rc = 0;
534 char *ctx = NULL;
535 u32 len;
536
537 if (!audit_enabled) {
538 *ab = NULL;
539 return rc;
540 }
541
542 *ab = audit_log_start(NULL, GFP_KERNEL, msg_type);
543 audit_log_format(*ab, "user pid=%d uid=%u auid=%u",
544 pid, uid, auid);
545 if (sid) {
546 rc = selinux_sid_to_string(sid, &ctx, &len);
547 if (rc)
548 audit_log_format(*ab, " ssid=%u", sid);
549 else
550 audit_log_format(*ab, " subj=%s", ctx);
551 kfree(ctx);
552 }
553
554 return rc;
555}
556
576static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) 557static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
577{ 558{
578 u32 uid, pid, seq, sid; 559 u32 uid, pid, seq, sid;
@@ -583,7 +564,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
583 u16 msg_type = nlh->nlmsg_type; 564 u16 msg_type = nlh->nlmsg_type;
584 uid_t loginuid; /* loginuid of sender */ 565 uid_t loginuid; /* loginuid of sender */
585 struct audit_sig_info *sig_data; 566 struct audit_sig_info *sig_data;
586 char *ctx; 567 char *ctx = NULL;
587 u32 len; 568 u32 len;
588 569
589 err = audit_netlink_ok(skb, msg_type); 570 err = audit_netlink_ok(skb, msg_type);
@@ -634,23 +615,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
634 if (err < 0) return err; 615 if (err < 0) return err;
635 } 616 }
636 if (status_get->mask & AUDIT_STATUS_PID) { 617 if (status_get->mask & AUDIT_STATUS_PID) {
637 int old = audit_pid; 618 int new_pid = status_get->pid;
638 if (sid) { 619
639 if ((err = selinux_sid_to_string( 620 if (audit_enabled != AUDIT_OFF)
640 sid, &ctx, &len))) 621 audit_log_config_change("audit_pid", new_pid,
641 return err; 622 audit_pid, loginuid,
642 else 623 sid, 1);
643 audit_log(NULL, GFP_KERNEL, 624
644 AUDIT_CONFIG_CHANGE, 625 audit_pid = new_pid;
645 "audit_pid=%d old=%d by auid=%u subj=%s",
646 status_get->pid, old,
647 loginuid, ctx);
648 kfree(ctx);
649 } else
650 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
651 "audit_pid=%d old=%d by auid=%u",
652 status_get->pid, old, loginuid);
653 audit_pid = status_get->pid;
654 } 626 }
655 if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) 627 if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
656 err = audit_set_rate_limit(status_get->rate_limit, 628 err = audit_set_rate_limit(status_get->rate_limit,
@@ -673,64 +645,35 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
673 if (err) 645 if (err)
674 break; 646 break;
675 } 647 }
676 ab = audit_log_start(NULL, GFP_KERNEL, msg_type); 648 audit_log_common_recv_msg(&ab, msg_type, pid, uid,
677 if (ab) { 649 loginuid, sid);
678 audit_log_format(ab, 650
679 "user pid=%d uid=%u auid=%u", 651 if (msg_type != AUDIT_USER_TTY)
680 pid, uid, loginuid); 652 audit_log_format(ab, " msg='%.1024s'",
681 if (sid) { 653 (char *)data);
682 if (selinux_sid_to_string( 654 else {
683 sid, &ctx, &len)) { 655 int size;
684 audit_log_format(ab, 656
685 " ssid=%u", sid); 657 audit_log_format(ab, " msg=");
686 /* Maybe call audit_panic? */ 658 size = nlmsg_len(nlh);
687 } else 659 audit_log_n_untrustedstring(ab, size,
688 audit_log_format(ab, 660 data);
689 " subj=%s", ctx);
690 kfree(ctx);
691 }
692 if (msg_type != AUDIT_USER_TTY)
693 audit_log_format(ab, " msg='%.1024s'",
694 (char *)data);
695 else {
696 int size;
697
698 audit_log_format(ab, " msg=");
699 size = nlmsg_len(nlh);
700 audit_log_n_untrustedstring(ab, size,
701 data);
702 }
703 audit_set_pid(ab, pid);
704 audit_log_end(ab);
705 } 661 }
662 audit_set_pid(ab, pid);
663 audit_log_end(ab);
706 } 664 }
707 break; 665 break;
708 case AUDIT_ADD: 666 case AUDIT_ADD:
709 case AUDIT_DEL: 667 case AUDIT_DEL:
710 if (nlmsg_len(nlh) < sizeof(struct audit_rule)) 668 if (nlmsg_len(nlh) < sizeof(struct audit_rule))
711 return -EINVAL; 669 return -EINVAL;
712 if (audit_enabled == 2) { 670 if (audit_enabled == AUDIT_LOCKED) {
713 ab = audit_log_start(NULL, GFP_KERNEL, 671 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
714 AUDIT_CONFIG_CHANGE); 672 uid, loginuid, sid);
715 if (ab) { 673
716 audit_log_format(ab, 674 audit_log_format(ab, " audit_enabled=%d res=0",
717 "pid=%d uid=%u auid=%u", 675 audit_enabled);
718 pid, uid, loginuid); 676 audit_log_end(ab);
719 if (sid) {
720 if (selinux_sid_to_string(
721 sid, &ctx, &len)) {
722 audit_log_format(ab,
723 " ssid=%u", sid);
724 /* Maybe call audit_panic? */
725 } else
726 audit_log_format(ab,
727 " subj=%s", ctx);
728 kfree(ctx);
729 }
730 audit_log_format(ab, " audit_enabled=%d res=0",
731 audit_enabled);
732 audit_log_end(ab);
733 }
734 return -EPERM; 677 return -EPERM;
735 } 678 }
736 /* fallthrough */ 679 /* fallthrough */
@@ -743,28 +686,13 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
743 case AUDIT_DEL_RULE: 686 case AUDIT_DEL_RULE:
744 if (nlmsg_len(nlh) < sizeof(struct audit_rule_data)) 687 if (nlmsg_len(nlh) < sizeof(struct audit_rule_data))
745 return -EINVAL; 688 return -EINVAL;
746 if (audit_enabled == 2) { 689 if (audit_enabled == AUDIT_LOCKED) {
747 ab = audit_log_start(NULL, GFP_KERNEL, 690 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
748 AUDIT_CONFIG_CHANGE); 691 uid, loginuid, sid);
749 if (ab) { 692
750 audit_log_format(ab, 693 audit_log_format(ab, " audit_enabled=%d res=0",
751 "pid=%d uid=%u auid=%u", 694 audit_enabled);
752 pid, uid, loginuid); 695 audit_log_end(ab);
753 if (sid) {
754 if (selinux_sid_to_string(
755 sid, &ctx, &len)) {
756 audit_log_format(ab,
757 " ssid=%u", sid);
758 /* Maybe call audit_panic? */
759 } else
760 audit_log_format(ab,
761 " subj=%s", ctx);
762 kfree(ctx);
763 }
764 audit_log_format(ab, " audit_enabled=%d res=0",
765 audit_enabled);
766 audit_log_end(ab);
767 }
768 return -EPERM; 696 return -EPERM;
769 } 697 }
770 /* fallthrough */ 698 /* fallthrough */
@@ -775,19 +703,10 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
775 break; 703 break;
776 case AUDIT_TRIM: 704 case AUDIT_TRIM:
777 audit_trim_trees(); 705 audit_trim_trees();
778 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); 706
779 if (!ab) 707 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
780 break; 708 uid, loginuid, sid);
781 audit_log_format(ab, "auid=%u", loginuid); 709
782 if (sid) {
783 u32 len;
784 ctx = NULL;
785 if (selinux_sid_to_string(sid, &ctx, &len))
786 audit_log_format(ab, " ssid=%u", sid);
787 else
788 audit_log_format(ab, " subj=%s", ctx);
789 kfree(ctx);
790 }
791 audit_log_format(ab, " op=trim res=1"); 710 audit_log_format(ab, " op=trim res=1");
792 audit_log_end(ab); 711 audit_log_end(ab);
793 break; 712 break;
@@ -817,22 +736,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
817 /* OK, here comes... */ 736 /* OK, here comes... */
818 err = audit_tag_tree(old, new); 737 err = audit_tag_tree(old, new);
819 738
820 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); 739 audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid,
821 if (!ab) { 740 uid, loginuid, sid);
822 kfree(old); 741
823 kfree(new);
824 break;
825 }
826 audit_log_format(ab, "auid=%u", loginuid);
827 if (sid) {
828 u32 len;
829 ctx = NULL;
830 if (selinux_sid_to_string(sid, &ctx, &len))
831 audit_log_format(ab, " ssid=%u", sid);
832 else
833 audit_log_format(ab, " subj=%s", ctx);
834 kfree(ctx);
835 }
836 audit_log_format(ab, " op=make_equiv old="); 742 audit_log_format(ab, " op=make_equiv old=");
837 audit_log_untrustedstring(ab, old); 743 audit_log_untrustedstring(ab, old);
838 audit_log_format(ab, " new="); 744 audit_log_format(ab, " new=");
@@ -965,6 +871,7 @@ static int __init audit_init(void)
965 skb_queue_head_init(&audit_skb_queue); 871 skb_queue_head_init(&audit_skb_queue);
966 audit_initialized = 1; 872 audit_initialized = 1;
967 audit_enabled = audit_default; 873 audit_enabled = audit_default;
874 audit_ever_enabled |= !!audit_default;
968 875
969 /* Register the callback with selinux. This callback will be invoked 876 /* Register the callback with selinux. This callback will be invoked
970 * when a new policy is loaded. */ 877 * when a new policy is loaded. */
@@ -992,8 +899,10 @@ static int __init audit_enable(char *str)
992 printk(KERN_INFO "audit: %s%s\n", 899 printk(KERN_INFO "audit: %s%s\n",
993 audit_default ? "enabled" : "disabled", 900 audit_default ? "enabled" : "disabled",
994 audit_initialized ? "" : " (after initialization)"); 901 audit_initialized ? "" : " (after initialization)");
995 if (audit_initialized) 902 if (audit_initialized) {
996 audit_enabled = audit_default; 903 audit_enabled = audit_default;
904 audit_ever_enabled |= !!audit_default;
905 }
997 return 1; 906 return 1;
998} 907}
999 908
@@ -1130,7 +1039,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
1130{ 1039{
1131 struct audit_buffer *ab = NULL; 1040 struct audit_buffer *ab = NULL;
1132 struct timespec t; 1041 struct timespec t;
1133 unsigned int serial; 1042 unsigned int uninitialized_var(serial);
1134 int reserve; 1043 int reserve;
1135 unsigned long timeout_start = jiffies; 1044 unsigned long timeout_start = jiffies;
1136 1045
@@ -1164,7 +1073,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
1164 remove_wait_queue(&audit_backlog_wait, &wait); 1073 remove_wait_queue(&audit_backlog_wait, &wait);
1165 continue; 1074 continue;
1166 } 1075 }
1167 if (audit_rate_check()) 1076 if (audit_rate_check() && printk_ratelimit())
1168 printk(KERN_WARNING 1077 printk(KERN_WARNING
1169 "audit: audit_backlog=%d > " 1078 "audit: audit_backlog=%d > "
1170 "audit_backlog_limit=%d\n", 1079 "audit_backlog_limit=%d\n",
@@ -1200,13 +1109,17 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask,
1200static inline int audit_expand(struct audit_buffer *ab, int extra) 1109static inline int audit_expand(struct audit_buffer *ab, int extra)
1201{ 1110{
1202 struct sk_buff *skb = ab->skb; 1111 struct sk_buff *skb = ab->skb;
1203 int ret = pskb_expand_head(skb, skb_headroom(skb), extra, 1112 int oldtail = skb_tailroom(skb);
1204 ab->gfp_mask); 1113 int ret = pskb_expand_head(skb, 0, extra, ab->gfp_mask);
1114 int newtail = skb_tailroom(skb);
1115
1205 if (ret < 0) { 1116 if (ret < 0) {
1206 audit_log_lost("out of memory in audit_expand"); 1117 audit_log_lost("out of memory in audit_expand");
1207 return 0; 1118 return 0;
1208 } 1119 }
1209 return skb_tailroom(skb); 1120
1121 skb->truesize += newtail - oldtail;
1122 return newtail;
1210} 1123}
1211 1124
1212/* 1125/*
@@ -1245,6 +1158,7 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt,
1245 goto out; 1158 goto out;
1246 len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2); 1159 len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2);
1247 } 1160 }
1161 va_end(args2);
1248 if (len > 0) 1162 if (len > 0)
1249 skb_put(skb, len); 1163 skb_put(skb, len);
1250out: 1164out:
@@ -1346,6 +1260,21 @@ static void audit_log_n_string(struct audit_buffer *ab, size_t slen,
1346} 1260}
1347 1261
1348/** 1262/**
1263 * audit_string_contains_control - does a string need to be logged in hex
1264 * @string - string to be checked
1265 * @len - max length of the string to check
1266 */
1267int audit_string_contains_control(const char *string, size_t len)
1268{
1269 const unsigned char *p;
1270 for (p = string; p < (const unsigned char *)string + len && *p; p++) {
1271 if (*p == '"' || *p < 0x21 || *p > 0x7f)
1272 return 1;
1273 }
1274 return 0;
1275}
1276
1277/**
1349 * audit_log_n_untrustedstring - log a string that may contain random characters 1278 * audit_log_n_untrustedstring - log a string that may contain random characters
1350 * @ab: audit_buffer 1279 * @ab: audit_buffer
1351 * @len: lenth of string (not including trailing null) 1280 * @len: lenth of string (not including trailing null)
@@ -1359,19 +1288,13 @@ static void audit_log_n_string(struct audit_buffer *ab, size_t slen,
1359 * The caller specifies the number of characters in the string to log, which may 1288 * The caller specifies the number of characters in the string to log, which may
1360 * or may not be the entire string. 1289 * or may not be the entire string.
1361 */ 1290 */
1362const char *audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len, 1291void audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len,
1363 const char *string) 1292 const char *string)
1364{ 1293{
1365 const unsigned char *p; 1294 if (audit_string_contains_control(string, len))
1366 1295 audit_log_hex(ab, string, len);
1367 for (p = string; p < (const unsigned char *)string + len && *p; p++) { 1296 else
1368 if (*p == '"' || *p < 0x21 || *p > 0x7f) { 1297 audit_log_n_string(ab, len, string);
1369 audit_log_hex(ab, string, len);
1370 return string + len + 1;
1371 }
1372 }
1373 audit_log_n_string(ab, len, string);
1374 return p + 1;
1375} 1298}
1376 1299
1377/** 1300/**
@@ -1382,9 +1305,9 @@ const char *audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len,
1382 * Same as audit_log_n_untrustedstring(), except that strlen is used to 1305 * Same as audit_log_n_untrustedstring(), except that strlen is used to
1383 * determine string length. 1306 * determine string length.
1384 */ 1307 */
1385const char *audit_log_untrustedstring(struct audit_buffer *ab, const char *string) 1308void audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
1386{ 1309{
1387 return audit_log_n_untrustedstring(ab, strlen(string), string); 1310 audit_log_n_untrustedstring(ab, strlen(string), string);
1388} 1311}
1389 1312
1390/* This is a helper-function to print the escaped d_path */ 1313/* This is a helper-function to print the escaped d_path */
@@ -1433,8 +1356,11 @@ void audit_log_end(struct audit_buffer *ab)
1433 skb_queue_tail(&audit_skb_queue, ab->skb); 1356 skb_queue_tail(&audit_skb_queue, ab->skb);
1434 ab->skb = NULL; 1357 ab->skb = NULL;
1435 wake_up_interruptible(&kauditd_wait); 1358 wake_up_interruptible(&kauditd_wait);
1359 } else if (printk_ratelimit()) {
1360 struct nlmsghdr *nlh = nlmsg_hdr(ab->skb);
1361 printk(KERN_NOTICE "type=%d %s\n", nlh->nlmsg_type, ab->skb->data + NLMSG_SPACE(0));
1436 } else { 1362 } else {
1437 printk(KERN_NOTICE "%s\n", ab->skb->data + NLMSG_SPACE(0)); 1363 audit_log_lost("printk limit exceeded\n");
1438 } 1364 }
1439 } 1365 }
1440 audit_buffer_free(ab); 1366 audit_buffer_free(ab);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 5d96f2cc7be8..6f19fd477aac 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -95,6 +95,8 @@ extern struct inotify_handle *audit_ih;
95/* Inotify events we care about. */ 95/* Inotify events we care about. */
96#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF 96#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
97 97
98extern int audit_enabled;
99
98void audit_free_parent(struct inotify_watch *i_watch) 100void audit_free_parent(struct inotify_watch *i_watch)
99{ 101{
100 struct audit_parent *parent; 102 struct audit_parent *parent;
@@ -974,7 +976,6 @@ static void audit_update_watch(struct audit_parent *parent,
974 struct audit_watch *owatch, *nwatch, *nextw; 976 struct audit_watch *owatch, *nwatch, *nextw;
975 struct audit_krule *r, *nextr; 977 struct audit_krule *r, *nextr;
976 struct audit_entry *oentry, *nentry; 978 struct audit_entry *oentry, *nentry;
977 struct audit_buffer *ab;
978 979
979 mutex_lock(&audit_filter_mutex); 980 mutex_lock(&audit_filter_mutex);
980 list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { 981 list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
@@ -1014,13 +1015,18 @@ static void audit_update_watch(struct audit_parent *parent,
1014 call_rcu(&oentry->rcu, audit_free_rule_rcu); 1015 call_rcu(&oentry->rcu, audit_free_rule_rcu);
1015 } 1016 }
1016 1017
1017 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); 1018 if (audit_enabled) {
1018 audit_log_format(ab, "op=updated rules specifying path="); 1019 struct audit_buffer *ab;
1019 audit_log_untrustedstring(ab, owatch->path); 1020 ab = audit_log_start(NULL, GFP_KERNEL,
1020 audit_log_format(ab, " with dev=%u ino=%lu\n", dev, ino); 1021 AUDIT_CONFIG_CHANGE);
1021 audit_log_format(ab, " list=%d res=1", r->listnr); 1022 audit_log_format(ab,
1022 audit_log_end(ab); 1023 "op=updated rules specifying path=");
1023 1024 audit_log_untrustedstring(ab, owatch->path);
1025 audit_log_format(ab, " with dev=%u ino=%lu\n",
1026 dev, ino);
1027 audit_log_format(ab, " list=%d res=1", r->listnr);
1028 audit_log_end(ab);
1029 }
1024 audit_remove_watch(owatch); 1030 audit_remove_watch(owatch);
1025 goto add_watch_to_parent; /* event applies to a single watch */ 1031 goto add_watch_to_parent; /* event applies to a single watch */
1026 } 1032 }
@@ -1039,25 +1045,28 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
1039 struct audit_watch *w, *nextw; 1045 struct audit_watch *w, *nextw;
1040 struct audit_krule *r, *nextr; 1046 struct audit_krule *r, *nextr;
1041 struct audit_entry *e; 1047 struct audit_entry *e;
1042 struct audit_buffer *ab;
1043 1048
1044 mutex_lock(&audit_filter_mutex); 1049 mutex_lock(&audit_filter_mutex);
1045 parent->flags |= AUDIT_PARENT_INVALID; 1050 parent->flags |= AUDIT_PARENT_INVALID;
1046 list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { 1051 list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
1047 list_for_each_entry_safe(r, nextr, &w->rules, rlist) { 1052 list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
1048 e = container_of(r, struct audit_entry, rule); 1053 e = container_of(r, struct audit_entry, rule);
1049 1054 if (audit_enabled) {
1050 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); 1055 struct audit_buffer *ab;
1051 audit_log_format(ab, "op=remove rule path="); 1056 ab = audit_log_start(NULL, GFP_KERNEL,
1052 audit_log_untrustedstring(ab, w->path); 1057 AUDIT_CONFIG_CHANGE);
1053 if (r->filterkey) { 1058 audit_log_format(ab, "op=remove rule path=");
1054 audit_log_format(ab, " key="); 1059 audit_log_untrustedstring(ab, w->path);
1055 audit_log_untrustedstring(ab, r->filterkey); 1060 if (r->filterkey) {
1056 } else 1061 audit_log_format(ab, " key=");
1057 audit_log_format(ab, " key=(null)"); 1062 audit_log_untrustedstring(ab,
1058 audit_log_format(ab, " list=%d res=1", r->listnr); 1063 r->filterkey);
1059 audit_log_end(ab); 1064 } else
1060 1065 audit_log_format(ab, " key=(null)");
1066 audit_log_format(ab, " list=%d res=1",
1067 r->listnr);
1068 audit_log_end(ab);
1069 }
1061 list_del(&r->rlist); 1070 list_del(&r->rlist);
1062 list_del_rcu(&e->list); 1071 list_del_rcu(&e->list);
1063 call_rcu(&e->rcu, audit_free_rule_rcu); 1072 call_rcu(&e->rcu, audit_free_rule_rcu);
@@ -1495,6 +1504,9 @@ static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action,
1495{ 1504{
1496 struct audit_buffer *ab; 1505 struct audit_buffer *ab;
1497 1506
1507 if (!audit_enabled)
1508 return;
1509
1498 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); 1510 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
1499 if (!ab) 1511 if (!ab)
1500 return; 1512 return;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index bce9ecdb7712..1c06ecf38d7b 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -70,6 +70,7 @@
70#include "audit.h" 70#include "audit.h"
71 71
72extern struct list_head audit_filter_list[]; 72extern struct list_head audit_filter_list[];
73extern int audit_ever_enabled;
73 74
74/* AUDIT_NAMES is the number of slots we reserve in the audit_context 75/* AUDIT_NAMES is the number of slots we reserve in the audit_context
75 * for saving names from getname(). */ 76 * for saving names from getname(). */
@@ -78,6 +79,9 @@ extern struct list_head audit_filter_list[];
78/* Indicates that audit should log the full pathname. */ 79/* Indicates that audit should log the full pathname. */
79#define AUDIT_NAME_FULL -1 80#define AUDIT_NAME_FULL -1
80 81
82/* no execve audit message should be longer than this (userspace limits) */
83#define MAX_EXECVE_AUDIT_LEN 7500
84
81/* number of audit rules */ 85/* number of audit rules */
82int audit_n_rules; 86int audit_n_rules;
83 87
@@ -176,7 +180,11 @@ struct audit_aux_data_fd_pair {
176struct audit_aux_data_pids { 180struct audit_aux_data_pids {
177 struct audit_aux_data d; 181 struct audit_aux_data d;
178 pid_t target_pid[AUDIT_AUX_PIDS]; 182 pid_t target_pid[AUDIT_AUX_PIDS];
183 uid_t target_auid[AUDIT_AUX_PIDS];
184 uid_t target_uid[AUDIT_AUX_PIDS];
185 unsigned int target_sessionid[AUDIT_AUX_PIDS];
179 u32 target_sid[AUDIT_AUX_PIDS]; 186 u32 target_sid[AUDIT_AUX_PIDS];
187 char target_comm[AUDIT_AUX_PIDS][TASK_COMM_LEN];
180 int pid_count; 188 int pid_count;
181}; 189};
182 190
@@ -192,7 +200,6 @@ struct audit_context {
192 enum audit_state state; 200 enum audit_state state;
193 unsigned int serial; /* serial number for record */ 201 unsigned int serial; /* serial number for record */
194 struct timespec ctime; /* time of syscall entry */ 202 struct timespec ctime; /* time of syscall entry */
195 uid_t loginuid; /* login uid (identity) */
196 int major; /* syscall number */ 203 int major; /* syscall number */
197 unsigned long argv[4]; /* syscall arguments */ 204 unsigned long argv[4]; /* syscall arguments */
198 int return_valid; /* return code is valid */ 205 int return_valid; /* return code is valid */
@@ -215,7 +222,11 @@ struct audit_context {
215 int arch; 222 int arch;
216 223
217 pid_t target_pid; 224 pid_t target_pid;
225 uid_t target_auid;
226 uid_t target_uid;
227 unsigned int target_sessionid;
218 u32 target_sid; 228 u32 target_sid;
229 char target_comm[TASK_COMM_LEN];
219 230
220 struct audit_tree_refs *trees, *first_trees; 231 struct audit_tree_refs *trees, *first_trees;
221 int tree_count; 232 int tree_count;
@@ -506,7 +517,7 @@ static int audit_filter_rules(struct task_struct *tsk,
506 case AUDIT_LOGINUID: 517 case AUDIT_LOGINUID:
507 result = 0; 518 result = 0;
508 if (ctx) 519 if (ctx)
509 result = audit_comparator(ctx->loginuid, f->op, f->val); 520 result = audit_comparator(tsk->loginuid, f->op, f->val);
510 break; 521 break;
511 case AUDIT_SUBJ_USER: 522 case AUDIT_SUBJ_USER:
512 case AUDIT_SUBJ_ROLE: 523 case AUDIT_SUBJ_ROLE:
@@ -702,7 +713,24 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
702 if (likely(!context)) 713 if (likely(!context))
703 return NULL; 714 return NULL;
704 context->return_valid = return_valid; 715 context->return_valid = return_valid;
705 context->return_code = return_code; 716
717 /*
718 * we need to fix up the return code in the audit logs if the actual
719 * return codes are later going to be fixed up by the arch specific
720 * signal handlers
721 *
722 * This is actually a test for:
723 * (rc == ERESTARTSYS ) || (rc == ERESTARTNOINTR) ||
724 * (rc == ERESTARTNOHAND) || (rc == ERESTART_RESTARTBLOCK)
725 *
726 * but is faster than a bunch of ||
727 */
728 if (unlikely(return_code <= -ERESTARTSYS) &&
729 (return_code >= -ERESTART_RESTARTBLOCK) &&
730 (return_code != -ENOIOCTLCMD))
731 context->return_code = -EINTR;
732 else
733 context->return_code = return_code;
706 734
707 if (context->in_syscall && !context->dummy && !context->auditable) { 735 if (context->in_syscall && !context->dummy && !context->auditable) {
708 enum audit_state state; 736 enum audit_state state;
@@ -783,11 +811,8 @@ static inline void audit_free_aux(struct audit_context *context)
783static inline void audit_zero_context(struct audit_context *context, 811static inline void audit_zero_context(struct audit_context *context,
784 enum audit_state state) 812 enum audit_state state)
785{ 813{
786 uid_t loginuid = context->loginuid;
787
788 memset(context, 0, sizeof(*context)); 814 memset(context, 0, sizeof(*context));
789 context->state = state; 815 context->state = state;
790 context->loginuid = loginuid;
791} 816}
792 817
793static inline struct audit_context *audit_alloc_context(enum audit_state state) 818static inline struct audit_context *audit_alloc_context(enum audit_state state)
@@ -814,7 +839,7 @@ int audit_alloc(struct task_struct *tsk)
814 struct audit_context *context; 839 struct audit_context *context;
815 enum audit_state state; 840 enum audit_state state;
816 841
817 if (likely(!audit_enabled)) 842 if (likely(!audit_ever_enabled))
818 return 0; /* Return if not auditing. */ 843 return 0; /* Return if not auditing. */
819 844
820 state = audit_filter_task(tsk); 845 state = audit_filter_task(tsk);
@@ -826,11 +851,6 @@ int audit_alloc(struct task_struct *tsk)
826 return -ENOMEM; 851 return -ENOMEM;
827 } 852 }
828 853
829 /* Preserve login uid */
830 context->loginuid = -1;
831 if (current->audit_context)
832 context->loginuid = current->audit_context->loginuid;
833
834 tsk->audit_context = context; 854 tsk->audit_context = context;
835 set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT); 855 set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT);
836 return 0; 856 return 0;
@@ -922,7 +942,8 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk
922} 942}
923 943
924static int audit_log_pid_context(struct audit_context *context, pid_t pid, 944static int audit_log_pid_context(struct audit_context *context, pid_t pid,
925 u32 sid) 945 uid_t auid, uid_t uid, unsigned int sessionid,
946 u32 sid, char *comm)
926{ 947{
927 struct audit_buffer *ab; 948 struct audit_buffer *ab;
928 char *s = NULL; 949 char *s = NULL;
@@ -931,68 +952,204 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
931 952
932 ab = audit_log_start(context, GFP_KERNEL, AUDIT_OBJ_PID); 953 ab = audit_log_start(context, GFP_KERNEL, AUDIT_OBJ_PID);
933 if (!ab) 954 if (!ab)
934 return 1; 955 return rc;
935 956
957 audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, auid,
958 uid, sessionid);
936 if (selinux_sid_to_string(sid, &s, &len)) { 959 if (selinux_sid_to_string(sid, &s, &len)) {
937 audit_log_format(ab, "opid=%d obj=(none)", pid); 960 audit_log_format(ab, " obj=(none)");
938 rc = 1; 961 rc = 1;
939 } else 962 } else
940 audit_log_format(ab, "opid=%d obj=%s", pid, s); 963 audit_log_format(ab, " obj=%s", s);
964 audit_log_format(ab, " ocomm=");
965 audit_log_untrustedstring(ab, comm);
941 audit_log_end(ab); 966 audit_log_end(ab);
942 kfree(s); 967 kfree(s);
943 968
944 return rc; 969 return rc;
945} 970}
946 971
947static void audit_log_execve_info(struct audit_buffer *ab, 972/*
948 struct audit_aux_data_execve *axi) 973 * to_send and len_sent accounting are very loose estimates. We aren't
974 * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being
975 * within about 500 bytes (next page boundry)
976 *
977 * why snprintf? an int is up to 12 digits long. if we just assumed when
978 * logging that a[%d]= was going to be 16 characters long we would be wasting
979 * space in every audit message. In one 7500 byte message we can log up to
980 * about 1000 min size arguments. That comes down to about 50% waste of space
981 * if we didn't do the snprintf to find out how long arg_num_len was.
982 */
983static int audit_log_single_execve_arg(struct audit_context *context,
984 struct audit_buffer **ab,
985 int arg_num,
986 size_t *len_sent,
987 const char __user *p,
988 char *buf)
949{ 989{
950 int i; 990 char arg_num_len_buf[12];
951 long len, ret; 991 const char __user *tmp_p = p;
952 const char __user *p; 992 /* how many digits are in arg_num? 3 is the length of a=\n */
953 char *buf; 993 size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 3;
994 size_t len, len_left, to_send;
995 size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN;
996 unsigned int i, has_cntl = 0, too_long = 0;
997 int ret;
998
999 /* strnlen_user includes the null we don't want to send */
1000 len_left = len = strnlen_user(p, MAX_ARG_STRLEN) - 1;
954 1001
955 if (axi->mm != current->mm) 1002 /*
956 return; /* execve failed, no additional info */ 1003 * We just created this mm, if we can't find the strings
957 1004 * we just copied into it something is _very_ wrong. Similar
958 p = (const char __user *)axi->mm->arg_start; 1005 * for strings that are too long, we should not have created
1006 * any.
1007 */
1008 if (unlikely((len = -1) || len > MAX_ARG_STRLEN - 1)) {
1009 WARN_ON(1);
1010 send_sig(SIGKILL, current, 0);
1011 }
959 1012
960 for (i = 0; i < axi->argc; i++, p += len) { 1013 /* walk the whole argument looking for non-ascii chars */
961 len = strnlen_user(p, MAX_ARG_STRLEN); 1014 do {
1015 if (len_left > MAX_EXECVE_AUDIT_LEN)
1016 to_send = MAX_EXECVE_AUDIT_LEN;
1017 else
1018 to_send = len_left;
1019 ret = copy_from_user(buf, tmp_p, to_send);
962 /* 1020 /*
963 * We just created this mm, if we can't find the strings 1021 * There is no reason for this copy to be short. We just
964 * we just copied into it something is _very_ wrong. Similar 1022 * copied them here, and the mm hasn't been exposed to user-
965 * for strings that are too long, we should not have created 1023 * space yet.
966 * any.
967 */ 1024 */
968 if (!len || len > MAX_ARG_STRLEN) { 1025 if (ret) {
969 WARN_ON(1); 1026 WARN_ON(1);
970 send_sig(SIGKILL, current, 0); 1027 send_sig(SIGKILL, current, 0);
971 } 1028 }
972 1029 buf[to_send] = '\0';
973 buf = kmalloc(len, GFP_KERNEL); 1030 has_cntl = audit_string_contains_control(buf, to_send);
974 if (!buf) { 1031 if (has_cntl) {
975 audit_panic("out of memory for argv string\n"); 1032 /*
1033 * hex messages get logged as 2 bytes, so we can only
1034 * send half as much in each message
1035 */
1036 max_execve_audit_len = MAX_EXECVE_AUDIT_LEN / 2;
976 break; 1037 break;
977 } 1038 }
1039 len_left -= to_send;
1040 tmp_p += to_send;
1041 } while (len_left > 0);
1042
1043 len_left = len;
1044
1045 if (len > max_execve_audit_len)
1046 too_long = 1;
1047
1048 /* rewalk the argument actually logging the message */
1049 for (i = 0; len_left > 0; i++) {
1050 int room_left;
1051
1052 if (len_left > max_execve_audit_len)
1053 to_send = max_execve_audit_len;
1054 else
1055 to_send = len_left;
1056
1057 /* do we have space left to send this argument in this ab? */
1058 room_left = MAX_EXECVE_AUDIT_LEN - arg_num_len - *len_sent;
1059 if (has_cntl)
1060 room_left -= (to_send * 2);
1061 else
1062 room_left -= to_send;
1063 if (room_left < 0) {
1064 *len_sent = 0;
1065 audit_log_end(*ab);
1066 *ab = audit_log_start(context, GFP_KERNEL, AUDIT_EXECVE);
1067 if (!*ab)
1068 return 0;
1069 }
978 1070
979 ret = copy_from_user(buf, p, len);
980 /* 1071 /*
981 * There is no reason for this copy to be short. We just 1072 * first record needs to say how long the original string was
982 * copied them here, and the mm hasn't been exposed to user- 1073 * so we can be sure nothing was lost.
983 * space yet. 1074 */
1075 if ((i == 0) && (too_long))
1076 audit_log_format(*ab, "a%d_len=%ld ", arg_num,
1077 has_cntl ? 2*len : len);
1078
1079 /*
1080 * normally arguments are small enough to fit and we already
1081 * filled buf above when we checked for control characters
1082 * so don't bother with another copy_from_user
984 */ 1083 */
1084 if (len >= max_execve_audit_len)
1085 ret = copy_from_user(buf, p, to_send);
1086 else
1087 ret = 0;
985 if (ret) { 1088 if (ret) {
986 WARN_ON(1); 1089 WARN_ON(1);
987 send_sig(SIGKILL, current, 0); 1090 send_sig(SIGKILL, current, 0);
988 } 1091 }
1092 buf[to_send] = '\0';
1093
1094 /* actually log it */
1095 audit_log_format(*ab, "a%d", arg_num);
1096 if (too_long)
1097 audit_log_format(*ab, "[%d]", i);
1098 audit_log_format(*ab, "=");
1099 if (has_cntl)
1100 audit_log_hex(*ab, buf, to_send);
1101 else
1102 audit_log_format(*ab, "\"%s\"", buf);
1103 audit_log_format(*ab, "\n");
1104
1105 p += to_send;
1106 len_left -= to_send;
1107 *len_sent += arg_num_len;
1108 if (has_cntl)
1109 *len_sent += to_send * 2;
1110 else
1111 *len_sent += to_send;
1112 }
1113 /* include the null we didn't log */
1114 return len + 1;
1115}
989 1116
990 audit_log_format(ab, "a%d=", i); 1117static void audit_log_execve_info(struct audit_context *context,
991 audit_log_untrustedstring(ab, buf); 1118 struct audit_buffer **ab,
992 audit_log_format(ab, "\n"); 1119 struct audit_aux_data_execve *axi)
1120{
1121 int i;
1122 size_t len, len_sent = 0;
1123 const char __user *p;
1124 char *buf;
1125
1126 if (axi->mm != current->mm)
1127 return; /* execve failed, no additional info */
1128
1129 p = (const char __user *)axi->mm->arg_start;
1130
1131 audit_log_format(*ab, "argc=%d ", axi->argc);
1132
1133 /*
1134 * we need some kernel buffer to hold the userspace args. Just
1135 * allocate one big one rather than allocating one of the right size
1136 * for every single argument inside audit_log_single_execve_arg()
1137 * should be <8k allocation so should be pretty safe.
1138 */
1139 buf = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL);
1140 if (!buf) {
1141 audit_panic("out of memory for argv string\n");
1142 return;
1143 }
993 1144
994 kfree(buf); 1145 for (i = 0; i < axi->argc; i++) {
1146 len = audit_log_single_execve_arg(context, ab, i,
1147 &len_sent, p, buf);
1148 if (len <= 0)
1149 break;
1150 p += len;
995 } 1151 }
1152 kfree(buf);
996} 1153}
997 1154
998static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) 1155static void audit_log_exit(struct audit_context *context, struct task_struct *tsk)
@@ -1039,7 +1196,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1039 " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" 1196 " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
1040 " ppid=%d pid=%d auid=%u uid=%u gid=%u" 1197 " ppid=%d pid=%d auid=%u uid=%u gid=%u"
1041 " euid=%u suid=%u fsuid=%u" 1198 " euid=%u suid=%u fsuid=%u"
1042 " egid=%u sgid=%u fsgid=%u tty=%s", 1199 " egid=%u sgid=%u fsgid=%u tty=%s ses=%u",
1043 context->argv[0], 1200 context->argv[0],
1044 context->argv[1], 1201 context->argv[1],
1045 context->argv[2], 1202 context->argv[2],
@@ -1047,11 +1204,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1047 context->name_count, 1204 context->name_count,
1048 context->ppid, 1205 context->ppid,
1049 context->pid, 1206 context->pid,
1050 context->loginuid, 1207 tsk->loginuid,
1051 context->uid, 1208 context->uid,
1052 context->gid, 1209 context->gid,
1053 context->euid, context->suid, context->fsuid, 1210 context->euid, context->suid, context->fsuid,
1054 context->egid, context->sgid, context->fsgid, tty); 1211 context->egid, context->sgid, context->fsgid, tty,
1212 tsk->sessionid);
1055 1213
1056 mutex_unlock(&tty_mutex); 1214 mutex_unlock(&tty_mutex);
1057 1215
@@ -1135,7 +1293,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1135 1293
1136 case AUDIT_EXECVE: { 1294 case AUDIT_EXECVE: {
1137 struct audit_aux_data_execve *axi = (void *)aux; 1295 struct audit_aux_data_execve *axi = (void *)aux;
1138 audit_log_execve_info(ab, axi); 1296 audit_log_execve_info(context, &ab, axi);
1139 break; } 1297 break; }
1140 1298
1141 case AUDIT_SOCKETCALL: { 1299 case AUDIT_SOCKETCALL: {
@@ -1168,13 +1326,19 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1168 1326
1169 for (i = 0; i < axs->pid_count; i++) 1327 for (i = 0; i < axs->pid_count; i++)
1170 if (audit_log_pid_context(context, axs->target_pid[i], 1328 if (audit_log_pid_context(context, axs->target_pid[i],
1171 axs->target_sid[i])) 1329 axs->target_auid[i],
1330 axs->target_uid[i],
1331 axs->target_sessionid[i],
1332 axs->target_sid[i],
1333 axs->target_comm[i]))
1172 call_panic = 1; 1334 call_panic = 1;
1173 } 1335 }
1174 1336
1175 if (context->target_pid && 1337 if (context->target_pid &&
1176 audit_log_pid_context(context, context->target_pid, 1338 audit_log_pid_context(context, context->target_pid,
1177 context->target_sid)) 1339 context->target_auid, context->target_uid,
1340 context->target_sessionid,
1341 context->target_sid, context->target_comm))
1178 call_panic = 1; 1342 call_panic = 1;
1179 1343
1180 if (context->pwd && context->pwdmnt) { 1344 if (context->pwd && context->pwdmnt) {
@@ -1242,6 +1406,11 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1242 1406
1243 audit_log_end(ab); 1407 audit_log_end(ab);
1244 } 1408 }
1409
1410 /* Send end of event record to help user space know we are finished */
1411 ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
1412 if (ab)
1413 audit_log_end(ab);
1245 if (call_panic) 1414 if (call_panic)
1246 audit_panic("error converting sid to string"); 1415 audit_panic("error converting sid to string");
1247} 1416}
@@ -1766,6 +1935,9 @@ void auditsc_get_stamp(struct audit_context *ctx,
1766 ctx->auditable = 1; 1935 ctx->auditable = 1;
1767} 1936}
1768 1937
1938/* global counter which is incremented every time something logs in */
1939static atomic_t session_id = ATOMIC_INIT(0);
1940
1769/** 1941/**
1770 * audit_set_loginuid - set a task's audit_context loginuid 1942 * audit_set_loginuid - set a task's audit_context loginuid
1771 * @task: task whose audit context is being modified 1943 * @task: task whose audit context is being modified
@@ -1777,41 +1949,29 @@ void auditsc_get_stamp(struct audit_context *ctx,
1777 */ 1949 */
1778int audit_set_loginuid(struct task_struct *task, uid_t loginuid) 1950int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
1779{ 1951{
1952 unsigned int sessionid = atomic_inc_return(&session_id);
1780 struct audit_context *context = task->audit_context; 1953 struct audit_context *context = task->audit_context;
1781 1954
1782 if (context) { 1955 if (context && context->in_syscall) {
1783 /* Only log if audit is enabled */ 1956 struct audit_buffer *ab;
1784 if (context->in_syscall) { 1957
1785 struct audit_buffer *ab; 1958 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
1786 1959 if (ab) {
1787 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); 1960 audit_log_format(ab, "login pid=%d uid=%u "
1788 if (ab) { 1961 "old auid=%u new auid=%u"
1789 audit_log_format(ab, "login pid=%d uid=%u " 1962 " old ses=%u new ses=%u",
1790 "old auid=%u new auid=%u", 1963 task->pid, task->uid,
1791 task->pid, task->uid, 1964 task->loginuid, loginuid,
1792 context->loginuid, loginuid); 1965 task->sessionid, sessionid);
1793 audit_log_end(ab); 1966 audit_log_end(ab);
1794 }
1795 } 1967 }
1796 context->loginuid = loginuid;
1797 } 1968 }
1969 task->sessionid = sessionid;
1970 task->loginuid = loginuid;
1798 return 0; 1971 return 0;
1799} 1972}
1800 1973
1801/** 1974/**
1802 * audit_get_loginuid - get the loginuid for an audit_context
1803 * @ctx: the audit_context
1804 *
1805 * Returns the context's loginuid or -1 if @ctx is NULL.
1806 */
1807uid_t audit_get_loginuid(struct audit_context *ctx)
1808{
1809 return ctx ? ctx->loginuid : -1;
1810}
1811
1812EXPORT_SYMBOL(audit_get_loginuid);
1813
1814/**
1815 * __audit_mq_open - record audit data for a POSIX MQ open 1975 * __audit_mq_open - record audit data for a POSIX MQ open
1816 * @oflag: open flag 1976 * @oflag: open flag
1817 * @mode: mode bits 1977 * @mode: mode bits
@@ -2070,8 +2230,6 @@ int __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode
2070 return 0; 2230 return 0;
2071} 2231}
2072 2232
2073int audit_argv_kb = 32;
2074
2075int audit_bprm(struct linux_binprm *bprm) 2233int audit_bprm(struct linux_binprm *bprm)
2076{ 2234{
2077 struct audit_aux_data_execve *ax; 2235 struct audit_aux_data_execve *ax;
@@ -2080,14 +2238,6 @@ int audit_bprm(struct linux_binprm *bprm)
2080 if (likely(!audit_enabled || !context || context->dummy)) 2238 if (likely(!audit_enabled || !context || context->dummy))
2081 return 0; 2239 return 0;
2082 2240
2083 /*
2084 * Even though the stack code doesn't limit the arg+env size any more,
2085 * the audit code requires that _all_ arguments be logged in a single
2086 * netlink skb. Hence cap it :-(
2087 */
2088 if (bprm->argv_len > (audit_argv_kb << 10))
2089 return -E2BIG;
2090
2091 ax = kmalloc(sizeof(*ax), GFP_KERNEL); 2241 ax = kmalloc(sizeof(*ax), GFP_KERNEL);
2092 if (!ax) 2242 if (!ax)
2093 return -ENOMEM; 2243 return -ENOMEM;
@@ -2193,7 +2343,11 @@ void __audit_ptrace(struct task_struct *t)
2193 struct audit_context *context = current->audit_context; 2343 struct audit_context *context = current->audit_context;
2194 2344
2195 context->target_pid = t->pid; 2345 context->target_pid = t->pid;
2346 context->target_auid = audit_get_loginuid(t);
2347 context->target_uid = t->uid;
2348 context->target_sessionid = audit_get_sessionid(t);
2196 selinux_get_task_sid(t, &context->target_sid); 2349 selinux_get_task_sid(t, &context->target_sid);
2350 memcpy(context->target_comm, t->comm, TASK_COMM_LEN);
2197} 2351}
2198 2352
2199/** 2353/**
@@ -2216,8 +2370,8 @@ int __audit_signal_info(int sig, struct task_struct *t)
2216 if (audit_pid && t->tgid == audit_pid) { 2370 if (audit_pid && t->tgid == audit_pid) {
2217 if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) { 2371 if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) {
2218 audit_sig_pid = tsk->pid; 2372 audit_sig_pid = tsk->pid;
2219 if (ctx) 2373 if (tsk->loginuid != -1)
2220 audit_sig_uid = ctx->loginuid; 2374 audit_sig_uid = tsk->loginuid;
2221 else 2375 else
2222 audit_sig_uid = tsk->uid; 2376 audit_sig_uid = tsk->uid;
2223 selinux_get_task_sid(tsk, &audit_sig_sid); 2377 selinux_get_task_sid(tsk, &audit_sig_sid);
@@ -2230,7 +2384,11 @@ int __audit_signal_info(int sig, struct task_struct *t)
2230 * in audit_context */ 2384 * in audit_context */
2231 if (!ctx->target_pid) { 2385 if (!ctx->target_pid) {
2232 ctx->target_pid = t->tgid; 2386 ctx->target_pid = t->tgid;
2387 ctx->target_auid = audit_get_loginuid(t);
2388 ctx->target_uid = t->uid;
2389 ctx->target_sessionid = audit_get_sessionid(t);
2233 selinux_get_task_sid(t, &ctx->target_sid); 2390 selinux_get_task_sid(t, &ctx->target_sid);
2391 memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN);
2234 return 0; 2392 return 0;
2235 } 2393 }
2236 2394
@@ -2247,7 +2405,11 @@ int __audit_signal_info(int sig, struct task_struct *t)
2247 BUG_ON(axp->pid_count >= AUDIT_AUX_PIDS); 2405 BUG_ON(axp->pid_count >= AUDIT_AUX_PIDS);
2248 2406
2249 axp->target_pid[axp->pid_count] = t->tgid; 2407 axp->target_pid[axp->pid_count] = t->tgid;
2408 axp->target_auid[axp->pid_count] = audit_get_loginuid(t);
2409 axp->target_uid[axp->pid_count] = t->uid;
2410 axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t);
2250 selinux_get_task_sid(t, &axp->target_sid[axp->pid_count]); 2411 selinux_get_task_sid(t, &axp->target_sid[axp->pid_count]);
2412 memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN);
2251 axp->pid_count++; 2413 axp->pid_count++;
2252 2414
2253 return 0; 2415 return 0;
@@ -2264,6 +2426,8 @@ void audit_core_dumps(long signr)
2264{ 2426{
2265 struct audit_buffer *ab; 2427 struct audit_buffer *ab;
2266 u32 sid; 2428 u32 sid;
2429 uid_t auid = audit_get_loginuid(current);
2430 unsigned int sessionid = audit_get_sessionid(current);
2267 2431
2268 if (!audit_enabled) 2432 if (!audit_enabled)
2269 return; 2433 return;
@@ -2272,9 +2436,8 @@ void audit_core_dumps(long signr)
2272 return; 2436 return;
2273 2437
2274 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); 2438 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND);
2275 audit_log_format(ab, "auid=%u uid=%u gid=%u", 2439 audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u",
2276 audit_get_loginuid(current->audit_context), 2440 auid, current->uid, current->gid, sessionid);
2277 current->uid, current->gid);
2278 selinux_get_task_sid(current, &sid); 2441 selinux_get_task_sid(current, &sid);
2279 if (sid) { 2442 if (sid) {
2280 char *ctx = NULL; 2443 char *ctx = NULL;
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
new file mode 100644
index 000000000000..d1a7605c5b8f
--- /dev/null
+++ b/kernel/backtracetest.c
@@ -0,0 +1,48 @@
1/*
2 * Simple stack backtrace regression test module
3 *
4 * (C) Copyright 2008 Intel Corporation
5 * Author: Arjan van de Ven <arjan@linux.intel.com>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; version 2
10 * of the License.
11 */
12
13#include <linux/module.h>
14#include <linux/sched.h>
15#include <linux/delay.h>
16
17static struct timer_list backtrace_timer;
18
19static void backtrace_test_timer(unsigned long data)
20{
21 printk("Testing a backtrace from irq context.\n");
22 printk("The following trace is a kernel self test and not a bug!\n");
23 dump_stack();
24}
25static int backtrace_regression_test(void)
26{
27 printk("====[ backtrace testing ]===========\n");
28 printk("Testing a backtrace from process context.\n");
29 printk("The following trace is a kernel self test and not a bug!\n");
30 dump_stack();
31
32 init_timer(&backtrace_timer);
33 backtrace_timer.function = backtrace_test_timer;
34 mod_timer(&backtrace_timer, jiffies + 10);
35
36 msleep(10);
37 printk("====[ end of backtrace testing ]====\n");
38 return 0;
39}
40
41static void exitf(void)
42{
43}
44
45module_init(backtrace_regression_test);
46module_exit(exitf);
47MODULE_LICENSE("GPL");
48MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>");
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3fe21e19c96e..1a3c23936d43 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1,6 +1,4 @@
1/* 1/*
2 * kernel/cgroup.c
3 *
4 * Generic process-grouping system. 2 * Generic process-grouping system.
5 * 3 *
6 * Based originally on the cpuset system, extracted by Paul Menage 4 * Based originally on the cpuset system, extracted by Paul Menage
@@ -2200,7 +2198,8 @@ static void cgroup_init_subsys(struct cgroup_subsys *ss)
2200{ 2198{
2201 struct cgroup_subsys_state *css; 2199 struct cgroup_subsys_state *css;
2202 struct list_head *l; 2200 struct list_head *l;
2203 printk(KERN_ERR "Initializing cgroup subsys %s\n", ss->name); 2201
2202 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
2204 2203
2205 /* Create the top cgroup state for this subsystem */ 2204 /* Create the top cgroup state for this subsystem */
2206 ss->root = &rootnode; 2205 ss->root = &rootnode;
@@ -2273,7 +2272,7 @@ int __init cgroup_init_early(void)
2273 BUG_ON(!ss->create); 2272 BUG_ON(!ss->create);
2274 BUG_ON(!ss->destroy); 2273 BUG_ON(!ss->destroy);
2275 if (ss->subsys_id != i) { 2274 if (ss->subsys_id != i) {
2276 printk(KERN_ERR "Subsys %s id == %d\n", 2275 printk(KERN_ERR "cgroup: Subsys %s id == %d\n",
2277 ss->name, ss->subsys_id); 2276 ss->name, ss->subsys_id);
2278 BUG(); 2277 BUG();
2279 } 2278 }
@@ -2605,7 +2604,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)
2605 dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename)); 2604 dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename));
2606 if (IS_ERR(dentry)) { 2605 if (IS_ERR(dentry)) {
2607 printk(KERN_INFO 2606 printk(KERN_INFO
2608 "Couldn't allocate dentry for %s: %ld\n", nodename, 2607 "cgroup: Couldn't allocate dentry for %s: %ld\n", nodename,
2609 PTR_ERR(dentry)); 2608 PTR_ERR(dentry));
2610 ret = PTR_ERR(dentry); 2609 ret = PTR_ERR(dentry);
2611 goto out_release; 2610 goto out_release;
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 6b3a0c15144f..e0d3a4f56ecb 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -15,9 +15,8 @@
15#include <linux/stop_machine.h> 15#include <linux/stop_machine.h>
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17 17
18/* This protects CPUs going up and down... */ 18/* Serializes the updates to cpu_online_map, cpu_present_map */
19static DEFINE_MUTEX(cpu_add_remove_lock); 19static DEFINE_MUTEX(cpu_add_remove_lock);
20static DEFINE_MUTEX(cpu_bitmask_lock);
21 20
22static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); 21static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
23 22
@@ -26,52 +25,123 @@ static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain);
26 */ 25 */
27static int cpu_hotplug_disabled; 26static int cpu_hotplug_disabled;
28 27
29#ifdef CONFIG_HOTPLUG_CPU 28static struct {
29 struct task_struct *active_writer;
30 struct mutex lock; /* Synchronizes accesses to refcount, */
31 /*
32 * Also blocks the new readers during
33 * an ongoing cpu hotplug operation.
34 */
35 int refcount;
36 wait_queue_head_t writer_queue;
37} cpu_hotplug;
30 38
31/* Crappy recursive lock-takers in cpufreq! Complain loudly about idiots */ 39#define writer_exists() (cpu_hotplug.active_writer != NULL)
32static struct task_struct *recursive;
33static int recursive_depth;
34 40
35void lock_cpu_hotplug(void) 41void __init cpu_hotplug_init(void)
36{ 42{
37 struct task_struct *tsk = current; 43 cpu_hotplug.active_writer = NULL;
38 44 mutex_init(&cpu_hotplug.lock);
39 if (tsk == recursive) { 45 cpu_hotplug.refcount = 0;
40 static int warnings = 10; 46 init_waitqueue_head(&cpu_hotplug.writer_queue);
41 if (warnings) { 47}
42 printk(KERN_ERR "Lukewarm IQ detected in hotplug locking\n"); 48
43 WARN_ON(1); 49#ifdef CONFIG_HOTPLUG_CPU
44 warnings--; 50
45 } 51void get_online_cpus(void)
46 recursive_depth++; 52{
53 might_sleep();
54 if (cpu_hotplug.active_writer == current)
47 return; 55 return;
48 } 56 mutex_lock(&cpu_hotplug.lock);
49 mutex_lock(&cpu_bitmask_lock); 57 cpu_hotplug.refcount++;
50 recursive = tsk; 58 mutex_unlock(&cpu_hotplug.lock);
59
51} 60}
52EXPORT_SYMBOL_GPL(lock_cpu_hotplug); 61EXPORT_SYMBOL_GPL(get_online_cpus);
53 62
54void unlock_cpu_hotplug(void) 63void put_online_cpus(void)
55{ 64{
56 WARN_ON(recursive != current); 65 if (cpu_hotplug.active_writer == current)
57 if (recursive_depth) {
58 recursive_depth--;
59 return; 66 return;
60 } 67 mutex_lock(&cpu_hotplug.lock);
61 recursive = NULL; 68 cpu_hotplug.refcount--;
62 mutex_unlock(&cpu_bitmask_lock); 69
70 if (unlikely(writer_exists()) && !cpu_hotplug.refcount)
71 wake_up(&cpu_hotplug.writer_queue);
72
73 mutex_unlock(&cpu_hotplug.lock);
74
63} 75}
64EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); 76EXPORT_SYMBOL_GPL(put_online_cpus);
65 77
66#endif /* CONFIG_HOTPLUG_CPU */ 78#endif /* CONFIG_HOTPLUG_CPU */
67 79
80/*
81 * The following two API's must be used when attempting
82 * to serialize the updates to cpu_online_map, cpu_present_map.
83 */
84void cpu_maps_update_begin(void)
85{
86 mutex_lock(&cpu_add_remove_lock);
87}
88
89void cpu_maps_update_done(void)
90{
91 mutex_unlock(&cpu_add_remove_lock);
92}
93
94/*
95 * This ensures that the hotplug operation can begin only when the
96 * refcount goes to zero.
97 *
98 * Note that during a cpu-hotplug operation, the new readers, if any,
99 * will be blocked by the cpu_hotplug.lock
100 *
101 * Since cpu_maps_update_begin is always called after invoking
102 * cpu_maps_update_begin, we can be sure that only one writer is active.
103 *
104 * Note that theoretically, there is a possibility of a livelock:
105 * - Refcount goes to zero, last reader wakes up the sleeping
106 * writer.
107 * - Last reader unlocks the cpu_hotplug.lock.
108 * - A new reader arrives at this moment, bumps up the refcount.
109 * - The writer acquires the cpu_hotplug.lock finds the refcount
110 * non zero and goes to sleep again.
111 *
112 * However, this is very difficult to achieve in practice since
113 * get_online_cpus() not an api which is called all that often.
114 *
115 */
116static void cpu_hotplug_begin(void)
117{
118 DECLARE_WAITQUEUE(wait, current);
119
120 mutex_lock(&cpu_hotplug.lock);
121
122 cpu_hotplug.active_writer = current;
123 add_wait_queue_exclusive(&cpu_hotplug.writer_queue, &wait);
124 while (cpu_hotplug.refcount) {
125 set_current_state(TASK_UNINTERRUPTIBLE);
126 mutex_unlock(&cpu_hotplug.lock);
127 schedule();
128 mutex_lock(&cpu_hotplug.lock);
129 }
130 remove_wait_queue_locked(&cpu_hotplug.writer_queue, &wait);
131}
132
133static void cpu_hotplug_done(void)
134{
135 cpu_hotplug.active_writer = NULL;
136 mutex_unlock(&cpu_hotplug.lock);
137}
68/* Need to know about CPUs going up/down? */ 138/* Need to know about CPUs going up/down? */
69int __cpuinit register_cpu_notifier(struct notifier_block *nb) 139int __cpuinit register_cpu_notifier(struct notifier_block *nb)
70{ 140{
71 int ret; 141 int ret;
72 mutex_lock(&cpu_add_remove_lock); 142 cpu_maps_update_begin();
73 ret = raw_notifier_chain_register(&cpu_chain, nb); 143 ret = raw_notifier_chain_register(&cpu_chain, nb);
74 mutex_unlock(&cpu_add_remove_lock); 144 cpu_maps_update_done();
75 return ret; 145 return ret;
76} 146}
77 147
@@ -81,9 +151,9 @@ EXPORT_SYMBOL(register_cpu_notifier);
81 151
82void unregister_cpu_notifier(struct notifier_block *nb) 152void unregister_cpu_notifier(struct notifier_block *nb)
83{ 153{
84 mutex_lock(&cpu_add_remove_lock); 154 cpu_maps_update_begin();
85 raw_notifier_chain_unregister(&cpu_chain, nb); 155 raw_notifier_chain_unregister(&cpu_chain, nb);
86 mutex_unlock(&cpu_add_remove_lock); 156 cpu_maps_update_done();
87} 157}
88EXPORT_SYMBOL(unregister_cpu_notifier); 158EXPORT_SYMBOL(unregister_cpu_notifier);
89 159
@@ -147,7 +217,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
147 if (!cpu_online(cpu)) 217 if (!cpu_online(cpu))
148 return -EINVAL; 218 return -EINVAL;
149 219
150 raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu); 220 cpu_hotplug_begin();
151 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, 221 err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod,
152 hcpu, -1, &nr_calls); 222 hcpu, -1, &nr_calls);
153 if (err == NOTIFY_BAD) { 223 if (err == NOTIFY_BAD) {
@@ -166,9 +236,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen)
166 cpu_clear(cpu, tmp); 236 cpu_clear(cpu, tmp);
167 set_cpus_allowed(current, tmp); 237 set_cpus_allowed(current, tmp);
168 238
169 mutex_lock(&cpu_bitmask_lock);
170 p = __stop_machine_run(take_cpu_down, &tcd_param, cpu); 239 p = __stop_machine_run(take_cpu_down, &tcd_param, cpu);
171 mutex_unlock(&cpu_bitmask_lock);
172 240
173 if (IS_ERR(p) || cpu_online(cpu)) { 241 if (IS_ERR(p) || cpu_online(cpu)) {
174 /* CPU didn't die: tell everyone. Can't complain. */ 242 /* CPU didn't die: tell everyone. Can't complain. */
@@ -202,7 +270,7 @@ out_thread:
202out_allowed: 270out_allowed:
203 set_cpus_allowed(current, old_allowed); 271 set_cpus_allowed(current, old_allowed);
204out_release: 272out_release:
205 raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu); 273 cpu_hotplug_done();
206 return err; 274 return err;
207} 275}
208 276
@@ -210,13 +278,13 @@ int cpu_down(unsigned int cpu)
210{ 278{
211 int err = 0; 279 int err = 0;
212 280
213 mutex_lock(&cpu_add_remove_lock); 281 cpu_maps_update_begin();
214 if (cpu_hotplug_disabled) 282 if (cpu_hotplug_disabled)
215 err = -EBUSY; 283 err = -EBUSY;
216 else 284 else
217 err = _cpu_down(cpu, 0); 285 err = _cpu_down(cpu, 0);
218 286
219 mutex_unlock(&cpu_add_remove_lock); 287 cpu_maps_update_done();
220 return err; 288 return err;
221} 289}
222#endif /*CONFIG_HOTPLUG_CPU*/ 290#endif /*CONFIG_HOTPLUG_CPU*/
@@ -231,7 +299,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
231 if (cpu_online(cpu) || !cpu_present(cpu)) 299 if (cpu_online(cpu) || !cpu_present(cpu))
232 return -EINVAL; 300 return -EINVAL;
233 301
234 raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu); 302 cpu_hotplug_begin();
235 ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu, 303 ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu,
236 -1, &nr_calls); 304 -1, &nr_calls);
237 if (ret == NOTIFY_BAD) { 305 if (ret == NOTIFY_BAD) {
@@ -243,9 +311,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
243 } 311 }
244 312
245 /* Arch-specific enabling code. */ 313 /* Arch-specific enabling code. */
246 mutex_lock(&cpu_bitmask_lock);
247 ret = __cpu_up(cpu); 314 ret = __cpu_up(cpu);
248 mutex_unlock(&cpu_bitmask_lock);
249 if (ret != 0) 315 if (ret != 0)
250 goto out_notify; 316 goto out_notify;
251 BUG_ON(!cpu_online(cpu)); 317 BUG_ON(!cpu_online(cpu));
@@ -257,7 +323,7 @@ out_notify:
257 if (ret != 0) 323 if (ret != 0)
258 __raw_notifier_call_chain(&cpu_chain, 324 __raw_notifier_call_chain(&cpu_chain,
259 CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); 325 CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
260 raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu); 326 cpu_hotplug_done();
261 327
262 return ret; 328 return ret;
263} 329}
@@ -275,13 +341,13 @@ int __cpuinit cpu_up(unsigned int cpu)
275 return -EINVAL; 341 return -EINVAL;
276 } 342 }
277 343
278 mutex_lock(&cpu_add_remove_lock); 344 cpu_maps_update_begin();
279 if (cpu_hotplug_disabled) 345 if (cpu_hotplug_disabled)
280 err = -EBUSY; 346 err = -EBUSY;
281 else 347 else
282 err = _cpu_up(cpu, 0); 348 err = _cpu_up(cpu, 0);
283 349
284 mutex_unlock(&cpu_add_remove_lock); 350 cpu_maps_update_done();
285 return err; 351 return err;
286} 352}
287 353
@@ -292,7 +358,7 @@ int disable_nonboot_cpus(void)
292{ 358{
293 int cpu, first_cpu, error = 0; 359 int cpu, first_cpu, error = 0;
294 360
295 mutex_lock(&cpu_add_remove_lock); 361 cpu_maps_update_begin();
296 first_cpu = first_cpu(cpu_online_map); 362 first_cpu = first_cpu(cpu_online_map);
297 /* We take down all of the non-boot CPUs in one shot to avoid races 363 /* We take down all of the non-boot CPUs in one shot to avoid races
298 * with the userspace trying to use the CPU hotplug at the same time 364 * with the userspace trying to use the CPU hotplug at the same time
@@ -319,7 +385,7 @@ int disable_nonboot_cpus(void)
319 } else { 385 } else {
320 printk(KERN_ERR "Non-boot CPUs are not disabled\n"); 386 printk(KERN_ERR "Non-boot CPUs are not disabled\n");
321 } 387 }
322 mutex_unlock(&cpu_add_remove_lock); 388 cpu_maps_update_done();
323 return error; 389 return error;
324} 390}
325 391
@@ -328,7 +394,7 @@ void enable_nonboot_cpus(void)
328 int cpu, error; 394 int cpu, error;
329 395
330 /* Allow everyone to use the CPU hotplug again */ 396 /* Allow everyone to use the CPU hotplug again */
331 mutex_lock(&cpu_add_remove_lock); 397 cpu_maps_update_begin();
332 cpu_hotplug_disabled = 0; 398 cpu_hotplug_disabled = 0;
333 if (cpus_empty(frozen_cpus)) 399 if (cpus_empty(frozen_cpus))
334 goto out; 400 goto out;
@@ -344,6 +410,6 @@ void enable_nonboot_cpus(void)
344 } 410 }
345 cpus_clear(frozen_cpus); 411 cpus_clear(frozen_cpus);
346out: 412out:
347 mutex_unlock(&cpu_add_remove_lock); 413 cpu_maps_update_done();
348} 414}
349#endif /* CONFIG_PM_SLEEP_SMP */ 415#endif /* CONFIG_PM_SLEEP_SMP */
diff --git a/kernel/cpu_acct.c b/kernel/cpu_acct.c
deleted file mode 100644
index 731e47e7f164..000000000000
--- a/kernel/cpu_acct.c
+++ /dev/null
@@ -1,186 +0,0 @@
1/*
2 * kernel/cpu_acct.c - CPU accounting cgroup subsystem
3 *
4 * Copyright (C) Google Inc, 2006
5 *
6 * Developed by Paul Menage (menage@google.com) and Balbir Singh
7 * (balbir@in.ibm.com)
8 *
9 */
10
11/*
12 * Example cgroup subsystem for reporting total CPU usage of tasks in a
13 * cgroup, along with percentage load over a time interval
14 */
15
16#include <linux/module.h>
17#include <linux/cgroup.h>
18#include <linux/fs.h>
19#include <linux/rcupdate.h>
20
21#include <asm/div64.h>
22
23struct cpuacct {
24 struct cgroup_subsys_state css;
25 spinlock_t lock;
26 /* total time used by this class */
27 cputime64_t time;
28
29 /* time when next load calculation occurs */
30 u64 next_interval_check;
31
32 /* time used in current period */
33 cputime64_t current_interval_time;
34
35 /* time used in last period */
36 cputime64_t last_interval_time;
37};
38
39struct cgroup_subsys cpuacct_subsys;
40
41static inline struct cpuacct *cgroup_ca(struct cgroup *cont)
42{
43 return container_of(cgroup_subsys_state(cont, cpuacct_subsys_id),
44 struct cpuacct, css);
45}
46
47static inline struct cpuacct *task_ca(struct task_struct *task)
48{
49 return container_of(task_subsys_state(task, cpuacct_subsys_id),
50 struct cpuacct, css);
51}
52
53#define INTERVAL (HZ * 10)
54
55static inline u64 next_interval_boundary(u64 now)
56{
57 /* calculate the next interval boundary beyond the
58 * current time */
59 do_div(now, INTERVAL);
60 return (now + 1) * INTERVAL;
61}
62
63static struct cgroup_subsys_state *cpuacct_create(
64 struct cgroup_subsys *ss, struct cgroup *cont)
65{
66 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
67
68 if (!ca)
69 return ERR_PTR(-ENOMEM);
70 spin_lock_init(&ca->lock);
71 ca->next_interval_check = next_interval_boundary(get_jiffies_64());
72 return &ca->css;
73}
74
75static void cpuacct_destroy(struct cgroup_subsys *ss,
76 struct cgroup *cont)
77{
78 kfree(cgroup_ca(cont));
79}
80
81/* Lazily update the load calculation if necessary. Called with ca locked */
82static void cpuusage_update(struct cpuacct *ca)
83{
84 u64 now = get_jiffies_64();
85
86 /* If we're not due for an update, return */
87 if (ca->next_interval_check > now)
88 return;
89
90 if (ca->next_interval_check <= (now - INTERVAL)) {
91 /* If it's been more than an interval since the last
92 * check, then catch up - the last interval must have
93 * been zero load */
94 ca->last_interval_time = 0;
95 ca->next_interval_check = next_interval_boundary(now);
96 } else {
97 /* If a steal takes the last interval time negative,
98 * then we just ignore it */
99 if ((s64)ca->current_interval_time > 0)
100 ca->last_interval_time = ca->current_interval_time;
101 else
102 ca->last_interval_time = 0;
103 ca->next_interval_check += INTERVAL;
104 }
105 ca->current_interval_time = 0;
106}
107
108static u64 cpuusage_read(struct cgroup *cont, struct cftype *cft)
109{
110 struct cpuacct *ca = cgroup_ca(cont);
111 u64 time;
112
113 spin_lock_irq(&ca->lock);
114 cpuusage_update(ca);
115 time = cputime64_to_jiffies64(ca->time);
116 spin_unlock_irq(&ca->lock);
117
118 /* Convert 64-bit jiffies to seconds */
119 time *= 1000;
120 do_div(time, HZ);
121 return time;
122}
123
124static u64 load_read(struct cgroup *cont, struct cftype *cft)
125{
126 struct cpuacct *ca = cgroup_ca(cont);
127 u64 time;
128
129 /* Find the time used in the previous interval */
130 spin_lock_irq(&ca->lock);
131 cpuusage_update(ca);
132 time = cputime64_to_jiffies64(ca->last_interval_time);
133 spin_unlock_irq(&ca->lock);
134
135 /* Convert time to a percentage, to give the load in the
136 * previous period */
137 time *= 100;
138 do_div(time, INTERVAL);
139
140 return time;
141}
142
143static struct cftype files[] = {
144 {
145 .name = "usage",
146 .read_uint = cpuusage_read,
147 },
148 {
149 .name = "load",
150 .read_uint = load_read,
151 }
152};
153
154static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cont)
155{
156 return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
157}
158
159void cpuacct_charge(struct task_struct *task, cputime_t cputime)
160{
161
162 struct cpuacct *ca;
163 unsigned long flags;
164
165 if (!cpuacct_subsys.active)
166 return;
167 rcu_read_lock();
168 ca = task_ca(task);
169 if (ca) {
170 spin_lock_irqsave(&ca->lock, flags);
171 cpuusage_update(ca);
172 ca->time = cputime64_add(ca->time, cputime);
173 ca->current_interval_time =
174 cputime64_add(ca->current_interval_time, cputime);
175 spin_unlock_irqrestore(&ca->lock, flags);
176 }
177 rcu_read_unlock();
178}
179
180struct cgroup_subsys cpuacct_subsys = {
181 .name = "cpuacct",
182 .create = cpuacct_create,
183 .destroy = cpuacct_destroy,
184 .populate = cpuacct_populate,
185 .subsys_id = cpuacct_subsys_id,
186};
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 50f5dc463688..cfaf6419d817 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -537,10 +537,10 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
537 * 537 *
538 * Call with cgroup_mutex held. May take callback_mutex during 538 * Call with cgroup_mutex held. May take callback_mutex during
539 * call due to the kfifo_alloc() and kmalloc() calls. May nest 539 * call due to the kfifo_alloc() and kmalloc() calls. May nest
540 * a call to the lock_cpu_hotplug()/unlock_cpu_hotplug() pair. 540 * a call to the get_online_cpus()/put_online_cpus() pair.
541 * Must not be called holding callback_mutex, because we must not 541 * Must not be called holding callback_mutex, because we must not
542 * call lock_cpu_hotplug() while holding callback_mutex. Elsewhere 542 * call get_online_cpus() while holding callback_mutex. Elsewhere
543 * the kernel nests callback_mutex inside lock_cpu_hotplug() calls. 543 * the kernel nests callback_mutex inside get_online_cpus() calls.
544 * So the reverse nesting would risk an ABBA deadlock. 544 * So the reverse nesting would risk an ABBA deadlock.
545 * 545 *
546 * The three key local variables below are: 546 * The three key local variables below are:
@@ -691,9 +691,9 @@ restart:
691 691
692rebuild: 692rebuild:
693 /* Have scheduler rebuild sched domains */ 693 /* Have scheduler rebuild sched domains */
694 lock_cpu_hotplug(); 694 get_online_cpus();
695 partition_sched_domains(ndoms, doms); 695 partition_sched_domains(ndoms, doms);
696 unlock_cpu_hotplug(); 696 put_online_cpus();
697 697
698done: 698done:
699 if (q && !IS_ERR(q)) 699 if (q && !IS_ERR(q))
@@ -1617,10 +1617,10 @@ static struct cgroup_subsys_state *cpuset_create(
1617 * 1617 *
1618 * If the cpuset being removed has its flag 'sched_load_balance' 1618 * If the cpuset being removed has its flag 'sched_load_balance'
1619 * enabled, then simulate turning sched_load_balance off, which 1619 * enabled, then simulate turning sched_load_balance off, which
1620 * will call rebuild_sched_domains(). The lock_cpu_hotplug() 1620 * will call rebuild_sched_domains(). The get_online_cpus()
1621 * call in rebuild_sched_domains() must not be made while holding 1621 * call in rebuild_sched_domains() must not be made while holding
1622 * callback_mutex. Elsewhere the kernel nests callback_mutex inside 1622 * callback_mutex. Elsewhere the kernel nests callback_mutex inside
1623 * lock_cpu_hotplug() calls. So the reverse nesting would risk an 1623 * get_online_cpus() calls. So the reverse nesting would risk an
1624 * ABBA deadlock. 1624 * ABBA deadlock.
1625 */ 1625 */
1626 1626
diff --git a/kernel/exit.c b/kernel/exit.c
index f1aec27f1df0..bfb1c0e940e8 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -249,7 +249,7 @@ static int has_stopped_jobs(struct pid *pgrp)
249 struct task_struct *p; 249 struct task_struct *p;
250 250
251 do_each_pid_task(pgrp, PIDTYPE_PGID, p) { 251 do_each_pid_task(pgrp, PIDTYPE_PGID, p) {
252 if (p->state != TASK_STOPPED) 252 if (!task_is_stopped(p))
253 continue; 253 continue;
254 retval = 1; 254 retval = 1;
255 break; 255 break;
@@ -614,7 +614,7 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
614 p->parent = p->real_parent; 614 p->parent = p->real_parent;
615 add_parent(p); 615 add_parent(p);
616 616
617 if (p->state == TASK_TRACED) { 617 if (task_is_traced(p)) {
618 /* 618 /*
619 * If it was at a trace stop, turn it into 619 * If it was at a trace stop, turn it into
620 * a normal stop since it's no longer being 620 * a normal stop since it's no longer being
@@ -1357,7 +1357,7 @@ static int wait_task_stopped(struct task_struct *p, int delayed_group_leader,
1357 int __user *stat_addr, struct rusage __user *ru) 1357 int __user *stat_addr, struct rusage __user *ru)
1358{ 1358{
1359 int retval, exit_code; 1359 int retval, exit_code;
1360 struct pid_namespace *ns; 1360 pid_t pid;
1361 1361
1362 if (!p->exit_code) 1362 if (!p->exit_code)
1363 return 0; 1363 return 0;
@@ -1376,21 +1376,19 @@ static int wait_task_stopped(struct task_struct *p, int delayed_group_leader,
1376 * keep holding onto the tasklist_lock while we call getrusage and 1376 * keep holding onto the tasklist_lock while we call getrusage and
1377 * possibly take page faults for user memory. 1377 * possibly take page faults for user memory.
1378 */ 1378 */
1379 ns = current->nsproxy->pid_ns; 1379 pid = task_pid_nr_ns(p, current->nsproxy->pid_ns);
1380 get_task_struct(p); 1380 get_task_struct(p);
1381 read_unlock(&tasklist_lock); 1381 read_unlock(&tasklist_lock);
1382 1382
1383 if (unlikely(noreap)) { 1383 if (unlikely(noreap)) {
1384 pid_t pid = task_pid_nr_ns(p, ns);
1385 uid_t uid = p->uid; 1384 uid_t uid = p->uid;
1386 int why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED; 1385 int why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED;
1387 1386
1388 exit_code = p->exit_code; 1387 exit_code = p->exit_code;
1389 if (unlikely(!exit_code) || 1388 if (unlikely(!exit_code) || unlikely(p->exit_state))
1390 unlikely(p->state & TASK_TRACED))
1391 goto bail_ref; 1389 goto bail_ref;
1392 return wait_noreap_copyout(p, pid, uid, 1390 return wait_noreap_copyout(p, pid, uid,
1393 why, (exit_code << 8) | 0x7f, 1391 why, exit_code,
1394 infop, ru); 1392 infop, ru);
1395 } 1393 }
1396 1394
@@ -1452,11 +1450,11 @@ bail_ref:
1452 if (!retval && infop) 1450 if (!retval && infop)
1453 retval = put_user(exit_code, &infop->si_status); 1451 retval = put_user(exit_code, &infop->si_status);
1454 if (!retval && infop) 1452 if (!retval && infop)
1455 retval = put_user(task_pid_nr_ns(p, ns), &infop->si_pid); 1453 retval = put_user(pid, &infop->si_pid);
1456 if (!retval && infop) 1454 if (!retval && infop)
1457 retval = put_user(p->uid, &infop->si_uid); 1455 retval = put_user(p->uid, &infop->si_uid);
1458 if (!retval) 1456 if (!retval)
1459 retval = task_pid_nr_ns(p, ns); 1457 retval = pid;
1460 put_task_struct(p); 1458 put_task_struct(p);
1461 1459
1462 BUG_ON(!retval); 1460 BUG_ON(!retval);
@@ -1565,60 +1563,51 @@ repeat:
1565 } 1563 }
1566 allowed = 1; 1564 allowed = 1;
1567 1565
1568 switch (p->state) { 1566 if (task_is_stopped_or_traced(p)) {
1569 case TASK_TRACED:
1570 /*
1571 * When we hit the race with PTRACE_ATTACH,
1572 * we will not report this child. But the
1573 * race means it has not yet been moved to
1574 * our ptrace_children list, so we need to
1575 * set the flag here to avoid a spurious ECHILD
1576 * when the race happens with the only child.
1577 */
1578 flag = 1;
1579 if (!my_ptrace_child(p))
1580 continue;
1581 /*FALLTHROUGH*/
1582 case TASK_STOPPED:
1583 /* 1567 /*
1584 * It's stopped now, so it might later 1568 * It's stopped now, so it might later
1585 * continue, exit, or stop again. 1569 * continue, exit, or stop again.
1570 *
1571 * When we hit the race with PTRACE_ATTACH, we
1572 * will not report this child. But the race
1573 * means it has not yet been moved to our
1574 * ptrace_children list, so we need to set the
1575 * flag here to avoid a spurious ECHILD when
1576 * the race happens with the only child.
1586 */ 1577 */
1587 flag = 1; 1578 flag = 1;
1588 if (!(options & WUNTRACED) && 1579
1589 !my_ptrace_child(p)) 1580 if (!my_ptrace_child(p)) {
1590 continue; 1581 if (task_is_traced(p))
1582 continue;
1583 if (!(options & WUNTRACED))
1584 continue;
1585 }
1586
1591 retval = wait_task_stopped(p, ret == 2, 1587 retval = wait_task_stopped(p, ret == 2,
1592 (options & WNOWAIT), 1588 (options & WNOWAIT), infop,
1593 infop, 1589 stat_addr, ru);
1594 stat_addr, ru);
1595 if (retval == -EAGAIN) 1590 if (retval == -EAGAIN)
1596 goto repeat; 1591 goto repeat;
1597 if (retval != 0) /* He released the lock. */ 1592 if (retval != 0) /* He released the lock. */
1598 goto end; 1593 goto end;
1599 break; 1594 } else if (p->exit_state == EXIT_DEAD) {
1600 default: 1595 continue;
1601 // case EXIT_DEAD: 1596 } else if (p->exit_state == EXIT_ZOMBIE) {
1602 if (p->exit_state == EXIT_DEAD) 1597 /*
1598 * Eligible but we cannot release it yet:
1599 */
1600 if (ret == 2)
1601 goto check_continued;
1602 if (!likely(options & WEXITED))
1603 continue; 1603 continue;
1604 // case EXIT_ZOMBIE: 1604 retval = wait_task_zombie(p,
1605 if (p->exit_state == EXIT_ZOMBIE) { 1605 (options & WNOWAIT), infop,
1606 /* 1606 stat_addr, ru);
1607 * Eligible but we cannot release 1607 /* He released the lock. */
1608 * it yet: 1608 if (retval != 0)
1609 */ 1609 goto end;
1610 if (ret == 2) 1610 } else {
1611 goto check_continued;
1612 if (!likely(options & WEXITED))
1613 continue;
1614 retval = wait_task_zombie(
1615 p, (options & WNOWAIT),
1616 infop, stat_addr, ru);
1617 /* He released the lock. */
1618 if (retval != 0)
1619 goto end;
1620 break;
1621 }
1622check_continued: 1611check_continued:
1623 /* 1612 /*
1624 * It's running now, so it might later 1613 * It's running now, so it might later
@@ -1627,12 +1616,11 @@ check_continued:
1627 flag = 1; 1616 flag = 1;
1628 if (!unlikely(options & WCONTINUED)) 1617 if (!unlikely(options & WCONTINUED))
1629 continue; 1618 continue;
1630 retval = wait_task_continued( 1619 retval = wait_task_continued(p,
1631 p, (options & WNOWAIT), 1620 (options & WNOWAIT), infop,
1632 infop, stat_addr, ru); 1621 stat_addr, ru);
1633 if (retval != 0) /* He released the lock. */ 1622 if (retval != 0) /* He released the lock. */
1634 goto end; 1623 goto end;
1635 break;
1636 } 1624 }
1637 } 1625 }
1638 if (!flag) { 1626 if (!flag) {
diff --git a/kernel/extable.c b/kernel/extable.c
index 7fe262855317..a26cb2e17023 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -46,7 +46,8 @@ int core_kernel_text(unsigned long addr)
46 addr <= (unsigned long)_etext) 46 addr <= (unsigned long)_etext)
47 return 1; 47 return 1;
48 48
49 if (addr >= (unsigned long)_sinittext && 49 if (system_state == SYSTEM_BOOTING &&
50 addr >= (unsigned long)_sinittext &&
50 addr <= (unsigned long)_einittext) 51 addr <= (unsigned long)_einittext)
51 return 1; 52 return 1;
52 return 0; 53 return 0;
diff --git a/kernel/fork.c b/kernel/fork.c
index ddafdfac9456..05e0b6f4365b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -51,6 +51,7 @@
51#include <linux/random.h> 51#include <linux/random.h>
52#include <linux/tty.h> 52#include <linux/tty.h>
53#include <linux/proc_fs.h> 53#include <linux/proc_fs.h>
54#include <linux/blkdev.h>
54 55
55#include <asm/pgtable.h> 56#include <asm/pgtable.h>
56#include <asm/pgalloc.h> 57#include <asm/pgalloc.h>
@@ -392,6 +393,7 @@ void fastcall __mmdrop(struct mm_struct *mm)
392 destroy_context(mm); 393 destroy_context(mm);
393 free_mm(mm); 394 free_mm(mm);
394} 395}
396EXPORT_SYMBOL_GPL(__mmdrop);
395 397
396/* 398/*
397 * Decrement the use count and release all resources for an mm. 399 * Decrement the use count and release all resources for an mm.
@@ -791,6 +793,31 @@ out:
791 return error; 793 return error;
792} 794}
793 795
796static int copy_io(unsigned long clone_flags, struct task_struct *tsk)
797{
798#ifdef CONFIG_BLOCK
799 struct io_context *ioc = current->io_context;
800
801 if (!ioc)
802 return 0;
803 /*
804 * Share io context with parent, if CLONE_IO is set
805 */
806 if (clone_flags & CLONE_IO) {
807 tsk->io_context = ioc_task_link(ioc);
808 if (unlikely(!tsk->io_context))
809 return -ENOMEM;
810 } else if (ioprio_valid(ioc->ioprio)) {
811 tsk->io_context = alloc_io_context(GFP_KERNEL, -1);
812 if (unlikely(!tsk->io_context))
813 return -ENOMEM;
814
815 tsk->io_context->ioprio = ioc->ioprio;
816 }
817#endif
818 return 0;
819}
820
794/* 821/*
795 * Helper to unshare the files of the current task. 822 * Helper to unshare the files of the current task.
796 * We don't want to expose copy_files internals to 823 * We don't want to expose copy_files internals to
@@ -1045,6 +1072,10 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1045 copy_flags(clone_flags, p); 1072 copy_flags(clone_flags, p);
1046 INIT_LIST_HEAD(&p->children); 1073 INIT_LIST_HEAD(&p->children);
1047 INIT_LIST_HEAD(&p->sibling); 1074 INIT_LIST_HEAD(&p->sibling);
1075#ifdef CONFIG_PREEMPT_RCU
1076 p->rcu_read_lock_nesting = 0;
1077 p->rcu_flipctr_idx = 0;
1078#endif /* #ifdef CONFIG_PREEMPT_RCU */
1048 p->vfork_done = NULL; 1079 p->vfork_done = NULL;
1049 spin_lock_init(&p->alloc_lock); 1080 spin_lock_init(&p->alloc_lock);
1050 1081
@@ -1056,6 +1087,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1056 p->gtime = cputime_zero; 1087 p->gtime = cputime_zero;
1057 p->utimescaled = cputime_zero; 1088 p->utimescaled = cputime_zero;
1058 p->stimescaled = cputime_zero; 1089 p->stimescaled = cputime_zero;
1090 p->prev_utime = cputime_zero;
1091 p->prev_stime = cputime_zero;
1092
1093#ifdef CONFIG_DETECT_SOFTLOCKUP
1094 p->last_switch_count = 0;
1095 p->last_switch_timestamp = 0;
1096#endif
1059 1097
1060#ifdef CONFIG_TASK_XACCT 1098#ifdef CONFIG_TASK_XACCT
1061 p->rchar = 0; /* I/O counter: bytes read */ 1099 p->rchar = 0; /* I/O counter: bytes read */
@@ -1121,6 +1159,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1121 p->blocked_on = NULL; /* not blocked yet */ 1159 p->blocked_on = NULL; /* not blocked yet */
1122#endif 1160#endif
1123 1161
1162 /* Perform scheduler related setup. Assign this task to a CPU. */
1163 sched_fork(p, clone_flags);
1164
1124 if ((retval = security_task_alloc(p))) 1165 if ((retval = security_task_alloc(p)))
1125 goto bad_fork_cleanup_policy; 1166 goto bad_fork_cleanup_policy;
1126 if ((retval = audit_alloc(p))) 1167 if ((retval = audit_alloc(p)))
@@ -1142,15 +1183,17 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1142 goto bad_fork_cleanup_mm; 1183 goto bad_fork_cleanup_mm;
1143 if ((retval = copy_namespaces(clone_flags, p))) 1184 if ((retval = copy_namespaces(clone_flags, p)))
1144 goto bad_fork_cleanup_keys; 1185 goto bad_fork_cleanup_keys;
1186 if ((retval = copy_io(clone_flags, p)))
1187 goto bad_fork_cleanup_namespaces;
1145 retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); 1188 retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
1146 if (retval) 1189 if (retval)
1147 goto bad_fork_cleanup_namespaces; 1190 goto bad_fork_cleanup_io;
1148 1191
1149 if (pid != &init_struct_pid) { 1192 if (pid != &init_struct_pid) {
1150 retval = -ENOMEM; 1193 retval = -ENOMEM;
1151 pid = alloc_pid(task_active_pid_ns(p)); 1194 pid = alloc_pid(task_active_pid_ns(p));
1152 if (!pid) 1195 if (!pid)
1153 goto bad_fork_cleanup_namespaces; 1196 goto bad_fork_cleanup_io;
1154 1197
1155 if (clone_flags & CLONE_NEWPID) { 1198 if (clone_flags & CLONE_NEWPID) {
1156 retval = pid_ns_prepare_proc(task_active_pid_ns(p)); 1199 retval = pid_ns_prepare_proc(task_active_pid_ns(p));
@@ -1191,6 +1234,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1191#ifdef TIF_SYSCALL_EMU 1234#ifdef TIF_SYSCALL_EMU
1192 clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); 1235 clear_tsk_thread_flag(p, TIF_SYSCALL_EMU);
1193#endif 1236#endif
1237 clear_all_latency_tracing(p);
1194 1238
1195 /* Our parent execution domain becomes current domain 1239 /* Our parent execution domain becomes current domain
1196 These must match for thread signalling to apply */ 1240 These must match for thread signalling to apply */
@@ -1210,9 +1254,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1210 INIT_LIST_HEAD(&p->ptrace_children); 1254 INIT_LIST_HEAD(&p->ptrace_children);
1211 INIT_LIST_HEAD(&p->ptrace_list); 1255 INIT_LIST_HEAD(&p->ptrace_list);
1212 1256
1213 /* Perform scheduler related setup. Assign this task to a CPU. */
1214 sched_fork(p, clone_flags);
1215
1216 /* Now that the task is set up, run cgroup callbacks if 1257 /* Now that the task is set up, run cgroup callbacks if
1217 * necessary. We need to run them before the task is visible 1258 * necessary. We need to run them before the task is visible
1218 * on the tasklist. */ 1259 * on the tasklist. */
@@ -1222,9 +1263,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1222 /* Need tasklist lock for parent etc handling! */ 1263 /* Need tasklist lock for parent etc handling! */
1223 write_lock_irq(&tasklist_lock); 1264 write_lock_irq(&tasklist_lock);
1224 1265
1225 /* for sys_ioprio_set(IOPRIO_WHO_PGRP) */
1226 p->ioprio = current->ioprio;
1227
1228 /* 1266 /*
1229 * The task hasn't been attached yet, so its cpus_allowed mask will 1267 * The task hasn't been attached yet, so its cpus_allowed mask will
1230 * not be changed, nor will its assigned CPU. 1268 * not be changed, nor will its assigned CPU.
@@ -1235,6 +1273,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1235 * parent's CPU). This avoids alot of nasty races. 1273 * parent's CPU). This avoids alot of nasty races.
1236 */ 1274 */
1237 p->cpus_allowed = current->cpus_allowed; 1275 p->cpus_allowed = current->cpus_allowed;
1276 p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed;
1238 if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || 1277 if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
1239 !cpu_online(task_cpu(p)))) 1278 !cpu_online(task_cpu(p))))
1240 set_task_cpu(p, smp_processor_id()); 1279 set_task_cpu(p, smp_processor_id());
@@ -1290,23 +1329,14 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1290 __ptrace_link(p, current->parent); 1329 __ptrace_link(p, current->parent);
1291 1330
1292 if (thread_group_leader(p)) { 1331 if (thread_group_leader(p)) {
1293 if (clone_flags & CLONE_NEWPID) { 1332 if (clone_flags & CLONE_NEWPID)
1294 p->nsproxy->pid_ns->child_reaper = p; 1333 p->nsproxy->pid_ns->child_reaper = p;
1295 p->signal->tty = NULL;
1296 set_task_pgrp(p, p->pid);
1297 set_task_session(p, p->pid);
1298 attach_pid(p, PIDTYPE_PGID, pid);
1299 attach_pid(p, PIDTYPE_SID, pid);
1300 } else {
1301 p->signal->tty = current->signal->tty;
1302 set_task_pgrp(p, task_pgrp_nr(current));
1303 set_task_session(p, task_session_nr(current));
1304 attach_pid(p, PIDTYPE_PGID,
1305 task_pgrp(current));
1306 attach_pid(p, PIDTYPE_SID,
1307 task_session(current));
1308 }
1309 1334
1335 p->signal->tty = current->signal->tty;
1336 set_task_pgrp(p, task_pgrp_nr(current));
1337 set_task_session(p, task_session_nr(current));
1338 attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
1339 attach_pid(p, PIDTYPE_SID, task_session(current));
1310 list_add_tail_rcu(&p->tasks, &init_task.tasks); 1340 list_add_tail_rcu(&p->tasks, &init_task.tasks);
1311 __get_cpu_var(process_counts)++; 1341 __get_cpu_var(process_counts)++;
1312 } 1342 }
@@ -1324,6 +1354,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1324bad_fork_free_pid: 1354bad_fork_free_pid:
1325 if (pid != &init_struct_pid) 1355 if (pid != &init_struct_pid)
1326 free_pid(pid); 1356 free_pid(pid);
1357bad_fork_cleanup_io:
1358 put_io_context(p->io_context);
1327bad_fork_cleanup_namespaces: 1359bad_fork_cleanup_namespaces:
1328 exit_task_namespaces(p); 1360 exit_task_namespaces(p);
1329bad_fork_cleanup_keys: 1361bad_fork_cleanup_keys:
diff --git a/kernel/futex.c b/kernel/futex.c
index 32710451dc20..a6baaec44b8f 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -109,6 +109,9 @@ struct futex_q {
109 /* Optional priority inheritance state: */ 109 /* Optional priority inheritance state: */
110 struct futex_pi_state *pi_state; 110 struct futex_pi_state *pi_state;
111 struct task_struct *task; 111 struct task_struct *task;
112
113 /* Bitset for the optional bitmasked wakeup */
114 u32 bitset;
112}; 115};
113 116
114/* 117/*
@@ -181,8 +184,8 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
181 * For other futexes, it points to &current->mm->mmap_sem and 184 * For other futexes, it points to &current->mm->mmap_sem and
182 * caller must have taken the reader lock. but NOT any spinlocks. 185 * caller must have taken the reader lock. but NOT any spinlocks.
183 */ 186 */
184int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared, 187static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
185 union futex_key *key) 188 union futex_key *key)
186{ 189{
187 unsigned long address = (unsigned long)uaddr; 190 unsigned long address = (unsigned long)uaddr;
188 struct mm_struct *mm = current->mm; 191 struct mm_struct *mm = current->mm;
@@ -268,14 +271,13 @@ int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared,
268 } 271 }
269 return err; 272 return err;
270} 273}
271EXPORT_SYMBOL_GPL(get_futex_key);
272 274
273/* 275/*
274 * Take a reference to the resource addressed by a key. 276 * Take a reference to the resource addressed by a key.
275 * Can be called while holding spinlocks. 277 * Can be called while holding spinlocks.
276 * 278 *
277 */ 279 */
278inline void get_futex_key_refs(union futex_key *key) 280static void get_futex_key_refs(union futex_key *key)
279{ 281{
280 if (key->both.ptr == 0) 282 if (key->both.ptr == 0)
281 return; 283 return;
@@ -288,13 +290,12 @@ inline void get_futex_key_refs(union futex_key *key)
288 break; 290 break;
289 } 291 }
290} 292}
291EXPORT_SYMBOL_GPL(get_futex_key_refs);
292 293
293/* 294/*
294 * Drop a reference to the resource addressed by a key. 295 * Drop a reference to the resource addressed by a key.
295 * The hash bucket spinlock must not be held. 296 * The hash bucket spinlock must not be held.
296 */ 297 */
297void drop_futex_key_refs(union futex_key *key) 298static void drop_futex_key_refs(union futex_key *key)
298{ 299{
299 if (!key->both.ptr) 300 if (!key->both.ptr)
300 return; 301 return;
@@ -307,7 +308,6 @@ void drop_futex_key_refs(union futex_key *key)
307 break; 308 break;
308 } 309 }
309} 310}
310EXPORT_SYMBOL_GPL(drop_futex_key_refs);
311 311
312static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) 312static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
313{ 313{
@@ -661,7 +661,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
661 661
662 if (curval == -EFAULT) 662 if (curval == -EFAULT)
663 ret = -EFAULT; 663 ret = -EFAULT;
664 if (curval != uval) 664 else if (curval != uval)
665 ret = -EINVAL; 665 ret = -EINVAL;
666 if (ret) { 666 if (ret) {
667 spin_unlock(&pi_state->pi_mutex.wait_lock); 667 spin_unlock(&pi_state->pi_mutex.wait_lock);
@@ -725,7 +725,7 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
725 * to this virtual address: 725 * to this virtual address:
726 */ 726 */
727static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, 727static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
728 int nr_wake) 728 int nr_wake, u32 bitset)
729{ 729{
730 struct futex_hash_bucket *hb; 730 struct futex_hash_bucket *hb;
731 struct futex_q *this, *next; 731 struct futex_q *this, *next;
@@ -733,6 +733,9 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
733 union futex_key key; 733 union futex_key key;
734 int ret; 734 int ret;
735 735
736 if (!bitset)
737 return -EINVAL;
738
736 futex_lock_mm(fshared); 739 futex_lock_mm(fshared);
737 740
738 ret = get_futex_key(uaddr, fshared, &key); 741 ret = get_futex_key(uaddr, fshared, &key);
@@ -749,6 +752,11 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared,
749 ret = -EINVAL; 752 ret = -EINVAL;
750 break; 753 break;
751 } 754 }
755
756 /* Check if one of the bits is set in both bitsets */
757 if (!(this->bitset & bitset))
758 continue;
759
752 wake_futex(this); 760 wake_futex(this);
753 if (++ret >= nr_wake) 761 if (++ret >= nr_wake)
754 break; 762 break;
@@ -1100,15 +1108,15 @@ static void unqueue_me_pi(struct futex_q *q)
1100} 1108}
1101 1109
1102/* 1110/*
1103 * Fixup the pi_state owner with current. 1111 * Fixup the pi_state owner with the new owner.
1104 * 1112 *
1105 * Must be called with hash bucket lock held and mm->sem held for non 1113 * Must be called with hash bucket lock held and mm->sem held for non
1106 * private futexes. 1114 * private futexes.
1107 */ 1115 */
1108static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, 1116static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1109 struct task_struct *curr) 1117 struct task_struct *newowner)
1110{ 1118{
1111 u32 newtid = task_pid_vnr(curr) | FUTEX_WAITERS; 1119 u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
1112 struct futex_pi_state *pi_state = q->pi_state; 1120 struct futex_pi_state *pi_state = q->pi_state;
1113 u32 uval, curval, newval; 1121 u32 uval, curval, newval;
1114 int ret; 1122 int ret;
@@ -1122,12 +1130,12 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1122 } else 1130 } else
1123 newtid |= FUTEX_OWNER_DIED; 1131 newtid |= FUTEX_OWNER_DIED;
1124 1132
1125 pi_state->owner = curr; 1133 pi_state->owner = newowner;
1126 1134
1127 spin_lock_irq(&curr->pi_lock); 1135 spin_lock_irq(&newowner->pi_lock);
1128 WARN_ON(!list_empty(&pi_state->list)); 1136 WARN_ON(!list_empty(&pi_state->list));
1129 list_add(&pi_state->list, &curr->pi_state_list); 1137 list_add(&pi_state->list, &newowner->pi_state_list);
1130 spin_unlock_irq(&curr->pi_lock); 1138 spin_unlock_irq(&newowner->pi_lock);
1131 1139
1132 /* 1140 /*
1133 * We own it, so we have to replace the pending owner 1141 * We own it, so we have to replace the pending owner
@@ -1152,14 +1160,14 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1152 1160
1153/* 1161/*
1154 * In case we must use restart_block to restart a futex_wait, 1162 * In case we must use restart_block to restart a futex_wait,
1155 * we encode in the 'arg3' shared capability 1163 * we encode in the 'flags' shared capability
1156 */ 1164 */
1157#define ARG3_SHARED 1 1165#define FLAGS_SHARED 1
1158 1166
1159static long futex_wait_restart(struct restart_block *restart); 1167static long futex_wait_restart(struct restart_block *restart);
1160 1168
1161static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, 1169static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1162 u32 val, ktime_t *abs_time) 1170 u32 val, ktime_t *abs_time, u32 bitset)
1163{ 1171{
1164 struct task_struct *curr = current; 1172 struct task_struct *curr = current;
1165 DECLARE_WAITQUEUE(wait, curr); 1173 DECLARE_WAITQUEUE(wait, curr);
@@ -1170,7 +1178,11 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1170 struct hrtimer_sleeper t; 1178 struct hrtimer_sleeper t;
1171 int rem = 0; 1179 int rem = 0;
1172 1180
1181 if (!bitset)
1182 return -EINVAL;
1183
1173 q.pi_state = NULL; 1184 q.pi_state = NULL;
1185 q.bitset = bitset;
1174 retry: 1186 retry:
1175 futex_lock_mm(fshared); 1187 futex_lock_mm(fshared);
1176 1188
@@ -1255,6 +1267,8 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1255 t.timer.expires = *abs_time; 1267 t.timer.expires = *abs_time;
1256 1268
1257 hrtimer_start(&t.timer, t.timer.expires, HRTIMER_MODE_ABS); 1269 hrtimer_start(&t.timer, t.timer.expires, HRTIMER_MODE_ABS);
1270 if (!hrtimer_active(&t.timer))
1271 t.task = NULL;
1258 1272
1259 /* 1273 /*
1260 * the timer could have already expired, in which 1274 * the timer could have already expired, in which
@@ -1293,12 +1307,14 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1293 struct restart_block *restart; 1307 struct restart_block *restart;
1294 restart = &current_thread_info()->restart_block; 1308 restart = &current_thread_info()->restart_block;
1295 restart->fn = futex_wait_restart; 1309 restart->fn = futex_wait_restart;
1296 restart->arg0 = (unsigned long)uaddr; 1310 restart->futex.uaddr = (u32 *)uaddr;
1297 restart->arg1 = (unsigned long)val; 1311 restart->futex.val = val;
1298 restart->arg2 = (unsigned long)abs_time; 1312 restart->futex.time = abs_time->tv64;
1299 restart->arg3 = 0; 1313 restart->futex.bitset = bitset;
1314 restart->futex.flags = 0;
1315
1300 if (fshared) 1316 if (fshared)
1301 restart->arg3 |= ARG3_SHARED; 1317 restart->futex.flags |= FLAGS_SHARED;
1302 return -ERESTART_RESTARTBLOCK; 1318 return -ERESTART_RESTARTBLOCK;
1303 } 1319 }
1304 1320
@@ -1313,15 +1329,16 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1313 1329
1314static long futex_wait_restart(struct restart_block *restart) 1330static long futex_wait_restart(struct restart_block *restart)
1315{ 1331{
1316 u32 __user *uaddr = (u32 __user *)restart->arg0; 1332 u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
1317 u32 val = (u32)restart->arg1;
1318 ktime_t *abs_time = (ktime_t *)restart->arg2;
1319 struct rw_semaphore *fshared = NULL; 1333 struct rw_semaphore *fshared = NULL;
1334 ktime_t t;
1320 1335
1336 t.tv64 = restart->futex.time;
1321 restart->fn = do_no_restart_syscall; 1337 restart->fn = do_no_restart_syscall;
1322 if (restart->arg3 & ARG3_SHARED) 1338 if (restart->futex.flags & FLAGS_SHARED)
1323 fshared = &current->mm->mmap_sem; 1339 fshared = &current->mm->mmap_sem;
1324 return (long)futex_wait(uaddr, fshared, val, abs_time); 1340 return (long)futex_wait(uaddr, fshared, restart->futex.val, &t,
1341 restart->futex.bitset);
1325} 1342}
1326 1343
1327 1344
@@ -1510,9 +1527,37 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1510 * when we were on the way back before we locked the 1527 * when we were on the way back before we locked the
1511 * hash bucket. 1528 * hash bucket.
1512 */ 1529 */
1513 if (q.pi_state->owner == curr && 1530 if (q.pi_state->owner == curr) {
1514 rt_mutex_trylock(&q.pi_state->pi_mutex)) { 1531 /*
1515 ret = 0; 1532 * Try to get the rt_mutex now. This might
1533 * fail as some other task acquired the
1534 * rt_mutex after we removed ourself from the
1535 * rt_mutex waiters list.
1536 */
1537 if (rt_mutex_trylock(&q.pi_state->pi_mutex))
1538 ret = 0;
1539 else {
1540 /*
1541 * pi_state is incorrect, some other
1542 * task did a lock steal and we
1543 * returned due to timeout or signal
1544 * without taking the rt_mutex. Too
1545 * late. We can access the
1546 * rt_mutex_owner without locking, as
1547 * the other task is now blocked on
1548 * the hash bucket lock. Fix the state
1549 * up.
1550 */
1551 struct task_struct *owner;
1552 int res;
1553
1554 owner = rt_mutex_owner(&q.pi_state->pi_mutex);
1555 res = fixup_pi_state_owner(uaddr, &q, owner);
1556
1557 /* propagate -EFAULT, if the fixup failed */
1558 if (res)
1559 ret = res;
1560 }
1516 } else { 1561 } else {
1517 /* 1562 /*
1518 * Paranoia check. If we did not take the lock 1563 * Paranoia check. If we did not take the lock
@@ -1914,7 +1959,8 @@ retry:
1914 * PI futexes happens in exit_pi_state(): 1959 * PI futexes happens in exit_pi_state():
1915 */ 1960 */
1916 if (!pi && (uval & FUTEX_WAITERS)) 1961 if (!pi && (uval & FUTEX_WAITERS))
1917 futex_wake(uaddr, &curr->mm->mmap_sem, 1); 1962 futex_wake(uaddr, &curr->mm->mmap_sem, 1,
1963 FUTEX_BITSET_MATCH_ANY);
1918 } 1964 }
1919 return 0; 1965 return 0;
1920} 1966}
@@ -2014,10 +2060,14 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
2014 2060
2015 switch (cmd) { 2061 switch (cmd) {
2016 case FUTEX_WAIT: 2062 case FUTEX_WAIT:
2017 ret = futex_wait(uaddr, fshared, val, timeout); 2063 val3 = FUTEX_BITSET_MATCH_ANY;
2064 case FUTEX_WAIT_BITSET:
2065 ret = futex_wait(uaddr, fshared, val, timeout, val3);
2018 break; 2066 break;
2019 case FUTEX_WAKE: 2067 case FUTEX_WAKE:
2020 ret = futex_wake(uaddr, fshared, val); 2068 val3 = FUTEX_BITSET_MATCH_ANY;
2069 case FUTEX_WAKE_BITSET:
2070 ret = futex_wake(uaddr, fshared, val, val3);
2021 break; 2071 break;
2022 case FUTEX_FD: 2072 case FUTEX_FD:
2023 /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */ 2073 /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */
@@ -2057,7 +2107,8 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
2057 u32 val2 = 0; 2107 u32 val2 = 0;
2058 int cmd = op & FUTEX_CMD_MASK; 2108 int cmd = op & FUTEX_CMD_MASK;
2059 2109
2060 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI)) { 2110 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
2111 cmd == FUTEX_WAIT_BITSET)) {
2061 if (copy_from_user(&ts, utime, sizeof(ts)) != 0) 2112 if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
2062 return -EFAULT; 2113 return -EFAULT;
2063 if (!timespec_valid(&ts)) 2114 if (!timespec_valid(&ts))
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 00b572666cc7..133d558db452 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -30,6 +30,15 @@ fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
30 return 0; 30 return 0;
31} 31}
32 32
33static void __user *futex_uaddr(struct robust_list *entry,
34 compat_long_t futex_offset)
35{
36 compat_uptr_t base = ptr_to_compat(entry);
37 void __user *uaddr = compat_ptr(base + futex_offset);
38
39 return uaddr;
40}
41
33/* 42/*
34 * Walk curr->robust_list (very carefully, it's a userspace list!) 43 * Walk curr->robust_list (very carefully, it's a userspace list!)
35 * and mark any locks found there dead, and notify any waiters. 44 * and mark any locks found there dead, and notify any waiters.
@@ -76,11 +85,12 @@ void compat_exit_robust_list(struct task_struct *curr)
76 * A pending lock might already be on the list, so 85 * A pending lock might already be on the list, so
77 * dont process it twice: 86 * dont process it twice:
78 */ 87 */
79 if (entry != pending) 88 if (entry != pending) {
80 if (handle_futex_death((void __user *)entry + futex_offset, 89 void __user *uaddr = futex_uaddr(entry, futex_offset);
81 curr, pi))
82 return;
83 90
91 if (handle_futex_death(uaddr, curr, pi))
92 return;
93 }
84 if (rc) 94 if (rc)
85 return; 95 return;
86 uentry = next_uentry; 96 uentry = next_uentry;
@@ -94,9 +104,11 @@ void compat_exit_robust_list(struct task_struct *curr)
94 104
95 cond_resched(); 105 cond_resched();
96 } 106 }
97 if (pending) 107 if (pending) {
98 handle_futex_death((void __user *)pending + futex_offset, 108 void __user *uaddr = futex_uaddr(pending, futex_offset);
99 curr, pip); 109
110 handle_futex_death(uaddr, curr, pip);
111 }
100} 112}
101 113
102asmlinkage long 114asmlinkage long
@@ -155,7 +167,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
155 int val2 = 0; 167 int val2 = 0;
156 int cmd = op & FUTEX_CMD_MASK; 168 int cmd = op & FUTEX_CMD_MASK;
157 169
158 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI)) { 170 if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
171 cmd == FUTEX_WAIT_BITSET)) {
159 if (get_compat_timespec(&ts, utime)) 172 if (get_compat_timespec(&ts, utime))
160 return -EFAULT; 173 return -EFAULT;
161 if (!timespec_valid(&ts)) 174 if (!timespec_valid(&ts))
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index b6d2ff7e37ee..1069998fe25f 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -325,6 +325,22 @@ unsigned long ktime_divns(const ktime_t kt, s64 div)
325} 325}
326#endif /* BITS_PER_LONG >= 64 */ 326#endif /* BITS_PER_LONG >= 64 */
327 327
328/*
329 * Check, whether the timer is on the callback pending list
330 */
331static inline int hrtimer_cb_pending(const struct hrtimer *timer)
332{
333 return timer->state & HRTIMER_STATE_PENDING;
334}
335
336/*
337 * Remove a timer from the callback pending list
338 */
339static inline void hrtimer_remove_cb_pending(struct hrtimer *timer)
340{
341 list_del_init(&timer->cb_entry);
342}
343
328/* High resolution timer related functions */ 344/* High resolution timer related functions */
329#ifdef CONFIG_HIGH_RES_TIMERS 345#ifdef CONFIG_HIGH_RES_TIMERS
330 346
@@ -494,29 +510,12 @@ void hres_timers_resume(void)
494} 510}
495 511
496/* 512/*
497 * Check, whether the timer is on the callback pending list
498 */
499static inline int hrtimer_cb_pending(const struct hrtimer *timer)
500{
501 return timer->state & HRTIMER_STATE_PENDING;
502}
503
504/*
505 * Remove a timer from the callback pending list
506 */
507static inline void hrtimer_remove_cb_pending(struct hrtimer *timer)
508{
509 list_del_init(&timer->cb_entry);
510}
511
512/*
513 * Initialize the high resolution related parts of cpu_base 513 * Initialize the high resolution related parts of cpu_base
514 */ 514 */
515static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) 515static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
516{ 516{
517 base->expires_next.tv64 = KTIME_MAX; 517 base->expires_next.tv64 = KTIME_MAX;
518 base->hres_active = 0; 518 base->hres_active = 0;
519 INIT_LIST_HEAD(&base->cb_pending);
520} 519}
521 520
522/* 521/*
@@ -524,7 +523,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
524 */ 523 */
525static inline void hrtimer_init_timer_hres(struct hrtimer *timer) 524static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
526{ 525{
527 INIT_LIST_HEAD(&timer->cb_entry);
528} 526}
529 527
530/* 528/*
@@ -602,7 +600,7 @@ static int hrtimer_switch_to_hres(void)
602 /* "Retrigger" the interrupt to get things going */ 600 /* "Retrigger" the interrupt to get things going */
603 retrigger_next_event(NULL); 601 retrigger_next_event(NULL);
604 local_irq_restore(flags); 602 local_irq_restore(flags);
605 printk(KERN_INFO "Switched to high resolution mode on CPU %d\n", 603 printk(KERN_DEBUG "Switched to high resolution mode on CPU %d\n",
606 smp_processor_id()); 604 smp_processor_id());
607 return 1; 605 return 1;
608} 606}
@@ -618,10 +616,13 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
618{ 616{
619 return 0; 617 return 0;
620} 618}
621static inline int hrtimer_cb_pending(struct hrtimer *timer) { return 0; }
622static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) { }
623static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } 619static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
624static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } 620static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
621static inline int hrtimer_reprogram(struct hrtimer *timer,
622 struct hrtimer_clock_base *base)
623{
624 return 0;
625}
625 626
626#endif /* CONFIG_HIGH_RES_TIMERS */ 627#endif /* CONFIG_HIGH_RES_TIMERS */
627 628
@@ -850,6 +851,14 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
850#ifdef CONFIG_TIME_LOW_RES 851#ifdef CONFIG_TIME_LOW_RES
851 tim = ktime_add(tim, base->resolution); 852 tim = ktime_add(tim, base->resolution);
852#endif 853#endif
854 /*
855 * Careful here: User space might have asked for a
856 * very long sleep, so the add above might result in a
857 * negative number, which enqueues the timer in front
858 * of the queue.
859 */
860 if (tim.tv64 < 0)
861 tim.tv64 = KTIME_MAX;
853 } 862 }
854 timer->expires = tim; 863 timer->expires = tim;
855 864
@@ -993,6 +1002,7 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
993 clock_id = CLOCK_MONOTONIC; 1002 clock_id = CLOCK_MONOTONIC;
994 1003
995 timer->base = &cpu_base->clock_base[clock_id]; 1004 timer->base = &cpu_base->clock_base[clock_id];
1005 INIT_LIST_HEAD(&timer->cb_entry);
996 hrtimer_init_timer_hres(timer); 1006 hrtimer_init_timer_hres(timer);
997 1007
998#ifdef CONFIG_TIMER_STATS 1008#ifdef CONFIG_TIMER_STATS
@@ -1022,6 +1032,85 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
1022} 1032}
1023EXPORT_SYMBOL_GPL(hrtimer_get_res); 1033EXPORT_SYMBOL_GPL(hrtimer_get_res);
1024 1034
1035static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base)
1036{
1037 spin_lock_irq(&cpu_base->lock);
1038
1039 while (!list_empty(&cpu_base->cb_pending)) {
1040 enum hrtimer_restart (*fn)(struct hrtimer *);
1041 struct hrtimer *timer;
1042 int restart;
1043
1044 timer = list_entry(cpu_base->cb_pending.next,
1045 struct hrtimer, cb_entry);
1046
1047 timer_stats_account_hrtimer(timer);
1048
1049 fn = timer->function;
1050 __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0);
1051 spin_unlock_irq(&cpu_base->lock);
1052
1053 restart = fn(timer);
1054
1055 spin_lock_irq(&cpu_base->lock);
1056
1057 timer->state &= ~HRTIMER_STATE_CALLBACK;
1058 if (restart == HRTIMER_RESTART) {
1059 BUG_ON(hrtimer_active(timer));
1060 /*
1061 * Enqueue the timer, allow reprogramming of the event
1062 * device
1063 */
1064 enqueue_hrtimer(timer, timer->base, 1);
1065 } else if (hrtimer_active(timer)) {
1066 /*
1067 * If the timer was rearmed on another CPU, reprogram
1068 * the event device.
1069 */
1070 if (timer->base->first == &timer->node)
1071 hrtimer_reprogram(timer, timer->base);
1072 }
1073 }
1074 spin_unlock_irq(&cpu_base->lock);
1075}
1076
1077static void __run_hrtimer(struct hrtimer *timer)
1078{
1079 struct hrtimer_clock_base *base = timer->base;
1080 struct hrtimer_cpu_base *cpu_base = base->cpu_base;
1081 enum hrtimer_restart (*fn)(struct hrtimer *);
1082 int restart;
1083
1084 __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
1085 timer_stats_account_hrtimer(timer);
1086
1087 fn = timer->function;
1088 if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) {
1089 /*
1090 * Used for scheduler timers, avoid lock inversion with
1091 * rq->lock and tasklist_lock.
1092 *
1093 * These timers are required to deal with enqueue expiry
1094 * themselves and are not allowed to migrate.
1095 */
1096 spin_unlock(&cpu_base->lock);
1097 restart = fn(timer);
1098 spin_lock(&cpu_base->lock);
1099 } else
1100 restart = fn(timer);
1101
1102 /*
1103 * Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid
1104 * reprogramming of the event hardware. This happens at the end of this
1105 * function anyway.
1106 */
1107 if (restart != HRTIMER_NORESTART) {
1108 BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
1109 enqueue_hrtimer(timer, base, 0);
1110 }
1111 timer->state &= ~HRTIMER_STATE_CALLBACK;
1112}
1113
1025#ifdef CONFIG_HIGH_RES_TIMERS 1114#ifdef CONFIG_HIGH_RES_TIMERS
1026 1115
1027/* 1116/*
@@ -1079,21 +1168,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1079 continue; 1168 continue;
1080 } 1169 }
1081 1170
1082 __remove_hrtimer(timer, base, 1171 __run_hrtimer(timer);
1083 HRTIMER_STATE_CALLBACK, 0);
1084 timer_stats_account_hrtimer(timer);
1085
1086 /*
1087 * Note: We clear the CALLBACK bit after
1088 * enqueue_hrtimer to avoid reprogramming of
1089 * the event hardware. This happens at the end
1090 * of this function anyway.
1091 */
1092 if (timer->function(timer) != HRTIMER_NORESTART) {
1093 BUG_ON(timer->state != HRTIMER_STATE_CALLBACK);
1094 enqueue_hrtimer(timer, base, 0);
1095 }
1096 timer->state &= ~HRTIMER_STATE_CALLBACK;
1097 } 1172 }
1098 spin_unlock(&cpu_base->lock); 1173 spin_unlock(&cpu_base->lock);
1099 base++; 1174 base++;
@@ -1114,52 +1189,41 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1114 1189
1115static void run_hrtimer_softirq(struct softirq_action *h) 1190static void run_hrtimer_softirq(struct softirq_action *h)
1116{ 1191{
1117 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 1192 run_hrtimer_pending(&__get_cpu_var(hrtimer_bases));
1118 1193}
1119 spin_lock_irq(&cpu_base->lock);
1120
1121 while (!list_empty(&cpu_base->cb_pending)) {
1122 enum hrtimer_restart (*fn)(struct hrtimer *);
1123 struct hrtimer *timer;
1124 int restart;
1125
1126 timer = list_entry(cpu_base->cb_pending.next,
1127 struct hrtimer, cb_entry);
1128 1194
1129 timer_stats_account_hrtimer(timer); 1195#endif /* CONFIG_HIGH_RES_TIMERS */
1130 1196
1131 fn = timer->function; 1197/*
1132 __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0); 1198 * Called from timer softirq every jiffy, expire hrtimers:
1133 spin_unlock_irq(&cpu_base->lock); 1199 *
1200 * For HRT its the fall back code to run the softirq in the timer
1201 * softirq context in case the hrtimer initialization failed or has
1202 * not been done yet.
1203 */
1204void hrtimer_run_pending(void)
1205{
1206 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
1134 1207
1135 restart = fn(timer); 1208 if (hrtimer_hres_active())
1209 return;
1136 1210
1137 spin_lock_irq(&cpu_base->lock); 1211 /*
1212 * This _is_ ugly: We have to check in the softirq context,
1213 * whether we can switch to highres and / or nohz mode. The
1214 * clocksource switch happens in the timer interrupt with
1215 * xtime_lock held. Notification from there only sets the
1216 * check bit in the tick_oneshot code, otherwise we might
1217 * deadlock vs. xtime_lock.
1218 */
1219 if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
1220 hrtimer_switch_to_hres();
1138 1221
1139 timer->state &= ~HRTIMER_STATE_CALLBACK; 1222 run_hrtimer_pending(cpu_base);
1140 if (restart == HRTIMER_RESTART) {
1141 BUG_ON(hrtimer_active(timer));
1142 /*
1143 * Enqueue the timer, allow reprogramming of the event
1144 * device
1145 */
1146 enqueue_hrtimer(timer, timer->base, 1);
1147 } else if (hrtimer_active(timer)) {
1148 /*
1149 * If the timer was rearmed on another CPU, reprogram
1150 * the event device.
1151 */
1152 if (timer->base->first == &timer->node)
1153 hrtimer_reprogram(timer, timer->base);
1154 }
1155 }
1156 spin_unlock_irq(&cpu_base->lock);
1157} 1223}
1158 1224
1159#endif /* CONFIG_HIGH_RES_TIMERS */
1160
1161/* 1225/*
1162 * Expire the per base hrtimer-queue: 1226 * Called from hardirq context every jiffy
1163 */ 1227 */
1164static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base, 1228static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base,
1165 int index) 1229 int index)
@@ -1173,46 +1237,27 @@ static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base,
1173 if (base->get_softirq_time) 1237 if (base->get_softirq_time)
1174 base->softirq_time = base->get_softirq_time(); 1238 base->softirq_time = base->get_softirq_time();
1175 1239
1176 spin_lock_irq(&cpu_base->lock); 1240 spin_lock(&cpu_base->lock);
1177 1241
1178 while ((node = base->first)) { 1242 while ((node = base->first)) {
1179 struct hrtimer *timer; 1243 struct hrtimer *timer;
1180 enum hrtimer_restart (*fn)(struct hrtimer *);
1181 int restart;
1182 1244
1183 timer = rb_entry(node, struct hrtimer, node); 1245 timer = rb_entry(node, struct hrtimer, node);
1184 if (base->softirq_time.tv64 <= timer->expires.tv64) 1246 if (base->softirq_time.tv64 <= timer->expires.tv64)
1185 break; 1247 break;
1186 1248
1187#ifdef CONFIG_HIGH_RES_TIMERS 1249 if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
1188 WARN_ON_ONCE(timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ); 1250 __remove_hrtimer(timer, base, HRTIMER_STATE_PENDING, 0);
1189#endif 1251 list_add_tail(&timer->cb_entry,
1190 timer_stats_account_hrtimer(timer); 1252 &base->cpu_base->cb_pending);
1191 1253 continue;
1192 fn = timer->function;
1193 __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0);
1194 spin_unlock_irq(&cpu_base->lock);
1195
1196 restart = fn(timer);
1197
1198 spin_lock_irq(&cpu_base->lock);
1199
1200 timer->state &= ~HRTIMER_STATE_CALLBACK;
1201 if (restart != HRTIMER_NORESTART) {
1202 BUG_ON(hrtimer_active(timer));
1203 enqueue_hrtimer(timer, base, 0);
1204 } 1254 }
1255
1256 __run_hrtimer(timer);
1205 } 1257 }
1206 spin_unlock_irq(&cpu_base->lock); 1258 spin_unlock(&cpu_base->lock);
1207} 1259}
1208 1260
1209/*
1210 * Called from timer softirq every jiffy, expire hrtimers:
1211 *
1212 * For HRT its the fall back code to run the softirq in the timer
1213 * softirq context in case the hrtimer initialization failed or has
1214 * not been done yet.
1215 */
1216void hrtimer_run_queues(void) 1261void hrtimer_run_queues(void)
1217{ 1262{
1218 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); 1263 struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
@@ -1221,18 +1266,6 @@ void hrtimer_run_queues(void)
1221 if (hrtimer_hres_active()) 1266 if (hrtimer_hres_active())
1222 return; 1267 return;
1223 1268
1224 /*
1225 * This _is_ ugly: We have to check in the softirq context,
1226 * whether we can switch to highres and / or nohz mode. The
1227 * clocksource switch happens in the timer interrupt with
1228 * xtime_lock held. Notification from there only sets the
1229 * check bit in the tick_oneshot code, otherwise we might
1230 * deadlock vs. xtime_lock.
1231 */
1232 if (tick_check_oneshot_change(!hrtimer_is_hres_enabled()))
1233 if (hrtimer_switch_to_hres())
1234 return;
1235
1236 hrtimer_get_softirq_time(cpu_base); 1269 hrtimer_get_softirq_time(cpu_base);
1237 1270
1238 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) 1271 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
@@ -1260,7 +1293,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
1260 sl->timer.function = hrtimer_wakeup; 1293 sl->timer.function = hrtimer_wakeup;
1261 sl->task = task; 1294 sl->task = task;
1262#ifdef CONFIG_HIGH_RES_TIMERS 1295#ifdef CONFIG_HIGH_RES_TIMERS
1263 sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_RESTART; 1296 sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
1264#endif 1297#endif
1265} 1298}
1266 1299
@@ -1271,6 +1304,8 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
1271 do { 1304 do {
1272 set_current_state(TASK_INTERRUPTIBLE); 1305 set_current_state(TASK_INTERRUPTIBLE);
1273 hrtimer_start(&t->timer, t->timer.expires, mode); 1306 hrtimer_start(&t->timer, t->timer.expires, mode);
1307 if (!hrtimer_active(&t->timer))
1308 t->task = NULL;
1274 1309
1275 if (likely(t->task)) 1310 if (likely(t->task))
1276 schedule(); 1311 schedule();
@@ -1280,6 +1315,8 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
1280 1315
1281 } while (t->task && !signal_pending(current)); 1316 } while (t->task && !signal_pending(current));
1282 1317
1318 __set_current_state(TASK_RUNNING);
1319
1283 return t->task == NULL; 1320 return t->task == NULL;
1284} 1321}
1285 1322
@@ -1370,7 +1407,7 @@ sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
1370/* 1407/*
1371 * Functions related to boot-time initialization: 1408 * Functions related to boot-time initialization:
1372 */ 1409 */
1373static void __devinit init_hrtimers_cpu(int cpu) 1410static void __cpuinit init_hrtimers_cpu(int cpu)
1374{ 1411{
1375 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); 1412 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
1376 int i; 1413 int i;
@@ -1381,6 +1418,7 @@ static void __devinit init_hrtimers_cpu(int cpu)
1381 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) 1418 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
1382 cpu_base->clock_base[i].cpu_base = cpu_base; 1419 cpu_base->clock_base[i].cpu_base = cpu_base;
1383 1420
1421 INIT_LIST_HEAD(&cpu_base->cb_pending);
1384 hrtimer_init_hres(cpu_base); 1422 hrtimer_init_hres(cpu_base);
1385} 1423}
1386 1424
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 9b5dff6b3f6a..44019ce30a14 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -297,18 +297,13 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
297 297
298 if (unlikely(desc->status & IRQ_INPROGRESS)) 298 if (unlikely(desc->status & IRQ_INPROGRESS))
299 goto out_unlock; 299 goto out_unlock;
300 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
300 kstat_cpu(cpu).irqs[irq]++; 301 kstat_cpu(cpu).irqs[irq]++;
301 302
302 action = desc->action; 303 action = desc->action;
303 if (unlikely(!action || (desc->status & IRQ_DISABLED))) { 304 if (unlikely(!action || (desc->status & IRQ_DISABLED)))
304 if (desc->chip->mask)
305 desc->chip->mask(irq);
306 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
307 desc->status |= IRQ_PENDING;
308 goto out_unlock; 305 goto out_unlock;
309 }
310 306
311 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING | IRQ_PENDING);
312 desc->status |= IRQ_INPROGRESS; 307 desc->status |= IRQ_INPROGRESS;
313 spin_unlock(&desc->lock); 308 spin_unlock(&desc->lock);
314 309
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index e391cbb1f566..dc335ad27525 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -178,9 +178,11 @@ fastcall unsigned int __do_IRQ(unsigned int irq)
178 */ 178 */
179 if (desc->chip->ack) 179 if (desc->chip->ack)
180 desc->chip->ack(irq); 180 desc->chip->ack(irq);
181 action_ret = handle_IRQ_event(irq, desc->action); 181 if (likely(!(desc->status & IRQ_DISABLED))) {
182 if (!noirqdebug) 182 action_ret = handle_IRQ_event(irq, desc->action);
183 note_interrupt(irq, desc, action_ret); 183 if (!noirqdebug)
184 note_interrupt(irq, desc, action_ret);
185 }
184 desc->chip->end(irq); 186 desc->chip->end(irq);
185 return 1; 187 return 1;
186 } 188 }
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 1f314221d534..438a01464287 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -479,6 +479,9 @@ void free_irq(unsigned int irq, void *dev_id)
479 return; 479 return;
480 } 480 }
481 printk(KERN_ERR "Trying to free already-free IRQ %d\n", irq); 481 printk(KERN_ERR "Trying to free already-free IRQ %d\n", irq);
482#ifdef CONFIG_DEBUG_SHIRQ
483 dump_stack();
484#endif
482 spin_unlock_irqrestore(&desc->lock, flags); 485 spin_unlock_irqrestore(&desc->lock, flags);
483 return; 486 return;
484 } 487 }
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 50b81b98046a..c2f2ccb0549a 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -75,6 +75,18 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
75 75
76#endif 76#endif
77 77
78static int irq_spurious_read(char *page, char **start, off_t off,
79 int count, int *eof, void *data)
80{
81 struct irq_desc *d = &irq_desc[(long) data];
82 return sprintf(page, "count %u\n"
83 "unhandled %u\n"
84 "last_unhandled %u ms\n",
85 d->irq_count,
86 d->irqs_unhandled,
87 jiffies_to_msecs(d->last_unhandled));
88}
89
78#define MAX_NAMELEN 128 90#define MAX_NAMELEN 128
79 91
80static int name_unique(unsigned int irq, struct irqaction *new_action) 92static int name_unique(unsigned int irq, struct irqaction *new_action)
@@ -118,6 +130,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
118void register_irq_proc(unsigned int irq) 130void register_irq_proc(unsigned int irq)
119{ 131{
120 char name [MAX_NAMELEN]; 132 char name [MAX_NAMELEN];
133 struct proc_dir_entry *entry;
121 134
122 if (!root_irq_dir || 135 if (!root_irq_dir ||
123 (irq_desc[irq].chip == &no_irq_chip) || 136 (irq_desc[irq].chip == &no_irq_chip) ||
@@ -132,8 +145,6 @@ void register_irq_proc(unsigned int irq)
132 145
133#ifdef CONFIG_SMP 146#ifdef CONFIG_SMP
134 { 147 {
135 struct proc_dir_entry *entry;
136
137 /* create /proc/irq/<irq>/smp_affinity */ 148 /* create /proc/irq/<irq>/smp_affinity */
138 entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir); 149 entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir);
139 150
@@ -144,6 +155,12 @@ void register_irq_proc(unsigned int irq)
144 } 155 }
145 } 156 }
146#endif 157#endif
158
159 entry = create_proc_entry("spurious", 0444, irq_desc[irq].dir);
160 if (entry) {
161 entry->data = (void *)(long)irq;
162 entry->read_proc = irq_spurious_read;
163 }
147} 164}
148 165
149#undef MAX_NAMELEN 166#undef MAX_NAMELEN
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 32b161972fad..a6b2bc831dd0 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -10,6 +10,7 @@
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/kallsyms.h> 11#include <linux/kallsyms.h>
12#include <linux/interrupt.h> 12#include <linux/interrupt.h>
13#include <linux/moduleparam.h>
13 14
14static int irqfixup __read_mostly; 15static int irqfixup __read_mostly;
15 16
@@ -225,6 +226,8 @@ int noirqdebug_setup(char *str)
225} 226}
226 227
227__setup("noirqdebug", noirqdebug_setup); 228__setup("noirqdebug", noirqdebug_setup);
229module_param(noirqdebug, bool, 0644);
230MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true");
228 231
229static int __init irqfixup_setup(char *str) 232static int __init irqfixup_setup(char *str)
230{ 233{
@@ -236,6 +239,8 @@ static int __init irqfixup_setup(char *str)
236} 239}
237 240
238__setup("irqfixup", irqfixup_setup); 241__setup("irqfixup", irqfixup_setup);
242module_param(irqfixup, int, 0644);
243MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode 2: irqpoll mode");
239 244
240static int __init irqpoll_setup(char *str) 245static int __init irqpoll_setup(char *str)
241{ 246{
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 474219a41929..7dadc71ce516 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -32,9 +32,14 @@
32 32
33/* These will be re-linked against their real values during the second link stage */ 33/* These will be re-linked against their real values during the second link stage */
34extern const unsigned long kallsyms_addresses[] __attribute__((weak)); 34extern const unsigned long kallsyms_addresses[] __attribute__((weak));
35extern const unsigned long kallsyms_num_syms __attribute__((weak));
36extern const u8 kallsyms_names[] __attribute__((weak)); 35extern const u8 kallsyms_names[] __attribute__((weak));
37 36
37/* tell the compiler that the count isn't in the small data section if the arch
38 * has one (eg: FRV)
39 */
40extern const unsigned long kallsyms_num_syms
41__attribute__((weak, section(".rodata")));
42
38extern const u8 kallsyms_token_table[] __attribute__((weak)); 43extern const u8 kallsyms_token_table[] __attribute__((weak));
39extern const u16 kallsyms_token_index[] __attribute__((weak)); 44extern const u16 kallsyms_token_index[] __attribute__((weak));
40 45
@@ -228,10 +233,11 @@ static unsigned long get_symbol_pos(unsigned long addr,
228int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, 233int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize,
229 unsigned long *offset) 234 unsigned long *offset)
230{ 235{
236 char namebuf[KSYM_NAME_LEN];
231 if (is_ksym_addr(addr)) 237 if (is_ksym_addr(addr))
232 return !!get_symbol_pos(addr, symbolsize, offset); 238 return !!get_symbol_pos(addr, symbolsize, offset);
233 239
234 return !!module_address_lookup(addr, symbolsize, offset, NULL); 240 return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf);
235} 241}
236 242
237/* 243/*
@@ -246,8 +252,6 @@ const char *kallsyms_lookup(unsigned long addr,
246 unsigned long *offset, 252 unsigned long *offset,
247 char **modname, char *namebuf) 253 char **modname, char *namebuf)
248{ 254{
249 const char *msym;
250
251 namebuf[KSYM_NAME_LEN - 1] = 0; 255 namebuf[KSYM_NAME_LEN - 1] = 0;
252 namebuf[0] = 0; 256 namebuf[0] = 0;
253 257
@@ -263,10 +267,8 @@ const char *kallsyms_lookup(unsigned long addr,
263 } 267 }
264 268
265 /* see if it's in a module */ 269 /* see if it's in a module */
266 msym = module_address_lookup(addr, symbolsize, offset, modname); 270 return module_address_lookup(addr, symbolsize, offset, modname,
267 if (msym) 271 namebuf);
268 return strncpy(namebuf, msym, KSYM_NAME_LEN - 1);
269
270 return NULL; 272 return NULL;
271} 273}
272 274
diff --git a/kernel/kexec.c b/kernel/kexec.c
index aa74a1ef2da8..9a26eec9eb04 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1404,6 +1404,7 @@ static int __init crash_save_vmcoreinfo_init(void)
1404 VMCOREINFO_OFFSET(list_head, next); 1404 VMCOREINFO_OFFSET(list_head, next);
1405 VMCOREINFO_OFFSET(list_head, prev); 1405 VMCOREINFO_OFFSET(list_head, prev);
1406 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); 1406 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
1407 VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
1407 VMCOREINFO_NUMBER(NR_FREE_PAGES); 1408 VMCOREINFO_NUMBER(NR_FREE_PAGES);
1408 1409
1409 arch_crash_save_vmcoreinfo(); 1410 arch_crash_save_vmcoreinfo();
diff --git a/kernel/kmod.c b/kernel/kmod.c
index c6a4f8aebeba..bb7df2a28bd7 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -451,13 +451,11 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
451 enum umh_wait wait) 451 enum umh_wait wait)
452{ 452{
453 DECLARE_COMPLETION_ONSTACK(done); 453 DECLARE_COMPLETION_ONSTACK(done);
454 int retval; 454 int retval = 0;
455 455
456 helper_lock(); 456 helper_lock();
457 if (sub_info->path[0] == '\0') { 457 if (sub_info->path[0] == '\0')
458 retval = 0;
459 goto out; 458 goto out;
460 }
461 459
462 if (!khelper_wq || usermodehelper_disabled) { 460 if (!khelper_wq || usermodehelper_disabled) {
463 retval = -EBUSY; 461 retval = -EBUSY;
@@ -468,13 +466,14 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info,
468 sub_info->wait = wait; 466 sub_info->wait = wait;
469 467
470 queue_work(khelper_wq, &sub_info->work); 468 queue_work(khelper_wq, &sub_info->work);
471 if (wait == UMH_NO_WAIT) /* task has freed sub_info */ 469 if (wait == UMH_NO_WAIT) /* task has freed sub_info */
472 return 0; 470 goto unlock;
473 wait_for_completion(&done); 471 wait_for_completion(&done);
474 retval = sub_info->retval; 472 retval = sub_info->retval;
475 473
476 out: 474out:
477 call_usermodehelper_freeinfo(sub_info); 475 call_usermodehelper_freeinfo(sub_info);
476unlock:
478 helper_unlock(); 477 helper_unlock();
479 return retval; 478 return retval;
480} 479}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index e3a5d817ac9b..d0493eafea3e 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -824,6 +824,8 @@ static int __init init_kprobes(void)
824 if (!err) 824 if (!err)
825 err = register_die_notifier(&kprobe_exceptions_nb); 825 err = register_die_notifier(&kprobe_exceptions_nb);
826 826
827 if (!err)
828 init_test_probes();
827 return err; 829 return err;
828} 830}
829 831
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 65daa5373ca6..e53bc30e9ba5 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -17,30 +17,34 @@
17#include <linux/sched.h> 17#include <linux/sched.h>
18 18
19#define KERNEL_ATTR_RO(_name) \ 19#define KERNEL_ATTR_RO(_name) \
20static struct subsys_attribute _name##_attr = __ATTR_RO(_name) 20static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
21 21
22#define KERNEL_ATTR_RW(_name) \ 22#define KERNEL_ATTR_RW(_name) \
23static struct subsys_attribute _name##_attr = \ 23static struct kobj_attribute _name##_attr = \
24 __ATTR(_name, 0644, _name##_show, _name##_store) 24 __ATTR(_name, 0644, _name##_show, _name##_store)
25 25
26#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) 26#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
27/* current uevent sequence number */ 27/* current uevent sequence number */
28static ssize_t uevent_seqnum_show(struct kset *kset, char *page) 28static ssize_t uevent_seqnum_show(struct kobject *kobj,
29 struct kobj_attribute *attr, char *buf)
29{ 30{
30 return sprintf(page, "%llu\n", (unsigned long long)uevent_seqnum); 31 return sprintf(buf, "%llu\n", (unsigned long long)uevent_seqnum);
31} 32}
32KERNEL_ATTR_RO(uevent_seqnum); 33KERNEL_ATTR_RO(uevent_seqnum);
33 34
34/* uevent helper program, used during early boo */ 35/* uevent helper program, used during early boo */
35static ssize_t uevent_helper_show(struct kset *kset, char *page) 36static ssize_t uevent_helper_show(struct kobject *kobj,
37 struct kobj_attribute *attr, char *buf)
36{ 38{
37 return sprintf(page, "%s\n", uevent_helper); 39 return sprintf(buf, "%s\n", uevent_helper);
38} 40}
39static ssize_t uevent_helper_store(struct kset *kset, const char *page, size_t count) 41static ssize_t uevent_helper_store(struct kobject *kobj,
42 struct kobj_attribute *attr,
43 const char *buf, size_t count)
40{ 44{
41 if (count+1 > UEVENT_HELPER_PATH_LEN) 45 if (count+1 > UEVENT_HELPER_PATH_LEN)
42 return -ENOENT; 46 return -ENOENT;
43 memcpy(uevent_helper, page, count); 47 memcpy(uevent_helper, buf, count);
44 uevent_helper[count] = '\0'; 48 uevent_helper[count] = '\0';
45 if (count && uevent_helper[count-1] == '\n') 49 if (count && uevent_helper[count-1] == '\n')
46 uevent_helper[count-1] = '\0'; 50 uevent_helper[count-1] = '\0';
@@ -50,21 +54,24 @@ KERNEL_ATTR_RW(uevent_helper);
50#endif 54#endif
51 55
52#ifdef CONFIG_KEXEC 56#ifdef CONFIG_KEXEC
53static ssize_t kexec_loaded_show(struct kset *kset, char *page) 57static ssize_t kexec_loaded_show(struct kobject *kobj,
58 struct kobj_attribute *attr, char *buf)
54{ 59{
55 return sprintf(page, "%d\n", !!kexec_image); 60 return sprintf(buf, "%d\n", !!kexec_image);
56} 61}
57KERNEL_ATTR_RO(kexec_loaded); 62KERNEL_ATTR_RO(kexec_loaded);
58 63
59static ssize_t kexec_crash_loaded_show(struct kset *kset, char *page) 64static ssize_t kexec_crash_loaded_show(struct kobject *kobj,
65 struct kobj_attribute *attr, char *buf)
60{ 66{
61 return sprintf(page, "%d\n", !!kexec_crash_image); 67 return sprintf(buf, "%d\n", !!kexec_crash_image);
62} 68}
63KERNEL_ATTR_RO(kexec_crash_loaded); 69KERNEL_ATTR_RO(kexec_crash_loaded);
64 70
65static ssize_t vmcoreinfo_show(struct kset *kset, char *page) 71static ssize_t vmcoreinfo_show(struct kobject *kobj,
72 struct kobj_attribute *attr, char *buf)
66{ 73{
67 return sprintf(page, "%lx %x\n", 74 return sprintf(buf, "%lx %x\n",
68 paddr_vmcoreinfo_note(), 75 paddr_vmcoreinfo_note(),
69 (unsigned int)vmcoreinfo_max_size); 76 (unsigned int)vmcoreinfo_max_size);
70} 77}
@@ -94,8 +101,8 @@ static struct bin_attribute notes_attr = {
94 .read = &notes_read, 101 .read = &notes_read,
95}; 102};
96 103
97decl_subsys(kernel, NULL, NULL); 104struct kobject *kernel_kobj;
98EXPORT_SYMBOL_GPL(kernel_subsys); 105EXPORT_SYMBOL_GPL(kernel_kobj);
99 106
100static struct attribute * kernel_attrs[] = { 107static struct attribute * kernel_attrs[] = {
101#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) 108#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
@@ -116,24 +123,39 @@ static struct attribute_group kernel_attr_group = {
116 123
117static int __init ksysfs_init(void) 124static int __init ksysfs_init(void)
118{ 125{
119 int error = subsystem_register(&kernel_subsys); 126 int error;
120 if (!error)
121 error = sysfs_create_group(&kernel_subsys.kobj,
122 &kernel_attr_group);
123 127
124 if (!error && notes_size > 0) { 128 kernel_kobj = kobject_create_and_add("kernel", NULL);
125 notes_attr.size = notes_size; 129 if (!kernel_kobj) {
126 error = sysfs_create_bin_file(&kernel_subsys.kobj, 130 error = -ENOMEM;
127 &notes_attr); 131 goto exit;
128 } 132 }
133 error = sysfs_create_group(kernel_kobj, &kernel_attr_group);
134 if (error)
135 goto kset_exit;
129 136
130 /* 137 if (notes_size > 0) {
131 * Create "/sys/kernel/uids" directory and corresponding root user's 138 notes_attr.size = notes_size;
132 * directory under it. 139 error = sysfs_create_bin_file(kernel_kobj, &notes_attr);
133 */ 140 if (error)
134 if (!error) 141 goto group_exit;
135 error = uids_kobject_init(); 142 }
136 143
144 /* create the /sys/kernel/uids/ directory */
145 error = uids_sysfs_init();
146 if (error)
147 goto notes_exit;
148
149 return 0;
150
151notes_exit:
152 if (notes_size > 0)
153 sysfs_remove_bin_file(kernel_kobj, &notes_attr);
154group_exit:
155 sysfs_remove_group(kernel_kobj, &kernel_attr_group);
156kset_exit:
157 kobject_put(kernel_kobj);
158exit:
137 return error; 159 return error;
138} 160}
139 161
diff --git a/kernel/kthread.c b/kernel/kthread.c
index dcfe724300eb..0ac887882f90 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -15,6 +15,8 @@
15#include <linux/mutex.h> 15#include <linux/mutex.h>
16#include <asm/semaphore.h> 16#include <asm/semaphore.h>
17 17
18#define KTHREAD_NICE_LEVEL (-5)
19
18static DEFINE_SPINLOCK(kthread_create_lock); 20static DEFINE_SPINLOCK(kthread_create_lock);
19static LIST_HEAD(kthread_create_list); 21static LIST_HEAD(kthread_create_list);
20struct task_struct *kthreadd_task; 22struct task_struct *kthreadd_task;
@@ -94,10 +96,18 @@ static void create_kthread(struct kthread_create_info *create)
94 if (pid < 0) { 96 if (pid < 0) {
95 create->result = ERR_PTR(pid); 97 create->result = ERR_PTR(pid);
96 } else { 98 } else {
99 struct sched_param param = { .sched_priority = 0 };
97 wait_for_completion(&create->started); 100 wait_for_completion(&create->started);
98 read_lock(&tasklist_lock); 101 read_lock(&tasklist_lock);
99 create->result = find_task_by_pid(pid); 102 create->result = find_task_by_pid(pid);
100 read_unlock(&tasklist_lock); 103 read_unlock(&tasklist_lock);
104 /*
105 * root may have changed our (kthreadd's) priority or CPU mask.
106 * The kernel thread should not inherit these properties.
107 */
108 sched_setscheduler(create->result, SCHED_NORMAL, &param);
109 set_user_nice(create->result, KTHREAD_NICE_LEVEL);
110 set_cpus_allowed(create->result, CPU_MASK_ALL);
101 } 111 }
102 complete(&create->done); 112 complete(&create->done);
103} 113}
@@ -221,7 +231,7 @@ int kthreadd(void *unused)
221 /* Setup a clean context for our children to inherit. */ 231 /* Setup a clean context for our children to inherit. */
222 set_task_comm(tsk, "kthreadd"); 232 set_task_comm(tsk, "kthreadd");
223 ignore_signals(tsk); 233 ignore_signals(tsk);
224 set_user_nice(tsk, -5); 234 set_user_nice(tsk, KTHREAD_NICE_LEVEL);
225 set_cpus_allowed(tsk, CPU_MASK_ALL); 235 set_cpus_allowed(tsk, CPU_MASK_ALL);
226 236
227 current->flags |= PF_NOFREEZE; 237 current->flags |= PF_NOFREEZE;
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
new file mode 100644
index 000000000000..b4e3c85abe74
--- /dev/null
+++ b/kernel/latencytop.c
@@ -0,0 +1,239 @@
1/*
2 * latencytop.c: Latency display infrastructure
3 *
4 * (C) Copyright 2008 Intel Corporation
5 * Author: Arjan van de Ven <arjan@linux.intel.com>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; version 2
10 * of the License.
11 */
12#include <linux/latencytop.h>
13#include <linux/kallsyms.h>
14#include <linux/seq_file.h>
15#include <linux/notifier.h>
16#include <linux/spinlock.h>
17#include <linux/proc_fs.h>
18#include <linux/module.h>
19#include <linux/sched.h>
20#include <linux/list.h>
21#include <linux/slab.h>
22#include <linux/stacktrace.h>
23
24static DEFINE_SPINLOCK(latency_lock);
25
26#define MAXLR 128
27static struct latency_record latency_record[MAXLR];
28
29int latencytop_enabled;
30
31void clear_all_latency_tracing(struct task_struct *p)
32{
33 unsigned long flags;
34
35 if (!latencytop_enabled)
36 return;
37
38 spin_lock_irqsave(&latency_lock, flags);
39 memset(&p->latency_record, 0, sizeof(p->latency_record));
40 p->latency_record_count = 0;
41 spin_unlock_irqrestore(&latency_lock, flags);
42}
43
44static void clear_global_latency_tracing(void)
45{
46 unsigned long flags;
47
48 spin_lock_irqsave(&latency_lock, flags);
49 memset(&latency_record, 0, sizeof(latency_record));
50 spin_unlock_irqrestore(&latency_lock, flags);
51}
52
53static void __sched
54account_global_scheduler_latency(struct task_struct *tsk, struct latency_record *lat)
55{
56 int firstnonnull = MAXLR + 1;
57 int i;
58
59 if (!latencytop_enabled)
60 return;
61
62 /* skip kernel threads for now */
63 if (!tsk->mm)
64 return;
65
66 for (i = 0; i < MAXLR; i++) {
67 int q;
68 int same = 1;
69 /* Nothing stored: */
70 if (!latency_record[i].backtrace[0]) {
71 if (firstnonnull > i)
72 firstnonnull = i;
73 continue;
74 }
75 for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) {
76 if (latency_record[i].backtrace[q] !=
77 lat->backtrace[q])
78 same = 0;
79 if (same && lat->backtrace[q] == 0)
80 break;
81 if (same && lat->backtrace[q] == ULONG_MAX)
82 break;
83 }
84 if (same) {
85 latency_record[i].count++;
86 latency_record[i].time += lat->time;
87 if (lat->time > latency_record[i].max)
88 latency_record[i].max = lat->time;
89 return;
90 }
91 }
92
93 i = firstnonnull;
94 if (i >= MAXLR - 1)
95 return;
96
97 /* Allocted a new one: */
98 memcpy(&latency_record[i], lat, sizeof(struct latency_record));
99}
100
101static inline void store_stacktrace(struct task_struct *tsk, struct latency_record *lat)
102{
103 struct stack_trace trace;
104
105 memset(&trace, 0, sizeof(trace));
106 trace.max_entries = LT_BACKTRACEDEPTH;
107 trace.entries = &lat->backtrace[0];
108 trace.skip = 0;
109 save_stack_trace_tsk(tsk, &trace);
110}
111
112void __sched
113account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
114{
115 unsigned long flags;
116 int i, q;
117 struct latency_record lat;
118
119 if (!latencytop_enabled)
120 return;
121
122 /* Long interruptible waits are generally user requested... */
123 if (inter && usecs > 5000)
124 return;
125
126 memset(&lat, 0, sizeof(lat));
127 lat.count = 1;
128 lat.time = usecs;
129 lat.max = usecs;
130 store_stacktrace(tsk, &lat);
131
132 spin_lock_irqsave(&latency_lock, flags);
133
134 account_global_scheduler_latency(tsk, &lat);
135
136 /*
137 * short term hack; if we're > 32 we stop; future we recycle:
138 */
139 tsk->latency_record_count++;
140 if (tsk->latency_record_count >= LT_SAVECOUNT)
141 goto out_unlock;
142
143 for (i = 0; i < LT_SAVECOUNT ; i++) {
144 struct latency_record *mylat;
145 int same = 1;
146 mylat = &tsk->latency_record[i];
147 for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) {
148 if (mylat->backtrace[q] !=
149 lat.backtrace[q])
150 same = 0;
151 if (same && lat.backtrace[q] == 0)
152 break;
153 if (same && lat.backtrace[q] == ULONG_MAX)
154 break;
155 }
156 if (same) {
157 mylat->count++;
158 mylat->time += lat.time;
159 if (lat.time > mylat->max)
160 mylat->max = lat.time;
161 goto out_unlock;
162 }
163 }
164
165 /* Allocated a new one: */
166 i = tsk->latency_record_count;
167 memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record));
168
169out_unlock:
170 spin_unlock_irqrestore(&latency_lock, flags);
171}
172
173static int lstats_show(struct seq_file *m, void *v)
174{
175 int i;
176
177 seq_puts(m, "Latency Top version : v0.1\n");
178
179 for (i = 0; i < MAXLR; i++) {
180 if (latency_record[i].backtrace[0]) {
181 int q;
182 seq_printf(m, "%i %li %li ",
183 latency_record[i].count,
184 latency_record[i].time,
185 latency_record[i].max);
186 for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
187 char sym[KSYM_NAME_LEN];
188 char *c;
189 if (!latency_record[i].backtrace[q])
190 break;
191 if (latency_record[i].backtrace[q] == ULONG_MAX)
192 break;
193 sprint_symbol(sym, latency_record[i].backtrace[q]);
194 c = strchr(sym, '+');
195 if (c)
196 *c = 0;
197 seq_printf(m, "%s ", sym);
198 }
199 seq_printf(m, "\n");
200 }
201 }
202 return 0;
203}
204
205static ssize_t
206lstats_write(struct file *file, const char __user *buf, size_t count,
207 loff_t *offs)
208{
209 clear_global_latency_tracing();
210
211 return count;
212}
213
214static int lstats_open(struct inode *inode, struct file *filp)
215{
216 return single_open(filp, lstats_show, NULL);
217}
218
219static struct file_operations lstats_fops = {
220 .open = lstats_open,
221 .read = seq_read,
222 .write = lstats_write,
223 .llseek = seq_lseek,
224 .release = single_release,
225};
226
227static int __init init_lstats_procfs(void)
228{
229 struct proc_dir_entry *pe;
230
231 pe = create_proc_entry("latency_stats", 0644, NULL);
232 if (!pe)
233 return -ENOMEM;
234
235 pe->proc_fops = &lstats_fops;
236
237 return 0;
238}
239__initcall(init_lstats_procfs);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 55fe0c7cd95f..3574379f4d62 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2424,7 +2424,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2424 return 0; 2424 return 0;
2425 2425
2426 /* 2426 /*
2427 * Calculate the chain hash: it's the combined has of all the 2427 * Calculate the chain hash: it's the combined hash of all the
2428 * lock keys along the dependency chain. We save the hash value 2428 * lock keys along the dependency chain. We save the hash value
2429 * at every step so that we can get the current hash easily 2429 * at every step so that we can get the current hash easily
2430 * after unlock. The chain hash is then used to cache dependency 2430 * after unlock. The chain hash is then used to cache dependency
@@ -2654,10 +2654,15 @@ static void check_flags(unsigned long flags)
2654 if (!debug_locks) 2654 if (!debug_locks)
2655 return; 2655 return;
2656 2656
2657 if (irqs_disabled_flags(flags)) 2657 if (irqs_disabled_flags(flags)) {
2658 DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled); 2658 if (DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled)) {
2659 else 2659 printk("possible reason: unannotated irqs-off.\n");
2660 DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled); 2660 }
2661 } else {
2662 if (DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled)) {
2663 printk("possible reason: unannotated irqs-on.\n");
2664 }
2665 }
2661 2666
2662 /* 2667 /*
2663 * We dont accurately track softirq state in e.g. 2668 * We dont accurately track softirq state in e.g.
@@ -2927,7 +2932,7 @@ static void zap_class(struct lock_class *class)
2927 2932
2928} 2933}
2929 2934
2930static inline int within(void *addr, void *start, unsigned long size) 2935static inline int within(const void *addr, void *start, unsigned long size)
2931{ 2936{
2932 return addr >= start && addr < start + size; 2937 return addr >= start && addr < start + size;
2933} 2938}
@@ -2938,9 +2943,10 @@ void lockdep_free_key_range(void *start, unsigned long size)
2938 struct list_head *head; 2943 struct list_head *head;
2939 unsigned long flags; 2944 unsigned long flags;
2940 int i; 2945 int i;
2946 int locked;
2941 2947
2942 raw_local_irq_save(flags); 2948 raw_local_irq_save(flags);
2943 graph_lock(); 2949 locked = graph_lock();
2944 2950
2945 /* 2951 /*
2946 * Unhash all classes that were created by this module: 2952 * Unhash all classes that were created by this module:
@@ -2949,12 +2955,16 @@ void lockdep_free_key_range(void *start, unsigned long size)
2949 head = classhash_table + i; 2955 head = classhash_table + i;
2950 if (list_empty(head)) 2956 if (list_empty(head))
2951 continue; 2957 continue;
2952 list_for_each_entry_safe(class, next, head, hash_entry) 2958 list_for_each_entry_safe(class, next, head, hash_entry) {
2953 if (within(class->key, start, size)) 2959 if (within(class->key, start, size))
2954 zap_class(class); 2960 zap_class(class);
2961 else if (within(class->name, start, size))
2962 zap_class(class);
2963 }
2955 } 2964 }
2956 2965
2957 graph_unlock(); 2966 if (locked)
2967 graph_unlock();
2958 raw_local_irq_restore(flags); 2968 raw_local_irq_restore(flags);
2959} 2969}
2960 2970
@@ -2964,6 +2974,7 @@ void lockdep_reset_lock(struct lockdep_map *lock)
2964 struct list_head *head; 2974 struct list_head *head;
2965 unsigned long flags; 2975 unsigned long flags;
2966 int i, j; 2976 int i, j;
2977 int locked;
2967 2978
2968 raw_local_irq_save(flags); 2979 raw_local_irq_save(flags);
2969 2980
@@ -2982,7 +2993,7 @@ void lockdep_reset_lock(struct lockdep_map *lock)
2982 * Debug check: in the end all mapped classes should 2993 * Debug check: in the end all mapped classes should
2983 * be gone. 2994 * be gone.
2984 */ 2995 */
2985 graph_lock(); 2996 locked = graph_lock();
2986 for (i = 0; i < CLASSHASH_SIZE; i++) { 2997 for (i = 0; i < CLASSHASH_SIZE; i++) {
2987 head = classhash_table + i; 2998 head = classhash_table + i;
2988 if (list_empty(head)) 2999 if (list_empty(head))
@@ -2995,7 +3006,8 @@ void lockdep_reset_lock(struct lockdep_map *lock)
2995 } 3006 }
2996 } 3007 }
2997 } 3008 }
2998 graph_unlock(); 3009 if (locked)
3010 graph_unlock();
2999 3011
3000out_restore: 3012out_restore:
3001 raw_local_irq_restore(flags); 3013 raw_local_irq_restore(flags);
@@ -3054,11 +3066,6 @@ void __init lockdep_info(void)
3054#endif 3066#endif
3055} 3067}
3056 3068
3057static inline int in_range(const void *start, const void *addr, const void *end)
3058{
3059 return addr >= start && addr <= end;
3060}
3061
3062static void 3069static void
3063print_freed_lock_bug(struct task_struct *curr, const void *mem_from, 3070print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
3064 const void *mem_to, struct held_lock *hlock) 3071 const void *mem_to, struct held_lock *hlock)
@@ -3080,6 +3087,13 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
3080 dump_stack(); 3087 dump_stack();
3081} 3088}
3082 3089
3090static inline int not_in_range(const void* mem_from, unsigned long mem_len,
3091 const void* lock_from, unsigned long lock_len)
3092{
3093 return lock_from + lock_len <= mem_from ||
3094 mem_from + mem_len <= lock_from;
3095}
3096
3083/* 3097/*
3084 * Called when kernel memory is freed (or unmapped), or if a lock 3098 * Called when kernel memory is freed (or unmapped), or if a lock
3085 * is destroyed or reinitialized - this code checks whether there is 3099 * is destroyed or reinitialized - this code checks whether there is
@@ -3087,7 +3101,6 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
3087 */ 3101 */
3088void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len) 3102void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
3089{ 3103{
3090 const void *mem_to = mem_from + mem_len, *lock_from, *lock_to;
3091 struct task_struct *curr = current; 3104 struct task_struct *curr = current;
3092 struct held_lock *hlock; 3105 struct held_lock *hlock;
3093 unsigned long flags; 3106 unsigned long flags;
@@ -3100,14 +3113,11 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
3100 for (i = 0; i < curr->lockdep_depth; i++) { 3113 for (i = 0; i < curr->lockdep_depth; i++) {
3101 hlock = curr->held_locks + i; 3114 hlock = curr->held_locks + i;
3102 3115
3103 lock_from = (void *)hlock->instance; 3116 if (not_in_range(mem_from, mem_len, hlock->instance,
3104 lock_to = (void *)(hlock->instance + 1); 3117 sizeof(*hlock->instance)))
3105
3106 if (!in_range(mem_from, lock_from, mem_to) &&
3107 !in_range(mem_from, lock_to, mem_to))
3108 continue; 3118 continue;
3109 3119
3110 print_freed_lock_bug(curr, mem_from, mem_to, hlock); 3120 print_freed_lock_bug(curr, mem_from, mem_from + mem_len, hlock);
3111 break; 3121 break;
3112 } 3122 }
3113 local_irq_restore(flags); 3123 local_irq_restore(flags);
@@ -3173,6 +3183,13 @@ retry:
3173 printk(" locked it.\n"); 3183 printk(" locked it.\n");
3174 3184
3175 do_each_thread(g, p) { 3185 do_each_thread(g, p) {
3186 /*
3187 * It's not reliable to print a task's held locks
3188 * if it's not sleeping (or if it's not the current
3189 * task):
3190 */
3191 if (p->state == TASK_RUNNING && p != current)
3192 continue;
3176 if (p->lockdep_depth) 3193 if (p->lockdep_depth)
3177 lockdep_print_held_locks(p); 3194 lockdep_print_held_locks(p);
3178 if (!unlock) 3195 if (!unlock)
@@ -3189,7 +3206,11 @@ retry:
3189 3206
3190EXPORT_SYMBOL_GPL(debug_show_all_locks); 3207EXPORT_SYMBOL_GPL(debug_show_all_locks);
3191 3208
3192void debug_show_held_locks(struct task_struct *task) 3209/*
3210 * Careful: only use this function if you are sure that
3211 * the task cannot run in parallel!
3212 */
3213void __debug_show_held_locks(struct task_struct *task)
3193{ 3214{
3194 if (unlikely(!debug_locks)) { 3215 if (unlikely(!debug_locks)) {
3195 printk("INFO: lockdep is turned off.\n"); 3216 printk("INFO: lockdep is turned off.\n");
@@ -3197,6 +3218,12 @@ void debug_show_held_locks(struct task_struct *task)
3197 } 3218 }
3198 lockdep_print_held_locks(task); 3219 lockdep_print_held_locks(task);
3199} 3220}
3221EXPORT_SYMBOL_GPL(__debug_show_held_locks);
3222
3223void debug_show_held_locks(struct task_struct *task)
3224{
3225 __debug_show_held_locks(task);
3226}
3200 3227
3201EXPORT_SYMBOL_GPL(debug_show_held_locks); 3228EXPORT_SYMBOL_GPL(debug_show_held_locks);
3202 3229
diff --git a/kernel/marker.c b/kernel/marker.c
index ccb48d9a3657..5323cfaedbce 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -28,7 +28,7 @@ extern struct marker __start___markers[];
28extern struct marker __stop___markers[]; 28extern struct marker __stop___markers[];
29 29
30/* 30/*
31 * module_mutex nests inside markers_mutex. Markers mutex protects the builtin 31 * markers_mutex nests inside module_mutex. Markers mutex protects the builtin
32 * and module markers, the hash table and deferred_sync. 32 * and module markers, the hash table and deferred_sync.
33 */ 33 */
34static DEFINE_MUTEX(markers_mutex); 34static DEFINE_MUTEX(markers_mutex);
@@ -257,7 +257,6 @@ static void disable_marker(struct marker *elem)
257 * @refcount: number of references left to the given probe_module (out) 257 * @refcount: number of references left to the given probe_module (out)
258 * 258 *
259 * Updates the probe callback corresponding to a range of markers. 259 * Updates the probe callback corresponding to a range of markers.
260 * Must be called with markers_mutex held.
261 */ 260 */
262void marker_update_probe_range(struct marker *begin, 261void marker_update_probe_range(struct marker *begin,
263 struct marker *end, struct module *probe_module, 262 struct marker *end, struct module *probe_module,
@@ -266,6 +265,7 @@ void marker_update_probe_range(struct marker *begin,
266 struct marker *iter; 265 struct marker *iter;
267 struct marker_entry *mark_entry; 266 struct marker_entry *mark_entry;
268 267
268 mutex_lock(&markers_mutex);
269 for (iter = begin; iter < end; iter++) { 269 for (iter = begin; iter < end; iter++) {
270 mark_entry = get_marker(iter->name); 270 mark_entry = get_marker(iter->name);
271 if (mark_entry && mark_entry->refcount) { 271 if (mark_entry && mark_entry->refcount) {
@@ -281,6 +281,7 @@ void marker_update_probe_range(struct marker *begin,
281 disable_marker(iter); 281 disable_marker(iter);
282 } 282 }
283 } 283 }
284 mutex_unlock(&markers_mutex);
284} 285}
285 286
286/* 287/*
@@ -293,7 +294,6 @@ static void marker_update_probes(struct module *probe_module)
293{ 294{
294 int refcount = 0; 295 int refcount = 0;
295 296
296 mutex_lock(&markers_mutex);
297 /* Core kernel markers */ 297 /* Core kernel markers */
298 marker_update_probe_range(__start___markers, 298 marker_update_probe_range(__start___markers,
299 __stop___markers, probe_module, &refcount); 299 __stop___markers, probe_module, &refcount);
@@ -303,7 +303,6 @@ static void marker_update_probes(struct module *probe_module)
303 synchronize_sched(); 303 synchronize_sched();
304 deferred_sync = 0; 304 deferred_sync = 0;
305 } 305 }
306 mutex_unlock(&markers_mutex);
307} 306}
308 307
309/** 308/**
@@ -320,7 +319,7 @@ int marker_probe_register(const char *name, const char *format,
320 marker_probe_func *probe, void *private) 319 marker_probe_func *probe, void *private)
321{ 320{
322 struct marker_entry *entry; 321 struct marker_entry *entry;
323 int ret = 0, need_update = 0; 322 int ret = 0;
324 323
325 mutex_lock(&markers_mutex); 324 mutex_lock(&markers_mutex);
326 entry = get_marker(name); 325 entry = get_marker(name);
@@ -335,11 +334,11 @@ int marker_probe_register(const char *name, const char *format,
335 ret = add_marker(name, format, probe, private); 334 ret = add_marker(name, format, probe, private);
336 if (ret) 335 if (ret)
337 goto end; 336 goto end;
338 need_update = 1; 337 mutex_unlock(&markers_mutex);
338 marker_update_probes(NULL);
339 return ret;
339end: 340end:
340 mutex_unlock(&markers_mutex); 341 mutex_unlock(&markers_mutex);
341 if (need_update)
342 marker_update_probes(NULL);
343 return ret; 342 return ret;
344} 343}
345EXPORT_SYMBOL_GPL(marker_probe_register); 344EXPORT_SYMBOL_GPL(marker_probe_register);
@@ -355,7 +354,6 @@ void *marker_probe_unregister(const char *name)
355 struct module *probe_module; 354 struct module *probe_module;
356 struct marker_entry *entry; 355 struct marker_entry *entry;
357 void *private; 356 void *private;
358 int need_update = 0;
359 357
360 mutex_lock(&markers_mutex); 358 mutex_lock(&markers_mutex);
361 entry = get_marker(name); 359 entry = get_marker(name);
@@ -368,11 +366,11 @@ void *marker_probe_unregister(const char *name)
368 probe_module = __module_text_address((unsigned long)entry->probe); 366 probe_module = __module_text_address((unsigned long)entry->probe);
369 private = remove_marker(name); 367 private = remove_marker(name);
370 deferred_sync = 1; 368 deferred_sync = 1;
371 need_update = 1; 369 mutex_unlock(&markers_mutex);
370 marker_update_probes(probe_module);
371 return private;
372end: 372end:
373 mutex_unlock(&markers_mutex); 373 mutex_unlock(&markers_mutex);
374 if (need_update)
375 marker_update_probes(probe_module);
376 return private; 374 return private;
377} 375}
378EXPORT_SYMBOL_GPL(marker_probe_unregister); 376EXPORT_SYMBOL_GPL(marker_probe_unregister);
@@ -392,7 +390,6 @@ void *marker_probe_unregister_private_data(void *private)
392 struct marker_entry *entry; 390 struct marker_entry *entry;
393 int found = 0; 391 int found = 0;
394 unsigned int i; 392 unsigned int i;
395 int need_update = 0;
396 393
397 mutex_lock(&markers_mutex); 394 mutex_lock(&markers_mutex);
398 for (i = 0; i < MARKER_TABLE_SIZE; i++) { 395 for (i = 0; i < MARKER_TABLE_SIZE; i++) {
@@ -414,11 +411,11 @@ iter_end:
414 probe_module = __module_text_address((unsigned long)entry->probe); 411 probe_module = __module_text_address((unsigned long)entry->probe);
415 private = remove_marker(entry->name); 412 private = remove_marker(entry->name);
416 deferred_sync = 1; 413 deferred_sync = 1;
417 need_update = 1; 414 mutex_unlock(&markers_mutex);
415 marker_update_probes(probe_module);
416 return private;
418end: 417end:
419 mutex_unlock(&markers_mutex); 418 mutex_unlock(&markers_mutex);
420 if (need_update)
421 marker_update_probes(probe_module);
422 return private; 419 return private;
423} 420}
424EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data); 421EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data);
@@ -434,7 +431,7 @@ EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data);
434int marker_arm(const char *name) 431int marker_arm(const char *name)
435{ 432{
436 struct marker_entry *entry; 433 struct marker_entry *entry;
437 int ret = 0, need_update = 0; 434 int ret = 0;
438 435
439 mutex_lock(&markers_mutex); 436 mutex_lock(&markers_mutex);
440 entry = get_marker(name); 437 entry = get_marker(name);
@@ -447,11 +444,9 @@ int marker_arm(const char *name)
447 */ 444 */
448 if (entry->refcount++) 445 if (entry->refcount++)
449 goto end; 446 goto end;
450 need_update = 1;
451end: 447end:
452 mutex_unlock(&markers_mutex); 448 mutex_unlock(&markers_mutex);
453 if (need_update) 449 marker_update_probes(NULL);
454 marker_update_probes(NULL);
455 return ret; 450 return ret;
456} 451}
457EXPORT_SYMBOL_GPL(marker_arm); 452EXPORT_SYMBOL_GPL(marker_arm);
@@ -467,7 +462,7 @@ EXPORT_SYMBOL_GPL(marker_arm);
467int marker_disarm(const char *name) 462int marker_disarm(const char *name)
468{ 463{
469 struct marker_entry *entry; 464 struct marker_entry *entry;
470 int ret = 0, need_update = 0; 465 int ret = 0;
471 466
472 mutex_lock(&markers_mutex); 467 mutex_lock(&markers_mutex);
473 entry = get_marker(name); 468 entry = get_marker(name);
@@ -486,11 +481,9 @@ int marker_disarm(const char *name)
486 ret = -EPERM; 481 ret = -EPERM;
487 goto end; 482 goto end;
488 } 483 }
489 need_update = 1;
490end: 484end:
491 mutex_unlock(&markers_mutex); 485 mutex_unlock(&markers_mutex);
492 if (need_update) 486 marker_update_probes(NULL);
493 marker_update_probes(NULL);
494 return ret; 487 return ret;
495} 488}
496EXPORT_SYMBOL_GPL(marker_disarm); 489EXPORT_SYMBOL_GPL(marker_disarm);
diff --git a/kernel/module.c b/kernel/module.c
index 3202c9950073..bd60278ee703 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -47,8 +47,6 @@
47#include <asm/cacheflush.h> 47#include <asm/cacheflush.h>
48#include <linux/license.h> 48#include <linux/license.h>
49 49
50extern int module_sysfs_initialized;
51
52#if 0 50#if 0
53#define DEBUGP printk 51#define DEBUGP printk
54#else 52#else
@@ -67,6 +65,9 @@ extern int module_sysfs_initialized;
67static DEFINE_MUTEX(module_mutex); 65static DEFINE_MUTEX(module_mutex);
68static LIST_HEAD(modules); 66static LIST_HEAD(modules);
69 67
68/* Waiting for a module to finish initializing? */
69static DECLARE_WAIT_QUEUE_HEAD(module_wq);
70
70static BLOCKING_NOTIFIER_HEAD(module_notify_list); 71static BLOCKING_NOTIFIER_HEAD(module_notify_list);
71 72
72int register_module_notifier(struct notifier_block * nb) 73int register_module_notifier(struct notifier_block * nb)
@@ -81,12 +82,16 @@ int unregister_module_notifier(struct notifier_block * nb)
81} 82}
82EXPORT_SYMBOL(unregister_module_notifier); 83EXPORT_SYMBOL(unregister_module_notifier);
83 84
84/* We require a truly strong try_module_get() */ 85/* We require a truly strong try_module_get(): 0 means failure due to
86 ongoing or failed initialization etc. */
85static inline int strong_try_module_get(struct module *mod) 87static inline int strong_try_module_get(struct module *mod)
86{ 88{
87 if (mod && mod->state == MODULE_STATE_COMING) 89 if (mod && mod->state == MODULE_STATE_COMING)
90 return -EBUSY;
91 if (try_module_get(mod))
88 return 0; 92 return 0;
89 return try_module_get(mod); 93 else
94 return -ENOENT;
90} 95}
91 96
92static inline void add_taint_module(struct module *mod, unsigned flag) 97static inline void add_taint_module(struct module *mod, unsigned flag)
@@ -425,6 +430,14 @@ static unsigned int find_pcpusec(Elf_Ehdr *hdr,
425 return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); 430 return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
426} 431}
427 432
433static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
434{
435 int cpu;
436
437 for_each_possible_cpu(cpu)
438 memcpy(pcpudest + per_cpu_offset(cpu), from, size);
439}
440
428static int percpu_modinit(void) 441static int percpu_modinit(void)
429{ 442{
430 pcpu_num_used = 2; 443 pcpu_num_used = 2;
@@ -497,6 +510,8 @@ static struct module_attribute modinfo_##field = { \
497MODINFO_ATTR(version); 510MODINFO_ATTR(version);
498MODINFO_ATTR(srcversion); 511MODINFO_ATTR(srcversion);
499 512
513static char last_unloaded_module[MODULE_NAME_LEN+1];
514
500#ifdef CONFIG_MODULE_UNLOAD 515#ifdef CONFIG_MODULE_UNLOAD
501/* Init the unload section of the module. */ 516/* Init the unload section of the module. */
502static void module_unload_init(struct module *mod) 517static void module_unload_init(struct module *mod)
@@ -538,11 +553,21 @@ static int already_uses(struct module *a, struct module *b)
538static int use_module(struct module *a, struct module *b) 553static int use_module(struct module *a, struct module *b)
539{ 554{
540 struct module_use *use; 555 struct module_use *use;
541 int no_warn; 556 int no_warn, err;
542 557
543 if (b == NULL || already_uses(a, b)) return 1; 558 if (b == NULL || already_uses(a, b)) return 1;
544 559
545 if (!strong_try_module_get(b)) 560 /* If we're interrupted or time out, we fail. */
561 if (wait_event_interruptible_timeout(
562 module_wq, (err = strong_try_module_get(b)) != -EBUSY,
563 30 * HZ) <= 0) {
564 printk("%s: gave up waiting for init of module %s.\n",
565 a->name, b->name);
566 return 0;
567 }
568
569 /* If strong_try_module_get() returned a different error, we fail. */
570 if (err)
546 return 0; 571 return 0;
547 572
548 DEBUGP("Allocating new usage for %s.\n", a->name); 573 DEBUGP("Allocating new usage for %s.\n", a->name);
@@ -720,6 +745,8 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
720 mod->exit(); 745 mod->exit();
721 mutex_lock(&module_mutex); 746 mutex_lock(&module_mutex);
722 } 747 }
748 /* Store the name of the last unloaded module for diagnostic purposes */
749 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
723 free_module(mod); 750 free_module(mod);
724 751
725 out: 752 out:
@@ -813,7 +840,7 @@ static inline void module_unload_free(struct module *mod)
813 840
814static inline int use_module(struct module *a, struct module *b) 841static inline int use_module(struct module *a, struct module *b)
815{ 842{
816 return strong_try_module_get(b); 843 return strong_try_module_get(b) == 0;
817} 844}
818 845
819static inline void module_unload_init(struct module *mod) 846static inline void module_unload_init(struct module *mod)
@@ -952,7 +979,8 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
952 ret = __find_symbol(name, &owner, &crc, 979 ret = __find_symbol(name, &owner, &crc,
953 !(mod->taints & TAINT_PROPRIETARY_MODULE)); 980 !(mod->taints & TAINT_PROPRIETARY_MODULE));
954 if (ret) { 981 if (ret) {
955 /* use_module can fail due to OOM, or module unloading */ 982 /* use_module can fail due to OOM,
983 or module initialization or unloading */
956 if (!check_version(sechdrs, versindex, name, mod, crc) || 984 if (!check_version(sechdrs, versindex, name, mod, crc) ||
957 !use_module(mod, owner)) 985 !use_module(mod, owner))
958 ret = 0; 986 ret = 0;
@@ -1120,7 +1148,7 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect,
1120 ++loaded; 1148 ++loaded;
1121 } 1149 }
1122 1150
1123 notes_attrs->dir = kobject_add_dir(&mod->mkobj.kobj, "notes"); 1151 notes_attrs->dir = kobject_create_and_add("notes", &mod->mkobj.kobj);
1124 if (!notes_attrs->dir) 1152 if (!notes_attrs->dir)
1125 goto out; 1153 goto out;
1126 1154
@@ -1210,6 +1238,7 @@ void module_remove_modinfo_attrs(struct module *mod)
1210int mod_sysfs_init(struct module *mod) 1238int mod_sysfs_init(struct module *mod)
1211{ 1239{
1212 int err; 1240 int err;
1241 struct kobject *kobj;
1213 1242
1214 if (!module_sysfs_initialized) { 1243 if (!module_sysfs_initialized) {
1215 printk(KERN_ERR "%s: module sysfs not initialized\n", 1244 printk(KERN_ERR "%s: module sysfs not initialized\n",
@@ -1217,15 +1246,25 @@ int mod_sysfs_init(struct module *mod)
1217 err = -EINVAL; 1246 err = -EINVAL;
1218 goto out; 1247 goto out;
1219 } 1248 }
1220 memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj)); 1249
1221 err = kobject_set_name(&mod->mkobj.kobj, "%s", mod->name); 1250 kobj = kset_find_obj(module_kset, mod->name);
1222 if (err) 1251 if (kobj) {
1252 printk(KERN_ERR "%s: module is already loaded\n", mod->name);
1253 kobject_put(kobj);
1254 err = -EINVAL;
1223 goto out; 1255 goto out;
1224 kobj_set_kset_s(&mod->mkobj, module_subsys); 1256 }
1257
1225 mod->mkobj.mod = mod; 1258 mod->mkobj.mod = mod;
1226 1259
1227 kobject_init(&mod->mkobj.kobj); 1260 memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj));
1261 mod->mkobj.kobj.kset = module_kset;
1262 err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL,
1263 "%s", mod->name);
1264 if (err)
1265 kobject_put(&mod->mkobj.kobj);
1228 1266
1267 /* delay uevent until full sysfs population */
1229out: 1268out:
1230 return err; 1269 return err;
1231} 1270}
@@ -1236,12 +1275,7 @@ int mod_sysfs_setup(struct module *mod,
1236{ 1275{
1237 int err; 1276 int err;
1238 1277
1239 /* delay uevent until full sysfs population */ 1278 mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj);
1240 err = kobject_add(&mod->mkobj.kobj);
1241 if (err)
1242 goto out;
1243
1244 mod->holders_dir = kobject_add_dir(&mod->mkobj.kobj, "holders");
1245 if (!mod->holders_dir) { 1279 if (!mod->holders_dir) {
1246 err = -ENOMEM; 1280 err = -ENOMEM;
1247 goto out_unreg; 1281 goto out_unreg;
@@ -1261,11 +1295,9 @@ int mod_sysfs_setup(struct module *mod,
1261out_unreg_param: 1295out_unreg_param:
1262 module_param_sysfs_remove(mod); 1296 module_param_sysfs_remove(mod);
1263out_unreg_holders: 1297out_unreg_holders:
1264 kobject_unregister(mod->holders_dir); 1298 kobject_put(mod->holders_dir);
1265out_unreg: 1299out_unreg:
1266 kobject_del(&mod->mkobj.kobj);
1267 kobject_put(&mod->mkobj.kobj); 1300 kobject_put(&mod->mkobj.kobj);
1268out:
1269 return err; 1301 return err;
1270} 1302}
1271#endif 1303#endif
@@ -1274,9 +1306,20 @@ static void mod_kobject_remove(struct module *mod)
1274{ 1306{
1275 module_remove_modinfo_attrs(mod); 1307 module_remove_modinfo_attrs(mod);
1276 module_param_sysfs_remove(mod); 1308 module_param_sysfs_remove(mod);
1277 kobject_unregister(mod->mkobj.drivers_dir); 1309 kobject_put(mod->mkobj.drivers_dir);
1278 kobject_unregister(mod->holders_dir); 1310 kobject_put(mod->holders_dir);
1279 kobject_unregister(&mod->mkobj.kobj); 1311 kobject_put(&mod->mkobj.kobj);
1312}
1313
1314/*
1315 * link the module with the whole machine is stopped with interrupts off
1316 * - this defends against kallsyms not taking locks
1317 */
1318static int __link_module(void *_mod)
1319{
1320 struct module *mod = _mod;
1321 list_add(&mod->list, &modules);
1322 return 0;
1280} 1323}
1281 1324
1282/* 1325/*
@@ -1328,7 +1371,7 @@ void *__symbol_get(const char *symbol)
1328 1371
1329 preempt_disable(); 1372 preempt_disable();
1330 value = __find_symbol(symbol, &owner, &crc, 1); 1373 value = __find_symbol(symbol, &owner, &crc, 1);
1331 if (value && !strong_try_module_get(owner)) 1374 if (value && strong_try_module_get(owner) != 0)
1332 value = 0; 1375 value = 0;
1333 preempt_enable(); 1376 preempt_enable();
1334 1377
@@ -1369,7 +1412,7 @@ dup:
1369 return ret; 1412 return ret;
1370} 1413}
1371 1414
1372/* Change all symbols so that sh_value encodes the pointer directly. */ 1415/* Change all symbols so that st_value encodes the pointer directly. */
1373static int simplify_symbols(Elf_Shdr *sechdrs, 1416static int simplify_symbols(Elf_Shdr *sechdrs,
1374 unsigned int symindex, 1417 unsigned int symindex,
1375 const char *strtab, 1418 const char *strtab,
@@ -1882,16 +1925,16 @@ static struct module *load_module(void __user *umod,
1882 /* Now we've moved module, initialize linked lists, etc. */ 1925 /* Now we've moved module, initialize linked lists, etc. */
1883 module_unload_init(mod); 1926 module_unload_init(mod);
1884 1927
1885 /* Initialize kobject, so we can reference it. */ 1928 /* add kobject, so we can reference it. */
1886 err = mod_sysfs_init(mod); 1929 err = mod_sysfs_init(mod);
1887 if (err) 1930 if (err)
1888 goto cleanup; 1931 goto free_unload;
1889 1932
1890 /* Set up license info based on the info section */ 1933 /* Set up license info based on the info section */
1891 set_license(mod, get_modinfo(sechdrs, infoindex, "license")); 1934 set_license(mod, get_modinfo(sechdrs, infoindex, "license"));
1892 1935
1893 if (strcmp(mod->name, "ndiswrapper") == 0) 1936 if (strcmp(mod->name, "ndiswrapper") == 0)
1894 add_taint(TAINT_PROPRIETARY_MODULE); 1937 add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
1895 if (strcmp(mod->name, "driverloader") == 0) 1938 if (strcmp(mod->name, "driverloader") == 0)
1896 add_taint_module(mod, TAINT_PROPRIETARY_MODULE); 1939 add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
1897 1940
@@ -2021,6 +2064,11 @@ static struct module *load_module(void __user *umod,
2021 printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", 2064 printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
2022 mod->name); 2065 mod->name);
2023 2066
2067 /* Now sew it into the lists so we can get lockdep and oops
2068 * info during argument parsing. Noone should access us, since
2069 * strong_try_module_get() will fail. */
2070 stop_machine_run(__link_module, mod, NR_CPUS);
2071
2024 /* Size of section 0 is 0, so this works well if no params */ 2072 /* Size of section 0 is 0, so this works well if no params */
2025 err = parse_args(mod->name, mod->args, 2073 err = parse_args(mod->name, mod->args,
2026 (struct kernel_param *) 2074 (struct kernel_param *)
@@ -2029,7 +2077,7 @@ static struct module *load_module(void __user *umod,
2029 / sizeof(struct kernel_param), 2077 / sizeof(struct kernel_param),
2030 NULL); 2078 NULL);
2031 if (err < 0) 2079 if (err < 0)
2032 goto arch_cleanup; 2080 goto unlink;
2033 2081
2034 err = mod_sysfs_setup(mod, 2082 err = mod_sysfs_setup(mod,
2035 (struct kernel_param *) 2083 (struct kernel_param *)
@@ -2037,7 +2085,7 @@ static struct module *load_module(void __user *umod,
2037 sechdrs[setupindex].sh_size 2085 sechdrs[setupindex].sh_size
2038 / sizeof(struct kernel_param)); 2086 / sizeof(struct kernel_param));
2039 if (err < 0) 2087 if (err < 0)
2040 goto arch_cleanup; 2088 goto unlink;
2041 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 2089 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
2042 add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 2090 add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
2043 2091
@@ -2052,9 +2100,13 @@ static struct module *load_module(void __user *umod,
2052 /* Done! */ 2100 /* Done! */
2053 return mod; 2101 return mod;
2054 2102
2055 arch_cleanup: 2103 unlink:
2104 stop_machine_run(__unlink_module, mod, NR_CPUS);
2056 module_arch_cleanup(mod); 2105 module_arch_cleanup(mod);
2057 cleanup: 2106 cleanup:
2107 kobject_del(&mod->mkobj.kobj);
2108 kobject_put(&mod->mkobj.kobj);
2109 free_unload:
2058 module_unload_free(mod); 2110 module_unload_free(mod);
2059 module_free(mod, mod->module_init); 2111 module_free(mod, mod->module_init);
2060 free_core: 2112 free_core:
@@ -2074,17 +2126,6 @@ static struct module *load_module(void __user *umod,
2074 goto free_hdr; 2126 goto free_hdr;
2075} 2127}
2076 2128
2077/*
2078 * link the module with the whole machine is stopped with interrupts off
2079 * - this defends against kallsyms not taking locks
2080 */
2081static int __link_module(void *_mod)
2082{
2083 struct module *mod = _mod;
2084 list_add(&mod->list, &modules);
2085 return 0;
2086}
2087
2088/* This is where the real work happens */ 2129/* This is where the real work happens */
2089asmlinkage long 2130asmlinkage long
2090sys_init_module(void __user *umod, 2131sys_init_module(void __user *umod,
@@ -2109,10 +2150,6 @@ sys_init_module(void __user *umod,
2109 return PTR_ERR(mod); 2150 return PTR_ERR(mod);
2110 } 2151 }
2111 2152
2112 /* Now sew it into the lists. They won't access us, since
2113 strong_try_module_get() will fail. */
2114 stop_machine_run(__link_module, mod, NR_CPUS);
2115
2116 /* Drop lock so they can recurse */ 2153 /* Drop lock so they can recurse */
2117 mutex_unlock(&module_mutex); 2154 mutex_unlock(&module_mutex);
2118 2155
@@ -2131,6 +2168,7 @@ sys_init_module(void __user *umod,
2131 mutex_lock(&module_mutex); 2168 mutex_lock(&module_mutex);
2132 free_module(mod); 2169 free_module(mod);
2133 mutex_unlock(&module_mutex); 2170 mutex_unlock(&module_mutex);
2171 wake_up(&module_wq);
2134 return ret; 2172 return ret;
2135 } 2173 }
2136 2174
@@ -2145,6 +2183,7 @@ sys_init_module(void __user *umod,
2145 mod->init_size = 0; 2183 mod->init_size = 0;
2146 mod->init_text_size = 0; 2184 mod->init_text_size = 0;
2147 mutex_unlock(&module_mutex); 2185 mutex_unlock(&module_mutex);
2186 wake_up(&module_wq);
2148 2187
2149 return 0; 2188 return 0;
2150} 2189}
@@ -2209,32 +2248,41 @@ static const char *get_ksymbol(struct module *mod,
2209 return mod->strtab + mod->symtab[best].st_name; 2248 return mod->strtab + mod->symtab[best].st_name;
2210} 2249}
2211 2250
2212/* For kallsyms to ask for address resolution. NULL means not found. 2251/* For kallsyms to ask for address resolution. NULL means not found. Careful
2213 We don't lock, as this is used for oops resolution and races are a 2252 * not to lock to avoid deadlock on oopses, simply disable preemption. */
2214 lesser concern. */ 2253char *module_address_lookup(unsigned long addr,
2215const char *module_address_lookup(unsigned long addr, 2254 unsigned long *size,
2216 unsigned long *size, 2255 unsigned long *offset,
2217 unsigned long *offset, 2256 char **modname,
2218 char **modname) 2257 char *namebuf)
2219{ 2258{
2220 struct module *mod; 2259 struct module *mod;
2260 const char *ret = NULL;
2221 2261
2262 preempt_disable();
2222 list_for_each_entry(mod, &modules, list) { 2263 list_for_each_entry(mod, &modules, list) {
2223 if (within(addr, mod->module_init, mod->init_size) 2264 if (within(addr, mod->module_init, mod->init_size)
2224 || within(addr, mod->module_core, mod->core_size)) { 2265 || within(addr, mod->module_core, mod->core_size)) {
2225 if (modname) 2266 if (modname)
2226 *modname = mod->name; 2267 *modname = mod->name;
2227 return get_ksymbol(mod, addr, size, offset); 2268 ret = get_ksymbol(mod, addr, size, offset);
2269 break;
2228 } 2270 }
2229 } 2271 }
2230 return NULL; 2272 /* Make a copy in here where it's safe */
2273 if (ret) {
2274 strncpy(namebuf, ret, KSYM_NAME_LEN - 1);
2275 ret = namebuf;
2276 }
2277 preempt_enable();
2278 return (char *)ret;
2231} 2279}
2232 2280
2233int lookup_module_symbol_name(unsigned long addr, char *symname) 2281int lookup_module_symbol_name(unsigned long addr, char *symname)
2234{ 2282{
2235 struct module *mod; 2283 struct module *mod;
2236 2284
2237 mutex_lock(&module_mutex); 2285 preempt_disable();
2238 list_for_each_entry(mod, &modules, list) { 2286 list_for_each_entry(mod, &modules, list) {
2239 if (within(addr, mod->module_init, mod->init_size) || 2287 if (within(addr, mod->module_init, mod->init_size) ||
2240 within(addr, mod->module_core, mod->core_size)) { 2288 within(addr, mod->module_core, mod->core_size)) {
@@ -2244,12 +2292,12 @@ int lookup_module_symbol_name(unsigned long addr, char *symname)
2244 if (!sym) 2292 if (!sym)
2245 goto out; 2293 goto out;
2246 strlcpy(symname, sym, KSYM_NAME_LEN); 2294 strlcpy(symname, sym, KSYM_NAME_LEN);
2247 mutex_unlock(&module_mutex); 2295 preempt_enable();
2248 return 0; 2296 return 0;
2249 } 2297 }
2250 } 2298 }
2251out: 2299out:
2252 mutex_unlock(&module_mutex); 2300 preempt_enable();
2253 return -ERANGE; 2301 return -ERANGE;
2254} 2302}
2255 2303
@@ -2258,7 +2306,7 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
2258{ 2306{
2259 struct module *mod; 2307 struct module *mod;
2260 2308
2261 mutex_lock(&module_mutex); 2309 preempt_disable();
2262 list_for_each_entry(mod, &modules, list) { 2310 list_for_each_entry(mod, &modules, list) {
2263 if (within(addr, mod->module_init, mod->init_size) || 2311 if (within(addr, mod->module_init, mod->init_size) ||
2264 within(addr, mod->module_core, mod->core_size)) { 2312 within(addr, mod->module_core, mod->core_size)) {
@@ -2271,12 +2319,12 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
2271 strlcpy(modname, mod->name, MODULE_NAME_LEN); 2319 strlcpy(modname, mod->name, MODULE_NAME_LEN);
2272 if (name) 2320 if (name)
2273 strlcpy(name, sym, KSYM_NAME_LEN); 2321 strlcpy(name, sym, KSYM_NAME_LEN);
2274 mutex_unlock(&module_mutex); 2322 preempt_enable();
2275 return 0; 2323 return 0;
2276 } 2324 }
2277 } 2325 }
2278out: 2326out:
2279 mutex_unlock(&module_mutex); 2327 preempt_enable();
2280 return -ERANGE; 2328 return -ERANGE;
2281} 2329}
2282 2330
@@ -2285,7 +2333,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
2285{ 2333{
2286 struct module *mod; 2334 struct module *mod;
2287 2335
2288 mutex_lock(&module_mutex); 2336 preempt_disable();
2289 list_for_each_entry(mod, &modules, list) { 2337 list_for_each_entry(mod, &modules, list) {
2290 if (symnum < mod->num_symtab) { 2338 if (symnum < mod->num_symtab) {
2291 *value = mod->symtab[symnum].st_value; 2339 *value = mod->symtab[symnum].st_value;
@@ -2294,12 +2342,12 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
2294 KSYM_NAME_LEN); 2342 KSYM_NAME_LEN);
2295 strlcpy(module_name, mod->name, MODULE_NAME_LEN); 2343 strlcpy(module_name, mod->name, MODULE_NAME_LEN);
2296 *exported = is_exported(name, mod); 2344 *exported = is_exported(name, mod);
2297 mutex_unlock(&module_mutex); 2345 preempt_enable();
2298 return 0; 2346 return 0;
2299 } 2347 }
2300 symnum -= mod->num_symtab; 2348 symnum -= mod->num_symtab;
2301 } 2349 }
2302 mutex_unlock(&module_mutex); 2350 preempt_enable();
2303 return -ERANGE; 2351 return -ERANGE;
2304} 2352}
2305 2353
@@ -2322,6 +2370,7 @@ unsigned long module_kallsyms_lookup_name(const char *name)
2322 unsigned long ret = 0; 2370 unsigned long ret = 0;
2323 2371
2324 /* Don't lock: we're in enough trouble already. */ 2372 /* Don't lock: we're in enough trouble already. */
2373 preempt_disable();
2325 if ((colon = strchr(name, ':')) != NULL) { 2374 if ((colon = strchr(name, ':')) != NULL) {
2326 *colon = '\0'; 2375 *colon = '\0';
2327 if ((mod = find_module(name)) != NULL) 2376 if ((mod = find_module(name)) != NULL)
@@ -2332,6 +2381,7 @@ unsigned long module_kallsyms_lookup_name(const char *name)
2332 if ((ret = mod_find_symname(mod, name)) != 0) 2381 if ((ret = mod_find_symname(mod, name)) != 0)
2333 break; 2382 break;
2334 } 2383 }
2384 preempt_enable();
2335 return ret; 2385 return ret;
2336} 2386}
2337#endif /* CONFIG_KALLSYMS */ 2387#endif /* CONFIG_KALLSYMS */
@@ -2353,21 +2403,30 @@ static void m_stop(struct seq_file *m, void *p)
2353 mutex_unlock(&module_mutex); 2403 mutex_unlock(&module_mutex);
2354} 2404}
2355 2405
2356static char *taint_flags(unsigned int taints, char *buf) 2406static char *module_flags(struct module *mod, char *buf)
2357{ 2407{
2358 int bx = 0; 2408 int bx = 0;
2359 2409
2360 if (taints) { 2410 if (mod->taints ||
2411 mod->state == MODULE_STATE_GOING ||
2412 mod->state == MODULE_STATE_COMING) {
2361 buf[bx++] = '('; 2413 buf[bx++] = '(';
2362 if (taints & TAINT_PROPRIETARY_MODULE) 2414 if (mod->taints & TAINT_PROPRIETARY_MODULE)
2363 buf[bx++] = 'P'; 2415 buf[bx++] = 'P';
2364 if (taints & TAINT_FORCED_MODULE) 2416 if (mod->taints & TAINT_FORCED_MODULE)
2365 buf[bx++] = 'F'; 2417 buf[bx++] = 'F';
2366 /* 2418 /*
2367 * TAINT_FORCED_RMMOD: could be added. 2419 * TAINT_FORCED_RMMOD: could be added.
2368 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't 2420 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
2369 * apply to modules. 2421 * apply to modules.
2370 */ 2422 */
2423
2424 /* Show a - for module-is-being-unloaded */
2425 if (mod->state == MODULE_STATE_GOING)
2426 buf[bx++] = '-';
2427 /* Show a + for module-is-being-loaded */
2428 if (mod->state == MODULE_STATE_COMING)
2429 buf[bx++] = '+';
2371 buf[bx++] = ')'; 2430 buf[bx++] = ')';
2372 } 2431 }
2373 buf[bx] = '\0'; 2432 buf[bx] = '\0';
@@ -2394,7 +2453,7 @@ static int m_show(struct seq_file *m, void *p)
2394 2453
2395 /* Taints info */ 2454 /* Taints info */
2396 if (mod->taints) 2455 if (mod->taints)
2397 seq_printf(m, " %s", taint_flags(mod->taints, buf)); 2456 seq_printf(m, " %s", module_flags(mod, buf));
2398 2457
2399 seq_printf(m, "\n"); 2458 seq_printf(m, "\n");
2400 return 0; 2459 return 0;
@@ -2489,97 +2548,12 @@ void print_modules(void)
2489 2548
2490 printk("Modules linked in:"); 2549 printk("Modules linked in:");
2491 list_for_each_entry(mod, &modules, list) 2550 list_for_each_entry(mod, &modules, list)
2492 printk(" %s%s", mod->name, taint_flags(mod->taints, buf)); 2551 printk(" %s%s", mod->name, module_flags(mod, buf));
2552 if (last_unloaded_module[0])
2553 printk(" [last unloaded: %s]", last_unloaded_module);
2493 printk("\n"); 2554 printk("\n");
2494} 2555}
2495 2556
2496#ifdef CONFIG_SYSFS
2497static char *make_driver_name(struct device_driver *drv)
2498{
2499 char *driver_name;
2500
2501 driver_name = kmalloc(strlen(drv->name) + strlen(drv->bus->name) + 2,
2502 GFP_KERNEL);
2503 if (!driver_name)
2504 return NULL;
2505
2506 sprintf(driver_name, "%s:%s", drv->bus->name, drv->name);
2507 return driver_name;
2508}
2509
2510static void module_create_drivers_dir(struct module_kobject *mk)
2511{
2512 if (!mk || mk->drivers_dir)
2513 return;
2514
2515 mk->drivers_dir = kobject_add_dir(&mk->kobj, "drivers");
2516}
2517
2518void module_add_driver(struct module *mod, struct device_driver *drv)
2519{
2520 char *driver_name;
2521 int no_warn;
2522 struct module_kobject *mk = NULL;
2523
2524 if (!drv)
2525 return;
2526
2527 if (mod)
2528 mk = &mod->mkobj;
2529 else if (drv->mod_name) {
2530 struct kobject *mkobj;
2531
2532 /* Lookup built-in module entry in /sys/modules */
2533 mkobj = kset_find_obj(&module_subsys, drv->mod_name);
2534 if (mkobj) {
2535 mk = container_of(mkobj, struct module_kobject, kobj);
2536 /* remember our module structure */
2537 drv->mkobj = mk;
2538 /* kset_find_obj took a reference */
2539 kobject_put(mkobj);
2540 }
2541 }
2542
2543 if (!mk)
2544 return;
2545
2546 /* Don't check return codes; these calls are idempotent */
2547 no_warn = sysfs_create_link(&drv->kobj, &mk->kobj, "module");
2548 driver_name = make_driver_name(drv);
2549 if (driver_name) {
2550 module_create_drivers_dir(mk);
2551 no_warn = sysfs_create_link(mk->drivers_dir, &drv->kobj,
2552 driver_name);
2553 kfree(driver_name);
2554 }
2555}
2556EXPORT_SYMBOL(module_add_driver);
2557
2558void module_remove_driver(struct device_driver *drv)
2559{
2560 struct module_kobject *mk = NULL;
2561 char *driver_name;
2562
2563 if (!drv)
2564 return;
2565
2566 sysfs_remove_link(&drv->kobj, "module");
2567
2568 if (drv->owner)
2569 mk = &drv->owner->mkobj;
2570 else if (drv->mkobj)
2571 mk = drv->mkobj;
2572 if (mk && mk->drivers_dir) {
2573 driver_name = make_driver_name(drv);
2574 if (driver_name) {
2575 sysfs_remove_link(mk->drivers_dir, driver_name);
2576 kfree(driver_name);
2577 }
2578 }
2579}
2580EXPORT_SYMBOL(module_remove_driver);
2581#endif
2582
2583#ifdef CONFIG_MODVERSIONS 2557#ifdef CONFIG_MODVERSIONS
2584/* Generate the signature for struct module here, too, for modversions. */ 2558/* Generate the signature for struct module here, too, for modversions. */
2585void struct_module(struct module *mod) { return; } 2559void struct_module(struct module *mod) { return; }
diff --git a/kernel/mutex.c b/kernel/mutex.c
index d7fe50cc556f..d9ec9b666250 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -166,9 +166,12 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
166 * got a signal? (This code gets eliminated in the 166 * got a signal? (This code gets eliminated in the
167 * TASK_UNINTERRUPTIBLE case.) 167 * TASK_UNINTERRUPTIBLE case.)
168 */ 168 */
169 if (unlikely(state == TASK_INTERRUPTIBLE && 169 if (unlikely((state == TASK_INTERRUPTIBLE &&
170 signal_pending(task))) { 170 signal_pending(task)) ||
171 mutex_remove_waiter(lock, &waiter, task_thread_info(task)); 171 (state == TASK_KILLABLE &&
172 fatal_signal_pending(task)))) {
173 mutex_remove_waiter(lock, &waiter,
174 task_thread_info(task));
172 mutex_release(&lock->dep_map, 1, ip); 175 mutex_release(&lock->dep_map, 1, ip);
173 spin_unlock_mutex(&lock->wait_lock, flags); 176 spin_unlock_mutex(&lock->wait_lock, flags);
174 177
@@ -211,6 +214,14 @@ mutex_lock_nested(struct mutex *lock, unsigned int subclass)
211EXPORT_SYMBOL_GPL(mutex_lock_nested); 214EXPORT_SYMBOL_GPL(mutex_lock_nested);
212 215
213int __sched 216int __sched
217mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass)
218{
219 might_sleep();
220 return __mutex_lock_common(lock, TASK_KILLABLE, subclass, _RET_IP_);
221}
222EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
223
224int __sched
214mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) 225mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
215{ 226{
216 might_sleep(); 227 might_sleep();
@@ -272,6 +283,9 @@ __mutex_unlock_slowpath(atomic_t *lock_count)
272 * mutex_lock_interruptible() and mutex_trylock(). 283 * mutex_lock_interruptible() and mutex_trylock().
273 */ 284 */
274static int fastcall noinline __sched 285static int fastcall noinline __sched
286__mutex_lock_killable_slowpath(atomic_t *lock_count);
287
288static noinline int fastcall __sched
275__mutex_lock_interruptible_slowpath(atomic_t *lock_count); 289__mutex_lock_interruptible_slowpath(atomic_t *lock_count);
276 290
277/*** 291/***
@@ -294,6 +308,14 @@ int fastcall __sched mutex_lock_interruptible(struct mutex *lock)
294 308
295EXPORT_SYMBOL(mutex_lock_interruptible); 309EXPORT_SYMBOL(mutex_lock_interruptible);
296 310
311int fastcall __sched mutex_lock_killable(struct mutex *lock)
312{
313 might_sleep();
314 return __mutex_fastpath_lock_retval
315 (&lock->count, __mutex_lock_killable_slowpath);
316}
317EXPORT_SYMBOL(mutex_lock_killable);
318
297static void fastcall noinline __sched 319static void fastcall noinline __sched
298__mutex_lock_slowpath(atomic_t *lock_count) 320__mutex_lock_slowpath(atomic_t *lock_count)
299{ 321{
@@ -303,6 +325,14 @@ __mutex_lock_slowpath(atomic_t *lock_count)
303} 325}
304 326
305static int fastcall noinline __sched 327static int fastcall noinline __sched
328__mutex_lock_killable_slowpath(atomic_t *lock_count)
329{
330 struct mutex *lock = container_of(lock_count, struct mutex, count);
331
332 return __mutex_lock_common(lock, TASK_KILLABLE, 0, _RET_IP_);
333}
334
335static noinline int fastcall __sched
306__mutex_lock_interruptible_slowpath(atomic_t *lock_count) 336__mutex_lock_interruptible_slowpath(atomic_t *lock_count)
307{ 337{
308 struct mutex *lock = container_of(lock_count, struct mutex, count); 338 struct mutex *lock = container_of(lock_count, struct mutex, count);
diff --git a/kernel/panic.c b/kernel/panic.c
index 6f6e03e91595..d9e90cfe3298 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -19,6 +19,8 @@
19#include <linux/nmi.h> 19#include <linux/nmi.h>
20#include <linux/kexec.h> 20#include <linux/kexec.h>
21#include <linux/debug_locks.h> 21#include <linux/debug_locks.h>
22#include <linux/random.h>
23#include <linux/kallsyms.h>
22 24
23int panic_on_oops; 25int panic_on_oops;
24int tainted; 26int tainted;
@@ -266,13 +268,52 @@ void oops_enter(void)
266} 268}
267 269
268/* 270/*
271 * 64-bit random ID for oopses:
272 */
273static u64 oops_id;
274
275static int init_oops_id(void)
276{
277 if (!oops_id)
278 get_random_bytes(&oops_id, sizeof(oops_id));
279
280 return 0;
281}
282late_initcall(init_oops_id);
283
284static void print_oops_end_marker(void)
285{
286 init_oops_id();
287 printk(KERN_WARNING "---[ end trace %016llx ]---\n",
288 (unsigned long long)oops_id);
289}
290
291/*
269 * Called when the architecture exits its oops handler, after printing 292 * Called when the architecture exits its oops handler, after printing
270 * everything. 293 * everything.
271 */ 294 */
272void oops_exit(void) 295void oops_exit(void)
273{ 296{
274 do_oops_enter_exit(); 297 do_oops_enter_exit();
298 print_oops_end_marker();
299}
300
301#ifdef WANT_WARN_ON_SLOWPATH
302void warn_on_slowpath(const char *file, int line)
303{
304 char function[KSYM_SYMBOL_LEN];
305 unsigned long caller = (unsigned long) __builtin_return_address(0);
306 sprint_symbol(function, caller);
307
308 printk(KERN_WARNING "------------[ cut here ]------------\n");
309 printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file,
310 line, function);
311 print_modules();
312 dump_stack();
313 print_oops_end_marker();
275} 314}
315EXPORT_SYMBOL(warn_on_slowpath);
316#endif
276 317
277#ifdef CONFIG_CC_STACKPROTECTOR 318#ifdef CONFIG_CC_STACKPROTECTOR
278/* 319/*
diff --git a/kernel/params.c b/kernel/params.c
index 16f269e9ddc9..42fe5e6126c0 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -376,8 +376,6 @@ int param_get_string(char *buffer, struct kernel_param *kp)
376 376
377extern struct kernel_param __start___param[], __stop___param[]; 377extern struct kernel_param __start___param[], __stop___param[];
378 378
379#define MAX_KBUILD_MODNAME KOBJ_NAME_LEN
380
381struct param_attribute 379struct param_attribute
382{ 380{
383 struct module_attribute mattr; 381 struct module_attribute mattr;
@@ -472,7 +470,7 @@ param_sysfs_setup(struct module_kobject *mk,
472 sizeof(mp->grp.attrs[0])); 470 sizeof(mp->grp.attrs[0]));
473 size[1] = (valid_attrs + 1) * sizeof(mp->grp.attrs[0]); 471 size[1] = (valid_attrs + 1) * sizeof(mp->grp.attrs[0]);
474 472
475 mp = kmalloc(size[0] + size[1], GFP_KERNEL); 473 mp = kzalloc(size[0] + size[1], GFP_KERNEL);
476 if (!mp) 474 if (!mp)
477 return ERR_PTR(-ENOMEM); 475 return ERR_PTR(-ENOMEM);
478 476
@@ -560,11 +558,10 @@ static void __init kernel_param_sysfs_setup(const char *name,
560 BUG_ON(!mk); 558 BUG_ON(!mk);
561 559
562 mk->mod = THIS_MODULE; 560 mk->mod = THIS_MODULE;
563 kobj_set_kset_s(mk, module_subsys); 561 mk->kobj.kset = module_kset;
564 kobject_set_name(&mk->kobj, name); 562 ret = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, "%s", name);
565 kobject_init(&mk->kobj);
566 ret = kobject_add(&mk->kobj);
567 if (ret) { 563 if (ret) {
564 kobject_put(&mk->kobj);
568 printk(KERN_ERR "Module '%s' failed to be added to sysfs, " 565 printk(KERN_ERR "Module '%s' failed to be added to sysfs, "
569 "error number %d\n", name, ret); 566 "error number %d\n", name, ret);
570 printk(KERN_ERR "The system will be unstable now.\n"); 567 printk(KERN_ERR "The system will be unstable now.\n");
@@ -588,23 +585,20 @@ static void __init param_sysfs_builtin(void)
588{ 585{
589 struct kernel_param *kp, *kp_begin = NULL; 586 struct kernel_param *kp, *kp_begin = NULL;
590 unsigned int i, name_len, count = 0; 587 unsigned int i, name_len, count = 0;
591 char modname[MAX_KBUILD_MODNAME + 1] = ""; 588 char modname[MODULE_NAME_LEN + 1] = "";
592 589
593 for (i=0; i < __stop___param - __start___param; i++) { 590 for (i=0; i < __stop___param - __start___param; i++) {
594 char *dot; 591 char *dot;
595 size_t kplen; 592 size_t max_name_len;
596 593
597 kp = &__start___param[i]; 594 kp = &__start___param[i];
598 kplen = strlen(kp->name); 595 max_name_len =
596 min_t(size_t, MODULE_NAME_LEN, strlen(kp->name));
599 597
600 /* We do not handle args without periods. */ 598 dot = memchr(kp->name, '.', max_name_len);
601 if (kplen > MAX_KBUILD_MODNAME) {
602 DEBUGP("kernel parameter name is too long: %s\n", kp->name);
603 continue;
604 }
605 dot = memchr(kp->name, '.', kplen);
606 if (!dot) { 599 if (!dot) {
607 DEBUGP("couldn't find period in %s\n", kp->name); 600 DEBUGP("couldn't find period in first %d characters "
601 "of %s\n", MODULE_NAME_LEN, kp->name);
608 continue; 602 continue;
609 } 603 }
610 name_len = dot - kp->name; 604 name_len = dot - kp->name;
@@ -682,8 +676,6 @@ static struct sysfs_ops module_sysfs_ops = {
682 .store = module_attr_store, 676 .store = module_attr_store,
683}; 677};
684 678
685static struct kobj_type module_ktype;
686
687static int uevent_filter(struct kset *kset, struct kobject *kobj) 679static int uevent_filter(struct kset *kset, struct kobject *kobj)
688{ 680{
689 struct kobj_type *ktype = get_ktype(kobj); 681 struct kobj_type *ktype = get_ktype(kobj);
@@ -697,10 +689,10 @@ static struct kset_uevent_ops module_uevent_ops = {
697 .filter = uevent_filter, 689 .filter = uevent_filter,
698}; 690};
699 691
700decl_subsys(module, &module_ktype, &module_uevent_ops); 692struct kset *module_kset;
701int module_sysfs_initialized; 693int module_sysfs_initialized;
702 694
703static struct kobj_type module_ktype = { 695struct kobj_type module_ktype = {
704 .sysfs_ops = &module_sysfs_ops, 696 .sysfs_ops = &module_sysfs_ops,
705}; 697};
706 698
@@ -709,13 +701,11 @@ static struct kobj_type module_ktype = {
709 */ 701 */
710static int __init param_sysfs_init(void) 702static int __init param_sysfs_init(void)
711{ 703{
712 int ret; 704 module_kset = kset_create_and_add("module", &module_uevent_ops, NULL);
713 705 if (!module_kset) {
714 ret = subsystem_register(&module_subsys); 706 printk(KERN_WARNING "%s (%d): error creating kset\n",
715 if (ret < 0) { 707 __FILE__, __LINE__);
716 printk(KERN_WARNING "%s (%d): subsystem_register error: %d\n", 708 return -ENOMEM;
717 __FILE__, __LINE__, ret);
718 return ret;
719 } 709 }
720 module_sysfs_initialized = 1; 710 module_sysfs_initialized = 1;
721 711
@@ -725,14 +715,7 @@ static int __init param_sysfs_init(void)
725} 715}
726subsys_initcall(param_sysfs_init); 716subsys_initcall(param_sysfs_init);
727 717
728#else 718#endif /* CONFIG_SYSFS */
729#if 0
730static struct sysfs_ops module_sysfs_ops = {
731 .show = NULL,
732 .store = NULL,
733};
734#endif
735#endif
736 719
737EXPORT_SYMBOL(param_set_byte); 720EXPORT_SYMBOL(param_set_byte);
738EXPORT_SYMBOL(param_get_byte); 721EXPORT_SYMBOL(param_get_byte);
diff --git a/kernel/pid.c b/kernel/pid.c
index d1db36b94674..f815455431bf 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -537,6 +537,7 @@ err_alloc:
537 return NULL; 537 return NULL;
538} 538}
539 539
540#ifdef CONFIG_PID_NS
540static struct pid_namespace *create_pid_namespace(int level) 541static struct pid_namespace *create_pid_namespace(int level)
541{ 542{
542 struct pid_namespace *ns; 543 struct pid_namespace *ns;
@@ -621,6 +622,7 @@ void free_pid_ns(struct kref *kref)
621 if (parent != NULL) 622 if (parent != NULL)
622 put_pid_ns(parent); 623 put_pid_ns(parent);
623} 624}
625#endif /* CONFIG_PID_NS */
624 626
625void zap_pid_ns_processes(struct pid_namespace *pid_ns) 627void zap_pid_ns_processes(struct pid_namespace *pid_ns)
626{ 628{
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 68c96376e84a..0b7c82ac467e 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -967,6 +967,7 @@ static void check_thread_timers(struct task_struct *tsk,
967{ 967{
968 int maxfire; 968 int maxfire;
969 struct list_head *timers = tsk->cpu_timers; 969 struct list_head *timers = tsk->cpu_timers;
970 struct signal_struct *const sig = tsk->signal;
970 971
971 maxfire = 20; 972 maxfire = 20;
972 tsk->it_prof_expires = cputime_zero; 973 tsk->it_prof_expires = cputime_zero;
@@ -1011,6 +1012,35 @@ static void check_thread_timers(struct task_struct *tsk,
1011 t->firing = 1; 1012 t->firing = 1;
1012 list_move_tail(&t->entry, firing); 1013 list_move_tail(&t->entry, firing);
1013 } 1014 }
1015
1016 /*
1017 * Check for the special case thread timers.
1018 */
1019 if (sig->rlim[RLIMIT_RTTIME].rlim_cur != RLIM_INFINITY) {
1020 unsigned long hard = sig->rlim[RLIMIT_RTTIME].rlim_max;
1021 unsigned long *soft = &sig->rlim[RLIMIT_RTTIME].rlim_cur;
1022
1023 if (hard != RLIM_INFINITY &&
1024 tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
1025 /*
1026 * At the hard limit, we just die.
1027 * No need to calculate anything else now.
1028 */
1029 __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
1030 return;
1031 }
1032 if (tsk->rt.timeout > DIV_ROUND_UP(*soft, USEC_PER_SEC/HZ)) {
1033 /*
1034 * At the soft limit, send a SIGXCPU every second.
1035 */
1036 if (sig->rlim[RLIMIT_RTTIME].rlim_cur
1037 < sig->rlim[RLIMIT_RTTIME].rlim_max) {
1038 sig->rlim[RLIMIT_RTTIME].rlim_cur +=
1039 USEC_PER_SEC;
1040 }
1041 __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
1042 }
1043 }
1014} 1044}
1015 1045
1016/* 1046/*
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 8e186c678149..ef9b802738a5 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -44,9 +44,30 @@ config PM_VERBOSE
44 ---help--- 44 ---help---
45 This option enables verbose messages from the Power Management code. 45 This option enables verbose messages from the Power Management code.
46 46
47config CAN_PM_TRACE
48 def_bool y
49 depends on PM_DEBUG && PM_SLEEP && EXPERIMENTAL
50
47config PM_TRACE 51config PM_TRACE
52 bool
53 help
54 This enables code to save the last PM event point across
55 reboot. The architecture needs to support this, x86 for
56 example does by saving things in the RTC, see below.
57
58 The architecture specific code must provide the extern
59 functions from <linux/resume-trace.h> as well as the
60 <asm/resume-trace.h> header with a TRACE_RESUME() macro.
61
62 The way the information is presented is architecture-
63 dependent, x86 will print the information during a
64 late_initcall.
65
66config PM_TRACE_RTC
48 bool "Suspend/resume event tracing" 67 bool "Suspend/resume event tracing"
49 depends on PM_DEBUG && X86 && PM_SLEEP && EXPERIMENTAL 68 depends on CAN_PM_TRACE
69 depends on X86
70 select PM_TRACE
50 default n 71 default n
51 ---help--- 72 ---help---
52 This enables some cheesy code to save the last PM event point in the 73 This enables some cheesy code to save the last PM event point in the
@@ -63,7 +84,8 @@ config PM_TRACE
63 84
64config PM_SLEEP_SMP 85config PM_SLEEP_SMP
65 bool 86 bool
66 depends on SUSPEND_SMP_POSSIBLE || HIBERNATION_SMP_POSSIBLE 87 depends on SMP
88 depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE
67 depends on PM_SLEEP 89 depends on PM_SLEEP
68 select HOTPLUG_CPU 90 select HOTPLUG_CPU
69 default y 91 default y
@@ -73,46 +95,29 @@ config PM_SLEEP
73 depends on SUSPEND || HIBERNATION 95 depends on SUSPEND || HIBERNATION
74 default y 96 default y
75 97
76config SUSPEND_UP_POSSIBLE
77 bool
78 depends on (X86 && !X86_VOYAGER) || PPC || ARM || BLACKFIN || MIPS \
79 || SUPERH || FRV
80 depends on !SMP
81 default y
82
83config SUSPEND_SMP_POSSIBLE
84 bool
85 depends on (X86 && !X86_VOYAGER) \
86 || (PPC && (PPC_PSERIES || PPC_PMAC)) || ARM
87 depends on SMP
88 default y
89
90config SUSPEND 98config SUSPEND
91 bool "Suspend to RAM and standby" 99 bool "Suspend to RAM and standby"
92 depends on PM 100 depends on PM && ARCH_SUSPEND_POSSIBLE
93 depends on SUSPEND_UP_POSSIBLE || SUSPEND_SMP_POSSIBLE
94 default y 101 default y
95 ---help--- 102 ---help---
96 Allow the system to enter sleep states in which main memory is 103 Allow the system to enter sleep states in which main memory is
97 powered and thus its contents are preserved, such as the 104 powered and thus its contents are preserved, such as the
98 suspend-to-RAM state (i.e. the ACPI S3 state). 105 suspend-to-RAM state (e.g. the ACPI S3 state).
99 106
100config HIBERNATION_UP_POSSIBLE 107config SUSPEND_FREEZER
101 bool 108 bool "Enable freezer for suspend to RAM/standby" \
102 depends on X86 || PPC64_SWSUSP || PPC32 109 if ARCH_WANTS_FREEZER_CONTROL || BROKEN
103 depends on !SMP 110 depends on SUSPEND
104 default y 111 default y
112 help
113 This allows you to turn off the freezer for suspend. If this is
114 done, no tasks are frozen for suspend to RAM/standby.
105 115
106config HIBERNATION_SMP_POSSIBLE 116 Turning OFF this setting is NOT recommended! If in doubt, say Y.
107 bool
108 depends on (X86 && !X86_VOYAGER) || PPC64_SWSUSP
109 depends on SMP
110 default y
111 117
112config HIBERNATION 118config HIBERNATION
113 bool "Hibernation (aka 'suspend to disk')" 119 bool "Hibernation (aka 'suspend to disk')"
114 depends on PM && SWAP 120 depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE
115 depends on HIBERNATION_UP_POSSIBLE || HIBERNATION_SMP_POSSIBLE
116 ---help--- 121 ---help---
117 Enable the suspend to disk (STD) functionality, which is usually 122 Enable the suspend to disk (STD) functionality, which is usually
118 called "hibernation" in user interfaces. STD checkpoints the 123 called "hibernation" in user interfaces. STD checkpoints the
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 8b15f777010a..d09da0895174 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -54,8 +54,8 @@ static struct platform_hibernation_ops *hibernation_ops;
54 54
55void hibernation_set_ops(struct platform_hibernation_ops *ops) 55void hibernation_set_ops(struct platform_hibernation_ops *ops)
56{ 56{
57 if (ops && !(ops->start && ops->pre_snapshot && ops->finish 57 if (ops && !(ops->begin && ops->end && ops->pre_snapshot
58 && ops->prepare && ops->enter && ops->pre_restore 58 && ops->prepare && ops->finish && ops->enter && ops->pre_restore
59 && ops->restore_cleanup)) { 59 && ops->restore_cleanup)) {
60 WARN_ON(1); 60 WARN_ON(1);
61 return; 61 return;
@@ -70,15 +70,55 @@ void hibernation_set_ops(struct platform_hibernation_ops *ops)
70 mutex_unlock(&pm_mutex); 70 mutex_unlock(&pm_mutex);
71} 71}
72 72
73#ifdef CONFIG_PM_DEBUG
74static void hibernation_debug_sleep(void)
75{
76 printk(KERN_INFO "hibernation debug: Waiting for 5 seconds.\n");
77 mdelay(5000);
78}
79
80static int hibernation_testmode(int mode)
81{
82 if (hibernation_mode == mode) {
83 hibernation_debug_sleep();
84 return 1;
85 }
86 return 0;
87}
88
89static int hibernation_test(int level)
90{
91 if (pm_test_level == level) {
92 hibernation_debug_sleep();
93 return 1;
94 }
95 return 0;
96}
97#else /* !CONFIG_PM_DEBUG */
98static int hibernation_testmode(int mode) { return 0; }
99static int hibernation_test(int level) { return 0; }
100#endif /* !CONFIG_PM_DEBUG */
101
73/** 102/**
74 * platform_start - tell the platform driver that we're starting 103 * platform_begin - tell the platform driver that we're starting
75 * hibernation 104 * hibernation
76 */ 105 */
77 106
78static int platform_start(int platform_mode) 107static int platform_begin(int platform_mode)
79{ 108{
80 return (platform_mode && hibernation_ops) ? 109 return (platform_mode && hibernation_ops) ?
81 hibernation_ops->start() : 0; 110 hibernation_ops->begin() : 0;
111}
112
113/**
114 * platform_end - tell the platform driver that we've entered the
115 * working state
116 */
117
118static void platform_end(int platform_mode)
119{
120 if (platform_mode && hibernation_ops)
121 hibernation_ops->end();
82} 122}
83 123
84/** 124/**
@@ -162,19 +202,25 @@ int create_image(int platform_mode)
162 */ 202 */
163 error = device_power_down(PMSG_FREEZE); 203 error = device_power_down(PMSG_FREEZE);
164 if (error) { 204 if (error) {
165 printk(KERN_ERR "Some devices failed to power down, " 205 printk(KERN_ERR "PM: Some devices failed to power down, "
166 KERN_ERR "aborting suspend\n"); 206 "aborting hibernation\n");
167 goto Enable_irqs; 207 goto Enable_irqs;
168 } 208 }
169 209
210 if (hibernation_test(TEST_CORE))
211 goto Power_up;
212
213 in_suspend = 1;
170 save_processor_state(); 214 save_processor_state();
171 error = swsusp_arch_suspend(); 215 error = swsusp_arch_suspend();
172 if (error) 216 if (error)
173 printk(KERN_ERR "Error %d while creating the image\n", error); 217 printk(KERN_ERR "PM: Error %d creating hibernation image\n",
218 error);
174 /* Restore control flow magically appears here */ 219 /* Restore control flow magically appears here */
175 restore_processor_state(); 220 restore_processor_state();
176 if (!in_suspend) 221 if (!in_suspend)
177 platform_leave(platform_mode); 222 platform_leave(platform_mode);
223 Power_up:
178 /* NOTE: device_power_up() is just a resume() for devices 224 /* NOTE: device_power_up() is just a resume() for devices
179 * that suspended with irqs off ... no overall powerup. 225 * that suspended with irqs off ... no overall powerup.
180 */ 226 */
@@ -202,36 +248,90 @@ int hibernation_snapshot(int platform_mode)
202 if (error) 248 if (error)
203 return error; 249 return error;
204 250
205 error = platform_start(platform_mode); 251 error = platform_begin(platform_mode);
206 if (error) 252 if (error)
207 return error; 253 goto Close;
208 254
209 suspend_console(); 255 suspend_console();
210 error = device_suspend(PMSG_FREEZE); 256 error = device_suspend(PMSG_FREEZE);
211 if (error) 257 if (error)
212 goto Resume_console; 258 goto Resume_console;
213 259
214 error = platform_pre_snapshot(platform_mode); 260 if (hibernation_test(TEST_DEVICES))
215 if (error)
216 goto Resume_devices; 261 goto Resume_devices;
217 262
263 error = platform_pre_snapshot(platform_mode);
264 if (error || hibernation_test(TEST_PLATFORM))
265 goto Finish;
266
218 error = disable_nonboot_cpus(); 267 error = disable_nonboot_cpus();
219 if (!error) { 268 if (!error) {
220 if (hibernation_mode != HIBERNATION_TEST) { 269 if (hibernation_test(TEST_CPUS))
221 in_suspend = 1; 270 goto Enable_cpus;
222 error = create_image(platform_mode); 271
223 /* Control returns here after successful restore */ 272 if (hibernation_testmode(HIBERNATION_TEST))
224 } else { 273 goto Enable_cpus;
225 printk("swsusp debug: Waiting for 5 seconds.\n"); 274
226 mdelay(5000); 275 error = create_image(platform_mode);
227 } 276 /* Control returns here after successful restore */
228 } 277 }
278 Enable_cpus:
229 enable_nonboot_cpus(); 279 enable_nonboot_cpus();
230 Resume_devices: 280 Finish:
231 platform_finish(platform_mode); 281 platform_finish(platform_mode);
282 Resume_devices:
232 device_resume(); 283 device_resume();
233 Resume_console: 284 Resume_console:
234 resume_console(); 285 resume_console();
286 Close:
287 platform_end(platform_mode);
288 return error;
289}
290
291/**
292 * resume_target_kernel - prepare devices that need to be suspended with
293 * interrupts off, restore the contents of highmem that have not been
294 * restored yet from the image and run the low level code that will restore
295 * the remaining contents of memory and switch to the just restored target
296 * kernel.
297 */
298
299static int resume_target_kernel(void)
300{
301 int error;
302
303 local_irq_disable();
304 error = device_power_down(PMSG_PRETHAW);
305 if (error) {
306 printk(KERN_ERR "PM: Some devices failed to power down, "
307 "aborting resume\n");
308 goto Enable_irqs;
309 }
310 /* We'll ignore saved state, but this gets preempt count (etc) right */
311 save_processor_state();
312 error = restore_highmem();
313 if (!error) {
314 error = swsusp_arch_resume();
315 /*
316 * The code below is only ever reached in case of a failure.
317 * Otherwise execution continues at place where
318 * swsusp_arch_suspend() was called
319 */
320 BUG_ON(!error);
321 /* This call to restore_highmem() undos the previous one */
322 restore_highmem();
323 }
324 /*
325 * The only reason why swsusp_arch_resume() can fail is memory being
326 * very tight, so we have to free it as soon as we can to avoid
327 * subsequent failures
328 */
329 swsusp_free();
330 restore_processor_state();
331 touch_softlockup_watchdog();
332 device_power_up();
333 Enable_irqs:
334 local_irq_enable();
235 return error; 335 return error;
236} 336}
237 337
@@ -258,7 +358,7 @@ int hibernation_restore(int platform_mode)
258 if (!error) { 358 if (!error) {
259 error = disable_nonboot_cpus(); 359 error = disable_nonboot_cpus();
260 if (!error) 360 if (!error)
261 error = swsusp_resume(); 361 error = resume_target_kernel();
262 enable_nonboot_cpus(); 362 enable_nonboot_cpus();
263 } 363 }
264 platform_restore_cleanup(platform_mode); 364 platform_restore_cleanup(platform_mode);
@@ -286,9 +386,9 @@ int hibernation_platform_enter(void)
286 * hibernation_ops->finish() before saving the image, so we should let 386 * hibernation_ops->finish() before saving the image, so we should let
287 * the firmware know that we're going to enter the sleep state after all 387 * the firmware know that we're going to enter the sleep state after all
288 */ 388 */
289 error = hibernation_ops->start(); 389 error = hibernation_ops->begin();
290 if (error) 390 if (error)
291 return error; 391 goto Close;
292 392
293 suspend_console(); 393 suspend_console();
294 error = device_suspend(PMSG_SUSPEND); 394 error = device_suspend(PMSG_SUSPEND);
@@ -322,6 +422,8 @@ int hibernation_platform_enter(void)
322 device_resume(); 422 device_resume();
323 Resume_console: 423 Resume_console:
324 resume_console(); 424 resume_console();
425 Close:
426 hibernation_ops->end();
325 return error; 427 return error;
326} 428}
327 429
@@ -352,24 +454,17 @@ static void power_down(void)
352 * Valid image is on the disk, if we continue we risk serious data 454 * Valid image is on the disk, if we continue we risk serious data
353 * corruption after resume. 455 * corruption after resume.
354 */ 456 */
355 printk(KERN_CRIT "Please power me down manually\n"); 457 printk(KERN_CRIT "PM: Please power down manually\n");
356 while(1); 458 while(1);
357} 459}
358 460
359static void unprepare_processes(void)
360{
361 thaw_processes();
362 pm_restore_console();
363}
364
365static int prepare_processes(void) 461static int prepare_processes(void)
366{ 462{
367 int error = 0; 463 int error = 0;
368 464
369 pm_prepare_console();
370 if (freeze_processes()) { 465 if (freeze_processes()) {
371 error = -EBUSY; 466 error = -EBUSY;
372 unprepare_processes(); 467 thaw_processes();
373 } 468 }
374 return error; 469 return error;
375} 470}
@@ -389,6 +484,7 @@ int hibernate(void)
389 goto Unlock; 484 goto Unlock;
390 } 485 }
391 486
487 pm_prepare_console();
392 error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); 488 error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
393 if (error) 489 if (error)
394 goto Exit; 490 goto Exit;
@@ -398,7 +494,7 @@ int hibernate(void)
398 if (error) 494 if (error)
399 goto Exit; 495 goto Exit;
400 496
401 printk("Syncing filesystems ... "); 497 printk(KERN_INFO "PM: Syncing filesystems ... ");
402 sys_sync(); 498 sys_sync();
403 printk("done.\n"); 499 printk("done.\n");
404 500
@@ -406,11 +502,12 @@ int hibernate(void)
406 if (error) 502 if (error)
407 goto Finish; 503 goto Finish;
408 504
409 if (hibernation_mode == HIBERNATION_TESTPROC) { 505 if (hibernation_test(TEST_FREEZER))
410 printk("swsusp debug: Waiting for 5 seconds.\n");
411 mdelay(5000);
412 goto Thaw; 506 goto Thaw;
413 } 507
508 if (hibernation_testmode(HIBERNATION_TESTPROC))
509 goto Thaw;
510
414 error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); 511 error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM);
415 if (in_suspend && !error) { 512 if (in_suspend && !error) {
416 unsigned int flags = 0; 513 unsigned int flags = 0;
@@ -427,11 +524,12 @@ int hibernate(void)
427 swsusp_free(); 524 swsusp_free();
428 } 525 }
429 Thaw: 526 Thaw:
430 unprepare_processes(); 527 thaw_processes();
431 Finish: 528 Finish:
432 free_basic_memory_bitmaps(); 529 free_basic_memory_bitmaps();
433 Exit: 530 Exit:
434 pm_notifier_call_chain(PM_POST_HIBERNATION); 531 pm_notifier_call_chain(PM_POST_HIBERNATION);
532 pm_restore_console();
435 atomic_inc(&snapshot_device_available); 533 atomic_inc(&snapshot_device_available);
436 Unlock: 534 Unlock:
437 mutex_unlock(&pm_mutex); 535 mutex_unlock(&pm_mutex);
@@ -456,29 +554,40 @@ static int software_resume(void)
456 int error; 554 int error;
457 unsigned int flags; 555 unsigned int flags;
458 556
459 mutex_lock(&pm_mutex); 557 /*
558 * name_to_dev_t() below takes a sysfs buffer mutex when sysfs
559 * is configured into the kernel. Since the regular hibernate
560 * trigger path is via sysfs which takes a buffer mutex before
561 * calling hibernate functions (which take pm_mutex) this can
562 * cause lockdep to complain about a possible ABBA deadlock
563 * which cannot happen since we're in the boot code here and
564 * sysfs can't be invoked yet. Therefore, we use a subclass
565 * here to avoid lockdep complaining.
566 */
567 mutex_lock_nested(&pm_mutex, SINGLE_DEPTH_NESTING);
460 if (!swsusp_resume_device) { 568 if (!swsusp_resume_device) {
461 if (!strlen(resume_file)) { 569 if (!strlen(resume_file)) {
462 mutex_unlock(&pm_mutex); 570 mutex_unlock(&pm_mutex);
463 return -ENOENT; 571 return -ENOENT;
464 } 572 }
465 swsusp_resume_device = name_to_dev_t(resume_file); 573 swsusp_resume_device = name_to_dev_t(resume_file);
466 pr_debug("swsusp: Resume From Partition %s\n", resume_file); 574 pr_debug("PM: Resume from partition %s\n", resume_file);
467 } else { 575 } else {
468 pr_debug("swsusp: Resume From Partition %d:%d\n", 576 pr_debug("PM: Resume from partition %d:%d\n",
469 MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); 577 MAJOR(swsusp_resume_device),
578 MINOR(swsusp_resume_device));
470 } 579 }
471 580
472 if (noresume) { 581 if (noresume) {
473 /** 582 /**
474 * FIXME: If noresume is specified, we need to find the partition 583 * FIXME: If noresume is specified, we need to find the
475 * and reset it back to normal swap space. 584 * partition and reset it back to normal swap space.
476 */ 585 */
477 mutex_unlock(&pm_mutex); 586 mutex_unlock(&pm_mutex);
478 return 0; 587 return 0;
479 } 588 }
480 589
481 pr_debug("PM: Checking swsusp image.\n"); 590 pr_debug("PM: Checking hibernation image.\n");
482 error = swsusp_check(); 591 error = swsusp_check();
483 if (error) 592 if (error)
484 goto Unlock; 593 goto Unlock;
@@ -489,6 +598,11 @@ static int software_resume(void)
489 goto Unlock; 598 goto Unlock;
490 } 599 }
491 600
601 pm_prepare_console();
602 error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
603 if (error)
604 goto Finish;
605
492 error = create_basic_memory_bitmaps(); 606 error = create_basic_memory_bitmaps();
493 if (error) 607 if (error)
494 goto Finish; 608 goto Finish;
@@ -500,7 +614,7 @@ static int software_resume(void)
500 goto Done; 614 goto Done;
501 } 615 }
502 616
503 pr_debug("PM: Reading swsusp image.\n"); 617 pr_debug("PM: Reading hibernation image.\n");
504 618
505 error = swsusp_read(&flags); 619 error = swsusp_read(&flags);
506 if (!error) 620 if (!error)
@@ -508,10 +622,12 @@ static int software_resume(void)
508 622
509 printk(KERN_ERR "PM: Restore failed, recovering.\n"); 623 printk(KERN_ERR "PM: Restore failed, recovering.\n");
510 swsusp_free(); 624 swsusp_free();
511 unprepare_processes(); 625 thaw_processes();
512 Done: 626 Done:
513 free_basic_memory_bitmaps(); 627 free_basic_memory_bitmaps();
514 Finish: 628 Finish:
629 pm_notifier_call_chain(PM_POST_RESTORE);
630 pm_restore_console();
515 atomic_inc(&snapshot_device_available); 631 atomic_inc(&snapshot_device_available);
516 /* For success case, the suspend path will release the lock */ 632 /* For success case, the suspend path will release the lock */
517 Unlock: 633 Unlock:
@@ -557,7 +673,8 @@ static const char * const hibernation_modes[] = {
557 * supports it (as determined by having hibernation_ops). 673 * supports it (as determined by having hibernation_ops).
558 */ 674 */
559 675
560static ssize_t disk_show(struct kset *kset, char *buf) 676static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
677 char *buf)
561{ 678{
562 int i; 679 int i;
563 char *start = buf; 680 char *start = buf;
@@ -587,7 +704,8 @@ static ssize_t disk_show(struct kset *kset, char *buf)
587} 704}
588 705
589 706
590static ssize_t disk_store(struct kset *kset, const char *buf, size_t n) 707static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
708 const char *buf, size_t n)
591{ 709{
592 int error = 0; 710 int error = 0;
593 int i; 711 int i;
@@ -624,7 +742,7 @@ static ssize_t disk_store(struct kset *kset, const char *buf, size_t n)
624 error = -EINVAL; 742 error = -EINVAL;
625 743
626 if (!error) 744 if (!error)
627 pr_debug("PM: suspend-to-disk mode set to '%s'\n", 745 pr_debug("PM: Hibernation mode set to '%s'\n",
628 hibernation_modes[mode]); 746 hibernation_modes[mode]);
629 mutex_unlock(&pm_mutex); 747 mutex_unlock(&pm_mutex);
630 return error ? error : n; 748 return error ? error : n;
@@ -632,13 +750,15 @@ static ssize_t disk_store(struct kset *kset, const char *buf, size_t n)
632 750
633power_attr(disk); 751power_attr(disk);
634 752
635static ssize_t resume_show(struct kset *kset, char *buf) 753static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr,
754 char *buf)
636{ 755{
637 return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device), 756 return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device),
638 MINOR(swsusp_resume_device)); 757 MINOR(swsusp_resume_device));
639} 758}
640 759
641static ssize_t resume_store(struct kset *kset, const char *buf, size_t n) 760static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr,
761 const char *buf, size_t n)
642{ 762{
643 unsigned int maj, min; 763 unsigned int maj, min;
644 dev_t res; 764 dev_t res;
@@ -654,7 +774,7 @@ static ssize_t resume_store(struct kset *kset, const char *buf, size_t n)
654 mutex_lock(&pm_mutex); 774 mutex_lock(&pm_mutex);
655 swsusp_resume_device = res; 775 swsusp_resume_device = res;
656 mutex_unlock(&pm_mutex); 776 mutex_unlock(&pm_mutex);
657 printk("Attempting manual resume\n"); 777 printk(KERN_INFO "PM: Starting manual resume from disk\n");
658 noresume = 0; 778 noresume = 0;
659 software_resume(); 779 software_resume();
660 ret = n; 780 ret = n;
@@ -664,12 +784,14 @@ static ssize_t resume_store(struct kset *kset, const char *buf, size_t n)
664 784
665power_attr(resume); 785power_attr(resume);
666 786
667static ssize_t image_size_show(struct kset *kset, char *buf) 787static ssize_t image_size_show(struct kobject *kobj, struct kobj_attribute *attr,
788 char *buf)
668{ 789{
669 return sprintf(buf, "%lu\n", image_size); 790 return sprintf(buf, "%lu\n", image_size);
670} 791}
671 792
672static ssize_t image_size_store(struct kset *kset, const char *buf, size_t n) 793static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *attr,
794 const char *buf, size_t n)
673{ 795{
674 unsigned long size; 796 unsigned long size;
675 797
@@ -698,7 +820,7 @@ static struct attribute_group attr_group = {
698 820
699static int __init pm_disk_init(void) 821static int __init pm_disk_init(void)
700{ 822{
701 return sysfs_create_group(&power_subsys.kobj, &attr_group); 823 return sysfs_create_group(power_kobj, &attr_group);
702} 824}
703 825
704core_initcall(pm_disk_init); 826core_initcall(pm_disk_init);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 3cdf95b1dc92..6a6d5eb3524e 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -24,10 +24,112 @@
24 24
25#include "power.h" 25#include "power.h"
26 26
27BLOCKING_NOTIFIER_HEAD(pm_chain_head);
28
29DEFINE_MUTEX(pm_mutex); 27DEFINE_MUTEX(pm_mutex);
30 28
29unsigned int pm_flags;
30EXPORT_SYMBOL(pm_flags);
31
32#ifdef CONFIG_PM_SLEEP
33
34/* Routines for PM-transition notifications */
35
36static BLOCKING_NOTIFIER_HEAD(pm_chain_head);
37
38int register_pm_notifier(struct notifier_block *nb)
39{
40 return blocking_notifier_chain_register(&pm_chain_head, nb);
41}
42EXPORT_SYMBOL_GPL(register_pm_notifier);
43
44int unregister_pm_notifier(struct notifier_block *nb)
45{
46 return blocking_notifier_chain_unregister(&pm_chain_head, nb);
47}
48EXPORT_SYMBOL_GPL(unregister_pm_notifier);
49
50int pm_notifier_call_chain(unsigned long val)
51{
52 return (blocking_notifier_call_chain(&pm_chain_head, val, NULL)
53 == NOTIFY_BAD) ? -EINVAL : 0;
54}
55
56#ifdef CONFIG_PM_DEBUG
57int pm_test_level = TEST_NONE;
58
59static int suspend_test(int level)
60{
61 if (pm_test_level == level) {
62 printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n");
63 mdelay(5000);
64 return 1;
65 }
66 return 0;
67}
68
69static const char * const pm_tests[__TEST_AFTER_LAST] = {
70 [TEST_NONE] = "none",
71 [TEST_CORE] = "core",
72 [TEST_CPUS] = "processors",
73 [TEST_PLATFORM] = "platform",
74 [TEST_DEVICES] = "devices",
75 [TEST_FREEZER] = "freezer",
76};
77
78static ssize_t pm_test_show(struct kobject *kobj, struct kobj_attribute *attr,
79 char *buf)
80{
81 char *s = buf;
82 int level;
83
84 for (level = TEST_FIRST; level <= TEST_MAX; level++)
85 if (pm_tests[level]) {
86 if (level == pm_test_level)
87 s += sprintf(s, "[%s] ", pm_tests[level]);
88 else
89 s += sprintf(s, "%s ", pm_tests[level]);
90 }
91
92 if (s != buf)
93 /* convert the last space to a newline */
94 *(s-1) = '\n';
95
96 return (s - buf);
97}
98
99static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
100 const char *buf, size_t n)
101{
102 const char * const *s;
103 int level;
104 char *p;
105 int len;
106 int error = -EINVAL;
107
108 p = memchr(buf, '\n', n);
109 len = p ? p - buf : n;
110
111 mutex_lock(&pm_mutex);
112
113 level = TEST_FIRST;
114 for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++)
115 if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) {
116 pm_test_level = level;
117 error = 0;
118 break;
119 }
120
121 mutex_unlock(&pm_mutex);
122
123 return error ? error : n;
124}
125
126power_attr(pm_test);
127#else /* !CONFIG_PM_DEBUG */
128static inline int suspend_test(int level) { return 0; }
129#endif /* !CONFIG_PM_DEBUG */
130
131#endif /* CONFIG_PM_SLEEP */
132
31#ifdef CONFIG_SUSPEND 133#ifdef CONFIG_SUSPEND
32 134
33/* This is just an arbitrary number */ 135/* This is just an arbitrary number */
@@ -73,13 +175,13 @@ static int suspend_prepare(void)
73 if (!suspend_ops || !suspend_ops->enter) 175 if (!suspend_ops || !suspend_ops->enter)
74 return -EPERM; 176 return -EPERM;
75 177
178 pm_prepare_console();
179
76 error = pm_notifier_call_chain(PM_SUSPEND_PREPARE); 180 error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
77 if (error) 181 if (error)
78 goto Finish; 182 goto Finish;
79 183
80 pm_prepare_console(); 184 if (suspend_freeze_processes()) {
81
82 if (freeze_processes()) {
83 error = -EAGAIN; 185 error = -EAGAIN;
84 goto Thaw; 186 goto Thaw;
85 } 187 }
@@ -97,10 +199,10 @@ static int suspend_prepare(void)
97 return 0; 199 return 0;
98 200
99 Thaw: 201 Thaw:
100 thaw_processes(); 202 suspend_thaw_processes();
101 pm_restore_console();
102 Finish: 203 Finish:
103 pm_notifier_call_chain(PM_POST_SUSPEND); 204 pm_notifier_call_chain(PM_POST_SUSPEND);
205 pm_restore_console();
104 return error; 206 return error;
105} 207}
106 208
@@ -130,10 +232,13 @@ static int suspend_enter(suspend_state_t state)
130 BUG_ON(!irqs_disabled()); 232 BUG_ON(!irqs_disabled());
131 233
132 if ((error = device_power_down(PMSG_SUSPEND))) { 234 if ((error = device_power_down(PMSG_SUSPEND))) {
133 printk(KERN_ERR "Some devices failed to power down\n"); 235 printk(KERN_ERR "PM: Some devices failed to power down\n");
134 goto Done; 236 goto Done;
135 } 237 }
136 error = suspend_ops->enter(state); 238
239 if (!suspend_test(TEST_CORE))
240 error = suspend_ops->enter(state);
241
137 device_power_up(); 242 device_power_up();
138 Done: 243 Done:
139 arch_suspend_enable_irqs(); 244 arch_suspend_enable_irqs();
@@ -142,8 +247,8 @@ static int suspend_enter(suspend_state_t state)
142} 247}
143 248
144/** 249/**
145 * suspend_devices_and_enter - suspend devices and enter the desired system sleep 250 * suspend_devices_and_enter - suspend devices and enter the desired system
146 * state. 251 * sleep state.
147 * @state: state to enter 252 * @state: state to enter
148 */ 253 */
149int suspend_devices_and_enter(suspend_state_t state) 254int suspend_devices_and_enter(suspend_state_t state)
@@ -153,33 +258,45 @@ int suspend_devices_and_enter(suspend_state_t state)
153 if (!suspend_ops) 258 if (!suspend_ops)
154 return -ENOSYS; 259 return -ENOSYS;
155 260
156 if (suspend_ops->set_target) { 261 if (suspend_ops->begin) {
157 error = suspend_ops->set_target(state); 262 error = suspend_ops->begin(state);
158 if (error) 263 if (error)
159 return error; 264 goto Close;
160 } 265 }
161 suspend_console(); 266 suspend_console();
162 error = device_suspend(PMSG_SUSPEND); 267 error = device_suspend(PMSG_SUSPEND);
163 if (error) { 268 if (error) {
164 printk(KERN_ERR "Some devices failed to suspend\n"); 269 printk(KERN_ERR "PM: Some devices failed to suspend\n");
165 goto Resume_console; 270 goto Resume_console;
166 } 271 }
272
273 if (suspend_test(TEST_DEVICES))
274 goto Resume_devices;
275
167 if (suspend_ops->prepare) { 276 if (suspend_ops->prepare) {
168 error = suspend_ops->prepare(); 277 error = suspend_ops->prepare();
169 if (error) 278 if (error)
170 goto Resume_devices; 279 goto Resume_devices;
171 } 280 }
281
282 if (suspend_test(TEST_PLATFORM))
283 goto Finish;
284
172 error = disable_nonboot_cpus(); 285 error = disable_nonboot_cpus();
173 if (!error) 286 if (!error && !suspend_test(TEST_CPUS))
174 suspend_enter(state); 287 suspend_enter(state);
175 288
176 enable_nonboot_cpus(); 289 enable_nonboot_cpus();
290 Finish:
177 if (suspend_ops->finish) 291 if (suspend_ops->finish)
178 suspend_ops->finish(); 292 suspend_ops->finish();
179 Resume_devices: 293 Resume_devices:
180 device_resume(); 294 device_resume();
181 Resume_console: 295 Resume_console:
182 resume_console(); 296 resume_console();
297 Close:
298 if (suspend_ops->end)
299 suspend_ops->end();
183 return error; 300 return error;
184} 301}
185 302
@@ -191,9 +308,9 @@ int suspend_devices_and_enter(suspend_state_t state)
191 */ 308 */
192static void suspend_finish(void) 309static void suspend_finish(void)
193{ 310{
194 thaw_processes(); 311 suspend_thaw_processes();
195 pm_restore_console();
196 pm_notifier_call_chain(PM_POST_SUSPEND); 312 pm_notifier_call_chain(PM_POST_SUSPEND);
313 pm_restore_console();
197} 314}
198 315
199 316
@@ -235,17 +352,22 @@ static int enter_state(suspend_state_t state)
235 if (!mutex_trylock(&pm_mutex)) 352 if (!mutex_trylock(&pm_mutex))
236 return -EBUSY; 353 return -EBUSY;
237 354
238 printk("Syncing filesystems ... "); 355 printk(KERN_INFO "PM: Syncing filesystems ... ");
239 sys_sync(); 356 sys_sync();
240 printk("done.\n"); 357 printk("done.\n");
241 358
242 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); 359 pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]);
243 if ((error = suspend_prepare())) 360 error = suspend_prepare();
361 if (error)
244 goto Unlock; 362 goto Unlock;
245 363
364 if (suspend_test(TEST_FREEZER))
365 goto Finish;
366
246 pr_debug("PM: Entering %s sleep\n", pm_states[state]); 367 pr_debug("PM: Entering %s sleep\n", pm_states[state]);
247 error = suspend_devices_and_enter(state); 368 error = suspend_devices_and_enter(state);
248 369
370 Finish:
249 pr_debug("PM: Finishing wakeup.\n"); 371 pr_debug("PM: Finishing wakeup.\n");
250 suspend_finish(); 372 suspend_finish();
251 Unlock: 373 Unlock:
@@ -273,8 +395,7 @@ EXPORT_SYMBOL(pm_suspend);
273 395
274#endif /* CONFIG_SUSPEND */ 396#endif /* CONFIG_SUSPEND */
275 397
276decl_subsys(power,NULL,NULL); 398struct kobject *power_kobj;
277
278 399
279/** 400/**
280 * state - control system power state. 401 * state - control system power state.
@@ -287,7 +408,8 @@ decl_subsys(power,NULL,NULL);
287 * proper enumerated value, and initiates a suspend transition. 408 * proper enumerated value, and initiates a suspend transition.
288 */ 409 */
289 410
290static ssize_t state_show(struct kset *kset, char *buf) 411static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr,
412 char *buf)
291{ 413{
292 char *s = buf; 414 char *s = buf;
293#ifdef CONFIG_SUSPEND 415#ifdef CONFIG_SUSPEND
@@ -308,7 +430,8 @@ static ssize_t state_show(struct kset *kset, char *buf)
308 return (s - buf); 430 return (s - buf);
309} 431}
310 432
311static ssize_t state_store(struct kset *kset, const char *buf, size_t n) 433static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
434 const char *buf, size_t n)
312{ 435{
313#ifdef CONFIG_SUSPEND 436#ifdef CONFIG_SUSPEND
314 suspend_state_t state = PM_SUSPEND_STANDBY; 437 suspend_state_t state = PM_SUSPEND_STANDBY;
@@ -345,13 +468,15 @@ power_attr(state);
345#ifdef CONFIG_PM_TRACE 468#ifdef CONFIG_PM_TRACE
346int pm_trace_enabled; 469int pm_trace_enabled;
347 470
348static ssize_t pm_trace_show(struct kset *kset, char *buf) 471static ssize_t pm_trace_show(struct kobject *kobj, struct kobj_attribute *attr,
472 char *buf)
349{ 473{
350 return sprintf(buf, "%d\n", pm_trace_enabled); 474 return sprintf(buf, "%d\n", pm_trace_enabled);
351} 475}
352 476
353static ssize_t 477static ssize_t
354pm_trace_store(struct kset *kset, const char *buf, size_t n) 478pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr,
479 const char *buf, size_t n)
355{ 480{
356 int val; 481 int val;
357 482
@@ -363,18 +488,18 @@ pm_trace_store(struct kset *kset, const char *buf, size_t n)
363} 488}
364 489
365power_attr(pm_trace); 490power_attr(pm_trace);
491#endif /* CONFIG_PM_TRACE */
366 492
367static struct attribute * g[] = { 493static struct attribute * g[] = {
368 &state_attr.attr, 494 &state_attr.attr,
495#ifdef CONFIG_PM_TRACE
369 &pm_trace_attr.attr, 496 &pm_trace_attr.attr,
497#endif
498#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_PM_DEBUG)
499 &pm_test_attr.attr,
500#endif
370 NULL, 501 NULL,
371}; 502};
372#else
373static struct attribute * g[] = {
374 &state_attr.attr,
375 NULL,
376};
377#endif /* CONFIG_PM_TRACE */
378 503
379static struct attribute_group attr_group = { 504static struct attribute_group attr_group = {
380 .attrs = g, 505 .attrs = g,
@@ -383,10 +508,10 @@ static struct attribute_group attr_group = {
383 508
384static int __init pm_init(void) 509static int __init pm_init(void)
385{ 510{
386 int error = subsystem_register(&power_subsys); 511 power_kobj = kobject_create_and_add("power", NULL);
387 if (!error) 512 if (!power_kobj)
388 error = sysfs_create_group(&power_subsys.kobj,&attr_group); 513 return -ENOMEM;
389 return error; 514 return sysfs_create_group(power_kobj, &attr_group);
390} 515}
391 516
392core_initcall(pm_init); 517core_initcall(pm_init);
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
index c50d15266c10..60c73fa670d5 100644
--- a/kernel/power/pm.c
+++ b/kernel/power/pm.c
@@ -27,8 +27,6 @@
27#include <linux/interrupt.h> 27#include <linux/interrupt.h>
28#include <linux/mutex.h> 28#include <linux/mutex.h>
29 29
30int pm_active;
31
32/* 30/*
33 * Locking notes: 31 * Locking notes:
34 * pm_devs_lock can be a semaphore providing pm ops are not called 32 * pm_devs_lock can be a semaphore providing pm ops are not called
@@ -204,6 +202,4 @@ int pm_send_all(pm_request_t rqst, void *data)
204 202
205EXPORT_SYMBOL(pm_register); 203EXPORT_SYMBOL(pm_register);
206EXPORT_SYMBOL(pm_send_all); 204EXPORT_SYMBOL(pm_send_all);
207EXPORT_SYMBOL(pm_active);
208
209 205
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 195dc4611764..700f44ec8406 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -1,5 +1,7 @@
1#include <linux/suspend.h> 1#include <linux/suspend.h>
2#include <linux/suspend_ioctls.h>
2#include <linux/utsname.h> 3#include <linux/utsname.h>
4#include <linux/freezer.h>
3 5
4struct swsusp_info { 6struct swsusp_info {
5 struct new_utsname uts; 7 struct new_utsname uts;
@@ -54,7 +56,7 @@ extern int pfn_is_nosave(unsigned long);
54extern struct mutex pm_mutex; 56extern struct mutex pm_mutex;
55 57
56#define power_attr(_name) \ 58#define power_attr(_name) \
57static struct subsys_attribute _name##_attr = { \ 59static struct kobj_attribute _name##_attr = { \
58 .attr = { \ 60 .attr = { \
59 .name = __stringify(_name), \ 61 .name = __stringify(_name), \
60 .mode = 0644, \ 62 .mode = 0644, \
@@ -63,8 +65,6 @@ static struct subsys_attribute _name##_attr = { \
63 .store = _name##_store, \ 65 .store = _name##_store, \
64} 66}
65 67
66extern struct kset power_subsys;
67
68/* Preferred image size in bytes (default 500 MB) */ 68/* Preferred image size in bytes (default 500 MB) */
69extern unsigned long image_size; 69extern unsigned long image_size;
70extern int in_suspend; 70extern int in_suspend;
@@ -130,42 +130,12 @@ struct snapshot_handle {
130#define data_of(handle) ((handle).buffer + (handle).buf_offset) 130#define data_of(handle) ((handle).buffer + (handle).buf_offset)
131 131
132extern unsigned int snapshot_additional_pages(struct zone *zone); 132extern unsigned int snapshot_additional_pages(struct zone *zone);
133extern unsigned long snapshot_get_image_size(void);
133extern int snapshot_read_next(struct snapshot_handle *handle, size_t count); 134extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
134extern int snapshot_write_next(struct snapshot_handle *handle, size_t count); 135extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
135extern void snapshot_write_finalize(struct snapshot_handle *handle); 136extern void snapshot_write_finalize(struct snapshot_handle *handle);
136extern int snapshot_image_loaded(struct snapshot_handle *handle); 137extern int snapshot_image_loaded(struct snapshot_handle *handle);
137 138
138/*
139 * This structure is used to pass the values needed for the identification
140 * of the resume swap area from a user space to the kernel via the
141 * SNAPSHOT_SET_SWAP_AREA ioctl
142 */
143struct resume_swap_area {
144 loff_t offset;
145 u_int32_t dev;
146} __attribute__((packed));
147
148#define SNAPSHOT_IOC_MAGIC '3'
149#define SNAPSHOT_FREEZE _IO(SNAPSHOT_IOC_MAGIC, 1)
150#define SNAPSHOT_UNFREEZE _IO(SNAPSHOT_IOC_MAGIC, 2)
151#define SNAPSHOT_ATOMIC_SNAPSHOT _IOW(SNAPSHOT_IOC_MAGIC, 3, void *)
152#define SNAPSHOT_ATOMIC_RESTORE _IO(SNAPSHOT_IOC_MAGIC, 4)
153#define SNAPSHOT_FREE _IO(SNAPSHOT_IOC_MAGIC, 5)
154#define SNAPSHOT_SET_IMAGE_SIZE _IOW(SNAPSHOT_IOC_MAGIC, 6, unsigned long)
155#define SNAPSHOT_AVAIL_SWAP _IOR(SNAPSHOT_IOC_MAGIC, 7, void *)
156#define SNAPSHOT_GET_SWAP_PAGE _IOR(SNAPSHOT_IOC_MAGIC, 8, void *)
157#define SNAPSHOT_FREE_SWAP_PAGES _IO(SNAPSHOT_IOC_MAGIC, 9)
158#define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int)
159#define SNAPSHOT_S2RAM _IO(SNAPSHOT_IOC_MAGIC, 11)
160#define SNAPSHOT_PMOPS _IOW(SNAPSHOT_IOC_MAGIC, 12, unsigned int)
161#define SNAPSHOT_SET_SWAP_AREA _IOW(SNAPSHOT_IOC_MAGIC, 13, \
162 struct resume_swap_area)
163#define SNAPSHOT_IOC_MAXNR 13
164
165#define PMOPS_PREPARE 1
166#define PMOPS_ENTER 2
167#define PMOPS_FINISH 3
168
169/* If unset, the snapshot device cannot be open. */ 139/* If unset, the snapshot device cannot be open. */
170extern atomic_t snapshot_device_available; 140extern atomic_t snapshot_device_available;
171 141
@@ -183,7 +153,6 @@ extern int swsusp_swap_in_use(void);
183extern int swsusp_check(void); 153extern int swsusp_check(void);
184extern int swsusp_shrink_memory(void); 154extern int swsusp_shrink_memory(void);
185extern void swsusp_free(void); 155extern void swsusp_free(void);
186extern int swsusp_resume(void);
187extern int swsusp_read(unsigned int *flags_p); 156extern int swsusp_read(unsigned int *flags_p);
188extern int swsusp_write(unsigned int flags); 157extern int swsusp_write(unsigned int flags);
189extern void swsusp_close(void); 158extern void swsusp_close(void);
@@ -203,11 +172,56 @@ static inline int suspend_devices_and_enter(suspend_state_t state)
203} 172}
204#endif /* !CONFIG_SUSPEND */ 173#endif /* !CONFIG_SUSPEND */
205 174
206/* kernel/power/common.c */ 175#ifdef CONFIG_PM_SLEEP
207extern struct blocking_notifier_head pm_chain_head; 176/* kernel/power/main.c */
177extern int pm_notifier_call_chain(unsigned long val);
178#endif
179
180#ifdef CONFIG_HIGHMEM
181unsigned int count_highmem_pages(void);
182int restore_highmem(void);
183#else
184static inline unsigned int count_highmem_pages(void) { return 0; }
185static inline int restore_highmem(void) { return 0; }
186#endif
187
188/*
189 * Suspend test levels
190 */
191enum {
192 /* keep first */
193 TEST_NONE,
194 TEST_CORE,
195 TEST_CPUS,
196 TEST_PLATFORM,
197 TEST_DEVICES,
198 TEST_FREEZER,
199 /* keep last */
200 __TEST_AFTER_LAST
201};
208 202
209static inline int pm_notifier_call_chain(unsigned long val) 203#define TEST_FIRST TEST_NONE
204#define TEST_MAX (__TEST_AFTER_LAST - 1)
205
206extern int pm_test_level;
207
208#ifdef CONFIG_SUSPEND_FREEZER
209static inline int suspend_freeze_processes(void)
210{
211 return freeze_processes();
212}
213
214static inline void suspend_thaw_processes(void)
210{ 215{
211 return (blocking_notifier_call_chain(&pm_chain_head, val, NULL) 216 thaw_processes();
212 == NOTIFY_BAD) ? -EINVAL : 0;
213} 217}
218#else
219static inline int suspend_freeze_processes(void)
220{
221 return 0;
222}
223
224static inline void suspend_thaw_processes(void)
225{
226}
227#endif
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 6533923e711b..7c2118f9597f 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -86,9 +86,9 @@ static void fake_signal_wake_up(struct task_struct *p, int resume)
86 86
87static void send_fake_signal(struct task_struct *p) 87static void send_fake_signal(struct task_struct *p)
88{ 88{
89 if (p->state == TASK_STOPPED) 89 if (task_is_stopped(p))
90 force_sig_specific(SIGSTOP, p); 90 force_sig_specific(SIGSTOP, p);
91 fake_signal_wake_up(p, p->state == TASK_STOPPED); 91 fake_signal_wake_up(p, task_is_stopped(p));
92} 92}
93 93
94static int has_mm(struct task_struct *p) 94static int has_mm(struct task_struct *p)
@@ -182,7 +182,7 @@ static int try_to_freeze_tasks(int freeze_user_space)
182 if (frozen(p) || !freezeable(p)) 182 if (frozen(p) || !freezeable(p))
183 continue; 183 continue;
184 184
185 if (p->state == TASK_TRACED && frozen(p->parent)) { 185 if (task_is_traced(p) && frozen(p->parent)) {
186 cancel_freezing(p); 186 cancel_freezing(p);
187 continue; 187 continue;
188 } 188 }
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 78039b477d2b..f6a5df934f8d 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -635,7 +635,7 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn,
635 region->end_pfn = end_pfn; 635 region->end_pfn = end_pfn;
636 list_add_tail(&region->list, &nosave_regions); 636 list_add_tail(&region->list, &nosave_regions);
637 Report: 637 Report:
638 printk("swsusp: Registered nosave memory region: %016lx - %016lx\n", 638 printk(KERN_INFO "PM: Registered nosave memory: %016lx - %016lx\n",
639 start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); 639 start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
640} 640}
641 641
@@ -704,7 +704,7 @@ static void mark_nosave_pages(struct memory_bitmap *bm)
704 list_for_each_entry(region, &nosave_regions, list) { 704 list_for_each_entry(region, &nosave_regions, list) {
705 unsigned long pfn; 705 unsigned long pfn;
706 706
707 printk("swsusp: Marking nosave pages: %016lx - %016lx\n", 707 pr_debug("PM: Marking nosave pages: %016lx - %016lx\n",
708 region->start_pfn << PAGE_SHIFT, 708 region->start_pfn << PAGE_SHIFT,
709 region->end_pfn << PAGE_SHIFT); 709 region->end_pfn << PAGE_SHIFT);
710 710
@@ -749,7 +749,7 @@ int create_basic_memory_bitmaps(void)
749 free_pages_map = bm2; 749 free_pages_map = bm2;
750 mark_nosave_pages(forbidden_pages_map); 750 mark_nosave_pages(forbidden_pages_map);
751 751
752 printk("swsusp: Basic memory bitmaps created\n"); 752 pr_debug("PM: Basic memory bitmaps created\n");
753 753
754 return 0; 754 return 0;
755 755
@@ -784,7 +784,7 @@ void free_basic_memory_bitmaps(void)
784 memory_bm_free(bm2, PG_UNSAFE_CLEAR); 784 memory_bm_free(bm2, PG_UNSAFE_CLEAR);
785 kfree(bm2); 785 kfree(bm2);
786 786
787 printk("swsusp: Basic memory bitmaps freed\n"); 787 pr_debug("PM: Basic memory bitmaps freed\n");
788} 788}
789 789
790/** 790/**
@@ -872,7 +872,6 @@ unsigned int count_highmem_pages(void)
872} 872}
873#else 873#else
874static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; } 874static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; }
875static inline unsigned int count_highmem_pages(void) { return 0; }
876#endif /* CONFIG_HIGHMEM */ 875#endif /* CONFIG_HIGHMEM */
877 876
878/** 877/**
@@ -1089,7 +1088,7 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem)
1089 } 1088 }
1090 1089
1091 nr_pages += count_pages_for_highmem(nr_highmem); 1090 nr_pages += count_pages_for_highmem(nr_highmem);
1092 pr_debug("swsusp: Normal pages needed: %u + %u + %u, available pages: %u\n", 1091 pr_debug("PM: Normal pages needed: %u + %u + %u, available pages: %u\n",
1093 nr_pages, PAGES_FOR_IO, meta, free); 1092 nr_pages, PAGES_FOR_IO, meta, free);
1094 1093
1095 return free > nr_pages + PAGES_FOR_IO + meta; 1094 return free > nr_pages + PAGES_FOR_IO + meta;
@@ -1202,20 +1201,20 @@ asmlinkage int swsusp_save(void)
1202{ 1201{
1203 unsigned int nr_pages, nr_highmem; 1202 unsigned int nr_pages, nr_highmem;
1204 1203
1205 printk("swsusp: critical section: \n"); 1204 printk(KERN_INFO "PM: Creating hibernation image: \n");
1206 1205
1207 drain_local_pages(); 1206 drain_local_pages();
1208 nr_pages = count_data_pages(); 1207 nr_pages = count_data_pages();
1209 nr_highmem = count_highmem_pages(); 1208 nr_highmem = count_highmem_pages();
1210 printk("swsusp: Need to copy %u pages\n", nr_pages + nr_highmem); 1209 printk(KERN_INFO "PM: Need to copy %u pages\n", nr_pages + nr_highmem);
1211 1210
1212 if (!enough_free_mem(nr_pages, nr_highmem)) { 1211 if (!enough_free_mem(nr_pages, nr_highmem)) {
1213 printk(KERN_ERR "swsusp: Not enough free memory\n"); 1212 printk(KERN_ERR "PM: Not enough free memory\n");
1214 return -ENOMEM; 1213 return -ENOMEM;
1215 } 1214 }
1216 1215
1217 if (swsusp_alloc(&orig_bm, &copy_bm, nr_pages, nr_highmem)) { 1216 if (swsusp_alloc(&orig_bm, &copy_bm, nr_pages, nr_highmem)) {
1218 printk(KERN_ERR "swsusp: Memory allocation failed\n"); 1217 printk(KERN_ERR "PM: Memory allocation failed\n");
1219 return -ENOMEM; 1218 return -ENOMEM;
1220 } 1219 }
1221 1220
@@ -1235,7 +1234,8 @@ asmlinkage int swsusp_save(void)
1235 nr_copy_pages = nr_pages; 1234 nr_copy_pages = nr_pages;
1236 nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE); 1235 nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE);
1237 1236
1238 printk("swsusp: critical section: done (%d pages copied)\n", nr_pages); 1237 printk(KERN_INFO "PM: Hibernation image created (%d pages copied)\n",
1238 nr_pages);
1239 1239
1240 return 0; 1240 return 0;
1241} 1241}
@@ -1264,12 +1264,17 @@ static char *check_image_kernel(struct swsusp_info *info)
1264} 1264}
1265#endif /* CONFIG_ARCH_HIBERNATION_HEADER */ 1265#endif /* CONFIG_ARCH_HIBERNATION_HEADER */
1266 1266
1267unsigned long snapshot_get_image_size(void)
1268{
1269 return nr_copy_pages + nr_meta_pages + 1;
1270}
1271
1267static int init_header(struct swsusp_info *info) 1272static int init_header(struct swsusp_info *info)
1268{ 1273{
1269 memset(info, 0, sizeof(struct swsusp_info)); 1274 memset(info, 0, sizeof(struct swsusp_info));
1270 info->num_physpages = num_physpages; 1275 info->num_physpages = num_physpages;
1271 info->image_pages = nr_copy_pages; 1276 info->image_pages = nr_copy_pages;
1272 info->pages = nr_copy_pages + nr_meta_pages + 1; 1277 info->pages = snapshot_get_image_size();
1273 info->size = info->pages; 1278 info->size = info->pages;
1274 info->size <<= PAGE_SHIFT; 1279 info->size <<= PAGE_SHIFT;
1275 return init_header_complete(info); 1280 return init_header_complete(info);
@@ -1429,7 +1434,7 @@ static int check_header(struct swsusp_info *info)
1429 if (!reason && info->num_physpages != num_physpages) 1434 if (!reason && info->num_physpages != num_physpages)
1430 reason = "memory size"; 1435 reason = "memory size";
1431 if (reason) { 1436 if (reason) {
1432 printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason); 1437 printk(KERN_ERR "PM: Image mismatch: %s\n", reason);
1433 return -EPERM; 1438 return -EPERM;
1434 } 1439 }
1435 return 0; 1440 return 0;
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 917aba100575..a0abf9a463f9 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -28,8 +28,6 @@
28 28
29#include "power.h" 29#include "power.h"
30 30
31extern char resume_file[];
32
33#define SWSUSP_SIG "S1SUSPEND" 31#define SWSUSP_SIG "S1SUSPEND"
34 32
35struct swsusp_header { 33struct swsusp_header {
@@ -73,7 +71,8 @@ static int submit(int rw, pgoff_t page_off, struct page *page,
73 bio->bi_end_io = end_swap_bio_read; 71 bio->bi_end_io = end_swap_bio_read;
74 72
75 if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { 73 if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
76 printk("swsusp: ERROR: adding page to bio at %ld\n", page_off); 74 printk(KERN_ERR "PM: Adding page to bio failed at %ld\n",
75 page_off);
77 bio_put(bio); 76 bio_put(bio);
78 return -EFAULT; 77 return -EFAULT;
79 } 78 }
@@ -153,7 +152,7 @@ static int mark_swapfiles(sector_t start, unsigned int flags)
153 error = bio_write_page(swsusp_resume_block, 152 error = bio_write_page(swsusp_resume_block,
154 swsusp_header, NULL); 153 swsusp_header, NULL);
155 } else { 154 } else {
156 printk(KERN_ERR "swsusp: Swap header not found!\n"); 155 printk(KERN_ERR "PM: Swap header not found!\n");
157 error = -ENODEV; 156 error = -ENODEV;
158 } 157 }
159 return error; 158 return error;
@@ -325,7 +324,8 @@ static int save_image(struct swap_map_handle *handle,
325 struct timeval start; 324 struct timeval start;
326 struct timeval stop; 325 struct timeval stop;
327 326
328 printk("Saving image data pages (%u pages) ... ", nr_to_write); 327 printk(KERN_INFO "PM: Saving image data pages (%u pages) ... ",
328 nr_to_write);
329 m = nr_to_write / 100; 329 m = nr_to_write / 100;
330 if (!m) 330 if (!m)
331 m = 1; 331 m = 1;
@@ -365,7 +365,7 @@ static int enough_swap(unsigned int nr_pages)
365{ 365{
366 unsigned int free_swap = count_swap_pages(root_swap, 1); 366 unsigned int free_swap = count_swap_pages(root_swap, 1);
367 367
368 pr_debug("swsusp: free swap pages: %u\n", free_swap); 368 pr_debug("PM: Free swap pages: %u\n", free_swap);
369 return free_swap > nr_pages + PAGES_FOR_IO; 369 return free_swap > nr_pages + PAGES_FOR_IO;
370} 370}
371 371
@@ -388,7 +388,7 @@ int swsusp_write(unsigned int flags)
388 388
389 error = swsusp_swap_check(); 389 error = swsusp_swap_check();
390 if (error) { 390 if (error) {
391 printk(KERN_ERR "swsusp: Cannot find swap device, try " 391 printk(KERN_ERR "PM: Cannot find swap device, try "
392 "swapon -a.\n"); 392 "swapon -a.\n");
393 return error; 393 return error;
394 } 394 }
@@ -402,7 +402,7 @@ int swsusp_write(unsigned int flags)
402 } 402 }
403 header = (struct swsusp_info *)data_of(snapshot); 403 header = (struct swsusp_info *)data_of(snapshot);
404 if (!enough_swap(header->pages)) { 404 if (!enough_swap(header->pages)) {
405 printk(KERN_ERR "swsusp: Not enough free swap\n"); 405 printk(KERN_ERR "PM: Not enough free swap\n");
406 error = -ENOSPC; 406 error = -ENOSPC;
407 goto out; 407 goto out;
408 } 408 }
@@ -417,7 +417,7 @@ int swsusp_write(unsigned int flags)
417 417
418 if (!error) { 418 if (!error) {
419 flush_swap_writer(&handle); 419 flush_swap_writer(&handle);
420 printk("S"); 420 printk(KERN_INFO "PM: S");
421 error = mark_swapfiles(start, flags); 421 error = mark_swapfiles(start, flags);
422 printk("|\n"); 422 printk("|\n");
423 } 423 }
@@ -507,7 +507,8 @@ static int load_image(struct swap_map_handle *handle,
507 int err2; 507 int err2;
508 unsigned nr_pages; 508 unsigned nr_pages;
509 509
510 printk("Loading image data pages (%u pages) ... ", nr_to_read); 510 printk(KERN_INFO "PM: Loading image data pages (%u pages) ... ",
511 nr_to_read);
511 m = nr_to_read / 100; 512 m = nr_to_read / 100;
512 if (!m) 513 if (!m)
513 m = 1; 514 m = 1;
@@ -558,7 +559,7 @@ int swsusp_read(unsigned int *flags_p)
558 559
559 *flags_p = swsusp_header->flags; 560 *flags_p = swsusp_header->flags;
560 if (IS_ERR(resume_bdev)) { 561 if (IS_ERR(resume_bdev)) {
561 pr_debug("swsusp: block device not initialised\n"); 562 pr_debug("PM: Image device not initialised\n");
562 return PTR_ERR(resume_bdev); 563 return PTR_ERR(resume_bdev);
563 } 564 }
564 565
@@ -577,9 +578,9 @@ int swsusp_read(unsigned int *flags_p)
577 blkdev_put(resume_bdev); 578 blkdev_put(resume_bdev);
578 579
579 if (!error) 580 if (!error)
580 pr_debug("swsusp: Reading resume file was successful\n"); 581 pr_debug("PM: Image successfully loaded\n");
581 else 582 else
582 pr_debug("swsusp: Error %d resuming\n", error); 583 pr_debug("PM: Error %d resuming\n", error);
583 return error; 584 return error;
584} 585}
585 586
@@ -611,13 +612,13 @@ int swsusp_check(void)
611 if (error) 612 if (error)
612 blkdev_put(resume_bdev); 613 blkdev_put(resume_bdev);
613 else 614 else
614 pr_debug("swsusp: Signature found, resuming\n"); 615 pr_debug("PM: Signature found, resuming\n");
615 } else { 616 } else {
616 error = PTR_ERR(resume_bdev); 617 error = PTR_ERR(resume_bdev);
617 } 618 }
618 619
619 if (error) 620 if (error)
620 pr_debug("swsusp: Error %d check for resume file\n", error); 621 pr_debug("PM: Error %d checking image file\n", error);
621 622
622 return error; 623 return error;
623} 624}
@@ -629,7 +630,7 @@ int swsusp_check(void)
629void swsusp_close(void) 630void swsusp_close(void)
630{ 631{
631 if (IS_ERR(resume_bdev)) { 632 if (IS_ERR(resume_bdev)) {
632 pr_debug("swsusp: block device not initialised\n"); 633 pr_debug("PM: Image device not initialised\n");
633 return; 634 return;
634 } 635 }
635 636
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index e1722d3155f1..023ff2a31d89 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -64,14 +64,6 @@ unsigned long image_size = 500 * 1024 * 1024;
64 64
65int in_suspend __nosavedata = 0; 65int in_suspend __nosavedata = 0;
66 66
67#ifdef CONFIG_HIGHMEM
68unsigned int count_highmem_pages(void);
69int restore_highmem(void);
70#else
71static inline int restore_highmem(void) { return 0; }
72static inline unsigned int count_highmem_pages(void) { return 0; }
73#endif
74
75/** 67/**
76 * The following functions are used for tracing the allocated 68 * The following functions are used for tracing the allocated
77 * swap pages, so that they can be freed in case of an error. 69 * swap pages, so that they can be freed in case of an error.
@@ -196,7 +188,8 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop,
196 centisecs = 1; /* avoid div-by-zero */ 188 centisecs = 1; /* avoid div-by-zero */
197 k = nr_pages * (PAGE_SIZE / 1024); 189 k = nr_pages * (PAGE_SIZE / 1024);
198 kps = (k * 100) / centisecs; 190 kps = (k * 100) / centisecs;
199 printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k, 191 printk(KERN_INFO "PM: %s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n",
192 msg, k,
200 centisecs / 100, centisecs % 100, 193 centisecs / 100, centisecs % 100,
201 kps / 1000, (kps % 1000) / 10); 194 kps / 1000, (kps % 1000) / 10);
202} 195}
@@ -227,7 +220,7 @@ int swsusp_shrink_memory(void)
227 char *p = "-\\|/"; 220 char *p = "-\\|/";
228 struct timeval start, stop; 221 struct timeval start, stop;
229 222
230 printk("Shrinking memory... "); 223 printk(KERN_INFO "PM: Shrinking memory... ");
231 do_gettimeofday(&start); 224 do_gettimeofday(&start);
232 do { 225 do {
233 long size, highmem_size; 226 long size, highmem_size;
@@ -269,38 +262,3 @@ int swsusp_shrink_memory(void)
269 262
270 return 0; 263 return 0;
271} 264}
272
273int swsusp_resume(void)
274{
275 int error;
276
277 local_irq_disable();
278 /* NOTE: device_power_down() is just a suspend() with irqs off;
279 * it has no special "power things down" semantics
280 */
281 if (device_power_down(PMSG_PRETHAW))
282 printk(KERN_ERR "Some devices failed to power down, very bad\n");
283 /* We'll ignore saved state, but this gets preempt count (etc) right */
284 save_processor_state();
285 error = restore_highmem();
286 if (!error) {
287 error = swsusp_arch_resume();
288 /* The code below is only ever reached in case of a failure.
289 * Otherwise execution continues at place where
290 * swsusp_arch_suspend() was called
291 */
292 BUG_ON(!error);
293 /* This call to restore_highmem() undos the previous one */
294 restore_highmem();
295 }
296 /* The only reason why swsusp_arch_resume() can fail is memory being
297 * very tight, so we have to free it as soon as we can to avoid
298 * subsequent failures
299 */
300 swsusp_free();
301 restore_processor_state();
302 touch_softlockup_watchdog();
303 device_power_up();
304 local_irq_enable();
305 return error;
306}
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 5bd321bcbb75..f5512cb3aa86 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -28,6 +28,29 @@
28 28
29#include "power.h" 29#include "power.h"
30 30
31/*
32 * NOTE: The SNAPSHOT_SET_SWAP_FILE and SNAPSHOT_PMOPS ioctls are obsolete and
33 * will be removed in the future. They are only preserved here for
34 * compatibility with existing userland utilities.
35 */
36#define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int)
37#define SNAPSHOT_PMOPS _IOW(SNAPSHOT_IOC_MAGIC, 12, unsigned int)
38
39#define PMOPS_PREPARE 1
40#define PMOPS_ENTER 2
41#define PMOPS_FINISH 3
42
43/*
44 * NOTE: The following ioctl definitions are wrong and have been replaced with
45 * correct ones. They are only preserved here for compatibility with existing
46 * userland utilities and will be removed in the future.
47 */
48#define SNAPSHOT_ATOMIC_SNAPSHOT _IOW(SNAPSHOT_IOC_MAGIC, 3, void *)
49#define SNAPSHOT_SET_IMAGE_SIZE _IOW(SNAPSHOT_IOC_MAGIC, 6, unsigned long)
50#define SNAPSHOT_AVAIL_SWAP _IOR(SNAPSHOT_IOC_MAGIC, 7, void *)
51#define SNAPSHOT_GET_SWAP_PAGE _IOR(SNAPSHOT_IOC_MAGIC, 8, void *)
52
53
31#define SNAPSHOT_MINOR 231 54#define SNAPSHOT_MINOR 231
32 55
33static struct snapshot_data { 56static struct snapshot_data {
@@ -36,7 +59,7 @@ static struct snapshot_data {
36 int mode; 59 int mode;
37 char frozen; 60 char frozen;
38 char ready; 61 char ready;
39 char platform_suspend; 62 char platform_support;
40} snapshot_state; 63} snapshot_state;
41 64
42atomic_t snapshot_device_available = ATOMIC_INIT(1); 65atomic_t snapshot_device_available = ATOMIC_INIT(1);
@@ -44,6 +67,7 @@ atomic_t snapshot_device_available = ATOMIC_INIT(1);
44static int snapshot_open(struct inode *inode, struct file *filp) 67static int snapshot_open(struct inode *inode, struct file *filp)
45{ 68{
46 struct snapshot_data *data; 69 struct snapshot_data *data;
70 int error;
47 71
48 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) 72 if (!atomic_add_unless(&snapshot_device_available, -1, 0))
49 return -EBUSY; 73 return -EBUSY;
@@ -64,13 +88,23 @@ static int snapshot_open(struct inode *inode, struct file *filp)
64 data->swap = swsusp_resume_device ? 88 data->swap = swsusp_resume_device ?
65 swap_type_of(swsusp_resume_device, 0, NULL) : -1; 89 swap_type_of(swsusp_resume_device, 0, NULL) : -1;
66 data->mode = O_RDONLY; 90 data->mode = O_RDONLY;
91 error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
92 if (error)
93 pm_notifier_call_chain(PM_POST_RESTORE);
67 } else { 94 } else {
68 data->swap = -1; 95 data->swap = -1;
69 data->mode = O_WRONLY; 96 data->mode = O_WRONLY;
97 error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
98 if (error)
99 pm_notifier_call_chain(PM_POST_HIBERNATION);
100 }
101 if (error) {
102 atomic_inc(&snapshot_device_available);
103 return error;
70 } 104 }
71 data->frozen = 0; 105 data->frozen = 0;
72 data->ready = 0; 106 data->ready = 0;
73 data->platform_suspend = 0; 107 data->platform_support = 0;
74 108
75 return 0; 109 return 0;
76} 110}
@@ -88,6 +122,8 @@ static int snapshot_release(struct inode *inode, struct file *filp)
88 thaw_processes(); 122 thaw_processes();
89 mutex_unlock(&pm_mutex); 123 mutex_unlock(&pm_mutex);
90 } 124 }
125 pm_notifier_call_chain(data->mode == O_WRONLY ?
126 PM_POST_HIBERNATION : PM_POST_RESTORE);
91 atomic_inc(&snapshot_device_available); 127 atomic_inc(&snapshot_device_available);
92 return 0; 128 return 0;
93} 129}
@@ -133,7 +169,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
133{ 169{
134 int error = 0; 170 int error = 0;
135 struct snapshot_data *data; 171 struct snapshot_data *data;
136 loff_t avail; 172 loff_t size;
137 sector_t offset; 173 sector_t offset;
138 174
139 if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC) 175 if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC)
@@ -151,18 +187,13 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
151 if (data->frozen) 187 if (data->frozen)
152 break; 188 break;
153 mutex_lock(&pm_mutex); 189 mutex_lock(&pm_mutex);
154 error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); 190 printk("Syncing filesystems ... ");
155 if (!error) { 191 sys_sync();
156 printk("Syncing filesystems ... "); 192 printk("done.\n");
157 sys_sync(); 193
158 printk("done.\n"); 194 error = freeze_processes();
159
160 error = freeze_processes();
161 if (error)
162 thaw_processes();
163 }
164 if (error) 195 if (error)
165 pm_notifier_call_chain(PM_POST_HIBERNATION); 196 thaw_processes();
166 mutex_unlock(&pm_mutex); 197 mutex_unlock(&pm_mutex);
167 if (!error) 198 if (!error)
168 data->frozen = 1; 199 data->frozen = 1;
@@ -173,19 +204,19 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
173 break; 204 break;
174 mutex_lock(&pm_mutex); 205 mutex_lock(&pm_mutex);
175 thaw_processes(); 206 thaw_processes();
176 pm_notifier_call_chain(PM_POST_HIBERNATION);
177 mutex_unlock(&pm_mutex); 207 mutex_unlock(&pm_mutex);
178 data->frozen = 0; 208 data->frozen = 0;
179 break; 209 break;
180 210
211 case SNAPSHOT_CREATE_IMAGE:
181 case SNAPSHOT_ATOMIC_SNAPSHOT: 212 case SNAPSHOT_ATOMIC_SNAPSHOT:
182 if (data->mode != O_RDONLY || !data->frozen || data->ready) { 213 if (data->mode != O_RDONLY || !data->frozen || data->ready) {
183 error = -EPERM; 214 error = -EPERM;
184 break; 215 break;
185 } 216 }
186 error = hibernation_snapshot(data->platform_suspend); 217 error = hibernation_snapshot(data->platform_support);
187 if (!error) 218 if (!error)
188 error = put_user(in_suspend, (unsigned int __user *)arg); 219 error = put_user(in_suspend, (int __user *)arg);
189 if (!error) 220 if (!error)
190 data->ready = 1; 221 data->ready = 1;
191 break; 222 break;
@@ -197,7 +228,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
197 error = -EPERM; 228 error = -EPERM;
198 break; 229 break;
199 } 230 }
200 error = hibernation_restore(data->platform_suspend); 231 error = hibernation_restore(data->platform_support);
201 break; 232 break;
202 233
203 case SNAPSHOT_FREE: 234 case SNAPSHOT_FREE:
@@ -206,16 +237,29 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
206 data->ready = 0; 237 data->ready = 0;
207 break; 238 break;
208 239
240 case SNAPSHOT_PREF_IMAGE_SIZE:
209 case SNAPSHOT_SET_IMAGE_SIZE: 241 case SNAPSHOT_SET_IMAGE_SIZE:
210 image_size = arg; 242 image_size = arg;
211 break; 243 break;
212 244
245 case SNAPSHOT_GET_IMAGE_SIZE:
246 if (!data->ready) {
247 error = -ENODATA;
248 break;
249 }
250 size = snapshot_get_image_size();
251 size <<= PAGE_SHIFT;
252 error = put_user(size, (loff_t __user *)arg);
253 break;
254
255 case SNAPSHOT_AVAIL_SWAP_SIZE:
213 case SNAPSHOT_AVAIL_SWAP: 256 case SNAPSHOT_AVAIL_SWAP:
214 avail = count_swap_pages(data->swap, 1); 257 size = count_swap_pages(data->swap, 1);
215 avail <<= PAGE_SHIFT; 258 size <<= PAGE_SHIFT;
216 error = put_user(avail, (loff_t __user *)arg); 259 error = put_user(size, (loff_t __user *)arg);
217 break; 260 break;
218 261
262 case SNAPSHOT_ALLOC_SWAP_PAGE:
219 case SNAPSHOT_GET_SWAP_PAGE: 263 case SNAPSHOT_GET_SWAP_PAGE:
220 if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { 264 if (data->swap < 0 || data->swap >= MAX_SWAPFILES) {
221 error = -ENODEV; 265 error = -ENODEV;
@@ -224,7 +268,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
224 offset = alloc_swapdev_block(data->swap); 268 offset = alloc_swapdev_block(data->swap);
225 if (offset) { 269 if (offset) {
226 offset <<= PAGE_SHIFT; 270 offset <<= PAGE_SHIFT;
227 error = put_user(offset, (sector_t __user *)arg); 271 error = put_user(offset, (loff_t __user *)arg);
228 } else { 272 } else {
229 error = -ENOSPC; 273 error = -ENOSPC;
230 } 274 }
@@ -238,7 +282,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
238 free_all_swap_pages(data->swap); 282 free_all_swap_pages(data->swap);
239 break; 283 break;
240 284
241 case SNAPSHOT_SET_SWAP_FILE: 285 case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */
242 if (!swsusp_swap_in_use()) { 286 if (!swsusp_swap_in_use()) {
243 /* 287 /*
244 * User space encodes device types as two-byte values, 288 * User space encodes device types as two-byte values,
@@ -275,26 +319,33 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
275 mutex_unlock(&pm_mutex); 319 mutex_unlock(&pm_mutex);
276 break; 320 break;
277 321
278 case SNAPSHOT_PMOPS: 322 case SNAPSHOT_PLATFORM_SUPPORT:
323 data->platform_support = !!arg;
324 break;
325
326 case SNAPSHOT_POWER_OFF:
327 if (data->platform_support)
328 error = hibernation_platform_enter();
329 break;
330
331 case SNAPSHOT_PMOPS: /* This ioctl is deprecated */
279 error = -EINVAL; 332 error = -EINVAL;
280 333
281 switch (arg) { 334 switch (arg) {
282 335
283 case PMOPS_PREPARE: 336 case PMOPS_PREPARE:
284 data->platform_suspend = 1; 337 data->platform_support = 1;
285 error = 0; 338 error = 0;
286 break; 339 break;
287 340
288 case PMOPS_ENTER: 341 case PMOPS_ENTER:
289 if (data->platform_suspend) 342 if (data->platform_support)
290 error = hibernation_platform_enter(); 343 error = hibernation_platform_enter();
291
292 break; 344 break;
293 345
294 case PMOPS_FINISH: 346 case PMOPS_FINISH:
295 if (data->platform_suspend) 347 if (data->platform_support)
296 error = 0; 348 error = 0;
297
298 break; 349 break;
299 350
300 default: 351 default:
diff --git a/kernel/printk.c b/kernel/printk.c
index a30fe33de395..29ae1e99cde0 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -36,6 +36,13 @@
36 36
37#include <asm/uaccess.h> 37#include <asm/uaccess.h>
38 38
39/*
40 * Architectures can override it:
41 */
42void __attribute__((weak)) early_printk(const char *fmt, ...)
43{
44}
45
39#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) 46#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
40 47
41/* printk's without a loglevel use this.. */ 48/* printk's without a loglevel use this.. */
@@ -448,10 +455,10 @@ static int __init ignore_loglevel_setup(char *str)
448 ignore_loglevel = 1; 455 ignore_loglevel = 1;
449 printk(KERN_INFO "debug: ignoring loglevel setting.\n"); 456 printk(KERN_INFO "debug: ignoring loglevel setting.\n");
450 457
451 return 1; 458 return 0;
452} 459}
453 460
454__setup("ignore_loglevel", ignore_loglevel_setup); 461early_param("ignore_loglevel", ignore_loglevel_setup);
455 462
456/* 463/*
457 * Write out chars from start to end - 1 inclusive 464 * Write out chars from start to end - 1 inclusive
@@ -573,11 +580,6 @@ static int __init printk_time_setup(char *str)
573 580
574__setup("time", printk_time_setup); 581__setup("time", printk_time_setup);
575 582
576__attribute__((weak)) unsigned long long printk_clock(void)
577{
578 return sched_clock();
579}
580
581/* Check if we have any console registered that can be called early in boot. */ 583/* Check if we have any console registered that can be called early in boot. */
582static int have_callable_console(void) 584static int have_callable_console(void)
583{ 585{
@@ -628,30 +630,57 @@ asmlinkage int printk(const char *fmt, ...)
628/* cpu currently holding logbuf_lock */ 630/* cpu currently holding logbuf_lock */
629static volatile unsigned int printk_cpu = UINT_MAX; 631static volatile unsigned int printk_cpu = UINT_MAX;
630 632
633const char printk_recursion_bug_msg [] =
634 KERN_CRIT "BUG: recent printk recursion!\n";
635static int printk_recursion_bug;
636
631asmlinkage int vprintk(const char *fmt, va_list args) 637asmlinkage int vprintk(const char *fmt, va_list args)
632{ 638{
639 static int log_level_unknown = 1;
640 static char printk_buf[1024];
641
633 unsigned long flags; 642 unsigned long flags;
634 int printed_len; 643 int printed_len = 0;
644 int this_cpu;
635 char *p; 645 char *p;
636 static char printk_buf[1024];
637 static int log_level_unknown = 1;
638 646
639 boot_delay_msec(); 647 boot_delay_msec();
640 648
641 preempt_disable(); 649 preempt_disable();
642 if (unlikely(oops_in_progress) && printk_cpu == smp_processor_id())
643 /* If a crash is occurring during printk() on this CPU,
644 * make sure we can't deadlock */
645 zap_locks();
646
647 /* This stops the holder of console_sem just where we want him */ 650 /* This stops the holder of console_sem just where we want him */
648 raw_local_irq_save(flags); 651 raw_local_irq_save(flags);
652 this_cpu = smp_processor_id();
653
654 /*
655 * Ouch, printk recursed into itself!
656 */
657 if (unlikely(printk_cpu == this_cpu)) {
658 /*
659 * If a crash is occurring during printk() on this CPU,
660 * then try to get the crash message out but make sure
661 * we can't deadlock. Otherwise just return to avoid the
662 * recursion and return - but flag the recursion so that
663 * it can be printed at the next appropriate moment:
664 */
665 if (!oops_in_progress) {
666 printk_recursion_bug = 1;
667 goto out_restore_irqs;
668 }
669 zap_locks();
670 }
671
649 lockdep_off(); 672 lockdep_off();
650 spin_lock(&logbuf_lock); 673 spin_lock(&logbuf_lock);
651 printk_cpu = smp_processor_id(); 674 printk_cpu = this_cpu;
652 675
676 if (printk_recursion_bug) {
677 printk_recursion_bug = 0;
678 strcpy(printk_buf, printk_recursion_bug_msg);
679 printed_len = sizeof(printk_recursion_bug_msg);
680 }
653 /* Emit the output into the temporary buffer */ 681 /* Emit the output into the temporary buffer */
654 printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args); 682 printed_len += vscnprintf(printk_buf + printed_len,
683 sizeof(printk_buf), fmt, args);
655 684
656 /* 685 /*
657 * Copy the output into log_buf. If the caller didn't provide 686 * Copy the output into log_buf. If the caller didn't provide
@@ -680,7 +709,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
680 loglev_char = default_message_loglevel 709 loglev_char = default_message_loglevel
681 + '0'; 710 + '0';
682 } 711 }
683 t = printk_clock(); 712 t = cpu_clock(printk_cpu);
684 nanosec_rem = do_div(t, 1000000000); 713 nanosec_rem = do_div(t, 1000000000);
685 tlen = sprintf(tbuf, 714 tlen = sprintf(tbuf,
686 "<%c>[%5lu.%06lu] ", 715 "<%c>[%5lu.%06lu] ",
@@ -744,6 +773,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
744 printk_cpu = UINT_MAX; 773 printk_cpu = UINT_MAX;
745 spin_unlock(&logbuf_lock); 774 spin_unlock(&logbuf_lock);
746 lockdep_on(); 775 lockdep_on();
776out_restore_irqs:
747 raw_local_irq_restore(flags); 777 raw_local_irq_restore(flags);
748 } 778 }
749 779
@@ -817,7 +847,7 @@ __setup("console=", console_setup);
817 * commonly to provide a default console (ie from PROM variables) when 847 * commonly to provide a default console (ie from PROM variables) when
818 * the user has not supplied one. 848 * the user has not supplied one.
819 */ 849 */
820int __init add_preferred_console(char *name, int idx, char *options) 850int add_preferred_console(char *name, int idx, char *options)
821{ 851{
822 struct console_cmdline *c; 852 struct console_cmdline *c;
823 int i; 853 int i;
diff --git a/kernel/profile.c b/kernel/profile.c
index 5e95330e5120..e64c2da11c0f 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -52,7 +52,7 @@ static DEFINE_PER_CPU(int, cpu_profile_flip);
52static DEFINE_MUTEX(profile_flip_mutex); 52static DEFINE_MUTEX(profile_flip_mutex);
53#endif /* CONFIG_SMP */ 53#endif /* CONFIG_SMP */
54 54
55static int __init profile_setup(char * str) 55static int __init profile_setup(char *str)
56{ 56{
57 static char __initdata schedstr[] = "schedule"; 57 static char __initdata schedstr[] = "schedule";
58 static char __initdata sleepstr[] = "sleep"; 58 static char __initdata sleepstr[] = "sleep";
@@ -104,28 +104,28 @@ __setup("profile=", profile_setup);
104 104
105void __init profile_init(void) 105void __init profile_init(void)
106{ 106{
107 if (!prof_on) 107 if (!prof_on)
108 return; 108 return;
109 109
110 /* only text is profiled */ 110 /* only text is profiled */
111 prof_len = (_etext - _stext) >> prof_shift; 111 prof_len = (_etext - _stext) >> prof_shift;
112 prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t)); 112 prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t));
113} 113}
114 114
115/* Profile event notifications */ 115/* Profile event notifications */
116 116
117#ifdef CONFIG_PROFILING 117#ifdef CONFIG_PROFILING
118 118
119static BLOCKING_NOTIFIER_HEAD(task_exit_notifier); 119static BLOCKING_NOTIFIER_HEAD(task_exit_notifier);
120static ATOMIC_NOTIFIER_HEAD(task_free_notifier); 120static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
121static BLOCKING_NOTIFIER_HEAD(munmap_notifier); 121static BLOCKING_NOTIFIER_HEAD(munmap_notifier);
122 122
123void profile_task_exit(struct task_struct * task) 123void profile_task_exit(struct task_struct *task)
124{ 124{
125 blocking_notifier_call_chain(&task_exit_notifier, 0, task); 125 blocking_notifier_call_chain(&task_exit_notifier, 0, task);
126} 126}
127 127
128int profile_handoff_task(struct task_struct * task) 128int profile_handoff_task(struct task_struct *task)
129{ 129{
130 int ret; 130 int ret;
131 ret = atomic_notifier_call_chain(&task_free_notifier, 0, task); 131 ret = atomic_notifier_call_chain(&task_free_notifier, 0, task);
@@ -137,52 +137,55 @@ void profile_munmap(unsigned long addr)
137 blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr); 137 blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr);
138} 138}
139 139
140int task_handoff_register(struct notifier_block * n) 140int task_handoff_register(struct notifier_block *n)
141{ 141{
142 return atomic_notifier_chain_register(&task_free_notifier, n); 142 return atomic_notifier_chain_register(&task_free_notifier, n);
143} 143}
144EXPORT_SYMBOL_GPL(task_handoff_register);
144 145
145int task_handoff_unregister(struct notifier_block * n) 146int task_handoff_unregister(struct notifier_block *n)
146{ 147{
147 return atomic_notifier_chain_unregister(&task_free_notifier, n); 148 return atomic_notifier_chain_unregister(&task_free_notifier, n);
148} 149}
150EXPORT_SYMBOL_GPL(task_handoff_unregister);
149 151
150int profile_event_register(enum profile_type type, struct notifier_block * n) 152int profile_event_register(enum profile_type type, struct notifier_block *n)
151{ 153{
152 int err = -EINVAL; 154 int err = -EINVAL;
153 155
154 switch (type) { 156 switch (type) {
155 case PROFILE_TASK_EXIT: 157 case PROFILE_TASK_EXIT:
156 err = blocking_notifier_chain_register( 158 err = blocking_notifier_chain_register(
157 &task_exit_notifier, n); 159 &task_exit_notifier, n);
158 break; 160 break;
159 case PROFILE_MUNMAP: 161 case PROFILE_MUNMAP:
160 err = blocking_notifier_chain_register( 162 err = blocking_notifier_chain_register(
161 &munmap_notifier, n); 163 &munmap_notifier, n);
162 break; 164 break;
163 } 165 }
164 166
165 return err; 167 return err;
166} 168}
169EXPORT_SYMBOL_GPL(profile_event_register);
167 170
168 171int profile_event_unregister(enum profile_type type, struct notifier_block *n)
169int profile_event_unregister(enum profile_type type, struct notifier_block * n)
170{ 172{
171 int err = -EINVAL; 173 int err = -EINVAL;
172 174
173 switch (type) { 175 switch (type) {
174 case PROFILE_TASK_EXIT: 176 case PROFILE_TASK_EXIT:
175 err = blocking_notifier_chain_unregister( 177 err = blocking_notifier_chain_unregister(
176 &task_exit_notifier, n); 178 &task_exit_notifier, n);
177 break; 179 break;
178 case PROFILE_MUNMAP: 180 case PROFILE_MUNMAP:
179 err = blocking_notifier_chain_unregister( 181 err = blocking_notifier_chain_unregister(
180 &munmap_notifier, n); 182 &munmap_notifier, n);
181 break; 183 break;
182 } 184 }
183 185
184 return err; 186 return err;
185} 187}
188EXPORT_SYMBOL_GPL(profile_event_unregister);
186 189
187int register_timer_hook(int (*hook)(struct pt_regs *)) 190int register_timer_hook(int (*hook)(struct pt_regs *))
188{ 191{
@@ -191,6 +194,7 @@ int register_timer_hook(int (*hook)(struct pt_regs *))
191 timer_hook = hook; 194 timer_hook = hook;
192 return 0; 195 return 0;
193} 196}
197EXPORT_SYMBOL_GPL(register_timer_hook);
194 198
195void unregister_timer_hook(int (*hook)(struct pt_regs *)) 199void unregister_timer_hook(int (*hook)(struct pt_regs *))
196{ 200{
@@ -199,13 +203,7 @@ void unregister_timer_hook(int (*hook)(struct pt_regs *))
199 /* make sure all CPUs see the NULL hook */ 203 /* make sure all CPUs see the NULL hook */
200 synchronize_sched(); /* Allow ongoing interrupts to complete. */ 204 synchronize_sched(); /* Allow ongoing interrupts to complete. */
201} 205}
202
203EXPORT_SYMBOL_GPL(register_timer_hook);
204EXPORT_SYMBOL_GPL(unregister_timer_hook); 206EXPORT_SYMBOL_GPL(unregister_timer_hook);
205EXPORT_SYMBOL_GPL(task_handoff_register);
206EXPORT_SYMBOL_GPL(task_handoff_unregister);
207EXPORT_SYMBOL_GPL(profile_event_register);
208EXPORT_SYMBOL_GPL(profile_event_unregister);
209 207
210#endif /* CONFIG_PROFILING */ 208#endif /* CONFIG_PROFILING */
211 209
@@ -366,7 +364,7 @@ static int __devinit profile_cpu_callback(struct notifier_block *info,
366 per_cpu(cpu_profile_hits, cpu)[0] = page_address(page); 364 per_cpu(cpu_profile_hits, cpu)[0] = page_address(page);
367 } 365 }
368 break; 366 break;
369 out_free: 367out_free:
370 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); 368 page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]);
371 per_cpu(cpu_profile_hits, cpu)[1] = NULL; 369 per_cpu(cpu_profile_hits, cpu)[1] = NULL;
372 __free_page(page); 370 __free_page(page);
@@ -409,7 +407,6 @@ void profile_hits(int type, void *__pc, unsigned int nr_hits)
409 atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); 407 atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
410} 408}
411#endif /* !CONFIG_SMP */ 409#endif /* !CONFIG_SMP */
412
413EXPORT_SYMBOL_GPL(profile_hits); 410EXPORT_SYMBOL_GPL(profile_hits);
414 411
415void profile_tick(int type) 412void profile_tick(int type)
@@ -427,7 +424,7 @@ void profile_tick(int type)
427#include <asm/uaccess.h> 424#include <asm/uaccess.h>
428#include <asm/ptrace.h> 425#include <asm/ptrace.h>
429 426
430static int prof_cpu_mask_read_proc (char *page, char **start, off_t off, 427static int prof_cpu_mask_read_proc(char *page, char **start, off_t off,
431 int count, int *eof, void *data) 428 int count, int *eof, void *data)
432{ 429{
433 int len = cpumask_scnprintf(page, count, *(cpumask_t *)data); 430 int len = cpumask_scnprintf(page, count, *(cpumask_t *)data);
@@ -437,8 +434,8 @@ static int prof_cpu_mask_read_proc (char *page, char **start, off_t off,
437 return len; 434 return len;
438} 435}
439 436
440static int prof_cpu_mask_write_proc (struct file *file, const char __user *buffer, 437static int prof_cpu_mask_write_proc(struct file *file,
441 unsigned long count, void *data) 438 const char __user *buffer, unsigned long count, void *data)
442{ 439{
443 cpumask_t *mask = (cpumask_t *)data; 440 cpumask_t *mask = (cpumask_t *)data;
444 unsigned long full_count = count, err; 441 unsigned long full_count = count, err;
@@ -457,7 +454,8 @@ void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir)
457 struct proc_dir_entry *entry; 454 struct proc_dir_entry *entry;
458 455
459 /* create /proc/irq/prof_cpu_mask */ 456 /* create /proc/irq/prof_cpu_mask */
460 if (!(entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir))) 457 entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir);
458 if (!entry)
461 return; 459 return;
462 entry->data = (void *)&prof_cpu_mask; 460 entry->data = (void *)&prof_cpu_mask;
463 entry->read_proc = prof_cpu_mask_read_proc; 461 entry->read_proc = prof_cpu_mask_read_proc;
@@ -475,7 +473,7 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
475{ 473{
476 unsigned long p = *ppos; 474 unsigned long p = *ppos;
477 ssize_t read; 475 ssize_t read;
478 char * pnt; 476 char *pnt;
479 unsigned int sample_step = 1 << prof_shift; 477 unsigned int sample_step = 1 << prof_shift;
480 478
481 profile_flip_buffers(); 479 profile_flip_buffers();
@@ -486,12 +484,12 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos)
486 read = 0; 484 read = 0;
487 485
488 while (p < sizeof(unsigned int) && count > 0) { 486 while (p < sizeof(unsigned int) && count > 0) {
489 if (put_user(*((char *)(&sample_step)+p),buf)) 487 if (put_user(*((char *)(&sample_step)+p), buf))
490 return -EFAULT; 488 return -EFAULT;
491 buf++; p++; count--; read++; 489 buf++; p++; count--; read++;
492 } 490 }
493 pnt = (char *)prof_buffer + p - sizeof(atomic_t); 491 pnt = (char *)prof_buffer + p - sizeof(atomic_t);
494 if (copy_to_user(buf,(void *)pnt,count)) 492 if (copy_to_user(buf, (void *)pnt, count))
495 return -EFAULT; 493 return -EFAULT;
496 read += count; 494 read += count;
497 *ppos += read; 495 *ppos += read;
@@ -508,7 +506,7 @@ static ssize_t write_profile(struct file *file, const char __user *buf,
508 size_t count, loff_t *ppos) 506 size_t count, loff_t *ppos)
509{ 507{
510#ifdef CONFIG_SMP 508#ifdef CONFIG_SMP
511 extern int setup_profiling_timer (unsigned int multiplier); 509 extern int setup_profiling_timer(unsigned int multiplier);
512 510
513 if (count == sizeof(int)) { 511 if (count == sizeof(int)) {
514 unsigned int multiplier; 512 unsigned int multiplier;
@@ -591,7 +589,8 @@ static int __init create_proc_profile(void)
591 return 0; 589 return 0;
592 if (create_hash_tables()) 590 if (create_hash_tables())
593 return -1; 591 return -1;
594 if (!(entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL))) 592 entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL);
593 if (!entry)
595 return 0; 594 return 0;
596 entry->proc_fops = &proc_profile_operations; 595 entry->proc_fops = &proc_profile_operations;
597 entry->size = (1+prof_len) * sizeof(atomic_t); 596 entry->size = (1+prof_len) * sizeof(atomic_t);
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 7c76f2ffaeaa..b0d4ab4dfd3d 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -51,7 +51,7 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
51void ptrace_untrace(struct task_struct *child) 51void ptrace_untrace(struct task_struct *child)
52{ 52{
53 spin_lock(&child->sighand->siglock); 53 spin_lock(&child->sighand->siglock);
54 if (child->state == TASK_TRACED) { 54 if (task_is_traced(child)) {
55 if (child->signal->flags & SIGNAL_STOP_STOPPED) { 55 if (child->signal->flags & SIGNAL_STOP_STOPPED) {
56 child->state = TASK_STOPPED; 56 child->state = TASK_STOPPED;
57 } else { 57 } else {
@@ -79,7 +79,7 @@ void __ptrace_unlink(struct task_struct *child)
79 add_parent(child); 79 add_parent(child);
80 } 80 }
81 81
82 if (child->state == TASK_TRACED) 82 if (task_is_traced(child))
83 ptrace_untrace(child); 83 ptrace_untrace(child);
84} 84}
85 85
@@ -103,9 +103,9 @@ int ptrace_check_attach(struct task_struct *child, int kill)
103 && child->signal != NULL) { 103 && child->signal != NULL) {
104 ret = 0; 104 ret = 0;
105 spin_lock_irq(&child->sighand->siglock); 105 spin_lock_irq(&child->sighand->siglock);
106 if (child->state == TASK_STOPPED) { 106 if (task_is_stopped(child)) {
107 child->state = TASK_TRACED; 107 child->state = TASK_TRACED;
108 } else if (child->state != TASK_TRACED && !kill) { 108 } else if (!task_is_traced(child) && !kill) {
109 ret = -ESRCH; 109 ret = -ESRCH;
110 } 110 }
111 spin_unlock_irq(&child->sighand->siglock); 111 spin_unlock_irq(&child->sighand->siglock);
@@ -120,7 +120,7 @@ int ptrace_check_attach(struct task_struct *child, int kill)
120 return ret; 120 return ret;
121} 121}
122 122
123static int may_attach(struct task_struct *task) 123int __ptrace_may_attach(struct task_struct *task)
124{ 124{
125 /* May we inspect the given task? 125 /* May we inspect the given task?
126 * This check is used both for attaching with ptrace 126 * This check is used both for attaching with ptrace
@@ -154,7 +154,7 @@ int ptrace_may_attach(struct task_struct *task)
154{ 154{
155 int err; 155 int err;
156 task_lock(task); 156 task_lock(task);
157 err = may_attach(task); 157 err = __ptrace_may_attach(task);
158 task_unlock(task); 158 task_unlock(task);
159 return !err; 159 return !err;
160} 160}
@@ -196,7 +196,7 @@ repeat:
196 /* the same process cannot be attached many times */ 196 /* the same process cannot be attached many times */
197 if (task->ptrace & PT_PTRACED) 197 if (task->ptrace & PT_PTRACED)
198 goto bad; 198 goto bad;
199 retval = may_attach(task); 199 retval = __ptrace_may_attach(task);
200 if (retval) 200 if (retval)
201 goto bad; 201 goto bad;
202 202
@@ -366,12 +366,73 @@ static int ptrace_setsiginfo(struct task_struct *child, siginfo_t __user * data)
366 return error; 366 return error;
367} 367}
368 368
369
370#ifdef PTRACE_SINGLESTEP
371#define is_singlestep(request) ((request) == PTRACE_SINGLESTEP)
372#else
373#define is_singlestep(request) 0
374#endif
375
376#ifdef PTRACE_SINGLEBLOCK
377#define is_singleblock(request) ((request) == PTRACE_SINGLEBLOCK)
378#else
379#define is_singleblock(request) 0
380#endif
381
382#ifdef PTRACE_SYSEMU
383#define is_sysemu_singlestep(request) ((request) == PTRACE_SYSEMU_SINGLESTEP)
384#else
385#define is_sysemu_singlestep(request) 0
386#endif
387
388static int ptrace_resume(struct task_struct *child, long request, long data)
389{
390 if (!valid_signal(data))
391 return -EIO;
392
393 if (request == PTRACE_SYSCALL)
394 set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
395 else
396 clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
397
398#ifdef TIF_SYSCALL_EMU
399 if (request == PTRACE_SYSEMU || request == PTRACE_SYSEMU_SINGLESTEP)
400 set_tsk_thread_flag(child, TIF_SYSCALL_EMU);
401 else
402 clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
403#endif
404
405 if (is_singleblock(request)) {
406 if (unlikely(!arch_has_block_step()))
407 return -EIO;
408 user_enable_block_step(child);
409 } else if (is_singlestep(request) || is_sysemu_singlestep(request)) {
410 if (unlikely(!arch_has_single_step()))
411 return -EIO;
412 user_enable_single_step(child);
413 }
414 else
415 user_disable_single_step(child);
416
417 child->exit_code = data;
418 wake_up_process(child);
419
420 return 0;
421}
422
369int ptrace_request(struct task_struct *child, long request, 423int ptrace_request(struct task_struct *child, long request,
370 long addr, long data) 424 long addr, long data)
371{ 425{
372 int ret = -EIO; 426 int ret = -EIO;
373 427
374 switch (request) { 428 switch (request) {
429 case PTRACE_PEEKTEXT:
430 case PTRACE_PEEKDATA:
431 return generic_ptrace_peekdata(child, addr, data);
432 case PTRACE_POKETEXT:
433 case PTRACE_POKEDATA:
434 return generic_ptrace_pokedata(child, addr, data);
435
375#ifdef PTRACE_OLDSETOPTIONS 436#ifdef PTRACE_OLDSETOPTIONS
376 case PTRACE_OLDSETOPTIONS: 437 case PTRACE_OLDSETOPTIONS:
377#endif 438#endif
@@ -390,6 +451,26 @@ int ptrace_request(struct task_struct *child, long request,
390 case PTRACE_DETACH: /* detach a process that was attached. */ 451 case PTRACE_DETACH: /* detach a process that was attached. */
391 ret = ptrace_detach(child, data); 452 ret = ptrace_detach(child, data);
392 break; 453 break;
454
455#ifdef PTRACE_SINGLESTEP
456 case PTRACE_SINGLESTEP:
457#endif
458#ifdef PTRACE_SINGLEBLOCK
459 case PTRACE_SINGLEBLOCK:
460#endif
461#ifdef PTRACE_SYSEMU
462 case PTRACE_SYSEMU:
463 case PTRACE_SYSEMU_SINGLESTEP:
464#endif
465 case PTRACE_SYSCALL:
466 case PTRACE_CONT:
467 return ptrace_resume(child, request, data);
468
469 case PTRACE_KILL:
470 if (child->exit_state) /* already dead */
471 return 0;
472 return ptrace_resume(child, request, SIGKILL);
473
393 default: 474 default:
394 break; 475 break;
395 } 476 }
@@ -470,6 +551,8 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data)
470 lock_kernel(); 551 lock_kernel();
471 if (request == PTRACE_TRACEME) { 552 if (request == PTRACE_TRACEME) {
472 ret = ptrace_traceme(); 553 ret = ptrace_traceme();
554 if (!ret)
555 arch_ptrace_attach(current);
473 goto out; 556 goto out;
474 } 557 }
475 558
@@ -524,3 +607,87 @@ int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data)
524 copied = access_process_vm(tsk, addr, &data, sizeof(data), 1); 607 copied = access_process_vm(tsk, addr, &data, sizeof(data), 1);
525 return (copied == sizeof(data)) ? 0 : -EIO; 608 return (copied == sizeof(data)) ? 0 : -EIO;
526} 609}
610
611#ifdef CONFIG_COMPAT
612#include <linux/compat.h>
613
614int compat_ptrace_request(struct task_struct *child, compat_long_t request,
615 compat_ulong_t addr, compat_ulong_t data)
616{
617 compat_ulong_t __user *datap = compat_ptr(data);
618 compat_ulong_t word;
619 int ret;
620
621 switch (request) {
622 case PTRACE_PEEKTEXT:
623 case PTRACE_PEEKDATA:
624 ret = access_process_vm(child, addr, &word, sizeof(word), 0);
625 if (ret != sizeof(word))
626 ret = -EIO;
627 else
628 ret = put_user(word, datap);
629 break;
630
631 case PTRACE_POKETEXT:
632 case PTRACE_POKEDATA:
633 ret = access_process_vm(child, addr, &data, sizeof(data), 1);
634 ret = (ret != sizeof(data) ? -EIO : 0);
635 break;
636
637 case PTRACE_GETEVENTMSG:
638 ret = put_user((compat_ulong_t) child->ptrace_message, datap);
639 break;
640
641 default:
642 ret = ptrace_request(child, request, addr, data);
643 }
644
645 return ret;
646}
647
648#ifdef __ARCH_WANT_COMPAT_SYS_PTRACE
649asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
650 compat_long_t addr, compat_long_t data)
651{
652 struct task_struct *child;
653 long ret;
654
655 /*
656 * This lock_kernel fixes a subtle race with suid exec
657 */
658 lock_kernel();
659 if (request == PTRACE_TRACEME) {
660 ret = ptrace_traceme();
661 goto out;
662 }
663
664 child = ptrace_get_task_struct(pid);
665 if (IS_ERR(child)) {
666 ret = PTR_ERR(child);
667 goto out;
668 }
669
670 if (request == PTRACE_ATTACH) {
671 ret = ptrace_attach(child);
672 /*
673 * Some architectures need to do book-keeping after
674 * a ptrace attach.
675 */
676 if (!ret)
677 arch_ptrace_attach(child);
678 goto out_put_task_struct;
679 }
680
681 ret = ptrace_check_attach(child, request == PTRACE_KILL);
682 if (!ret)
683 ret = compat_arch_ptrace(child, request, addr, data);
684
685 out_put_task_struct:
686 put_task_struct(child);
687 out:
688 unlock_kernel();
689 return ret;
690}
691#endif /* __ARCH_WANT_COMPAT_SYS_PTRACE */
692
693#endif /* CONFIG_COMPAT */
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
new file mode 100644
index 000000000000..f4ffbd0f306f
--- /dev/null
+++ b/kernel/rcuclassic.c
@@ -0,0 +1,575 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2001
19 *
20 * Authors: Dipankar Sarma <dipankar@in.ibm.com>
21 * Manfred Spraul <manfred@colorfullife.com>
22 *
23 * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
24 * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
25 * Papers:
26 * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
27 * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
28 *
29 * For detailed explanation of Read-Copy Update mechanism see -
30 * Documentation/RCU
31 *
32 */
33#include <linux/types.h>
34#include <linux/kernel.h>
35#include <linux/init.h>
36#include <linux/spinlock.h>
37#include <linux/smp.h>
38#include <linux/rcupdate.h>
39#include <linux/interrupt.h>
40#include <linux/sched.h>
41#include <asm/atomic.h>
42#include <linux/bitops.h>
43#include <linux/module.h>
44#include <linux/completion.h>
45#include <linux/moduleparam.h>
46#include <linux/percpu.h>
47#include <linux/notifier.h>
48#include <linux/cpu.h>
49#include <linux/mutex.h>
50
51#ifdef CONFIG_DEBUG_LOCK_ALLOC
52static struct lock_class_key rcu_lock_key;
53struct lockdep_map rcu_lock_map =
54 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
55EXPORT_SYMBOL_GPL(rcu_lock_map);
56#endif
57
58
59/* Definition for rcupdate control block. */
60static struct rcu_ctrlblk rcu_ctrlblk = {
61 .cur = -300,
62 .completed = -300,
63 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
64 .cpumask = CPU_MASK_NONE,
65};
66static struct rcu_ctrlblk rcu_bh_ctrlblk = {
67 .cur = -300,
68 .completed = -300,
69 .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
70 .cpumask = CPU_MASK_NONE,
71};
72
73DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
74DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
75
76static int blimit = 10;
77static int qhimark = 10000;
78static int qlowmark = 100;
79
80#ifdef CONFIG_SMP
81static void force_quiescent_state(struct rcu_data *rdp,
82 struct rcu_ctrlblk *rcp)
83{
84 int cpu;
85 cpumask_t cpumask;
86 set_need_resched();
87 if (unlikely(!rcp->signaled)) {
88 rcp->signaled = 1;
89 /*
90 * Don't send IPI to itself. With irqs disabled,
91 * rdp->cpu is the current cpu.
92 */
93 cpumask = rcp->cpumask;
94 cpu_clear(rdp->cpu, cpumask);
95 for_each_cpu_mask(cpu, cpumask)
96 smp_send_reschedule(cpu);
97 }
98}
99#else
100static inline void force_quiescent_state(struct rcu_data *rdp,
101 struct rcu_ctrlblk *rcp)
102{
103 set_need_resched();
104}
105#endif
106
107/**
108 * call_rcu - Queue an RCU callback for invocation after a grace period.
109 * @head: structure to be used for queueing the RCU updates.
110 * @func: actual update function to be invoked after the grace period
111 *
112 * The update function will be invoked some time after a full grace
113 * period elapses, in other words after all currently executing RCU
114 * read-side critical sections have completed. RCU read-side critical
115 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
116 * and may be nested.
117 */
118void call_rcu(struct rcu_head *head,
119 void (*func)(struct rcu_head *rcu))
120{
121 unsigned long flags;
122 struct rcu_data *rdp;
123
124 head->func = func;
125 head->next = NULL;
126 local_irq_save(flags);
127 rdp = &__get_cpu_var(rcu_data);
128 *rdp->nxttail = head;
129 rdp->nxttail = &head->next;
130 if (unlikely(++rdp->qlen > qhimark)) {
131 rdp->blimit = INT_MAX;
132 force_quiescent_state(rdp, &rcu_ctrlblk);
133 }
134 local_irq_restore(flags);
135}
136EXPORT_SYMBOL_GPL(call_rcu);
137
138/**
139 * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
140 * @head: structure to be used for queueing the RCU updates.
141 * @func: actual update function to be invoked after the grace period
142 *
143 * The update function will be invoked some time after a full grace
144 * period elapses, in other words after all currently executing RCU
145 * read-side critical sections have completed. call_rcu_bh() assumes
146 * that the read-side critical sections end on completion of a softirq
147 * handler. This means that read-side critical sections in process
148 * context must not be interrupted by softirqs. This interface is to be
149 * used when most of the read-side critical sections are in softirq context.
150 * RCU read-side critical sections are delimited by rcu_read_lock() and
151 * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh()
152 * and rcu_read_unlock_bh(), if in process context. These may be nested.
153 */
154void call_rcu_bh(struct rcu_head *head,
155 void (*func)(struct rcu_head *rcu))
156{
157 unsigned long flags;
158 struct rcu_data *rdp;
159
160 head->func = func;
161 head->next = NULL;
162 local_irq_save(flags);
163 rdp = &__get_cpu_var(rcu_bh_data);
164 *rdp->nxttail = head;
165 rdp->nxttail = &head->next;
166
167 if (unlikely(++rdp->qlen > qhimark)) {
168 rdp->blimit = INT_MAX;
169 force_quiescent_state(rdp, &rcu_bh_ctrlblk);
170 }
171
172 local_irq_restore(flags);
173}
174EXPORT_SYMBOL_GPL(call_rcu_bh);
175
176/*
177 * Return the number of RCU batches processed thus far. Useful
178 * for debug and statistics.
179 */
180long rcu_batches_completed(void)
181{
182 return rcu_ctrlblk.completed;
183}
184EXPORT_SYMBOL_GPL(rcu_batches_completed);
185
186/*
187 * Return the number of RCU batches processed thus far. Useful
188 * for debug and statistics.
189 */
190long rcu_batches_completed_bh(void)
191{
192 return rcu_bh_ctrlblk.completed;
193}
194EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
195
196/* Raises the softirq for processing rcu_callbacks. */
197static inline void raise_rcu_softirq(void)
198{
199 raise_softirq(RCU_SOFTIRQ);
200 /*
201 * The smp_mb() here is required to ensure that this cpu's
202 * __rcu_process_callbacks() reads the most recently updated
203 * value of rcu->cur.
204 */
205 smp_mb();
206}
207
208/*
209 * Invoke the completed RCU callbacks. They are expected to be in
210 * a per-cpu list.
211 */
212static void rcu_do_batch(struct rcu_data *rdp)
213{
214 struct rcu_head *next, *list;
215 int count = 0;
216
217 list = rdp->donelist;
218 while (list) {
219 next = list->next;
220 prefetch(next);
221 list->func(list);
222 list = next;
223 if (++count >= rdp->blimit)
224 break;
225 }
226 rdp->donelist = list;
227
228 local_irq_disable();
229 rdp->qlen -= count;
230 local_irq_enable();
231 if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
232 rdp->blimit = blimit;
233
234 if (!rdp->donelist)
235 rdp->donetail = &rdp->donelist;
236 else
237 raise_rcu_softirq();
238}
239
240/*
241 * Grace period handling:
242 * The grace period handling consists out of two steps:
243 * - A new grace period is started.
244 * This is done by rcu_start_batch. The start is not broadcasted to
245 * all cpus, they must pick this up by comparing rcp->cur with
246 * rdp->quiescbatch. All cpus are recorded in the
247 * rcu_ctrlblk.cpumask bitmap.
248 * - All cpus must go through a quiescent state.
249 * Since the start of the grace period is not broadcasted, at least two
250 * calls to rcu_check_quiescent_state are required:
251 * The first call just notices that a new grace period is running. The
252 * following calls check if there was a quiescent state since the beginning
253 * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If
254 * the bitmap is empty, then the grace period is completed.
255 * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
256 * period (if necessary).
257 */
258/*
259 * Register a new batch of callbacks, and start it up if there is currently no
260 * active batch and the batch to be registered has not already occurred.
261 * Caller must hold rcu_ctrlblk.lock.
262 */
263static void rcu_start_batch(struct rcu_ctrlblk *rcp)
264{
265 if (rcp->next_pending &&
266 rcp->completed == rcp->cur) {
267 rcp->next_pending = 0;
268 /*
269 * next_pending == 0 must be visible in
270 * __rcu_process_callbacks() before it can see new value of cur.
271 */
272 smp_wmb();
273 rcp->cur++;
274
275 /*
276 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
277 * Barrier Otherwise it can cause tickless idle CPUs to be
278 * included in rcp->cpumask, which will extend graceperiods
279 * unnecessarily.
280 */
281 smp_mb();
282 cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask);
283
284 rcp->signaled = 0;
285 }
286}
287
288/*
289 * cpu went through a quiescent state since the beginning of the grace period.
290 * Clear it from the cpu mask and complete the grace period if it was the last
291 * cpu. Start another grace period if someone has further entries pending
292 */
293static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
294{
295 cpu_clear(cpu, rcp->cpumask);
296 if (cpus_empty(rcp->cpumask)) {
297 /* batch completed ! */
298 rcp->completed = rcp->cur;
299 rcu_start_batch(rcp);
300 }
301}
302
303/*
304 * Check if the cpu has gone through a quiescent state (say context
305 * switch). If so and if it already hasn't done so in this RCU
306 * quiescent cycle, then indicate that it has done so.
307 */
308static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
309 struct rcu_data *rdp)
310{
311 if (rdp->quiescbatch != rcp->cur) {
312 /* start new grace period: */
313 rdp->qs_pending = 1;
314 rdp->passed_quiesc = 0;
315 rdp->quiescbatch = rcp->cur;
316 return;
317 }
318
319 /* Grace period already completed for this cpu?
320 * qs_pending is checked instead of the actual bitmap to avoid
321 * cacheline trashing.
322 */
323 if (!rdp->qs_pending)
324 return;
325
326 /*
327 * Was there a quiescent state since the beginning of the grace
328 * period? If no, then exit and wait for the next call.
329 */
330 if (!rdp->passed_quiesc)
331 return;
332 rdp->qs_pending = 0;
333
334 spin_lock(&rcp->lock);
335 /*
336 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
337 * during cpu startup. Ignore the quiescent state.
338 */
339 if (likely(rdp->quiescbatch == rcp->cur))
340 cpu_quiet(rdp->cpu, rcp);
341
342 spin_unlock(&rcp->lock);
343}
344
345
346#ifdef CONFIG_HOTPLUG_CPU
347
348/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing
349 * locking requirements, the list it's pulling from has to belong to a cpu
350 * which is dead and hence not processing interrupts.
351 */
352static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
353 struct rcu_head **tail)
354{
355 local_irq_disable();
356 *this_rdp->nxttail = list;
357 if (list)
358 this_rdp->nxttail = tail;
359 local_irq_enable();
360}
361
362static void __rcu_offline_cpu(struct rcu_data *this_rdp,
363 struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
364{
365 /* if the cpu going offline owns the grace period
366 * we can block indefinitely waiting for it, so flush
367 * it here
368 */
369 spin_lock_bh(&rcp->lock);
370 if (rcp->cur != rcp->completed)
371 cpu_quiet(rdp->cpu, rcp);
372 spin_unlock_bh(&rcp->lock);
373 rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail);
374 rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail);
375 rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
376}
377
378static void rcu_offline_cpu(int cpu)
379{
380 struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
381 struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
382
383 __rcu_offline_cpu(this_rdp, &rcu_ctrlblk,
384 &per_cpu(rcu_data, cpu));
385 __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk,
386 &per_cpu(rcu_bh_data, cpu));
387 put_cpu_var(rcu_data);
388 put_cpu_var(rcu_bh_data);
389}
390
391#else
392
393static void rcu_offline_cpu(int cpu)
394{
395}
396
397#endif
398
399/*
400 * This does the RCU processing work from softirq context.
401 */
402static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
403 struct rcu_data *rdp)
404{
405 if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) {
406 *rdp->donetail = rdp->curlist;
407 rdp->donetail = rdp->curtail;
408 rdp->curlist = NULL;
409 rdp->curtail = &rdp->curlist;
410 }
411
412 if (rdp->nxtlist && !rdp->curlist) {
413 local_irq_disable();
414 rdp->curlist = rdp->nxtlist;
415 rdp->curtail = rdp->nxttail;
416 rdp->nxtlist = NULL;
417 rdp->nxttail = &rdp->nxtlist;
418 local_irq_enable();
419
420 /*
421 * start the next batch of callbacks
422 */
423
424 /* determine batch number */
425 rdp->batch = rcp->cur + 1;
426 /* see the comment and corresponding wmb() in
427 * the rcu_start_batch()
428 */
429 smp_rmb();
430
431 if (!rcp->next_pending) {
432 /* and start it/schedule start if it's a new batch */
433 spin_lock(&rcp->lock);
434 rcp->next_pending = 1;
435 rcu_start_batch(rcp);
436 spin_unlock(&rcp->lock);
437 }
438 }
439
440 rcu_check_quiescent_state(rcp, rdp);
441 if (rdp->donelist)
442 rcu_do_batch(rdp);
443}
444
445static void rcu_process_callbacks(struct softirq_action *unused)
446{
447 __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
448 __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
449}
450
451static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
452{
453 /* This cpu has pending rcu entries and the grace period
454 * for them has completed.
455 */
456 if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch))
457 return 1;
458
459 /* This cpu has no pending entries, but there are new entries */
460 if (!rdp->curlist && rdp->nxtlist)
461 return 1;
462
463 /* This cpu has finished callbacks to invoke */
464 if (rdp->donelist)
465 return 1;
466
467 /* The rcu core waits for a quiescent state from the cpu */
468 if (rdp->quiescbatch != rcp->cur || rdp->qs_pending)
469 return 1;
470
471 /* nothing to do */
472 return 0;
473}
474
475/*
476 * Check to see if there is any immediate RCU-related work to be done
477 * by the current CPU, returning 1 if so. This function is part of the
478 * RCU implementation; it is -not- an exported member of the RCU API.
479 */
480int rcu_pending(int cpu)
481{
482 return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
483 __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
484}
485
486/*
487 * Check to see if any future RCU-related work will need to be done
488 * by the current CPU, even if none need be done immediately, returning
489 * 1 if so. This function is part of the RCU implementation; it is -not-
490 * an exported member of the RCU API.
491 */
492int rcu_needs_cpu(int cpu)
493{
494 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
495 struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
496
497 return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu));
498}
499
500void rcu_check_callbacks(int cpu, int user)
501{
502 if (user ||
503 (idle_cpu(cpu) && !in_softirq() &&
504 hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
505 rcu_qsctr_inc(cpu);
506 rcu_bh_qsctr_inc(cpu);
507 } else if (!in_softirq())
508 rcu_bh_qsctr_inc(cpu);
509 raise_rcu_softirq();
510}
511
512static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
513 struct rcu_data *rdp)
514{
515 memset(rdp, 0, sizeof(*rdp));
516 rdp->curtail = &rdp->curlist;
517 rdp->nxttail = &rdp->nxtlist;
518 rdp->donetail = &rdp->donelist;
519 rdp->quiescbatch = rcp->completed;
520 rdp->qs_pending = 0;
521 rdp->cpu = cpu;
522 rdp->blimit = blimit;
523}
524
525static void __cpuinit rcu_online_cpu(int cpu)
526{
527 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
528 struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu);
529
530 rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
531 rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
532 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
533}
534
535static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
536 unsigned long action, void *hcpu)
537{
538 long cpu = (long)hcpu;
539
540 switch (action) {
541 case CPU_UP_PREPARE:
542 case CPU_UP_PREPARE_FROZEN:
543 rcu_online_cpu(cpu);
544 break;
545 case CPU_DEAD:
546 case CPU_DEAD_FROZEN:
547 rcu_offline_cpu(cpu);
548 break;
549 default:
550 break;
551 }
552 return NOTIFY_OK;
553}
554
555static struct notifier_block __cpuinitdata rcu_nb = {
556 .notifier_call = rcu_cpu_notify,
557};
558
559/*
560 * Initializes rcu mechanism. Assumed to be called early.
561 * That is before local timer(SMP) or jiffie timer (uniproc) is setup.
562 * Note that rcu_qsctr and friends are implicitly
563 * initialized due to the choice of ``0'' for RCU_CTR_INVALID.
564 */
565void __init __rcu_init(void)
566{
567 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
568 (void *)(long)smp_processor_id());
569 /* Register notifier for non-boot CPUs */
570 register_cpu_notifier(&rcu_nb);
571}
572
573module_param(blimit, int, 0);
574module_param(qhimark, int, 0);
575module_param(qlowmark, int, 0);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a66d4d1615f7..760dfc233a00 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -15,7 +15,7 @@
15 * along with this program; if not, write to the Free Software 15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 * 17 *
18 * Copyright (C) IBM Corporation, 2001 18 * Copyright IBM Corporation, 2001
19 * 19 *
20 * Authors: Dipankar Sarma <dipankar@in.ibm.com> 20 * Authors: Dipankar Sarma <dipankar@in.ibm.com>
21 * Manfred Spraul <manfred@colorfullife.com> 21 * Manfred Spraul <manfred@colorfullife.com>
@@ -35,165 +35,57 @@
35#include <linux/init.h> 35#include <linux/init.h>
36#include <linux/spinlock.h> 36#include <linux/spinlock.h>
37#include <linux/smp.h> 37#include <linux/smp.h>
38#include <linux/rcupdate.h>
39#include <linux/interrupt.h> 38#include <linux/interrupt.h>
40#include <linux/sched.h> 39#include <linux/sched.h>
41#include <asm/atomic.h> 40#include <asm/atomic.h>
42#include <linux/bitops.h> 41#include <linux/bitops.h>
43#include <linux/module.h>
44#include <linux/completion.h> 42#include <linux/completion.h>
45#include <linux/moduleparam.h>
46#include <linux/percpu.h> 43#include <linux/percpu.h>
47#include <linux/notifier.h> 44#include <linux/notifier.h>
48#include <linux/cpu.h> 45#include <linux/cpu.h>
49#include <linux/mutex.h> 46#include <linux/mutex.h>
47#include <linux/module.h>
50 48
51#ifdef CONFIG_DEBUG_LOCK_ALLOC 49struct rcu_synchronize {
52static struct lock_class_key rcu_lock_key; 50 struct rcu_head head;
53struct lockdep_map rcu_lock_map = 51 struct completion completion;
54 STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
55
56EXPORT_SYMBOL_GPL(rcu_lock_map);
57#endif
58
59/* Definition for rcupdate control block. */
60static struct rcu_ctrlblk rcu_ctrlblk = {
61 .cur = -300,
62 .completed = -300,
63 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
64 .cpumask = CPU_MASK_NONE,
65};
66static struct rcu_ctrlblk rcu_bh_ctrlblk = {
67 .cur = -300,
68 .completed = -300,
69 .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
70 .cpumask = CPU_MASK_NONE,
71}; 52};
72 53
73DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; 54static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
74DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
75
76/* Fake initialization required by compiler */
77static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
78static int blimit = 10;
79static int qhimark = 10000;
80static int qlowmark = 100;
81
82static atomic_t rcu_barrier_cpu_count; 55static atomic_t rcu_barrier_cpu_count;
83static DEFINE_MUTEX(rcu_barrier_mutex); 56static DEFINE_MUTEX(rcu_barrier_mutex);
84static struct completion rcu_barrier_completion; 57static struct completion rcu_barrier_completion;
85 58
86#ifdef CONFIG_SMP 59/* Because of FASTCALL declaration of complete, we use this wrapper */
87static void force_quiescent_state(struct rcu_data *rdp, 60static void wakeme_after_rcu(struct rcu_head *head)
88 struct rcu_ctrlblk *rcp)
89{
90 int cpu;
91 cpumask_t cpumask;
92 set_need_resched();
93 if (unlikely(!rcp->signaled)) {
94 rcp->signaled = 1;
95 /*
96 * Don't send IPI to itself. With irqs disabled,
97 * rdp->cpu is the current cpu.
98 */
99 cpumask = rcp->cpumask;
100 cpu_clear(rdp->cpu, cpumask);
101 for_each_cpu_mask(cpu, cpumask)
102 smp_send_reschedule(cpu);
103 }
104}
105#else
106static inline void force_quiescent_state(struct rcu_data *rdp,
107 struct rcu_ctrlblk *rcp)
108{ 61{
109 set_need_resched(); 62 struct rcu_synchronize *rcu;
63
64 rcu = container_of(head, struct rcu_synchronize, head);
65 complete(&rcu->completion);
110} 66}
111#endif
112 67
113/** 68/**
114 * call_rcu - Queue an RCU callback for invocation after a grace period. 69 * synchronize_rcu - wait until a grace period has elapsed.
115 * @head: structure to be used for queueing the RCU updates.
116 * @func: actual update function to be invoked after the grace period
117 * 70 *
118 * The update function will be invoked some time after a full grace 71 * Control will return to the caller some time after a full grace
119 * period elapses, in other words after all currently executing RCU 72 * period has elapsed, in other words after all currently executing RCU
120 * read-side critical sections have completed. RCU read-side critical 73 * read-side critical sections have completed. RCU read-side critical
121 * sections are delimited by rcu_read_lock() and rcu_read_unlock(), 74 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
122 * and may be nested. 75 * and may be nested.
123 */ 76 */
124void fastcall call_rcu(struct rcu_head *head, 77void synchronize_rcu(void)
125 void (*func)(struct rcu_head *rcu))
126{
127 unsigned long flags;
128 struct rcu_data *rdp;
129
130 head->func = func;
131 head->next = NULL;
132 local_irq_save(flags);
133 rdp = &__get_cpu_var(rcu_data);
134 *rdp->nxttail = head;
135 rdp->nxttail = &head->next;
136 if (unlikely(++rdp->qlen > qhimark)) {
137 rdp->blimit = INT_MAX;
138 force_quiescent_state(rdp, &rcu_ctrlblk);
139 }
140 local_irq_restore(flags);
141}
142
143/**
144 * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
145 * @head: structure to be used for queueing the RCU updates.
146 * @func: actual update function to be invoked after the grace period
147 *
148 * The update function will be invoked some time after a full grace
149 * period elapses, in other words after all currently executing RCU
150 * read-side critical sections have completed. call_rcu_bh() assumes
151 * that the read-side critical sections end on completion of a softirq
152 * handler. This means that read-side critical sections in process
153 * context must not be interrupted by softirqs. This interface is to be
154 * used when most of the read-side critical sections are in softirq context.
155 * RCU read-side critical sections are delimited by rcu_read_lock() and
156 * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh()
157 * and rcu_read_unlock_bh(), if in process context. These may be nested.
158 */
159void fastcall call_rcu_bh(struct rcu_head *head,
160 void (*func)(struct rcu_head *rcu))
161{ 78{
162 unsigned long flags; 79 struct rcu_synchronize rcu;
163 struct rcu_data *rdp;
164
165 head->func = func;
166 head->next = NULL;
167 local_irq_save(flags);
168 rdp = &__get_cpu_var(rcu_bh_data);
169 *rdp->nxttail = head;
170 rdp->nxttail = &head->next;
171
172 if (unlikely(++rdp->qlen > qhimark)) {
173 rdp->blimit = INT_MAX;
174 force_quiescent_state(rdp, &rcu_bh_ctrlblk);
175 }
176
177 local_irq_restore(flags);
178}
179 80
180/* 81 init_completion(&rcu.completion);
181 * Return the number of RCU batches processed thus far. Useful 82 /* Will wake me after RCU finished */
182 * for debug and statistics. 83 call_rcu(&rcu.head, wakeme_after_rcu);
183 */
184long rcu_batches_completed(void)
185{
186 return rcu_ctrlblk.completed;
187}
188 84
189/* 85 /* Wait for it */
190 * Return the number of RCU batches processed thus far. Useful 86 wait_for_completion(&rcu.completion);
191 * for debug and statistics.
192 */
193long rcu_batches_completed_bh(void)
194{
195 return rcu_bh_ctrlblk.completed;
196} 87}
88EXPORT_SYMBOL_GPL(synchronize_rcu);
197 89
198static void rcu_barrier_callback(struct rcu_head *notused) 90static void rcu_barrier_callback(struct rcu_head *notused)
199{ 91{
@@ -207,10 +99,8 @@ static void rcu_barrier_callback(struct rcu_head *notused)
207static void rcu_barrier_func(void *notused) 99static void rcu_barrier_func(void *notused)
208{ 100{
209 int cpu = smp_processor_id(); 101 int cpu = smp_processor_id();
210 struct rcu_data *rdp = &per_cpu(rcu_data, cpu); 102 struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu);
211 struct rcu_head *head;
212 103
213 head = &rdp->barrier;
214 atomic_inc(&rcu_barrier_cpu_count); 104 atomic_inc(&rcu_barrier_cpu_count);
215 call_rcu(head, rcu_barrier_callback); 105 call_rcu(head, rcu_barrier_callback);
216} 106}
@@ -225,420 +115,24 @@ void rcu_barrier(void)
225 mutex_lock(&rcu_barrier_mutex); 115 mutex_lock(&rcu_barrier_mutex);
226 init_completion(&rcu_barrier_completion); 116 init_completion(&rcu_barrier_completion);
227 atomic_set(&rcu_barrier_cpu_count, 0); 117 atomic_set(&rcu_barrier_cpu_count, 0);
118 /*
119 * The queueing of callbacks in all CPUs must be atomic with
120 * respect to RCU, otherwise one CPU may queue a callback,
121 * wait for a grace period, decrement barrier count and call
122 * complete(), while other CPUs have not yet queued anything.
123 * So, we need to make sure that grace periods cannot complete
124 * until all the callbacks are queued.
125 */
126 rcu_read_lock();
228 on_each_cpu(rcu_barrier_func, NULL, 0, 1); 127 on_each_cpu(rcu_barrier_func, NULL, 0, 1);
128 rcu_read_unlock();
229 wait_for_completion(&rcu_barrier_completion); 129 wait_for_completion(&rcu_barrier_completion);
230 mutex_unlock(&rcu_barrier_mutex); 130 mutex_unlock(&rcu_barrier_mutex);
231} 131}
232EXPORT_SYMBOL_GPL(rcu_barrier); 132EXPORT_SYMBOL_GPL(rcu_barrier);
233 133
234/*
235 * Invoke the completed RCU callbacks. They are expected to be in
236 * a per-cpu list.
237 */
238static void rcu_do_batch(struct rcu_data *rdp)
239{
240 struct rcu_head *next, *list;
241 int count = 0;
242
243 list = rdp->donelist;
244 while (list) {
245 next = list->next;
246 prefetch(next);
247 list->func(list);
248 list = next;
249 if (++count >= rdp->blimit)
250 break;
251 }
252 rdp->donelist = list;
253
254 local_irq_disable();
255 rdp->qlen -= count;
256 local_irq_enable();
257 if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
258 rdp->blimit = blimit;
259
260 if (!rdp->donelist)
261 rdp->donetail = &rdp->donelist;
262 else
263 tasklet_schedule(&per_cpu(rcu_tasklet, rdp->cpu));
264}
265
266/*
267 * Grace period handling:
268 * The grace period handling consists out of two steps:
269 * - A new grace period is started.
270 * This is done by rcu_start_batch. The start is not broadcasted to
271 * all cpus, they must pick this up by comparing rcp->cur with
272 * rdp->quiescbatch. All cpus are recorded in the
273 * rcu_ctrlblk.cpumask bitmap.
274 * - All cpus must go through a quiescent state.
275 * Since the start of the grace period is not broadcasted, at least two
276 * calls to rcu_check_quiescent_state are required:
277 * The first call just notices that a new grace period is running. The
278 * following calls check if there was a quiescent state since the beginning
279 * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If
280 * the bitmap is empty, then the grace period is completed.
281 * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
282 * period (if necessary).
283 */
284/*
285 * Register a new batch of callbacks, and start it up if there is currently no
286 * active batch and the batch to be registered has not already occurred.
287 * Caller must hold rcu_ctrlblk.lock.
288 */
289static void rcu_start_batch(struct rcu_ctrlblk *rcp)
290{
291 if (rcp->next_pending &&
292 rcp->completed == rcp->cur) {
293 rcp->next_pending = 0;
294 /*
295 * next_pending == 0 must be visible in
296 * __rcu_process_callbacks() before it can see new value of cur.
297 */
298 smp_wmb();
299 rcp->cur++;
300
301 /*
302 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
303 * Barrier Otherwise it can cause tickless idle CPUs to be
304 * included in rcp->cpumask, which will extend graceperiods
305 * unnecessarily.
306 */
307 smp_mb();
308 cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask);
309
310 rcp->signaled = 0;
311 }
312}
313
314/*
315 * cpu went through a quiescent state since the beginning of the grace period.
316 * Clear it from the cpu mask and complete the grace period if it was the last
317 * cpu. Start another grace period if someone has further entries pending
318 */
319static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
320{
321 cpu_clear(cpu, rcp->cpumask);
322 if (cpus_empty(rcp->cpumask)) {
323 /* batch completed ! */
324 rcp->completed = rcp->cur;
325 rcu_start_batch(rcp);
326 }
327}
328
329/*
330 * Check if the cpu has gone through a quiescent state (say context
331 * switch). If so and if it already hasn't done so in this RCU
332 * quiescent cycle, then indicate that it has done so.
333 */
334static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
335 struct rcu_data *rdp)
336{
337 if (rdp->quiescbatch != rcp->cur) {
338 /* start new grace period: */
339 rdp->qs_pending = 1;
340 rdp->passed_quiesc = 0;
341 rdp->quiescbatch = rcp->cur;
342 return;
343 }
344
345 /* Grace period already completed for this cpu?
346 * qs_pending is checked instead of the actual bitmap to avoid
347 * cacheline trashing.
348 */
349 if (!rdp->qs_pending)
350 return;
351
352 /*
353 * Was there a quiescent state since the beginning of the grace
354 * period? If no, then exit and wait for the next call.
355 */
356 if (!rdp->passed_quiesc)
357 return;
358 rdp->qs_pending = 0;
359
360 spin_lock(&rcp->lock);
361 /*
362 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
363 * during cpu startup. Ignore the quiescent state.
364 */
365 if (likely(rdp->quiescbatch == rcp->cur))
366 cpu_quiet(rdp->cpu, rcp);
367
368 spin_unlock(&rcp->lock);
369}
370
371
372#ifdef CONFIG_HOTPLUG_CPU
373
374/* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing
375 * locking requirements, the list it's pulling from has to belong to a cpu
376 * which is dead and hence not processing interrupts.
377 */
378static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
379 struct rcu_head **tail)
380{
381 local_irq_disable();
382 *this_rdp->nxttail = list;
383 if (list)
384 this_rdp->nxttail = tail;
385 local_irq_enable();
386}
387
388static void __rcu_offline_cpu(struct rcu_data *this_rdp,
389 struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
390{
391 /* if the cpu going offline owns the grace period
392 * we can block indefinitely waiting for it, so flush
393 * it here
394 */
395 spin_lock_bh(&rcp->lock);
396 if (rcp->cur != rcp->completed)
397 cpu_quiet(rdp->cpu, rcp);
398 spin_unlock_bh(&rcp->lock);
399 rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail);
400 rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
401 rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail);
402}
403
404static void rcu_offline_cpu(int cpu)
405{
406 struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
407 struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
408
409 __rcu_offline_cpu(this_rdp, &rcu_ctrlblk,
410 &per_cpu(rcu_data, cpu));
411 __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk,
412 &per_cpu(rcu_bh_data, cpu));
413 put_cpu_var(rcu_data);
414 put_cpu_var(rcu_bh_data);
415 tasklet_kill_immediate(&per_cpu(rcu_tasklet, cpu), cpu);
416}
417
418#else
419
420static void rcu_offline_cpu(int cpu)
421{
422}
423
424#endif
425
426/*
427 * This does the RCU processing work from tasklet context.
428 */
429static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
430 struct rcu_data *rdp)
431{
432 if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) {
433 *rdp->donetail = rdp->curlist;
434 rdp->donetail = rdp->curtail;
435 rdp->curlist = NULL;
436 rdp->curtail = &rdp->curlist;
437 }
438
439 if (rdp->nxtlist && !rdp->curlist) {
440 local_irq_disable();
441 rdp->curlist = rdp->nxtlist;
442 rdp->curtail = rdp->nxttail;
443 rdp->nxtlist = NULL;
444 rdp->nxttail = &rdp->nxtlist;
445 local_irq_enable();
446
447 /*
448 * start the next batch of callbacks
449 */
450
451 /* determine batch number */
452 rdp->batch = rcp->cur + 1;
453 /* see the comment and corresponding wmb() in
454 * the rcu_start_batch()
455 */
456 smp_rmb();
457
458 if (!rcp->next_pending) {
459 /* and start it/schedule start if it's a new batch */
460 spin_lock(&rcp->lock);
461 rcp->next_pending = 1;
462 rcu_start_batch(rcp);
463 spin_unlock(&rcp->lock);
464 }
465 }
466
467 rcu_check_quiescent_state(rcp, rdp);
468 if (rdp->donelist)
469 rcu_do_batch(rdp);
470}
471
472static void rcu_process_callbacks(unsigned long unused)
473{
474 __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
475 __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
476}
477
478static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
479{
480 /* This cpu has pending rcu entries and the grace period
481 * for them has completed.
482 */
483 if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch))
484 return 1;
485
486 /* This cpu has no pending entries, but there are new entries */
487 if (!rdp->curlist && rdp->nxtlist)
488 return 1;
489
490 /* This cpu has finished callbacks to invoke */
491 if (rdp->donelist)
492 return 1;
493
494 /* The rcu core waits for a quiescent state from the cpu */
495 if (rdp->quiescbatch != rcp->cur || rdp->qs_pending)
496 return 1;
497
498 /* nothing to do */
499 return 0;
500}
501
502/*
503 * Check to see if there is any immediate RCU-related work to be done
504 * by the current CPU, returning 1 if so. This function is part of the
505 * RCU implementation; it is -not- an exported member of the RCU API.
506 */
507int rcu_pending(int cpu)
508{
509 return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
510 __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
511}
512
513/*
514 * Check to see if any future RCU-related work will need to be done
515 * by the current CPU, even if none need be done immediately, returning
516 * 1 if so. This function is part of the RCU implementation; it is -not-
517 * an exported member of the RCU API.
518 */
519int rcu_needs_cpu(int cpu)
520{
521 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
522 struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
523
524 return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu));
525}
526
527void rcu_check_callbacks(int cpu, int user)
528{
529 if (user ||
530 (idle_cpu(cpu) && !in_softirq() &&
531 hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
532 rcu_qsctr_inc(cpu);
533 rcu_bh_qsctr_inc(cpu);
534 } else if (!in_softirq())
535 rcu_bh_qsctr_inc(cpu);
536 tasklet_schedule(&per_cpu(rcu_tasklet, cpu));
537}
538
539static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
540 struct rcu_data *rdp)
541{
542 memset(rdp, 0, sizeof(*rdp));
543 rdp->curtail = &rdp->curlist;
544 rdp->nxttail = &rdp->nxtlist;
545 rdp->donetail = &rdp->donelist;
546 rdp->quiescbatch = rcp->completed;
547 rdp->qs_pending = 0;
548 rdp->cpu = cpu;
549 rdp->blimit = blimit;
550}
551
552static void __devinit rcu_online_cpu(int cpu)
553{
554 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
555 struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu);
556
557 rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
558 rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
559 tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL);
560}
561
562static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
563 unsigned long action, void *hcpu)
564{
565 long cpu = (long)hcpu;
566 switch (action) {
567 case CPU_UP_PREPARE:
568 case CPU_UP_PREPARE_FROZEN:
569 rcu_online_cpu(cpu);
570 break;
571 case CPU_DEAD:
572 case CPU_DEAD_FROZEN:
573 rcu_offline_cpu(cpu);
574 break;
575 default:
576 break;
577 }
578 return NOTIFY_OK;
579}
580
581static struct notifier_block __cpuinitdata rcu_nb = {
582 .notifier_call = rcu_cpu_notify,
583};
584
585/*
586 * Initializes rcu mechanism. Assumed to be called early.
587 * That is before local timer(SMP) or jiffie timer (uniproc) is setup.
588 * Note that rcu_qsctr and friends are implicitly
589 * initialized due to the choice of ``0'' for RCU_CTR_INVALID.
590 */
591void __init rcu_init(void) 134void __init rcu_init(void)
592{ 135{
593 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, 136 __rcu_init();
594 (void *)(long)smp_processor_id());
595 /* Register notifier for non-boot CPUs */
596 register_cpu_notifier(&rcu_nb);
597}
598
599struct rcu_synchronize {
600 struct rcu_head head;
601 struct completion completion;
602};
603
604/* Because of FASTCALL declaration of complete, we use this wrapper */
605static void wakeme_after_rcu(struct rcu_head *head)
606{
607 struct rcu_synchronize *rcu;
608
609 rcu = container_of(head, struct rcu_synchronize, head);
610 complete(&rcu->completion);
611} 137}
612 138
613/**
614 * synchronize_rcu - wait until a grace period has elapsed.
615 *
616 * Control will return to the caller some time after a full grace
617 * period has elapsed, in other words after all currently executing RCU
618 * read-side critical sections have completed. RCU read-side critical
619 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
620 * and may be nested.
621 *
622 * If your read-side code is not protected by rcu_read_lock(), do -not-
623 * use synchronize_rcu().
624 */
625void synchronize_rcu(void)
626{
627 struct rcu_synchronize rcu;
628
629 init_completion(&rcu.completion);
630 /* Will wake me after RCU finished */
631 call_rcu(&rcu.head, wakeme_after_rcu);
632
633 /* Wait for it */
634 wait_for_completion(&rcu.completion);
635}
636
637module_param(blimit, int, 0);
638module_param(qhimark, int, 0);
639module_param(qlowmark, int, 0);
640EXPORT_SYMBOL_GPL(rcu_batches_completed);
641EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
642EXPORT_SYMBOL_GPL(call_rcu);
643EXPORT_SYMBOL_GPL(call_rcu_bh);
644EXPORT_SYMBOL_GPL(synchronize_rcu);
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
new file mode 100644
index 000000000000..987cfb7ade89
--- /dev/null
+++ b/kernel/rcupreempt.c
@@ -0,0 +1,953 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion, realtime implementation
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2006
19 *
20 * Authors: Paul E. McKenney <paulmck@us.ibm.com>
21 * With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar
22 * for pushing me away from locks and towards counters, and
23 * to Suparna Bhattacharya for pushing me completely away
24 * from atomic instructions on the read side.
25 *
26 * Papers: http://www.rdrop.com/users/paulmck/RCU
27 *
28 * Design Document: http://lwn.net/Articles/253651/
29 *
30 * For detailed explanation of Read-Copy Update mechanism see -
31 * Documentation/RCU/ *.txt
32 *
33 */
34#include <linux/types.h>
35#include <linux/kernel.h>
36#include <linux/init.h>
37#include <linux/spinlock.h>
38#include <linux/smp.h>
39#include <linux/rcupdate.h>
40#include <linux/interrupt.h>
41#include <linux/sched.h>
42#include <asm/atomic.h>
43#include <linux/bitops.h>
44#include <linux/module.h>
45#include <linux/completion.h>
46#include <linux/moduleparam.h>
47#include <linux/percpu.h>
48#include <linux/notifier.h>
49#include <linux/rcupdate.h>
50#include <linux/cpu.h>
51#include <linux/random.h>
52#include <linux/delay.h>
53#include <linux/byteorder/swabb.h>
54#include <linux/cpumask.h>
55#include <linux/rcupreempt_trace.h>
56
57/*
58 * Macro that prevents the compiler from reordering accesses, but does
59 * absolutely -nothing- to prevent CPUs from reordering. This is used
60 * only to mediate communication between mainline code and hardware
61 * interrupt and NMI handlers.
62 */
63#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
64
65/*
66 * PREEMPT_RCU data structures.
67 */
68
69/*
70 * GP_STAGES specifies the number of times the state machine has
71 * to go through the all the rcu_try_flip_states (see below)
72 * in a single Grace Period.
73 *
74 * GP in GP_STAGES stands for Grace Period ;)
75 */
76#define GP_STAGES 2
77struct rcu_data {
78 spinlock_t lock; /* Protect rcu_data fields. */
79 long completed; /* Number of last completed batch. */
80 int waitlistcount;
81 struct tasklet_struct rcu_tasklet;
82 struct rcu_head *nextlist;
83 struct rcu_head **nexttail;
84 struct rcu_head *waitlist[GP_STAGES];
85 struct rcu_head **waittail[GP_STAGES];
86 struct rcu_head *donelist;
87 struct rcu_head **donetail;
88 long rcu_flipctr[2];
89#ifdef CONFIG_RCU_TRACE
90 struct rcupreempt_trace trace;
91#endif /* #ifdef CONFIG_RCU_TRACE */
92};
93
94/*
95 * States for rcu_try_flip() and friends.
96 */
97
98enum rcu_try_flip_states {
99
100 /*
101 * Stay here if nothing is happening. Flip the counter if somthing
102 * starts happening. Denoted by "I"
103 */
104 rcu_try_flip_idle_state,
105
106 /*
107 * Wait here for all CPUs to notice that the counter has flipped. This
108 * prevents the old set of counters from ever being incremented once
109 * we leave this state, which in turn is necessary because we cannot
110 * test any individual counter for zero -- we can only check the sum.
111 * Denoted by "A".
112 */
113 rcu_try_flip_waitack_state,
114
115 /*
116 * Wait here for the sum of the old per-CPU counters to reach zero.
117 * Denoted by "Z".
118 */
119 rcu_try_flip_waitzero_state,
120
121 /*
122 * Wait here for each of the other CPUs to execute a memory barrier.
123 * This is necessary to ensure that these other CPUs really have
124 * completed executing their RCU read-side critical sections, despite
125 * their CPUs wildly reordering memory. Denoted by "M".
126 */
127 rcu_try_flip_waitmb_state,
128};
129
130struct rcu_ctrlblk {
131 spinlock_t fliplock; /* Protect state-machine transitions. */
132 long completed; /* Number of last completed batch. */
133 enum rcu_try_flip_states rcu_try_flip_state; /* The current state of
134 the rcu state machine */
135};
136
137static DEFINE_PER_CPU(struct rcu_data, rcu_data);
138static struct rcu_ctrlblk rcu_ctrlblk = {
139 .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
140 .completed = 0,
141 .rcu_try_flip_state = rcu_try_flip_idle_state,
142};
143
144
145#ifdef CONFIG_RCU_TRACE
146static char *rcu_try_flip_state_names[] =
147 { "idle", "waitack", "waitzero", "waitmb" };
148#endif /* #ifdef CONFIG_RCU_TRACE */
149
150static cpumask_t rcu_cpu_online_map __read_mostly = CPU_MASK_NONE;
151
152/*
153 * Enum and per-CPU flag to determine when each CPU has seen
154 * the most recent counter flip.
155 */
156
157enum rcu_flip_flag_values {
158 rcu_flip_seen, /* Steady/initial state, last flip seen. */
159 /* Only GP detector can update. */
160 rcu_flipped /* Flip just completed, need confirmation. */
161 /* Only corresponding CPU can update. */
162};
163static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_flip_flag_values, rcu_flip_flag)
164 = rcu_flip_seen;
165
166/*
167 * Enum and per-CPU flag to determine when each CPU has executed the
168 * needed memory barrier to fence in memory references from its last RCU
169 * read-side critical section in the just-completed grace period.
170 */
171
172enum rcu_mb_flag_values {
173 rcu_mb_done, /* Steady/initial state, no mb()s required. */
174 /* Only GP detector can update. */
175 rcu_mb_needed /* Flip just completed, need an mb(). */
176 /* Only corresponding CPU can update. */
177};
178static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag)
179 = rcu_mb_done;
180
181/*
182 * RCU_DATA_ME: find the current CPU's rcu_data structure.
183 * RCU_DATA_CPU: find the specified CPU's rcu_data structure.
184 */
185#define RCU_DATA_ME() (&__get_cpu_var(rcu_data))
186#define RCU_DATA_CPU(cpu) (&per_cpu(rcu_data, cpu))
187
188/*
189 * Helper macro for tracing when the appropriate rcu_data is not
190 * cached in a local variable, but where the CPU number is so cached.
191 */
192#define RCU_TRACE_CPU(f, cpu) RCU_TRACE(f, &(RCU_DATA_CPU(cpu)->trace));
193
194/*
195 * Helper macro for tracing when the appropriate rcu_data is not
196 * cached in a local variable.
197 */
198#define RCU_TRACE_ME(f) RCU_TRACE(f, &(RCU_DATA_ME()->trace));
199
200/*
201 * Helper macro for tracing when the appropriate rcu_data is pointed
202 * to by a local variable.
203 */
204#define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace));
205
206/*
207 * Return the number of RCU batches processed thus far. Useful
208 * for debug and statistics.
209 */
210long rcu_batches_completed(void)
211{
212 return rcu_ctrlblk.completed;
213}
214EXPORT_SYMBOL_GPL(rcu_batches_completed);
215
216EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
217
218void __rcu_read_lock(void)
219{
220 int idx;
221 struct task_struct *t = current;
222 int nesting;
223
224 nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
225 if (nesting != 0) {
226
227 /* An earlier rcu_read_lock() covers us, just count it. */
228
229 t->rcu_read_lock_nesting = nesting + 1;
230
231 } else {
232 unsigned long flags;
233
234 /*
235 * We disable interrupts for the following reasons:
236 * - If we get scheduling clock interrupt here, and we
237 * end up acking the counter flip, it's like a promise
238 * that we will never increment the old counter again.
239 * Thus we will break that promise if that
240 * scheduling clock interrupt happens between the time
241 * we pick the .completed field and the time that we
242 * increment our counter.
243 *
244 * - We don't want to be preempted out here.
245 *
246 * NMIs can still occur, of course, and might themselves
247 * contain rcu_read_lock().
248 */
249
250 local_irq_save(flags);
251
252 /*
253 * Outermost nesting of rcu_read_lock(), so increment
254 * the current counter for the current CPU. Use volatile
255 * casts to prevent the compiler from reordering.
256 */
257
258 idx = ACCESS_ONCE(rcu_ctrlblk.completed) & 0x1;
259 ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])++;
260
261 /*
262 * Now that the per-CPU counter has been incremented, we
263 * are protected from races with rcu_read_lock() invoked
264 * from NMI handlers on this CPU. We can therefore safely
265 * increment the nesting counter, relieving further NMIs
266 * of the need to increment the per-CPU counter.
267 */
268
269 ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting + 1;
270
271 /*
272 * Now that we have preventing any NMIs from storing
273 * to the ->rcu_flipctr_idx, we can safely use it to
274 * remember which counter to decrement in the matching
275 * rcu_read_unlock().
276 */
277
278 ACCESS_ONCE(t->rcu_flipctr_idx) = idx;
279 local_irq_restore(flags);
280 }
281}
282EXPORT_SYMBOL_GPL(__rcu_read_lock);
283
284void __rcu_read_unlock(void)
285{
286 int idx;
287 struct task_struct *t = current;
288 int nesting;
289
290 nesting = ACCESS_ONCE(t->rcu_read_lock_nesting);
291 if (nesting > 1) {
292
293 /*
294 * We are still protected by the enclosing rcu_read_lock(),
295 * so simply decrement the counter.
296 */
297
298 t->rcu_read_lock_nesting = nesting - 1;
299
300 } else {
301 unsigned long flags;
302
303 /*
304 * Disable local interrupts to prevent the grace-period
305 * detection state machine from seeing us half-done.
306 * NMIs can still occur, of course, and might themselves
307 * contain rcu_read_lock() and rcu_read_unlock().
308 */
309
310 local_irq_save(flags);
311
312 /*
313 * Outermost nesting of rcu_read_unlock(), so we must
314 * decrement the current counter for the current CPU.
315 * This must be done carefully, because NMIs can
316 * occur at any point in this code, and any rcu_read_lock()
317 * and rcu_read_unlock() pairs in the NMI handlers
318 * must interact non-destructively with this code.
319 * Lots of volatile casts, and -very- careful ordering.
320 *
321 * Changes to this code, including this one, must be
322 * inspected, validated, and tested extremely carefully!!!
323 */
324
325 /*
326 * First, pick up the index.
327 */
328
329 idx = ACCESS_ONCE(t->rcu_flipctr_idx);
330
331 /*
332 * Now that we have fetched the counter index, it is
333 * safe to decrement the per-task RCU nesting counter.
334 * After this, any interrupts or NMIs will increment and
335 * decrement the per-CPU counters.
336 */
337 ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting - 1;
338
339 /*
340 * It is now safe to decrement this task's nesting count.
341 * NMIs that occur after this statement will route their
342 * rcu_read_lock() calls through this "else" clause, and
343 * will thus start incrementing the per-CPU counter on
344 * their own. They will also clobber ->rcu_flipctr_idx,
345 * but that is OK, since we have already fetched it.
346 */
347
348 ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])--;
349 local_irq_restore(flags);
350 }
351}
352EXPORT_SYMBOL_GPL(__rcu_read_unlock);
353
354/*
355 * If a global counter flip has occurred since the last time that we
356 * advanced callbacks, advance them. Hardware interrupts must be
357 * disabled when calling this function.
358 */
359static void __rcu_advance_callbacks(struct rcu_data *rdp)
360{
361 int cpu;
362 int i;
363 int wlc = 0;
364
365 if (rdp->completed != rcu_ctrlblk.completed) {
366 if (rdp->waitlist[GP_STAGES - 1] != NULL) {
367 *rdp->donetail = rdp->waitlist[GP_STAGES - 1];
368 rdp->donetail = rdp->waittail[GP_STAGES - 1];
369 RCU_TRACE_RDP(rcupreempt_trace_move2done, rdp);
370 }
371 for (i = GP_STAGES - 2; i >= 0; i--) {
372 if (rdp->waitlist[i] != NULL) {
373 rdp->waitlist[i + 1] = rdp->waitlist[i];
374 rdp->waittail[i + 1] = rdp->waittail[i];
375 wlc++;
376 } else {
377 rdp->waitlist[i + 1] = NULL;
378 rdp->waittail[i + 1] =
379 &rdp->waitlist[i + 1];
380 }
381 }
382 if (rdp->nextlist != NULL) {
383 rdp->waitlist[0] = rdp->nextlist;
384 rdp->waittail[0] = rdp->nexttail;
385 wlc++;
386 rdp->nextlist = NULL;
387 rdp->nexttail = &rdp->nextlist;
388 RCU_TRACE_RDP(rcupreempt_trace_move2wait, rdp);
389 } else {
390 rdp->waitlist[0] = NULL;
391 rdp->waittail[0] = &rdp->waitlist[0];
392 }
393 rdp->waitlistcount = wlc;
394 rdp->completed = rcu_ctrlblk.completed;
395 }
396
397 /*
398 * Check to see if this CPU needs to report that it has seen
399 * the most recent counter flip, thereby declaring that all
400 * subsequent rcu_read_lock() invocations will respect this flip.
401 */
402
403 cpu = raw_smp_processor_id();
404 if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
405 smp_mb(); /* Subsequent counter accesses must see new value */
406 per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
407 smp_mb(); /* Subsequent RCU read-side critical sections */
408 /* seen -after- acknowledgement. */
409 }
410}
411
412/*
413 * Get here when RCU is idle. Decide whether we need to
414 * move out of idle state, and return non-zero if so.
415 * "Straightforward" approach for the moment, might later
416 * use callback-list lengths, grace-period duration, or
417 * some such to determine when to exit idle state.
418 * Might also need a pre-idle test that does not acquire
419 * the lock, but let's get the simple case working first...
420 */
421
422static int
423rcu_try_flip_idle(void)
424{
425 int cpu;
426
427 RCU_TRACE_ME(rcupreempt_trace_try_flip_i1);
428 if (!rcu_pending(smp_processor_id())) {
429 RCU_TRACE_ME(rcupreempt_trace_try_flip_ie1);
430 return 0;
431 }
432
433 /*
434 * Do the flip.
435 */
436
437 RCU_TRACE_ME(rcupreempt_trace_try_flip_g1);
438 rcu_ctrlblk.completed++; /* stands in for rcu_try_flip_g2 */
439
440 /*
441 * Need a memory barrier so that other CPUs see the new
442 * counter value before they see the subsequent change of all
443 * the rcu_flip_flag instances to rcu_flipped.
444 */
445
446 smp_mb(); /* see above block comment. */
447
448 /* Now ask each CPU for acknowledgement of the flip. */
449
450 for_each_cpu_mask(cpu, rcu_cpu_online_map)
451 per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
452
453 return 1;
454}
455
456/*
457 * Wait for CPUs to acknowledge the flip.
458 */
459
460static int
461rcu_try_flip_waitack(void)
462{
463 int cpu;
464
465 RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
466 for_each_cpu_mask(cpu, rcu_cpu_online_map)
467 if (per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
468 RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
469 return 0;
470 }
471
472 /*
473 * Make sure our checks above don't bleed into subsequent
474 * waiting for the sum of the counters to reach zero.
475 */
476
477 smp_mb(); /* see above block comment. */
478 RCU_TRACE_ME(rcupreempt_trace_try_flip_a2);
479 return 1;
480}
481
482/*
483 * Wait for collective ``last'' counter to reach zero,
484 * then tell all CPUs to do an end-of-grace-period memory barrier.
485 */
486
487static int
488rcu_try_flip_waitzero(void)
489{
490 int cpu;
491 int lastidx = !(rcu_ctrlblk.completed & 0x1);
492 int sum = 0;
493
494 /* Check to see if the sum of the "last" counters is zero. */
495
496 RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
497 for_each_cpu_mask(cpu, rcu_cpu_online_map)
498 sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
499 if (sum != 0) {
500 RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
501 return 0;
502 }
503
504 /*
505 * This ensures that the other CPUs see the call for
506 * memory barriers -after- the sum to zero has been
507 * detected here
508 */
509 smp_mb(); /* ^^^^^^^^^^^^ */
510
511 /* Call for a memory barrier from each CPU. */
512 for_each_cpu_mask(cpu, rcu_cpu_online_map)
513 per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
514
515 RCU_TRACE_ME(rcupreempt_trace_try_flip_z2);
516 return 1;
517}
518
519/*
520 * Wait for all CPUs to do their end-of-grace-period memory barrier.
521 * Return 0 once all CPUs have done so.
522 */
523
524static int
525rcu_try_flip_waitmb(void)
526{
527 int cpu;
528
529 RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
530 for_each_cpu_mask(cpu, rcu_cpu_online_map)
531 if (per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
532 RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
533 return 0;
534 }
535
536 smp_mb(); /* Ensure that the above checks precede any following flip. */
537 RCU_TRACE_ME(rcupreempt_trace_try_flip_m2);
538 return 1;
539}
540
541/*
542 * Attempt a single flip of the counters. Remember, a single flip does
543 * -not- constitute a grace period. Instead, the interval between
544 * at least GP_STAGES consecutive flips is a grace period.
545 *
546 * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation
547 * on a large SMP, they might want to use a hierarchical organization of
548 * the per-CPU-counter pairs.
549 */
550static void rcu_try_flip(void)
551{
552 unsigned long flags;
553
554 RCU_TRACE_ME(rcupreempt_trace_try_flip_1);
555 if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, flags))) {
556 RCU_TRACE_ME(rcupreempt_trace_try_flip_e1);
557 return;
558 }
559
560 /*
561 * Take the next transition(s) through the RCU grace-period
562 * flip-counter state machine.
563 */
564
565 switch (rcu_ctrlblk.rcu_try_flip_state) {
566 case rcu_try_flip_idle_state:
567 if (rcu_try_flip_idle())
568 rcu_ctrlblk.rcu_try_flip_state =
569 rcu_try_flip_waitack_state;
570 break;
571 case rcu_try_flip_waitack_state:
572 if (rcu_try_flip_waitack())
573 rcu_ctrlblk.rcu_try_flip_state =
574 rcu_try_flip_waitzero_state;
575 break;
576 case rcu_try_flip_waitzero_state:
577 if (rcu_try_flip_waitzero())
578 rcu_ctrlblk.rcu_try_flip_state =
579 rcu_try_flip_waitmb_state;
580 break;
581 case rcu_try_flip_waitmb_state:
582 if (rcu_try_flip_waitmb())
583 rcu_ctrlblk.rcu_try_flip_state =
584 rcu_try_flip_idle_state;
585 }
586 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
587}
588
589/*
590 * Check to see if this CPU needs to do a memory barrier in order to
591 * ensure that any prior RCU read-side critical sections have committed
592 * their counter manipulations and critical-section memory references
593 * before declaring the grace period to be completed.
594 */
595static void rcu_check_mb(int cpu)
596{
597 if (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed) {
598 smp_mb(); /* Ensure RCU read-side accesses are visible. */
599 per_cpu(rcu_mb_flag, cpu) = rcu_mb_done;
600 }
601}
602
603void rcu_check_callbacks(int cpu, int user)
604{
605 unsigned long flags;
606 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
607
608 rcu_check_mb(cpu);
609 if (rcu_ctrlblk.completed == rdp->completed)
610 rcu_try_flip();
611 spin_lock_irqsave(&rdp->lock, flags);
612 RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
613 __rcu_advance_callbacks(rdp);
614 if (rdp->donelist == NULL) {
615 spin_unlock_irqrestore(&rdp->lock, flags);
616 } else {
617 spin_unlock_irqrestore(&rdp->lock, flags);
618 raise_softirq(RCU_SOFTIRQ);
619 }
620}
621
622/*
623 * Needed by dynticks, to make sure all RCU processing has finished
624 * when we go idle:
625 */
626void rcu_advance_callbacks(int cpu, int user)
627{
628 unsigned long flags;
629 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
630
631 if (rcu_ctrlblk.completed == rdp->completed) {
632 rcu_try_flip();
633 if (rcu_ctrlblk.completed == rdp->completed)
634 return;
635 }
636 spin_lock_irqsave(&rdp->lock, flags);
637 RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp);
638 __rcu_advance_callbacks(rdp);
639 spin_unlock_irqrestore(&rdp->lock, flags);
640}
641
642#ifdef CONFIG_HOTPLUG_CPU
643#define rcu_offline_cpu_enqueue(srclist, srctail, dstlist, dsttail) do { \
644 *dsttail = srclist; \
645 if (srclist != NULL) { \
646 dsttail = srctail; \
647 srclist = NULL; \
648 srctail = &srclist;\
649 } \
650 } while (0)
651
652void rcu_offline_cpu(int cpu)
653{
654 int i;
655 struct rcu_head *list = NULL;
656 unsigned long flags;
657 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
658 struct rcu_head **tail = &list;
659
660 /*
661 * Remove all callbacks from the newly dead CPU, retaining order.
662 * Otherwise rcu_barrier() will fail
663 */
664
665 spin_lock_irqsave(&rdp->lock, flags);
666 rcu_offline_cpu_enqueue(rdp->donelist, rdp->donetail, list, tail);
667 for (i = GP_STAGES - 1; i >= 0; i--)
668 rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i],
669 list, tail);
670 rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail);
671 spin_unlock_irqrestore(&rdp->lock, flags);
672 rdp->waitlistcount = 0;
673
674 /* Disengage the newly dead CPU from the grace-period computation. */
675
676 spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
677 rcu_check_mb(cpu);
678 if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) {
679 smp_mb(); /* Subsequent counter accesses must see new value */
680 per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen;
681 smp_mb(); /* Subsequent RCU read-side critical sections */
682 /* seen -after- acknowledgement. */
683 }
684
685 RCU_DATA_ME()->rcu_flipctr[0] += RCU_DATA_CPU(cpu)->rcu_flipctr[0];
686 RCU_DATA_ME()->rcu_flipctr[1] += RCU_DATA_CPU(cpu)->rcu_flipctr[1];
687
688 RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0;
689 RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0;
690
691 cpu_clear(cpu, rcu_cpu_online_map);
692
693 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
694
695 /*
696 * Place the removed callbacks on the current CPU's queue.
697 * Make them all start a new grace period: simple approach,
698 * in theory could starve a given set of callbacks, but
699 * you would need to be doing some serious CPU hotplugging
700 * to make this happen. If this becomes a problem, adding
701 * a synchronize_rcu() to the hotplug path would be a simple
702 * fix.
703 */
704
705 rdp = RCU_DATA_ME();
706 spin_lock_irqsave(&rdp->lock, flags);
707 *rdp->nexttail = list;
708 if (list)
709 rdp->nexttail = tail;
710 spin_unlock_irqrestore(&rdp->lock, flags);
711}
712
713void __devinit rcu_online_cpu(int cpu)
714{
715 unsigned long flags;
716
717 spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
718 cpu_set(cpu, rcu_cpu_online_map);
719 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
720}
721
722#else /* #ifdef CONFIG_HOTPLUG_CPU */
723
724void rcu_offline_cpu(int cpu)
725{
726}
727
728void __devinit rcu_online_cpu(int cpu)
729{
730}
731
732#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
733
734static void rcu_process_callbacks(struct softirq_action *unused)
735{
736 unsigned long flags;
737 struct rcu_head *next, *list;
738 struct rcu_data *rdp = RCU_DATA_ME();
739
740 spin_lock_irqsave(&rdp->lock, flags);
741 list = rdp->donelist;
742 if (list == NULL) {
743 spin_unlock_irqrestore(&rdp->lock, flags);
744 return;
745 }
746 rdp->donelist = NULL;
747 rdp->donetail = &rdp->donelist;
748 RCU_TRACE_RDP(rcupreempt_trace_done_remove, rdp);
749 spin_unlock_irqrestore(&rdp->lock, flags);
750 while (list) {
751 next = list->next;
752 list->func(list);
753 list = next;
754 RCU_TRACE_ME(rcupreempt_trace_invoke);
755 }
756}
757
758void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
759{
760 unsigned long flags;
761 struct rcu_data *rdp;
762
763 head->func = func;
764 head->next = NULL;
765 local_irq_save(flags);
766 rdp = RCU_DATA_ME();
767 spin_lock(&rdp->lock);
768 __rcu_advance_callbacks(rdp);
769 *rdp->nexttail = head;
770 rdp->nexttail = &head->next;
771 RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp);
772 spin_unlock(&rdp->lock);
773 local_irq_restore(flags);
774}
775EXPORT_SYMBOL_GPL(call_rcu);
776
777/*
778 * Wait until all currently running preempt_disable() code segments
779 * (including hardware-irq-disable segments) complete. Note that
780 * in -rt this does -not- necessarily result in all currently executing
781 * interrupt -handlers- having completed.
782 */
783void __synchronize_sched(void)
784{
785 cpumask_t oldmask;
786 int cpu;
787
788 if (sched_getaffinity(0, &oldmask) < 0)
789 oldmask = cpu_possible_map;
790 for_each_online_cpu(cpu) {
791 sched_setaffinity(0, cpumask_of_cpu(cpu));
792 schedule();
793 }
794 sched_setaffinity(0, oldmask);
795}
796EXPORT_SYMBOL_GPL(__synchronize_sched);
797
798/*
799 * Check to see if any future RCU-related work will need to be done
800 * by the current CPU, even if none need be done immediately, returning
801 * 1 if so. Assumes that notifiers would take care of handling any
802 * outstanding requests from the RCU core.
803 *
804 * This function is part of the RCU implementation; it is -not-
805 * an exported member of the RCU API.
806 */
807int rcu_needs_cpu(int cpu)
808{
809 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
810
811 return (rdp->donelist != NULL ||
812 !!rdp->waitlistcount ||
813 rdp->nextlist != NULL);
814}
815
816int rcu_pending(int cpu)
817{
818 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
819
820 /* The CPU has at least one callback queued somewhere. */
821
822 if (rdp->donelist != NULL ||
823 !!rdp->waitlistcount ||
824 rdp->nextlist != NULL)
825 return 1;
826
827 /* The RCU core needs an acknowledgement from this CPU. */
828
829 if ((per_cpu(rcu_flip_flag, cpu) == rcu_flipped) ||
830 (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed))
831 return 1;
832
833 /* This CPU has fallen behind the global grace-period number. */
834
835 if (rdp->completed != rcu_ctrlblk.completed)
836 return 1;
837
838 /* Nothing needed from this CPU. */
839
840 return 0;
841}
842
843static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
844 unsigned long action, void *hcpu)
845{
846 long cpu = (long)hcpu;
847
848 switch (action) {
849 case CPU_UP_PREPARE:
850 case CPU_UP_PREPARE_FROZEN:
851 rcu_online_cpu(cpu);
852 break;
853 case CPU_UP_CANCELED:
854 case CPU_UP_CANCELED_FROZEN:
855 case CPU_DEAD:
856 case CPU_DEAD_FROZEN:
857 rcu_offline_cpu(cpu);
858 break;
859 default:
860 break;
861 }
862 return NOTIFY_OK;
863}
864
865static struct notifier_block __cpuinitdata rcu_nb = {
866 .notifier_call = rcu_cpu_notify,
867};
868
869void __init __rcu_init(void)
870{
871 int cpu;
872 int i;
873 struct rcu_data *rdp;
874
875 printk(KERN_NOTICE "Preemptible RCU implementation.\n");
876 for_each_possible_cpu(cpu) {
877 rdp = RCU_DATA_CPU(cpu);
878 spin_lock_init(&rdp->lock);
879 rdp->completed = 0;
880 rdp->waitlistcount = 0;
881 rdp->nextlist = NULL;
882 rdp->nexttail = &rdp->nextlist;
883 for (i = 0; i < GP_STAGES; i++) {
884 rdp->waitlist[i] = NULL;
885 rdp->waittail[i] = &rdp->waitlist[i];
886 }
887 rdp->donelist = NULL;
888 rdp->donetail = &rdp->donelist;
889 rdp->rcu_flipctr[0] = 0;
890 rdp->rcu_flipctr[1] = 0;
891 }
892 register_cpu_notifier(&rcu_nb);
893
894 /*
895 * We don't need protection against CPU-Hotplug here
896 * since
897 * a) If a CPU comes online while we are iterating over the
898 * cpu_online_map below, we would only end up making a
899 * duplicate call to rcu_online_cpu() which sets the corresponding
900 * CPU's mask in the rcu_cpu_online_map.
901 *
902 * b) A CPU cannot go offline at this point in time since the user
903 * does not have access to the sysfs interface, nor do we
904 * suspend the system.
905 */
906 for_each_online_cpu(cpu)
907 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long) cpu);
908
909 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL);
910}
911
912/*
913 * Deprecated, use synchronize_rcu() or synchronize_sched() instead.
914 */
915void synchronize_kernel(void)
916{
917 synchronize_rcu();
918}
919
920#ifdef CONFIG_RCU_TRACE
921long *rcupreempt_flipctr(int cpu)
922{
923 return &RCU_DATA_CPU(cpu)->rcu_flipctr[0];
924}
925EXPORT_SYMBOL_GPL(rcupreempt_flipctr);
926
927int rcupreempt_flip_flag(int cpu)
928{
929 return per_cpu(rcu_flip_flag, cpu);
930}
931EXPORT_SYMBOL_GPL(rcupreempt_flip_flag);
932
933int rcupreempt_mb_flag(int cpu)
934{
935 return per_cpu(rcu_mb_flag, cpu);
936}
937EXPORT_SYMBOL_GPL(rcupreempt_mb_flag);
938
939char *rcupreempt_try_flip_state_name(void)
940{
941 return rcu_try_flip_state_names[rcu_ctrlblk.rcu_try_flip_state];
942}
943EXPORT_SYMBOL_GPL(rcupreempt_try_flip_state_name);
944
945struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu)
946{
947 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
948
949 return &rdp->trace;
950}
951EXPORT_SYMBOL_GPL(rcupreempt_trace_cpu);
952
953#endif /* #ifdef RCU_TRACE */
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
new file mode 100644
index 000000000000..49ac4947af24
--- /dev/null
+++ b/kernel/rcupreempt_trace.c
@@ -0,0 +1,330 @@
1/*
2 * Read-Copy Update tracing for realtime implementation
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright IBM Corporation, 2006
19 *
20 * Papers: http://www.rdrop.com/users/paulmck/RCU
21 *
22 * For detailed explanation of Read-Copy Update mechanism see -
23 * Documentation/RCU/ *.txt
24 *
25 */
26#include <linux/types.h>
27#include <linux/kernel.h>
28#include <linux/init.h>
29#include <linux/spinlock.h>
30#include <linux/smp.h>
31#include <linux/rcupdate.h>
32#include <linux/interrupt.h>
33#include <linux/sched.h>
34#include <asm/atomic.h>
35#include <linux/bitops.h>
36#include <linux/module.h>
37#include <linux/completion.h>
38#include <linux/moduleparam.h>
39#include <linux/percpu.h>
40#include <linux/notifier.h>
41#include <linux/rcupdate.h>
42#include <linux/cpu.h>
43#include <linux/mutex.h>
44#include <linux/rcupreempt_trace.h>
45#include <linux/debugfs.h>
46
47static struct mutex rcupreempt_trace_mutex;
48static char *rcupreempt_trace_buf;
49#define RCUPREEMPT_TRACE_BUF_SIZE 4096
50
51void rcupreempt_trace_move2done(struct rcupreempt_trace *trace)
52{
53 trace->done_length += trace->wait_length;
54 trace->done_add += trace->wait_length;
55 trace->wait_length = 0;
56}
57void rcupreempt_trace_move2wait(struct rcupreempt_trace *trace)
58{
59 trace->wait_length += trace->next_length;
60 trace->wait_add += trace->next_length;
61 trace->next_length = 0;
62}
63void rcupreempt_trace_try_flip_1(struct rcupreempt_trace *trace)
64{
65 atomic_inc(&trace->rcu_try_flip_1);
66}
67void rcupreempt_trace_try_flip_e1(struct rcupreempt_trace *trace)
68{
69 atomic_inc(&trace->rcu_try_flip_e1);
70}
71void rcupreempt_trace_try_flip_i1(struct rcupreempt_trace *trace)
72{
73 trace->rcu_try_flip_i1++;
74}
75void rcupreempt_trace_try_flip_ie1(struct rcupreempt_trace *trace)
76{
77 trace->rcu_try_flip_ie1++;
78}
79void rcupreempt_trace_try_flip_g1(struct rcupreempt_trace *trace)
80{
81 trace->rcu_try_flip_g1++;
82}
83void rcupreempt_trace_try_flip_a1(struct rcupreempt_trace *trace)
84{
85 trace->rcu_try_flip_a1++;
86}
87void rcupreempt_trace_try_flip_ae1(struct rcupreempt_trace *trace)
88{
89 trace->rcu_try_flip_ae1++;
90}
91void rcupreempt_trace_try_flip_a2(struct rcupreempt_trace *trace)
92{
93 trace->rcu_try_flip_a2++;
94}
95void rcupreempt_trace_try_flip_z1(struct rcupreempt_trace *trace)
96{
97 trace->rcu_try_flip_z1++;
98}
99void rcupreempt_trace_try_flip_ze1(struct rcupreempt_trace *trace)
100{
101 trace->rcu_try_flip_ze1++;
102}
103void rcupreempt_trace_try_flip_z2(struct rcupreempt_trace *trace)
104{
105 trace->rcu_try_flip_z2++;
106}
107void rcupreempt_trace_try_flip_m1(struct rcupreempt_trace *trace)
108{
109 trace->rcu_try_flip_m1++;
110}
111void rcupreempt_trace_try_flip_me1(struct rcupreempt_trace *trace)
112{
113 trace->rcu_try_flip_me1++;
114}
115void rcupreempt_trace_try_flip_m2(struct rcupreempt_trace *trace)
116{
117 trace->rcu_try_flip_m2++;
118}
119void rcupreempt_trace_check_callbacks(struct rcupreempt_trace *trace)
120{
121 trace->rcu_check_callbacks++;
122}
123void rcupreempt_trace_done_remove(struct rcupreempt_trace *trace)
124{
125 trace->done_remove += trace->done_length;
126 trace->done_length = 0;
127}
128void rcupreempt_trace_invoke(struct rcupreempt_trace *trace)
129{
130 atomic_inc(&trace->done_invoked);
131}
132void rcupreempt_trace_next_add(struct rcupreempt_trace *trace)
133{
134 trace->next_add++;
135 trace->next_length++;
136}
137
138static void rcupreempt_trace_sum(struct rcupreempt_trace *sp)
139{
140 struct rcupreempt_trace *cp;
141 int cpu;
142
143 memset(sp, 0, sizeof(*sp));
144 for_each_possible_cpu(cpu) {
145 cp = rcupreempt_trace_cpu(cpu);
146 sp->next_length += cp->next_length;
147 sp->next_add += cp->next_add;
148 sp->wait_length += cp->wait_length;
149 sp->wait_add += cp->wait_add;
150 sp->done_length += cp->done_length;
151 sp->done_add += cp->done_add;
152 sp->done_remove += cp->done_remove;
153 atomic_set(&sp->done_invoked, atomic_read(&cp->done_invoked));
154 sp->rcu_check_callbacks += cp->rcu_check_callbacks;
155 atomic_set(&sp->rcu_try_flip_1,
156 atomic_read(&cp->rcu_try_flip_1));
157 atomic_set(&sp->rcu_try_flip_e1,
158 atomic_read(&cp->rcu_try_flip_e1));
159 sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1;
160 sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1;
161 sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1;
162 sp->rcu_try_flip_a1 += cp->rcu_try_flip_a1;
163 sp->rcu_try_flip_ae1 += cp->rcu_try_flip_ae1;
164 sp->rcu_try_flip_a2 += cp->rcu_try_flip_a2;
165 sp->rcu_try_flip_z1 += cp->rcu_try_flip_z1;
166 sp->rcu_try_flip_ze1 += cp->rcu_try_flip_ze1;
167 sp->rcu_try_flip_z2 += cp->rcu_try_flip_z2;
168 sp->rcu_try_flip_m1 += cp->rcu_try_flip_m1;
169 sp->rcu_try_flip_me1 += cp->rcu_try_flip_me1;
170 sp->rcu_try_flip_m2 += cp->rcu_try_flip_m2;
171 }
172}
173
174static ssize_t rcustats_read(struct file *filp, char __user *buffer,
175 size_t count, loff_t *ppos)
176{
177 struct rcupreempt_trace trace;
178 ssize_t bcount;
179 int cnt = 0;
180
181 rcupreempt_trace_sum(&trace);
182 mutex_lock(&rcupreempt_trace_mutex);
183 snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt,
184 "ggp=%ld rcc=%ld\n",
185 rcu_batches_completed(),
186 trace.rcu_check_callbacks);
187 snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt,
188 "na=%ld nl=%ld wa=%ld wl=%ld da=%ld dl=%ld dr=%ld di=%d\n"
189 "1=%d e1=%d i1=%ld ie1=%ld g1=%ld a1=%ld ae1=%ld a2=%ld\n"
190 "z1=%ld ze1=%ld z2=%ld m1=%ld me1=%ld m2=%ld\n",
191
192 trace.next_add, trace.next_length,
193 trace.wait_add, trace.wait_length,
194 trace.done_add, trace.done_length,
195 trace.done_remove, atomic_read(&trace.done_invoked),
196 atomic_read(&trace.rcu_try_flip_1),
197 atomic_read(&trace.rcu_try_flip_e1),
198 trace.rcu_try_flip_i1, trace.rcu_try_flip_ie1,
199 trace.rcu_try_flip_g1,
200 trace.rcu_try_flip_a1, trace.rcu_try_flip_ae1,
201 trace.rcu_try_flip_a2,
202 trace.rcu_try_flip_z1, trace.rcu_try_flip_ze1,
203 trace.rcu_try_flip_z2,
204 trace.rcu_try_flip_m1, trace.rcu_try_flip_me1,
205 trace.rcu_try_flip_m2);
206 bcount = simple_read_from_buffer(buffer, count, ppos,
207 rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
208 mutex_unlock(&rcupreempt_trace_mutex);
209 return bcount;
210}
211
212static ssize_t rcugp_read(struct file *filp, char __user *buffer,
213 size_t count, loff_t *ppos)
214{
215 long oldgp = rcu_batches_completed();
216 ssize_t bcount;
217
218 mutex_lock(&rcupreempt_trace_mutex);
219 synchronize_rcu();
220 snprintf(rcupreempt_trace_buf, RCUPREEMPT_TRACE_BUF_SIZE,
221 "oldggp=%ld newggp=%ld\n", oldgp, rcu_batches_completed());
222 bcount = simple_read_from_buffer(buffer, count, ppos,
223 rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
224 mutex_unlock(&rcupreempt_trace_mutex);
225 return bcount;
226}
227
228static ssize_t rcuctrs_read(struct file *filp, char __user *buffer,
229 size_t count, loff_t *ppos)
230{
231 int cnt = 0;
232 int cpu;
233 int f = rcu_batches_completed() & 0x1;
234 ssize_t bcount;
235
236 mutex_lock(&rcupreempt_trace_mutex);
237
238 cnt += snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE,
239 "CPU last cur F M\n");
240 for_each_online_cpu(cpu) {
241 long *flipctr = rcupreempt_flipctr(cpu);
242 cnt += snprintf(&rcupreempt_trace_buf[cnt],
243 RCUPREEMPT_TRACE_BUF_SIZE - cnt,
244 "%3d %4ld %3ld %d %d\n",
245 cpu,
246 flipctr[!f],
247 flipctr[f],
248 rcupreempt_flip_flag(cpu),
249 rcupreempt_mb_flag(cpu));
250 }
251 cnt += snprintf(&rcupreempt_trace_buf[cnt],
252 RCUPREEMPT_TRACE_BUF_SIZE - cnt,
253 "ggp = %ld, state = %s\n",
254 rcu_batches_completed(),
255 rcupreempt_try_flip_state_name());
256 cnt += snprintf(&rcupreempt_trace_buf[cnt],
257 RCUPREEMPT_TRACE_BUF_SIZE - cnt,
258 "\n");
259 bcount = simple_read_from_buffer(buffer, count, ppos,
260 rcupreempt_trace_buf, strlen(rcupreempt_trace_buf));
261 mutex_unlock(&rcupreempt_trace_mutex);
262 return bcount;
263}
264
265static struct file_operations rcustats_fops = {
266 .owner = THIS_MODULE,
267 .read = rcustats_read,
268};
269
270static struct file_operations rcugp_fops = {
271 .owner = THIS_MODULE,
272 .read = rcugp_read,
273};
274
275static struct file_operations rcuctrs_fops = {
276 .owner = THIS_MODULE,
277 .read = rcuctrs_read,
278};
279
280static struct dentry *rcudir, *statdir, *ctrsdir, *gpdir;
281static int rcupreempt_debugfs_init(void)
282{
283 rcudir = debugfs_create_dir("rcu", NULL);
284 if (!rcudir)
285 goto out;
286 statdir = debugfs_create_file("rcustats", 0444, rcudir,
287 NULL, &rcustats_fops);
288 if (!statdir)
289 goto free_out;
290
291 gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
292 if (!gpdir)
293 goto free_out;
294
295 ctrsdir = debugfs_create_file("rcuctrs", 0444, rcudir,
296 NULL, &rcuctrs_fops);
297 if (!ctrsdir)
298 goto free_out;
299 return 0;
300free_out:
301 if (statdir)
302 debugfs_remove(statdir);
303 if (gpdir)
304 debugfs_remove(gpdir);
305 debugfs_remove(rcudir);
306out:
307 return 1;
308}
309
310static int __init rcupreempt_trace_init(void)
311{
312 mutex_init(&rcupreempt_trace_mutex);
313 rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL);
314 if (!rcupreempt_trace_buf)
315 return 1;
316 return rcupreempt_debugfs_init();
317}
318
319static void __exit rcupreempt_trace_cleanup(void)
320{
321 debugfs_remove(statdir);
322 debugfs_remove(gpdir);
323 debugfs_remove(ctrsdir);
324 debugfs_remove(rcudir);
325 kfree(rcupreempt_trace_buf);
326}
327
328
329module_init(rcupreempt_trace_init);
330module_exit(rcupreempt_trace_cleanup);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index c3e165c2318f..fd599829e72a 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -726,11 +726,11 @@ static void rcu_torture_shuffle_tasks(void)
726 cpumask_t tmp_mask = CPU_MASK_ALL; 726 cpumask_t tmp_mask = CPU_MASK_ALL;
727 int i; 727 int i;
728 728
729 lock_cpu_hotplug(); 729 get_online_cpus();
730 730
731 /* No point in shuffling if there is only one online CPU (ex: UP) */ 731 /* No point in shuffling if there is only one online CPU (ex: UP) */
732 if (num_online_cpus() == 1) { 732 if (num_online_cpus() == 1) {
733 unlock_cpu_hotplug(); 733 put_online_cpus();
734 return; 734 return;
735 } 735 }
736 736
@@ -762,7 +762,7 @@ static void rcu_torture_shuffle_tasks(void)
762 else 762 else
763 rcu_idle_cpu--; 763 rcu_idle_cpu--;
764 764
765 unlock_cpu_hotplug(); 765 put_online_cpus();
766} 766}
767 767
768/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the 768/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
diff --git a/kernel/resource.c b/kernel/resource.c
index a358142ff48f..2eb553d9b517 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -277,7 +277,7 @@ walk_memory_resource(unsigned long start_pfn, unsigned long nr_pages, void *arg,
277 int ret = -1; 277 int ret = -1;
278 res.start = (u64) start_pfn << PAGE_SHIFT; 278 res.start = (u64) start_pfn << PAGE_SHIFT;
279 res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1; 279 res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1;
280 res.flags = IORESOURCE_MEM; 280 res.flags = IORESOURCE_MEM | IORESOURCE_BUSY;
281 orig_end = res.end; 281 orig_end = res.end;
282 while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) { 282 while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) {
283 pfn = (unsigned long)(res.start >> PAGE_SHIFT); 283 pfn = (unsigned long)(res.start >> PAGE_SHIFT);
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index e3055ba69159..092e4c620af9 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -394,7 +394,7 @@ static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL);
394static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command); 394static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command);
395 395
396static struct sysdev_class rttest_sysclass = { 396static struct sysdev_class rttest_sysclass = {
397 set_kset_name("rttest"), 397 .name = "rttest",
398}; 398};
399 399
400static int init_test_thread(int id) 400static int init_test_thread(int id)
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
index 1ec620c03064..cae050b05f5e 100644
--- a/kernel/rwsem.c
+++ b/kernel/rwsem.c
@@ -6,6 +6,7 @@
6 6
7#include <linux/types.h> 7#include <linux/types.h>
8#include <linux/kernel.h> 8#include <linux/kernel.h>
9#include <linux/sched.h>
9#include <linux/module.h> 10#include <linux/module.h>
10#include <linux/rwsem.h> 11#include <linux/rwsem.h>
11 12
@@ -15,7 +16,7 @@
15/* 16/*
16 * lock for reading 17 * lock for reading
17 */ 18 */
18void down_read(struct rw_semaphore *sem) 19void __sched down_read(struct rw_semaphore *sem)
19{ 20{
20 might_sleep(); 21 might_sleep();
21 rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); 22 rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
@@ -42,7 +43,7 @@ EXPORT_SYMBOL(down_read_trylock);
42/* 43/*
43 * lock for writing 44 * lock for writing
44 */ 45 */
45void down_write(struct rw_semaphore *sem) 46void __sched down_write(struct rw_semaphore *sem)
46{ 47{
47 might_sleep(); 48 might_sleep();
48 rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); 49 rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
diff --git a/kernel/sched.c b/kernel/sched.c
index b4fbbc440453..9474b23c28bf 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -22,6 +22,8 @@
22 * by Peter Williams 22 * by Peter Williams
23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith 23 * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith
24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri 24 * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri
25 * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins,
26 * Thomas Gleixner, Mike Kravetz
25 */ 27 */
26 28
27#include <linux/mm.h> 29#include <linux/mm.h>
@@ -52,7 +54,6 @@
52#include <linux/cpu.h> 54#include <linux/cpu.h>
53#include <linux/cpuset.h> 55#include <linux/cpuset.h>
54#include <linux/percpu.h> 56#include <linux/percpu.h>
55#include <linux/cpu_acct.h>
56#include <linux/kthread.h> 57#include <linux/kthread.h>
57#include <linux/seq_file.h> 58#include <linux/seq_file.h>
58#include <linux/sysctl.h> 59#include <linux/sysctl.h>
@@ -64,6 +65,7 @@
64#include <linux/reciprocal_div.h> 65#include <linux/reciprocal_div.h>
65#include <linux/unistd.h> 66#include <linux/unistd.h>
66#include <linux/pagemap.h> 67#include <linux/pagemap.h>
68#include <linux/hrtimer.h>
67 69
68#include <asm/tlb.h> 70#include <asm/tlb.h>
69#include <asm/irq_regs.h> 71#include <asm/irq_regs.h>
@@ -75,7 +77,7 @@
75 */ 77 */
76unsigned long long __attribute__((weak)) sched_clock(void) 78unsigned long long __attribute__((weak)) sched_clock(void)
77{ 79{
78 return (unsigned long long)jiffies * (1000000000 / HZ); 80 return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
79} 81}
80 82
81/* 83/*
@@ -97,10 +99,9 @@ unsigned long long __attribute__((weak)) sched_clock(void)
97#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) 99#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO))
98 100
99/* 101/*
100 * Some helpers for converting nanosecond timing to jiffy resolution 102 * Helpers for converting nanosecond timing to jiffy resolution
101 */ 103 */
102#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (1000000000 / HZ)) 104#define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ))
103#define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ))
104 105
105#define NICE_0_LOAD SCHED_LOAD_SCALE 106#define NICE_0_LOAD SCHED_LOAD_SCALE
106#define NICE_0_SHIFT SCHED_LOAD_SHIFT 107#define NICE_0_SHIFT SCHED_LOAD_SHIFT
@@ -160,6 +161,8 @@ struct rt_prio_array {
160 161
161struct cfs_rq; 162struct cfs_rq;
162 163
164static LIST_HEAD(task_groups);
165
163/* task group related information */ 166/* task group related information */
164struct task_group { 167struct task_group {
165#ifdef CONFIG_FAIR_CGROUP_SCHED 168#ifdef CONFIG_FAIR_CGROUP_SCHED
@@ -169,9 +172,50 @@ struct task_group {
169 struct sched_entity **se; 172 struct sched_entity **se;
170 /* runqueue "owned" by this group on each cpu */ 173 /* runqueue "owned" by this group on each cpu */
171 struct cfs_rq **cfs_rq; 174 struct cfs_rq **cfs_rq;
175
176 struct sched_rt_entity **rt_se;
177 struct rt_rq **rt_rq;
178
179 unsigned int rt_ratio;
180
181 /*
182 * shares assigned to a task group governs how much of cpu bandwidth
183 * is allocated to the group. The more shares a group has, the more is
184 * the cpu bandwidth allocated to it.
185 *
186 * For ex, lets say that there are three task groups, A, B and C which
187 * have been assigned shares 1000, 2000 and 3000 respectively. Then,
188 * cpu bandwidth allocated by the scheduler to task groups A, B and C
189 * should be:
190 *
191 * Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66%
192 * Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33%
193 * Bw(C) = 3000/(1000+2000+3000) * 100 = 50%
194 *
195 * The weight assigned to a task group's schedulable entities on every
196 * cpu (task_group.se[a_cpu]->load.weight) is derived from the task
197 * group's shares. For ex: lets say that task group A has been
198 * assigned shares of 1000 and there are two CPUs in a system. Then,
199 *
200 * tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000;
201 *
202 * Note: It's not necessary that each of a task's group schedulable
203 * entity have the same weight on all CPUs. If the group
204 * has 2 of its tasks on CPU0 and 1 task on CPU1, then a
205 * better distribution of weight could be:
206 *
207 * tg_A->se[0]->load.weight = 2/3 * 2000 = 1333
208 * tg_A->se[1]->load.weight = 1/2 * 2000 = 667
209 *
210 * rebalance_shares() is responsible for distributing the shares of a
211 * task groups like this among the group's schedulable entities across
212 * cpus.
213 *
214 */
172 unsigned long shares; 215 unsigned long shares;
173 /* spinlock to serialize modification to shares */ 216
174 spinlock_t lock; 217 struct rcu_head rcu;
218 struct list_head list;
175}; 219};
176 220
177/* Default task group's sched entity on each cpu */ 221/* Default task group's sched entity on each cpu */
@@ -179,24 +223,51 @@ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
179/* Default task group's cfs_rq on each cpu */ 223/* Default task group's cfs_rq on each cpu */
180static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; 224static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
181 225
226static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
227static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
228
182static struct sched_entity *init_sched_entity_p[NR_CPUS]; 229static struct sched_entity *init_sched_entity_p[NR_CPUS];
183static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; 230static struct cfs_rq *init_cfs_rq_p[NR_CPUS];
184 231
232static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS];
233static struct rt_rq *init_rt_rq_p[NR_CPUS];
234
235/* task_group_mutex serializes add/remove of task groups and also changes to
236 * a task group's cpu shares.
237 */
238static DEFINE_MUTEX(task_group_mutex);
239
240/* doms_cur_mutex serializes access to doms_cur[] array */
241static DEFINE_MUTEX(doms_cur_mutex);
242
243#ifdef CONFIG_SMP
244/* kernel thread that runs rebalance_shares() periodically */
245static struct task_struct *lb_monitor_task;
246static int load_balance_monitor(void *unused);
247#endif
248
249static void set_se_shares(struct sched_entity *se, unsigned long shares);
250
185/* Default task group. 251/* Default task group.
186 * Every task in system belong to this group at bootup. 252 * Every task in system belong to this group at bootup.
187 */ 253 */
188struct task_group init_task_group = { 254struct task_group init_task_group = {
189 .se = init_sched_entity_p, 255 .se = init_sched_entity_p,
190 .cfs_rq = init_cfs_rq_p, 256 .cfs_rq = init_cfs_rq_p,
257
258 .rt_se = init_sched_rt_entity_p,
259 .rt_rq = init_rt_rq_p,
191}; 260};
192 261
193#ifdef CONFIG_FAIR_USER_SCHED 262#ifdef CONFIG_FAIR_USER_SCHED
194# define INIT_TASK_GRP_LOAD 2*NICE_0_LOAD 263# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
195#else 264#else
196# define INIT_TASK_GRP_LOAD NICE_0_LOAD 265# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
197#endif 266#endif
198 267
199static int init_task_group_load = INIT_TASK_GRP_LOAD; 268#define MIN_GROUP_SHARES 2
269
270static int init_task_group_load = INIT_TASK_GROUP_LOAD;
200 271
201/* return group to which a task belongs */ 272/* return group to which a task belongs */
202static inline struct task_group *task_group(struct task_struct *p) 273static inline struct task_group *task_group(struct task_struct *p)
@@ -209,22 +280,48 @@ static inline struct task_group *task_group(struct task_struct *p)
209 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), 280 tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
210 struct task_group, css); 281 struct task_group, css);
211#else 282#else
212 tg = &init_task_group; 283 tg = &init_task_group;
213#endif 284#endif
214
215 return tg; 285 return tg;
216} 286}
217 287
218/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ 288/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
219static inline void set_task_cfs_rq(struct task_struct *p) 289static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
290{
291 p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
292 p->se.parent = task_group(p)->se[cpu];
293
294 p->rt.rt_rq = task_group(p)->rt_rq[cpu];
295 p->rt.parent = task_group(p)->rt_se[cpu];
296}
297
298static inline void lock_task_group_list(void)
220{ 299{
221 p->se.cfs_rq = task_group(p)->cfs_rq[task_cpu(p)]; 300 mutex_lock(&task_group_mutex);
222 p->se.parent = task_group(p)->se[task_cpu(p)]; 301}
302
303static inline void unlock_task_group_list(void)
304{
305 mutex_unlock(&task_group_mutex);
306}
307
308static inline void lock_doms_cur(void)
309{
310 mutex_lock(&doms_cur_mutex);
311}
312
313static inline void unlock_doms_cur(void)
314{
315 mutex_unlock(&doms_cur_mutex);
223} 316}
224 317
225#else 318#else
226 319
227static inline void set_task_cfs_rq(struct task_struct *p) { } 320static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
321static inline void lock_task_group_list(void) { }
322static inline void unlock_task_group_list(void) { }
323static inline void lock_doms_cur(void) { }
324static inline void unlock_doms_cur(void) { }
228 325
229#endif /* CONFIG_FAIR_GROUP_SCHED */ 326#endif /* CONFIG_FAIR_GROUP_SCHED */
230 327
@@ -249,26 +346,72 @@ struct cfs_rq {
249#ifdef CONFIG_FAIR_GROUP_SCHED 346#ifdef CONFIG_FAIR_GROUP_SCHED
250 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ 347 struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
251 348
252 /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in 349 /*
350 * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in
253 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities 351 * a hierarchy). Non-leaf lrqs hold other higher schedulable entities
254 * (like users, containers etc.) 352 * (like users, containers etc.)
255 * 353 *
256 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This 354 * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This
257 * list is used during load balance. 355 * list is used during load balance.
258 */ 356 */
259 struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */ 357 struct list_head leaf_cfs_rq_list;
260 struct task_group *tg; /* group that "owns" this runqueue */ 358 struct task_group *tg; /* group that "owns" this runqueue */
261 struct rcu_head rcu;
262#endif 359#endif
263}; 360};
264 361
265/* Real-Time classes' related field in a runqueue: */ 362/* Real-Time classes' related field in a runqueue: */
266struct rt_rq { 363struct rt_rq {
267 struct rt_prio_array active; 364 struct rt_prio_array active;
268 int rt_load_balance_idx; 365 unsigned long rt_nr_running;
269 struct list_head *rt_load_balance_head, *rt_load_balance_curr; 366#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
367 int highest_prio; /* highest queued rt task prio */
368#endif
369#ifdef CONFIG_SMP
370 unsigned long rt_nr_migratory;
371 int overloaded;
372#endif
373 int rt_throttled;
374 u64 rt_time;
375
376#ifdef CONFIG_FAIR_GROUP_SCHED
377 struct rq *rq;
378 struct list_head leaf_rt_rq_list;
379 struct task_group *tg;
380 struct sched_rt_entity *rt_se;
381#endif
270}; 382};
271 383
384#ifdef CONFIG_SMP
385
386/*
387 * We add the notion of a root-domain which will be used to define per-domain
388 * variables. Each exclusive cpuset essentially defines an island domain by
389 * fully partitioning the member cpus from any other cpuset. Whenever a new
390 * exclusive cpuset is created, we also create and attach a new root-domain
391 * object.
392 *
393 */
394struct root_domain {
395 atomic_t refcount;
396 cpumask_t span;
397 cpumask_t online;
398
399 /*
400 * The "RT overload" flag: it gets set if a CPU has more than
401 * one runnable RT task.
402 */
403 cpumask_t rto_mask;
404 atomic_t rto_count;
405};
406
407/*
408 * By default the system creates a single root-domain with all cpus as
409 * members (mimicking the global state we have today).
410 */
411static struct root_domain def_root_domain;
412
413#endif
414
272/* 415/*
273 * This is the main, per-CPU runqueue data structure. 416 * This is the main, per-CPU runqueue data structure.
274 * 417 *
@@ -297,11 +440,15 @@ struct rq {
297 u64 nr_switches; 440 u64 nr_switches;
298 441
299 struct cfs_rq cfs; 442 struct cfs_rq cfs;
443 struct rt_rq rt;
444 u64 rt_period_expire;
445 int rt_throttled;
446
300#ifdef CONFIG_FAIR_GROUP_SCHED 447#ifdef CONFIG_FAIR_GROUP_SCHED
301 /* list of leaf cfs_rq on this cpu: */ 448 /* list of leaf cfs_rq on this cpu: */
302 struct list_head leaf_cfs_rq_list; 449 struct list_head leaf_cfs_rq_list;
450 struct list_head leaf_rt_rq_list;
303#endif 451#endif
304 struct rt_rq rt;
305 452
306 /* 453 /*
307 * This is part of a global counter where only the total sum 454 * This is part of a global counter where only the total sum
@@ -318,7 +465,7 @@ struct rq {
318 u64 clock, prev_clock_raw; 465 u64 clock, prev_clock_raw;
319 s64 clock_max_delta; 466 s64 clock_max_delta;
320 467
321 unsigned int clock_warps, clock_overflows; 468 unsigned int clock_warps, clock_overflows, clock_underflows;
322 u64 idle_clock; 469 u64 idle_clock;
323 unsigned int clock_deep_idle_events; 470 unsigned int clock_deep_idle_events;
324 u64 tick_timestamp; 471 u64 tick_timestamp;
@@ -326,6 +473,7 @@ struct rq {
326 atomic_t nr_iowait; 473 atomic_t nr_iowait;
327 474
328#ifdef CONFIG_SMP 475#ifdef CONFIG_SMP
476 struct root_domain *rd;
329 struct sched_domain *sd; 477 struct sched_domain *sd;
330 478
331 /* For active balancing */ 479 /* For active balancing */
@@ -338,6 +486,12 @@ struct rq {
338 struct list_head migration_queue; 486 struct list_head migration_queue;
339#endif 487#endif
340 488
489#ifdef CONFIG_SCHED_HRTICK
490 unsigned long hrtick_flags;
491 ktime_t hrtick_expire;
492 struct hrtimer hrtick_timer;
493#endif
494
341#ifdef CONFIG_SCHEDSTATS 495#ifdef CONFIG_SCHEDSTATS
342 /* latency stats */ 496 /* latency stats */
343 struct sched_info rq_sched_info; 497 struct sched_info rq_sched_info;
@@ -364,7 +518,6 @@ struct rq {
364}; 518};
365 519
366static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 520static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
367static DEFINE_MUTEX(sched_hotcpu_mutex);
368 521
369static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) 522static inline void check_preempt_curr(struct rq *rq, struct task_struct *p)
370{ 523{
@@ -442,6 +595,23 @@ static void update_rq_clock(struct rq *rq)
442#define task_rq(p) cpu_rq(task_cpu(p)) 595#define task_rq(p) cpu_rq(task_cpu(p))
443#define cpu_curr(cpu) (cpu_rq(cpu)->curr) 596#define cpu_curr(cpu) (cpu_rq(cpu)->curr)
444 597
598unsigned long rt_needs_cpu(int cpu)
599{
600 struct rq *rq = cpu_rq(cpu);
601 u64 delta;
602
603 if (!rq->rt_throttled)
604 return 0;
605
606 if (rq->clock > rq->rt_period_expire)
607 return 1;
608
609 delta = rq->rt_period_expire - rq->clock;
610 do_div(delta, NSEC_PER_SEC / HZ);
611
612 return (unsigned long)delta;
613}
614
445/* 615/*
446 * Tunables that become constants when CONFIG_SCHED_DEBUG is off: 616 * Tunables that become constants when CONFIG_SCHED_DEBUG is off:
447 */ 617 */
@@ -456,24 +626,47 @@ static void update_rq_clock(struct rq *rq)
456 */ 626 */
457enum { 627enum {
458 SCHED_FEAT_NEW_FAIR_SLEEPERS = 1, 628 SCHED_FEAT_NEW_FAIR_SLEEPERS = 1,
459 SCHED_FEAT_START_DEBIT = 2, 629 SCHED_FEAT_WAKEUP_PREEMPT = 2,
460 SCHED_FEAT_TREE_AVG = 4, 630 SCHED_FEAT_START_DEBIT = 4,
461 SCHED_FEAT_APPROX_AVG = 8, 631 SCHED_FEAT_TREE_AVG = 8,
462 SCHED_FEAT_WAKEUP_PREEMPT = 16, 632 SCHED_FEAT_APPROX_AVG = 16,
463 SCHED_FEAT_PREEMPT_RESTRICT = 32, 633 SCHED_FEAT_HRTICK = 32,
634 SCHED_FEAT_DOUBLE_TICK = 64,
464}; 635};
465 636
466const_debug unsigned int sysctl_sched_features = 637const_debug unsigned int sysctl_sched_features =
467 SCHED_FEAT_NEW_FAIR_SLEEPERS * 1 | 638 SCHED_FEAT_NEW_FAIR_SLEEPERS * 1 |
639 SCHED_FEAT_WAKEUP_PREEMPT * 1 |
468 SCHED_FEAT_START_DEBIT * 1 | 640 SCHED_FEAT_START_DEBIT * 1 |
469 SCHED_FEAT_TREE_AVG * 0 | 641 SCHED_FEAT_TREE_AVG * 0 |
470 SCHED_FEAT_APPROX_AVG * 0 | 642 SCHED_FEAT_APPROX_AVG * 0 |
471 SCHED_FEAT_WAKEUP_PREEMPT * 1 | 643 SCHED_FEAT_HRTICK * 1 |
472 SCHED_FEAT_PREEMPT_RESTRICT * 1; 644 SCHED_FEAT_DOUBLE_TICK * 0;
473 645
474#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) 646#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x)
475 647
476/* 648/*
649 * Number of tasks to iterate in a single balance run.
650 * Limited because this is done with IRQs disabled.
651 */
652const_debug unsigned int sysctl_sched_nr_migrate = 32;
653
654/*
655 * period over which we measure -rt task cpu usage in ms.
656 * default: 1s
657 */
658const_debug unsigned int sysctl_sched_rt_period = 1000;
659
660#define SCHED_RT_FRAC_SHIFT 16
661#define SCHED_RT_FRAC (1UL << SCHED_RT_FRAC_SHIFT)
662
663/*
664 * ratio of time -rt tasks may consume.
665 * default: 95%
666 */
667const_debug unsigned int sysctl_sched_rt_ratio = 62259;
668
669/*
477 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu 670 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
478 * clock constructed from sched_clock(): 671 * clock constructed from sched_clock():
479 */ 672 */
@@ -485,7 +678,12 @@ unsigned long long cpu_clock(int cpu)
485 678
486 local_irq_save(flags); 679 local_irq_save(flags);
487 rq = cpu_rq(cpu); 680 rq = cpu_rq(cpu);
488 update_rq_clock(rq); 681 /*
682 * Only call sched_clock() if the scheduler has already been
683 * initialized (some code might call cpu_clock() very early):
684 */
685 if (rq->idle)
686 update_rq_clock(rq);
489 now = rq->clock; 687 now = rq->clock;
490 local_irq_restore(flags); 688 local_irq_restore(flags);
491 689
@@ -500,10 +698,15 @@ EXPORT_SYMBOL_GPL(cpu_clock);
500# define finish_arch_switch(prev) do { } while (0) 698# define finish_arch_switch(prev) do { } while (0)
501#endif 699#endif
502 700
701static inline int task_current(struct rq *rq, struct task_struct *p)
702{
703 return rq->curr == p;
704}
705
503#ifndef __ARCH_WANT_UNLOCKED_CTXSW 706#ifndef __ARCH_WANT_UNLOCKED_CTXSW
504static inline int task_running(struct rq *rq, struct task_struct *p) 707static inline int task_running(struct rq *rq, struct task_struct *p)
505{ 708{
506 return rq->curr == p; 709 return task_current(rq, p);
507} 710}
508 711
509static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) 712static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
@@ -532,7 +735,7 @@ static inline int task_running(struct rq *rq, struct task_struct *p)
532#ifdef CONFIG_SMP 735#ifdef CONFIG_SMP
533 return p->oncpu; 736 return p->oncpu;
534#else 737#else
535 return rq->curr == p; 738 return task_current(rq, p);
536#endif 739#endif
537} 740}
538 741
@@ -588,7 +791,7 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
588 791
589/* 792/*
590 * task_rq_lock - lock the runqueue a given task resides on and disable 793 * task_rq_lock - lock the runqueue a given task resides on and disable
591 * interrupts. Note the ordering: we can safely lookup the task_rq without 794 * interrupts. Note the ordering: we can safely lookup the task_rq without
592 * explicitly disabling preemption. 795 * explicitly disabling preemption.
593 */ 796 */
594static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) 797static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
@@ -666,9 +869,177 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
666 rq->prev_clock_raw = now; 869 rq->prev_clock_raw = now;
667 rq->clock += delta_ns; 870 rq->clock += delta_ns;
668 spin_unlock(&rq->lock); 871 spin_unlock(&rq->lock);
872 touch_softlockup_watchdog();
669} 873}
670EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); 874EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
671 875
876static void __resched_task(struct task_struct *p, int tif_bit);
877
878static inline void resched_task(struct task_struct *p)
879{
880 __resched_task(p, TIF_NEED_RESCHED);
881}
882
883#ifdef CONFIG_SCHED_HRTICK
884/*
885 * Use HR-timers to deliver accurate preemption points.
886 *
887 * Its all a bit involved since we cannot program an hrt while holding the
888 * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a
889 * reschedule event.
890 *
891 * When we get rescheduled we reprogram the hrtick_timer outside of the
892 * rq->lock.
893 */
894static inline void resched_hrt(struct task_struct *p)
895{
896 __resched_task(p, TIF_HRTICK_RESCHED);
897}
898
899static inline void resched_rq(struct rq *rq)
900{
901 unsigned long flags;
902
903 spin_lock_irqsave(&rq->lock, flags);
904 resched_task(rq->curr);
905 spin_unlock_irqrestore(&rq->lock, flags);
906}
907
908enum {
909 HRTICK_SET, /* re-programm hrtick_timer */
910 HRTICK_RESET, /* not a new slice */
911};
912
913/*
914 * Use hrtick when:
915 * - enabled by features
916 * - hrtimer is actually high res
917 */
918static inline int hrtick_enabled(struct rq *rq)
919{
920 if (!sched_feat(HRTICK))
921 return 0;
922 return hrtimer_is_hres_active(&rq->hrtick_timer);
923}
924
925/*
926 * Called to set the hrtick timer state.
927 *
928 * called with rq->lock held and irqs disabled
929 */
930static void hrtick_start(struct rq *rq, u64 delay, int reset)
931{
932 assert_spin_locked(&rq->lock);
933
934 /*
935 * preempt at: now + delay
936 */
937 rq->hrtick_expire =
938 ktime_add_ns(rq->hrtick_timer.base->get_time(), delay);
939 /*
940 * indicate we need to program the timer
941 */
942 __set_bit(HRTICK_SET, &rq->hrtick_flags);
943 if (reset)
944 __set_bit(HRTICK_RESET, &rq->hrtick_flags);
945
946 /*
947 * New slices are called from the schedule path and don't need a
948 * forced reschedule.
949 */
950 if (reset)
951 resched_hrt(rq->curr);
952}
953
954static void hrtick_clear(struct rq *rq)
955{
956 if (hrtimer_active(&rq->hrtick_timer))
957 hrtimer_cancel(&rq->hrtick_timer);
958}
959
960/*
961 * Update the timer from the possible pending state.
962 */
963static void hrtick_set(struct rq *rq)
964{
965 ktime_t time;
966 int set, reset;
967 unsigned long flags;
968
969 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
970
971 spin_lock_irqsave(&rq->lock, flags);
972 set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags);
973 reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags);
974 time = rq->hrtick_expire;
975 clear_thread_flag(TIF_HRTICK_RESCHED);
976 spin_unlock_irqrestore(&rq->lock, flags);
977
978 if (set) {
979 hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS);
980 if (reset && !hrtimer_active(&rq->hrtick_timer))
981 resched_rq(rq);
982 } else
983 hrtick_clear(rq);
984}
985
986/*
987 * High-resolution timer tick.
988 * Runs from hardirq context with interrupts disabled.
989 */
990static enum hrtimer_restart hrtick(struct hrtimer *timer)
991{
992 struct rq *rq = container_of(timer, struct rq, hrtick_timer);
993
994 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
995
996 spin_lock(&rq->lock);
997 __update_rq_clock(rq);
998 rq->curr->sched_class->task_tick(rq, rq->curr, 1);
999 spin_unlock(&rq->lock);
1000
1001 return HRTIMER_NORESTART;
1002}
1003
1004static inline void init_rq_hrtick(struct rq *rq)
1005{
1006 rq->hrtick_flags = 0;
1007 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1008 rq->hrtick_timer.function = hrtick;
1009 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
1010}
1011
1012void hrtick_resched(void)
1013{
1014 struct rq *rq;
1015 unsigned long flags;
1016
1017 if (!test_thread_flag(TIF_HRTICK_RESCHED))
1018 return;
1019
1020 local_irq_save(flags);
1021 rq = cpu_rq(smp_processor_id());
1022 hrtick_set(rq);
1023 local_irq_restore(flags);
1024}
1025#else
1026static inline void hrtick_clear(struct rq *rq)
1027{
1028}
1029
1030static inline void hrtick_set(struct rq *rq)
1031{
1032}
1033
1034static inline void init_rq_hrtick(struct rq *rq)
1035{
1036}
1037
1038void hrtick_resched(void)
1039{
1040}
1041#endif
1042
672/* 1043/*
673 * resched_task - mark a task 'to be rescheduled now'. 1044 * resched_task - mark a task 'to be rescheduled now'.
674 * 1045 *
@@ -682,16 +1053,16 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
682#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) 1053#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
683#endif 1054#endif
684 1055
685static void resched_task(struct task_struct *p) 1056static void __resched_task(struct task_struct *p, int tif_bit)
686{ 1057{
687 int cpu; 1058 int cpu;
688 1059
689 assert_spin_locked(&task_rq(p)->lock); 1060 assert_spin_locked(&task_rq(p)->lock);
690 1061
691 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) 1062 if (unlikely(test_tsk_thread_flag(p, tif_bit)))
692 return; 1063 return;
693 1064
694 set_tsk_thread_flag(p, TIF_NEED_RESCHED); 1065 set_tsk_thread_flag(p, tif_bit);
695 1066
696 cpu = task_cpu(p); 1067 cpu = task_cpu(p);
697 if (cpu == smp_processor_id()) 1068 if (cpu == smp_processor_id())
@@ -714,10 +1085,10 @@ static void resched_cpu(int cpu)
714 spin_unlock_irqrestore(&rq->lock, flags); 1085 spin_unlock_irqrestore(&rq->lock, flags);
715} 1086}
716#else 1087#else
717static inline void resched_task(struct task_struct *p) 1088static void __resched_task(struct task_struct *p, int tif_bit)
718{ 1089{
719 assert_spin_locked(&task_rq(p)->lock); 1090 assert_spin_locked(&task_rq(p)->lock);
720 set_tsk_need_resched(p); 1091 set_tsk_thread_flag(p, tif_bit);
721} 1092}
722#endif 1093#endif
723 1094
@@ -776,7 +1147,7 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
776 * To aid in avoiding the subversion of "niceness" due to uneven distribution 1147 * To aid in avoiding the subversion of "niceness" due to uneven distribution
777 * of tasks with abnormal "nice" values across CPUs the contribution that 1148 * of tasks with abnormal "nice" values across CPUs the contribution that
778 * each task makes to its run queue's load is weighted according to its 1149 * each task makes to its run queue's load is weighted according to its
779 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a 1150 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
780 * scaled version of the new time slice allocation that they receive on time 1151 * scaled version of the new time slice allocation that they receive on time
781 * slice expiry etc. 1152 * slice expiry etc.
782 */ 1153 */
@@ -851,6 +1222,29 @@ iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
851 struct rq_iterator *iterator); 1222 struct rq_iterator *iterator);
852#endif 1223#endif
853 1224
1225#ifdef CONFIG_CGROUP_CPUACCT
1226static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
1227#else
1228static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
1229#endif
1230
1231static inline void inc_cpu_load(struct rq *rq, unsigned long load)
1232{
1233 update_load_add(&rq->load, load);
1234}
1235
1236static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1237{
1238 update_load_sub(&rq->load, load);
1239}
1240
1241#ifdef CONFIG_SMP
1242static unsigned long source_load(int cpu, int type);
1243static unsigned long target_load(int cpu, int type);
1244static unsigned long cpu_avg_load_per_task(int cpu);
1245static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1246#endif /* CONFIG_SMP */
1247
854#include "sched_stats.h" 1248#include "sched_stats.h"
855#include "sched_idletask.c" 1249#include "sched_idletask.c"
856#include "sched_fair.c" 1250#include "sched_fair.c"
@@ -861,41 +1255,14 @@ iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
861 1255
862#define sched_class_highest (&rt_sched_class) 1256#define sched_class_highest (&rt_sched_class)
863 1257
864/* 1258static void inc_nr_running(struct rq *rq)
865 * Update delta_exec, delta_fair fields for rq.
866 *
867 * delta_fair clock advances at a rate inversely proportional to
868 * total load (rq->load.weight) on the runqueue, while
869 * delta_exec advances at the same rate as wall-clock (provided
870 * cpu is not idle).
871 *
872 * delta_exec / delta_fair is a measure of the (smoothened) load on this
873 * runqueue over any given interval. This (smoothened) load is used
874 * during load balance.
875 *
876 * This function is called /before/ updating rq->load
877 * and when switching tasks.
878 */
879static inline void inc_load(struct rq *rq, const struct task_struct *p)
880{
881 update_load_add(&rq->load, p->se.load.weight);
882}
883
884static inline void dec_load(struct rq *rq, const struct task_struct *p)
885{
886 update_load_sub(&rq->load, p->se.load.weight);
887}
888
889static void inc_nr_running(struct task_struct *p, struct rq *rq)
890{ 1259{
891 rq->nr_running++; 1260 rq->nr_running++;
892 inc_load(rq, p);
893} 1261}
894 1262
895static void dec_nr_running(struct task_struct *p, struct rq *rq) 1263static void dec_nr_running(struct rq *rq)
896{ 1264{
897 rq->nr_running--; 1265 rq->nr_running--;
898 dec_load(rq, p);
899} 1266}
900 1267
901static void set_load_weight(struct task_struct *p) 1268static void set_load_weight(struct task_struct *p)
@@ -983,11 +1350,11 @@ static int effective_prio(struct task_struct *p)
983 */ 1350 */
984static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) 1351static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
985{ 1352{
986 if (p->state == TASK_UNINTERRUPTIBLE) 1353 if (task_contributes_to_load(p))
987 rq->nr_uninterruptible--; 1354 rq->nr_uninterruptible--;
988 1355
989 enqueue_task(rq, p, wakeup); 1356 enqueue_task(rq, p, wakeup);
990 inc_nr_running(p, rq); 1357 inc_nr_running(rq);
991} 1358}
992 1359
993/* 1360/*
@@ -995,11 +1362,11 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
995 */ 1362 */
996static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) 1363static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
997{ 1364{
998 if (p->state == TASK_UNINTERRUPTIBLE) 1365 if (task_contributes_to_load(p))
999 rq->nr_uninterruptible++; 1366 rq->nr_uninterruptible++;
1000 1367
1001 dequeue_task(rq, p, sleep); 1368 dequeue_task(rq, p, sleep);
1002 dec_nr_running(p, rq); 1369 dec_nr_running(rq);
1003} 1370}
1004 1371
1005/** 1372/**
@@ -1019,10 +1386,28 @@ unsigned long weighted_cpuload(const int cpu)
1019 1386
1020static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 1387static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1021{ 1388{
1389 set_task_rq(p, cpu);
1022#ifdef CONFIG_SMP 1390#ifdef CONFIG_SMP
1391 /*
1392 * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be
1393 * successfuly executed on another CPU. We must ensure that updates of
1394 * per-task data have been completed by this moment.
1395 */
1396 smp_wmb();
1023 task_thread_info(p)->cpu = cpu; 1397 task_thread_info(p)->cpu = cpu;
1024#endif 1398#endif
1025 set_task_cfs_rq(p); 1399}
1400
1401static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1402 const struct sched_class *prev_class,
1403 int oldprio, int running)
1404{
1405 if (prev_class != p->sched_class) {
1406 if (prev_class->switched_from)
1407 prev_class->switched_from(rq, p, running);
1408 p->sched_class->switched_to(rq, p, running);
1409 } else
1410 p->sched_class->prio_changed(rq, p, oldprio, running);
1026} 1411}
1027 1412
1028#ifdef CONFIG_SMP 1413#ifdef CONFIG_SMP
@@ -1030,7 +1415,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1030/* 1415/*
1031 * Is this task likely cache-hot: 1416 * Is this task likely cache-hot:
1032 */ 1417 */
1033static inline int 1418static int
1034task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) 1419task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
1035{ 1420{
1036 s64 delta; 1421 s64 delta;
@@ -1255,7 +1640,7 @@ static unsigned long target_load(int cpu, int type)
1255/* 1640/*
1256 * Return the average load per task on the cpu's run queue 1641 * Return the average load per task on the cpu's run queue
1257 */ 1642 */
1258static inline unsigned long cpu_avg_load_per_task(int cpu) 1643static unsigned long cpu_avg_load_per_task(int cpu)
1259{ 1644{
1260 struct rq *rq = cpu_rq(cpu); 1645 struct rq *rq = cpu_rq(cpu);
1261 unsigned long total = weighted_cpuload(cpu); 1646 unsigned long total = weighted_cpuload(cpu);
@@ -1412,58 +1797,6 @@ static int sched_balance_self(int cpu, int flag)
1412 1797
1413#endif /* CONFIG_SMP */ 1798#endif /* CONFIG_SMP */
1414 1799
1415/*
1416 * wake_idle() will wake a task on an idle cpu if task->cpu is
1417 * not idle and an idle cpu is available. The span of cpus to
1418 * search starts with cpus closest then further out as needed,
1419 * so we always favor a closer, idle cpu.
1420 *
1421 * Returns the CPU we should wake onto.
1422 */
1423#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1424static int wake_idle(int cpu, struct task_struct *p)
1425{
1426 cpumask_t tmp;
1427 struct sched_domain *sd;
1428 int i;
1429
1430 /*
1431 * If it is idle, then it is the best cpu to run this task.
1432 *
1433 * This cpu is also the best, if it has more than one task already.
1434 * Siblings must be also busy(in most cases) as they didn't already
1435 * pickup the extra load from this cpu and hence we need not check
1436 * sibling runqueue info. This will avoid the checks and cache miss
1437 * penalities associated with that.
1438 */
1439 if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
1440 return cpu;
1441
1442 for_each_domain(cpu, sd) {
1443 if (sd->flags & SD_WAKE_IDLE) {
1444 cpus_and(tmp, sd->span, p->cpus_allowed);
1445 for_each_cpu_mask(i, tmp) {
1446 if (idle_cpu(i)) {
1447 if (i != task_cpu(p)) {
1448 schedstat_inc(p,
1449 se.nr_wakeups_idle);
1450 }
1451 return i;
1452 }
1453 }
1454 } else {
1455 break;
1456 }
1457 }
1458 return cpu;
1459}
1460#else
1461static inline int wake_idle(int cpu, struct task_struct *p)
1462{
1463 return cpu;
1464}
1465#endif
1466
1467/*** 1800/***
1468 * try_to_wake_up - wake up a thread 1801 * try_to_wake_up - wake up a thread
1469 * @p: the to-be-woken-up thread 1802 * @p: the to-be-woken-up thread
@@ -1484,11 +1817,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1484 unsigned long flags; 1817 unsigned long flags;
1485 long old_state; 1818 long old_state;
1486 struct rq *rq; 1819 struct rq *rq;
1487#ifdef CONFIG_SMP
1488 struct sched_domain *sd, *this_sd = NULL;
1489 unsigned long load, this_load;
1490 int new_cpu;
1491#endif
1492 1820
1493 rq = task_rq_lock(p, &flags); 1821 rq = task_rq_lock(p, &flags);
1494 old_state = p->state; 1822 old_state = p->state;
@@ -1506,92 +1834,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1506 if (unlikely(task_running(rq, p))) 1834 if (unlikely(task_running(rq, p)))
1507 goto out_activate; 1835 goto out_activate;
1508 1836
1509 new_cpu = cpu; 1837 cpu = p->sched_class->select_task_rq(p, sync);
1510 1838 if (cpu != orig_cpu) {
1511 schedstat_inc(rq, ttwu_count); 1839 set_task_cpu(p, cpu);
1512 if (cpu == this_cpu) {
1513 schedstat_inc(rq, ttwu_local);
1514 goto out_set_cpu;
1515 }
1516
1517 for_each_domain(this_cpu, sd) {
1518 if (cpu_isset(cpu, sd->span)) {
1519 schedstat_inc(sd, ttwu_wake_remote);
1520 this_sd = sd;
1521 break;
1522 }
1523 }
1524
1525 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1526 goto out_set_cpu;
1527
1528 /*
1529 * Check for affine wakeup and passive balancing possibilities.
1530 */
1531 if (this_sd) {
1532 int idx = this_sd->wake_idx;
1533 unsigned int imbalance;
1534
1535 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1536
1537 load = source_load(cpu, idx);
1538 this_load = target_load(this_cpu, idx);
1539
1540 new_cpu = this_cpu; /* Wake to this CPU if we can */
1541
1542 if (this_sd->flags & SD_WAKE_AFFINE) {
1543 unsigned long tl = this_load;
1544 unsigned long tl_per_task;
1545
1546 /*
1547 * Attract cache-cold tasks on sync wakeups:
1548 */
1549 if (sync && !task_hot(p, rq->clock, this_sd))
1550 goto out_set_cpu;
1551
1552 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1553 tl_per_task = cpu_avg_load_per_task(this_cpu);
1554
1555 /*
1556 * If sync wakeup then subtract the (maximum possible)
1557 * effect of the currently running task from the load
1558 * of the current CPU:
1559 */
1560 if (sync)
1561 tl -= current->se.load.weight;
1562
1563 if ((tl <= load &&
1564 tl + target_load(cpu, idx) <= tl_per_task) ||
1565 100*(tl + p->se.load.weight) <= imbalance*load) {
1566 /*
1567 * This domain has SD_WAKE_AFFINE and
1568 * p is cache cold in this domain, and
1569 * there is no bad imbalance.
1570 */
1571 schedstat_inc(this_sd, ttwu_move_affine);
1572 schedstat_inc(p, se.nr_wakeups_affine);
1573 goto out_set_cpu;
1574 }
1575 }
1576
1577 /*
1578 * Start passive balancing when half the imbalance_pct
1579 * limit is reached.
1580 */
1581 if (this_sd->flags & SD_WAKE_BALANCE) {
1582 if (imbalance*this_load <= 100*load) {
1583 schedstat_inc(this_sd, ttwu_move_balance);
1584 schedstat_inc(p, se.nr_wakeups_passive);
1585 goto out_set_cpu;
1586 }
1587 }
1588 }
1589
1590 new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1591out_set_cpu:
1592 new_cpu = wake_idle(new_cpu, p);
1593 if (new_cpu != cpu) {
1594 set_task_cpu(p, new_cpu);
1595 task_rq_unlock(rq, &flags); 1840 task_rq_unlock(rq, &flags);
1596 /* might preempt at this point */ 1841 /* might preempt at this point */
1597 rq = task_rq_lock(p, &flags); 1842 rq = task_rq_lock(p, &flags);
@@ -1605,6 +1850,21 @@ out_set_cpu:
1605 cpu = task_cpu(p); 1850 cpu = task_cpu(p);
1606 } 1851 }
1607 1852
1853#ifdef CONFIG_SCHEDSTATS
1854 schedstat_inc(rq, ttwu_count);
1855 if (cpu == this_cpu)
1856 schedstat_inc(rq, ttwu_local);
1857 else {
1858 struct sched_domain *sd;
1859 for_each_domain(this_cpu, sd) {
1860 if (cpu_isset(cpu, sd->span)) {
1861 schedstat_inc(sd, ttwu_wake_remote);
1862 break;
1863 }
1864 }
1865 }
1866#endif
1867
1608out_activate: 1868out_activate:
1609#endif /* CONFIG_SMP */ 1869#endif /* CONFIG_SMP */
1610 schedstat_inc(p, se.nr_wakeups); 1870 schedstat_inc(p, se.nr_wakeups);
@@ -1623,6 +1883,10 @@ out_activate:
1623 1883
1624out_running: 1884out_running:
1625 p->state = TASK_RUNNING; 1885 p->state = TASK_RUNNING;
1886#ifdef CONFIG_SMP
1887 if (p->sched_class->task_wake_up)
1888 p->sched_class->task_wake_up(rq, p);
1889#endif
1626out: 1890out:
1627 task_rq_unlock(rq, &flags); 1891 task_rq_unlock(rq, &flags);
1628 1892
@@ -1631,8 +1895,7 @@ out:
1631 1895
1632int fastcall wake_up_process(struct task_struct *p) 1896int fastcall wake_up_process(struct task_struct *p)
1633{ 1897{
1634 return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | 1898 return try_to_wake_up(p, TASK_ALL, 0);
1635 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
1636} 1899}
1637EXPORT_SYMBOL(wake_up_process); 1900EXPORT_SYMBOL(wake_up_process);
1638 1901
@@ -1665,7 +1928,7 @@ static void __sched_fork(struct task_struct *p)
1665 p->se.wait_max = 0; 1928 p->se.wait_max = 0;
1666#endif 1929#endif
1667 1930
1668 INIT_LIST_HEAD(&p->run_list); 1931 INIT_LIST_HEAD(&p->rt.run_list);
1669 p->se.on_rq = 0; 1932 p->se.on_rq = 0;
1670 1933
1671#ifdef CONFIG_PREEMPT_NOTIFIERS 1934#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -1742,9 +2005,13 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1742 * management (if any): 2005 * management (if any):
1743 */ 2006 */
1744 p->sched_class->task_new(rq, p); 2007 p->sched_class->task_new(rq, p);
1745 inc_nr_running(p, rq); 2008 inc_nr_running(rq);
1746 } 2009 }
1747 check_preempt_curr(rq, p); 2010 check_preempt_curr(rq, p);
2011#ifdef CONFIG_SMP
2012 if (p->sched_class->task_wake_up)
2013 p->sched_class->task_wake_up(rq, p);
2014#endif
1748 task_rq_unlock(rq, &flags); 2015 task_rq_unlock(rq, &flags);
1749} 2016}
1750 2017
@@ -1839,7 +2106,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
1839 * and do any other architecture-specific cleanup actions. 2106 * and do any other architecture-specific cleanup actions.
1840 * 2107 *
1841 * Note that we may have delayed dropping an mm in context_switch(). If 2108 * Note that we may have delayed dropping an mm in context_switch(). If
1842 * so, we finish that here outside of the runqueue lock. (Doing it 2109 * so, we finish that here outside of the runqueue lock. (Doing it
1843 * with the lock held can cause deadlocks; see schedule() for 2110 * with the lock held can cause deadlocks; see schedule() for
1844 * details.) 2111 * details.)
1845 */ 2112 */
@@ -1865,6 +2132,11 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
1865 prev_state = prev->state; 2132 prev_state = prev->state;
1866 finish_arch_switch(prev); 2133 finish_arch_switch(prev);
1867 finish_lock_switch(rq, prev); 2134 finish_lock_switch(rq, prev);
2135#ifdef CONFIG_SMP
2136 if (current->sched_class->post_schedule)
2137 current->sched_class->post_schedule(rq);
2138#endif
2139
1868 fire_sched_in_preempt_notifiers(current); 2140 fire_sched_in_preempt_notifiers(current);
1869 if (mm) 2141 if (mm)
1870 mmdrop(mm); 2142 mmdrop(mm);
@@ -2098,11 +2370,13 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
2098/* 2370/*
2099 * double_lock_balance - lock the busiest runqueue, this_rq is locked already. 2371 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
2100 */ 2372 */
2101static void double_lock_balance(struct rq *this_rq, struct rq *busiest) 2373static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
2102 __releases(this_rq->lock) 2374 __releases(this_rq->lock)
2103 __acquires(busiest->lock) 2375 __acquires(busiest->lock)
2104 __acquires(this_rq->lock) 2376 __acquires(this_rq->lock)
2105{ 2377{
2378 int ret = 0;
2379
2106 if (unlikely(!irqs_disabled())) { 2380 if (unlikely(!irqs_disabled())) {
2107 /* printk() doesn't work good under rq->lock */ 2381 /* printk() doesn't work good under rq->lock */
2108 spin_unlock(&this_rq->lock); 2382 spin_unlock(&this_rq->lock);
@@ -2113,15 +2387,17 @@ static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
2113 spin_unlock(&this_rq->lock); 2387 spin_unlock(&this_rq->lock);
2114 spin_lock(&busiest->lock); 2388 spin_lock(&busiest->lock);
2115 spin_lock(&this_rq->lock); 2389 spin_lock(&this_rq->lock);
2390 ret = 1;
2116 } else 2391 } else
2117 spin_lock(&busiest->lock); 2392 spin_lock(&busiest->lock);
2118 } 2393 }
2394 return ret;
2119} 2395}
2120 2396
2121/* 2397/*
2122 * If dest_cpu is allowed for this process, migrate the task to it. 2398 * If dest_cpu is allowed for this process, migrate the task to it.
2123 * This is accomplished by forcing the cpu_allowed mask to only 2399 * This is accomplished by forcing the cpu_allowed mask to only
2124 * allow dest_cpu, which will force the cpu onto dest_cpu. Then 2400 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
2125 * the cpu_allowed mask is restored. 2401 * the cpu_allowed mask is restored.
2126 */ 2402 */
2127static void sched_migrate_task(struct task_struct *p, int dest_cpu) 2403static void sched_migrate_task(struct task_struct *p, int dest_cpu)
@@ -2237,7 +2513,7 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2237 enum cpu_idle_type idle, int *all_pinned, 2513 enum cpu_idle_type idle, int *all_pinned,
2238 int *this_best_prio, struct rq_iterator *iterator) 2514 int *this_best_prio, struct rq_iterator *iterator)
2239{ 2515{
2240 int pulled = 0, pinned = 0, skip_for_load; 2516 int loops = 0, pulled = 0, pinned = 0, skip_for_load;
2241 struct task_struct *p; 2517 struct task_struct *p;
2242 long rem_load_move = max_load_move; 2518 long rem_load_move = max_load_move;
2243 2519
@@ -2251,10 +2527,10 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2251 */ 2527 */
2252 p = iterator->start(iterator->arg); 2528 p = iterator->start(iterator->arg);
2253next: 2529next:
2254 if (!p) 2530 if (!p || loops++ > sysctl_sched_nr_migrate)
2255 goto out; 2531 goto out;
2256 /* 2532 /*
2257 * To help distribute high priority tasks accross CPUs we don't 2533 * To help distribute high priority tasks across CPUs we don't
2258 * skip a task if it will be the highest priority task (i.e. smallest 2534 * skip a task if it will be the highest priority task (i.e. smallest
2259 * prio value) on its new queue regardless of its load weight 2535 * prio value) on its new queue regardless of its load weight
2260 */ 2536 */
@@ -2271,8 +2547,7 @@ next:
2271 rem_load_move -= p->se.load.weight; 2547 rem_load_move -= p->se.load.weight;
2272 2548
2273 /* 2549 /*
2274 * We only want to steal up to the prescribed number of tasks 2550 * We only want to steal up to the prescribed amount of weighted load.
2275 * and the prescribed amount of weighted load.
2276 */ 2551 */
2277 if (rem_load_move > 0) { 2552 if (rem_load_move > 0) {
2278 if (p->prio < *this_best_prio) 2553 if (p->prio < *this_best_prio)
@@ -2567,7 +2842,7 @@ group_next:
2567 * tasks around. Thus we look for the minimum possible imbalance. 2842 * tasks around. Thus we look for the minimum possible imbalance.
2568 * Negative imbalances (*we* are more loaded than anyone else) will 2843 * Negative imbalances (*we* are more loaded than anyone else) will
2569 * be counted as no imbalance for these purposes -- we can't fix that 2844 * be counted as no imbalance for these purposes -- we can't fix that
2570 * by pulling tasks to us. Be careful of negative numbers as they'll 2845 * by pulling tasks to us. Be careful of negative numbers as they'll
2571 * appear as very large values with unsigned longs. 2846 * appear as very large values with unsigned longs.
2572 */ 2847 */
2573 if (max_load <= busiest_load_per_task) 2848 if (max_load <= busiest_load_per_task)
@@ -3002,7 +3277,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3002 3277
3003 /* 3278 /*
3004 * This condition is "impossible", if it occurs 3279 * This condition is "impossible", if it occurs
3005 * we need to fix it. Originally reported by 3280 * we need to fix it. Originally reported by
3006 * Bjorn Helgaas on a 128-cpu setup. 3281 * Bjorn Helgaas on a 128-cpu setup.
3007 */ 3282 */
3008 BUG_ON(busiest_rq == target_rq); 3283 BUG_ON(busiest_rq == target_rq);
@@ -3034,7 +3309,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3034#ifdef CONFIG_NO_HZ 3309#ifdef CONFIG_NO_HZ
3035static struct { 3310static struct {
3036 atomic_t load_balancer; 3311 atomic_t load_balancer;
3037 cpumask_t cpu_mask; 3312 cpumask_t cpu_mask;
3038} nohz ____cacheline_aligned = { 3313} nohz ____cacheline_aligned = {
3039 .load_balancer = ATOMIC_INIT(-1), 3314 .load_balancer = ATOMIC_INIT(-1),
3040 .cpu_mask = CPU_MASK_NONE, 3315 .cpu_mask = CPU_MASK_NONE,
@@ -3315,7 +3590,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
3315 3590
3316 rq = task_rq_lock(p, &flags); 3591 rq = task_rq_lock(p, &flags);
3317 ns = p->se.sum_exec_runtime; 3592 ns = p->se.sum_exec_runtime;
3318 if (rq->curr == p) { 3593 if (task_current(rq, p)) {
3319 update_rq_clock(rq); 3594 update_rq_clock(rq);
3320 delta_exec = rq->clock - p->se.exec_start; 3595 delta_exec = rq->clock - p->se.exec_start;
3321 if ((s64)delta_exec > 0) 3596 if ((s64)delta_exec > 0)
@@ -3335,13 +3610,9 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
3335{ 3610{
3336 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3611 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
3337 cputime64_t tmp; 3612 cputime64_t tmp;
3338 struct rq *rq = this_rq();
3339 3613
3340 p->utime = cputime_add(p->utime, cputime); 3614 p->utime = cputime_add(p->utime, cputime);
3341 3615
3342 if (p != rq->idle)
3343 cpuacct_charge(p, cputime);
3344
3345 /* Add user time to cpustat. */ 3616 /* Add user time to cpustat. */
3346 tmp = cputime_to_cputime64(cputime); 3617 tmp = cputime_to_cputime64(cputime);
3347 if (TASK_NICE(p) > 0) 3618 if (TASK_NICE(p) > 0)
@@ -3355,7 +3626,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
3355 * @p: the process that the cpu time gets accounted to 3626 * @p: the process that the cpu time gets accounted to
3356 * @cputime: the cpu time spent in virtual machine since the last update 3627 * @cputime: the cpu time spent in virtual machine since the last update
3357 */ 3628 */
3358void account_guest_time(struct task_struct *p, cputime_t cputime) 3629static void account_guest_time(struct task_struct *p, cputime_t cputime)
3359{ 3630{
3360 cputime64_t tmp; 3631 cputime64_t tmp;
3361 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3632 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
@@ -3392,10 +3663,8 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3392 struct rq *rq = this_rq(); 3663 struct rq *rq = this_rq();
3393 cputime64_t tmp; 3664 cputime64_t tmp;
3394 3665
3395 if (p->flags & PF_VCPU) { 3666 if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0))
3396 account_guest_time(p, cputime); 3667 return account_guest_time(p, cputime);
3397 return;
3398 }
3399 3668
3400 p->stime = cputime_add(p->stime, cputime); 3669 p->stime = cputime_add(p->stime, cputime);
3401 3670
@@ -3405,10 +3674,9 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
3405 cpustat->irq = cputime64_add(cpustat->irq, tmp); 3674 cpustat->irq = cputime64_add(cpustat->irq, tmp);
3406 else if (softirq_count()) 3675 else if (softirq_count())
3407 cpustat->softirq = cputime64_add(cpustat->softirq, tmp); 3676 cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
3408 else if (p != rq->idle) { 3677 else if (p != rq->idle)
3409 cpustat->system = cputime64_add(cpustat->system, tmp); 3678 cpustat->system = cputime64_add(cpustat->system, tmp);
3410 cpuacct_charge(p, cputime); 3679 else if (atomic_read(&rq->nr_iowait) > 0)
3411 } else if (atomic_read(&rq->nr_iowait) > 0)
3412 cpustat->iowait = cputime64_add(cpustat->iowait, tmp); 3680 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3413 else 3681 else
3414 cpustat->idle = cputime64_add(cpustat->idle, tmp); 3682 cpustat->idle = cputime64_add(cpustat->idle, tmp);
@@ -3444,10 +3712,8 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
3444 cpustat->iowait = cputime64_add(cpustat->iowait, tmp); 3712 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
3445 else 3713 else
3446 cpustat->idle = cputime64_add(cpustat->idle, tmp); 3714 cpustat->idle = cputime64_add(cpustat->idle, tmp);
3447 } else { 3715 } else
3448 cpustat->steal = cputime64_add(cpustat->steal, tmp); 3716 cpustat->steal = cputime64_add(cpustat->steal, tmp);
3449 cpuacct_charge(p, -tmp);
3450 }
3451} 3717}
3452 3718
3453/* 3719/*
@@ -3469,12 +3735,14 @@ void scheduler_tick(void)
3469 /* 3735 /*
3470 * Let rq->clock advance by at least TICK_NSEC: 3736 * Let rq->clock advance by at least TICK_NSEC:
3471 */ 3737 */
3472 if (unlikely(rq->clock < next_tick)) 3738 if (unlikely(rq->clock < next_tick)) {
3473 rq->clock = next_tick; 3739 rq->clock = next_tick;
3740 rq->clock_underflows++;
3741 }
3474 rq->tick_timestamp = rq->clock; 3742 rq->tick_timestamp = rq->clock;
3475 update_cpu_load(rq); 3743 update_cpu_load(rq);
3476 if (curr != rq->idle) /* FIXME: needed? */ 3744 curr->sched_class->task_tick(rq, curr, 0);
3477 curr->sched_class->task_tick(rq, curr); 3745 update_sched_rt_period(rq);
3478 spin_unlock(&rq->lock); 3746 spin_unlock(&rq->lock);
3479 3747
3480#ifdef CONFIG_SMP 3748#ifdef CONFIG_SMP
@@ -3547,7 +3815,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
3547static inline void schedule_debug(struct task_struct *prev) 3815static inline void schedule_debug(struct task_struct *prev)
3548{ 3816{
3549 /* 3817 /*
3550 * Test if we are atomic. Since do_exit() needs to call into 3818 * Test if we are atomic. Since do_exit() needs to call into
3551 * schedule() atomically, we ignore that path for now. 3819 * schedule() atomically, we ignore that path for now.
3552 * Otherwise, whine if we are scheduling when we should not be. 3820 * Otherwise, whine if we are scheduling when we should not be.
3553 */ 3821 */
@@ -3620,6 +3888,8 @@ need_resched_nonpreemptible:
3620 3888
3621 schedule_debug(prev); 3889 schedule_debug(prev);
3622 3890
3891 hrtick_clear(rq);
3892
3623 /* 3893 /*
3624 * Do the rq-clock update outside the rq lock: 3894 * Do the rq-clock update outside the rq lock:
3625 */ 3895 */
@@ -3638,6 +3908,11 @@ need_resched_nonpreemptible:
3638 switch_count = &prev->nvcsw; 3908 switch_count = &prev->nvcsw;
3639 } 3909 }
3640 3910
3911#ifdef CONFIG_SMP
3912 if (prev->sched_class->pre_schedule)
3913 prev->sched_class->pre_schedule(rq, prev);
3914#endif
3915
3641 if (unlikely(!rq->nr_running)) 3916 if (unlikely(!rq->nr_running))
3642 idle_balance(cpu, rq); 3917 idle_balance(cpu, rq);
3643 3918
@@ -3652,14 +3927,20 @@ need_resched_nonpreemptible:
3652 ++*switch_count; 3927 ++*switch_count;
3653 3928
3654 context_switch(rq, prev, next); /* unlocks the rq */ 3929 context_switch(rq, prev, next); /* unlocks the rq */
3930 /*
3931 * the context switch might have flipped the stack from under
3932 * us, hence refresh the local variables.
3933 */
3934 cpu = smp_processor_id();
3935 rq = cpu_rq(cpu);
3655 } else 3936 } else
3656 spin_unlock_irq(&rq->lock); 3937 spin_unlock_irq(&rq->lock);
3657 3938
3658 if (unlikely(reacquire_kernel_lock(current) < 0)) { 3939 hrtick_set(rq);
3659 cpu = smp_processor_id(); 3940
3660 rq = cpu_rq(cpu); 3941 if (unlikely(reacquire_kernel_lock(current) < 0))
3661 goto need_resched_nonpreemptible; 3942 goto need_resched_nonpreemptible;
3662 } 3943
3663 preempt_enable_no_resched(); 3944 preempt_enable_no_resched();
3664 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 3945 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3665 goto need_resched; 3946 goto need_resched;
@@ -3669,19 +3950,18 @@ EXPORT_SYMBOL(schedule);
3669#ifdef CONFIG_PREEMPT 3950#ifdef CONFIG_PREEMPT
3670/* 3951/*
3671 * this is the entry point to schedule() from in-kernel preemption 3952 * this is the entry point to schedule() from in-kernel preemption
3672 * off of preempt_enable. Kernel preemptions off return from interrupt 3953 * off of preempt_enable. Kernel preemptions off return from interrupt
3673 * occur there and call schedule directly. 3954 * occur there and call schedule directly.
3674 */ 3955 */
3675asmlinkage void __sched preempt_schedule(void) 3956asmlinkage void __sched preempt_schedule(void)
3676{ 3957{
3677 struct thread_info *ti = current_thread_info(); 3958 struct thread_info *ti = current_thread_info();
3678#ifdef CONFIG_PREEMPT_BKL
3679 struct task_struct *task = current; 3959 struct task_struct *task = current;
3680 int saved_lock_depth; 3960 int saved_lock_depth;
3681#endif 3961
3682 /* 3962 /*
3683 * If there is a non-zero preempt_count or interrupts are disabled, 3963 * If there is a non-zero preempt_count or interrupts are disabled,
3684 * we do not want to preempt the current task. Just return.. 3964 * we do not want to preempt the current task. Just return..
3685 */ 3965 */
3686 if (likely(ti->preempt_count || irqs_disabled())) 3966 if (likely(ti->preempt_count || irqs_disabled()))
3687 return; 3967 return;
@@ -3694,14 +3974,10 @@ asmlinkage void __sched preempt_schedule(void)
3694 * clear ->lock_depth so that schedule() doesnt 3974 * clear ->lock_depth so that schedule() doesnt
3695 * auto-release the semaphore: 3975 * auto-release the semaphore:
3696 */ 3976 */
3697#ifdef CONFIG_PREEMPT_BKL
3698 saved_lock_depth = task->lock_depth; 3977 saved_lock_depth = task->lock_depth;
3699 task->lock_depth = -1; 3978 task->lock_depth = -1;
3700#endif
3701 schedule(); 3979 schedule();
3702#ifdef CONFIG_PREEMPT_BKL
3703 task->lock_depth = saved_lock_depth; 3980 task->lock_depth = saved_lock_depth;
3704#endif
3705 sub_preempt_count(PREEMPT_ACTIVE); 3981 sub_preempt_count(PREEMPT_ACTIVE);
3706 3982
3707 /* 3983 /*
@@ -3722,10 +3998,9 @@ EXPORT_SYMBOL(preempt_schedule);
3722asmlinkage void __sched preempt_schedule_irq(void) 3998asmlinkage void __sched preempt_schedule_irq(void)
3723{ 3999{
3724 struct thread_info *ti = current_thread_info(); 4000 struct thread_info *ti = current_thread_info();
3725#ifdef CONFIG_PREEMPT_BKL
3726 struct task_struct *task = current; 4001 struct task_struct *task = current;
3727 int saved_lock_depth; 4002 int saved_lock_depth;
3728#endif 4003
3729 /* Catch callers which need to be fixed */ 4004 /* Catch callers which need to be fixed */
3730 BUG_ON(ti->preempt_count || !irqs_disabled()); 4005 BUG_ON(ti->preempt_count || !irqs_disabled());
3731 4006
@@ -3737,16 +4012,12 @@ asmlinkage void __sched preempt_schedule_irq(void)
3737 * clear ->lock_depth so that schedule() doesnt 4012 * clear ->lock_depth so that schedule() doesnt
3738 * auto-release the semaphore: 4013 * auto-release the semaphore:
3739 */ 4014 */
3740#ifdef CONFIG_PREEMPT_BKL
3741 saved_lock_depth = task->lock_depth; 4015 saved_lock_depth = task->lock_depth;
3742 task->lock_depth = -1; 4016 task->lock_depth = -1;
3743#endif
3744 local_irq_enable(); 4017 local_irq_enable();
3745 schedule(); 4018 schedule();
3746 local_irq_disable(); 4019 local_irq_disable();
3747#ifdef CONFIG_PREEMPT_BKL
3748 task->lock_depth = saved_lock_depth; 4020 task->lock_depth = saved_lock_depth;
3749#endif
3750 sub_preempt_count(PREEMPT_ACTIVE); 4021 sub_preempt_count(PREEMPT_ACTIVE);
3751 4022
3752 /* 4023 /*
@@ -3767,12 +4038,12 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
3767EXPORT_SYMBOL(default_wake_function); 4038EXPORT_SYMBOL(default_wake_function);
3768 4039
3769/* 4040/*
3770 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just 4041 * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just
3771 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve 4042 * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve
3772 * number) then we wake all the non-exclusive tasks and one exclusive task. 4043 * number) then we wake all the non-exclusive tasks and one exclusive task.
3773 * 4044 *
3774 * There are circumstances in which we can try to wake a task which has already 4045 * There are circumstances in which we can try to wake a task which has already
3775 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns 4046 * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
3776 * zero in this (rare) case, and we handle it by continuing to scan the queue. 4047 * zero in this (rare) case, and we handle it by continuing to scan the queue.
3777 */ 4048 */
3778static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, 4049static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
@@ -3852,8 +4123,7 @@ void complete(struct completion *x)
3852 4123
3853 spin_lock_irqsave(&x->wait.lock, flags); 4124 spin_lock_irqsave(&x->wait.lock, flags);
3854 x->done++; 4125 x->done++;
3855 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 4126 __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL);
3856 1, 0, NULL);
3857 spin_unlock_irqrestore(&x->wait.lock, flags); 4127 spin_unlock_irqrestore(&x->wait.lock, flags);
3858} 4128}
3859EXPORT_SYMBOL(complete); 4129EXPORT_SYMBOL(complete);
@@ -3864,8 +4134,7 @@ void complete_all(struct completion *x)
3864 4134
3865 spin_lock_irqsave(&x->wait.lock, flags); 4135 spin_lock_irqsave(&x->wait.lock, flags);
3866 x->done += UINT_MAX/2; 4136 x->done += UINT_MAX/2;
3867 __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 4137 __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL);
3868 0, 0, NULL);
3869 spin_unlock_irqrestore(&x->wait.lock, flags); 4138 spin_unlock_irqrestore(&x->wait.lock, flags);
3870} 4139}
3871EXPORT_SYMBOL(complete_all); 4140EXPORT_SYMBOL(complete_all);
@@ -3879,8 +4148,10 @@ do_wait_for_common(struct completion *x, long timeout, int state)
3879 wait.flags |= WQ_FLAG_EXCLUSIVE; 4148 wait.flags |= WQ_FLAG_EXCLUSIVE;
3880 __add_wait_queue_tail(&x->wait, &wait); 4149 __add_wait_queue_tail(&x->wait, &wait);
3881 do { 4150 do {
3882 if (state == TASK_INTERRUPTIBLE && 4151 if ((state == TASK_INTERRUPTIBLE &&
3883 signal_pending(current)) { 4152 signal_pending(current)) ||
4153 (state == TASK_KILLABLE &&
4154 fatal_signal_pending(current))) {
3884 __remove_wait_queue(&x->wait, &wait); 4155 __remove_wait_queue(&x->wait, &wait);
3885 return -ERESTARTSYS; 4156 return -ERESTARTSYS;
3886 } 4157 }
@@ -3940,6 +4211,15 @@ wait_for_completion_interruptible_timeout(struct completion *x,
3940} 4211}
3941EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); 4212EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
3942 4213
4214int __sched wait_for_completion_killable(struct completion *x)
4215{
4216 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
4217 if (t == -ERESTARTSYS)
4218 return t;
4219 return 0;
4220}
4221EXPORT_SYMBOL(wait_for_completion_killable);
4222
3943static long __sched 4223static long __sched
3944sleep_on_common(wait_queue_head_t *q, int state, long timeout) 4224sleep_on_common(wait_queue_head_t *q, int state, long timeout)
3945{ 4225{
@@ -4003,6 +4283,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4003 unsigned long flags; 4283 unsigned long flags;
4004 int oldprio, on_rq, running; 4284 int oldprio, on_rq, running;
4005 struct rq *rq; 4285 struct rq *rq;
4286 const struct sched_class *prev_class = p->sched_class;
4006 4287
4007 BUG_ON(prio < 0 || prio > MAX_PRIO); 4288 BUG_ON(prio < 0 || prio > MAX_PRIO);
4008 4289
@@ -4011,7 +4292,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4011 4292
4012 oldprio = p->prio; 4293 oldprio = p->prio;
4013 on_rq = p->se.on_rq; 4294 on_rq = p->se.on_rq;
4014 running = task_running(rq, p); 4295 running = task_current(rq, p);
4015 if (on_rq) { 4296 if (on_rq) {
4016 dequeue_task(rq, p, 0); 4297 dequeue_task(rq, p, 0);
4017 if (running) 4298 if (running)
@@ -4028,18 +4309,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
4028 if (on_rq) { 4309 if (on_rq) {
4029 if (running) 4310 if (running)
4030 p->sched_class->set_curr_task(rq); 4311 p->sched_class->set_curr_task(rq);
4312
4031 enqueue_task(rq, p, 0); 4313 enqueue_task(rq, p, 0);
4032 /* 4314
4033 * Reschedule if we are currently running on this runqueue and 4315 check_class_changed(rq, p, prev_class, oldprio, running);
4034 * our priority decreased, or if we are not currently running on
4035 * this runqueue and our priority is higher than the current's
4036 */
4037 if (running) {
4038 if (p->prio > oldprio)
4039 resched_task(rq->curr);
4040 } else {
4041 check_preempt_curr(rq, p);
4042 }
4043 } 4316 }
4044 task_rq_unlock(rq, &flags); 4317 task_rq_unlock(rq, &flags);
4045} 4318}
@@ -4071,10 +4344,8 @@ void set_user_nice(struct task_struct *p, long nice)
4071 goto out_unlock; 4344 goto out_unlock;
4072 } 4345 }
4073 on_rq = p->se.on_rq; 4346 on_rq = p->se.on_rq;
4074 if (on_rq) { 4347 if (on_rq)
4075 dequeue_task(rq, p, 0); 4348 dequeue_task(rq, p, 0);
4076 dec_load(rq, p);
4077 }
4078 4349
4079 p->static_prio = NICE_TO_PRIO(nice); 4350 p->static_prio = NICE_TO_PRIO(nice);
4080 set_load_weight(p); 4351 set_load_weight(p);
@@ -4084,7 +4355,6 @@ void set_user_nice(struct task_struct *p, long nice)
4084 4355
4085 if (on_rq) { 4356 if (on_rq) {
4086 enqueue_task(rq, p, 0); 4357 enqueue_task(rq, p, 0);
4087 inc_load(rq, p);
4088 /* 4358 /*
4089 * If the task increased its priority or is running and 4359 * If the task increased its priority or is running and
4090 * lowered its priority, then reschedule its CPU: 4360 * lowered its priority, then reschedule its CPU:
@@ -4242,6 +4512,7 @@ int sched_setscheduler(struct task_struct *p, int policy,
4242{ 4512{
4243 int retval, oldprio, oldpolicy = -1, on_rq, running; 4513 int retval, oldprio, oldpolicy = -1, on_rq, running;
4244 unsigned long flags; 4514 unsigned long flags;
4515 const struct sched_class *prev_class = p->sched_class;
4245 struct rq *rq; 4516 struct rq *rq;
4246 4517
4247 /* may grab non-irq protected spin_locks */ 4518 /* may grab non-irq protected spin_locks */
@@ -4322,7 +4593,7 @@ recheck:
4322 } 4593 }
4323 update_rq_clock(rq); 4594 update_rq_clock(rq);
4324 on_rq = p->se.on_rq; 4595 on_rq = p->se.on_rq;
4325 running = task_running(rq, p); 4596 running = task_current(rq, p);
4326 if (on_rq) { 4597 if (on_rq) {
4327 deactivate_task(rq, p, 0); 4598 deactivate_task(rq, p, 0);
4328 if (running) 4599 if (running)
@@ -4335,18 +4606,10 @@ recheck:
4335 if (on_rq) { 4606 if (on_rq) {
4336 if (running) 4607 if (running)
4337 p->sched_class->set_curr_task(rq); 4608 p->sched_class->set_curr_task(rq);
4609
4338 activate_task(rq, p, 0); 4610 activate_task(rq, p, 0);
4339 /* 4611
4340 * Reschedule if we are currently running on this runqueue and 4612 check_class_changed(rq, p, prev_class, oldprio, running);
4341 * our priority decreased, or if we are not currently running on
4342 * this runqueue and our priority is higher than the current's
4343 */
4344 if (running) {
4345 if (p->prio > oldprio)
4346 resched_task(rq->curr);
4347 } else {
4348 check_preempt_curr(rq, p);
4349 }
4350 } 4613 }
4351 __task_rq_unlock(rq); 4614 __task_rq_unlock(rq);
4352 spin_unlock_irqrestore(&p->pi_lock, flags); 4615 spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -4385,8 +4648,8 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4385 * @policy: new policy. 4648 * @policy: new policy.
4386 * @param: structure containing the new RT priority. 4649 * @param: structure containing the new RT priority.
4387 */ 4650 */
4388asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, 4651asmlinkage long
4389 struct sched_param __user *param) 4652sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4390{ 4653{
4391 /* negative values for policy are not valid */ 4654 /* negative values for policy are not valid */
4392 if (policy < 0) 4655 if (policy < 0)
@@ -4474,19 +4737,19 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4474 struct task_struct *p; 4737 struct task_struct *p;
4475 int retval; 4738 int retval;
4476 4739
4477 mutex_lock(&sched_hotcpu_mutex); 4740 get_online_cpus();
4478 read_lock(&tasklist_lock); 4741 read_lock(&tasklist_lock);
4479 4742
4480 p = find_process_by_pid(pid); 4743 p = find_process_by_pid(pid);
4481 if (!p) { 4744 if (!p) {
4482 read_unlock(&tasklist_lock); 4745 read_unlock(&tasklist_lock);
4483 mutex_unlock(&sched_hotcpu_mutex); 4746 put_online_cpus();
4484 return -ESRCH; 4747 return -ESRCH;
4485 } 4748 }
4486 4749
4487 /* 4750 /*
4488 * It is not safe to call set_cpus_allowed with the 4751 * It is not safe to call set_cpus_allowed with the
4489 * tasklist_lock held. We will bump the task_struct's 4752 * tasklist_lock held. We will bump the task_struct's
4490 * usage count and then drop tasklist_lock. 4753 * usage count and then drop tasklist_lock.
4491 */ 4754 */
4492 get_task_struct(p); 4755 get_task_struct(p);
@@ -4520,7 +4783,7 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
4520 } 4783 }
4521out_unlock: 4784out_unlock:
4522 put_task_struct(p); 4785 put_task_struct(p);
4523 mutex_unlock(&sched_hotcpu_mutex); 4786 put_online_cpus();
4524 return retval; 4787 return retval;
4525} 4788}
4526 4789
@@ -4577,7 +4840,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
4577 struct task_struct *p; 4840 struct task_struct *p;
4578 int retval; 4841 int retval;
4579 4842
4580 mutex_lock(&sched_hotcpu_mutex); 4843 get_online_cpus();
4581 read_lock(&tasklist_lock); 4844 read_lock(&tasklist_lock);
4582 4845
4583 retval = -ESRCH; 4846 retval = -ESRCH;
@@ -4593,7 +4856,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
4593 4856
4594out_unlock: 4857out_unlock:
4595 read_unlock(&tasklist_lock); 4858 read_unlock(&tasklist_lock);
4596 mutex_unlock(&sched_hotcpu_mutex); 4859 put_online_cpus();
4597 4860
4598 return retval; 4861 return retval;
4599} 4862}
@@ -4667,7 +4930,8 @@ static void __cond_resched(void)
4667 } while (need_resched()); 4930 } while (need_resched());
4668} 4931}
4669 4932
4670int __sched cond_resched(void) 4933#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY)
4934int __sched _cond_resched(void)
4671{ 4935{
4672 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && 4936 if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) &&
4673 system_state == SYSTEM_RUNNING) { 4937 system_state == SYSTEM_RUNNING) {
@@ -4676,31 +4940,28 @@ int __sched cond_resched(void)
4676 } 4940 }
4677 return 0; 4941 return 0;
4678} 4942}
4679EXPORT_SYMBOL(cond_resched); 4943EXPORT_SYMBOL(_cond_resched);
4944#endif
4680 4945
4681/* 4946/*
4682 * cond_resched_lock() - if a reschedule is pending, drop the given lock, 4947 * cond_resched_lock() - if a reschedule is pending, drop the given lock,
4683 * call schedule, and on return reacquire the lock. 4948 * call schedule, and on return reacquire the lock.
4684 * 4949 *
4685 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level 4950 * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
4686 * operations here to prevent schedule() from being called twice (once via 4951 * operations here to prevent schedule() from being called twice (once via
4687 * spin_unlock(), once by hand). 4952 * spin_unlock(), once by hand).
4688 */ 4953 */
4689int cond_resched_lock(spinlock_t *lock) 4954int cond_resched_lock(spinlock_t *lock)
4690{ 4955{
4956 int resched = need_resched() && system_state == SYSTEM_RUNNING;
4691 int ret = 0; 4957 int ret = 0;
4692 4958
4693 if (need_lockbreak(lock)) { 4959 if (spin_needbreak(lock) || resched) {
4694 spin_unlock(lock); 4960 spin_unlock(lock);
4695 cpu_relax(); 4961 if (resched && need_resched())
4696 ret = 1; 4962 __cond_resched();
4697 spin_lock(lock); 4963 else
4698 } 4964 cpu_relax();
4699 if (need_resched() && system_state == SYSTEM_RUNNING) {
4700 spin_release(&lock->dep_map, 1, _THIS_IP_);
4701 _raw_spin_unlock(lock);
4702 preempt_enable_no_resched();
4703 __cond_resched();
4704 ret = 1; 4965 ret = 1;
4705 spin_lock(lock); 4966 spin_lock(lock);
4706 } 4967 }
@@ -4736,7 +4997,7 @@ void __sched yield(void)
4736EXPORT_SYMBOL(yield); 4997EXPORT_SYMBOL(yield);
4737 4998
4738/* 4999/*
4739 * This task is about to go to sleep on IO. Increment rq->nr_iowait so 5000 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
4740 * that process accounting knows that this is a task in IO wait state. 5001 * that process accounting knows that this is a task in IO wait state.
4741 * 5002 *
4742 * But don't do that if it is a deliberate, throttling IO wait (this task 5003 * But don't do that if it is a deliberate, throttling IO wait (this task
@@ -4845,17 +5106,21 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4845 if (retval) 5106 if (retval)
4846 goto out_unlock; 5107 goto out_unlock;
4847 5108
4848 if (p->policy == SCHED_FIFO) 5109 /*
4849 time_slice = 0; 5110 * Time slice is 0 for SCHED_FIFO tasks and for SCHED_OTHER
4850 else if (p->policy == SCHED_RR) 5111 * tasks that are on an otherwise idle runqueue:
5112 */
5113 time_slice = 0;
5114 if (p->policy == SCHED_RR) {
4851 time_slice = DEF_TIMESLICE; 5115 time_slice = DEF_TIMESLICE;
4852 else { 5116 } else {
4853 struct sched_entity *se = &p->se; 5117 struct sched_entity *se = &p->se;
4854 unsigned long flags; 5118 unsigned long flags;
4855 struct rq *rq; 5119 struct rq *rq;
4856 5120
4857 rq = task_rq_lock(p, &flags); 5121 rq = task_rq_lock(p, &flags);
4858 time_slice = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se)); 5122 if (rq->cfs.load.weight)
5123 time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se));
4859 task_rq_unlock(rq, &flags); 5124 task_rq_unlock(rq, &flags);
4860 } 5125 }
4861 read_unlock(&tasklist_lock); 5126 read_unlock(&tasklist_lock);
@@ -4870,7 +5135,7 @@ out_unlock:
4870 5135
4871static const char stat_nam[] = "RSDTtZX"; 5136static const char stat_nam[] = "RSDTtZX";
4872 5137
4873static void show_task(struct task_struct *p) 5138void sched_show_task(struct task_struct *p)
4874{ 5139{
4875 unsigned long free = 0; 5140 unsigned long free = 0;
4876 unsigned state; 5141 unsigned state;
@@ -4898,10 +5163,9 @@ static void show_task(struct task_struct *p)
4898 } 5163 }
4899#endif 5164#endif
4900 printk(KERN_CONT "%5lu %5d %6d\n", free, 5165 printk(KERN_CONT "%5lu %5d %6d\n", free,
4901 task_pid_nr(p), task_pid_nr(p->parent)); 5166 task_pid_nr(p), task_pid_nr(p->real_parent));
4902 5167
4903 if (state != TASK_RUNNING) 5168 show_stack(p, NULL);
4904 show_stack(p, NULL);
4905} 5169}
4906 5170
4907void show_state_filter(unsigned long state_filter) 5171void show_state_filter(unsigned long state_filter)
@@ -4923,7 +5187,7 @@ void show_state_filter(unsigned long state_filter)
4923 */ 5187 */
4924 touch_nmi_watchdog(); 5188 touch_nmi_watchdog();
4925 if (!state_filter || (p->state & state_filter)) 5189 if (!state_filter || (p->state & state_filter))
4926 show_task(p); 5190 sched_show_task(p);
4927 } while_each_thread(g, p); 5191 } while_each_thread(g, p);
4928 5192
4929 touch_all_softlockup_watchdogs(); 5193 touch_all_softlockup_watchdogs();
@@ -4972,11 +5236,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
4972 spin_unlock_irqrestore(&rq->lock, flags); 5236 spin_unlock_irqrestore(&rq->lock, flags);
4973 5237
4974 /* Set the preempt count _outside_ the spinlocks! */ 5238 /* Set the preempt count _outside_ the spinlocks! */
4975#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
4976 task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
4977#else
4978 task_thread_info(idle)->preempt_count = 0; 5239 task_thread_info(idle)->preempt_count = 0;
4979#endif 5240
4980 /* 5241 /*
4981 * The idle tasks have their own, simple scheduling class: 5242 * The idle tasks have their own, simple scheduling class:
4982 */ 5243 */
@@ -4992,6 +5253,32 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
4992 */ 5253 */
4993cpumask_t nohz_cpu_mask = CPU_MASK_NONE; 5254cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4994 5255
5256/*
5257 * Increase the granularity value when there are more CPUs,
5258 * because with more CPUs the 'effective latency' as visible
5259 * to users decreases. But the relationship is not linear,
5260 * so pick a second-best guess by going with the log2 of the
5261 * number of CPUs.
5262 *
5263 * This idea comes from the SD scheduler of Con Kolivas:
5264 */
5265static inline void sched_init_granularity(void)
5266{
5267 unsigned int factor = 1 + ilog2(num_online_cpus());
5268 const unsigned long limit = 200000000;
5269
5270 sysctl_sched_min_granularity *= factor;
5271 if (sysctl_sched_min_granularity > limit)
5272 sysctl_sched_min_granularity = limit;
5273
5274 sysctl_sched_latency *= factor;
5275 if (sysctl_sched_latency > limit)
5276 sysctl_sched_latency = limit;
5277
5278 sysctl_sched_wakeup_granularity *= factor;
5279 sysctl_sched_batch_wakeup_granularity *= factor;
5280}
5281
4995#ifdef CONFIG_SMP 5282#ifdef CONFIG_SMP
4996/* 5283/*
4997 * This is how migration works: 5284 * This is how migration works:
@@ -5015,7 +5302,7 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
5015 * is removed from the allowed bitmask. 5302 * is removed from the allowed bitmask.
5016 * 5303 *
5017 * NOTE: the caller must have a valid reference to the task, the 5304 * NOTE: the caller must have a valid reference to the task, the
5018 * task must not exit() & deallocate itself prematurely. The 5305 * task must not exit() & deallocate itself prematurely. The
5019 * call is not atomic; no spinlocks may be held. 5306 * call is not atomic; no spinlocks may be held.
5020 */ 5307 */
5021int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) 5308int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
@@ -5031,7 +5318,13 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
5031 goto out; 5318 goto out;
5032 } 5319 }
5033 5320
5034 p->cpus_allowed = new_mask; 5321 if (p->sched_class->set_cpus_allowed)
5322 p->sched_class->set_cpus_allowed(p, &new_mask);
5323 else {
5324 p->cpus_allowed = new_mask;
5325 p->rt.nr_cpus_allowed = cpus_weight(new_mask);
5326 }
5327
5035 /* Can the task run on the task's current CPU? If so, we're done */ 5328 /* Can the task run on the task's current CPU? If so, we're done */
5036 if (cpu_isset(task_cpu(p), new_mask)) 5329 if (cpu_isset(task_cpu(p), new_mask))
5037 goto out; 5330 goto out;
@@ -5052,7 +5345,7 @@ out:
5052EXPORT_SYMBOL_GPL(set_cpus_allowed); 5345EXPORT_SYMBOL_GPL(set_cpus_allowed);
5053 5346
5054/* 5347/*
5055 * Move (not current) task off this cpu, onto dest cpu. We're doing 5348 * Move (not current) task off this cpu, onto dest cpu. We're doing
5056 * this because either it can't run here any more (set_cpus_allowed() 5349 * this because either it can't run here any more (set_cpus_allowed()
5057 * away from this CPU, or CPU going down), or because we're 5350 * away from this CPU, or CPU going down), or because we're
5058 * attempting to rebalance this task on exec (sched_exec). 5351 * attempting to rebalance this task on exec (sched_exec).
@@ -5197,7 +5490,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5197 * Try to stay on the same cpuset, where the 5490 * Try to stay on the same cpuset, where the
5198 * current cpuset may be a subset of all cpus. 5491 * current cpuset may be a subset of all cpus.
5199 * The cpuset_cpus_allowed_locked() variant of 5492 * The cpuset_cpus_allowed_locked() variant of
5200 * cpuset_cpus_allowed() will not block. It must be 5493 * cpuset_cpus_allowed() will not block. It must be
5201 * called within calls to cpuset_lock/cpuset_unlock. 5494 * called within calls to cpuset_lock/cpuset_unlock.
5202 */ 5495 */
5203 rq = task_rq_lock(p, &flags); 5496 rq = task_rq_lock(p, &flags);
@@ -5210,10 +5503,11 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
5210 * kernel threads (both mm NULL), since they never 5503 * kernel threads (both mm NULL), since they never
5211 * leave kernel. 5504 * leave kernel.
5212 */ 5505 */
5213 if (p->mm && printk_ratelimit()) 5506 if (p->mm && printk_ratelimit()) {
5214 printk(KERN_INFO "process %d (%s) no " 5507 printk(KERN_INFO "process %d (%s) no "
5215 "longer affine to cpu%d\n", 5508 "longer affine to cpu%d\n",
5216 task_pid_nr(p), p->comm, dead_cpu); 5509 task_pid_nr(p), p->comm, dead_cpu);
5510 }
5217 } 5511 }
5218 } while (!__migrate_task_irq(p, dead_cpu, dest_cpu)); 5512 } while (!__migrate_task_irq(p, dead_cpu, dest_cpu));
5219} 5513}
@@ -5257,23 +5551,9 @@ static void migrate_live_tasks(int src_cpu)
5257} 5551}
5258 5552
5259/* 5553/*
5260 * activate_idle_task - move idle task to the _front_ of runqueue.
5261 */
5262static void activate_idle_task(struct task_struct *p, struct rq *rq)
5263{
5264 update_rq_clock(rq);
5265
5266 if (p->state == TASK_UNINTERRUPTIBLE)
5267 rq->nr_uninterruptible--;
5268
5269 enqueue_task(rq, p, 0);
5270 inc_nr_running(p, rq);
5271}
5272
5273/*
5274 * Schedules idle task to be the next runnable task on current CPU. 5554 * Schedules idle task to be the next runnable task on current CPU.
5275 * It does so by boosting its priority to highest possible and adding it to 5555 * It does so by boosting its priority to highest possible.
5276 * the _front_ of the runqueue. Used by CPU offline code. 5556 * Used by CPU offline code.
5277 */ 5557 */
5278void sched_idle_next(void) 5558void sched_idle_next(void)
5279{ 5559{
@@ -5293,8 +5573,8 @@ void sched_idle_next(void)
5293 5573
5294 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); 5574 __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1);
5295 5575
5296 /* Add idle task to the _front_ of its priority queue: */ 5576 update_rq_clock(rq);
5297 activate_idle_task(p, rq); 5577 activate_task(rq, p, 0);
5298 5578
5299 spin_unlock_irqrestore(&rq->lock, flags); 5579 spin_unlock_irqrestore(&rq->lock, flags);
5300} 5580}
@@ -5329,7 +5609,7 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
5329 5609
5330 /* 5610 /*
5331 * Drop lock around migration; if someone else moves it, 5611 * Drop lock around migration; if someone else moves it,
5332 * that's OK. No task can be added to this CPU, so iteration is 5612 * that's OK. No task can be added to this CPU, so iteration is
5333 * fine. 5613 * fine.
5334 */ 5614 */
5335 spin_unlock_irq(&rq->lock); 5615 spin_unlock_irq(&rq->lock);
@@ -5365,7 +5645,7 @@ static struct ctl_table sd_ctl_dir[] = {
5365 .procname = "sched_domain", 5645 .procname = "sched_domain",
5366 .mode = 0555, 5646 .mode = 0555,
5367 }, 5647 },
5368 {0,}, 5648 {0, },
5369}; 5649};
5370 5650
5371static struct ctl_table sd_ctl_root[] = { 5651static struct ctl_table sd_ctl_root[] = {
@@ -5375,7 +5655,7 @@ static struct ctl_table sd_ctl_root[] = {
5375 .mode = 0555, 5655 .mode = 0555,
5376 .child = sd_ctl_dir, 5656 .child = sd_ctl_dir,
5377 }, 5657 },
5378 {0,}, 5658 {0, },
5379}; 5659};
5380 5660
5381static struct ctl_table *sd_alloc_ctl_entry(int n) 5661static struct ctl_table *sd_alloc_ctl_entry(int n)
@@ -5393,7 +5673,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
5393 /* 5673 /*
5394 * In the intermediate directories, both the child directory and 5674 * In the intermediate directories, both the child directory and
5395 * procname are dynamically allocated and could fail but the mode 5675 * procname are dynamically allocated and could fail but the mode
5396 * will always be set. In the lowest directory the names are 5676 * will always be set. In the lowest directory the names are
5397 * static strings and all have proc handlers. 5677 * static strings and all have proc handlers.
5398 */ 5678 */
5399 for (entry = *tablep; entry->mode; entry++) { 5679 for (entry = *tablep; entry->mode; entry++) {
@@ -5455,7 +5735,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
5455 return table; 5735 return table;
5456} 5736}
5457 5737
5458static ctl_table * sd_alloc_ctl_cpu_table(int cpu) 5738static ctl_table *sd_alloc_ctl_cpu_table(int cpu)
5459{ 5739{
5460 struct ctl_table *entry, *table; 5740 struct ctl_table *entry, *table;
5461 struct sched_domain *sd; 5741 struct sched_domain *sd;
@@ -5536,9 +5816,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5536 struct rq *rq; 5816 struct rq *rq;
5537 5817
5538 switch (action) { 5818 switch (action) {
5539 case CPU_LOCK_ACQUIRE:
5540 mutex_lock(&sched_hotcpu_mutex);
5541 break;
5542 5819
5543 case CPU_UP_PREPARE: 5820 case CPU_UP_PREPARE:
5544 case CPU_UP_PREPARE_FROZEN: 5821 case CPU_UP_PREPARE_FROZEN:
@@ -5557,6 +5834,15 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5557 case CPU_ONLINE_FROZEN: 5834 case CPU_ONLINE_FROZEN:
5558 /* Strictly unnecessary, as first user will wake it. */ 5835 /* Strictly unnecessary, as first user will wake it. */
5559 wake_up_process(cpu_rq(cpu)->migration_thread); 5836 wake_up_process(cpu_rq(cpu)->migration_thread);
5837
5838 /* Update our root-domain */
5839 rq = cpu_rq(cpu);
5840 spin_lock_irqsave(&rq->lock, flags);
5841 if (rq->rd) {
5842 BUG_ON(!cpu_isset(cpu, rq->rd->span));
5843 cpu_set(cpu, rq->rd->online);
5844 }
5845 spin_unlock_irqrestore(&rq->lock, flags);
5560 break; 5846 break;
5561 5847
5562#ifdef CONFIG_HOTPLUG_CPU 5848#ifdef CONFIG_HOTPLUG_CPU
@@ -5564,7 +5850,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5564 case CPU_UP_CANCELED_FROZEN: 5850 case CPU_UP_CANCELED_FROZEN:
5565 if (!cpu_rq(cpu)->migration_thread) 5851 if (!cpu_rq(cpu)->migration_thread)
5566 break; 5852 break;
5567 /* Unbind it from offline cpu so it can run. Fall thru. */ 5853 /* Unbind it from offline cpu so it can run. Fall thru. */
5568 kthread_bind(cpu_rq(cpu)->migration_thread, 5854 kthread_bind(cpu_rq(cpu)->migration_thread,
5569 any_online_cpu(cpu_online_map)); 5855 any_online_cpu(cpu_online_map));
5570 kthread_stop(cpu_rq(cpu)->migration_thread); 5856 kthread_stop(cpu_rq(cpu)->migration_thread);
@@ -5591,9 +5877,11 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5591 migrate_nr_uninterruptible(rq); 5877 migrate_nr_uninterruptible(rq);
5592 BUG_ON(rq->nr_running != 0); 5878 BUG_ON(rq->nr_running != 0);
5593 5879
5594 /* No need to migrate the tasks: it was best-effort if 5880 /*
5595 * they didn't take sched_hotcpu_mutex. Just wake up 5881 * No need to migrate the tasks: it was best-effort if
5596 * the requestors. */ 5882 * they didn't take sched_hotcpu_mutex. Just wake up
5883 * the requestors.
5884 */
5597 spin_lock_irq(&rq->lock); 5885 spin_lock_irq(&rq->lock);
5598 while (!list_empty(&rq->migration_queue)) { 5886 while (!list_empty(&rq->migration_queue)) {
5599 struct migration_req *req; 5887 struct migration_req *req;
@@ -5605,10 +5893,18 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
5605 } 5893 }
5606 spin_unlock_irq(&rq->lock); 5894 spin_unlock_irq(&rq->lock);
5607 break; 5895 break;
5608#endif 5896
5609 case CPU_LOCK_RELEASE: 5897 case CPU_DOWN_PREPARE:
5610 mutex_unlock(&sched_hotcpu_mutex); 5898 /* Update our root-domain */
5899 rq = cpu_rq(cpu);
5900 spin_lock_irqsave(&rq->lock, flags);
5901 if (rq->rd) {
5902 BUG_ON(!cpu_isset(cpu, rq->rd->span));
5903 cpu_clear(cpu, rq->rd->online);
5904 }
5905 spin_unlock_irqrestore(&rq->lock, flags);
5611 break; 5906 break;
5907#endif
5612 } 5908 }
5613 return NOTIFY_OK; 5909 return NOTIFY_OK;
5614} 5910}
@@ -5621,7 +5917,7 @@ static struct notifier_block __cpuinitdata migration_notifier = {
5621 .priority = 10 5917 .priority = 10
5622}; 5918};
5623 5919
5624int __init migration_init(void) 5920void __init migration_init(void)
5625{ 5921{
5626 void *cpu = (void *)(long)smp_processor_id(); 5922 void *cpu = (void *)(long)smp_processor_id();
5627 int err; 5923 int err;
@@ -5631,8 +5927,6 @@ int __init migration_init(void)
5631 BUG_ON(err == NOTIFY_BAD); 5927 BUG_ON(err == NOTIFY_BAD);
5632 migration_call(&migration_notifier, CPU_ONLINE, cpu); 5928 migration_call(&migration_notifier, CPU_ONLINE, cpu);
5633 register_cpu_notifier(&migration_notifier); 5929 register_cpu_notifier(&migration_notifier);
5634
5635 return 0;
5636} 5930}
5637#endif 5931#endif
5638 5932
@@ -5798,11 +6092,76 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
5798 return 1; 6092 return 1;
5799} 6093}
5800 6094
6095static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6096{
6097 unsigned long flags;
6098 const struct sched_class *class;
6099
6100 spin_lock_irqsave(&rq->lock, flags);
6101
6102 if (rq->rd) {
6103 struct root_domain *old_rd = rq->rd;
6104
6105 for (class = sched_class_highest; class; class = class->next) {
6106 if (class->leave_domain)
6107 class->leave_domain(rq);
6108 }
6109
6110 cpu_clear(rq->cpu, old_rd->span);
6111 cpu_clear(rq->cpu, old_rd->online);
6112
6113 if (atomic_dec_and_test(&old_rd->refcount))
6114 kfree(old_rd);
6115 }
6116
6117 atomic_inc(&rd->refcount);
6118 rq->rd = rd;
6119
6120 cpu_set(rq->cpu, rd->span);
6121 if (cpu_isset(rq->cpu, cpu_online_map))
6122 cpu_set(rq->cpu, rd->online);
6123
6124 for (class = sched_class_highest; class; class = class->next) {
6125 if (class->join_domain)
6126 class->join_domain(rq);
6127 }
6128
6129 spin_unlock_irqrestore(&rq->lock, flags);
6130}
6131
6132static void init_rootdomain(struct root_domain *rd)
6133{
6134 memset(rd, 0, sizeof(*rd));
6135
6136 cpus_clear(rd->span);
6137 cpus_clear(rd->online);
6138}
6139
6140static void init_defrootdomain(void)
6141{
6142 init_rootdomain(&def_root_domain);
6143 atomic_set(&def_root_domain.refcount, 1);
6144}
6145
6146static struct root_domain *alloc_rootdomain(void)
6147{
6148 struct root_domain *rd;
6149
6150 rd = kmalloc(sizeof(*rd), GFP_KERNEL);
6151 if (!rd)
6152 return NULL;
6153
6154 init_rootdomain(rd);
6155
6156 return rd;
6157}
6158
5801/* 6159/*
5802 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must 6160 * Attach the domain 'sd' to 'cpu' as its base domain. Callers must
5803 * hold the hotplug lock. 6161 * hold the hotplug lock.
5804 */ 6162 */
5805static void cpu_attach_domain(struct sched_domain *sd, int cpu) 6163static void
6164cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
5806{ 6165{
5807 struct rq *rq = cpu_rq(cpu); 6166 struct rq *rq = cpu_rq(cpu);
5808 struct sched_domain *tmp; 6167 struct sched_domain *tmp;
@@ -5827,6 +6186,7 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu)
5827 6186
5828 sched_domain_debug(sd, cpu); 6187 sched_domain_debug(sd, cpu);
5829 6188
6189 rq_attach_root(rq, rd);
5830 rcu_assign_pointer(rq->sd, sd); 6190 rcu_assign_pointer(rq->sd, sd);
5831} 6191}
5832 6192
@@ -5903,7 +6263,7 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map,
5903 * @node: node whose sched_domain we're building 6263 * @node: node whose sched_domain we're building
5904 * @used_nodes: nodes already in the sched_domain 6264 * @used_nodes: nodes already in the sched_domain
5905 * 6265 *
5906 * Find the next node to include in a given scheduling domain. Simply 6266 * Find the next node to include in a given scheduling domain. Simply
5907 * finds the closest node not already in the @used_nodes map. 6267 * finds the closest node not already in the @used_nodes map.
5908 * 6268 *
5909 * Should use nodemask_t. 6269 * Should use nodemask_t.
@@ -5943,7 +6303,7 @@ static int find_next_best_node(int node, unsigned long *used_nodes)
5943 * @node: node whose cpumask we're constructing 6303 * @node: node whose cpumask we're constructing
5944 * @size: number of nodes to include in this span 6304 * @size: number of nodes to include in this span
5945 * 6305 *
5946 * Given a node, construct a good cpumask for its sched_domain to span. It 6306 * Given a node, construct a good cpumask for its sched_domain to span. It
5947 * should be one that prevents unnecessary balancing, but also spreads tasks 6307 * should be one that prevents unnecessary balancing, but also spreads tasks
5948 * out optimally. 6308 * out optimally.
5949 */ 6309 */
@@ -5980,8 +6340,8 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
5980static DEFINE_PER_CPU(struct sched_domain, cpu_domains); 6340static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
5981static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); 6341static DEFINE_PER_CPU(struct sched_group, sched_group_cpus);
5982 6342
5983static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, 6343static int
5984 struct sched_group **sg) 6344cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg)
5985{ 6345{
5986 if (sg) 6346 if (sg)
5987 *sg = &per_cpu(sched_group_cpus, cpu); 6347 *sg = &per_cpu(sched_group_cpus, cpu);
@@ -5998,8 +6358,8 @@ static DEFINE_PER_CPU(struct sched_group, sched_group_core);
5998#endif 6358#endif
5999 6359
6000#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 6360#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
6001static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map, 6361static int
6002 struct sched_group **sg) 6362cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg)
6003{ 6363{
6004 int group; 6364 int group;
6005 cpumask_t mask = per_cpu(cpu_sibling_map, cpu); 6365 cpumask_t mask = per_cpu(cpu_sibling_map, cpu);
@@ -6010,8 +6370,8 @@ static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
6010 return group; 6370 return group;
6011} 6371}
6012#elif defined(CONFIG_SCHED_MC) 6372#elif defined(CONFIG_SCHED_MC)
6013static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map, 6373static int
6014 struct sched_group **sg) 6374cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg)
6015{ 6375{
6016 if (sg) 6376 if (sg)
6017 *sg = &per_cpu(sched_group_core, cpu); 6377 *sg = &per_cpu(sched_group_core, cpu);
@@ -6022,8 +6382,8 @@ static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map,
6022static DEFINE_PER_CPU(struct sched_domain, phys_domains); 6382static DEFINE_PER_CPU(struct sched_domain, phys_domains);
6023static DEFINE_PER_CPU(struct sched_group, sched_group_phys); 6383static DEFINE_PER_CPU(struct sched_group, sched_group_phys);
6024 6384
6025static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, 6385static int
6026 struct sched_group **sg) 6386cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg)
6027{ 6387{
6028 int group; 6388 int group;
6029#ifdef CONFIG_SCHED_MC 6389#ifdef CONFIG_SCHED_MC
@@ -6195,6 +6555,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6195static int build_sched_domains(const cpumask_t *cpu_map) 6555static int build_sched_domains(const cpumask_t *cpu_map)
6196{ 6556{
6197 int i; 6557 int i;
6558 struct root_domain *rd;
6198#ifdef CONFIG_NUMA 6559#ifdef CONFIG_NUMA
6199 struct sched_group **sched_group_nodes = NULL; 6560 struct sched_group **sched_group_nodes = NULL;
6200 int sd_allnodes = 0; 6561 int sd_allnodes = 0;
@@ -6203,7 +6564,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6203 * Allocate the per-node list of sched groups 6564 * Allocate the per-node list of sched groups
6204 */ 6565 */
6205 sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *), 6566 sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *),
6206 GFP_KERNEL); 6567 GFP_KERNEL);
6207 if (!sched_group_nodes) { 6568 if (!sched_group_nodes) {
6208 printk(KERN_WARNING "Can not alloc sched group node list\n"); 6569 printk(KERN_WARNING "Can not alloc sched group node list\n");
6209 return -ENOMEM; 6570 return -ENOMEM;
@@ -6211,6 +6572,12 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6211 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; 6572 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
6212#endif 6573#endif
6213 6574
6575 rd = alloc_rootdomain();
6576 if (!rd) {
6577 printk(KERN_WARNING "Cannot alloc root domain\n");
6578 return -ENOMEM;
6579 }
6580
6214 /* 6581 /*
6215 * Set up domains for cpus specified by the cpu_map. 6582 * Set up domains for cpus specified by the cpu_map.
6216 */ 6583 */
@@ -6427,7 +6794,7 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6427#else 6794#else
6428 sd = &per_cpu(phys_domains, i); 6795 sd = &per_cpu(phys_domains, i);
6429#endif 6796#endif
6430 cpu_attach_domain(sd, i); 6797 cpu_attach_domain(sd, rd, i);
6431 } 6798 }
6432 6799
6433 return 0; 6800 return 0;
@@ -6450,7 +6817,7 @@ static int ndoms_cur; /* number of sched domains in 'doms_cur' */
6450static cpumask_t fallback_doms; 6817static cpumask_t fallback_doms;
6451 6818
6452/* 6819/*
6453 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 6820 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
6454 * For now this just excludes isolated cpus, but could be used to 6821 * For now this just excludes isolated cpus, but could be used to
6455 * exclude other special cases in the future. 6822 * exclude other special cases in the future.
6456 */ 6823 */
@@ -6485,26 +6852,26 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
6485 unregister_sched_domain_sysctl(); 6852 unregister_sched_domain_sysctl();
6486 6853
6487 for_each_cpu_mask(i, *cpu_map) 6854 for_each_cpu_mask(i, *cpu_map)
6488 cpu_attach_domain(NULL, i); 6855 cpu_attach_domain(NULL, &def_root_domain, i);
6489 synchronize_sched(); 6856 synchronize_sched();
6490 arch_destroy_sched_domains(cpu_map); 6857 arch_destroy_sched_domains(cpu_map);
6491} 6858}
6492 6859
6493/* 6860/*
6494 * Partition sched domains as specified by the 'ndoms_new' 6861 * Partition sched domains as specified by the 'ndoms_new'
6495 * cpumasks in the array doms_new[] of cpumasks. This compares 6862 * cpumasks in the array doms_new[] of cpumasks. This compares
6496 * doms_new[] to the current sched domain partitioning, doms_cur[]. 6863 * doms_new[] to the current sched domain partitioning, doms_cur[].
6497 * It destroys each deleted domain and builds each new domain. 6864 * It destroys each deleted domain and builds each new domain.
6498 * 6865 *
6499 * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'. 6866 * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'.
6500 * The masks don't intersect (don't overlap.) We should setup one 6867 * The masks don't intersect (don't overlap.) We should setup one
6501 * sched domain for each mask. CPUs not in any of the cpumasks will 6868 * sched domain for each mask. CPUs not in any of the cpumasks will
6502 * not be load balanced. If the same cpumask appears both in the 6869 * not be load balanced. If the same cpumask appears both in the
6503 * current 'doms_cur' domains and in the new 'doms_new', we can leave 6870 * current 'doms_cur' domains and in the new 'doms_new', we can leave
6504 * it as it is. 6871 * it as it is.
6505 * 6872 *
6506 * The passed in 'doms_new' should be kmalloc'd. This routine takes 6873 * The passed in 'doms_new' should be kmalloc'd. This routine takes
6507 * ownership of it and will kfree it when done with it. If the caller 6874 * ownership of it and will kfree it when done with it. If the caller
6508 * failed the kmalloc call, then it can pass in doms_new == NULL, 6875 * failed the kmalloc call, then it can pass in doms_new == NULL,
6509 * and partition_sched_domains() will fallback to the single partition 6876 * and partition_sched_domains() will fallback to the single partition
6510 * 'fallback_doms'. 6877 * 'fallback_doms'.
@@ -6515,6 +6882,8 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new)
6515{ 6882{
6516 int i, j; 6883 int i, j;
6517 6884
6885 lock_doms_cur();
6886
6518 /* always unregister in case we don't destroy any domains */ 6887 /* always unregister in case we don't destroy any domains */
6519 unregister_sched_domain_sysctl(); 6888 unregister_sched_domain_sysctl();
6520 6889
@@ -6555,6 +6924,8 @@ match2:
6555 ndoms_cur = ndoms_new; 6924 ndoms_cur = ndoms_new;
6556 6925
6557 register_sched_domain_sysctl(); 6926 register_sched_domain_sysctl();
6927
6928 unlock_doms_cur();
6558} 6929}
6559 6930
6560#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 6931#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -6562,10 +6933,10 @@ static int arch_reinit_sched_domains(void)
6562{ 6933{
6563 int err; 6934 int err;
6564 6935
6565 mutex_lock(&sched_hotcpu_mutex); 6936 get_online_cpus();
6566 detach_destroy_domains(&cpu_online_map); 6937 detach_destroy_domains(&cpu_online_map);
6567 err = arch_init_sched_domains(&cpu_online_map); 6938 err = arch_init_sched_domains(&cpu_online_map);
6568 mutex_unlock(&sched_hotcpu_mutex); 6939 put_online_cpus();
6569 6940
6570 return err; 6941 return err;
6571} 6942}
@@ -6634,7 +7005,7 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
6634#endif 7005#endif
6635 7006
6636/* 7007/*
6637 * Force a reinitialization of the sched domains hierarchy. The domains 7008 * Force a reinitialization of the sched domains hierarchy. The domains
6638 * and groups cannot be updated in place without racing with the balancing 7009 * and groups cannot be updated in place without racing with the balancing
6639 * code, so we temporarily attach all running cpus to the NULL domain 7010 * code, so we temporarily attach all running cpus to the NULL domain
6640 * which will prevent rebalancing while the sched domains are recalculated. 7011 * which will prevent rebalancing while the sched domains are recalculated.
@@ -6676,30 +7047,44 @@ void __init sched_init_smp(void)
6676{ 7047{
6677 cpumask_t non_isolated_cpus; 7048 cpumask_t non_isolated_cpus;
6678 7049
6679 mutex_lock(&sched_hotcpu_mutex); 7050 get_online_cpus();
6680 arch_init_sched_domains(&cpu_online_map); 7051 arch_init_sched_domains(&cpu_online_map);
6681 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); 7052 cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map);
6682 if (cpus_empty(non_isolated_cpus)) 7053 if (cpus_empty(non_isolated_cpus))
6683 cpu_set(smp_processor_id(), non_isolated_cpus); 7054 cpu_set(smp_processor_id(), non_isolated_cpus);
6684 mutex_unlock(&sched_hotcpu_mutex); 7055 put_online_cpus();
6685 /* XXX: Theoretical race here - CPU may be hotplugged now */ 7056 /* XXX: Theoretical race here - CPU may be hotplugged now */
6686 hotcpu_notifier(update_sched_domains, 0); 7057 hotcpu_notifier(update_sched_domains, 0);
6687 7058
6688 /* Move init over to a non-isolated CPU */ 7059 /* Move init over to a non-isolated CPU */
6689 if (set_cpus_allowed(current, non_isolated_cpus) < 0) 7060 if (set_cpus_allowed(current, non_isolated_cpus) < 0)
6690 BUG(); 7061 BUG();
7062 sched_init_granularity();
7063
7064#ifdef CONFIG_FAIR_GROUP_SCHED
7065 if (nr_cpu_ids == 1)
7066 return;
7067
7068 lb_monitor_task = kthread_create(load_balance_monitor, NULL,
7069 "group_balance");
7070 if (!IS_ERR(lb_monitor_task)) {
7071 lb_monitor_task->flags |= PF_NOFREEZE;
7072 wake_up_process(lb_monitor_task);
7073 } else {
7074 printk(KERN_ERR "Could not create load balance monitor thread"
7075 "(error = %ld) \n", PTR_ERR(lb_monitor_task));
7076 }
7077#endif
6691} 7078}
6692#else 7079#else
6693void __init sched_init_smp(void) 7080void __init sched_init_smp(void)
6694{ 7081{
7082 sched_init_granularity();
6695} 7083}
6696#endif /* CONFIG_SMP */ 7084#endif /* CONFIG_SMP */
6697 7085
6698int in_sched_functions(unsigned long addr) 7086int in_sched_functions(unsigned long addr)
6699{ 7087{
6700 /* Linker adds these: start and end of __sched functions */
6701 extern char __sched_text_start[], __sched_text_end[];
6702
6703 return in_lock_functions(addr) || 7088 return in_lock_functions(addr) ||
6704 (addr >= (unsigned long)__sched_text_start 7089 (addr >= (unsigned long)__sched_text_start
6705 && addr < (unsigned long)__sched_text_end); 7090 && addr < (unsigned long)__sched_text_end);
@@ -6714,13 +7099,87 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
6714 cfs_rq->min_vruntime = (u64)(-(1LL << 20)); 7099 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
6715} 7100}
6716 7101
7102static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
7103{
7104 struct rt_prio_array *array;
7105 int i;
7106
7107 array = &rt_rq->active;
7108 for (i = 0; i < MAX_RT_PRIO; i++) {
7109 INIT_LIST_HEAD(array->queue + i);
7110 __clear_bit(i, array->bitmap);
7111 }
7112 /* delimiter for bitsearch: */
7113 __set_bit(MAX_RT_PRIO, array->bitmap);
7114
7115#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
7116 rt_rq->highest_prio = MAX_RT_PRIO;
7117#endif
7118#ifdef CONFIG_SMP
7119 rt_rq->rt_nr_migratory = 0;
7120 rt_rq->overloaded = 0;
7121#endif
7122
7123 rt_rq->rt_time = 0;
7124 rt_rq->rt_throttled = 0;
7125
7126#ifdef CONFIG_FAIR_GROUP_SCHED
7127 rt_rq->rq = rq;
7128#endif
7129}
7130
7131#ifdef CONFIG_FAIR_GROUP_SCHED
7132static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg,
7133 struct cfs_rq *cfs_rq, struct sched_entity *se,
7134 int cpu, int add)
7135{
7136 tg->cfs_rq[cpu] = cfs_rq;
7137 init_cfs_rq(cfs_rq, rq);
7138 cfs_rq->tg = tg;
7139 if (add)
7140 list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7141
7142 tg->se[cpu] = se;
7143 se->cfs_rq = &rq->cfs;
7144 se->my_q = cfs_rq;
7145 se->load.weight = tg->shares;
7146 se->load.inv_weight = div64_64(1ULL<<32, se->load.weight);
7147 se->parent = NULL;
7148}
7149
7150static void init_tg_rt_entry(struct rq *rq, struct task_group *tg,
7151 struct rt_rq *rt_rq, struct sched_rt_entity *rt_se,
7152 int cpu, int add)
7153{
7154 tg->rt_rq[cpu] = rt_rq;
7155 init_rt_rq(rt_rq, rq);
7156 rt_rq->tg = tg;
7157 rt_rq->rt_se = rt_se;
7158 if (add)
7159 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
7160
7161 tg->rt_se[cpu] = rt_se;
7162 rt_se->rt_rq = &rq->rt;
7163 rt_se->my_q = rt_rq;
7164 rt_se->parent = NULL;
7165 INIT_LIST_HEAD(&rt_se->run_list);
7166}
7167#endif
7168
6717void __init sched_init(void) 7169void __init sched_init(void)
6718{ 7170{
6719 int highest_cpu = 0; 7171 int highest_cpu = 0;
6720 int i, j; 7172 int i, j;
6721 7173
7174#ifdef CONFIG_SMP
7175 init_defrootdomain();
7176#endif
7177
7178#ifdef CONFIG_FAIR_GROUP_SCHED
7179 list_add(&init_task_group.list, &task_groups);
7180#endif
7181
6722 for_each_possible_cpu(i) { 7182 for_each_possible_cpu(i) {
6723 struct rt_prio_array *array;
6724 struct rq *rq; 7183 struct rq *rq;
6725 7184
6726 rq = cpu_rq(i); 7185 rq = cpu_rq(i);
@@ -6729,52 +7188,39 @@ void __init sched_init(void)
6729 rq->nr_running = 0; 7188 rq->nr_running = 0;
6730 rq->clock = 1; 7189 rq->clock = 1;
6731 init_cfs_rq(&rq->cfs, rq); 7190 init_cfs_rq(&rq->cfs, rq);
7191 init_rt_rq(&rq->rt, rq);
6732#ifdef CONFIG_FAIR_GROUP_SCHED 7192#ifdef CONFIG_FAIR_GROUP_SCHED
6733 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
6734 {
6735 struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i);
6736 struct sched_entity *se =
6737 &per_cpu(init_sched_entity, i);
6738
6739 init_cfs_rq_p[i] = cfs_rq;
6740 init_cfs_rq(cfs_rq, rq);
6741 cfs_rq->tg = &init_task_group;
6742 list_add(&cfs_rq->leaf_cfs_rq_list,
6743 &rq->leaf_cfs_rq_list);
6744
6745 init_sched_entity_p[i] = se;
6746 se->cfs_rq = &rq->cfs;
6747 se->my_q = cfs_rq;
6748 se->load.weight = init_task_group_load;
6749 se->load.inv_weight =
6750 div64_64(1ULL<<32, init_task_group_load);
6751 se->parent = NULL;
6752 }
6753 init_task_group.shares = init_task_group_load; 7193 init_task_group.shares = init_task_group_load;
6754 spin_lock_init(&init_task_group.lock); 7194 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
7195 init_tg_cfs_entry(rq, &init_task_group,
7196 &per_cpu(init_cfs_rq, i),
7197 &per_cpu(init_sched_entity, i), i, 1);
7198
7199 init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */
7200 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
7201 init_tg_rt_entry(rq, &init_task_group,
7202 &per_cpu(init_rt_rq, i),
7203 &per_cpu(init_sched_rt_entity, i), i, 1);
6755#endif 7204#endif
7205 rq->rt_period_expire = 0;
7206 rq->rt_throttled = 0;
6756 7207
6757 for (j = 0; j < CPU_LOAD_IDX_MAX; j++) 7208 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
6758 rq->cpu_load[j] = 0; 7209 rq->cpu_load[j] = 0;
6759#ifdef CONFIG_SMP 7210#ifdef CONFIG_SMP
6760 rq->sd = NULL; 7211 rq->sd = NULL;
7212 rq->rd = NULL;
6761 rq->active_balance = 0; 7213 rq->active_balance = 0;
6762 rq->next_balance = jiffies; 7214 rq->next_balance = jiffies;
6763 rq->push_cpu = 0; 7215 rq->push_cpu = 0;
6764 rq->cpu = i; 7216 rq->cpu = i;
6765 rq->migration_thread = NULL; 7217 rq->migration_thread = NULL;
6766 INIT_LIST_HEAD(&rq->migration_queue); 7218 INIT_LIST_HEAD(&rq->migration_queue);
7219 rq_attach_root(rq, &def_root_domain);
6767#endif 7220#endif
7221 init_rq_hrtick(rq);
6768 atomic_set(&rq->nr_iowait, 0); 7222 atomic_set(&rq->nr_iowait, 0);
6769
6770 array = &rq->rt.active;
6771 for (j = 0; j < MAX_RT_PRIO; j++) {
6772 INIT_LIST_HEAD(array->queue + j);
6773 __clear_bit(j, array->bitmap);
6774 }
6775 highest_cpu = i; 7223 highest_cpu = i;
6776 /* delimiter for bitsearch: */
6777 __set_bit(MAX_RT_PRIO, array->bitmap);
6778 } 7224 }
6779 7225
6780 set_load_weight(&init_task); 7226 set_load_weight(&init_task);
@@ -6925,8 +7371,8 @@ struct task_struct *curr_task(int cpu)
6925 * @p: the task pointer to set. 7371 * @p: the task pointer to set.
6926 * 7372 *
6927 * Description: This function must only be used when non-maskable interrupts 7373 * Description: This function must only be used when non-maskable interrupts
6928 * are serviced on a separate stack. It allows the architecture to switch the 7374 * are serviced on a separate stack. It allows the architecture to switch the
6929 * notion of the current task on a cpu in a non-blocking manner. This function 7375 * notion of the current task on a cpu in a non-blocking manner. This function
6930 * must be called with all CPU's synchronized, and interrupts disabled, the 7376 * must be called with all CPU's synchronized, and interrupts disabled, the
6931 * and caller must save the original value of the current task (see 7377 * and caller must save the original value of the current task (see
6932 * curr_task() above) and restore that value before reenabling interrupts and 7378 * curr_task() above) and restore that value before reenabling interrupts and
@@ -6943,12 +7389,187 @@ void set_curr_task(int cpu, struct task_struct *p)
6943 7389
6944#ifdef CONFIG_FAIR_GROUP_SCHED 7390#ifdef CONFIG_FAIR_GROUP_SCHED
6945 7391
7392#ifdef CONFIG_SMP
7393/*
7394 * distribute shares of all task groups among their schedulable entities,
7395 * to reflect load distribution across cpus.
7396 */
7397static int rebalance_shares(struct sched_domain *sd, int this_cpu)
7398{
7399 struct cfs_rq *cfs_rq;
7400 struct rq *rq = cpu_rq(this_cpu);
7401 cpumask_t sdspan = sd->span;
7402 int balanced = 1;
7403
7404 /* Walk thr' all the task groups that we have */
7405 for_each_leaf_cfs_rq(rq, cfs_rq) {
7406 int i;
7407 unsigned long total_load = 0, total_shares;
7408 struct task_group *tg = cfs_rq->tg;
7409
7410 /* Gather total task load of this group across cpus */
7411 for_each_cpu_mask(i, sdspan)
7412 total_load += tg->cfs_rq[i]->load.weight;
7413
7414 /* Nothing to do if this group has no load */
7415 if (!total_load)
7416 continue;
7417
7418 /*
7419 * tg->shares represents the number of cpu shares the task group
7420 * is eligible to hold on a single cpu. On N cpus, it is
7421 * eligible to hold (N * tg->shares) number of cpu shares.
7422 */
7423 total_shares = tg->shares * cpus_weight(sdspan);
7424
7425 /*
7426 * redistribute total_shares across cpus as per the task load
7427 * distribution.
7428 */
7429 for_each_cpu_mask(i, sdspan) {
7430 unsigned long local_load, local_shares;
7431
7432 local_load = tg->cfs_rq[i]->load.weight;
7433 local_shares = (local_load * total_shares) / total_load;
7434 if (!local_shares)
7435 local_shares = MIN_GROUP_SHARES;
7436 if (local_shares == tg->se[i]->load.weight)
7437 continue;
7438
7439 spin_lock_irq(&cpu_rq(i)->lock);
7440 set_se_shares(tg->se[i], local_shares);
7441 spin_unlock_irq(&cpu_rq(i)->lock);
7442 balanced = 0;
7443 }
7444 }
7445
7446 return balanced;
7447}
7448
7449/*
7450 * How frequently should we rebalance_shares() across cpus?
7451 *
7452 * The more frequently we rebalance shares, the more accurate is the fairness
7453 * of cpu bandwidth distribution between task groups. However higher frequency
7454 * also implies increased scheduling overhead.
7455 *
7456 * sysctl_sched_min_bal_int_shares represents the minimum interval between
7457 * consecutive calls to rebalance_shares() in the same sched domain.
7458 *
7459 * sysctl_sched_max_bal_int_shares represents the maximum interval between
7460 * consecutive calls to rebalance_shares() in the same sched domain.
7461 *
7462 * These settings allows for the appropriate trade-off between accuracy of
7463 * fairness and the associated overhead.
7464 *
7465 */
7466
7467/* default: 8ms, units: milliseconds */
7468const_debug unsigned int sysctl_sched_min_bal_int_shares = 8;
7469
7470/* default: 128ms, units: milliseconds */
7471const_debug unsigned int sysctl_sched_max_bal_int_shares = 128;
7472
7473/* kernel thread that runs rebalance_shares() periodically */
7474static int load_balance_monitor(void *unused)
7475{
7476 unsigned int timeout = sysctl_sched_min_bal_int_shares;
7477 struct sched_param schedparm;
7478 int ret;
7479
7480 /*
7481 * We don't want this thread's execution to be limited by the shares
7482 * assigned to default group (init_task_group). Hence make it run
7483 * as a SCHED_RR RT task at the lowest priority.
7484 */
7485 schedparm.sched_priority = 1;
7486 ret = sched_setscheduler(current, SCHED_RR, &schedparm);
7487 if (ret)
7488 printk(KERN_ERR "Couldn't set SCHED_RR policy for load balance"
7489 " monitor thread (error = %d) \n", ret);
7490
7491 while (!kthread_should_stop()) {
7492 int i, cpu, balanced = 1;
7493
7494 /* Prevent cpus going down or coming up */
7495 get_online_cpus();
7496 /* lockout changes to doms_cur[] array */
7497 lock_doms_cur();
7498 /*
7499 * Enter a rcu read-side critical section to safely walk rq->sd
7500 * chain on various cpus and to walk task group list
7501 * (rq->leaf_cfs_rq_list) in rebalance_shares().
7502 */
7503 rcu_read_lock();
7504
7505 for (i = 0; i < ndoms_cur; i++) {
7506 cpumask_t cpumap = doms_cur[i];
7507 struct sched_domain *sd = NULL, *sd_prev = NULL;
7508
7509 cpu = first_cpu(cpumap);
7510
7511 /* Find the highest domain at which to balance shares */
7512 for_each_domain(cpu, sd) {
7513 if (!(sd->flags & SD_LOAD_BALANCE))
7514 continue;
7515 sd_prev = sd;
7516 }
7517
7518 sd = sd_prev;
7519 /* sd == NULL? No load balance reqd in this domain */
7520 if (!sd)
7521 continue;
7522
7523 balanced &= rebalance_shares(sd, cpu);
7524 }
7525
7526 rcu_read_unlock();
7527
7528 unlock_doms_cur();
7529 put_online_cpus();
7530
7531 if (!balanced)
7532 timeout = sysctl_sched_min_bal_int_shares;
7533 else if (timeout < sysctl_sched_max_bal_int_shares)
7534 timeout *= 2;
7535
7536 msleep_interruptible(timeout);
7537 }
7538
7539 return 0;
7540}
7541#endif /* CONFIG_SMP */
7542
7543static void free_sched_group(struct task_group *tg)
7544{
7545 int i;
7546
7547 for_each_possible_cpu(i) {
7548 if (tg->cfs_rq)
7549 kfree(tg->cfs_rq[i]);
7550 if (tg->se)
7551 kfree(tg->se[i]);
7552 if (tg->rt_rq)
7553 kfree(tg->rt_rq[i]);
7554 if (tg->rt_se)
7555 kfree(tg->rt_se[i]);
7556 }
7557
7558 kfree(tg->cfs_rq);
7559 kfree(tg->se);
7560 kfree(tg->rt_rq);
7561 kfree(tg->rt_se);
7562 kfree(tg);
7563}
7564
6946/* allocate runqueue etc for a new task group */ 7565/* allocate runqueue etc for a new task group */
6947struct task_group *sched_create_group(void) 7566struct task_group *sched_create_group(void)
6948{ 7567{
6949 struct task_group *tg; 7568 struct task_group *tg;
6950 struct cfs_rq *cfs_rq; 7569 struct cfs_rq *cfs_rq;
6951 struct sched_entity *se; 7570 struct sched_entity *se;
7571 struct rt_rq *rt_rq;
7572 struct sched_rt_entity *rt_se;
6952 struct rq *rq; 7573 struct rq *rq;
6953 int i; 7574 int i;
6954 7575
@@ -6962,97 +7583,89 @@ struct task_group *sched_create_group(void)
6962 tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); 7583 tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL);
6963 if (!tg->se) 7584 if (!tg->se)
6964 goto err; 7585 goto err;
7586 tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL);
7587 if (!tg->rt_rq)
7588 goto err;
7589 tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL);
7590 if (!tg->rt_se)
7591 goto err;
7592
7593 tg->shares = NICE_0_LOAD;
7594 tg->rt_ratio = 0; /* XXX */
6965 7595
6966 for_each_possible_cpu(i) { 7596 for_each_possible_cpu(i) {
6967 rq = cpu_rq(i); 7597 rq = cpu_rq(i);
6968 7598
6969 cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, 7599 cfs_rq = kmalloc_node(sizeof(struct cfs_rq),
6970 cpu_to_node(i)); 7600 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
6971 if (!cfs_rq) 7601 if (!cfs_rq)
6972 goto err; 7602 goto err;
6973 7603
6974 se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL, 7604 se = kmalloc_node(sizeof(struct sched_entity),
6975 cpu_to_node(i)); 7605 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
6976 if (!se) 7606 if (!se)
6977 goto err; 7607 goto err;
6978 7608
6979 memset(cfs_rq, 0, sizeof(struct cfs_rq)); 7609 rt_rq = kmalloc_node(sizeof(struct rt_rq),
6980 memset(se, 0, sizeof(struct sched_entity)); 7610 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
7611 if (!rt_rq)
7612 goto err;
6981 7613
6982 tg->cfs_rq[i] = cfs_rq; 7614 rt_se = kmalloc_node(sizeof(struct sched_rt_entity),
6983 init_cfs_rq(cfs_rq, rq); 7615 GFP_KERNEL|__GFP_ZERO, cpu_to_node(i));
6984 cfs_rq->tg = tg; 7616 if (!rt_se)
7617 goto err;
6985 7618
6986 tg->se[i] = se; 7619 init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0);
6987 se->cfs_rq = &rq->cfs; 7620 init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0);
6988 se->my_q = cfs_rq;
6989 se->load.weight = NICE_0_LOAD;
6990 se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD);
6991 se->parent = NULL;
6992 } 7621 }
6993 7622
7623 lock_task_group_list();
6994 for_each_possible_cpu(i) { 7624 for_each_possible_cpu(i) {
6995 rq = cpu_rq(i); 7625 rq = cpu_rq(i);
6996 cfs_rq = tg->cfs_rq[i]; 7626 cfs_rq = tg->cfs_rq[i];
6997 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); 7627 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7628 rt_rq = tg->rt_rq[i];
7629 list_add_rcu(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
6998 } 7630 }
6999 7631 list_add_rcu(&tg->list, &task_groups);
7000 tg->shares = NICE_0_LOAD; 7632 unlock_task_group_list();
7001 spin_lock_init(&tg->lock);
7002 7633
7003 return tg; 7634 return tg;
7004 7635
7005err: 7636err:
7006 for_each_possible_cpu(i) { 7637 free_sched_group(tg);
7007 if (tg->cfs_rq)
7008 kfree(tg->cfs_rq[i]);
7009 if (tg->se)
7010 kfree(tg->se[i]);
7011 }
7012 kfree(tg->cfs_rq);
7013 kfree(tg->se);
7014 kfree(tg);
7015
7016 return ERR_PTR(-ENOMEM); 7638 return ERR_PTR(-ENOMEM);
7017} 7639}
7018 7640
7019/* rcu callback to free various structures associated with a task group */ 7641/* rcu callback to free various structures associated with a task group */
7020static void free_sched_group(struct rcu_head *rhp) 7642static void free_sched_group_rcu(struct rcu_head *rhp)
7021{ 7643{
7022 struct cfs_rq *cfs_rq = container_of(rhp, struct cfs_rq, rcu);
7023 struct task_group *tg = cfs_rq->tg;
7024 struct sched_entity *se;
7025 int i;
7026
7027 /* now it should be safe to free those cfs_rqs */ 7644 /* now it should be safe to free those cfs_rqs */
7028 for_each_possible_cpu(i) { 7645 free_sched_group(container_of(rhp, struct task_group, rcu));
7029 cfs_rq = tg->cfs_rq[i];
7030 kfree(cfs_rq);
7031
7032 se = tg->se[i];
7033 kfree(se);
7034 }
7035
7036 kfree(tg->cfs_rq);
7037 kfree(tg->se);
7038 kfree(tg);
7039} 7646}
7040 7647
7041/* Destroy runqueue etc associated with a task group */ 7648/* Destroy runqueue etc associated with a task group */
7042void sched_destroy_group(struct task_group *tg) 7649void sched_destroy_group(struct task_group *tg)
7043{ 7650{
7044 struct cfs_rq *cfs_rq; 7651 struct cfs_rq *cfs_rq = NULL;
7652 struct rt_rq *rt_rq = NULL;
7045 int i; 7653 int i;
7046 7654
7655 lock_task_group_list();
7047 for_each_possible_cpu(i) { 7656 for_each_possible_cpu(i) {
7048 cfs_rq = tg->cfs_rq[i]; 7657 cfs_rq = tg->cfs_rq[i];
7049 list_del_rcu(&cfs_rq->leaf_cfs_rq_list); 7658 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
7659 rt_rq = tg->rt_rq[i];
7660 list_del_rcu(&rt_rq->leaf_rt_rq_list);
7050 } 7661 }
7662 list_del_rcu(&tg->list);
7663 unlock_task_group_list();
7051 7664
7052 cfs_rq = tg->cfs_rq[0]; 7665 BUG_ON(!cfs_rq);
7053 7666
7054 /* wait for possible concurrent references to cfs_rqs complete */ 7667 /* wait for possible concurrent references to cfs_rqs complete */
7055 call_rcu(&cfs_rq->rcu, free_sched_group); 7668 call_rcu(&tg->rcu, free_sched_group_rcu);
7056} 7669}
7057 7670
7058/* change task's runqueue when it moves between groups. 7671/* change task's runqueue when it moves between groups.
@@ -7068,12 +7681,9 @@ void sched_move_task(struct task_struct *tsk)
7068 7681
7069 rq = task_rq_lock(tsk, &flags); 7682 rq = task_rq_lock(tsk, &flags);
7070 7683
7071 if (tsk->sched_class != &fair_sched_class)
7072 goto done;
7073
7074 update_rq_clock(rq); 7684 update_rq_clock(rq);
7075 7685
7076 running = task_running(rq, tsk); 7686 running = task_current(rq, tsk);
7077 on_rq = tsk->se.on_rq; 7687 on_rq = tsk->se.on_rq;
7078 7688
7079 if (on_rq) { 7689 if (on_rq) {
@@ -7082,7 +7692,7 @@ void sched_move_task(struct task_struct *tsk)
7082 tsk->sched_class->put_prev_task(rq, tsk); 7692 tsk->sched_class->put_prev_task(rq, tsk);
7083 } 7693 }
7084 7694
7085 set_task_cfs_rq(tsk); 7695 set_task_rq(tsk, task_cpu(tsk));
7086 7696
7087 if (on_rq) { 7697 if (on_rq) {
7088 if (unlikely(running)) 7698 if (unlikely(running))
@@ -7090,45 +7700,82 @@ void sched_move_task(struct task_struct *tsk)
7090 enqueue_task(rq, tsk, 0); 7700 enqueue_task(rq, tsk, 0);
7091 } 7701 }
7092 7702
7093done:
7094 task_rq_unlock(rq, &flags); 7703 task_rq_unlock(rq, &flags);
7095} 7704}
7096 7705
7706/* rq->lock to be locked by caller */
7097static void set_se_shares(struct sched_entity *se, unsigned long shares) 7707static void set_se_shares(struct sched_entity *se, unsigned long shares)
7098{ 7708{
7099 struct cfs_rq *cfs_rq = se->cfs_rq; 7709 struct cfs_rq *cfs_rq = se->cfs_rq;
7100 struct rq *rq = cfs_rq->rq; 7710 struct rq *rq = cfs_rq->rq;
7101 int on_rq; 7711 int on_rq;
7102 7712
7103 spin_lock_irq(&rq->lock); 7713 if (!shares)
7714 shares = MIN_GROUP_SHARES;
7104 7715
7105 on_rq = se->on_rq; 7716 on_rq = se->on_rq;
7106 if (on_rq) 7717 if (on_rq) {
7107 dequeue_entity(cfs_rq, se, 0); 7718 dequeue_entity(cfs_rq, se, 0);
7719 dec_cpu_load(rq, se->load.weight);
7720 }
7108 7721
7109 se->load.weight = shares; 7722 se->load.weight = shares;
7110 se->load.inv_weight = div64_64((1ULL<<32), shares); 7723 se->load.inv_weight = div64_64((1ULL<<32), shares);
7111 7724
7112 if (on_rq) 7725 if (on_rq) {
7113 enqueue_entity(cfs_rq, se, 0); 7726 enqueue_entity(cfs_rq, se, 0);
7114 7727 inc_cpu_load(rq, se->load.weight);
7115 spin_unlock_irq(&rq->lock); 7728 }
7116} 7729}
7117 7730
7118int sched_group_set_shares(struct task_group *tg, unsigned long shares) 7731int sched_group_set_shares(struct task_group *tg, unsigned long shares)
7119{ 7732{
7120 int i; 7733 int i;
7734 struct cfs_rq *cfs_rq;
7735 struct rq *rq;
7121 7736
7122 spin_lock(&tg->lock); 7737 lock_task_group_list();
7123 if (tg->shares == shares) 7738 if (tg->shares == shares)
7124 goto done; 7739 goto done;
7125 7740
7741 if (shares < MIN_GROUP_SHARES)
7742 shares = MIN_GROUP_SHARES;
7743
7744 /*
7745 * Prevent any load balance activity (rebalance_shares,
7746 * load_balance_fair) from referring to this group first,
7747 * by taking it off the rq->leaf_cfs_rq_list on each cpu.
7748 */
7749 for_each_possible_cpu(i) {
7750 cfs_rq = tg->cfs_rq[i];
7751 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
7752 }
7753
7754 /* wait for any ongoing reference to this group to finish */
7755 synchronize_sched();
7756
7757 /*
7758 * Now we are free to modify the group's share on each cpu
7759 * w/o tripping rebalance_share or load_balance_fair.
7760 */
7126 tg->shares = shares; 7761 tg->shares = shares;
7127 for_each_possible_cpu(i) 7762 for_each_possible_cpu(i) {
7763 spin_lock_irq(&cpu_rq(i)->lock);
7128 set_se_shares(tg->se[i], shares); 7764 set_se_shares(tg->se[i], shares);
7765 spin_unlock_irq(&cpu_rq(i)->lock);
7766 }
7129 7767
7768 /*
7769 * Enable load balance activity on this group, by inserting it back on
7770 * each cpu's rq->leaf_cfs_rq_list.
7771 */
7772 for_each_possible_cpu(i) {
7773 rq = cpu_rq(i);
7774 cfs_rq = tg->cfs_rq[i];
7775 list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
7776 }
7130done: 7777done:
7131 spin_unlock(&tg->lock); 7778 unlock_task_group_list();
7132 return 0; 7779 return 0;
7133} 7780}
7134 7781
@@ -7137,6 +7784,31 @@ unsigned long sched_group_shares(struct task_group *tg)
7137 return tg->shares; 7784 return tg->shares;
7138} 7785}
7139 7786
7787/*
7788 * Ensure the total rt_ratio <= sysctl_sched_rt_ratio
7789 */
7790int sched_group_set_rt_ratio(struct task_group *tg, unsigned long rt_ratio)
7791{
7792 struct task_group *tgi;
7793 unsigned long total = 0;
7794
7795 rcu_read_lock();
7796 list_for_each_entry_rcu(tgi, &task_groups, list)
7797 total += tgi->rt_ratio;
7798 rcu_read_unlock();
7799
7800 if (total + rt_ratio - tg->rt_ratio > sysctl_sched_rt_ratio)
7801 return -EINVAL;
7802
7803 tg->rt_ratio = rt_ratio;
7804 return 0;
7805}
7806
7807unsigned long sched_group_rt_ratio(struct task_group *tg)
7808{
7809 return tg->rt_ratio;
7810}
7811
7140#endif /* CONFIG_FAIR_GROUP_SCHED */ 7812#endif /* CONFIG_FAIR_GROUP_SCHED */
7141 7813
7142#ifdef CONFIG_FAIR_CGROUP_SCHED 7814#ifdef CONFIG_FAIR_CGROUP_SCHED
@@ -7173,16 +7845,17 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
7173 return &tg->css; 7845 return &tg->css;
7174} 7846}
7175 7847
7176static void cpu_cgroup_destroy(struct cgroup_subsys *ss, 7848static void
7177 struct cgroup *cgrp) 7849cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
7178{ 7850{
7179 struct task_group *tg = cgroup_tg(cgrp); 7851 struct task_group *tg = cgroup_tg(cgrp);
7180 7852
7181 sched_destroy_group(tg); 7853 sched_destroy_group(tg);
7182} 7854}
7183 7855
7184static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, 7856static int
7185 struct cgroup *cgrp, struct task_struct *tsk) 7857cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
7858 struct task_struct *tsk)
7186{ 7859{
7187 /* We don't support RT-tasks being in separate groups */ 7860 /* We don't support RT-tasks being in separate groups */
7188 if (tsk->sched_class != &fair_sched_class) 7861 if (tsk->sched_class != &fair_sched_class)
@@ -7211,26 +7884,169 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft)
7211 return (u64) tg->shares; 7884 return (u64) tg->shares;
7212} 7885}
7213 7886
7214static struct cftype cpu_shares = { 7887static int cpu_rt_ratio_write_uint(struct cgroup *cgrp, struct cftype *cftype,
7215 .name = "shares", 7888 u64 rt_ratio_val)
7216 .read_uint = cpu_shares_read_uint, 7889{
7217 .write_uint = cpu_shares_write_uint, 7890 return sched_group_set_rt_ratio(cgroup_tg(cgrp), rt_ratio_val);
7891}
7892
7893static u64 cpu_rt_ratio_read_uint(struct cgroup *cgrp, struct cftype *cft)
7894{
7895 struct task_group *tg = cgroup_tg(cgrp);
7896
7897 return (u64) tg->rt_ratio;
7898}
7899
7900static struct cftype cpu_files[] = {
7901 {
7902 .name = "shares",
7903 .read_uint = cpu_shares_read_uint,
7904 .write_uint = cpu_shares_write_uint,
7905 },
7906 {
7907 .name = "rt_ratio",
7908 .read_uint = cpu_rt_ratio_read_uint,
7909 .write_uint = cpu_rt_ratio_write_uint,
7910 },
7218}; 7911};
7219 7912
7220static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) 7913static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)
7221{ 7914{
7222 return cgroup_add_file(cont, ss, &cpu_shares); 7915 return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files));
7223} 7916}
7224 7917
7225struct cgroup_subsys cpu_cgroup_subsys = { 7918struct cgroup_subsys cpu_cgroup_subsys = {
7226 .name = "cpu", 7919 .name = "cpu",
7227 .create = cpu_cgroup_create, 7920 .create = cpu_cgroup_create,
7228 .destroy = cpu_cgroup_destroy, 7921 .destroy = cpu_cgroup_destroy,
7229 .can_attach = cpu_cgroup_can_attach, 7922 .can_attach = cpu_cgroup_can_attach,
7230 .attach = cpu_cgroup_attach, 7923 .attach = cpu_cgroup_attach,
7231 .populate = cpu_cgroup_populate, 7924 .populate = cpu_cgroup_populate,
7232 .subsys_id = cpu_cgroup_subsys_id, 7925 .subsys_id = cpu_cgroup_subsys_id,
7233 .early_init = 1, 7926 .early_init = 1,
7234}; 7927};
7235 7928
7236#endif /* CONFIG_FAIR_CGROUP_SCHED */ 7929#endif /* CONFIG_FAIR_CGROUP_SCHED */
7930
7931#ifdef CONFIG_CGROUP_CPUACCT
7932
7933/*
7934 * CPU accounting code for task groups.
7935 *
7936 * Based on the work by Paul Menage (menage@google.com) and Balbir Singh
7937 * (balbir@in.ibm.com).
7938 */
7939
7940/* track cpu usage of a group of tasks */
7941struct cpuacct {
7942 struct cgroup_subsys_state css;
7943 /* cpuusage holds pointer to a u64-type object on every cpu */
7944 u64 *cpuusage;
7945};
7946
7947struct cgroup_subsys cpuacct_subsys;
7948
7949/* return cpu accounting group corresponding to this container */
7950static inline struct cpuacct *cgroup_ca(struct cgroup *cont)
7951{
7952 return container_of(cgroup_subsys_state(cont, cpuacct_subsys_id),
7953 struct cpuacct, css);
7954}
7955
7956/* return cpu accounting group to which this task belongs */
7957static inline struct cpuacct *task_ca(struct task_struct *tsk)
7958{
7959 return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
7960 struct cpuacct, css);
7961}
7962
7963/* create a new cpu accounting group */
7964static struct cgroup_subsys_state *cpuacct_create(
7965 struct cgroup_subsys *ss, struct cgroup *cont)
7966{
7967 struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
7968
7969 if (!ca)
7970 return ERR_PTR(-ENOMEM);
7971
7972 ca->cpuusage = alloc_percpu(u64);
7973 if (!ca->cpuusage) {
7974 kfree(ca);
7975 return ERR_PTR(-ENOMEM);
7976 }
7977
7978 return &ca->css;
7979}
7980
7981/* destroy an existing cpu accounting group */
7982static void
7983cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
7984{
7985 struct cpuacct *ca = cgroup_ca(cont);
7986
7987 free_percpu(ca->cpuusage);
7988 kfree(ca);
7989}
7990
7991/* return total cpu usage (in nanoseconds) of a group */
7992static u64 cpuusage_read(struct cgroup *cont, struct cftype *cft)
7993{
7994 struct cpuacct *ca = cgroup_ca(cont);
7995 u64 totalcpuusage = 0;
7996 int i;
7997
7998 for_each_possible_cpu(i) {
7999 u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
8000
8001 /*
8002 * Take rq->lock to make 64-bit addition safe on 32-bit
8003 * platforms.
8004 */
8005 spin_lock_irq(&cpu_rq(i)->lock);
8006 totalcpuusage += *cpuusage;
8007 spin_unlock_irq(&cpu_rq(i)->lock);
8008 }
8009
8010 return totalcpuusage;
8011}
8012
8013static struct cftype files[] = {
8014 {
8015 .name = "usage",
8016 .read_uint = cpuusage_read,
8017 },
8018};
8019
8020static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cont)
8021{
8022 return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files));
8023}
8024
8025/*
8026 * charge this task's execution time to its accounting group.
8027 *
8028 * called with rq->lock held.
8029 */
8030static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
8031{
8032 struct cpuacct *ca;
8033
8034 if (!cpuacct_subsys.active)
8035 return;
8036
8037 ca = task_ca(tsk);
8038 if (ca) {
8039 u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk));
8040
8041 *cpuusage += cputime;
8042 }
8043}
8044
8045struct cgroup_subsys cpuacct_subsys = {
8046 .name = "cpuacct",
8047 .create = cpuacct_create,
8048 .destroy = cpuacct_destroy,
8049 .populate = cpuacct_populate,
8050 .subsys_id = cpuacct_subsys_id,
8051};
8052#endif /* CONFIG_CGROUP_CPUACCT */
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index e6fb392e5164..4b5e24cf2f4a 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -31,9 +31,9 @@
31/* 31/*
32 * Ease the printing of nsec fields: 32 * Ease the printing of nsec fields:
33 */ 33 */
34static long long nsec_high(long long nsec) 34static long long nsec_high(unsigned long long nsec)
35{ 35{
36 if (nsec < 0) { 36 if ((long long)nsec < 0) {
37 nsec = -nsec; 37 nsec = -nsec;
38 do_div(nsec, 1000000); 38 do_div(nsec, 1000000);
39 return -nsec; 39 return -nsec;
@@ -43,9 +43,9 @@ static long long nsec_high(long long nsec)
43 return nsec; 43 return nsec;
44} 44}
45 45
46static unsigned long nsec_low(long long nsec) 46static unsigned long nsec_low(unsigned long long nsec)
47{ 47{
48 if (nsec < 0) 48 if ((long long)nsec < 0)
49 nsec = -nsec; 49 nsec = -nsec;
50 50
51 return do_div(nsec, 1000000); 51 return do_div(nsec, 1000000);
@@ -80,6 +80,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p)
80static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) 80static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
81{ 81{
82 struct task_struct *g, *p; 82 struct task_struct *g, *p;
83 unsigned long flags;
83 84
84 SEQ_printf(m, 85 SEQ_printf(m,
85 "\nrunnable tasks:\n" 86 "\nrunnable tasks:\n"
@@ -88,7 +89,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
88 "------------------------------------------------------" 89 "------------------------------------------------------"
89 "----------------------------------------------------\n"); 90 "----------------------------------------------------\n");
90 91
91 read_lock_irq(&tasklist_lock); 92 read_lock_irqsave(&tasklist_lock, flags);
92 93
93 do_each_thread(g, p) { 94 do_each_thread(g, p) {
94 if (!p->se.on_rq || task_cpu(p) != rq_cpu) 95 if (!p->se.on_rq || task_cpu(p) != rq_cpu)
@@ -97,7 +98,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
97 print_task(m, rq, p); 98 print_task(m, rq, p);
98 } while_each_thread(g, p); 99 } while_each_thread(g, p);
99 100
100 read_unlock_irq(&tasklist_lock); 101 read_unlock_irqrestore(&tasklist_lock, flags);
101} 102}
102 103
103void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) 104void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
@@ -178,6 +179,7 @@ static void print_cpu(struct seq_file *m, int cpu)
178 PN(prev_clock_raw); 179 PN(prev_clock_raw);
179 P(clock_warps); 180 P(clock_warps);
180 P(clock_overflows); 181 P(clock_overflows);
182 P(clock_underflows);
181 P(clock_deep_idle_events); 183 P(clock_deep_idle_events);
182 PN(clock_max_delta); 184 PN(clock_max_delta);
183 P(cpu_load[0]); 185 P(cpu_load[0]);
@@ -198,7 +200,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
198 u64 now = ktime_to_ns(ktime_get()); 200 u64 now = ktime_to_ns(ktime_get());
199 int cpu; 201 int cpu;
200 202
201 SEQ_printf(m, "Sched Debug Version: v0.06-v22, %s %.*s\n", 203 SEQ_printf(m, "Sched Debug Version: v0.07, %s %.*s\n",
202 init_utsname()->release, 204 init_utsname()->release,
203 (int)strcspn(init_utsname()->version, " "), 205 (int)strcspn(init_utsname()->version, " "),
204 init_utsname()->version); 206 init_utsname()->version);
@@ -210,7 +212,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
210#define PN(x) \ 212#define PN(x) \
211 SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) 213 SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
212 PN(sysctl_sched_latency); 214 PN(sysctl_sched_latency);
213 PN(sysctl_sched_nr_latency); 215 PN(sysctl_sched_min_granularity);
214 PN(sysctl_sched_wakeup_granularity); 216 PN(sysctl_sched_wakeup_granularity);
215 PN(sysctl_sched_batch_wakeup_granularity); 217 PN(sysctl_sched_batch_wakeup_granularity);
216 PN(sysctl_sched_child_runs_first); 218 PN(sysctl_sched_child_runs_first);
@@ -298,6 +300,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
298 PN(se.exec_max); 300 PN(se.exec_max);
299 PN(se.slice_max); 301 PN(se.slice_max);
300 PN(se.wait_max); 302 PN(se.wait_max);
303 PN(se.wait_sum);
304 P(se.wait_count);
301 P(sched_info.bkl_count); 305 P(sched_info.bkl_count);
302 P(se.nr_migrations); 306 P(se.nr_migrations);
303 P(se.nr_migrations_cold); 307 P(se.nr_migrations_cold);
@@ -326,10 +330,12 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
326 avg_atom = -1LL; 330 avg_atom = -1LL;
327 331
328 avg_per_cpu = p->se.sum_exec_runtime; 332 avg_per_cpu = p->se.sum_exec_runtime;
329 if (p->se.nr_migrations) 333 if (p->se.nr_migrations) {
330 avg_per_cpu = div64_64(avg_per_cpu, p->se.nr_migrations); 334 avg_per_cpu = div64_64(avg_per_cpu,
331 else 335 p->se.nr_migrations);
336 } else {
332 avg_per_cpu = -1LL; 337 avg_per_cpu = -1LL;
338 }
333 339
334 __PN(avg_atom); 340 __PN(avg_atom);
335 __PN(avg_per_cpu); 341 __PN(avg_per_cpu);
@@ -363,6 +369,8 @@ void proc_sched_set_task(struct task_struct *p)
363{ 369{
364#ifdef CONFIG_SCHEDSTATS 370#ifdef CONFIG_SCHEDSTATS
365 p->se.wait_max = 0; 371 p->se.wait_max = 0;
372 p->se.wait_sum = 0;
373 p->se.wait_count = 0;
366 p->se.sleep_max = 0; 374 p->se.sleep_max = 0;
367 p->se.sum_sleep_runtime = 0; 375 p->se.sum_sleep_runtime = 0;
368 p->se.block_max = 0; 376 p->se.block_max = 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 9971831b560e..6c091d6e159d 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -20,9 +20,11 @@
20 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 20 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
21 */ 21 */
22 22
23#include <linux/latencytop.h>
24
23/* 25/*
24 * Targeted preemption latency for CPU-bound tasks: 26 * Targeted preemption latency for CPU-bound tasks:
25 * (default: 20ms, units: nanoseconds) 27 * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds)
26 * 28 *
27 * NOTE: this latency value is not the same as the concept of 29 * NOTE: this latency value is not the same as the concept of
28 * 'timeslice length' - timeslices in CFS are of variable length 30 * 'timeslice length' - timeslices in CFS are of variable length
@@ -32,19 +34,24 @@
32 * (to see the precise effective timeslice length of your workload, 34 * (to see the precise effective timeslice length of your workload,
33 * run vmstat and monitor the context-switches (cs) field) 35 * run vmstat and monitor the context-switches (cs) field)
34 */ 36 */
35const_debug unsigned int sysctl_sched_latency = 20000000ULL; 37unsigned int sysctl_sched_latency = 20000000ULL;
36 38
37/* 39/*
38 * After fork, child runs first. (default) If set to 0 then 40 * Minimal preemption granularity for CPU-bound tasks:
39 * parent will (try to) run first. 41 * (default: 4 msec * (1 + ilog(ncpus)), units: nanoseconds)
40 */ 42 */
41const_debug unsigned int sysctl_sched_child_runs_first = 1; 43unsigned int sysctl_sched_min_granularity = 4000000ULL;
42 44
43/* 45/*
44 * Minimal preemption granularity for CPU-bound tasks: 46 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
45 * (default: 2 msec, units: nanoseconds)
46 */ 47 */
47const_debug unsigned int sysctl_sched_nr_latency = 20; 48static unsigned int sched_nr_latency = 5;
49
50/*
51 * After fork, child runs first. (default) If set to 0 then
52 * parent will (try to) run first.
53 */
54const_debug unsigned int sysctl_sched_child_runs_first = 1;
48 55
49/* 56/*
50 * sys_sched_yield() compat mode 57 * sys_sched_yield() compat mode
@@ -56,23 +63,23 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
56 63
57/* 64/*
58 * SCHED_BATCH wake-up granularity. 65 * SCHED_BATCH wake-up granularity.
59 * (default: 10 msec, units: nanoseconds) 66 * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds)
60 * 67 *
61 * This option delays the preemption effects of decoupled workloads 68 * This option delays the preemption effects of decoupled workloads
62 * and reduces their over-scheduling. Synchronous workloads will still 69 * and reduces their over-scheduling. Synchronous workloads will still
63 * have immediate wakeup/sleep latencies. 70 * have immediate wakeup/sleep latencies.
64 */ 71 */
65const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL; 72unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL;
66 73
67/* 74/*
68 * SCHED_OTHER wake-up granularity. 75 * SCHED_OTHER wake-up granularity.
69 * (default: 10 msec, units: nanoseconds) 76 * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds)
70 * 77 *
71 * This option delays the preemption effects of decoupled workloads 78 * This option delays the preemption effects of decoupled workloads
72 * and reduces their over-scheduling. Synchronous workloads will still 79 * and reduces their over-scheduling. Synchronous workloads will still
73 * have immediate wakeup/sleep latencies. 80 * have immediate wakeup/sleep latencies.
74 */ 81 */
75const_debug unsigned int sysctl_sched_wakeup_granularity = 10000000UL; 82unsigned int sysctl_sched_wakeup_granularity = 10000000UL;
76 83
77const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 84const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
78 85
@@ -212,6 +219,22 @@ static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
212 * Scheduling class statistics methods: 219 * Scheduling class statistics methods:
213 */ 220 */
214 221
222#ifdef CONFIG_SCHED_DEBUG
223int sched_nr_latency_handler(struct ctl_table *table, int write,
224 struct file *filp, void __user *buffer, size_t *lenp,
225 loff_t *ppos)
226{
227 int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
228
229 if (ret || !write)
230 return ret;
231
232 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
233 sysctl_sched_min_granularity);
234
235 return 0;
236}
237#endif
215 238
216/* 239/*
217 * The idea is to set a period in which each task runs once. 240 * The idea is to set a period in which each task runs once.
@@ -224,11 +247,11 @@ static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
224static u64 __sched_period(unsigned long nr_running) 247static u64 __sched_period(unsigned long nr_running)
225{ 248{
226 u64 period = sysctl_sched_latency; 249 u64 period = sysctl_sched_latency;
227 unsigned long nr_latency = sysctl_sched_nr_latency; 250 unsigned long nr_latency = sched_nr_latency;
228 251
229 if (unlikely(nr_running > nr_latency)) { 252 if (unlikely(nr_running > nr_latency)) {
253 period = sysctl_sched_min_granularity;
230 period *= nr_running; 254 period *= nr_running;
231 do_div(period, nr_latency);
232 } 255 }
233 256
234 return period; 257 return period;
@@ -259,6 +282,7 @@ static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running)
259{ 282{
260 u64 vslice = __sched_period(nr_running); 283 u64 vslice = __sched_period(nr_running);
261 284
285 vslice *= NICE_0_LOAD;
262 do_div(vslice, rq_weight); 286 do_div(vslice, rq_weight);
263 287
264 return vslice; 288 return vslice;
@@ -329,6 +353,12 @@ static void update_curr(struct cfs_rq *cfs_rq)
329 353
330 __update_curr(cfs_rq, curr, delta_exec); 354 __update_curr(cfs_rq, curr, delta_exec);
331 curr->exec_start = now; 355 curr->exec_start = now;
356
357 if (entity_is_task(curr)) {
358 struct task_struct *curtask = task_of(curr);
359
360 cpuacct_charge(curtask, delta_exec);
361 }
332} 362}
333 363
334static inline void 364static inline void
@@ -355,6 +385,9 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
355{ 385{
356 schedstat_set(se->wait_max, max(se->wait_max, 386 schedstat_set(se->wait_max, max(se->wait_max,
357 rq_of(cfs_rq)->clock - se->wait_start)); 387 rq_of(cfs_rq)->clock - se->wait_start));
388 schedstat_set(se->wait_count, se->wait_count + 1);
389 schedstat_set(se->wait_sum, se->wait_sum +
390 rq_of(cfs_rq)->clock - se->wait_start);
358 schedstat_set(se->wait_start, 0); 391 schedstat_set(se->wait_start, 0);
359} 392}
360 393
@@ -406,6 +439,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
406#ifdef CONFIG_SCHEDSTATS 439#ifdef CONFIG_SCHEDSTATS
407 if (se->sleep_start) { 440 if (se->sleep_start) {
408 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; 441 u64 delta = rq_of(cfs_rq)->clock - se->sleep_start;
442 struct task_struct *tsk = task_of(se);
409 443
410 if ((s64)delta < 0) 444 if ((s64)delta < 0)
411 delta = 0; 445 delta = 0;
@@ -415,9 +449,12 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
415 449
416 se->sleep_start = 0; 450 se->sleep_start = 0;
417 se->sum_sleep_runtime += delta; 451 se->sum_sleep_runtime += delta;
452
453 account_scheduler_latency(tsk, delta >> 10, 1);
418 } 454 }
419 if (se->block_start) { 455 if (se->block_start) {
420 u64 delta = rq_of(cfs_rq)->clock - se->block_start; 456 u64 delta = rq_of(cfs_rq)->clock - se->block_start;
457 struct task_struct *tsk = task_of(se);
421 458
422 if ((s64)delta < 0) 459 if ((s64)delta < 0)
423 delta = 0; 460 delta = 0;
@@ -434,11 +471,11 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
434 * time that the task spent sleeping: 471 * time that the task spent sleeping:
435 */ 472 */
436 if (unlikely(prof_on == SLEEP_PROFILING)) { 473 if (unlikely(prof_on == SLEEP_PROFILING)) {
437 struct task_struct *tsk = task_of(se);
438 474
439 profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), 475 profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk),
440 delta >> 20); 476 delta >> 20);
441 } 477 }
478 account_scheduler_latency(tsk, delta >> 10, 0);
442 } 479 }
443#endif 480#endif
444} 481}
@@ -472,19 +509,25 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
472 } else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running) 509 } else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running)
473 vruntime += sched_vslice(cfs_rq)/2; 510 vruntime += sched_vslice(cfs_rq)/2;
474 511
512 /*
513 * The 'current' period is already promised to the current tasks,
514 * however the extra weight of the new task will slow them down a
515 * little, place the new task so that it fits in the slot that
516 * stays open at the end.
517 */
475 if (initial && sched_feat(START_DEBIT)) 518 if (initial && sched_feat(START_DEBIT))
476 vruntime += sched_vslice_add(cfs_rq, se); 519 vruntime += sched_vslice_add(cfs_rq, se);
477 520
478 if (!initial) { 521 if (!initial) {
479 if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se) && 522 /* sleeps upto a single latency don't count. */
480 task_of(se)->policy != SCHED_BATCH) 523 if (sched_feat(NEW_FAIR_SLEEPERS))
481 vruntime -= sysctl_sched_latency; 524 vruntime -= sysctl_sched_latency;
482 525
483 vruntime = max_t(s64, vruntime, se->vruntime); 526 /* ensure we never gain time by being placed backwards. */
527 vruntime = max_vruntime(se->vruntime, vruntime);
484 } 528 }
485 529
486 se->vruntime = vruntime; 530 se->vruntime = vruntime;
487
488} 531}
489 532
490static void 533static void
@@ -517,7 +560,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
517 560
518 update_stats_dequeue(cfs_rq, se); 561 update_stats_dequeue(cfs_rq, se);
519 if (sleep) { 562 if (sleep) {
520 se->peer_preempt = 0;
521#ifdef CONFIG_SCHEDSTATS 563#ifdef CONFIG_SCHEDSTATS
522 if (entity_is_task(se)) { 564 if (entity_is_task(se)) {
523 struct task_struct *tsk = task_of(se); 565 struct task_struct *tsk = task_of(se);
@@ -545,10 +587,8 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
545 587
546 ideal_runtime = sched_slice(cfs_rq, curr); 588 ideal_runtime = sched_slice(cfs_rq, curr);
547 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; 589 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
548 if (delta_exec > ideal_runtime || 590 if (delta_exec > ideal_runtime)
549 (sched_feat(PREEMPT_RESTRICT) && curr->peer_preempt))
550 resched_task(rq_of(cfs_rq)->curr); 591 resched_task(rq_of(cfs_rq)->curr);
551 curr->peer_preempt = 0;
552} 592}
553 593
554static void 594static void
@@ -611,13 +651,29 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
611 cfs_rq->curr = NULL; 651 cfs_rq->curr = NULL;
612} 652}
613 653
614static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) 654static void
655entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
615{ 656{
616 /* 657 /*
617 * Update run-time statistics of the 'current'. 658 * Update run-time statistics of the 'current'.
618 */ 659 */
619 update_curr(cfs_rq); 660 update_curr(cfs_rq);
620 661
662#ifdef CONFIG_SCHED_HRTICK
663 /*
664 * queued ticks are scheduled to match the slice, so don't bother
665 * validating it and just reschedule.
666 */
667 if (queued)
668 return resched_task(rq_of(cfs_rq)->curr);
669 /*
670 * don't let the period tick interfere with the hrtick preemption
671 */
672 if (!sched_feat(DOUBLE_TICK) &&
673 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
674 return;
675#endif
676
621 if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) 677 if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT))
622 check_preempt_tick(cfs_rq, curr); 678 check_preempt_tick(cfs_rq, curr);
623} 679}
@@ -659,7 +715,7 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu)
659 715
660/* Iterate thr' all leaf cfs_rq's on a runqueue */ 716/* Iterate thr' all leaf cfs_rq's on a runqueue */
661#define for_each_leaf_cfs_rq(rq, cfs_rq) \ 717#define for_each_leaf_cfs_rq(rq, cfs_rq) \
662 list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) 718 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
663 719
664/* Do the two (enqueued) entities belong to the same group ? */ 720/* Do the two (enqueued) entities belong to the same group ? */
665static inline int 721static inline int
@@ -676,6 +732,8 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
676 return se->parent; 732 return se->parent;
677} 733}
678 734
735#define GROUP_IMBALANCE_PCT 20
736
679#else /* CONFIG_FAIR_GROUP_SCHED */ 737#else /* CONFIG_FAIR_GROUP_SCHED */
680 738
681#define for_each_sched_entity(se) \ 739#define for_each_sched_entity(se) \
@@ -721,6 +779,43 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
721 779
722#endif /* CONFIG_FAIR_GROUP_SCHED */ 780#endif /* CONFIG_FAIR_GROUP_SCHED */
723 781
782#ifdef CONFIG_SCHED_HRTICK
783static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
784{
785 int requeue = rq->curr == p;
786 struct sched_entity *se = &p->se;
787 struct cfs_rq *cfs_rq = cfs_rq_of(se);
788
789 WARN_ON(task_rq(p) != rq);
790
791 if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) {
792 u64 slice = sched_slice(cfs_rq, se);
793 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
794 s64 delta = slice - ran;
795
796 if (delta < 0) {
797 if (rq->curr == p)
798 resched_task(p);
799 return;
800 }
801
802 /*
803 * Don't schedule slices shorter than 10000ns, that just
804 * doesn't make sense. Rely on vruntime for fairness.
805 */
806 if (!requeue)
807 delta = max(10000LL, delta);
808
809 hrtick_start(rq, delta, requeue);
810 }
811}
812#else
813static inline void
814hrtick_start_fair(struct rq *rq, struct task_struct *p)
815{
816}
817#endif
818
724/* 819/*
725 * The enqueue_task method is called before nr_running is 820 * The enqueue_task method is called before nr_running is
726 * increased. Here we update the fair scheduling stats and 821 * increased. Here we update the fair scheduling stats and
@@ -729,15 +824,28 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
729static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) 824static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
730{ 825{
731 struct cfs_rq *cfs_rq; 826 struct cfs_rq *cfs_rq;
732 struct sched_entity *se = &p->se; 827 struct sched_entity *se = &p->se,
828 *topse = NULL; /* Highest schedulable entity */
829 int incload = 1;
733 830
734 for_each_sched_entity(se) { 831 for_each_sched_entity(se) {
735 if (se->on_rq) 832 topse = se;
833 if (se->on_rq) {
834 incload = 0;
736 break; 835 break;
836 }
737 cfs_rq = cfs_rq_of(se); 837 cfs_rq = cfs_rq_of(se);
738 enqueue_entity(cfs_rq, se, wakeup); 838 enqueue_entity(cfs_rq, se, wakeup);
739 wakeup = 1; 839 wakeup = 1;
740 } 840 }
841 /* Increment cpu load if we just enqueued the first task of a group on
842 * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs
843 * at the highest grouping level.
844 */
845 if (incload)
846 inc_cpu_load(rq, topse->load.weight);
847
848 hrtick_start_fair(rq, rq->curr);
741} 849}
742 850
743/* 851/*
@@ -748,16 +856,30 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
748static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) 856static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
749{ 857{
750 struct cfs_rq *cfs_rq; 858 struct cfs_rq *cfs_rq;
751 struct sched_entity *se = &p->se; 859 struct sched_entity *se = &p->se,
860 *topse = NULL; /* Highest schedulable entity */
861 int decload = 1;
752 862
753 for_each_sched_entity(se) { 863 for_each_sched_entity(se) {
864 topse = se;
754 cfs_rq = cfs_rq_of(se); 865 cfs_rq = cfs_rq_of(se);
755 dequeue_entity(cfs_rq, se, sleep); 866 dequeue_entity(cfs_rq, se, sleep);
756 /* Don't dequeue parent if it has other entities besides us */ 867 /* Don't dequeue parent if it has other entities besides us */
757 if (cfs_rq->load.weight) 868 if (cfs_rq->load.weight) {
869 if (parent_entity(se))
870 decload = 0;
758 break; 871 break;
872 }
759 sleep = 1; 873 sleep = 1;
760 } 874 }
875 /* Decrement cpu load if we just dequeued the last task of a group on
876 * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs
877 * at the highest grouping level.
878 */
879 if (decload)
880 dec_cpu_load(rq, topse->load.weight);
881
882 hrtick_start_fair(rq, rq->curr);
761} 883}
762 884
763/* 885/*
@@ -767,8 +889,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
767 */ 889 */
768static void yield_task_fair(struct rq *rq) 890static void yield_task_fair(struct rq *rq)
769{ 891{
770 struct cfs_rq *cfs_rq = task_cfs_rq(rq->curr); 892 struct task_struct *curr = rq->curr;
771 struct sched_entity *rightmost, *se = &rq->curr->se; 893 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
894 struct sched_entity *rightmost, *se = &curr->se;
772 895
773 /* 896 /*
774 * Are we the only task in the tree? 897 * Are we the only task in the tree?
@@ -776,7 +899,7 @@ static void yield_task_fair(struct rq *rq)
776 if (unlikely(cfs_rq->nr_running == 1)) 899 if (unlikely(cfs_rq->nr_running == 1))
777 return; 900 return;
778 901
779 if (likely(!sysctl_sched_compat_yield)) { 902 if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
780 __update_rq_clock(rq); 903 __update_rq_clock(rq);
781 /* 904 /*
782 * Update run-time statistics of the 'current'. 905 * Update run-time statistics of the 'current'.
@@ -804,6 +927,154 @@ static void yield_task_fair(struct rq *rq)
804} 927}
805 928
806/* 929/*
930 * wake_idle() will wake a task on an idle cpu if task->cpu is
931 * not idle and an idle cpu is available. The span of cpus to
932 * search starts with cpus closest then further out as needed,
933 * so we always favor a closer, idle cpu.
934 *
935 * Returns the CPU we should wake onto.
936 */
937#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
938static int wake_idle(int cpu, struct task_struct *p)
939{
940 cpumask_t tmp;
941 struct sched_domain *sd;
942 int i;
943
944 /*
945 * If it is idle, then it is the best cpu to run this task.
946 *
947 * This cpu is also the best, if it has more than one task already.
948 * Siblings must be also busy(in most cases) as they didn't already
949 * pickup the extra load from this cpu and hence we need not check
950 * sibling runqueue info. This will avoid the checks and cache miss
951 * penalities associated with that.
952 */
953 if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1)
954 return cpu;
955
956 for_each_domain(cpu, sd) {
957 if (sd->flags & SD_WAKE_IDLE) {
958 cpus_and(tmp, sd->span, p->cpus_allowed);
959 for_each_cpu_mask(i, tmp) {
960 if (idle_cpu(i)) {
961 if (i != task_cpu(p)) {
962 schedstat_inc(p,
963 se.nr_wakeups_idle);
964 }
965 return i;
966 }
967 }
968 } else {
969 break;
970 }
971 }
972 return cpu;
973}
974#else
975static inline int wake_idle(int cpu, struct task_struct *p)
976{
977 return cpu;
978}
979#endif
980
981#ifdef CONFIG_SMP
982static int select_task_rq_fair(struct task_struct *p, int sync)
983{
984 int cpu, this_cpu;
985 struct rq *rq;
986 struct sched_domain *sd, *this_sd = NULL;
987 int new_cpu;
988
989 cpu = task_cpu(p);
990 rq = task_rq(p);
991 this_cpu = smp_processor_id();
992 new_cpu = cpu;
993
994 if (cpu == this_cpu)
995 goto out_set_cpu;
996
997 for_each_domain(this_cpu, sd) {
998 if (cpu_isset(cpu, sd->span)) {
999 this_sd = sd;
1000 break;
1001 }
1002 }
1003
1004 if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
1005 goto out_set_cpu;
1006
1007 /*
1008 * Check for affine wakeup and passive balancing possibilities.
1009 */
1010 if (this_sd) {
1011 int idx = this_sd->wake_idx;
1012 unsigned int imbalance;
1013 unsigned long load, this_load;
1014
1015 imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
1016
1017 load = source_load(cpu, idx);
1018 this_load = target_load(this_cpu, idx);
1019
1020 new_cpu = this_cpu; /* Wake to this CPU if we can */
1021
1022 if (this_sd->flags & SD_WAKE_AFFINE) {
1023 unsigned long tl = this_load;
1024 unsigned long tl_per_task;
1025
1026 /*
1027 * Attract cache-cold tasks on sync wakeups:
1028 */
1029 if (sync && !task_hot(p, rq->clock, this_sd))
1030 goto out_set_cpu;
1031
1032 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1033 tl_per_task = cpu_avg_load_per_task(this_cpu);
1034
1035 /*
1036 * If sync wakeup then subtract the (maximum possible)
1037 * effect of the currently running task from the load
1038 * of the current CPU:
1039 */
1040 if (sync)
1041 tl -= current->se.load.weight;
1042
1043 if ((tl <= load &&
1044 tl + target_load(cpu, idx) <= tl_per_task) ||
1045 100*(tl + p->se.load.weight) <= imbalance*load) {
1046 /*
1047 * This domain has SD_WAKE_AFFINE and
1048 * p is cache cold in this domain, and
1049 * there is no bad imbalance.
1050 */
1051 schedstat_inc(this_sd, ttwu_move_affine);
1052 schedstat_inc(p, se.nr_wakeups_affine);
1053 goto out_set_cpu;
1054 }
1055 }
1056
1057 /*
1058 * Start passive balancing when half the imbalance_pct
1059 * limit is reached.
1060 */
1061 if (this_sd->flags & SD_WAKE_BALANCE) {
1062 if (imbalance*this_load <= 100*load) {
1063 schedstat_inc(this_sd, ttwu_move_balance);
1064 schedstat_inc(p, se.nr_wakeups_passive);
1065 goto out_set_cpu;
1066 }
1067 }
1068 }
1069
1070 new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */
1071out_set_cpu:
1072 return wake_idle(new_cpu, p);
1073}
1074#endif /* CONFIG_SMP */
1075
1076
1077/*
807 * Preempt the current task with a newly woken task if needed: 1078 * Preempt the current task with a newly woken task if needed:
808 */ 1079 */
809static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) 1080static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
@@ -811,7 +1082,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
811 struct task_struct *curr = rq->curr; 1082 struct task_struct *curr = rq->curr;
812 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1083 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
813 struct sched_entity *se = &curr->se, *pse = &p->se; 1084 struct sched_entity *se = &curr->se, *pse = &p->se;
814 s64 delta, gran; 1085 unsigned long gran;
815 1086
816 if (unlikely(rt_prio(p->prio))) { 1087 if (unlikely(rt_prio(p->prio))) {
817 update_rq_clock(rq); 1088 update_rq_clock(rq);
@@ -826,28 +1097,29 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
826 if (unlikely(p->policy == SCHED_BATCH)) 1097 if (unlikely(p->policy == SCHED_BATCH))
827 return; 1098 return;
828 1099
829 if (sched_feat(WAKEUP_PREEMPT)) { 1100 if (!sched_feat(WAKEUP_PREEMPT))
830 while (!is_same_group(se, pse)) { 1101 return;
831 se = parent_entity(se);
832 pse = parent_entity(pse);
833 }
834 1102
835 delta = se->vruntime - pse->vruntime; 1103 while (!is_same_group(se, pse)) {
836 gran = sysctl_sched_wakeup_granularity; 1104 se = parent_entity(se);
837 if (unlikely(se->load.weight != NICE_0_LOAD)) 1105 pse = parent_entity(pse);
838 gran = calc_delta_fair(gran, &se->load); 1106 }
839 1107
840 if (delta > gran) { 1108 gran = sysctl_sched_wakeup_granularity;
841 int now = !sched_feat(PREEMPT_RESTRICT); 1109 /*
1110 * More easily preempt - nice tasks, while not making
1111 * it harder for + nice tasks.
1112 */
1113 if (unlikely(se->load.weight > NICE_0_LOAD))
1114 gran = calc_delta_fair(gran, &se->load);
842 1115
843 if (now || p->prio < curr->prio || !se->peer_preempt++) 1116 if (pse->vruntime + gran < se->vruntime)
844 resched_task(curr); 1117 resched_task(curr);
845 }
846 }
847} 1118}
848 1119
849static struct task_struct *pick_next_task_fair(struct rq *rq) 1120static struct task_struct *pick_next_task_fair(struct rq *rq)
850{ 1121{
1122 struct task_struct *p;
851 struct cfs_rq *cfs_rq = &rq->cfs; 1123 struct cfs_rq *cfs_rq = &rq->cfs;
852 struct sched_entity *se; 1124 struct sched_entity *se;
853 1125
@@ -859,7 +1131,10 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
859 cfs_rq = group_cfs_rq(se); 1131 cfs_rq = group_cfs_rq(se);
860 } while (cfs_rq); 1132 } while (cfs_rq);
861 1133
862 return task_of(se); 1134 p = task_of(se);
1135 hrtick_start_fair(rq, p);
1136
1137 return p;
863} 1138}
864 1139
865/* 1140/*
@@ -916,25 +1191,6 @@ static struct task_struct *load_balance_next_fair(void *arg)
916 return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); 1191 return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
917} 1192}
918 1193
919#ifdef CONFIG_FAIR_GROUP_SCHED
920static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
921{
922 struct sched_entity *curr;
923 struct task_struct *p;
924
925 if (!cfs_rq->nr_running)
926 return MAX_PRIO;
927
928 curr = cfs_rq->curr;
929 if (!curr)
930 curr = __pick_next_entity(cfs_rq);
931
932 p = task_of(curr);
933
934 return p->prio;
935}
936#endif
937
938static unsigned long 1194static unsigned long
939load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1195load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
940 unsigned long max_load_move, 1196 unsigned long max_load_move,
@@ -944,28 +1200,45 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
944 struct cfs_rq *busy_cfs_rq; 1200 struct cfs_rq *busy_cfs_rq;
945 long rem_load_move = max_load_move; 1201 long rem_load_move = max_load_move;
946 struct rq_iterator cfs_rq_iterator; 1202 struct rq_iterator cfs_rq_iterator;
1203 unsigned long load_moved;
947 1204
948 cfs_rq_iterator.start = load_balance_start_fair; 1205 cfs_rq_iterator.start = load_balance_start_fair;
949 cfs_rq_iterator.next = load_balance_next_fair; 1206 cfs_rq_iterator.next = load_balance_next_fair;
950 1207
951 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { 1208 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
952#ifdef CONFIG_FAIR_GROUP_SCHED 1209#ifdef CONFIG_FAIR_GROUP_SCHED
953 struct cfs_rq *this_cfs_rq; 1210 struct cfs_rq *this_cfs_rq = busy_cfs_rq->tg->cfs_rq[this_cpu];
954 long imbalance; 1211 unsigned long maxload, task_load, group_weight;
955 unsigned long maxload; 1212 unsigned long thisload, per_task_load;
1213 struct sched_entity *se = busy_cfs_rq->tg->se[busiest->cpu];
956 1214
957 this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); 1215 task_load = busy_cfs_rq->load.weight;
1216 group_weight = se->load.weight;
958 1217
959 imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; 1218 /*
960 /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ 1219 * 'group_weight' is contributed by tasks of total weight
961 if (imbalance <= 0) 1220 * 'task_load'. To move 'rem_load_move' worth of weight only,
1221 * we need to move a maximum task load of:
1222 *
1223 * maxload = (remload / group_weight) * task_load;
1224 */
1225 maxload = (rem_load_move * task_load) / group_weight;
1226
1227 if (!maxload || !task_load)
962 continue; 1228 continue;
963 1229
964 /* Don't pull more than imbalance/2 */ 1230 per_task_load = task_load / busy_cfs_rq->nr_running;
965 imbalance /= 2; 1231 /*
966 maxload = min(rem_load_move, imbalance); 1232 * balance_tasks will try to forcibly move atleast one task if
1233 * possible (because of SCHED_LOAD_SCALE_FUZZ). Avoid that if
1234 * maxload is less than GROUP_IMBALANCE_FUZZ% the per_task_load.
1235 */
1236 if (100 * maxload < GROUP_IMBALANCE_PCT * per_task_load)
1237 continue;
967 1238
968 *this_best_prio = cfs_rq_best_prio(this_cfs_rq); 1239 /* Disable priority-based load balance */
1240 *this_best_prio = 0;
1241 thisload = this_cfs_rq->load.weight;
969#else 1242#else
970# define maxload rem_load_move 1243# define maxload rem_load_move
971#endif 1244#endif
@@ -974,11 +1247,33 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
974 * load_balance_[start|next]_fair iterators 1247 * load_balance_[start|next]_fair iterators
975 */ 1248 */
976 cfs_rq_iterator.arg = busy_cfs_rq; 1249 cfs_rq_iterator.arg = busy_cfs_rq;
977 rem_load_move -= balance_tasks(this_rq, this_cpu, busiest, 1250 load_moved = balance_tasks(this_rq, this_cpu, busiest,
978 maxload, sd, idle, all_pinned, 1251 maxload, sd, idle, all_pinned,
979 this_best_prio, 1252 this_best_prio,
980 &cfs_rq_iterator); 1253 &cfs_rq_iterator);
981 1254
1255#ifdef CONFIG_FAIR_GROUP_SCHED
1256 /*
1257 * load_moved holds the task load that was moved. The
1258 * effective (group) weight moved would be:
1259 * load_moved_eff = load_moved/task_load * group_weight;
1260 */
1261 load_moved = (group_weight * load_moved) / task_load;
1262
1263 /* Adjust shares on both cpus to reflect load_moved */
1264 group_weight -= load_moved;
1265 set_se_shares(se, group_weight);
1266
1267 se = busy_cfs_rq->tg->se[this_cpu];
1268 if (!thisload)
1269 group_weight = load_moved;
1270 else
1271 group_weight = se->load.weight + load_moved;
1272 set_se_shares(se, group_weight);
1273#endif
1274
1275 rem_load_move -= load_moved;
1276
982 if (rem_load_move <= 0) 1277 if (rem_load_move <= 0)
983 break; 1278 break;
984 } 1279 }
@@ -1014,18 +1309,18 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1014/* 1309/*
1015 * scheduler tick hitting a task of our scheduling class: 1310 * scheduler tick hitting a task of our scheduling class:
1016 */ 1311 */
1017static void task_tick_fair(struct rq *rq, struct task_struct *curr) 1312static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
1018{ 1313{
1019 struct cfs_rq *cfs_rq; 1314 struct cfs_rq *cfs_rq;
1020 struct sched_entity *se = &curr->se; 1315 struct sched_entity *se = &curr->se;
1021 1316
1022 for_each_sched_entity(se) { 1317 for_each_sched_entity(se) {
1023 cfs_rq = cfs_rq_of(se); 1318 cfs_rq = cfs_rq_of(se);
1024 entity_tick(cfs_rq, se); 1319 entity_tick(cfs_rq, se, queued);
1025 } 1320 }
1026} 1321}
1027 1322
1028#define swap(a,b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0) 1323#define swap(a, b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0)
1029 1324
1030/* 1325/*
1031 * Share the fairness runtime between parent and child, thus the 1326 * Share the fairness runtime between parent and child, thus the
@@ -1045,8 +1340,9 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1045 update_curr(cfs_rq); 1340 update_curr(cfs_rq);
1046 place_entity(cfs_rq, se, 1); 1341 place_entity(cfs_rq, se, 1);
1047 1342
1343 /* 'curr' will be NULL if the child belongs to a different group */
1048 if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) && 1344 if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) &&
1049 curr->vruntime < se->vruntime) { 1345 curr && curr->vruntime < se->vruntime) {
1050 /* 1346 /*
1051 * Upon rescheduling, sched_class::put_prev_task() will place 1347 * Upon rescheduling, sched_class::put_prev_task() will place
1052 * 'current' within the tree based on its new key value. 1348 * 'current' within the tree based on its new key value.
@@ -1054,11 +1350,46 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1054 swap(curr->vruntime, se->vruntime); 1350 swap(curr->vruntime, se->vruntime);
1055 } 1351 }
1056 1352
1057 se->peer_preempt = 0;
1058 enqueue_task_fair(rq, p, 0); 1353 enqueue_task_fair(rq, p, 0);
1059 resched_task(rq->curr); 1354 resched_task(rq->curr);
1060} 1355}
1061 1356
1357/*
1358 * Priority of the task has changed. Check to see if we preempt
1359 * the current task.
1360 */
1361static void prio_changed_fair(struct rq *rq, struct task_struct *p,
1362 int oldprio, int running)
1363{
1364 /*
1365 * Reschedule if we are currently running on this runqueue and
1366 * our priority decreased, or if we are not currently running on
1367 * this runqueue and our priority is higher than the current's
1368 */
1369 if (running) {
1370 if (p->prio > oldprio)
1371 resched_task(rq->curr);
1372 } else
1373 check_preempt_curr(rq, p);
1374}
1375
1376/*
1377 * We switched to the sched_fair class.
1378 */
1379static void switched_to_fair(struct rq *rq, struct task_struct *p,
1380 int running)
1381{
1382 /*
1383 * We were most likely switched from sched_rt, so
1384 * kick off the schedule if running, otherwise just see
1385 * if we can still preempt the current task.
1386 */
1387 if (running)
1388 resched_task(rq->curr);
1389 else
1390 check_preempt_curr(rq, p);
1391}
1392
1062/* Account for a task changing its policy or group. 1393/* Account for a task changing its policy or group.
1063 * 1394 *
1064 * This routine is mostly called to set cfs_rq->curr field when a task 1395 * This routine is mostly called to set cfs_rq->curr field when a task
@@ -1080,6 +1411,9 @@ static const struct sched_class fair_sched_class = {
1080 .enqueue_task = enqueue_task_fair, 1411 .enqueue_task = enqueue_task_fair,
1081 .dequeue_task = dequeue_task_fair, 1412 .dequeue_task = dequeue_task_fair,
1082 .yield_task = yield_task_fair, 1413 .yield_task = yield_task_fair,
1414#ifdef CONFIG_SMP
1415 .select_task_rq = select_task_rq_fair,
1416#endif /* CONFIG_SMP */
1083 1417
1084 .check_preempt_curr = check_preempt_wakeup, 1418 .check_preempt_curr = check_preempt_wakeup,
1085 1419
@@ -1094,6 +1428,9 @@ static const struct sched_class fair_sched_class = {
1094 .set_curr_task = set_curr_task_fair, 1428 .set_curr_task = set_curr_task_fair,
1095 .task_tick = task_tick_fair, 1429 .task_tick = task_tick_fair,
1096 .task_new = task_new_fair, 1430 .task_new = task_new_fair,
1431
1432 .prio_changed = prio_changed_fair,
1433 .switched_to = switched_to_fair,
1097}; 1434};
1098 1435
1099#ifdef CONFIG_SCHED_DEBUG 1436#ifdef CONFIG_SCHED_DEBUG
@@ -1104,7 +1441,9 @@ static void print_cfs_stats(struct seq_file *m, int cpu)
1104#ifdef CONFIG_FAIR_GROUP_SCHED 1441#ifdef CONFIG_FAIR_GROUP_SCHED
1105 print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs); 1442 print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs);
1106#endif 1443#endif
1444 rcu_read_lock();
1107 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) 1445 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
1108 print_cfs_rq(m, cpu, cfs_rq); 1446 print_cfs_rq(m, cpu, cfs_rq);
1447 rcu_read_unlock();
1109} 1448}
1110#endif 1449#endif
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index bf9c25c15b8b..2bcafa375633 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -5,6 +5,12 @@
5 * handled in sched_fair.c) 5 * handled in sched_fair.c)
6 */ 6 */
7 7
8#ifdef CONFIG_SMP
9static int select_task_rq_idle(struct task_struct *p, int sync)
10{
11 return task_cpu(p); /* IDLE tasks as never migrated */
12}
13#endif /* CONFIG_SMP */
8/* 14/*
9 * Idle tasks are unconditionally rescheduled: 15 * Idle tasks are unconditionally rescheduled:
10 */ 16 */
@@ -55,7 +61,7 @@ move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest,
55} 61}
56#endif 62#endif
57 63
58static void task_tick_idle(struct rq *rq, struct task_struct *curr) 64static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
59{ 65{
60} 66}
61 67
@@ -63,6 +69,33 @@ static void set_curr_task_idle(struct rq *rq)
63{ 69{
64} 70}
65 71
72static void switched_to_idle(struct rq *rq, struct task_struct *p,
73 int running)
74{
75 /* Can this actually happen?? */
76 if (running)
77 resched_task(rq->curr);
78 else
79 check_preempt_curr(rq, p);
80}
81
82static void prio_changed_idle(struct rq *rq, struct task_struct *p,
83 int oldprio, int running)
84{
85 /* This can happen for hot plug CPUS */
86
87 /*
88 * Reschedule if we are currently running on this runqueue and
89 * our priority decreased, or if we are not currently running on
90 * this runqueue and our priority is higher than the current's
91 */
92 if (running) {
93 if (p->prio > oldprio)
94 resched_task(rq->curr);
95 } else
96 check_preempt_curr(rq, p);
97}
98
66/* 99/*
67 * Simple, special scheduling class for the per-CPU idle tasks: 100 * Simple, special scheduling class for the per-CPU idle tasks:
68 */ 101 */
@@ -72,6 +105,9 @@ const struct sched_class idle_sched_class = {
72 105
73 /* dequeue is not valid, we print a debug message there: */ 106 /* dequeue is not valid, we print a debug message there: */
74 .dequeue_task = dequeue_task_idle, 107 .dequeue_task = dequeue_task_idle,
108#ifdef CONFIG_SMP
109 .select_task_rq = select_task_rq_idle,
110#endif /* CONFIG_SMP */
75 111
76 .check_preempt_curr = check_preempt_curr_idle, 112 .check_preempt_curr = check_preempt_curr_idle,
77 113
@@ -85,5 +121,9 @@ const struct sched_class idle_sched_class = {
85 121
86 .set_curr_task = set_curr_task_idle, 122 .set_curr_task = set_curr_task_idle,
87 .task_tick = task_tick_idle, 123 .task_tick = task_tick_idle,
124
125 .prio_changed = prio_changed_idle,
126 .switched_to = switched_to_idle,
127
88 /* no .task_new for idle tasks */ 128 /* no .task_new for idle tasks */
89}; 129};
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 8abd752a0ebd..274b40d7bef2 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -3,6 +3,217 @@
3 * policies) 3 * policies)
4 */ 4 */
5 5
6#ifdef CONFIG_SMP
7
8static inline int rt_overloaded(struct rq *rq)
9{
10 return atomic_read(&rq->rd->rto_count);
11}
12
13static inline void rt_set_overload(struct rq *rq)
14{
15 cpu_set(rq->cpu, rq->rd->rto_mask);
16 /*
17 * Make sure the mask is visible before we set
18 * the overload count. That is checked to determine
19 * if we should look at the mask. It would be a shame
20 * if we looked at the mask, but the mask was not
21 * updated yet.
22 */
23 wmb();
24 atomic_inc(&rq->rd->rto_count);
25}
26
27static inline void rt_clear_overload(struct rq *rq)
28{
29 /* the order here really doesn't matter */
30 atomic_dec(&rq->rd->rto_count);
31 cpu_clear(rq->cpu, rq->rd->rto_mask);
32}
33
34static void update_rt_migration(struct rq *rq)
35{
36 if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1)) {
37 if (!rq->rt.overloaded) {
38 rt_set_overload(rq);
39 rq->rt.overloaded = 1;
40 }
41 } else if (rq->rt.overloaded) {
42 rt_clear_overload(rq);
43 rq->rt.overloaded = 0;
44 }
45}
46#endif /* CONFIG_SMP */
47
48static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
49{
50 return container_of(rt_se, struct task_struct, rt);
51}
52
53static inline int on_rt_rq(struct sched_rt_entity *rt_se)
54{
55 return !list_empty(&rt_se->run_list);
56}
57
58#ifdef CONFIG_FAIR_GROUP_SCHED
59
60static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
61{
62 if (!rt_rq->tg)
63 return SCHED_RT_FRAC;
64
65 return rt_rq->tg->rt_ratio;
66}
67
68#define for_each_leaf_rt_rq(rt_rq, rq) \
69 list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
70
71static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
72{
73 return rt_rq->rq;
74}
75
76static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
77{
78 return rt_se->rt_rq;
79}
80
81#define for_each_sched_rt_entity(rt_se) \
82 for (; rt_se; rt_se = rt_se->parent)
83
84static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
85{
86 return rt_se->my_q;
87}
88
89static void enqueue_rt_entity(struct sched_rt_entity *rt_se);
90static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
91
92static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
93{
94 struct sched_rt_entity *rt_se = rt_rq->rt_se;
95
96 if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) {
97 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
98
99 enqueue_rt_entity(rt_se);
100 if (rt_rq->highest_prio < curr->prio)
101 resched_task(curr);
102 }
103}
104
105static void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
106{
107 struct sched_rt_entity *rt_se = rt_rq->rt_se;
108
109 if (rt_se && on_rt_rq(rt_se))
110 dequeue_rt_entity(rt_se);
111}
112
113#else
114
115static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
116{
117 return sysctl_sched_rt_ratio;
118}
119
120#define for_each_leaf_rt_rq(rt_rq, rq) \
121 for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
122
123static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
124{
125 return container_of(rt_rq, struct rq, rt);
126}
127
128static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
129{
130 struct task_struct *p = rt_task_of(rt_se);
131 struct rq *rq = task_rq(p);
132
133 return &rq->rt;
134}
135
136#define for_each_sched_rt_entity(rt_se) \
137 for (; rt_se; rt_se = NULL)
138
139static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
140{
141 return NULL;
142}
143
144static inline void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
145{
146}
147
148static inline void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
149{
150}
151
152#endif
153
154static inline int rt_se_prio(struct sched_rt_entity *rt_se)
155{
156#ifdef CONFIG_FAIR_GROUP_SCHED
157 struct rt_rq *rt_rq = group_rt_rq(rt_se);
158
159 if (rt_rq)
160 return rt_rq->highest_prio;
161#endif
162
163 return rt_task_of(rt_se)->prio;
164}
165
166static int sched_rt_ratio_exceeded(struct rt_rq *rt_rq)
167{
168 unsigned int rt_ratio = sched_rt_ratio(rt_rq);
169 u64 period, ratio;
170
171 if (rt_ratio == SCHED_RT_FRAC)
172 return 0;
173
174 if (rt_rq->rt_throttled)
175 return 1;
176
177 period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
178 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
179
180 if (rt_rq->rt_time > ratio) {
181 struct rq *rq = rq_of_rt_rq(rt_rq);
182
183 rq->rt_throttled = 1;
184 rt_rq->rt_throttled = 1;
185
186 sched_rt_ratio_dequeue(rt_rq);
187 return 1;
188 }
189
190 return 0;
191}
192
193static void update_sched_rt_period(struct rq *rq)
194{
195 struct rt_rq *rt_rq;
196 u64 period;
197
198 while (rq->clock > rq->rt_period_expire) {
199 period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
200 rq->rt_period_expire += period;
201
202 for_each_leaf_rt_rq(rt_rq, rq) {
203 unsigned long rt_ratio = sched_rt_ratio(rt_rq);
204 u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
205
206 rt_rq->rt_time -= min(rt_rq->rt_time, ratio);
207 if (rt_rq->rt_throttled) {
208 rt_rq->rt_throttled = 0;
209 sched_rt_ratio_enqueue(rt_rq);
210 }
211 }
212
213 rq->rt_throttled = 0;
214 }
215}
216
6/* 217/*
7 * Update the current task's runtime statistics. Skip current tasks that 218 * Update the current task's runtime statistics. Skip current tasks that
8 * are not in our scheduling class. 219 * are not in our scheduling class.
@@ -10,6 +221,8 @@
10static void update_curr_rt(struct rq *rq) 221static void update_curr_rt(struct rq *rq)
11{ 222{
12 struct task_struct *curr = rq->curr; 223 struct task_struct *curr = rq->curr;
224 struct sched_rt_entity *rt_se = &curr->rt;
225 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
13 u64 delta_exec; 226 u64 delta_exec;
14 227
15 if (!task_has_rt_policy(curr)) 228 if (!task_has_rt_policy(curr))
@@ -23,47 +236,229 @@ static void update_curr_rt(struct rq *rq)
23 236
24 curr->se.sum_exec_runtime += delta_exec; 237 curr->se.sum_exec_runtime += delta_exec;
25 curr->se.exec_start = rq->clock; 238 curr->se.exec_start = rq->clock;
239 cpuacct_charge(curr, delta_exec);
240
241 rt_rq->rt_time += delta_exec;
242 /*
243 * might make it a tad more accurate:
244 *
245 * update_sched_rt_period(rq);
246 */
247 if (sched_rt_ratio_exceeded(rt_rq))
248 resched_task(curr);
26} 249}
27 250
28static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) 251static inline
252void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
253{
254 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
255 rt_rq->rt_nr_running++;
256#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
257 if (rt_se_prio(rt_se) < rt_rq->highest_prio)
258 rt_rq->highest_prio = rt_se_prio(rt_se);
259#endif
260#ifdef CONFIG_SMP
261 if (rt_se->nr_cpus_allowed > 1) {
262 struct rq *rq = rq_of_rt_rq(rt_rq);
263 rq->rt.rt_nr_migratory++;
264 }
265
266 update_rt_migration(rq_of_rt_rq(rt_rq));
267#endif
268}
269
270static inline
271void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
272{
273 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
274 WARN_ON(!rt_rq->rt_nr_running);
275 rt_rq->rt_nr_running--;
276#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED
277 if (rt_rq->rt_nr_running) {
278 struct rt_prio_array *array;
279
280 WARN_ON(rt_se_prio(rt_se) < rt_rq->highest_prio);
281 if (rt_se_prio(rt_se) == rt_rq->highest_prio) {
282 /* recalculate */
283 array = &rt_rq->active;
284 rt_rq->highest_prio =
285 sched_find_first_bit(array->bitmap);
286 } /* otherwise leave rq->highest prio alone */
287 } else
288 rt_rq->highest_prio = MAX_RT_PRIO;
289#endif
290#ifdef CONFIG_SMP
291 if (rt_se->nr_cpus_allowed > 1) {
292 struct rq *rq = rq_of_rt_rq(rt_rq);
293 rq->rt.rt_nr_migratory--;
294 }
295
296 update_rt_migration(rq_of_rt_rq(rt_rq));
297#endif /* CONFIG_SMP */
298}
299
300static void enqueue_rt_entity(struct sched_rt_entity *rt_se)
301{
302 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
303 struct rt_prio_array *array = &rt_rq->active;
304 struct rt_rq *group_rq = group_rt_rq(rt_se);
305
306 if (group_rq && group_rq->rt_throttled)
307 return;
308
309 list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
310 __set_bit(rt_se_prio(rt_se), array->bitmap);
311
312 inc_rt_tasks(rt_se, rt_rq);
313}
314
315static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
316{
317 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
318 struct rt_prio_array *array = &rt_rq->active;
319
320 list_del_init(&rt_se->run_list);
321 if (list_empty(array->queue + rt_se_prio(rt_se)))
322 __clear_bit(rt_se_prio(rt_se), array->bitmap);
323
324 dec_rt_tasks(rt_se, rt_rq);
325}
326
327/*
328 * Because the prio of an upper entry depends on the lower
329 * entries, we must remove entries top - down.
330 *
331 * XXX: O(1/2 h^2) because we can only walk up, not down the chain.
332 * doesn't matter much for now, as h=2 for GROUP_SCHED.
333 */
334static void dequeue_rt_stack(struct task_struct *p)
29{ 335{
30 struct rt_prio_array *array = &rq->rt.active; 336 struct sched_rt_entity *rt_se, *top_se;
31 337
32 list_add_tail(&p->run_list, array->queue + p->prio); 338 /*
33 __set_bit(p->prio, array->bitmap); 339 * dequeue all, top - down.
340 */
341 do {
342 rt_se = &p->rt;
343 top_se = NULL;
344 for_each_sched_rt_entity(rt_se) {
345 if (on_rt_rq(rt_se))
346 top_se = rt_se;
347 }
348 if (top_se)
349 dequeue_rt_entity(top_se);
350 } while (top_se);
34} 351}
35 352
36/* 353/*
37 * Adding/removing a task to/from a priority array: 354 * Adding/removing a task to/from a priority array:
38 */ 355 */
356static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
357{
358 struct sched_rt_entity *rt_se = &p->rt;
359
360 if (wakeup)
361 rt_se->timeout = 0;
362
363 dequeue_rt_stack(p);
364
365 /*
366 * enqueue everybody, bottom - up.
367 */
368 for_each_sched_rt_entity(rt_se)
369 enqueue_rt_entity(rt_se);
370
371 inc_cpu_load(rq, p->se.load.weight);
372}
373
39static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) 374static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
40{ 375{
41 struct rt_prio_array *array = &rq->rt.active; 376 struct sched_rt_entity *rt_se = &p->rt;
377 struct rt_rq *rt_rq;
42 378
43 update_curr_rt(rq); 379 update_curr_rt(rq);
44 380
45 list_del(&p->run_list); 381 dequeue_rt_stack(p);
46 if (list_empty(array->queue + p->prio)) 382
47 __clear_bit(p->prio, array->bitmap); 383 /*
384 * re-enqueue all non-empty rt_rq entities.
385 */
386 for_each_sched_rt_entity(rt_se) {
387 rt_rq = group_rt_rq(rt_se);
388 if (rt_rq && rt_rq->rt_nr_running)
389 enqueue_rt_entity(rt_se);
390 }
391
392 dec_cpu_load(rq, p->se.load.weight);
48} 393}
49 394
50/* 395/*
51 * Put task to the end of the run list without the overhead of dequeue 396 * Put task to the end of the run list without the overhead of dequeue
52 * followed by enqueue. 397 * followed by enqueue.
53 */ 398 */
399static
400void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
401{
402 struct rt_prio_array *array = &rt_rq->active;
403
404 list_move_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se));
405}
406
54static void requeue_task_rt(struct rq *rq, struct task_struct *p) 407static void requeue_task_rt(struct rq *rq, struct task_struct *p)
55{ 408{
56 struct rt_prio_array *array = &rq->rt.active; 409 struct sched_rt_entity *rt_se = &p->rt;
410 struct rt_rq *rt_rq;
57 411
58 list_move_tail(&p->run_list, array->queue + p->prio); 412 for_each_sched_rt_entity(rt_se) {
413 rt_rq = rt_rq_of_se(rt_se);
414 requeue_rt_entity(rt_rq, rt_se);
415 }
59} 416}
60 417
61static void 418static void yield_task_rt(struct rq *rq)
62yield_task_rt(struct rq *rq)
63{ 419{
64 requeue_task_rt(rq, rq->curr); 420 requeue_task_rt(rq, rq->curr);
65} 421}
66 422
423#ifdef CONFIG_SMP
424static int find_lowest_rq(struct task_struct *task);
425
426static int select_task_rq_rt(struct task_struct *p, int sync)
427{
428 struct rq *rq = task_rq(p);
429
430 /*
431 * If the current task is an RT task, then
432 * try to see if we can wake this RT task up on another
433 * runqueue. Otherwise simply start this RT task
434 * on its current runqueue.
435 *
436 * We want to avoid overloading runqueues. Even if
437 * the RT task is of higher priority than the current RT task.
438 * RT tasks behave differently than other tasks. If
439 * one gets preempted, we try to push it off to another queue.
440 * So trying to keep a preempting RT task on the same
441 * cache hot CPU will force the running RT task to
442 * a cold CPU. So we waste all the cache for the lower
443 * RT task in hopes of saving some of a RT task
444 * that is just being woken and probably will have
445 * cold cache anyway.
446 */
447 if (unlikely(rt_task(rq->curr)) &&
448 (p->rt.nr_cpus_allowed > 1)) {
449 int cpu = find_lowest_rq(p);
450
451 return (cpu == -1) ? task_cpu(p) : cpu;
452 }
453
454 /*
455 * Otherwise, just let it ride on the affined RQ and the
456 * post-schedule router will push the preempted task away
457 */
458 return task_cpu(p);
459}
460#endif /* CONFIG_SMP */
461
67/* 462/*
68 * Preempt the current task with a newly woken task if needed: 463 * Preempt the current task with a newly woken task if needed:
69 */ 464 */
@@ -73,25 +468,48 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
73 resched_task(rq->curr); 468 resched_task(rq->curr);
74} 469}
75 470
76static struct task_struct *pick_next_task_rt(struct rq *rq) 471static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
472 struct rt_rq *rt_rq)
77{ 473{
78 struct rt_prio_array *array = &rq->rt.active; 474 struct rt_prio_array *array = &rt_rq->active;
79 struct task_struct *next; 475 struct sched_rt_entity *next = NULL;
80 struct list_head *queue; 476 struct list_head *queue;
81 int idx; 477 int idx;
82 478
83 idx = sched_find_first_bit(array->bitmap); 479 idx = sched_find_first_bit(array->bitmap);
84 if (idx >= MAX_RT_PRIO) 480 BUG_ON(idx >= MAX_RT_PRIO);
85 return NULL;
86 481
87 queue = array->queue + idx; 482 queue = array->queue + idx;
88 next = list_entry(queue->next, struct task_struct, run_list); 483 next = list_entry(queue->next, struct sched_rt_entity, run_list);
89
90 next->se.exec_start = rq->clock;
91 484
92 return next; 485 return next;
93} 486}
94 487
488static struct task_struct *pick_next_task_rt(struct rq *rq)
489{
490 struct sched_rt_entity *rt_se;
491 struct task_struct *p;
492 struct rt_rq *rt_rq;
493
494 rt_rq = &rq->rt;
495
496 if (unlikely(!rt_rq->rt_nr_running))
497 return NULL;
498
499 if (sched_rt_ratio_exceeded(rt_rq))
500 return NULL;
501
502 do {
503 rt_se = pick_next_rt_entity(rq, rt_rq);
504 BUG_ON(!rt_se);
505 rt_rq = group_rt_rq(rt_se);
506 } while (rt_rq);
507
508 p = rt_task_of(rt_se);
509 p->se.exec_start = rq->clock;
510 return p;
511}
512
95static void put_prev_task_rt(struct rq *rq, struct task_struct *p) 513static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
96{ 514{
97 update_curr_rt(rq); 515 update_curr_rt(rq);
@@ -99,76 +517,448 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
99} 517}
100 518
101#ifdef CONFIG_SMP 519#ifdef CONFIG_SMP
102/* 520
103 * Load-balancing iterator. Note: while the runqueue stays locked 521/* Only try algorithms three times */
104 * during the whole iteration, the current task might be 522#define RT_MAX_TRIES 3
105 * dequeued so the iterator has to be dequeue-safe. Here we 523
106 * achieve that by always pre-iterating before returning 524static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
107 * the current task: 525static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
108 */ 526
109static struct task_struct *load_balance_start_rt(void *arg) 527static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
110{ 528{
111 struct rq *rq = arg; 529 if (!task_running(rq, p) &&
112 struct rt_prio_array *array = &rq->rt.active; 530 (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) &&
113 struct list_head *head, *curr; 531 (p->rt.nr_cpus_allowed > 1))
114 struct task_struct *p; 532 return 1;
533 return 0;
534}
535
536/* Return the second highest RT task, NULL otherwise */
537static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
538{
539 struct task_struct *next = NULL;
540 struct sched_rt_entity *rt_se;
541 struct rt_prio_array *array;
542 struct rt_rq *rt_rq;
115 int idx; 543 int idx;
116 544
117 idx = sched_find_first_bit(array->bitmap); 545 for_each_leaf_rt_rq(rt_rq, rq) {
118 if (idx >= MAX_RT_PRIO) 546 array = &rt_rq->active;
119 return NULL; 547 idx = sched_find_first_bit(array->bitmap);
548 next_idx:
549 if (idx >= MAX_RT_PRIO)
550 continue;
551 if (next && next->prio < idx)
552 continue;
553 list_for_each_entry(rt_se, array->queue + idx, run_list) {
554 struct task_struct *p = rt_task_of(rt_se);
555 if (pick_rt_task(rq, p, cpu)) {
556 next = p;
557 break;
558 }
559 }
560 if (!next) {
561 idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
562 goto next_idx;
563 }
564 }
565
566 return next;
567}
120 568
121 head = array->queue + idx; 569static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
122 curr = head->prev;
123 570
124 p = list_entry(curr, struct task_struct, run_list); 571static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
572{
573 int lowest_prio = -1;
574 int lowest_cpu = -1;
575 int count = 0;
576 int cpu;
125 577
126 curr = curr->prev; 578 cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed);
127 579
128 rq->rt.rt_load_balance_idx = idx; 580 /*
129 rq->rt.rt_load_balance_head = head; 581 * Scan each rq for the lowest prio.
130 rq->rt.rt_load_balance_curr = curr; 582 */
583 for_each_cpu_mask(cpu, *lowest_mask) {
584 struct rq *rq = cpu_rq(cpu);
131 585
132 return p; 586 /* We look for lowest RT prio or non-rt CPU */
587 if (rq->rt.highest_prio >= MAX_RT_PRIO) {
588 /*
589 * if we already found a low RT queue
590 * and now we found this non-rt queue
591 * clear the mask and set our bit.
592 * Otherwise just return the queue as is
593 * and the count==1 will cause the algorithm
594 * to use the first bit found.
595 */
596 if (lowest_cpu != -1) {
597 cpus_clear(*lowest_mask);
598 cpu_set(rq->cpu, *lowest_mask);
599 }
600 return 1;
601 }
602
603 /* no locking for now */
604 if ((rq->rt.highest_prio > task->prio)
605 && (rq->rt.highest_prio >= lowest_prio)) {
606 if (rq->rt.highest_prio > lowest_prio) {
607 /* new low - clear old data */
608 lowest_prio = rq->rt.highest_prio;
609 lowest_cpu = cpu;
610 count = 0;
611 }
612 count++;
613 } else
614 cpu_clear(cpu, *lowest_mask);
615 }
616
617 /*
618 * Clear out all the set bits that represent
619 * runqueues that were of higher prio than
620 * the lowest_prio.
621 */
622 if (lowest_cpu > 0) {
623 /*
624 * Perhaps we could add another cpumask op to
625 * zero out bits. Like cpu_zero_bits(cpumask, nrbits);
626 * Then that could be optimized to use memset and such.
627 */
628 for_each_cpu_mask(cpu, *lowest_mask) {
629 if (cpu >= lowest_cpu)
630 break;
631 cpu_clear(cpu, *lowest_mask);
632 }
633 }
634
635 return count;
133} 636}
134 637
135static struct task_struct *load_balance_next_rt(void *arg) 638static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
136{ 639{
137 struct rq *rq = arg; 640 int first;
138 struct rt_prio_array *array = &rq->rt.active; 641
139 struct list_head *head, *curr; 642 /* "this_cpu" is cheaper to preempt than a remote processor */
140 struct task_struct *p; 643 if ((this_cpu != -1) && cpu_isset(this_cpu, *mask))
141 int idx; 644 return this_cpu;
645
646 first = first_cpu(*mask);
647 if (first != NR_CPUS)
648 return first;
649
650 return -1;
651}
652
653static int find_lowest_rq(struct task_struct *task)
654{
655 struct sched_domain *sd;
656 cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask);
657 int this_cpu = smp_processor_id();
658 int cpu = task_cpu(task);
659 int count = find_lowest_cpus(task, lowest_mask);
660
661 if (!count)
662 return -1; /* No targets found */
142 663
143 idx = rq->rt.rt_load_balance_idx; 664 /*
144 head = rq->rt.rt_load_balance_head; 665 * There is no sense in performing an optimal search if only one
145 curr = rq->rt.rt_load_balance_curr; 666 * target is found.
667 */
668 if (count == 1)
669 return first_cpu(*lowest_mask);
670
671 /*
672 * At this point we have built a mask of cpus representing the
673 * lowest priority tasks in the system. Now we want to elect
674 * the best one based on our affinity and topology.
675 *
676 * We prioritize the last cpu that the task executed on since
677 * it is most likely cache-hot in that location.
678 */
679 if (cpu_isset(cpu, *lowest_mask))
680 return cpu;
146 681
147 /* 682 /*
148 * If we arrived back to the head again then 683 * Otherwise, we consult the sched_domains span maps to figure
149 * iterate to the next queue (if any): 684 * out which cpu is logically closest to our hot cache data.
150 */ 685 */
151 if (unlikely(head == curr)) { 686 if (this_cpu == cpu)
152 int next_idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1); 687 this_cpu = -1; /* Skip this_cpu opt if the same */
153 688
154 if (next_idx >= MAX_RT_PRIO) 689 for_each_domain(cpu, sd) {
155 return NULL; 690 if (sd->flags & SD_WAKE_AFFINE) {
691 cpumask_t domain_mask;
692 int best_cpu;
156 693
157 idx = next_idx; 694 cpus_and(domain_mask, sd->span, *lowest_mask);
158 head = array->queue + idx;
159 curr = head->prev;
160 695
161 rq->rt.rt_load_balance_idx = idx; 696 best_cpu = pick_optimal_cpu(this_cpu,
162 rq->rt.rt_load_balance_head = head; 697 &domain_mask);
698 if (best_cpu != -1)
699 return best_cpu;
700 }
163 } 701 }
164 702
165 p = list_entry(curr, struct task_struct, run_list); 703 /*
704 * And finally, if there were no matches within the domains
705 * just give the caller *something* to work with from the compatible
706 * locations.
707 */
708 return pick_optimal_cpu(this_cpu, lowest_mask);
709}
166 710
167 curr = curr->prev; 711/* Will lock the rq it finds */
712static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
713{
714 struct rq *lowest_rq = NULL;
715 int tries;
716 int cpu;
168 717
169 rq->rt.rt_load_balance_curr = curr; 718 for (tries = 0; tries < RT_MAX_TRIES; tries++) {
719 cpu = find_lowest_rq(task);
170 720
171 return p; 721 if ((cpu == -1) || (cpu == rq->cpu))
722 break;
723
724 lowest_rq = cpu_rq(cpu);
725
726 /* if the prio of this runqueue changed, try again */
727 if (double_lock_balance(rq, lowest_rq)) {
728 /*
729 * We had to unlock the run queue. In
730 * the mean time, task could have
731 * migrated already or had its affinity changed.
732 * Also make sure that it wasn't scheduled on its rq.
733 */
734 if (unlikely(task_rq(task) != rq ||
735 !cpu_isset(lowest_rq->cpu,
736 task->cpus_allowed) ||
737 task_running(rq, task) ||
738 !task->se.on_rq)) {
739
740 spin_unlock(&lowest_rq->lock);
741 lowest_rq = NULL;
742 break;
743 }
744 }
745
746 /* If this rq is still suitable use it. */
747 if (lowest_rq->rt.highest_prio > task->prio)
748 break;
749
750 /* try again */
751 spin_unlock(&lowest_rq->lock);
752 lowest_rq = NULL;
753 }
754
755 return lowest_rq;
756}
757
758/*
759 * If the current CPU has more than one RT task, see if the non
760 * running task can migrate over to a CPU that is running a task
761 * of lesser priority.
762 */
763static int push_rt_task(struct rq *rq)
764{
765 struct task_struct *next_task;
766 struct rq *lowest_rq;
767 int ret = 0;
768 int paranoid = RT_MAX_TRIES;
769
770 if (!rq->rt.overloaded)
771 return 0;
772
773 next_task = pick_next_highest_task_rt(rq, -1);
774 if (!next_task)
775 return 0;
776
777 retry:
778 if (unlikely(next_task == rq->curr)) {
779 WARN_ON(1);
780 return 0;
781 }
782
783 /*
784 * It's possible that the next_task slipped in of
785 * higher priority than current. If that's the case
786 * just reschedule current.
787 */
788 if (unlikely(next_task->prio < rq->curr->prio)) {
789 resched_task(rq->curr);
790 return 0;
791 }
792
793 /* We might release rq lock */
794 get_task_struct(next_task);
795
796 /* find_lock_lowest_rq locks the rq if found */
797 lowest_rq = find_lock_lowest_rq(next_task, rq);
798 if (!lowest_rq) {
799 struct task_struct *task;
800 /*
801 * find lock_lowest_rq releases rq->lock
802 * so it is possible that next_task has changed.
803 * If it has, then try again.
804 */
805 task = pick_next_highest_task_rt(rq, -1);
806 if (unlikely(task != next_task) && task && paranoid--) {
807 put_task_struct(next_task);
808 next_task = task;
809 goto retry;
810 }
811 goto out;
812 }
813
814 deactivate_task(rq, next_task, 0);
815 set_task_cpu(next_task, lowest_rq->cpu);
816 activate_task(lowest_rq, next_task, 0);
817
818 resched_task(lowest_rq->curr);
819
820 spin_unlock(&lowest_rq->lock);
821
822 ret = 1;
823out:
824 put_task_struct(next_task);
825
826 return ret;
827}
828
829/*
830 * TODO: Currently we just use the second highest prio task on
831 * the queue, and stop when it can't migrate (or there's
832 * no more RT tasks). There may be a case where a lower
833 * priority RT task has a different affinity than the
834 * higher RT task. In this case the lower RT task could
835 * possibly be able to migrate where as the higher priority
836 * RT task could not. We currently ignore this issue.
837 * Enhancements are welcome!
838 */
839static void push_rt_tasks(struct rq *rq)
840{
841 /* push_rt_task will return true if it moved an RT */
842 while (push_rt_task(rq))
843 ;
844}
845
846static int pull_rt_task(struct rq *this_rq)
847{
848 int this_cpu = this_rq->cpu, ret = 0, cpu;
849 struct task_struct *p, *next;
850 struct rq *src_rq;
851
852 if (likely(!rt_overloaded(this_rq)))
853 return 0;
854
855 next = pick_next_task_rt(this_rq);
856
857 for_each_cpu_mask(cpu, this_rq->rd->rto_mask) {
858 if (this_cpu == cpu)
859 continue;
860
861 src_rq = cpu_rq(cpu);
862 /*
863 * We can potentially drop this_rq's lock in
864 * double_lock_balance, and another CPU could
865 * steal our next task - hence we must cause
866 * the caller to recalculate the next task
867 * in that case:
868 */
869 if (double_lock_balance(this_rq, src_rq)) {
870 struct task_struct *old_next = next;
871
872 next = pick_next_task_rt(this_rq);
873 if (next != old_next)
874 ret = 1;
875 }
876
877 /*
878 * Are there still pullable RT tasks?
879 */
880 if (src_rq->rt.rt_nr_running <= 1)
881 goto skip;
882
883 p = pick_next_highest_task_rt(src_rq, this_cpu);
884
885 /*
886 * Do we have an RT task that preempts
887 * the to-be-scheduled task?
888 */
889 if (p && (!next || (p->prio < next->prio))) {
890 WARN_ON(p == src_rq->curr);
891 WARN_ON(!p->se.on_rq);
892
893 /*
894 * There's a chance that p is higher in priority
895 * than what's currently running on its cpu.
896 * This is just that p is wakeing up and hasn't
897 * had a chance to schedule. We only pull
898 * p if it is lower in priority than the
899 * current task on the run queue or
900 * this_rq next task is lower in prio than
901 * the current task on that rq.
902 */
903 if (p->prio < src_rq->curr->prio ||
904 (next && next->prio < src_rq->curr->prio))
905 goto skip;
906
907 ret = 1;
908
909 deactivate_task(src_rq, p, 0);
910 set_task_cpu(p, this_cpu);
911 activate_task(this_rq, p, 0);
912 /*
913 * We continue with the search, just in
914 * case there's an even higher prio task
915 * in another runqueue. (low likelyhood
916 * but possible)
917 *
918 * Update next so that we won't pick a task
919 * on another cpu with a priority lower (or equal)
920 * than the one we just picked.
921 */
922 next = p;
923
924 }
925 skip:
926 spin_unlock(&src_rq->lock);
927 }
928
929 return ret;
930}
931
932static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
933{
934 /* Try to pull RT tasks here if we lower this rq's prio */
935 if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio)
936 pull_rt_task(rq);
937}
938
939static void post_schedule_rt(struct rq *rq)
940{
941 /*
942 * If we have more than one rt_task queued, then
943 * see if we can push the other rt_tasks off to other CPUS.
944 * Note we may release the rq lock, and since
945 * the lock was owned by prev, we need to release it
946 * first via finish_lock_switch and then reaquire it here.
947 */
948 if (unlikely(rq->rt.overloaded)) {
949 spin_lock_irq(&rq->lock);
950 push_rt_tasks(rq);
951 spin_unlock_irq(&rq->lock);
952 }
953}
954
955
956static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
957{
958 if (!task_running(rq, p) &&
959 (p->prio >= rq->rt.highest_prio) &&
960 rq->rt.overloaded)
961 push_rt_tasks(rq);
172} 962}
173 963
174static unsigned long 964static unsigned long
@@ -177,36 +967,170 @@ load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
177 struct sched_domain *sd, enum cpu_idle_type idle, 967 struct sched_domain *sd, enum cpu_idle_type idle,
178 int *all_pinned, int *this_best_prio) 968 int *all_pinned, int *this_best_prio)
179{ 969{
180 struct rq_iterator rt_rq_iterator; 970 /* don't touch RT tasks */
181 971 return 0;
182 rt_rq_iterator.start = load_balance_start_rt;
183 rt_rq_iterator.next = load_balance_next_rt;
184 /* pass 'busiest' rq argument into
185 * load_balance_[start|next]_rt iterators
186 */
187 rt_rq_iterator.arg = busiest;
188
189 return balance_tasks(this_rq, this_cpu, busiest, max_load_move, sd,
190 idle, all_pinned, this_best_prio, &rt_rq_iterator);
191} 972}
192 973
193static int 974static int
194move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, 975move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
195 struct sched_domain *sd, enum cpu_idle_type idle) 976 struct sched_domain *sd, enum cpu_idle_type idle)
196{ 977{
197 struct rq_iterator rt_rq_iterator; 978 /* don't touch RT tasks */
979 return 0;
980}
981
982static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask)
983{
984 int weight = cpus_weight(*new_mask);
198 985
199 rt_rq_iterator.start = load_balance_start_rt; 986 BUG_ON(!rt_task(p));
200 rt_rq_iterator.next = load_balance_next_rt; 987
201 rt_rq_iterator.arg = busiest; 988 /*
989 * Update the migration status of the RQ if we have an RT task
990 * which is running AND changing its weight value.
991 */
992 if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {
993 struct rq *rq = task_rq(p);
202 994
203 return iter_move_one_task(this_rq, this_cpu, busiest, sd, idle, 995 if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
204 &rt_rq_iterator); 996 rq->rt.rt_nr_migratory++;
997 } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
998 BUG_ON(!rq->rt.rt_nr_migratory);
999 rq->rt.rt_nr_migratory--;
1000 }
1001
1002 update_rt_migration(rq);
1003 }
1004
1005 p->cpus_allowed = *new_mask;
1006 p->rt.nr_cpus_allowed = weight;
205} 1007}
206#endif
207 1008
208static void task_tick_rt(struct rq *rq, struct task_struct *p) 1009/* Assumes rq->lock is held */
1010static void join_domain_rt(struct rq *rq)
209{ 1011{
1012 if (rq->rt.overloaded)
1013 rt_set_overload(rq);
1014}
1015
1016/* Assumes rq->lock is held */
1017static void leave_domain_rt(struct rq *rq)
1018{
1019 if (rq->rt.overloaded)
1020 rt_clear_overload(rq);
1021}
1022
1023/*
1024 * When switch from the rt queue, we bring ourselves to a position
1025 * that we might want to pull RT tasks from other runqueues.
1026 */
1027static void switched_from_rt(struct rq *rq, struct task_struct *p,
1028 int running)
1029{
1030 /*
1031 * If there are other RT tasks then we will reschedule
1032 * and the scheduling of the other RT tasks will handle
1033 * the balancing. But if we are the last RT task
1034 * we may need to handle the pulling of RT tasks
1035 * now.
1036 */
1037 if (!rq->rt.rt_nr_running)
1038 pull_rt_task(rq);
1039}
1040#endif /* CONFIG_SMP */
1041
1042/*
1043 * When switching a task to RT, we may overload the runqueue
1044 * with RT tasks. In this case we try to push them off to
1045 * other runqueues.
1046 */
1047static void switched_to_rt(struct rq *rq, struct task_struct *p,
1048 int running)
1049{
1050 int check_resched = 1;
1051
1052 /*
1053 * If we are already running, then there's nothing
1054 * that needs to be done. But if we are not running
1055 * we may need to preempt the current running task.
1056 * If that current running task is also an RT task
1057 * then see if we can move to another run queue.
1058 */
1059 if (!running) {
1060#ifdef CONFIG_SMP
1061 if (rq->rt.overloaded && push_rt_task(rq) &&
1062 /* Don't resched if we changed runqueues */
1063 rq != task_rq(p))
1064 check_resched = 0;
1065#endif /* CONFIG_SMP */
1066 if (check_resched && p->prio < rq->curr->prio)
1067 resched_task(rq->curr);
1068 }
1069}
1070
1071/*
1072 * Priority of the task has changed. This may cause
1073 * us to initiate a push or pull.
1074 */
1075static void prio_changed_rt(struct rq *rq, struct task_struct *p,
1076 int oldprio, int running)
1077{
1078 if (running) {
1079#ifdef CONFIG_SMP
1080 /*
1081 * If our priority decreases while running, we
1082 * may need to pull tasks to this runqueue.
1083 */
1084 if (oldprio < p->prio)
1085 pull_rt_task(rq);
1086 /*
1087 * If there's a higher priority task waiting to run
1088 * then reschedule.
1089 */
1090 if (p->prio > rq->rt.highest_prio)
1091 resched_task(p);
1092#else
1093 /* For UP simply resched on drop of prio */
1094 if (oldprio < p->prio)
1095 resched_task(p);
1096#endif /* CONFIG_SMP */
1097 } else {
1098 /*
1099 * This task is not running, but if it is
1100 * greater than the current running task
1101 * then reschedule.
1102 */
1103 if (p->prio < rq->curr->prio)
1104 resched_task(rq->curr);
1105 }
1106}
1107
1108static void watchdog(struct rq *rq, struct task_struct *p)
1109{
1110 unsigned long soft, hard;
1111
1112 if (!p->signal)
1113 return;
1114
1115 soft = p->signal->rlim[RLIMIT_RTTIME].rlim_cur;
1116 hard = p->signal->rlim[RLIMIT_RTTIME].rlim_max;
1117
1118 if (soft != RLIM_INFINITY) {
1119 unsigned long next;
1120
1121 p->rt.timeout++;
1122 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
1123 if (p->rt.timeout > next)
1124 p->it_sched_expires = p->se.sum_exec_runtime;
1125 }
1126}
1127
1128static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
1129{
1130 update_curr_rt(rq);
1131
1132 watchdog(rq, p);
1133
210 /* 1134 /*
211 * RR tasks need a special form of timeslice management. 1135 * RR tasks need a special form of timeslice management.
212 * FIFO tasks have no timeslices. 1136 * FIFO tasks have no timeslices.
@@ -214,16 +1138,16 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p)
214 if (p->policy != SCHED_RR) 1138 if (p->policy != SCHED_RR)
215 return; 1139 return;
216 1140
217 if (--p->time_slice) 1141 if (--p->rt.time_slice)
218 return; 1142 return;
219 1143
220 p->time_slice = DEF_TIMESLICE; 1144 p->rt.time_slice = DEF_TIMESLICE;
221 1145
222 /* 1146 /*
223 * Requeue to the end of queue if we are not the only element 1147 * Requeue to the end of queue if we are not the only element
224 * on the queue: 1148 * on the queue:
225 */ 1149 */
226 if (p->run_list.prev != p->run_list.next) { 1150 if (p->rt.run_list.prev != p->rt.run_list.next) {
227 requeue_task_rt(rq, p); 1151 requeue_task_rt(rq, p);
228 set_tsk_need_resched(p); 1152 set_tsk_need_resched(p);
229 } 1153 }
@@ -241,6 +1165,9 @@ const struct sched_class rt_sched_class = {
241 .enqueue_task = enqueue_task_rt, 1165 .enqueue_task = enqueue_task_rt,
242 .dequeue_task = dequeue_task_rt, 1166 .dequeue_task = dequeue_task_rt,
243 .yield_task = yield_task_rt, 1167 .yield_task = yield_task_rt,
1168#ifdef CONFIG_SMP
1169 .select_task_rq = select_task_rq_rt,
1170#endif /* CONFIG_SMP */
244 1171
245 .check_preempt_curr = check_preempt_curr_rt, 1172 .check_preempt_curr = check_preempt_curr_rt,
246 1173
@@ -250,8 +1177,18 @@ const struct sched_class rt_sched_class = {
250#ifdef CONFIG_SMP 1177#ifdef CONFIG_SMP
251 .load_balance = load_balance_rt, 1178 .load_balance = load_balance_rt,
252 .move_one_task = move_one_task_rt, 1179 .move_one_task = move_one_task_rt,
1180 .set_cpus_allowed = set_cpus_allowed_rt,
1181 .join_domain = join_domain_rt,
1182 .leave_domain = leave_domain_rt,
1183 .pre_schedule = pre_schedule_rt,
1184 .post_schedule = post_schedule_rt,
1185 .task_wake_up = task_wake_up_rt,
1186 .switched_from = switched_from_rt,
253#endif 1187#endif
254 1188
255 .set_curr_task = set_curr_task_rt, 1189 .set_curr_task = set_curr_task_rt,
256 .task_tick = task_tick_rt, 1190 .task_tick = task_tick_rt,
1191
1192 .prio_changed = prio_changed_rt,
1193 .switched_to = switched_to_rt,
257}; 1194};
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index ef1a7df80ea2..5b32433e7ee5 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -52,7 +52,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
52 sd->lb_nobusyq[itype], 52 sd->lb_nobusyq[itype],
53 sd->lb_nobusyg[itype]); 53 sd->lb_nobusyg[itype]);
54 } 54 }
55 seq_printf(seq, " %u %u %u %u %u %u %u %u %u %u %u %u\n", 55 seq_printf(seq,
56 " %u %u %u %u %u %u %u %u %u %u %u %u\n",
56 sd->alb_count, sd->alb_failed, sd->alb_pushed, 57 sd->alb_count, sd->alb_failed, sd->alb_pushed,
57 sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed, 58 sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
58 sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed, 59 sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
@@ -127,7 +128,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
127# define schedstat_set(var, val) do { } while (0) 128# define schedstat_set(var, val) do { } while (0)
128#endif 129#endif
129 130
130#ifdef CONFIG_SCHEDSTATS 131#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
131/* 132/*
132 * Called when a process is dequeued from the active array and given 133 * Called when a process is dequeued from the active array and given
133 * the cpu. We should note that with the exception of interactive 134 * the cpu. We should note that with the exception of interactive
@@ -155,7 +156,7 @@ static inline void sched_info_dequeued(struct task_struct *t)
155 */ 156 */
156static void sched_info_arrive(struct task_struct *t) 157static void sched_info_arrive(struct task_struct *t)
157{ 158{
158 unsigned long long now = sched_clock(), delta = 0; 159 unsigned long long now = task_rq(t)->clock, delta = 0;
159 160
160 if (t->sched_info.last_queued) 161 if (t->sched_info.last_queued)
161 delta = now - t->sched_info.last_queued; 162 delta = now - t->sched_info.last_queued;
@@ -186,7 +187,7 @@ static inline void sched_info_queued(struct task_struct *t)
186{ 187{
187 if (unlikely(sched_info_on())) 188 if (unlikely(sched_info_on()))
188 if (!t->sched_info.last_queued) 189 if (!t->sched_info.last_queued)
189 t->sched_info.last_queued = sched_clock(); 190 t->sched_info.last_queued = task_rq(t)->clock;
190} 191}
191 192
192/* 193/*
@@ -195,7 +196,8 @@ static inline void sched_info_queued(struct task_struct *t)
195 */ 196 */
196static inline void sched_info_depart(struct task_struct *t) 197static inline void sched_info_depart(struct task_struct *t)
197{ 198{
198 unsigned long long delta = sched_clock() - t->sched_info.last_arrival; 199 unsigned long long delta = task_rq(t)->clock -
200 t->sched_info.last_arrival;
199 201
200 t->sched_info.cpu_time += delta; 202 t->sched_info.cpu_time += delta;
201 rq_sched_info_depart(task_rq(t), delta); 203 rq_sched_info_depart(task_rq(t), delta);
@@ -231,5 +233,5 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
231#else 233#else
232#define sched_info_queued(t) do { } while (0) 234#define sched_info_queued(t) do { } while (0)
233#define sched_info_switch(t, next) do { } while (0) 235#define sched_info_switch(t, next) do { } while (0)
234#endif /* CONFIG_SCHEDSTATS */ 236#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
235 237
diff --git a/kernel/signal.c b/kernel/signal.c
index 12006308c7eb..4333b6dbb424 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -55,7 +55,7 @@ static int sig_ignored(struct task_struct *t, int sig)
55 * signal handler may change by the time it is 55 * signal handler may change by the time it is
56 * unblocked. 56 * unblocked.
57 */ 57 */
58 if (sigismember(&t->blocked, sig)) 58 if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
59 return 0; 59 return 0;
60 60
61 /* Is it explicitly or implicitly ignored? */ 61 /* Is it explicitly or implicitly ignored? */
@@ -124,7 +124,7 @@ void recalc_sigpending_and_wake(struct task_struct *t)
124 124
125void recalc_sigpending(void) 125void recalc_sigpending(void)
126{ 126{
127 if (!recalc_sigpending_tsk(current)) 127 if (!recalc_sigpending_tsk(current) && !freezing(current))
128 clear_thread_flag(TIF_SIGPENDING); 128 clear_thread_flag(TIF_SIGPENDING);
129 129
130} 130}
@@ -456,15 +456,15 @@ void signal_wake_up(struct task_struct *t, int resume)
456 set_tsk_thread_flag(t, TIF_SIGPENDING); 456 set_tsk_thread_flag(t, TIF_SIGPENDING);
457 457
458 /* 458 /*
459 * For SIGKILL, we want to wake it up in the stopped/traced case. 459 * For SIGKILL, we want to wake it up in the stopped/traced/killable
460 * We don't check t->state here because there is a race with it 460 * case. We don't check t->state here because there is a race with it
461 * executing another processor and just now entering stopped state. 461 * executing another processor and just now entering stopped state.
462 * By using wake_up_state, we ensure the process will wake up and 462 * By using wake_up_state, we ensure the process will wake up and
463 * handle its death signal. 463 * handle its death signal.
464 */ 464 */
465 mask = TASK_INTERRUPTIBLE; 465 mask = TASK_INTERRUPTIBLE;
466 if (resume) 466 if (resume)
467 mask |= TASK_STOPPED | TASK_TRACED; 467 mask |= TASK_WAKEKILL;
468 if (!wake_up_state(t, mask)) 468 if (!wake_up_state(t, mask))
469 kick_process(t); 469 kick_process(t);
470} 470}
@@ -620,7 +620,7 @@ static void handle_stop_signal(int sig, struct task_struct *p)
620 * Wake up the stopped thread _after_ setting 620 * Wake up the stopped thread _after_ setting
621 * TIF_SIGPENDING 621 * TIF_SIGPENDING
622 */ 622 */
623 state = TASK_STOPPED; 623 state = __TASK_STOPPED;
624 if (sig_user_defined(t, SIGCONT) && !sigismember(&t->blocked, SIGCONT)) { 624 if (sig_user_defined(t, SIGCONT) && !sigismember(&t->blocked, SIGCONT)) {
625 set_tsk_thread_flag(t, TIF_SIGPENDING); 625 set_tsk_thread_flag(t, TIF_SIGPENDING);
626 state |= TASK_INTERRUPTIBLE; 626 state |= TASK_INTERRUPTIBLE;
@@ -732,14 +732,14 @@ static void print_fatal_signal(struct pt_regs *regs, int signr)
732 printk("%s/%d: potentially unexpected fatal signal %d.\n", 732 printk("%s/%d: potentially unexpected fatal signal %d.\n",
733 current->comm, task_pid_nr(current), signr); 733 current->comm, task_pid_nr(current), signr);
734 734
735#ifdef __i386__ 735#if defined(__i386__) && !defined(__arch_um__)
736 printk("code at %08lx: ", regs->eip); 736 printk("code at %08lx: ", regs->ip);
737 { 737 {
738 int i; 738 int i;
739 for (i = 0; i < 16; i++) { 739 for (i = 0; i < 16; i++) {
740 unsigned char insn; 740 unsigned char insn;
741 741
742 __get_user(insn, (unsigned char *)(regs->eip + i)); 742 __get_user(insn, (unsigned char *)(regs->ip + i));
743 printk("%02x ", insn); 743 printk("%02x ", insn);
744 } 744 }
745 } 745 }
@@ -838,7 +838,7 @@ static inline int wants_signal(int sig, struct task_struct *p)
838 return 0; 838 return 0;
839 if (sig == SIGKILL) 839 if (sig == SIGKILL)
840 return 1; 840 return 1;
841 if (p->state & (TASK_STOPPED | TASK_TRACED)) 841 if (task_is_stopped_or_traced(p))
842 return 0; 842 return 0;
843 return task_curr(p) || !signal_pending(p); 843 return task_curr(p) || !signal_pending(p);
844} 844}
@@ -994,6 +994,12 @@ void zap_other_threads(struct task_struct *p)
994 } 994 }
995} 995}
996 996
997int fastcall __fatal_signal_pending(struct task_struct *tsk)
998{
999 return sigismember(&tsk->pending.signal, SIGKILL);
1000}
1001EXPORT_SYMBOL(__fatal_signal_pending);
1002
997/* 1003/*
998 * Must be called under rcu_read_lock() or with tasklist_lock read-held. 1004 * Must be called under rcu_read_lock() or with tasklist_lock read-held.
999 */ 1005 */
@@ -1441,7 +1447,7 @@ void do_notify_parent(struct task_struct *tsk, int sig)
1441 BUG_ON(sig == -1); 1447 BUG_ON(sig == -1);
1442 1448
1443 /* do_notify_parent_cldstop should have been called instead. */ 1449 /* do_notify_parent_cldstop should have been called instead. */
1444 BUG_ON(tsk->state & (TASK_STOPPED|TASK_TRACED)); 1450 BUG_ON(task_is_stopped_or_traced(tsk));
1445 1451
1446 BUG_ON(!tsk->ptrace && 1452 BUG_ON(!tsk->ptrace &&
1447 (tsk->group_leader != tsk || !thread_group_empty(tsk))); 1453 (tsk->group_leader != tsk || !thread_group_empty(tsk)));
@@ -1729,7 +1735,7 @@ static int do_signal_stop(int signr)
1729 * so this check has no races. 1735 * so this check has no races.
1730 */ 1736 */
1731 if (!t->exit_state && 1737 if (!t->exit_state &&
1732 !(t->state & (TASK_STOPPED|TASK_TRACED))) { 1738 !task_is_stopped_or_traced(t)) {
1733 stop_count++; 1739 stop_count++;
1734 signal_wake_up(t, 0); 1740 signal_wake_up(t, 0);
1735 } 1741 }
diff --git a/kernel/softirq.c b/kernel/softirq.c
index bd89bc4eb0b9..d7837d45419e 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -3,7 +3,9 @@
3 * 3 *
4 * Copyright (C) 1992 Linus Torvalds 4 * Copyright (C) 1992 Linus Torvalds
5 * 5 *
6 * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) 6 * Distribute under GPLv2.
7 *
8 * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
7 */ 9 */
8 10
9#include <linux/module.h> 11#include <linux/module.h>
@@ -278,9 +280,14 @@ asmlinkage void do_softirq(void)
278 */ 280 */
279void irq_enter(void) 281void irq_enter(void)
280{ 282{
283#ifdef CONFIG_NO_HZ
284 int cpu = smp_processor_id();
285 if (idle_cpu(cpu) && !in_interrupt())
286 tick_nohz_stop_idle(cpu);
287#endif
281 __irq_enter(); 288 __irq_enter();
282#ifdef CONFIG_NO_HZ 289#ifdef CONFIG_NO_HZ
283 if (idle_cpu(smp_processor_id())) 290 if (idle_cpu(cpu))
284 tick_nohz_update_jiffies(); 291 tick_nohz_update_jiffies();
285#endif 292#endif
286} 293}
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 11df812263c8..7c2da88db4ed 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -8,6 +8,7 @@
8 */ 8 */
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/cpu.h> 10#include <linux/cpu.h>
11#include <linux/nmi.h>
11#include <linux/init.h> 12#include <linux/init.h>
12#include <linux/delay.h> 13#include <linux/delay.h>
13#include <linux/freezer.h> 14#include <linux/freezer.h>
@@ -23,8 +24,8 @@ static DEFINE_PER_CPU(unsigned long, touch_timestamp);
23static DEFINE_PER_CPU(unsigned long, print_timestamp); 24static DEFINE_PER_CPU(unsigned long, print_timestamp);
24static DEFINE_PER_CPU(struct task_struct *, watchdog_task); 25static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
25 26
26static int did_panic; 27static int __read_mostly did_panic;
27int softlockup_thresh = 10; 28unsigned long __read_mostly softlockup_thresh = 60;
28 29
29static int 30static int
30softlock_panic(struct notifier_block *this, unsigned long event, void *ptr) 31softlock_panic(struct notifier_block *this, unsigned long event, void *ptr)
@@ -45,7 +46,7 @@ static struct notifier_block panic_block = {
45 */ 46 */
46static unsigned long get_timestamp(int this_cpu) 47static unsigned long get_timestamp(int this_cpu)
47{ 48{
48 return cpu_clock(this_cpu) >> 30; /* 2^30 ~= 10^9 */ 49 return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */
49} 50}
50 51
51void touch_softlockup_watchdog(void) 52void touch_softlockup_watchdog(void)
@@ -104,7 +105,7 @@ void softlockup_tick(void)
104 if (now > (touch_timestamp + 1)) 105 if (now > (touch_timestamp + 1))
105 wake_up_process(per_cpu(watchdog_task, this_cpu)); 106 wake_up_process(per_cpu(watchdog_task, this_cpu));
106 107
107 /* Warn about unreasonable 10+ seconds delays: */ 108 /* Warn about unreasonable delays: */
108 if (now <= (touch_timestamp + softlockup_thresh)) 109 if (now <= (touch_timestamp + softlockup_thresh))
109 return; 110 return;
110 111
@@ -122,11 +123,93 @@ void softlockup_tick(void)
122} 123}
123 124
124/* 125/*
126 * Have a reasonable limit on the number of tasks checked:
127 */
128unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
129
130/*
131 * Zero means infinite timeout - no checking done:
132 */
133unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120;
134
135unsigned long __read_mostly sysctl_hung_task_warnings = 10;
136
137/*
138 * Only do the hung-tasks check on one CPU:
139 */
140static int check_cpu __read_mostly = -1;
141
142static void check_hung_task(struct task_struct *t, unsigned long now)
143{
144 unsigned long switch_count = t->nvcsw + t->nivcsw;
145
146 if (t->flags & PF_FROZEN)
147 return;
148
149 if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
150 t->last_switch_count = switch_count;
151 t->last_switch_timestamp = now;
152 return;
153 }
154 if ((long)(now - t->last_switch_timestamp) <
155 sysctl_hung_task_timeout_secs)
156 return;
157 if (sysctl_hung_task_warnings < 0)
158 return;
159 sysctl_hung_task_warnings--;
160
161 /*
162 * Ok, the task did not get scheduled for more than 2 minutes,
163 * complain:
164 */
165 printk(KERN_ERR "INFO: task %s:%d blocked for more than "
166 "%ld seconds.\n", t->comm, t->pid,
167 sysctl_hung_task_timeout_secs);
168 printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
169 " disables this message.\n");
170 sched_show_task(t);
171 __debug_show_held_locks(t);
172
173 t->last_switch_timestamp = now;
174 touch_nmi_watchdog();
175}
176
177/*
178 * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
179 * a really long time (120 seconds). If that happens, print out
180 * a warning.
181 */
182static void check_hung_uninterruptible_tasks(int this_cpu)
183{
184 int max_count = sysctl_hung_task_check_count;
185 unsigned long now = get_timestamp(this_cpu);
186 struct task_struct *g, *t;
187
188 /*
189 * If the system crashed already then all bets are off,
190 * do not report extra hung tasks:
191 */
192 if ((tainted & TAINT_DIE) || did_panic)
193 return;
194
195 read_lock(&tasklist_lock);
196 do_each_thread(g, t) {
197 if (!--max_count)
198 goto unlock;
199 if (t->state & TASK_UNINTERRUPTIBLE)
200 check_hung_task(t, now);
201 } while_each_thread(g, t);
202 unlock:
203 read_unlock(&tasklist_lock);
204}
205
206/*
125 * The watchdog thread - runs every second and touches the timestamp. 207 * The watchdog thread - runs every second and touches the timestamp.
126 */ 208 */
127static int watchdog(void *__bind_cpu) 209static int watchdog(void *__bind_cpu)
128{ 210{
129 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 211 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
212 int this_cpu = (long)__bind_cpu;
130 213
131 sched_setscheduler(current, SCHED_FIFO, &param); 214 sched_setscheduler(current, SCHED_FIFO, &param);
132 215
@@ -135,13 +218,23 @@ static int watchdog(void *__bind_cpu)
135 218
136 /* 219 /*
137 * Run briefly once per second to reset the softlockup timestamp. 220 * Run briefly once per second to reset the softlockup timestamp.
138 * If this gets delayed for more than 10 seconds then the 221 * If this gets delayed for more than 60 seconds then the
139 * debug-printout triggers in softlockup_tick(). 222 * debug-printout triggers in softlockup_tick().
140 */ 223 */
141 while (!kthread_should_stop()) { 224 while (!kthread_should_stop()) {
142 set_current_state(TASK_INTERRUPTIBLE); 225 set_current_state(TASK_INTERRUPTIBLE);
143 touch_softlockup_watchdog(); 226 touch_softlockup_watchdog();
144 schedule(); 227 schedule();
228
229 if (kthread_should_stop())
230 break;
231
232 if (this_cpu != check_cpu)
233 continue;
234
235 if (sysctl_hung_task_timeout_secs)
236 check_hung_uninterruptible_tasks(this_cpu);
237
145 } 238 }
146 239
147 return 0; 240 return 0;
@@ -171,9 +264,20 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
171 break; 264 break;
172 case CPU_ONLINE: 265 case CPU_ONLINE:
173 case CPU_ONLINE_FROZEN: 266 case CPU_ONLINE_FROZEN:
267 check_cpu = any_online_cpu(cpu_online_map);
174 wake_up_process(per_cpu(watchdog_task, hotcpu)); 268 wake_up_process(per_cpu(watchdog_task, hotcpu));
175 break; 269 break;
176#ifdef CONFIG_HOTPLUG_CPU 270#ifdef CONFIG_HOTPLUG_CPU
271 case CPU_DOWN_PREPARE:
272 case CPU_DOWN_PREPARE_FROZEN:
273 if (hotcpu == check_cpu) {
274 cpumask_t temp_cpu_online_map = cpu_online_map;
275
276 cpu_clear(hotcpu, temp_cpu_online_map);
277 check_cpu = any_online_cpu(temp_cpu_online_map);
278 }
279 break;
280
177 case CPU_UP_CANCELED: 281 case CPU_UP_CANCELED:
178 case CPU_UP_CANCELED_FROZEN: 282 case CPU_UP_CANCELED_FROZEN:
179 if (!per_cpu(watchdog_task, hotcpu)) 283 if (!per_cpu(watchdog_task, hotcpu))
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index cd72424c2662..ae28c8245123 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -65,8 +65,7 @@ EXPORT_SYMBOL(_write_trylock);
65 * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are 65 * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are
66 * not re-enabled during lock-acquire (which the preempt-spin-ops do): 66 * not re-enabled during lock-acquire (which the preempt-spin-ops do):
67 */ 67 */
68#if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP) || \ 68#if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC)
69 defined(CONFIG_DEBUG_LOCK_ALLOC)
70 69
71void __lockfunc _read_lock(rwlock_t *lock) 70void __lockfunc _read_lock(rwlock_t *lock)
72{ 71{
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 319821ef78af..51b5ee53571a 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -203,13 +203,13 @@ int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu)
203 int ret; 203 int ret;
204 204
205 /* No CPUs can come up or down during this. */ 205 /* No CPUs can come up or down during this. */
206 lock_cpu_hotplug(); 206 get_online_cpus();
207 p = __stop_machine_run(fn, data, cpu); 207 p = __stop_machine_run(fn, data, cpu);
208 if (!IS_ERR(p)) 208 if (!IS_ERR(p))
209 ret = kthread_stop(p); 209 ret = kthread_stop(p);
210 else 210 else
211 ret = PTR_ERR(p); 211 ret = PTR_ERR(p);
212 unlock_cpu_hotplug(); 212 put_online_cpus();
213 213
214 return ret; 214 return ret;
215} 215}
diff --git a/kernel/sys.c b/kernel/sys.c
index 304b5410d746..d1fe71eb4546 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1750,7 +1750,7 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1750} 1750}
1751 1751
1752asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep, 1752asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep,
1753 struct getcpu_cache __user *cache) 1753 struct getcpu_cache __user *unused)
1754{ 1754{
1755 int err = 0; 1755 int err = 0;
1756 int cpu = raw_smp_processor_id(); 1756 int cpu = raw_smp_processor_id();
@@ -1758,24 +1758,6 @@ asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep,
1758 err |= put_user(cpu, cpup); 1758 err |= put_user(cpu, cpup);
1759 if (nodep) 1759 if (nodep)
1760 err |= put_user(cpu_to_node(cpu), nodep); 1760 err |= put_user(cpu_to_node(cpu), nodep);
1761 if (cache) {
1762 /*
1763 * The cache is not needed for this implementation,
1764 * but make sure user programs pass something
1765 * valid. vsyscall implementations can instead make
1766 * good use of the cache. Only use t0 and t1 because
1767 * these are available in both 32bit and 64bit ABI (no
1768 * need for a compat_getcpu). 32bit has enough
1769 * padding
1770 */
1771 unsigned long t0, t1;
1772 get_user(t0, &cache->blob[0]);
1773 get_user(t1, &cache->blob[1]);
1774 t0++;
1775 t1++;
1776 put_user(t0, &cache->blob[0]);
1777 put_user(t1, &cache->blob[1]);
1778 }
1779 return err ? -EFAULT : 0; 1761 return err ? -EFAULT : 0;
1780} 1762}
1781 1763
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 52c7a151e298..beee5b3b68a2 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -40,10 +40,14 @@ cond_syscall(sys_recvfrom);
40cond_syscall(sys_recv); 40cond_syscall(sys_recv);
41cond_syscall(sys_socket); 41cond_syscall(sys_socket);
42cond_syscall(sys_setsockopt); 42cond_syscall(sys_setsockopt);
43cond_syscall(compat_sys_setsockopt);
43cond_syscall(sys_getsockopt); 44cond_syscall(sys_getsockopt);
45cond_syscall(compat_sys_getsockopt);
44cond_syscall(sys_shutdown); 46cond_syscall(sys_shutdown);
45cond_syscall(sys_sendmsg); 47cond_syscall(sys_sendmsg);
48cond_syscall(compat_sys_sendmsg);
46cond_syscall(sys_recvmsg); 49cond_syscall(sys_recvmsg);
50cond_syscall(compat_sys_recvmsg);
47cond_syscall(sys_socketcall); 51cond_syscall(sys_socketcall);
48cond_syscall(sys_futex); 52cond_syscall(sys_futex);
49cond_syscall(compat_sys_futex); 53cond_syscall(compat_sys_futex);
@@ -127,6 +131,7 @@ cond_syscall(sys32_sysctl);
127cond_syscall(ppc_rtas); 131cond_syscall(ppc_rtas);
128cond_syscall(sys_spu_run); 132cond_syscall(sys_spu_run);
129cond_syscall(sys_spu_create); 133cond_syscall(sys_spu_create);
134cond_syscall(sys_subpage_prot);
130 135
131/* mmu depending weak syscall entries */ 136/* mmu depending weak syscall entries */
132cond_syscall(sys_mprotect); 137cond_syscall(sys_mprotect);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3b4efbe26445..7cb1ac3e6fff 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -53,6 +53,7 @@
53#ifdef CONFIG_X86 53#ifdef CONFIG_X86
54#include <asm/nmi.h> 54#include <asm/nmi.h>
55#include <asm/stacktrace.h> 55#include <asm/stacktrace.h>
56#include <asm/io.h>
56#endif 57#endif
57 58
58static int deprecated_sysctl_warning(struct __sysctl_args *args); 59static int deprecated_sysctl_warning(struct __sysctl_args *args);
@@ -80,7 +81,7 @@ extern int percpu_pagelist_fraction;
80extern int compat_log; 81extern int compat_log;
81extern int maps_protect; 82extern int maps_protect;
82extern int sysctl_stat_interval; 83extern int sysctl_stat_interval;
83extern int audit_argv_kb; 84extern int latencytop_enabled;
84 85
85/* Constants used for minimum and maximum */ 86/* Constants used for minimum and maximum */
86#ifdef CONFIG_DETECT_SOFTLOCKUP 87#ifdef CONFIG_DETECT_SOFTLOCKUP
@@ -156,8 +157,16 @@ static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *
156#endif 157#endif
157 158
158static struct ctl_table root_table[]; 159static struct ctl_table root_table[];
159static struct ctl_table_header root_table_header = 160static struct ctl_table_root sysctl_table_root;
160 { root_table, LIST_HEAD_INIT(root_table_header.ctl_entry) }; 161static struct ctl_table_header root_table_header = {
162 .ctl_table = root_table,
163 .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.header_list),
164 .root = &sysctl_table_root,
165};
166static struct ctl_table_root sysctl_table_root = {
167 .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list),
168 .header_list = LIST_HEAD_INIT(root_table_header.ctl_entry),
169};
161 170
162static struct ctl_table kern_table[]; 171static struct ctl_table kern_table[];
163static struct ctl_table vm_table[]; 172static struct ctl_table vm_table[];
@@ -191,14 +200,6 @@ static struct ctl_table root_table[] = {
191 .mode = 0555, 200 .mode = 0555,
192 .child = vm_table, 201 .child = vm_table,
193 }, 202 },
194#ifdef CONFIG_NET
195 {
196 .ctl_name = CTL_NET,
197 .procname = "net",
198 .mode = 0555,
199 .child = net_table,
200 },
201#endif
202 { 203 {
203 .ctl_name = CTL_FS, 204 .ctl_name = CTL_FS,
204 .procname = "fs", 205 .procname = "fs",
@@ -225,21 +226,24 @@ static struct ctl_table root_table[] = {
225}; 226};
226 227
227#ifdef CONFIG_SCHED_DEBUG 228#ifdef CONFIG_SCHED_DEBUG
228static unsigned long min_sched_granularity_ns = 100000; /* 100 usecs */ 229static int min_sched_granularity_ns = 100000; /* 100 usecs */
229static unsigned long max_sched_granularity_ns = 1000000000; /* 1 second */ 230static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */
230static unsigned long min_wakeup_granularity_ns; /* 0 usecs */ 231static int min_wakeup_granularity_ns; /* 0 usecs */
231static unsigned long max_wakeup_granularity_ns = 1000000000; /* 1 second */ 232static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */
232#endif 233#endif
233 234
234static struct ctl_table kern_table[] = { 235static struct ctl_table kern_table[] = {
235#ifdef CONFIG_SCHED_DEBUG 236#ifdef CONFIG_SCHED_DEBUG
236 { 237 {
237 .ctl_name = CTL_UNNUMBERED, 238 .ctl_name = CTL_UNNUMBERED,
238 .procname = "sched_nr_latency", 239 .procname = "sched_min_granularity_ns",
239 .data = &sysctl_sched_nr_latency, 240 .data = &sysctl_sched_min_granularity,
240 .maxlen = sizeof(unsigned int), 241 .maxlen = sizeof(unsigned int),
241 .mode = 0644, 242 .mode = 0644,
242 .proc_handler = &proc_dointvec, 243 .proc_handler = &sched_nr_latency_handler,
244 .strategy = &sysctl_intvec,
245 .extra1 = &min_sched_granularity_ns,
246 .extra2 = &max_sched_granularity_ns,
243 }, 247 },
244 { 248 {
245 .ctl_name = CTL_UNNUMBERED, 249 .ctl_name = CTL_UNNUMBERED,
@@ -247,7 +251,7 @@ static struct ctl_table kern_table[] = {
247 .data = &sysctl_sched_latency, 251 .data = &sysctl_sched_latency,
248 .maxlen = sizeof(unsigned int), 252 .maxlen = sizeof(unsigned int),
249 .mode = 0644, 253 .mode = 0644,
250 .proc_handler = &proc_dointvec_minmax, 254 .proc_handler = &sched_nr_latency_handler,
251 .strategy = &sysctl_intvec, 255 .strategy = &sysctl_intvec,
252 .extra1 = &min_sched_granularity_ns, 256 .extra1 = &min_sched_granularity_ns,
253 .extra2 = &max_sched_granularity_ns, 257 .extra2 = &max_sched_granularity_ns,
@@ -298,6 +302,48 @@ static struct ctl_table kern_table[] = {
298 .mode = 0644, 302 .mode = 0644,
299 .proc_handler = &proc_dointvec, 303 .proc_handler = &proc_dointvec,
300 }, 304 },
305 {
306 .ctl_name = CTL_UNNUMBERED,
307 .procname = "sched_nr_migrate",
308 .data = &sysctl_sched_nr_migrate,
309 .maxlen = sizeof(unsigned int),
310 .mode = 0644,
311 .proc_handler = &proc_dointvec,
312 },
313 {
314 .ctl_name = CTL_UNNUMBERED,
315 .procname = "sched_rt_period_ms",
316 .data = &sysctl_sched_rt_period,
317 .maxlen = sizeof(unsigned int),
318 .mode = 0644,
319 .proc_handler = &proc_dointvec,
320 },
321 {
322 .ctl_name = CTL_UNNUMBERED,
323 .procname = "sched_rt_ratio",
324 .data = &sysctl_sched_rt_ratio,
325 .maxlen = sizeof(unsigned int),
326 .mode = 0644,
327 .proc_handler = &proc_dointvec,
328 },
329#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
330 {
331 .ctl_name = CTL_UNNUMBERED,
332 .procname = "sched_min_bal_int_shares",
333 .data = &sysctl_sched_min_bal_int_shares,
334 .maxlen = sizeof(unsigned int),
335 .mode = 0644,
336 .proc_handler = &proc_dointvec,
337 },
338 {
339 .ctl_name = CTL_UNNUMBERED,
340 .procname = "sched_max_bal_int_shares",
341 .data = &sysctl_sched_max_bal_int_shares,
342 .maxlen = sizeof(unsigned int),
343 .mode = 0644,
344 .proc_handler = &proc_dointvec,
345 },
346#endif
301#endif 347#endif
302 { 348 {
303 .ctl_name = CTL_UNNUMBERED, 349 .ctl_name = CTL_UNNUMBERED,
@@ -343,16 +389,6 @@ static struct ctl_table kern_table[] = {
343 .mode = 0644, 389 .mode = 0644,
344 .proc_handler = &proc_dointvec, 390 .proc_handler = &proc_dointvec,
345 }, 391 },
346#ifdef CONFIG_AUDITSYSCALL
347 {
348 .ctl_name = CTL_UNNUMBERED,
349 .procname = "audit_argv_kb",
350 .data = &audit_argv_kb,
351 .maxlen = sizeof(int),
352 .mode = 0644,
353 .proc_handler = &proc_dointvec,
354 },
355#endif
356 { 392 {
357 .ctl_name = KERN_CORE_PATTERN, 393 .ctl_name = KERN_CORE_PATTERN,
358 .procname = "core_pattern", 394 .procname = "core_pattern",
@@ -371,6 +407,15 @@ static struct ctl_table kern_table[] = {
371 .proc_handler = &proc_dointvec_taint, 407 .proc_handler = &proc_dointvec_taint,
372 }, 408 },
373#endif 409#endif
410#ifdef CONFIG_LATENCYTOP
411 {
412 .procname = "latencytop",
413 .data = &latencytop_enabled,
414 .maxlen = sizeof(int),
415 .mode = 0644,
416 .proc_handler = &proc_dointvec,
417 },
418#endif
374#ifdef CONFIG_SECURITY_CAPABILITIES 419#ifdef CONFIG_SECURITY_CAPABILITIES
375 { 420 {
376 .procname = "cap-bound", 421 .procname = "cap-bound",
@@ -672,6 +717,14 @@ static struct ctl_table kern_table[] = {
672 .mode = 0644, 717 .mode = 0644,
673 .proc_handler = &proc_dointvec, 718 .proc_handler = &proc_dointvec,
674 }, 719 },
720 {
721 .ctl_name = CTL_UNNUMBERED,
722 .procname = "io_delay_type",
723 .data = &io_delay_type,
724 .maxlen = sizeof(int),
725 .mode = 0644,
726 .proc_handler = &proc_dointvec,
727 },
675#endif 728#endif
676#if defined(CONFIG_MMU) 729#if defined(CONFIG_MMU)
677 { 730 {
@@ -717,13 +770,40 @@ static struct ctl_table kern_table[] = {
717 .ctl_name = CTL_UNNUMBERED, 770 .ctl_name = CTL_UNNUMBERED,
718 .procname = "softlockup_thresh", 771 .procname = "softlockup_thresh",
719 .data = &softlockup_thresh, 772 .data = &softlockup_thresh,
720 .maxlen = sizeof(int), 773 .maxlen = sizeof(unsigned long),
721 .mode = 0644, 774 .mode = 0644,
722 .proc_handler = &proc_dointvec_minmax, 775 .proc_handler = &proc_doulongvec_minmax,
723 .strategy = &sysctl_intvec, 776 .strategy = &sysctl_intvec,
724 .extra1 = &one, 777 .extra1 = &one,
725 .extra2 = &sixty, 778 .extra2 = &sixty,
726 }, 779 },
780 {
781 .ctl_name = CTL_UNNUMBERED,
782 .procname = "hung_task_check_count",
783 .data = &sysctl_hung_task_check_count,
784 .maxlen = sizeof(unsigned long),
785 .mode = 0644,
786 .proc_handler = &proc_doulongvec_minmax,
787 .strategy = &sysctl_intvec,
788 },
789 {
790 .ctl_name = CTL_UNNUMBERED,
791 .procname = "hung_task_timeout_secs",
792 .data = &sysctl_hung_task_timeout_secs,
793 .maxlen = sizeof(unsigned long),
794 .mode = 0644,
795 .proc_handler = &proc_doulongvec_minmax,
796 .strategy = &sysctl_intvec,
797 },
798 {
799 .ctl_name = CTL_UNNUMBERED,
800 .procname = "hung_task_warnings",
801 .data = &sysctl_hung_task_warnings,
802 .maxlen = sizeof(unsigned long),
803 .mode = 0644,
804 .proc_handler = &proc_doulongvec_minmax,
805 .strategy = &sysctl_intvec,
806 },
727#endif 807#endif
728#ifdef CONFIG_COMPAT 808#ifdef CONFIG_COMPAT
729 { 809 {
@@ -895,11 +975,11 @@ static struct ctl_table vm_table[] = {
895 }, 975 },
896 { 976 {
897 .ctl_name = CTL_UNNUMBERED, 977 .ctl_name = CTL_UNNUMBERED,
898 .procname = "hugetlb_dynamic_pool", 978 .procname = "nr_overcommit_hugepages",
899 .data = &hugetlb_dynamic_pool, 979 .data = &nr_overcommit_huge_pages,
900 .maxlen = sizeof(hugetlb_dynamic_pool), 980 .maxlen = sizeof(nr_overcommit_huge_pages),
901 .mode = 0644, 981 .mode = 0644,
902 .proc_handler = &proc_dointvec, 982 .proc_handler = &proc_doulongvec_minmax,
903 }, 983 },
904#endif 984#endif
905 { 985 {
@@ -1289,12 +1369,27 @@ void sysctl_head_finish(struct ctl_table_header *head)
1289 spin_unlock(&sysctl_lock); 1369 spin_unlock(&sysctl_lock);
1290} 1370}
1291 1371
1292struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev) 1372static struct list_head *
1373lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces)
1374{
1375 struct list_head *header_list;
1376 header_list = &root->header_list;
1377 if (root->lookup)
1378 header_list = root->lookup(root, namespaces);
1379 return header_list;
1380}
1381
1382struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces,
1383 struct ctl_table_header *prev)
1293{ 1384{
1385 struct ctl_table_root *root;
1386 struct list_head *header_list;
1294 struct ctl_table_header *head; 1387 struct ctl_table_header *head;
1295 struct list_head *tmp; 1388 struct list_head *tmp;
1389
1296 spin_lock(&sysctl_lock); 1390 spin_lock(&sysctl_lock);
1297 if (prev) { 1391 if (prev) {
1392 head = prev;
1298 tmp = &prev->ctl_entry; 1393 tmp = &prev->ctl_entry;
1299 unuse_table(prev); 1394 unuse_table(prev);
1300 goto next; 1395 goto next;
@@ -1308,14 +1403,38 @@ struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev)
1308 spin_unlock(&sysctl_lock); 1403 spin_unlock(&sysctl_lock);
1309 return head; 1404 return head;
1310 next: 1405 next:
1406 root = head->root;
1311 tmp = tmp->next; 1407 tmp = tmp->next;
1312 if (tmp == &root_table_header.ctl_entry) 1408 header_list = lookup_header_list(root, namespaces);
1313 break; 1409 if (tmp != header_list)
1410 continue;
1411
1412 do {
1413 root = list_entry(root->root_list.next,
1414 struct ctl_table_root, root_list);
1415 if (root == &sysctl_table_root)
1416 goto out;
1417 header_list = lookup_header_list(root, namespaces);
1418 } while (list_empty(header_list));
1419 tmp = header_list->next;
1314 } 1420 }
1421out:
1315 spin_unlock(&sysctl_lock); 1422 spin_unlock(&sysctl_lock);
1316 return NULL; 1423 return NULL;
1317} 1424}
1318 1425
1426struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev)
1427{
1428 return __sysctl_head_next(current->nsproxy, prev);
1429}
1430
1431void register_sysctl_root(struct ctl_table_root *root)
1432{
1433 spin_lock(&sysctl_lock);
1434 list_add_tail(&root->root_list, &sysctl_table_root.root_list);
1435 spin_unlock(&sysctl_lock);
1436}
1437
1319#ifdef CONFIG_SYSCTL_SYSCALL 1438#ifdef CONFIG_SYSCTL_SYSCALL
1320int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, 1439int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp,
1321 void __user *newval, size_t newlen) 1440 void __user *newval, size_t newlen)
@@ -1472,18 +1591,21 @@ static __init int sysctl_init(void)
1472{ 1591{
1473 int err; 1592 int err;
1474 sysctl_set_parent(NULL, root_table); 1593 sysctl_set_parent(NULL, root_table);
1475 err = sysctl_check_table(root_table); 1594 err = sysctl_check_table(current->nsproxy, root_table);
1476 return 0; 1595 return 0;
1477} 1596}
1478 1597
1479core_initcall(sysctl_init); 1598core_initcall(sysctl_init);
1480 1599
1481/** 1600/**
1482 * register_sysctl_table - register a sysctl hierarchy 1601 * __register_sysctl_paths - register a sysctl hierarchy
1602 * @root: List of sysctl headers to register on
1603 * @namespaces: Data to compute which lists of sysctl entries are visible
1604 * @path: The path to the directory the sysctl table is in.
1483 * @table: the top-level table structure 1605 * @table: the top-level table structure
1484 * 1606 *
1485 * Register a sysctl table hierarchy. @table should be a filled in ctl_table 1607 * Register a sysctl table hierarchy. @table should be a filled in ctl_table
1486 * array. An entry with a ctl_name of 0 terminates the table. 1608 * array. A completely 0 filled entry terminates the table.
1487 * 1609 *
1488 * The members of the &struct ctl_table structure are used as follows: 1610 * The members of the &struct ctl_table structure are used as follows:
1489 * 1611 *
@@ -1546,25 +1668,99 @@ core_initcall(sysctl_init);
1546 * This routine returns %NULL on a failure to register, and a pointer 1668 * This routine returns %NULL on a failure to register, and a pointer
1547 * to the table header on success. 1669 * to the table header on success.
1548 */ 1670 */
1549struct ctl_table_header *register_sysctl_table(struct ctl_table * table) 1671struct ctl_table_header *__register_sysctl_paths(
1672 struct ctl_table_root *root,
1673 struct nsproxy *namespaces,
1674 const struct ctl_path *path, struct ctl_table *table)
1550{ 1675{
1551 struct ctl_table_header *tmp; 1676 struct list_head *header_list;
1552 tmp = kmalloc(sizeof(struct ctl_table_header), GFP_KERNEL); 1677 struct ctl_table_header *header;
1553 if (!tmp) 1678 struct ctl_table *new, **prevp;
1679 unsigned int n, npath;
1680
1681 /* Count the path components */
1682 for (npath = 0; path[npath].ctl_name || path[npath].procname; ++npath)
1683 ;
1684
1685 /*
1686 * For each path component, allocate a 2-element ctl_table array.
1687 * The first array element will be filled with the sysctl entry
1688 * for this, the second will be the sentinel (ctl_name == 0).
1689 *
1690 * We allocate everything in one go so that we don't have to
1691 * worry about freeing additional memory in unregister_sysctl_table.
1692 */
1693 header = kzalloc(sizeof(struct ctl_table_header) +
1694 (2 * npath * sizeof(struct ctl_table)), GFP_KERNEL);
1695 if (!header)
1554 return NULL; 1696 return NULL;
1555 tmp->ctl_table = table; 1697
1556 INIT_LIST_HEAD(&tmp->ctl_entry); 1698 new = (struct ctl_table *) (header + 1);
1557 tmp->used = 0; 1699
1558 tmp->unregistering = NULL; 1700 /* Now connect the dots */
1559 sysctl_set_parent(NULL, table); 1701 prevp = &header->ctl_table;
1560 if (sysctl_check_table(tmp->ctl_table)) { 1702 for (n = 0; n < npath; ++n, ++path) {
1561 kfree(tmp); 1703 /* Copy the procname */
1704 new->procname = path->procname;
1705 new->ctl_name = path->ctl_name;
1706 new->mode = 0555;
1707
1708 *prevp = new;
1709 prevp = &new->child;
1710
1711 new += 2;
1712 }
1713 *prevp = table;
1714 header->ctl_table_arg = table;
1715
1716 INIT_LIST_HEAD(&header->ctl_entry);
1717 header->used = 0;
1718 header->unregistering = NULL;
1719 header->root = root;
1720 sysctl_set_parent(NULL, header->ctl_table);
1721 if (sysctl_check_table(namespaces, header->ctl_table)) {
1722 kfree(header);
1562 return NULL; 1723 return NULL;
1563 } 1724 }
1564 spin_lock(&sysctl_lock); 1725 spin_lock(&sysctl_lock);
1565 list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry); 1726 header_list = lookup_header_list(root, namespaces);
1727 list_add_tail(&header->ctl_entry, header_list);
1566 spin_unlock(&sysctl_lock); 1728 spin_unlock(&sysctl_lock);
1567 return tmp; 1729
1730 return header;
1731}
1732
1733/**
1734 * register_sysctl_table_path - register a sysctl table hierarchy
1735 * @path: The path to the directory the sysctl table is in.
1736 * @table: the top-level table structure
1737 *
1738 * Register a sysctl table hierarchy. @table should be a filled in ctl_table
1739 * array. A completely 0 filled entry terminates the table.
1740 *
1741 * See __register_sysctl_paths for more details.
1742 */
1743struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
1744 struct ctl_table *table)
1745{
1746 return __register_sysctl_paths(&sysctl_table_root, current->nsproxy,
1747 path, table);
1748}
1749
1750/**
1751 * register_sysctl_table - register a sysctl table hierarchy
1752 * @table: the top-level table structure
1753 *
1754 * Register a sysctl table hierarchy. @table should be a filled in ctl_table
1755 * array. A completely 0 filled entry terminates the table.
1756 *
1757 * See register_sysctl_paths for more details.
1758 */
1759struct ctl_table_header *register_sysctl_table(struct ctl_table *table)
1760{
1761 static const struct ctl_path null_path[] = { {} };
1762
1763 return register_sysctl_paths(null_path, table);
1568} 1764}
1569 1765
1570/** 1766/**
@@ -1577,6 +1773,10 @@ struct ctl_table_header *register_sysctl_table(struct ctl_table * table)
1577void unregister_sysctl_table(struct ctl_table_header * header) 1773void unregister_sysctl_table(struct ctl_table_header * header)
1578{ 1774{
1579 might_sleep(); 1775 might_sleep();
1776
1777 if (header == NULL)
1778 return;
1779
1580 spin_lock(&sysctl_lock); 1780 spin_lock(&sysctl_lock);
1581 start_unregistering(header); 1781 start_unregistering(header);
1582 spin_unlock(&sysctl_lock); 1782 spin_unlock(&sysctl_lock);
@@ -1589,6 +1789,12 @@ struct ctl_table_header *register_sysctl_table(struct ctl_table * table)
1589 return NULL; 1789 return NULL;
1590} 1790}
1591 1791
1792struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path,
1793 struct ctl_table *table)
1794{
1795 return NULL;
1796}
1797
1592void unregister_sysctl_table(struct ctl_table_header * table) 1798void unregister_sysctl_table(struct ctl_table_header * table)
1593{ 1799{
1594} 1800}
@@ -2609,6 +2815,10 @@ static int deprecated_sysctl_warning(struct __sysctl_args *args)
2609 int name[CTL_MAXNAME]; 2815 int name[CTL_MAXNAME];
2610 int i; 2816 int i;
2611 2817
2818 /* Check args->nlen. */
2819 if (args->nlen < 0 || args->nlen > CTL_MAXNAME)
2820 return -ENOTDIR;
2821
2612 /* Read in the sysctl name for better debug message logging */ 2822 /* Read in the sysctl name for better debug message logging */
2613 for (i = 0; i < args->nlen; i++) 2823 for (i = 0; i < args->nlen; i++)
2614 if (get_user(name[i], args->name + i)) 2824 if (get_user(name[i], args->name + i))
@@ -2643,6 +2853,7 @@ EXPORT_SYMBOL(proc_dostring);
2643EXPORT_SYMBOL(proc_doulongvec_minmax); 2853EXPORT_SYMBOL(proc_doulongvec_minmax);
2644EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); 2854EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax);
2645EXPORT_SYMBOL(register_sysctl_table); 2855EXPORT_SYMBOL(register_sysctl_table);
2856EXPORT_SYMBOL(register_sysctl_paths);
2646EXPORT_SYMBOL(sysctl_intvec); 2857EXPORT_SYMBOL(sysctl_intvec);
2647EXPORT_SYMBOL(sysctl_jiffies); 2858EXPORT_SYMBOL(sysctl_jiffies);
2648EXPORT_SYMBOL(sysctl_ms_jiffies); 2859EXPORT_SYMBOL(sysctl_ms_jiffies);
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index ed6fe51df77a..c3206fa50048 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -1,6 +1,5 @@
1#include <linux/stat.h> 1#include <linux/stat.h>
2#include <linux/sysctl.h> 2#include <linux/sysctl.h>
3#include "../arch/s390/appldata/appldata.h"
4#include "../fs/xfs/linux-2.6/xfs_sysctl.h" 3#include "../fs/xfs/linux-2.6/xfs_sysctl.h"
5#include <linux/sunrpc/debug.h> 4#include <linux/sunrpc/debug.h>
6#include <linux/string.h> 5#include <linux/string.h>
@@ -96,7 +95,7 @@ static struct trans_ctl_table trans_kern_table[] = {
96 95
97 { KERN_PTY, "pty", trans_pty_table }, 96 { KERN_PTY, "pty", trans_pty_table },
98 { KERN_NGROUPS_MAX, "ngroups_max" }, 97 { KERN_NGROUPS_MAX, "ngroups_max" },
99 { KERN_SPARC_SCONS_PWROFF, "scons_poweroff" }, 98 { KERN_SPARC_SCONS_PWROFF, "scons-poweroff" },
100 { KERN_HZ_TIMER, "hz_timer" }, 99 { KERN_HZ_TIMER, "hz_timer" },
101 { KERN_UNKNOWN_NMI_PANIC, "unknown_nmi_panic" }, 100 { KERN_UNKNOWN_NMI_PANIC, "unknown_nmi_panic" },
102 { KERN_BOOTLOADER_TYPE, "bootloader_type" }, 101 { KERN_BOOTLOADER_TYPE, "bootloader_type" },
@@ -140,9 +139,6 @@ static struct trans_ctl_table trans_vm_table[] = {
140 { VM_PANIC_ON_OOM, "panic_on_oom" }, 139 { VM_PANIC_ON_OOM, "panic_on_oom" },
141 { VM_VDSO_ENABLED, "vdso_enabled" }, 140 { VM_VDSO_ENABLED, "vdso_enabled" },
142 { VM_MIN_SLAB, "min_slab_ratio" }, 141 { VM_MIN_SLAB, "min_slab_ratio" },
143 { VM_CMM_PAGES, "cmm_pages" },
144 { VM_CMM_TIMED_PAGES, "cmm_timed_pages" },
145 { VM_CMM_TIMEOUT, "cmm_timeout" },
146 142
147 {} 143 {}
148}; 144};
@@ -237,36 +233,6 @@ static struct trans_ctl_table trans_net_ipv4_conf_table[] = {
237 {} 233 {}
238}; 234};
239 235
240
241static struct trans_ctl_table trans_net_ipv4_vs_table[] = {
242 { NET_IPV4_VS_AMEMTHRESH, "amemthresh" },
243 { NET_IPV4_VS_DEBUG_LEVEL, "debug_level" },
244 { NET_IPV4_VS_AMDROPRATE, "am_droprate" },
245 { NET_IPV4_VS_DROP_ENTRY, "drop_entry" },
246 { NET_IPV4_VS_DROP_PACKET, "drop_packet" },
247 { NET_IPV4_VS_SECURE_TCP, "secure_tcp" },
248 { NET_IPV4_VS_TO_ES, "timeout_established" },
249 { NET_IPV4_VS_TO_SS, "timeout_synsent" },
250 { NET_IPV4_VS_TO_SR, "timeout_synrecv" },
251 { NET_IPV4_VS_TO_FW, "timeout_finwait" },
252 { NET_IPV4_VS_TO_TW, "timeout_timewait" },
253 { NET_IPV4_VS_TO_CL, "timeout_close" },
254 { NET_IPV4_VS_TO_CW, "timeout_closewait" },
255 { NET_IPV4_VS_TO_LA, "timeout_lastack" },
256 { NET_IPV4_VS_TO_LI, "timeout_listen" },
257 { NET_IPV4_VS_TO_SA, "timeout_synack" },
258 { NET_IPV4_VS_TO_UDP, "timeout_udp" },
259 { NET_IPV4_VS_TO_ICMP, "timeout_icmp" },
260 { NET_IPV4_VS_CACHE_BYPASS, "cache_bypass" },
261 { NET_IPV4_VS_EXPIRE_NODEST_CONN, "expire_nodest_conn" },
262 { NET_IPV4_VS_EXPIRE_QUIESCENT_TEMPLATE, "expire_quiescent_template" },
263 { NET_IPV4_VS_SYNC_THRESHOLD, "sync_threshold" },
264 { NET_IPV4_VS_NAT_ICMP_SEND, "nat_icmp_send" },
265 { NET_IPV4_VS_LBLC_EXPIRE, "lblc_expiration" },
266 { NET_IPV4_VS_LBLCR_EXPIRE, "lblcr_expiration" },
267 {}
268};
269
270static struct trans_ctl_table trans_net_neigh_vars_table[] = { 236static struct trans_ctl_table trans_net_neigh_vars_table[] = {
271 { NET_NEIGH_MCAST_SOLICIT, "mcast_solicit" }, 237 { NET_NEIGH_MCAST_SOLICIT, "mcast_solicit" },
272 { NET_NEIGH_UCAST_SOLICIT, "ucast_solicit" }, 238 { NET_NEIGH_UCAST_SOLICIT, "ucast_solicit" },
@@ -341,7 +307,6 @@ static struct trans_ctl_table trans_net_ipv4_table[] = {
341 { NET_IPV4_ROUTE, "route", trans_net_ipv4_route_table }, 307 { NET_IPV4_ROUTE, "route", trans_net_ipv4_route_table },
342 /* NET_IPV4_FIB_HASH unused */ 308 /* NET_IPV4_FIB_HASH unused */
343 { NET_IPV4_NETFILTER, "netfilter", trans_net_ipv4_netfilter_table }, 309 { NET_IPV4_NETFILTER, "netfilter", trans_net_ipv4_netfilter_table },
344 { NET_IPV4_VS, "vs", trans_net_ipv4_vs_table },
345 310
346 { NET_IPV4_TCP_TIMESTAMPS, "tcp_timestamps" }, 311 { NET_IPV4_TCP_TIMESTAMPS, "tcp_timestamps" },
347 { NET_IPV4_TCP_WINDOW_SCALING, "tcp_window_scaling" }, 312 { NET_IPV4_TCP_WINDOW_SCALING, "tcp_window_scaling" },
@@ -462,7 +427,7 @@ static struct trans_ctl_table trans_net_netrom_table[] = {
462 {} 427 {}
463}; 428};
464 429
465static struct trans_ctl_table trans_net_ax25_table[] = { 430static struct trans_ctl_table trans_net_ax25_param_table[] = {
466 { NET_AX25_IP_DEFAULT_MODE, "ip_default_mode" }, 431 { NET_AX25_IP_DEFAULT_MODE, "ip_default_mode" },
467 { NET_AX25_DEFAULT_MODE, "ax25_default_mode" }, 432 { NET_AX25_DEFAULT_MODE, "ax25_default_mode" },
468 { NET_AX25_BACKOFF_TYPE, "backoff_type" }, 433 { NET_AX25_BACKOFF_TYPE, "backoff_type" },
@@ -480,6 +445,11 @@ static struct trans_ctl_table trans_net_ax25_table[] = {
480 {} 445 {}
481}; 446};
482 447
448static struct trans_ctl_table trans_net_ax25_table[] = {
449 { 0, NULL, trans_net_ax25_param_table },
450 {}
451};
452
483static struct trans_ctl_table trans_net_bridge_table[] = { 453static struct trans_ctl_table trans_net_bridge_table[] = {
484 { NET_BRIDGE_NF_CALL_ARPTABLES, "bridge-nf-call-arptables" }, 454 { NET_BRIDGE_NF_CALL_ARPTABLES, "bridge-nf-call-arptables" },
485 { NET_BRIDGE_NF_CALL_IPTABLES, "bridge-nf-call-iptables" }, 455 { NET_BRIDGE_NF_CALL_IPTABLES, "bridge-nf-call-iptables" },
@@ -738,7 +708,7 @@ static struct trans_ctl_table trans_net_table[] = {
738 { NET_ROSE, "rose", trans_net_rose_table }, 708 { NET_ROSE, "rose", trans_net_rose_table },
739 { NET_IPV6, "ipv6", trans_net_ipv6_table }, 709 { NET_IPV6, "ipv6", trans_net_ipv6_table },
740 { NET_X25, "x25", trans_net_x25_table }, 710 { NET_X25, "x25", trans_net_x25_table },
741 { NET_TR, "tr", trans_net_tr_table }, 711 { NET_TR, "token-ring", trans_net_tr_table },
742 { NET_DECNET, "decnet", trans_net_decnet_table }, 712 { NET_DECNET, "decnet", trans_net_decnet_table },
743 /* NET_ECONET not used */ 713 /* NET_ECONET not used */
744 { NET_SCTP, "sctp", trans_net_sctp_table }, 714 { NET_SCTP, "sctp", trans_net_sctp_table },
@@ -1219,16 +1189,6 @@ static struct trans_ctl_table trans_arlan_table[] = {
1219 {} 1189 {}
1220}; 1190};
1221 1191
1222static struct trans_ctl_table trans_appldata_table[] = {
1223 { CTL_APPLDATA_TIMER, "timer" },
1224 { CTL_APPLDATA_INTERVAL, "interval" },
1225 { CTL_APPLDATA_OS, "os" },
1226 { CTL_APPLDATA_NET_SUM, "net_sum" },
1227 { CTL_APPLDATA_MEM, "mem" },
1228 {}
1229
1230};
1231
1232static struct trans_ctl_table trans_s390dbf_table[] = { 1192static struct trans_ctl_table trans_s390dbf_table[] = {
1233 { 5678 /* CTL_S390DBF_STOPPABLE */, "debug_stoppable" }, 1193 { 5678 /* CTL_S390DBF_STOPPABLE */, "debug_stoppable" },
1234 { 5679 /* CTL_S390DBF_ACTIVE */, "debug_active" }, 1194 { 5679 /* CTL_S390DBF_ACTIVE */, "debug_active" },
@@ -1273,7 +1233,6 @@ static struct trans_ctl_table trans_root_table[] = {
1273 { CTL_ABI, "abi" }, 1233 { CTL_ABI, "abi" },
1274 /* CTL_CPU not used */ 1234 /* CTL_CPU not used */
1275 { CTL_ARLAN, "arlan", trans_arlan_table }, 1235 { CTL_ARLAN, "arlan", trans_arlan_table },
1276 { CTL_APPLDATA, "appldata", trans_appldata_table },
1277 { CTL_S390DBF, "s390dbf", trans_s390dbf_table }, 1236 { CTL_S390DBF, "s390dbf", trans_s390dbf_table },
1278 { CTL_SUNRPC, "sunrpc", trans_sunrpc_table }, 1237 { CTL_SUNRPC, "sunrpc", trans_sunrpc_table },
1279 { CTL_PM, "pm", trans_pm_table }, 1238 { CTL_PM, "pm", trans_pm_table },
@@ -1383,7 +1342,8 @@ static void sysctl_repair_table(struct ctl_table *table)
1383 } 1342 }
1384} 1343}
1385 1344
1386static struct ctl_table *sysctl_check_lookup(struct ctl_table *table) 1345static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces,
1346 struct ctl_table *table)
1387{ 1347{
1388 struct ctl_table_header *head; 1348 struct ctl_table_header *head;
1389 struct ctl_table *ref, *test; 1349 struct ctl_table *ref, *test;
@@ -1391,8 +1351,8 @@ static struct ctl_table *sysctl_check_lookup(struct ctl_table *table)
1391 1351
1392 depth = sysctl_depth(table); 1352 depth = sysctl_depth(table);
1393 1353
1394 for (head = sysctl_head_next(NULL); head; 1354 for (head = __sysctl_head_next(namespaces, NULL); head;
1395 head = sysctl_head_next(head)) { 1355 head = __sysctl_head_next(namespaces, head)) {
1396 cur_depth = depth; 1356 cur_depth = depth;
1397 ref = head->ctl_table; 1357 ref = head->ctl_table;
1398repeat: 1358repeat:
@@ -1432,17 +1392,19 @@ static void set_fail(const char **fail, struct ctl_table *table, const char *str
1432 printk(KERN_ERR "sysctl table check failed: "); 1392 printk(KERN_ERR "sysctl table check failed: ");
1433 sysctl_print_path(table); 1393 sysctl_print_path(table);
1434 printk(" %s\n", *fail); 1394 printk(" %s\n", *fail);
1395 dump_stack();
1435 } 1396 }
1436 *fail = str; 1397 *fail = str;
1437} 1398}
1438 1399
1439static int sysctl_check_dir(struct ctl_table *table) 1400static int sysctl_check_dir(struct nsproxy *namespaces,
1401 struct ctl_table *table)
1440{ 1402{
1441 struct ctl_table *ref; 1403 struct ctl_table *ref;
1442 int error; 1404 int error;
1443 1405
1444 error = 0; 1406 error = 0;
1445 ref = sysctl_check_lookup(table); 1407 ref = sysctl_check_lookup(namespaces, table);
1446 if (ref) { 1408 if (ref) {
1447 int match = 0; 1409 int match = 0;
1448 if ((!table->procname && !ref->procname) || 1410 if ((!table->procname && !ref->procname) ||
@@ -1467,11 +1429,12 @@ static int sysctl_check_dir(struct ctl_table *table)
1467 return error; 1429 return error;
1468} 1430}
1469 1431
1470static void sysctl_check_leaf(struct ctl_table *table, const char **fail) 1432static void sysctl_check_leaf(struct nsproxy *namespaces,
1433 struct ctl_table *table, const char **fail)
1471{ 1434{
1472 struct ctl_table *ref; 1435 struct ctl_table *ref;
1473 1436
1474 ref = sysctl_check_lookup(table); 1437 ref = sysctl_check_lookup(namespaces, table);
1475 if (ref && (ref != table)) 1438 if (ref && (ref != table))
1476 set_fail(fail, table, "Sysctl already exists"); 1439 set_fail(fail, table, "Sysctl already exists");
1477} 1440}
@@ -1495,7 +1458,7 @@ static void sysctl_check_bin_path(struct ctl_table *table, const char **fail)
1495 } 1458 }
1496} 1459}
1497 1460
1498int sysctl_check_table(struct ctl_table *table) 1461int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
1499{ 1462{
1500 int error = 0; 1463 int error = 0;
1501 for (; table->ctl_name || table->procname; table++) { 1464 for (; table->ctl_name || table->procname; table++) {
@@ -1525,7 +1488,7 @@ int sysctl_check_table(struct ctl_table *table)
1525 set_fail(&fail, table, "Directory with extra1"); 1488 set_fail(&fail, table, "Directory with extra1");
1526 if (table->extra2) 1489 if (table->extra2)
1527 set_fail(&fail, table, "Directory with extra2"); 1490 set_fail(&fail, table, "Directory with extra2");
1528 if (sysctl_check_dir(table)) 1491 if (sysctl_check_dir(namespaces, table))
1529 set_fail(&fail, table, "Inconsistent directory names"); 1492 set_fail(&fail, table, "Inconsistent directory names");
1530 } else { 1493 } else {
1531 if ((table->strategy == sysctl_data) || 1494 if ((table->strategy == sysctl_data) ||
@@ -1574,7 +1537,7 @@ int sysctl_check_table(struct ctl_table *table)
1574 if (!table->procname && table->proc_handler) 1537 if (!table->procname && table->proc_handler)
1575 set_fail(&fail, table, "proc_handler without procname"); 1538 set_fail(&fail, table, "proc_handler without procname");
1576#endif 1539#endif
1577 sysctl_check_leaf(table, &fail); 1540 sysctl_check_leaf(namespaces, table, &fail);
1578 } 1541 }
1579 sysctl_check_bin_path(table, &fail); 1542 sysctl_check_bin_path(table, &fail);
1580 if (fail) { 1543 if (fail) {
@@ -1582,7 +1545,7 @@ int sysctl_check_table(struct ctl_table *table)
1582 error = -EINVAL; 1545 error = -EINVAL;
1583 } 1546 }
1584 if (table->child) 1547 if (table->child)
1585 error |= sysctl_check_table(table->child); 1548 error |= sysctl_check_table(namespaces, table->child);
1586 } 1549 }
1587 return error; 1550 return error;
1588} 1551}
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 354e74bc17c1..07e86a828073 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -398,31 +398,31 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
398 398
399 fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]); 399 fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]);
400 file = fget_light(fd, &fput_needed); 400 file = fget_light(fd, &fput_needed);
401 if (file) { 401 if (!file)
402 size = nla_total_size(sizeof(struct cgroupstats)); 402 return 0;
403 403
404 rc = prepare_reply(info, CGROUPSTATS_CMD_NEW, &rep_skb, 404 size = nla_total_size(sizeof(struct cgroupstats));
405 size);
406 if (rc < 0)
407 goto err;
408 405
409 na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, 406 rc = prepare_reply(info, CGROUPSTATS_CMD_NEW, &rep_skb,
410 sizeof(struct cgroupstats)); 407 size);
411 stats = nla_data(na); 408 if (rc < 0)
412 memset(stats, 0, sizeof(*stats)); 409 goto err;
413 410
414 rc = cgroupstats_build(stats, file->f_dentry); 411 na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS,
415 if (rc < 0) 412 sizeof(struct cgroupstats));
416 goto err; 413 stats = nla_data(na);
414 memset(stats, 0, sizeof(*stats));
417 415
418 fput_light(file, fput_needed); 416 rc = cgroupstats_build(stats, file->f_dentry);
419 return send_reply(rep_skb, info->snd_pid); 417 if (rc < 0) {
418 nlmsg_free(rep_skb);
419 goto err;
420 } 420 }
421 421
422 rc = send_reply(rep_skb, info->snd_pid);
423
422err: 424err:
423 if (file) 425 fput_light(file, fput_needed);
424 fput_light(file, fput_needed);
425 nlmsg_free(rep_skb);
426 return rc; 426 return rc;
427} 427}
428 428
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
new file mode 100644
index 000000000000..88cdb109e13c
--- /dev/null
+++ b/kernel/test_kprobes.c
@@ -0,0 +1,216 @@
1/*
2 * test_kprobes.c - simple sanity test for *probes
3 *
4 * Copyright IBM Corp. 2008
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it would be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
14 * the GNU General Public License for more details.
15 */
16
17#include <linux/kernel.h>
18#include <linux/kprobes.h>
19#include <linux/random.h>
20
21#define div_factor 3
22
23static u32 rand1, preh_val, posth_val, jph_val;
24static int errors, handler_errors, num_tests;
25
26static noinline u32 kprobe_target(u32 value)
27{
28 /*
29 * gcc ignores noinline on some architectures unless we stuff
30 * sufficient lard into the function. The get_kprobe() here is
31 * just for that.
32 *
33 * NOTE: We aren't concerned about the correctness of get_kprobe()
34 * here; hence, this call is neither under !preempt nor with the
35 * kprobe_mutex held. This is fine(tm)
36 */
37 if (get_kprobe((void *)0xdeadbeef))
38 printk(KERN_INFO "Kprobe smoke test: probe on 0xdeadbeef!\n");
39
40 return (value / div_factor);
41}
42
43static int kp_pre_handler(struct kprobe *p, struct pt_regs *regs)
44{
45 preh_val = (rand1 / div_factor);
46 return 0;
47}
48
49static void kp_post_handler(struct kprobe *p, struct pt_regs *regs,
50 unsigned long flags)
51{
52 if (preh_val != (rand1 / div_factor)) {
53 handler_errors++;
54 printk(KERN_ERR "Kprobe smoke test failed: "
55 "incorrect value in post_handler\n");
56 }
57 posth_val = preh_val + div_factor;
58}
59
60static struct kprobe kp = {
61 .symbol_name = "kprobe_target",
62 .pre_handler = kp_pre_handler,
63 .post_handler = kp_post_handler
64};
65
66static int test_kprobe(void)
67{
68 int ret;
69
70 ret = register_kprobe(&kp);
71 if (ret < 0) {
72 printk(KERN_ERR "Kprobe smoke test failed: "
73 "register_kprobe returned %d\n", ret);
74 return ret;
75 }
76
77 ret = kprobe_target(rand1);
78 unregister_kprobe(&kp);
79
80 if (preh_val == 0) {
81 printk(KERN_ERR "Kprobe smoke test failed: "
82 "kprobe pre_handler not called\n");
83 handler_errors++;
84 }
85
86 if (posth_val == 0) {
87 printk(KERN_ERR "Kprobe smoke test failed: "
88 "kprobe post_handler not called\n");
89 handler_errors++;
90 }
91
92 return 0;
93}
94
95static u32 j_kprobe_target(u32 value)
96{
97 if (value != rand1) {
98 handler_errors++;
99 printk(KERN_ERR "Kprobe smoke test failed: "
100 "incorrect value in jprobe handler\n");
101 }
102
103 jph_val = rand1;
104 jprobe_return();
105 return 0;
106}
107
108static struct jprobe jp = {
109 .entry = j_kprobe_target,
110 .kp.symbol_name = "kprobe_target"
111};
112
113static int test_jprobe(void)
114{
115 int ret;
116
117 ret = register_jprobe(&jp);
118 if (ret < 0) {
119 printk(KERN_ERR "Kprobe smoke test failed: "
120 "register_jprobe returned %d\n", ret);
121 return ret;
122 }
123
124 ret = kprobe_target(rand1);
125 unregister_jprobe(&jp);
126 if (jph_val == 0) {
127 printk(KERN_ERR "Kprobe smoke test failed: "
128 "jprobe handler not called\n");
129 handler_errors++;
130 }
131
132 return 0;
133}
134
135#ifdef CONFIG_KRETPROBES
136static u32 krph_val;
137
138static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs)
139{
140 unsigned long ret = regs_return_value(regs);
141
142 if (ret != (rand1 / div_factor)) {
143 handler_errors++;
144 printk(KERN_ERR "Kprobe smoke test failed: "
145 "incorrect value in kretprobe handler\n");
146 }
147
148 krph_val = (rand1 / div_factor);
149 return 0;
150}
151
152static struct kretprobe rp = {
153 .handler = return_handler,
154 .kp.symbol_name = "kprobe_target"
155};
156
157static int test_kretprobe(void)
158{
159 int ret;
160
161 ret = register_kretprobe(&rp);
162 if (ret < 0) {
163 printk(KERN_ERR "Kprobe smoke test failed: "
164 "register_kretprobe returned %d\n", ret);
165 return ret;
166 }
167
168 ret = kprobe_target(rand1);
169 unregister_kretprobe(&rp);
170 if (krph_val == 0) {
171 printk(KERN_ERR "Kprobe smoke test failed: "
172 "kretprobe handler not called\n");
173 handler_errors++;
174 }
175
176 return 0;
177}
178#endif /* CONFIG_KRETPROBES */
179
180int init_test_probes(void)
181{
182 int ret;
183
184 do {
185 rand1 = random32();
186 } while (rand1 <= div_factor);
187
188 printk(KERN_INFO "Kprobe smoke test started\n");
189 num_tests++;
190 ret = test_kprobe();
191 if (ret < 0)
192 errors++;
193
194 num_tests++;
195 ret = test_jprobe();
196 if (ret < 0)
197 errors++;
198
199#ifdef CONFIG_KRETPROBES
200 num_tests++;
201 ret = test_kretprobe();
202 if (ret < 0)
203 errors++;
204#endif /* CONFIG_KRETPROBES */
205
206 if (errors)
207 printk(KERN_ERR "BUG: Kprobe smoke test: %d out of "
208 "%d tests failed\n", errors, num_tests);
209 else if (handler_errors)
210 printk(KERN_ERR "BUG: Kprobe smoke test: %d error(s) "
211 "running handlers\n", handler_errors);
212 else
213 printk(KERN_INFO "Kprobe smoke test passed successfully\n");
214
215 return 0;
216}
diff --git a/kernel/time.c b/kernel/time.c
index 09d3c45c4da7..4064c0566e77 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -129,6 +129,7 @@ static inline void warp_clock(void)
129 write_seqlock_irq(&xtime_lock); 129 write_seqlock_irq(&xtime_lock);
130 wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; 130 wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60;
131 xtime.tv_sec += sys_tz.tz_minuteswest * 60; 131 xtime.tv_sec += sys_tz.tz_minuteswest * 60;
132 update_xtime_cache(0);
132 write_sequnlock_irq(&xtime_lock); 133 write_sequnlock_irq(&xtime_lock);
133 clock_was_set(); 134 clock_was_set();
134} 135}
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 822beebe664a..3e59fce6dd43 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -41,6 +41,11 @@ unsigned long clockevent_delta2ns(unsigned long latch,
41{ 41{
42 u64 clc = ((u64) latch << evt->shift); 42 u64 clc = ((u64) latch << evt->shift);
43 43
44 if (unlikely(!evt->mult)) {
45 evt->mult = 1;
46 WARN_ON(1);
47 }
48
44 do_div(clc, evt->mult); 49 do_div(clc, evt->mult);
45 if (clc < 1000) 50 if (clc < 1000)
46 clc = 1000; 51 clc = 1000;
@@ -78,6 +83,11 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires,
78 unsigned long long clc; 83 unsigned long long clc;
79 int64_t delta; 84 int64_t delta;
80 85
86 if (unlikely(expires.tv64 < 0)) {
87 WARN_ON_ONCE(1);
88 return -ETIME;
89 }
90
81 delta = ktime_to_ns(ktime_sub(expires, now)); 91 delta = ktime_to_ns(ktime_sub(expires, now));
82 92
83 if (delta <= 0) 93 if (delta <= 0)
@@ -146,6 +156,14 @@ static void clockevents_notify_released(void)
146void clockevents_register_device(struct clock_event_device *dev) 156void clockevents_register_device(struct clock_event_device *dev)
147{ 157{
148 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); 158 BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
159 /*
160 * A nsec2cyc multiplicator of 0 is invalid and we'd crash
161 * on it, so fix it up and emit a warning:
162 */
163 if (unlikely(!dev->mult)) {
164 dev->mult = 1;
165 WARN_ON(1);
166 }
149 167
150 spin_lock(&clockevents_lock); 168 spin_lock(&clockevents_lock);
151 169
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index c8a9d13874df..6e9259a5d501 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -142,8 +142,13 @@ static void clocksource_watchdog(unsigned long data)
142 } 142 }
143 143
144 if (!list_empty(&watchdog_list)) { 144 if (!list_empty(&watchdog_list)) {
145 __mod_timer(&watchdog_timer, 145 /* Cycle through CPUs to check if the CPUs stay synchronized to
146 watchdog_timer.expires + WATCHDOG_INTERVAL); 146 * each other. */
147 int next_cpu = next_cpu(raw_smp_processor_id(), cpu_online_map);
148 if (next_cpu >= NR_CPUS)
149 next_cpu = first_cpu(cpu_online_map);
150 watchdog_timer.expires += WATCHDOG_INTERVAL;
151 add_timer_on(&watchdog_timer, next_cpu);
147 } 152 }
148 spin_unlock(&watchdog_lock); 153 spin_unlock(&watchdog_lock);
149} 154}
@@ -165,7 +170,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
165 if (!started && watchdog) { 170 if (!started && watchdog) {
166 watchdog_last = watchdog->read(); 171 watchdog_last = watchdog->read();
167 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; 172 watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL;
168 add_timer(&watchdog_timer); 173 add_timer_on(&watchdog_timer, first_cpu(cpu_online_map));
169 } 174 }
170 } else { 175 } else {
171 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) 176 if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS)
@@ -175,7 +180,7 @@ static void clocksource_check_watchdog(struct clocksource *cs)
175 if (watchdog) 180 if (watchdog)
176 del_timer(&watchdog_timer); 181 del_timer(&watchdog_timer);
177 watchdog = cs; 182 watchdog = cs;
178 init_timer(&watchdog_timer); 183 init_timer_deferrable(&watchdog_timer);
179 watchdog_timer.function = clocksource_watchdog; 184 watchdog_timer.function = clocksource_watchdog;
180 185
181 /* Reset watchdog cycles */ 186 /* Reset watchdog cycles */
@@ -186,7 +191,8 @@ static void clocksource_check_watchdog(struct clocksource *cs)
186 watchdog_last = watchdog->read(); 191 watchdog_last = watchdog->read();
187 watchdog_timer.expires = 192 watchdog_timer.expires =
188 jiffies + WATCHDOG_INTERVAL; 193 jiffies + WATCHDOG_INTERVAL;
189 add_timer(&watchdog_timer); 194 add_timer_on(&watchdog_timer,
195 first_cpu(cpu_online_map));
190 } 196 }
191 } 197 }
192 } 198 }
@@ -331,6 +337,21 @@ void clocksource_change_rating(struct clocksource *cs, int rating)
331 spin_unlock_irqrestore(&clocksource_lock, flags); 337 spin_unlock_irqrestore(&clocksource_lock, flags);
332} 338}
333 339
340/**
341 * clocksource_unregister - remove a registered clocksource
342 */
343void clocksource_unregister(struct clocksource *cs)
344{
345 unsigned long flags;
346
347 spin_lock_irqsave(&clocksource_lock, flags);
348 list_del(&cs->list);
349 if (clocksource_override == cs)
350 clocksource_override = NULL;
351 next_clocksource = select_clocksource();
352 spin_unlock_irqrestore(&clocksource_lock, flags);
353}
354
334#ifdef CONFIG_SYSFS 355#ifdef CONFIG_SYSFS
335/** 356/**
336 * sysfs_show_current_clocksources - sysfs interface for current clocksource 357 * sysfs_show_current_clocksources - sysfs interface for current clocksource
@@ -441,7 +462,7 @@ static SYSDEV_ATTR(available_clocksource, 0600,
441 sysfs_show_available_clocksources, NULL); 462 sysfs_show_available_clocksources, NULL);
442 463
443static struct sysdev_class clocksource_sysclass = { 464static struct sysdev_class clocksource_sysclass = {
444 set_kset_name("clocksource"), 465 .name = "clocksource",
445}; 466};
446 467
447static struct sys_device device_clocksource = { 468static struct sys_device device_clocksource = {
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index de6a2d6b3ebb..e64efaf957e8 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -205,7 +205,7 @@ static void sync_cmos_clock(unsigned long dummy)
205 return; 205 return;
206 206
207 getnstimeofday(&now); 207 getnstimeofday(&now);
208 if (abs(xtime.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) 208 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2)
209 fail = update_persistent_clock(now); 209 fail = update_persistent_clock(now);
210 210
211 next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec; 211 next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec;
@@ -249,10 +249,12 @@ int do_adjtimex(struct timex *txc)
249 249
250 /* Now we validate the data before disabling interrupts */ 250 /* Now we validate the data before disabling interrupts */
251 251
252 if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) 252 if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) {
253 /* singleshot must not be used with any other mode bits */ 253 /* singleshot must not be used with any other mode bits */
254 if (txc->modes != ADJ_OFFSET_SINGLESHOT) 254 if (txc->modes != ADJ_OFFSET_SINGLESHOT &&
255 txc->modes != ADJ_OFFSET_SS_READ)
255 return -EINVAL; 256 return -EINVAL;
257 }
256 258
257 if (txc->modes != ADJ_OFFSET_SINGLESHOT && (txc->modes & ADJ_OFFSET)) 259 if (txc->modes != ADJ_OFFSET_SINGLESHOT && (txc->modes & ADJ_OFFSET))
258 /* adjustment Offset limited to +- .512 seconds */ 260 /* adjustment Offset limited to +- .512 seconds */
@@ -372,7 +374,8 @@ int do_adjtimex(struct timex *txc)
372leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0) 374leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0)
373 result = TIME_ERROR; 375 result = TIME_ERROR;
374 376
375 if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) 377 if ((txc->modes == ADJ_OFFSET_SINGLESHOT) ||
378 (txc->modes == ADJ_OFFSET_SS_READ))
376 txc->offset = save_adjust; 379 txc->offset = save_adjust;
377 else 380 else
378 txc->offset = ((long)shift_right(time_offset, SHIFT_UPDATE)) * 381 txc->offset = ((long)shift_right(time_offset, SHIFT_UPDATE)) *
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 8cfb8b2ce773..e1bd50cbbf5d 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -126,9 +126,9 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu)
126/* 126/*
127 * Broadcast the event to the cpus, which are set in the mask 127 * Broadcast the event to the cpus, which are set in the mask
128 */ 128 */
129int tick_do_broadcast(cpumask_t mask) 129static void tick_do_broadcast(cpumask_t mask)
130{ 130{
131 int ret = 0, cpu = smp_processor_id(); 131 int cpu = smp_processor_id();
132 struct tick_device *td; 132 struct tick_device *td;
133 133
134 /* 134 /*
@@ -138,7 +138,6 @@ int tick_do_broadcast(cpumask_t mask)
138 cpu_clear(cpu, mask); 138 cpu_clear(cpu, mask);
139 td = &per_cpu(tick_cpu_device, cpu); 139 td = &per_cpu(tick_cpu_device, cpu);
140 td->evtdev->event_handler(td->evtdev); 140 td->evtdev->event_handler(td->evtdev);
141 ret = 1;
142 } 141 }
143 142
144 if (!cpus_empty(mask)) { 143 if (!cpus_empty(mask)) {
@@ -151,9 +150,7 @@ int tick_do_broadcast(cpumask_t mask)
151 cpu = first_cpu(mask); 150 cpu = first_cpu(mask);
152 td = &per_cpu(tick_cpu_device, cpu); 151 td = &per_cpu(tick_cpu_device, cpu);
153 td->evtdev->broadcast(mask); 152 td->evtdev->broadcast(mask);
154 ret = 1;
155 } 153 }
156 return ret;
157} 154}
158 155
159/* 156/*
@@ -384,45 +381,19 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
384} 381}
385 382
386/* 383/*
387 * Reprogram the broadcast device:
388 *
389 * Called with tick_broadcast_lock held and interrupts disabled.
390 */
391static int tick_broadcast_reprogram(void)
392{
393 ktime_t expires = { .tv64 = KTIME_MAX };
394 struct tick_device *td;
395 int cpu;
396
397 /*
398 * Find the event which expires next:
399 */
400 for (cpu = first_cpu(tick_broadcast_oneshot_mask); cpu != NR_CPUS;
401 cpu = next_cpu(cpu, tick_broadcast_oneshot_mask)) {
402 td = &per_cpu(tick_cpu_device, cpu);
403 if (td->evtdev->next_event.tv64 < expires.tv64)
404 expires = td->evtdev->next_event;
405 }
406
407 if (expires.tv64 == KTIME_MAX)
408 return 0;
409
410 return tick_broadcast_set_event(expires, 0);
411}
412
413/*
414 * Handle oneshot mode broadcasting 384 * Handle oneshot mode broadcasting
415 */ 385 */
416static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) 386static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
417{ 387{
418 struct tick_device *td; 388 struct tick_device *td;
419 cpumask_t mask; 389 cpumask_t mask;
420 ktime_t now; 390 ktime_t now, next_event;
421 int cpu; 391 int cpu;
422 392
423 spin_lock(&tick_broadcast_lock); 393 spin_lock(&tick_broadcast_lock);
424again: 394again:
425 dev->next_event.tv64 = KTIME_MAX; 395 dev->next_event.tv64 = KTIME_MAX;
396 next_event.tv64 = KTIME_MAX;
426 mask = CPU_MASK_NONE; 397 mask = CPU_MASK_NONE;
427 now = ktime_get(); 398 now = ktime_get();
428 /* Find all expired events */ 399 /* Find all expired events */
@@ -431,19 +402,31 @@ again:
431 td = &per_cpu(tick_cpu_device, cpu); 402 td = &per_cpu(tick_cpu_device, cpu);
432 if (td->evtdev->next_event.tv64 <= now.tv64) 403 if (td->evtdev->next_event.tv64 <= now.tv64)
433 cpu_set(cpu, mask); 404 cpu_set(cpu, mask);
405 else if (td->evtdev->next_event.tv64 < next_event.tv64)
406 next_event.tv64 = td->evtdev->next_event.tv64;
434 } 407 }
435 408
436 /* 409 /*
437 * Wakeup the cpus which have an expired event. The broadcast 410 * Wakeup the cpus which have an expired event.
438 * device is reprogrammed in the return from idle code. 411 */
412 tick_do_broadcast(mask);
413
414 /*
415 * Two reasons for reprogram:
416 *
417 * - The global event did not expire any CPU local
418 * events. This happens in dyntick mode, as the maximum PIT
419 * delta is quite small.
420 *
421 * - There are pending events on sleeping CPUs which were not
422 * in the event mask
439 */ 423 */
440 if (!tick_do_broadcast(mask)) { 424 if (next_event.tv64 != KTIME_MAX) {
441 /* 425 /*
442 * The global event did not expire any CPU local 426 * Rearm the broadcast device. If event expired,
443 * events. This happens in dyntick mode, as the 427 * repeat the above
444 * maximum PIT delta is quite small.
445 */ 428 */
446 if (tick_broadcast_reprogram()) 429 if (tick_broadcast_set_event(next_event, 0))
447 goto again; 430 goto again;
448 } 431 }
449 spin_unlock(&tick_broadcast_lock); 432 spin_unlock(&tick_broadcast_lock);
@@ -508,7 +491,7 @@ static void tick_broadcast_clear_oneshot(int cpu)
508} 491}
509 492
510/** 493/**
511 * tick_broadcast_setup_highres - setup the broadcast device for highres 494 * tick_broadcast_setup_oneshot - setup the broadcast device
512 */ 495 */
513void tick_broadcast_setup_oneshot(struct clock_event_device *bc) 496void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
514{ 497{
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index bb13f2724905..f13f2b7f4fd4 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -70,8 +70,6 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
70 * Broadcasting support 70 * Broadcasting support
71 */ 71 */
72#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST 72#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
73extern int tick_do_broadcast(cpumask_t mask);
74
75extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); 73extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu);
76extern int tick_check_broadcast_device(struct clock_event_device *dev); 74extern int tick_check_broadcast_device(struct clock_event_device *dev);
77extern int tick_is_broadcast_device(struct clock_event_device *dev); 75extern int tick_is_broadcast_device(struct clock_event_device *dev);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 10a1347597fd..88267f0a8471 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -9,7 +9,7 @@
9 * 9 *
10 * Started by: Thomas Gleixner and Ingo Molnar 10 * Started by: Thomas Gleixner and Ingo Molnar
11 * 11 *
12 * For licencing details see kernel-base/COPYING 12 * Distribute under GPLv2.
13 */ 13 */
14#include <linux/cpu.h> 14#include <linux/cpu.h>
15#include <linux/err.h> 15#include <linux/err.h>
@@ -133,14 +133,55 @@ void tick_nohz_update_jiffies(void)
133 if (!ts->tick_stopped) 133 if (!ts->tick_stopped)
134 return; 134 return;
135 135
136 touch_softlockup_watchdog();
137
136 cpu_clear(cpu, nohz_cpu_mask); 138 cpu_clear(cpu, nohz_cpu_mask);
137 now = ktime_get(); 139 now = ktime_get();
140 ts->idle_waketime = now;
138 141
139 local_irq_save(flags); 142 local_irq_save(flags);
140 tick_do_update_jiffies64(now); 143 tick_do_update_jiffies64(now);
141 local_irq_restore(flags); 144 local_irq_restore(flags);
142} 145}
143 146
147void tick_nohz_stop_idle(int cpu)
148{
149 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
150
151 if (ts->idle_active) {
152 ktime_t now, delta;
153 now = ktime_get();
154 delta = ktime_sub(now, ts->idle_entrytime);
155 ts->idle_lastupdate = now;
156 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
157 ts->idle_active = 0;
158 }
159}
160
161static ktime_t tick_nohz_start_idle(int cpu)
162{
163 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
164 ktime_t now, delta;
165
166 now = ktime_get();
167 if (ts->idle_active) {
168 delta = ktime_sub(now, ts->idle_entrytime);
169 ts->idle_lastupdate = now;
170 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
171 }
172 ts->idle_entrytime = now;
173 ts->idle_active = 1;
174 return now;
175}
176
177u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
178{
179 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
180
181 *last_update_time = ktime_to_us(ts->idle_lastupdate);
182 return ktime_to_us(ts->idle_sleeptime);
183}
184
144/** 185/**
145 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task 186 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
146 * 187 *
@@ -151,14 +192,16 @@ void tick_nohz_update_jiffies(void)
151void tick_nohz_stop_sched_tick(void) 192void tick_nohz_stop_sched_tick(void)
152{ 193{
153 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; 194 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
195 unsigned long rt_jiffies;
154 struct tick_sched *ts; 196 struct tick_sched *ts;
155 ktime_t last_update, expires, now, delta; 197 ktime_t last_update, expires, now;
156 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; 198 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
157 int cpu; 199 int cpu;
158 200
159 local_irq_save(flags); 201 local_irq_save(flags);
160 202
161 cpu = smp_processor_id(); 203 cpu = smp_processor_id();
204 now = tick_nohz_start_idle(cpu);
162 ts = &per_cpu(tick_cpu_sched, cpu); 205 ts = &per_cpu(tick_cpu_sched, cpu);
163 206
164 /* 207 /*
@@ -190,19 +233,7 @@ void tick_nohz_stop_sched_tick(void)
190 } 233 }
191 } 234 }
192 235
193 now = ktime_get();
194 /*
195 * When called from irq_exit we need to account the idle sleep time
196 * correctly.
197 */
198 if (ts->tick_stopped) {
199 delta = ktime_sub(now, ts->idle_entrytime);
200 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
201 }
202
203 ts->idle_entrytime = now;
204 ts->idle_calls++; 236 ts->idle_calls++;
205
206 /* Read jiffies and the time when jiffies were updated last */ 237 /* Read jiffies and the time when jiffies were updated last */
207 do { 238 do {
208 seq = read_seqbegin(&xtime_lock); 239 seq = read_seqbegin(&xtime_lock);
@@ -214,6 +245,10 @@ void tick_nohz_stop_sched_tick(void)
214 next_jiffies = get_next_timer_interrupt(last_jiffies); 245 next_jiffies = get_next_timer_interrupt(last_jiffies);
215 delta_jiffies = next_jiffies - last_jiffies; 246 delta_jiffies = next_jiffies - last_jiffies;
216 247
248 rt_jiffies = rt_needs_cpu(cpu);
249 if (rt_jiffies && rt_jiffies < delta_jiffies)
250 delta_jiffies = rt_jiffies;
251
217 if (rcu_needs_cpu(cpu)) 252 if (rcu_needs_cpu(cpu))
218 delta_jiffies = 1; 253 delta_jiffies = 1;
219 /* 254 /*
@@ -289,7 +324,7 @@ void tick_nohz_stop_sched_tick(void)
289 /* Check, if the timer was already in the past */ 324 /* Check, if the timer was already in the past */
290 if (hrtimer_active(&ts->sched_timer)) 325 if (hrtimer_active(&ts->sched_timer))
291 goto out; 326 goto out;
292 } else if(!tick_program_event(expires, 0)) 327 } else if (!tick_program_event(expires, 0))
293 goto out; 328 goto out;
294 /* 329 /*
295 * We are past the event already. So we crossed a 330 * We are past the event already. So we crossed a
@@ -320,10 +355,8 @@ ktime_t tick_nohz_get_sleep_length(void)
320 return ts->sleep_length; 355 return ts->sleep_length;
321} 356}
322 357
323EXPORT_SYMBOL_GPL(tick_nohz_get_sleep_length);
324
325/** 358/**
326 * nohz_restart_sched_tick - restart the idle tick from the idle task 359 * tick_nohz_restart_sched_tick - restart the idle tick from the idle task
327 * 360 *
328 * Restart the idle tick when the CPU is woken up from idle 361 * Restart the idle tick when the CPU is woken up from idle
329 */ 362 */
@@ -332,23 +365,22 @@ void tick_nohz_restart_sched_tick(void)
332 int cpu = smp_processor_id(); 365 int cpu = smp_processor_id();
333 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 366 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
334 unsigned long ticks; 367 unsigned long ticks;
335 ktime_t now, delta; 368 ktime_t now;
336 369
337 if (!ts->tick_stopped) 370 local_irq_disable();
371 tick_nohz_stop_idle(cpu);
372
373 if (!ts->tick_stopped) {
374 local_irq_enable();
338 return; 375 return;
376 }
339 377
340 /* Update jiffies first */ 378 /* Update jiffies first */
341 now = ktime_get();
342
343 local_irq_disable();
344 select_nohz_load_balancer(0); 379 select_nohz_load_balancer(0);
380 now = ktime_get();
345 tick_do_update_jiffies64(now); 381 tick_do_update_jiffies64(now);
346 cpu_clear(cpu, nohz_cpu_mask); 382 cpu_clear(cpu, nohz_cpu_mask);
347 383
348 /* Account the idle time */
349 delta = ktime_sub(now, ts->idle_entrytime);
350 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
351
352 /* 384 /*
353 * We stopped the tick in idle. Update process times would miss the 385 * We stopped the tick in idle. Update process times would miss the
354 * time we slept as update_process_times does only a 1 tick 386 * time we slept as update_process_times does only a 1 tick
@@ -369,6 +401,7 @@ void tick_nohz_restart_sched_tick(void)
369 * Cancel the scheduled timer and restore the tick 401 * Cancel the scheduled timer and restore the tick
370 */ 402 */
371 ts->tick_stopped = 0; 403 ts->tick_stopped = 0;
404 ts->idle_exittime = now;
372 hrtimer_cancel(&ts->sched_timer); 405 hrtimer_cancel(&ts->sched_timer);
373 ts->sched_timer.expires = ts->idle_tick; 406 ts->sched_timer.expires = ts->idle_tick;
374 407
@@ -502,14 +535,13 @@ static inline void tick_nohz_switch_to_nohz(void) { }
502 */ 535 */
503#ifdef CONFIG_HIGH_RES_TIMERS 536#ifdef CONFIG_HIGH_RES_TIMERS
504/* 537/*
505 * We rearm the timer until we get disabled by the idle code 538 * We rearm the timer until we get disabled by the idle code.
506 * Called with interrupts disabled and timer->base->cpu_base->lock held. 539 * Called with interrupts disabled and timer->base->cpu_base->lock held.
507 */ 540 */
508static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) 541static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
509{ 542{
510 struct tick_sched *ts = 543 struct tick_sched *ts =
511 container_of(timer, struct tick_sched, sched_timer); 544 container_of(timer, struct tick_sched, sched_timer);
512 struct hrtimer_cpu_base *base = timer->base->cpu_base;
513 struct pt_regs *regs = get_irq_regs(); 545 struct pt_regs *regs = get_irq_regs();
514 ktime_t now = ktime_get(); 546 ktime_t now = ktime_get();
515 int cpu = smp_processor_id(); 547 int cpu = smp_processor_id();
@@ -547,15 +579,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
547 touch_softlockup_watchdog(); 579 touch_softlockup_watchdog();
548 ts->idle_jiffies++; 580 ts->idle_jiffies++;
549 } 581 }
550 /*
551 * update_process_times() might take tasklist_lock, hence
552 * drop the base lock. sched-tick hrtimers are per-CPU and
553 * never accessible by userspace APIs, so this is safe to do.
554 */
555 spin_unlock(&base->lock);
556 update_process_times(user_mode(regs)); 582 update_process_times(user_mode(regs));
557 profile_tick(CPU_PROFILING); 583 profile_tick(CPU_PROFILING);
558 spin_lock(&base->lock);
559 } 584 }
560 585
561 /* Do not restart, when we are in the idle loop */ 586 /* Do not restart, when we are in the idle loop */
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e5e466b27598..cd5dbc4579c9 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -47,7 +47,7 @@ struct timespec wall_to_monotonic __attribute__ ((aligned (16)));
47static unsigned long total_sleep_time; /* seconds */ 47static unsigned long total_sleep_time; /* seconds */
48 48
49static struct timespec xtime_cache __attribute__ ((aligned (16))); 49static struct timespec xtime_cache __attribute__ ((aligned (16)));
50static inline void update_xtime_cache(u64 nsec) 50void update_xtime_cache(u64 nsec)
51{ 51{
52 xtime_cache = xtime; 52 xtime_cache = xtime;
53 timespec_add_ns(&xtime_cache, nsec); 53 timespec_add_ns(&xtime_cache, nsec);
@@ -82,13 +82,12 @@ static inline s64 __get_nsec_offset(void)
82} 82}
83 83
84/** 84/**
85 * __get_realtime_clock_ts - Returns the time of day in a timespec 85 * getnstimeofday - Returns the time of day in a timespec
86 * @ts: pointer to the timespec to be set 86 * @ts: pointer to the timespec to be set
87 * 87 *
88 * Returns the time of day in a timespec. Used by 88 * Returns the time of day in a timespec.
89 * do_gettimeofday() and get_realtime_clock_ts().
90 */ 89 */
91static inline void __get_realtime_clock_ts(struct timespec *ts) 90void getnstimeofday(struct timespec *ts)
92{ 91{
93 unsigned long seq; 92 unsigned long seq;
94 s64 nsecs; 93 s64 nsecs;
@@ -104,30 +103,19 @@ static inline void __get_realtime_clock_ts(struct timespec *ts)
104 timespec_add_ns(ts, nsecs); 103 timespec_add_ns(ts, nsecs);
105} 104}
106 105
107/**
108 * getnstimeofday - Returns the time of day in a timespec
109 * @ts: pointer to the timespec to be set
110 *
111 * Returns the time of day in a timespec.
112 */
113void getnstimeofday(struct timespec *ts)
114{
115 __get_realtime_clock_ts(ts);
116}
117
118EXPORT_SYMBOL(getnstimeofday); 106EXPORT_SYMBOL(getnstimeofday);
119 107
120/** 108/**
121 * do_gettimeofday - Returns the time of day in a timeval 109 * do_gettimeofday - Returns the time of day in a timeval
122 * @tv: pointer to the timeval to be set 110 * @tv: pointer to the timeval to be set
123 * 111 *
124 * NOTE: Users should be converted to using get_realtime_clock_ts() 112 * NOTE: Users should be converted to using getnstimeofday()
125 */ 113 */
126void do_gettimeofday(struct timeval *tv) 114void do_gettimeofday(struct timeval *tv)
127{ 115{
128 struct timespec now; 116 struct timespec now;
129 117
130 __get_realtime_clock_ts(&now); 118 getnstimeofday(&now);
131 tv->tv_sec = now.tv_sec; 119 tv->tv_sec = now.tv_sec;
132 tv->tv_usec = now.tv_nsec/1000; 120 tv->tv_usec = now.tv_nsec/1000;
133} 121}
@@ -157,6 +145,7 @@ int do_settimeofday(struct timespec *tv)
157 145
158 set_normalized_timespec(&xtime, sec, nsec); 146 set_normalized_timespec(&xtime, sec, nsec);
159 set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); 147 set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
148 update_xtime_cache(0);
160 149
161 clock->error = 0; 150 clock->error = 0;
162 ntp_clear(); 151 ntp_clear();
@@ -198,7 +187,8 @@ static void change_clocksource(void)
198 187
199 clock->error = 0; 188 clock->error = 0;
200 clock->xtime_nsec = 0; 189 clock->xtime_nsec = 0;
201 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); 190 clocksource_calculate_interval(clock,
191 (unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT));
202 192
203 tick_clock_notify(); 193 tick_clock_notify();
204 194
@@ -255,15 +245,16 @@ void __init timekeeping_init(void)
255 ntp_clear(); 245 ntp_clear();
256 246
257 clock = clocksource_get_next(); 247 clock = clocksource_get_next();
258 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); 248 clocksource_calculate_interval(clock,
249 (unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT));
259 clock->cycle_last = clocksource_read(clock); 250 clock->cycle_last = clocksource_read(clock);
260 251
261 xtime.tv_sec = sec; 252 xtime.tv_sec = sec;
262 xtime.tv_nsec = 0; 253 xtime.tv_nsec = 0;
263 set_normalized_timespec(&wall_to_monotonic, 254 set_normalized_timespec(&wall_to_monotonic,
264 -xtime.tv_sec, -xtime.tv_nsec); 255 -xtime.tv_sec, -xtime.tv_nsec);
256 update_xtime_cache(0);
265 total_sleep_time = 0; 257 total_sleep_time = 0;
266
267 write_sequnlock_irqrestore(&xtime_lock, flags); 258 write_sequnlock_irqrestore(&xtime_lock, flags);
268} 259}
269 260
@@ -300,6 +291,7 @@ static int timekeeping_resume(struct sys_device *dev)
300 } 291 }
301 /* Make sure that we have the correct xtime reference */ 292 /* Make sure that we have the correct xtime reference */
302 timespec_add_ns(&xtime, timekeeping_suspend_nsecs); 293 timespec_add_ns(&xtime, timekeeping_suspend_nsecs);
294 update_xtime_cache(0);
303 /* re-base the last cycle value */ 295 /* re-base the last cycle value */
304 clock->cycle_last = clocksource_read(clock); 296 clock->cycle_last = clocksource_read(clock);
305 clock->error = 0; 297 clock->error = 0;
@@ -335,9 +327,9 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
335 327
336/* sysfs resume/suspend bits for timekeeping */ 328/* sysfs resume/suspend bits for timekeeping */
337static struct sysdev_class timekeeping_sysclass = { 329static struct sysdev_class timekeeping_sysclass = {
330 .name = "timekeeping",
338 .resume = timekeeping_resume, 331 .resume = timekeeping_resume,
339 .suspend = timekeeping_suspend, 332 .suspend = timekeeping_suspend,
340 set_kset_name("timekeeping"),
341}; 333};
342 334
343static struct sys_device device_timer = { 335static struct sys_device device_timer = {
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index fdb2e03d4fe0..d3d94c1a0fd2 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -129,7 +129,8 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
129 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); 129 struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
130 int i; 130 int i;
131 131
132 SEQ_printf(m, "\ncpu: %d\n", cpu); 132 SEQ_printf(m, "\n");
133 SEQ_printf(m, "cpu: %d\n", cpu);
133 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 134 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
134 SEQ_printf(m, " clock %d:\n", i); 135 SEQ_printf(m, " clock %d:\n", i);
135 print_base(m, cpu_base->clock_base + i, now); 136 print_base(m, cpu_base->clock_base + i, now);
@@ -165,6 +166,8 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
165 P(idle_calls); 166 P(idle_calls);
166 P(idle_sleeps); 167 P(idle_sleeps);
167 P_ns(idle_entrytime); 168 P_ns(idle_entrytime);
169 P_ns(idle_waketime);
170 P_ns(idle_exittime);
168 P_ns(idle_sleeptime); 171 P_ns(idle_sleeptime);
169 P(last_jiffies); 172 P(last_jiffies);
170 P(next_jiffies); 173 P(next_jiffies);
@@ -184,7 +187,8 @@ print_tickdevice(struct seq_file *m, struct tick_device *td)
184{ 187{
185 struct clock_event_device *dev = td->evtdev; 188 struct clock_event_device *dev = td->evtdev;
186 189
187 SEQ_printf(m, "\nTick Device: mode: %d\n", td->mode); 190 SEQ_printf(m, "\n");
191 SEQ_printf(m, "Tick Device: mode: %d\n", td->mode);
188 192
189 SEQ_printf(m, "Clock Event Device: "); 193 SEQ_printf(m, "Clock Event Device: ");
190 if (!dev) { 194 if (!dev) {
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index c36bb7ed0301..417da8c5bc72 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -26,7 +26,7 @@
26 * the pid and cmdline from the owner process if applicable. 26 * the pid and cmdline from the owner process if applicable.
27 * 27 *
28 * Start/stop data collection: 28 * Start/stop data collection:
29 * # echo 1[0] >/proc/timer_stats 29 * # echo [1|0] >/proc/timer_stats
30 * 30 *
31 * Display the information collected so far: 31 * Display the information collected so far:
32 * # cat /proc/timer_stats 32 * # cat /proc/timer_stats
diff --git a/kernel/timer.c b/kernel/timer.c
index fb4e67d5dd60..9fbb472b8cf0 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -58,59 +58,57 @@ EXPORT_SYMBOL(jiffies_64);
58#define TVN_MASK (TVN_SIZE - 1) 58#define TVN_MASK (TVN_SIZE - 1)
59#define TVR_MASK (TVR_SIZE - 1) 59#define TVR_MASK (TVR_SIZE - 1)
60 60
61typedef struct tvec_s { 61struct tvec {
62 struct list_head vec[TVN_SIZE]; 62 struct list_head vec[TVN_SIZE];
63} tvec_t; 63};
64 64
65typedef struct tvec_root_s { 65struct tvec_root {
66 struct list_head vec[TVR_SIZE]; 66 struct list_head vec[TVR_SIZE];
67} tvec_root_t; 67};
68 68
69struct tvec_t_base_s { 69struct tvec_base {
70 spinlock_t lock; 70 spinlock_t lock;
71 struct timer_list *running_timer; 71 struct timer_list *running_timer;
72 unsigned long timer_jiffies; 72 unsigned long timer_jiffies;
73 tvec_root_t tv1; 73 struct tvec_root tv1;
74 tvec_t tv2; 74 struct tvec tv2;
75 tvec_t tv3; 75 struct tvec tv3;
76 tvec_t tv4; 76 struct tvec tv4;
77 tvec_t tv5; 77 struct tvec tv5;
78} ____cacheline_aligned; 78} ____cacheline_aligned;
79 79
80typedef struct tvec_t_base_s tvec_base_t; 80struct tvec_base boot_tvec_bases;
81
82tvec_base_t boot_tvec_bases;
83EXPORT_SYMBOL(boot_tvec_bases); 81EXPORT_SYMBOL(boot_tvec_bases);
84static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases; 82static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases;
85 83
86/* 84/*
87 * Note that all tvec_bases is 2 byte aligned and lower bit of 85 * Note that all tvec_bases are 2 byte aligned and lower bit of
88 * base in timer_list is guaranteed to be zero. Use the LSB for 86 * base in timer_list is guaranteed to be zero. Use the LSB for
89 * the new flag to indicate whether the timer is deferrable 87 * the new flag to indicate whether the timer is deferrable
90 */ 88 */
91#define TBASE_DEFERRABLE_FLAG (0x1) 89#define TBASE_DEFERRABLE_FLAG (0x1)
92 90
93/* Functions below help us manage 'deferrable' flag */ 91/* Functions below help us manage 'deferrable' flag */
94static inline unsigned int tbase_get_deferrable(tvec_base_t *base) 92static inline unsigned int tbase_get_deferrable(struct tvec_base *base)
95{ 93{
96 return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG); 94 return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG);
97} 95}
98 96
99static inline tvec_base_t *tbase_get_base(tvec_base_t *base) 97static inline struct tvec_base *tbase_get_base(struct tvec_base *base)
100{ 98{
101 return ((tvec_base_t *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG)); 99 return ((struct tvec_base *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG));
102} 100}
103 101
104static inline void timer_set_deferrable(struct timer_list *timer) 102static inline void timer_set_deferrable(struct timer_list *timer)
105{ 103{
106 timer->base = ((tvec_base_t *)((unsigned long)(timer->base) | 104 timer->base = ((struct tvec_base *)((unsigned long)(timer->base) |
107 TBASE_DEFERRABLE_FLAG)); 105 TBASE_DEFERRABLE_FLAG));
108} 106}
109 107
110static inline void 108static inline void
111timer_set_base(struct timer_list *timer, tvec_base_t *new_base) 109timer_set_base(struct timer_list *timer, struct tvec_base *new_base)
112{ 110{
113 timer->base = (tvec_base_t *)((unsigned long)(new_base) | 111 timer->base = (struct tvec_base *)((unsigned long)(new_base) |
114 tbase_get_deferrable(timer->base)); 112 tbase_get_deferrable(timer->base));
115} 113}
116 114
@@ -246,7 +244,7 @@ unsigned long round_jiffies_relative(unsigned long j)
246EXPORT_SYMBOL_GPL(round_jiffies_relative); 244EXPORT_SYMBOL_GPL(round_jiffies_relative);
247 245
248 246
249static inline void set_running_timer(tvec_base_t *base, 247static inline void set_running_timer(struct tvec_base *base,
250 struct timer_list *timer) 248 struct timer_list *timer)
251{ 249{
252#ifdef CONFIG_SMP 250#ifdef CONFIG_SMP
@@ -254,7 +252,7 @@ static inline void set_running_timer(tvec_base_t *base,
254#endif 252#endif
255} 253}
256 254
257static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) 255static void internal_add_timer(struct tvec_base *base, struct timer_list *timer)
258{ 256{
259 unsigned long expires = timer->expires; 257 unsigned long expires = timer->expires;
260 unsigned long idx = expires - base->timer_jiffies; 258 unsigned long idx = expires - base->timer_jiffies;
@@ -371,14 +369,14 @@ static inline void detach_timer(struct timer_list *timer,
371 * possible to set timer->base = NULL and drop the lock: the timer remains 369 * possible to set timer->base = NULL and drop the lock: the timer remains
372 * locked. 370 * locked.
373 */ 371 */
374static tvec_base_t *lock_timer_base(struct timer_list *timer, 372static struct tvec_base *lock_timer_base(struct timer_list *timer,
375 unsigned long *flags) 373 unsigned long *flags)
376 __acquires(timer->base->lock) 374 __acquires(timer->base->lock)
377{ 375{
378 tvec_base_t *base; 376 struct tvec_base *base;
379 377
380 for (;;) { 378 for (;;) {
381 tvec_base_t *prelock_base = timer->base; 379 struct tvec_base *prelock_base = timer->base;
382 base = tbase_get_base(prelock_base); 380 base = tbase_get_base(prelock_base);
383 if (likely(base != NULL)) { 381 if (likely(base != NULL)) {
384 spin_lock_irqsave(&base->lock, *flags); 382 spin_lock_irqsave(&base->lock, *flags);
@@ -393,7 +391,7 @@ static tvec_base_t *lock_timer_base(struct timer_list *timer,
393 391
394int __mod_timer(struct timer_list *timer, unsigned long expires) 392int __mod_timer(struct timer_list *timer, unsigned long expires)
395{ 393{
396 tvec_base_t *base, *new_base; 394 struct tvec_base *base, *new_base;
397 unsigned long flags; 395 unsigned long flags;
398 int ret = 0; 396 int ret = 0;
399 397
@@ -445,7 +443,7 @@ EXPORT_SYMBOL(__mod_timer);
445 */ 443 */
446void add_timer_on(struct timer_list *timer, int cpu) 444void add_timer_on(struct timer_list *timer, int cpu)
447{ 445{
448 tvec_base_t *base = per_cpu(tvec_bases, cpu); 446 struct tvec_base *base = per_cpu(tvec_bases, cpu);
449 unsigned long flags; 447 unsigned long flags;
450 448
451 timer_stats_timer_set_start_info(timer); 449 timer_stats_timer_set_start_info(timer);
@@ -508,7 +506,7 @@ EXPORT_SYMBOL(mod_timer);
508 */ 506 */
509int del_timer(struct timer_list *timer) 507int del_timer(struct timer_list *timer)
510{ 508{
511 tvec_base_t *base; 509 struct tvec_base *base;
512 unsigned long flags; 510 unsigned long flags;
513 int ret = 0; 511 int ret = 0;
514 512
@@ -539,7 +537,7 @@ EXPORT_SYMBOL(del_timer);
539 */ 537 */
540int try_to_del_timer_sync(struct timer_list *timer) 538int try_to_del_timer_sync(struct timer_list *timer)
541{ 539{
542 tvec_base_t *base; 540 struct tvec_base *base;
543 unsigned long flags; 541 unsigned long flags;
544 int ret = -1; 542 int ret = -1;
545 543
@@ -591,7 +589,7 @@ int del_timer_sync(struct timer_list *timer)
591EXPORT_SYMBOL(del_timer_sync); 589EXPORT_SYMBOL(del_timer_sync);
592#endif 590#endif
593 591
594static int cascade(tvec_base_t *base, tvec_t *tv, int index) 592static int cascade(struct tvec_base *base, struct tvec *tv, int index)
595{ 593{
596 /* cascade all the timers from tv up one level */ 594 /* cascade all the timers from tv up one level */
597 struct timer_list *timer, *tmp; 595 struct timer_list *timer, *tmp;
@@ -620,7 +618,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
620 * This function cascades all vectors and executes all expired timer 618 * This function cascades all vectors and executes all expired timer
621 * vectors. 619 * vectors.
622 */ 620 */
623static inline void __run_timers(tvec_base_t *base) 621static inline void __run_timers(struct tvec_base *base)
624{ 622{
625 struct timer_list *timer; 623 struct timer_list *timer;
626 624
@@ -657,7 +655,7 @@ static inline void __run_timers(tvec_base_t *base)
657 int preempt_count = preempt_count(); 655 int preempt_count = preempt_count();
658 fn(data); 656 fn(data);
659 if (preempt_count != preempt_count()) { 657 if (preempt_count != preempt_count()) {
660 printk(KERN_WARNING "huh, entered %p " 658 printk(KERN_ERR "huh, entered %p "
661 "with preempt_count %08x, exited" 659 "with preempt_count %08x, exited"
662 " with %08x?\n", 660 " with %08x?\n",
663 fn, preempt_count, 661 fn, preempt_count,
@@ -678,13 +676,13 @@ static inline void __run_timers(tvec_base_t *base)
678 * is used on S/390 to stop all activity when a cpus is idle. 676 * is used on S/390 to stop all activity when a cpus is idle.
679 * This functions needs to be called disabled. 677 * This functions needs to be called disabled.
680 */ 678 */
681static unsigned long __next_timer_interrupt(tvec_base_t *base) 679static unsigned long __next_timer_interrupt(struct tvec_base *base)
682{ 680{
683 unsigned long timer_jiffies = base->timer_jiffies; 681 unsigned long timer_jiffies = base->timer_jiffies;
684 unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA; 682 unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA;
685 int index, slot, array, found = 0; 683 int index, slot, array, found = 0;
686 struct timer_list *nte; 684 struct timer_list *nte;
687 tvec_t *varray[4]; 685 struct tvec *varray[4];
688 686
689 /* Look for timer events in tv1. */ 687 /* Look for timer events in tv1. */
690 index = slot = timer_jiffies & TVR_MASK; 688 index = slot = timer_jiffies & TVR_MASK;
@@ -716,7 +714,7 @@ cascade:
716 varray[3] = &base->tv5; 714 varray[3] = &base->tv5;
717 715
718 for (array = 0; array < 4; array++) { 716 for (array = 0; array < 4; array++) {
719 tvec_t *varp = varray[array]; 717 struct tvec *varp = varray[array];
720 718
721 index = slot = timer_jiffies & TVN_MASK; 719 index = slot = timer_jiffies & TVN_MASK;
722 do { 720 do {
@@ -790,12 +788,12 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now,
790} 788}
791 789
792/** 790/**
793 * next_timer_interrupt - return the jiffy of the next pending timer 791 * get_next_timer_interrupt - return the jiffy of the next pending timer
794 * @now: current time (in jiffies) 792 * @now: current time (in jiffies)
795 */ 793 */
796unsigned long get_next_timer_interrupt(unsigned long now) 794unsigned long get_next_timer_interrupt(unsigned long now)
797{ 795{
798 tvec_base_t *base = __get_cpu_var(tvec_bases); 796 struct tvec_base *base = __get_cpu_var(tvec_bases);
799 unsigned long expires; 797 unsigned long expires;
800 798
801 spin_lock(&base->lock); 799 spin_lock(&base->lock);
@@ -817,6 +815,19 @@ unsigned long next_timer_interrupt(void)
817 815
818#endif 816#endif
819 817
818#ifndef CONFIG_VIRT_CPU_ACCOUNTING
819void account_process_tick(struct task_struct *p, int user_tick)
820{
821 if (user_tick) {
822 account_user_time(p, jiffies_to_cputime(1));
823 account_user_time_scaled(p, jiffies_to_cputime(1));
824 } else {
825 account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1));
826 account_system_time_scaled(p, jiffies_to_cputime(1));
827 }
828}
829#endif
830
820/* 831/*
821 * Called from the timer interrupt handler to charge one tick to the current 832 * Called from the timer interrupt handler to charge one tick to the current
822 * process. user_tick is 1 if the tick is user time, 0 for system. 833 * process. user_tick is 1 if the tick is user time, 0 for system.
@@ -827,13 +838,7 @@ void update_process_times(int user_tick)
827 int cpu = smp_processor_id(); 838 int cpu = smp_processor_id();
828 839
829 /* Note: this timer irq context must be accounted for as well. */ 840 /* Note: this timer irq context must be accounted for as well. */
830 if (user_tick) { 841 account_process_tick(p, user_tick);
831 account_user_time(p, jiffies_to_cputime(1));
832 account_user_time_scaled(p, jiffies_to_cputime(1));
833 } else {
834 account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1));
835 account_system_time_scaled(p, jiffies_to_cputime(1));
836 }
837 run_local_timers(); 842 run_local_timers();
838 if (rcu_pending(cpu)) 843 if (rcu_pending(cpu))
839 rcu_check_callbacks(cpu, user_tick); 844 rcu_check_callbacks(cpu, user_tick);
@@ -887,9 +892,9 @@ static inline void calc_load(unsigned long ticks)
887 */ 892 */
888static void run_timer_softirq(struct softirq_action *h) 893static void run_timer_softirq(struct softirq_action *h)
889{ 894{
890 tvec_base_t *base = __get_cpu_var(tvec_bases); 895 struct tvec_base *base = __get_cpu_var(tvec_bases);
891 896
892 hrtimer_run_queues(); 897 hrtimer_run_pending();
893 898
894 if (time_after_eq(jiffies, base->timer_jiffies)) 899 if (time_after_eq(jiffies, base->timer_jiffies))
895 __run_timers(base); 900 __run_timers(base);
@@ -900,6 +905,7 @@ static void run_timer_softirq(struct softirq_action *h)
900 */ 905 */
901void run_local_timers(void) 906void run_local_timers(void)
902{ 907{
908 hrtimer_run_queues();
903 raise_softirq(TIMER_SOFTIRQ); 909 raise_softirq(TIMER_SOFTIRQ);
904 softlockup_tick(); 910 softlockup_tick();
905} 911}
@@ -971,7 +977,7 @@ asmlinkage long sys_getppid(void)
971 int pid; 977 int pid;
972 978
973 rcu_read_lock(); 979 rcu_read_lock();
974 pid = task_ppid_nr_ns(current, current->nsproxy->pid_ns); 980 pid = task_tgid_nr_ns(current->real_parent, current->nsproxy->pid_ns);
975 rcu_read_unlock(); 981 rcu_read_unlock();
976 982
977 return pid; 983 return pid;
@@ -1093,6 +1099,13 @@ signed long __sched schedule_timeout_interruptible(signed long timeout)
1093} 1099}
1094EXPORT_SYMBOL(schedule_timeout_interruptible); 1100EXPORT_SYMBOL(schedule_timeout_interruptible);
1095 1101
1102signed long __sched schedule_timeout_killable(signed long timeout)
1103{
1104 __set_current_state(TASK_KILLABLE);
1105 return schedule_timeout(timeout);
1106}
1107EXPORT_SYMBOL(schedule_timeout_killable);
1108
1096signed long __sched schedule_timeout_uninterruptible(signed long timeout) 1109signed long __sched schedule_timeout_uninterruptible(signed long timeout)
1097{ 1110{
1098 __set_current_state(TASK_UNINTERRUPTIBLE); 1111 __set_current_state(TASK_UNINTERRUPTIBLE);
@@ -1212,11 +1225,11 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info)
1212 */ 1225 */
1213static struct lock_class_key base_lock_keys[NR_CPUS]; 1226static struct lock_class_key base_lock_keys[NR_CPUS];
1214 1227
1215static int __devinit init_timers_cpu(int cpu) 1228static int __cpuinit init_timers_cpu(int cpu)
1216{ 1229{
1217 int j; 1230 int j;
1218 tvec_base_t *base; 1231 struct tvec_base *base;
1219 static char __devinitdata tvec_base_done[NR_CPUS]; 1232 static char __cpuinitdata tvec_base_done[NR_CPUS];
1220 1233
1221 if (!tvec_base_done[cpu]) { 1234 if (!tvec_base_done[cpu]) {
1222 static char boot_done; 1235 static char boot_done;
@@ -1270,7 +1283,7 @@ static int __devinit init_timers_cpu(int cpu)
1270} 1283}
1271 1284
1272#ifdef CONFIG_HOTPLUG_CPU 1285#ifdef CONFIG_HOTPLUG_CPU
1273static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head) 1286static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head)
1274{ 1287{
1275 struct timer_list *timer; 1288 struct timer_list *timer;
1276 1289
@@ -1282,10 +1295,10 @@ static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head)
1282 } 1295 }
1283} 1296}
1284 1297
1285static void __devinit migrate_timers(int cpu) 1298static void __cpuinit migrate_timers(int cpu)
1286{ 1299{
1287 tvec_base_t *old_base; 1300 struct tvec_base *old_base;
1288 tvec_base_t *new_base; 1301 struct tvec_base *new_base;
1289 int i; 1302 int i;
1290 1303
1291 BUG_ON(cpu_online(cpu)); 1304 BUG_ON(cpu_online(cpu));
diff --git a/kernel/user.c b/kernel/user.c
index 0f3aa0234107..bc1c48d35cb3 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -115,7 +115,7 @@ static void sched_switch_user(struct task_struct *p) { }
115 115
116#if defined(CONFIG_FAIR_USER_SCHED) && defined(CONFIG_SYSFS) 116#if defined(CONFIG_FAIR_USER_SCHED) && defined(CONFIG_SYSFS)
117 117
118static struct kobject uids_kobject; /* represents /sys/kernel/uids directory */ 118static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */
119static DEFINE_MUTEX(uids_mutex); 119static DEFINE_MUTEX(uids_mutex);
120 120
121static inline void uids_mutex_lock(void) 121static inline void uids_mutex_lock(void)
@@ -128,86 +128,83 @@ static inline void uids_mutex_unlock(void)
128 mutex_unlock(&uids_mutex); 128 mutex_unlock(&uids_mutex);
129} 129}
130 130
131/* return cpu shares held by the user */ 131/* uid directory attributes */
132static ssize_t cpu_shares_show(struct kset *kset, char *buffer) 132static ssize_t cpu_shares_show(struct kobject *kobj,
133 struct kobj_attribute *attr,
134 char *buf)
133{ 135{
134 struct user_struct *up = container_of(kset, struct user_struct, kset); 136 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
135 137
136 return sprintf(buffer, "%lu\n", sched_group_shares(up->tg)); 138 return sprintf(buf, "%lu\n", sched_group_shares(up->tg));
137} 139}
138 140
139/* modify cpu shares held by the user */ 141static ssize_t cpu_shares_store(struct kobject *kobj,
140static ssize_t cpu_shares_store(struct kset *kset, const char *buffer, 142 struct kobj_attribute *attr,
141 size_t size) 143 const char *buf, size_t size)
142{ 144{
143 struct user_struct *up = container_of(kset, struct user_struct, kset); 145 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
144 unsigned long shares; 146 unsigned long shares;
145 int rc; 147 int rc;
146 148
147 sscanf(buffer, "%lu", &shares); 149 sscanf(buf, "%lu", &shares);
148 150
149 rc = sched_group_set_shares(up->tg, shares); 151 rc = sched_group_set_shares(up->tg, shares);
150 152
151 return (rc ? rc : size); 153 return (rc ? rc : size);
152} 154}
153 155
154static void user_attr_init(struct subsys_attribute *sa, char *name, int mode) 156static struct kobj_attribute cpu_share_attr =
157 __ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store);
158
159/* default attributes per uid directory */
160static struct attribute *uids_attributes[] = {
161 &cpu_share_attr.attr,
162 NULL
163};
164
165/* the lifetime of user_struct is not managed by the core (now) */
166static void uids_release(struct kobject *kobj)
155{ 167{
156 sa->attr.name = name; 168 return;
157 sa->attr.mode = mode;
158 sa->show = cpu_shares_show;
159 sa->store = cpu_shares_store;
160} 169}
161 170
162/* Create "/sys/kernel/uids/<uid>" directory and 171static struct kobj_type uids_ktype = {
163 * "/sys/kernel/uids/<uid>/cpu_share" file for this user. 172 .sysfs_ops = &kobj_sysfs_ops,
164 */ 173 .default_attrs = uids_attributes,
165static int user_kobject_create(struct user_struct *up) 174 .release = uids_release,
175};
176
177/* create /sys/kernel/uids/<uid>/cpu_share file for this user */
178static int uids_user_create(struct user_struct *up)
166{ 179{
167 struct kset *kset = &up->kset; 180 struct kobject *kobj = &up->kobj;
168 struct kobject *kobj = &kset->kobj;
169 int error; 181 int error;
170 182
171 memset(kset, 0, sizeof(struct kset)); 183 memset(kobj, 0, sizeof(struct kobject));
172 kobj->parent = &uids_kobject; /* create under /sys/kernel/uids dir */ 184 kobj->kset = uids_kset;
173 kobject_set_name(kobj, "%d", up->uid); 185 error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid);
174 kset_init(kset); 186 if (error) {
175 user_attr_init(&up->user_attr, "cpu_share", 0644); 187 kobject_put(kobj);
176
177 error = kobject_add(kobj);
178 if (error)
179 goto done; 188 goto done;
180 189 }
181 error = sysfs_create_file(kobj, &up->user_attr.attr);
182 if (error)
183 kobject_del(kobj);
184 190
185 kobject_uevent(kobj, KOBJ_ADD); 191 kobject_uevent(kobj, KOBJ_ADD);
186
187done: 192done:
188 return error; 193 return error;
189} 194}
190 195
191/* create these in sysfs filesystem: 196/* create these entries in sysfs:
192 * "/sys/kernel/uids" directory 197 * "/sys/kernel/uids" directory
193 * "/sys/kernel/uids/0" directory (for root user) 198 * "/sys/kernel/uids/0" directory (for root user)
194 * "/sys/kernel/uids/0/cpu_share" file (for root user) 199 * "/sys/kernel/uids/0/cpu_share" file (for root user)
195 */ 200 */
196int __init uids_kobject_init(void) 201int __init uids_sysfs_init(void)
197{ 202{
198 int error; 203 uids_kset = kset_create_and_add("uids", NULL, kernel_kobj);
199 204 if (!uids_kset)
200 /* create under /sys/kernel dir */ 205 return -ENOMEM;
201 uids_kobject.parent = &kernel_subsys.kobj;
202 uids_kobject.kset = &kernel_subsys;
203 kobject_set_name(&uids_kobject, "uids");
204 kobject_init(&uids_kobject);
205 206
206 error = kobject_add(&uids_kobject); 207 return uids_user_create(&root_user);
207 if (!error)
208 error = user_kobject_create(&root_user);
209
210 return error;
211} 208}
212 209
213/* work function to remove sysfs directory for a user and free up 210/* work function to remove sysfs directory for a user and free up
@@ -216,7 +213,6 @@ int __init uids_kobject_init(void)
216static void remove_user_sysfs_dir(struct work_struct *w) 213static void remove_user_sysfs_dir(struct work_struct *w)
217{ 214{
218 struct user_struct *up = container_of(w, struct user_struct, work); 215 struct user_struct *up = container_of(w, struct user_struct, work);
219 struct kobject *kobj = &up->kset.kobj;
220 unsigned long flags; 216 unsigned long flags;
221 int remove_user = 0; 217 int remove_user = 0;
222 218
@@ -238,9 +234,9 @@ static void remove_user_sysfs_dir(struct work_struct *w)
238 if (!remove_user) 234 if (!remove_user)
239 goto done; 235 goto done;
240 236
241 sysfs_remove_file(kobj, &up->user_attr.attr); 237 kobject_uevent(&up->kobj, KOBJ_REMOVE);
242 kobject_uevent(kobj, KOBJ_REMOVE); 238 kobject_del(&up->kobj);
243 kobject_del(kobj); 239 kobject_put(&up->kobj);
244 240
245 sched_destroy_user(up); 241 sched_destroy_user(up);
246 key_put(up->uid_keyring); 242 key_put(up->uid_keyring);
@@ -267,7 +263,8 @@ static inline void free_user(struct user_struct *up, unsigned long flags)
267 263
268#else /* CONFIG_FAIR_USER_SCHED && CONFIG_SYSFS */ 264#else /* CONFIG_FAIR_USER_SCHED && CONFIG_SYSFS */
269 265
270static inline int user_kobject_create(struct user_struct *up) { return 0; } 266int uids_sysfs_init(void) { return 0; }
267static inline int uids_user_create(struct user_struct *up) { return 0; }
271static inline void uids_mutex_lock(void) { } 268static inline void uids_mutex_lock(void) { }
272static inline void uids_mutex_unlock(void) { } 269static inline void uids_mutex_unlock(void) { }
273 270
@@ -322,9 +319,9 @@ void free_uid(struct user_struct *up)
322struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) 319struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
323{ 320{
324 struct hlist_head *hashent = uidhashentry(ns, uid); 321 struct hlist_head *hashent = uidhashentry(ns, uid);
325 struct user_struct *up; 322 struct user_struct *up, *new;
326 323
327 /* Make uid_hash_find() + user_kobject_create() + uid_hash_insert() 324 /* Make uid_hash_find() + uids_user_create() + uid_hash_insert()
328 * atomic. 325 * atomic.
329 */ 326 */
330 uids_mutex_lock(); 327 uids_mutex_lock();
@@ -334,11 +331,10 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
334 spin_unlock_irq(&uidhash_lock); 331 spin_unlock_irq(&uidhash_lock);
335 332
336 if (!up) { 333 if (!up) {
337 struct user_struct *new;
338
339 new = kmem_cache_alloc(uid_cachep, GFP_KERNEL); 334 new = kmem_cache_alloc(uid_cachep, GFP_KERNEL);
340 if (!new) 335 if (!new)
341 return NULL; 336 goto out_unlock;
337
342 new->uid = uid; 338 new->uid = uid;
343 atomic_set(&new->__count, 1); 339 atomic_set(&new->__count, 1);
344 atomic_set(&new->processes, 0); 340 atomic_set(&new->processes, 0);
@@ -353,26 +349,14 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
353#endif 349#endif
354 new->locked_shm = 0; 350 new->locked_shm = 0;
355 351
356 if (alloc_uid_keyring(new, current) < 0) { 352 if (alloc_uid_keyring(new, current) < 0)
357 kmem_cache_free(uid_cachep, new); 353 goto out_free_user;
358 return NULL;
359 }
360 354
361 if (sched_create_user(new) < 0) { 355 if (sched_create_user(new) < 0)
362 key_put(new->uid_keyring); 356 goto out_put_keys;
363 key_put(new->session_keyring);
364 kmem_cache_free(uid_cachep, new);
365 return NULL;
366 }
367 357
368 if (user_kobject_create(new)) { 358 if (uids_user_create(new))
369 sched_destroy_user(new); 359 goto out_destoy_sched;
370 key_put(new->uid_keyring);
371 key_put(new->session_keyring);
372 kmem_cache_free(uid_cachep, new);
373 uids_mutex_unlock();
374 return NULL;
375 }
376 360
377 /* 361 /*
378 * Before adding this, check whether we raced 362 * Before adding this, check whether we raced
@@ -400,6 +384,17 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
400 uids_mutex_unlock(); 384 uids_mutex_unlock();
401 385
402 return up; 386 return up;
387
388out_destoy_sched:
389 sched_destroy_user(new);
390out_put_keys:
391 key_put(new->uid_keyring);
392 key_put(new->session_keyring);
393out_free_user:
394 kmem_cache_free(uid_cachep, new);
395out_unlock:
396 uids_mutex_unlock();
397 return NULL;
403} 398}
404 399
405void switch_uid(struct user_struct *new_user) 400void switch_uid(struct user_struct *new_user)
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index c76c06466bfd..fe3a56c2256d 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -18,6 +18,10 @@
18static void *get_uts(ctl_table *table, int write) 18static void *get_uts(ctl_table *table, int write)
19{ 19{
20 char *which = table->data; 20 char *which = table->data;
21 struct uts_namespace *uts_ns;
22
23 uts_ns = current->nsproxy->uts_ns;
24 which = (which - (char *)&init_uts_ns) + (char *)uts_ns;
21 25
22 if (!write) 26 if (!write)
23 down_read(&uts_sem); 27 down_read(&uts_sem);
diff --git a/kernel/wait.c b/kernel/wait.c
index 444ddbfaefc4..f9876888a569 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -215,7 +215,7 @@ void fastcall __wake_up_bit(wait_queue_head_t *wq, void *word, int bit)
215{ 215{
216 struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); 216 struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit);
217 if (waitqueue_active(wq)) 217 if (waitqueue_active(wq))
218 __wake_up(wq, TASK_INTERRUPTIBLE|TASK_UNINTERRUPTIBLE, 1, &key); 218 __wake_up(wq, TASK_NORMAL, 1, &key);
219} 219}
220EXPORT_SYMBOL(__wake_up_bit); 220EXPORT_SYMBOL(__wake_up_bit);
221 221
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 52d5e7c9a8e6..52db48e7f6e7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -67,9 +67,8 @@ struct workqueue_struct {
67#endif 67#endif
68}; 68};
69 69
70/* All the per-cpu workqueues on the system, for hotplug cpu to add/remove 70/* Serializes the accesses to the list of workqueues. */
71 threads to each one as cpus come/go. */ 71static DEFINE_SPINLOCK(workqueue_lock);
72static DEFINE_MUTEX(workqueue_mutex);
73static LIST_HEAD(workqueues); 72static LIST_HEAD(workqueues);
74 73
75static int singlethread_cpu __read_mostly; 74static int singlethread_cpu __read_mostly;
@@ -592,8 +591,6 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
592 * Returns zero on success. 591 * Returns zero on success.
593 * Returns -ve errno on failure. 592 * Returns -ve errno on failure.
594 * 593 *
595 * Appears to be racy against CPU hotplug.
596 *
597 * schedule_on_each_cpu() is very slow. 594 * schedule_on_each_cpu() is very slow.
598 */ 595 */
599int schedule_on_each_cpu(work_func_t func) 596int schedule_on_each_cpu(work_func_t func)
@@ -605,7 +602,7 @@ int schedule_on_each_cpu(work_func_t func)
605 if (!works) 602 if (!works)
606 return -ENOMEM; 603 return -ENOMEM;
607 604
608 preempt_disable(); /* CPU hotplug */ 605 get_online_cpus();
609 for_each_online_cpu(cpu) { 606 for_each_online_cpu(cpu) {
610 struct work_struct *work = per_cpu_ptr(works, cpu); 607 struct work_struct *work = per_cpu_ptr(works, cpu);
611 608
@@ -613,8 +610,8 @@ int schedule_on_each_cpu(work_func_t func)
613 set_bit(WORK_STRUCT_PENDING, work_data_bits(work)); 610 set_bit(WORK_STRUCT_PENDING, work_data_bits(work));
614 __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work); 611 __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work);
615 } 612 }
616 preempt_enable();
617 flush_workqueue(keventd_wq); 613 flush_workqueue(keventd_wq);
614 put_online_cpus();
618 free_percpu(works); 615 free_percpu(works);
619 return 0; 616 return 0;
620} 617}
@@ -722,7 +719,8 @@ static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
722struct workqueue_struct *__create_workqueue_key(const char *name, 719struct workqueue_struct *__create_workqueue_key(const char *name,
723 int singlethread, 720 int singlethread,
724 int freezeable, 721 int freezeable,
725 struct lock_class_key *key) 722 struct lock_class_key *key,
723 const char *lock_name)
726{ 724{
727 struct workqueue_struct *wq; 725 struct workqueue_struct *wq;
728 struct cpu_workqueue_struct *cwq; 726 struct cpu_workqueue_struct *cwq;
@@ -739,7 +737,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
739 } 737 }
740 738
741 wq->name = name; 739 wq->name = name;
742 lockdep_init_map(&wq->lockdep_map, name, key, 0); 740 lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
743 wq->singlethread = singlethread; 741 wq->singlethread = singlethread;
744 wq->freezeable = freezeable; 742 wq->freezeable = freezeable;
745 INIT_LIST_HEAD(&wq->list); 743 INIT_LIST_HEAD(&wq->list);
@@ -749,8 +747,10 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
749 err = create_workqueue_thread(cwq, singlethread_cpu); 747 err = create_workqueue_thread(cwq, singlethread_cpu);
750 start_workqueue_thread(cwq, -1); 748 start_workqueue_thread(cwq, -1);
751 } else { 749 } else {
752 mutex_lock(&workqueue_mutex); 750 get_online_cpus();
751 spin_lock(&workqueue_lock);
753 list_add(&wq->list, &workqueues); 752 list_add(&wq->list, &workqueues);
753 spin_unlock(&workqueue_lock);
754 754
755 for_each_possible_cpu(cpu) { 755 for_each_possible_cpu(cpu) {
756 cwq = init_cpu_workqueue(wq, cpu); 756 cwq = init_cpu_workqueue(wq, cpu);
@@ -759,7 +759,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
759 err = create_workqueue_thread(cwq, cpu); 759 err = create_workqueue_thread(cwq, cpu);
760 start_workqueue_thread(cwq, cpu); 760 start_workqueue_thread(cwq, cpu);
761 } 761 }
762 mutex_unlock(&workqueue_mutex); 762 put_online_cpus();
763 } 763 }
764 764
765 if (err) { 765 if (err) {
@@ -774,7 +774,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
774{ 774{
775 /* 775 /*
776 * Our caller is either destroy_workqueue() or CPU_DEAD, 776 * Our caller is either destroy_workqueue() or CPU_DEAD,
777 * workqueue_mutex protects cwq->thread 777 * get_online_cpus() protects cwq->thread.
778 */ 778 */
779 if (cwq->thread == NULL) 779 if (cwq->thread == NULL)
780 return; 780 return;
@@ -809,9 +809,11 @@ void destroy_workqueue(struct workqueue_struct *wq)
809 struct cpu_workqueue_struct *cwq; 809 struct cpu_workqueue_struct *cwq;
810 int cpu; 810 int cpu;
811 811
812 mutex_lock(&workqueue_mutex); 812 get_online_cpus();
813 spin_lock(&workqueue_lock);
813 list_del(&wq->list); 814 list_del(&wq->list);
814 mutex_unlock(&workqueue_mutex); 815 spin_unlock(&workqueue_lock);
816 put_online_cpus();
815 817
816 for_each_cpu_mask(cpu, *cpu_map) { 818 for_each_cpu_mask(cpu, *cpu_map) {
817 cwq = per_cpu_ptr(wq->cpu_wq, cpu); 819 cwq = per_cpu_ptr(wq->cpu_wq, cpu);
@@ -834,13 +836,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
834 action &= ~CPU_TASKS_FROZEN; 836 action &= ~CPU_TASKS_FROZEN;
835 837
836 switch (action) { 838 switch (action) {
837 case CPU_LOCK_ACQUIRE:
838 mutex_lock(&workqueue_mutex);
839 return NOTIFY_OK;
840
841 case CPU_LOCK_RELEASE:
842 mutex_unlock(&workqueue_mutex);
843 return NOTIFY_OK;
844 839
845 case CPU_UP_PREPARE: 840 case CPU_UP_PREPARE:
846 cpu_set(cpu, cpu_populated_map); 841 cpu_set(cpu, cpu_populated_map);
@@ -853,7 +848,8 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
853 case CPU_UP_PREPARE: 848 case CPU_UP_PREPARE:
854 if (!create_workqueue_thread(cwq, cpu)) 849 if (!create_workqueue_thread(cwq, cpu))
855 break; 850 break;
856 printk(KERN_ERR "workqueue for %i failed\n", cpu); 851 printk(KERN_ERR "workqueue [%s] for %i failed\n",
852 wq->name, cpu);
857 return NOTIFY_BAD; 853 return NOTIFY_BAD;
858 854
859 case CPU_ONLINE: 855 case CPU_ONLINE: