diff options
Diffstat (limited to 'kernel')
92 files changed, 8290 insertions, 3240 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz index 4af15802ccd4..526128a2e622 100644 --- a/kernel/Kconfig.hz +++ b/kernel/Kconfig.hz | |||
@@ -54,3 +54,5 @@ config HZ | |||
54 | default 300 if HZ_300 | 54 | default 300 if HZ_300 |
55 | default 1000 if HZ_1000 | 55 | default 1000 if HZ_1000 |
56 | 56 | ||
57 | config SCHED_HRTICK | ||
58 | def_bool HIGH_RES_TIMERS && X86 | ||
diff --git a/kernel/Kconfig.instrumentation b/kernel/Kconfig.instrumentation index f5f2c769d95e..468f47ad7503 100644 --- a/kernel/Kconfig.instrumentation +++ b/kernel/Kconfig.instrumentation | |||
@@ -20,8 +20,8 @@ config PROFILING | |||
20 | 20 | ||
21 | config OPROFILE | 21 | config OPROFILE |
22 | tristate "OProfile system profiling (EXPERIMENTAL)" | 22 | tristate "OProfile system profiling (EXPERIMENTAL)" |
23 | depends on PROFILING | 23 | depends on PROFILING && !UML |
24 | depends on ALPHA || ARM || BLACKFIN || X86_32 || IA64 || M32R || MIPS || PARISC || PPC || S390 || SUPERH || SPARC || X86_64 | 24 | depends on ARCH_SUPPORTS_OPROFILE || ALPHA || ARM || BLACKFIN || IA64 || M32R || PARISC || PPC || S390 || SUPERH || SPARC |
25 | help | 25 | help |
26 | OProfile is a profiling system capable of profiling the | 26 | OProfile is a profiling system capable of profiling the |
27 | whole system, include the kernel, kernel modules, libraries, | 27 | whole system, include the kernel, kernel modules, libraries, |
@@ -31,7 +31,7 @@ config OPROFILE | |||
31 | 31 | ||
32 | config KPROBES | 32 | config KPROBES |
33 | bool "Kprobes" | 33 | bool "Kprobes" |
34 | depends on KALLSYMS && MODULES | 34 | depends on KALLSYMS && MODULES && !UML |
35 | depends on X86_32 || IA64 || PPC || S390 || SPARC64 || X86_64 || AVR32 | 35 | depends on X86_32 || IA64 || PPC || S390 || SPARC64 || X86_64 || AVR32 |
36 | help | 36 | help |
37 | Kprobes allows you to trap at almost any kernel address and | 37 | Kprobes allows you to trap at almost any kernel address and |
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index c64ce9c14207..0669b70fa6a3 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt | |||
@@ -52,14 +52,13 @@ config PREEMPT | |||
52 | 52 | ||
53 | endchoice | 53 | endchoice |
54 | 54 | ||
55 | config PREEMPT_BKL | 55 | config RCU_TRACE |
56 | bool "Preempt The Big Kernel Lock" | 56 | bool "Enable tracing for RCU - currently stats in debugfs" |
57 | depends on SMP || PREEMPT | 57 | select DEBUG_FS |
58 | default y | 58 | default y |
59 | help | 59 | help |
60 | This option reduces the latency of the kernel by making the | 60 | This option provides tracing in RCU which presents stats |
61 | big kernel lock preemptible. | 61 | in debugfs for debugging RCU implementation. |
62 | 62 | ||
63 | Say Y here if you are building a kernel for a desktop system. | 63 | Say Y here if you want to enable RCU tracing |
64 | Say N if you are unsure. | 64 | Say N if you are unsure. |
65 | |||
diff --git a/kernel/Makefile b/kernel/Makefile index f60afe742599..8885627ea021 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -36,14 +36,15 @@ obj-$(CONFIG_KALLSYMS) += kallsyms.o | |||
36 | obj-$(CONFIG_PM) += power/ | 36 | obj-$(CONFIG_PM) += power/ |
37 | obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o | 37 | obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o |
38 | obj-$(CONFIG_KEXEC) += kexec.o | 38 | obj-$(CONFIG_KEXEC) += kexec.o |
39 | obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o | ||
39 | obj-$(CONFIG_COMPAT) += compat.o | 40 | obj-$(CONFIG_COMPAT) += compat.o |
40 | obj-$(CONFIG_CGROUPS) += cgroup.o | 41 | obj-$(CONFIG_CGROUPS) += cgroup.o |
41 | obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o | 42 | obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o |
42 | obj-$(CONFIG_CPUSETS) += cpuset.o | 43 | obj-$(CONFIG_CPUSETS) += cpuset.o |
43 | obj-$(CONFIG_CGROUP_CPUACCT) += cpu_acct.o | ||
44 | obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o | 44 | obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o |
45 | obj-$(CONFIG_IKCONFIG) += configs.o | 45 | obj-$(CONFIG_IKCONFIG) += configs.o |
46 | obj-$(CONFIG_STOP_MACHINE) += stop_machine.o | 46 | obj-$(CONFIG_STOP_MACHINE) += stop_machine.o |
47 | obj-$(CONFIG_KPROBES_SANITY_TEST) += test_kprobes.o | ||
47 | obj-$(CONFIG_AUDIT) += audit.o auditfilter.o | 48 | obj-$(CONFIG_AUDIT) += audit.o auditfilter.o |
48 | obj-$(CONFIG_AUDITSYSCALL) += auditsc.o | 49 | obj-$(CONFIG_AUDITSYSCALL) += auditsc.o |
49 | obj-$(CONFIG_AUDIT_TREE) += audit_tree.o | 50 | obj-$(CONFIG_AUDIT_TREE) += audit_tree.o |
@@ -53,11 +54,17 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o | |||
53 | obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ | 54 | obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ |
54 | obj-$(CONFIG_SECCOMP) += seccomp.o | 55 | obj-$(CONFIG_SECCOMP) += seccomp.o |
55 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o | 56 | obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o |
57 | obj-$(CONFIG_CLASSIC_RCU) += rcuclassic.o | ||
58 | obj-$(CONFIG_PREEMPT_RCU) += rcupreempt.o | ||
59 | ifeq ($(CONFIG_PREEMPT_RCU),y) | ||
60 | obj-$(CONFIG_RCU_TRACE) += rcupreempt_trace.o | ||
61 | endif | ||
56 | obj-$(CONFIG_RELAY) += relay.o | 62 | obj-$(CONFIG_RELAY) += relay.o |
57 | obj-$(CONFIG_SYSCTL) += utsname_sysctl.o | 63 | obj-$(CONFIG_SYSCTL) += utsname_sysctl.o |
58 | obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o | 64 | obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o |
59 | obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o | 65 | obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o |
60 | obj-$(CONFIG_MARKERS) += marker.o | 66 | obj-$(CONFIG_MARKERS) += marker.o |
67 | obj-$(CONFIG_LATENCYTOP) += latencytop.o | ||
61 | 68 | ||
62 | ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) | 69 | ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) |
63 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is | 70 | # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is |
diff --git a/kernel/acct.c b/kernel/acct.c index fce53d8df8a7..521dfa53cb99 100644 --- a/kernel/acct.c +++ b/kernel/acct.c | |||
@@ -413,7 +413,7 @@ static u32 encode_float(u64 value) | |||
413 | * The acct_process() call is the workhorse of the process | 413 | * The acct_process() call is the workhorse of the process |
414 | * accounting system. The struct acct is built here and then written | 414 | * accounting system. The struct acct is built here and then written |
415 | * into the accounting file. This function should only be called from | 415 | * into the accounting file. This function should only be called from |
416 | * do_exit(). | 416 | * do_exit() or when switching to a different output file. |
417 | */ | 417 | */ |
418 | 418 | ||
419 | /* | 419 | /* |
@@ -482,7 +482,7 @@ static void do_acct_process(struct file *file) | |||
482 | #endif | 482 | #endif |
483 | #if ACCT_VERSION==3 | 483 | #if ACCT_VERSION==3 |
484 | ac.ac_pid = current->tgid; | 484 | ac.ac_pid = current->tgid; |
485 | ac.ac_ppid = current->parent->tgid; | 485 | ac.ac_ppid = current->real_parent->tgid; |
486 | #endif | 486 | #endif |
487 | 487 | ||
488 | spin_lock_irq(¤t->sighand->siglock); | 488 | spin_lock_irq(¤t->sighand->siglock); |
diff --git a/kernel/audit.c b/kernel/audit.c index f93c2713017d..c8555b180213 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
@@ -66,10 +66,11 @@ | |||
66 | * (Initialization happens after skb_init is called.) */ | 66 | * (Initialization happens after skb_init is called.) */ |
67 | static int audit_initialized; | 67 | static int audit_initialized; |
68 | 68 | ||
69 | /* 0 - no auditing | 69 | #define AUDIT_OFF 0 |
70 | * 1 - auditing enabled | 70 | #define AUDIT_ON 1 |
71 | * 2 - auditing enabled and configuration is locked/unchangeable. */ | 71 | #define AUDIT_LOCKED 2 |
72 | int audit_enabled; | 72 | int audit_enabled; |
73 | int audit_ever_enabled; | ||
73 | 74 | ||
74 | /* Default state when kernel boots without any parameters. */ | 75 | /* Default state when kernel boots without any parameters. */ |
75 | static int audit_default; | 76 | static int audit_default; |
@@ -152,8 +153,10 @@ struct audit_buffer { | |||
152 | 153 | ||
153 | static void audit_set_pid(struct audit_buffer *ab, pid_t pid) | 154 | static void audit_set_pid(struct audit_buffer *ab, pid_t pid) |
154 | { | 155 | { |
155 | struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); | 156 | if (ab) { |
156 | nlh->nlmsg_pid = pid; | 157 | struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); |
158 | nlh->nlmsg_pid = pid; | ||
159 | } | ||
157 | } | 160 | } |
158 | 161 | ||
159 | void audit_panic(const char *message) | 162 | void audit_panic(const char *message) |
@@ -163,7 +166,8 @@ void audit_panic(const char *message) | |||
163 | case AUDIT_FAIL_SILENT: | 166 | case AUDIT_FAIL_SILENT: |
164 | break; | 167 | break; |
165 | case AUDIT_FAIL_PRINTK: | 168 | case AUDIT_FAIL_PRINTK: |
166 | printk(KERN_ERR "audit: %s\n", message); | 169 | if (printk_ratelimit()) |
170 | printk(KERN_ERR "audit: %s\n", message); | ||
167 | break; | 171 | break; |
168 | case AUDIT_FAIL_PANIC: | 172 | case AUDIT_FAIL_PANIC: |
169 | panic("audit: %s\n", message); | 173 | panic("audit: %s\n", message); |
@@ -231,161 +235,107 @@ void audit_log_lost(const char *message) | |||
231 | } | 235 | } |
232 | 236 | ||
233 | if (print) { | 237 | if (print) { |
234 | printk(KERN_WARNING | 238 | if (printk_ratelimit()) |
235 | "audit: audit_lost=%d audit_rate_limit=%d audit_backlog_limit=%d\n", | 239 | printk(KERN_WARNING |
236 | atomic_read(&audit_lost), | 240 | "audit: audit_lost=%d audit_rate_limit=%d " |
237 | audit_rate_limit, | 241 | "audit_backlog_limit=%d\n", |
238 | audit_backlog_limit); | 242 | atomic_read(&audit_lost), |
243 | audit_rate_limit, | ||
244 | audit_backlog_limit); | ||
239 | audit_panic(message); | 245 | audit_panic(message); |
240 | } | 246 | } |
241 | } | 247 | } |
242 | 248 | ||
243 | static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid) | 249 | static int audit_log_config_change(char *function_name, int new, int old, |
250 | uid_t loginuid, u32 sid, int allow_changes) | ||
244 | { | 251 | { |
245 | int res, rc = 0, old = audit_rate_limit; | 252 | struct audit_buffer *ab; |
246 | 253 | int rc = 0; | |
247 | /* check if we are locked */ | ||
248 | if (audit_enabled == 2) | ||
249 | res = 0; | ||
250 | else | ||
251 | res = 1; | ||
252 | 254 | ||
255 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); | ||
256 | audit_log_format(ab, "%s=%d old=%d by auid=%u", function_name, new, | ||
257 | old, loginuid); | ||
253 | if (sid) { | 258 | if (sid) { |
254 | char *ctx = NULL; | 259 | char *ctx = NULL; |
255 | u32 len; | 260 | u32 len; |
256 | if ((rc = selinux_sid_to_string(sid, &ctx, &len)) == 0) { | 261 | |
257 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | 262 | rc = selinux_sid_to_string(sid, &ctx, &len); |
258 | "audit_rate_limit=%d old=%d by auid=%u" | 263 | if (rc) { |
259 | " subj=%s res=%d", | 264 | audit_log_format(ab, " sid=%u", sid); |
260 | limit, old, loginuid, ctx, res); | 265 | allow_changes = 0; /* Something weird, deny request */ |
266 | } else { | ||
267 | audit_log_format(ab, " subj=%s", ctx); | ||
261 | kfree(ctx); | 268 | kfree(ctx); |
262 | } else | 269 | } |
263 | res = 0; /* Something weird, deny request */ | ||
264 | } | 270 | } |
265 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | 271 | audit_log_format(ab, " res=%d", allow_changes); |
266 | "audit_rate_limit=%d old=%d by auid=%u res=%d", | 272 | audit_log_end(ab); |
267 | limit, old, loginuid, res); | ||
268 | |||
269 | /* If we are allowed, make the change */ | ||
270 | if (res == 1) | ||
271 | audit_rate_limit = limit; | ||
272 | /* Not allowed, update reason */ | ||
273 | else if (rc == 0) | ||
274 | rc = -EPERM; | ||
275 | return rc; | 273 | return rc; |
276 | } | 274 | } |
277 | 275 | ||
278 | static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid) | 276 | static int audit_do_config_change(char *function_name, int *to_change, |
277 | int new, uid_t loginuid, u32 sid) | ||
279 | { | 278 | { |
280 | int res, rc = 0, old = audit_backlog_limit; | 279 | int allow_changes, rc = 0, old = *to_change; |
281 | 280 | ||
282 | /* check if we are locked */ | 281 | /* check if we are locked */ |
283 | if (audit_enabled == 2) | 282 | if (audit_enabled == AUDIT_LOCKED) |
284 | res = 0; | 283 | allow_changes = 0; |
285 | else | 284 | else |
286 | res = 1; | 285 | allow_changes = 1; |
287 | 286 | ||
288 | if (sid) { | 287 | if (audit_enabled != AUDIT_OFF) { |
289 | char *ctx = NULL; | 288 | rc = audit_log_config_change(function_name, new, old, |
290 | u32 len; | 289 | loginuid, sid, allow_changes); |
291 | if ((rc = selinux_sid_to_string(sid, &ctx, &len)) == 0) { | 290 | if (rc) |
292 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | 291 | allow_changes = 0; |
293 | "audit_backlog_limit=%d old=%d by auid=%u" | ||
294 | " subj=%s res=%d", | ||
295 | limit, old, loginuid, ctx, res); | ||
296 | kfree(ctx); | ||
297 | } else | ||
298 | res = 0; /* Something weird, deny request */ | ||
299 | } | 292 | } |
300 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
301 | "audit_backlog_limit=%d old=%d by auid=%u res=%d", | ||
302 | limit, old, loginuid, res); | ||
303 | 293 | ||
304 | /* If we are allowed, make the change */ | 294 | /* If we are allowed, make the change */ |
305 | if (res == 1) | 295 | if (allow_changes == 1) |
306 | audit_backlog_limit = limit; | 296 | *to_change = new; |
307 | /* Not allowed, update reason */ | 297 | /* Not allowed, update reason */ |
308 | else if (rc == 0) | 298 | else if (rc == 0) |
309 | rc = -EPERM; | 299 | rc = -EPERM; |
310 | return rc; | 300 | return rc; |
311 | } | 301 | } |
312 | 302 | ||
313 | static int audit_set_enabled(int state, uid_t loginuid, u32 sid) | 303 | static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid) |
314 | { | 304 | { |
315 | int res, rc = 0, old = audit_enabled; | 305 | return audit_do_config_change("audit_rate_limit", &audit_rate_limit, |
306 | limit, loginuid, sid); | ||
307 | } | ||
308 | |||
309 | static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid) | ||
310 | { | ||
311 | return audit_do_config_change("audit_backlog_limit", &audit_backlog_limit, | ||
312 | limit, loginuid, sid); | ||
313 | } | ||
316 | 314 | ||
317 | if (state < 0 || state > 2) | 315 | static int audit_set_enabled(int state, uid_t loginuid, u32 sid) |
316 | { | ||
317 | int rc; | ||
318 | if (state < AUDIT_OFF || state > AUDIT_LOCKED) | ||
318 | return -EINVAL; | 319 | return -EINVAL; |
319 | 320 | ||
320 | /* check if we are locked */ | 321 | rc = audit_do_config_change("audit_enabled", &audit_enabled, state, |
321 | if (audit_enabled == 2) | 322 | loginuid, sid); |
322 | res = 0; | ||
323 | else | ||
324 | res = 1; | ||
325 | 323 | ||
326 | if (sid) { | 324 | if (!rc) |
327 | char *ctx = NULL; | 325 | audit_ever_enabled |= !!state; |
328 | u32 len; | ||
329 | if ((rc = selinux_sid_to_string(sid, &ctx, &len)) == 0) { | ||
330 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
331 | "audit_enabled=%d old=%d by auid=%u" | ||
332 | " subj=%s res=%d", | ||
333 | state, old, loginuid, ctx, res); | ||
334 | kfree(ctx); | ||
335 | } else | ||
336 | res = 0; /* Something weird, deny request */ | ||
337 | } | ||
338 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
339 | "audit_enabled=%d old=%d by auid=%u res=%d", | ||
340 | state, old, loginuid, res); | ||
341 | 326 | ||
342 | /* If we are allowed, make the change */ | ||
343 | if (res == 1) | ||
344 | audit_enabled = state; | ||
345 | /* Not allowed, update reason */ | ||
346 | else if (rc == 0) | ||
347 | rc = -EPERM; | ||
348 | return rc; | 327 | return rc; |
349 | } | 328 | } |
350 | 329 | ||
351 | static int audit_set_failure(int state, uid_t loginuid, u32 sid) | 330 | static int audit_set_failure(int state, uid_t loginuid, u32 sid) |
352 | { | 331 | { |
353 | int res, rc = 0, old = audit_failure; | ||
354 | |||
355 | if (state != AUDIT_FAIL_SILENT | 332 | if (state != AUDIT_FAIL_SILENT |
356 | && state != AUDIT_FAIL_PRINTK | 333 | && state != AUDIT_FAIL_PRINTK |
357 | && state != AUDIT_FAIL_PANIC) | 334 | && state != AUDIT_FAIL_PANIC) |
358 | return -EINVAL; | 335 | return -EINVAL; |
359 | 336 | ||
360 | /* check if we are locked */ | 337 | return audit_do_config_change("audit_failure", &audit_failure, state, |
361 | if (audit_enabled == 2) | 338 | loginuid, sid); |
362 | res = 0; | ||
363 | else | ||
364 | res = 1; | ||
365 | |||
366 | if (sid) { | ||
367 | char *ctx = NULL; | ||
368 | u32 len; | ||
369 | if ((rc = selinux_sid_to_string(sid, &ctx, &len)) == 0) { | ||
370 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
371 | "audit_failure=%d old=%d by auid=%u" | ||
372 | " subj=%s res=%d", | ||
373 | state, old, loginuid, ctx, res); | ||
374 | kfree(ctx); | ||
375 | } else | ||
376 | res = 0; /* Something weird, deny request */ | ||
377 | } | ||
378 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
379 | "audit_failure=%d old=%d by auid=%u res=%d", | ||
380 | state, old, loginuid, res); | ||
381 | |||
382 | /* If we are allowed, make the change */ | ||
383 | if (res == 1) | ||
384 | audit_failure = state; | ||
385 | /* Not allowed, update reason */ | ||
386 | else if (rc == 0) | ||
387 | rc = -EPERM; | ||
388 | return rc; | ||
389 | } | 339 | } |
390 | 340 | ||
391 | static int kauditd_thread(void *dummy) | 341 | static int kauditd_thread(void *dummy) |
@@ -405,7 +355,11 @@ static int kauditd_thread(void *dummy) | |||
405 | audit_pid = 0; | 355 | audit_pid = 0; |
406 | } | 356 | } |
407 | } else { | 357 | } else { |
408 | printk(KERN_NOTICE "%s\n", skb->data + NLMSG_SPACE(0)); | 358 | if (printk_ratelimit()) |
359 | printk(KERN_NOTICE "%s\n", skb->data + | ||
360 | NLMSG_SPACE(0)); | ||
361 | else | ||
362 | audit_log_lost("printk limit exceeded\n"); | ||
409 | kfree_skb(skb); | 363 | kfree_skb(skb); |
410 | } | 364 | } |
411 | } else { | 365 | } else { |
@@ -573,6 +527,33 @@ static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type) | |||
573 | return err; | 527 | return err; |
574 | } | 528 | } |
575 | 529 | ||
530 | static int audit_log_common_recv_msg(struct audit_buffer **ab, u16 msg_type, | ||
531 | u32 pid, u32 uid, uid_t auid, u32 sid) | ||
532 | { | ||
533 | int rc = 0; | ||
534 | char *ctx = NULL; | ||
535 | u32 len; | ||
536 | |||
537 | if (!audit_enabled) { | ||
538 | *ab = NULL; | ||
539 | return rc; | ||
540 | } | ||
541 | |||
542 | *ab = audit_log_start(NULL, GFP_KERNEL, msg_type); | ||
543 | audit_log_format(*ab, "user pid=%d uid=%u auid=%u", | ||
544 | pid, uid, auid); | ||
545 | if (sid) { | ||
546 | rc = selinux_sid_to_string(sid, &ctx, &len); | ||
547 | if (rc) | ||
548 | audit_log_format(*ab, " ssid=%u", sid); | ||
549 | else | ||
550 | audit_log_format(*ab, " subj=%s", ctx); | ||
551 | kfree(ctx); | ||
552 | } | ||
553 | |||
554 | return rc; | ||
555 | } | ||
556 | |||
576 | static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | 557 | static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) |
577 | { | 558 | { |
578 | u32 uid, pid, seq, sid; | 559 | u32 uid, pid, seq, sid; |
@@ -583,7 +564,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
583 | u16 msg_type = nlh->nlmsg_type; | 564 | u16 msg_type = nlh->nlmsg_type; |
584 | uid_t loginuid; /* loginuid of sender */ | 565 | uid_t loginuid; /* loginuid of sender */ |
585 | struct audit_sig_info *sig_data; | 566 | struct audit_sig_info *sig_data; |
586 | char *ctx; | 567 | char *ctx = NULL; |
587 | u32 len; | 568 | u32 len; |
588 | 569 | ||
589 | err = audit_netlink_ok(skb, msg_type); | 570 | err = audit_netlink_ok(skb, msg_type); |
@@ -634,23 +615,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
634 | if (err < 0) return err; | 615 | if (err < 0) return err; |
635 | } | 616 | } |
636 | if (status_get->mask & AUDIT_STATUS_PID) { | 617 | if (status_get->mask & AUDIT_STATUS_PID) { |
637 | int old = audit_pid; | 618 | int new_pid = status_get->pid; |
638 | if (sid) { | 619 | |
639 | if ((err = selinux_sid_to_string( | 620 | if (audit_enabled != AUDIT_OFF) |
640 | sid, &ctx, &len))) | 621 | audit_log_config_change("audit_pid", new_pid, |
641 | return err; | 622 | audit_pid, loginuid, |
642 | else | 623 | sid, 1); |
643 | audit_log(NULL, GFP_KERNEL, | 624 | |
644 | AUDIT_CONFIG_CHANGE, | 625 | audit_pid = new_pid; |
645 | "audit_pid=%d old=%d by auid=%u subj=%s", | ||
646 | status_get->pid, old, | ||
647 | loginuid, ctx); | ||
648 | kfree(ctx); | ||
649 | } else | ||
650 | audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, | ||
651 | "audit_pid=%d old=%d by auid=%u", | ||
652 | status_get->pid, old, loginuid); | ||
653 | audit_pid = status_get->pid; | ||
654 | } | 626 | } |
655 | if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) | 627 | if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) |
656 | err = audit_set_rate_limit(status_get->rate_limit, | 628 | err = audit_set_rate_limit(status_get->rate_limit, |
@@ -673,64 +645,35 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
673 | if (err) | 645 | if (err) |
674 | break; | 646 | break; |
675 | } | 647 | } |
676 | ab = audit_log_start(NULL, GFP_KERNEL, msg_type); | 648 | audit_log_common_recv_msg(&ab, msg_type, pid, uid, |
677 | if (ab) { | 649 | loginuid, sid); |
678 | audit_log_format(ab, | 650 | |
679 | "user pid=%d uid=%u auid=%u", | 651 | if (msg_type != AUDIT_USER_TTY) |
680 | pid, uid, loginuid); | 652 | audit_log_format(ab, " msg='%.1024s'", |
681 | if (sid) { | 653 | (char *)data); |
682 | if (selinux_sid_to_string( | 654 | else { |
683 | sid, &ctx, &len)) { | 655 | int size; |
684 | audit_log_format(ab, | 656 | |
685 | " ssid=%u", sid); | 657 | audit_log_format(ab, " msg="); |
686 | /* Maybe call audit_panic? */ | 658 | size = nlmsg_len(nlh); |
687 | } else | 659 | audit_log_n_untrustedstring(ab, size, |
688 | audit_log_format(ab, | 660 | data); |
689 | " subj=%s", ctx); | ||
690 | kfree(ctx); | ||
691 | } | ||
692 | if (msg_type != AUDIT_USER_TTY) | ||
693 | audit_log_format(ab, " msg='%.1024s'", | ||
694 | (char *)data); | ||
695 | else { | ||
696 | int size; | ||
697 | |||
698 | audit_log_format(ab, " msg="); | ||
699 | size = nlmsg_len(nlh); | ||
700 | audit_log_n_untrustedstring(ab, size, | ||
701 | data); | ||
702 | } | ||
703 | audit_set_pid(ab, pid); | ||
704 | audit_log_end(ab); | ||
705 | } | 661 | } |
662 | audit_set_pid(ab, pid); | ||
663 | audit_log_end(ab); | ||
706 | } | 664 | } |
707 | break; | 665 | break; |
708 | case AUDIT_ADD: | 666 | case AUDIT_ADD: |
709 | case AUDIT_DEL: | 667 | case AUDIT_DEL: |
710 | if (nlmsg_len(nlh) < sizeof(struct audit_rule)) | 668 | if (nlmsg_len(nlh) < sizeof(struct audit_rule)) |
711 | return -EINVAL; | 669 | return -EINVAL; |
712 | if (audit_enabled == 2) { | 670 | if (audit_enabled == AUDIT_LOCKED) { |
713 | ab = audit_log_start(NULL, GFP_KERNEL, | 671 | audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, |
714 | AUDIT_CONFIG_CHANGE); | 672 | uid, loginuid, sid); |
715 | if (ab) { | 673 | |
716 | audit_log_format(ab, | 674 | audit_log_format(ab, " audit_enabled=%d res=0", |
717 | "pid=%d uid=%u auid=%u", | 675 | audit_enabled); |
718 | pid, uid, loginuid); | 676 | audit_log_end(ab); |
719 | if (sid) { | ||
720 | if (selinux_sid_to_string( | ||
721 | sid, &ctx, &len)) { | ||
722 | audit_log_format(ab, | ||
723 | " ssid=%u", sid); | ||
724 | /* Maybe call audit_panic? */ | ||
725 | } else | ||
726 | audit_log_format(ab, | ||
727 | " subj=%s", ctx); | ||
728 | kfree(ctx); | ||
729 | } | ||
730 | audit_log_format(ab, " audit_enabled=%d res=0", | ||
731 | audit_enabled); | ||
732 | audit_log_end(ab); | ||
733 | } | ||
734 | return -EPERM; | 677 | return -EPERM; |
735 | } | 678 | } |
736 | /* fallthrough */ | 679 | /* fallthrough */ |
@@ -743,28 +686,13 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
743 | case AUDIT_DEL_RULE: | 686 | case AUDIT_DEL_RULE: |
744 | if (nlmsg_len(nlh) < sizeof(struct audit_rule_data)) | 687 | if (nlmsg_len(nlh) < sizeof(struct audit_rule_data)) |
745 | return -EINVAL; | 688 | return -EINVAL; |
746 | if (audit_enabled == 2) { | 689 | if (audit_enabled == AUDIT_LOCKED) { |
747 | ab = audit_log_start(NULL, GFP_KERNEL, | 690 | audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, |
748 | AUDIT_CONFIG_CHANGE); | 691 | uid, loginuid, sid); |
749 | if (ab) { | 692 | |
750 | audit_log_format(ab, | 693 | audit_log_format(ab, " audit_enabled=%d res=0", |
751 | "pid=%d uid=%u auid=%u", | 694 | audit_enabled); |
752 | pid, uid, loginuid); | 695 | audit_log_end(ab); |
753 | if (sid) { | ||
754 | if (selinux_sid_to_string( | ||
755 | sid, &ctx, &len)) { | ||
756 | audit_log_format(ab, | ||
757 | " ssid=%u", sid); | ||
758 | /* Maybe call audit_panic? */ | ||
759 | } else | ||
760 | audit_log_format(ab, | ||
761 | " subj=%s", ctx); | ||
762 | kfree(ctx); | ||
763 | } | ||
764 | audit_log_format(ab, " audit_enabled=%d res=0", | ||
765 | audit_enabled); | ||
766 | audit_log_end(ab); | ||
767 | } | ||
768 | return -EPERM; | 696 | return -EPERM; |
769 | } | 697 | } |
770 | /* fallthrough */ | 698 | /* fallthrough */ |
@@ -775,19 +703,10 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
775 | break; | 703 | break; |
776 | case AUDIT_TRIM: | 704 | case AUDIT_TRIM: |
777 | audit_trim_trees(); | 705 | audit_trim_trees(); |
778 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); | 706 | |
779 | if (!ab) | 707 | audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, |
780 | break; | 708 | uid, loginuid, sid); |
781 | audit_log_format(ab, "auid=%u", loginuid); | 709 | |
782 | if (sid) { | ||
783 | u32 len; | ||
784 | ctx = NULL; | ||
785 | if (selinux_sid_to_string(sid, &ctx, &len)) | ||
786 | audit_log_format(ab, " ssid=%u", sid); | ||
787 | else | ||
788 | audit_log_format(ab, " subj=%s", ctx); | ||
789 | kfree(ctx); | ||
790 | } | ||
791 | audit_log_format(ab, " op=trim res=1"); | 710 | audit_log_format(ab, " op=trim res=1"); |
792 | audit_log_end(ab); | 711 | audit_log_end(ab); |
793 | break; | 712 | break; |
@@ -817,22 +736,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) | |||
817 | /* OK, here comes... */ | 736 | /* OK, here comes... */ |
818 | err = audit_tag_tree(old, new); | 737 | err = audit_tag_tree(old, new); |
819 | 738 | ||
820 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); | 739 | audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE, pid, |
821 | if (!ab) { | 740 | uid, loginuid, sid); |
822 | kfree(old); | 741 | |
823 | kfree(new); | ||
824 | break; | ||
825 | } | ||
826 | audit_log_format(ab, "auid=%u", loginuid); | ||
827 | if (sid) { | ||
828 | u32 len; | ||
829 | ctx = NULL; | ||
830 | if (selinux_sid_to_string(sid, &ctx, &len)) | ||
831 | audit_log_format(ab, " ssid=%u", sid); | ||
832 | else | ||
833 | audit_log_format(ab, " subj=%s", ctx); | ||
834 | kfree(ctx); | ||
835 | } | ||
836 | audit_log_format(ab, " op=make_equiv old="); | 742 | audit_log_format(ab, " op=make_equiv old="); |
837 | audit_log_untrustedstring(ab, old); | 743 | audit_log_untrustedstring(ab, old); |
838 | audit_log_format(ab, " new="); | 744 | audit_log_format(ab, " new="); |
@@ -965,6 +871,7 @@ static int __init audit_init(void) | |||
965 | skb_queue_head_init(&audit_skb_queue); | 871 | skb_queue_head_init(&audit_skb_queue); |
966 | audit_initialized = 1; | 872 | audit_initialized = 1; |
967 | audit_enabled = audit_default; | 873 | audit_enabled = audit_default; |
874 | audit_ever_enabled |= !!audit_default; | ||
968 | 875 | ||
969 | /* Register the callback with selinux. This callback will be invoked | 876 | /* Register the callback with selinux. This callback will be invoked |
970 | * when a new policy is loaded. */ | 877 | * when a new policy is loaded. */ |
@@ -992,8 +899,10 @@ static int __init audit_enable(char *str) | |||
992 | printk(KERN_INFO "audit: %s%s\n", | 899 | printk(KERN_INFO "audit: %s%s\n", |
993 | audit_default ? "enabled" : "disabled", | 900 | audit_default ? "enabled" : "disabled", |
994 | audit_initialized ? "" : " (after initialization)"); | 901 | audit_initialized ? "" : " (after initialization)"); |
995 | if (audit_initialized) | 902 | if (audit_initialized) { |
996 | audit_enabled = audit_default; | 903 | audit_enabled = audit_default; |
904 | audit_ever_enabled |= !!audit_default; | ||
905 | } | ||
997 | return 1; | 906 | return 1; |
998 | } | 907 | } |
999 | 908 | ||
@@ -1130,7 +1039,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, | |||
1130 | { | 1039 | { |
1131 | struct audit_buffer *ab = NULL; | 1040 | struct audit_buffer *ab = NULL; |
1132 | struct timespec t; | 1041 | struct timespec t; |
1133 | unsigned int serial; | 1042 | unsigned int uninitialized_var(serial); |
1134 | int reserve; | 1043 | int reserve; |
1135 | unsigned long timeout_start = jiffies; | 1044 | unsigned long timeout_start = jiffies; |
1136 | 1045 | ||
@@ -1164,7 +1073,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, | |||
1164 | remove_wait_queue(&audit_backlog_wait, &wait); | 1073 | remove_wait_queue(&audit_backlog_wait, &wait); |
1165 | continue; | 1074 | continue; |
1166 | } | 1075 | } |
1167 | if (audit_rate_check()) | 1076 | if (audit_rate_check() && printk_ratelimit()) |
1168 | printk(KERN_WARNING | 1077 | printk(KERN_WARNING |
1169 | "audit: audit_backlog=%d > " | 1078 | "audit: audit_backlog=%d > " |
1170 | "audit_backlog_limit=%d\n", | 1079 | "audit_backlog_limit=%d\n", |
@@ -1200,13 +1109,17 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, | |||
1200 | static inline int audit_expand(struct audit_buffer *ab, int extra) | 1109 | static inline int audit_expand(struct audit_buffer *ab, int extra) |
1201 | { | 1110 | { |
1202 | struct sk_buff *skb = ab->skb; | 1111 | struct sk_buff *skb = ab->skb; |
1203 | int ret = pskb_expand_head(skb, skb_headroom(skb), extra, | 1112 | int oldtail = skb_tailroom(skb); |
1204 | ab->gfp_mask); | 1113 | int ret = pskb_expand_head(skb, 0, extra, ab->gfp_mask); |
1114 | int newtail = skb_tailroom(skb); | ||
1115 | |||
1205 | if (ret < 0) { | 1116 | if (ret < 0) { |
1206 | audit_log_lost("out of memory in audit_expand"); | 1117 | audit_log_lost("out of memory in audit_expand"); |
1207 | return 0; | 1118 | return 0; |
1208 | } | 1119 | } |
1209 | return skb_tailroom(skb); | 1120 | |
1121 | skb->truesize += newtail - oldtail; | ||
1122 | return newtail; | ||
1210 | } | 1123 | } |
1211 | 1124 | ||
1212 | /* | 1125 | /* |
@@ -1245,6 +1158,7 @@ static void audit_log_vformat(struct audit_buffer *ab, const char *fmt, | |||
1245 | goto out; | 1158 | goto out; |
1246 | len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2); | 1159 | len = vsnprintf(skb_tail_pointer(skb), avail, fmt, args2); |
1247 | } | 1160 | } |
1161 | va_end(args2); | ||
1248 | if (len > 0) | 1162 | if (len > 0) |
1249 | skb_put(skb, len); | 1163 | skb_put(skb, len); |
1250 | out: | 1164 | out: |
@@ -1346,6 +1260,21 @@ static void audit_log_n_string(struct audit_buffer *ab, size_t slen, | |||
1346 | } | 1260 | } |
1347 | 1261 | ||
1348 | /** | 1262 | /** |
1263 | * audit_string_contains_control - does a string need to be logged in hex | ||
1264 | * @string - string to be checked | ||
1265 | * @len - max length of the string to check | ||
1266 | */ | ||
1267 | int audit_string_contains_control(const char *string, size_t len) | ||
1268 | { | ||
1269 | const unsigned char *p; | ||
1270 | for (p = string; p < (const unsigned char *)string + len && *p; p++) { | ||
1271 | if (*p == '"' || *p < 0x21 || *p > 0x7f) | ||
1272 | return 1; | ||
1273 | } | ||
1274 | return 0; | ||
1275 | } | ||
1276 | |||
1277 | /** | ||
1349 | * audit_log_n_untrustedstring - log a string that may contain random characters | 1278 | * audit_log_n_untrustedstring - log a string that may contain random characters |
1350 | * @ab: audit_buffer | 1279 | * @ab: audit_buffer |
1351 | * @len: lenth of string (not including trailing null) | 1280 | * @len: lenth of string (not including trailing null) |
@@ -1359,19 +1288,13 @@ static void audit_log_n_string(struct audit_buffer *ab, size_t slen, | |||
1359 | * The caller specifies the number of characters in the string to log, which may | 1288 | * The caller specifies the number of characters in the string to log, which may |
1360 | * or may not be the entire string. | 1289 | * or may not be the entire string. |
1361 | */ | 1290 | */ |
1362 | const char *audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len, | 1291 | void audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len, |
1363 | const char *string) | 1292 | const char *string) |
1364 | { | 1293 | { |
1365 | const unsigned char *p; | 1294 | if (audit_string_contains_control(string, len)) |
1366 | 1295 | audit_log_hex(ab, string, len); | |
1367 | for (p = string; p < (const unsigned char *)string + len && *p; p++) { | 1296 | else |
1368 | if (*p == '"' || *p < 0x21 || *p > 0x7f) { | 1297 | audit_log_n_string(ab, len, string); |
1369 | audit_log_hex(ab, string, len); | ||
1370 | return string + len + 1; | ||
1371 | } | ||
1372 | } | ||
1373 | audit_log_n_string(ab, len, string); | ||
1374 | return p + 1; | ||
1375 | } | 1298 | } |
1376 | 1299 | ||
1377 | /** | 1300 | /** |
@@ -1382,9 +1305,9 @@ const char *audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len, | |||
1382 | * Same as audit_log_n_untrustedstring(), except that strlen is used to | 1305 | * Same as audit_log_n_untrustedstring(), except that strlen is used to |
1383 | * determine string length. | 1306 | * determine string length. |
1384 | */ | 1307 | */ |
1385 | const char *audit_log_untrustedstring(struct audit_buffer *ab, const char *string) | 1308 | void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) |
1386 | { | 1309 | { |
1387 | return audit_log_n_untrustedstring(ab, strlen(string), string); | 1310 | audit_log_n_untrustedstring(ab, strlen(string), string); |
1388 | } | 1311 | } |
1389 | 1312 | ||
1390 | /* This is a helper-function to print the escaped d_path */ | 1313 | /* This is a helper-function to print the escaped d_path */ |
@@ -1433,8 +1356,11 @@ void audit_log_end(struct audit_buffer *ab) | |||
1433 | skb_queue_tail(&audit_skb_queue, ab->skb); | 1356 | skb_queue_tail(&audit_skb_queue, ab->skb); |
1434 | ab->skb = NULL; | 1357 | ab->skb = NULL; |
1435 | wake_up_interruptible(&kauditd_wait); | 1358 | wake_up_interruptible(&kauditd_wait); |
1359 | } else if (printk_ratelimit()) { | ||
1360 | struct nlmsghdr *nlh = nlmsg_hdr(ab->skb); | ||
1361 | printk(KERN_NOTICE "type=%d %s\n", nlh->nlmsg_type, ab->skb->data + NLMSG_SPACE(0)); | ||
1436 | } else { | 1362 | } else { |
1437 | printk(KERN_NOTICE "%s\n", ab->skb->data + NLMSG_SPACE(0)); | 1363 | audit_log_lost("printk limit exceeded\n"); |
1438 | } | 1364 | } |
1439 | } | 1365 | } |
1440 | audit_buffer_free(ab); | 1366 | audit_buffer_free(ab); |
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index 5d96f2cc7be8..6f19fd477aac 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c | |||
@@ -95,6 +95,8 @@ extern struct inotify_handle *audit_ih; | |||
95 | /* Inotify events we care about. */ | 95 | /* Inotify events we care about. */ |
96 | #define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF | 96 | #define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF |
97 | 97 | ||
98 | extern int audit_enabled; | ||
99 | |||
98 | void audit_free_parent(struct inotify_watch *i_watch) | 100 | void audit_free_parent(struct inotify_watch *i_watch) |
99 | { | 101 | { |
100 | struct audit_parent *parent; | 102 | struct audit_parent *parent; |
@@ -974,7 +976,6 @@ static void audit_update_watch(struct audit_parent *parent, | |||
974 | struct audit_watch *owatch, *nwatch, *nextw; | 976 | struct audit_watch *owatch, *nwatch, *nextw; |
975 | struct audit_krule *r, *nextr; | 977 | struct audit_krule *r, *nextr; |
976 | struct audit_entry *oentry, *nentry; | 978 | struct audit_entry *oentry, *nentry; |
977 | struct audit_buffer *ab; | ||
978 | 979 | ||
979 | mutex_lock(&audit_filter_mutex); | 980 | mutex_lock(&audit_filter_mutex); |
980 | list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { | 981 | list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) { |
@@ -1014,13 +1015,18 @@ static void audit_update_watch(struct audit_parent *parent, | |||
1014 | call_rcu(&oentry->rcu, audit_free_rule_rcu); | 1015 | call_rcu(&oentry->rcu, audit_free_rule_rcu); |
1015 | } | 1016 | } |
1016 | 1017 | ||
1017 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); | 1018 | if (audit_enabled) { |
1018 | audit_log_format(ab, "op=updated rules specifying path="); | 1019 | struct audit_buffer *ab; |
1019 | audit_log_untrustedstring(ab, owatch->path); | 1020 | ab = audit_log_start(NULL, GFP_KERNEL, |
1020 | audit_log_format(ab, " with dev=%u ino=%lu\n", dev, ino); | 1021 | AUDIT_CONFIG_CHANGE); |
1021 | audit_log_format(ab, " list=%d res=1", r->listnr); | 1022 | audit_log_format(ab, |
1022 | audit_log_end(ab); | 1023 | "op=updated rules specifying path="); |
1023 | 1024 | audit_log_untrustedstring(ab, owatch->path); | |
1025 | audit_log_format(ab, " with dev=%u ino=%lu\n", | ||
1026 | dev, ino); | ||
1027 | audit_log_format(ab, " list=%d res=1", r->listnr); | ||
1028 | audit_log_end(ab); | ||
1029 | } | ||
1024 | audit_remove_watch(owatch); | 1030 | audit_remove_watch(owatch); |
1025 | goto add_watch_to_parent; /* event applies to a single watch */ | 1031 | goto add_watch_to_parent; /* event applies to a single watch */ |
1026 | } | 1032 | } |
@@ -1039,25 +1045,28 @@ static void audit_remove_parent_watches(struct audit_parent *parent) | |||
1039 | struct audit_watch *w, *nextw; | 1045 | struct audit_watch *w, *nextw; |
1040 | struct audit_krule *r, *nextr; | 1046 | struct audit_krule *r, *nextr; |
1041 | struct audit_entry *e; | 1047 | struct audit_entry *e; |
1042 | struct audit_buffer *ab; | ||
1043 | 1048 | ||
1044 | mutex_lock(&audit_filter_mutex); | 1049 | mutex_lock(&audit_filter_mutex); |
1045 | parent->flags |= AUDIT_PARENT_INVALID; | 1050 | parent->flags |= AUDIT_PARENT_INVALID; |
1046 | list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { | 1051 | list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { |
1047 | list_for_each_entry_safe(r, nextr, &w->rules, rlist) { | 1052 | list_for_each_entry_safe(r, nextr, &w->rules, rlist) { |
1048 | e = container_of(r, struct audit_entry, rule); | 1053 | e = container_of(r, struct audit_entry, rule); |
1049 | 1054 | if (audit_enabled) { | |
1050 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); | 1055 | struct audit_buffer *ab; |
1051 | audit_log_format(ab, "op=remove rule path="); | 1056 | ab = audit_log_start(NULL, GFP_KERNEL, |
1052 | audit_log_untrustedstring(ab, w->path); | 1057 | AUDIT_CONFIG_CHANGE); |
1053 | if (r->filterkey) { | 1058 | audit_log_format(ab, "op=remove rule path="); |
1054 | audit_log_format(ab, " key="); | 1059 | audit_log_untrustedstring(ab, w->path); |
1055 | audit_log_untrustedstring(ab, r->filterkey); | 1060 | if (r->filterkey) { |
1056 | } else | 1061 | audit_log_format(ab, " key="); |
1057 | audit_log_format(ab, " key=(null)"); | 1062 | audit_log_untrustedstring(ab, |
1058 | audit_log_format(ab, " list=%d res=1", r->listnr); | 1063 | r->filterkey); |
1059 | audit_log_end(ab); | 1064 | } else |
1060 | 1065 | audit_log_format(ab, " key=(null)"); | |
1066 | audit_log_format(ab, " list=%d res=1", | ||
1067 | r->listnr); | ||
1068 | audit_log_end(ab); | ||
1069 | } | ||
1061 | list_del(&r->rlist); | 1070 | list_del(&r->rlist); |
1062 | list_del_rcu(&e->list); | 1071 | list_del_rcu(&e->list); |
1063 | call_rcu(&e->rcu, audit_free_rule_rcu); | 1072 | call_rcu(&e->rcu, audit_free_rule_rcu); |
@@ -1495,6 +1504,9 @@ static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action, | |||
1495 | { | 1504 | { |
1496 | struct audit_buffer *ab; | 1505 | struct audit_buffer *ab; |
1497 | 1506 | ||
1507 | if (!audit_enabled) | ||
1508 | return; | ||
1509 | |||
1498 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); | 1510 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); |
1499 | if (!ab) | 1511 | if (!ab) |
1500 | return; | 1512 | return; |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index bce9ecdb7712..1c06ecf38d7b 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
@@ -70,6 +70,7 @@ | |||
70 | #include "audit.h" | 70 | #include "audit.h" |
71 | 71 | ||
72 | extern struct list_head audit_filter_list[]; | 72 | extern struct list_head audit_filter_list[]; |
73 | extern int audit_ever_enabled; | ||
73 | 74 | ||
74 | /* AUDIT_NAMES is the number of slots we reserve in the audit_context | 75 | /* AUDIT_NAMES is the number of slots we reserve in the audit_context |
75 | * for saving names from getname(). */ | 76 | * for saving names from getname(). */ |
@@ -78,6 +79,9 @@ extern struct list_head audit_filter_list[]; | |||
78 | /* Indicates that audit should log the full pathname. */ | 79 | /* Indicates that audit should log the full pathname. */ |
79 | #define AUDIT_NAME_FULL -1 | 80 | #define AUDIT_NAME_FULL -1 |
80 | 81 | ||
82 | /* no execve audit message should be longer than this (userspace limits) */ | ||
83 | #define MAX_EXECVE_AUDIT_LEN 7500 | ||
84 | |||
81 | /* number of audit rules */ | 85 | /* number of audit rules */ |
82 | int audit_n_rules; | 86 | int audit_n_rules; |
83 | 87 | ||
@@ -176,7 +180,11 @@ struct audit_aux_data_fd_pair { | |||
176 | struct audit_aux_data_pids { | 180 | struct audit_aux_data_pids { |
177 | struct audit_aux_data d; | 181 | struct audit_aux_data d; |
178 | pid_t target_pid[AUDIT_AUX_PIDS]; | 182 | pid_t target_pid[AUDIT_AUX_PIDS]; |
183 | uid_t target_auid[AUDIT_AUX_PIDS]; | ||
184 | uid_t target_uid[AUDIT_AUX_PIDS]; | ||
185 | unsigned int target_sessionid[AUDIT_AUX_PIDS]; | ||
179 | u32 target_sid[AUDIT_AUX_PIDS]; | 186 | u32 target_sid[AUDIT_AUX_PIDS]; |
187 | char target_comm[AUDIT_AUX_PIDS][TASK_COMM_LEN]; | ||
180 | int pid_count; | 188 | int pid_count; |
181 | }; | 189 | }; |
182 | 190 | ||
@@ -192,7 +200,6 @@ struct audit_context { | |||
192 | enum audit_state state; | 200 | enum audit_state state; |
193 | unsigned int serial; /* serial number for record */ | 201 | unsigned int serial; /* serial number for record */ |
194 | struct timespec ctime; /* time of syscall entry */ | 202 | struct timespec ctime; /* time of syscall entry */ |
195 | uid_t loginuid; /* login uid (identity) */ | ||
196 | int major; /* syscall number */ | 203 | int major; /* syscall number */ |
197 | unsigned long argv[4]; /* syscall arguments */ | 204 | unsigned long argv[4]; /* syscall arguments */ |
198 | int return_valid; /* return code is valid */ | 205 | int return_valid; /* return code is valid */ |
@@ -215,7 +222,11 @@ struct audit_context { | |||
215 | int arch; | 222 | int arch; |
216 | 223 | ||
217 | pid_t target_pid; | 224 | pid_t target_pid; |
225 | uid_t target_auid; | ||
226 | uid_t target_uid; | ||
227 | unsigned int target_sessionid; | ||
218 | u32 target_sid; | 228 | u32 target_sid; |
229 | char target_comm[TASK_COMM_LEN]; | ||
219 | 230 | ||
220 | struct audit_tree_refs *trees, *first_trees; | 231 | struct audit_tree_refs *trees, *first_trees; |
221 | int tree_count; | 232 | int tree_count; |
@@ -506,7 +517,7 @@ static int audit_filter_rules(struct task_struct *tsk, | |||
506 | case AUDIT_LOGINUID: | 517 | case AUDIT_LOGINUID: |
507 | result = 0; | 518 | result = 0; |
508 | if (ctx) | 519 | if (ctx) |
509 | result = audit_comparator(ctx->loginuid, f->op, f->val); | 520 | result = audit_comparator(tsk->loginuid, f->op, f->val); |
510 | break; | 521 | break; |
511 | case AUDIT_SUBJ_USER: | 522 | case AUDIT_SUBJ_USER: |
512 | case AUDIT_SUBJ_ROLE: | 523 | case AUDIT_SUBJ_ROLE: |
@@ -702,7 +713,24 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk, | |||
702 | if (likely(!context)) | 713 | if (likely(!context)) |
703 | return NULL; | 714 | return NULL; |
704 | context->return_valid = return_valid; | 715 | context->return_valid = return_valid; |
705 | context->return_code = return_code; | 716 | |
717 | /* | ||
718 | * we need to fix up the return code in the audit logs if the actual | ||
719 | * return codes are later going to be fixed up by the arch specific | ||
720 | * signal handlers | ||
721 | * | ||
722 | * This is actually a test for: | ||
723 | * (rc == ERESTARTSYS ) || (rc == ERESTARTNOINTR) || | ||
724 | * (rc == ERESTARTNOHAND) || (rc == ERESTART_RESTARTBLOCK) | ||
725 | * | ||
726 | * but is faster than a bunch of || | ||
727 | */ | ||
728 | if (unlikely(return_code <= -ERESTARTSYS) && | ||
729 | (return_code >= -ERESTART_RESTARTBLOCK) && | ||
730 | (return_code != -ENOIOCTLCMD)) | ||
731 | context->return_code = -EINTR; | ||
732 | else | ||
733 | context->return_code = return_code; | ||
706 | 734 | ||
707 | if (context->in_syscall && !context->dummy && !context->auditable) { | 735 | if (context->in_syscall && !context->dummy && !context->auditable) { |
708 | enum audit_state state; | 736 | enum audit_state state; |
@@ -783,11 +811,8 @@ static inline void audit_free_aux(struct audit_context *context) | |||
783 | static inline void audit_zero_context(struct audit_context *context, | 811 | static inline void audit_zero_context(struct audit_context *context, |
784 | enum audit_state state) | 812 | enum audit_state state) |
785 | { | 813 | { |
786 | uid_t loginuid = context->loginuid; | ||
787 | |||
788 | memset(context, 0, sizeof(*context)); | 814 | memset(context, 0, sizeof(*context)); |
789 | context->state = state; | 815 | context->state = state; |
790 | context->loginuid = loginuid; | ||
791 | } | 816 | } |
792 | 817 | ||
793 | static inline struct audit_context *audit_alloc_context(enum audit_state state) | 818 | static inline struct audit_context *audit_alloc_context(enum audit_state state) |
@@ -814,7 +839,7 @@ int audit_alloc(struct task_struct *tsk) | |||
814 | struct audit_context *context; | 839 | struct audit_context *context; |
815 | enum audit_state state; | 840 | enum audit_state state; |
816 | 841 | ||
817 | if (likely(!audit_enabled)) | 842 | if (likely(!audit_ever_enabled)) |
818 | return 0; /* Return if not auditing. */ | 843 | return 0; /* Return if not auditing. */ |
819 | 844 | ||
820 | state = audit_filter_task(tsk); | 845 | state = audit_filter_task(tsk); |
@@ -826,11 +851,6 @@ int audit_alloc(struct task_struct *tsk) | |||
826 | return -ENOMEM; | 851 | return -ENOMEM; |
827 | } | 852 | } |
828 | 853 | ||
829 | /* Preserve login uid */ | ||
830 | context->loginuid = -1; | ||
831 | if (current->audit_context) | ||
832 | context->loginuid = current->audit_context->loginuid; | ||
833 | |||
834 | tsk->audit_context = context; | 854 | tsk->audit_context = context; |
835 | set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT); | 855 | set_tsk_thread_flag(tsk, TIF_SYSCALL_AUDIT); |
836 | return 0; | 856 | return 0; |
@@ -922,7 +942,8 @@ static void audit_log_task_info(struct audit_buffer *ab, struct task_struct *tsk | |||
922 | } | 942 | } |
923 | 943 | ||
924 | static int audit_log_pid_context(struct audit_context *context, pid_t pid, | 944 | static int audit_log_pid_context(struct audit_context *context, pid_t pid, |
925 | u32 sid) | 945 | uid_t auid, uid_t uid, unsigned int sessionid, |
946 | u32 sid, char *comm) | ||
926 | { | 947 | { |
927 | struct audit_buffer *ab; | 948 | struct audit_buffer *ab; |
928 | char *s = NULL; | 949 | char *s = NULL; |
@@ -931,68 +952,204 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid, | |||
931 | 952 | ||
932 | ab = audit_log_start(context, GFP_KERNEL, AUDIT_OBJ_PID); | 953 | ab = audit_log_start(context, GFP_KERNEL, AUDIT_OBJ_PID); |
933 | if (!ab) | 954 | if (!ab) |
934 | return 1; | 955 | return rc; |
935 | 956 | ||
957 | audit_log_format(ab, "opid=%d oauid=%d ouid=%d oses=%d", pid, auid, | ||
958 | uid, sessionid); | ||
936 | if (selinux_sid_to_string(sid, &s, &len)) { | 959 | if (selinux_sid_to_string(sid, &s, &len)) { |
937 | audit_log_format(ab, "opid=%d obj=(none)", pid); | 960 | audit_log_format(ab, " obj=(none)"); |
938 | rc = 1; | 961 | rc = 1; |
939 | } else | 962 | } else |
940 | audit_log_format(ab, "opid=%d obj=%s", pid, s); | 963 | audit_log_format(ab, " obj=%s", s); |
964 | audit_log_format(ab, " ocomm="); | ||
965 | audit_log_untrustedstring(ab, comm); | ||
941 | audit_log_end(ab); | 966 | audit_log_end(ab); |
942 | kfree(s); | 967 | kfree(s); |
943 | 968 | ||
944 | return rc; | 969 | return rc; |
945 | } | 970 | } |
946 | 971 | ||
947 | static void audit_log_execve_info(struct audit_buffer *ab, | 972 | /* |
948 | struct audit_aux_data_execve *axi) | 973 | * to_send and len_sent accounting are very loose estimates. We aren't |
974 | * really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being | ||
975 | * within about 500 bytes (next page boundry) | ||
976 | * | ||
977 | * why snprintf? an int is up to 12 digits long. if we just assumed when | ||
978 | * logging that a[%d]= was going to be 16 characters long we would be wasting | ||
979 | * space in every audit message. In one 7500 byte message we can log up to | ||
980 | * about 1000 min size arguments. That comes down to about 50% waste of space | ||
981 | * if we didn't do the snprintf to find out how long arg_num_len was. | ||
982 | */ | ||
983 | static int audit_log_single_execve_arg(struct audit_context *context, | ||
984 | struct audit_buffer **ab, | ||
985 | int arg_num, | ||
986 | size_t *len_sent, | ||
987 | const char __user *p, | ||
988 | char *buf) | ||
949 | { | 989 | { |
950 | int i; | 990 | char arg_num_len_buf[12]; |
951 | long len, ret; | 991 | const char __user *tmp_p = p; |
952 | const char __user *p; | 992 | /* how many digits are in arg_num? 3 is the length of a=\n */ |
953 | char *buf; | 993 | size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 3; |
994 | size_t len, len_left, to_send; | ||
995 | size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN; | ||
996 | unsigned int i, has_cntl = 0, too_long = 0; | ||
997 | int ret; | ||
998 | |||
999 | /* strnlen_user includes the null we don't want to send */ | ||
1000 | len_left = len = strnlen_user(p, MAX_ARG_STRLEN) - 1; | ||
954 | 1001 | ||
955 | if (axi->mm != current->mm) | 1002 | /* |
956 | return; /* execve failed, no additional info */ | 1003 | * We just created this mm, if we can't find the strings |
957 | 1004 | * we just copied into it something is _very_ wrong. Similar | |
958 | p = (const char __user *)axi->mm->arg_start; | 1005 | * for strings that are too long, we should not have created |
1006 | * any. | ||
1007 | */ | ||
1008 | if (unlikely((len = -1) || len > MAX_ARG_STRLEN - 1)) { | ||
1009 | WARN_ON(1); | ||
1010 | send_sig(SIGKILL, current, 0); | ||
1011 | } | ||
959 | 1012 | ||
960 | for (i = 0; i < axi->argc; i++, p += len) { | 1013 | /* walk the whole argument looking for non-ascii chars */ |
961 | len = strnlen_user(p, MAX_ARG_STRLEN); | 1014 | do { |
1015 | if (len_left > MAX_EXECVE_AUDIT_LEN) | ||
1016 | to_send = MAX_EXECVE_AUDIT_LEN; | ||
1017 | else | ||
1018 | to_send = len_left; | ||
1019 | ret = copy_from_user(buf, tmp_p, to_send); | ||
962 | /* | 1020 | /* |
963 | * We just created this mm, if we can't find the strings | 1021 | * There is no reason for this copy to be short. We just |
964 | * we just copied into it something is _very_ wrong. Similar | 1022 | * copied them here, and the mm hasn't been exposed to user- |
965 | * for strings that are too long, we should not have created | 1023 | * space yet. |
966 | * any. | ||
967 | */ | 1024 | */ |
968 | if (!len || len > MAX_ARG_STRLEN) { | 1025 | if (ret) { |
969 | WARN_ON(1); | 1026 | WARN_ON(1); |
970 | send_sig(SIGKILL, current, 0); | 1027 | send_sig(SIGKILL, current, 0); |
971 | } | 1028 | } |
972 | 1029 | buf[to_send] = '\0'; | |
973 | buf = kmalloc(len, GFP_KERNEL); | 1030 | has_cntl = audit_string_contains_control(buf, to_send); |
974 | if (!buf) { | 1031 | if (has_cntl) { |
975 | audit_panic("out of memory for argv string\n"); | 1032 | /* |
1033 | * hex messages get logged as 2 bytes, so we can only | ||
1034 | * send half as much in each message | ||
1035 | */ | ||
1036 | max_execve_audit_len = MAX_EXECVE_AUDIT_LEN / 2; | ||
976 | break; | 1037 | break; |
977 | } | 1038 | } |
1039 | len_left -= to_send; | ||
1040 | tmp_p += to_send; | ||
1041 | } while (len_left > 0); | ||
1042 | |||
1043 | len_left = len; | ||
1044 | |||
1045 | if (len > max_execve_audit_len) | ||
1046 | too_long = 1; | ||
1047 | |||
1048 | /* rewalk the argument actually logging the message */ | ||
1049 | for (i = 0; len_left > 0; i++) { | ||
1050 | int room_left; | ||
1051 | |||
1052 | if (len_left > max_execve_audit_len) | ||
1053 | to_send = max_execve_audit_len; | ||
1054 | else | ||
1055 | to_send = len_left; | ||
1056 | |||
1057 | /* do we have space left to send this argument in this ab? */ | ||
1058 | room_left = MAX_EXECVE_AUDIT_LEN - arg_num_len - *len_sent; | ||
1059 | if (has_cntl) | ||
1060 | room_left -= (to_send * 2); | ||
1061 | else | ||
1062 | room_left -= to_send; | ||
1063 | if (room_left < 0) { | ||
1064 | *len_sent = 0; | ||
1065 | audit_log_end(*ab); | ||
1066 | *ab = audit_log_start(context, GFP_KERNEL, AUDIT_EXECVE); | ||
1067 | if (!*ab) | ||
1068 | return 0; | ||
1069 | } | ||
978 | 1070 | ||
979 | ret = copy_from_user(buf, p, len); | ||
980 | /* | 1071 | /* |
981 | * There is no reason for this copy to be short. We just | 1072 | * first record needs to say how long the original string was |
982 | * copied them here, and the mm hasn't been exposed to user- | 1073 | * so we can be sure nothing was lost. |
983 | * space yet. | 1074 | */ |
1075 | if ((i == 0) && (too_long)) | ||
1076 | audit_log_format(*ab, "a%d_len=%ld ", arg_num, | ||
1077 | has_cntl ? 2*len : len); | ||
1078 | |||
1079 | /* | ||
1080 | * normally arguments are small enough to fit and we already | ||
1081 | * filled buf above when we checked for control characters | ||
1082 | * so don't bother with another copy_from_user | ||
984 | */ | 1083 | */ |
1084 | if (len >= max_execve_audit_len) | ||
1085 | ret = copy_from_user(buf, p, to_send); | ||
1086 | else | ||
1087 | ret = 0; | ||
985 | if (ret) { | 1088 | if (ret) { |
986 | WARN_ON(1); | 1089 | WARN_ON(1); |
987 | send_sig(SIGKILL, current, 0); | 1090 | send_sig(SIGKILL, current, 0); |
988 | } | 1091 | } |
1092 | buf[to_send] = '\0'; | ||
1093 | |||
1094 | /* actually log it */ | ||
1095 | audit_log_format(*ab, "a%d", arg_num); | ||
1096 | if (too_long) | ||
1097 | audit_log_format(*ab, "[%d]", i); | ||
1098 | audit_log_format(*ab, "="); | ||
1099 | if (has_cntl) | ||
1100 | audit_log_hex(*ab, buf, to_send); | ||
1101 | else | ||
1102 | audit_log_format(*ab, "\"%s\"", buf); | ||
1103 | audit_log_format(*ab, "\n"); | ||
1104 | |||
1105 | p += to_send; | ||
1106 | len_left -= to_send; | ||
1107 | *len_sent += arg_num_len; | ||
1108 | if (has_cntl) | ||
1109 | *len_sent += to_send * 2; | ||
1110 | else | ||
1111 | *len_sent += to_send; | ||
1112 | } | ||
1113 | /* include the null we didn't log */ | ||
1114 | return len + 1; | ||
1115 | } | ||
989 | 1116 | ||
990 | audit_log_format(ab, "a%d=", i); | 1117 | static void audit_log_execve_info(struct audit_context *context, |
991 | audit_log_untrustedstring(ab, buf); | 1118 | struct audit_buffer **ab, |
992 | audit_log_format(ab, "\n"); | 1119 | struct audit_aux_data_execve *axi) |
1120 | { | ||
1121 | int i; | ||
1122 | size_t len, len_sent = 0; | ||
1123 | const char __user *p; | ||
1124 | char *buf; | ||
1125 | |||
1126 | if (axi->mm != current->mm) | ||
1127 | return; /* execve failed, no additional info */ | ||
1128 | |||
1129 | p = (const char __user *)axi->mm->arg_start; | ||
1130 | |||
1131 | audit_log_format(*ab, "argc=%d ", axi->argc); | ||
1132 | |||
1133 | /* | ||
1134 | * we need some kernel buffer to hold the userspace args. Just | ||
1135 | * allocate one big one rather than allocating one of the right size | ||
1136 | * for every single argument inside audit_log_single_execve_arg() | ||
1137 | * should be <8k allocation so should be pretty safe. | ||
1138 | */ | ||
1139 | buf = kmalloc(MAX_EXECVE_AUDIT_LEN + 1, GFP_KERNEL); | ||
1140 | if (!buf) { | ||
1141 | audit_panic("out of memory for argv string\n"); | ||
1142 | return; | ||
1143 | } | ||
993 | 1144 | ||
994 | kfree(buf); | 1145 | for (i = 0; i < axi->argc; i++) { |
1146 | len = audit_log_single_execve_arg(context, ab, i, | ||
1147 | &len_sent, p, buf); | ||
1148 | if (len <= 0) | ||
1149 | break; | ||
1150 | p += len; | ||
995 | } | 1151 | } |
1152 | kfree(buf); | ||
996 | } | 1153 | } |
997 | 1154 | ||
998 | static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) | 1155 | static void audit_log_exit(struct audit_context *context, struct task_struct *tsk) |
@@ -1039,7 +1196,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
1039 | " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" | 1196 | " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" |
1040 | " ppid=%d pid=%d auid=%u uid=%u gid=%u" | 1197 | " ppid=%d pid=%d auid=%u uid=%u gid=%u" |
1041 | " euid=%u suid=%u fsuid=%u" | 1198 | " euid=%u suid=%u fsuid=%u" |
1042 | " egid=%u sgid=%u fsgid=%u tty=%s", | 1199 | " egid=%u sgid=%u fsgid=%u tty=%s ses=%u", |
1043 | context->argv[0], | 1200 | context->argv[0], |
1044 | context->argv[1], | 1201 | context->argv[1], |
1045 | context->argv[2], | 1202 | context->argv[2], |
@@ -1047,11 +1204,12 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
1047 | context->name_count, | 1204 | context->name_count, |
1048 | context->ppid, | 1205 | context->ppid, |
1049 | context->pid, | 1206 | context->pid, |
1050 | context->loginuid, | 1207 | tsk->loginuid, |
1051 | context->uid, | 1208 | context->uid, |
1052 | context->gid, | 1209 | context->gid, |
1053 | context->euid, context->suid, context->fsuid, | 1210 | context->euid, context->suid, context->fsuid, |
1054 | context->egid, context->sgid, context->fsgid, tty); | 1211 | context->egid, context->sgid, context->fsgid, tty, |
1212 | tsk->sessionid); | ||
1055 | 1213 | ||
1056 | mutex_unlock(&tty_mutex); | 1214 | mutex_unlock(&tty_mutex); |
1057 | 1215 | ||
@@ -1135,7 +1293,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
1135 | 1293 | ||
1136 | case AUDIT_EXECVE: { | 1294 | case AUDIT_EXECVE: { |
1137 | struct audit_aux_data_execve *axi = (void *)aux; | 1295 | struct audit_aux_data_execve *axi = (void *)aux; |
1138 | audit_log_execve_info(ab, axi); | 1296 | audit_log_execve_info(context, &ab, axi); |
1139 | break; } | 1297 | break; } |
1140 | 1298 | ||
1141 | case AUDIT_SOCKETCALL: { | 1299 | case AUDIT_SOCKETCALL: { |
@@ -1168,13 +1326,19 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
1168 | 1326 | ||
1169 | for (i = 0; i < axs->pid_count; i++) | 1327 | for (i = 0; i < axs->pid_count; i++) |
1170 | if (audit_log_pid_context(context, axs->target_pid[i], | 1328 | if (audit_log_pid_context(context, axs->target_pid[i], |
1171 | axs->target_sid[i])) | 1329 | axs->target_auid[i], |
1330 | axs->target_uid[i], | ||
1331 | axs->target_sessionid[i], | ||
1332 | axs->target_sid[i], | ||
1333 | axs->target_comm[i])) | ||
1172 | call_panic = 1; | 1334 | call_panic = 1; |
1173 | } | 1335 | } |
1174 | 1336 | ||
1175 | if (context->target_pid && | 1337 | if (context->target_pid && |
1176 | audit_log_pid_context(context, context->target_pid, | 1338 | audit_log_pid_context(context, context->target_pid, |
1177 | context->target_sid)) | 1339 | context->target_auid, context->target_uid, |
1340 | context->target_sessionid, | ||
1341 | context->target_sid, context->target_comm)) | ||
1178 | call_panic = 1; | 1342 | call_panic = 1; |
1179 | 1343 | ||
1180 | if (context->pwd && context->pwdmnt) { | 1344 | if (context->pwd && context->pwdmnt) { |
@@ -1242,6 +1406,11 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts | |||
1242 | 1406 | ||
1243 | audit_log_end(ab); | 1407 | audit_log_end(ab); |
1244 | } | 1408 | } |
1409 | |||
1410 | /* Send end of event record to help user space know we are finished */ | ||
1411 | ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE); | ||
1412 | if (ab) | ||
1413 | audit_log_end(ab); | ||
1245 | if (call_panic) | 1414 | if (call_panic) |
1246 | audit_panic("error converting sid to string"); | 1415 | audit_panic("error converting sid to string"); |
1247 | } | 1416 | } |
@@ -1766,6 +1935,9 @@ void auditsc_get_stamp(struct audit_context *ctx, | |||
1766 | ctx->auditable = 1; | 1935 | ctx->auditable = 1; |
1767 | } | 1936 | } |
1768 | 1937 | ||
1938 | /* global counter which is incremented every time something logs in */ | ||
1939 | static atomic_t session_id = ATOMIC_INIT(0); | ||
1940 | |||
1769 | /** | 1941 | /** |
1770 | * audit_set_loginuid - set a task's audit_context loginuid | 1942 | * audit_set_loginuid - set a task's audit_context loginuid |
1771 | * @task: task whose audit context is being modified | 1943 | * @task: task whose audit context is being modified |
@@ -1777,41 +1949,29 @@ void auditsc_get_stamp(struct audit_context *ctx, | |||
1777 | */ | 1949 | */ |
1778 | int audit_set_loginuid(struct task_struct *task, uid_t loginuid) | 1950 | int audit_set_loginuid(struct task_struct *task, uid_t loginuid) |
1779 | { | 1951 | { |
1952 | unsigned int sessionid = atomic_inc_return(&session_id); | ||
1780 | struct audit_context *context = task->audit_context; | 1953 | struct audit_context *context = task->audit_context; |
1781 | 1954 | ||
1782 | if (context) { | 1955 | if (context && context->in_syscall) { |
1783 | /* Only log if audit is enabled */ | 1956 | struct audit_buffer *ab; |
1784 | if (context->in_syscall) { | 1957 | |
1785 | struct audit_buffer *ab; | 1958 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); |
1786 | 1959 | if (ab) { | |
1787 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); | 1960 | audit_log_format(ab, "login pid=%d uid=%u " |
1788 | if (ab) { | 1961 | "old auid=%u new auid=%u" |
1789 | audit_log_format(ab, "login pid=%d uid=%u " | 1962 | " old ses=%u new ses=%u", |
1790 | "old auid=%u new auid=%u", | 1963 | task->pid, task->uid, |
1791 | task->pid, task->uid, | 1964 | task->loginuid, loginuid, |
1792 | context->loginuid, loginuid); | 1965 | task->sessionid, sessionid); |
1793 | audit_log_end(ab); | 1966 | audit_log_end(ab); |
1794 | } | ||
1795 | } | 1967 | } |
1796 | context->loginuid = loginuid; | ||
1797 | } | 1968 | } |
1969 | task->sessionid = sessionid; | ||
1970 | task->loginuid = loginuid; | ||
1798 | return 0; | 1971 | return 0; |
1799 | } | 1972 | } |
1800 | 1973 | ||
1801 | /** | 1974 | /** |
1802 | * audit_get_loginuid - get the loginuid for an audit_context | ||
1803 | * @ctx: the audit_context | ||
1804 | * | ||
1805 | * Returns the context's loginuid or -1 if @ctx is NULL. | ||
1806 | */ | ||
1807 | uid_t audit_get_loginuid(struct audit_context *ctx) | ||
1808 | { | ||
1809 | return ctx ? ctx->loginuid : -1; | ||
1810 | } | ||
1811 | |||
1812 | EXPORT_SYMBOL(audit_get_loginuid); | ||
1813 | |||
1814 | /** | ||
1815 | * __audit_mq_open - record audit data for a POSIX MQ open | 1975 | * __audit_mq_open - record audit data for a POSIX MQ open |
1816 | * @oflag: open flag | 1976 | * @oflag: open flag |
1817 | * @mode: mode bits | 1977 | * @mode: mode bits |
@@ -2070,8 +2230,6 @@ int __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode | |||
2070 | return 0; | 2230 | return 0; |
2071 | } | 2231 | } |
2072 | 2232 | ||
2073 | int audit_argv_kb = 32; | ||
2074 | |||
2075 | int audit_bprm(struct linux_binprm *bprm) | 2233 | int audit_bprm(struct linux_binprm *bprm) |
2076 | { | 2234 | { |
2077 | struct audit_aux_data_execve *ax; | 2235 | struct audit_aux_data_execve *ax; |
@@ -2080,14 +2238,6 @@ int audit_bprm(struct linux_binprm *bprm) | |||
2080 | if (likely(!audit_enabled || !context || context->dummy)) | 2238 | if (likely(!audit_enabled || !context || context->dummy)) |
2081 | return 0; | 2239 | return 0; |
2082 | 2240 | ||
2083 | /* | ||
2084 | * Even though the stack code doesn't limit the arg+env size any more, | ||
2085 | * the audit code requires that _all_ arguments be logged in a single | ||
2086 | * netlink skb. Hence cap it :-( | ||
2087 | */ | ||
2088 | if (bprm->argv_len > (audit_argv_kb << 10)) | ||
2089 | return -E2BIG; | ||
2090 | |||
2091 | ax = kmalloc(sizeof(*ax), GFP_KERNEL); | 2241 | ax = kmalloc(sizeof(*ax), GFP_KERNEL); |
2092 | if (!ax) | 2242 | if (!ax) |
2093 | return -ENOMEM; | 2243 | return -ENOMEM; |
@@ -2193,7 +2343,11 @@ void __audit_ptrace(struct task_struct *t) | |||
2193 | struct audit_context *context = current->audit_context; | 2343 | struct audit_context *context = current->audit_context; |
2194 | 2344 | ||
2195 | context->target_pid = t->pid; | 2345 | context->target_pid = t->pid; |
2346 | context->target_auid = audit_get_loginuid(t); | ||
2347 | context->target_uid = t->uid; | ||
2348 | context->target_sessionid = audit_get_sessionid(t); | ||
2196 | selinux_get_task_sid(t, &context->target_sid); | 2349 | selinux_get_task_sid(t, &context->target_sid); |
2350 | memcpy(context->target_comm, t->comm, TASK_COMM_LEN); | ||
2197 | } | 2351 | } |
2198 | 2352 | ||
2199 | /** | 2353 | /** |
@@ -2216,8 +2370,8 @@ int __audit_signal_info(int sig, struct task_struct *t) | |||
2216 | if (audit_pid && t->tgid == audit_pid) { | 2370 | if (audit_pid && t->tgid == audit_pid) { |
2217 | if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) { | 2371 | if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) { |
2218 | audit_sig_pid = tsk->pid; | 2372 | audit_sig_pid = tsk->pid; |
2219 | if (ctx) | 2373 | if (tsk->loginuid != -1) |
2220 | audit_sig_uid = ctx->loginuid; | 2374 | audit_sig_uid = tsk->loginuid; |
2221 | else | 2375 | else |
2222 | audit_sig_uid = tsk->uid; | 2376 | audit_sig_uid = tsk->uid; |
2223 | selinux_get_task_sid(tsk, &audit_sig_sid); | 2377 | selinux_get_task_sid(tsk, &audit_sig_sid); |
@@ -2230,7 +2384,11 @@ int __audit_signal_info(int sig, struct task_struct *t) | |||
2230 | * in audit_context */ | 2384 | * in audit_context */ |
2231 | if (!ctx->target_pid) { | 2385 | if (!ctx->target_pid) { |
2232 | ctx->target_pid = t->tgid; | 2386 | ctx->target_pid = t->tgid; |
2387 | ctx->target_auid = audit_get_loginuid(t); | ||
2388 | ctx->target_uid = t->uid; | ||
2389 | ctx->target_sessionid = audit_get_sessionid(t); | ||
2233 | selinux_get_task_sid(t, &ctx->target_sid); | 2390 | selinux_get_task_sid(t, &ctx->target_sid); |
2391 | memcpy(ctx->target_comm, t->comm, TASK_COMM_LEN); | ||
2234 | return 0; | 2392 | return 0; |
2235 | } | 2393 | } |
2236 | 2394 | ||
@@ -2247,7 +2405,11 @@ int __audit_signal_info(int sig, struct task_struct *t) | |||
2247 | BUG_ON(axp->pid_count >= AUDIT_AUX_PIDS); | 2405 | BUG_ON(axp->pid_count >= AUDIT_AUX_PIDS); |
2248 | 2406 | ||
2249 | axp->target_pid[axp->pid_count] = t->tgid; | 2407 | axp->target_pid[axp->pid_count] = t->tgid; |
2408 | axp->target_auid[axp->pid_count] = audit_get_loginuid(t); | ||
2409 | axp->target_uid[axp->pid_count] = t->uid; | ||
2410 | axp->target_sessionid[axp->pid_count] = audit_get_sessionid(t); | ||
2250 | selinux_get_task_sid(t, &axp->target_sid[axp->pid_count]); | 2411 | selinux_get_task_sid(t, &axp->target_sid[axp->pid_count]); |
2412 | memcpy(axp->target_comm[axp->pid_count], t->comm, TASK_COMM_LEN); | ||
2251 | axp->pid_count++; | 2413 | axp->pid_count++; |
2252 | 2414 | ||
2253 | return 0; | 2415 | return 0; |
@@ -2264,6 +2426,8 @@ void audit_core_dumps(long signr) | |||
2264 | { | 2426 | { |
2265 | struct audit_buffer *ab; | 2427 | struct audit_buffer *ab; |
2266 | u32 sid; | 2428 | u32 sid; |
2429 | uid_t auid = audit_get_loginuid(current); | ||
2430 | unsigned int sessionid = audit_get_sessionid(current); | ||
2267 | 2431 | ||
2268 | if (!audit_enabled) | 2432 | if (!audit_enabled) |
2269 | return; | 2433 | return; |
@@ -2272,9 +2436,8 @@ void audit_core_dumps(long signr) | |||
2272 | return; | 2436 | return; |
2273 | 2437 | ||
2274 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); | 2438 | ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_ANOM_ABEND); |
2275 | audit_log_format(ab, "auid=%u uid=%u gid=%u", | 2439 | audit_log_format(ab, "auid=%u uid=%u gid=%u ses=%u", |
2276 | audit_get_loginuid(current->audit_context), | 2440 | auid, current->uid, current->gid, sessionid); |
2277 | current->uid, current->gid); | ||
2278 | selinux_get_task_sid(current, &sid); | 2441 | selinux_get_task_sid(current, &sid); |
2279 | if (sid) { | 2442 | if (sid) { |
2280 | char *ctx = NULL; | 2443 | char *ctx = NULL; |
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c new file mode 100644 index 000000000000..d1a7605c5b8f --- /dev/null +++ b/kernel/backtracetest.c | |||
@@ -0,0 +1,48 @@ | |||
1 | /* | ||
2 | * Simple stack backtrace regression test module | ||
3 | * | ||
4 | * (C) Copyright 2008 Intel Corporation | ||
5 | * Author: Arjan van de Ven <arjan@linux.intel.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public License | ||
9 | * as published by the Free Software Foundation; version 2 | ||
10 | * of the License. | ||
11 | */ | ||
12 | |||
13 | #include <linux/module.h> | ||
14 | #include <linux/sched.h> | ||
15 | #include <linux/delay.h> | ||
16 | |||
17 | static struct timer_list backtrace_timer; | ||
18 | |||
19 | static void backtrace_test_timer(unsigned long data) | ||
20 | { | ||
21 | printk("Testing a backtrace from irq context.\n"); | ||
22 | printk("The following trace is a kernel self test and not a bug!\n"); | ||
23 | dump_stack(); | ||
24 | } | ||
25 | static int backtrace_regression_test(void) | ||
26 | { | ||
27 | printk("====[ backtrace testing ]===========\n"); | ||
28 | printk("Testing a backtrace from process context.\n"); | ||
29 | printk("The following trace is a kernel self test and not a bug!\n"); | ||
30 | dump_stack(); | ||
31 | |||
32 | init_timer(&backtrace_timer); | ||
33 | backtrace_timer.function = backtrace_test_timer; | ||
34 | mod_timer(&backtrace_timer, jiffies + 10); | ||
35 | |||
36 | msleep(10); | ||
37 | printk("====[ end of backtrace testing ]====\n"); | ||
38 | return 0; | ||
39 | } | ||
40 | |||
41 | static void exitf(void) | ||
42 | { | ||
43 | } | ||
44 | |||
45 | module_init(backtrace_regression_test); | ||
46 | module_exit(exitf); | ||
47 | MODULE_LICENSE("GPL"); | ||
48 | MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>"); | ||
diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3fe21e19c96e..1a3c23936d43 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c | |||
@@ -1,6 +1,4 @@ | |||
1 | /* | 1 | /* |
2 | * kernel/cgroup.c | ||
3 | * | ||
4 | * Generic process-grouping system. | 2 | * Generic process-grouping system. |
5 | * | 3 | * |
6 | * Based originally on the cpuset system, extracted by Paul Menage | 4 | * Based originally on the cpuset system, extracted by Paul Menage |
@@ -2200,7 +2198,8 @@ static void cgroup_init_subsys(struct cgroup_subsys *ss) | |||
2200 | { | 2198 | { |
2201 | struct cgroup_subsys_state *css; | 2199 | struct cgroup_subsys_state *css; |
2202 | struct list_head *l; | 2200 | struct list_head *l; |
2203 | printk(KERN_ERR "Initializing cgroup subsys %s\n", ss->name); | 2201 | |
2202 | printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); | ||
2204 | 2203 | ||
2205 | /* Create the top cgroup state for this subsystem */ | 2204 | /* Create the top cgroup state for this subsystem */ |
2206 | ss->root = &rootnode; | 2205 | ss->root = &rootnode; |
@@ -2273,7 +2272,7 @@ int __init cgroup_init_early(void) | |||
2273 | BUG_ON(!ss->create); | 2272 | BUG_ON(!ss->create); |
2274 | BUG_ON(!ss->destroy); | 2273 | BUG_ON(!ss->destroy); |
2275 | if (ss->subsys_id != i) { | 2274 | if (ss->subsys_id != i) { |
2276 | printk(KERN_ERR "Subsys %s id == %d\n", | 2275 | printk(KERN_ERR "cgroup: Subsys %s id == %d\n", |
2277 | ss->name, ss->subsys_id); | 2276 | ss->name, ss->subsys_id); |
2278 | BUG(); | 2277 | BUG(); |
2279 | } | 2278 | } |
@@ -2605,7 +2604,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys) | |||
2605 | dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename)); | 2604 | dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename)); |
2606 | if (IS_ERR(dentry)) { | 2605 | if (IS_ERR(dentry)) { |
2607 | printk(KERN_INFO | 2606 | printk(KERN_INFO |
2608 | "Couldn't allocate dentry for %s: %ld\n", nodename, | 2607 | "cgroup: Couldn't allocate dentry for %s: %ld\n", nodename, |
2609 | PTR_ERR(dentry)); | 2608 | PTR_ERR(dentry)); |
2610 | ret = PTR_ERR(dentry); | 2609 | ret = PTR_ERR(dentry); |
2611 | goto out_release; | 2610 | goto out_release; |
diff --git a/kernel/cpu.c b/kernel/cpu.c index 6b3a0c15144f..e0d3a4f56ecb 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -15,9 +15,8 @@ | |||
15 | #include <linux/stop_machine.h> | 15 | #include <linux/stop_machine.h> |
16 | #include <linux/mutex.h> | 16 | #include <linux/mutex.h> |
17 | 17 | ||
18 | /* This protects CPUs going up and down... */ | 18 | /* Serializes the updates to cpu_online_map, cpu_present_map */ |
19 | static DEFINE_MUTEX(cpu_add_remove_lock); | 19 | static DEFINE_MUTEX(cpu_add_remove_lock); |
20 | static DEFINE_MUTEX(cpu_bitmask_lock); | ||
21 | 20 | ||
22 | static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); | 21 | static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); |
23 | 22 | ||
@@ -26,52 +25,123 @@ static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); | |||
26 | */ | 25 | */ |
27 | static int cpu_hotplug_disabled; | 26 | static int cpu_hotplug_disabled; |
28 | 27 | ||
29 | #ifdef CONFIG_HOTPLUG_CPU | 28 | static struct { |
29 | struct task_struct *active_writer; | ||
30 | struct mutex lock; /* Synchronizes accesses to refcount, */ | ||
31 | /* | ||
32 | * Also blocks the new readers during | ||
33 | * an ongoing cpu hotplug operation. | ||
34 | */ | ||
35 | int refcount; | ||
36 | wait_queue_head_t writer_queue; | ||
37 | } cpu_hotplug; | ||
30 | 38 | ||
31 | /* Crappy recursive lock-takers in cpufreq! Complain loudly about idiots */ | 39 | #define writer_exists() (cpu_hotplug.active_writer != NULL) |
32 | static struct task_struct *recursive; | ||
33 | static int recursive_depth; | ||
34 | 40 | ||
35 | void lock_cpu_hotplug(void) | 41 | void __init cpu_hotplug_init(void) |
36 | { | 42 | { |
37 | struct task_struct *tsk = current; | 43 | cpu_hotplug.active_writer = NULL; |
38 | 44 | mutex_init(&cpu_hotplug.lock); | |
39 | if (tsk == recursive) { | 45 | cpu_hotplug.refcount = 0; |
40 | static int warnings = 10; | 46 | init_waitqueue_head(&cpu_hotplug.writer_queue); |
41 | if (warnings) { | 47 | } |
42 | printk(KERN_ERR "Lukewarm IQ detected in hotplug locking\n"); | 48 | |
43 | WARN_ON(1); | 49 | #ifdef CONFIG_HOTPLUG_CPU |
44 | warnings--; | 50 | |
45 | } | 51 | void get_online_cpus(void) |
46 | recursive_depth++; | 52 | { |
53 | might_sleep(); | ||
54 | if (cpu_hotplug.active_writer == current) | ||
47 | return; | 55 | return; |
48 | } | 56 | mutex_lock(&cpu_hotplug.lock); |
49 | mutex_lock(&cpu_bitmask_lock); | 57 | cpu_hotplug.refcount++; |
50 | recursive = tsk; | 58 | mutex_unlock(&cpu_hotplug.lock); |
59 | |||
51 | } | 60 | } |
52 | EXPORT_SYMBOL_GPL(lock_cpu_hotplug); | 61 | EXPORT_SYMBOL_GPL(get_online_cpus); |
53 | 62 | ||
54 | void unlock_cpu_hotplug(void) | 63 | void put_online_cpus(void) |
55 | { | 64 | { |
56 | WARN_ON(recursive != current); | 65 | if (cpu_hotplug.active_writer == current) |
57 | if (recursive_depth) { | ||
58 | recursive_depth--; | ||
59 | return; | 66 | return; |
60 | } | 67 | mutex_lock(&cpu_hotplug.lock); |
61 | recursive = NULL; | 68 | cpu_hotplug.refcount--; |
62 | mutex_unlock(&cpu_bitmask_lock); | 69 | |
70 | if (unlikely(writer_exists()) && !cpu_hotplug.refcount) | ||
71 | wake_up(&cpu_hotplug.writer_queue); | ||
72 | |||
73 | mutex_unlock(&cpu_hotplug.lock); | ||
74 | |||
63 | } | 75 | } |
64 | EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); | 76 | EXPORT_SYMBOL_GPL(put_online_cpus); |
65 | 77 | ||
66 | #endif /* CONFIG_HOTPLUG_CPU */ | 78 | #endif /* CONFIG_HOTPLUG_CPU */ |
67 | 79 | ||
80 | /* | ||
81 | * The following two API's must be used when attempting | ||
82 | * to serialize the updates to cpu_online_map, cpu_present_map. | ||
83 | */ | ||
84 | void cpu_maps_update_begin(void) | ||
85 | { | ||
86 | mutex_lock(&cpu_add_remove_lock); | ||
87 | } | ||
88 | |||
89 | void cpu_maps_update_done(void) | ||
90 | { | ||
91 | mutex_unlock(&cpu_add_remove_lock); | ||
92 | } | ||
93 | |||
94 | /* | ||
95 | * This ensures that the hotplug operation can begin only when the | ||
96 | * refcount goes to zero. | ||
97 | * | ||
98 | * Note that during a cpu-hotplug operation, the new readers, if any, | ||
99 | * will be blocked by the cpu_hotplug.lock | ||
100 | * | ||
101 | * Since cpu_maps_update_begin is always called after invoking | ||
102 | * cpu_maps_update_begin, we can be sure that only one writer is active. | ||
103 | * | ||
104 | * Note that theoretically, there is a possibility of a livelock: | ||
105 | * - Refcount goes to zero, last reader wakes up the sleeping | ||
106 | * writer. | ||
107 | * - Last reader unlocks the cpu_hotplug.lock. | ||
108 | * - A new reader arrives at this moment, bumps up the refcount. | ||
109 | * - The writer acquires the cpu_hotplug.lock finds the refcount | ||
110 | * non zero and goes to sleep again. | ||
111 | * | ||
112 | * However, this is very difficult to achieve in practice since | ||
113 | * get_online_cpus() not an api which is called all that often. | ||
114 | * | ||
115 | */ | ||
116 | static void cpu_hotplug_begin(void) | ||
117 | { | ||
118 | DECLARE_WAITQUEUE(wait, current); | ||
119 | |||
120 | mutex_lock(&cpu_hotplug.lock); | ||
121 | |||
122 | cpu_hotplug.active_writer = current; | ||
123 | add_wait_queue_exclusive(&cpu_hotplug.writer_queue, &wait); | ||
124 | while (cpu_hotplug.refcount) { | ||
125 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
126 | mutex_unlock(&cpu_hotplug.lock); | ||
127 | schedule(); | ||
128 | mutex_lock(&cpu_hotplug.lock); | ||
129 | } | ||
130 | remove_wait_queue_locked(&cpu_hotplug.writer_queue, &wait); | ||
131 | } | ||
132 | |||
133 | static void cpu_hotplug_done(void) | ||
134 | { | ||
135 | cpu_hotplug.active_writer = NULL; | ||
136 | mutex_unlock(&cpu_hotplug.lock); | ||
137 | } | ||
68 | /* Need to know about CPUs going up/down? */ | 138 | /* Need to know about CPUs going up/down? */ |
69 | int __cpuinit register_cpu_notifier(struct notifier_block *nb) | 139 | int __cpuinit register_cpu_notifier(struct notifier_block *nb) |
70 | { | 140 | { |
71 | int ret; | 141 | int ret; |
72 | mutex_lock(&cpu_add_remove_lock); | 142 | cpu_maps_update_begin(); |
73 | ret = raw_notifier_chain_register(&cpu_chain, nb); | 143 | ret = raw_notifier_chain_register(&cpu_chain, nb); |
74 | mutex_unlock(&cpu_add_remove_lock); | 144 | cpu_maps_update_done(); |
75 | return ret; | 145 | return ret; |
76 | } | 146 | } |
77 | 147 | ||
@@ -81,9 +151,9 @@ EXPORT_SYMBOL(register_cpu_notifier); | |||
81 | 151 | ||
82 | void unregister_cpu_notifier(struct notifier_block *nb) | 152 | void unregister_cpu_notifier(struct notifier_block *nb) |
83 | { | 153 | { |
84 | mutex_lock(&cpu_add_remove_lock); | 154 | cpu_maps_update_begin(); |
85 | raw_notifier_chain_unregister(&cpu_chain, nb); | 155 | raw_notifier_chain_unregister(&cpu_chain, nb); |
86 | mutex_unlock(&cpu_add_remove_lock); | 156 | cpu_maps_update_done(); |
87 | } | 157 | } |
88 | EXPORT_SYMBOL(unregister_cpu_notifier); | 158 | EXPORT_SYMBOL(unregister_cpu_notifier); |
89 | 159 | ||
@@ -147,7 +217,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) | |||
147 | if (!cpu_online(cpu)) | 217 | if (!cpu_online(cpu)) |
148 | return -EINVAL; | 218 | return -EINVAL; |
149 | 219 | ||
150 | raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu); | 220 | cpu_hotplug_begin(); |
151 | err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, | 221 | err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, |
152 | hcpu, -1, &nr_calls); | 222 | hcpu, -1, &nr_calls); |
153 | if (err == NOTIFY_BAD) { | 223 | if (err == NOTIFY_BAD) { |
@@ -166,9 +236,7 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) | |||
166 | cpu_clear(cpu, tmp); | 236 | cpu_clear(cpu, tmp); |
167 | set_cpus_allowed(current, tmp); | 237 | set_cpus_allowed(current, tmp); |
168 | 238 | ||
169 | mutex_lock(&cpu_bitmask_lock); | ||
170 | p = __stop_machine_run(take_cpu_down, &tcd_param, cpu); | 239 | p = __stop_machine_run(take_cpu_down, &tcd_param, cpu); |
171 | mutex_unlock(&cpu_bitmask_lock); | ||
172 | 240 | ||
173 | if (IS_ERR(p) || cpu_online(cpu)) { | 241 | if (IS_ERR(p) || cpu_online(cpu)) { |
174 | /* CPU didn't die: tell everyone. Can't complain. */ | 242 | /* CPU didn't die: tell everyone. Can't complain. */ |
@@ -202,7 +270,7 @@ out_thread: | |||
202 | out_allowed: | 270 | out_allowed: |
203 | set_cpus_allowed(current, old_allowed); | 271 | set_cpus_allowed(current, old_allowed); |
204 | out_release: | 272 | out_release: |
205 | raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu); | 273 | cpu_hotplug_done(); |
206 | return err; | 274 | return err; |
207 | } | 275 | } |
208 | 276 | ||
@@ -210,13 +278,13 @@ int cpu_down(unsigned int cpu) | |||
210 | { | 278 | { |
211 | int err = 0; | 279 | int err = 0; |
212 | 280 | ||
213 | mutex_lock(&cpu_add_remove_lock); | 281 | cpu_maps_update_begin(); |
214 | if (cpu_hotplug_disabled) | 282 | if (cpu_hotplug_disabled) |
215 | err = -EBUSY; | 283 | err = -EBUSY; |
216 | else | 284 | else |
217 | err = _cpu_down(cpu, 0); | 285 | err = _cpu_down(cpu, 0); |
218 | 286 | ||
219 | mutex_unlock(&cpu_add_remove_lock); | 287 | cpu_maps_update_done(); |
220 | return err; | 288 | return err; |
221 | } | 289 | } |
222 | #endif /*CONFIG_HOTPLUG_CPU*/ | 290 | #endif /*CONFIG_HOTPLUG_CPU*/ |
@@ -231,7 +299,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) | |||
231 | if (cpu_online(cpu) || !cpu_present(cpu)) | 299 | if (cpu_online(cpu) || !cpu_present(cpu)) |
232 | return -EINVAL; | 300 | return -EINVAL; |
233 | 301 | ||
234 | raw_notifier_call_chain(&cpu_chain, CPU_LOCK_ACQUIRE, hcpu); | 302 | cpu_hotplug_begin(); |
235 | ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu, | 303 | ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu, |
236 | -1, &nr_calls); | 304 | -1, &nr_calls); |
237 | if (ret == NOTIFY_BAD) { | 305 | if (ret == NOTIFY_BAD) { |
@@ -243,9 +311,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) | |||
243 | } | 311 | } |
244 | 312 | ||
245 | /* Arch-specific enabling code. */ | 313 | /* Arch-specific enabling code. */ |
246 | mutex_lock(&cpu_bitmask_lock); | ||
247 | ret = __cpu_up(cpu); | 314 | ret = __cpu_up(cpu); |
248 | mutex_unlock(&cpu_bitmask_lock); | ||
249 | if (ret != 0) | 315 | if (ret != 0) |
250 | goto out_notify; | 316 | goto out_notify; |
251 | BUG_ON(!cpu_online(cpu)); | 317 | BUG_ON(!cpu_online(cpu)); |
@@ -257,7 +323,7 @@ out_notify: | |||
257 | if (ret != 0) | 323 | if (ret != 0) |
258 | __raw_notifier_call_chain(&cpu_chain, | 324 | __raw_notifier_call_chain(&cpu_chain, |
259 | CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); | 325 | CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); |
260 | raw_notifier_call_chain(&cpu_chain, CPU_LOCK_RELEASE, hcpu); | 326 | cpu_hotplug_done(); |
261 | 327 | ||
262 | return ret; | 328 | return ret; |
263 | } | 329 | } |
@@ -275,13 +341,13 @@ int __cpuinit cpu_up(unsigned int cpu) | |||
275 | return -EINVAL; | 341 | return -EINVAL; |
276 | } | 342 | } |
277 | 343 | ||
278 | mutex_lock(&cpu_add_remove_lock); | 344 | cpu_maps_update_begin(); |
279 | if (cpu_hotplug_disabled) | 345 | if (cpu_hotplug_disabled) |
280 | err = -EBUSY; | 346 | err = -EBUSY; |
281 | else | 347 | else |
282 | err = _cpu_up(cpu, 0); | 348 | err = _cpu_up(cpu, 0); |
283 | 349 | ||
284 | mutex_unlock(&cpu_add_remove_lock); | 350 | cpu_maps_update_done(); |
285 | return err; | 351 | return err; |
286 | } | 352 | } |
287 | 353 | ||
@@ -292,7 +358,7 @@ int disable_nonboot_cpus(void) | |||
292 | { | 358 | { |
293 | int cpu, first_cpu, error = 0; | 359 | int cpu, first_cpu, error = 0; |
294 | 360 | ||
295 | mutex_lock(&cpu_add_remove_lock); | 361 | cpu_maps_update_begin(); |
296 | first_cpu = first_cpu(cpu_online_map); | 362 | first_cpu = first_cpu(cpu_online_map); |
297 | /* We take down all of the non-boot CPUs in one shot to avoid races | 363 | /* We take down all of the non-boot CPUs in one shot to avoid races |
298 | * with the userspace trying to use the CPU hotplug at the same time | 364 | * with the userspace trying to use the CPU hotplug at the same time |
@@ -319,7 +385,7 @@ int disable_nonboot_cpus(void) | |||
319 | } else { | 385 | } else { |
320 | printk(KERN_ERR "Non-boot CPUs are not disabled\n"); | 386 | printk(KERN_ERR "Non-boot CPUs are not disabled\n"); |
321 | } | 387 | } |
322 | mutex_unlock(&cpu_add_remove_lock); | 388 | cpu_maps_update_done(); |
323 | return error; | 389 | return error; |
324 | } | 390 | } |
325 | 391 | ||
@@ -328,7 +394,7 @@ void enable_nonboot_cpus(void) | |||
328 | int cpu, error; | 394 | int cpu, error; |
329 | 395 | ||
330 | /* Allow everyone to use the CPU hotplug again */ | 396 | /* Allow everyone to use the CPU hotplug again */ |
331 | mutex_lock(&cpu_add_remove_lock); | 397 | cpu_maps_update_begin(); |
332 | cpu_hotplug_disabled = 0; | 398 | cpu_hotplug_disabled = 0; |
333 | if (cpus_empty(frozen_cpus)) | 399 | if (cpus_empty(frozen_cpus)) |
334 | goto out; | 400 | goto out; |
@@ -344,6 +410,6 @@ void enable_nonboot_cpus(void) | |||
344 | } | 410 | } |
345 | cpus_clear(frozen_cpus); | 411 | cpus_clear(frozen_cpus); |
346 | out: | 412 | out: |
347 | mutex_unlock(&cpu_add_remove_lock); | 413 | cpu_maps_update_done(); |
348 | } | 414 | } |
349 | #endif /* CONFIG_PM_SLEEP_SMP */ | 415 | #endif /* CONFIG_PM_SLEEP_SMP */ |
diff --git a/kernel/cpu_acct.c b/kernel/cpu_acct.c deleted file mode 100644 index 731e47e7f164..000000000000 --- a/kernel/cpu_acct.c +++ /dev/null | |||
@@ -1,186 +0,0 @@ | |||
1 | /* | ||
2 | * kernel/cpu_acct.c - CPU accounting cgroup subsystem | ||
3 | * | ||
4 | * Copyright (C) Google Inc, 2006 | ||
5 | * | ||
6 | * Developed by Paul Menage (menage@google.com) and Balbir Singh | ||
7 | * (balbir@in.ibm.com) | ||
8 | * | ||
9 | */ | ||
10 | |||
11 | /* | ||
12 | * Example cgroup subsystem for reporting total CPU usage of tasks in a | ||
13 | * cgroup, along with percentage load over a time interval | ||
14 | */ | ||
15 | |||
16 | #include <linux/module.h> | ||
17 | #include <linux/cgroup.h> | ||
18 | #include <linux/fs.h> | ||
19 | #include <linux/rcupdate.h> | ||
20 | |||
21 | #include <asm/div64.h> | ||
22 | |||
23 | struct cpuacct { | ||
24 | struct cgroup_subsys_state css; | ||
25 | spinlock_t lock; | ||
26 | /* total time used by this class */ | ||
27 | cputime64_t time; | ||
28 | |||
29 | /* time when next load calculation occurs */ | ||
30 | u64 next_interval_check; | ||
31 | |||
32 | /* time used in current period */ | ||
33 | cputime64_t current_interval_time; | ||
34 | |||
35 | /* time used in last period */ | ||
36 | cputime64_t last_interval_time; | ||
37 | }; | ||
38 | |||
39 | struct cgroup_subsys cpuacct_subsys; | ||
40 | |||
41 | static inline struct cpuacct *cgroup_ca(struct cgroup *cont) | ||
42 | { | ||
43 | return container_of(cgroup_subsys_state(cont, cpuacct_subsys_id), | ||
44 | struct cpuacct, css); | ||
45 | } | ||
46 | |||
47 | static inline struct cpuacct *task_ca(struct task_struct *task) | ||
48 | { | ||
49 | return container_of(task_subsys_state(task, cpuacct_subsys_id), | ||
50 | struct cpuacct, css); | ||
51 | } | ||
52 | |||
53 | #define INTERVAL (HZ * 10) | ||
54 | |||
55 | static inline u64 next_interval_boundary(u64 now) | ||
56 | { | ||
57 | /* calculate the next interval boundary beyond the | ||
58 | * current time */ | ||
59 | do_div(now, INTERVAL); | ||
60 | return (now + 1) * INTERVAL; | ||
61 | } | ||
62 | |||
63 | static struct cgroup_subsys_state *cpuacct_create( | ||
64 | struct cgroup_subsys *ss, struct cgroup *cont) | ||
65 | { | ||
66 | struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); | ||
67 | |||
68 | if (!ca) | ||
69 | return ERR_PTR(-ENOMEM); | ||
70 | spin_lock_init(&ca->lock); | ||
71 | ca->next_interval_check = next_interval_boundary(get_jiffies_64()); | ||
72 | return &ca->css; | ||
73 | } | ||
74 | |||
75 | static void cpuacct_destroy(struct cgroup_subsys *ss, | ||
76 | struct cgroup *cont) | ||
77 | { | ||
78 | kfree(cgroup_ca(cont)); | ||
79 | } | ||
80 | |||
81 | /* Lazily update the load calculation if necessary. Called with ca locked */ | ||
82 | static void cpuusage_update(struct cpuacct *ca) | ||
83 | { | ||
84 | u64 now = get_jiffies_64(); | ||
85 | |||
86 | /* If we're not due for an update, return */ | ||
87 | if (ca->next_interval_check > now) | ||
88 | return; | ||
89 | |||
90 | if (ca->next_interval_check <= (now - INTERVAL)) { | ||
91 | /* If it's been more than an interval since the last | ||
92 | * check, then catch up - the last interval must have | ||
93 | * been zero load */ | ||
94 | ca->last_interval_time = 0; | ||
95 | ca->next_interval_check = next_interval_boundary(now); | ||
96 | } else { | ||
97 | /* If a steal takes the last interval time negative, | ||
98 | * then we just ignore it */ | ||
99 | if ((s64)ca->current_interval_time > 0) | ||
100 | ca->last_interval_time = ca->current_interval_time; | ||
101 | else | ||
102 | ca->last_interval_time = 0; | ||
103 | ca->next_interval_check += INTERVAL; | ||
104 | } | ||
105 | ca->current_interval_time = 0; | ||
106 | } | ||
107 | |||
108 | static u64 cpuusage_read(struct cgroup *cont, struct cftype *cft) | ||
109 | { | ||
110 | struct cpuacct *ca = cgroup_ca(cont); | ||
111 | u64 time; | ||
112 | |||
113 | spin_lock_irq(&ca->lock); | ||
114 | cpuusage_update(ca); | ||
115 | time = cputime64_to_jiffies64(ca->time); | ||
116 | spin_unlock_irq(&ca->lock); | ||
117 | |||
118 | /* Convert 64-bit jiffies to seconds */ | ||
119 | time *= 1000; | ||
120 | do_div(time, HZ); | ||
121 | return time; | ||
122 | } | ||
123 | |||
124 | static u64 load_read(struct cgroup *cont, struct cftype *cft) | ||
125 | { | ||
126 | struct cpuacct *ca = cgroup_ca(cont); | ||
127 | u64 time; | ||
128 | |||
129 | /* Find the time used in the previous interval */ | ||
130 | spin_lock_irq(&ca->lock); | ||
131 | cpuusage_update(ca); | ||
132 | time = cputime64_to_jiffies64(ca->last_interval_time); | ||
133 | spin_unlock_irq(&ca->lock); | ||
134 | |||
135 | /* Convert time to a percentage, to give the load in the | ||
136 | * previous period */ | ||
137 | time *= 100; | ||
138 | do_div(time, INTERVAL); | ||
139 | |||
140 | return time; | ||
141 | } | ||
142 | |||
143 | static struct cftype files[] = { | ||
144 | { | ||
145 | .name = "usage", | ||
146 | .read_uint = cpuusage_read, | ||
147 | }, | ||
148 | { | ||
149 | .name = "load", | ||
150 | .read_uint = load_read, | ||
151 | } | ||
152 | }; | ||
153 | |||
154 | static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cont) | ||
155 | { | ||
156 | return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); | ||
157 | } | ||
158 | |||
159 | void cpuacct_charge(struct task_struct *task, cputime_t cputime) | ||
160 | { | ||
161 | |||
162 | struct cpuacct *ca; | ||
163 | unsigned long flags; | ||
164 | |||
165 | if (!cpuacct_subsys.active) | ||
166 | return; | ||
167 | rcu_read_lock(); | ||
168 | ca = task_ca(task); | ||
169 | if (ca) { | ||
170 | spin_lock_irqsave(&ca->lock, flags); | ||
171 | cpuusage_update(ca); | ||
172 | ca->time = cputime64_add(ca->time, cputime); | ||
173 | ca->current_interval_time = | ||
174 | cputime64_add(ca->current_interval_time, cputime); | ||
175 | spin_unlock_irqrestore(&ca->lock, flags); | ||
176 | } | ||
177 | rcu_read_unlock(); | ||
178 | } | ||
179 | |||
180 | struct cgroup_subsys cpuacct_subsys = { | ||
181 | .name = "cpuacct", | ||
182 | .create = cpuacct_create, | ||
183 | .destroy = cpuacct_destroy, | ||
184 | .populate = cpuacct_populate, | ||
185 | .subsys_id = cpuacct_subsys_id, | ||
186 | }; | ||
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 50f5dc463688..cfaf6419d817 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -537,10 +537,10 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b) | |||
537 | * | 537 | * |
538 | * Call with cgroup_mutex held. May take callback_mutex during | 538 | * Call with cgroup_mutex held. May take callback_mutex during |
539 | * call due to the kfifo_alloc() and kmalloc() calls. May nest | 539 | * call due to the kfifo_alloc() and kmalloc() calls. May nest |
540 | * a call to the lock_cpu_hotplug()/unlock_cpu_hotplug() pair. | 540 | * a call to the get_online_cpus()/put_online_cpus() pair. |
541 | * Must not be called holding callback_mutex, because we must not | 541 | * Must not be called holding callback_mutex, because we must not |
542 | * call lock_cpu_hotplug() while holding callback_mutex. Elsewhere | 542 | * call get_online_cpus() while holding callback_mutex. Elsewhere |
543 | * the kernel nests callback_mutex inside lock_cpu_hotplug() calls. | 543 | * the kernel nests callback_mutex inside get_online_cpus() calls. |
544 | * So the reverse nesting would risk an ABBA deadlock. | 544 | * So the reverse nesting would risk an ABBA deadlock. |
545 | * | 545 | * |
546 | * The three key local variables below are: | 546 | * The three key local variables below are: |
@@ -691,9 +691,9 @@ restart: | |||
691 | 691 | ||
692 | rebuild: | 692 | rebuild: |
693 | /* Have scheduler rebuild sched domains */ | 693 | /* Have scheduler rebuild sched domains */ |
694 | lock_cpu_hotplug(); | 694 | get_online_cpus(); |
695 | partition_sched_domains(ndoms, doms); | 695 | partition_sched_domains(ndoms, doms); |
696 | unlock_cpu_hotplug(); | 696 | put_online_cpus(); |
697 | 697 | ||
698 | done: | 698 | done: |
699 | if (q && !IS_ERR(q)) | 699 | if (q && !IS_ERR(q)) |
@@ -1617,10 +1617,10 @@ static struct cgroup_subsys_state *cpuset_create( | |||
1617 | * | 1617 | * |
1618 | * If the cpuset being removed has its flag 'sched_load_balance' | 1618 | * If the cpuset being removed has its flag 'sched_load_balance' |
1619 | * enabled, then simulate turning sched_load_balance off, which | 1619 | * enabled, then simulate turning sched_load_balance off, which |
1620 | * will call rebuild_sched_domains(). The lock_cpu_hotplug() | 1620 | * will call rebuild_sched_domains(). The get_online_cpus() |
1621 | * call in rebuild_sched_domains() must not be made while holding | 1621 | * call in rebuild_sched_domains() must not be made while holding |
1622 | * callback_mutex. Elsewhere the kernel nests callback_mutex inside | 1622 | * callback_mutex. Elsewhere the kernel nests callback_mutex inside |
1623 | * lock_cpu_hotplug() calls. So the reverse nesting would risk an | 1623 | * get_online_cpus() calls. So the reverse nesting would risk an |
1624 | * ABBA deadlock. | 1624 | * ABBA deadlock. |
1625 | */ | 1625 | */ |
1626 | 1626 | ||
diff --git a/kernel/exit.c b/kernel/exit.c index f1aec27f1df0..bfb1c0e940e8 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -249,7 +249,7 @@ static int has_stopped_jobs(struct pid *pgrp) | |||
249 | struct task_struct *p; | 249 | struct task_struct *p; |
250 | 250 | ||
251 | do_each_pid_task(pgrp, PIDTYPE_PGID, p) { | 251 | do_each_pid_task(pgrp, PIDTYPE_PGID, p) { |
252 | if (p->state != TASK_STOPPED) | 252 | if (!task_is_stopped(p)) |
253 | continue; | 253 | continue; |
254 | retval = 1; | 254 | retval = 1; |
255 | break; | 255 | break; |
@@ -614,7 +614,7 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced) | |||
614 | p->parent = p->real_parent; | 614 | p->parent = p->real_parent; |
615 | add_parent(p); | 615 | add_parent(p); |
616 | 616 | ||
617 | if (p->state == TASK_TRACED) { | 617 | if (task_is_traced(p)) { |
618 | /* | 618 | /* |
619 | * If it was at a trace stop, turn it into | 619 | * If it was at a trace stop, turn it into |
620 | * a normal stop since it's no longer being | 620 | * a normal stop since it's no longer being |
@@ -1357,7 +1357,7 @@ static int wait_task_stopped(struct task_struct *p, int delayed_group_leader, | |||
1357 | int __user *stat_addr, struct rusage __user *ru) | 1357 | int __user *stat_addr, struct rusage __user *ru) |
1358 | { | 1358 | { |
1359 | int retval, exit_code; | 1359 | int retval, exit_code; |
1360 | struct pid_namespace *ns; | 1360 | pid_t pid; |
1361 | 1361 | ||
1362 | if (!p->exit_code) | 1362 | if (!p->exit_code) |
1363 | return 0; | 1363 | return 0; |
@@ -1376,21 +1376,19 @@ static int wait_task_stopped(struct task_struct *p, int delayed_group_leader, | |||
1376 | * keep holding onto the tasklist_lock while we call getrusage and | 1376 | * keep holding onto the tasklist_lock while we call getrusage and |
1377 | * possibly take page faults for user memory. | 1377 | * possibly take page faults for user memory. |
1378 | */ | 1378 | */ |
1379 | ns = current->nsproxy->pid_ns; | 1379 | pid = task_pid_nr_ns(p, current->nsproxy->pid_ns); |
1380 | get_task_struct(p); | 1380 | get_task_struct(p); |
1381 | read_unlock(&tasklist_lock); | 1381 | read_unlock(&tasklist_lock); |
1382 | 1382 | ||
1383 | if (unlikely(noreap)) { | 1383 | if (unlikely(noreap)) { |
1384 | pid_t pid = task_pid_nr_ns(p, ns); | ||
1385 | uid_t uid = p->uid; | 1384 | uid_t uid = p->uid; |
1386 | int why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED; | 1385 | int why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED; |
1387 | 1386 | ||
1388 | exit_code = p->exit_code; | 1387 | exit_code = p->exit_code; |
1389 | if (unlikely(!exit_code) || | 1388 | if (unlikely(!exit_code) || unlikely(p->exit_state)) |
1390 | unlikely(p->state & TASK_TRACED)) | ||
1391 | goto bail_ref; | 1389 | goto bail_ref; |
1392 | return wait_noreap_copyout(p, pid, uid, | 1390 | return wait_noreap_copyout(p, pid, uid, |
1393 | why, (exit_code << 8) | 0x7f, | 1391 | why, exit_code, |
1394 | infop, ru); | 1392 | infop, ru); |
1395 | } | 1393 | } |
1396 | 1394 | ||
@@ -1452,11 +1450,11 @@ bail_ref: | |||
1452 | if (!retval && infop) | 1450 | if (!retval && infop) |
1453 | retval = put_user(exit_code, &infop->si_status); | 1451 | retval = put_user(exit_code, &infop->si_status); |
1454 | if (!retval && infop) | 1452 | if (!retval && infop) |
1455 | retval = put_user(task_pid_nr_ns(p, ns), &infop->si_pid); | 1453 | retval = put_user(pid, &infop->si_pid); |
1456 | if (!retval && infop) | 1454 | if (!retval && infop) |
1457 | retval = put_user(p->uid, &infop->si_uid); | 1455 | retval = put_user(p->uid, &infop->si_uid); |
1458 | if (!retval) | 1456 | if (!retval) |
1459 | retval = task_pid_nr_ns(p, ns); | 1457 | retval = pid; |
1460 | put_task_struct(p); | 1458 | put_task_struct(p); |
1461 | 1459 | ||
1462 | BUG_ON(!retval); | 1460 | BUG_ON(!retval); |
@@ -1565,60 +1563,51 @@ repeat: | |||
1565 | } | 1563 | } |
1566 | allowed = 1; | 1564 | allowed = 1; |
1567 | 1565 | ||
1568 | switch (p->state) { | 1566 | if (task_is_stopped_or_traced(p)) { |
1569 | case TASK_TRACED: | ||
1570 | /* | ||
1571 | * When we hit the race with PTRACE_ATTACH, | ||
1572 | * we will not report this child. But the | ||
1573 | * race means it has not yet been moved to | ||
1574 | * our ptrace_children list, so we need to | ||
1575 | * set the flag here to avoid a spurious ECHILD | ||
1576 | * when the race happens with the only child. | ||
1577 | */ | ||
1578 | flag = 1; | ||
1579 | if (!my_ptrace_child(p)) | ||
1580 | continue; | ||
1581 | /*FALLTHROUGH*/ | ||
1582 | case TASK_STOPPED: | ||
1583 | /* | 1567 | /* |
1584 | * It's stopped now, so it might later | 1568 | * It's stopped now, so it might later |
1585 | * continue, exit, or stop again. | 1569 | * continue, exit, or stop again. |
1570 | * | ||
1571 | * When we hit the race with PTRACE_ATTACH, we | ||
1572 | * will not report this child. But the race | ||
1573 | * means it has not yet been moved to our | ||
1574 | * ptrace_children list, so we need to set the | ||
1575 | * flag here to avoid a spurious ECHILD when | ||
1576 | * the race happens with the only child. | ||
1586 | */ | 1577 | */ |
1587 | flag = 1; | 1578 | flag = 1; |
1588 | if (!(options & WUNTRACED) && | 1579 | |
1589 | !my_ptrace_child(p)) | 1580 | if (!my_ptrace_child(p)) { |
1590 | continue; | 1581 | if (task_is_traced(p)) |
1582 | continue; | ||
1583 | if (!(options & WUNTRACED)) | ||
1584 | continue; | ||
1585 | } | ||
1586 | |||
1591 | retval = wait_task_stopped(p, ret == 2, | 1587 | retval = wait_task_stopped(p, ret == 2, |
1592 | (options & WNOWAIT), | 1588 | (options & WNOWAIT), infop, |
1593 | infop, | 1589 | stat_addr, ru); |
1594 | stat_addr, ru); | ||
1595 | if (retval == -EAGAIN) | 1590 | if (retval == -EAGAIN) |
1596 | goto repeat; | 1591 | goto repeat; |
1597 | if (retval != 0) /* He released the lock. */ | 1592 | if (retval != 0) /* He released the lock. */ |
1598 | goto end; | 1593 | goto end; |
1599 | break; | 1594 | } else if (p->exit_state == EXIT_DEAD) { |
1600 | default: | 1595 | continue; |
1601 | // case EXIT_DEAD: | 1596 | } else if (p->exit_state == EXIT_ZOMBIE) { |
1602 | if (p->exit_state == EXIT_DEAD) | 1597 | /* |
1598 | * Eligible but we cannot release it yet: | ||
1599 | */ | ||
1600 | if (ret == 2) | ||
1601 | goto check_continued; | ||
1602 | if (!likely(options & WEXITED)) | ||
1603 | continue; | 1603 | continue; |
1604 | // case EXIT_ZOMBIE: | 1604 | retval = wait_task_zombie(p, |
1605 | if (p->exit_state == EXIT_ZOMBIE) { | 1605 | (options & WNOWAIT), infop, |
1606 | /* | 1606 | stat_addr, ru); |
1607 | * Eligible but we cannot release | 1607 | /* He released the lock. */ |
1608 | * it yet: | 1608 | if (retval != 0) |
1609 | */ | 1609 | goto end; |
1610 | if (ret == 2) | 1610 | } else { |
1611 | goto check_continued; | ||
1612 | if (!likely(options & WEXITED)) | ||
1613 | continue; | ||
1614 | retval = wait_task_zombie( | ||
1615 | p, (options & WNOWAIT), | ||
1616 | infop, stat_addr, ru); | ||
1617 | /* He released the lock. */ | ||
1618 | if (retval != 0) | ||
1619 | goto end; | ||
1620 | break; | ||
1621 | } | ||
1622 | check_continued: | 1611 | check_continued: |
1623 | /* | 1612 | /* |
1624 | * It's running now, so it might later | 1613 | * It's running now, so it might later |
@@ -1627,12 +1616,11 @@ check_continued: | |||
1627 | flag = 1; | 1616 | flag = 1; |
1628 | if (!unlikely(options & WCONTINUED)) | 1617 | if (!unlikely(options & WCONTINUED)) |
1629 | continue; | 1618 | continue; |
1630 | retval = wait_task_continued( | 1619 | retval = wait_task_continued(p, |
1631 | p, (options & WNOWAIT), | 1620 | (options & WNOWAIT), infop, |
1632 | infop, stat_addr, ru); | 1621 | stat_addr, ru); |
1633 | if (retval != 0) /* He released the lock. */ | 1622 | if (retval != 0) /* He released the lock. */ |
1634 | goto end; | 1623 | goto end; |
1635 | break; | ||
1636 | } | 1624 | } |
1637 | } | 1625 | } |
1638 | if (!flag) { | 1626 | if (!flag) { |
diff --git a/kernel/extable.c b/kernel/extable.c index 7fe262855317..a26cb2e17023 100644 --- a/kernel/extable.c +++ b/kernel/extable.c | |||
@@ -46,7 +46,8 @@ int core_kernel_text(unsigned long addr) | |||
46 | addr <= (unsigned long)_etext) | 46 | addr <= (unsigned long)_etext) |
47 | return 1; | 47 | return 1; |
48 | 48 | ||
49 | if (addr >= (unsigned long)_sinittext && | 49 | if (system_state == SYSTEM_BOOTING && |
50 | addr >= (unsigned long)_sinittext && | ||
50 | addr <= (unsigned long)_einittext) | 51 | addr <= (unsigned long)_einittext) |
51 | return 1; | 52 | return 1; |
52 | return 0; | 53 | return 0; |
diff --git a/kernel/fork.c b/kernel/fork.c index ddafdfac9456..05e0b6f4365b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -51,6 +51,7 @@ | |||
51 | #include <linux/random.h> | 51 | #include <linux/random.h> |
52 | #include <linux/tty.h> | 52 | #include <linux/tty.h> |
53 | #include <linux/proc_fs.h> | 53 | #include <linux/proc_fs.h> |
54 | #include <linux/blkdev.h> | ||
54 | 55 | ||
55 | #include <asm/pgtable.h> | 56 | #include <asm/pgtable.h> |
56 | #include <asm/pgalloc.h> | 57 | #include <asm/pgalloc.h> |
@@ -392,6 +393,7 @@ void fastcall __mmdrop(struct mm_struct *mm) | |||
392 | destroy_context(mm); | 393 | destroy_context(mm); |
393 | free_mm(mm); | 394 | free_mm(mm); |
394 | } | 395 | } |
396 | EXPORT_SYMBOL_GPL(__mmdrop); | ||
395 | 397 | ||
396 | /* | 398 | /* |
397 | * Decrement the use count and release all resources for an mm. | 399 | * Decrement the use count and release all resources for an mm. |
@@ -791,6 +793,31 @@ out: | |||
791 | return error; | 793 | return error; |
792 | } | 794 | } |
793 | 795 | ||
796 | static int copy_io(unsigned long clone_flags, struct task_struct *tsk) | ||
797 | { | ||
798 | #ifdef CONFIG_BLOCK | ||
799 | struct io_context *ioc = current->io_context; | ||
800 | |||
801 | if (!ioc) | ||
802 | return 0; | ||
803 | /* | ||
804 | * Share io context with parent, if CLONE_IO is set | ||
805 | */ | ||
806 | if (clone_flags & CLONE_IO) { | ||
807 | tsk->io_context = ioc_task_link(ioc); | ||
808 | if (unlikely(!tsk->io_context)) | ||
809 | return -ENOMEM; | ||
810 | } else if (ioprio_valid(ioc->ioprio)) { | ||
811 | tsk->io_context = alloc_io_context(GFP_KERNEL, -1); | ||
812 | if (unlikely(!tsk->io_context)) | ||
813 | return -ENOMEM; | ||
814 | |||
815 | tsk->io_context->ioprio = ioc->ioprio; | ||
816 | } | ||
817 | #endif | ||
818 | return 0; | ||
819 | } | ||
820 | |||
794 | /* | 821 | /* |
795 | * Helper to unshare the files of the current task. | 822 | * Helper to unshare the files of the current task. |
796 | * We don't want to expose copy_files internals to | 823 | * We don't want to expose copy_files internals to |
@@ -1045,6 +1072,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1045 | copy_flags(clone_flags, p); | 1072 | copy_flags(clone_flags, p); |
1046 | INIT_LIST_HEAD(&p->children); | 1073 | INIT_LIST_HEAD(&p->children); |
1047 | INIT_LIST_HEAD(&p->sibling); | 1074 | INIT_LIST_HEAD(&p->sibling); |
1075 | #ifdef CONFIG_PREEMPT_RCU | ||
1076 | p->rcu_read_lock_nesting = 0; | ||
1077 | p->rcu_flipctr_idx = 0; | ||
1078 | #endif /* #ifdef CONFIG_PREEMPT_RCU */ | ||
1048 | p->vfork_done = NULL; | 1079 | p->vfork_done = NULL; |
1049 | spin_lock_init(&p->alloc_lock); | 1080 | spin_lock_init(&p->alloc_lock); |
1050 | 1081 | ||
@@ -1056,6 +1087,13 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1056 | p->gtime = cputime_zero; | 1087 | p->gtime = cputime_zero; |
1057 | p->utimescaled = cputime_zero; | 1088 | p->utimescaled = cputime_zero; |
1058 | p->stimescaled = cputime_zero; | 1089 | p->stimescaled = cputime_zero; |
1090 | p->prev_utime = cputime_zero; | ||
1091 | p->prev_stime = cputime_zero; | ||
1092 | |||
1093 | #ifdef CONFIG_DETECT_SOFTLOCKUP | ||
1094 | p->last_switch_count = 0; | ||
1095 | p->last_switch_timestamp = 0; | ||
1096 | #endif | ||
1059 | 1097 | ||
1060 | #ifdef CONFIG_TASK_XACCT | 1098 | #ifdef CONFIG_TASK_XACCT |
1061 | p->rchar = 0; /* I/O counter: bytes read */ | 1099 | p->rchar = 0; /* I/O counter: bytes read */ |
@@ -1121,6 +1159,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1121 | p->blocked_on = NULL; /* not blocked yet */ | 1159 | p->blocked_on = NULL; /* not blocked yet */ |
1122 | #endif | 1160 | #endif |
1123 | 1161 | ||
1162 | /* Perform scheduler related setup. Assign this task to a CPU. */ | ||
1163 | sched_fork(p, clone_flags); | ||
1164 | |||
1124 | if ((retval = security_task_alloc(p))) | 1165 | if ((retval = security_task_alloc(p))) |
1125 | goto bad_fork_cleanup_policy; | 1166 | goto bad_fork_cleanup_policy; |
1126 | if ((retval = audit_alloc(p))) | 1167 | if ((retval = audit_alloc(p))) |
@@ -1142,15 +1183,17 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1142 | goto bad_fork_cleanup_mm; | 1183 | goto bad_fork_cleanup_mm; |
1143 | if ((retval = copy_namespaces(clone_flags, p))) | 1184 | if ((retval = copy_namespaces(clone_flags, p))) |
1144 | goto bad_fork_cleanup_keys; | 1185 | goto bad_fork_cleanup_keys; |
1186 | if ((retval = copy_io(clone_flags, p))) | ||
1187 | goto bad_fork_cleanup_namespaces; | ||
1145 | retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); | 1188 | retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); |
1146 | if (retval) | 1189 | if (retval) |
1147 | goto bad_fork_cleanup_namespaces; | 1190 | goto bad_fork_cleanup_io; |
1148 | 1191 | ||
1149 | if (pid != &init_struct_pid) { | 1192 | if (pid != &init_struct_pid) { |
1150 | retval = -ENOMEM; | 1193 | retval = -ENOMEM; |
1151 | pid = alloc_pid(task_active_pid_ns(p)); | 1194 | pid = alloc_pid(task_active_pid_ns(p)); |
1152 | if (!pid) | 1195 | if (!pid) |
1153 | goto bad_fork_cleanup_namespaces; | 1196 | goto bad_fork_cleanup_io; |
1154 | 1197 | ||
1155 | if (clone_flags & CLONE_NEWPID) { | 1198 | if (clone_flags & CLONE_NEWPID) { |
1156 | retval = pid_ns_prepare_proc(task_active_pid_ns(p)); | 1199 | retval = pid_ns_prepare_proc(task_active_pid_ns(p)); |
@@ -1191,6 +1234,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1191 | #ifdef TIF_SYSCALL_EMU | 1234 | #ifdef TIF_SYSCALL_EMU |
1192 | clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); | 1235 | clear_tsk_thread_flag(p, TIF_SYSCALL_EMU); |
1193 | #endif | 1236 | #endif |
1237 | clear_all_latency_tracing(p); | ||
1194 | 1238 | ||
1195 | /* Our parent execution domain becomes current domain | 1239 | /* Our parent execution domain becomes current domain |
1196 | These must match for thread signalling to apply */ | 1240 | These must match for thread signalling to apply */ |
@@ -1210,9 +1254,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1210 | INIT_LIST_HEAD(&p->ptrace_children); | 1254 | INIT_LIST_HEAD(&p->ptrace_children); |
1211 | INIT_LIST_HEAD(&p->ptrace_list); | 1255 | INIT_LIST_HEAD(&p->ptrace_list); |
1212 | 1256 | ||
1213 | /* Perform scheduler related setup. Assign this task to a CPU. */ | ||
1214 | sched_fork(p, clone_flags); | ||
1215 | |||
1216 | /* Now that the task is set up, run cgroup callbacks if | 1257 | /* Now that the task is set up, run cgroup callbacks if |
1217 | * necessary. We need to run them before the task is visible | 1258 | * necessary. We need to run them before the task is visible |
1218 | * on the tasklist. */ | 1259 | * on the tasklist. */ |
@@ -1222,9 +1263,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1222 | /* Need tasklist lock for parent etc handling! */ | 1263 | /* Need tasklist lock for parent etc handling! */ |
1223 | write_lock_irq(&tasklist_lock); | 1264 | write_lock_irq(&tasklist_lock); |
1224 | 1265 | ||
1225 | /* for sys_ioprio_set(IOPRIO_WHO_PGRP) */ | ||
1226 | p->ioprio = current->ioprio; | ||
1227 | |||
1228 | /* | 1266 | /* |
1229 | * The task hasn't been attached yet, so its cpus_allowed mask will | 1267 | * The task hasn't been attached yet, so its cpus_allowed mask will |
1230 | * not be changed, nor will its assigned CPU. | 1268 | * not be changed, nor will its assigned CPU. |
@@ -1235,6 +1273,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1235 | * parent's CPU). This avoids alot of nasty races. | 1273 | * parent's CPU). This avoids alot of nasty races. |
1236 | */ | 1274 | */ |
1237 | p->cpus_allowed = current->cpus_allowed; | 1275 | p->cpus_allowed = current->cpus_allowed; |
1276 | p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed; | ||
1238 | if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || | 1277 | if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || |
1239 | !cpu_online(task_cpu(p)))) | 1278 | !cpu_online(task_cpu(p)))) |
1240 | set_task_cpu(p, smp_processor_id()); | 1279 | set_task_cpu(p, smp_processor_id()); |
@@ -1290,23 +1329,14 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1290 | __ptrace_link(p, current->parent); | 1329 | __ptrace_link(p, current->parent); |
1291 | 1330 | ||
1292 | if (thread_group_leader(p)) { | 1331 | if (thread_group_leader(p)) { |
1293 | if (clone_flags & CLONE_NEWPID) { | 1332 | if (clone_flags & CLONE_NEWPID) |
1294 | p->nsproxy->pid_ns->child_reaper = p; | 1333 | p->nsproxy->pid_ns->child_reaper = p; |
1295 | p->signal->tty = NULL; | ||
1296 | set_task_pgrp(p, p->pid); | ||
1297 | set_task_session(p, p->pid); | ||
1298 | attach_pid(p, PIDTYPE_PGID, pid); | ||
1299 | attach_pid(p, PIDTYPE_SID, pid); | ||
1300 | } else { | ||
1301 | p->signal->tty = current->signal->tty; | ||
1302 | set_task_pgrp(p, task_pgrp_nr(current)); | ||
1303 | set_task_session(p, task_session_nr(current)); | ||
1304 | attach_pid(p, PIDTYPE_PGID, | ||
1305 | task_pgrp(current)); | ||
1306 | attach_pid(p, PIDTYPE_SID, | ||
1307 | task_session(current)); | ||
1308 | } | ||
1309 | 1334 | ||
1335 | p->signal->tty = current->signal->tty; | ||
1336 | set_task_pgrp(p, task_pgrp_nr(current)); | ||
1337 | set_task_session(p, task_session_nr(current)); | ||
1338 | attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); | ||
1339 | attach_pid(p, PIDTYPE_SID, task_session(current)); | ||
1310 | list_add_tail_rcu(&p->tasks, &init_task.tasks); | 1340 | list_add_tail_rcu(&p->tasks, &init_task.tasks); |
1311 | __get_cpu_var(process_counts)++; | 1341 | __get_cpu_var(process_counts)++; |
1312 | } | 1342 | } |
@@ -1324,6 +1354,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, | |||
1324 | bad_fork_free_pid: | 1354 | bad_fork_free_pid: |
1325 | if (pid != &init_struct_pid) | 1355 | if (pid != &init_struct_pid) |
1326 | free_pid(pid); | 1356 | free_pid(pid); |
1357 | bad_fork_cleanup_io: | ||
1358 | put_io_context(p->io_context); | ||
1327 | bad_fork_cleanup_namespaces: | 1359 | bad_fork_cleanup_namespaces: |
1328 | exit_task_namespaces(p); | 1360 | exit_task_namespaces(p); |
1329 | bad_fork_cleanup_keys: | 1361 | bad_fork_cleanup_keys: |
diff --git a/kernel/futex.c b/kernel/futex.c index 32710451dc20..a6baaec44b8f 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -109,6 +109,9 @@ struct futex_q { | |||
109 | /* Optional priority inheritance state: */ | 109 | /* Optional priority inheritance state: */ |
110 | struct futex_pi_state *pi_state; | 110 | struct futex_pi_state *pi_state; |
111 | struct task_struct *task; | 111 | struct task_struct *task; |
112 | |||
113 | /* Bitset for the optional bitmasked wakeup */ | ||
114 | u32 bitset; | ||
112 | }; | 115 | }; |
113 | 116 | ||
114 | /* | 117 | /* |
@@ -181,8 +184,8 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2) | |||
181 | * For other futexes, it points to ¤t->mm->mmap_sem and | 184 | * For other futexes, it points to ¤t->mm->mmap_sem and |
182 | * caller must have taken the reader lock. but NOT any spinlocks. | 185 | * caller must have taken the reader lock. but NOT any spinlocks. |
183 | */ | 186 | */ |
184 | int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared, | 187 | static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared, |
185 | union futex_key *key) | 188 | union futex_key *key) |
186 | { | 189 | { |
187 | unsigned long address = (unsigned long)uaddr; | 190 | unsigned long address = (unsigned long)uaddr; |
188 | struct mm_struct *mm = current->mm; | 191 | struct mm_struct *mm = current->mm; |
@@ -268,14 +271,13 @@ int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared, | |||
268 | } | 271 | } |
269 | return err; | 272 | return err; |
270 | } | 273 | } |
271 | EXPORT_SYMBOL_GPL(get_futex_key); | ||
272 | 274 | ||
273 | /* | 275 | /* |
274 | * Take a reference to the resource addressed by a key. | 276 | * Take a reference to the resource addressed by a key. |
275 | * Can be called while holding spinlocks. | 277 | * Can be called while holding spinlocks. |
276 | * | 278 | * |
277 | */ | 279 | */ |
278 | inline void get_futex_key_refs(union futex_key *key) | 280 | static void get_futex_key_refs(union futex_key *key) |
279 | { | 281 | { |
280 | if (key->both.ptr == 0) | 282 | if (key->both.ptr == 0) |
281 | return; | 283 | return; |
@@ -288,13 +290,12 @@ inline void get_futex_key_refs(union futex_key *key) | |||
288 | break; | 290 | break; |
289 | } | 291 | } |
290 | } | 292 | } |
291 | EXPORT_SYMBOL_GPL(get_futex_key_refs); | ||
292 | 293 | ||
293 | /* | 294 | /* |
294 | * Drop a reference to the resource addressed by a key. | 295 | * Drop a reference to the resource addressed by a key. |
295 | * The hash bucket spinlock must not be held. | 296 | * The hash bucket spinlock must not be held. |
296 | */ | 297 | */ |
297 | void drop_futex_key_refs(union futex_key *key) | 298 | static void drop_futex_key_refs(union futex_key *key) |
298 | { | 299 | { |
299 | if (!key->both.ptr) | 300 | if (!key->both.ptr) |
300 | return; | 301 | return; |
@@ -307,7 +308,6 @@ void drop_futex_key_refs(union futex_key *key) | |||
307 | break; | 308 | break; |
308 | } | 309 | } |
309 | } | 310 | } |
310 | EXPORT_SYMBOL_GPL(drop_futex_key_refs); | ||
311 | 311 | ||
312 | static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) | 312 | static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) |
313 | { | 313 | { |
@@ -661,7 +661,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) | |||
661 | 661 | ||
662 | if (curval == -EFAULT) | 662 | if (curval == -EFAULT) |
663 | ret = -EFAULT; | 663 | ret = -EFAULT; |
664 | if (curval != uval) | 664 | else if (curval != uval) |
665 | ret = -EINVAL; | 665 | ret = -EINVAL; |
666 | if (ret) { | 666 | if (ret) { |
667 | spin_unlock(&pi_state->pi_mutex.wait_lock); | 667 | spin_unlock(&pi_state->pi_mutex.wait_lock); |
@@ -725,7 +725,7 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) | |||
725 | * to this virtual address: | 725 | * to this virtual address: |
726 | */ | 726 | */ |
727 | static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, | 727 | static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, |
728 | int nr_wake) | 728 | int nr_wake, u32 bitset) |
729 | { | 729 | { |
730 | struct futex_hash_bucket *hb; | 730 | struct futex_hash_bucket *hb; |
731 | struct futex_q *this, *next; | 731 | struct futex_q *this, *next; |
@@ -733,6 +733,9 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, | |||
733 | union futex_key key; | 733 | union futex_key key; |
734 | int ret; | 734 | int ret; |
735 | 735 | ||
736 | if (!bitset) | ||
737 | return -EINVAL; | ||
738 | |||
736 | futex_lock_mm(fshared); | 739 | futex_lock_mm(fshared); |
737 | 740 | ||
738 | ret = get_futex_key(uaddr, fshared, &key); | 741 | ret = get_futex_key(uaddr, fshared, &key); |
@@ -749,6 +752,11 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, | |||
749 | ret = -EINVAL; | 752 | ret = -EINVAL; |
750 | break; | 753 | break; |
751 | } | 754 | } |
755 | |||
756 | /* Check if one of the bits is set in both bitsets */ | ||
757 | if (!(this->bitset & bitset)) | ||
758 | continue; | ||
759 | |||
752 | wake_futex(this); | 760 | wake_futex(this); |
753 | if (++ret >= nr_wake) | 761 | if (++ret >= nr_wake) |
754 | break; | 762 | break; |
@@ -1100,15 +1108,15 @@ static void unqueue_me_pi(struct futex_q *q) | |||
1100 | } | 1108 | } |
1101 | 1109 | ||
1102 | /* | 1110 | /* |
1103 | * Fixup the pi_state owner with current. | 1111 | * Fixup the pi_state owner with the new owner. |
1104 | * | 1112 | * |
1105 | * Must be called with hash bucket lock held and mm->sem held for non | 1113 | * Must be called with hash bucket lock held and mm->sem held for non |
1106 | * private futexes. | 1114 | * private futexes. |
1107 | */ | 1115 | */ |
1108 | static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, | 1116 | static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, |
1109 | struct task_struct *curr) | 1117 | struct task_struct *newowner) |
1110 | { | 1118 | { |
1111 | u32 newtid = task_pid_vnr(curr) | FUTEX_WAITERS; | 1119 | u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; |
1112 | struct futex_pi_state *pi_state = q->pi_state; | 1120 | struct futex_pi_state *pi_state = q->pi_state; |
1113 | u32 uval, curval, newval; | 1121 | u32 uval, curval, newval; |
1114 | int ret; | 1122 | int ret; |
@@ -1122,12 +1130,12 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, | |||
1122 | } else | 1130 | } else |
1123 | newtid |= FUTEX_OWNER_DIED; | 1131 | newtid |= FUTEX_OWNER_DIED; |
1124 | 1132 | ||
1125 | pi_state->owner = curr; | 1133 | pi_state->owner = newowner; |
1126 | 1134 | ||
1127 | spin_lock_irq(&curr->pi_lock); | 1135 | spin_lock_irq(&newowner->pi_lock); |
1128 | WARN_ON(!list_empty(&pi_state->list)); | 1136 | WARN_ON(!list_empty(&pi_state->list)); |
1129 | list_add(&pi_state->list, &curr->pi_state_list); | 1137 | list_add(&pi_state->list, &newowner->pi_state_list); |
1130 | spin_unlock_irq(&curr->pi_lock); | 1138 | spin_unlock_irq(&newowner->pi_lock); |
1131 | 1139 | ||
1132 | /* | 1140 | /* |
1133 | * We own it, so we have to replace the pending owner | 1141 | * We own it, so we have to replace the pending owner |
@@ -1152,14 +1160,14 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, | |||
1152 | 1160 | ||
1153 | /* | 1161 | /* |
1154 | * In case we must use restart_block to restart a futex_wait, | 1162 | * In case we must use restart_block to restart a futex_wait, |
1155 | * we encode in the 'arg3' shared capability | 1163 | * we encode in the 'flags' shared capability |
1156 | */ | 1164 | */ |
1157 | #define ARG3_SHARED 1 | 1165 | #define FLAGS_SHARED 1 |
1158 | 1166 | ||
1159 | static long futex_wait_restart(struct restart_block *restart); | 1167 | static long futex_wait_restart(struct restart_block *restart); |
1160 | 1168 | ||
1161 | static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, | 1169 | static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, |
1162 | u32 val, ktime_t *abs_time) | 1170 | u32 val, ktime_t *abs_time, u32 bitset) |
1163 | { | 1171 | { |
1164 | struct task_struct *curr = current; | 1172 | struct task_struct *curr = current; |
1165 | DECLARE_WAITQUEUE(wait, curr); | 1173 | DECLARE_WAITQUEUE(wait, curr); |
@@ -1170,7 +1178,11 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, | |||
1170 | struct hrtimer_sleeper t; | 1178 | struct hrtimer_sleeper t; |
1171 | int rem = 0; | 1179 | int rem = 0; |
1172 | 1180 | ||
1181 | if (!bitset) | ||
1182 | return -EINVAL; | ||
1183 | |||
1173 | q.pi_state = NULL; | 1184 | q.pi_state = NULL; |
1185 | q.bitset = bitset; | ||
1174 | retry: | 1186 | retry: |
1175 | futex_lock_mm(fshared); | 1187 | futex_lock_mm(fshared); |
1176 | 1188 | ||
@@ -1255,6 +1267,8 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, | |||
1255 | t.timer.expires = *abs_time; | 1267 | t.timer.expires = *abs_time; |
1256 | 1268 | ||
1257 | hrtimer_start(&t.timer, t.timer.expires, HRTIMER_MODE_ABS); | 1269 | hrtimer_start(&t.timer, t.timer.expires, HRTIMER_MODE_ABS); |
1270 | if (!hrtimer_active(&t.timer)) | ||
1271 | t.task = NULL; | ||
1258 | 1272 | ||
1259 | /* | 1273 | /* |
1260 | * the timer could have already expired, in which | 1274 | * the timer could have already expired, in which |
@@ -1293,12 +1307,14 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, | |||
1293 | struct restart_block *restart; | 1307 | struct restart_block *restart; |
1294 | restart = ¤t_thread_info()->restart_block; | 1308 | restart = ¤t_thread_info()->restart_block; |
1295 | restart->fn = futex_wait_restart; | 1309 | restart->fn = futex_wait_restart; |
1296 | restart->arg0 = (unsigned long)uaddr; | 1310 | restart->futex.uaddr = (u32 *)uaddr; |
1297 | restart->arg1 = (unsigned long)val; | 1311 | restart->futex.val = val; |
1298 | restart->arg2 = (unsigned long)abs_time; | 1312 | restart->futex.time = abs_time->tv64; |
1299 | restart->arg3 = 0; | 1313 | restart->futex.bitset = bitset; |
1314 | restart->futex.flags = 0; | ||
1315 | |||
1300 | if (fshared) | 1316 | if (fshared) |
1301 | restart->arg3 |= ARG3_SHARED; | 1317 | restart->futex.flags |= FLAGS_SHARED; |
1302 | return -ERESTART_RESTARTBLOCK; | 1318 | return -ERESTART_RESTARTBLOCK; |
1303 | } | 1319 | } |
1304 | 1320 | ||
@@ -1313,15 +1329,16 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, | |||
1313 | 1329 | ||
1314 | static long futex_wait_restart(struct restart_block *restart) | 1330 | static long futex_wait_restart(struct restart_block *restart) |
1315 | { | 1331 | { |
1316 | u32 __user *uaddr = (u32 __user *)restart->arg0; | 1332 | u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; |
1317 | u32 val = (u32)restart->arg1; | ||
1318 | ktime_t *abs_time = (ktime_t *)restart->arg2; | ||
1319 | struct rw_semaphore *fshared = NULL; | 1333 | struct rw_semaphore *fshared = NULL; |
1334 | ktime_t t; | ||
1320 | 1335 | ||
1336 | t.tv64 = restart->futex.time; | ||
1321 | restart->fn = do_no_restart_syscall; | 1337 | restart->fn = do_no_restart_syscall; |
1322 | if (restart->arg3 & ARG3_SHARED) | 1338 | if (restart->futex.flags & FLAGS_SHARED) |
1323 | fshared = ¤t->mm->mmap_sem; | 1339 | fshared = ¤t->mm->mmap_sem; |
1324 | return (long)futex_wait(uaddr, fshared, val, abs_time); | 1340 | return (long)futex_wait(uaddr, fshared, restart->futex.val, &t, |
1341 | restart->futex.bitset); | ||
1325 | } | 1342 | } |
1326 | 1343 | ||
1327 | 1344 | ||
@@ -1510,9 +1527,37 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, | |||
1510 | * when we were on the way back before we locked the | 1527 | * when we were on the way back before we locked the |
1511 | * hash bucket. | 1528 | * hash bucket. |
1512 | */ | 1529 | */ |
1513 | if (q.pi_state->owner == curr && | 1530 | if (q.pi_state->owner == curr) { |
1514 | rt_mutex_trylock(&q.pi_state->pi_mutex)) { | 1531 | /* |
1515 | ret = 0; | 1532 | * Try to get the rt_mutex now. This might |
1533 | * fail as some other task acquired the | ||
1534 | * rt_mutex after we removed ourself from the | ||
1535 | * rt_mutex waiters list. | ||
1536 | */ | ||
1537 | if (rt_mutex_trylock(&q.pi_state->pi_mutex)) | ||
1538 | ret = 0; | ||
1539 | else { | ||
1540 | /* | ||
1541 | * pi_state is incorrect, some other | ||
1542 | * task did a lock steal and we | ||
1543 | * returned due to timeout or signal | ||
1544 | * without taking the rt_mutex. Too | ||
1545 | * late. We can access the | ||
1546 | * rt_mutex_owner without locking, as | ||
1547 | * the other task is now blocked on | ||
1548 | * the hash bucket lock. Fix the state | ||
1549 | * up. | ||
1550 | */ | ||
1551 | struct task_struct *owner; | ||
1552 | int res; | ||
1553 | |||
1554 | owner = rt_mutex_owner(&q.pi_state->pi_mutex); | ||
1555 | res = fixup_pi_state_owner(uaddr, &q, owner); | ||
1556 | |||
1557 | /* propagate -EFAULT, if the fixup failed */ | ||
1558 | if (res) | ||
1559 | ret = res; | ||
1560 | } | ||
1516 | } else { | 1561 | } else { |
1517 | /* | 1562 | /* |
1518 | * Paranoia check. If we did not take the lock | 1563 | * Paranoia check. If we did not take the lock |
@@ -1914,7 +1959,8 @@ retry: | |||
1914 | * PI futexes happens in exit_pi_state(): | 1959 | * PI futexes happens in exit_pi_state(): |
1915 | */ | 1960 | */ |
1916 | if (!pi && (uval & FUTEX_WAITERS)) | 1961 | if (!pi && (uval & FUTEX_WAITERS)) |
1917 | futex_wake(uaddr, &curr->mm->mmap_sem, 1); | 1962 | futex_wake(uaddr, &curr->mm->mmap_sem, 1, |
1963 | FUTEX_BITSET_MATCH_ANY); | ||
1918 | } | 1964 | } |
1919 | return 0; | 1965 | return 0; |
1920 | } | 1966 | } |
@@ -2014,10 +2060,14 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, | |||
2014 | 2060 | ||
2015 | switch (cmd) { | 2061 | switch (cmd) { |
2016 | case FUTEX_WAIT: | 2062 | case FUTEX_WAIT: |
2017 | ret = futex_wait(uaddr, fshared, val, timeout); | 2063 | val3 = FUTEX_BITSET_MATCH_ANY; |
2064 | case FUTEX_WAIT_BITSET: | ||
2065 | ret = futex_wait(uaddr, fshared, val, timeout, val3); | ||
2018 | break; | 2066 | break; |
2019 | case FUTEX_WAKE: | 2067 | case FUTEX_WAKE: |
2020 | ret = futex_wake(uaddr, fshared, val); | 2068 | val3 = FUTEX_BITSET_MATCH_ANY; |
2069 | case FUTEX_WAKE_BITSET: | ||
2070 | ret = futex_wake(uaddr, fshared, val, val3); | ||
2021 | break; | 2071 | break; |
2022 | case FUTEX_FD: | 2072 | case FUTEX_FD: |
2023 | /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */ | 2073 | /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */ |
@@ -2057,7 +2107,8 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val, | |||
2057 | u32 val2 = 0; | 2107 | u32 val2 = 0; |
2058 | int cmd = op & FUTEX_CMD_MASK; | 2108 | int cmd = op & FUTEX_CMD_MASK; |
2059 | 2109 | ||
2060 | if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI)) { | 2110 | if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || |
2111 | cmd == FUTEX_WAIT_BITSET)) { | ||
2061 | if (copy_from_user(&ts, utime, sizeof(ts)) != 0) | 2112 | if (copy_from_user(&ts, utime, sizeof(ts)) != 0) |
2062 | return -EFAULT; | 2113 | return -EFAULT; |
2063 | if (!timespec_valid(&ts)) | 2114 | if (!timespec_valid(&ts)) |
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 00b572666cc7..133d558db452 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c | |||
@@ -30,6 +30,15 @@ fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, | |||
30 | return 0; | 30 | return 0; |
31 | } | 31 | } |
32 | 32 | ||
33 | static void __user *futex_uaddr(struct robust_list *entry, | ||
34 | compat_long_t futex_offset) | ||
35 | { | ||
36 | compat_uptr_t base = ptr_to_compat(entry); | ||
37 | void __user *uaddr = compat_ptr(base + futex_offset); | ||
38 | |||
39 | return uaddr; | ||
40 | } | ||
41 | |||
33 | /* | 42 | /* |
34 | * Walk curr->robust_list (very carefully, it's a userspace list!) | 43 | * Walk curr->robust_list (very carefully, it's a userspace list!) |
35 | * and mark any locks found there dead, and notify any waiters. | 44 | * and mark any locks found there dead, and notify any waiters. |
@@ -76,11 +85,12 @@ void compat_exit_robust_list(struct task_struct *curr) | |||
76 | * A pending lock might already be on the list, so | 85 | * A pending lock might already be on the list, so |
77 | * dont process it twice: | 86 | * dont process it twice: |
78 | */ | 87 | */ |
79 | if (entry != pending) | 88 | if (entry != pending) { |
80 | if (handle_futex_death((void __user *)entry + futex_offset, | 89 | void __user *uaddr = futex_uaddr(entry, futex_offset); |
81 | curr, pi)) | ||
82 | return; | ||
83 | 90 | ||
91 | if (handle_futex_death(uaddr, curr, pi)) | ||
92 | return; | ||
93 | } | ||
84 | if (rc) | 94 | if (rc) |
85 | return; | 95 | return; |
86 | uentry = next_uentry; | 96 | uentry = next_uentry; |
@@ -94,9 +104,11 @@ void compat_exit_robust_list(struct task_struct *curr) | |||
94 | 104 | ||
95 | cond_resched(); | 105 | cond_resched(); |
96 | } | 106 | } |
97 | if (pending) | 107 | if (pending) { |
98 | handle_futex_death((void __user *)pending + futex_offset, | 108 | void __user *uaddr = futex_uaddr(pending, futex_offset); |
99 | curr, pip); | 109 | |
110 | handle_futex_death(uaddr, curr, pip); | ||
111 | } | ||
100 | } | 112 | } |
101 | 113 | ||
102 | asmlinkage long | 114 | asmlinkage long |
@@ -155,7 +167,8 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, | |||
155 | int val2 = 0; | 167 | int val2 = 0; |
156 | int cmd = op & FUTEX_CMD_MASK; | 168 | int cmd = op & FUTEX_CMD_MASK; |
157 | 169 | ||
158 | if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI)) { | 170 | if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || |
171 | cmd == FUTEX_WAIT_BITSET)) { | ||
159 | if (get_compat_timespec(&ts, utime)) | 172 | if (get_compat_timespec(&ts, utime)) |
160 | return -EFAULT; | 173 | return -EFAULT; |
161 | if (!timespec_valid(&ts)) | 174 | if (!timespec_valid(&ts)) |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index b6d2ff7e37ee..1069998fe25f 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
@@ -325,6 +325,22 @@ unsigned long ktime_divns(const ktime_t kt, s64 div) | |||
325 | } | 325 | } |
326 | #endif /* BITS_PER_LONG >= 64 */ | 326 | #endif /* BITS_PER_LONG >= 64 */ |
327 | 327 | ||
328 | /* | ||
329 | * Check, whether the timer is on the callback pending list | ||
330 | */ | ||
331 | static inline int hrtimer_cb_pending(const struct hrtimer *timer) | ||
332 | { | ||
333 | return timer->state & HRTIMER_STATE_PENDING; | ||
334 | } | ||
335 | |||
336 | /* | ||
337 | * Remove a timer from the callback pending list | ||
338 | */ | ||
339 | static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) | ||
340 | { | ||
341 | list_del_init(&timer->cb_entry); | ||
342 | } | ||
343 | |||
328 | /* High resolution timer related functions */ | 344 | /* High resolution timer related functions */ |
329 | #ifdef CONFIG_HIGH_RES_TIMERS | 345 | #ifdef CONFIG_HIGH_RES_TIMERS |
330 | 346 | ||
@@ -494,29 +510,12 @@ void hres_timers_resume(void) | |||
494 | } | 510 | } |
495 | 511 | ||
496 | /* | 512 | /* |
497 | * Check, whether the timer is on the callback pending list | ||
498 | */ | ||
499 | static inline int hrtimer_cb_pending(const struct hrtimer *timer) | ||
500 | { | ||
501 | return timer->state & HRTIMER_STATE_PENDING; | ||
502 | } | ||
503 | |||
504 | /* | ||
505 | * Remove a timer from the callback pending list | ||
506 | */ | ||
507 | static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) | ||
508 | { | ||
509 | list_del_init(&timer->cb_entry); | ||
510 | } | ||
511 | |||
512 | /* | ||
513 | * Initialize the high resolution related parts of cpu_base | 513 | * Initialize the high resolution related parts of cpu_base |
514 | */ | 514 | */ |
515 | static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) | 515 | static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) |
516 | { | 516 | { |
517 | base->expires_next.tv64 = KTIME_MAX; | 517 | base->expires_next.tv64 = KTIME_MAX; |
518 | base->hres_active = 0; | 518 | base->hres_active = 0; |
519 | INIT_LIST_HEAD(&base->cb_pending); | ||
520 | } | 519 | } |
521 | 520 | ||
522 | /* | 521 | /* |
@@ -524,7 +523,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) | |||
524 | */ | 523 | */ |
525 | static inline void hrtimer_init_timer_hres(struct hrtimer *timer) | 524 | static inline void hrtimer_init_timer_hres(struct hrtimer *timer) |
526 | { | 525 | { |
527 | INIT_LIST_HEAD(&timer->cb_entry); | ||
528 | } | 526 | } |
529 | 527 | ||
530 | /* | 528 | /* |
@@ -602,7 +600,7 @@ static int hrtimer_switch_to_hres(void) | |||
602 | /* "Retrigger" the interrupt to get things going */ | 600 | /* "Retrigger" the interrupt to get things going */ |
603 | retrigger_next_event(NULL); | 601 | retrigger_next_event(NULL); |
604 | local_irq_restore(flags); | 602 | local_irq_restore(flags); |
605 | printk(KERN_INFO "Switched to high resolution mode on CPU %d\n", | 603 | printk(KERN_DEBUG "Switched to high resolution mode on CPU %d\n", |
606 | smp_processor_id()); | 604 | smp_processor_id()); |
607 | return 1; | 605 | return 1; |
608 | } | 606 | } |
@@ -618,10 +616,13 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer, | |||
618 | { | 616 | { |
619 | return 0; | 617 | return 0; |
620 | } | 618 | } |
621 | static inline int hrtimer_cb_pending(struct hrtimer *timer) { return 0; } | ||
622 | static inline void hrtimer_remove_cb_pending(struct hrtimer *timer) { } | ||
623 | static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } | 619 | static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { } |
624 | static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } | 620 | static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { } |
621 | static inline int hrtimer_reprogram(struct hrtimer *timer, | ||
622 | struct hrtimer_clock_base *base) | ||
623 | { | ||
624 | return 0; | ||
625 | } | ||
625 | 626 | ||
626 | #endif /* CONFIG_HIGH_RES_TIMERS */ | 627 | #endif /* CONFIG_HIGH_RES_TIMERS */ |
627 | 628 | ||
@@ -850,6 +851,14 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) | |||
850 | #ifdef CONFIG_TIME_LOW_RES | 851 | #ifdef CONFIG_TIME_LOW_RES |
851 | tim = ktime_add(tim, base->resolution); | 852 | tim = ktime_add(tim, base->resolution); |
852 | #endif | 853 | #endif |
854 | /* | ||
855 | * Careful here: User space might have asked for a | ||
856 | * very long sleep, so the add above might result in a | ||
857 | * negative number, which enqueues the timer in front | ||
858 | * of the queue. | ||
859 | */ | ||
860 | if (tim.tv64 < 0) | ||
861 | tim.tv64 = KTIME_MAX; | ||
853 | } | 862 | } |
854 | timer->expires = tim; | 863 | timer->expires = tim; |
855 | 864 | ||
@@ -993,6 +1002,7 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id, | |||
993 | clock_id = CLOCK_MONOTONIC; | 1002 | clock_id = CLOCK_MONOTONIC; |
994 | 1003 | ||
995 | timer->base = &cpu_base->clock_base[clock_id]; | 1004 | timer->base = &cpu_base->clock_base[clock_id]; |
1005 | INIT_LIST_HEAD(&timer->cb_entry); | ||
996 | hrtimer_init_timer_hres(timer); | 1006 | hrtimer_init_timer_hres(timer); |
997 | 1007 | ||
998 | #ifdef CONFIG_TIMER_STATS | 1008 | #ifdef CONFIG_TIMER_STATS |
@@ -1022,6 +1032,85 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp) | |||
1022 | } | 1032 | } |
1023 | EXPORT_SYMBOL_GPL(hrtimer_get_res); | 1033 | EXPORT_SYMBOL_GPL(hrtimer_get_res); |
1024 | 1034 | ||
1035 | static void run_hrtimer_pending(struct hrtimer_cpu_base *cpu_base) | ||
1036 | { | ||
1037 | spin_lock_irq(&cpu_base->lock); | ||
1038 | |||
1039 | while (!list_empty(&cpu_base->cb_pending)) { | ||
1040 | enum hrtimer_restart (*fn)(struct hrtimer *); | ||
1041 | struct hrtimer *timer; | ||
1042 | int restart; | ||
1043 | |||
1044 | timer = list_entry(cpu_base->cb_pending.next, | ||
1045 | struct hrtimer, cb_entry); | ||
1046 | |||
1047 | timer_stats_account_hrtimer(timer); | ||
1048 | |||
1049 | fn = timer->function; | ||
1050 | __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0); | ||
1051 | spin_unlock_irq(&cpu_base->lock); | ||
1052 | |||
1053 | restart = fn(timer); | ||
1054 | |||
1055 | spin_lock_irq(&cpu_base->lock); | ||
1056 | |||
1057 | timer->state &= ~HRTIMER_STATE_CALLBACK; | ||
1058 | if (restart == HRTIMER_RESTART) { | ||
1059 | BUG_ON(hrtimer_active(timer)); | ||
1060 | /* | ||
1061 | * Enqueue the timer, allow reprogramming of the event | ||
1062 | * device | ||
1063 | */ | ||
1064 | enqueue_hrtimer(timer, timer->base, 1); | ||
1065 | } else if (hrtimer_active(timer)) { | ||
1066 | /* | ||
1067 | * If the timer was rearmed on another CPU, reprogram | ||
1068 | * the event device. | ||
1069 | */ | ||
1070 | if (timer->base->first == &timer->node) | ||
1071 | hrtimer_reprogram(timer, timer->base); | ||
1072 | } | ||
1073 | } | ||
1074 | spin_unlock_irq(&cpu_base->lock); | ||
1075 | } | ||
1076 | |||
1077 | static void __run_hrtimer(struct hrtimer *timer) | ||
1078 | { | ||
1079 | struct hrtimer_clock_base *base = timer->base; | ||
1080 | struct hrtimer_cpu_base *cpu_base = base->cpu_base; | ||
1081 | enum hrtimer_restart (*fn)(struct hrtimer *); | ||
1082 | int restart; | ||
1083 | |||
1084 | __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); | ||
1085 | timer_stats_account_hrtimer(timer); | ||
1086 | |||
1087 | fn = timer->function; | ||
1088 | if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) { | ||
1089 | /* | ||
1090 | * Used for scheduler timers, avoid lock inversion with | ||
1091 | * rq->lock and tasklist_lock. | ||
1092 | * | ||
1093 | * These timers are required to deal with enqueue expiry | ||
1094 | * themselves and are not allowed to migrate. | ||
1095 | */ | ||
1096 | spin_unlock(&cpu_base->lock); | ||
1097 | restart = fn(timer); | ||
1098 | spin_lock(&cpu_base->lock); | ||
1099 | } else | ||
1100 | restart = fn(timer); | ||
1101 | |||
1102 | /* | ||
1103 | * Note: We clear the CALLBACK bit after enqueue_hrtimer to avoid | ||
1104 | * reprogramming of the event hardware. This happens at the end of this | ||
1105 | * function anyway. | ||
1106 | */ | ||
1107 | if (restart != HRTIMER_NORESTART) { | ||
1108 | BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); | ||
1109 | enqueue_hrtimer(timer, base, 0); | ||
1110 | } | ||
1111 | timer->state &= ~HRTIMER_STATE_CALLBACK; | ||
1112 | } | ||
1113 | |||
1025 | #ifdef CONFIG_HIGH_RES_TIMERS | 1114 | #ifdef CONFIG_HIGH_RES_TIMERS |
1026 | 1115 | ||
1027 | /* | 1116 | /* |
@@ -1079,21 +1168,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) | |||
1079 | continue; | 1168 | continue; |
1080 | } | 1169 | } |
1081 | 1170 | ||
1082 | __remove_hrtimer(timer, base, | 1171 | __run_hrtimer(timer); |
1083 | HRTIMER_STATE_CALLBACK, 0); | ||
1084 | timer_stats_account_hrtimer(timer); | ||
1085 | |||
1086 | /* | ||
1087 | * Note: We clear the CALLBACK bit after | ||
1088 | * enqueue_hrtimer to avoid reprogramming of | ||
1089 | * the event hardware. This happens at the end | ||
1090 | * of this function anyway. | ||
1091 | */ | ||
1092 | if (timer->function(timer) != HRTIMER_NORESTART) { | ||
1093 | BUG_ON(timer->state != HRTIMER_STATE_CALLBACK); | ||
1094 | enqueue_hrtimer(timer, base, 0); | ||
1095 | } | ||
1096 | timer->state &= ~HRTIMER_STATE_CALLBACK; | ||
1097 | } | 1172 | } |
1098 | spin_unlock(&cpu_base->lock); | 1173 | spin_unlock(&cpu_base->lock); |
1099 | base++; | 1174 | base++; |
@@ -1114,52 +1189,41 @@ void hrtimer_interrupt(struct clock_event_device *dev) | |||
1114 | 1189 | ||
1115 | static void run_hrtimer_softirq(struct softirq_action *h) | 1190 | static void run_hrtimer_softirq(struct softirq_action *h) |
1116 | { | 1191 | { |
1117 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | 1192 | run_hrtimer_pending(&__get_cpu_var(hrtimer_bases)); |
1118 | 1193 | } | |
1119 | spin_lock_irq(&cpu_base->lock); | ||
1120 | |||
1121 | while (!list_empty(&cpu_base->cb_pending)) { | ||
1122 | enum hrtimer_restart (*fn)(struct hrtimer *); | ||
1123 | struct hrtimer *timer; | ||
1124 | int restart; | ||
1125 | |||
1126 | timer = list_entry(cpu_base->cb_pending.next, | ||
1127 | struct hrtimer, cb_entry); | ||
1128 | 1194 | ||
1129 | timer_stats_account_hrtimer(timer); | 1195 | #endif /* CONFIG_HIGH_RES_TIMERS */ |
1130 | 1196 | ||
1131 | fn = timer->function; | 1197 | /* |
1132 | __remove_hrtimer(timer, timer->base, HRTIMER_STATE_CALLBACK, 0); | 1198 | * Called from timer softirq every jiffy, expire hrtimers: |
1133 | spin_unlock_irq(&cpu_base->lock); | 1199 | * |
1200 | * For HRT its the fall back code to run the softirq in the timer | ||
1201 | * softirq context in case the hrtimer initialization failed or has | ||
1202 | * not been done yet. | ||
1203 | */ | ||
1204 | void hrtimer_run_pending(void) | ||
1205 | { | ||
1206 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | ||
1134 | 1207 | ||
1135 | restart = fn(timer); | 1208 | if (hrtimer_hres_active()) |
1209 | return; | ||
1136 | 1210 | ||
1137 | spin_lock_irq(&cpu_base->lock); | 1211 | /* |
1212 | * This _is_ ugly: We have to check in the softirq context, | ||
1213 | * whether we can switch to highres and / or nohz mode. The | ||
1214 | * clocksource switch happens in the timer interrupt with | ||
1215 | * xtime_lock held. Notification from there only sets the | ||
1216 | * check bit in the tick_oneshot code, otherwise we might | ||
1217 | * deadlock vs. xtime_lock. | ||
1218 | */ | ||
1219 | if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) | ||
1220 | hrtimer_switch_to_hres(); | ||
1138 | 1221 | ||
1139 | timer->state &= ~HRTIMER_STATE_CALLBACK; | 1222 | run_hrtimer_pending(cpu_base); |
1140 | if (restart == HRTIMER_RESTART) { | ||
1141 | BUG_ON(hrtimer_active(timer)); | ||
1142 | /* | ||
1143 | * Enqueue the timer, allow reprogramming of the event | ||
1144 | * device | ||
1145 | */ | ||
1146 | enqueue_hrtimer(timer, timer->base, 1); | ||
1147 | } else if (hrtimer_active(timer)) { | ||
1148 | /* | ||
1149 | * If the timer was rearmed on another CPU, reprogram | ||
1150 | * the event device. | ||
1151 | */ | ||
1152 | if (timer->base->first == &timer->node) | ||
1153 | hrtimer_reprogram(timer, timer->base); | ||
1154 | } | ||
1155 | } | ||
1156 | spin_unlock_irq(&cpu_base->lock); | ||
1157 | } | 1223 | } |
1158 | 1224 | ||
1159 | #endif /* CONFIG_HIGH_RES_TIMERS */ | ||
1160 | |||
1161 | /* | 1225 | /* |
1162 | * Expire the per base hrtimer-queue: | 1226 | * Called from hardirq context every jiffy |
1163 | */ | 1227 | */ |
1164 | static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base, | 1228 | static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base, |
1165 | int index) | 1229 | int index) |
@@ -1173,46 +1237,27 @@ static inline void run_hrtimer_queue(struct hrtimer_cpu_base *cpu_base, | |||
1173 | if (base->get_softirq_time) | 1237 | if (base->get_softirq_time) |
1174 | base->softirq_time = base->get_softirq_time(); | 1238 | base->softirq_time = base->get_softirq_time(); |
1175 | 1239 | ||
1176 | spin_lock_irq(&cpu_base->lock); | 1240 | spin_lock(&cpu_base->lock); |
1177 | 1241 | ||
1178 | while ((node = base->first)) { | 1242 | while ((node = base->first)) { |
1179 | struct hrtimer *timer; | 1243 | struct hrtimer *timer; |
1180 | enum hrtimer_restart (*fn)(struct hrtimer *); | ||
1181 | int restart; | ||
1182 | 1244 | ||
1183 | timer = rb_entry(node, struct hrtimer, node); | 1245 | timer = rb_entry(node, struct hrtimer, node); |
1184 | if (base->softirq_time.tv64 <= timer->expires.tv64) | 1246 | if (base->softirq_time.tv64 <= timer->expires.tv64) |
1185 | break; | 1247 | break; |
1186 | 1248 | ||
1187 | #ifdef CONFIG_HIGH_RES_TIMERS | 1249 | if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) { |
1188 | WARN_ON_ONCE(timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ); | 1250 | __remove_hrtimer(timer, base, HRTIMER_STATE_PENDING, 0); |
1189 | #endif | 1251 | list_add_tail(&timer->cb_entry, |
1190 | timer_stats_account_hrtimer(timer); | 1252 | &base->cpu_base->cb_pending); |
1191 | 1253 | continue; | |
1192 | fn = timer->function; | ||
1193 | __remove_hrtimer(timer, base, HRTIMER_STATE_CALLBACK, 0); | ||
1194 | spin_unlock_irq(&cpu_base->lock); | ||
1195 | |||
1196 | restart = fn(timer); | ||
1197 | |||
1198 | spin_lock_irq(&cpu_base->lock); | ||
1199 | |||
1200 | timer->state &= ~HRTIMER_STATE_CALLBACK; | ||
1201 | if (restart != HRTIMER_NORESTART) { | ||
1202 | BUG_ON(hrtimer_active(timer)); | ||
1203 | enqueue_hrtimer(timer, base, 0); | ||
1204 | } | 1254 | } |
1255 | |||
1256 | __run_hrtimer(timer); | ||
1205 | } | 1257 | } |
1206 | spin_unlock_irq(&cpu_base->lock); | 1258 | spin_unlock(&cpu_base->lock); |
1207 | } | 1259 | } |
1208 | 1260 | ||
1209 | /* | ||
1210 | * Called from timer softirq every jiffy, expire hrtimers: | ||
1211 | * | ||
1212 | * For HRT its the fall back code to run the softirq in the timer | ||
1213 | * softirq context in case the hrtimer initialization failed or has | ||
1214 | * not been done yet. | ||
1215 | */ | ||
1216 | void hrtimer_run_queues(void) | 1261 | void hrtimer_run_queues(void) |
1217 | { | 1262 | { |
1218 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); | 1263 | struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases); |
@@ -1221,18 +1266,6 @@ void hrtimer_run_queues(void) | |||
1221 | if (hrtimer_hres_active()) | 1266 | if (hrtimer_hres_active()) |
1222 | return; | 1267 | return; |
1223 | 1268 | ||
1224 | /* | ||
1225 | * This _is_ ugly: We have to check in the softirq context, | ||
1226 | * whether we can switch to highres and / or nohz mode. The | ||
1227 | * clocksource switch happens in the timer interrupt with | ||
1228 | * xtime_lock held. Notification from there only sets the | ||
1229 | * check bit in the tick_oneshot code, otherwise we might | ||
1230 | * deadlock vs. xtime_lock. | ||
1231 | */ | ||
1232 | if (tick_check_oneshot_change(!hrtimer_is_hres_enabled())) | ||
1233 | if (hrtimer_switch_to_hres()) | ||
1234 | return; | ||
1235 | |||
1236 | hrtimer_get_softirq_time(cpu_base); | 1269 | hrtimer_get_softirq_time(cpu_base); |
1237 | 1270 | ||
1238 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) | 1271 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) |
@@ -1260,7 +1293,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task) | |||
1260 | sl->timer.function = hrtimer_wakeup; | 1293 | sl->timer.function = hrtimer_wakeup; |
1261 | sl->task = task; | 1294 | sl->task = task; |
1262 | #ifdef CONFIG_HIGH_RES_TIMERS | 1295 | #ifdef CONFIG_HIGH_RES_TIMERS |
1263 | sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_RESTART; | 1296 | sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; |
1264 | #endif | 1297 | #endif |
1265 | } | 1298 | } |
1266 | 1299 | ||
@@ -1271,6 +1304,8 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod | |||
1271 | do { | 1304 | do { |
1272 | set_current_state(TASK_INTERRUPTIBLE); | 1305 | set_current_state(TASK_INTERRUPTIBLE); |
1273 | hrtimer_start(&t->timer, t->timer.expires, mode); | 1306 | hrtimer_start(&t->timer, t->timer.expires, mode); |
1307 | if (!hrtimer_active(&t->timer)) | ||
1308 | t->task = NULL; | ||
1274 | 1309 | ||
1275 | if (likely(t->task)) | 1310 | if (likely(t->task)) |
1276 | schedule(); | 1311 | schedule(); |
@@ -1280,6 +1315,8 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod | |||
1280 | 1315 | ||
1281 | } while (t->task && !signal_pending(current)); | 1316 | } while (t->task && !signal_pending(current)); |
1282 | 1317 | ||
1318 | __set_current_state(TASK_RUNNING); | ||
1319 | |||
1283 | return t->task == NULL; | 1320 | return t->task == NULL; |
1284 | } | 1321 | } |
1285 | 1322 | ||
@@ -1370,7 +1407,7 @@ sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp) | |||
1370 | /* | 1407 | /* |
1371 | * Functions related to boot-time initialization: | 1408 | * Functions related to boot-time initialization: |
1372 | */ | 1409 | */ |
1373 | static void __devinit init_hrtimers_cpu(int cpu) | 1410 | static void __cpuinit init_hrtimers_cpu(int cpu) |
1374 | { | 1411 | { |
1375 | struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); | 1412 | struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); |
1376 | int i; | 1413 | int i; |
@@ -1381,6 +1418,7 @@ static void __devinit init_hrtimers_cpu(int cpu) | |||
1381 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) | 1418 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) |
1382 | cpu_base->clock_base[i].cpu_base = cpu_base; | 1419 | cpu_base->clock_base[i].cpu_base = cpu_base; |
1383 | 1420 | ||
1421 | INIT_LIST_HEAD(&cpu_base->cb_pending); | ||
1384 | hrtimer_init_hres(cpu_base); | 1422 | hrtimer_init_hres(cpu_base); |
1385 | } | 1423 | } |
1386 | 1424 | ||
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 9b5dff6b3f6a..44019ce30a14 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c | |||
@@ -297,18 +297,13 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc) | |||
297 | 297 | ||
298 | if (unlikely(desc->status & IRQ_INPROGRESS)) | 298 | if (unlikely(desc->status & IRQ_INPROGRESS)) |
299 | goto out_unlock; | 299 | goto out_unlock; |
300 | desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); | ||
300 | kstat_cpu(cpu).irqs[irq]++; | 301 | kstat_cpu(cpu).irqs[irq]++; |
301 | 302 | ||
302 | action = desc->action; | 303 | action = desc->action; |
303 | if (unlikely(!action || (desc->status & IRQ_DISABLED))) { | 304 | if (unlikely(!action || (desc->status & IRQ_DISABLED))) |
304 | if (desc->chip->mask) | ||
305 | desc->chip->mask(irq); | ||
306 | desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); | ||
307 | desc->status |= IRQ_PENDING; | ||
308 | goto out_unlock; | 305 | goto out_unlock; |
309 | } | ||
310 | 306 | ||
311 | desc->status &= ~(IRQ_REPLAY | IRQ_WAITING | IRQ_PENDING); | ||
312 | desc->status |= IRQ_INPROGRESS; | 307 | desc->status |= IRQ_INPROGRESS; |
313 | spin_unlock(&desc->lock); | 308 | spin_unlock(&desc->lock); |
314 | 309 | ||
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index e391cbb1f566..dc335ad27525 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c | |||
@@ -178,9 +178,11 @@ fastcall unsigned int __do_IRQ(unsigned int irq) | |||
178 | */ | 178 | */ |
179 | if (desc->chip->ack) | 179 | if (desc->chip->ack) |
180 | desc->chip->ack(irq); | 180 | desc->chip->ack(irq); |
181 | action_ret = handle_IRQ_event(irq, desc->action); | 181 | if (likely(!(desc->status & IRQ_DISABLED))) { |
182 | if (!noirqdebug) | 182 | action_ret = handle_IRQ_event(irq, desc->action); |
183 | note_interrupt(irq, desc, action_ret); | 183 | if (!noirqdebug) |
184 | note_interrupt(irq, desc, action_ret); | ||
185 | } | ||
184 | desc->chip->end(irq); | 186 | desc->chip->end(irq); |
185 | return 1; | 187 | return 1; |
186 | } | 188 | } |
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 1f314221d534..438a01464287 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c | |||
@@ -479,6 +479,9 @@ void free_irq(unsigned int irq, void *dev_id) | |||
479 | return; | 479 | return; |
480 | } | 480 | } |
481 | printk(KERN_ERR "Trying to free already-free IRQ %d\n", irq); | 481 | printk(KERN_ERR "Trying to free already-free IRQ %d\n", irq); |
482 | #ifdef CONFIG_DEBUG_SHIRQ | ||
483 | dump_stack(); | ||
484 | #endif | ||
482 | spin_unlock_irqrestore(&desc->lock, flags); | 485 | spin_unlock_irqrestore(&desc->lock, flags); |
483 | return; | 486 | return; |
484 | } | 487 | } |
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index 50b81b98046a..c2f2ccb0549a 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c | |||
@@ -75,6 +75,18 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer, | |||
75 | 75 | ||
76 | #endif | 76 | #endif |
77 | 77 | ||
78 | static int irq_spurious_read(char *page, char **start, off_t off, | ||
79 | int count, int *eof, void *data) | ||
80 | { | ||
81 | struct irq_desc *d = &irq_desc[(long) data]; | ||
82 | return sprintf(page, "count %u\n" | ||
83 | "unhandled %u\n" | ||
84 | "last_unhandled %u ms\n", | ||
85 | d->irq_count, | ||
86 | d->irqs_unhandled, | ||
87 | jiffies_to_msecs(d->last_unhandled)); | ||
88 | } | ||
89 | |||
78 | #define MAX_NAMELEN 128 | 90 | #define MAX_NAMELEN 128 |
79 | 91 | ||
80 | static int name_unique(unsigned int irq, struct irqaction *new_action) | 92 | static int name_unique(unsigned int irq, struct irqaction *new_action) |
@@ -118,6 +130,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action) | |||
118 | void register_irq_proc(unsigned int irq) | 130 | void register_irq_proc(unsigned int irq) |
119 | { | 131 | { |
120 | char name [MAX_NAMELEN]; | 132 | char name [MAX_NAMELEN]; |
133 | struct proc_dir_entry *entry; | ||
121 | 134 | ||
122 | if (!root_irq_dir || | 135 | if (!root_irq_dir || |
123 | (irq_desc[irq].chip == &no_irq_chip) || | 136 | (irq_desc[irq].chip == &no_irq_chip) || |
@@ -132,8 +145,6 @@ void register_irq_proc(unsigned int irq) | |||
132 | 145 | ||
133 | #ifdef CONFIG_SMP | 146 | #ifdef CONFIG_SMP |
134 | { | 147 | { |
135 | struct proc_dir_entry *entry; | ||
136 | |||
137 | /* create /proc/irq/<irq>/smp_affinity */ | 148 | /* create /proc/irq/<irq>/smp_affinity */ |
138 | entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir); | 149 | entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir); |
139 | 150 | ||
@@ -144,6 +155,12 @@ void register_irq_proc(unsigned int irq) | |||
144 | } | 155 | } |
145 | } | 156 | } |
146 | #endif | 157 | #endif |
158 | |||
159 | entry = create_proc_entry("spurious", 0444, irq_desc[irq].dir); | ||
160 | if (entry) { | ||
161 | entry->data = (void *)(long)irq; | ||
162 | entry->read_proc = irq_spurious_read; | ||
163 | } | ||
147 | } | 164 | } |
148 | 165 | ||
149 | #undef MAX_NAMELEN | 166 | #undef MAX_NAMELEN |
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 32b161972fad..a6b2bc831dd0 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c | |||
@@ -10,6 +10,7 @@ | |||
10 | #include <linux/module.h> | 10 | #include <linux/module.h> |
11 | #include <linux/kallsyms.h> | 11 | #include <linux/kallsyms.h> |
12 | #include <linux/interrupt.h> | 12 | #include <linux/interrupt.h> |
13 | #include <linux/moduleparam.h> | ||
13 | 14 | ||
14 | static int irqfixup __read_mostly; | 15 | static int irqfixup __read_mostly; |
15 | 16 | ||
@@ -225,6 +226,8 @@ int noirqdebug_setup(char *str) | |||
225 | } | 226 | } |
226 | 227 | ||
227 | __setup("noirqdebug", noirqdebug_setup); | 228 | __setup("noirqdebug", noirqdebug_setup); |
229 | module_param(noirqdebug, bool, 0644); | ||
230 | MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true"); | ||
228 | 231 | ||
229 | static int __init irqfixup_setup(char *str) | 232 | static int __init irqfixup_setup(char *str) |
230 | { | 233 | { |
@@ -236,6 +239,8 @@ static int __init irqfixup_setup(char *str) | |||
236 | } | 239 | } |
237 | 240 | ||
238 | __setup("irqfixup", irqfixup_setup); | 241 | __setup("irqfixup", irqfixup_setup); |
242 | module_param(irqfixup, int, 0644); | ||
243 | MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode 2: irqpoll mode"); | ||
239 | 244 | ||
240 | static int __init irqpoll_setup(char *str) | 245 | static int __init irqpoll_setup(char *str) |
241 | { | 246 | { |
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 474219a41929..7dadc71ce516 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c | |||
@@ -32,9 +32,14 @@ | |||
32 | 32 | ||
33 | /* These will be re-linked against their real values during the second link stage */ | 33 | /* These will be re-linked against their real values during the second link stage */ |
34 | extern const unsigned long kallsyms_addresses[] __attribute__((weak)); | 34 | extern const unsigned long kallsyms_addresses[] __attribute__((weak)); |
35 | extern const unsigned long kallsyms_num_syms __attribute__((weak)); | ||
36 | extern const u8 kallsyms_names[] __attribute__((weak)); | 35 | extern const u8 kallsyms_names[] __attribute__((weak)); |
37 | 36 | ||
37 | /* tell the compiler that the count isn't in the small data section if the arch | ||
38 | * has one (eg: FRV) | ||
39 | */ | ||
40 | extern const unsigned long kallsyms_num_syms | ||
41 | __attribute__((weak, section(".rodata"))); | ||
42 | |||
38 | extern const u8 kallsyms_token_table[] __attribute__((weak)); | 43 | extern const u8 kallsyms_token_table[] __attribute__((weak)); |
39 | extern const u16 kallsyms_token_index[] __attribute__((weak)); | 44 | extern const u16 kallsyms_token_index[] __attribute__((weak)); |
40 | 45 | ||
@@ -228,10 +233,11 @@ static unsigned long get_symbol_pos(unsigned long addr, | |||
228 | int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, | 233 | int kallsyms_lookup_size_offset(unsigned long addr, unsigned long *symbolsize, |
229 | unsigned long *offset) | 234 | unsigned long *offset) |
230 | { | 235 | { |
236 | char namebuf[KSYM_NAME_LEN]; | ||
231 | if (is_ksym_addr(addr)) | 237 | if (is_ksym_addr(addr)) |
232 | return !!get_symbol_pos(addr, symbolsize, offset); | 238 | return !!get_symbol_pos(addr, symbolsize, offset); |
233 | 239 | ||
234 | return !!module_address_lookup(addr, symbolsize, offset, NULL); | 240 | return !!module_address_lookup(addr, symbolsize, offset, NULL, namebuf); |
235 | } | 241 | } |
236 | 242 | ||
237 | /* | 243 | /* |
@@ -246,8 +252,6 @@ const char *kallsyms_lookup(unsigned long addr, | |||
246 | unsigned long *offset, | 252 | unsigned long *offset, |
247 | char **modname, char *namebuf) | 253 | char **modname, char *namebuf) |
248 | { | 254 | { |
249 | const char *msym; | ||
250 | |||
251 | namebuf[KSYM_NAME_LEN - 1] = 0; | 255 | namebuf[KSYM_NAME_LEN - 1] = 0; |
252 | namebuf[0] = 0; | 256 | namebuf[0] = 0; |
253 | 257 | ||
@@ -263,10 +267,8 @@ const char *kallsyms_lookup(unsigned long addr, | |||
263 | } | 267 | } |
264 | 268 | ||
265 | /* see if it's in a module */ | 269 | /* see if it's in a module */ |
266 | msym = module_address_lookup(addr, symbolsize, offset, modname); | 270 | return module_address_lookup(addr, symbolsize, offset, modname, |
267 | if (msym) | 271 | namebuf); |
268 | return strncpy(namebuf, msym, KSYM_NAME_LEN - 1); | ||
269 | |||
270 | return NULL; | 272 | return NULL; |
271 | } | 273 | } |
272 | 274 | ||
diff --git a/kernel/kexec.c b/kernel/kexec.c index aa74a1ef2da8..9a26eec9eb04 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c | |||
@@ -1404,6 +1404,7 @@ static int __init crash_save_vmcoreinfo_init(void) | |||
1404 | VMCOREINFO_OFFSET(list_head, next); | 1404 | VMCOREINFO_OFFSET(list_head, next); |
1405 | VMCOREINFO_OFFSET(list_head, prev); | 1405 | VMCOREINFO_OFFSET(list_head, prev); |
1406 | VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); | 1406 | VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); |
1407 | VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); | ||
1407 | VMCOREINFO_NUMBER(NR_FREE_PAGES); | 1408 | VMCOREINFO_NUMBER(NR_FREE_PAGES); |
1408 | 1409 | ||
1409 | arch_crash_save_vmcoreinfo(); | 1410 | arch_crash_save_vmcoreinfo(); |
diff --git a/kernel/kmod.c b/kernel/kmod.c index c6a4f8aebeba..bb7df2a28bd7 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c | |||
@@ -451,13 +451,11 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, | |||
451 | enum umh_wait wait) | 451 | enum umh_wait wait) |
452 | { | 452 | { |
453 | DECLARE_COMPLETION_ONSTACK(done); | 453 | DECLARE_COMPLETION_ONSTACK(done); |
454 | int retval; | 454 | int retval = 0; |
455 | 455 | ||
456 | helper_lock(); | 456 | helper_lock(); |
457 | if (sub_info->path[0] == '\0') { | 457 | if (sub_info->path[0] == '\0') |
458 | retval = 0; | ||
459 | goto out; | 458 | goto out; |
460 | } | ||
461 | 459 | ||
462 | if (!khelper_wq || usermodehelper_disabled) { | 460 | if (!khelper_wq || usermodehelper_disabled) { |
463 | retval = -EBUSY; | 461 | retval = -EBUSY; |
@@ -468,13 +466,14 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, | |||
468 | sub_info->wait = wait; | 466 | sub_info->wait = wait; |
469 | 467 | ||
470 | queue_work(khelper_wq, &sub_info->work); | 468 | queue_work(khelper_wq, &sub_info->work); |
471 | if (wait == UMH_NO_WAIT) /* task has freed sub_info */ | 469 | if (wait == UMH_NO_WAIT) /* task has freed sub_info */ |
472 | return 0; | 470 | goto unlock; |
473 | wait_for_completion(&done); | 471 | wait_for_completion(&done); |
474 | retval = sub_info->retval; | 472 | retval = sub_info->retval; |
475 | 473 | ||
476 | out: | 474 | out: |
477 | call_usermodehelper_freeinfo(sub_info); | 475 | call_usermodehelper_freeinfo(sub_info); |
476 | unlock: | ||
478 | helper_unlock(); | 477 | helper_unlock(); |
479 | return retval; | 478 | return retval; |
480 | } | 479 | } |
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index e3a5d817ac9b..d0493eafea3e 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -824,6 +824,8 @@ static int __init init_kprobes(void) | |||
824 | if (!err) | 824 | if (!err) |
825 | err = register_die_notifier(&kprobe_exceptions_nb); | 825 | err = register_die_notifier(&kprobe_exceptions_nb); |
826 | 826 | ||
827 | if (!err) | ||
828 | init_test_probes(); | ||
827 | return err; | 829 | return err; |
828 | } | 830 | } |
829 | 831 | ||
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 65daa5373ca6..e53bc30e9ba5 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c | |||
@@ -17,30 +17,34 @@ | |||
17 | #include <linux/sched.h> | 17 | #include <linux/sched.h> |
18 | 18 | ||
19 | #define KERNEL_ATTR_RO(_name) \ | 19 | #define KERNEL_ATTR_RO(_name) \ |
20 | static struct subsys_attribute _name##_attr = __ATTR_RO(_name) | 20 | static struct kobj_attribute _name##_attr = __ATTR_RO(_name) |
21 | 21 | ||
22 | #define KERNEL_ATTR_RW(_name) \ | 22 | #define KERNEL_ATTR_RW(_name) \ |
23 | static struct subsys_attribute _name##_attr = \ | 23 | static struct kobj_attribute _name##_attr = \ |
24 | __ATTR(_name, 0644, _name##_show, _name##_store) | 24 | __ATTR(_name, 0644, _name##_show, _name##_store) |
25 | 25 | ||
26 | #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) | 26 | #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) |
27 | /* current uevent sequence number */ | 27 | /* current uevent sequence number */ |
28 | static ssize_t uevent_seqnum_show(struct kset *kset, char *page) | 28 | static ssize_t uevent_seqnum_show(struct kobject *kobj, |
29 | struct kobj_attribute *attr, char *buf) | ||
29 | { | 30 | { |
30 | return sprintf(page, "%llu\n", (unsigned long long)uevent_seqnum); | 31 | return sprintf(buf, "%llu\n", (unsigned long long)uevent_seqnum); |
31 | } | 32 | } |
32 | KERNEL_ATTR_RO(uevent_seqnum); | 33 | KERNEL_ATTR_RO(uevent_seqnum); |
33 | 34 | ||
34 | /* uevent helper program, used during early boo */ | 35 | /* uevent helper program, used during early boo */ |
35 | static ssize_t uevent_helper_show(struct kset *kset, char *page) | 36 | static ssize_t uevent_helper_show(struct kobject *kobj, |
37 | struct kobj_attribute *attr, char *buf) | ||
36 | { | 38 | { |
37 | return sprintf(page, "%s\n", uevent_helper); | 39 | return sprintf(buf, "%s\n", uevent_helper); |
38 | } | 40 | } |
39 | static ssize_t uevent_helper_store(struct kset *kset, const char *page, size_t count) | 41 | static ssize_t uevent_helper_store(struct kobject *kobj, |
42 | struct kobj_attribute *attr, | ||
43 | const char *buf, size_t count) | ||
40 | { | 44 | { |
41 | if (count+1 > UEVENT_HELPER_PATH_LEN) | 45 | if (count+1 > UEVENT_HELPER_PATH_LEN) |
42 | return -ENOENT; | 46 | return -ENOENT; |
43 | memcpy(uevent_helper, page, count); | 47 | memcpy(uevent_helper, buf, count); |
44 | uevent_helper[count] = '\0'; | 48 | uevent_helper[count] = '\0'; |
45 | if (count && uevent_helper[count-1] == '\n') | 49 | if (count && uevent_helper[count-1] == '\n') |
46 | uevent_helper[count-1] = '\0'; | 50 | uevent_helper[count-1] = '\0'; |
@@ -50,21 +54,24 @@ KERNEL_ATTR_RW(uevent_helper); | |||
50 | #endif | 54 | #endif |
51 | 55 | ||
52 | #ifdef CONFIG_KEXEC | 56 | #ifdef CONFIG_KEXEC |
53 | static ssize_t kexec_loaded_show(struct kset *kset, char *page) | 57 | static ssize_t kexec_loaded_show(struct kobject *kobj, |
58 | struct kobj_attribute *attr, char *buf) | ||
54 | { | 59 | { |
55 | return sprintf(page, "%d\n", !!kexec_image); | 60 | return sprintf(buf, "%d\n", !!kexec_image); |
56 | } | 61 | } |
57 | KERNEL_ATTR_RO(kexec_loaded); | 62 | KERNEL_ATTR_RO(kexec_loaded); |
58 | 63 | ||
59 | static ssize_t kexec_crash_loaded_show(struct kset *kset, char *page) | 64 | static ssize_t kexec_crash_loaded_show(struct kobject *kobj, |
65 | struct kobj_attribute *attr, char *buf) | ||
60 | { | 66 | { |
61 | return sprintf(page, "%d\n", !!kexec_crash_image); | 67 | return sprintf(buf, "%d\n", !!kexec_crash_image); |
62 | } | 68 | } |
63 | KERNEL_ATTR_RO(kexec_crash_loaded); | 69 | KERNEL_ATTR_RO(kexec_crash_loaded); |
64 | 70 | ||
65 | static ssize_t vmcoreinfo_show(struct kset *kset, char *page) | 71 | static ssize_t vmcoreinfo_show(struct kobject *kobj, |
72 | struct kobj_attribute *attr, char *buf) | ||
66 | { | 73 | { |
67 | return sprintf(page, "%lx %x\n", | 74 | return sprintf(buf, "%lx %x\n", |
68 | paddr_vmcoreinfo_note(), | 75 | paddr_vmcoreinfo_note(), |
69 | (unsigned int)vmcoreinfo_max_size); | 76 | (unsigned int)vmcoreinfo_max_size); |
70 | } | 77 | } |
@@ -94,8 +101,8 @@ static struct bin_attribute notes_attr = { | |||
94 | .read = ¬es_read, | 101 | .read = ¬es_read, |
95 | }; | 102 | }; |
96 | 103 | ||
97 | decl_subsys(kernel, NULL, NULL); | 104 | struct kobject *kernel_kobj; |
98 | EXPORT_SYMBOL_GPL(kernel_subsys); | 105 | EXPORT_SYMBOL_GPL(kernel_kobj); |
99 | 106 | ||
100 | static struct attribute * kernel_attrs[] = { | 107 | static struct attribute * kernel_attrs[] = { |
101 | #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) | 108 | #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) |
@@ -116,24 +123,39 @@ static struct attribute_group kernel_attr_group = { | |||
116 | 123 | ||
117 | static int __init ksysfs_init(void) | 124 | static int __init ksysfs_init(void) |
118 | { | 125 | { |
119 | int error = subsystem_register(&kernel_subsys); | 126 | int error; |
120 | if (!error) | ||
121 | error = sysfs_create_group(&kernel_subsys.kobj, | ||
122 | &kernel_attr_group); | ||
123 | 127 | ||
124 | if (!error && notes_size > 0) { | 128 | kernel_kobj = kobject_create_and_add("kernel", NULL); |
125 | notes_attr.size = notes_size; | 129 | if (!kernel_kobj) { |
126 | error = sysfs_create_bin_file(&kernel_subsys.kobj, | 130 | error = -ENOMEM; |
127 | ¬es_attr); | 131 | goto exit; |
128 | } | 132 | } |
133 | error = sysfs_create_group(kernel_kobj, &kernel_attr_group); | ||
134 | if (error) | ||
135 | goto kset_exit; | ||
129 | 136 | ||
130 | /* | 137 | if (notes_size > 0) { |
131 | * Create "/sys/kernel/uids" directory and corresponding root user's | 138 | notes_attr.size = notes_size; |
132 | * directory under it. | 139 | error = sysfs_create_bin_file(kernel_kobj, ¬es_attr); |
133 | */ | 140 | if (error) |
134 | if (!error) | 141 | goto group_exit; |
135 | error = uids_kobject_init(); | 142 | } |
136 | 143 | ||
144 | /* create the /sys/kernel/uids/ directory */ | ||
145 | error = uids_sysfs_init(); | ||
146 | if (error) | ||
147 | goto notes_exit; | ||
148 | |||
149 | return 0; | ||
150 | |||
151 | notes_exit: | ||
152 | if (notes_size > 0) | ||
153 | sysfs_remove_bin_file(kernel_kobj, ¬es_attr); | ||
154 | group_exit: | ||
155 | sysfs_remove_group(kernel_kobj, &kernel_attr_group); | ||
156 | kset_exit: | ||
157 | kobject_put(kernel_kobj); | ||
158 | exit: | ||
137 | return error; | 159 | return error; |
138 | } | 160 | } |
139 | 161 | ||
diff --git a/kernel/kthread.c b/kernel/kthread.c index dcfe724300eb..0ac887882f90 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c | |||
@@ -15,6 +15,8 @@ | |||
15 | #include <linux/mutex.h> | 15 | #include <linux/mutex.h> |
16 | #include <asm/semaphore.h> | 16 | #include <asm/semaphore.h> |
17 | 17 | ||
18 | #define KTHREAD_NICE_LEVEL (-5) | ||
19 | |||
18 | static DEFINE_SPINLOCK(kthread_create_lock); | 20 | static DEFINE_SPINLOCK(kthread_create_lock); |
19 | static LIST_HEAD(kthread_create_list); | 21 | static LIST_HEAD(kthread_create_list); |
20 | struct task_struct *kthreadd_task; | 22 | struct task_struct *kthreadd_task; |
@@ -94,10 +96,18 @@ static void create_kthread(struct kthread_create_info *create) | |||
94 | if (pid < 0) { | 96 | if (pid < 0) { |
95 | create->result = ERR_PTR(pid); | 97 | create->result = ERR_PTR(pid); |
96 | } else { | 98 | } else { |
99 | struct sched_param param = { .sched_priority = 0 }; | ||
97 | wait_for_completion(&create->started); | 100 | wait_for_completion(&create->started); |
98 | read_lock(&tasklist_lock); | 101 | read_lock(&tasklist_lock); |
99 | create->result = find_task_by_pid(pid); | 102 | create->result = find_task_by_pid(pid); |
100 | read_unlock(&tasklist_lock); | 103 | read_unlock(&tasklist_lock); |
104 | /* | ||
105 | * root may have changed our (kthreadd's) priority or CPU mask. | ||
106 | * The kernel thread should not inherit these properties. | ||
107 | */ | ||
108 | sched_setscheduler(create->result, SCHED_NORMAL, ¶m); | ||
109 | set_user_nice(create->result, KTHREAD_NICE_LEVEL); | ||
110 | set_cpus_allowed(create->result, CPU_MASK_ALL); | ||
101 | } | 111 | } |
102 | complete(&create->done); | 112 | complete(&create->done); |
103 | } | 113 | } |
@@ -221,7 +231,7 @@ int kthreadd(void *unused) | |||
221 | /* Setup a clean context for our children to inherit. */ | 231 | /* Setup a clean context for our children to inherit. */ |
222 | set_task_comm(tsk, "kthreadd"); | 232 | set_task_comm(tsk, "kthreadd"); |
223 | ignore_signals(tsk); | 233 | ignore_signals(tsk); |
224 | set_user_nice(tsk, -5); | 234 | set_user_nice(tsk, KTHREAD_NICE_LEVEL); |
225 | set_cpus_allowed(tsk, CPU_MASK_ALL); | 235 | set_cpus_allowed(tsk, CPU_MASK_ALL); |
226 | 236 | ||
227 | current->flags |= PF_NOFREEZE; | 237 | current->flags |= PF_NOFREEZE; |
diff --git a/kernel/latencytop.c b/kernel/latencytop.c new file mode 100644 index 000000000000..b4e3c85abe74 --- /dev/null +++ b/kernel/latencytop.c | |||
@@ -0,0 +1,239 @@ | |||
1 | /* | ||
2 | * latencytop.c: Latency display infrastructure | ||
3 | * | ||
4 | * (C) Copyright 2008 Intel Corporation | ||
5 | * Author: Arjan van de Ven <arjan@linux.intel.com> | ||
6 | * | ||
7 | * This program is free software; you can redistribute it and/or | ||
8 | * modify it under the terms of the GNU General Public License | ||
9 | * as published by the Free Software Foundation; version 2 | ||
10 | * of the License. | ||
11 | */ | ||
12 | #include <linux/latencytop.h> | ||
13 | #include <linux/kallsyms.h> | ||
14 | #include <linux/seq_file.h> | ||
15 | #include <linux/notifier.h> | ||
16 | #include <linux/spinlock.h> | ||
17 | #include <linux/proc_fs.h> | ||
18 | #include <linux/module.h> | ||
19 | #include <linux/sched.h> | ||
20 | #include <linux/list.h> | ||
21 | #include <linux/slab.h> | ||
22 | #include <linux/stacktrace.h> | ||
23 | |||
24 | static DEFINE_SPINLOCK(latency_lock); | ||
25 | |||
26 | #define MAXLR 128 | ||
27 | static struct latency_record latency_record[MAXLR]; | ||
28 | |||
29 | int latencytop_enabled; | ||
30 | |||
31 | void clear_all_latency_tracing(struct task_struct *p) | ||
32 | { | ||
33 | unsigned long flags; | ||
34 | |||
35 | if (!latencytop_enabled) | ||
36 | return; | ||
37 | |||
38 | spin_lock_irqsave(&latency_lock, flags); | ||
39 | memset(&p->latency_record, 0, sizeof(p->latency_record)); | ||
40 | p->latency_record_count = 0; | ||
41 | spin_unlock_irqrestore(&latency_lock, flags); | ||
42 | } | ||
43 | |||
44 | static void clear_global_latency_tracing(void) | ||
45 | { | ||
46 | unsigned long flags; | ||
47 | |||
48 | spin_lock_irqsave(&latency_lock, flags); | ||
49 | memset(&latency_record, 0, sizeof(latency_record)); | ||
50 | spin_unlock_irqrestore(&latency_lock, flags); | ||
51 | } | ||
52 | |||
53 | static void __sched | ||
54 | account_global_scheduler_latency(struct task_struct *tsk, struct latency_record *lat) | ||
55 | { | ||
56 | int firstnonnull = MAXLR + 1; | ||
57 | int i; | ||
58 | |||
59 | if (!latencytop_enabled) | ||
60 | return; | ||
61 | |||
62 | /* skip kernel threads for now */ | ||
63 | if (!tsk->mm) | ||
64 | return; | ||
65 | |||
66 | for (i = 0; i < MAXLR; i++) { | ||
67 | int q; | ||
68 | int same = 1; | ||
69 | /* Nothing stored: */ | ||
70 | if (!latency_record[i].backtrace[0]) { | ||
71 | if (firstnonnull > i) | ||
72 | firstnonnull = i; | ||
73 | continue; | ||
74 | } | ||
75 | for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { | ||
76 | if (latency_record[i].backtrace[q] != | ||
77 | lat->backtrace[q]) | ||
78 | same = 0; | ||
79 | if (same && lat->backtrace[q] == 0) | ||
80 | break; | ||
81 | if (same && lat->backtrace[q] == ULONG_MAX) | ||
82 | break; | ||
83 | } | ||
84 | if (same) { | ||
85 | latency_record[i].count++; | ||
86 | latency_record[i].time += lat->time; | ||
87 | if (lat->time > latency_record[i].max) | ||
88 | latency_record[i].max = lat->time; | ||
89 | return; | ||
90 | } | ||
91 | } | ||
92 | |||
93 | i = firstnonnull; | ||
94 | if (i >= MAXLR - 1) | ||
95 | return; | ||
96 | |||
97 | /* Allocted a new one: */ | ||
98 | memcpy(&latency_record[i], lat, sizeof(struct latency_record)); | ||
99 | } | ||
100 | |||
101 | static inline void store_stacktrace(struct task_struct *tsk, struct latency_record *lat) | ||
102 | { | ||
103 | struct stack_trace trace; | ||
104 | |||
105 | memset(&trace, 0, sizeof(trace)); | ||
106 | trace.max_entries = LT_BACKTRACEDEPTH; | ||
107 | trace.entries = &lat->backtrace[0]; | ||
108 | trace.skip = 0; | ||
109 | save_stack_trace_tsk(tsk, &trace); | ||
110 | } | ||
111 | |||
112 | void __sched | ||
113 | account_scheduler_latency(struct task_struct *tsk, int usecs, int inter) | ||
114 | { | ||
115 | unsigned long flags; | ||
116 | int i, q; | ||
117 | struct latency_record lat; | ||
118 | |||
119 | if (!latencytop_enabled) | ||
120 | return; | ||
121 | |||
122 | /* Long interruptible waits are generally user requested... */ | ||
123 | if (inter && usecs > 5000) | ||
124 | return; | ||
125 | |||
126 | memset(&lat, 0, sizeof(lat)); | ||
127 | lat.count = 1; | ||
128 | lat.time = usecs; | ||
129 | lat.max = usecs; | ||
130 | store_stacktrace(tsk, &lat); | ||
131 | |||
132 | spin_lock_irqsave(&latency_lock, flags); | ||
133 | |||
134 | account_global_scheduler_latency(tsk, &lat); | ||
135 | |||
136 | /* | ||
137 | * short term hack; if we're > 32 we stop; future we recycle: | ||
138 | */ | ||
139 | tsk->latency_record_count++; | ||
140 | if (tsk->latency_record_count >= LT_SAVECOUNT) | ||
141 | goto out_unlock; | ||
142 | |||
143 | for (i = 0; i < LT_SAVECOUNT ; i++) { | ||
144 | struct latency_record *mylat; | ||
145 | int same = 1; | ||
146 | mylat = &tsk->latency_record[i]; | ||
147 | for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) { | ||
148 | if (mylat->backtrace[q] != | ||
149 | lat.backtrace[q]) | ||
150 | same = 0; | ||
151 | if (same && lat.backtrace[q] == 0) | ||
152 | break; | ||
153 | if (same && lat.backtrace[q] == ULONG_MAX) | ||
154 | break; | ||
155 | } | ||
156 | if (same) { | ||
157 | mylat->count++; | ||
158 | mylat->time += lat.time; | ||
159 | if (lat.time > mylat->max) | ||
160 | mylat->max = lat.time; | ||
161 | goto out_unlock; | ||
162 | } | ||
163 | } | ||
164 | |||
165 | /* Allocated a new one: */ | ||
166 | i = tsk->latency_record_count; | ||
167 | memcpy(&tsk->latency_record[i], &lat, sizeof(struct latency_record)); | ||
168 | |||
169 | out_unlock: | ||
170 | spin_unlock_irqrestore(&latency_lock, flags); | ||
171 | } | ||
172 | |||
173 | static int lstats_show(struct seq_file *m, void *v) | ||
174 | { | ||
175 | int i; | ||
176 | |||
177 | seq_puts(m, "Latency Top version : v0.1\n"); | ||
178 | |||
179 | for (i = 0; i < MAXLR; i++) { | ||
180 | if (latency_record[i].backtrace[0]) { | ||
181 | int q; | ||
182 | seq_printf(m, "%i %li %li ", | ||
183 | latency_record[i].count, | ||
184 | latency_record[i].time, | ||
185 | latency_record[i].max); | ||
186 | for (q = 0; q < LT_BACKTRACEDEPTH; q++) { | ||
187 | char sym[KSYM_NAME_LEN]; | ||
188 | char *c; | ||
189 | if (!latency_record[i].backtrace[q]) | ||
190 | break; | ||
191 | if (latency_record[i].backtrace[q] == ULONG_MAX) | ||
192 | break; | ||
193 | sprint_symbol(sym, latency_record[i].backtrace[q]); | ||
194 | c = strchr(sym, '+'); | ||
195 | if (c) | ||
196 | *c = 0; | ||
197 | seq_printf(m, "%s ", sym); | ||
198 | } | ||
199 | seq_printf(m, "\n"); | ||
200 | } | ||
201 | } | ||
202 | return 0; | ||
203 | } | ||
204 | |||
205 | static ssize_t | ||
206 | lstats_write(struct file *file, const char __user *buf, size_t count, | ||
207 | loff_t *offs) | ||
208 | { | ||
209 | clear_global_latency_tracing(); | ||
210 | |||
211 | return count; | ||
212 | } | ||
213 | |||
214 | static int lstats_open(struct inode *inode, struct file *filp) | ||
215 | { | ||
216 | return single_open(filp, lstats_show, NULL); | ||
217 | } | ||
218 | |||
219 | static struct file_operations lstats_fops = { | ||
220 | .open = lstats_open, | ||
221 | .read = seq_read, | ||
222 | .write = lstats_write, | ||
223 | .llseek = seq_lseek, | ||
224 | .release = single_release, | ||
225 | }; | ||
226 | |||
227 | static int __init init_lstats_procfs(void) | ||
228 | { | ||
229 | struct proc_dir_entry *pe; | ||
230 | |||
231 | pe = create_proc_entry("latency_stats", 0644, NULL); | ||
232 | if (!pe) | ||
233 | return -ENOMEM; | ||
234 | |||
235 | pe->proc_fops = &lstats_fops; | ||
236 | |||
237 | return 0; | ||
238 | } | ||
239 | __initcall(init_lstats_procfs); | ||
diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 55fe0c7cd95f..3574379f4d62 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c | |||
@@ -2424,7 +2424,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, | |||
2424 | return 0; | 2424 | return 0; |
2425 | 2425 | ||
2426 | /* | 2426 | /* |
2427 | * Calculate the chain hash: it's the combined has of all the | 2427 | * Calculate the chain hash: it's the combined hash of all the |
2428 | * lock keys along the dependency chain. We save the hash value | 2428 | * lock keys along the dependency chain. We save the hash value |
2429 | * at every step so that we can get the current hash easily | 2429 | * at every step so that we can get the current hash easily |
2430 | * after unlock. The chain hash is then used to cache dependency | 2430 | * after unlock. The chain hash is then used to cache dependency |
@@ -2654,10 +2654,15 @@ static void check_flags(unsigned long flags) | |||
2654 | if (!debug_locks) | 2654 | if (!debug_locks) |
2655 | return; | 2655 | return; |
2656 | 2656 | ||
2657 | if (irqs_disabled_flags(flags)) | 2657 | if (irqs_disabled_flags(flags)) { |
2658 | DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled); | 2658 | if (DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled)) { |
2659 | else | 2659 | printk("possible reason: unannotated irqs-off.\n"); |
2660 | DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled); | 2660 | } |
2661 | } else { | ||
2662 | if (DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled)) { | ||
2663 | printk("possible reason: unannotated irqs-on.\n"); | ||
2664 | } | ||
2665 | } | ||
2661 | 2666 | ||
2662 | /* | 2667 | /* |
2663 | * We dont accurately track softirq state in e.g. | 2668 | * We dont accurately track softirq state in e.g. |
@@ -2927,7 +2932,7 @@ static void zap_class(struct lock_class *class) | |||
2927 | 2932 | ||
2928 | } | 2933 | } |
2929 | 2934 | ||
2930 | static inline int within(void *addr, void *start, unsigned long size) | 2935 | static inline int within(const void *addr, void *start, unsigned long size) |
2931 | { | 2936 | { |
2932 | return addr >= start && addr < start + size; | 2937 | return addr >= start && addr < start + size; |
2933 | } | 2938 | } |
@@ -2938,9 +2943,10 @@ void lockdep_free_key_range(void *start, unsigned long size) | |||
2938 | struct list_head *head; | 2943 | struct list_head *head; |
2939 | unsigned long flags; | 2944 | unsigned long flags; |
2940 | int i; | 2945 | int i; |
2946 | int locked; | ||
2941 | 2947 | ||
2942 | raw_local_irq_save(flags); | 2948 | raw_local_irq_save(flags); |
2943 | graph_lock(); | 2949 | locked = graph_lock(); |
2944 | 2950 | ||
2945 | /* | 2951 | /* |
2946 | * Unhash all classes that were created by this module: | 2952 | * Unhash all classes that were created by this module: |
@@ -2949,12 +2955,16 @@ void lockdep_free_key_range(void *start, unsigned long size) | |||
2949 | head = classhash_table + i; | 2955 | head = classhash_table + i; |
2950 | if (list_empty(head)) | 2956 | if (list_empty(head)) |
2951 | continue; | 2957 | continue; |
2952 | list_for_each_entry_safe(class, next, head, hash_entry) | 2958 | list_for_each_entry_safe(class, next, head, hash_entry) { |
2953 | if (within(class->key, start, size)) | 2959 | if (within(class->key, start, size)) |
2954 | zap_class(class); | 2960 | zap_class(class); |
2961 | else if (within(class->name, start, size)) | ||
2962 | zap_class(class); | ||
2963 | } | ||
2955 | } | 2964 | } |
2956 | 2965 | ||
2957 | graph_unlock(); | 2966 | if (locked) |
2967 | graph_unlock(); | ||
2958 | raw_local_irq_restore(flags); | 2968 | raw_local_irq_restore(flags); |
2959 | } | 2969 | } |
2960 | 2970 | ||
@@ -2964,6 +2974,7 @@ void lockdep_reset_lock(struct lockdep_map *lock) | |||
2964 | struct list_head *head; | 2974 | struct list_head *head; |
2965 | unsigned long flags; | 2975 | unsigned long flags; |
2966 | int i, j; | 2976 | int i, j; |
2977 | int locked; | ||
2967 | 2978 | ||
2968 | raw_local_irq_save(flags); | 2979 | raw_local_irq_save(flags); |
2969 | 2980 | ||
@@ -2982,7 +2993,7 @@ void lockdep_reset_lock(struct lockdep_map *lock) | |||
2982 | * Debug check: in the end all mapped classes should | 2993 | * Debug check: in the end all mapped classes should |
2983 | * be gone. | 2994 | * be gone. |
2984 | */ | 2995 | */ |
2985 | graph_lock(); | 2996 | locked = graph_lock(); |
2986 | for (i = 0; i < CLASSHASH_SIZE; i++) { | 2997 | for (i = 0; i < CLASSHASH_SIZE; i++) { |
2987 | head = classhash_table + i; | 2998 | head = classhash_table + i; |
2988 | if (list_empty(head)) | 2999 | if (list_empty(head)) |
@@ -2995,7 +3006,8 @@ void lockdep_reset_lock(struct lockdep_map *lock) | |||
2995 | } | 3006 | } |
2996 | } | 3007 | } |
2997 | } | 3008 | } |
2998 | graph_unlock(); | 3009 | if (locked) |
3010 | graph_unlock(); | ||
2999 | 3011 | ||
3000 | out_restore: | 3012 | out_restore: |
3001 | raw_local_irq_restore(flags); | 3013 | raw_local_irq_restore(flags); |
@@ -3054,11 +3066,6 @@ void __init lockdep_info(void) | |||
3054 | #endif | 3066 | #endif |
3055 | } | 3067 | } |
3056 | 3068 | ||
3057 | static inline int in_range(const void *start, const void *addr, const void *end) | ||
3058 | { | ||
3059 | return addr >= start && addr <= end; | ||
3060 | } | ||
3061 | |||
3062 | static void | 3069 | static void |
3063 | print_freed_lock_bug(struct task_struct *curr, const void *mem_from, | 3070 | print_freed_lock_bug(struct task_struct *curr, const void *mem_from, |
3064 | const void *mem_to, struct held_lock *hlock) | 3071 | const void *mem_to, struct held_lock *hlock) |
@@ -3080,6 +3087,13 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from, | |||
3080 | dump_stack(); | 3087 | dump_stack(); |
3081 | } | 3088 | } |
3082 | 3089 | ||
3090 | static inline int not_in_range(const void* mem_from, unsigned long mem_len, | ||
3091 | const void* lock_from, unsigned long lock_len) | ||
3092 | { | ||
3093 | return lock_from + lock_len <= mem_from || | ||
3094 | mem_from + mem_len <= lock_from; | ||
3095 | } | ||
3096 | |||
3083 | /* | 3097 | /* |
3084 | * Called when kernel memory is freed (or unmapped), or if a lock | 3098 | * Called when kernel memory is freed (or unmapped), or if a lock |
3085 | * is destroyed or reinitialized - this code checks whether there is | 3099 | * is destroyed or reinitialized - this code checks whether there is |
@@ -3087,7 +3101,6 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from, | |||
3087 | */ | 3101 | */ |
3088 | void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len) | 3102 | void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len) |
3089 | { | 3103 | { |
3090 | const void *mem_to = mem_from + mem_len, *lock_from, *lock_to; | ||
3091 | struct task_struct *curr = current; | 3104 | struct task_struct *curr = current; |
3092 | struct held_lock *hlock; | 3105 | struct held_lock *hlock; |
3093 | unsigned long flags; | 3106 | unsigned long flags; |
@@ -3100,14 +3113,11 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len) | |||
3100 | for (i = 0; i < curr->lockdep_depth; i++) { | 3113 | for (i = 0; i < curr->lockdep_depth; i++) { |
3101 | hlock = curr->held_locks + i; | 3114 | hlock = curr->held_locks + i; |
3102 | 3115 | ||
3103 | lock_from = (void *)hlock->instance; | 3116 | if (not_in_range(mem_from, mem_len, hlock->instance, |
3104 | lock_to = (void *)(hlock->instance + 1); | 3117 | sizeof(*hlock->instance))) |
3105 | |||
3106 | if (!in_range(mem_from, lock_from, mem_to) && | ||
3107 | !in_range(mem_from, lock_to, mem_to)) | ||
3108 | continue; | 3118 | continue; |
3109 | 3119 | ||
3110 | print_freed_lock_bug(curr, mem_from, mem_to, hlock); | 3120 | print_freed_lock_bug(curr, mem_from, mem_from + mem_len, hlock); |
3111 | break; | 3121 | break; |
3112 | } | 3122 | } |
3113 | local_irq_restore(flags); | 3123 | local_irq_restore(flags); |
@@ -3173,6 +3183,13 @@ retry: | |||
3173 | printk(" locked it.\n"); | 3183 | printk(" locked it.\n"); |
3174 | 3184 | ||
3175 | do_each_thread(g, p) { | 3185 | do_each_thread(g, p) { |
3186 | /* | ||
3187 | * It's not reliable to print a task's held locks | ||
3188 | * if it's not sleeping (or if it's not the current | ||
3189 | * task): | ||
3190 | */ | ||
3191 | if (p->state == TASK_RUNNING && p != current) | ||
3192 | continue; | ||
3176 | if (p->lockdep_depth) | 3193 | if (p->lockdep_depth) |
3177 | lockdep_print_held_locks(p); | 3194 | lockdep_print_held_locks(p); |
3178 | if (!unlock) | 3195 | if (!unlock) |
@@ -3189,7 +3206,11 @@ retry: | |||
3189 | 3206 | ||
3190 | EXPORT_SYMBOL_GPL(debug_show_all_locks); | 3207 | EXPORT_SYMBOL_GPL(debug_show_all_locks); |
3191 | 3208 | ||
3192 | void debug_show_held_locks(struct task_struct *task) | 3209 | /* |
3210 | * Careful: only use this function if you are sure that | ||
3211 | * the task cannot run in parallel! | ||
3212 | */ | ||
3213 | void __debug_show_held_locks(struct task_struct *task) | ||
3193 | { | 3214 | { |
3194 | if (unlikely(!debug_locks)) { | 3215 | if (unlikely(!debug_locks)) { |
3195 | printk("INFO: lockdep is turned off.\n"); | 3216 | printk("INFO: lockdep is turned off.\n"); |
@@ -3197,6 +3218,12 @@ void debug_show_held_locks(struct task_struct *task) | |||
3197 | } | 3218 | } |
3198 | lockdep_print_held_locks(task); | 3219 | lockdep_print_held_locks(task); |
3199 | } | 3220 | } |
3221 | EXPORT_SYMBOL_GPL(__debug_show_held_locks); | ||
3222 | |||
3223 | void debug_show_held_locks(struct task_struct *task) | ||
3224 | { | ||
3225 | __debug_show_held_locks(task); | ||
3226 | } | ||
3200 | 3227 | ||
3201 | EXPORT_SYMBOL_GPL(debug_show_held_locks); | 3228 | EXPORT_SYMBOL_GPL(debug_show_held_locks); |
3202 | 3229 | ||
diff --git a/kernel/marker.c b/kernel/marker.c index ccb48d9a3657..5323cfaedbce 100644 --- a/kernel/marker.c +++ b/kernel/marker.c | |||
@@ -28,7 +28,7 @@ extern struct marker __start___markers[]; | |||
28 | extern struct marker __stop___markers[]; | 28 | extern struct marker __stop___markers[]; |
29 | 29 | ||
30 | /* | 30 | /* |
31 | * module_mutex nests inside markers_mutex. Markers mutex protects the builtin | 31 | * markers_mutex nests inside module_mutex. Markers mutex protects the builtin |
32 | * and module markers, the hash table and deferred_sync. | 32 | * and module markers, the hash table and deferred_sync. |
33 | */ | 33 | */ |
34 | static DEFINE_MUTEX(markers_mutex); | 34 | static DEFINE_MUTEX(markers_mutex); |
@@ -257,7 +257,6 @@ static void disable_marker(struct marker *elem) | |||
257 | * @refcount: number of references left to the given probe_module (out) | 257 | * @refcount: number of references left to the given probe_module (out) |
258 | * | 258 | * |
259 | * Updates the probe callback corresponding to a range of markers. | 259 | * Updates the probe callback corresponding to a range of markers. |
260 | * Must be called with markers_mutex held. | ||
261 | */ | 260 | */ |
262 | void marker_update_probe_range(struct marker *begin, | 261 | void marker_update_probe_range(struct marker *begin, |
263 | struct marker *end, struct module *probe_module, | 262 | struct marker *end, struct module *probe_module, |
@@ -266,6 +265,7 @@ void marker_update_probe_range(struct marker *begin, | |||
266 | struct marker *iter; | 265 | struct marker *iter; |
267 | struct marker_entry *mark_entry; | 266 | struct marker_entry *mark_entry; |
268 | 267 | ||
268 | mutex_lock(&markers_mutex); | ||
269 | for (iter = begin; iter < end; iter++) { | 269 | for (iter = begin; iter < end; iter++) { |
270 | mark_entry = get_marker(iter->name); | 270 | mark_entry = get_marker(iter->name); |
271 | if (mark_entry && mark_entry->refcount) { | 271 | if (mark_entry && mark_entry->refcount) { |
@@ -281,6 +281,7 @@ void marker_update_probe_range(struct marker *begin, | |||
281 | disable_marker(iter); | 281 | disable_marker(iter); |
282 | } | 282 | } |
283 | } | 283 | } |
284 | mutex_unlock(&markers_mutex); | ||
284 | } | 285 | } |
285 | 286 | ||
286 | /* | 287 | /* |
@@ -293,7 +294,6 @@ static void marker_update_probes(struct module *probe_module) | |||
293 | { | 294 | { |
294 | int refcount = 0; | 295 | int refcount = 0; |
295 | 296 | ||
296 | mutex_lock(&markers_mutex); | ||
297 | /* Core kernel markers */ | 297 | /* Core kernel markers */ |
298 | marker_update_probe_range(__start___markers, | 298 | marker_update_probe_range(__start___markers, |
299 | __stop___markers, probe_module, &refcount); | 299 | __stop___markers, probe_module, &refcount); |
@@ -303,7 +303,6 @@ static void marker_update_probes(struct module *probe_module) | |||
303 | synchronize_sched(); | 303 | synchronize_sched(); |
304 | deferred_sync = 0; | 304 | deferred_sync = 0; |
305 | } | 305 | } |
306 | mutex_unlock(&markers_mutex); | ||
307 | } | 306 | } |
308 | 307 | ||
309 | /** | 308 | /** |
@@ -320,7 +319,7 @@ int marker_probe_register(const char *name, const char *format, | |||
320 | marker_probe_func *probe, void *private) | 319 | marker_probe_func *probe, void *private) |
321 | { | 320 | { |
322 | struct marker_entry *entry; | 321 | struct marker_entry *entry; |
323 | int ret = 0, need_update = 0; | 322 | int ret = 0; |
324 | 323 | ||
325 | mutex_lock(&markers_mutex); | 324 | mutex_lock(&markers_mutex); |
326 | entry = get_marker(name); | 325 | entry = get_marker(name); |
@@ -335,11 +334,11 @@ int marker_probe_register(const char *name, const char *format, | |||
335 | ret = add_marker(name, format, probe, private); | 334 | ret = add_marker(name, format, probe, private); |
336 | if (ret) | 335 | if (ret) |
337 | goto end; | 336 | goto end; |
338 | need_update = 1; | 337 | mutex_unlock(&markers_mutex); |
338 | marker_update_probes(NULL); | ||
339 | return ret; | ||
339 | end: | 340 | end: |
340 | mutex_unlock(&markers_mutex); | 341 | mutex_unlock(&markers_mutex); |
341 | if (need_update) | ||
342 | marker_update_probes(NULL); | ||
343 | return ret; | 342 | return ret; |
344 | } | 343 | } |
345 | EXPORT_SYMBOL_GPL(marker_probe_register); | 344 | EXPORT_SYMBOL_GPL(marker_probe_register); |
@@ -355,7 +354,6 @@ void *marker_probe_unregister(const char *name) | |||
355 | struct module *probe_module; | 354 | struct module *probe_module; |
356 | struct marker_entry *entry; | 355 | struct marker_entry *entry; |
357 | void *private; | 356 | void *private; |
358 | int need_update = 0; | ||
359 | 357 | ||
360 | mutex_lock(&markers_mutex); | 358 | mutex_lock(&markers_mutex); |
361 | entry = get_marker(name); | 359 | entry = get_marker(name); |
@@ -368,11 +366,11 @@ void *marker_probe_unregister(const char *name) | |||
368 | probe_module = __module_text_address((unsigned long)entry->probe); | 366 | probe_module = __module_text_address((unsigned long)entry->probe); |
369 | private = remove_marker(name); | 367 | private = remove_marker(name); |
370 | deferred_sync = 1; | 368 | deferred_sync = 1; |
371 | need_update = 1; | 369 | mutex_unlock(&markers_mutex); |
370 | marker_update_probes(probe_module); | ||
371 | return private; | ||
372 | end: | 372 | end: |
373 | mutex_unlock(&markers_mutex); | 373 | mutex_unlock(&markers_mutex); |
374 | if (need_update) | ||
375 | marker_update_probes(probe_module); | ||
376 | return private; | 374 | return private; |
377 | } | 375 | } |
378 | EXPORT_SYMBOL_GPL(marker_probe_unregister); | 376 | EXPORT_SYMBOL_GPL(marker_probe_unregister); |
@@ -392,7 +390,6 @@ void *marker_probe_unregister_private_data(void *private) | |||
392 | struct marker_entry *entry; | 390 | struct marker_entry *entry; |
393 | int found = 0; | 391 | int found = 0; |
394 | unsigned int i; | 392 | unsigned int i; |
395 | int need_update = 0; | ||
396 | 393 | ||
397 | mutex_lock(&markers_mutex); | 394 | mutex_lock(&markers_mutex); |
398 | for (i = 0; i < MARKER_TABLE_SIZE; i++) { | 395 | for (i = 0; i < MARKER_TABLE_SIZE; i++) { |
@@ -414,11 +411,11 @@ iter_end: | |||
414 | probe_module = __module_text_address((unsigned long)entry->probe); | 411 | probe_module = __module_text_address((unsigned long)entry->probe); |
415 | private = remove_marker(entry->name); | 412 | private = remove_marker(entry->name); |
416 | deferred_sync = 1; | 413 | deferred_sync = 1; |
417 | need_update = 1; | 414 | mutex_unlock(&markers_mutex); |
415 | marker_update_probes(probe_module); | ||
416 | return private; | ||
418 | end: | 417 | end: |
419 | mutex_unlock(&markers_mutex); | 418 | mutex_unlock(&markers_mutex); |
420 | if (need_update) | ||
421 | marker_update_probes(probe_module); | ||
422 | return private; | 419 | return private; |
423 | } | 420 | } |
424 | EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data); | 421 | EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data); |
@@ -434,7 +431,7 @@ EXPORT_SYMBOL_GPL(marker_probe_unregister_private_data); | |||
434 | int marker_arm(const char *name) | 431 | int marker_arm(const char *name) |
435 | { | 432 | { |
436 | struct marker_entry *entry; | 433 | struct marker_entry *entry; |
437 | int ret = 0, need_update = 0; | 434 | int ret = 0; |
438 | 435 | ||
439 | mutex_lock(&markers_mutex); | 436 | mutex_lock(&markers_mutex); |
440 | entry = get_marker(name); | 437 | entry = get_marker(name); |
@@ -447,11 +444,9 @@ int marker_arm(const char *name) | |||
447 | */ | 444 | */ |
448 | if (entry->refcount++) | 445 | if (entry->refcount++) |
449 | goto end; | 446 | goto end; |
450 | need_update = 1; | ||
451 | end: | 447 | end: |
452 | mutex_unlock(&markers_mutex); | 448 | mutex_unlock(&markers_mutex); |
453 | if (need_update) | 449 | marker_update_probes(NULL); |
454 | marker_update_probes(NULL); | ||
455 | return ret; | 450 | return ret; |
456 | } | 451 | } |
457 | EXPORT_SYMBOL_GPL(marker_arm); | 452 | EXPORT_SYMBOL_GPL(marker_arm); |
@@ -467,7 +462,7 @@ EXPORT_SYMBOL_GPL(marker_arm); | |||
467 | int marker_disarm(const char *name) | 462 | int marker_disarm(const char *name) |
468 | { | 463 | { |
469 | struct marker_entry *entry; | 464 | struct marker_entry *entry; |
470 | int ret = 0, need_update = 0; | 465 | int ret = 0; |
471 | 466 | ||
472 | mutex_lock(&markers_mutex); | 467 | mutex_lock(&markers_mutex); |
473 | entry = get_marker(name); | 468 | entry = get_marker(name); |
@@ -486,11 +481,9 @@ int marker_disarm(const char *name) | |||
486 | ret = -EPERM; | 481 | ret = -EPERM; |
487 | goto end; | 482 | goto end; |
488 | } | 483 | } |
489 | need_update = 1; | ||
490 | end: | 484 | end: |
491 | mutex_unlock(&markers_mutex); | 485 | mutex_unlock(&markers_mutex); |
492 | if (need_update) | 486 | marker_update_probes(NULL); |
493 | marker_update_probes(NULL); | ||
494 | return ret; | 487 | return ret; |
495 | } | 488 | } |
496 | EXPORT_SYMBOL_GPL(marker_disarm); | 489 | EXPORT_SYMBOL_GPL(marker_disarm); |
diff --git a/kernel/module.c b/kernel/module.c index 3202c9950073..bd60278ee703 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -47,8 +47,6 @@ | |||
47 | #include <asm/cacheflush.h> | 47 | #include <asm/cacheflush.h> |
48 | #include <linux/license.h> | 48 | #include <linux/license.h> |
49 | 49 | ||
50 | extern int module_sysfs_initialized; | ||
51 | |||
52 | #if 0 | 50 | #if 0 |
53 | #define DEBUGP printk | 51 | #define DEBUGP printk |
54 | #else | 52 | #else |
@@ -67,6 +65,9 @@ extern int module_sysfs_initialized; | |||
67 | static DEFINE_MUTEX(module_mutex); | 65 | static DEFINE_MUTEX(module_mutex); |
68 | static LIST_HEAD(modules); | 66 | static LIST_HEAD(modules); |
69 | 67 | ||
68 | /* Waiting for a module to finish initializing? */ | ||
69 | static DECLARE_WAIT_QUEUE_HEAD(module_wq); | ||
70 | |||
70 | static BLOCKING_NOTIFIER_HEAD(module_notify_list); | 71 | static BLOCKING_NOTIFIER_HEAD(module_notify_list); |
71 | 72 | ||
72 | int register_module_notifier(struct notifier_block * nb) | 73 | int register_module_notifier(struct notifier_block * nb) |
@@ -81,12 +82,16 @@ int unregister_module_notifier(struct notifier_block * nb) | |||
81 | } | 82 | } |
82 | EXPORT_SYMBOL(unregister_module_notifier); | 83 | EXPORT_SYMBOL(unregister_module_notifier); |
83 | 84 | ||
84 | /* We require a truly strong try_module_get() */ | 85 | /* We require a truly strong try_module_get(): 0 means failure due to |
86 | ongoing or failed initialization etc. */ | ||
85 | static inline int strong_try_module_get(struct module *mod) | 87 | static inline int strong_try_module_get(struct module *mod) |
86 | { | 88 | { |
87 | if (mod && mod->state == MODULE_STATE_COMING) | 89 | if (mod && mod->state == MODULE_STATE_COMING) |
90 | return -EBUSY; | ||
91 | if (try_module_get(mod)) | ||
88 | return 0; | 92 | return 0; |
89 | return try_module_get(mod); | 93 | else |
94 | return -ENOENT; | ||
90 | } | 95 | } |
91 | 96 | ||
92 | static inline void add_taint_module(struct module *mod, unsigned flag) | 97 | static inline void add_taint_module(struct module *mod, unsigned flag) |
@@ -425,6 +430,14 @@ static unsigned int find_pcpusec(Elf_Ehdr *hdr, | |||
425 | return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); | 430 | return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); |
426 | } | 431 | } |
427 | 432 | ||
433 | static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size) | ||
434 | { | ||
435 | int cpu; | ||
436 | |||
437 | for_each_possible_cpu(cpu) | ||
438 | memcpy(pcpudest + per_cpu_offset(cpu), from, size); | ||
439 | } | ||
440 | |||
428 | static int percpu_modinit(void) | 441 | static int percpu_modinit(void) |
429 | { | 442 | { |
430 | pcpu_num_used = 2; | 443 | pcpu_num_used = 2; |
@@ -497,6 +510,8 @@ static struct module_attribute modinfo_##field = { \ | |||
497 | MODINFO_ATTR(version); | 510 | MODINFO_ATTR(version); |
498 | MODINFO_ATTR(srcversion); | 511 | MODINFO_ATTR(srcversion); |
499 | 512 | ||
513 | static char last_unloaded_module[MODULE_NAME_LEN+1]; | ||
514 | |||
500 | #ifdef CONFIG_MODULE_UNLOAD | 515 | #ifdef CONFIG_MODULE_UNLOAD |
501 | /* Init the unload section of the module. */ | 516 | /* Init the unload section of the module. */ |
502 | static void module_unload_init(struct module *mod) | 517 | static void module_unload_init(struct module *mod) |
@@ -538,11 +553,21 @@ static int already_uses(struct module *a, struct module *b) | |||
538 | static int use_module(struct module *a, struct module *b) | 553 | static int use_module(struct module *a, struct module *b) |
539 | { | 554 | { |
540 | struct module_use *use; | 555 | struct module_use *use; |
541 | int no_warn; | 556 | int no_warn, err; |
542 | 557 | ||
543 | if (b == NULL || already_uses(a, b)) return 1; | 558 | if (b == NULL || already_uses(a, b)) return 1; |
544 | 559 | ||
545 | if (!strong_try_module_get(b)) | 560 | /* If we're interrupted or time out, we fail. */ |
561 | if (wait_event_interruptible_timeout( | ||
562 | module_wq, (err = strong_try_module_get(b)) != -EBUSY, | ||
563 | 30 * HZ) <= 0) { | ||
564 | printk("%s: gave up waiting for init of module %s.\n", | ||
565 | a->name, b->name); | ||
566 | return 0; | ||
567 | } | ||
568 | |||
569 | /* If strong_try_module_get() returned a different error, we fail. */ | ||
570 | if (err) | ||
546 | return 0; | 571 | return 0; |
547 | 572 | ||
548 | DEBUGP("Allocating new usage for %s.\n", a->name); | 573 | DEBUGP("Allocating new usage for %s.\n", a->name); |
@@ -720,6 +745,8 @@ sys_delete_module(const char __user *name_user, unsigned int flags) | |||
720 | mod->exit(); | 745 | mod->exit(); |
721 | mutex_lock(&module_mutex); | 746 | mutex_lock(&module_mutex); |
722 | } | 747 | } |
748 | /* Store the name of the last unloaded module for diagnostic purposes */ | ||
749 | strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); | ||
723 | free_module(mod); | 750 | free_module(mod); |
724 | 751 | ||
725 | out: | 752 | out: |
@@ -813,7 +840,7 @@ static inline void module_unload_free(struct module *mod) | |||
813 | 840 | ||
814 | static inline int use_module(struct module *a, struct module *b) | 841 | static inline int use_module(struct module *a, struct module *b) |
815 | { | 842 | { |
816 | return strong_try_module_get(b); | 843 | return strong_try_module_get(b) == 0; |
817 | } | 844 | } |
818 | 845 | ||
819 | static inline void module_unload_init(struct module *mod) | 846 | static inline void module_unload_init(struct module *mod) |
@@ -952,7 +979,8 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs, | |||
952 | ret = __find_symbol(name, &owner, &crc, | 979 | ret = __find_symbol(name, &owner, &crc, |
953 | !(mod->taints & TAINT_PROPRIETARY_MODULE)); | 980 | !(mod->taints & TAINT_PROPRIETARY_MODULE)); |
954 | if (ret) { | 981 | if (ret) { |
955 | /* use_module can fail due to OOM, or module unloading */ | 982 | /* use_module can fail due to OOM, |
983 | or module initialization or unloading */ | ||
956 | if (!check_version(sechdrs, versindex, name, mod, crc) || | 984 | if (!check_version(sechdrs, versindex, name, mod, crc) || |
957 | !use_module(mod, owner)) | 985 | !use_module(mod, owner)) |
958 | ret = 0; | 986 | ret = 0; |
@@ -1120,7 +1148,7 @@ static void add_notes_attrs(struct module *mod, unsigned int nsect, | |||
1120 | ++loaded; | 1148 | ++loaded; |
1121 | } | 1149 | } |
1122 | 1150 | ||
1123 | notes_attrs->dir = kobject_add_dir(&mod->mkobj.kobj, "notes"); | 1151 | notes_attrs->dir = kobject_create_and_add("notes", &mod->mkobj.kobj); |
1124 | if (!notes_attrs->dir) | 1152 | if (!notes_attrs->dir) |
1125 | goto out; | 1153 | goto out; |
1126 | 1154 | ||
@@ -1210,6 +1238,7 @@ void module_remove_modinfo_attrs(struct module *mod) | |||
1210 | int mod_sysfs_init(struct module *mod) | 1238 | int mod_sysfs_init(struct module *mod) |
1211 | { | 1239 | { |
1212 | int err; | 1240 | int err; |
1241 | struct kobject *kobj; | ||
1213 | 1242 | ||
1214 | if (!module_sysfs_initialized) { | 1243 | if (!module_sysfs_initialized) { |
1215 | printk(KERN_ERR "%s: module sysfs not initialized\n", | 1244 | printk(KERN_ERR "%s: module sysfs not initialized\n", |
@@ -1217,15 +1246,25 @@ int mod_sysfs_init(struct module *mod) | |||
1217 | err = -EINVAL; | 1246 | err = -EINVAL; |
1218 | goto out; | 1247 | goto out; |
1219 | } | 1248 | } |
1220 | memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj)); | 1249 | |
1221 | err = kobject_set_name(&mod->mkobj.kobj, "%s", mod->name); | 1250 | kobj = kset_find_obj(module_kset, mod->name); |
1222 | if (err) | 1251 | if (kobj) { |
1252 | printk(KERN_ERR "%s: module is already loaded\n", mod->name); | ||
1253 | kobject_put(kobj); | ||
1254 | err = -EINVAL; | ||
1223 | goto out; | 1255 | goto out; |
1224 | kobj_set_kset_s(&mod->mkobj, module_subsys); | 1256 | } |
1257 | |||
1225 | mod->mkobj.mod = mod; | 1258 | mod->mkobj.mod = mod; |
1226 | 1259 | ||
1227 | kobject_init(&mod->mkobj.kobj); | 1260 | memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj)); |
1261 | mod->mkobj.kobj.kset = module_kset; | ||
1262 | err = kobject_init_and_add(&mod->mkobj.kobj, &module_ktype, NULL, | ||
1263 | "%s", mod->name); | ||
1264 | if (err) | ||
1265 | kobject_put(&mod->mkobj.kobj); | ||
1228 | 1266 | ||
1267 | /* delay uevent until full sysfs population */ | ||
1229 | out: | 1268 | out: |
1230 | return err; | 1269 | return err; |
1231 | } | 1270 | } |
@@ -1236,12 +1275,7 @@ int mod_sysfs_setup(struct module *mod, | |||
1236 | { | 1275 | { |
1237 | int err; | 1276 | int err; |
1238 | 1277 | ||
1239 | /* delay uevent until full sysfs population */ | 1278 | mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj); |
1240 | err = kobject_add(&mod->mkobj.kobj); | ||
1241 | if (err) | ||
1242 | goto out; | ||
1243 | |||
1244 | mod->holders_dir = kobject_add_dir(&mod->mkobj.kobj, "holders"); | ||
1245 | if (!mod->holders_dir) { | 1279 | if (!mod->holders_dir) { |
1246 | err = -ENOMEM; | 1280 | err = -ENOMEM; |
1247 | goto out_unreg; | 1281 | goto out_unreg; |
@@ -1261,11 +1295,9 @@ int mod_sysfs_setup(struct module *mod, | |||
1261 | out_unreg_param: | 1295 | out_unreg_param: |
1262 | module_param_sysfs_remove(mod); | 1296 | module_param_sysfs_remove(mod); |
1263 | out_unreg_holders: | 1297 | out_unreg_holders: |
1264 | kobject_unregister(mod->holders_dir); | 1298 | kobject_put(mod->holders_dir); |
1265 | out_unreg: | 1299 | out_unreg: |
1266 | kobject_del(&mod->mkobj.kobj); | ||
1267 | kobject_put(&mod->mkobj.kobj); | 1300 | kobject_put(&mod->mkobj.kobj); |
1268 | out: | ||
1269 | return err; | 1301 | return err; |
1270 | } | 1302 | } |
1271 | #endif | 1303 | #endif |
@@ -1274,9 +1306,20 @@ static void mod_kobject_remove(struct module *mod) | |||
1274 | { | 1306 | { |
1275 | module_remove_modinfo_attrs(mod); | 1307 | module_remove_modinfo_attrs(mod); |
1276 | module_param_sysfs_remove(mod); | 1308 | module_param_sysfs_remove(mod); |
1277 | kobject_unregister(mod->mkobj.drivers_dir); | 1309 | kobject_put(mod->mkobj.drivers_dir); |
1278 | kobject_unregister(mod->holders_dir); | 1310 | kobject_put(mod->holders_dir); |
1279 | kobject_unregister(&mod->mkobj.kobj); | 1311 | kobject_put(&mod->mkobj.kobj); |
1312 | } | ||
1313 | |||
1314 | /* | ||
1315 | * link the module with the whole machine is stopped with interrupts off | ||
1316 | * - this defends against kallsyms not taking locks | ||
1317 | */ | ||
1318 | static int __link_module(void *_mod) | ||
1319 | { | ||
1320 | struct module *mod = _mod; | ||
1321 | list_add(&mod->list, &modules); | ||
1322 | return 0; | ||
1280 | } | 1323 | } |
1281 | 1324 | ||
1282 | /* | 1325 | /* |
@@ -1328,7 +1371,7 @@ void *__symbol_get(const char *symbol) | |||
1328 | 1371 | ||
1329 | preempt_disable(); | 1372 | preempt_disable(); |
1330 | value = __find_symbol(symbol, &owner, &crc, 1); | 1373 | value = __find_symbol(symbol, &owner, &crc, 1); |
1331 | if (value && !strong_try_module_get(owner)) | 1374 | if (value && strong_try_module_get(owner) != 0) |
1332 | value = 0; | 1375 | value = 0; |
1333 | preempt_enable(); | 1376 | preempt_enable(); |
1334 | 1377 | ||
@@ -1369,7 +1412,7 @@ dup: | |||
1369 | return ret; | 1412 | return ret; |
1370 | } | 1413 | } |
1371 | 1414 | ||
1372 | /* Change all symbols so that sh_value encodes the pointer directly. */ | 1415 | /* Change all symbols so that st_value encodes the pointer directly. */ |
1373 | static int simplify_symbols(Elf_Shdr *sechdrs, | 1416 | static int simplify_symbols(Elf_Shdr *sechdrs, |
1374 | unsigned int symindex, | 1417 | unsigned int symindex, |
1375 | const char *strtab, | 1418 | const char *strtab, |
@@ -1882,16 +1925,16 @@ static struct module *load_module(void __user *umod, | |||
1882 | /* Now we've moved module, initialize linked lists, etc. */ | 1925 | /* Now we've moved module, initialize linked lists, etc. */ |
1883 | module_unload_init(mod); | 1926 | module_unload_init(mod); |
1884 | 1927 | ||
1885 | /* Initialize kobject, so we can reference it. */ | 1928 | /* add kobject, so we can reference it. */ |
1886 | err = mod_sysfs_init(mod); | 1929 | err = mod_sysfs_init(mod); |
1887 | if (err) | 1930 | if (err) |
1888 | goto cleanup; | 1931 | goto free_unload; |
1889 | 1932 | ||
1890 | /* Set up license info based on the info section */ | 1933 | /* Set up license info based on the info section */ |
1891 | set_license(mod, get_modinfo(sechdrs, infoindex, "license")); | 1934 | set_license(mod, get_modinfo(sechdrs, infoindex, "license")); |
1892 | 1935 | ||
1893 | if (strcmp(mod->name, "ndiswrapper") == 0) | 1936 | if (strcmp(mod->name, "ndiswrapper") == 0) |
1894 | add_taint(TAINT_PROPRIETARY_MODULE); | 1937 | add_taint_module(mod, TAINT_PROPRIETARY_MODULE); |
1895 | if (strcmp(mod->name, "driverloader") == 0) | 1938 | if (strcmp(mod->name, "driverloader") == 0) |
1896 | add_taint_module(mod, TAINT_PROPRIETARY_MODULE); | 1939 | add_taint_module(mod, TAINT_PROPRIETARY_MODULE); |
1897 | 1940 | ||
@@ -2021,6 +2064,11 @@ static struct module *load_module(void __user *umod, | |||
2021 | printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", | 2064 | printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", |
2022 | mod->name); | 2065 | mod->name); |
2023 | 2066 | ||
2067 | /* Now sew it into the lists so we can get lockdep and oops | ||
2068 | * info during argument parsing. Noone should access us, since | ||
2069 | * strong_try_module_get() will fail. */ | ||
2070 | stop_machine_run(__link_module, mod, NR_CPUS); | ||
2071 | |||
2024 | /* Size of section 0 is 0, so this works well if no params */ | 2072 | /* Size of section 0 is 0, so this works well if no params */ |
2025 | err = parse_args(mod->name, mod->args, | 2073 | err = parse_args(mod->name, mod->args, |
2026 | (struct kernel_param *) | 2074 | (struct kernel_param *) |
@@ -2029,7 +2077,7 @@ static struct module *load_module(void __user *umod, | |||
2029 | / sizeof(struct kernel_param), | 2077 | / sizeof(struct kernel_param), |
2030 | NULL); | 2078 | NULL); |
2031 | if (err < 0) | 2079 | if (err < 0) |
2032 | goto arch_cleanup; | 2080 | goto unlink; |
2033 | 2081 | ||
2034 | err = mod_sysfs_setup(mod, | 2082 | err = mod_sysfs_setup(mod, |
2035 | (struct kernel_param *) | 2083 | (struct kernel_param *) |
@@ -2037,7 +2085,7 @@ static struct module *load_module(void __user *umod, | |||
2037 | sechdrs[setupindex].sh_size | 2085 | sechdrs[setupindex].sh_size |
2038 | / sizeof(struct kernel_param)); | 2086 | / sizeof(struct kernel_param)); |
2039 | if (err < 0) | 2087 | if (err < 0) |
2040 | goto arch_cleanup; | 2088 | goto unlink; |
2041 | add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); | 2089 | add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); |
2042 | add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); | 2090 | add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); |
2043 | 2091 | ||
@@ -2052,9 +2100,13 @@ static struct module *load_module(void __user *umod, | |||
2052 | /* Done! */ | 2100 | /* Done! */ |
2053 | return mod; | 2101 | return mod; |
2054 | 2102 | ||
2055 | arch_cleanup: | 2103 | unlink: |
2104 | stop_machine_run(__unlink_module, mod, NR_CPUS); | ||
2056 | module_arch_cleanup(mod); | 2105 | module_arch_cleanup(mod); |
2057 | cleanup: | 2106 | cleanup: |
2107 | kobject_del(&mod->mkobj.kobj); | ||
2108 | kobject_put(&mod->mkobj.kobj); | ||
2109 | free_unload: | ||
2058 | module_unload_free(mod); | 2110 | module_unload_free(mod); |
2059 | module_free(mod, mod->module_init); | 2111 | module_free(mod, mod->module_init); |
2060 | free_core: | 2112 | free_core: |
@@ -2074,17 +2126,6 @@ static struct module *load_module(void __user *umod, | |||
2074 | goto free_hdr; | 2126 | goto free_hdr; |
2075 | } | 2127 | } |
2076 | 2128 | ||
2077 | /* | ||
2078 | * link the module with the whole machine is stopped with interrupts off | ||
2079 | * - this defends against kallsyms not taking locks | ||
2080 | */ | ||
2081 | static int __link_module(void *_mod) | ||
2082 | { | ||
2083 | struct module *mod = _mod; | ||
2084 | list_add(&mod->list, &modules); | ||
2085 | return 0; | ||
2086 | } | ||
2087 | |||
2088 | /* This is where the real work happens */ | 2129 | /* This is where the real work happens */ |
2089 | asmlinkage long | 2130 | asmlinkage long |
2090 | sys_init_module(void __user *umod, | 2131 | sys_init_module(void __user *umod, |
@@ -2109,10 +2150,6 @@ sys_init_module(void __user *umod, | |||
2109 | return PTR_ERR(mod); | 2150 | return PTR_ERR(mod); |
2110 | } | 2151 | } |
2111 | 2152 | ||
2112 | /* Now sew it into the lists. They won't access us, since | ||
2113 | strong_try_module_get() will fail. */ | ||
2114 | stop_machine_run(__link_module, mod, NR_CPUS); | ||
2115 | |||
2116 | /* Drop lock so they can recurse */ | 2153 | /* Drop lock so they can recurse */ |
2117 | mutex_unlock(&module_mutex); | 2154 | mutex_unlock(&module_mutex); |
2118 | 2155 | ||
@@ -2131,6 +2168,7 @@ sys_init_module(void __user *umod, | |||
2131 | mutex_lock(&module_mutex); | 2168 | mutex_lock(&module_mutex); |
2132 | free_module(mod); | 2169 | free_module(mod); |
2133 | mutex_unlock(&module_mutex); | 2170 | mutex_unlock(&module_mutex); |
2171 | wake_up(&module_wq); | ||
2134 | return ret; | 2172 | return ret; |
2135 | } | 2173 | } |
2136 | 2174 | ||
@@ -2145,6 +2183,7 @@ sys_init_module(void __user *umod, | |||
2145 | mod->init_size = 0; | 2183 | mod->init_size = 0; |
2146 | mod->init_text_size = 0; | 2184 | mod->init_text_size = 0; |
2147 | mutex_unlock(&module_mutex); | 2185 | mutex_unlock(&module_mutex); |
2186 | wake_up(&module_wq); | ||
2148 | 2187 | ||
2149 | return 0; | 2188 | return 0; |
2150 | } | 2189 | } |
@@ -2209,32 +2248,41 @@ static const char *get_ksymbol(struct module *mod, | |||
2209 | return mod->strtab + mod->symtab[best].st_name; | 2248 | return mod->strtab + mod->symtab[best].st_name; |
2210 | } | 2249 | } |
2211 | 2250 | ||
2212 | /* For kallsyms to ask for address resolution. NULL means not found. | 2251 | /* For kallsyms to ask for address resolution. NULL means not found. Careful |
2213 | We don't lock, as this is used for oops resolution and races are a | 2252 | * not to lock to avoid deadlock on oopses, simply disable preemption. */ |
2214 | lesser concern. */ | 2253 | char *module_address_lookup(unsigned long addr, |
2215 | const char *module_address_lookup(unsigned long addr, | 2254 | unsigned long *size, |
2216 | unsigned long *size, | 2255 | unsigned long *offset, |
2217 | unsigned long *offset, | 2256 | char **modname, |
2218 | char **modname) | 2257 | char *namebuf) |
2219 | { | 2258 | { |
2220 | struct module *mod; | 2259 | struct module *mod; |
2260 | const char *ret = NULL; | ||
2221 | 2261 | ||
2262 | preempt_disable(); | ||
2222 | list_for_each_entry(mod, &modules, list) { | 2263 | list_for_each_entry(mod, &modules, list) { |
2223 | if (within(addr, mod->module_init, mod->init_size) | 2264 | if (within(addr, mod->module_init, mod->init_size) |
2224 | || within(addr, mod->module_core, mod->core_size)) { | 2265 | || within(addr, mod->module_core, mod->core_size)) { |
2225 | if (modname) | 2266 | if (modname) |
2226 | *modname = mod->name; | 2267 | *modname = mod->name; |
2227 | return get_ksymbol(mod, addr, size, offset); | 2268 | ret = get_ksymbol(mod, addr, size, offset); |
2269 | break; | ||
2228 | } | 2270 | } |
2229 | } | 2271 | } |
2230 | return NULL; | 2272 | /* Make a copy in here where it's safe */ |
2273 | if (ret) { | ||
2274 | strncpy(namebuf, ret, KSYM_NAME_LEN - 1); | ||
2275 | ret = namebuf; | ||
2276 | } | ||
2277 | preempt_enable(); | ||
2278 | return (char *)ret; | ||
2231 | } | 2279 | } |
2232 | 2280 | ||
2233 | int lookup_module_symbol_name(unsigned long addr, char *symname) | 2281 | int lookup_module_symbol_name(unsigned long addr, char *symname) |
2234 | { | 2282 | { |
2235 | struct module *mod; | 2283 | struct module *mod; |
2236 | 2284 | ||
2237 | mutex_lock(&module_mutex); | 2285 | preempt_disable(); |
2238 | list_for_each_entry(mod, &modules, list) { | 2286 | list_for_each_entry(mod, &modules, list) { |
2239 | if (within(addr, mod->module_init, mod->init_size) || | 2287 | if (within(addr, mod->module_init, mod->init_size) || |
2240 | within(addr, mod->module_core, mod->core_size)) { | 2288 | within(addr, mod->module_core, mod->core_size)) { |
@@ -2244,12 +2292,12 @@ int lookup_module_symbol_name(unsigned long addr, char *symname) | |||
2244 | if (!sym) | 2292 | if (!sym) |
2245 | goto out; | 2293 | goto out; |
2246 | strlcpy(symname, sym, KSYM_NAME_LEN); | 2294 | strlcpy(symname, sym, KSYM_NAME_LEN); |
2247 | mutex_unlock(&module_mutex); | 2295 | preempt_enable(); |
2248 | return 0; | 2296 | return 0; |
2249 | } | 2297 | } |
2250 | } | 2298 | } |
2251 | out: | 2299 | out: |
2252 | mutex_unlock(&module_mutex); | 2300 | preempt_enable(); |
2253 | return -ERANGE; | 2301 | return -ERANGE; |
2254 | } | 2302 | } |
2255 | 2303 | ||
@@ -2258,7 +2306,7 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, | |||
2258 | { | 2306 | { |
2259 | struct module *mod; | 2307 | struct module *mod; |
2260 | 2308 | ||
2261 | mutex_lock(&module_mutex); | 2309 | preempt_disable(); |
2262 | list_for_each_entry(mod, &modules, list) { | 2310 | list_for_each_entry(mod, &modules, list) { |
2263 | if (within(addr, mod->module_init, mod->init_size) || | 2311 | if (within(addr, mod->module_init, mod->init_size) || |
2264 | within(addr, mod->module_core, mod->core_size)) { | 2312 | within(addr, mod->module_core, mod->core_size)) { |
@@ -2271,12 +2319,12 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, | |||
2271 | strlcpy(modname, mod->name, MODULE_NAME_LEN); | 2319 | strlcpy(modname, mod->name, MODULE_NAME_LEN); |
2272 | if (name) | 2320 | if (name) |
2273 | strlcpy(name, sym, KSYM_NAME_LEN); | 2321 | strlcpy(name, sym, KSYM_NAME_LEN); |
2274 | mutex_unlock(&module_mutex); | 2322 | preempt_enable(); |
2275 | return 0; | 2323 | return 0; |
2276 | } | 2324 | } |
2277 | } | 2325 | } |
2278 | out: | 2326 | out: |
2279 | mutex_unlock(&module_mutex); | 2327 | preempt_enable(); |
2280 | return -ERANGE; | 2328 | return -ERANGE; |
2281 | } | 2329 | } |
2282 | 2330 | ||
@@ -2285,7 +2333,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, | |||
2285 | { | 2333 | { |
2286 | struct module *mod; | 2334 | struct module *mod; |
2287 | 2335 | ||
2288 | mutex_lock(&module_mutex); | 2336 | preempt_disable(); |
2289 | list_for_each_entry(mod, &modules, list) { | 2337 | list_for_each_entry(mod, &modules, list) { |
2290 | if (symnum < mod->num_symtab) { | 2338 | if (symnum < mod->num_symtab) { |
2291 | *value = mod->symtab[symnum].st_value; | 2339 | *value = mod->symtab[symnum].st_value; |
@@ -2294,12 +2342,12 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, | |||
2294 | KSYM_NAME_LEN); | 2342 | KSYM_NAME_LEN); |
2295 | strlcpy(module_name, mod->name, MODULE_NAME_LEN); | 2343 | strlcpy(module_name, mod->name, MODULE_NAME_LEN); |
2296 | *exported = is_exported(name, mod); | 2344 | *exported = is_exported(name, mod); |
2297 | mutex_unlock(&module_mutex); | 2345 | preempt_enable(); |
2298 | return 0; | 2346 | return 0; |
2299 | } | 2347 | } |
2300 | symnum -= mod->num_symtab; | 2348 | symnum -= mod->num_symtab; |
2301 | } | 2349 | } |
2302 | mutex_unlock(&module_mutex); | 2350 | preempt_enable(); |
2303 | return -ERANGE; | 2351 | return -ERANGE; |
2304 | } | 2352 | } |
2305 | 2353 | ||
@@ -2322,6 +2370,7 @@ unsigned long module_kallsyms_lookup_name(const char *name) | |||
2322 | unsigned long ret = 0; | 2370 | unsigned long ret = 0; |
2323 | 2371 | ||
2324 | /* Don't lock: we're in enough trouble already. */ | 2372 | /* Don't lock: we're in enough trouble already. */ |
2373 | preempt_disable(); | ||
2325 | if ((colon = strchr(name, ':')) != NULL) { | 2374 | if ((colon = strchr(name, ':')) != NULL) { |
2326 | *colon = '\0'; | 2375 | *colon = '\0'; |
2327 | if ((mod = find_module(name)) != NULL) | 2376 | if ((mod = find_module(name)) != NULL) |
@@ -2332,6 +2381,7 @@ unsigned long module_kallsyms_lookup_name(const char *name) | |||
2332 | if ((ret = mod_find_symname(mod, name)) != 0) | 2381 | if ((ret = mod_find_symname(mod, name)) != 0) |
2333 | break; | 2382 | break; |
2334 | } | 2383 | } |
2384 | preempt_enable(); | ||
2335 | return ret; | 2385 | return ret; |
2336 | } | 2386 | } |
2337 | #endif /* CONFIG_KALLSYMS */ | 2387 | #endif /* CONFIG_KALLSYMS */ |
@@ -2353,21 +2403,30 @@ static void m_stop(struct seq_file *m, void *p) | |||
2353 | mutex_unlock(&module_mutex); | 2403 | mutex_unlock(&module_mutex); |
2354 | } | 2404 | } |
2355 | 2405 | ||
2356 | static char *taint_flags(unsigned int taints, char *buf) | 2406 | static char *module_flags(struct module *mod, char *buf) |
2357 | { | 2407 | { |
2358 | int bx = 0; | 2408 | int bx = 0; |
2359 | 2409 | ||
2360 | if (taints) { | 2410 | if (mod->taints || |
2411 | mod->state == MODULE_STATE_GOING || | ||
2412 | mod->state == MODULE_STATE_COMING) { | ||
2361 | buf[bx++] = '('; | 2413 | buf[bx++] = '('; |
2362 | if (taints & TAINT_PROPRIETARY_MODULE) | 2414 | if (mod->taints & TAINT_PROPRIETARY_MODULE) |
2363 | buf[bx++] = 'P'; | 2415 | buf[bx++] = 'P'; |
2364 | if (taints & TAINT_FORCED_MODULE) | 2416 | if (mod->taints & TAINT_FORCED_MODULE) |
2365 | buf[bx++] = 'F'; | 2417 | buf[bx++] = 'F'; |
2366 | /* | 2418 | /* |
2367 | * TAINT_FORCED_RMMOD: could be added. | 2419 | * TAINT_FORCED_RMMOD: could be added. |
2368 | * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't | 2420 | * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't |
2369 | * apply to modules. | 2421 | * apply to modules. |
2370 | */ | 2422 | */ |
2423 | |||
2424 | /* Show a - for module-is-being-unloaded */ | ||
2425 | if (mod->state == MODULE_STATE_GOING) | ||
2426 | buf[bx++] = '-'; | ||
2427 | /* Show a + for module-is-being-loaded */ | ||
2428 | if (mod->state == MODULE_STATE_COMING) | ||
2429 | buf[bx++] = '+'; | ||
2371 | buf[bx++] = ')'; | 2430 | buf[bx++] = ')'; |
2372 | } | 2431 | } |
2373 | buf[bx] = '\0'; | 2432 | buf[bx] = '\0'; |
@@ -2394,7 +2453,7 @@ static int m_show(struct seq_file *m, void *p) | |||
2394 | 2453 | ||
2395 | /* Taints info */ | 2454 | /* Taints info */ |
2396 | if (mod->taints) | 2455 | if (mod->taints) |
2397 | seq_printf(m, " %s", taint_flags(mod->taints, buf)); | 2456 | seq_printf(m, " %s", module_flags(mod, buf)); |
2398 | 2457 | ||
2399 | seq_printf(m, "\n"); | 2458 | seq_printf(m, "\n"); |
2400 | return 0; | 2459 | return 0; |
@@ -2489,97 +2548,12 @@ void print_modules(void) | |||
2489 | 2548 | ||
2490 | printk("Modules linked in:"); | 2549 | printk("Modules linked in:"); |
2491 | list_for_each_entry(mod, &modules, list) | 2550 | list_for_each_entry(mod, &modules, list) |
2492 | printk(" %s%s", mod->name, taint_flags(mod->taints, buf)); | 2551 | printk(" %s%s", mod->name, module_flags(mod, buf)); |
2552 | if (last_unloaded_module[0]) | ||
2553 | printk(" [last unloaded: %s]", last_unloaded_module); | ||
2493 | printk("\n"); | 2554 | printk("\n"); |
2494 | } | 2555 | } |
2495 | 2556 | ||
2496 | #ifdef CONFIG_SYSFS | ||
2497 | static char *make_driver_name(struct device_driver *drv) | ||
2498 | { | ||
2499 | char *driver_name; | ||
2500 | |||
2501 | driver_name = kmalloc(strlen(drv->name) + strlen(drv->bus->name) + 2, | ||
2502 | GFP_KERNEL); | ||
2503 | if (!driver_name) | ||
2504 | return NULL; | ||
2505 | |||
2506 | sprintf(driver_name, "%s:%s", drv->bus->name, drv->name); | ||
2507 | return driver_name; | ||
2508 | } | ||
2509 | |||
2510 | static void module_create_drivers_dir(struct module_kobject *mk) | ||
2511 | { | ||
2512 | if (!mk || mk->drivers_dir) | ||
2513 | return; | ||
2514 | |||
2515 | mk->drivers_dir = kobject_add_dir(&mk->kobj, "drivers"); | ||
2516 | } | ||
2517 | |||
2518 | void module_add_driver(struct module *mod, struct device_driver *drv) | ||
2519 | { | ||
2520 | char *driver_name; | ||
2521 | int no_warn; | ||
2522 | struct module_kobject *mk = NULL; | ||
2523 | |||
2524 | if (!drv) | ||
2525 | return; | ||
2526 | |||
2527 | if (mod) | ||
2528 | mk = &mod->mkobj; | ||
2529 | else if (drv->mod_name) { | ||
2530 | struct kobject *mkobj; | ||
2531 | |||
2532 | /* Lookup built-in module entry in /sys/modules */ | ||
2533 | mkobj = kset_find_obj(&module_subsys, drv->mod_name); | ||
2534 | if (mkobj) { | ||
2535 | mk = container_of(mkobj, struct module_kobject, kobj); | ||
2536 | /* remember our module structure */ | ||
2537 | drv->mkobj = mk; | ||
2538 | /* kset_find_obj took a reference */ | ||
2539 | kobject_put(mkobj); | ||
2540 | } | ||
2541 | } | ||
2542 | |||
2543 | if (!mk) | ||
2544 | return; | ||
2545 | |||
2546 | /* Don't check return codes; these calls are idempotent */ | ||
2547 | no_warn = sysfs_create_link(&drv->kobj, &mk->kobj, "module"); | ||
2548 | driver_name = make_driver_name(drv); | ||
2549 | if (driver_name) { | ||
2550 | module_create_drivers_dir(mk); | ||
2551 | no_warn = sysfs_create_link(mk->drivers_dir, &drv->kobj, | ||
2552 | driver_name); | ||
2553 | kfree(driver_name); | ||
2554 | } | ||
2555 | } | ||
2556 | EXPORT_SYMBOL(module_add_driver); | ||
2557 | |||
2558 | void module_remove_driver(struct device_driver *drv) | ||
2559 | { | ||
2560 | struct module_kobject *mk = NULL; | ||
2561 | char *driver_name; | ||
2562 | |||
2563 | if (!drv) | ||
2564 | return; | ||
2565 | |||
2566 | sysfs_remove_link(&drv->kobj, "module"); | ||
2567 | |||
2568 | if (drv->owner) | ||
2569 | mk = &drv->owner->mkobj; | ||
2570 | else if (drv->mkobj) | ||
2571 | mk = drv->mkobj; | ||
2572 | if (mk && mk->drivers_dir) { | ||
2573 | driver_name = make_driver_name(drv); | ||
2574 | if (driver_name) { | ||
2575 | sysfs_remove_link(mk->drivers_dir, driver_name); | ||
2576 | kfree(driver_name); | ||
2577 | } | ||
2578 | } | ||
2579 | } | ||
2580 | EXPORT_SYMBOL(module_remove_driver); | ||
2581 | #endif | ||
2582 | |||
2583 | #ifdef CONFIG_MODVERSIONS | 2557 | #ifdef CONFIG_MODVERSIONS |
2584 | /* Generate the signature for struct module here, too, for modversions. */ | 2558 | /* Generate the signature for struct module here, too, for modversions. */ |
2585 | void struct_module(struct module *mod) { return; } | 2559 | void struct_module(struct module *mod) { return; } |
diff --git a/kernel/mutex.c b/kernel/mutex.c index d7fe50cc556f..d9ec9b666250 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c | |||
@@ -166,9 +166,12 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, | |||
166 | * got a signal? (This code gets eliminated in the | 166 | * got a signal? (This code gets eliminated in the |
167 | * TASK_UNINTERRUPTIBLE case.) | 167 | * TASK_UNINTERRUPTIBLE case.) |
168 | */ | 168 | */ |
169 | if (unlikely(state == TASK_INTERRUPTIBLE && | 169 | if (unlikely((state == TASK_INTERRUPTIBLE && |
170 | signal_pending(task))) { | 170 | signal_pending(task)) || |
171 | mutex_remove_waiter(lock, &waiter, task_thread_info(task)); | 171 | (state == TASK_KILLABLE && |
172 | fatal_signal_pending(task)))) { | ||
173 | mutex_remove_waiter(lock, &waiter, | ||
174 | task_thread_info(task)); | ||
172 | mutex_release(&lock->dep_map, 1, ip); | 175 | mutex_release(&lock->dep_map, 1, ip); |
173 | spin_unlock_mutex(&lock->wait_lock, flags); | 176 | spin_unlock_mutex(&lock->wait_lock, flags); |
174 | 177 | ||
@@ -211,6 +214,14 @@ mutex_lock_nested(struct mutex *lock, unsigned int subclass) | |||
211 | EXPORT_SYMBOL_GPL(mutex_lock_nested); | 214 | EXPORT_SYMBOL_GPL(mutex_lock_nested); |
212 | 215 | ||
213 | int __sched | 216 | int __sched |
217 | mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass) | ||
218 | { | ||
219 | might_sleep(); | ||
220 | return __mutex_lock_common(lock, TASK_KILLABLE, subclass, _RET_IP_); | ||
221 | } | ||
222 | EXPORT_SYMBOL_GPL(mutex_lock_killable_nested); | ||
223 | |||
224 | int __sched | ||
214 | mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) | 225 | mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) |
215 | { | 226 | { |
216 | might_sleep(); | 227 | might_sleep(); |
@@ -272,6 +283,9 @@ __mutex_unlock_slowpath(atomic_t *lock_count) | |||
272 | * mutex_lock_interruptible() and mutex_trylock(). | 283 | * mutex_lock_interruptible() and mutex_trylock(). |
273 | */ | 284 | */ |
274 | static int fastcall noinline __sched | 285 | static int fastcall noinline __sched |
286 | __mutex_lock_killable_slowpath(atomic_t *lock_count); | ||
287 | |||
288 | static noinline int fastcall __sched | ||
275 | __mutex_lock_interruptible_slowpath(atomic_t *lock_count); | 289 | __mutex_lock_interruptible_slowpath(atomic_t *lock_count); |
276 | 290 | ||
277 | /*** | 291 | /*** |
@@ -294,6 +308,14 @@ int fastcall __sched mutex_lock_interruptible(struct mutex *lock) | |||
294 | 308 | ||
295 | EXPORT_SYMBOL(mutex_lock_interruptible); | 309 | EXPORT_SYMBOL(mutex_lock_interruptible); |
296 | 310 | ||
311 | int fastcall __sched mutex_lock_killable(struct mutex *lock) | ||
312 | { | ||
313 | might_sleep(); | ||
314 | return __mutex_fastpath_lock_retval | ||
315 | (&lock->count, __mutex_lock_killable_slowpath); | ||
316 | } | ||
317 | EXPORT_SYMBOL(mutex_lock_killable); | ||
318 | |||
297 | static void fastcall noinline __sched | 319 | static void fastcall noinline __sched |
298 | __mutex_lock_slowpath(atomic_t *lock_count) | 320 | __mutex_lock_slowpath(atomic_t *lock_count) |
299 | { | 321 | { |
@@ -303,6 +325,14 @@ __mutex_lock_slowpath(atomic_t *lock_count) | |||
303 | } | 325 | } |
304 | 326 | ||
305 | static int fastcall noinline __sched | 327 | static int fastcall noinline __sched |
328 | __mutex_lock_killable_slowpath(atomic_t *lock_count) | ||
329 | { | ||
330 | struct mutex *lock = container_of(lock_count, struct mutex, count); | ||
331 | |||
332 | return __mutex_lock_common(lock, TASK_KILLABLE, 0, _RET_IP_); | ||
333 | } | ||
334 | |||
335 | static noinline int fastcall __sched | ||
306 | __mutex_lock_interruptible_slowpath(atomic_t *lock_count) | 336 | __mutex_lock_interruptible_slowpath(atomic_t *lock_count) |
307 | { | 337 | { |
308 | struct mutex *lock = container_of(lock_count, struct mutex, count); | 338 | struct mutex *lock = container_of(lock_count, struct mutex, count); |
diff --git a/kernel/panic.c b/kernel/panic.c index 6f6e03e91595..d9e90cfe3298 100644 --- a/kernel/panic.c +++ b/kernel/panic.c | |||
@@ -19,6 +19,8 @@ | |||
19 | #include <linux/nmi.h> | 19 | #include <linux/nmi.h> |
20 | #include <linux/kexec.h> | 20 | #include <linux/kexec.h> |
21 | #include <linux/debug_locks.h> | 21 | #include <linux/debug_locks.h> |
22 | #include <linux/random.h> | ||
23 | #include <linux/kallsyms.h> | ||
22 | 24 | ||
23 | int panic_on_oops; | 25 | int panic_on_oops; |
24 | int tainted; | 26 | int tainted; |
@@ -266,13 +268,52 @@ void oops_enter(void) | |||
266 | } | 268 | } |
267 | 269 | ||
268 | /* | 270 | /* |
271 | * 64-bit random ID for oopses: | ||
272 | */ | ||
273 | static u64 oops_id; | ||
274 | |||
275 | static int init_oops_id(void) | ||
276 | { | ||
277 | if (!oops_id) | ||
278 | get_random_bytes(&oops_id, sizeof(oops_id)); | ||
279 | |||
280 | return 0; | ||
281 | } | ||
282 | late_initcall(init_oops_id); | ||
283 | |||
284 | static void print_oops_end_marker(void) | ||
285 | { | ||
286 | init_oops_id(); | ||
287 | printk(KERN_WARNING "---[ end trace %016llx ]---\n", | ||
288 | (unsigned long long)oops_id); | ||
289 | } | ||
290 | |||
291 | /* | ||
269 | * Called when the architecture exits its oops handler, after printing | 292 | * Called when the architecture exits its oops handler, after printing |
270 | * everything. | 293 | * everything. |
271 | */ | 294 | */ |
272 | void oops_exit(void) | 295 | void oops_exit(void) |
273 | { | 296 | { |
274 | do_oops_enter_exit(); | 297 | do_oops_enter_exit(); |
298 | print_oops_end_marker(); | ||
299 | } | ||
300 | |||
301 | #ifdef WANT_WARN_ON_SLOWPATH | ||
302 | void warn_on_slowpath(const char *file, int line) | ||
303 | { | ||
304 | char function[KSYM_SYMBOL_LEN]; | ||
305 | unsigned long caller = (unsigned long) __builtin_return_address(0); | ||
306 | sprint_symbol(function, caller); | ||
307 | |||
308 | printk(KERN_WARNING "------------[ cut here ]------------\n"); | ||
309 | printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file, | ||
310 | line, function); | ||
311 | print_modules(); | ||
312 | dump_stack(); | ||
313 | print_oops_end_marker(); | ||
275 | } | 314 | } |
315 | EXPORT_SYMBOL(warn_on_slowpath); | ||
316 | #endif | ||
276 | 317 | ||
277 | #ifdef CONFIG_CC_STACKPROTECTOR | 318 | #ifdef CONFIG_CC_STACKPROTECTOR |
278 | /* | 319 | /* |
diff --git a/kernel/params.c b/kernel/params.c index 16f269e9ddc9..42fe5e6126c0 100644 --- a/kernel/params.c +++ b/kernel/params.c | |||
@@ -376,8 +376,6 @@ int param_get_string(char *buffer, struct kernel_param *kp) | |||
376 | 376 | ||
377 | extern struct kernel_param __start___param[], __stop___param[]; | 377 | extern struct kernel_param __start___param[], __stop___param[]; |
378 | 378 | ||
379 | #define MAX_KBUILD_MODNAME KOBJ_NAME_LEN | ||
380 | |||
381 | struct param_attribute | 379 | struct param_attribute |
382 | { | 380 | { |
383 | struct module_attribute mattr; | 381 | struct module_attribute mattr; |
@@ -472,7 +470,7 @@ param_sysfs_setup(struct module_kobject *mk, | |||
472 | sizeof(mp->grp.attrs[0])); | 470 | sizeof(mp->grp.attrs[0])); |
473 | size[1] = (valid_attrs + 1) * sizeof(mp->grp.attrs[0]); | 471 | size[1] = (valid_attrs + 1) * sizeof(mp->grp.attrs[0]); |
474 | 472 | ||
475 | mp = kmalloc(size[0] + size[1], GFP_KERNEL); | 473 | mp = kzalloc(size[0] + size[1], GFP_KERNEL); |
476 | if (!mp) | 474 | if (!mp) |
477 | return ERR_PTR(-ENOMEM); | 475 | return ERR_PTR(-ENOMEM); |
478 | 476 | ||
@@ -560,11 +558,10 @@ static void __init kernel_param_sysfs_setup(const char *name, | |||
560 | BUG_ON(!mk); | 558 | BUG_ON(!mk); |
561 | 559 | ||
562 | mk->mod = THIS_MODULE; | 560 | mk->mod = THIS_MODULE; |
563 | kobj_set_kset_s(mk, module_subsys); | 561 | mk->kobj.kset = module_kset; |
564 | kobject_set_name(&mk->kobj, name); | 562 | ret = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, "%s", name); |
565 | kobject_init(&mk->kobj); | ||
566 | ret = kobject_add(&mk->kobj); | ||
567 | if (ret) { | 563 | if (ret) { |
564 | kobject_put(&mk->kobj); | ||
568 | printk(KERN_ERR "Module '%s' failed to be added to sysfs, " | 565 | printk(KERN_ERR "Module '%s' failed to be added to sysfs, " |
569 | "error number %d\n", name, ret); | 566 | "error number %d\n", name, ret); |
570 | printk(KERN_ERR "The system will be unstable now.\n"); | 567 | printk(KERN_ERR "The system will be unstable now.\n"); |
@@ -588,23 +585,20 @@ static void __init param_sysfs_builtin(void) | |||
588 | { | 585 | { |
589 | struct kernel_param *kp, *kp_begin = NULL; | 586 | struct kernel_param *kp, *kp_begin = NULL; |
590 | unsigned int i, name_len, count = 0; | 587 | unsigned int i, name_len, count = 0; |
591 | char modname[MAX_KBUILD_MODNAME + 1] = ""; | 588 | char modname[MODULE_NAME_LEN + 1] = ""; |
592 | 589 | ||
593 | for (i=0; i < __stop___param - __start___param; i++) { | 590 | for (i=0; i < __stop___param - __start___param; i++) { |
594 | char *dot; | 591 | char *dot; |
595 | size_t kplen; | 592 | size_t max_name_len; |
596 | 593 | ||
597 | kp = &__start___param[i]; | 594 | kp = &__start___param[i]; |
598 | kplen = strlen(kp->name); | 595 | max_name_len = |
596 | min_t(size_t, MODULE_NAME_LEN, strlen(kp->name)); | ||
599 | 597 | ||
600 | /* We do not handle args without periods. */ | 598 | dot = memchr(kp->name, '.', max_name_len); |
601 | if (kplen > MAX_KBUILD_MODNAME) { | ||
602 | DEBUGP("kernel parameter name is too long: %s\n", kp->name); | ||
603 | continue; | ||
604 | } | ||
605 | dot = memchr(kp->name, '.', kplen); | ||
606 | if (!dot) { | 599 | if (!dot) { |
607 | DEBUGP("couldn't find period in %s\n", kp->name); | 600 | DEBUGP("couldn't find period in first %d characters " |
601 | "of %s\n", MODULE_NAME_LEN, kp->name); | ||
608 | continue; | 602 | continue; |
609 | } | 603 | } |
610 | name_len = dot - kp->name; | 604 | name_len = dot - kp->name; |
@@ -682,8 +676,6 @@ static struct sysfs_ops module_sysfs_ops = { | |||
682 | .store = module_attr_store, | 676 | .store = module_attr_store, |
683 | }; | 677 | }; |
684 | 678 | ||
685 | static struct kobj_type module_ktype; | ||
686 | |||
687 | static int uevent_filter(struct kset *kset, struct kobject *kobj) | 679 | static int uevent_filter(struct kset *kset, struct kobject *kobj) |
688 | { | 680 | { |
689 | struct kobj_type *ktype = get_ktype(kobj); | 681 | struct kobj_type *ktype = get_ktype(kobj); |
@@ -697,10 +689,10 @@ static struct kset_uevent_ops module_uevent_ops = { | |||
697 | .filter = uevent_filter, | 689 | .filter = uevent_filter, |
698 | }; | 690 | }; |
699 | 691 | ||
700 | decl_subsys(module, &module_ktype, &module_uevent_ops); | 692 | struct kset *module_kset; |
701 | int module_sysfs_initialized; | 693 | int module_sysfs_initialized; |
702 | 694 | ||
703 | static struct kobj_type module_ktype = { | 695 | struct kobj_type module_ktype = { |
704 | .sysfs_ops = &module_sysfs_ops, | 696 | .sysfs_ops = &module_sysfs_ops, |
705 | }; | 697 | }; |
706 | 698 | ||
@@ -709,13 +701,11 @@ static struct kobj_type module_ktype = { | |||
709 | */ | 701 | */ |
710 | static int __init param_sysfs_init(void) | 702 | static int __init param_sysfs_init(void) |
711 | { | 703 | { |
712 | int ret; | 704 | module_kset = kset_create_and_add("module", &module_uevent_ops, NULL); |
713 | 705 | if (!module_kset) { | |
714 | ret = subsystem_register(&module_subsys); | 706 | printk(KERN_WARNING "%s (%d): error creating kset\n", |
715 | if (ret < 0) { | 707 | __FILE__, __LINE__); |
716 | printk(KERN_WARNING "%s (%d): subsystem_register error: %d\n", | 708 | return -ENOMEM; |
717 | __FILE__, __LINE__, ret); | ||
718 | return ret; | ||
719 | } | 709 | } |
720 | module_sysfs_initialized = 1; | 710 | module_sysfs_initialized = 1; |
721 | 711 | ||
@@ -725,14 +715,7 @@ static int __init param_sysfs_init(void) | |||
725 | } | 715 | } |
726 | subsys_initcall(param_sysfs_init); | 716 | subsys_initcall(param_sysfs_init); |
727 | 717 | ||
728 | #else | 718 | #endif /* CONFIG_SYSFS */ |
729 | #if 0 | ||
730 | static struct sysfs_ops module_sysfs_ops = { | ||
731 | .show = NULL, | ||
732 | .store = NULL, | ||
733 | }; | ||
734 | #endif | ||
735 | #endif | ||
736 | 719 | ||
737 | EXPORT_SYMBOL(param_set_byte); | 720 | EXPORT_SYMBOL(param_set_byte); |
738 | EXPORT_SYMBOL(param_get_byte); | 721 | EXPORT_SYMBOL(param_get_byte); |
diff --git a/kernel/pid.c b/kernel/pid.c index d1db36b94674..f815455431bf 100644 --- a/kernel/pid.c +++ b/kernel/pid.c | |||
@@ -537,6 +537,7 @@ err_alloc: | |||
537 | return NULL; | 537 | return NULL; |
538 | } | 538 | } |
539 | 539 | ||
540 | #ifdef CONFIG_PID_NS | ||
540 | static struct pid_namespace *create_pid_namespace(int level) | 541 | static struct pid_namespace *create_pid_namespace(int level) |
541 | { | 542 | { |
542 | struct pid_namespace *ns; | 543 | struct pid_namespace *ns; |
@@ -621,6 +622,7 @@ void free_pid_ns(struct kref *kref) | |||
621 | if (parent != NULL) | 622 | if (parent != NULL) |
622 | put_pid_ns(parent); | 623 | put_pid_ns(parent); |
623 | } | 624 | } |
625 | #endif /* CONFIG_PID_NS */ | ||
624 | 626 | ||
625 | void zap_pid_ns_processes(struct pid_namespace *pid_ns) | 627 | void zap_pid_ns_processes(struct pid_namespace *pid_ns) |
626 | { | 628 | { |
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 68c96376e84a..0b7c82ac467e 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c | |||
@@ -967,6 +967,7 @@ static void check_thread_timers(struct task_struct *tsk, | |||
967 | { | 967 | { |
968 | int maxfire; | 968 | int maxfire; |
969 | struct list_head *timers = tsk->cpu_timers; | 969 | struct list_head *timers = tsk->cpu_timers; |
970 | struct signal_struct *const sig = tsk->signal; | ||
970 | 971 | ||
971 | maxfire = 20; | 972 | maxfire = 20; |
972 | tsk->it_prof_expires = cputime_zero; | 973 | tsk->it_prof_expires = cputime_zero; |
@@ -1011,6 +1012,35 @@ static void check_thread_timers(struct task_struct *tsk, | |||
1011 | t->firing = 1; | 1012 | t->firing = 1; |
1012 | list_move_tail(&t->entry, firing); | 1013 | list_move_tail(&t->entry, firing); |
1013 | } | 1014 | } |
1015 | |||
1016 | /* | ||
1017 | * Check for the special case thread timers. | ||
1018 | */ | ||
1019 | if (sig->rlim[RLIMIT_RTTIME].rlim_cur != RLIM_INFINITY) { | ||
1020 | unsigned long hard = sig->rlim[RLIMIT_RTTIME].rlim_max; | ||
1021 | unsigned long *soft = &sig->rlim[RLIMIT_RTTIME].rlim_cur; | ||
1022 | |||
1023 | if (hard != RLIM_INFINITY && | ||
1024 | tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { | ||
1025 | /* | ||
1026 | * At the hard limit, we just die. | ||
1027 | * No need to calculate anything else now. | ||
1028 | */ | ||
1029 | __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); | ||
1030 | return; | ||
1031 | } | ||
1032 | if (tsk->rt.timeout > DIV_ROUND_UP(*soft, USEC_PER_SEC/HZ)) { | ||
1033 | /* | ||
1034 | * At the soft limit, send a SIGXCPU every second. | ||
1035 | */ | ||
1036 | if (sig->rlim[RLIMIT_RTTIME].rlim_cur | ||
1037 | < sig->rlim[RLIMIT_RTTIME].rlim_max) { | ||
1038 | sig->rlim[RLIMIT_RTTIME].rlim_cur += | ||
1039 | USEC_PER_SEC; | ||
1040 | } | ||
1041 | __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); | ||
1042 | } | ||
1043 | } | ||
1014 | } | 1044 | } |
1015 | 1045 | ||
1016 | /* | 1046 | /* |
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 8e186c678149..ef9b802738a5 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
@@ -44,9 +44,30 @@ config PM_VERBOSE | |||
44 | ---help--- | 44 | ---help--- |
45 | This option enables verbose messages from the Power Management code. | 45 | This option enables verbose messages from the Power Management code. |
46 | 46 | ||
47 | config CAN_PM_TRACE | ||
48 | def_bool y | ||
49 | depends on PM_DEBUG && PM_SLEEP && EXPERIMENTAL | ||
50 | |||
47 | config PM_TRACE | 51 | config PM_TRACE |
52 | bool | ||
53 | help | ||
54 | This enables code to save the last PM event point across | ||
55 | reboot. The architecture needs to support this, x86 for | ||
56 | example does by saving things in the RTC, see below. | ||
57 | |||
58 | The architecture specific code must provide the extern | ||
59 | functions from <linux/resume-trace.h> as well as the | ||
60 | <asm/resume-trace.h> header with a TRACE_RESUME() macro. | ||
61 | |||
62 | The way the information is presented is architecture- | ||
63 | dependent, x86 will print the information during a | ||
64 | late_initcall. | ||
65 | |||
66 | config PM_TRACE_RTC | ||
48 | bool "Suspend/resume event tracing" | 67 | bool "Suspend/resume event tracing" |
49 | depends on PM_DEBUG && X86 && PM_SLEEP && EXPERIMENTAL | 68 | depends on CAN_PM_TRACE |
69 | depends on X86 | ||
70 | select PM_TRACE | ||
50 | default n | 71 | default n |
51 | ---help--- | 72 | ---help--- |
52 | This enables some cheesy code to save the last PM event point in the | 73 | This enables some cheesy code to save the last PM event point in the |
@@ -63,7 +84,8 @@ config PM_TRACE | |||
63 | 84 | ||
64 | config PM_SLEEP_SMP | 85 | config PM_SLEEP_SMP |
65 | bool | 86 | bool |
66 | depends on SUSPEND_SMP_POSSIBLE || HIBERNATION_SMP_POSSIBLE | 87 | depends on SMP |
88 | depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE | ||
67 | depends on PM_SLEEP | 89 | depends on PM_SLEEP |
68 | select HOTPLUG_CPU | 90 | select HOTPLUG_CPU |
69 | default y | 91 | default y |
@@ -73,46 +95,29 @@ config PM_SLEEP | |||
73 | depends on SUSPEND || HIBERNATION | 95 | depends on SUSPEND || HIBERNATION |
74 | default y | 96 | default y |
75 | 97 | ||
76 | config SUSPEND_UP_POSSIBLE | ||
77 | bool | ||
78 | depends on (X86 && !X86_VOYAGER) || PPC || ARM || BLACKFIN || MIPS \ | ||
79 | || SUPERH || FRV | ||
80 | depends on !SMP | ||
81 | default y | ||
82 | |||
83 | config SUSPEND_SMP_POSSIBLE | ||
84 | bool | ||
85 | depends on (X86 && !X86_VOYAGER) \ | ||
86 | || (PPC && (PPC_PSERIES || PPC_PMAC)) || ARM | ||
87 | depends on SMP | ||
88 | default y | ||
89 | |||
90 | config SUSPEND | 98 | config SUSPEND |
91 | bool "Suspend to RAM and standby" | 99 | bool "Suspend to RAM and standby" |
92 | depends on PM | 100 | depends on PM && ARCH_SUSPEND_POSSIBLE |
93 | depends on SUSPEND_UP_POSSIBLE || SUSPEND_SMP_POSSIBLE | ||
94 | default y | 101 | default y |
95 | ---help--- | 102 | ---help--- |
96 | Allow the system to enter sleep states in which main memory is | 103 | Allow the system to enter sleep states in which main memory is |
97 | powered and thus its contents are preserved, such as the | 104 | powered and thus its contents are preserved, such as the |
98 | suspend-to-RAM state (i.e. the ACPI S3 state). | 105 | suspend-to-RAM state (e.g. the ACPI S3 state). |
99 | 106 | ||
100 | config HIBERNATION_UP_POSSIBLE | 107 | config SUSPEND_FREEZER |
101 | bool | 108 | bool "Enable freezer for suspend to RAM/standby" \ |
102 | depends on X86 || PPC64_SWSUSP || PPC32 | 109 | if ARCH_WANTS_FREEZER_CONTROL || BROKEN |
103 | depends on !SMP | 110 | depends on SUSPEND |
104 | default y | 111 | default y |
112 | help | ||
113 | This allows you to turn off the freezer for suspend. If this is | ||
114 | done, no tasks are frozen for suspend to RAM/standby. | ||
105 | 115 | ||
106 | config HIBERNATION_SMP_POSSIBLE | 116 | Turning OFF this setting is NOT recommended! If in doubt, say Y. |
107 | bool | ||
108 | depends on (X86 && !X86_VOYAGER) || PPC64_SWSUSP | ||
109 | depends on SMP | ||
110 | default y | ||
111 | 117 | ||
112 | config HIBERNATION | 118 | config HIBERNATION |
113 | bool "Hibernation (aka 'suspend to disk')" | 119 | bool "Hibernation (aka 'suspend to disk')" |
114 | depends on PM && SWAP | 120 | depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE |
115 | depends on HIBERNATION_UP_POSSIBLE || HIBERNATION_SMP_POSSIBLE | ||
116 | ---help--- | 121 | ---help--- |
117 | Enable the suspend to disk (STD) functionality, which is usually | 122 | Enable the suspend to disk (STD) functionality, which is usually |
118 | called "hibernation" in user interfaces. STD checkpoints the | 123 | called "hibernation" in user interfaces. STD checkpoints the |
diff --git a/kernel/power/disk.c b/kernel/power/disk.c index 8b15f777010a..d09da0895174 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c | |||
@@ -54,8 +54,8 @@ static struct platform_hibernation_ops *hibernation_ops; | |||
54 | 54 | ||
55 | void hibernation_set_ops(struct platform_hibernation_ops *ops) | 55 | void hibernation_set_ops(struct platform_hibernation_ops *ops) |
56 | { | 56 | { |
57 | if (ops && !(ops->start && ops->pre_snapshot && ops->finish | 57 | if (ops && !(ops->begin && ops->end && ops->pre_snapshot |
58 | && ops->prepare && ops->enter && ops->pre_restore | 58 | && ops->prepare && ops->finish && ops->enter && ops->pre_restore |
59 | && ops->restore_cleanup)) { | 59 | && ops->restore_cleanup)) { |
60 | WARN_ON(1); | 60 | WARN_ON(1); |
61 | return; | 61 | return; |
@@ -70,15 +70,55 @@ void hibernation_set_ops(struct platform_hibernation_ops *ops) | |||
70 | mutex_unlock(&pm_mutex); | 70 | mutex_unlock(&pm_mutex); |
71 | } | 71 | } |
72 | 72 | ||
73 | #ifdef CONFIG_PM_DEBUG | ||
74 | static void hibernation_debug_sleep(void) | ||
75 | { | ||
76 | printk(KERN_INFO "hibernation debug: Waiting for 5 seconds.\n"); | ||
77 | mdelay(5000); | ||
78 | } | ||
79 | |||
80 | static int hibernation_testmode(int mode) | ||
81 | { | ||
82 | if (hibernation_mode == mode) { | ||
83 | hibernation_debug_sleep(); | ||
84 | return 1; | ||
85 | } | ||
86 | return 0; | ||
87 | } | ||
88 | |||
89 | static int hibernation_test(int level) | ||
90 | { | ||
91 | if (pm_test_level == level) { | ||
92 | hibernation_debug_sleep(); | ||
93 | return 1; | ||
94 | } | ||
95 | return 0; | ||
96 | } | ||
97 | #else /* !CONFIG_PM_DEBUG */ | ||
98 | static int hibernation_testmode(int mode) { return 0; } | ||
99 | static int hibernation_test(int level) { return 0; } | ||
100 | #endif /* !CONFIG_PM_DEBUG */ | ||
101 | |||
73 | /** | 102 | /** |
74 | * platform_start - tell the platform driver that we're starting | 103 | * platform_begin - tell the platform driver that we're starting |
75 | * hibernation | 104 | * hibernation |
76 | */ | 105 | */ |
77 | 106 | ||
78 | static int platform_start(int platform_mode) | 107 | static int platform_begin(int platform_mode) |
79 | { | 108 | { |
80 | return (platform_mode && hibernation_ops) ? | 109 | return (platform_mode && hibernation_ops) ? |
81 | hibernation_ops->start() : 0; | 110 | hibernation_ops->begin() : 0; |
111 | } | ||
112 | |||
113 | /** | ||
114 | * platform_end - tell the platform driver that we've entered the | ||
115 | * working state | ||
116 | */ | ||
117 | |||
118 | static void platform_end(int platform_mode) | ||
119 | { | ||
120 | if (platform_mode && hibernation_ops) | ||
121 | hibernation_ops->end(); | ||
82 | } | 122 | } |
83 | 123 | ||
84 | /** | 124 | /** |
@@ -162,19 +202,25 @@ int create_image(int platform_mode) | |||
162 | */ | 202 | */ |
163 | error = device_power_down(PMSG_FREEZE); | 203 | error = device_power_down(PMSG_FREEZE); |
164 | if (error) { | 204 | if (error) { |
165 | printk(KERN_ERR "Some devices failed to power down, " | 205 | printk(KERN_ERR "PM: Some devices failed to power down, " |
166 | KERN_ERR "aborting suspend\n"); | 206 | "aborting hibernation\n"); |
167 | goto Enable_irqs; | 207 | goto Enable_irqs; |
168 | } | 208 | } |
169 | 209 | ||
210 | if (hibernation_test(TEST_CORE)) | ||
211 | goto Power_up; | ||
212 | |||
213 | in_suspend = 1; | ||
170 | save_processor_state(); | 214 | save_processor_state(); |
171 | error = swsusp_arch_suspend(); | 215 | error = swsusp_arch_suspend(); |
172 | if (error) | 216 | if (error) |
173 | printk(KERN_ERR "Error %d while creating the image\n", error); | 217 | printk(KERN_ERR "PM: Error %d creating hibernation image\n", |
218 | error); | ||
174 | /* Restore control flow magically appears here */ | 219 | /* Restore control flow magically appears here */ |
175 | restore_processor_state(); | 220 | restore_processor_state(); |
176 | if (!in_suspend) | 221 | if (!in_suspend) |
177 | platform_leave(platform_mode); | 222 | platform_leave(platform_mode); |
223 | Power_up: | ||
178 | /* NOTE: device_power_up() is just a resume() for devices | 224 | /* NOTE: device_power_up() is just a resume() for devices |
179 | * that suspended with irqs off ... no overall powerup. | 225 | * that suspended with irqs off ... no overall powerup. |
180 | */ | 226 | */ |
@@ -202,36 +248,90 @@ int hibernation_snapshot(int platform_mode) | |||
202 | if (error) | 248 | if (error) |
203 | return error; | 249 | return error; |
204 | 250 | ||
205 | error = platform_start(platform_mode); | 251 | error = platform_begin(platform_mode); |
206 | if (error) | 252 | if (error) |
207 | return error; | 253 | goto Close; |
208 | 254 | ||
209 | suspend_console(); | 255 | suspend_console(); |
210 | error = device_suspend(PMSG_FREEZE); | 256 | error = device_suspend(PMSG_FREEZE); |
211 | if (error) | 257 | if (error) |
212 | goto Resume_console; | 258 | goto Resume_console; |
213 | 259 | ||
214 | error = platform_pre_snapshot(platform_mode); | 260 | if (hibernation_test(TEST_DEVICES)) |
215 | if (error) | ||
216 | goto Resume_devices; | 261 | goto Resume_devices; |
217 | 262 | ||
263 | error = platform_pre_snapshot(platform_mode); | ||
264 | if (error || hibernation_test(TEST_PLATFORM)) | ||
265 | goto Finish; | ||
266 | |||
218 | error = disable_nonboot_cpus(); | 267 | error = disable_nonboot_cpus(); |
219 | if (!error) { | 268 | if (!error) { |
220 | if (hibernation_mode != HIBERNATION_TEST) { | 269 | if (hibernation_test(TEST_CPUS)) |
221 | in_suspend = 1; | 270 | goto Enable_cpus; |
222 | error = create_image(platform_mode); | 271 | |
223 | /* Control returns here after successful restore */ | 272 | if (hibernation_testmode(HIBERNATION_TEST)) |
224 | } else { | 273 | goto Enable_cpus; |
225 | printk("swsusp debug: Waiting for 5 seconds.\n"); | 274 | |
226 | mdelay(5000); | 275 | error = create_image(platform_mode); |
227 | } | 276 | /* Control returns here after successful restore */ |
228 | } | 277 | } |
278 | Enable_cpus: | ||
229 | enable_nonboot_cpus(); | 279 | enable_nonboot_cpus(); |
230 | Resume_devices: | 280 | Finish: |
231 | platform_finish(platform_mode); | 281 | platform_finish(platform_mode); |
282 | Resume_devices: | ||
232 | device_resume(); | 283 | device_resume(); |
233 | Resume_console: | 284 | Resume_console: |
234 | resume_console(); | 285 | resume_console(); |
286 | Close: | ||
287 | platform_end(platform_mode); | ||
288 | return error; | ||
289 | } | ||
290 | |||
291 | /** | ||
292 | * resume_target_kernel - prepare devices that need to be suspended with | ||
293 | * interrupts off, restore the contents of highmem that have not been | ||
294 | * restored yet from the image and run the low level code that will restore | ||
295 | * the remaining contents of memory and switch to the just restored target | ||
296 | * kernel. | ||
297 | */ | ||
298 | |||
299 | static int resume_target_kernel(void) | ||
300 | { | ||
301 | int error; | ||
302 | |||
303 | local_irq_disable(); | ||
304 | error = device_power_down(PMSG_PRETHAW); | ||
305 | if (error) { | ||
306 | printk(KERN_ERR "PM: Some devices failed to power down, " | ||
307 | "aborting resume\n"); | ||
308 | goto Enable_irqs; | ||
309 | } | ||
310 | /* We'll ignore saved state, but this gets preempt count (etc) right */ | ||
311 | save_processor_state(); | ||
312 | error = restore_highmem(); | ||
313 | if (!error) { | ||
314 | error = swsusp_arch_resume(); | ||
315 | /* | ||
316 | * The code below is only ever reached in case of a failure. | ||
317 | * Otherwise execution continues at place where | ||
318 | * swsusp_arch_suspend() was called | ||
319 | */ | ||
320 | BUG_ON(!error); | ||
321 | /* This call to restore_highmem() undos the previous one */ | ||
322 | restore_highmem(); | ||
323 | } | ||
324 | /* | ||
325 | * The only reason why swsusp_arch_resume() can fail is memory being | ||
326 | * very tight, so we have to free it as soon as we can to avoid | ||
327 | * subsequent failures | ||
328 | */ | ||
329 | swsusp_free(); | ||
330 | restore_processor_state(); | ||
331 | touch_softlockup_watchdog(); | ||
332 | device_power_up(); | ||
333 | Enable_irqs: | ||
334 | local_irq_enable(); | ||
235 | return error; | 335 | return error; |
236 | } | 336 | } |
237 | 337 | ||
@@ -258,7 +358,7 @@ int hibernation_restore(int platform_mode) | |||
258 | if (!error) { | 358 | if (!error) { |
259 | error = disable_nonboot_cpus(); | 359 | error = disable_nonboot_cpus(); |
260 | if (!error) | 360 | if (!error) |
261 | error = swsusp_resume(); | 361 | error = resume_target_kernel(); |
262 | enable_nonboot_cpus(); | 362 | enable_nonboot_cpus(); |
263 | } | 363 | } |
264 | platform_restore_cleanup(platform_mode); | 364 | platform_restore_cleanup(platform_mode); |
@@ -286,9 +386,9 @@ int hibernation_platform_enter(void) | |||
286 | * hibernation_ops->finish() before saving the image, so we should let | 386 | * hibernation_ops->finish() before saving the image, so we should let |
287 | * the firmware know that we're going to enter the sleep state after all | 387 | * the firmware know that we're going to enter the sleep state after all |
288 | */ | 388 | */ |
289 | error = hibernation_ops->start(); | 389 | error = hibernation_ops->begin(); |
290 | if (error) | 390 | if (error) |
291 | return error; | 391 | goto Close; |
292 | 392 | ||
293 | suspend_console(); | 393 | suspend_console(); |
294 | error = device_suspend(PMSG_SUSPEND); | 394 | error = device_suspend(PMSG_SUSPEND); |
@@ -322,6 +422,8 @@ int hibernation_platform_enter(void) | |||
322 | device_resume(); | 422 | device_resume(); |
323 | Resume_console: | 423 | Resume_console: |
324 | resume_console(); | 424 | resume_console(); |
425 | Close: | ||
426 | hibernation_ops->end(); | ||
325 | return error; | 427 | return error; |
326 | } | 428 | } |
327 | 429 | ||
@@ -352,24 +454,17 @@ static void power_down(void) | |||
352 | * Valid image is on the disk, if we continue we risk serious data | 454 | * Valid image is on the disk, if we continue we risk serious data |
353 | * corruption after resume. | 455 | * corruption after resume. |
354 | */ | 456 | */ |
355 | printk(KERN_CRIT "Please power me down manually\n"); | 457 | printk(KERN_CRIT "PM: Please power down manually\n"); |
356 | while(1); | 458 | while(1); |
357 | } | 459 | } |
358 | 460 | ||
359 | static void unprepare_processes(void) | ||
360 | { | ||
361 | thaw_processes(); | ||
362 | pm_restore_console(); | ||
363 | } | ||
364 | |||
365 | static int prepare_processes(void) | 461 | static int prepare_processes(void) |
366 | { | 462 | { |
367 | int error = 0; | 463 | int error = 0; |
368 | 464 | ||
369 | pm_prepare_console(); | ||
370 | if (freeze_processes()) { | 465 | if (freeze_processes()) { |
371 | error = -EBUSY; | 466 | error = -EBUSY; |
372 | unprepare_processes(); | 467 | thaw_processes(); |
373 | } | 468 | } |
374 | return error; | 469 | return error; |
375 | } | 470 | } |
@@ -389,6 +484,7 @@ int hibernate(void) | |||
389 | goto Unlock; | 484 | goto Unlock; |
390 | } | 485 | } |
391 | 486 | ||
487 | pm_prepare_console(); | ||
392 | error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); | 488 | error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); |
393 | if (error) | 489 | if (error) |
394 | goto Exit; | 490 | goto Exit; |
@@ -398,7 +494,7 @@ int hibernate(void) | |||
398 | if (error) | 494 | if (error) |
399 | goto Exit; | 495 | goto Exit; |
400 | 496 | ||
401 | printk("Syncing filesystems ... "); | 497 | printk(KERN_INFO "PM: Syncing filesystems ... "); |
402 | sys_sync(); | 498 | sys_sync(); |
403 | printk("done.\n"); | 499 | printk("done.\n"); |
404 | 500 | ||
@@ -406,11 +502,12 @@ int hibernate(void) | |||
406 | if (error) | 502 | if (error) |
407 | goto Finish; | 503 | goto Finish; |
408 | 504 | ||
409 | if (hibernation_mode == HIBERNATION_TESTPROC) { | 505 | if (hibernation_test(TEST_FREEZER)) |
410 | printk("swsusp debug: Waiting for 5 seconds.\n"); | ||
411 | mdelay(5000); | ||
412 | goto Thaw; | 506 | goto Thaw; |
413 | } | 507 | |
508 | if (hibernation_testmode(HIBERNATION_TESTPROC)) | ||
509 | goto Thaw; | ||
510 | |||
414 | error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); | 511 | error = hibernation_snapshot(hibernation_mode == HIBERNATION_PLATFORM); |
415 | if (in_suspend && !error) { | 512 | if (in_suspend && !error) { |
416 | unsigned int flags = 0; | 513 | unsigned int flags = 0; |
@@ -427,11 +524,12 @@ int hibernate(void) | |||
427 | swsusp_free(); | 524 | swsusp_free(); |
428 | } | 525 | } |
429 | Thaw: | 526 | Thaw: |
430 | unprepare_processes(); | 527 | thaw_processes(); |
431 | Finish: | 528 | Finish: |
432 | free_basic_memory_bitmaps(); | 529 | free_basic_memory_bitmaps(); |
433 | Exit: | 530 | Exit: |
434 | pm_notifier_call_chain(PM_POST_HIBERNATION); | 531 | pm_notifier_call_chain(PM_POST_HIBERNATION); |
532 | pm_restore_console(); | ||
435 | atomic_inc(&snapshot_device_available); | 533 | atomic_inc(&snapshot_device_available); |
436 | Unlock: | 534 | Unlock: |
437 | mutex_unlock(&pm_mutex); | 535 | mutex_unlock(&pm_mutex); |
@@ -456,29 +554,40 @@ static int software_resume(void) | |||
456 | int error; | 554 | int error; |
457 | unsigned int flags; | 555 | unsigned int flags; |
458 | 556 | ||
459 | mutex_lock(&pm_mutex); | 557 | /* |
558 | * name_to_dev_t() below takes a sysfs buffer mutex when sysfs | ||
559 | * is configured into the kernel. Since the regular hibernate | ||
560 | * trigger path is via sysfs which takes a buffer mutex before | ||
561 | * calling hibernate functions (which take pm_mutex) this can | ||
562 | * cause lockdep to complain about a possible ABBA deadlock | ||
563 | * which cannot happen since we're in the boot code here and | ||
564 | * sysfs can't be invoked yet. Therefore, we use a subclass | ||
565 | * here to avoid lockdep complaining. | ||
566 | */ | ||
567 | mutex_lock_nested(&pm_mutex, SINGLE_DEPTH_NESTING); | ||
460 | if (!swsusp_resume_device) { | 568 | if (!swsusp_resume_device) { |
461 | if (!strlen(resume_file)) { | 569 | if (!strlen(resume_file)) { |
462 | mutex_unlock(&pm_mutex); | 570 | mutex_unlock(&pm_mutex); |
463 | return -ENOENT; | 571 | return -ENOENT; |
464 | } | 572 | } |
465 | swsusp_resume_device = name_to_dev_t(resume_file); | 573 | swsusp_resume_device = name_to_dev_t(resume_file); |
466 | pr_debug("swsusp: Resume From Partition %s\n", resume_file); | 574 | pr_debug("PM: Resume from partition %s\n", resume_file); |
467 | } else { | 575 | } else { |
468 | pr_debug("swsusp: Resume From Partition %d:%d\n", | 576 | pr_debug("PM: Resume from partition %d:%d\n", |
469 | MAJOR(swsusp_resume_device), MINOR(swsusp_resume_device)); | 577 | MAJOR(swsusp_resume_device), |
578 | MINOR(swsusp_resume_device)); | ||
470 | } | 579 | } |
471 | 580 | ||
472 | if (noresume) { | 581 | if (noresume) { |
473 | /** | 582 | /** |
474 | * FIXME: If noresume is specified, we need to find the partition | 583 | * FIXME: If noresume is specified, we need to find the |
475 | * and reset it back to normal swap space. | 584 | * partition and reset it back to normal swap space. |
476 | */ | 585 | */ |
477 | mutex_unlock(&pm_mutex); | 586 | mutex_unlock(&pm_mutex); |
478 | return 0; | 587 | return 0; |
479 | } | 588 | } |
480 | 589 | ||
481 | pr_debug("PM: Checking swsusp image.\n"); | 590 | pr_debug("PM: Checking hibernation image.\n"); |
482 | error = swsusp_check(); | 591 | error = swsusp_check(); |
483 | if (error) | 592 | if (error) |
484 | goto Unlock; | 593 | goto Unlock; |
@@ -489,6 +598,11 @@ static int software_resume(void) | |||
489 | goto Unlock; | 598 | goto Unlock; |
490 | } | 599 | } |
491 | 600 | ||
601 | pm_prepare_console(); | ||
602 | error = pm_notifier_call_chain(PM_RESTORE_PREPARE); | ||
603 | if (error) | ||
604 | goto Finish; | ||
605 | |||
492 | error = create_basic_memory_bitmaps(); | 606 | error = create_basic_memory_bitmaps(); |
493 | if (error) | 607 | if (error) |
494 | goto Finish; | 608 | goto Finish; |
@@ -500,7 +614,7 @@ static int software_resume(void) | |||
500 | goto Done; | 614 | goto Done; |
501 | } | 615 | } |
502 | 616 | ||
503 | pr_debug("PM: Reading swsusp image.\n"); | 617 | pr_debug("PM: Reading hibernation image.\n"); |
504 | 618 | ||
505 | error = swsusp_read(&flags); | 619 | error = swsusp_read(&flags); |
506 | if (!error) | 620 | if (!error) |
@@ -508,10 +622,12 @@ static int software_resume(void) | |||
508 | 622 | ||
509 | printk(KERN_ERR "PM: Restore failed, recovering.\n"); | 623 | printk(KERN_ERR "PM: Restore failed, recovering.\n"); |
510 | swsusp_free(); | 624 | swsusp_free(); |
511 | unprepare_processes(); | 625 | thaw_processes(); |
512 | Done: | 626 | Done: |
513 | free_basic_memory_bitmaps(); | 627 | free_basic_memory_bitmaps(); |
514 | Finish: | 628 | Finish: |
629 | pm_notifier_call_chain(PM_POST_RESTORE); | ||
630 | pm_restore_console(); | ||
515 | atomic_inc(&snapshot_device_available); | 631 | atomic_inc(&snapshot_device_available); |
516 | /* For success case, the suspend path will release the lock */ | 632 | /* For success case, the suspend path will release the lock */ |
517 | Unlock: | 633 | Unlock: |
@@ -557,7 +673,8 @@ static const char * const hibernation_modes[] = { | |||
557 | * supports it (as determined by having hibernation_ops). | 673 | * supports it (as determined by having hibernation_ops). |
558 | */ | 674 | */ |
559 | 675 | ||
560 | static ssize_t disk_show(struct kset *kset, char *buf) | 676 | static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr, |
677 | char *buf) | ||
561 | { | 678 | { |
562 | int i; | 679 | int i; |
563 | char *start = buf; | 680 | char *start = buf; |
@@ -587,7 +704,8 @@ static ssize_t disk_show(struct kset *kset, char *buf) | |||
587 | } | 704 | } |
588 | 705 | ||
589 | 706 | ||
590 | static ssize_t disk_store(struct kset *kset, const char *buf, size_t n) | 707 | static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr, |
708 | const char *buf, size_t n) | ||
591 | { | 709 | { |
592 | int error = 0; | 710 | int error = 0; |
593 | int i; | 711 | int i; |
@@ -624,7 +742,7 @@ static ssize_t disk_store(struct kset *kset, const char *buf, size_t n) | |||
624 | error = -EINVAL; | 742 | error = -EINVAL; |
625 | 743 | ||
626 | if (!error) | 744 | if (!error) |
627 | pr_debug("PM: suspend-to-disk mode set to '%s'\n", | 745 | pr_debug("PM: Hibernation mode set to '%s'\n", |
628 | hibernation_modes[mode]); | 746 | hibernation_modes[mode]); |
629 | mutex_unlock(&pm_mutex); | 747 | mutex_unlock(&pm_mutex); |
630 | return error ? error : n; | 748 | return error ? error : n; |
@@ -632,13 +750,15 @@ static ssize_t disk_store(struct kset *kset, const char *buf, size_t n) | |||
632 | 750 | ||
633 | power_attr(disk); | 751 | power_attr(disk); |
634 | 752 | ||
635 | static ssize_t resume_show(struct kset *kset, char *buf) | 753 | static ssize_t resume_show(struct kobject *kobj, struct kobj_attribute *attr, |
754 | char *buf) | ||
636 | { | 755 | { |
637 | return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device), | 756 | return sprintf(buf,"%d:%d\n", MAJOR(swsusp_resume_device), |
638 | MINOR(swsusp_resume_device)); | 757 | MINOR(swsusp_resume_device)); |
639 | } | 758 | } |
640 | 759 | ||
641 | static ssize_t resume_store(struct kset *kset, const char *buf, size_t n) | 760 | static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, |
761 | const char *buf, size_t n) | ||
642 | { | 762 | { |
643 | unsigned int maj, min; | 763 | unsigned int maj, min; |
644 | dev_t res; | 764 | dev_t res; |
@@ -654,7 +774,7 @@ static ssize_t resume_store(struct kset *kset, const char *buf, size_t n) | |||
654 | mutex_lock(&pm_mutex); | 774 | mutex_lock(&pm_mutex); |
655 | swsusp_resume_device = res; | 775 | swsusp_resume_device = res; |
656 | mutex_unlock(&pm_mutex); | 776 | mutex_unlock(&pm_mutex); |
657 | printk("Attempting manual resume\n"); | 777 | printk(KERN_INFO "PM: Starting manual resume from disk\n"); |
658 | noresume = 0; | 778 | noresume = 0; |
659 | software_resume(); | 779 | software_resume(); |
660 | ret = n; | 780 | ret = n; |
@@ -664,12 +784,14 @@ static ssize_t resume_store(struct kset *kset, const char *buf, size_t n) | |||
664 | 784 | ||
665 | power_attr(resume); | 785 | power_attr(resume); |
666 | 786 | ||
667 | static ssize_t image_size_show(struct kset *kset, char *buf) | 787 | static ssize_t image_size_show(struct kobject *kobj, struct kobj_attribute *attr, |
788 | char *buf) | ||
668 | { | 789 | { |
669 | return sprintf(buf, "%lu\n", image_size); | 790 | return sprintf(buf, "%lu\n", image_size); |
670 | } | 791 | } |
671 | 792 | ||
672 | static ssize_t image_size_store(struct kset *kset, const char *buf, size_t n) | 793 | static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *attr, |
794 | const char *buf, size_t n) | ||
673 | { | 795 | { |
674 | unsigned long size; | 796 | unsigned long size; |
675 | 797 | ||
@@ -698,7 +820,7 @@ static struct attribute_group attr_group = { | |||
698 | 820 | ||
699 | static int __init pm_disk_init(void) | 821 | static int __init pm_disk_init(void) |
700 | { | 822 | { |
701 | return sysfs_create_group(&power_subsys.kobj, &attr_group); | 823 | return sysfs_create_group(power_kobj, &attr_group); |
702 | } | 824 | } |
703 | 825 | ||
704 | core_initcall(pm_disk_init); | 826 | core_initcall(pm_disk_init); |
diff --git a/kernel/power/main.c b/kernel/power/main.c index 3cdf95b1dc92..6a6d5eb3524e 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c | |||
@@ -24,10 +24,112 @@ | |||
24 | 24 | ||
25 | #include "power.h" | 25 | #include "power.h" |
26 | 26 | ||
27 | BLOCKING_NOTIFIER_HEAD(pm_chain_head); | ||
28 | |||
29 | DEFINE_MUTEX(pm_mutex); | 27 | DEFINE_MUTEX(pm_mutex); |
30 | 28 | ||
29 | unsigned int pm_flags; | ||
30 | EXPORT_SYMBOL(pm_flags); | ||
31 | |||
32 | #ifdef CONFIG_PM_SLEEP | ||
33 | |||
34 | /* Routines for PM-transition notifications */ | ||
35 | |||
36 | static BLOCKING_NOTIFIER_HEAD(pm_chain_head); | ||
37 | |||
38 | int register_pm_notifier(struct notifier_block *nb) | ||
39 | { | ||
40 | return blocking_notifier_chain_register(&pm_chain_head, nb); | ||
41 | } | ||
42 | EXPORT_SYMBOL_GPL(register_pm_notifier); | ||
43 | |||
44 | int unregister_pm_notifier(struct notifier_block *nb) | ||
45 | { | ||
46 | return blocking_notifier_chain_unregister(&pm_chain_head, nb); | ||
47 | } | ||
48 | EXPORT_SYMBOL_GPL(unregister_pm_notifier); | ||
49 | |||
50 | int pm_notifier_call_chain(unsigned long val) | ||
51 | { | ||
52 | return (blocking_notifier_call_chain(&pm_chain_head, val, NULL) | ||
53 | == NOTIFY_BAD) ? -EINVAL : 0; | ||
54 | } | ||
55 | |||
56 | #ifdef CONFIG_PM_DEBUG | ||
57 | int pm_test_level = TEST_NONE; | ||
58 | |||
59 | static int suspend_test(int level) | ||
60 | { | ||
61 | if (pm_test_level == level) { | ||
62 | printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n"); | ||
63 | mdelay(5000); | ||
64 | return 1; | ||
65 | } | ||
66 | return 0; | ||
67 | } | ||
68 | |||
69 | static const char * const pm_tests[__TEST_AFTER_LAST] = { | ||
70 | [TEST_NONE] = "none", | ||
71 | [TEST_CORE] = "core", | ||
72 | [TEST_CPUS] = "processors", | ||
73 | [TEST_PLATFORM] = "platform", | ||
74 | [TEST_DEVICES] = "devices", | ||
75 | [TEST_FREEZER] = "freezer", | ||
76 | }; | ||
77 | |||
78 | static ssize_t pm_test_show(struct kobject *kobj, struct kobj_attribute *attr, | ||
79 | char *buf) | ||
80 | { | ||
81 | char *s = buf; | ||
82 | int level; | ||
83 | |||
84 | for (level = TEST_FIRST; level <= TEST_MAX; level++) | ||
85 | if (pm_tests[level]) { | ||
86 | if (level == pm_test_level) | ||
87 | s += sprintf(s, "[%s] ", pm_tests[level]); | ||
88 | else | ||
89 | s += sprintf(s, "%s ", pm_tests[level]); | ||
90 | } | ||
91 | |||
92 | if (s != buf) | ||
93 | /* convert the last space to a newline */ | ||
94 | *(s-1) = '\n'; | ||
95 | |||
96 | return (s - buf); | ||
97 | } | ||
98 | |||
99 | static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr, | ||
100 | const char *buf, size_t n) | ||
101 | { | ||
102 | const char * const *s; | ||
103 | int level; | ||
104 | char *p; | ||
105 | int len; | ||
106 | int error = -EINVAL; | ||
107 | |||
108 | p = memchr(buf, '\n', n); | ||
109 | len = p ? p - buf : n; | ||
110 | |||
111 | mutex_lock(&pm_mutex); | ||
112 | |||
113 | level = TEST_FIRST; | ||
114 | for (s = &pm_tests[level]; level <= TEST_MAX; s++, level++) | ||
115 | if (*s && len == strlen(*s) && !strncmp(buf, *s, len)) { | ||
116 | pm_test_level = level; | ||
117 | error = 0; | ||
118 | break; | ||
119 | } | ||
120 | |||
121 | mutex_unlock(&pm_mutex); | ||
122 | |||
123 | return error ? error : n; | ||
124 | } | ||
125 | |||
126 | power_attr(pm_test); | ||
127 | #else /* !CONFIG_PM_DEBUG */ | ||
128 | static inline int suspend_test(int level) { return 0; } | ||
129 | #endif /* !CONFIG_PM_DEBUG */ | ||
130 | |||
131 | #endif /* CONFIG_PM_SLEEP */ | ||
132 | |||
31 | #ifdef CONFIG_SUSPEND | 133 | #ifdef CONFIG_SUSPEND |
32 | 134 | ||
33 | /* This is just an arbitrary number */ | 135 | /* This is just an arbitrary number */ |
@@ -73,13 +175,13 @@ static int suspend_prepare(void) | |||
73 | if (!suspend_ops || !suspend_ops->enter) | 175 | if (!suspend_ops || !suspend_ops->enter) |
74 | return -EPERM; | 176 | return -EPERM; |
75 | 177 | ||
178 | pm_prepare_console(); | ||
179 | |||
76 | error = pm_notifier_call_chain(PM_SUSPEND_PREPARE); | 180 | error = pm_notifier_call_chain(PM_SUSPEND_PREPARE); |
77 | if (error) | 181 | if (error) |
78 | goto Finish; | 182 | goto Finish; |
79 | 183 | ||
80 | pm_prepare_console(); | 184 | if (suspend_freeze_processes()) { |
81 | |||
82 | if (freeze_processes()) { | ||
83 | error = -EAGAIN; | 185 | error = -EAGAIN; |
84 | goto Thaw; | 186 | goto Thaw; |
85 | } | 187 | } |
@@ -97,10 +199,10 @@ static int suspend_prepare(void) | |||
97 | return 0; | 199 | return 0; |
98 | 200 | ||
99 | Thaw: | 201 | Thaw: |
100 | thaw_processes(); | 202 | suspend_thaw_processes(); |
101 | pm_restore_console(); | ||
102 | Finish: | 203 | Finish: |
103 | pm_notifier_call_chain(PM_POST_SUSPEND); | 204 | pm_notifier_call_chain(PM_POST_SUSPEND); |
205 | pm_restore_console(); | ||
104 | return error; | 206 | return error; |
105 | } | 207 | } |
106 | 208 | ||
@@ -130,10 +232,13 @@ static int suspend_enter(suspend_state_t state) | |||
130 | BUG_ON(!irqs_disabled()); | 232 | BUG_ON(!irqs_disabled()); |
131 | 233 | ||
132 | if ((error = device_power_down(PMSG_SUSPEND))) { | 234 | if ((error = device_power_down(PMSG_SUSPEND))) { |
133 | printk(KERN_ERR "Some devices failed to power down\n"); | 235 | printk(KERN_ERR "PM: Some devices failed to power down\n"); |
134 | goto Done; | 236 | goto Done; |
135 | } | 237 | } |
136 | error = suspend_ops->enter(state); | 238 | |
239 | if (!suspend_test(TEST_CORE)) | ||
240 | error = suspend_ops->enter(state); | ||
241 | |||
137 | device_power_up(); | 242 | device_power_up(); |
138 | Done: | 243 | Done: |
139 | arch_suspend_enable_irqs(); | 244 | arch_suspend_enable_irqs(); |
@@ -142,8 +247,8 @@ static int suspend_enter(suspend_state_t state) | |||
142 | } | 247 | } |
143 | 248 | ||
144 | /** | 249 | /** |
145 | * suspend_devices_and_enter - suspend devices and enter the desired system sleep | 250 | * suspend_devices_and_enter - suspend devices and enter the desired system |
146 | * state. | 251 | * sleep state. |
147 | * @state: state to enter | 252 | * @state: state to enter |
148 | */ | 253 | */ |
149 | int suspend_devices_and_enter(suspend_state_t state) | 254 | int suspend_devices_and_enter(suspend_state_t state) |
@@ -153,33 +258,45 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
153 | if (!suspend_ops) | 258 | if (!suspend_ops) |
154 | return -ENOSYS; | 259 | return -ENOSYS; |
155 | 260 | ||
156 | if (suspend_ops->set_target) { | 261 | if (suspend_ops->begin) { |
157 | error = suspend_ops->set_target(state); | 262 | error = suspend_ops->begin(state); |
158 | if (error) | 263 | if (error) |
159 | return error; | 264 | goto Close; |
160 | } | 265 | } |
161 | suspend_console(); | 266 | suspend_console(); |
162 | error = device_suspend(PMSG_SUSPEND); | 267 | error = device_suspend(PMSG_SUSPEND); |
163 | if (error) { | 268 | if (error) { |
164 | printk(KERN_ERR "Some devices failed to suspend\n"); | 269 | printk(KERN_ERR "PM: Some devices failed to suspend\n"); |
165 | goto Resume_console; | 270 | goto Resume_console; |
166 | } | 271 | } |
272 | |||
273 | if (suspend_test(TEST_DEVICES)) | ||
274 | goto Resume_devices; | ||
275 | |||
167 | if (suspend_ops->prepare) { | 276 | if (suspend_ops->prepare) { |
168 | error = suspend_ops->prepare(); | 277 | error = suspend_ops->prepare(); |
169 | if (error) | 278 | if (error) |
170 | goto Resume_devices; | 279 | goto Resume_devices; |
171 | } | 280 | } |
281 | |||
282 | if (suspend_test(TEST_PLATFORM)) | ||
283 | goto Finish; | ||
284 | |||
172 | error = disable_nonboot_cpus(); | 285 | error = disable_nonboot_cpus(); |
173 | if (!error) | 286 | if (!error && !suspend_test(TEST_CPUS)) |
174 | suspend_enter(state); | 287 | suspend_enter(state); |
175 | 288 | ||
176 | enable_nonboot_cpus(); | 289 | enable_nonboot_cpus(); |
290 | Finish: | ||
177 | if (suspend_ops->finish) | 291 | if (suspend_ops->finish) |
178 | suspend_ops->finish(); | 292 | suspend_ops->finish(); |
179 | Resume_devices: | 293 | Resume_devices: |
180 | device_resume(); | 294 | device_resume(); |
181 | Resume_console: | 295 | Resume_console: |
182 | resume_console(); | 296 | resume_console(); |
297 | Close: | ||
298 | if (suspend_ops->end) | ||
299 | suspend_ops->end(); | ||
183 | return error; | 300 | return error; |
184 | } | 301 | } |
185 | 302 | ||
@@ -191,9 +308,9 @@ int suspend_devices_and_enter(suspend_state_t state) | |||
191 | */ | 308 | */ |
192 | static void suspend_finish(void) | 309 | static void suspend_finish(void) |
193 | { | 310 | { |
194 | thaw_processes(); | 311 | suspend_thaw_processes(); |
195 | pm_restore_console(); | ||
196 | pm_notifier_call_chain(PM_POST_SUSPEND); | 312 | pm_notifier_call_chain(PM_POST_SUSPEND); |
313 | pm_restore_console(); | ||
197 | } | 314 | } |
198 | 315 | ||
199 | 316 | ||
@@ -235,17 +352,22 @@ static int enter_state(suspend_state_t state) | |||
235 | if (!mutex_trylock(&pm_mutex)) | 352 | if (!mutex_trylock(&pm_mutex)) |
236 | return -EBUSY; | 353 | return -EBUSY; |
237 | 354 | ||
238 | printk("Syncing filesystems ... "); | 355 | printk(KERN_INFO "PM: Syncing filesystems ... "); |
239 | sys_sync(); | 356 | sys_sync(); |
240 | printk("done.\n"); | 357 | printk("done.\n"); |
241 | 358 | ||
242 | pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); | 359 | pr_debug("PM: Preparing system for %s sleep\n", pm_states[state]); |
243 | if ((error = suspend_prepare())) | 360 | error = suspend_prepare(); |
361 | if (error) | ||
244 | goto Unlock; | 362 | goto Unlock; |
245 | 363 | ||
364 | if (suspend_test(TEST_FREEZER)) | ||
365 | goto Finish; | ||
366 | |||
246 | pr_debug("PM: Entering %s sleep\n", pm_states[state]); | 367 | pr_debug("PM: Entering %s sleep\n", pm_states[state]); |
247 | error = suspend_devices_and_enter(state); | 368 | error = suspend_devices_and_enter(state); |
248 | 369 | ||
370 | Finish: | ||
249 | pr_debug("PM: Finishing wakeup.\n"); | 371 | pr_debug("PM: Finishing wakeup.\n"); |
250 | suspend_finish(); | 372 | suspend_finish(); |
251 | Unlock: | 373 | Unlock: |
@@ -273,8 +395,7 @@ EXPORT_SYMBOL(pm_suspend); | |||
273 | 395 | ||
274 | #endif /* CONFIG_SUSPEND */ | 396 | #endif /* CONFIG_SUSPEND */ |
275 | 397 | ||
276 | decl_subsys(power,NULL,NULL); | 398 | struct kobject *power_kobj; |
277 | |||
278 | 399 | ||
279 | /** | 400 | /** |
280 | * state - control system power state. | 401 | * state - control system power state. |
@@ -287,7 +408,8 @@ decl_subsys(power,NULL,NULL); | |||
287 | * proper enumerated value, and initiates a suspend transition. | 408 | * proper enumerated value, and initiates a suspend transition. |
288 | */ | 409 | */ |
289 | 410 | ||
290 | static ssize_t state_show(struct kset *kset, char *buf) | 411 | static ssize_t state_show(struct kobject *kobj, struct kobj_attribute *attr, |
412 | char *buf) | ||
291 | { | 413 | { |
292 | char *s = buf; | 414 | char *s = buf; |
293 | #ifdef CONFIG_SUSPEND | 415 | #ifdef CONFIG_SUSPEND |
@@ -308,7 +430,8 @@ static ssize_t state_show(struct kset *kset, char *buf) | |||
308 | return (s - buf); | 430 | return (s - buf); |
309 | } | 431 | } |
310 | 432 | ||
311 | static ssize_t state_store(struct kset *kset, const char *buf, size_t n) | 433 | static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, |
434 | const char *buf, size_t n) | ||
312 | { | 435 | { |
313 | #ifdef CONFIG_SUSPEND | 436 | #ifdef CONFIG_SUSPEND |
314 | suspend_state_t state = PM_SUSPEND_STANDBY; | 437 | suspend_state_t state = PM_SUSPEND_STANDBY; |
@@ -345,13 +468,15 @@ power_attr(state); | |||
345 | #ifdef CONFIG_PM_TRACE | 468 | #ifdef CONFIG_PM_TRACE |
346 | int pm_trace_enabled; | 469 | int pm_trace_enabled; |
347 | 470 | ||
348 | static ssize_t pm_trace_show(struct kset *kset, char *buf) | 471 | static ssize_t pm_trace_show(struct kobject *kobj, struct kobj_attribute *attr, |
472 | char *buf) | ||
349 | { | 473 | { |
350 | return sprintf(buf, "%d\n", pm_trace_enabled); | 474 | return sprintf(buf, "%d\n", pm_trace_enabled); |
351 | } | 475 | } |
352 | 476 | ||
353 | static ssize_t | 477 | static ssize_t |
354 | pm_trace_store(struct kset *kset, const char *buf, size_t n) | 478 | pm_trace_store(struct kobject *kobj, struct kobj_attribute *attr, |
479 | const char *buf, size_t n) | ||
355 | { | 480 | { |
356 | int val; | 481 | int val; |
357 | 482 | ||
@@ -363,18 +488,18 @@ pm_trace_store(struct kset *kset, const char *buf, size_t n) | |||
363 | } | 488 | } |
364 | 489 | ||
365 | power_attr(pm_trace); | 490 | power_attr(pm_trace); |
491 | #endif /* CONFIG_PM_TRACE */ | ||
366 | 492 | ||
367 | static struct attribute * g[] = { | 493 | static struct attribute * g[] = { |
368 | &state_attr.attr, | 494 | &state_attr.attr, |
495 | #ifdef CONFIG_PM_TRACE | ||
369 | &pm_trace_attr.attr, | 496 | &pm_trace_attr.attr, |
497 | #endif | ||
498 | #if defined(CONFIG_PM_SLEEP) && defined(CONFIG_PM_DEBUG) | ||
499 | &pm_test_attr.attr, | ||
500 | #endif | ||
370 | NULL, | 501 | NULL, |
371 | }; | 502 | }; |
372 | #else | ||
373 | static struct attribute * g[] = { | ||
374 | &state_attr.attr, | ||
375 | NULL, | ||
376 | }; | ||
377 | #endif /* CONFIG_PM_TRACE */ | ||
378 | 503 | ||
379 | static struct attribute_group attr_group = { | 504 | static struct attribute_group attr_group = { |
380 | .attrs = g, | 505 | .attrs = g, |
@@ -383,10 +508,10 @@ static struct attribute_group attr_group = { | |||
383 | 508 | ||
384 | static int __init pm_init(void) | 509 | static int __init pm_init(void) |
385 | { | 510 | { |
386 | int error = subsystem_register(&power_subsys); | 511 | power_kobj = kobject_create_and_add("power", NULL); |
387 | if (!error) | 512 | if (!power_kobj) |
388 | error = sysfs_create_group(&power_subsys.kobj,&attr_group); | 513 | return -ENOMEM; |
389 | return error; | 514 | return sysfs_create_group(power_kobj, &attr_group); |
390 | } | 515 | } |
391 | 516 | ||
392 | core_initcall(pm_init); | 517 | core_initcall(pm_init); |
diff --git a/kernel/power/pm.c b/kernel/power/pm.c index c50d15266c10..60c73fa670d5 100644 --- a/kernel/power/pm.c +++ b/kernel/power/pm.c | |||
@@ -27,8 +27,6 @@ | |||
27 | #include <linux/interrupt.h> | 27 | #include <linux/interrupt.h> |
28 | #include <linux/mutex.h> | 28 | #include <linux/mutex.h> |
29 | 29 | ||
30 | int pm_active; | ||
31 | |||
32 | /* | 30 | /* |
33 | * Locking notes: | 31 | * Locking notes: |
34 | * pm_devs_lock can be a semaphore providing pm ops are not called | 32 | * pm_devs_lock can be a semaphore providing pm ops are not called |
@@ -204,6 +202,4 @@ int pm_send_all(pm_request_t rqst, void *data) | |||
204 | 202 | ||
205 | EXPORT_SYMBOL(pm_register); | 203 | EXPORT_SYMBOL(pm_register); |
206 | EXPORT_SYMBOL(pm_send_all); | 204 | EXPORT_SYMBOL(pm_send_all); |
207 | EXPORT_SYMBOL(pm_active); | ||
208 | |||
209 | 205 | ||
diff --git a/kernel/power/power.h b/kernel/power/power.h index 195dc4611764..700f44ec8406 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h | |||
@@ -1,5 +1,7 @@ | |||
1 | #include <linux/suspend.h> | 1 | #include <linux/suspend.h> |
2 | #include <linux/suspend_ioctls.h> | ||
2 | #include <linux/utsname.h> | 3 | #include <linux/utsname.h> |
4 | #include <linux/freezer.h> | ||
3 | 5 | ||
4 | struct swsusp_info { | 6 | struct swsusp_info { |
5 | struct new_utsname uts; | 7 | struct new_utsname uts; |
@@ -54,7 +56,7 @@ extern int pfn_is_nosave(unsigned long); | |||
54 | extern struct mutex pm_mutex; | 56 | extern struct mutex pm_mutex; |
55 | 57 | ||
56 | #define power_attr(_name) \ | 58 | #define power_attr(_name) \ |
57 | static struct subsys_attribute _name##_attr = { \ | 59 | static struct kobj_attribute _name##_attr = { \ |
58 | .attr = { \ | 60 | .attr = { \ |
59 | .name = __stringify(_name), \ | 61 | .name = __stringify(_name), \ |
60 | .mode = 0644, \ | 62 | .mode = 0644, \ |
@@ -63,8 +65,6 @@ static struct subsys_attribute _name##_attr = { \ | |||
63 | .store = _name##_store, \ | 65 | .store = _name##_store, \ |
64 | } | 66 | } |
65 | 67 | ||
66 | extern struct kset power_subsys; | ||
67 | |||
68 | /* Preferred image size in bytes (default 500 MB) */ | 68 | /* Preferred image size in bytes (default 500 MB) */ |
69 | extern unsigned long image_size; | 69 | extern unsigned long image_size; |
70 | extern int in_suspend; | 70 | extern int in_suspend; |
@@ -130,42 +130,12 @@ struct snapshot_handle { | |||
130 | #define data_of(handle) ((handle).buffer + (handle).buf_offset) | 130 | #define data_of(handle) ((handle).buffer + (handle).buf_offset) |
131 | 131 | ||
132 | extern unsigned int snapshot_additional_pages(struct zone *zone); | 132 | extern unsigned int snapshot_additional_pages(struct zone *zone); |
133 | extern unsigned long snapshot_get_image_size(void); | ||
133 | extern int snapshot_read_next(struct snapshot_handle *handle, size_t count); | 134 | extern int snapshot_read_next(struct snapshot_handle *handle, size_t count); |
134 | extern int snapshot_write_next(struct snapshot_handle *handle, size_t count); | 135 | extern int snapshot_write_next(struct snapshot_handle *handle, size_t count); |
135 | extern void snapshot_write_finalize(struct snapshot_handle *handle); | 136 | extern void snapshot_write_finalize(struct snapshot_handle *handle); |
136 | extern int snapshot_image_loaded(struct snapshot_handle *handle); | 137 | extern int snapshot_image_loaded(struct snapshot_handle *handle); |
137 | 138 | ||
138 | /* | ||
139 | * This structure is used to pass the values needed for the identification | ||
140 | * of the resume swap area from a user space to the kernel via the | ||
141 | * SNAPSHOT_SET_SWAP_AREA ioctl | ||
142 | */ | ||
143 | struct resume_swap_area { | ||
144 | loff_t offset; | ||
145 | u_int32_t dev; | ||
146 | } __attribute__((packed)); | ||
147 | |||
148 | #define SNAPSHOT_IOC_MAGIC '3' | ||
149 | #define SNAPSHOT_FREEZE _IO(SNAPSHOT_IOC_MAGIC, 1) | ||
150 | #define SNAPSHOT_UNFREEZE _IO(SNAPSHOT_IOC_MAGIC, 2) | ||
151 | #define SNAPSHOT_ATOMIC_SNAPSHOT _IOW(SNAPSHOT_IOC_MAGIC, 3, void *) | ||
152 | #define SNAPSHOT_ATOMIC_RESTORE _IO(SNAPSHOT_IOC_MAGIC, 4) | ||
153 | #define SNAPSHOT_FREE _IO(SNAPSHOT_IOC_MAGIC, 5) | ||
154 | #define SNAPSHOT_SET_IMAGE_SIZE _IOW(SNAPSHOT_IOC_MAGIC, 6, unsigned long) | ||
155 | #define SNAPSHOT_AVAIL_SWAP _IOR(SNAPSHOT_IOC_MAGIC, 7, void *) | ||
156 | #define SNAPSHOT_GET_SWAP_PAGE _IOR(SNAPSHOT_IOC_MAGIC, 8, void *) | ||
157 | #define SNAPSHOT_FREE_SWAP_PAGES _IO(SNAPSHOT_IOC_MAGIC, 9) | ||
158 | #define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int) | ||
159 | #define SNAPSHOT_S2RAM _IO(SNAPSHOT_IOC_MAGIC, 11) | ||
160 | #define SNAPSHOT_PMOPS _IOW(SNAPSHOT_IOC_MAGIC, 12, unsigned int) | ||
161 | #define SNAPSHOT_SET_SWAP_AREA _IOW(SNAPSHOT_IOC_MAGIC, 13, \ | ||
162 | struct resume_swap_area) | ||
163 | #define SNAPSHOT_IOC_MAXNR 13 | ||
164 | |||
165 | #define PMOPS_PREPARE 1 | ||
166 | #define PMOPS_ENTER 2 | ||
167 | #define PMOPS_FINISH 3 | ||
168 | |||
169 | /* If unset, the snapshot device cannot be open. */ | 139 | /* If unset, the snapshot device cannot be open. */ |
170 | extern atomic_t snapshot_device_available; | 140 | extern atomic_t snapshot_device_available; |
171 | 141 | ||
@@ -183,7 +153,6 @@ extern int swsusp_swap_in_use(void); | |||
183 | extern int swsusp_check(void); | 153 | extern int swsusp_check(void); |
184 | extern int swsusp_shrink_memory(void); | 154 | extern int swsusp_shrink_memory(void); |
185 | extern void swsusp_free(void); | 155 | extern void swsusp_free(void); |
186 | extern int swsusp_resume(void); | ||
187 | extern int swsusp_read(unsigned int *flags_p); | 156 | extern int swsusp_read(unsigned int *flags_p); |
188 | extern int swsusp_write(unsigned int flags); | 157 | extern int swsusp_write(unsigned int flags); |
189 | extern void swsusp_close(void); | 158 | extern void swsusp_close(void); |
@@ -203,11 +172,56 @@ static inline int suspend_devices_and_enter(suspend_state_t state) | |||
203 | } | 172 | } |
204 | #endif /* !CONFIG_SUSPEND */ | 173 | #endif /* !CONFIG_SUSPEND */ |
205 | 174 | ||
206 | /* kernel/power/common.c */ | 175 | #ifdef CONFIG_PM_SLEEP |
207 | extern struct blocking_notifier_head pm_chain_head; | 176 | /* kernel/power/main.c */ |
177 | extern int pm_notifier_call_chain(unsigned long val); | ||
178 | #endif | ||
179 | |||
180 | #ifdef CONFIG_HIGHMEM | ||
181 | unsigned int count_highmem_pages(void); | ||
182 | int restore_highmem(void); | ||
183 | #else | ||
184 | static inline unsigned int count_highmem_pages(void) { return 0; } | ||
185 | static inline int restore_highmem(void) { return 0; } | ||
186 | #endif | ||
187 | |||
188 | /* | ||
189 | * Suspend test levels | ||
190 | */ | ||
191 | enum { | ||
192 | /* keep first */ | ||
193 | TEST_NONE, | ||
194 | TEST_CORE, | ||
195 | TEST_CPUS, | ||
196 | TEST_PLATFORM, | ||
197 | TEST_DEVICES, | ||
198 | TEST_FREEZER, | ||
199 | /* keep last */ | ||
200 | __TEST_AFTER_LAST | ||
201 | }; | ||
208 | 202 | ||
209 | static inline int pm_notifier_call_chain(unsigned long val) | 203 | #define TEST_FIRST TEST_NONE |
204 | #define TEST_MAX (__TEST_AFTER_LAST - 1) | ||
205 | |||
206 | extern int pm_test_level; | ||
207 | |||
208 | #ifdef CONFIG_SUSPEND_FREEZER | ||
209 | static inline int suspend_freeze_processes(void) | ||
210 | { | ||
211 | return freeze_processes(); | ||
212 | } | ||
213 | |||
214 | static inline void suspend_thaw_processes(void) | ||
210 | { | 215 | { |
211 | return (blocking_notifier_call_chain(&pm_chain_head, val, NULL) | 216 | thaw_processes(); |
212 | == NOTIFY_BAD) ? -EINVAL : 0; | ||
213 | } | 217 | } |
218 | #else | ||
219 | static inline int suspend_freeze_processes(void) | ||
220 | { | ||
221 | return 0; | ||
222 | } | ||
223 | |||
224 | static inline void suspend_thaw_processes(void) | ||
225 | { | ||
226 | } | ||
227 | #endif | ||
diff --git a/kernel/power/process.c b/kernel/power/process.c index 6533923e711b..7c2118f9597f 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c | |||
@@ -86,9 +86,9 @@ static void fake_signal_wake_up(struct task_struct *p, int resume) | |||
86 | 86 | ||
87 | static void send_fake_signal(struct task_struct *p) | 87 | static void send_fake_signal(struct task_struct *p) |
88 | { | 88 | { |
89 | if (p->state == TASK_STOPPED) | 89 | if (task_is_stopped(p)) |
90 | force_sig_specific(SIGSTOP, p); | 90 | force_sig_specific(SIGSTOP, p); |
91 | fake_signal_wake_up(p, p->state == TASK_STOPPED); | 91 | fake_signal_wake_up(p, task_is_stopped(p)); |
92 | } | 92 | } |
93 | 93 | ||
94 | static int has_mm(struct task_struct *p) | 94 | static int has_mm(struct task_struct *p) |
@@ -182,7 +182,7 @@ static int try_to_freeze_tasks(int freeze_user_space) | |||
182 | if (frozen(p) || !freezeable(p)) | 182 | if (frozen(p) || !freezeable(p)) |
183 | continue; | 183 | continue; |
184 | 184 | ||
185 | if (p->state == TASK_TRACED && frozen(p->parent)) { | 185 | if (task_is_traced(p) && frozen(p->parent)) { |
186 | cancel_freezing(p); | 186 | cancel_freezing(p); |
187 | continue; | 187 | continue; |
188 | } | 188 | } |
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 78039b477d2b..f6a5df934f8d 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c | |||
@@ -635,7 +635,7 @@ __register_nosave_region(unsigned long start_pfn, unsigned long end_pfn, | |||
635 | region->end_pfn = end_pfn; | 635 | region->end_pfn = end_pfn; |
636 | list_add_tail(®ion->list, &nosave_regions); | 636 | list_add_tail(®ion->list, &nosave_regions); |
637 | Report: | 637 | Report: |
638 | printk("swsusp: Registered nosave memory region: %016lx - %016lx\n", | 638 | printk(KERN_INFO "PM: Registered nosave memory: %016lx - %016lx\n", |
639 | start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); | 639 | start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT); |
640 | } | 640 | } |
641 | 641 | ||
@@ -704,7 +704,7 @@ static void mark_nosave_pages(struct memory_bitmap *bm) | |||
704 | list_for_each_entry(region, &nosave_regions, list) { | 704 | list_for_each_entry(region, &nosave_regions, list) { |
705 | unsigned long pfn; | 705 | unsigned long pfn; |
706 | 706 | ||
707 | printk("swsusp: Marking nosave pages: %016lx - %016lx\n", | 707 | pr_debug("PM: Marking nosave pages: %016lx - %016lx\n", |
708 | region->start_pfn << PAGE_SHIFT, | 708 | region->start_pfn << PAGE_SHIFT, |
709 | region->end_pfn << PAGE_SHIFT); | 709 | region->end_pfn << PAGE_SHIFT); |
710 | 710 | ||
@@ -749,7 +749,7 @@ int create_basic_memory_bitmaps(void) | |||
749 | free_pages_map = bm2; | 749 | free_pages_map = bm2; |
750 | mark_nosave_pages(forbidden_pages_map); | 750 | mark_nosave_pages(forbidden_pages_map); |
751 | 751 | ||
752 | printk("swsusp: Basic memory bitmaps created\n"); | 752 | pr_debug("PM: Basic memory bitmaps created\n"); |
753 | 753 | ||
754 | return 0; | 754 | return 0; |
755 | 755 | ||
@@ -784,7 +784,7 @@ void free_basic_memory_bitmaps(void) | |||
784 | memory_bm_free(bm2, PG_UNSAFE_CLEAR); | 784 | memory_bm_free(bm2, PG_UNSAFE_CLEAR); |
785 | kfree(bm2); | 785 | kfree(bm2); |
786 | 786 | ||
787 | printk("swsusp: Basic memory bitmaps freed\n"); | 787 | pr_debug("PM: Basic memory bitmaps freed\n"); |
788 | } | 788 | } |
789 | 789 | ||
790 | /** | 790 | /** |
@@ -872,7 +872,6 @@ unsigned int count_highmem_pages(void) | |||
872 | } | 872 | } |
873 | #else | 873 | #else |
874 | static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; } | 874 | static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; } |
875 | static inline unsigned int count_highmem_pages(void) { return 0; } | ||
876 | #endif /* CONFIG_HIGHMEM */ | 875 | #endif /* CONFIG_HIGHMEM */ |
877 | 876 | ||
878 | /** | 877 | /** |
@@ -1089,7 +1088,7 @@ static int enough_free_mem(unsigned int nr_pages, unsigned int nr_highmem) | |||
1089 | } | 1088 | } |
1090 | 1089 | ||
1091 | nr_pages += count_pages_for_highmem(nr_highmem); | 1090 | nr_pages += count_pages_for_highmem(nr_highmem); |
1092 | pr_debug("swsusp: Normal pages needed: %u + %u + %u, available pages: %u\n", | 1091 | pr_debug("PM: Normal pages needed: %u + %u + %u, available pages: %u\n", |
1093 | nr_pages, PAGES_FOR_IO, meta, free); | 1092 | nr_pages, PAGES_FOR_IO, meta, free); |
1094 | 1093 | ||
1095 | return free > nr_pages + PAGES_FOR_IO + meta; | 1094 | return free > nr_pages + PAGES_FOR_IO + meta; |
@@ -1202,20 +1201,20 @@ asmlinkage int swsusp_save(void) | |||
1202 | { | 1201 | { |
1203 | unsigned int nr_pages, nr_highmem; | 1202 | unsigned int nr_pages, nr_highmem; |
1204 | 1203 | ||
1205 | printk("swsusp: critical section: \n"); | 1204 | printk(KERN_INFO "PM: Creating hibernation image: \n"); |
1206 | 1205 | ||
1207 | drain_local_pages(); | 1206 | drain_local_pages(); |
1208 | nr_pages = count_data_pages(); | 1207 | nr_pages = count_data_pages(); |
1209 | nr_highmem = count_highmem_pages(); | 1208 | nr_highmem = count_highmem_pages(); |
1210 | printk("swsusp: Need to copy %u pages\n", nr_pages + nr_highmem); | 1209 | printk(KERN_INFO "PM: Need to copy %u pages\n", nr_pages + nr_highmem); |
1211 | 1210 | ||
1212 | if (!enough_free_mem(nr_pages, nr_highmem)) { | 1211 | if (!enough_free_mem(nr_pages, nr_highmem)) { |
1213 | printk(KERN_ERR "swsusp: Not enough free memory\n"); | 1212 | printk(KERN_ERR "PM: Not enough free memory\n"); |
1214 | return -ENOMEM; | 1213 | return -ENOMEM; |
1215 | } | 1214 | } |
1216 | 1215 | ||
1217 | if (swsusp_alloc(&orig_bm, ©_bm, nr_pages, nr_highmem)) { | 1216 | if (swsusp_alloc(&orig_bm, ©_bm, nr_pages, nr_highmem)) { |
1218 | printk(KERN_ERR "swsusp: Memory allocation failed\n"); | 1217 | printk(KERN_ERR "PM: Memory allocation failed\n"); |
1219 | return -ENOMEM; | 1218 | return -ENOMEM; |
1220 | } | 1219 | } |
1221 | 1220 | ||
@@ -1235,7 +1234,8 @@ asmlinkage int swsusp_save(void) | |||
1235 | nr_copy_pages = nr_pages; | 1234 | nr_copy_pages = nr_pages; |
1236 | nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE); | 1235 | nr_meta_pages = DIV_ROUND_UP(nr_pages * sizeof(long), PAGE_SIZE); |
1237 | 1236 | ||
1238 | printk("swsusp: critical section: done (%d pages copied)\n", nr_pages); | 1237 | printk(KERN_INFO "PM: Hibernation image created (%d pages copied)\n", |
1238 | nr_pages); | ||
1239 | 1239 | ||
1240 | return 0; | 1240 | return 0; |
1241 | } | 1241 | } |
@@ -1264,12 +1264,17 @@ static char *check_image_kernel(struct swsusp_info *info) | |||
1264 | } | 1264 | } |
1265 | #endif /* CONFIG_ARCH_HIBERNATION_HEADER */ | 1265 | #endif /* CONFIG_ARCH_HIBERNATION_HEADER */ |
1266 | 1266 | ||
1267 | unsigned long snapshot_get_image_size(void) | ||
1268 | { | ||
1269 | return nr_copy_pages + nr_meta_pages + 1; | ||
1270 | } | ||
1271 | |||
1267 | static int init_header(struct swsusp_info *info) | 1272 | static int init_header(struct swsusp_info *info) |
1268 | { | 1273 | { |
1269 | memset(info, 0, sizeof(struct swsusp_info)); | 1274 | memset(info, 0, sizeof(struct swsusp_info)); |
1270 | info->num_physpages = num_physpages; | 1275 | info->num_physpages = num_physpages; |
1271 | info->image_pages = nr_copy_pages; | 1276 | info->image_pages = nr_copy_pages; |
1272 | info->pages = nr_copy_pages + nr_meta_pages + 1; | 1277 | info->pages = snapshot_get_image_size(); |
1273 | info->size = info->pages; | 1278 | info->size = info->pages; |
1274 | info->size <<= PAGE_SHIFT; | 1279 | info->size <<= PAGE_SHIFT; |
1275 | return init_header_complete(info); | 1280 | return init_header_complete(info); |
@@ -1429,7 +1434,7 @@ static int check_header(struct swsusp_info *info) | |||
1429 | if (!reason && info->num_physpages != num_physpages) | 1434 | if (!reason && info->num_physpages != num_physpages) |
1430 | reason = "memory size"; | 1435 | reason = "memory size"; |
1431 | if (reason) { | 1436 | if (reason) { |
1432 | printk(KERN_ERR "swsusp: Resume mismatch: %s\n", reason); | 1437 | printk(KERN_ERR "PM: Image mismatch: %s\n", reason); |
1433 | return -EPERM; | 1438 | return -EPERM; |
1434 | } | 1439 | } |
1435 | return 0; | 1440 | return 0; |
diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 917aba100575..a0abf9a463f9 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c | |||
@@ -28,8 +28,6 @@ | |||
28 | 28 | ||
29 | #include "power.h" | 29 | #include "power.h" |
30 | 30 | ||
31 | extern char resume_file[]; | ||
32 | |||
33 | #define SWSUSP_SIG "S1SUSPEND" | 31 | #define SWSUSP_SIG "S1SUSPEND" |
34 | 32 | ||
35 | struct swsusp_header { | 33 | struct swsusp_header { |
@@ -73,7 +71,8 @@ static int submit(int rw, pgoff_t page_off, struct page *page, | |||
73 | bio->bi_end_io = end_swap_bio_read; | 71 | bio->bi_end_io = end_swap_bio_read; |
74 | 72 | ||
75 | if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { | 73 | if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { |
76 | printk("swsusp: ERROR: adding page to bio at %ld\n", page_off); | 74 | printk(KERN_ERR "PM: Adding page to bio failed at %ld\n", |
75 | page_off); | ||
77 | bio_put(bio); | 76 | bio_put(bio); |
78 | return -EFAULT; | 77 | return -EFAULT; |
79 | } | 78 | } |
@@ -153,7 +152,7 @@ static int mark_swapfiles(sector_t start, unsigned int flags) | |||
153 | error = bio_write_page(swsusp_resume_block, | 152 | error = bio_write_page(swsusp_resume_block, |
154 | swsusp_header, NULL); | 153 | swsusp_header, NULL); |
155 | } else { | 154 | } else { |
156 | printk(KERN_ERR "swsusp: Swap header not found!\n"); | 155 | printk(KERN_ERR "PM: Swap header not found!\n"); |
157 | error = -ENODEV; | 156 | error = -ENODEV; |
158 | } | 157 | } |
159 | return error; | 158 | return error; |
@@ -325,7 +324,8 @@ static int save_image(struct swap_map_handle *handle, | |||
325 | struct timeval start; | 324 | struct timeval start; |
326 | struct timeval stop; | 325 | struct timeval stop; |
327 | 326 | ||
328 | printk("Saving image data pages (%u pages) ... ", nr_to_write); | 327 | printk(KERN_INFO "PM: Saving image data pages (%u pages) ... ", |
328 | nr_to_write); | ||
329 | m = nr_to_write / 100; | 329 | m = nr_to_write / 100; |
330 | if (!m) | 330 | if (!m) |
331 | m = 1; | 331 | m = 1; |
@@ -365,7 +365,7 @@ static int enough_swap(unsigned int nr_pages) | |||
365 | { | 365 | { |
366 | unsigned int free_swap = count_swap_pages(root_swap, 1); | 366 | unsigned int free_swap = count_swap_pages(root_swap, 1); |
367 | 367 | ||
368 | pr_debug("swsusp: free swap pages: %u\n", free_swap); | 368 | pr_debug("PM: Free swap pages: %u\n", free_swap); |
369 | return free_swap > nr_pages + PAGES_FOR_IO; | 369 | return free_swap > nr_pages + PAGES_FOR_IO; |
370 | } | 370 | } |
371 | 371 | ||
@@ -388,7 +388,7 @@ int swsusp_write(unsigned int flags) | |||
388 | 388 | ||
389 | error = swsusp_swap_check(); | 389 | error = swsusp_swap_check(); |
390 | if (error) { | 390 | if (error) { |
391 | printk(KERN_ERR "swsusp: Cannot find swap device, try " | 391 | printk(KERN_ERR "PM: Cannot find swap device, try " |
392 | "swapon -a.\n"); | 392 | "swapon -a.\n"); |
393 | return error; | 393 | return error; |
394 | } | 394 | } |
@@ -402,7 +402,7 @@ int swsusp_write(unsigned int flags) | |||
402 | } | 402 | } |
403 | header = (struct swsusp_info *)data_of(snapshot); | 403 | header = (struct swsusp_info *)data_of(snapshot); |
404 | if (!enough_swap(header->pages)) { | 404 | if (!enough_swap(header->pages)) { |
405 | printk(KERN_ERR "swsusp: Not enough free swap\n"); | 405 | printk(KERN_ERR "PM: Not enough free swap\n"); |
406 | error = -ENOSPC; | 406 | error = -ENOSPC; |
407 | goto out; | 407 | goto out; |
408 | } | 408 | } |
@@ -417,7 +417,7 @@ int swsusp_write(unsigned int flags) | |||
417 | 417 | ||
418 | if (!error) { | 418 | if (!error) { |
419 | flush_swap_writer(&handle); | 419 | flush_swap_writer(&handle); |
420 | printk("S"); | 420 | printk(KERN_INFO "PM: S"); |
421 | error = mark_swapfiles(start, flags); | 421 | error = mark_swapfiles(start, flags); |
422 | printk("|\n"); | 422 | printk("|\n"); |
423 | } | 423 | } |
@@ -507,7 +507,8 @@ static int load_image(struct swap_map_handle *handle, | |||
507 | int err2; | 507 | int err2; |
508 | unsigned nr_pages; | 508 | unsigned nr_pages; |
509 | 509 | ||
510 | printk("Loading image data pages (%u pages) ... ", nr_to_read); | 510 | printk(KERN_INFO "PM: Loading image data pages (%u pages) ... ", |
511 | nr_to_read); | ||
511 | m = nr_to_read / 100; | 512 | m = nr_to_read / 100; |
512 | if (!m) | 513 | if (!m) |
513 | m = 1; | 514 | m = 1; |
@@ -558,7 +559,7 @@ int swsusp_read(unsigned int *flags_p) | |||
558 | 559 | ||
559 | *flags_p = swsusp_header->flags; | 560 | *flags_p = swsusp_header->flags; |
560 | if (IS_ERR(resume_bdev)) { | 561 | if (IS_ERR(resume_bdev)) { |
561 | pr_debug("swsusp: block device not initialised\n"); | 562 | pr_debug("PM: Image device not initialised\n"); |
562 | return PTR_ERR(resume_bdev); | 563 | return PTR_ERR(resume_bdev); |
563 | } | 564 | } |
564 | 565 | ||
@@ -577,9 +578,9 @@ int swsusp_read(unsigned int *flags_p) | |||
577 | blkdev_put(resume_bdev); | 578 | blkdev_put(resume_bdev); |
578 | 579 | ||
579 | if (!error) | 580 | if (!error) |
580 | pr_debug("swsusp: Reading resume file was successful\n"); | 581 | pr_debug("PM: Image successfully loaded\n"); |
581 | else | 582 | else |
582 | pr_debug("swsusp: Error %d resuming\n", error); | 583 | pr_debug("PM: Error %d resuming\n", error); |
583 | return error; | 584 | return error; |
584 | } | 585 | } |
585 | 586 | ||
@@ -611,13 +612,13 @@ int swsusp_check(void) | |||
611 | if (error) | 612 | if (error) |
612 | blkdev_put(resume_bdev); | 613 | blkdev_put(resume_bdev); |
613 | else | 614 | else |
614 | pr_debug("swsusp: Signature found, resuming\n"); | 615 | pr_debug("PM: Signature found, resuming\n"); |
615 | } else { | 616 | } else { |
616 | error = PTR_ERR(resume_bdev); | 617 | error = PTR_ERR(resume_bdev); |
617 | } | 618 | } |
618 | 619 | ||
619 | if (error) | 620 | if (error) |
620 | pr_debug("swsusp: Error %d check for resume file\n", error); | 621 | pr_debug("PM: Error %d checking image file\n", error); |
621 | 622 | ||
622 | return error; | 623 | return error; |
623 | } | 624 | } |
@@ -629,7 +630,7 @@ int swsusp_check(void) | |||
629 | void swsusp_close(void) | 630 | void swsusp_close(void) |
630 | { | 631 | { |
631 | if (IS_ERR(resume_bdev)) { | 632 | if (IS_ERR(resume_bdev)) { |
632 | pr_debug("swsusp: block device not initialised\n"); | 633 | pr_debug("PM: Image device not initialised\n"); |
633 | return; | 634 | return; |
634 | } | 635 | } |
635 | 636 | ||
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index e1722d3155f1..023ff2a31d89 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c | |||
@@ -64,14 +64,6 @@ unsigned long image_size = 500 * 1024 * 1024; | |||
64 | 64 | ||
65 | int in_suspend __nosavedata = 0; | 65 | int in_suspend __nosavedata = 0; |
66 | 66 | ||
67 | #ifdef CONFIG_HIGHMEM | ||
68 | unsigned int count_highmem_pages(void); | ||
69 | int restore_highmem(void); | ||
70 | #else | ||
71 | static inline int restore_highmem(void) { return 0; } | ||
72 | static inline unsigned int count_highmem_pages(void) { return 0; } | ||
73 | #endif | ||
74 | |||
75 | /** | 67 | /** |
76 | * The following functions are used for tracing the allocated | 68 | * The following functions are used for tracing the allocated |
77 | * swap pages, so that they can be freed in case of an error. | 69 | * swap pages, so that they can be freed in case of an error. |
@@ -196,7 +188,8 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop, | |||
196 | centisecs = 1; /* avoid div-by-zero */ | 188 | centisecs = 1; /* avoid div-by-zero */ |
197 | k = nr_pages * (PAGE_SIZE / 1024); | 189 | k = nr_pages * (PAGE_SIZE / 1024); |
198 | kps = (k * 100) / centisecs; | 190 | kps = (k * 100) / centisecs; |
199 | printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k, | 191 | printk(KERN_INFO "PM: %s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", |
192 | msg, k, | ||
200 | centisecs / 100, centisecs % 100, | 193 | centisecs / 100, centisecs % 100, |
201 | kps / 1000, (kps % 1000) / 10); | 194 | kps / 1000, (kps % 1000) / 10); |
202 | } | 195 | } |
@@ -227,7 +220,7 @@ int swsusp_shrink_memory(void) | |||
227 | char *p = "-\\|/"; | 220 | char *p = "-\\|/"; |
228 | struct timeval start, stop; | 221 | struct timeval start, stop; |
229 | 222 | ||
230 | printk("Shrinking memory... "); | 223 | printk(KERN_INFO "PM: Shrinking memory... "); |
231 | do_gettimeofday(&start); | 224 | do_gettimeofday(&start); |
232 | do { | 225 | do { |
233 | long size, highmem_size; | 226 | long size, highmem_size; |
@@ -269,38 +262,3 @@ int swsusp_shrink_memory(void) | |||
269 | 262 | ||
270 | return 0; | 263 | return 0; |
271 | } | 264 | } |
272 | |||
273 | int swsusp_resume(void) | ||
274 | { | ||
275 | int error; | ||
276 | |||
277 | local_irq_disable(); | ||
278 | /* NOTE: device_power_down() is just a suspend() with irqs off; | ||
279 | * it has no special "power things down" semantics | ||
280 | */ | ||
281 | if (device_power_down(PMSG_PRETHAW)) | ||
282 | printk(KERN_ERR "Some devices failed to power down, very bad\n"); | ||
283 | /* We'll ignore saved state, but this gets preempt count (etc) right */ | ||
284 | save_processor_state(); | ||
285 | error = restore_highmem(); | ||
286 | if (!error) { | ||
287 | error = swsusp_arch_resume(); | ||
288 | /* The code below is only ever reached in case of a failure. | ||
289 | * Otherwise execution continues at place where | ||
290 | * swsusp_arch_suspend() was called | ||
291 | */ | ||
292 | BUG_ON(!error); | ||
293 | /* This call to restore_highmem() undos the previous one */ | ||
294 | restore_highmem(); | ||
295 | } | ||
296 | /* The only reason why swsusp_arch_resume() can fail is memory being | ||
297 | * very tight, so we have to free it as soon as we can to avoid | ||
298 | * subsequent failures | ||
299 | */ | ||
300 | swsusp_free(); | ||
301 | restore_processor_state(); | ||
302 | touch_softlockup_watchdog(); | ||
303 | device_power_up(); | ||
304 | local_irq_enable(); | ||
305 | return error; | ||
306 | } | ||
diff --git a/kernel/power/user.c b/kernel/power/user.c index 5bd321bcbb75..f5512cb3aa86 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c | |||
@@ -28,6 +28,29 @@ | |||
28 | 28 | ||
29 | #include "power.h" | 29 | #include "power.h" |
30 | 30 | ||
31 | /* | ||
32 | * NOTE: The SNAPSHOT_SET_SWAP_FILE and SNAPSHOT_PMOPS ioctls are obsolete and | ||
33 | * will be removed in the future. They are only preserved here for | ||
34 | * compatibility with existing userland utilities. | ||
35 | */ | ||
36 | #define SNAPSHOT_SET_SWAP_FILE _IOW(SNAPSHOT_IOC_MAGIC, 10, unsigned int) | ||
37 | #define SNAPSHOT_PMOPS _IOW(SNAPSHOT_IOC_MAGIC, 12, unsigned int) | ||
38 | |||
39 | #define PMOPS_PREPARE 1 | ||
40 | #define PMOPS_ENTER 2 | ||
41 | #define PMOPS_FINISH 3 | ||
42 | |||
43 | /* | ||
44 | * NOTE: The following ioctl definitions are wrong and have been replaced with | ||
45 | * correct ones. They are only preserved here for compatibility with existing | ||
46 | * userland utilities and will be removed in the future. | ||
47 | */ | ||
48 | #define SNAPSHOT_ATOMIC_SNAPSHOT _IOW(SNAPSHOT_IOC_MAGIC, 3, void *) | ||
49 | #define SNAPSHOT_SET_IMAGE_SIZE _IOW(SNAPSHOT_IOC_MAGIC, 6, unsigned long) | ||
50 | #define SNAPSHOT_AVAIL_SWAP _IOR(SNAPSHOT_IOC_MAGIC, 7, void *) | ||
51 | #define SNAPSHOT_GET_SWAP_PAGE _IOR(SNAPSHOT_IOC_MAGIC, 8, void *) | ||
52 | |||
53 | |||
31 | #define SNAPSHOT_MINOR 231 | 54 | #define SNAPSHOT_MINOR 231 |
32 | 55 | ||
33 | static struct snapshot_data { | 56 | static struct snapshot_data { |
@@ -36,7 +59,7 @@ static struct snapshot_data { | |||
36 | int mode; | 59 | int mode; |
37 | char frozen; | 60 | char frozen; |
38 | char ready; | 61 | char ready; |
39 | char platform_suspend; | 62 | char platform_support; |
40 | } snapshot_state; | 63 | } snapshot_state; |
41 | 64 | ||
42 | atomic_t snapshot_device_available = ATOMIC_INIT(1); | 65 | atomic_t snapshot_device_available = ATOMIC_INIT(1); |
@@ -44,6 +67,7 @@ atomic_t snapshot_device_available = ATOMIC_INIT(1); | |||
44 | static int snapshot_open(struct inode *inode, struct file *filp) | 67 | static int snapshot_open(struct inode *inode, struct file *filp) |
45 | { | 68 | { |
46 | struct snapshot_data *data; | 69 | struct snapshot_data *data; |
70 | int error; | ||
47 | 71 | ||
48 | if (!atomic_add_unless(&snapshot_device_available, -1, 0)) | 72 | if (!atomic_add_unless(&snapshot_device_available, -1, 0)) |
49 | return -EBUSY; | 73 | return -EBUSY; |
@@ -64,13 +88,23 @@ static int snapshot_open(struct inode *inode, struct file *filp) | |||
64 | data->swap = swsusp_resume_device ? | 88 | data->swap = swsusp_resume_device ? |
65 | swap_type_of(swsusp_resume_device, 0, NULL) : -1; | 89 | swap_type_of(swsusp_resume_device, 0, NULL) : -1; |
66 | data->mode = O_RDONLY; | 90 | data->mode = O_RDONLY; |
91 | error = pm_notifier_call_chain(PM_RESTORE_PREPARE); | ||
92 | if (error) | ||
93 | pm_notifier_call_chain(PM_POST_RESTORE); | ||
67 | } else { | 94 | } else { |
68 | data->swap = -1; | 95 | data->swap = -1; |
69 | data->mode = O_WRONLY; | 96 | data->mode = O_WRONLY; |
97 | error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); | ||
98 | if (error) | ||
99 | pm_notifier_call_chain(PM_POST_HIBERNATION); | ||
100 | } | ||
101 | if (error) { | ||
102 | atomic_inc(&snapshot_device_available); | ||
103 | return error; | ||
70 | } | 104 | } |
71 | data->frozen = 0; | 105 | data->frozen = 0; |
72 | data->ready = 0; | 106 | data->ready = 0; |
73 | data->platform_suspend = 0; | 107 | data->platform_support = 0; |
74 | 108 | ||
75 | return 0; | 109 | return 0; |
76 | } | 110 | } |
@@ -88,6 +122,8 @@ static int snapshot_release(struct inode *inode, struct file *filp) | |||
88 | thaw_processes(); | 122 | thaw_processes(); |
89 | mutex_unlock(&pm_mutex); | 123 | mutex_unlock(&pm_mutex); |
90 | } | 124 | } |
125 | pm_notifier_call_chain(data->mode == O_WRONLY ? | ||
126 | PM_POST_HIBERNATION : PM_POST_RESTORE); | ||
91 | atomic_inc(&snapshot_device_available); | 127 | atomic_inc(&snapshot_device_available); |
92 | return 0; | 128 | return 0; |
93 | } | 129 | } |
@@ -133,7 +169,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
133 | { | 169 | { |
134 | int error = 0; | 170 | int error = 0; |
135 | struct snapshot_data *data; | 171 | struct snapshot_data *data; |
136 | loff_t avail; | 172 | loff_t size; |
137 | sector_t offset; | 173 | sector_t offset; |
138 | 174 | ||
139 | if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC) | 175 | if (_IOC_TYPE(cmd) != SNAPSHOT_IOC_MAGIC) |
@@ -151,18 +187,13 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
151 | if (data->frozen) | 187 | if (data->frozen) |
152 | break; | 188 | break; |
153 | mutex_lock(&pm_mutex); | 189 | mutex_lock(&pm_mutex); |
154 | error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE); | 190 | printk("Syncing filesystems ... "); |
155 | if (!error) { | 191 | sys_sync(); |
156 | printk("Syncing filesystems ... "); | 192 | printk("done.\n"); |
157 | sys_sync(); | 193 | |
158 | printk("done.\n"); | 194 | error = freeze_processes(); |
159 | |||
160 | error = freeze_processes(); | ||
161 | if (error) | ||
162 | thaw_processes(); | ||
163 | } | ||
164 | if (error) | 195 | if (error) |
165 | pm_notifier_call_chain(PM_POST_HIBERNATION); | 196 | thaw_processes(); |
166 | mutex_unlock(&pm_mutex); | 197 | mutex_unlock(&pm_mutex); |
167 | if (!error) | 198 | if (!error) |
168 | data->frozen = 1; | 199 | data->frozen = 1; |
@@ -173,19 +204,19 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
173 | break; | 204 | break; |
174 | mutex_lock(&pm_mutex); | 205 | mutex_lock(&pm_mutex); |
175 | thaw_processes(); | 206 | thaw_processes(); |
176 | pm_notifier_call_chain(PM_POST_HIBERNATION); | ||
177 | mutex_unlock(&pm_mutex); | 207 | mutex_unlock(&pm_mutex); |
178 | data->frozen = 0; | 208 | data->frozen = 0; |
179 | break; | 209 | break; |
180 | 210 | ||
211 | case SNAPSHOT_CREATE_IMAGE: | ||
181 | case SNAPSHOT_ATOMIC_SNAPSHOT: | 212 | case SNAPSHOT_ATOMIC_SNAPSHOT: |
182 | if (data->mode != O_RDONLY || !data->frozen || data->ready) { | 213 | if (data->mode != O_RDONLY || !data->frozen || data->ready) { |
183 | error = -EPERM; | 214 | error = -EPERM; |
184 | break; | 215 | break; |
185 | } | 216 | } |
186 | error = hibernation_snapshot(data->platform_suspend); | 217 | error = hibernation_snapshot(data->platform_support); |
187 | if (!error) | 218 | if (!error) |
188 | error = put_user(in_suspend, (unsigned int __user *)arg); | 219 | error = put_user(in_suspend, (int __user *)arg); |
189 | if (!error) | 220 | if (!error) |
190 | data->ready = 1; | 221 | data->ready = 1; |
191 | break; | 222 | break; |
@@ -197,7 +228,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
197 | error = -EPERM; | 228 | error = -EPERM; |
198 | break; | 229 | break; |
199 | } | 230 | } |
200 | error = hibernation_restore(data->platform_suspend); | 231 | error = hibernation_restore(data->platform_support); |
201 | break; | 232 | break; |
202 | 233 | ||
203 | case SNAPSHOT_FREE: | 234 | case SNAPSHOT_FREE: |
@@ -206,16 +237,29 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
206 | data->ready = 0; | 237 | data->ready = 0; |
207 | break; | 238 | break; |
208 | 239 | ||
240 | case SNAPSHOT_PREF_IMAGE_SIZE: | ||
209 | case SNAPSHOT_SET_IMAGE_SIZE: | 241 | case SNAPSHOT_SET_IMAGE_SIZE: |
210 | image_size = arg; | 242 | image_size = arg; |
211 | break; | 243 | break; |
212 | 244 | ||
245 | case SNAPSHOT_GET_IMAGE_SIZE: | ||
246 | if (!data->ready) { | ||
247 | error = -ENODATA; | ||
248 | break; | ||
249 | } | ||
250 | size = snapshot_get_image_size(); | ||
251 | size <<= PAGE_SHIFT; | ||
252 | error = put_user(size, (loff_t __user *)arg); | ||
253 | break; | ||
254 | |||
255 | case SNAPSHOT_AVAIL_SWAP_SIZE: | ||
213 | case SNAPSHOT_AVAIL_SWAP: | 256 | case SNAPSHOT_AVAIL_SWAP: |
214 | avail = count_swap_pages(data->swap, 1); | 257 | size = count_swap_pages(data->swap, 1); |
215 | avail <<= PAGE_SHIFT; | 258 | size <<= PAGE_SHIFT; |
216 | error = put_user(avail, (loff_t __user *)arg); | 259 | error = put_user(size, (loff_t __user *)arg); |
217 | break; | 260 | break; |
218 | 261 | ||
262 | case SNAPSHOT_ALLOC_SWAP_PAGE: | ||
219 | case SNAPSHOT_GET_SWAP_PAGE: | 263 | case SNAPSHOT_GET_SWAP_PAGE: |
220 | if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { | 264 | if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { |
221 | error = -ENODEV; | 265 | error = -ENODEV; |
@@ -224,7 +268,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
224 | offset = alloc_swapdev_block(data->swap); | 268 | offset = alloc_swapdev_block(data->swap); |
225 | if (offset) { | 269 | if (offset) { |
226 | offset <<= PAGE_SHIFT; | 270 | offset <<= PAGE_SHIFT; |
227 | error = put_user(offset, (sector_t __user *)arg); | 271 | error = put_user(offset, (loff_t __user *)arg); |
228 | } else { | 272 | } else { |
229 | error = -ENOSPC; | 273 | error = -ENOSPC; |
230 | } | 274 | } |
@@ -238,7 +282,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
238 | free_all_swap_pages(data->swap); | 282 | free_all_swap_pages(data->swap); |
239 | break; | 283 | break; |
240 | 284 | ||
241 | case SNAPSHOT_SET_SWAP_FILE: | 285 | case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */ |
242 | if (!swsusp_swap_in_use()) { | 286 | if (!swsusp_swap_in_use()) { |
243 | /* | 287 | /* |
244 | * User space encodes device types as two-byte values, | 288 | * User space encodes device types as two-byte values, |
@@ -275,26 +319,33 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp, | |||
275 | mutex_unlock(&pm_mutex); | 319 | mutex_unlock(&pm_mutex); |
276 | break; | 320 | break; |
277 | 321 | ||
278 | case SNAPSHOT_PMOPS: | 322 | case SNAPSHOT_PLATFORM_SUPPORT: |
323 | data->platform_support = !!arg; | ||
324 | break; | ||
325 | |||
326 | case SNAPSHOT_POWER_OFF: | ||
327 | if (data->platform_support) | ||
328 | error = hibernation_platform_enter(); | ||
329 | break; | ||
330 | |||
331 | case SNAPSHOT_PMOPS: /* This ioctl is deprecated */ | ||
279 | error = -EINVAL; | 332 | error = -EINVAL; |
280 | 333 | ||
281 | switch (arg) { | 334 | switch (arg) { |
282 | 335 | ||
283 | case PMOPS_PREPARE: | 336 | case PMOPS_PREPARE: |
284 | data->platform_suspend = 1; | 337 | data->platform_support = 1; |
285 | error = 0; | 338 | error = 0; |
286 | break; | 339 | break; |
287 | 340 | ||
288 | case PMOPS_ENTER: | 341 | case PMOPS_ENTER: |
289 | if (data->platform_suspend) | 342 | if (data->platform_support) |
290 | error = hibernation_platform_enter(); | 343 | error = hibernation_platform_enter(); |
291 | |||
292 | break; | 344 | break; |
293 | 345 | ||
294 | case PMOPS_FINISH: | 346 | case PMOPS_FINISH: |
295 | if (data->platform_suspend) | 347 | if (data->platform_support) |
296 | error = 0; | 348 | error = 0; |
297 | |||
298 | break; | 349 | break; |
299 | 350 | ||
300 | default: | 351 | default: |
diff --git a/kernel/printk.c b/kernel/printk.c index a30fe33de395..29ae1e99cde0 100644 --- a/kernel/printk.c +++ b/kernel/printk.c | |||
@@ -36,6 +36,13 @@ | |||
36 | 36 | ||
37 | #include <asm/uaccess.h> | 37 | #include <asm/uaccess.h> |
38 | 38 | ||
39 | /* | ||
40 | * Architectures can override it: | ||
41 | */ | ||
42 | void __attribute__((weak)) early_printk(const char *fmt, ...) | ||
43 | { | ||
44 | } | ||
45 | |||
39 | #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) | 46 | #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) |
40 | 47 | ||
41 | /* printk's without a loglevel use this.. */ | 48 | /* printk's without a loglevel use this.. */ |
@@ -448,10 +455,10 @@ static int __init ignore_loglevel_setup(char *str) | |||
448 | ignore_loglevel = 1; | 455 | ignore_loglevel = 1; |
449 | printk(KERN_INFO "debug: ignoring loglevel setting.\n"); | 456 | printk(KERN_INFO "debug: ignoring loglevel setting.\n"); |
450 | 457 | ||
451 | return 1; | 458 | return 0; |
452 | } | 459 | } |
453 | 460 | ||
454 | __setup("ignore_loglevel", ignore_loglevel_setup); | 461 | early_param("ignore_loglevel", ignore_loglevel_setup); |
455 | 462 | ||
456 | /* | 463 | /* |
457 | * Write out chars from start to end - 1 inclusive | 464 | * Write out chars from start to end - 1 inclusive |
@@ -573,11 +580,6 @@ static int __init printk_time_setup(char *str) | |||
573 | 580 | ||
574 | __setup("time", printk_time_setup); | 581 | __setup("time", printk_time_setup); |
575 | 582 | ||
576 | __attribute__((weak)) unsigned long long printk_clock(void) | ||
577 | { | ||
578 | return sched_clock(); | ||
579 | } | ||
580 | |||
581 | /* Check if we have any console registered that can be called early in boot. */ | 583 | /* Check if we have any console registered that can be called early in boot. */ |
582 | static int have_callable_console(void) | 584 | static int have_callable_console(void) |
583 | { | 585 | { |
@@ -628,30 +630,57 @@ asmlinkage int printk(const char *fmt, ...) | |||
628 | /* cpu currently holding logbuf_lock */ | 630 | /* cpu currently holding logbuf_lock */ |
629 | static volatile unsigned int printk_cpu = UINT_MAX; | 631 | static volatile unsigned int printk_cpu = UINT_MAX; |
630 | 632 | ||
633 | const char printk_recursion_bug_msg [] = | ||
634 | KERN_CRIT "BUG: recent printk recursion!\n"; | ||
635 | static int printk_recursion_bug; | ||
636 | |||
631 | asmlinkage int vprintk(const char *fmt, va_list args) | 637 | asmlinkage int vprintk(const char *fmt, va_list args) |
632 | { | 638 | { |
639 | static int log_level_unknown = 1; | ||
640 | static char printk_buf[1024]; | ||
641 | |||
633 | unsigned long flags; | 642 | unsigned long flags; |
634 | int printed_len; | 643 | int printed_len = 0; |
644 | int this_cpu; | ||
635 | char *p; | 645 | char *p; |
636 | static char printk_buf[1024]; | ||
637 | static int log_level_unknown = 1; | ||
638 | 646 | ||
639 | boot_delay_msec(); | 647 | boot_delay_msec(); |
640 | 648 | ||
641 | preempt_disable(); | 649 | preempt_disable(); |
642 | if (unlikely(oops_in_progress) && printk_cpu == smp_processor_id()) | ||
643 | /* If a crash is occurring during printk() on this CPU, | ||
644 | * make sure we can't deadlock */ | ||
645 | zap_locks(); | ||
646 | |||
647 | /* This stops the holder of console_sem just where we want him */ | 650 | /* This stops the holder of console_sem just where we want him */ |
648 | raw_local_irq_save(flags); | 651 | raw_local_irq_save(flags); |
652 | this_cpu = smp_processor_id(); | ||
653 | |||
654 | /* | ||
655 | * Ouch, printk recursed into itself! | ||
656 | */ | ||
657 | if (unlikely(printk_cpu == this_cpu)) { | ||
658 | /* | ||
659 | * If a crash is occurring during printk() on this CPU, | ||
660 | * then try to get the crash message out but make sure | ||
661 | * we can't deadlock. Otherwise just return to avoid the | ||
662 | * recursion and return - but flag the recursion so that | ||
663 | * it can be printed at the next appropriate moment: | ||
664 | */ | ||
665 | if (!oops_in_progress) { | ||
666 | printk_recursion_bug = 1; | ||
667 | goto out_restore_irqs; | ||
668 | } | ||
669 | zap_locks(); | ||
670 | } | ||
671 | |||
649 | lockdep_off(); | 672 | lockdep_off(); |
650 | spin_lock(&logbuf_lock); | 673 | spin_lock(&logbuf_lock); |
651 | printk_cpu = smp_processor_id(); | 674 | printk_cpu = this_cpu; |
652 | 675 | ||
676 | if (printk_recursion_bug) { | ||
677 | printk_recursion_bug = 0; | ||
678 | strcpy(printk_buf, printk_recursion_bug_msg); | ||
679 | printed_len = sizeof(printk_recursion_bug_msg); | ||
680 | } | ||
653 | /* Emit the output into the temporary buffer */ | 681 | /* Emit the output into the temporary buffer */ |
654 | printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args); | 682 | printed_len += vscnprintf(printk_buf + printed_len, |
683 | sizeof(printk_buf), fmt, args); | ||
655 | 684 | ||
656 | /* | 685 | /* |
657 | * Copy the output into log_buf. If the caller didn't provide | 686 | * Copy the output into log_buf. If the caller didn't provide |
@@ -680,7 +709,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
680 | loglev_char = default_message_loglevel | 709 | loglev_char = default_message_loglevel |
681 | + '0'; | 710 | + '0'; |
682 | } | 711 | } |
683 | t = printk_clock(); | 712 | t = cpu_clock(printk_cpu); |
684 | nanosec_rem = do_div(t, 1000000000); | 713 | nanosec_rem = do_div(t, 1000000000); |
685 | tlen = sprintf(tbuf, | 714 | tlen = sprintf(tbuf, |
686 | "<%c>[%5lu.%06lu] ", | 715 | "<%c>[%5lu.%06lu] ", |
@@ -744,6 +773,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) | |||
744 | printk_cpu = UINT_MAX; | 773 | printk_cpu = UINT_MAX; |
745 | spin_unlock(&logbuf_lock); | 774 | spin_unlock(&logbuf_lock); |
746 | lockdep_on(); | 775 | lockdep_on(); |
776 | out_restore_irqs: | ||
747 | raw_local_irq_restore(flags); | 777 | raw_local_irq_restore(flags); |
748 | } | 778 | } |
749 | 779 | ||
@@ -817,7 +847,7 @@ __setup("console=", console_setup); | |||
817 | * commonly to provide a default console (ie from PROM variables) when | 847 | * commonly to provide a default console (ie from PROM variables) when |
818 | * the user has not supplied one. | 848 | * the user has not supplied one. |
819 | */ | 849 | */ |
820 | int __init add_preferred_console(char *name, int idx, char *options) | 850 | int add_preferred_console(char *name, int idx, char *options) |
821 | { | 851 | { |
822 | struct console_cmdline *c; | 852 | struct console_cmdline *c; |
823 | int i; | 853 | int i; |
diff --git a/kernel/profile.c b/kernel/profile.c index 5e95330e5120..e64c2da11c0f 100644 --- a/kernel/profile.c +++ b/kernel/profile.c | |||
@@ -52,7 +52,7 @@ static DEFINE_PER_CPU(int, cpu_profile_flip); | |||
52 | static DEFINE_MUTEX(profile_flip_mutex); | 52 | static DEFINE_MUTEX(profile_flip_mutex); |
53 | #endif /* CONFIG_SMP */ | 53 | #endif /* CONFIG_SMP */ |
54 | 54 | ||
55 | static int __init profile_setup(char * str) | 55 | static int __init profile_setup(char *str) |
56 | { | 56 | { |
57 | static char __initdata schedstr[] = "schedule"; | 57 | static char __initdata schedstr[] = "schedule"; |
58 | static char __initdata sleepstr[] = "sleep"; | 58 | static char __initdata sleepstr[] = "sleep"; |
@@ -104,28 +104,28 @@ __setup("profile=", profile_setup); | |||
104 | 104 | ||
105 | void __init profile_init(void) | 105 | void __init profile_init(void) |
106 | { | 106 | { |
107 | if (!prof_on) | 107 | if (!prof_on) |
108 | return; | 108 | return; |
109 | 109 | ||
110 | /* only text is profiled */ | 110 | /* only text is profiled */ |
111 | prof_len = (_etext - _stext) >> prof_shift; | 111 | prof_len = (_etext - _stext) >> prof_shift; |
112 | prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t)); | 112 | prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t)); |
113 | } | 113 | } |
114 | 114 | ||
115 | /* Profile event notifications */ | 115 | /* Profile event notifications */ |
116 | 116 | ||
117 | #ifdef CONFIG_PROFILING | 117 | #ifdef CONFIG_PROFILING |
118 | 118 | ||
119 | static BLOCKING_NOTIFIER_HEAD(task_exit_notifier); | 119 | static BLOCKING_NOTIFIER_HEAD(task_exit_notifier); |
120 | static ATOMIC_NOTIFIER_HEAD(task_free_notifier); | 120 | static ATOMIC_NOTIFIER_HEAD(task_free_notifier); |
121 | static BLOCKING_NOTIFIER_HEAD(munmap_notifier); | 121 | static BLOCKING_NOTIFIER_HEAD(munmap_notifier); |
122 | 122 | ||
123 | void profile_task_exit(struct task_struct * task) | 123 | void profile_task_exit(struct task_struct *task) |
124 | { | 124 | { |
125 | blocking_notifier_call_chain(&task_exit_notifier, 0, task); | 125 | blocking_notifier_call_chain(&task_exit_notifier, 0, task); |
126 | } | 126 | } |
127 | 127 | ||
128 | int profile_handoff_task(struct task_struct * task) | 128 | int profile_handoff_task(struct task_struct *task) |
129 | { | 129 | { |
130 | int ret; | 130 | int ret; |
131 | ret = atomic_notifier_call_chain(&task_free_notifier, 0, task); | 131 | ret = atomic_notifier_call_chain(&task_free_notifier, 0, task); |
@@ -137,52 +137,55 @@ void profile_munmap(unsigned long addr) | |||
137 | blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr); | 137 | blocking_notifier_call_chain(&munmap_notifier, 0, (void *)addr); |
138 | } | 138 | } |
139 | 139 | ||
140 | int task_handoff_register(struct notifier_block * n) | 140 | int task_handoff_register(struct notifier_block *n) |
141 | { | 141 | { |
142 | return atomic_notifier_chain_register(&task_free_notifier, n); | 142 | return atomic_notifier_chain_register(&task_free_notifier, n); |
143 | } | 143 | } |
144 | EXPORT_SYMBOL_GPL(task_handoff_register); | ||
144 | 145 | ||
145 | int task_handoff_unregister(struct notifier_block * n) | 146 | int task_handoff_unregister(struct notifier_block *n) |
146 | { | 147 | { |
147 | return atomic_notifier_chain_unregister(&task_free_notifier, n); | 148 | return atomic_notifier_chain_unregister(&task_free_notifier, n); |
148 | } | 149 | } |
150 | EXPORT_SYMBOL_GPL(task_handoff_unregister); | ||
149 | 151 | ||
150 | int profile_event_register(enum profile_type type, struct notifier_block * n) | 152 | int profile_event_register(enum profile_type type, struct notifier_block *n) |
151 | { | 153 | { |
152 | int err = -EINVAL; | 154 | int err = -EINVAL; |
153 | 155 | ||
154 | switch (type) { | 156 | switch (type) { |
155 | case PROFILE_TASK_EXIT: | 157 | case PROFILE_TASK_EXIT: |
156 | err = blocking_notifier_chain_register( | 158 | err = blocking_notifier_chain_register( |
157 | &task_exit_notifier, n); | 159 | &task_exit_notifier, n); |
158 | break; | 160 | break; |
159 | case PROFILE_MUNMAP: | 161 | case PROFILE_MUNMAP: |
160 | err = blocking_notifier_chain_register( | 162 | err = blocking_notifier_chain_register( |
161 | &munmap_notifier, n); | 163 | &munmap_notifier, n); |
162 | break; | 164 | break; |
163 | } | 165 | } |
164 | 166 | ||
165 | return err; | 167 | return err; |
166 | } | 168 | } |
169 | EXPORT_SYMBOL_GPL(profile_event_register); | ||
167 | 170 | ||
168 | 171 | int profile_event_unregister(enum profile_type type, struct notifier_block *n) | |
169 | int profile_event_unregister(enum profile_type type, struct notifier_block * n) | ||
170 | { | 172 | { |
171 | int err = -EINVAL; | 173 | int err = -EINVAL; |
172 | 174 | ||
173 | switch (type) { | 175 | switch (type) { |
174 | case PROFILE_TASK_EXIT: | 176 | case PROFILE_TASK_EXIT: |
175 | err = blocking_notifier_chain_unregister( | 177 | err = blocking_notifier_chain_unregister( |
176 | &task_exit_notifier, n); | 178 | &task_exit_notifier, n); |
177 | break; | 179 | break; |
178 | case PROFILE_MUNMAP: | 180 | case PROFILE_MUNMAP: |
179 | err = blocking_notifier_chain_unregister( | 181 | err = blocking_notifier_chain_unregister( |
180 | &munmap_notifier, n); | 182 | &munmap_notifier, n); |
181 | break; | 183 | break; |
182 | } | 184 | } |
183 | 185 | ||
184 | return err; | 186 | return err; |
185 | } | 187 | } |
188 | EXPORT_SYMBOL_GPL(profile_event_unregister); | ||
186 | 189 | ||
187 | int register_timer_hook(int (*hook)(struct pt_regs *)) | 190 | int register_timer_hook(int (*hook)(struct pt_regs *)) |
188 | { | 191 | { |
@@ -191,6 +194,7 @@ int register_timer_hook(int (*hook)(struct pt_regs *)) | |||
191 | timer_hook = hook; | 194 | timer_hook = hook; |
192 | return 0; | 195 | return 0; |
193 | } | 196 | } |
197 | EXPORT_SYMBOL_GPL(register_timer_hook); | ||
194 | 198 | ||
195 | void unregister_timer_hook(int (*hook)(struct pt_regs *)) | 199 | void unregister_timer_hook(int (*hook)(struct pt_regs *)) |
196 | { | 200 | { |
@@ -199,13 +203,7 @@ void unregister_timer_hook(int (*hook)(struct pt_regs *)) | |||
199 | /* make sure all CPUs see the NULL hook */ | 203 | /* make sure all CPUs see the NULL hook */ |
200 | synchronize_sched(); /* Allow ongoing interrupts to complete. */ | 204 | synchronize_sched(); /* Allow ongoing interrupts to complete. */ |
201 | } | 205 | } |
202 | |||
203 | EXPORT_SYMBOL_GPL(register_timer_hook); | ||
204 | EXPORT_SYMBOL_GPL(unregister_timer_hook); | 206 | EXPORT_SYMBOL_GPL(unregister_timer_hook); |
205 | EXPORT_SYMBOL_GPL(task_handoff_register); | ||
206 | EXPORT_SYMBOL_GPL(task_handoff_unregister); | ||
207 | EXPORT_SYMBOL_GPL(profile_event_register); | ||
208 | EXPORT_SYMBOL_GPL(profile_event_unregister); | ||
209 | 207 | ||
210 | #endif /* CONFIG_PROFILING */ | 208 | #endif /* CONFIG_PROFILING */ |
211 | 209 | ||
@@ -366,7 +364,7 @@ static int __devinit profile_cpu_callback(struct notifier_block *info, | |||
366 | per_cpu(cpu_profile_hits, cpu)[0] = page_address(page); | 364 | per_cpu(cpu_profile_hits, cpu)[0] = page_address(page); |
367 | } | 365 | } |
368 | break; | 366 | break; |
369 | out_free: | 367 | out_free: |
370 | page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); | 368 | page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); |
371 | per_cpu(cpu_profile_hits, cpu)[1] = NULL; | 369 | per_cpu(cpu_profile_hits, cpu)[1] = NULL; |
372 | __free_page(page); | 370 | __free_page(page); |
@@ -409,7 +407,6 @@ void profile_hits(int type, void *__pc, unsigned int nr_hits) | |||
409 | atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); | 407 | atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]); |
410 | } | 408 | } |
411 | #endif /* !CONFIG_SMP */ | 409 | #endif /* !CONFIG_SMP */ |
412 | |||
413 | EXPORT_SYMBOL_GPL(profile_hits); | 410 | EXPORT_SYMBOL_GPL(profile_hits); |
414 | 411 | ||
415 | void profile_tick(int type) | 412 | void profile_tick(int type) |
@@ -427,7 +424,7 @@ void profile_tick(int type) | |||
427 | #include <asm/uaccess.h> | 424 | #include <asm/uaccess.h> |
428 | #include <asm/ptrace.h> | 425 | #include <asm/ptrace.h> |
429 | 426 | ||
430 | static int prof_cpu_mask_read_proc (char *page, char **start, off_t off, | 427 | static int prof_cpu_mask_read_proc(char *page, char **start, off_t off, |
431 | int count, int *eof, void *data) | 428 | int count, int *eof, void *data) |
432 | { | 429 | { |
433 | int len = cpumask_scnprintf(page, count, *(cpumask_t *)data); | 430 | int len = cpumask_scnprintf(page, count, *(cpumask_t *)data); |
@@ -437,8 +434,8 @@ static int prof_cpu_mask_read_proc (char *page, char **start, off_t off, | |||
437 | return len; | 434 | return len; |
438 | } | 435 | } |
439 | 436 | ||
440 | static int prof_cpu_mask_write_proc (struct file *file, const char __user *buffer, | 437 | static int prof_cpu_mask_write_proc(struct file *file, |
441 | unsigned long count, void *data) | 438 | const char __user *buffer, unsigned long count, void *data) |
442 | { | 439 | { |
443 | cpumask_t *mask = (cpumask_t *)data; | 440 | cpumask_t *mask = (cpumask_t *)data; |
444 | unsigned long full_count = count, err; | 441 | unsigned long full_count = count, err; |
@@ -457,7 +454,8 @@ void create_prof_cpu_mask(struct proc_dir_entry *root_irq_dir) | |||
457 | struct proc_dir_entry *entry; | 454 | struct proc_dir_entry *entry; |
458 | 455 | ||
459 | /* create /proc/irq/prof_cpu_mask */ | 456 | /* create /proc/irq/prof_cpu_mask */ |
460 | if (!(entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir))) | 457 | entry = create_proc_entry("prof_cpu_mask", 0600, root_irq_dir); |
458 | if (!entry) | ||
461 | return; | 459 | return; |
462 | entry->data = (void *)&prof_cpu_mask; | 460 | entry->data = (void *)&prof_cpu_mask; |
463 | entry->read_proc = prof_cpu_mask_read_proc; | 461 | entry->read_proc = prof_cpu_mask_read_proc; |
@@ -475,7 +473,7 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos) | |||
475 | { | 473 | { |
476 | unsigned long p = *ppos; | 474 | unsigned long p = *ppos; |
477 | ssize_t read; | 475 | ssize_t read; |
478 | char * pnt; | 476 | char *pnt; |
479 | unsigned int sample_step = 1 << prof_shift; | 477 | unsigned int sample_step = 1 << prof_shift; |
480 | 478 | ||
481 | profile_flip_buffers(); | 479 | profile_flip_buffers(); |
@@ -486,12 +484,12 @@ read_profile(struct file *file, char __user *buf, size_t count, loff_t *ppos) | |||
486 | read = 0; | 484 | read = 0; |
487 | 485 | ||
488 | while (p < sizeof(unsigned int) && count > 0) { | 486 | while (p < sizeof(unsigned int) && count > 0) { |
489 | if (put_user(*((char *)(&sample_step)+p),buf)) | 487 | if (put_user(*((char *)(&sample_step)+p), buf)) |
490 | return -EFAULT; | 488 | return -EFAULT; |
491 | buf++; p++; count--; read++; | 489 | buf++; p++; count--; read++; |
492 | } | 490 | } |
493 | pnt = (char *)prof_buffer + p - sizeof(atomic_t); | 491 | pnt = (char *)prof_buffer + p - sizeof(atomic_t); |
494 | if (copy_to_user(buf,(void *)pnt,count)) | 492 | if (copy_to_user(buf, (void *)pnt, count)) |
495 | return -EFAULT; | 493 | return -EFAULT; |
496 | read += count; | 494 | read += count; |
497 | *ppos += read; | 495 | *ppos += read; |
@@ -508,7 +506,7 @@ static ssize_t write_profile(struct file *file, const char __user *buf, | |||
508 | size_t count, loff_t *ppos) | 506 | size_t count, loff_t *ppos) |
509 | { | 507 | { |
510 | #ifdef CONFIG_SMP | 508 | #ifdef CONFIG_SMP |
511 | extern int setup_profiling_timer (unsigned int multiplier); | 509 | extern int setup_profiling_timer(unsigned int multiplier); |
512 | 510 | ||
513 | if (count == sizeof(int)) { | 511 | if (count == sizeof(int)) { |
514 | unsigned int multiplier; | 512 | unsigned int multiplier; |
@@ -591,7 +589,8 @@ static int __init create_proc_profile(void) | |||
591 | return 0; | 589 | return 0; |
592 | if (create_hash_tables()) | 590 | if (create_hash_tables()) |
593 | return -1; | 591 | return -1; |
594 | if (!(entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL))) | 592 | entry = create_proc_entry("profile", S_IWUSR | S_IRUGO, NULL); |
593 | if (!entry) | ||
595 | return 0; | 594 | return 0; |
596 | entry->proc_fops = &proc_profile_operations; | 595 | entry->proc_fops = &proc_profile_operations; |
597 | entry->size = (1+prof_len) * sizeof(atomic_t); | 596 | entry->size = (1+prof_len) * sizeof(atomic_t); |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 7c76f2ffaeaa..b0d4ab4dfd3d 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -51,7 +51,7 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent) | |||
51 | void ptrace_untrace(struct task_struct *child) | 51 | void ptrace_untrace(struct task_struct *child) |
52 | { | 52 | { |
53 | spin_lock(&child->sighand->siglock); | 53 | spin_lock(&child->sighand->siglock); |
54 | if (child->state == TASK_TRACED) { | 54 | if (task_is_traced(child)) { |
55 | if (child->signal->flags & SIGNAL_STOP_STOPPED) { | 55 | if (child->signal->flags & SIGNAL_STOP_STOPPED) { |
56 | child->state = TASK_STOPPED; | 56 | child->state = TASK_STOPPED; |
57 | } else { | 57 | } else { |
@@ -79,7 +79,7 @@ void __ptrace_unlink(struct task_struct *child) | |||
79 | add_parent(child); | 79 | add_parent(child); |
80 | } | 80 | } |
81 | 81 | ||
82 | if (child->state == TASK_TRACED) | 82 | if (task_is_traced(child)) |
83 | ptrace_untrace(child); | 83 | ptrace_untrace(child); |
84 | } | 84 | } |
85 | 85 | ||
@@ -103,9 +103,9 @@ int ptrace_check_attach(struct task_struct *child, int kill) | |||
103 | && child->signal != NULL) { | 103 | && child->signal != NULL) { |
104 | ret = 0; | 104 | ret = 0; |
105 | spin_lock_irq(&child->sighand->siglock); | 105 | spin_lock_irq(&child->sighand->siglock); |
106 | if (child->state == TASK_STOPPED) { | 106 | if (task_is_stopped(child)) { |
107 | child->state = TASK_TRACED; | 107 | child->state = TASK_TRACED; |
108 | } else if (child->state != TASK_TRACED && !kill) { | 108 | } else if (!task_is_traced(child) && !kill) { |
109 | ret = -ESRCH; | 109 | ret = -ESRCH; |
110 | } | 110 | } |
111 | spin_unlock_irq(&child->sighand->siglock); | 111 | spin_unlock_irq(&child->sighand->siglock); |
@@ -120,7 +120,7 @@ int ptrace_check_attach(struct task_struct *child, int kill) | |||
120 | return ret; | 120 | return ret; |
121 | } | 121 | } |
122 | 122 | ||
123 | static int may_attach(struct task_struct *task) | 123 | int __ptrace_may_attach(struct task_struct *task) |
124 | { | 124 | { |
125 | /* May we inspect the given task? | 125 | /* May we inspect the given task? |
126 | * This check is used both for attaching with ptrace | 126 | * This check is used both for attaching with ptrace |
@@ -154,7 +154,7 @@ int ptrace_may_attach(struct task_struct *task) | |||
154 | { | 154 | { |
155 | int err; | 155 | int err; |
156 | task_lock(task); | 156 | task_lock(task); |
157 | err = may_attach(task); | 157 | err = __ptrace_may_attach(task); |
158 | task_unlock(task); | 158 | task_unlock(task); |
159 | return !err; | 159 | return !err; |
160 | } | 160 | } |
@@ -196,7 +196,7 @@ repeat: | |||
196 | /* the same process cannot be attached many times */ | 196 | /* the same process cannot be attached many times */ |
197 | if (task->ptrace & PT_PTRACED) | 197 | if (task->ptrace & PT_PTRACED) |
198 | goto bad; | 198 | goto bad; |
199 | retval = may_attach(task); | 199 | retval = __ptrace_may_attach(task); |
200 | if (retval) | 200 | if (retval) |
201 | goto bad; | 201 | goto bad; |
202 | 202 | ||
@@ -366,12 +366,73 @@ static int ptrace_setsiginfo(struct task_struct *child, siginfo_t __user * data) | |||
366 | return error; | 366 | return error; |
367 | } | 367 | } |
368 | 368 | ||
369 | |||
370 | #ifdef PTRACE_SINGLESTEP | ||
371 | #define is_singlestep(request) ((request) == PTRACE_SINGLESTEP) | ||
372 | #else | ||
373 | #define is_singlestep(request) 0 | ||
374 | #endif | ||
375 | |||
376 | #ifdef PTRACE_SINGLEBLOCK | ||
377 | #define is_singleblock(request) ((request) == PTRACE_SINGLEBLOCK) | ||
378 | #else | ||
379 | #define is_singleblock(request) 0 | ||
380 | #endif | ||
381 | |||
382 | #ifdef PTRACE_SYSEMU | ||
383 | #define is_sysemu_singlestep(request) ((request) == PTRACE_SYSEMU_SINGLESTEP) | ||
384 | #else | ||
385 | #define is_sysemu_singlestep(request) 0 | ||
386 | #endif | ||
387 | |||
388 | static int ptrace_resume(struct task_struct *child, long request, long data) | ||
389 | { | ||
390 | if (!valid_signal(data)) | ||
391 | return -EIO; | ||
392 | |||
393 | if (request == PTRACE_SYSCALL) | ||
394 | set_tsk_thread_flag(child, TIF_SYSCALL_TRACE); | ||
395 | else | ||
396 | clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE); | ||
397 | |||
398 | #ifdef TIF_SYSCALL_EMU | ||
399 | if (request == PTRACE_SYSEMU || request == PTRACE_SYSEMU_SINGLESTEP) | ||
400 | set_tsk_thread_flag(child, TIF_SYSCALL_EMU); | ||
401 | else | ||
402 | clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); | ||
403 | #endif | ||
404 | |||
405 | if (is_singleblock(request)) { | ||
406 | if (unlikely(!arch_has_block_step())) | ||
407 | return -EIO; | ||
408 | user_enable_block_step(child); | ||
409 | } else if (is_singlestep(request) || is_sysemu_singlestep(request)) { | ||
410 | if (unlikely(!arch_has_single_step())) | ||
411 | return -EIO; | ||
412 | user_enable_single_step(child); | ||
413 | } | ||
414 | else | ||
415 | user_disable_single_step(child); | ||
416 | |||
417 | child->exit_code = data; | ||
418 | wake_up_process(child); | ||
419 | |||
420 | return 0; | ||
421 | } | ||
422 | |||
369 | int ptrace_request(struct task_struct *child, long request, | 423 | int ptrace_request(struct task_struct *child, long request, |
370 | long addr, long data) | 424 | long addr, long data) |
371 | { | 425 | { |
372 | int ret = -EIO; | 426 | int ret = -EIO; |
373 | 427 | ||
374 | switch (request) { | 428 | switch (request) { |
429 | case PTRACE_PEEKTEXT: | ||
430 | case PTRACE_PEEKDATA: | ||
431 | return generic_ptrace_peekdata(child, addr, data); | ||
432 | case PTRACE_POKETEXT: | ||
433 | case PTRACE_POKEDATA: | ||
434 | return generic_ptrace_pokedata(child, addr, data); | ||
435 | |||
375 | #ifdef PTRACE_OLDSETOPTIONS | 436 | #ifdef PTRACE_OLDSETOPTIONS |
376 | case PTRACE_OLDSETOPTIONS: | 437 | case PTRACE_OLDSETOPTIONS: |
377 | #endif | 438 | #endif |
@@ -390,6 +451,26 @@ int ptrace_request(struct task_struct *child, long request, | |||
390 | case PTRACE_DETACH: /* detach a process that was attached. */ | 451 | case PTRACE_DETACH: /* detach a process that was attached. */ |
391 | ret = ptrace_detach(child, data); | 452 | ret = ptrace_detach(child, data); |
392 | break; | 453 | break; |
454 | |||
455 | #ifdef PTRACE_SINGLESTEP | ||
456 | case PTRACE_SINGLESTEP: | ||
457 | #endif | ||
458 | #ifdef PTRACE_SINGLEBLOCK | ||
459 | case PTRACE_SINGLEBLOCK: | ||
460 | #endif | ||
461 | #ifdef PTRACE_SYSEMU | ||
462 | case PTRACE_SYSEMU: | ||
463 | case PTRACE_SYSEMU_SINGLESTEP: | ||
464 | #endif | ||
465 | case PTRACE_SYSCALL: | ||
466 | case PTRACE_CONT: | ||
467 | return ptrace_resume(child, request, data); | ||
468 | |||
469 | case PTRACE_KILL: | ||
470 | if (child->exit_state) /* already dead */ | ||
471 | return 0; | ||
472 | return ptrace_resume(child, request, SIGKILL); | ||
473 | |||
393 | default: | 474 | default: |
394 | break; | 475 | break; |
395 | } | 476 | } |
@@ -470,6 +551,8 @@ asmlinkage long sys_ptrace(long request, long pid, long addr, long data) | |||
470 | lock_kernel(); | 551 | lock_kernel(); |
471 | if (request == PTRACE_TRACEME) { | 552 | if (request == PTRACE_TRACEME) { |
472 | ret = ptrace_traceme(); | 553 | ret = ptrace_traceme(); |
554 | if (!ret) | ||
555 | arch_ptrace_attach(current); | ||
473 | goto out; | 556 | goto out; |
474 | } | 557 | } |
475 | 558 | ||
@@ -524,3 +607,87 @@ int generic_ptrace_pokedata(struct task_struct *tsk, long addr, long data) | |||
524 | copied = access_process_vm(tsk, addr, &data, sizeof(data), 1); | 607 | copied = access_process_vm(tsk, addr, &data, sizeof(data), 1); |
525 | return (copied == sizeof(data)) ? 0 : -EIO; | 608 | return (copied == sizeof(data)) ? 0 : -EIO; |
526 | } | 609 | } |
610 | |||
611 | #ifdef CONFIG_COMPAT | ||
612 | #include <linux/compat.h> | ||
613 | |||
614 | int compat_ptrace_request(struct task_struct *child, compat_long_t request, | ||
615 | compat_ulong_t addr, compat_ulong_t data) | ||
616 | { | ||
617 | compat_ulong_t __user *datap = compat_ptr(data); | ||
618 | compat_ulong_t word; | ||
619 | int ret; | ||
620 | |||
621 | switch (request) { | ||
622 | case PTRACE_PEEKTEXT: | ||
623 | case PTRACE_PEEKDATA: | ||
624 | ret = access_process_vm(child, addr, &word, sizeof(word), 0); | ||
625 | if (ret != sizeof(word)) | ||
626 | ret = -EIO; | ||
627 | else | ||
628 | ret = put_user(word, datap); | ||
629 | break; | ||
630 | |||
631 | case PTRACE_POKETEXT: | ||
632 | case PTRACE_POKEDATA: | ||
633 | ret = access_process_vm(child, addr, &data, sizeof(data), 1); | ||
634 | ret = (ret != sizeof(data) ? -EIO : 0); | ||
635 | break; | ||
636 | |||
637 | case PTRACE_GETEVENTMSG: | ||
638 | ret = put_user((compat_ulong_t) child->ptrace_message, datap); | ||
639 | break; | ||
640 | |||
641 | default: | ||
642 | ret = ptrace_request(child, request, addr, data); | ||
643 | } | ||
644 | |||
645 | return ret; | ||
646 | } | ||
647 | |||
648 | #ifdef __ARCH_WANT_COMPAT_SYS_PTRACE | ||
649 | asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, | ||
650 | compat_long_t addr, compat_long_t data) | ||
651 | { | ||
652 | struct task_struct *child; | ||
653 | long ret; | ||
654 | |||
655 | /* | ||
656 | * This lock_kernel fixes a subtle race with suid exec | ||
657 | */ | ||
658 | lock_kernel(); | ||
659 | if (request == PTRACE_TRACEME) { | ||
660 | ret = ptrace_traceme(); | ||
661 | goto out; | ||
662 | } | ||
663 | |||
664 | child = ptrace_get_task_struct(pid); | ||
665 | if (IS_ERR(child)) { | ||
666 | ret = PTR_ERR(child); | ||
667 | goto out; | ||
668 | } | ||
669 | |||
670 | if (request == PTRACE_ATTACH) { | ||
671 | ret = ptrace_attach(child); | ||
672 | /* | ||
673 | * Some architectures need to do book-keeping after | ||
674 | * a ptrace attach. | ||
675 | */ | ||
676 | if (!ret) | ||
677 | arch_ptrace_attach(child); | ||
678 | goto out_put_task_struct; | ||
679 | } | ||
680 | |||
681 | ret = ptrace_check_attach(child, request == PTRACE_KILL); | ||
682 | if (!ret) | ||
683 | ret = compat_arch_ptrace(child, request, addr, data); | ||
684 | |||
685 | out_put_task_struct: | ||
686 | put_task_struct(child); | ||
687 | out: | ||
688 | unlock_kernel(); | ||
689 | return ret; | ||
690 | } | ||
691 | #endif /* __ARCH_WANT_COMPAT_SYS_PTRACE */ | ||
692 | |||
693 | #endif /* CONFIG_COMPAT */ | ||
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c new file mode 100644 index 000000000000..f4ffbd0f306f --- /dev/null +++ b/kernel/rcuclassic.c | |||
@@ -0,0 +1,575 @@ | |||
1 | /* | ||
2 | * Read-Copy Update mechanism for mutual exclusion | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
17 | * | ||
18 | * Copyright IBM Corporation, 2001 | ||
19 | * | ||
20 | * Authors: Dipankar Sarma <dipankar@in.ibm.com> | ||
21 | * Manfred Spraul <manfred@colorfullife.com> | ||
22 | * | ||
23 | * Based on the original work by Paul McKenney <paulmck@us.ibm.com> | ||
24 | * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. | ||
25 | * Papers: | ||
26 | * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf | ||
27 | * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) | ||
28 | * | ||
29 | * For detailed explanation of Read-Copy Update mechanism see - | ||
30 | * Documentation/RCU | ||
31 | * | ||
32 | */ | ||
33 | #include <linux/types.h> | ||
34 | #include <linux/kernel.h> | ||
35 | #include <linux/init.h> | ||
36 | #include <linux/spinlock.h> | ||
37 | #include <linux/smp.h> | ||
38 | #include <linux/rcupdate.h> | ||
39 | #include <linux/interrupt.h> | ||
40 | #include <linux/sched.h> | ||
41 | #include <asm/atomic.h> | ||
42 | #include <linux/bitops.h> | ||
43 | #include <linux/module.h> | ||
44 | #include <linux/completion.h> | ||
45 | #include <linux/moduleparam.h> | ||
46 | #include <linux/percpu.h> | ||
47 | #include <linux/notifier.h> | ||
48 | #include <linux/cpu.h> | ||
49 | #include <linux/mutex.h> | ||
50 | |||
51 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
52 | static struct lock_class_key rcu_lock_key; | ||
53 | struct lockdep_map rcu_lock_map = | ||
54 | STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); | ||
55 | EXPORT_SYMBOL_GPL(rcu_lock_map); | ||
56 | #endif | ||
57 | |||
58 | |||
59 | /* Definition for rcupdate control block. */ | ||
60 | static struct rcu_ctrlblk rcu_ctrlblk = { | ||
61 | .cur = -300, | ||
62 | .completed = -300, | ||
63 | .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), | ||
64 | .cpumask = CPU_MASK_NONE, | ||
65 | }; | ||
66 | static struct rcu_ctrlblk rcu_bh_ctrlblk = { | ||
67 | .cur = -300, | ||
68 | .completed = -300, | ||
69 | .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), | ||
70 | .cpumask = CPU_MASK_NONE, | ||
71 | }; | ||
72 | |||
73 | DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; | ||
74 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; | ||
75 | |||
76 | static int blimit = 10; | ||
77 | static int qhimark = 10000; | ||
78 | static int qlowmark = 100; | ||
79 | |||
80 | #ifdef CONFIG_SMP | ||
81 | static void force_quiescent_state(struct rcu_data *rdp, | ||
82 | struct rcu_ctrlblk *rcp) | ||
83 | { | ||
84 | int cpu; | ||
85 | cpumask_t cpumask; | ||
86 | set_need_resched(); | ||
87 | if (unlikely(!rcp->signaled)) { | ||
88 | rcp->signaled = 1; | ||
89 | /* | ||
90 | * Don't send IPI to itself. With irqs disabled, | ||
91 | * rdp->cpu is the current cpu. | ||
92 | */ | ||
93 | cpumask = rcp->cpumask; | ||
94 | cpu_clear(rdp->cpu, cpumask); | ||
95 | for_each_cpu_mask(cpu, cpumask) | ||
96 | smp_send_reschedule(cpu); | ||
97 | } | ||
98 | } | ||
99 | #else | ||
100 | static inline void force_quiescent_state(struct rcu_data *rdp, | ||
101 | struct rcu_ctrlblk *rcp) | ||
102 | { | ||
103 | set_need_resched(); | ||
104 | } | ||
105 | #endif | ||
106 | |||
107 | /** | ||
108 | * call_rcu - Queue an RCU callback for invocation after a grace period. | ||
109 | * @head: structure to be used for queueing the RCU updates. | ||
110 | * @func: actual update function to be invoked after the grace period | ||
111 | * | ||
112 | * The update function will be invoked some time after a full grace | ||
113 | * period elapses, in other words after all currently executing RCU | ||
114 | * read-side critical sections have completed. RCU read-side critical | ||
115 | * sections are delimited by rcu_read_lock() and rcu_read_unlock(), | ||
116 | * and may be nested. | ||
117 | */ | ||
118 | void call_rcu(struct rcu_head *head, | ||
119 | void (*func)(struct rcu_head *rcu)) | ||
120 | { | ||
121 | unsigned long flags; | ||
122 | struct rcu_data *rdp; | ||
123 | |||
124 | head->func = func; | ||
125 | head->next = NULL; | ||
126 | local_irq_save(flags); | ||
127 | rdp = &__get_cpu_var(rcu_data); | ||
128 | *rdp->nxttail = head; | ||
129 | rdp->nxttail = &head->next; | ||
130 | if (unlikely(++rdp->qlen > qhimark)) { | ||
131 | rdp->blimit = INT_MAX; | ||
132 | force_quiescent_state(rdp, &rcu_ctrlblk); | ||
133 | } | ||
134 | local_irq_restore(flags); | ||
135 | } | ||
136 | EXPORT_SYMBOL_GPL(call_rcu); | ||
137 | |||
138 | /** | ||
139 | * call_rcu_bh - Queue an RCU for invocation after a quicker grace period. | ||
140 | * @head: structure to be used for queueing the RCU updates. | ||
141 | * @func: actual update function to be invoked after the grace period | ||
142 | * | ||
143 | * The update function will be invoked some time after a full grace | ||
144 | * period elapses, in other words after all currently executing RCU | ||
145 | * read-side critical sections have completed. call_rcu_bh() assumes | ||
146 | * that the read-side critical sections end on completion of a softirq | ||
147 | * handler. This means that read-side critical sections in process | ||
148 | * context must not be interrupted by softirqs. This interface is to be | ||
149 | * used when most of the read-side critical sections are in softirq context. | ||
150 | * RCU read-side critical sections are delimited by rcu_read_lock() and | ||
151 | * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh() | ||
152 | * and rcu_read_unlock_bh(), if in process context. These may be nested. | ||
153 | */ | ||
154 | void call_rcu_bh(struct rcu_head *head, | ||
155 | void (*func)(struct rcu_head *rcu)) | ||
156 | { | ||
157 | unsigned long flags; | ||
158 | struct rcu_data *rdp; | ||
159 | |||
160 | head->func = func; | ||
161 | head->next = NULL; | ||
162 | local_irq_save(flags); | ||
163 | rdp = &__get_cpu_var(rcu_bh_data); | ||
164 | *rdp->nxttail = head; | ||
165 | rdp->nxttail = &head->next; | ||
166 | |||
167 | if (unlikely(++rdp->qlen > qhimark)) { | ||
168 | rdp->blimit = INT_MAX; | ||
169 | force_quiescent_state(rdp, &rcu_bh_ctrlblk); | ||
170 | } | ||
171 | |||
172 | local_irq_restore(flags); | ||
173 | } | ||
174 | EXPORT_SYMBOL_GPL(call_rcu_bh); | ||
175 | |||
176 | /* | ||
177 | * Return the number of RCU batches processed thus far. Useful | ||
178 | * for debug and statistics. | ||
179 | */ | ||
180 | long rcu_batches_completed(void) | ||
181 | { | ||
182 | return rcu_ctrlblk.completed; | ||
183 | } | ||
184 | EXPORT_SYMBOL_GPL(rcu_batches_completed); | ||
185 | |||
186 | /* | ||
187 | * Return the number of RCU batches processed thus far. Useful | ||
188 | * for debug and statistics. | ||
189 | */ | ||
190 | long rcu_batches_completed_bh(void) | ||
191 | { | ||
192 | return rcu_bh_ctrlblk.completed; | ||
193 | } | ||
194 | EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); | ||
195 | |||
196 | /* Raises the softirq for processing rcu_callbacks. */ | ||
197 | static inline void raise_rcu_softirq(void) | ||
198 | { | ||
199 | raise_softirq(RCU_SOFTIRQ); | ||
200 | /* | ||
201 | * The smp_mb() here is required to ensure that this cpu's | ||
202 | * __rcu_process_callbacks() reads the most recently updated | ||
203 | * value of rcu->cur. | ||
204 | */ | ||
205 | smp_mb(); | ||
206 | } | ||
207 | |||
208 | /* | ||
209 | * Invoke the completed RCU callbacks. They are expected to be in | ||
210 | * a per-cpu list. | ||
211 | */ | ||
212 | static void rcu_do_batch(struct rcu_data *rdp) | ||
213 | { | ||
214 | struct rcu_head *next, *list; | ||
215 | int count = 0; | ||
216 | |||
217 | list = rdp->donelist; | ||
218 | while (list) { | ||
219 | next = list->next; | ||
220 | prefetch(next); | ||
221 | list->func(list); | ||
222 | list = next; | ||
223 | if (++count >= rdp->blimit) | ||
224 | break; | ||
225 | } | ||
226 | rdp->donelist = list; | ||
227 | |||
228 | local_irq_disable(); | ||
229 | rdp->qlen -= count; | ||
230 | local_irq_enable(); | ||
231 | if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) | ||
232 | rdp->blimit = blimit; | ||
233 | |||
234 | if (!rdp->donelist) | ||
235 | rdp->donetail = &rdp->donelist; | ||
236 | else | ||
237 | raise_rcu_softirq(); | ||
238 | } | ||
239 | |||
240 | /* | ||
241 | * Grace period handling: | ||
242 | * The grace period handling consists out of two steps: | ||
243 | * - A new grace period is started. | ||
244 | * This is done by rcu_start_batch. The start is not broadcasted to | ||
245 | * all cpus, they must pick this up by comparing rcp->cur with | ||
246 | * rdp->quiescbatch. All cpus are recorded in the | ||
247 | * rcu_ctrlblk.cpumask bitmap. | ||
248 | * - All cpus must go through a quiescent state. | ||
249 | * Since the start of the grace period is not broadcasted, at least two | ||
250 | * calls to rcu_check_quiescent_state are required: | ||
251 | * The first call just notices that a new grace period is running. The | ||
252 | * following calls check if there was a quiescent state since the beginning | ||
253 | * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If | ||
254 | * the bitmap is empty, then the grace period is completed. | ||
255 | * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace | ||
256 | * period (if necessary). | ||
257 | */ | ||
258 | /* | ||
259 | * Register a new batch of callbacks, and start it up if there is currently no | ||
260 | * active batch and the batch to be registered has not already occurred. | ||
261 | * Caller must hold rcu_ctrlblk.lock. | ||
262 | */ | ||
263 | static void rcu_start_batch(struct rcu_ctrlblk *rcp) | ||
264 | { | ||
265 | if (rcp->next_pending && | ||
266 | rcp->completed == rcp->cur) { | ||
267 | rcp->next_pending = 0; | ||
268 | /* | ||
269 | * next_pending == 0 must be visible in | ||
270 | * __rcu_process_callbacks() before it can see new value of cur. | ||
271 | */ | ||
272 | smp_wmb(); | ||
273 | rcp->cur++; | ||
274 | |||
275 | /* | ||
276 | * Accessing nohz_cpu_mask before incrementing rcp->cur needs a | ||
277 | * Barrier Otherwise it can cause tickless idle CPUs to be | ||
278 | * included in rcp->cpumask, which will extend graceperiods | ||
279 | * unnecessarily. | ||
280 | */ | ||
281 | smp_mb(); | ||
282 | cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask); | ||
283 | |||
284 | rcp->signaled = 0; | ||
285 | } | ||
286 | } | ||
287 | |||
288 | /* | ||
289 | * cpu went through a quiescent state since the beginning of the grace period. | ||
290 | * Clear it from the cpu mask and complete the grace period if it was the last | ||
291 | * cpu. Start another grace period if someone has further entries pending | ||
292 | */ | ||
293 | static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp) | ||
294 | { | ||
295 | cpu_clear(cpu, rcp->cpumask); | ||
296 | if (cpus_empty(rcp->cpumask)) { | ||
297 | /* batch completed ! */ | ||
298 | rcp->completed = rcp->cur; | ||
299 | rcu_start_batch(rcp); | ||
300 | } | ||
301 | } | ||
302 | |||
303 | /* | ||
304 | * Check if the cpu has gone through a quiescent state (say context | ||
305 | * switch). If so and if it already hasn't done so in this RCU | ||
306 | * quiescent cycle, then indicate that it has done so. | ||
307 | */ | ||
308 | static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, | ||
309 | struct rcu_data *rdp) | ||
310 | { | ||
311 | if (rdp->quiescbatch != rcp->cur) { | ||
312 | /* start new grace period: */ | ||
313 | rdp->qs_pending = 1; | ||
314 | rdp->passed_quiesc = 0; | ||
315 | rdp->quiescbatch = rcp->cur; | ||
316 | return; | ||
317 | } | ||
318 | |||
319 | /* Grace period already completed for this cpu? | ||
320 | * qs_pending is checked instead of the actual bitmap to avoid | ||
321 | * cacheline trashing. | ||
322 | */ | ||
323 | if (!rdp->qs_pending) | ||
324 | return; | ||
325 | |||
326 | /* | ||
327 | * Was there a quiescent state since the beginning of the grace | ||
328 | * period? If no, then exit and wait for the next call. | ||
329 | */ | ||
330 | if (!rdp->passed_quiesc) | ||
331 | return; | ||
332 | rdp->qs_pending = 0; | ||
333 | |||
334 | spin_lock(&rcp->lock); | ||
335 | /* | ||
336 | * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync | ||
337 | * during cpu startup. Ignore the quiescent state. | ||
338 | */ | ||
339 | if (likely(rdp->quiescbatch == rcp->cur)) | ||
340 | cpu_quiet(rdp->cpu, rcp); | ||
341 | |||
342 | spin_unlock(&rcp->lock); | ||
343 | } | ||
344 | |||
345 | |||
346 | #ifdef CONFIG_HOTPLUG_CPU | ||
347 | |||
348 | /* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing | ||
349 | * locking requirements, the list it's pulling from has to belong to a cpu | ||
350 | * which is dead and hence not processing interrupts. | ||
351 | */ | ||
352 | static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, | ||
353 | struct rcu_head **tail) | ||
354 | { | ||
355 | local_irq_disable(); | ||
356 | *this_rdp->nxttail = list; | ||
357 | if (list) | ||
358 | this_rdp->nxttail = tail; | ||
359 | local_irq_enable(); | ||
360 | } | ||
361 | |||
362 | static void __rcu_offline_cpu(struct rcu_data *this_rdp, | ||
363 | struct rcu_ctrlblk *rcp, struct rcu_data *rdp) | ||
364 | { | ||
365 | /* if the cpu going offline owns the grace period | ||
366 | * we can block indefinitely waiting for it, so flush | ||
367 | * it here | ||
368 | */ | ||
369 | spin_lock_bh(&rcp->lock); | ||
370 | if (rcp->cur != rcp->completed) | ||
371 | cpu_quiet(rdp->cpu, rcp); | ||
372 | spin_unlock_bh(&rcp->lock); | ||
373 | rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail); | ||
374 | rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail); | ||
375 | rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail); | ||
376 | } | ||
377 | |||
378 | static void rcu_offline_cpu(int cpu) | ||
379 | { | ||
380 | struct rcu_data *this_rdp = &get_cpu_var(rcu_data); | ||
381 | struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data); | ||
382 | |||
383 | __rcu_offline_cpu(this_rdp, &rcu_ctrlblk, | ||
384 | &per_cpu(rcu_data, cpu)); | ||
385 | __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk, | ||
386 | &per_cpu(rcu_bh_data, cpu)); | ||
387 | put_cpu_var(rcu_data); | ||
388 | put_cpu_var(rcu_bh_data); | ||
389 | } | ||
390 | |||
391 | #else | ||
392 | |||
393 | static void rcu_offline_cpu(int cpu) | ||
394 | { | ||
395 | } | ||
396 | |||
397 | #endif | ||
398 | |||
399 | /* | ||
400 | * This does the RCU processing work from softirq context. | ||
401 | */ | ||
402 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, | ||
403 | struct rcu_data *rdp) | ||
404 | { | ||
405 | if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) { | ||
406 | *rdp->donetail = rdp->curlist; | ||
407 | rdp->donetail = rdp->curtail; | ||
408 | rdp->curlist = NULL; | ||
409 | rdp->curtail = &rdp->curlist; | ||
410 | } | ||
411 | |||
412 | if (rdp->nxtlist && !rdp->curlist) { | ||
413 | local_irq_disable(); | ||
414 | rdp->curlist = rdp->nxtlist; | ||
415 | rdp->curtail = rdp->nxttail; | ||
416 | rdp->nxtlist = NULL; | ||
417 | rdp->nxttail = &rdp->nxtlist; | ||
418 | local_irq_enable(); | ||
419 | |||
420 | /* | ||
421 | * start the next batch of callbacks | ||
422 | */ | ||
423 | |||
424 | /* determine batch number */ | ||
425 | rdp->batch = rcp->cur + 1; | ||
426 | /* see the comment and corresponding wmb() in | ||
427 | * the rcu_start_batch() | ||
428 | */ | ||
429 | smp_rmb(); | ||
430 | |||
431 | if (!rcp->next_pending) { | ||
432 | /* and start it/schedule start if it's a new batch */ | ||
433 | spin_lock(&rcp->lock); | ||
434 | rcp->next_pending = 1; | ||
435 | rcu_start_batch(rcp); | ||
436 | spin_unlock(&rcp->lock); | ||
437 | } | ||
438 | } | ||
439 | |||
440 | rcu_check_quiescent_state(rcp, rdp); | ||
441 | if (rdp->donelist) | ||
442 | rcu_do_batch(rdp); | ||
443 | } | ||
444 | |||
445 | static void rcu_process_callbacks(struct softirq_action *unused) | ||
446 | { | ||
447 | __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data)); | ||
448 | __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); | ||
449 | } | ||
450 | |||
451 | static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) | ||
452 | { | ||
453 | /* This cpu has pending rcu entries and the grace period | ||
454 | * for them has completed. | ||
455 | */ | ||
456 | if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) | ||
457 | return 1; | ||
458 | |||
459 | /* This cpu has no pending entries, but there are new entries */ | ||
460 | if (!rdp->curlist && rdp->nxtlist) | ||
461 | return 1; | ||
462 | |||
463 | /* This cpu has finished callbacks to invoke */ | ||
464 | if (rdp->donelist) | ||
465 | return 1; | ||
466 | |||
467 | /* The rcu core waits for a quiescent state from the cpu */ | ||
468 | if (rdp->quiescbatch != rcp->cur || rdp->qs_pending) | ||
469 | return 1; | ||
470 | |||
471 | /* nothing to do */ | ||
472 | return 0; | ||
473 | } | ||
474 | |||
475 | /* | ||
476 | * Check to see if there is any immediate RCU-related work to be done | ||
477 | * by the current CPU, returning 1 if so. This function is part of the | ||
478 | * RCU implementation; it is -not- an exported member of the RCU API. | ||
479 | */ | ||
480 | int rcu_pending(int cpu) | ||
481 | { | ||
482 | return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) || | ||
483 | __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu)); | ||
484 | } | ||
485 | |||
486 | /* | ||
487 | * Check to see if any future RCU-related work will need to be done | ||
488 | * by the current CPU, even if none need be done immediately, returning | ||
489 | * 1 if so. This function is part of the RCU implementation; it is -not- | ||
490 | * an exported member of the RCU API. | ||
491 | */ | ||
492 | int rcu_needs_cpu(int cpu) | ||
493 | { | ||
494 | struct rcu_data *rdp = &per_cpu(rcu_data, cpu); | ||
495 | struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu); | ||
496 | |||
497 | return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu)); | ||
498 | } | ||
499 | |||
500 | void rcu_check_callbacks(int cpu, int user) | ||
501 | { | ||
502 | if (user || | ||
503 | (idle_cpu(cpu) && !in_softirq() && | ||
504 | hardirq_count() <= (1 << HARDIRQ_SHIFT))) { | ||
505 | rcu_qsctr_inc(cpu); | ||
506 | rcu_bh_qsctr_inc(cpu); | ||
507 | } else if (!in_softirq()) | ||
508 | rcu_bh_qsctr_inc(cpu); | ||
509 | raise_rcu_softirq(); | ||
510 | } | ||
511 | |||
512 | static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, | ||
513 | struct rcu_data *rdp) | ||
514 | { | ||
515 | memset(rdp, 0, sizeof(*rdp)); | ||
516 | rdp->curtail = &rdp->curlist; | ||
517 | rdp->nxttail = &rdp->nxtlist; | ||
518 | rdp->donetail = &rdp->donelist; | ||
519 | rdp->quiescbatch = rcp->completed; | ||
520 | rdp->qs_pending = 0; | ||
521 | rdp->cpu = cpu; | ||
522 | rdp->blimit = blimit; | ||
523 | } | ||
524 | |||
525 | static void __cpuinit rcu_online_cpu(int cpu) | ||
526 | { | ||
527 | struct rcu_data *rdp = &per_cpu(rcu_data, cpu); | ||
528 | struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu); | ||
529 | |||
530 | rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp); | ||
531 | rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp); | ||
532 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL); | ||
533 | } | ||
534 | |||
535 | static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | ||
536 | unsigned long action, void *hcpu) | ||
537 | { | ||
538 | long cpu = (long)hcpu; | ||
539 | |||
540 | switch (action) { | ||
541 | case CPU_UP_PREPARE: | ||
542 | case CPU_UP_PREPARE_FROZEN: | ||
543 | rcu_online_cpu(cpu); | ||
544 | break; | ||
545 | case CPU_DEAD: | ||
546 | case CPU_DEAD_FROZEN: | ||
547 | rcu_offline_cpu(cpu); | ||
548 | break; | ||
549 | default: | ||
550 | break; | ||
551 | } | ||
552 | return NOTIFY_OK; | ||
553 | } | ||
554 | |||
555 | static struct notifier_block __cpuinitdata rcu_nb = { | ||
556 | .notifier_call = rcu_cpu_notify, | ||
557 | }; | ||
558 | |||
559 | /* | ||
560 | * Initializes rcu mechanism. Assumed to be called early. | ||
561 | * That is before local timer(SMP) or jiffie timer (uniproc) is setup. | ||
562 | * Note that rcu_qsctr and friends are implicitly | ||
563 | * initialized due to the choice of ``0'' for RCU_CTR_INVALID. | ||
564 | */ | ||
565 | void __init __rcu_init(void) | ||
566 | { | ||
567 | rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, | ||
568 | (void *)(long)smp_processor_id()); | ||
569 | /* Register notifier for non-boot CPUs */ | ||
570 | register_cpu_notifier(&rcu_nb); | ||
571 | } | ||
572 | |||
573 | module_param(blimit, int, 0); | ||
574 | module_param(qhimark, int, 0); | ||
575 | module_param(qlowmark, int, 0); | ||
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index a66d4d1615f7..760dfc233a00 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
@@ -15,7 +15,7 @@ | |||
15 | * along with this program; if not, write to the Free Software | 15 | * along with this program; if not, write to the Free Software |
16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | 16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
17 | * | 17 | * |
18 | * Copyright (C) IBM Corporation, 2001 | 18 | * Copyright IBM Corporation, 2001 |
19 | * | 19 | * |
20 | * Authors: Dipankar Sarma <dipankar@in.ibm.com> | 20 | * Authors: Dipankar Sarma <dipankar@in.ibm.com> |
21 | * Manfred Spraul <manfred@colorfullife.com> | 21 | * Manfred Spraul <manfred@colorfullife.com> |
@@ -35,165 +35,57 @@ | |||
35 | #include <linux/init.h> | 35 | #include <linux/init.h> |
36 | #include <linux/spinlock.h> | 36 | #include <linux/spinlock.h> |
37 | #include <linux/smp.h> | 37 | #include <linux/smp.h> |
38 | #include <linux/rcupdate.h> | ||
39 | #include <linux/interrupt.h> | 38 | #include <linux/interrupt.h> |
40 | #include <linux/sched.h> | 39 | #include <linux/sched.h> |
41 | #include <asm/atomic.h> | 40 | #include <asm/atomic.h> |
42 | #include <linux/bitops.h> | 41 | #include <linux/bitops.h> |
43 | #include <linux/module.h> | ||
44 | #include <linux/completion.h> | 42 | #include <linux/completion.h> |
45 | #include <linux/moduleparam.h> | ||
46 | #include <linux/percpu.h> | 43 | #include <linux/percpu.h> |
47 | #include <linux/notifier.h> | 44 | #include <linux/notifier.h> |
48 | #include <linux/cpu.h> | 45 | #include <linux/cpu.h> |
49 | #include <linux/mutex.h> | 46 | #include <linux/mutex.h> |
47 | #include <linux/module.h> | ||
50 | 48 | ||
51 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | 49 | struct rcu_synchronize { |
52 | static struct lock_class_key rcu_lock_key; | 50 | struct rcu_head head; |
53 | struct lockdep_map rcu_lock_map = | 51 | struct completion completion; |
54 | STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); | ||
55 | |||
56 | EXPORT_SYMBOL_GPL(rcu_lock_map); | ||
57 | #endif | ||
58 | |||
59 | /* Definition for rcupdate control block. */ | ||
60 | static struct rcu_ctrlblk rcu_ctrlblk = { | ||
61 | .cur = -300, | ||
62 | .completed = -300, | ||
63 | .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), | ||
64 | .cpumask = CPU_MASK_NONE, | ||
65 | }; | ||
66 | static struct rcu_ctrlblk rcu_bh_ctrlblk = { | ||
67 | .cur = -300, | ||
68 | .completed = -300, | ||
69 | .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), | ||
70 | .cpumask = CPU_MASK_NONE, | ||
71 | }; | 52 | }; |
72 | 53 | ||
73 | DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L }; | 54 | static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; |
74 | DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L }; | ||
75 | |||
76 | /* Fake initialization required by compiler */ | ||
77 | static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL}; | ||
78 | static int blimit = 10; | ||
79 | static int qhimark = 10000; | ||
80 | static int qlowmark = 100; | ||
81 | |||
82 | static atomic_t rcu_barrier_cpu_count; | 55 | static atomic_t rcu_barrier_cpu_count; |
83 | static DEFINE_MUTEX(rcu_barrier_mutex); | 56 | static DEFINE_MUTEX(rcu_barrier_mutex); |
84 | static struct completion rcu_barrier_completion; | 57 | static struct completion rcu_barrier_completion; |
85 | 58 | ||
86 | #ifdef CONFIG_SMP | 59 | /* Because of FASTCALL declaration of complete, we use this wrapper */ |
87 | static void force_quiescent_state(struct rcu_data *rdp, | 60 | static void wakeme_after_rcu(struct rcu_head *head) |
88 | struct rcu_ctrlblk *rcp) | ||
89 | { | ||
90 | int cpu; | ||
91 | cpumask_t cpumask; | ||
92 | set_need_resched(); | ||
93 | if (unlikely(!rcp->signaled)) { | ||
94 | rcp->signaled = 1; | ||
95 | /* | ||
96 | * Don't send IPI to itself. With irqs disabled, | ||
97 | * rdp->cpu is the current cpu. | ||
98 | */ | ||
99 | cpumask = rcp->cpumask; | ||
100 | cpu_clear(rdp->cpu, cpumask); | ||
101 | for_each_cpu_mask(cpu, cpumask) | ||
102 | smp_send_reschedule(cpu); | ||
103 | } | ||
104 | } | ||
105 | #else | ||
106 | static inline void force_quiescent_state(struct rcu_data *rdp, | ||
107 | struct rcu_ctrlblk *rcp) | ||
108 | { | 61 | { |
109 | set_need_resched(); | 62 | struct rcu_synchronize *rcu; |
63 | |||
64 | rcu = container_of(head, struct rcu_synchronize, head); | ||
65 | complete(&rcu->completion); | ||
110 | } | 66 | } |
111 | #endif | ||
112 | 67 | ||
113 | /** | 68 | /** |
114 | * call_rcu - Queue an RCU callback for invocation after a grace period. | 69 | * synchronize_rcu - wait until a grace period has elapsed. |
115 | * @head: structure to be used for queueing the RCU updates. | ||
116 | * @func: actual update function to be invoked after the grace period | ||
117 | * | 70 | * |
118 | * The update function will be invoked some time after a full grace | 71 | * Control will return to the caller some time after a full grace |
119 | * period elapses, in other words after all currently executing RCU | 72 | * period has elapsed, in other words after all currently executing RCU |
120 | * read-side critical sections have completed. RCU read-side critical | 73 | * read-side critical sections have completed. RCU read-side critical |
121 | * sections are delimited by rcu_read_lock() and rcu_read_unlock(), | 74 | * sections are delimited by rcu_read_lock() and rcu_read_unlock(), |
122 | * and may be nested. | 75 | * and may be nested. |
123 | */ | 76 | */ |
124 | void fastcall call_rcu(struct rcu_head *head, | 77 | void synchronize_rcu(void) |
125 | void (*func)(struct rcu_head *rcu)) | ||
126 | { | ||
127 | unsigned long flags; | ||
128 | struct rcu_data *rdp; | ||
129 | |||
130 | head->func = func; | ||
131 | head->next = NULL; | ||
132 | local_irq_save(flags); | ||
133 | rdp = &__get_cpu_var(rcu_data); | ||
134 | *rdp->nxttail = head; | ||
135 | rdp->nxttail = &head->next; | ||
136 | if (unlikely(++rdp->qlen > qhimark)) { | ||
137 | rdp->blimit = INT_MAX; | ||
138 | force_quiescent_state(rdp, &rcu_ctrlblk); | ||
139 | } | ||
140 | local_irq_restore(flags); | ||
141 | } | ||
142 | |||
143 | /** | ||
144 | * call_rcu_bh - Queue an RCU for invocation after a quicker grace period. | ||
145 | * @head: structure to be used for queueing the RCU updates. | ||
146 | * @func: actual update function to be invoked after the grace period | ||
147 | * | ||
148 | * The update function will be invoked some time after a full grace | ||
149 | * period elapses, in other words after all currently executing RCU | ||
150 | * read-side critical sections have completed. call_rcu_bh() assumes | ||
151 | * that the read-side critical sections end on completion of a softirq | ||
152 | * handler. This means that read-side critical sections in process | ||
153 | * context must not be interrupted by softirqs. This interface is to be | ||
154 | * used when most of the read-side critical sections are in softirq context. | ||
155 | * RCU read-side critical sections are delimited by rcu_read_lock() and | ||
156 | * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh() | ||
157 | * and rcu_read_unlock_bh(), if in process context. These may be nested. | ||
158 | */ | ||
159 | void fastcall call_rcu_bh(struct rcu_head *head, | ||
160 | void (*func)(struct rcu_head *rcu)) | ||
161 | { | 78 | { |
162 | unsigned long flags; | 79 | struct rcu_synchronize rcu; |
163 | struct rcu_data *rdp; | ||
164 | |||
165 | head->func = func; | ||
166 | head->next = NULL; | ||
167 | local_irq_save(flags); | ||
168 | rdp = &__get_cpu_var(rcu_bh_data); | ||
169 | *rdp->nxttail = head; | ||
170 | rdp->nxttail = &head->next; | ||
171 | |||
172 | if (unlikely(++rdp->qlen > qhimark)) { | ||
173 | rdp->blimit = INT_MAX; | ||
174 | force_quiescent_state(rdp, &rcu_bh_ctrlblk); | ||
175 | } | ||
176 | |||
177 | local_irq_restore(flags); | ||
178 | } | ||
179 | 80 | ||
180 | /* | 81 | init_completion(&rcu.completion); |
181 | * Return the number of RCU batches processed thus far. Useful | 82 | /* Will wake me after RCU finished */ |
182 | * for debug and statistics. | 83 | call_rcu(&rcu.head, wakeme_after_rcu); |
183 | */ | ||
184 | long rcu_batches_completed(void) | ||
185 | { | ||
186 | return rcu_ctrlblk.completed; | ||
187 | } | ||
188 | 84 | ||
189 | /* | 85 | /* Wait for it */ |
190 | * Return the number of RCU batches processed thus far. Useful | 86 | wait_for_completion(&rcu.completion); |
191 | * for debug and statistics. | ||
192 | */ | ||
193 | long rcu_batches_completed_bh(void) | ||
194 | { | ||
195 | return rcu_bh_ctrlblk.completed; | ||
196 | } | 87 | } |
88 | EXPORT_SYMBOL_GPL(synchronize_rcu); | ||
197 | 89 | ||
198 | static void rcu_barrier_callback(struct rcu_head *notused) | 90 | static void rcu_barrier_callback(struct rcu_head *notused) |
199 | { | 91 | { |
@@ -207,10 +99,8 @@ static void rcu_barrier_callback(struct rcu_head *notused) | |||
207 | static void rcu_barrier_func(void *notused) | 99 | static void rcu_barrier_func(void *notused) |
208 | { | 100 | { |
209 | int cpu = smp_processor_id(); | 101 | int cpu = smp_processor_id(); |
210 | struct rcu_data *rdp = &per_cpu(rcu_data, cpu); | 102 | struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu); |
211 | struct rcu_head *head; | ||
212 | 103 | ||
213 | head = &rdp->barrier; | ||
214 | atomic_inc(&rcu_barrier_cpu_count); | 104 | atomic_inc(&rcu_barrier_cpu_count); |
215 | call_rcu(head, rcu_barrier_callback); | 105 | call_rcu(head, rcu_barrier_callback); |
216 | } | 106 | } |
@@ -225,420 +115,24 @@ void rcu_barrier(void) | |||
225 | mutex_lock(&rcu_barrier_mutex); | 115 | mutex_lock(&rcu_barrier_mutex); |
226 | init_completion(&rcu_barrier_completion); | 116 | init_completion(&rcu_barrier_completion); |
227 | atomic_set(&rcu_barrier_cpu_count, 0); | 117 | atomic_set(&rcu_barrier_cpu_count, 0); |
118 | /* | ||
119 | * The queueing of callbacks in all CPUs must be atomic with | ||
120 | * respect to RCU, otherwise one CPU may queue a callback, | ||
121 | * wait for a grace period, decrement barrier count and call | ||
122 | * complete(), while other CPUs have not yet queued anything. | ||
123 | * So, we need to make sure that grace periods cannot complete | ||
124 | * until all the callbacks are queued. | ||
125 | */ | ||
126 | rcu_read_lock(); | ||
228 | on_each_cpu(rcu_barrier_func, NULL, 0, 1); | 127 | on_each_cpu(rcu_barrier_func, NULL, 0, 1); |
128 | rcu_read_unlock(); | ||
229 | wait_for_completion(&rcu_barrier_completion); | 129 | wait_for_completion(&rcu_barrier_completion); |
230 | mutex_unlock(&rcu_barrier_mutex); | 130 | mutex_unlock(&rcu_barrier_mutex); |
231 | } | 131 | } |
232 | EXPORT_SYMBOL_GPL(rcu_barrier); | 132 | EXPORT_SYMBOL_GPL(rcu_barrier); |
233 | 133 | ||
234 | /* | ||
235 | * Invoke the completed RCU callbacks. They are expected to be in | ||
236 | * a per-cpu list. | ||
237 | */ | ||
238 | static void rcu_do_batch(struct rcu_data *rdp) | ||
239 | { | ||
240 | struct rcu_head *next, *list; | ||
241 | int count = 0; | ||
242 | |||
243 | list = rdp->donelist; | ||
244 | while (list) { | ||
245 | next = list->next; | ||
246 | prefetch(next); | ||
247 | list->func(list); | ||
248 | list = next; | ||
249 | if (++count >= rdp->blimit) | ||
250 | break; | ||
251 | } | ||
252 | rdp->donelist = list; | ||
253 | |||
254 | local_irq_disable(); | ||
255 | rdp->qlen -= count; | ||
256 | local_irq_enable(); | ||
257 | if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) | ||
258 | rdp->blimit = blimit; | ||
259 | |||
260 | if (!rdp->donelist) | ||
261 | rdp->donetail = &rdp->donelist; | ||
262 | else | ||
263 | tasklet_schedule(&per_cpu(rcu_tasklet, rdp->cpu)); | ||
264 | } | ||
265 | |||
266 | /* | ||
267 | * Grace period handling: | ||
268 | * The grace period handling consists out of two steps: | ||
269 | * - A new grace period is started. | ||
270 | * This is done by rcu_start_batch. The start is not broadcasted to | ||
271 | * all cpus, they must pick this up by comparing rcp->cur with | ||
272 | * rdp->quiescbatch. All cpus are recorded in the | ||
273 | * rcu_ctrlblk.cpumask bitmap. | ||
274 | * - All cpus must go through a quiescent state. | ||
275 | * Since the start of the grace period is not broadcasted, at least two | ||
276 | * calls to rcu_check_quiescent_state are required: | ||
277 | * The first call just notices that a new grace period is running. The | ||
278 | * following calls check if there was a quiescent state since the beginning | ||
279 | * of the grace period. If so, it updates rcu_ctrlblk.cpumask. If | ||
280 | * the bitmap is empty, then the grace period is completed. | ||
281 | * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace | ||
282 | * period (if necessary). | ||
283 | */ | ||
284 | /* | ||
285 | * Register a new batch of callbacks, and start it up if there is currently no | ||
286 | * active batch and the batch to be registered has not already occurred. | ||
287 | * Caller must hold rcu_ctrlblk.lock. | ||
288 | */ | ||
289 | static void rcu_start_batch(struct rcu_ctrlblk *rcp) | ||
290 | { | ||
291 | if (rcp->next_pending && | ||
292 | rcp->completed == rcp->cur) { | ||
293 | rcp->next_pending = 0; | ||
294 | /* | ||
295 | * next_pending == 0 must be visible in | ||
296 | * __rcu_process_callbacks() before it can see new value of cur. | ||
297 | */ | ||
298 | smp_wmb(); | ||
299 | rcp->cur++; | ||
300 | |||
301 | /* | ||
302 | * Accessing nohz_cpu_mask before incrementing rcp->cur needs a | ||
303 | * Barrier Otherwise it can cause tickless idle CPUs to be | ||
304 | * included in rcp->cpumask, which will extend graceperiods | ||
305 | * unnecessarily. | ||
306 | */ | ||
307 | smp_mb(); | ||
308 | cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask); | ||
309 | |||
310 | rcp->signaled = 0; | ||
311 | } | ||
312 | } | ||
313 | |||
314 | /* | ||
315 | * cpu went through a quiescent state since the beginning of the grace period. | ||
316 | * Clear it from the cpu mask and complete the grace period if it was the last | ||
317 | * cpu. Start another grace period if someone has further entries pending | ||
318 | */ | ||
319 | static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp) | ||
320 | { | ||
321 | cpu_clear(cpu, rcp->cpumask); | ||
322 | if (cpus_empty(rcp->cpumask)) { | ||
323 | /* batch completed ! */ | ||
324 | rcp->completed = rcp->cur; | ||
325 | rcu_start_batch(rcp); | ||
326 | } | ||
327 | } | ||
328 | |||
329 | /* | ||
330 | * Check if the cpu has gone through a quiescent state (say context | ||
331 | * switch). If so and if it already hasn't done so in this RCU | ||
332 | * quiescent cycle, then indicate that it has done so. | ||
333 | */ | ||
334 | static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, | ||
335 | struct rcu_data *rdp) | ||
336 | { | ||
337 | if (rdp->quiescbatch != rcp->cur) { | ||
338 | /* start new grace period: */ | ||
339 | rdp->qs_pending = 1; | ||
340 | rdp->passed_quiesc = 0; | ||
341 | rdp->quiescbatch = rcp->cur; | ||
342 | return; | ||
343 | } | ||
344 | |||
345 | /* Grace period already completed for this cpu? | ||
346 | * qs_pending is checked instead of the actual bitmap to avoid | ||
347 | * cacheline trashing. | ||
348 | */ | ||
349 | if (!rdp->qs_pending) | ||
350 | return; | ||
351 | |||
352 | /* | ||
353 | * Was there a quiescent state since the beginning of the grace | ||
354 | * period? If no, then exit and wait for the next call. | ||
355 | */ | ||
356 | if (!rdp->passed_quiesc) | ||
357 | return; | ||
358 | rdp->qs_pending = 0; | ||
359 | |||
360 | spin_lock(&rcp->lock); | ||
361 | /* | ||
362 | * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync | ||
363 | * during cpu startup. Ignore the quiescent state. | ||
364 | */ | ||
365 | if (likely(rdp->quiescbatch == rcp->cur)) | ||
366 | cpu_quiet(rdp->cpu, rcp); | ||
367 | |||
368 | spin_unlock(&rcp->lock); | ||
369 | } | ||
370 | |||
371 | |||
372 | #ifdef CONFIG_HOTPLUG_CPU | ||
373 | |||
374 | /* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing | ||
375 | * locking requirements, the list it's pulling from has to belong to a cpu | ||
376 | * which is dead and hence not processing interrupts. | ||
377 | */ | ||
378 | static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, | ||
379 | struct rcu_head **tail) | ||
380 | { | ||
381 | local_irq_disable(); | ||
382 | *this_rdp->nxttail = list; | ||
383 | if (list) | ||
384 | this_rdp->nxttail = tail; | ||
385 | local_irq_enable(); | ||
386 | } | ||
387 | |||
388 | static void __rcu_offline_cpu(struct rcu_data *this_rdp, | ||
389 | struct rcu_ctrlblk *rcp, struct rcu_data *rdp) | ||
390 | { | ||
391 | /* if the cpu going offline owns the grace period | ||
392 | * we can block indefinitely waiting for it, so flush | ||
393 | * it here | ||
394 | */ | ||
395 | spin_lock_bh(&rcp->lock); | ||
396 | if (rcp->cur != rcp->completed) | ||
397 | cpu_quiet(rdp->cpu, rcp); | ||
398 | spin_unlock_bh(&rcp->lock); | ||
399 | rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail); | ||
400 | rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail); | ||
401 | rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail); | ||
402 | } | ||
403 | |||
404 | static void rcu_offline_cpu(int cpu) | ||
405 | { | ||
406 | struct rcu_data *this_rdp = &get_cpu_var(rcu_data); | ||
407 | struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data); | ||
408 | |||
409 | __rcu_offline_cpu(this_rdp, &rcu_ctrlblk, | ||
410 | &per_cpu(rcu_data, cpu)); | ||
411 | __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk, | ||
412 | &per_cpu(rcu_bh_data, cpu)); | ||
413 | put_cpu_var(rcu_data); | ||
414 | put_cpu_var(rcu_bh_data); | ||
415 | tasklet_kill_immediate(&per_cpu(rcu_tasklet, cpu), cpu); | ||
416 | } | ||
417 | |||
418 | #else | ||
419 | |||
420 | static void rcu_offline_cpu(int cpu) | ||
421 | { | ||
422 | } | ||
423 | |||
424 | #endif | ||
425 | |||
426 | /* | ||
427 | * This does the RCU processing work from tasklet context. | ||
428 | */ | ||
429 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, | ||
430 | struct rcu_data *rdp) | ||
431 | { | ||
432 | if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) { | ||
433 | *rdp->donetail = rdp->curlist; | ||
434 | rdp->donetail = rdp->curtail; | ||
435 | rdp->curlist = NULL; | ||
436 | rdp->curtail = &rdp->curlist; | ||
437 | } | ||
438 | |||
439 | if (rdp->nxtlist && !rdp->curlist) { | ||
440 | local_irq_disable(); | ||
441 | rdp->curlist = rdp->nxtlist; | ||
442 | rdp->curtail = rdp->nxttail; | ||
443 | rdp->nxtlist = NULL; | ||
444 | rdp->nxttail = &rdp->nxtlist; | ||
445 | local_irq_enable(); | ||
446 | |||
447 | /* | ||
448 | * start the next batch of callbacks | ||
449 | */ | ||
450 | |||
451 | /* determine batch number */ | ||
452 | rdp->batch = rcp->cur + 1; | ||
453 | /* see the comment and corresponding wmb() in | ||
454 | * the rcu_start_batch() | ||
455 | */ | ||
456 | smp_rmb(); | ||
457 | |||
458 | if (!rcp->next_pending) { | ||
459 | /* and start it/schedule start if it's a new batch */ | ||
460 | spin_lock(&rcp->lock); | ||
461 | rcp->next_pending = 1; | ||
462 | rcu_start_batch(rcp); | ||
463 | spin_unlock(&rcp->lock); | ||
464 | } | ||
465 | } | ||
466 | |||
467 | rcu_check_quiescent_state(rcp, rdp); | ||
468 | if (rdp->donelist) | ||
469 | rcu_do_batch(rdp); | ||
470 | } | ||
471 | |||
472 | static void rcu_process_callbacks(unsigned long unused) | ||
473 | { | ||
474 | __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data)); | ||
475 | __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); | ||
476 | } | ||
477 | |||
478 | static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) | ||
479 | { | ||
480 | /* This cpu has pending rcu entries and the grace period | ||
481 | * for them has completed. | ||
482 | */ | ||
483 | if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) | ||
484 | return 1; | ||
485 | |||
486 | /* This cpu has no pending entries, but there are new entries */ | ||
487 | if (!rdp->curlist && rdp->nxtlist) | ||
488 | return 1; | ||
489 | |||
490 | /* This cpu has finished callbacks to invoke */ | ||
491 | if (rdp->donelist) | ||
492 | return 1; | ||
493 | |||
494 | /* The rcu core waits for a quiescent state from the cpu */ | ||
495 | if (rdp->quiescbatch != rcp->cur || rdp->qs_pending) | ||
496 | return 1; | ||
497 | |||
498 | /* nothing to do */ | ||
499 | return 0; | ||
500 | } | ||
501 | |||
502 | /* | ||
503 | * Check to see if there is any immediate RCU-related work to be done | ||
504 | * by the current CPU, returning 1 if so. This function is part of the | ||
505 | * RCU implementation; it is -not- an exported member of the RCU API. | ||
506 | */ | ||
507 | int rcu_pending(int cpu) | ||
508 | { | ||
509 | return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) || | ||
510 | __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu)); | ||
511 | } | ||
512 | |||
513 | /* | ||
514 | * Check to see if any future RCU-related work will need to be done | ||
515 | * by the current CPU, even if none need be done immediately, returning | ||
516 | * 1 if so. This function is part of the RCU implementation; it is -not- | ||
517 | * an exported member of the RCU API. | ||
518 | */ | ||
519 | int rcu_needs_cpu(int cpu) | ||
520 | { | ||
521 | struct rcu_data *rdp = &per_cpu(rcu_data, cpu); | ||
522 | struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu); | ||
523 | |||
524 | return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu)); | ||
525 | } | ||
526 | |||
527 | void rcu_check_callbacks(int cpu, int user) | ||
528 | { | ||
529 | if (user || | ||
530 | (idle_cpu(cpu) && !in_softirq() && | ||
531 | hardirq_count() <= (1 << HARDIRQ_SHIFT))) { | ||
532 | rcu_qsctr_inc(cpu); | ||
533 | rcu_bh_qsctr_inc(cpu); | ||
534 | } else if (!in_softirq()) | ||
535 | rcu_bh_qsctr_inc(cpu); | ||
536 | tasklet_schedule(&per_cpu(rcu_tasklet, cpu)); | ||
537 | } | ||
538 | |||
539 | static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, | ||
540 | struct rcu_data *rdp) | ||
541 | { | ||
542 | memset(rdp, 0, sizeof(*rdp)); | ||
543 | rdp->curtail = &rdp->curlist; | ||
544 | rdp->nxttail = &rdp->nxtlist; | ||
545 | rdp->donetail = &rdp->donelist; | ||
546 | rdp->quiescbatch = rcp->completed; | ||
547 | rdp->qs_pending = 0; | ||
548 | rdp->cpu = cpu; | ||
549 | rdp->blimit = blimit; | ||
550 | } | ||
551 | |||
552 | static void __devinit rcu_online_cpu(int cpu) | ||
553 | { | ||
554 | struct rcu_data *rdp = &per_cpu(rcu_data, cpu); | ||
555 | struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu); | ||
556 | |||
557 | rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp); | ||
558 | rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp); | ||
559 | tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL); | ||
560 | } | ||
561 | |||
562 | static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | ||
563 | unsigned long action, void *hcpu) | ||
564 | { | ||
565 | long cpu = (long)hcpu; | ||
566 | switch (action) { | ||
567 | case CPU_UP_PREPARE: | ||
568 | case CPU_UP_PREPARE_FROZEN: | ||
569 | rcu_online_cpu(cpu); | ||
570 | break; | ||
571 | case CPU_DEAD: | ||
572 | case CPU_DEAD_FROZEN: | ||
573 | rcu_offline_cpu(cpu); | ||
574 | break; | ||
575 | default: | ||
576 | break; | ||
577 | } | ||
578 | return NOTIFY_OK; | ||
579 | } | ||
580 | |||
581 | static struct notifier_block __cpuinitdata rcu_nb = { | ||
582 | .notifier_call = rcu_cpu_notify, | ||
583 | }; | ||
584 | |||
585 | /* | ||
586 | * Initializes rcu mechanism. Assumed to be called early. | ||
587 | * That is before local timer(SMP) or jiffie timer (uniproc) is setup. | ||
588 | * Note that rcu_qsctr and friends are implicitly | ||
589 | * initialized due to the choice of ``0'' for RCU_CTR_INVALID. | ||
590 | */ | ||
591 | void __init rcu_init(void) | 134 | void __init rcu_init(void) |
592 | { | 135 | { |
593 | rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, | 136 | __rcu_init(); |
594 | (void *)(long)smp_processor_id()); | ||
595 | /* Register notifier for non-boot CPUs */ | ||
596 | register_cpu_notifier(&rcu_nb); | ||
597 | } | ||
598 | |||
599 | struct rcu_synchronize { | ||
600 | struct rcu_head head; | ||
601 | struct completion completion; | ||
602 | }; | ||
603 | |||
604 | /* Because of FASTCALL declaration of complete, we use this wrapper */ | ||
605 | static void wakeme_after_rcu(struct rcu_head *head) | ||
606 | { | ||
607 | struct rcu_synchronize *rcu; | ||
608 | |||
609 | rcu = container_of(head, struct rcu_synchronize, head); | ||
610 | complete(&rcu->completion); | ||
611 | } | 137 | } |
612 | 138 | ||
613 | /** | ||
614 | * synchronize_rcu - wait until a grace period has elapsed. | ||
615 | * | ||
616 | * Control will return to the caller some time after a full grace | ||
617 | * period has elapsed, in other words after all currently executing RCU | ||
618 | * read-side critical sections have completed. RCU read-side critical | ||
619 | * sections are delimited by rcu_read_lock() and rcu_read_unlock(), | ||
620 | * and may be nested. | ||
621 | * | ||
622 | * If your read-side code is not protected by rcu_read_lock(), do -not- | ||
623 | * use synchronize_rcu(). | ||
624 | */ | ||
625 | void synchronize_rcu(void) | ||
626 | { | ||
627 | struct rcu_synchronize rcu; | ||
628 | |||
629 | init_completion(&rcu.completion); | ||
630 | /* Will wake me after RCU finished */ | ||
631 | call_rcu(&rcu.head, wakeme_after_rcu); | ||
632 | |||
633 | /* Wait for it */ | ||
634 | wait_for_completion(&rcu.completion); | ||
635 | } | ||
636 | |||
637 | module_param(blimit, int, 0); | ||
638 | module_param(qhimark, int, 0); | ||
639 | module_param(qlowmark, int, 0); | ||
640 | EXPORT_SYMBOL_GPL(rcu_batches_completed); | ||
641 | EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); | ||
642 | EXPORT_SYMBOL_GPL(call_rcu); | ||
643 | EXPORT_SYMBOL_GPL(call_rcu_bh); | ||
644 | EXPORT_SYMBOL_GPL(synchronize_rcu); | ||
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c new file mode 100644 index 000000000000..987cfb7ade89 --- /dev/null +++ b/kernel/rcupreempt.c | |||
@@ -0,0 +1,953 @@ | |||
1 | /* | ||
2 | * Read-Copy Update mechanism for mutual exclusion, realtime implementation | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
17 | * | ||
18 | * Copyright IBM Corporation, 2006 | ||
19 | * | ||
20 | * Authors: Paul E. McKenney <paulmck@us.ibm.com> | ||
21 | * With thanks to Esben Nielsen, Bill Huey, and Ingo Molnar | ||
22 | * for pushing me away from locks and towards counters, and | ||
23 | * to Suparna Bhattacharya for pushing me completely away | ||
24 | * from atomic instructions on the read side. | ||
25 | * | ||
26 | * Papers: http://www.rdrop.com/users/paulmck/RCU | ||
27 | * | ||
28 | * Design Document: http://lwn.net/Articles/253651/ | ||
29 | * | ||
30 | * For detailed explanation of Read-Copy Update mechanism see - | ||
31 | * Documentation/RCU/ *.txt | ||
32 | * | ||
33 | */ | ||
34 | #include <linux/types.h> | ||
35 | #include <linux/kernel.h> | ||
36 | #include <linux/init.h> | ||
37 | #include <linux/spinlock.h> | ||
38 | #include <linux/smp.h> | ||
39 | #include <linux/rcupdate.h> | ||
40 | #include <linux/interrupt.h> | ||
41 | #include <linux/sched.h> | ||
42 | #include <asm/atomic.h> | ||
43 | #include <linux/bitops.h> | ||
44 | #include <linux/module.h> | ||
45 | #include <linux/completion.h> | ||
46 | #include <linux/moduleparam.h> | ||
47 | #include <linux/percpu.h> | ||
48 | #include <linux/notifier.h> | ||
49 | #include <linux/rcupdate.h> | ||
50 | #include <linux/cpu.h> | ||
51 | #include <linux/random.h> | ||
52 | #include <linux/delay.h> | ||
53 | #include <linux/byteorder/swabb.h> | ||
54 | #include <linux/cpumask.h> | ||
55 | #include <linux/rcupreempt_trace.h> | ||
56 | |||
57 | /* | ||
58 | * Macro that prevents the compiler from reordering accesses, but does | ||
59 | * absolutely -nothing- to prevent CPUs from reordering. This is used | ||
60 | * only to mediate communication between mainline code and hardware | ||
61 | * interrupt and NMI handlers. | ||
62 | */ | ||
63 | #define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x)) | ||
64 | |||
65 | /* | ||
66 | * PREEMPT_RCU data structures. | ||
67 | */ | ||
68 | |||
69 | /* | ||
70 | * GP_STAGES specifies the number of times the state machine has | ||
71 | * to go through the all the rcu_try_flip_states (see below) | ||
72 | * in a single Grace Period. | ||
73 | * | ||
74 | * GP in GP_STAGES stands for Grace Period ;) | ||
75 | */ | ||
76 | #define GP_STAGES 2 | ||
77 | struct rcu_data { | ||
78 | spinlock_t lock; /* Protect rcu_data fields. */ | ||
79 | long completed; /* Number of last completed batch. */ | ||
80 | int waitlistcount; | ||
81 | struct tasklet_struct rcu_tasklet; | ||
82 | struct rcu_head *nextlist; | ||
83 | struct rcu_head **nexttail; | ||
84 | struct rcu_head *waitlist[GP_STAGES]; | ||
85 | struct rcu_head **waittail[GP_STAGES]; | ||
86 | struct rcu_head *donelist; | ||
87 | struct rcu_head **donetail; | ||
88 | long rcu_flipctr[2]; | ||
89 | #ifdef CONFIG_RCU_TRACE | ||
90 | struct rcupreempt_trace trace; | ||
91 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
92 | }; | ||
93 | |||
94 | /* | ||
95 | * States for rcu_try_flip() and friends. | ||
96 | */ | ||
97 | |||
98 | enum rcu_try_flip_states { | ||
99 | |||
100 | /* | ||
101 | * Stay here if nothing is happening. Flip the counter if somthing | ||
102 | * starts happening. Denoted by "I" | ||
103 | */ | ||
104 | rcu_try_flip_idle_state, | ||
105 | |||
106 | /* | ||
107 | * Wait here for all CPUs to notice that the counter has flipped. This | ||
108 | * prevents the old set of counters from ever being incremented once | ||
109 | * we leave this state, which in turn is necessary because we cannot | ||
110 | * test any individual counter for zero -- we can only check the sum. | ||
111 | * Denoted by "A". | ||
112 | */ | ||
113 | rcu_try_flip_waitack_state, | ||
114 | |||
115 | /* | ||
116 | * Wait here for the sum of the old per-CPU counters to reach zero. | ||
117 | * Denoted by "Z". | ||
118 | */ | ||
119 | rcu_try_flip_waitzero_state, | ||
120 | |||
121 | /* | ||
122 | * Wait here for each of the other CPUs to execute a memory barrier. | ||
123 | * This is necessary to ensure that these other CPUs really have | ||
124 | * completed executing their RCU read-side critical sections, despite | ||
125 | * their CPUs wildly reordering memory. Denoted by "M". | ||
126 | */ | ||
127 | rcu_try_flip_waitmb_state, | ||
128 | }; | ||
129 | |||
130 | struct rcu_ctrlblk { | ||
131 | spinlock_t fliplock; /* Protect state-machine transitions. */ | ||
132 | long completed; /* Number of last completed batch. */ | ||
133 | enum rcu_try_flip_states rcu_try_flip_state; /* The current state of | ||
134 | the rcu state machine */ | ||
135 | }; | ||
136 | |||
137 | static DEFINE_PER_CPU(struct rcu_data, rcu_data); | ||
138 | static struct rcu_ctrlblk rcu_ctrlblk = { | ||
139 | .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock), | ||
140 | .completed = 0, | ||
141 | .rcu_try_flip_state = rcu_try_flip_idle_state, | ||
142 | }; | ||
143 | |||
144 | |||
145 | #ifdef CONFIG_RCU_TRACE | ||
146 | static char *rcu_try_flip_state_names[] = | ||
147 | { "idle", "waitack", "waitzero", "waitmb" }; | ||
148 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
149 | |||
150 | static cpumask_t rcu_cpu_online_map __read_mostly = CPU_MASK_NONE; | ||
151 | |||
152 | /* | ||
153 | * Enum and per-CPU flag to determine when each CPU has seen | ||
154 | * the most recent counter flip. | ||
155 | */ | ||
156 | |||
157 | enum rcu_flip_flag_values { | ||
158 | rcu_flip_seen, /* Steady/initial state, last flip seen. */ | ||
159 | /* Only GP detector can update. */ | ||
160 | rcu_flipped /* Flip just completed, need confirmation. */ | ||
161 | /* Only corresponding CPU can update. */ | ||
162 | }; | ||
163 | static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_flip_flag_values, rcu_flip_flag) | ||
164 | = rcu_flip_seen; | ||
165 | |||
166 | /* | ||
167 | * Enum and per-CPU flag to determine when each CPU has executed the | ||
168 | * needed memory barrier to fence in memory references from its last RCU | ||
169 | * read-side critical section in the just-completed grace period. | ||
170 | */ | ||
171 | |||
172 | enum rcu_mb_flag_values { | ||
173 | rcu_mb_done, /* Steady/initial state, no mb()s required. */ | ||
174 | /* Only GP detector can update. */ | ||
175 | rcu_mb_needed /* Flip just completed, need an mb(). */ | ||
176 | /* Only corresponding CPU can update. */ | ||
177 | }; | ||
178 | static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag) | ||
179 | = rcu_mb_done; | ||
180 | |||
181 | /* | ||
182 | * RCU_DATA_ME: find the current CPU's rcu_data structure. | ||
183 | * RCU_DATA_CPU: find the specified CPU's rcu_data structure. | ||
184 | */ | ||
185 | #define RCU_DATA_ME() (&__get_cpu_var(rcu_data)) | ||
186 | #define RCU_DATA_CPU(cpu) (&per_cpu(rcu_data, cpu)) | ||
187 | |||
188 | /* | ||
189 | * Helper macro for tracing when the appropriate rcu_data is not | ||
190 | * cached in a local variable, but where the CPU number is so cached. | ||
191 | */ | ||
192 | #define RCU_TRACE_CPU(f, cpu) RCU_TRACE(f, &(RCU_DATA_CPU(cpu)->trace)); | ||
193 | |||
194 | /* | ||
195 | * Helper macro for tracing when the appropriate rcu_data is not | ||
196 | * cached in a local variable. | ||
197 | */ | ||
198 | #define RCU_TRACE_ME(f) RCU_TRACE(f, &(RCU_DATA_ME()->trace)); | ||
199 | |||
200 | /* | ||
201 | * Helper macro for tracing when the appropriate rcu_data is pointed | ||
202 | * to by a local variable. | ||
203 | */ | ||
204 | #define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace)); | ||
205 | |||
206 | /* | ||
207 | * Return the number of RCU batches processed thus far. Useful | ||
208 | * for debug and statistics. | ||
209 | */ | ||
210 | long rcu_batches_completed(void) | ||
211 | { | ||
212 | return rcu_ctrlblk.completed; | ||
213 | } | ||
214 | EXPORT_SYMBOL_GPL(rcu_batches_completed); | ||
215 | |||
216 | EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); | ||
217 | |||
218 | void __rcu_read_lock(void) | ||
219 | { | ||
220 | int idx; | ||
221 | struct task_struct *t = current; | ||
222 | int nesting; | ||
223 | |||
224 | nesting = ACCESS_ONCE(t->rcu_read_lock_nesting); | ||
225 | if (nesting != 0) { | ||
226 | |||
227 | /* An earlier rcu_read_lock() covers us, just count it. */ | ||
228 | |||
229 | t->rcu_read_lock_nesting = nesting + 1; | ||
230 | |||
231 | } else { | ||
232 | unsigned long flags; | ||
233 | |||
234 | /* | ||
235 | * We disable interrupts for the following reasons: | ||
236 | * - If we get scheduling clock interrupt here, and we | ||
237 | * end up acking the counter flip, it's like a promise | ||
238 | * that we will never increment the old counter again. | ||
239 | * Thus we will break that promise if that | ||
240 | * scheduling clock interrupt happens between the time | ||
241 | * we pick the .completed field and the time that we | ||
242 | * increment our counter. | ||
243 | * | ||
244 | * - We don't want to be preempted out here. | ||
245 | * | ||
246 | * NMIs can still occur, of course, and might themselves | ||
247 | * contain rcu_read_lock(). | ||
248 | */ | ||
249 | |||
250 | local_irq_save(flags); | ||
251 | |||
252 | /* | ||
253 | * Outermost nesting of rcu_read_lock(), so increment | ||
254 | * the current counter for the current CPU. Use volatile | ||
255 | * casts to prevent the compiler from reordering. | ||
256 | */ | ||
257 | |||
258 | idx = ACCESS_ONCE(rcu_ctrlblk.completed) & 0x1; | ||
259 | ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])++; | ||
260 | |||
261 | /* | ||
262 | * Now that the per-CPU counter has been incremented, we | ||
263 | * are protected from races with rcu_read_lock() invoked | ||
264 | * from NMI handlers on this CPU. We can therefore safely | ||
265 | * increment the nesting counter, relieving further NMIs | ||
266 | * of the need to increment the per-CPU counter. | ||
267 | */ | ||
268 | |||
269 | ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting + 1; | ||
270 | |||
271 | /* | ||
272 | * Now that we have preventing any NMIs from storing | ||
273 | * to the ->rcu_flipctr_idx, we can safely use it to | ||
274 | * remember which counter to decrement in the matching | ||
275 | * rcu_read_unlock(). | ||
276 | */ | ||
277 | |||
278 | ACCESS_ONCE(t->rcu_flipctr_idx) = idx; | ||
279 | local_irq_restore(flags); | ||
280 | } | ||
281 | } | ||
282 | EXPORT_SYMBOL_GPL(__rcu_read_lock); | ||
283 | |||
284 | void __rcu_read_unlock(void) | ||
285 | { | ||
286 | int idx; | ||
287 | struct task_struct *t = current; | ||
288 | int nesting; | ||
289 | |||
290 | nesting = ACCESS_ONCE(t->rcu_read_lock_nesting); | ||
291 | if (nesting > 1) { | ||
292 | |||
293 | /* | ||
294 | * We are still protected by the enclosing rcu_read_lock(), | ||
295 | * so simply decrement the counter. | ||
296 | */ | ||
297 | |||
298 | t->rcu_read_lock_nesting = nesting - 1; | ||
299 | |||
300 | } else { | ||
301 | unsigned long flags; | ||
302 | |||
303 | /* | ||
304 | * Disable local interrupts to prevent the grace-period | ||
305 | * detection state machine from seeing us half-done. | ||
306 | * NMIs can still occur, of course, and might themselves | ||
307 | * contain rcu_read_lock() and rcu_read_unlock(). | ||
308 | */ | ||
309 | |||
310 | local_irq_save(flags); | ||
311 | |||
312 | /* | ||
313 | * Outermost nesting of rcu_read_unlock(), so we must | ||
314 | * decrement the current counter for the current CPU. | ||
315 | * This must be done carefully, because NMIs can | ||
316 | * occur at any point in this code, and any rcu_read_lock() | ||
317 | * and rcu_read_unlock() pairs in the NMI handlers | ||
318 | * must interact non-destructively with this code. | ||
319 | * Lots of volatile casts, and -very- careful ordering. | ||
320 | * | ||
321 | * Changes to this code, including this one, must be | ||
322 | * inspected, validated, and tested extremely carefully!!! | ||
323 | */ | ||
324 | |||
325 | /* | ||
326 | * First, pick up the index. | ||
327 | */ | ||
328 | |||
329 | idx = ACCESS_ONCE(t->rcu_flipctr_idx); | ||
330 | |||
331 | /* | ||
332 | * Now that we have fetched the counter index, it is | ||
333 | * safe to decrement the per-task RCU nesting counter. | ||
334 | * After this, any interrupts or NMIs will increment and | ||
335 | * decrement the per-CPU counters. | ||
336 | */ | ||
337 | ACCESS_ONCE(t->rcu_read_lock_nesting) = nesting - 1; | ||
338 | |||
339 | /* | ||
340 | * It is now safe to decrement this task's nesting count. | ||
341 | * NMIs that occur after this statement will route their | ||
342 | * rcu_read_lock() calls through this "else" clause, and | ||
343 | * will thus start incrementing the per-CPU counter on | ||
344 | * their own. They will also clobber ->rcu_flipctr_idx, | ||
345 | * but that is OK, since we have already fetched it. | ||
346 | */ | ||
347 | |||
348 | ACCESS_ONCE(RCU_DATA_ME()->rcu_flipctr[idx])--; | ||
349 | local_irq_restore(flags); | ||
350 | } | ||
351 | } | ||
352 | EXPORT_SYMBOL_GPL(__rcu_read_unlock); | ||
353 | |||
354 | /* | ||
355 | * If a global counter flip has occurred since the last time that we | ||
356 | * advanced callbacks, advance them. Hardware interrupts must be | ||
357 | * disabled when calling this function. | ||
358 | */ | ||
359 | static void __rcu_advance_callbacks(struct rcu_data *rdp) | ||
360 | { | ||
361 | int cpu; | ||
362 | int i; | ||
363 | int wlc = 0; | ||
364 | |||
365 | if (rdp->completed != rcu_ctrlblk.completed) { | ||
366 | if (rdp->waitlist[GP_STAGES - 1] != NULL) { | ||
367 | *rdp->donetail = rdp->waitlist[GP_STAGES - 1]; | ||
368 | rdp->donetail = rdp->waittail[GP_STAGES - 1]; | ||
369 | RCU_TRACE_RDP(rcupreempt_trace_move2done, rdp); | ||
370 | } | ||
371 | for (i = GP_STAGES - 2; i >= 0; i--) { | ||
372 | if (rdp->waitlist[i] != NULL) { | ||
373 | rdp->waitlist[i + 1] = rdp->waitlist[i]; | ||
374 | rdp->waittail[i + 1] = rdp->waittail[i]; | ||
375 | wlc++; | ||
376 | } else { | ||
377 | rdp->waitlist[i + 1] = NULL; | ||
378 | rdp->waittail[i + 1] = | ||
379 | &rdp->waitlist[i + 1]; | ||
380 | } | ||
381 | } | ||
382 | if (rdp->nextlist != NULL) { | ||
383 | rdp->waitlist[0] = rdp->nextlist; | ||
384 | rdp->waittail[0] = rdp->nexttail; | ||
385 | wlc++; | ||
386 | rdp->nextlist = NULL; | ||
387 | rdp->nexttail = &rdp->nextlist; | ||
388 | RCU_TRACE_RDP(rcupreempt_trace_move2wait, rdp); | ||
389 | } else { | ||
390 | rdp->waitlist[0] = NULL; | ||
391 | rdp->waittail[0] = &rdp->waitlist[0]; | ||
392 | } | ||
393 | rdp->waitlistcount = wlc; | ||
394 | rdp->completed = rcu_ctrlblk.completed; | ||
395 | } | ||
396 | |||
397 | /* | ||
398 | * Check to see if this CPU needs to report that it has seen | ||
399 | * the most recent counter flip, thereby declaring that all | ||
400 | * subsequent rcu_read_lock() invocations will respect this flip. | ||
401 | */ | ||
402 | |||
403 | cpu = raw_smp_processor_id(); | ||
404 | if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) { | ||
405 | smp_mb(); /* Subsequent counter accesses must see new value */ | ||
406 | per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen; | ||
407 | smp_mb(); /* Subsequent RCU read-side critical sections */ | ||
408 | /* seen -after- acknowledgement. */ | ||
409 | } | ||
410 | } | ||
411 | |||
412 | /* | ||
413 | * Get here when RCU is idle. Decide whether we need to | ||
414 | * move out of idle state, and return non-zero if so. | ||
415 | * "Straightforward" approach for the moment, might later | ||
416 | * use callback-list lengths, grace-period duration, or | ||
417 | * some such to determine when to exit idle state. | ||
418 | * Might also need a pre-idle test that does not acquire | ||
419 | * the lock, but let's get the simple case working first... | ||
420 | */ | ||
421 | |||
422 | static int | ||
423 | rcu_try_flip_idle(void) | ||
424 | { | ||
425 | int cpu; | ||
426 | |||
427 | RCU_TRACE_ME(rcupreempt_trace_try_flip_i1); | ||
428 | if (!rcu_pending(smp_processor_id())) { | ||
429 | RCU_TRACE_ME(rcupreempt_trace_try_flip_ie1); | ||
430 | return 0; | ||
431 | } | ||
432 | |||
433 | /* | ||
434 | * Do the flip. | ||
435 | */ | ||
436 | |||
437 | RCU_TRACE_ME(rcupreempt_trace_try_flip_g1); | ||
438 | rcu_ctrlblk.completed++; /* stands in for rcu_try_flip_g2 */ | ||
439 | |||
440 | /* | ||
441 | * Need a memory barrier so that other CPUs see the new | ||
442 | * counter value before they see the subsequent change of all | ||
443 | * the rcu_flip_flag instances to rcu_flipped. | ||
444 | */ | ||
445 | |||
446 | smp_mb(); /* see above block comment. */ | ||
447 | |||
448 | /* Now ask each CPU for acknowledgement of the flip. */ | ||
449 | |||
450 | for_each_cpu_mask(cpu, rcu_cpu_online_map) | ||
451 | per_cpu(rcu_flip_flag, cpu) = rcu_flipped; | ||
452 | |||
453 | return 1; | ||
454 | } | ||
455 | |||
456 | /* | ||
457 | * Wait for CPUs to acknowledge the flip. | ||
458 | */ | ||
459 | |||
460 | static int | ||
461 | rcu_try_flip_waitack(void) | ||
462 | { | ||
463 | int cpu; | ||
464 | |||
465 | RCU_TRACE_ME(rcupreempt_trace_try_flip_a1); | ||
466 | for_each_cpu_mask(cpu, rcu_cpu_online_map) | ||
467 | if (per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) { | ||
468 | RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1); | ||
469 | return 0; | ||
470 | } | ||
471 | |||
472 | /* | ||
473 | * Make sure our checks above don't bleed into subsequent | ||
474 | * waiting for the sum of the counters to reach zero. | ||
475 | */ | ||
476 | |||
477 | smp_mb(); /* see above block comment. */ | ||
478 | RCU_TRACE_ME(rcupreempt_trace_try_flip_a2); | ||
479 | return 1; | ||
480 | } | ||
481 | |||
482 | /* | ||
483 | * Wait for collective ``last'' counter to reach zero, | ||
484 | * then tell all CPUs to do an end-of-grace-period memory barrier. | ||
485 | */ | ||
486 | |||
487 | static int | ||
488 | rcu_try_flip_waitzero(void) | ||
489 | { | ||
490 | int cpu; | ||
491 | int lastidx = !(rcu_ctrlblk.completed & 0x1); | ||
492 | int sum = 0; | ||
493 | |||
494 | /* Check to see if the sum of the "last" counters is zero. */ | ||
495 | |||
496 | RCU_TRACE_ME(rcupreempt_trace_try_flip_z1); | ||
497 | for_each_cpu_mask(cpu, rcu_cpu_online_map) | ||
498 | sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx]; | ||
499 | if (sum != 0) { | ||
500 | RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1); | ||
501 | return 0; | ||
502 | } | ||
503 | |||
504 | /* | ||
505 | * This ensures that the other CPUs see the call for | ||
506 | * memory barriers -after- the sum to zero has been | ||
507 | * detected here | ||
508 | */ | ||
509 | smp_mb(); /* ^^^^^^^^^^^^ */ | ||
510 | |||
511 | /* Call for a memory barrier from each CPU. */ | ||
512 | for_each_cpu_mask(cpu, rcu_cpu_online_map) | ||
513 | per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed; | ||
514 | |||
515 | RCU_TRACE_ME(rcupreempt_trace_try_flip_z2); | ||
516 | return 1; | ||
517 | } | ||
518 | |||
519 | /* | ||
520 | * Wait for all CPUs to do their end-of-grace-period memory barrier. | ||
521 | * Return 0 once all CPUs have done so. | ||
522 | */ | ||
523 | |||
524 | static int | ||
525 | rcu_try_flip_waitmb(void) | ||
526 | { | ||
527 | int cpu; | ||
528 | |||
529 | RCU_TRACE_ME(rcupreempt_trace_try_flip_m1); | ||
530 | for_each_cpu_mask(cpu, rcu_cpu_online_map) | ||
531 | if (per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) { | ||
532 | RCU_TRACE_ME(rcupreempt_trace_try_flip_me1); | ||
533 | return 0; | ||
534 | } | ||
535 | |||
536 | smp_mb(); /* Ensure that the above checks precede any following flip. */ | ||
537 | RCU_TRACE_ME(rcupreempt_trace_try_flip_m2); | ||
538 | return 1; | ||
539 | } | ||
540 | |||
541 | /* | ||
542 | * Attempt a single flip of the counters. Remember, a single flip does | ||
543 | * -not- constitute a grace period. Instead, the interval between | ||
544 | * at least GP_STAGES consecutive flips is a grace period. | ||
545 | * | ||
546 | * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation | ||
547 | * on a large SMP, they might want to use a hierarchical organization of | ||
548 | * the per-CPU-counter pairs. | ||
549 | */ | ||
550 | static void rcu_try_flip(void) | ||
551 | { | ||
552 | unsigned long flags; | ||
553 | |||
554 | RCU_TRACE_ME(rcupreempt_trace_try_flip_1); | ||
555 | if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, flags))) { | ||
556 | RCU_TRACE_ME(rcupreempt_trace_try_flip_e1); | ||
557 | return; | ||
558 | } | ||
559 | |||
560 | /* | ||
561 | * Take the next transition(s) through the RCU grace-period | ||
562 | * flip-counter state machine. | ||
563 | */ | ||
564 | |||
565 | switch (rcu_ctrlblk.rcu_try_flip_state) { | ||
566 | case rcu_try_flip_idle_state: | ||
567 | if (rcu_try_flip_idle()) | ||
568 | rcu_ctrlblk.rcu_try_flip_state = | ||
569 | rcu_try_flip_waitack_state; | ||
570 | break; | ||
571 | case rcu_try_flip_waitack_state: | ||
572 | if (rcu_try_flip_waitack()) | ||
573 | rcu_ctrlblk.rcu_try_flip_state = | ||
574 | rcu_try_flip_waitzero_state; | ||
575 | break; | ||
576 | case rcu_try_flip_waitzero_state: | ||
577 | if (rcu_try_flip_waitzero()) | ||
578 | rcu_ctrlblk.rcu_try_flip_state = | ||
579 | rcu_try_flip_waitmb_state; | ||
580 | break; | ||
581 | case rcu_try_flip_waitmb_state: | ||
582 | if (rcu_try_flip_waitmb()) | ||
583 | rcu_ctrlblk.rcu_try_flip_state = | ||
584 | rcu_try_flip_idle_state; | ||
585 | } | ||
586 | spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); | ||
587 | } | ||
588 | |||
589 | /* | ||
590 | * Check to see if this CPU needs to do a memory barrier in order to | ||
591 | * ensure that any prior RCU read-side critical sections have committed | ||
592 | * their counter manipulations and critical-section memory references | ||
593 | * before declaring the grace period to be completed. | ||
594 | */ | ||
595 | static void rcu_check_mb(int cpu) | ||
596 | { | ||
597 | if (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed) { | ||
598 | smp_mb(); /* Ensure RCU read-side accesses are visible. */ | ||
599 | per_cpu(rcu_mb_flag, cpu) = rcu_mb_done; | ||
600 | } | ||
601 | } | ||
602 | |||
603 | void rcu_check_callbacks(int cpu, int user) | ||
604 | { | ||
605 | unsigned long flags; | ||
606 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
607 | |||
608 | rcu_check_mb(cpu); | ||
609 | if (rcu_ctrlblk.completed == rdp->completed) | ||
610 | rcu_try_flip(); | ||
611 | spin_lock_irqsave(&rdp->lock, flags); | ||
612 | RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp); | ||
613 | __rcu_advance_callbacks(rdp); | ||
614 | if (rdp->donelist == NULL) { | ||
615 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
616 | } else { | ||
617 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
618 | raise_softirq(RCU_SOFTIRQ); | ||
619 | } | ||
620 | } | ||
621 | |||
622 | /* | ||
623 | * Needed by dynticks, to make sure all RCU processing has finished | ||
624 | * when we go idle: | ||
625 | */ | ||
626 | void rcu_advance_callbacks(int cpu, int user) | ||
627 | { | ||
628 | unsigned long flags; | ||
629 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
630 | |||
631 | if (rcu_ctrlblk.completed == rdp->completed) { | ||
632 | rcu_try_flip(); | ||
633 | if (rcu_ctrlblk.completed == rdp->completed) | ||
634 | return; | ||
635 | } | ||
636 | spin_lock_irqsave(&rdp->lock, flags); | ||
637 | RCU_TRACE_RDP(rcupreempt_trace_check_callbacks, rdp); | ||
638 | __rcu_advance_callbacks(rdp); | ||
639 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
640 | } | ||
641 | |||
642 | #ifdef CONFIG_HOTPLUG_CPU | ||
643 | #define rcu_offline_cpu_enqueue(srclist, srctail, dstlist, dsttail) do { \ | ||
644 | *dsttail = srclist; \ | ||
645 | if (srclist != NULL) { \ | ||
646 | dsttail = srctail; \ | ||
647 | srclist = NULL; \ | ||
648 | srctail = &srclist;\ | ||
649 | } \ | ||
650 | } while (0) | ||
651 | |||
652 | void rcu_offline_cpu(int cpu) | ||
653 | { | ||
654 | int i; | ||
655 | struct rcu_head *list = NULL; | ||
656 | unsigned long flags; | ||
657 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
658 | struct rcu_head **tail = &list; | ||
659 | |||
660 | /* | ||
661 | * Remove all callbacks from the newly dead CPU, retaining order. | ||
662 | * Otherwise rcu_barrier() will fail | ||
663 | */ | ||
664 | |||
665 | spin_lock_irqsave(&rdp->lock, flags); | ||
666 | rcu_offline_cpu_enqueue(rdp->donelist, rdp->donetail, list, tail); | ||
667 | for (i = GP_STAGES - 1; i >= 0; i--) | ||
668 | rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i], | ||
669 | list, tail); | ||
670 | rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail); | ||
671 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
672 | rdp->waitlistcount = 0; | ||
673 | |||
674 | /* Disengage the newly dead CPU from the grace-period computation. */ | ||
675 | |||
676 | spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); | ||
677 | rcu_check_mb(cpu); | ||
678 | if (per_cpu(rcu_flip_flag, cpu) == rcu_flipped) { | ||
679 | smp_mb(); /* Subsequent counter accesses must see new value */ | ||
680 | per_cpu(rcu_flip_flag, cpu) = rcu_flip_seen; | ||
681 | smp_mb(); /* Subsequent RCU read-side critical sections */ | ||
682 | /* seen -after- acknowledgement. */ | ||
683 | } | ||
684 | |||
685 | RCU_DATA_ME()->rcu_flipctr[0] += RCU_DATA_CPU(cpu)->rcu_flipctr[0]; | ||
686 | RCU_DATA_ME()->rcu_flipctr[1] += RCU_DATA_CPU(cpu)->rcu_flipctr[1]; | ||
687 | |||
688 | RCU_DATA_CPU(cpu)->rcu_flipctr[0] = 0; | ||
689 | RCU_DATA_CPU(cpu)->rcu_flipctr[1] = 0; | ||
690 | |||
691 | cpu_clear(cpu, rcu_cpu_online_map); | ||
692 | |||
693 | spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); | ||
694 | |||
695 | /* | ||
696 | * Place the removed callbacks on the current CPU's queue. | ||
697 | * Make them all start a new grace period: simple approach, | ||
698 | * in theory could starve a given set of callbacks, but | ||
699 | * you would need to be doing some serious CPU hotplugging | ||
700 | * to make this happen. If this becomes a problem, adding | ||
701 | * a synchronize_rcu() to the hotplug path would be a simple | ||
702 | * fix. | ||
703 | */ | ||
704 | |||
705 | rdp = RCU_DATA_ME(); | ||
706 | spin_lock_irqsave(&rdp->lock, flags); | ||
707 | *rdp->nexttail = list; | ||
708 | if (list) | ||
709 | rdp->nexttail = tail; | ||
710 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
711 | } | ||
712 | |||
713 | void __devinit rcu_online_cpu(int cpu) | ||
714 | { | ||
715 | unsigned long flags; | ||
716 | |||
717 | spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); | ||
718 | cpu_set(cpu, rcu_cpu_online_map); | ||
719 | spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); | ||
720 | } | ||
721 | |||
722 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
723 | |||
724 | void rcu_offline_cpu(int cpu) | ||
725 | { | ||
726 | } | ||
727 | |||
728 | void __devinit rcu_online_cpu(int cpu) | ||
729 | { | ||
730 | } | ||
731 | |||
732 | #endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ | ||
733 | |||
734 | static void rcu_process_callbacks(struct softirq_action *unused) | ||
735 | { | ||
736 | unsigned long flags; | ||
737 | struct rcu_head *next, *list; | ||
738 | struct rcu_data *rdp = RCU_DATA_ME(); | ||
739 | |||
740 | spin_lock_irqsave(&rdp->lock, flags); | ||
741 | list = rdp->donelist; | ||
742 | if (list == NULL) { | ||
743 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
744 | return; | ||
745 | } | ||
746 | rdp->donelist = NULL; | ||
747 | rdp->donetail = &rdp->donelist; | ||
748 | RCU_TRACE_RDP(rcupreempt_trace_done_remove, rdp); | ||
749 | spin_unlock_irqrestore(&rdp->lock, flags); | ||
750 | while (list) { | ||
751 | next = list->next; | ||
752 | list->func(list); | ||
753 | list = next; | ||
754 | RCU_TRACE_ME(rcupreempt_trace_invoke); | ||
755 | } | ||
756 | } | ||
757 | |||
758 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | ||
759 | { | ||
760 | unsigned long flags; | ||
761 | struct rcu_data *rdp; | ||
762 | |||
763 | head->func = func; | ||
764 | head->next = NULL; | ||
765 | local_irq_save(flags); | ||
766 | rdp = RCU_DATA_ME(); | ||
767 | spin_lock(&rdp->lock); | ||
768 | __rcu_advance_callbacks(rdp); | ||
769 | *rdp->nexttail = head; | ||
770 | rdp->nexttail = &head->next; | ||
771 | RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp); | ||
772 | spin_unlock(&rdp->lock); | ||
773 | local_irq_restore(flags); | ||
774 | } | ||
775 | EXPORT_SYMBOL_GPL(call_rcu); | ||
776 | |||
777 | /* | ||
778 | * Wait until all currently running preempt_disable() code segments | ||
779 | * (including hardware-irq-disable segments) complete. Note that | ||
780 | * in -rt this does -not- necessarily result in all currently executing | ||
781 | * interrupt -handlers- having completed. | ||
782 | */ | ||
783 | void __synchronize_sched(void) | ||
784 | { | ||
785 | cpumask_t oldmask; | ||
786 | int cpu; | ||
787 | |||
788 | if (sched_getaffinity(0, &oldmask) < 0) | ||
789 | oldmask = cpu_possible_map; | ||
790 | for_each_online_cpu(cpu) { | ||
791 | sched_setaffinity(0, cpumask_of_cpu(cpu)); | ||
792 | schedule(); | ||
793 | } | ||
794 | sched_setaffinity(0, oldmask); | ||
795 | } | ||
796 | EXPORT_SYMBOL_GPL(__synchronize_sched); | ||
797 | |||
798 | /* | ||
799 | * Check to see if any future RCU-related work will need to be done | ||
800 | * by the current CPU, even if none need be done immediately, returning | ||
801 | * 1 if so. Assumes that notifiers would take care of handling any | ||
802 | * outstanding requests from the RCU core. | ||
803 | * | ||
804 | * This function is part of the RCU implementation; it is -not- | ||
805 | * an exported member of the RCU API. | ||
806 | */ | ||
807 | int rcu_needs_cpu(int cpu) | ||
808 | { | ||
809 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
810 | |||
811 | return (rdp->donelist != NULL || | ||
812 | !!rdp->waitlistcount || | ||
813 | rdp->nextlist != NULL); | ||
814 | } | ||
815 | |||
816 | int rcu_pending(int cpu) | ||
817 | { | ||
818 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
819 | |||
820 | /* The CPU has at least one callback queued somewhere. */ | ||
821 | |||
822 | if (rdp->donelist != NULL || | ||
823 | !!rdp->waitlistcount || | ||
824 | rdp->nextlist != NULL) | ||
825 | return 1; | ||
826 | |||
827 | /* The RCU core needs an acknowledgement from this CPU. */ | ||
828 | |||
829 | if ((per_cpu(rcu_flip_flag, cpu) == rcu_flipped) || | ||
830 | (per_cpu(rcu_mb_flag, cpu) == rcu_mb_needed)) | ||
831 | return 1; | ||
832 | |||
833 | /* This CPU has fallen behind the global grace-period number. */ | ||
834 | |||
835 | if (rdp->completed != rcu_ctrlblk.completed) | ||
836 | return 1; | ||
837 | |||
838 | /* Nothing needed from this CPU. */ | ||
839 | |||
840 | return 0; | ||
841 | } | ||
842 | |||
843 | static int __cpuinit rcu_cpu_notify(struct notifier_block *self, | ||
844 | unsigned long action, void *hcpu) | ||
845 | { | ||
846 | long cpu = (long)hcpu; | ||
847 | |||
848 | switch (action) { | ||
849 | case CPU_UP_PREPARE: | ||
850 | case CPU_UP_PREPARE_FROZEN: | ||
851 | rcu_online_cpu(cpu); | ||
852 | break; | ||
853 | case CPU_UP_CANCELED: | ||
854 | case CPU_UP_CANCELED_FROZEN: | ||
855 | case CPU_DEAD: | ||
856 | case CPU_DEAD_FROZEN: | ||
857 | rcu_offline_cpu(cpu); | ||
858 | break; | ||
859 | default: | ||
860 | break; | ||
861 | } | ||
862 | return NOTIFY_OK; | ||
863 | } | ||
864 | |||
865 | static struct notifier_block __cpuinitdata rcu_nb = { | ||
866 | .notifier_call = rcu_cpu_notify, | ||
867 | }; | ||
868 | |||
869 | void __init __rcu_init(void) | ||
870 | { | ||
871 | int cpu; | ||
872 | int i; | ||
873 | struct rcu_data *rdp; | ||
874 | |||
875 | printk(KERN_NOTICE "Preemptible RCU implementation.\n"); | ||
876 | for_each_possible_cpu(cpu) { | ||
877 | rdp = RCU_DATA_CPU(cpu); | ||
878 | spin_lock_init(&rdp->lock); | ||
879 | rdp->completed = 0; | ||
880 | rdp->waitlistcount = 0; | ||
881 | rdp->nextlist = NULL; | ||
882 | rdp->nexttail = &rdp->nextlist; | ||
883 | for (i = 0; i < GP_STAGES; i++) { | ||
884 | rdp->waitlist[i] = NULL; | ||
885 | rdp->waittail[i] = &rdp->waitlist[i]; | ||
886 | } | ||
887 | rdp->donelist = NULL; | ||
888 | rdp->donetail = &rdp->donelist; | ||
889 | rdp->rcu_flipctr[0] = 0; | ||
890 | rdp->rcu_flipctr[1] = 0; | ||
891 | } | ||
892 | register_cpu_notifier(&rcu_nb); | ||
893 | |||
894 | /* | ||
895 | * We don't need protection against CPU-Hotplug here | ||
896 | * since | ||
897 | * a) If a CPU comes online while we are iterating over the | ||
898 | * cpu_online_map below, we would only end up making a | ||
899 | * duplicate call to rcu_online_cpu() which sets the corresponding | ||
900 | * CPU's mask in the rcu_cpu_online_map. | ||
901 | * | ||
902 | * b) A CPU cannot go offline at this point in time since the user | ||
903 | * does not have access to the sysfs interface, nor do we | ||
904 | * suspend the system. | ||
905 | */ | ||
906 | for_each_online_cpu(cpu) | ||
907 | rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long) cpu); | ||
908 | |||
909 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL); | ||
910 | } | ||
911 | |||
912 | /* | ||
913 | * Deprecated, use synchronize_rcu() or synchronize_sched() instead. | ||
914 | */ | ||
915 | void synchronize_kernel(void) | ||
916 | { | ||
917 | synchronize_rcu(); | ||
918 | } | ||
919 | |||
920 | #ifdef CONFIG_RCU_TRACE | ||
921 | long *rcupreempt_flipctr(int cpu) | ||
922 | { | ||
923 | return &RCU_DATA_CPU(cpu)->rcu_flipctr[0]; | ||
924 | } | ||
925 | EXPORT_SYMBOL_GPL(rcupreempt_flipctr); | ||
926 | |||
927 | int rcupreempt_flip_flag(int cpu) | ||
928 | { | ||
929 | return per_cpu(rcu_flip_flag, cpu); | ||
930 | } | ||
931 | EXPORT_SYMBOL_GPL(rcupreempt_flip_flag); | ||
932 | |||
933 | int rcupreempt_mb_flag(int cpu) | ||
934 | { | ||
935 | return per_cpu(rcu_mb_flag, cpu); | ||
936 | } | ||
937 | EXPORT_SYMBOL_GPL(rcupreempt_mb_flag); | ||
938 | |||
939 | char *rcupreempt_try_flip_state_name(void) | ||
940 | { | ||
941 | return rcu_try_flip_state_names[rcu_ctrlblk.rcu_try_flip_state]; | ||
942 | } | ||
943 | EXPORT_SYMBOL_GPL(rcupreempt_try_flip_state_name); | ||
944 | |||
945 | struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu) | ||
946 | { | ||
947 | struct rcu_data *rdp = RCU_DATA_CPU(cpu); | ||
948 | |||
949 | return &rdp->trace; | ||
950 | } | ||
951 | EXPORT_SYMBOL_GPL(rcupreempt_trace_cpu); | ||
952 | |||
953 | #endif /* #ifdef RCU_TRACE */ | ||
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c new file mode 100644 index 000000000000..49ac4947af24 --- /dev/null +++ b/kernel/rcupreempt_trace.c | |||
@@ -0,0 +1,330 @@ | |||
1 | /* | ||
2 | * Read-Copy Update tracing for realtime implementation | ||
3 | * | ||
4 | * This program is free software; you can redistribute it and/or modify | ||
5 | * it under the terms of the GNU General Public License as published by | ||
6 | * the Free Software Foundation; either version 2 of the License, or | ||
7 | * (at your option) any later version. | ||
8 | * | ||
9 | * This program is distributed in the hope that it will be useful, | ||
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
12 | * GNU General Public License for more details. | ||
13 | * | ||
14 | * You should have received a copy of the GNU General Public License | ||
15 | * along with this program; if not, write to the Free Software | ||
16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
17 | * | ||
18 | * Copyright IBM Corporation, 2006 | ||
19 | * | ||
20 | * Papers: http://www.rdrop.com/users/paulmck/RCU | ||
21 | * | ||
22 | * For detailed explanation of Read-Copy Update mechanism see - | ||
23 | * Documentation/RCU/ *.txt | ||
24 | * | ||
25 | */ | ||
26 | #include <linux/types.h> | ||
27 | #include <linux/kernel.h> | ||
28 | #include <linux/init.h> | ||
29 | #include <linux/spinlock.h> | ||
30 | #include <linux/smp.h> | ||
31 | #include <linux/rcupdate.h> | ||
32 | #include <linux/interrupt.h> | ||
33 | #include <linux/sched.h> | ||
34 | #include <asm/atomic.h> | ||
35 | #include <linux/bitops.h> | ||
36 | #include <linux/module.h> | ||
37 | #include <linux/completion.h> | ||
38 | #include <linux/moduleparam.h> | ||
39 | #include <linux/percpu.h> | ||
40 | #include <linux/notifier.h> | ||
41 | #include <linux/rcupdate.h> | ||
42 | #include <linux/cpu.h> | ||
43 | #include <linux/mutex.h> | ||
44 | #include <linux/rcupreempt_trace.h> | ||
45 | #include <linux/debugfs.h> | ||
46 | |||
47 | static struct mutex rcupreempt_trace_mutex; | ||
48 | static char *rcupreempt_trace_buf; | ||
49 | #define RCUPREEMPT_TRACE_BUF_SIZE 4096 | ||
50 | |||
51 | void rcupreempt_trace_move2done(struct rcupreempt_trace *trace) | ||
52 | { | ||
53 | trace->done_length += trace->wait_length; | ||
54 | trace->done_add += trace->wait_length; | ||
55 | trace->wait_length = 0; | ||
56 | } | ||
57 | void rcupreempt_trace_move2wait(struct rcupreempt_trace *trace) | ||
58 | { | ||
59 | trace->wait_length += trace->next_length; | ||
60 | trace->wait_add += trace->next_length; | ||
61 | trace->next_length = 0; | ||
62 | } | ||
63 | void rcupreempt_trace_try_flip_1(struct rcupreempt_trace *trace) | ||
64 | { | ||
65 | atomic_inc(&trace->rcu_try_flip_1); | ||
66 | } | ||
67 | void rcupreempt_trace_try_flip_e1(struct rcupreempt_trace *trace) | ||
68 | { | ||
69 | atomic_inc(&trace->rcu_try_flip_e1); | ||
70 | } | ||
71 | void rcupreempt_trace_try_flip_i1(struct rcupreempt_trace *trace) | ||
72 | { | ||
73 | trace->rcu_try_flip_i1++; | ||
74 | } | ||
75 | void rcupreempt_trace_try_flip_ie1(struct rcupreempt_trace *trace) | ||
76 | { | ||
77 | trace->rcu_try_flip_ie1++; | ||
78 | } | ||
79 | void rcupreempt_trace_try_flip_g1(struct rcupreempt_trace *trace) | ||
80 | { | ||
81 | trace->rcu_try_flip_g1++; | ||
82 | } | ||
83 | void rcupreempt_trace_try_flip_a1(struct rcupreempt_trace *trace) | ||
84 | { | ||
85 | trace->rcu_try_flip_a1++; | ||
86 | } | ||
87 | void rcupreempt_trace_try_flip_ae1(struct rcupreempt_trace *trace) | ||
88 | { | ||
89 | trace->rcu_try_flip_ae1++; | ||
90 | } | ||
91 | void rcupreempt_trace_try_flip_a2(struct rcupreempt_trace *trace) | ||
92 | { | ||
93 | trace->rcu_try_flip_a2++; | ||
94 | } | ||
95 | void rcupreempt_trace_try_flip_z1(struct rcupreempt_trace *trace) | ||
96 | { | ||
97 | trace->rcu_try_flip_z1++; | ||
98 | } | ||
99 | void rcupreempt_trace_try_flip_ze1(struct rcupreempt_trace *trace) | ||
100 | { | ||
101 | trace->rcu_try_flip_ze1++; | ||
102 | } | ||
103 | void rcupreempt_trace_try_flip_z2(struct rcupreempt_trace *trace) | ||
104 | { | ||
105 | trace->rcu_try_flip_z2++; | ||
106 | } | ||
107 | void rcupreempt_trace_try_flip_m1(struct rcupreempt_trace *trace) | ||
108 | { | ||
109 | trace->rcu_try_flip_m1++; | ||
110 | } | ||
111 | void rcupreempt_trace_try_flip_me1(struct rcupreempt_trace *trace) | ||
112 | { | ||
113 | trace->rcu_try_flip_me1++; | ||
114 | } | ||
115 | void rcupreempt_trace_try_flip_m2(struct rcupreempt_trace *trace) | ||
116 | { | ||
117 | trace->rcu_try_flip_m2++; | ||
118 | } | ||
119 | void rcupreempt_trace_check_callbacks(struct rcupreempt_trace *trace) | ||
120 | { | ||
121 | trace->rcu_check_callbacks++; | ||
122 | } | ||
123 | void rcupreempt_trace_done_remove(struct rcupreempt_trace *trace) | ||
124 | { | ||
125 | trace->done_remove += trace->done_length; | ||
126 | trace->done_length = 0; | ||
127 | } | ||
128 | void rcupreempt_trace_invoke(struct rcupreempt_trace *trace) | ||
129 | { | ||
130 | atomic_inc(&trace->done_invoked); | ||
131 | } | ||
132 | void rcupreempt_trace_next_add(struct rcupreempt_trace *trace) | ||
133 | { | ||
134 | trace->next_add++; | ||
135 | trace->next_length++; | ||
136 | } | ||
137 | |||
138 | static void rcupreempt_trace_sum(struct rcupreempt_trace *sp) | ||
139 | { | ||
140 | struct rcupreempt_trace *cp; | ||
141 | int cpu; | ||
142 | |||
143 | memset(sp, 0, sizeof(*sp)); | ||
144 | for_each_possible_cpu(cpu) { | ||
145 | cp = rcupreempt_trace_cpu(cpu); | ||
146 | sp->next_length += cp->next_length; | ||
147 | sp->next_add += cp->next_add; | ||
148 | sp->wait_length += cp->wait_length; | ||
149 | sp->wait_add += cp->wait_add; | ||
150 | sp->done_length += cp->done_length; | ||
151 | sp->done_add += cp->done_add; | ||
152 | sp->done_remove += cp->done_remove; | ||
153 | atomic_set(&sp->done_invoked, atomic_read(&cp->done_invoked)); | ||
154 | sp->rcu_check_callbacks += cp->rcu_check_callbacks; | ||
155 | atomic_set(&sp->rcu_try_flip_1, | ||
156 | atomic_read(&cp->rcu_try_flip_1)); | ||
157 | atomic_set(&sp->rcu_try_flip_e1, | ||
158 | atomic_read(&cp->rcu_try_flip_e1)); | ||
159 | sp->rcu_try_flip_i1 += cp->rcu_try_flip_i1; | ||
160 | sp->rcu_try_flip_ie1 += cp->rcu_try_flip_ie1; | ||
161 | sp->rcu_try_flip_g1 += cp->rcu_try_flip_g1; | ||
162 | sp->rcu_try_flip_a1 += cp->rcu_try_flip_a1; | ||
163 | sp->rcu_try_flip_ae1 += cp->rcu_try_flip_ae1; | ||
164 | sp->rcu_try_flip_a2 += cp->rcu_try_flip_a2; | ||
165 | sp->rcu_try_flip_z1 += cp->rcu_try_flip_z1; | ||
166 | sp->rcu_try_flip_ze1 += cp->rcu_try_flip_ze1; | ||
167 | sp->rcu_try_flip_z2 += cp->rcu_try_flip_z2; | ||
168 | sp->rcu_try_flip_m1 += cp->rcu_try_flip_m1; | ||
169 | sp->rcu_try_flip_me1 += cp->rcu_try_flip_me1; | ||
170 | sp->rcu_try_flip_m2 += cp->rcu_try_flip_m2; | ||
171 | } | ||
172 | } | ||
173 | |||
174 | static ssize_t rcustats_read(struct file *filp, char __user *buffer, | ||
175 | size_t count, loff_t *ppos) | ||
176 | { | ||
177 | struct rcupreempt_trace trace; | ||
178 | ssize_t bcount; | ||
179 | int cnt = 0; | ||
180 | |||
181 | rcupreempt_trace_sum(&trace); | ||
182 | mutex_lock(&rcupreempt_trace_mutex); | ||
183 | snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt, | ||
184 | "ggp=%ld rcc=%ld\n", | ||
185 | rcu_batches_completed(), | ||
186 | trace.rcu_check_callbacks); | ||
187 | snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE - cnt, | ||
188 | "na=%ld nl=%ld wa=%ld wl=%ld da=%ld dl=%ld dr=%ld di=%d\n" | ||
189 | "1=%d e1=%d i1=%ld ie1=%ld g1=%ld a1=%ld ae1=%ld a2=%ld\n" | ||
190 | "z1=%ld ze1=%ld z2=%ld m1=%ld me1=%ld m2=%ld\n", | ||
191 | |||
192 | trace.next_add, trace.next_length, | ||
193 | trace.wait_add, trace.wait_length, | ||
194 | trace.done_add, trace.done_length, | ||
195 | trace.done_remove, atomic_read(&trace.done_invoked), | ||
196 | atomic_read(&trace.rcu_try_flip_1), | ||
197 | atomic_read(&trace.rcu_try_flip_e1), | ||
198 | trace.rcu_try_flip_i1, trace.rcu_try_flip_ie1, | ||
199 | trace.rcu_try_flip_g1, | ||
200 | trace.rcu_try_flip_a1, trace.rcu_try_flip_ae1, | ||
201 | trace.rcu_try_flip_a2, | ||
202 | trace.rcu_try_flip_z1, trace.rcu_try_flip_ze1, | ||
203 | trace.rcu_try_flip_z2, | ||
204 | trace.rcu_try_flip_m1, trace.rcu_try_flip_me1, | ||
205 | trace.rcu_try_flip_m2); | ||
206 | bcount = simple_read_from_buffer(buffer, count, ppos, | ||
207 | rcupreempt_trace_buf, strlen(rcupreempt_trace_buf)); | ||
208 | mutex_unlock(&rcupreempt_trace_mutex); | ||
209 | return bcount; | ||
210 | } | ||
211 | |||
212 | static ssize_t rcugp_read(struct file *filp, char __user *buffer, | ||
213 | size_t count, loff_t *ppos) | ||
214 | { | ||
215 | long oldgp = rcu_batches_completed(); | ||
216 | ssize_t bcount; | ||
217 | |||
218 | mutex_lock(&rcupreempt_trace_mutex); | ||
219 | synchronize_rcu(); | ||
220 | snprintf(rcupreempt_trace_buf, RCUPREEMPT_TRACE_BUF_SIZE, | ||
221 | "oldggp=%ld newggp=%ld\n", oldgp, rcu_batches_completed()); | ||
222 | bcount = simple_read_from_buffer(buffer, count, ppos, | ||
223 | rcupreempt_trace_buf, strlen(rcupreempt_trace_buf)); | ||
224 | mutex_unlock(&rcupreempt_trace_mutex); | ||
225 | return bcount; | ||
226 | } | ||
227 | |||
228 | static ssize_t rcuctrs_read(struct file *filp, char __user *buffer, | ||
229 | size_t count, loff_t *ppos) | ||
230 | { | ||
231 | int cnt = 0; | ||
232 | int cpu; | ||
233 | int f = rcu_batches_completed() & 0x1; | ||
234 | ssize_t bcount; | ||
235 | |||
236 | mutex_lock(&rcupreempt_trace_mutex); | ||
237 | |||
238 | cnt += snprintf(&rcupreempt_trace_buf[cnt], RCUPREEMPT_TRACE_BUF_SIZE, | ||
239 | "CPU last cur F M\n"); | ||
240 | for_each_online_cpu(cpu) { | ||
241 | long *flipctr = rcupreempt_flipctr(cpu); | ||
242 | cnt += snprintf(&rcupreempt_trace_buf[cnt], | ||
243 | RCUPREEMPT_TRACE_BUF_SIZE - cnt, | ||
244 | "%3d %4ld %3ld %d %d\n", | ||
245 | cpu, | ||
246 | flipctr[!f], | ||
247 | flipctr[f], | ||
248 | rcupreempt_flip_flag(cpu), | ||
249 | rcupreempt_mb_flag(cpu)); | ||
250 | } | ||
251 | cnt += snprintf(&rcupreempt_trace_buf[cnt], | ||
252 | RCUPREEMPT_TRACE_BUF_SIZE - cnt, | ||
253 | "ggp = %ld, state = %s\n", | ||
254 | rcu_batches_completed(), | ||
255 | rcupreempt_try_flip_state_name()); | ||
256 | cnt += snprintf(&rcupreempt_trace_buf[cnt], | ||
257 | RCUPREEMPT_TRACE_BUF_SIZE - cnt, | ||
258 | "\n"); | ||
259 | bcount = simple_read_from_buffer(buffer, count, ppos, | ||
260 | rcupreempt_trace_buf, strlen(rcupreempt_trace_buf)); | ||
261 | mutex_unlock(&rcupreempt_trace_mutex); | ||
262 | return bcount; | ||
263 | } | ||
264 | |||
265 | static struct file_operations rcustats_fops = { | ||
266 | .owner = THIS_MODULE, | ||
267 | .read = rcustats_read, | ||
268 | }; | ||
269 | |||
270 | static struct file_operations rcugp_fops = { | ||
271 | .owner = THIS_MODULE, | ||
272 | .read = rcugp_read, | ||
273 | }; | ||
274 | |||
275 | static struct file_operations rcuctrs_fops = { | ||
276 | .owner = THIS_MODULE, | ||
277 | .read = rcuctrs_read, | ||
278 | }; | ||
279 | |||
280 | static struct dentry *rcudir, *statdir, *ctrsdir, *gpdir; | ||
281 | static int rcupreempt_debugfs_init(void) | ||
282 | { | ||
283 | rcudir = debugfs_create_dir("rcu", NULL); | ||
284 | if (!rcudir) | ||
285 | goto out; | ||
286 | statdir = debugfs_create_file("rcustats", 0444, rcudir, | ||
287 | NULL, &rcustats_fops); | ||
288 | if (!statdir) | ||
289 | goto free_out; | ||
290 | |||
291 | gpdir = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops); | ||
292 | if (!gpdir) | ||
293 | goto free_out; | ||
294 | |||
295 | ctrsdir = debugfs_create_file("rcuctrs", 0444, rcudir, | ||
296 | NULL, &rcuctrs_fops); | ||
297 | if (!ctrsdir) | ||
298 | goto free_out; | ||
299 | return 0; | ||
300 | free_out: | ||
301 | if (statdir) | ||
302 | debugfs_remove(statdir); | ||
303 | if (gpdir) | ||
304 | debugfs_remove(gpdir); | ||
305 | debugfs_remove(rcudir); | ||
306 | out: | ||
307 | return 1; | ||
308 | } | ||
309 | |||
310 | static int __init rcupreempt_trace_init(void) | ||
311 | { | ||
312 | mutex_init(&rcupreempt_trace_mutex); | ||
313 | rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL); | ||
314 | if (!rcupreempt_trace_buf) | ||
315 | return 1; | ||
316 | return rcupreempt_debugfs_init(); | ||
317 | } | ||
318 | |||
319 | static void __exit rcupreempt_trace_cleanup(void) | ||
320 | { | ||
321 | debugfs_remove(statdir); | ||
322 | debugfs_remove(gpdir); | ||
323 | debugfs_remove(ctrsdir); | ||
324 | debugfs_remove(rcudir); | ||
325 | kfree(rcupreempt_trace_buf); | ||
326 | } | ||
327 | |||
328 | |||
329 | module_init(rcupreempt_trace_init); | ||
330 | module_exit(rcupreempt_trace_cleanup); | ||
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index c3e165c2318f..fd599829e72a 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -726,11 +726,11 @@ static void rcu_torture_shuffle_tasks(void) | |||
726 | cpumask_t tmp_mask = CPU_MASK_ALL; | 726 | cpumask_t tmp_mask = CPU_MASK_ALL; |
727 | int i; | 727 | int i; |
728 | 728 | ||
729 | lock_cpu_hotplug(); | 729 | get_online_cpus(); |
730 | 730 | ||
731 | /* No point in shuffling if there is only one online CPU (ex: UP) */ | 731 | /* No point in shuffling if there is only one online CPU (ex: UP) */ |
732 | if (num_online_cpus() == 1) { | 732 | if (num_online_cpus() == 1) { |
733 | unlock_cpu_hotplug(); | 733 | put_online_cpus(); |
734 | return; | 734 | return; |
735 | } | 735 | } |
736 | 736 | ||
@@ -762,7 +762,7 @@ static void rcu_torture_shuffle_tasks(void) | |||
762 | else | 762 | else |
763 | rcu_idle_cpu--; | 763 | rcu_idle_cpu--; |
764 | 764 | ||
765 | unlock_cpu_hotplug(); | 765 | put_online_cpus(); |
766 | } | 766 | } |
767 | 767 | ||
768 | /* Shuffle tasks across CPUs, with the intent of allowing each CPU in the | 768 | /* Shuffle tasks across CPUs, with the intent of allowing each CPU in the |
diff --git a/kernel/resource.c b/kernel/resource.c index a358142ff48f..2eb553d9b517 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
@@ -277,7 +277,7 @@ walk_memory_resource(unsigned long start_pfn, unsigned long nr_pages, void *arg, | |||
277 | int ret = -1; | 277 | int ret = -1; |
278 | res.start = (u64) start_pfn << PAGE_SHIFT; | 278 | res.start = (u64) start_pfn << PAGE_SHIFT; |
279 | res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1; | 279 | res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1; |
280 | res.flags = IORESOURCE_MEM; | 280 | res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; |
281 | orig_end = res.end; | 281 | orig_end = res.end; |
282 | while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) { | 282 | while ((res.start < res.end) && (find_next_system_ram(&res) >= 0)) { |
283 | pfn = (unsigned long)(res.start >> PAGE_SHIFT); | 283 | pfn = (unsigned long)(res.start >> PAGE_SHIFT); |
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index e3055ba69159..092e4c620af9 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c | |||
@@ -394,7 +394,7 @@ static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL); | |||
394 | static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command); | 394 | static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command); |
395 | 395 | ||
396 | static struct sysdev_class rttest_sysclass = { | 396 | static struct sysdev_class rttest_sysclass = { |
397 | set_kset_name("rttest"), | 397 | .name = "rttest", |
398 | }; | 398 | }; |
399 | 399 | ||
400 | static int init_test_thread(int id) | 400 | static int init_test_thread(int id) |
diff --git a/kernel/rwsem.c b/kernel/rwsem.c index 1ec620c03064..cae050b05f5e 100644 --- a/kernel/rwsem.c +++ b/kernel/rwsem.c | |||
@@ -6,6 +6,7 @@ | |||
6 | 6 | ||
7 | #include <linux/types.h> | 7 | #include <linux/types.h> |
8 | #include <linux/kernel.h> | 8 | #include <linux/kernel.h> |
9 | #include <linux/sched.h> | ||
9 | #include <linux/module.h> | 10 | #include <linux/module.h> |
10 | #include <linux/rwsem.h> | 11 | #include <linux/rwsem.h> |
11 | 12 | ||
@@ -15,7 +16,7 @@ | |||
15 | /* | 16 | /* |
16 | * lock for reading | 17 | * lock for reading |
17 | */ | 18 | */ |
18 | void down_read(struct rw_semaphore *sem) | 19 | void __sched down_read(struct rw_semaphore *sem) |
19 | { | 20 | { |
20 | might_sleep(); | 21 | might_sleep(); |
21 | rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); | 22 | rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_); |
@@ -42,7 +43,7 @@ EXPORT_SYMBOL(down_read_trylock); | |||
42 | /* | 43 | /* |
43 | * lock for writing | 44 | * lock for writing |
44 | */ | 45 | */ |
45 | void down_write(struct rw_semaphore *sem) | 46 | void __sched down_write(struct rw_semaphore *sem) |
46 | { | 47 | { |
47 | might_sleep(); | 48 | might_sleep(); |
48 | rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); | 49 | rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_); |
diff --git a/kernel/sched.c b/kernel/sched.c index b4fbbc440453..9474b23c28bf 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -22,6 +22,8 @@ | |||
22 | * by Peter Williams | 22 | * by Peter Williams |
23 | * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith | 23 | * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith |
24 | * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri | 24 | * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri |
25 | * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, | ||
26 | * Thomas Gleixner, Mike Kravetz | ||
25 | */ | 27 | */ |
26 | 28 | ||
27 | #include <linux/mm.h> | 29 | #include <linux/mm.h> |
@@ -52,7 +54,6 @@ | |||
52 | #include <linux/cpu.h> | 54 | #include <linux/cpu.h> |
53 | #include <linux/cpuset.h> | 55 | #include <linux/cpuset.h> |
54 | #include <linux/percpu.h> | 56 | #include <linux/percpu.h> |
55 | #include <linux/cpu_acct.h> | ||
56 | #include <linux/kthread.h> | 57 | #include <linux/kthread.h> |
57 | #include <linux/seq_file.h> | 58 | #include <linux/seq_file.h> |
58 | #include <linux/sysctl.h> | 59 | #include <linux/sysctl.h> |
@@ -64,6 +65,7 @@ | |||
64 | #include <linux/reciprocal_div.h> | 65 | #include <linux/reciprocal_div.h> |
65 | #include <linux/unistd.h> | 66 | #include <linux/unistd.h> |
66 | #include <linux/pagemap.h> | 67 | #include <linux/pagemap.h> |
68 | #include <linux/hrtimer.h> | ||
67 | 69 | ||
68 | #include <asm/tlb.h> | 70 | #include <asm/tlb.h> |
69 | #include <asm/irq_regs.h> | 71 | #include <asm/irq_regs.h> |
@@ -75,7 +77,7 @@ | |||
75 | */ | 77 | */ |
76 | unsigned long long __attribute__((weak)) sched_clock(void) | 78 | unsigned long long __attribute__((weak)) sched_clock(void) |
77 | { | 79 | { |
78 | return (unsigned long long)jiffies * (1000000000 / HZ); | 80 | return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); |
79 | } | 81 | } |
80 | 82 | ||
81 | /* | 83 | /* |
@@ -97,10 +99,9 @@ unsigned long long __attribute__((weak)) sched_clock(void) | |||
97 | #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) | 99 | #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) |
98 | 100 | ||
99 | /* | 101 | /* |
100 | * Some helpers for converting nanosecond timing to jiffy resolution | 102 | * Helpers for converting nanosecond timing to jiffy resolution |
101 | */ | 103 | */ |
102 | #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (1000000000 / HZ)) | 104 | #define NS_TO_JIFFIES(TIME) ((unsigned long)(TIME) / (NSEC_PER_SEC / HZ)) |
103 | #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) | ||
104 | 105 | ||
105 | #define NICE_0_LOAD SCHED_LOAD_SCALE | 106 | #define NICE_0_LOAD SCHED_LOAD_SCALE |
106 | #define NICE_0_SHIFT SCHED_LOAD_SHIFT | 107 | #define NICE_0_SHIFT SCHED_LOAD_SHIFT |
@@ -160,6 +161,8 @@ struct rt_prio_array { | |||
160 | 161 | ||
161 | struct cfs_rq; | 162 | struct cfs_rq; |
162 | 163 | ||
164 | static LIST_HEAD(task_groups); | ||
165 | |||
163 | /* task group related information */ | 166 | /* task group related information */ |
164 | struct task_group { | 167 | struct task_group { |
165 | #ifdef CONFIG_FAIR_CGROUP_SCHED | 168 | #ifdef CONFIG_FAIR_CGROUP_SCHED |
@@ -169,9 +172,50 @@ struct task_group { | |||
169 | struct sched_entity **se; | 172 | struct sched_entity **se; |
170 | /* runqueue "owned" by this group on each cpu */ | 173 | /* runqueue "owned" by this group on each cpu */ |
171 | struct cfs_rq **cfs_rq; | 174 | struct cfs_rq **cfs_rq; |
175 | |||
176 | struct sched_rt_entity **rt_se; | ||
177 | struct rt_rq **rt_rq; | ||
178 | |||
179 | unsigned int rt_ratio; | ||
180 | |||
181 | /* | ||
182 | * shares assigned to a task group governs how much of cpu bandwidth | ||
183 | * is allocated to the group. The more shares a group has, the more is | ||
184 | * the cpu bandwidth allocated to it. | ||
185 | * | ||
186 | * For ex, lets say that there are three task groups, A, B and C which | ||
187 | * have been assigned shares 1000, 2000 and 3000 respectively. Then, | ||
188 | * cpu bandwidth allocated by the scheduler to task groups A, B and C | ||
189 | * should be: | ||
190 | * | ||
191 | * Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66% | ||
192 | * Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33% | ||
193 | * Bw(C) = 3000/(1000+2000+3000) * 100 = 50% | ||
194 | * | ||
195 | * The weight assigned to a task group's schedulable entities on every | ||
196 | * cpu (task_group.se[a_cpu]->load.weight) is derived from the task | ||
197 | * group's shares. For ex: lets say that task group A has been | ||
198 | * assigned shares of 1000 and there are two CPUs in a system. Then, | ||
199 | * | ||
200 | * tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000; | ||
201 | * | ||
202 | * Note: It's not necessary that each of a task's group schedulable | ||
203 | * entity have the same weight on all CPUs. If the group | ||
204 | * has 2 of its tasks on CPU0 and 1 task on CPU1, then a | ||
205 | * better distribution of weight could be: | ||
206 | * | ||
207 | * tg_A->se[0]->load.weight = 2/3 * 2000 = 1333 | ||
208 | * tg_A->se[1]->load.weight = 1/2 * 2000 = 667 | ||
209 | * | ||
210 | * rebalance_shares() is responsible for distributing the shares of a | ||
211 | * task groups like this among the group's schedulable entities across | ||
212 | * cpus. | ||
213 | * | ||
214 | */ | ||
172 | unsigned long shares; | 215 | unsigned long shares; |
173 | /* spinlock to serialize modification to shares */ | 216 | |
174 | spinlock_t lock; | 217 | struct rcu_head rcu; |
218 | struct list_head list; | ||
175 | }; | 219 | }; |
176 | 220 | ||
177 | /* Default task group's sched entity on each cpu */ | 221 | /* Default task group's sched entity on each cpu */ |
@@ -179,24 +223,51 @@ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); | |||
179 | /* Default task group's cfs_rq on each cpu */ | 223 | /* Default task group's cfs_rq on each cpu */ |
180 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; | 224 | static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; |
181 | 225 | ||
226 | static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); | ||
227 | static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; | ||
228 | |||
182 | static struct sched_entity *init_sched_entity_p[NR_CPUS]; | 229 | static struct sched_entity *init_sched_entity_p[NR_CPUS]; |
183 | static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; | 230 | static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; |
184 | 231 | ||
232 | static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS]; | ||
233 | static struct rt_rq *init_rt_rq_p[NR_CPUS]; | ||
234 | |||
235 | /* task_group_mutex serializes add/remove of task groups and also changes to | ||
236 | * a task group's cpu shares. | ||
237 | */ | ||
238 | static DEFINE_MUTEX(task_group_mutex); | ||
239 | |||
240 | /* doms_cur_mutex serializes access to doms_cur[] array */ | ||
241 | static DEFINE_MUTEX(doms_cur_mutex); | ||
242 | |||
243 | #ifdef CONFIG_SMP | ||
244 | /* kernel thread that runs rebalance_shares() periodically */ | ||
245 | static struct task_struct *lb_monitor_task; | ||
246 | static int load_balance_monitor(void *unused); | ||
247 | #endif | ||
248 | |||
249 | static void set_se_shares(struct sched_entity *se, unsigned long shares); | ||
250 | |||
185 | /* Default task group. | 251 | /* Default task group. |
186 | * Every task in system belong to this group at bootup. | 252 | * Every task in system belong to this group at bootup. |
187 | */ | 253 | */ |
188 | struct task_group init_task_group = { | 254 | struct task_group init_task_group = { |
189 | .se = init_sched_entity_p, | 255 | .se = init_sched_entity_p, |
190 | .cfs_rq = init_cfs_rq_p, | 256 | .cfs_rq = init_cfs_rq_p, |
257 | |||
258 | .rt_se = init_sched_rt_entity_p, | ||
259 | .rt_rq = init_rt_rq_p, | ||
191 | }; | 260 | }; |
192 | 261 | ||
193 | #ifdef CONFIG_FAIR_USER_SCHED | 262 | #ifdef CONFIG_FAIR_USER_SCHED |
194 | # define INIT_TASK_GRP_LOAD 2*NICE_0_LOAD | 263 | # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) |
195 | #else | 264 | #else |
196 | # define INIT_TASK_GRP_LOAD NICE_0_LOAD | 265 | # define INIT_TASK_GROUP_LOAD NICE_0_LOAD |
197 | #endif | 266 | #endif |
198 | 267 | ||
199 | static int init_task_group_load = INIT_TASK_GRP_LOAD; | 268 | #define MIN_GROUP_SHARES 2 |
269 | |||
270 | static int init_task_group_load = INIT_TASK_GROUP_LOAD; | ||
200 | 271 | ||
201 | /* return group to which a task belongs */ | 272 | /* return group to which a task belongs */ |
202 | static inline struct task_group *task_group(struct task_struct *p) | 273 | static inline struct task_group *task_group(struct task_struct *p) |
@@ -209,22 +280,48 @@ static inline struct task_group *task_group(struct task_struct *p) | |||
209 | tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), | 280 | tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), |
210 | struct task_group, css); | 281 | struct task_group, css); |
211 | #else | 282 | #else |
212 | tg = &init_task_group; | 283 | tg = &init_task_group; |
213 | #endif | 284 | #endif |
214 | |||
215 | return tg; | 285 | return tg; |
216 | } | 286 | } |
217 | 287 | ||
218 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ | 288 | /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ |
219 | static inline void set_task_cfs_rq(struct task_struct *p) | 289 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) |
290 | { | ||
291 | p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; | ||
292 | p->se.parent = task_group(p)->se[cpu]; | ||
293 | |||
294 | p->rt.rt_rq = task_group(p)->rt_rq[cpu]; | ||
295 | p->rt.parent = task_group(p)->rt_se[cpu]; | ||
296 | } | ||
297 | |||
298 | static inline void lock_task_group_list(void) | ||
220 | { | 299 | { |
221 | p->se.cfs_rq = task_group(p)->cfs_rq[task_cpu(p)]; | 300 | mutex_lock(&task_group_mutex); |
222 | p->se.parent = task_group(p)->se[task_cpu(p)]; | 301 | } |
302 | |||
303 | static inline void unlock_task_group_list(void) | ||
304 | { | ||
305 | mutex_unlock(&task_group_mutex); | ||
306 | } | ||
307 | |||
308 | static inline void lock_doms_cur(void) | ||
309 | { | ||
310 | mutex_lock(&doms_cur_mutex); | ||
311 | } | ||
312 | |||
313 | static inline void unlock_doms_cur(void) | ||
314 | { | ||
315 | mutex_unlock(&doms_cur_mutex); | ||
223 | } | 316 | } |
224 | 317 | ||
225 | #else | 318 | #else |
226 | 319 | ||
227 | static inline void set_task_cfs_rq(struct task_struct *p) { } | 320 | static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } |
321 | static inline void lock_task_group_list(void) { } | ||
322 | static inline void unlock_task_group_list(void) { } | ||
323 | static inline void lock_doms_cur(void) { } | ||
324 | static inline void unlock_doms_cur(void) { } | ||
228 | 325 | ||
229 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 326 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
230 | 327 | ||
@@ -249,26 +346,72 @@ struct cfs_rq { | |||
249 | #ifdef CONFIG_FAIR_GROUP_SCHED | 346 | #ifdef CONFIG_FAIR_GROUP_SCHED |
250 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ | 347 | struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ |
251 | 348 | ||
252 | /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in | 349 | /* |
350 | * leaf cfs_rqs are those that hold tasks (lowest schedulable entity in | ||
253 | * a hierarchy). Non-leaf lrqs hold other higher schedulable entities | 351 | * a hierarchy). Non-leaf lrqs hold other higher schedulable entities |
254 | * (like users, containers etc.) | 352 | * (like users, containers etc.) |
255 | * | 353 | * |
256 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This | 354 | * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This |
257 | * list is used during load balance. | 355 | * list is used during load balance. |
258 | */ | 356 | */ |
259 | struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */ | 357 | struct list_head leaf_cfs_rq_list; |
260 | struct task_group *tg; /* group that "owns" this runqueue */ | 358 | struct task_group *tg; /* group that "owns" this runqueue */ |
261 | struct rcu_head rcu; | ||
262 | #endif | 359 | #endif |
263 | }; | 360 | }; |
264 | 361 | ||
265 | /* Real-Time classes' related field in a runqueue: */ | 362 | /* Real-Time classes' related field in a runqueue: */ |
266 | struct rt_rq { | 363 | struct rt_rq { |
267 | struct rt_prio_array active; | 364 | struct rt_prio_array active; |
268 | int rt_load_balance_idx; | 365 | unsigned long rt_nr_running; |
269 | struct list_head *rt_load_balance_head, *rt_load_balance_curr; | 366 | #if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED |
367 | int highest_prio; /* highest queued rt task prio */ | ||
368 | #endif | ||
369 | #ifdef CONFIG_SMP | ||
370 | unsigned long rt_nr_migratory; | ||
371 | int overloaded; | ||
372 | #endif | ||
373 | int rt_throttled; | ||
374 | u64 rt_time; | ||
375 | |||
376 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
377 | struct rq *rq; | ||
378 | struct list_head leaf_rt_rq_list; | ||
379 | struct task_group *tg; | ||
380 | struct sched_rt_entity *rt_se; | ||
381 | #endif | ||
270 | }; | 382 | }; |
271 | 383 | ||
384 | #ifdef CONFIG_SMP | ||
385 | |||
386 | /* | ||
387 | * We add the notion of a root-domain which will be used to define per-domain | ||
388 | * variables. Each exclusive cpuset essentially defines an island domain by | ||
389 | * fully partitioning the member cpus from any other cpuset. Whenever a new | ||
390 | * exclusive cpuset is created, we also create and attach a new root-domain | ||
391 | * object. | ||
392 | * | ||
393 | */ | ||
394 | struct root_domain { | ||
395 | atomic_t refcount; | ||
396 | cpumask_t span; | ||
397 | cpumask_t online; | ||
398 | |||
399 | /* | ||
400 | * The "RT overload" flag: it gets set if a CPU has more than | ||
401 | * one runnable RT task. | ||
402 | */ | ||
403 | cpumask_t rto_mask; | ||
404 | atomic_t rto_count; | ||
405 | }; | ||
406 | |||
407 | /* | ||
408 | * By default the system creates a single root-domain with all cpus as | ||
409 | * members (mimicking the global state we have today). | ||
410 | */ | ||
411 | static struct root_domain def_root_domain; | ||
412 | |||
413 | #endif | ||
414 | |||
272 | /* | 415 | /* |
273 | * This is the main, per-CPU runqueue data structure. | 416 | * This is the main, per-CPU runqueue data structure. |
274 | * | 417 | * |
@@ -297,11 +440,15 @@ struct rq { | |||
297 | u64 nr_switches; | 440 | u64 nr_switches; |
298 | 441 | ||
299 | struct cfs_rq cfs; | 442 | struct cfs_rq cfs; |
443 | struct rt_rq rt; | ||
444 | u64 rt_period_expire; | ||
445 | int rt_throttled; | ||
446 | |||
300 | #ifdef CONFIG_FAIR_GROUP_SCHED | 447 | #ifdef CONFIG_FAIR_GROUP_SCHED |
301 | /* list of leaf cfs_rq on this cpu: */ | 448 | /* list of leaf cfs_rq on this cpu: */ |
302 | struct list_head leaf_cfs_rq_list; | 449 | struct list_head leaf_cfs_rq_list; |
450 | struct list_head leaf_rt_rq_list; | ||
303 | #endif | 451 | #endif |
304 | struct rt_rq rt; | ||
305 | 452 | ||
306 | /* | 453 | /* |
307 | * This is part of a global counter where only the total sum | 454 | * This is part of a global counter where only the total sum |
@@ -318,7 +465,7 @@ struct rq { | |||
318 | u64 clock, prev_clock_raw; | 465 | u64 clock, prev_clock_raw; |
319 | s64 clock_max_delta; | 466 | s64 clock_max_delta; |
320 | 467 | ||
321 | unsigned int clock_warps, clock_overflows; | 468 | unsigned int clock_warps, clock_overflows, clock_underflows; |
322 | u64 idle_clock; | 469 | u64 idle_clock; |
323 | unsigned int clock_deep_idle_events; | 470 | unsigned int clock_deep_idle_events; |
324 | u64 tick_timestamp; | 471 | u64 tick_timestamp; |
@@ -326,6 +473,7 @@ struct rq { | |||
326 | atomic_t nr_iowait; | 473 | atomic_t nr_iowait; |
327 | 474 | ||
328 | #ifdef CONFIG_SMP | 475 | #ifdef CONFIG_SMP |
476 | struct root_domain *rd; | ||
329 | struct sched_domain *sd; | 477 | struct sched_domain *sd; |
330 | 478 | ||
331 | /* For active balancing */ | 479 | /* For active balancing */ |
@@ -338,6 +486,12 @@ struct rq { | |||
338 | struct list_head migration_queue; | 486 | struct list_head migration_queue; |
339 | #endif | 487 | #endif |
340 | 488 | ||
489 | #ifdef CONFIG_SCHED_HRTICK | ||
490 | unsigned long hrtick_flags; | ||
491 | ktime_t hrtick_expire; | ||
492 | struct hrtimer hrtick_timer; | ||
493 | #endif | ||
494 | |||
341 | #ifdef CONFIG_SCHEDSTATS | 495 | #ifdef CONFIG_SCHEDSTATS |
342 | /* latency stats */ | 496 | /* latency stats */ |
343 | struct sched_info rq_sched_info; | 497 | struct sched_info rq_sched_info; |
@@ -364,7 +518,6 @@ struct rq { | |||
364 | }; | 518 | }; |
365 | 519 | ||
366 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); | 520 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); |
367 | static DEFINE_MUTEX(sched_hotcpu_mutex); | ||
368 | 521 | ||
369 | static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) | 522 | static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) |
370 | { | 523 | { |
@@ -442,6 +595,23 @@ static void update_rq_clock(struct rq *rq) | |||
442 | #define task_rq(p) cpu_rq(task_cpu(p)) | 595 | #define task_rq(p) cpu_rq(task_cpu(p)) |
443 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) | 596 | #define cpu_curr(cpu) (cpu_rq(cpu)->curr) |
444 | 597 | ||
598 | unsigned long rt_needs_cpu(int cpu) | ||
599 | { | ||
600 | struct rq *rq = cpu_rq(cpu); | ||
601 | u64 delta; | ||
602 | |||
603 | if (!rq->rt_throttled) | ||
604 | return 0; | ||
605 | |||
606 | if (rq->clock > rq->rt_period_expire) | ||
607 | return 1; | ||
608 | |||
609 | delta = rq->rt_period_expire - rq->clock; | ||
610 | do_div(delta, NSEC_PER_SEC / HZ); | ||
611 | |||
612 | return (unsigned long)delta; | ||
613 | } | ||
614 | |||
445 | /* | 615 | /* |
446 | * Tunables that become constants when CONFIG_SCHED_DEBUG is off: | 616 | * Tunables that become constants when CONFIG_SCHED_DEBUG is off: |
447 | */ | 617 | */ |
@@ -456,24 +626,47 @@ static void update_rq_clock(struct rq *rq) | |||
456 | */ | 626 | */ |
457 | enum { | 627 | enum { |
458 | SCHED_FEAT_NEW_FAIR_SLEEPERS = 1, | 628 | SCHED_FEAT_NEW_FAIR_SLEEPERS = 1, |
459 | SCHED_FEAT_START_DEBIT = 2, | 629 | SCHED_FEAT_WAKEUP_PREEMPT = 2, |
460 | SCHED_FEAT_TREE_AVG = 4, | 630 | SCHED_FEAT_START_DEBIT = 4, |
461 | SCHED_FEAT_APPROX_AVG = 8, | 631 | SCHED_FEAT_TREE_AVG = 8, |
462 | SCHED_FEAT_WAKEUP_PREEMPT = 16, | 632 | SCHED_FEAT_APPROX_AVG = 16, |
463 | SCHED_FEAT_PREEMPT_RESTRICT = 32, | 633 | SCHED_FEAT_HRTICK = 32, |
634 | SCHED_FEAT_DOUBLE_TICK = 64, | ||
464 | }; | 635 | }; |
465 | 636 | ||
466 | const_debug unsigned int sysctl_sched_features = | 637 | const_debug unsigned int sysctl_sched_features = |
467 | SCHED_FEAT_NEW_FAIR_SLEEPERS * 1 | | 638 | SCHED_FEAT_NEW_FAIR_SLEEPERS * 1 | |
639 | SCHED_FEAT_WAKEUP_PREEMPT * 1 | | ||
468 | SCHED_FEAT_START_DEBIT * 1 | | 640 | SCHED_FEAT_START_DEBIT * 1 | |
469 | SCHED_FEAT_TREE_AVG * 0 | | 641 | SCHED_FEAT_TREE_AVG * 0 | |
470 | SCHED_FEAT_APPROX_AVG * 0 | | 642 | SCHED_FEAT_APPROX_AVG * 0 | |
471 | SCHED_FEAT_WAKEUP_PREEMPT * 1 | | 643 | SCHED_FEAT_HRTICK * 1 | |
472 | SCHED_FEAT_PREEMPT_RESTRICT * 1; | 644 | SCHED_FEAT_DOUBLE_TICK * 0; |
473 | 645 | ||
474 | #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) | 646 | #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) |
475 | 647 | ||
476 | /* | 648 | /* |
649 | * Number of tasks to iterate in a single balance run. | ||
650 | * Limited because this is done with IRQs disabled. | ||
651 | */ | ||
652 | const_debug unsigned int sysctl_sched_nr_migrate = 32; | ||
653 | |||
654 | /* | ||
655 | * period over which we measure -rt task cpu usage in ms. | ||
656 | * default: 1s | ||
657 | */ | ||
658 | const_debug unsigned int sysctl_sched_rt_period = 1000; | ||
659 | |||
660 | #define SCHED_RT_FRAC_SHIFT 16 | ||
661 | #define SCHED_RT_FRAC (1UL << SCHED_RT_FRAC_SHIFT) | ||
662 | |||
663 | /* | ||
664 | * ratio of time -rt tasks may consume. | ||
665 | * default: 95% | ||
666 | */ | ||
667 | const_debug unsigned int sysctl_sched_rt_ratio = 62259; | ||
668 | |||
669 | /* | ||
477 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu | 670 | * For kernel-internal use: high-speed (but slightly incorrect) per-cpu |
478 | * clock constructed from sched_clock(): | 671 | * clock constructed from sched_clock(): |
479 | */ | 672 | */ |
@@ -485,7 +678,12 @@ unsigned long long cpu_clock(int cpu) | |||
485 | 678 | ||
486 | local_irq_save(flags); | 679 | local_irq_save(flags); |
487 | rq = cpu_rq(cpu); | 680 | rq = cpu_rq(cpu); |
488 | update_rq_clock(rq); | 681 | /* |
682 | * Only call sched_clock() if the scheduler has already been | ||
683 | * initialized (some code might call cpu_clock() very early): | ||
684 | */ | ||
685 | if (rq->idle) | ||
686 | update_rq_clock(rq); | ||
489 | now = rq->clock; | 687 | now = rq->clock; |
490 | local_irq_restore(flags); | 688 | local_irq_restore(flags); |
491 | 689 | ||
@@ -500,10 +698,15 @@ EXPORT_SYMBOL_GPL(cpu_clock); | |||
500 | # define finish_arch_switch(prev) do { } while (0) | 698 | # define finish_arch_switch(prev) do { } while (0) |
501 | #endif | 699 | #endif |
502 | 700 | ||
701 | static inline int task_current(struct rq *rq, struct task_struct *p) | ||
702 | { | ||
703 | return rq->curr == p; | ||
704 | } | ||
705 | |||
503 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW | 706 | #ifndef __ARCH_WANT_UNLOCKED_CTXSW |
504 | static inline int task_running(struct rq *rq, struct task_struct *p) | 707 | static inline int task_running(struct rq *rq, struct task_struct *p) |
505 | { | 708 | { |
506 | return rq->curr == p; | 709 | return task_current(rq, p); |
507 | } | 710 | } |
508 | 711 | ||
509 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) | 712 | static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next) |
@@ -532,7 +735,7 @@ static inline int task_running(struct rq *rq, struct task_struct *p) | |||
532 | #ifdef CONFIG_SMP | 735 | #ifdef CONFIG_SMP |
533 | return p->oncpu; | 736 | return p->oncpu; |
534 | #else | 737 | #else |
535 | return rq->curr == p; | 738 | return task_current(rq, p); |
536 | #endif | 739 | #endif |
537 | } | 740 | } |
538 | 741 | ||
@@ -588,7 +791,7 @@ static inline struct rq *__task_rq_lock(struct task_struct *p) | |||
588 | 791 | ||
589 | /* | 792 | /* |
590 | * task_rq_lock - lock the runqueue a given task resides on and disable | 793 | * task_rq_lock - lock the runqueue a given task resides on and disable |
591 | * interrupts. Note the ordering: we can safely lookup the task_rq without | 794 | * interrupts. Note the ordering: we can safely lookup the task_rq without |
592 | * explicitly disabling preemption. | 795 | * explicitly disabling preemption. |
593 | */ | 796 | */ |
594 | static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) | 797 | static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) |
@@ -666,9 +869,177 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) | |||
666 | rq->prev_clock_raw = now; | 869 | rq->prev_clock_raw = now; |
667 | rq->clock += delta_ns; | 870 | rq->clock += delta_ns; |
668 | spin_unlock(&rq->lock); | 871 | spin_unlock(&rq->lock); |
872 | touch_softlockup_watchdog(); | ||
669 | } | 873 | } |
670 | EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); | 874 | EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); |
671 | 875 | ||
876 | static void __resched_task(struct task_struct *p, int tif_bit); | ||
877 | |||
878 | static inline void resched_task(struct task_struct *p) | ||
879 | { | ||
880 | __resched_task(p, TIF_NEED_RESCHED); | ||
881 | } | ||
882 | |||
883 | #ifdef CONFIG_SCHED_HRTICK | ||
884 | /* | ||
885 | * Use HR-timers to deliver accurate preemption points. | ||
886 | * | ||
887 | * Its all a bit involved since we cannot program an hrt while holding the | ||
888 | * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a | ||
889 | * reschedule event. | ||
890 | * | ||
891 | * When we get rescheduled we reprogram the hrtick_timer outside of the | ||
892 | * rq->lock. | ||
893 | */ | ||
894 | static inline void resched_hrt(struct task_struct *p) | ||
895 | { | ||
896 | __resched_task(p, TIF_HRTICK_RESCHED); | ||
897 | } | ||
898 | |||
899 | static inline void resched_rq(struct rq *rq) | ||
900 | { | ||
901 | unsigned long flags; | ||
902 | |||
903 | spin_lock_irqsave(&rq->lock, flags); | ||
904 | resched_task(rq->curr); | ||
905 | spin_unlock_irqrestore(&rq->lock, flags); | ||
906 | } | ||
907 | |||
908 | enum { | ||
909 | HRTICK_SET, /* re-programm hrtick_timer */ | ||
910 | HRTICK_RESET, /* not a new slice */ | ||
911 | }; | ||
912 | |||
913 | /* | ||
914 | * Use hrtick when: | ||
915 | * - enabled by features | ||
916 | * - hrtimer is actually high res | ||
917 | */ | ||
918 | static inline int hrtick_enabled(struct rq *rq) | ||
919 | { | ||
920 | if (!sched_feat(HRTICK)) | ||
921 | return 0; | ||
922 | return hrtimer_is_hres_active(&rq->hrtick_timer); | ||
923 | } | ||
924 | |||
925 | /* | ||
926 | * Called to set the hrtick timer state. | ||
927 | * | ||
928 | * called with rq->lock held and irqs disabled | ||
929 | */ | ||
930 | static void hrtick_start(struct rq *rq, u64 delay, int reset) | ||
931 | { | ||
932 | assert_spin_locked(&rq->lock); | ||
933 | |||
934 | /* | ||
935 | * preempt at: now + delay | ||
936 | */ | ||
937 | rq->hrtick_expire = | ||
938 | ktime_add_ns(rq->hrtick_timer.base->get_time(), delay); | ||
939 | /* | ||
940 | * indicate we need to program the timer | ||
941 | */ | ||
942 | __set_bit(HRTICK_SET, &rq->hrtick_flags); | ||
943 | if (reset) | ||
944 | __set_bit(HRTICK_RESET, &rq->hrtick_flags); | ||
945 | |||
946 | /* | ||
947 | * New slices are called from the schedule path and don't need a | ||
948 | * forced reschedule. | ||
949 | */ | ||
950 | if (reset) | ||
951 | resched_hrt(rq->curr); | ||
952 | } | ||
953 | |||
954 | static void hrtick_clear(struct rq *rq) | ||
955 | { | ||
956 | if (hrtimer_active(&rq->hrtick_timer)) | ||
957 | hrtimer_cancel(&rq->hrtick_timer); | ||
958 | } | ||
959 | |||
960 | /* | ||
961 | * Update the timer from the possible pending state. | ||
962 | */ | ||
963 | static void hrtick_set(struct rq *rq) | ||
964 | { | ||
965 | ktime_t time; | ||
966 | int set, reset; | ||
967 | unsigned long flags; | ||
968 | |||
969 | WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); | ||
970 | |||
971 | spin_lock_irqsave(&rq->lock, flags); | ||
972 | set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags); | ||
973 | reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags); | ||
974 | time = rq->hrtick_expire; | ||
975 | clear_thread_flag(TIF_HRTICK_RESCHED); | ||
976 | spin_unlock_irqrestore(&rq->lock, flags); | ||
977 | |||
978 | if (set) { | ||
979 | hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS); | ||
980 | if (reset && !hrtimer_active(&rq->hrtick_timer)) | ||
981 | resched_rq(rq); | ||
982 | } else | ||
983 | hrtick_clear(rq); | ||
984 | } | ||
985 | |||
986 | /* | ||
987 | * High-resolution timer tick. | ||
988 | * Runs from hardirq context with interrupts disabled. | ||
989 | */ | ||
990 | static enum hrtimer_restart hrtick(struct hrtimer *timer) | ||
991 | { | ||
992 | struct rq *rq = container_of(timer, struct rq, hrtick_timer); | ||
993 | |||
994 | WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); | ||
995 | |||
996 | spin_lock(&rq->lock); | ||
997 | __update_rq_clock(rq); | ||
998 | rq->curr->sched_class->task_tick(rq, rq->curr, 1); | ||
999 | spin_unlock(&rq->lock); | ||
1000 | |||
1001 | return HRTIMER_NORESTART; | ||
1002 | } | ||
1003 | |||
1004 | static inline void init_rq_hrtick(struct rq *rq) | ||
1005 | { | ||
1006 | rq->hrtick_flags = 0; | ||
1007 | hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); | ||
1008 | rq->hrtick_timer.function = hrtick; | ||
1009 | rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; | ||
1010 | } | ||
1011 | |||
1012 | void hrtick_resched(void) | ||
1013 | { | ||
1014 | struct rq *rq; | ||
1015 | unsigned long flags; | ||
1016 | |||
1017 | if (!test_thread_flag(TIF_HRTICK_RESCHED)) | ||
1018 | return; | ||
1019 | |||
1020 | local_irq_save(flags); | ||
1021 | rq = cpu_rq(smp_processor_id()); | ||
1022 | hrtick_set(rq); | ||
1023 | local_irq_restore(flags); | ||
1024 | } | ||
1025 | #else | ||
1026 | static inline void hrtick_clear(struct rq *rq) | ||
1027 | { | ||
1028 | } | ||
1029 | |||
1030 | static inline void hrtick_set(struct rq *rq) | ||
1031 | { | ||
1032 | } | ||
1033 | |||
1034 | static inline void init_rq_hrtick(struct rq *rq) | ||
1035 | { | ||
1036 | } | ||
1037 | |||
1038 | void hrtick_resched(void) | ||
1039 | { | ||
1040 | } | ||
1041 | #endif | ||
1042 | |||
672 | /* | 1043 | /* |
673 | * resched_task - mark a task 'to be rescheduled now'. | 1044 | * resched_task - mark a task 'to be rescheduled now'. |
674 | * | 1045 | * |
@@ -682,16 +1053,16 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); | |||
682 | #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) | 1053 | #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) |
683 | #endif | 1054 | #endif |
684 | 1055 | ||
685 | static void resched_task(struct task_struct *p) | 1056 | static void __resched_task(struct task_struct *p, int tif_bit) |
686 | { | 1057 | { |
687 | int cpu; | 1058 | int cpu; |
688 | 1059 | ||
689 | assert_spin_locked(&task_rq(p)->lock); | 1060 | assert_spin_locked(&task_rq(p)->lock); |
690 | 1061 | ||
691 | if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) | 1062 | if (unlikely(test_tsk_thread_flag(p, tif_bit))) |
692 | return; | 1063 | return; |
693 | 1064 | ||
694 | set_tsk_thread_flag(p, TIF_NEED_RESCHED); | 1065 | set_tsk_thread_flag(p, tif_bit); |
695 | 1066 | ||
696 | cpu = task_cpu(p); | 1067 | cpu = task_cpu(p); |
697 | if (cpu == smp_processor_id()) | 1068 | if (cpu == smp_processor_id()) |
@@ -714,10 +1085,10 @@ static void resched_cpu(int cpu) | |||
714 | spin_unlock_irqrestore(&rq->lock, flags); | 1085 | spin_unlock_irqrestore(&rq->lock, flags); |
715 | } | 1086 | } |
716 | #else | 1087 | #else |
717 | static inline void resched_task(struct task_struct *p) | 1088 | static void __resched_task(struct task_struct *p, int tif_bit) |
718 | { | 1089 | { |
719 | assert_spin_locked(&task_rq(p)->lock); | 1090 | assert_spin_locked(&task_rq(p)->lock); |
720 | set_tsk_need_resched(p); | 1091 | set_tsk_thread_flag(p, tif_bit); |
721 | } | 1092 | } |
722 | #endif | 1093 | #endif |
723 | 1094 | ||
@@ -776,7 +1147,7 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec) | |||
776 | * To aid in avoiding the subversion of "niceness" due to uneven distribution | 1147 | * To aid in avoiding the subversion of "niceness" due to uneven distribution |
777 | * of tasks with abnormal "nice" values across CPUs the contribution that | 1148 | * of tasks with abnormal "nice" values across CPUs the contribution that |
778 | * each task makes to its run queue's load is weighted according to its | 1149 | * each task makes to its run queue's load is weighted according to its |
779 | * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a | 1150 | * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a |
780 | * scaled version of the new time slice allocation that they receive on time | 1151 | * scaled version of the new time slice allocation that they receive on time |
781 | * slice expiry etc. | 1152 | * slice expiry etc. |
782 | */ | 1153 | */ |
@@ -851,6 +1222,29 @@ iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
851 | struct rq_iterator *iterator); | 1222 | struct rq_iterator *iterator); |
852 | #endif | 1223 | #endif |
853 | 1224 | ||
1225 | #ifdef CONFIG_CGROUP_CPUACCT | ||
1226 | static void cpuacct_charge(struct task_struct *tsk, u64 cputime); | ||
1227 | #else | ||
1228 | static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} | ||
1229 | #endif | ||
1230 | |||
1231 | static inline void inc_cpu_load(struct rq *rq, unsigned long load) | ||
1232 | { | ||
1233 | update_load_add(&rq->load, load); | ||
1234 | } | ||
1235 | |||
1236 | static inline void dec_cpu_load(struct rq *rq, unsigned long load) | ||
1237 | { | ||
1238 | update_load_sub(&rq->load, load); | ||
1239 | } | ||
1240 | |||
1241 | #ifdef CONFIG_SMP | ||
1242 | static unsigned long source_load(int cpu, int type); | ||
1243 | static unsigned long target_load(int cpu, int type); | ||
1244 | static unsigned long cpu_avg_load_per_task(int cpu); | ||
1245 | static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); | ||
1246 | #endif /* CONFIG_SMP */ | ||
1247 | |||
854 | #include "sched_stats.h" | 1248 | #include "sched_stats.h" |
855 | #include "sched_idletask.c" | 1249 | #include "sched_idletask.c" |
856 | #include "sched_fair.c" | 1250 | #include "sched_fair.c" |
@@ -861,41 +1255,14 @@ iter_move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
861 | 1255 | ||
862 | #define sched_class_highest (&rt_sched_class) | 1256 | #define sched_class_highest (&rt_sched_class) |
863 | 1257 | ||
864 | /* | 1258 | static void inc_nr_running(struct rq *rq) |
865 | * Update delta_exec, delta_fair fields for rq. | ||
866 | * | ||
867 | * delta_fair clock advances at a rate inversely proportional to | ||
868 | * total load (rq->load.weight) on the runqueue, while | ||
869 | * delta_exec advances at the same rate as wall-clock (provided | ||
870 | * cpu is not idle). | ||
871 | * | ||
872 | * delta_exec / delta_fair is a measure of the (smoothened) load on this | ||
873 | * runqueue over any given interval. This (smoothened) load is used | ||
874 | * during load balance. | ||
875 | * | ||
876 | * This function is called /before/ updating rq->load | ||
877 | * and when switching tasks. | ||
878 | */ | ||
879 | static inline void inc_load(struct rq *rq, const struct task_struct *p) | ||
880 | { | ||
881 | update_load_add(&rq->load, p->se.load.weight); | ||
882 | } | ||
883 | |||
884 | static inline void dec_load(struct rq *rq, const struct task_struct *p) | ||
885 | { | ||
886 | update_load_sub(&rq->load, p->se.load.weight); | ||
887 | } | ||
888 | |||
889 | static void inc_nr_running(struct task_struct *p, struct rq *rq) | ||
890 | { | 1259 | { |
891 | rq->nr_running++; | 1260 | rq->nr_running++; |
892 | inc_load(rq, p); | ||
893 | } | 1261 | } |
894 | 1262 | ||
895 | static void dec_nr_running(struct task_struct *p, struct rq *rq) | 1263 | static void dec_nr_running(struct rq *rq) |
896 | { | 1264 | { |
897 | rq->nr_running--; | 1265 | rq->nr_running--; |
898 | dec_load(rq, p); | ||
899 | } | 1266 | } |
900 | 1267 | ||
901 | static void set_load_weight(struct task_struct *p) | 1268 | static void set_load_weight(struct task_struct *p) |
@@ -983,11 +1350,11 @@ static int effective_prio(struct task_struct *p) | |||
983 | */ | 1350 | */ |
984 | static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) | 1351 | static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) |
985 | { | 1352 | { |
986 | if (p->state == TASK_UNINTERRUPTIBLE) | 1353 | if (task_contributes_to_load(p)) |
987 | rq->nr_uninterruptible--; | 1354 | rq->nr_uninterruptible--; |
988 | 1355 | ||
989 | enqueue_task(rq, p, wakeup); | 1356 | enqueue_task(rq, p, wakeup); |
990 | inc_nr_running(p, rq); | 1357 | inc_nr_running(rq); |
991 | } | 1358 | } |
992 | 1359 | ||
993 | /* | 1360 | /* |
@@ -995,11 +1362,11 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) | |||
995 | */ | 1362 | */ |
996 | static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) | 1363 | static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) |
997 | { | 1364 | { |
998 | if (p->state == TASK_UNINTERRUPTIBLE) | 1365 | if (task_contributes_to_load(p)) |
999 | rq->nr_uninterruptible++; | 1366 | rq->nr_uninterruptible++; |
1000 | 1367 | ||
1001 | dequeue_task(rq, p, sleep); | 1368 | dequeue_task(rq, p, sleep); |
1002 | dec_nr_running(p, rq); | 1369 | dec_nr_running(rq); |
1003 | } | 1370 | } |
1004 | 1371 | ||
1005 | /** | 1372 | /** |
@@ -1019,10 +1386,28 @@ unsigned long weighted_cpuload(const int cpu) | |||
1019 | 1386 | ||
1020 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | 1387 | static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) |
1021 | { | 1388 | { |
1389 | set_task_rq(p, cpu); | ||
1022 | #ifdef CONFIG_SMP | 1390 | #ifdef CONFIG_SMP |
1391 | /* | ||
1392 | * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be | ||
1393 | * successfuly executed on another CPU. We must ensure that updates of | ||
1394 | * per-task data have been completed by this moment. | ||
1395 | */ | ||
1396 | smp_wmb(); | ||
1023 | task_thread_info(p)->cpu = cpu; | 1397 | task_thread_info(p)->cpu = cpu; |
1024 | #endif | 1398 | #endif |
1025 | set_task_cfs_rq(p); | 1399 | } |
1400 | |||
1401 | static inline void check_class_changed(struct rq *rq, struct task_struct *p, | ||
1402 | const struct sched_class *prev_class, | ||
1403 | int oldprio, int running) | ||
1404 | { | ||
1405 | if (prev_class != p->sched_class) { | ||
1406 | if (prev_class->switched_from) | ||
1407 | prev_class->switched_from(rq, p, running); | ||
1408 | p->sched_class->switched_to(rq, p, running); | ||
1409 | } else | ||
1410 | p->sched_class->prio_changed(rq, p, oldprio, running); | ||
1026 | } | 1411 | } |
1027 | 1412 | ||
1028 | #ifdef CONFIG_SMP | 1413 | #ifdef CONFIG_SMP |
@@ -1030,7 +1415,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) | |||
1030 | /* | 1415 | /* |
1031 | * Is this task likely cache-hot: | 1416 | * Is this task likely cache-hot: |
1032 | */ | 1417 | */ |
1033 | static inline int | 1418 | static int |
1034 | task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) | 1419 | task_hot(struct task_struct *p, u64 now, struct sched_domain *sd) |
1035 | { | 1420 | { |
1036 | s64 delta; | 1421 | s64 delta; |
@@ -1255,7 +1640,7 @@ static unsigned long target_load(int cpu, int type) | |||
1255 | /* | 1640 | /* |
1256 | * Return the average load per task on the cpu's run queue | 1641 | * Return the average load per task on the cpu's run queue |
1257 | */ | 1642 | */ |
1258 | static inline unsigned long cpu_avg_load_per_task(int cpu) | 1643 | static unsigned long cpu_avg_load_per_task(int cpu) |
1259 | { | 1644 | { |
1260 | struct rq *rq = cpu_rq(cpu); | 1645 | struct rq *rq = cpu_rq(cpu); |
1261 | unsigned long total = weighted_cpuload(cpu); | 1646 | unsigned long total = weighted_cpuload(cpu); |
@@ -1412,58 +1797,6 @@ static int sched_balance_self(int cpu, int flag) | |||
1412 | 1797 | ||
1413 | #endif /* CONFIG_SMP */ | 1798 | #endif /* CONFIG_SMP */ |
1414 | 1799 | ||
1415 | /* | ||
1416 | * wake_idle() will wake a task on an idle cpu if task->cpu is | ||
1417 | * not idle and an idle cpu is available. The span of cpus to | ||
1418 | * search starts with cpus closest then further out as needed, | ||
1419 | * so we always favor a closer, idle cpu. | ||
1420 | * | ||
1421 | * Returns the CPU we should wake onto. | ||
1422 | */ | ||
1423 | #if defined(ARCH_HAS_SCHED_WAKE_IDLE) | ||
1424 | static int wake_idle(int cpu, struct task_struct *p) | ||
1425 | { | ||
1426 | cpumask_t tmp; | ||
1427 | struct sched_domain *sd; | ||
1428 | int i; | ||
1429 | |||
1430 | /* | ||
1431 | * If it is idle, then it is the best cpu to run this task. | ||
1432 | * | ||
1433 | * This cpu is also the best, if it has more than one task already. | ||
1434 | * Siblings must be also busy(in most cases) as they didn't already | ||
1435 | * pickup the extra load from this cpu and hence we need not check | ||
1436 | * sibling runqueue info. This will avoid the checks and cache miss | ||
1437 | * penalities associated with that. | ||
1438 | */ | ||
1439 | if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1) | ||
1440 | return cpu; | ||
1441 | |||
1442 | for_each_domain(cpu, sd) { | ||
1443 | if (sd->flags & SD_WAKE_IDLE) { | ||
1444 | cpus_and(tmp, sd->span, p->cpus_allowed); | ||
1445 | for_each_cpu_mask(i, tmp) { | ||
1446 | if (idle_cpu(i)) { | ||
1447 | if (i != task_cpu(p)) { | ||
1448 | schedstat_inc(p, | ||
1449 | se.nr_wakeups_idle); | ||
1450 | } | ||
1451 | return i; | ||
1452 | } | ||
1453 | } | ||
1454 | } else { | ||
1455 | break; | ||
1456 | } | ||
1457 | } | ||
1458 | return cpu; | ||
1459 | } | ||
1460 | #else | ||
1461 | static inline int wake_idle(int cpu, struct task_struct *p) | ||
1462 | { | ||
1463 | return cpu; | ||
1464 | } | ||
1465 | #endif | ||
1466 | |||
1467 | /*** | 1800 | /*** |
1468 | * try_to_wake_up - wake up a thread | 1801 | * try_to_wake_up - wake up a thread |
1469 | * @p: the to-be-woken-up thread | 1802 | * @p: the to-be-woken-up thread |
@@ -1484,11 +1817,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
1484 | unsigned long flags; | 1817 | unsigned long flags; |
1485 | long old_state; | 1818 | long old_state; |
1486 | struct rq *rq; | 1819 | struct rq *rq; |
1487 | #ifdef CONFIG_SMP | ||
1488 | struct sched_domain *sd, *this_sd = NULL; | ||
1489 | unsigned long load, this_load; | ||
1490 | int new_cpu; | ||
1491 | #endif | ||
1492 | 1820 | ||
1493 | rq = task_rq_lock(p, &flags); | 1821 | rq = task_rq_lock(p, &flags); |
1494 | old_state = p->state; | 1822 | old_state = p->state; |
@@ -1506,92 +1834,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) | |||
1506 | if (unlikely(task_running(rq, p))) | 1834 | if (unlikely(task_running(rq, p))) |
1507 | goto out_activate; | 1835 | goto out_activate; |
1508 | 1836 | ||
1509 | new_cpu = cpu; | 1837 | cpu = p->sched_class->select_task_rq(p, sync); |
1510 | 1838 | if (cpu != orig_cpu) { | |
1511 | schedstat_inc(rq, ttwu_count); | 1839 | set_task_cpu(p, cpu); |
1512 | if (cpu == this_cpu) { | ||
1513 | schedstat_inc(rq, ttwu_local); | ||
1514 | goto out_set_cpu; | ||
1515 | } | ||
1516 | |||
1517 | for_each_domain(this_cpu, sd) { | ||
1518 | if (cpu_isset(cpu, sd->span)) { | ||
1519 | schedstat_inc(sd, ttwu_wake_remote); | ||
1520 | this_sd = sd; | ||
1521 | break; | ||
1522 | } | ||
1523 | } | ||
1524 | |||
1525 | if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) | ||
1526 | goto out_set_cpu; | ||
1527 | |||
1528 | /* | ||
1529 | * Check for affine wakeup and passive balancing possibilities. | ||
1530 | */ | ||
1531 | if (this_sd) { | ||
1532 | int idx = this_sd->wake_idx; | ||
1533 | unsigned int imbalance; | ||
1534 | |||
1535 | imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; | ||
1536 | |||
1537 | load = source_load(cpu, idx); | ||
1538 | this_load = target_load(this_cpu, idx); | ||
1539 | |||
1540 | new_cpu = this_cpu; /* Wake to this CPU if we can */ | ||
1541 | |||
1542 | if (this_sd->flags & SD_WAKE_AFFINE) { | ||
1543 | unsigned long tl = this_load; | ||
1544 | unsigned long tl_per_task; | ||
1545 | |||
1546 | /* | ||
1547 | * Attract cache-cold tasks on sync wakeups: | ||
1548 | */ | ||
1549 | if (sync && !task_hot(p, rq->clock, this_sd)) | ||
1550 | goto out_set_cpu; | ||
1551 | |||
1552 | schedstat_inc(p, se.nr_wakeups_affine_attempts); | ||
1553 | tl_per_task = cpu_avg_load_per_task(this_cpu); | ||
1554 | |||
1555 | /* | ||
1556 | * If sync wakeup then subtract the (maximum possible) | ||
1557 | * effect of the currently running task from the load | ||
1558 | * of the current CPU: | ||
1559 | */ | ||
1560 | if (sync) | ||
1561 | tl -= current->se.load.weight; | ||
1562 | |||
1563 | if ((tl <= load && | ||
1564 | tl + target_load(cpu, idx) <= tl_per_task) || | ||
1565 | 100*(tl + p->se.load.weight) <= imbalance*load) { | ||
1566 | /* | ||
1567 | * This domain has SD_WAKE_AFFINE and | ||
1568 | * p is cache cold in this domain, and | ||
1569 | * there is no bad imbalance. | ||
1570 | */ | ||
1571 | schedstat_inc(this_sd, ttwu_move_affine); | ||
1572 | schedstat_inc(p, se.nr_wakeups_affine); | ||
1573 | goto out_set_cpu; | ||
1574 | } | ||
1575 | } | ||
1576 | |||
1577 | /* | ||
1578 | * Start passive balancing when half the imbalance_pct | ||
1579 | * limit is reached. | ||
1580 | */ | ||
1581 | if (this_sd->flags & SD_WAKE_BALANCE) { | ||
1582 | if (imbalance*this_load <= 100*load) { | ||
1583 | schedstat_inc(this_sd, ttwu_move_balance); | ||
1584 | schedstat_inc(p, se.nr_wakeups_passive); | ||
1585 | goto out_set_cpu; | ||
1586 | } | ||
1587 | } | ||
1588 | } | ||
1589 | |||
1590 | new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ | ||
1591 | out_set_cpu: | ||
1592 | new_cpu = wake_idle(new_cpu, p); | ||
1593 | if (new_cpu != cpu) { | ||
1594 | set_task_cpu(p, new_cpu); | ||
1595 | task_rq_unlock(rq, &flags); | 1840 | task_rq_unlock(rq, &flags); |
1596 | /* might preempt at this point */ | 1841 | /* might preempt at this point */ |
1597 | rq = task_rq_lock(p, &flags); | 1842 | rq = task_rq_lock(p, &flags); |
@@ -1605,6 +1850,21 @@ out_set_cpu: | |||
1605 | cpu = task_cpu(p); | 1850 | cpu = task_cpu(p); |
1606 | } | 1851 | } |
1607 | 1852 | ||
1853 | #ifdef CONFIG_SCHEDSTATS | ||
1854 | schedstat_inc(rq, ttwu_count); | ||
1855 | if (cpu == this_cpu) | ||
1856 | schedstat_inc(rq, ttwu_local); | ||
1857 | else { | ||
1858 | struct sched_domain *sd; | ||
1859 | for_each_domain(this_cpu, sd) { | ||
1860 | if (cpu_isset(cpu, sd->span)) { | ||
1861 | schedstat_inc(sd, ttwu_wake_remote); | ||
1862 | break; | ||
1863 | } | ||
1864 | } | ||
1865 | } | ||
1866 | #endif | ||
1867 | |||
1608 | out_activate: | 1868 | out_activate: |
1609 | #endif /* CONFIG_SMP */ | 1869 | #endif /* CONFIG_SMP */ |
1610 | schedstat_inc(p, se.nr_wakeups); | 1870 | schedstat_inc(p, se.nr_wakeups); |
@@ -1623,6 +1883,10 @@ out_activate: | |||
1623 | 1883 | ||
1624 | out_running: | 1884 | out_running: |
1625 | p->state = TASK_RUNNING; | 1885 | p->state = TASK_RUNNING; |
1886 | #ifdef CONFIG_SMP | ||
1887 | if (p->sched_class->task_wake_up) | ||
1888 | p->sched_class->task_wake_up(rq, p); | ||
1889 | #endif | ||
1626 | out: | 1890 | out: |
1627 | task_rq_unlock(rq, &flags); | 1891 | task_rq_unlock(rq, &flags); |
1628 | 1892 | ||
@@ -1631,8 +1895,7 @@ out: | |||
1631 | 1895 | ||
1632 | int fastcall wake_up_process(struct task_struct *p) | 1896 | int fastcall wake_up_process(struct task_struct *p) |
1633 | { | 1897 | { |
1634 | return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | | 1898 | return try_to_wake_up(p, TASK_ALL, 0); |
1635 | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); | ||
1636 | } | 1899 | } |
1637 | EXPORT_SYMBOL(wake_up_process); | 1900 | EXPORT_SYMBOL(wake_up_process); |
1638 | 1901 | ||
@@ -1665,7 +1928,7 @@ static void __sched_fork(struct task_struct *p) | |||
1665 | p->se.wait_max = 0; | 1928 | p->se.wait_max = 0; |
1666 | #endif | 1929 | #endif |
1667 | 1930 | ||
1668 | INIT_LIST_HEAD(&p->run_list); | 1931 | INIT_LIST_HEAD(&p->rt.run_list); |
1669 | p->se.on_rq = 0; | 1932 | p->se.on_rq = 0; |
1670 | 1933 | ||
1671 | #ifdef CONFIG_PREEMPT_NOTIFIERS | 1934 | #ifdef CONFIG_PREEMPT_NOTIFIERS |
@@ -1742,9 +2005,13 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) | |||
1742 | * management (if any): | 2005 | * management (if any): |
1743 | */ | 2006 | */ |
1744 | p->sched_class->task_new(rq, p); | 2007 | p->sched_class->task_new(rq, p); |
1745 | inc_nr_running(p, rq); | 2008 | inc_nr_running(rq); |
1746 | } | 2009 | } |
1747 | check_preempt_curr(rq, p); | 2010 | check_preempt_curr(rq, p); |
2011 | #ifdef CONFIG_SMP | ||
2012 | if (p->sched_class->task_wake_up) | ||
2013 | p->sched_class->task_wake_up(rq, p); | ||
2014 | #endif | ||
1748 | task_rq_unlock(rq, &flags); | 2015 | task_rq_unlock(rq, &flags); |
1749 | } | 2016 | } |
1750 | 2017 | ||
@@ -1839,7 +2106,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, | |||
1839 | * and do any other architecture-specific cleanup actions. | 2106 | * and do any other architecture-specific cleanup actions. |
1840 | * | 2107 | * |
1841 | * Note that we may have delayed dropping an mm in context_switch(). If | 2108 | * Note that we may have delayed dropping an mm in context_switch(). If |
1842 | * so, we finish that here outside of the runqueue lock. (Doing it | 2109 | * so, we finish that here outside of the runqueue lock. (Doing it |
1843 | * with the lock held can cause deadlocks; see schedule() for | 2110 | * with the lock held can cause deadlocks; see schedule() for |
1844 | * details.) | 2111 | * details.) |
1845 | */ | 2112 | */ |
@@ -1865,6 +2132,11 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
1865 | prev_state = prev->state; | 2132 | prev_state = prev->state; |
1866 | finish_arch_switch(prev); | 2133 | finish_arch_switch(prev); |
1867 | finish_lock_switch(rq, prev); | 2134 | finish_lock_switch(rq, prev); |
2135 | #ifdef CONFIG_SMP | ||
2136 | if (current->sched_class->post_schedule) | ||
2137 | current->sched_class->post_schedule(rq); | ||
2138 | #endif | ||
2139 | |||
1868 | fire_sched_in_preempt_notifiers(current); | 2140 | fire_sched_in_preempt_notifiers(current); |
1869 | if (mm) | 2141 | if (mm) |
1870 | mmdrop(mm); | 2142 | mmdrop(mm); |
@@ -2098,11 +2370,13 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2) | |||
2098 | /* | 2370 | /* |
2099 | * double_lock_balance - lock the busiest runqueue, this_rq is locked already. | 2371 | * double_lock_balance - lock the busiest runqueue, this_rq is locked already. |
2100 | */ | 2372 | */ |
2101 | static void double_lock_balance(struct rq *this_rq, struct rq *busiest) | 2373 | static int double_lock_balance(struct rq *this_rq, struct rq *busiest) |
2102 | __releases(this_rq->lock) | 2374 | __releases(this_rq->lock) |
2103 | __acquires(busiest->lock) | 2375 | __acquires(busiest->lock) |
2104 | __acquires(this_rq->lock) | 2376 | __acquires(this_rq->lock) |
2105 | { | 2377 | { |
2378 | int ret = 0; | ||
2379 | |||
2106 | if (unlikely(!irqs_disabled())) { | 2380 | if (unlikely(!irqs_disabled())) { |
2107 | /* printk() doesn't work good under rq->lock */ | 2381 | /* printk() doesn't work good under rq->lock */ |
2108 | spin_unlock(&this_rq->lock); | 2382 | spin_unlock(&this_rq->lock); |
@@ -2113,15 +2387,17 @@ static void double_lock_balance(struct rq *this_rq, struct rq *busiest) | |||
2113 | spin_unlock(&this_rq->lock); | 2387 | spin_unlock(&this_rq->lock); |
2114 | spin_lock(&busiest->lock); | 2388 | spin_lock(&busiest->lock); |
2115 | spin_lock(&this_rq->lock); | 2389 | spin_lock(&this_rq->lock); |
2390 | ret = 1; | ||
2116 | } else | 2391 | } else |
2117 | spin_lock(&busiest->lock); | 2392 | spin_lock(&busiest->lock); |
2118 | } | 2393 | } |
2394 | return ret; | ||
2119 | } | 2395 | } |
2120 | 2396 | ||
2121 | /* | 2397 | /* |
2122 | * If dest_cpu is allowed for this process, migrate the task to it. | 2398 | * If dest_cpu is allowed for this process, migrate the task to it. |
2123 | * This is accomplished by forcing the cpu_allowed mask to only | 2399 | * This is accomplished by forcing the cpu_allowed mask to only |
2124 | * allow dest_cpu, which will force the cpu onto dest_cpu. Then | 2400 | * allow dest_cpu, which will force the cpu onto dest_cpu. Then |
2125 | * the cpu_allowed mask is restored. | 2401 | * the cpu_allowed mask is restored. |
2126 | */ | 2402 | */ |
2127 | static void sched_migrate_task(struct task_struct *p, int dest_cpu) | 2403 | static void sched_migrate_task(struct task_struct *p, int dest_cpu) |
@@ -2237,7 +2513,7 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2237 | enum cpu_idle_type idle, int *all_pinned, | 2513 | enum cpu_idle_type idle, int *all_pinned, |
2238 | int *this_best_prio, struct rq_iterator *iterator) | 2514 | int *this_best_prio, struct rq_iterator *iterator) |
2239 | { | 2515 | { |
2240 | int pulled = 0, pinned = 0, skip_for_load; | 2516 | int loops = 0, pulled = 0, pinned = 0, skip_for_load; |
2241 | struct task_struct *p; | 2517 | struct task_struct *p; |
2242 | long rem_load_move = max_load_move; | 2518 | long rem_load_move = max_load_move; |
2243 | 2519 | ||
@@ -2251,10 +2527,10 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
2251 | */ | 2527 | */ |
2252 | p = iterator->start(iterator->arg); | 2528 | p = iterator->start(iterator->arg); |
2253 | next: | 2529 | next: |
2254 | if (!p) | 2530 | if (!p || loops++ > sysctl_sched_nr_migrate) |
2255 | goto out; | 2531 | goto out; |
2256 | /* | 2532 | /* |
2257 | * To help distribute high priority tasks accross CPUs we don't | 2533 | * To help distribute high priority tasks across CPUs we don't |
2258 | * skip a task if it will be the highest priority task (i.e. smallest | 2534 | * skip a task if it will be the highest priority task (i.e. smallest |
2259 | * prio value) on its new queue regardless of its load weight | 2535 | * prio value) on its new queue regardless of its load weight |
2260 | */ | 2536 | */ |
@@ -2271,8 +2547,7 @@ next: | |||
2271 | rem_load_move -= p->se.load.weight; | 2547 | rem_load_move -= p->se.load.weight; |
2272 | 2548 | ||
2273 | /* | 2549 | /* |
2274 | * We only want to steal up to the prescribed number of tasks | 2550 | * We only want to steal up to the prescribed amount of weighted load. |
2275 | * and the prescribed amount of weighted load. | ||
2276 | */ | 2551 | */ |
2277 | if (rem_load_move > 0) { | 2552 | if (rem_load_move > 0) { |
2278 | if (p->prio < *this_best_prio) | 2553 | if (p->prio < *this_best_prio) |
@@ -2567,7 +2842,7 @@ group_next: | |||
2567 | * tasks around. Thus we look for the minimum possible imbalance. | 2842 | * tasks around. Thus we look for the minimum possible imbalance. |
2568 | * Negative imbalances (*we* are more loaded than anyone else) will | 2843 | * Negative imbalances (*we* are more loaded than anyone else) will |
2569 | * be counted as no imbalance for these purposes -- we can't fix that | 2844 | * be counted as no imbalance for these purposes -- we can't fix that |
2570 | * by pulling tasks to us. Be careful of negative numbers as they'll | 2845 | * by pulling tasks to us. Be careful of negative numbers as they'll |
2571 | * appear as very large values with unsigned longs. | 2846 | * appear as very large values with unsigned longs. |
2572 | */ | 2847 | */ |
2573 | if (max_load <= busiest_load_per_task) | 2848 | if (max_load <= busiest_load_per_task) |
@@ -3002,7 +3277,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) | |||
3002 | 3277 | ||
3003 | /* | 3278 | /* |
3004 | * This condition is "impossible", if it occurs | 3279 | * This condition is "impossible", if it occurs |
3005 | * we need to fix it. Originally reported by | 3280 | * we need to fix it. Originally reported by |
3006 | * Bjorn Helgaas on a 128-cpu setup. | 3281 | * Bjorn Helgaas on a 128-cpu setup. |
3007 | */ | 3282 | */ |
3008 | BUG_ON(busiest_rq == target_rq); | 3283 | BUG_ON(busiest_rq == target_rq); |
@@ -3034,7 +3309,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) | |||
3034 | #ifdef CONFIG_NO_HZ | 3309 | #ifdef CONFIG_NO_HZ |
3035 | static struct { | 3310 | static struct { |
3036 | atomic_t load_balancer; | 3311 | atomic_t load_balancer; |
3037 | cpumask_t cpu_mask; | 3312 | cpumask_t cpu_mask; |
3038 | } nohz ____cacheline_aligned = { | 3313 | } nohz ____cacheline_aligned = { |
3039 | .load_balancer = ATOMIC_INIT(-1), | 3314 | .load_balancer = ATOMIC_INIT(-1), |
3040 | .cpu_mask = CPU_MASK_NONE, | 3315 | .cpu_mask = CPU_MASK_NONE, |
@@ -3315,7 +3590,7 @@ unsigned long long task_sched_runtime(struct task_struct *p) | |||
3315 | 3590 | ||
3316 | rq = task_rq_lock(p, &flags); | 3591 | rq = task_rq_lock(p, &flags); |
3317 | ns = p->se.sum_exec_runtime; | 3592 | ns = p->se.sum_exec_runtime; |
3318 | if (rq->curr == p) { | 3593 | if (task_current(rq, p)) { |
3319 | update_rq_clock(rq); | 3594 | update_rq_clock(rq); |
3320 | delta_exec = rq->clock - p->se.exec_start; | 3595 | delta_exec = rq->clock - p->se.exec_start; |
3321 | if ((s64)delta_exec > 0) | 3596 | if ((s64)delta_exec > 0) |
@@ -3335,13 +3610,9 @@ void account_user_time(struct task_struct *p, cputime_t cputime) | |||
3335 | { | 3610 | { |
3336 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 3611 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
3337 | cputime64_t tmp; | 3612 | cputime64_t tmp; |
3338 | struct rq *rq = this_rq(); | ||
3339 | 3613 | ||
3340 | p->utime = cputime_add(p->utime, cputime); | 3614 | p->utime = cputime_add(p->utime, cputime); |
3341 | 3615 | ||
3342 | if (p != rq->idle) | ||
3343 | cpuacct_charge(p, cputime); | ||
3344 | |||
3345 | /* Add user time to cpustat. */ | 3616 | /* Add user time to cpustat. */ |
3346 | tmp = cputime_to_cputime64(cputime); | 3617 | tmp = cputime_to_cputime64(cputime); |
3347 | if (TASK_NICE(p) > 0) | 3618 | if (TASK_NICE(p) > 0) |
@@ -3355,7 +3626,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime) | |||
3355 | * @p: the process that the cpu time gets accounted to | 3626 | * @p: the process that the cpu time gets accounted to |
3356 | * @cputime: the cpu time spent in virtual machine since the last update | 3627 | * @cputime: the cpu time spent in virtual machine since the last update |
3357 | */ | 3628 | */ |
3358 | void account_guest_time(struct task_struct *p, cputime_t cputime) | 3629 | static void account_guest_time(struct task_struct *p, cputime_t cputime) |
3359 | { | 3630 | { |
3360 | cputime64_t tmp; | 3631 | cputime64_t tmp; |
3361 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; | 3632 | struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; |
@@ -3392,10 +3663,8 @@ void account_system_time(struct task_struct *p, int hardirq_offset, | |||
3392 | struct rq *rq = this_rq(); | 3663 | struct rq *rq = this_rq(); |
3393 | cputime64_t tmp; | 3664 | cputime64_t tmp; |
3394 | 3665 | ||
3395 | if (p->flags & PF_VCPU) { | 3666 | if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) |
3396 | account_guest_time(p, cputime); | 3667 | return account_guest_time(p, cputime); |
3397 | return; | ||
3398 | } | ||
3399 | 3668 | ||
3400 | p->stime = cputime_add(p->stime, cputime); | 3669 | p->stime = cputime_add(p->stime, cputime); |
3401 | 3670 | ||
@@ -3405,10 +3674,9 @@ void account_system_time(struct task_struct *p, int hardirq_offset, | |||
3405 | cpustat->irq = cputime64_add(cpustat->irq, tmp); | 3674 | cpustat->irq = cputime64_add(cpustat->irq, tmp); |
3406 | else if (softirq_count()) | 3675 | else if (softirq_count()) |
3407 | cpustat->softirq = cputime64_add(cpustat->softirq, tmp); | 3676 | cpustat->softirq = cputime64_add(cpustat->softirq, tmp); |
3408 | else if (p != rq->idle) { | 3677 | else if (p != rq->idle) |
3409 | cpustat->system = cputime64_add(cpustat->system, tmp); | 3678 | cpustat->system = cputime64_add(cpustat->system, tmp); |
3410 | cpuacct_charge(p, cputime); | 3679 | else if (atomic_read(&rq->nr_iowait) > 0) |
3411 | } else if (atomic_read(&rq->nr_iowait) > 0) | ||
3412 | cpustat->iowait = cputime64_add(cpustat->iowait, tmp); | 3680 | cpustat->iowait = cputime64_add(cpustat->iowait, tmp); |
3413 | else | 3681 | else |
3414 | cpustat->idle = cputime64_add(cpustat->idle, tmp); | 3682 | cpustat->idle = cputime64_add(cpustat->idle, tmp); |
@@ -3444,10 +3712,8 @@ void account_steal_time(struct task_struct *p, cputime_t steal) | |||
3444 | cpustat->iowait = cputime64_add(cpustat->iowait, tmp); | 3712 | cpustat->iowait = cputime64_add(cpustat->iowait, tmp); |
3445 | else | 3713 | else |
3446 | cpustat->idle = cputime64_add(cpustat->idle, tmp); | 3714 | cpustat->idle = cputime64_add(cpustat->idle, tmp); |
3447 | } else { | 3715 | } else |
3448 | cpustat->steal = cputime64_add(cpustat->steal, tmp); | 3716 | cpustat->steal = cputime64_add(cpustat->steal, tmp); |
3449 | cpuacct_charge(p, -tmp); | ||
3450 | } | ||
3451 | } | 3717 | } |
3452 | 3718 | ||
3453 | /* | 3719 | /* |
@@ -3469,12 +3735,14 @@ void scheduler_tick(void) | |||
3469 | /* | 3735 | /* |
3470 | * Let rq->clock advance by at least TICK_NSEC: | 3736 | * Let rq->clock advance by at least TICK_NSEC: |
3471 | */ | 3737 | */ |
3472 | if (unlikely(rq->clock < next_tick)) | 3738 | if (unlikely(rq->clock < next_tick)) { |
3473 | rq->clock = next_tick; | 3739 | rq->clock = next_tick; |
3740 | rq->clock_underflows++; | ||
3741 | } | ||
3474 | rq->tick_timestamp = rq->clock; | 3742 | rq->tick_timestamp = rq->clock; |
3475 | update_cpu_load(rq); | 3743 | update_cpu_load(rq); |
3476 | if (curr != rq->idle) /* FIXME: needed? */ | 3744 | curr->sched_class->task_tick(rq, curr, 0); |
3477 | curr->sched_class->task_tick(rq, curr); | 3745 | update_sched_rt_period(rq); |
3478 | spin_unlock(&rq->lock); | 3746 | spin_unlock(&rq->lock); |
3479 | 3747 | ||
3480 | #ifdef CONFIG_SMP | 3748 | #ifdef CONFIG_SMP |
@@ -3547,7 +3815,7 @@ static noinline void __schedule_bug(struct task_struct *prev) | |||
3547 | static inline void schedule_debug(struct task_struct *prev) | 3815 | static inline void schedule_debug(struct task_struct *prev) |
3548 | { | 3816 | { |
3549 | /* | 3817 | /* |
3550 | * Test if we are atomic. Since do_exit() needs to call into | 3818 | * Test if we are atomic. Since do_exit() needs to call into |
3551 | * schedule() atomically, we ignore that path for now. | 3819 | * schedule() atomically, we ignore that path for now. |
3552 | * Otherwise, whine if we are scheduling when we should not be. | 3820 | * Otherwise, whine if we are scheduling when we should not be. |
3553 | */ | 3821 | */ |
@@ -3620,6 +3888,8 @@ need_resched_nonpreemptible: | |||
3620 | 3888 | ||
3621 | schedule_debug(prev); | 3889 | schedule_debug(prev); |
3622 | 3890 | ||
3891 | hrtick_clear(rq); | ||
3892 | |||
3623 | /* | 3893 | /* |
3624 | * Do the rq-clock update outside the rq lock: | 3894 | * Do the rq-clock update outside the rq lock: |
3625 | */ | 3895 | */ |
@@ -3638,6 +3908,11 @@ need_resched_nonpreemptible: | |||
3638 | switch_count = &prev->nvcsw; | 3908 | switch_count = &prev->nvcsw; |
3639 | } | 3909 | } |
3640 | 3910 | ||
3911 | #ifdef CONFIG_SMP | ||
3912 | if (prev->sched_class->pre_schedule) | ||
3913 | prev->sched_class->pre_schedule(rq, prev); | ||
3914 | #endif | ||
3915 | |||
3641 | if (unlikely(!rq->nr_running)) | 3916 | if (unlikely(!rq->nr_running)) |
3642 | idle_balance(cpu, rq); | 3917 | idle_balance(cpu, rq); |
3643 | 3918 | ||
@@ -3652,14 +3927,20 @@ need_resched_nonpreemptible: | |||
3652 | ++*switch_count; | 3927 | ++*switch_count; |
3653 | 3928 | ||
3654 | context_switch(rq, prev, next); /* unlocks the rq */ | 3929 | context_switch(rq, prev, next); /* unlocks the rq */ |
3930 | /* | ||
3931 | * the context switch might have flipped the stack from under | ||
3932 | * us, hence refresh the local variables. | ||
3933 | */ | ||
3934 | cpu = smp_processor_id(); | ||
3935 | rq = cpu_rq(cpu); | ||
3655 | } else | 3936 | } else |
3656 | spin_unlock_irq(&rq->lock); | 3937 | spin_unlock_irq(&rq->lock); |
3657 | 3938 | ||
3658 | if (unlikely(reacquire_kernel_lock(current) < 0)) { | 3939 | hrtick_set(rq); |
3659 | cpu = smp_processor_id(); | 3940 | |
3660 | rq = cpu_rq(cpu); | 3941 | if (unlikely(reacquire_kernel_lock(current) < 0)) |
3661 | goto need_resched_nonpreemptible; | 3942 | goto need_resched_nonpreemptible; |
3662 | } | 3943 | |
3663 | preempt_enable_no_resched(); | 3944 | preempt_enable_no_resched(); |
3664 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) | 3945 | if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) |
3665 | goto need_resched; | 3946 | goto need_resched; |
@@ -3669,19 +3950,18 @@ EXPORT_SYMBOL(schedule); | |||
3669 | #ifdef CONFIG_PREEMPT | 3950 | #ifdef CONFIG_PREEMPT |
3670 | /* | 3951 | /* |
3671 | * this is the entry point to schedule() from in-kernel preemption | 3952 | * this is the entry point to schedule() from in-kernel preemption |
3672 | * off of preempt_enable. Kernel preemptions off return from interrupt | 3953 | * off of preempt_enable. Kernel preemptions off return from interrupt |
3673 | * occur there and call schedule directly. | 3954 | * occur there and call schedule directly. |
3674 | */ | 3955 | */ |
3675 | asmlinkage void __sched preempt_schedule(void) | 3956 | asmlinkage void __sched preempt_schedule(void) |
3676 | { | 3957 | { |
3677 | struct thread_info *ti = current_thread_info(); | 3958 | struct thread_info *ti = current_thread_info(); |
3678 | #ifdef CONFIG_PREEMPT_BKL | ||
3679 | struct task_struct *task = current; | 3959 | struct task_struct *task = current; |
3680 | int saved_lock_depth; | 3960 | int saved_lock_depth; |
3681 | #endif | 3961 | |
3682 | /* | 3962 | /* |
3683 | * If there is a non-zero preempt_count or interrupts are disabled, | 3963 | * If there is a non-zero preempt_count or interrupts are disabled, |
3684 | * we do not want to preempt the current task. Just return.. | 3964 | * we do not want to preempt the current task. Just return.. |
3685 | */ | 3965 | */ |
3686 | if (likely(ti->preempt_count || irqs_disabled())) | 3966 | if (likely(ti->preempt_count || irqs_disabled())) |
3687 | return; | 3967 | return; |
@@ -3694,14 +3974,10 @@ asmlinkage void __sched preempt_schedule(void) | |||
3694 | * clear ->lock_depth so that schedule() doesnt | 3974 | * clear ->lock_depth so that schedule() doesnt |
3695 | * auto-release the semaphore: | 3975 | * auto-release the semaphore: |
3696 | */ | 3976 | */ |
3697 | #ifdef CONFIG_PREEMPT_BKL | ||
3698 | saved_lock_depth = task->lock_depth; | 3977 | saved_lock_depth = task->lock_depth; |
3699 | task->lock_depth = -1; | 3978 | task->lock_depth = -1; |
3700 | #endif | ||
3701 | schedule(); | 3979 | schedule(); |
3702 | #ifdef CONFIG_PREEMPT_BKL | ||
3703 | task->lock_depth = saved_lock_depth; | 3980 | task->lock_depth = saved_lock_depth; |
3704 | #endif | ||
3705 | sub_preempt_count(PREEMPT_ACTIVE); | 3981 | sub_preempt_count(PREEMPT_ACTIVE); |
3706 | 3982 | ||
3707 | /* | 3983 | /* |
@@ -3722,10 +3998,9 @@ EXPORT_SYMBOL(preempt_schedule); | |||
3722 | asmlinkage void __sched preempt_schedule_irq(void) | 3998 | asmlinkage void __sched preempt_schedule_irq(void) |
3723 | { | 3999 | { |
3724 | struct thread_info *ti = current_thread_info(); | 4000 | struct thread_info *ti = current_thread_info(); |
3725 | #ifdef CONFIG_PREEMPT_BKL | ||
3726 | struct task_struct *task = current; | 4001 | struct task_struct *task = current; |
3727 | int saved_lock_depth; | 4002 | int saved_lock_depth; |
3728 | #endif | 4003 | |
3729 | /* Catch callers which need to be fixed */ | 4004 | /* Catch callers which need to be fixed */ |
3730 | BUG_ON(ti->preempt_count || !irqs_disabled()); | 4005 | BUG_ON(ti->preempt_count || !irqs_disabled()); |
3731 | 4006 | ||
@@ -3737,16 +4012,12 @@ asmlinkage void __sched preempt_schedule_irq(void) | |||
3737 | * clear ->lock_depth so that schedule() doesnt | 4012 | * clear ->lock_depth so that schedule() doesnt |
3738 | * auto-release the semaphore: | 4013 | * auto-release the semaphore: |
3739 | */ | 4014 | */ |
3740 | #ifdef CONFIG_PREEMPT_BKL | ||
3741 | saved_lock_depth = task->lock_depth; | 4015 | saved_lock_depth = task->lock_depth; |
3742 | task->lock_depth = -1; | 4016 | task->lock_depth = -1; |
3743 | #endif | ||
3744 | local_irq_enable(); | 4017 | local_irq_enable(); |
3745 | schedule(); | 4018 | schedule(); |
3746 | local_irq_disable(); | 4019 | local_irq_disable(); |
3747 | #ifdef CONFIG_PREEMPT_BKL | ||
3748 | task->lock_depth = saved_lock_depth; | 4020 | task->lock_depth = saved_lock_depth; |
3749 | #endif | ||
3750 | sub_preempt_count(PREEMPT_ACTIVE); | 4021 | sub_preempt_count(PREEMPT_ACTIVE); |
3751 | 4022 | ||
3752 | /* | 4023 | /* |
@@ -3767,12 +4038,12 @@ int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, | |||
3767 | EXPORT_SYMBOL(default_wake_function); | 4038 | EXPORT_SYMBOL(default_wake_function); |
3768 | 4039 | ||
3769 | /* | 4040 | /* |
3770 | * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just | 4041 | * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just |
3771 | * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve | 4042 | * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve |
3772 | * number) then we wake all the non-exclusive tasks and one exclusive task. | 4043 | * number) then we wake all the non-exclusive tasks and one exclusive task. |
3773 | * | 4044 | * |
3774 | * There are circumstances in which we can try to wake a task which has already | 4045 | * There are circumstances in which we can try to wake a task which has already |
3775 | * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns | 4046 | * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns |
3776 | * zero in this (rare) case, and we handle it by continuing to scan the queue. | 4047 | * zero in this (rare) case, and we handle it by continuing to scan the queue. |
3777 | */ | 4048 | */ |
3778 | static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, | 4049 | static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, |
@@ -3852,8 +4123,7 @@ void complete(struct completion *x) | |||
3852 | 4123 | ||
3853 | spin_lock_irqsave(&x->wait.lock, flags); | 4124 | spin_lock_irqsave(&x->wait.lock, flags); |
3854 | x->done++; | 4125 | x->done++; |
3855 | __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, | 4126 | __wake_up_common(&x->wait, TASK_NORMAL, 1, 0, NULL); |
3856 | 1, 0, NULL); | ||
3857 | spin_unlock_irqrestore(&x->wait.lock, flags); | 4127 | spin_unlock_irqrestore(&x->wait.lock, flags); |
3858 | } | 4128 | } |
3859 | EXPORT_SYMBOL(complete); | 4129 | EXPORT_SYMBOL(complete); |
@@ -3864,8 +4134,7 @@ void complete_all(struct completion *x) | |||
3864 | 4134 | ||
3865 | spin_lock_irqsave(&x->wait.lock, flags); | 4135 | spin_lock_irqsave(&x->wait.lock, flags); |
3866 | x->done += UINT_MAX/2; | 4136 | x->done += UINT_MAX/2; |
3867 | __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, | 4137 | __wake_up_common(&x->wait, TASK_NORMAL, 0, 0, NULL); |
3868 | 0, 0, NULL); | ||
3869 | spin_unlock_irqrestore(&x->wait.lock, flags); | 4138 | spin_unlock_irqrestore(&x->wait.lock, flags); |
3870 | } | 4139 | } |
3871 | EXPORT_SYMBOL(complete_all); | 4140 | EXPORT_SYMBOL(complete_all); |
@@ -3879,8 +4148,10 @@ do_wait_for_common(struct completion *x, long timeout, int state) | |||
3879 | wait.flags |= WQ_FLAG_EXCLUSIVE; | 4148 | wait.flags |= WQ_FLAG_EXCLUSIVE; |
3880 | __add_wait_queue_tail(&x->wait, &wait); | 4149 | __add_wait_queue_tail(&x->wait, &wait); |
3881 | do { | 4150 | do { |
3882 | if (state == TASK_INTERRUPTIBLE && | 4151 | if ((state == TASK_INTERRUPTIBLE && |
3883 | signal_pending(current)) { | 4152 | signal_pending(current)) || |
4153 | (state == TASK_KILLABLE && | ||
4154 | fatal_signal_pending(current))) { | ||
3884 | __remove_wait_queue(&x->wait, &wait); | 4155 | __remove_wait_queue(&x->wait, &wait); |
3885 | return -ERESTARTSYS; | 4156 | return -ERESTARTSYS; |
3886 | } | 4157 | } |
@@ -3940,6 +4211,15 @@ wait_for_completion_interruptible_timeout(struct completion *x, | |||
3940 | } | 4211 | } |
3941 | EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); | 4212 | EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); |
3942 | 4213 | ||
4214 | int __sched wait_for_completion_killable(struct completion *x) | ||
4215 | { | ||
4216 | long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); | ||
4217 | if (t == -ERESTARTSYS) | ||
4218 | return t; | ||
4219 | return 0; | ||
4220 | } | ||
4221 | EXPORT_SYMBOL(wait_for_completion_killable); | ||
4222 | |||
3943 | static long __sched | 4223 | static long __sched |
3944 | sleep_on_common(wait_queue_head_t *q, int state, long timeout) | 4224 | sleep_on_common(wait_queue_head_t *q, int state, long timeout) |
3945 | { | 4225 | { |
@@ -4003,6 +4283,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
4003 | unsigned long flags; | 4283 | unsigned long flags; |
4004 | int oldprio, on_rq, running; | 4284 | int oldprio, on_rq, running; |
4005 | struct rq *rq; | 4285 | struct rq *rq; |
4286 | const struct sched_class *prev_class = p->sched_class; | ||
4006 | 4287 | ||
4007 | BUG_ON(prio < 0 || prio > MAX_PRIO); | 4288 | BUG_ON(prio < 0 || prio > MAX_PRIO); |
4008 | 4289 | ||
@@ -4011,7 +4292,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
4011 | 4292 | ||
4012 | oldprio = p->prio; | 4293 | oldprio = p->prio; |
4013 | on_rq = p->se.on_rq; | 4294 | on_rq = p->se.on_rq; |
4014 | running = task_running(rq, p); | 4295 | running = task_current(rq, p); |
4015 | if (on_rq) { | 4296 | if (on_rq) { |
4016 | dequeue_task(rq, p, 0); | 4297 | dequeue_task(rq, p, 0); |
4017 | if (running) | 4298 | if (running) |
@@ -4028,18 +4309,10 @@ void rt_mutex_setprio(struct task_struct *p, int prio) | |||
4028 | if (on_rq) { | 4309 | if (on_rq) { |
4029 | if (running) | 4310 | if (running) |
4030 | p->sched_class->set_curr_task(rq); | 4311 | p->sched_class->set_curr_task(rq); |
4312 | |||
4031 | enqueue_task(rq, p, 0); | 4313 | enqueue_task(rq, p, 0); |
4032 | /* | 4314 | |
4033 | * Reschedule if we are currently running on this runqueue and | 4315 | check_class_changed(rq, p, prev_class, oldprio, running); |
4034 | * our priority decreased, or if we are not currently running on | ||
4035 | * this runqueue and our priority is higher than the current's | ||
4036 | */ | ||
4037 | if (running) { | ||
4038 | if (p->prio > oldprio) | ||
4039 | resched_task(rq->curr); | ||
4040 | } else { | ||
4041 | check_preempt_curr(rq, p); | ||
4042 | } | ||
4043 | } | 4316 | } |
4044 | task_rq_unlock(rq, &flags); | 4317 | task_rq_unlock(rq, &flags); |
4045 | } | 4318 | } |
@@ -4071,10 +4344,8 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4071 | goto out_unlock; | 4344 | goto out_unlock; |
4072 | } | 4345 | } |
4073 | on_rq = p->se.on_rq; | 4346 | on_rq = p->se.on_rq; |
4074 | if (on_rq) { | 4347 | if (on_rq) |
4075 | dequeue_task(rq, p, 0); | 4348 | dequeue_task(rq, p, 0); |
4076 | dec_load(rq, p); | ||
4077 | } | ||
4078 | 4349 | ||
4079 | p->static_prio = NICE_TO_PRIO(nice); | 4350 | p->static_prio = NICE_TO_PRIO(nice); |
4080 | set_load_weight(p); | 4351 | set_load_weight(p); |
@@ -4084,7 +4355,6 @@ void set_user_nice(struct task_struct *p, long nice) | |||
4084 | 4355 | ||
4085 | if (on_rq) { | 4356 | if (on_rq) { |
4086 | enqueue_task(rq, p, 0); | 4357 | enqueue_task(rq, p, 0); |
4087 | inc_load(rq, p); | ||
4088 | /* | 4358 | /* |
4089 | * If the task increased its priority or is running and | 4359 | * If the task increased its priority or is running and |
4090 | * lowered its priority, then reschedule its CPU: | 4360 | * lowered its priority, then reschedule its CPU: |
@@ -4242,6 +4512,7 @@ int sched_setscheduler(struct task_struct *p, int policy, | |||
4242 | { | 4512 | { |
4243 | int retval, oldprio, oldpolicy = -1, on_rq, running; | 4513 | int retval, oldprio, oldpolicy = -1, on_rq, running; |
4244 | unsigned long flags; | 4514 | unsigned long flags; |
4515 | const struct sched_class *prev_class = p->sched_class; | ||
4245 | struct rq *rq; | 4516 | struct rq *rq; |
4246 | 4517 | ||
4247 | /* may grab non-irq protected spin_locks */ | 4518 | /* may grab non-irq protected spin_locks */ |
@@ -4322,7 +4593,7 @@ recheck: | |||
4322 | } | 4593 | } |
4323 | update_rq_clock(rq); | 4594 | update_rq_clock(rq); |
4324 | on_rq = p->se.on_rq; | 4595 | on_rq = p->se.on_rq; |
4325 | running = task_running(rq, p); | 4596 | running = task_current(rq, p); |
4326 | if (on_rq) { | 4597 | if (on_rq) { |
4327 | deactivate_task(rq, p, 0); | 4598 | deactivate_task(rq, p, 0); |
4328 | if (running) | 4599 | if (running) |
@@ -4335,18 +4606,10 @@ recheck: | |||
4335 | if (on_rq) { | 4606 | if (on_rq) { |
4336 | if (running) | 4607 | if (running) |
4337 | p->sched_class->set_curr_task(rq); | 4608 | p->sched_class->set_curr_task(rq); |
4609 | |||
4338 | activate_task(rq, p, 0); | 4610 | activate_task(rq, p, 0); |
4339 | /* | 4611 | |
4340 | * Reschedule if we are currently running on this runqueue and | 4612 | check_class_changed(rq, p, prev_class, oldprio, running); |
4341 | * our priority decreased, or if we are not currently running on | ||
4342 | * this runqueue and our priority is higher than the current's | ||
4343 | */ | ||
4344 | if (running) { | ||
4345 | if (p->prio > oldprio) | ||
4346 | resched_task(rq->curr); | ||
4347 | } else { | ||
4348 | check_preempt_curr(rq, p); | ||
4349 | } | ||
4350 | } | 4613 | } |
4351 | __task_rq_unlock(rq); | 4614 | __task_rq_unlock(rq); |
4352 | spin_unlock_irqrestore(&p->pi_lock, flags); | 4615 | spin_unlock_irqrestore(&p->pi_lock, flags); |
@@ -4385,8 +4648,8 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) | |||
4385 | * @policy: new policy. | 4648 | * @policy: new policy. |
4386 | * @param: structure containing the new RT priority. | 4649 | * @param: structure containing the new RT priority. |
4387 | */ | 4650 | */ |
4388 | asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, | 4651 | asmlinkage long |
4389 | struct sched_param __user *param) | 4652 | sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) |
4390 | { | 4653 | { |
4391 | /* negative values for policy are not valid */ | 4654 | /* negative values for policy are not valid */ |
4392 | if (policy < 0) | 4655 | if (policy < 0) |
@@ -4474,19 +4737,19 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask) | |||
4474 | struct task_struct *p; | 4737 | struct task_struct *p; |
4475 | int retval; | 4738 | int retval; |
4476 | 4739 | ||
4477 | mutex_lock(&sched_hotcpu_mutex); | 4740 | get_online_cpus(); |
4478 | read_lock(&tasklist_lock); | 4741 | read_lock(&tasklist_lock); |
4479 | 4742 | ||
4480 | p = find_process_by_pid(pid); | 4743 | p = find_process_by_pid(pid); |
4481 | if (!p) { | 4744 | if (!p) { |
4482 | read_unlock(&tasklist_lock); | 4745 | read_unlock(&tasklist_lock); |
4483 | mutex_unlock(&sched_hotcpu_mutex); | 4746 | put_online_cpus(); |
4484 | return -ESRCH; | 4747 | return -ESRCH; |
4485 | } | 4748 | } |
4486 | 4749 | ||
4487 | /* | 4750 | /* |
4488 | * It is not safe to call set_cpus_allowed with the | 4751 | * It is not safe to call set_cpus_allowed with the |
4489 | * tasklist_lock held. We will bump the task_struct's | 4752 | * tasklist_lock held. We will bump the task_struct's |
4490 | * usage count and then drop tasklist_lock. | 4753 | * usage count and then drop tasklist_lock. |
4491 | */ | 4754 | */ |
4492 | get_task_struct(p); | 4755 | get_task_struct(p); |
@@ -4520,7 +4783,7 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask) | |||
4520 | } | 4783 | } |
4521 | out_unlock: | 4784 | out_unlock: |
4522 | put_task_struct(p); | 4785 | put_task_struct(p); |
4523 | mutex_unlock(&sched_hotcpu_mutex); | 4786 | put_online_cpus(); |
4524 | return retval; | 4787 | return retval; |
4525 | } | 4788 | } |
4526 | 4789 | ||
@@ -4577,7 +4840,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask) | |||
4577 | struct task_struct *p; | 4840 | struct task_struct *p; |
4578 | int retval; | 4841 | int retval; |
4579 | 4842 | ||
4580 | mutex_lock(&sched_hotcpu_mutex); | 4843 | get_online_cpus(); |
4581 | read_lock(&tasklist_lock); | 4844 | read_lock(&tasklist_lock); |
4582 | 4845 | ||
4583 | retval = -ESRCH; | 4846 | retval = -ESRCH; |
@@ -4593,7 +4856,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask) | |||
4593 | 4856 | ||
4594 | out_unlock: | 4857 | out_unlock: |
4595 | read_unlock(&tasklist_lock); | 4858 | read_unlock(&tasklist_lock); |
4596 | mutex_unlock(&sched_hotcpu_mutex); | 4859 | put_online_cpus(); |
4597 | 4860 | ||
4598 | return retval; | 4861 | return retval; |
4599 | } | 4862 | } |
@@ -4667,7 +4930,8 @@ static void __cond_resched(void) | |||
4667 | } while (need_resched()); | 4930 | } while (need_resched()); |
4668 | } | 4931 | } |
4669 | 4932 | ||
4670 | int __sched cond_resched(void) | 4933 | #if !defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY) |
4934 | int __sched _cond_resched(void) | ||
4671 | { | 4935 | { |
4672 | if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && | 4936 | if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && |
4673 | system_state == SYSTEM_RUNNING) { | 4937 | system_state == SYSTEM_RUNNING) { |
@@ -4676,31 +4940,28 @@ int __sched cond_resched(void) | |||
4676 | } | 4940 | } |
4677 | return 0; | 4941 | return 0; |
4678 | } | 4942 | } |
4679 | EXPORT_SYMBOL(cond_resched); | 4943 | EXPORT_SYMBOL(_cond_resched); |
4944 | #endif | ||
4680 | 4945 | ||
4681 | /* | 4946 | /* |
4682 | * cond_resched_lock() - if a reschedule is pending, drop the given lock, | 4947 | * cond_resched_lock() - if a reschedule is pending, drop the given lock, |
4683 | * call schedule, and on return reacquire the lock. | 4948 | * call schedule, and on return reacquire the lock. |
4684 | * | 4949 | * |
4685 | * This works OK both with and without CONFIG_PREEMPT. We do strange low-level | 4950 | * This works OK both with and without CONFIG_PREEMPT. We do strange low-level |
4686 | * operations here to prevent schedule() from being called twice (once via | 4951 | * operations here to prevent schedule() from being called twice (once via |
4687 | * spin_unlock(), once by hand). | 4952 | * spin_unlock(), once by hand). |
4688 | */ | 4953 | */ |
4689 | int cond_resched_lock(spinlock_t *lock) | 4954 | int cond_resched_lock(spinlock_t *lock) |
4690 | { | 4955 | { |
4956 | int resched = need_resched() && system_state == SYSTEM_RUNNING; | ||
4691 | int ret = 0; | 4957 | int ret = 0; |
4692 | 4958 | ||
4693 | if (need_lockbreak(lock)) { | 4959 | if (spin_needbreak(lock) || resched) { |
4694 | spin_unlock(lock); | 4960 | spin_unlock(lock); |
4695 | cpu_relax(); | 4961 | if (resched && need_resched()) |
4696 | ret = 1; | 4962 | __cond_resched(); |
4697 | spin_lock(lock); | 4963 | else |
4698 | } | 4964 | cpu_relax(); |
4699 | if (need_resched() && system_state == SYSTEM_RUNNING) { | ||
4700 | spin_release(&lock->dep_map, 1, _THIS_IP_); | ||
4701 | _raw_spin_unlock(lock); | ||
4702 | preempt_enable_no_resched(); | ||
4703 | __cond_resched(); | ||
4704 | ret = 1; | 4965 | ret = 1; |
4705 | spin_lock(lock); | 4966 | spin_lock(lock); |
4706 | } | 4967 | } |
@@ -4736,7 +4997,7 @@ void __sched yield(void) | |||
4736 | EXPORT_SYMBOL(yield); | 4997 | EXPORT_SYMBOL(yield); |
4737 | 4998 | ||
4738 | /* | 4999 | /* |
4739 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so | 5000 | * This task is about to go to sleep on IO. Increment rq->nr_iowait so |
4740 | * that process accounting knows that this is a task in IO wait state. | 5001 | * that process accounting knows that this is a task in IO wait state. |
4741 | * | 5002 | * |
4742 | * But don't do that if it is a deliberate, throttling IO wait (this task | 5003 | * But don't do that if it is a deliberate, throttling IO wait (this task |
@@ -4845,17 +5106,21 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) | |||
4845 | if (retval) | 5106 | if (retval) |
4846 | goto out_unlock; | 5107 | goto out_unlock; |
4847 | 5108 | ||
4848 | if (p->policy == SCHED_FIFO) | 5109 | /* |
4849 | time_slice = 0; | 5110 | * Time slice is 0 for SCHED_FIFO tasks and for SCHED_OTHER |
4850 | else if (p->policy == SCHED_RR) | 5111 | * tasks that are on an otherwise idle runqueue: |
5112 | */ | ||
5113 | time_slice = 0; | ||
5114 | if (p->policy == SCHED_RR) { | ||
4851 | time_slice = DEF_TIMESLICE; | 5115 | time_slice = DEF_TIMESLICE; |
4852 | else { | 5116 | } else { |
4853 | struct sched_entity *se = &p->se; | 5117 | struct sched_entity *se = &p->se; |
4854 | unsigned long flags; | 5118 | unsigned long flags; |
4855 | struct rq *rq; | 5119 | struct rq *rq; |
4856 | 5120 | ||
4857 | rq = task_rq_lock(p, &flags); | 5121 | rq = task_rq_lock(p, &flags); |
4858 | time_slice = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se)); | 5122 | if (rq->cfs.load.weight) |
5123 | time_slice = NS_TO_JIFFIES(sched_slice(&rq->cfs, se)); | ||
4859 | task_rq_unlock(rq, &flags); | 5124 | task_rq_unlock(rq, &flags); |
4860 | } | 5125 | } |
4861 | read_unlock(&tasklist_lock); | 5126 | read_unlock(&tasklist_lock); |
@@ -4870,7 +5135,7 @@ out_unlock: | |||
4870 | 5135 | ||
4871 | static const char stat_nam[] = "RSDTtZX"; | 5136 | static const char stat_nam[] = "RSDTtZX"; |
4872 | 5137 | ||
4873 | static void show_task(struct task_struct *p) | 5138 | void sched_show_task(struct task_struct *p) |
4874 | { | 5139 | { |
4875 | unsigned long free = 0; | 5140 | unsigned long free = 0; |
4876 | unsigned state; | 5141 | unsigned state; |
@@ -4898,10 +5163,9 @@ static void show_task(struct task_struct *p) | |||
4898 | } | 5163 | } |
4899 | #endif | 5164 | #endif |
4900 | printk(KERN_CONT "%5lu %5d %6d\n", free, | 5165 | printk(KERN_CONT "%5lu %5d %6d\n", free, |
4901 | task_pid_nr(p), task_pid_nr(p->parent)); | 5166 | task_pid_nr(p), task_pid_nr(p->real_parent)); |
4902 | 5167 | ||
4903 | if (state != TASK_RUNNING) | 5168 | show_stack(p, NULL); |
4904 | show_stack(p, NULL); | ||
4905 | } | 5169 | } |
4906 | 5170 | ||
4907 | void show_state_filter(unsigned long state_filter) | 5171 | void show_state_filter(unsigned long state_filter) |
@@ -4923,7 +5187,7 @@ void show_state_filter(unsigned long state_filter) | |||
4923 | */ | 5187 | */ |
4924 | touch_nmi_watchdog(); | 5188 | touch_nmi_watchdog(); |
4925 | if (!state_filter || (p->state & state_filter)) | 5189 | if (!state_filter || (p->state & state_filter)) |
4926 | show_task(p); | 5190 | sched_show_task(p); |
4927 | } while_each_thread(g, p); | 5191 | } while_each_thread(g, p); |
4928 | 5192 | ||
4929 | touch_all_softlockup_watchdogs(); | 5193 | touch_all_softlockup_watchdogs(); |
@@ -4972,11 +5236,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
4972 | spin_unlock_irqrestore(&rq->lock, flags); | 5236 | spin_unlock_irqrestore(&rq->lock, flags); |
4973 | 5237 | ||
4974 | /* Set the preempt count _outside_ the spinlocks! */ | 5238 | /* Set the preempt count _outside_ the spinlocks! */ |
4975 | #if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL) | ||
4976 | task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); | ||
4977 | #else | ||
4978 | task_thread_info(idle)->preempt_count = 0; | 5239 | task_thread_info(idle)->preempt_count = 0; |
4979 | #endif | 5240 | |
4980 | /* | 5241 | /* |
4981 | * The idle tasks have their own, simple scheduling class: | 5242 | * The idle tasks have their own, simple scheduling class: |
4982 | */ | 5243 | */ |
@@ -4992,6 +5253,32 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) | |||
4992 | */ | 5253 | */ |
4993 | cpumask_t nohz_cpu_mask = CPU_MASK_NONE; | 5254 | cpumask_t nohz_cpu_mask = CPU_MASK_NONE; |
4994 | 5255 | ||
5256 | /* | ||
5257 | * Increase the granularity value when there are more CPUs, | ||
5258 | * because with more CPUs the 'effective latency' as visible | ||
5259 | * to users decreases. But the relationship is not linear, | ||
5260 | * so pick a second-best guess by going with the log2 of the | ||
5261 | * number of CPUs. | ||
5262 | * | ||
5263 | * This idea comes from the SD scheduler of Con Kolivas: | ||
5264 | */ | ||
5265 | static inline void sched_init_granularity(void) | ||
5266 | { | ||
5267 | unsigned int factor = 1 + ilog2(num_online_cpus()); | ||
5268 | const unsigned long limit = 200000000; | ||
5269 | |||
5270 | sysctl_sched_min_granularity *= factor; | ||
5271 | if (sysctl_sched_min_granularity > limit) | ||
5272 | sysctl_sched_min_granularity = limit; | ||
5273 | |||
5274 | sysctl_sched_latency *= factor; | ||
5275 | if (sysctl_sched_latency > limit) | ||
5276 | sysctl_sched_latency = limit; | ||
5277 | |||
5278 | sysctl_sched_wakeup_granularity *= factor; | ||
5279 | sysctl_sched_batch_wakeup_granularity *= factor; | ||
5280 | } | ||
5281 | |||
4995 | #ifdef CONFIG_SMP | 5282 | #ifdef CONFIG_SMP |
4996 | /* | 5283 | /* |
4997 | * This is how migration works: | 5284 | * This is how migration works: |
@@ -5015,7 +5302,7 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE; | |||
5015 | * is removed from the allowed bitmask. | 5302 | * is removed from the allowed bitmask. |
5016 | * | 5303 | * |
5017 | * NOTE: the caller must have a valid reference to the task, the | 5304 | * NOTE: the caller must have a valid reference to the task, the |
5018 | * task must not exit() & deallocate itself prematurely. The | 5305 | * task must not exit() & deallocate itself prematurely. The |
5019 | * call is not atomic; no spinlocks may be held. | 5306 | * call is not atomic; no spinlocks may be held. |
5020 | */ | 5307 | */ |
5021 | int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) | 5308 | int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) |
@@ -5031,7 +5318,13 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) | |||
5031 | goto out; | 5318 | goto out; |
5032 | } | 5319 | } |
5033 | 5320 | ||
5034 | p->cpus_allowed = new_mask; | 5321 | if (p->sched_class->set_cpus_allowed) |
5322 | p->sched_class->set_cpus_allowed(p, &new_mask); | ||
5323 | else { | ||
5324 | p->cpus_allowed = new_mask; | ||
5325 | p->rt.nr_cpus_allowed = cpus_weight(new_mask); | ||
5326 | } | ||
5327 | |||
5035 | /* Can the task run on the task's current CPU? If so, we're done */ | 5328 | /* Can the task run on the task's current CPU? If so, we're done */ |
5036 | if (cpu_isset(task_cpu(p), new_mask)) | 5329 | if (cpu_isset(task_cpu(p), new_mask)) |
5037 | goto out; | 5330 | goto out; |
@@ -5052,7 +5345,7 @@ out: | |||
5052 | EXPORT_SYMBOL_GPL(set_cpus_allowed); | 5345 | EXPORT_SYMBOL_GPL(set_cpus_allowed); |
5053 | 5346 | ||
5054 | /* | 5347 | /* |
5055 | * Move (not current) task off this cpu, onto dest cpu. We're doing | 5348 | * Move (not current) task off this cpu, onto dest cpu. We're doing |
5056 | * this because either it can't run here any more (set_cpus_allowed() | 5349 | * this because either it can't run here any more (set_cpus_allowed() |
5057 | * away from this CPU, or CPU going down), or because we're | 5350 | * away from this CPU, or CPU going down), or because we're |
5058 | * attempting to rebalance this task on exec (sched_exec). | 5351 | * attempting to rebalance this task on exec (sched_exec). |
@@ -5197,7 +5490,7 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) | |||
5197 | * Try to stay on the same cpuset, where the | 5490 | * Try to stay on the same cpuset, where the |
5198 | * current cpuset may be a subset of all cpus. | 5491 | * current cpuset may be a subset of all cpus. |
5199 | * The cpuset_cpus_allowed_locked() variant of | 5492 | * The cpuset_cpus_allowed_locked() variant of |
5200 | * cpuset_cpus_allowed() will not block. It must be | 5493 | * cpuset_cpus_allowed() will not block. It must be |
5201 | * called within calls to cpuset_lock/cpuset_unlock. | 5494 | * called within calls to cpuset_lock/cpuset_unlock. |
5202 | */ | 5495 | */ |
5203 | rq = task_rq_lock(p, &flags); | 5496 | rq = task_rq_lock(p, &flags); |
@@ -5210,10 +5503,11 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p) | |||
5210 | * kernel threads (both mm NULL), since they never | 5503 | * kernel threads (both mm NULL), since they never |
5211 | * leave kernel. | 5504 | * leave kernel. |
5212 | */ | 5505 | */ |
5213 | if (p->mm && printk_ratelimit()) | 5506 | if (p->mm && printk_ratelimit()) { |
5214 | printk(KERN_INFO "process %d (%s) no " | 5507 | printk(KERN_INFO "process %d (%s) no " |
5215 | "longer affine to cpu%d\n", | 5508 | "longer affine to cpu%d\n", |
5216 | task_pid_nr(p), p->comm, dead_cpu); | 5509 | task_pid_nr(p), p->comm, dead_cpu); |
5510 | } | ||
5217 | } | 5511 | } |
5218 | } while (!__migrate_task_irq(p, dead_cpu, dest_cpu)); | 5512 | } while (!__migrate_task_irq(p, dead_cpu, dest_cpu)); |
5219 | } | 5513 | } |
@@ -5257,23 +5551,9 @@ static void migrate_live_tasks(int src_cpu) | |||
5257 | } | 5551 | } |
5258 | 5552 | ||
5259 | /* | 5553 | /* |
5260 | * activate_idle_task - move idle task to the _front_ of runqueue. | ||
5261 | */ | ||
5262 | static void activate_idle_task(struct task_struct *p, struct rq *rq) | ||
5263 | { | ||
5264 | update_rq_clock(rq); | ||
5265 | |||
5266 | if (p->state == TASK_UNINTERRUPTIBLE) | ||
5267 | rq->nr_uninterruptible--; | ||
5268 | |||
5269 | enqueue_task(rq, p, 0); | ||
5270 | inc_nr_running(p, rq); | ||
5271 | } | ||
5272 | |||
5273 | /* | ||
5274 | * Schedules idle task to be the next runnable task on current CPU. | 5554 | * Schedules idle task to be the next runnable task on current CPU. |
5275 | * It does so by boosting its priority to highest possible and adding it to | 5555 | * It does so by boosting its priority to highest possible. |
5276 | * the _front_ of the runqueue. Used by CPU offline code. | 5556 | * Used by CPU offline code. |
5277 | */ | 5557 | */ |
5278 | void sched_idle_next(void) | 5558 | void sched_idle_next(void) |
5279 | { | 5559 | { |
@@ -5293,8 +5573,8 @@ void sched_idle_next(void) | |||
5293 | 5573 | ||
5294 | __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); | 5574 | __setscheduler(rq, p, SCHED_FIFO, MAX_RT_PRIO-1); |
5295 | 5575 | ||
5296 | /* Add idle task to the _front_ of its priority queue: */ | 5576 | update_rq_clock(rq); |
5297 | activate_idle_task(p, rq); | 5577 | activate_task(rq, p, 0); |
5298 | 5578 | ||
5299 | spin_unlock_irqrestore(&rq->lock, flags); | 5579 | spin_unlock_irqrestore(&rq->lock, flags); |
5300 | } | 5580 | } |
@@ -5329,7 +5609,7 @@ static void migrate_dead(unsigned int dead_cpu, struct task_struct *p) | |||
5329 | 5609 | ||
5330 | /* | 5610 | /* |
5331 | * Drop lock around migration; if someone else moves it, | 5611 | * Drop lock around migration; if someone else moves it, |
5332 | * that's OK. No task can be added to this CPU, so iteration is | 5612 | * that's OK. No task can be added to this CPU, so iteration is |
5333 | * fine. | 5613 | * fine. |
5334 | */ | 5614 | */ |
5335 | spin_unlock_irq(&rq->lock); | 5615 | spin_unlock_irq(&rq->lock); |
@@ -5365,7 +5645,7 @@ static struct ctl_table sd_ctl_dir[] = { | |||
5365 | .procname = "sched_domain", | 5645 | .procname = "sched_domain", |
5366 | .mode = 0555, | 5646 | .mode = 0555, |
5367 | }, | 5647 | }, |
5368 | {0,}, | 5648 | {0, }, |
5369 | }; | 5649 | }; |
5370 | 5650 | ||
5371 | static struct ctl_table sd_ctl_root[] = { | 5651 | static struct ctl_table sd_ctl_root[] = { |
@@ -5375,7 +5655,7 @@ static struct ctl_table sd_ctl_root[] = { | |||
5375 | .mode = 0555, | 5655 | .mode = 0555, |
5376 | .child = sd_ctl_dir, | 5656 | .child = sd_ctl_dir, |
5377 | }, | 5657 | }, |
5378 | {0,}, | 5658 | {0, }, |
5379 | }; | 5659 | }; |
5380 | 5660 | ||
5381 | static struct ctl_table *sd_alloc_ctl_entry(int n) | 5661 | static struct ctl_table *sd_alloc_ctl_entry(int n) |
@@ -5393,7 +5673,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep) | |||
5393 | /* | 5673 | /* |
5394 | * In the intermediate directories, both the child directory and | 5674 | * In the intermediate directories, both the child directory and |
5395 | * procname are dynamically allocated and could fail but the mode | 5675 | * procname are dynamically allocated and could fail but the mode |
5396 | * will always be set. In the lowest directory the names are | 5676 | * will always be set. In the lowest directory the names are |
5397 | * static strings and all have proc handlers. | 5677 | * static strings and all have proc handlers. |
5398 | */ | 5678 | */ |
5399 | for (entry = *tablep; entry->mode; entry++) { | 5679 | for (entry = *tablep; entry->mode; entry++) { |
@@ -5455,7 +5735,7 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd) | |||
5455 | return table; | 5735 | return table; |
5456 | } | 5736 | } |
5457 | 5737 | ||
5458 | static ctl_table * sd_alloc_ctl_cpu_table(int cpu) | 5738 | static ctl_table *sd_alloc_ctl_cpu_table(int cpu) |
5459 | { | 5739 | { |
5460 | struct ctl_table *entry, *table; | 5740 | struct ctl_table *entry, *table; |
5461 | struct sched_domain *sd; | 5741 | struct sched_domain *sd; |
@@ -5536,9 +5816,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
5536 | struct rq *rq; | 5816 | struct rq *rq; |
5537 | 5817 | ||
5538 | switch (action) { | 5818 | switch (action) { |
5539 | case CPU_LOCK_ACQUIRE: | ||
5540 | mutex_lock(&sched_hotcpu_mutex); | ||
5541 | break; | ||
5542 | 5819 | ||
5543 | case CPU_UP_PREPARE: | 5820 | case CPU_UP_PREPARE: |
5544 | case CPU_UP_PREPARE_FROZEN: | 5821 | case CPU_UP_PREPARE_FROZEN: |
@@ -5557,6 +5834,15 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
5557 | case CPU_ONLINE_FROZEN: | 5834 | case CPU_ONLINE_FROZEN: |
5558 | /* Strictly unnecessary, as first user will wake it. */ | 5835 | /* Strictly unnecessary, as first user will wake it. */ |
5559 | wake_up_process(cpu_rq(cpu)->migration_thread); | 5836 | wake_up_process(cpu_rq(cpu)->migration_thread); |
5837 | |||
5838 | /* Update our root-domain */ | ||
5839 | rq = cpu_rq(cpu); | ||
5840 | spin_lock_irqsave(&rq->lock, flags); | ||
5841 | if (rq->rd) { | ||
5842 | BUG_ON(!cpu_isset(cpu, rq->rd->span)); | ||
5843 | cpu_set(cpu, rq->rd->online); | ||
5844 | } | ||
5845 | spin_unlock_irqrestore(&rq->lock, flags); | ||
5560 | break; | 5846 | break; |
5561 | 5847 | ||
5562 | #ifdef CONFIG_HOTPLUG_CPU | 5848 | #ifdef CONFIG_HOTPLUG_CPU |
@@ -5564,7 +5850,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
5564 | case CPU_UP_CANCELED_FROZEN: | 5850 | case CPU_UP_CANCELED_FROZEN: |
5565 | if (!cpu_rq(cpu)->migration_thread) | 5851 | if (!cpu_rq(cpu)->migration_thread) |
5566 | break; | 5852 | break; |
5567 | /* Unbind it from offline cpu so it can run. Fall thru. */ | 5853 | /* Unbind it from offline cpu so it can run. Fall thru. */ |
5568 | kthread_bind(cpu_rq(cpu)->migration_thread, | 5854 | kthread_bind(cpu_rq(cpu)->migration_thread, |
5569 | any_online_cpu(cpu_online_map)); | 5855 | any_online_cpu(cpu_online_map)); |
5570 | kthread_stop(cpu_rq(cpu)->migration_thread); | 5856 | kthread_stop(cpu_rq(cpu)->migration_thread); |
@@ -5591,9 +5877,11 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
5591 | migrate_nr_uninterruptible(rq); | 5877 | migrate_nr_uninterruptible(rq); |
5592 | BUG_ON(rq->nr_running != 0); | 5878 | BUG_ON(rq->nr_running != 0); |
5593 | 5879 | ||
5594 | /* No need to migrate the tasks: it was best-effort if | 5880 | /* |
5595 | * they didn't take sched_hotcpu_mutex. Just wake up | 5881 | * No need to migrate the tasks: it was best-effort if |
5596 | * the requestors. */ | 5882 | * they didn't take sched_hotcpu_mutex. Just wake up |
5883 | * the requestors. | ||
5884 | */ | ||
5597 | spin_lock_irq(&rq->lock); | 5885 | spin_lock_irq(&rq->lock); |
5598 | while (!list_empty(&rq->migration_queue)) { | 5886 | while (!list_empty(&rq->migration_queue)) { |
5599 | struct migration_req *req; | 5887 | struct migration_req *req; |
@@ -5605,10 +5893,18 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
5605 | } | 5893 | } |
5606 | spin_unlock_irq(&rq->lock); | 5894 | spin_unlock_irq(&rq->lock); |
5607 | break; | 5895 | break; |
5608 | #endif | 5896 | |
5609 | case CPU_LOCK_RELEASE: | 5897 | case CPU_DOWN_PREPARE: |
5610 | mutex_unlock(&sched_hotcpu_mutex); | 5898 | /* Update our root-domain */ |
5899 | rq = cpu_rq(cpu); | ||
5900 | spin_lock_irqsave(&rq->lock, flags); | ||
5901 | if (rq->rd) { | ||
5902 | BUG_ON(!cpu_isset(cpu, rq->rd->span)); | ||
5903 | cpu_clear(cpu, rq->rd->online); | ||
5904 | } | ||
5905 | spin_unlock_irqrestore(&rq->lock, flags); | ||
5611 | break; | 5906 | break; |
5907 | #endif | ||
5612 | } | 5908 | } |
5613 | return NOTIFY_OK; | 5909 | return NOTIFY_OK; |
5614 | } | 5910 | } |
@@ -5621,7 +5917,7 @@ static struct notifier_block __cpuinitdata migration_notifier = { | |||
5621 | .priority = 10 | 5917 | .priority = 10 |
5622 | }; | 5918 | }; |
5623 | 5919 | ||
5624 | int __init migration_init(void) | 5920 | void __init migration_init(void) |
5625 | { | 5921 | { |
5626 | void *cpu = (void *)(long)smp_processor_id(); | 5922 | void *cpu = (void *)(long)smp_processor_id(); |
5627 | int err; | 5923 | int err; |
@@ -5631,8 +5927,6 @@ int __init migration_init(void) | |||
5631 | BUG_ON(err == NOTIFY_BAD); | 5927 | BUG_ON(err == NOTIFY_BAD); |
5632 | migration_call(&migration_notifier, CPU_ONLINE, cpu); | 5928 | migration_call(&migration_notifier, CPU_ONLINE, cpu); |
5633 | register_cpu_notifier(&migration_notifier); | 5929 | register_cpu_notifier(&migration_notifier); |
5634 | |||
5635 | return 0; | ||
5636 | } | 5930 | } |
5637 | #endif | 5931 | #endif |
5638 | 5932 | ||
@@ -5798,11 +6092,76 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) | |||
5798 | return 1; | 6092 | return 1; |
5799 | } | 6093 | } |
5800 | 6094 | ||
6095 | static void rq_attach_root(struct rq *rq, struct root_domain *rd) | ||
6096 | { | ||
6097 | unsigned long flags; | ||
6098 | const struct sched_class *class; | ||
6099 | |||
6100 | spin_lock_irqsave(&rq->lock, flags); | ||
6101 | |||
6102 | if (rq->rd) { | ||
6103 | struct root_domain *old_rd = rq->rd; | ||
6104 | |||
6105 | for (class = sched_class_highest; class; class = class->next) { | ||
6106 | if (class->leave_domain) | ||
6107 | class->leave_domain(rq); | ||
6108 | } | ||
6109 | |||
6110 | cpu_clear(rq->cpu, old_rd->span); | ||
6111 | cpu_clear(rq->cpu, old_rd->online); | ||
6112 | |||
6113 | if (atomic_dec_and_test(&old_rd->refcount)) | ||
6114 | kfree(old_rd); | ||
6115 | } | ||
6116 | |||
6117 | atomic_inc(&rd->refcount); | ||
6118 | rq->rd = rd; | ||
6119 | |||
6120 | cpu_set(rq->cpu, rd->span); | ||
6121 | if (cpu_isset(rq->cpu, cpu_online_map)) | ||
6122 | cpu_set(rq->cpu, rd->online); | ||
6123 | |||
6124 | for (class = sched_class_highest; class; class = class->next) { | ||
6125 | if (class->join_domain) | ||
6126 | class->join_domain(rq); | ||
6127 | } | ||
6128 | |||
6129 | spin_unlock_irqrestore(&rq->lock, flags); | ||
6130 | } | ||
6131 | |||
6132 | static void init_rootdomain(struct root_domain *rd) | ||
6133 | { | ||
6134 | memset(rd, 0, sizeof(*rd)); | ||
6135 | |||
6136 | cpus_clear(rd->span); | ||
6137 | cpus_clear(rd->online); | ||
6138 | } | ||
6139 | |||
6140 | static void init_defrootdomain(void) | ||
6141 | { | ||
6142 | init_rootdomain(&def_root_domain); | ||
6143 | atomic_set(&def_root_domain.refcount, 1); | ||
6144 | } | ||
6145 | |||
6146 | static struct root_domain *alloc_rootdomain(void) | ||
6147 | { | ||
6148 | struct root_domain *rd; | ||
6149 | |||
6150 | rd = kmalloc(sizeof(*rd), GFP_KERNEL); | ||
6151 | if (!rd) | ||
6152 | return NULL; | ||
6153 | |||
6154 | init_rootdomain(rd); | ||
6155 | |||
6156 | return rd; | ||
6157 | } | ||
6158 | |||
5801 | /* | 6159 | /* |
5802 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must | 6160 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must |
5803 | * hold the hotplug lock. | 6161 | * hold the hotplug lock. |
5804 | */ | 6162 | */ |
5805 | static void cpu_attach_domain(struct sched_domain *sd, int cpu) | 6163 | static void |
6164 | cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) | ||
5806 | { | 6165 | { |
5807 | struct rq *rq = cpu_rq(cpu); | 6166 | struct rq *rq = cpu_rq(cpu); |
5808 | struct sched_domain *tmp; | 6167 | struct sched_domain *tmp; |
@@ -5827,6 +6186,7 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu) | |||
5827 | 6186 | ||
5828 | sched_domain_debug(sd, cpu); | 6187 | sched_domain_debug(sd, cpu); |
5829 | 6188 | ||
6189 | rq_attach_root(rq, rd); | ||
5830 | rcu_assign_pointer(rq->sd, sd); | 6190 | rcu_assign_pointer(rq->sd, sd); |
5831 | } | 6191 | } |
5832 | 6192 | ||
@@ -5903,7 +6263,7 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, | |||
5903 | * @node: node whose sched_domain we're building | 6263 | * @node: node whose sched_domain we're building |
5904 | * @used_nodes: nodes already in the sched_domain | 6264 | * @used_nodes: nodes already in the sched_domain |
5905 | * | 6265 | * |
5906 | * Find the next node to include in a given scheduling domain. Simply | 6266 | * Find the next node to include in a given scheduling domain. Simply |
5907 | * finds the closest node not already in the @used_nodes map. | 6267 | * finds the closest node not already in the @used_nodes map. |
5908 | * | 6268 | * |
5909 | * Should use nodemask_t. | 6269 | * Should use nodemask_t. |
@@ -5943,7 +6303,7 @@ static int find_next_best_node(int node, unsigned long *used_nodes) | |||
5943 | * @node: node whose cpumask we're constructing | 6303 | * @node: node whose cpumask we're constructing |
5944 | * @size: number of nodes to include in this span | 6304 | * @size: number of nodes to include in this span |
5945 | * | 6305 | * |
5946 | * Given a node, construct a good cpumask for its sched_domain to span. It | 6306 | * Given a node, construct a good cpumask for its sched_domain to span. It |
5947 | * should be one that prevents unnecessary balancing, but also spreads tasks | 6307 | * should be one that prevents unnecessary balancing, but also spreads tasks |
5948 | * out optimally. | 6308 | * out optimally. |
5949 | */ | 6309 | */ |
@@ -5980,8 +6340,8 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0; | |||
5980 | static DEFINE_PER_CPU(struct sched_domain, cpu_domains); | 6340 | static DEFINE_PER_CPU(struct sched_domain, cpu_domains); |
5981 | static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); | 6341 | static DEFINE_PER_CPU(struct sched_group, sched_group_cpus); |
5982 | 6342 | ||
5983 | static int cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, | 6343 | static int |
5984 | struct sched_group **sg) | 6344 | cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) |
5985 | { | 6345 | { |
5986 | if (sg) | 6346 | if (sg) |
5987 | *sg = &per_cpu(sched_group_cpus, cpu); | 6347 | *sg = &per_cpu(sched_group_cpus, cpu); |
@@ -5998,8 +6358,8 @@ static DEFINE_PER_CPU(struct sched_group, sched_group_core); | |||
5998 | #endif | 6358 | #endif |
5999 | 6359 | ||
6000 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) | 6360 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) |
6001 | static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map, | 6361 | static int |
6002 | struct sched_group **sg) | 6362 | cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) |
6003 | { | 6363 | { |
6004 | int group; | 6364 | int group; |
6005 | cpumask_t mask = per_cpu(cpu_sibling_map, cpu); | 6365 | cpumask_t mask = per_cpu(cpu_sibling_map, cpu); |
@@ -6010,8 +6370,8 @@ static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map, | |||
6010 | return group; | 6370 | return group; |
6011 | } | 6371 | } |
6012 | #elif defined(CONFIG_SCHED_MC) | 6372 | #elif defined(CONFIG_SCHED_MC) |
6013 | static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map, | 6373 | static int |
6014 | struct sched_group **sg) | 6374 | cpu_to_core_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) |
6015 | { | 6375 | { |
6016 | if (sg) | 6376 | if (sg) |
6017 | *sg = &per_cpu(sched_group_core, cpu); | 6377 | *sg = &per_cpu(sched_group_core, cpu); |
@@ -6022,8 +6382,8 @@ static int cpu_to_core_group(int cpu, const cpumask_t *cpu_map, | |||
6022 | static DEFINE_PER_CPU(struct sched_domain, phys_domains); | 6382 | static DEFINE_PER_CPU(struct sched_domain, phys_domains); |
6023 | static DEFINE_PER_CPU(struct sched_group, sched_group_phys); | 6383 | static DEFINE_PER_CPU(struct sched_group, sched_group_phys); |
6024 | 6384 | ||
6025 | static int cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, | 6385 | static int |
6026 | struct sched_group **sg) | 6386 | cpu_to_phys_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg) |
6027 | { | 6387 | { |
6028 | int group; | 6388 | int group; |
6029 | #ifdef CONFIG_SCHED_MC | 6389 | #ifdef CONFIG_SCHED_MC |
@@ -6195,6 +6555,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) | |||
6195 | static int build_sched_domains(const cpumask_t *cpu_map) | 6555 | static int build_sched_domains(const cpumask_t *cpu_map) |
6196 | { | 6556 | { |
6197 | int i; | 6557 | int i; |
6558 | struct root_domain *rd; | ||
6198 | #ifdef CONFIG_NUMA | 6559 | #ifdef CONFIG_NUMA |
6199 | struct sched_group **sched_group_nodes = NULL; | 6560 | struct sched_group **sched_group_nodes = NULL; |
6200 | int sd_allnodes = 0; | 6561 | int sd_allnodes = 0; |
@@ -6203,7 +6564,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6203 | * Allocate the per-node list of sched groups | 6564 | * Allocate the per-node list of sched groups |
6204 | */ | 6565 | */ |
6205 | sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *), | 6566 | sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *), |
6206 | GFP_KERNEL); | 6567 | GFP_KERNEL); |
6207 | if (!sched_group_nodes) { | 6568 | if (!sched_group_nodes) { |
6208 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | 6569 | printk(KERN_WARNING "Can not alloc sched group node list\n"); |
6209 | return -ENOMEM; | 6570 | return -ENOMEM; |
@@ -6211,6 +6572,12 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6211 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; | 6572 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; |
6212 | #endif | 6573 | #endif |
6213 | 6574 | ||
6575 | rd = alloc_rootdomain(); | ||
6576 | if (!rd) { | ||
6577 | printk(KERN_WARNING "Cannot alloc root domain\n"); | ||
6578 | return -ENOMEM; | ||
6579 | } | ||
6580 | |||
6214 | /* | 6581 | /* |
6215 | * Set up domains for cpus specified by the cpu_map. | 6582 | * Set up domains for cpus specified by the cpu_map. |
6216 | */ | 6583 | */ |
@@ -6427,7 +6794,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) | |||
6427 | #else | 6794 | #else |
6428 | sd = &per_cpu(phys_domains, i); | 6795 | sd = &per_cpu(phys_domains, i); |
6429 | #endif | 6796 | #endif |
6430 | cpu_attach_domain(sd, i); | 6797 | cpu_attach_domain(sd, rd, i); |
6431 | } | 6798 | } |
6432 | 6799 | ||
6433 | return 0; | 6800 | return 0; |
@@ -6450,7 +6817,7 @@ static int ndoms_cur; /* number of sched domains in 'doms_cur' */ | |||
6450 | static cpumask_t fallback_doms; | 6817 | static cpumask_t fallback_doms; |
6451 | 6818 | ||
6452 | /* | 6819 | /* |
6453 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | 6820 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. |
6454 | * For now this just excludes isolated cpus, but could be used to | 6821 | * For now this just excludes isolated cpus, but could be used to |
6455 | * exclude other special cases in the future. | 6822 | * exclude other special cases in the future. |
6456 | */ | 6823 | */ |
@@ -6485,26 +6852,26 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) | |||
6485 | unregister_sched_domain_sysctl(); | 6852 | unregister_sched_domain_sysctl(); |
6486 | 6853 | ||
6487 | for_each_cpu_mask(i, *cpu_map) | 6854 | for_each_cpu_mask(i, *cpu_map) |
6488 | cpu_attach_domain(NULL, i); | 6855 | cpu_attach_domain(NULL, &def_root_domain, i); |
6489 | synchronize_sched(); | 6856 | synchronize_sched(); |
6490 | arch_destroy_sched_domains(cpu_map); | 6857 | arch_destroy_sched_domains(cpu_map); |
6491 | } | 6858 | } |
6492 | 6859 | ||
6493 | /* | 6860 | /* |
6494 | * Partition sched domains as specified by the 'ndoms_new' | 6861 | * Partition sched domains as specified by the 'ndoms_new' |
6495 | * cpumasks in the array doms_new[] of cpumasks. This compares | 6862 | * cpumasks in the array doms_new[] of cpumasks. This compares |
6496 | * doms_new[] to the current sched domain partitioning, doms_cur[]. | 6863 | * doms_new[] to the current sched domain partitioning, doms_cur[]. |
6497 | * It destroys each deleted domain and builds each new domain. | 6864 | * It destroys each deleted domain and builds each new domain. |
6498 | * | 6865 | * |
6499 | * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'. | 6866 | * 'doms_new' is an array of cpumask_t's of length 'ndoms_new'. |
6500 | * The masks don't intersect (don't overlap.) We should setup one | 6867 | * The masks don't intersect (don't overlap.) We should setup one |
6501 | * sched domain for each mask. CPUs not in any of the cpumasks will | 6868 | * sched domain for each mask. CPUs not in any of the cpumasks will |
6502 | * not be load balanced. If the same cpumask appears both in the | 6869 | * not be load balanced. If the same cpumask appears both in the |
6503 | * current 'doms_cur' domains and in the new 'doms_new', we can leave | 6870 | * current 'doms_cur' domains and in the new 'doms_new', we can leave |
6504 | * it as it is. | 6871 | * it as it is. |
6505 | * | 6872 | * |
6506 | * The passed in 'doms_new' should be kmalloc'd. This routine takes | 6873 | * The passed in 'doms_new' should be kmalloc'd. This routine takes |
6507 | * ownership of it and will kfree it when done with it. If the caller | 6874 | * ownership of it and will kfree it when done with it. If the caller |
6508 | * failed the kmalloc call, then it can pass in doms_new == NULL, | 6875 | * failed the kmalloc call, then it can pass in doms_new == NULL, |
6509 | * and partition_sched_domains() will fallback to the single partition | 6876 | * and partition_sched_domains() will fallback to the single partition |
6510 | * 'fallback_doms'. | 6877 | * 'fallback_doms'. |
@@ -6515,6 +6882,8 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new) | |||
6515 | { | 6882 | { |
6516 | int i, j; | 6883 | int i, j; |
6517 | 6884 | ||
6885 | lock_doms_cur(); | ||
6886 | |||
6518 | /* always unregister in case we don't destroy any domains */ | 6887 | /* always unregister in case we don't destroy any domains */ |
6519 | unregister_sched_domain_sysctl(); | 6888 | unregister_sched_domain_sysctl(); |
6520 | 6889 | ||
@@ -6555,6 +6924,8 @@ match2: | |||
6555 | ndoms_cur = ndoms_new; | 6924 | ndoms_cur = ndoms_new; |
6556 | 6925 | ||
6557 | register_sched_domain_sysctl(); | 6926 | register_sched_domain_sysctl(); |
6927 | |||
6928 | unlock_doms_cur(); | ||
6558 | } | 6929 | } |
6559 | 6930 | ||
6560 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | 6931 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) |
@@ -6562,10 +6933,10 @@ static int arch_reinit_sched_domains(void) | |||
6562 | { | 6933 | { |
6563 | int err; | 6934 | int err; |
6564 | 6935 | ||
6565 | mutex_lock(&sched_hotcpu_mutex); | 6936 | get_online_cpus(); |
6566 | detach_destroy_domains(&cpu_online_map); | 6937 | detach_destroy_domains(&cpu_online_map); |
6567 | err = arch_init_sched_domains(&cpu_online_map); | 6938 | err = arch_init_sched_domains(&cpu_online_map); |
6568 | mutex_unlock(&sched_hotcpu_mutex); | 6939 | put_online_cpus(); |
6569 | 6940 | ||
6570 | return err; | 6941 | return err; |
6571 | } | 6942 | } |
@@ -6634,7 +7005,7 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) | |||
6634 | #endif | 7005 | #endif |
6635 | 7006 | ||
6636 | /* | 7007 | /* |
6637 | * Force a reinitialization of the sched domains hierarchy. The domains | 7008 | * Force a reinitialization of the sched domains hierarchy. The domains |
6638 | * and groups cannot be updated in place without racing with the balancing | 7009 | * and groups cannot be updated in place without racing with the balancing |
6639 | * code, so we temporarily attach all running cpus to the NULL domain | 7010 | * code, so we temporarily attach all running cpus to the NULL domain |
6640 | * which will prevent rebalancing while the sched domains are recalculated. | 7011 | * which will prevent rebalancing while the sched domains are recalculated. |
@@ -6676,30 +7047,44 @@ void __init sched_init_smp(void) | |||
6676 | { | 7047 | { |
6677 | cpumask_t non_isolated_cpus; | 7048 | cpumask_t non_isolated_cpus; |
6678 | 7049 | ||
6679 | mutex_lock(&sched_hotcpu_mutex); | 7050 | get_online_cpus(); |
6680 | arch_init_sched_domains(&cpu_online_map); | 7051 | arch_init_sched_domains(&cpu_online_map); |
6681 | cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); | 7052 | cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); |
6682 | if (cpus_empty(non_isolated_cpus)) | 7053 | if (cpus_empty(non_isolated_cpus)) |
6683 | cpu_set(smp_processor_id(), non_isolated_cpus); | 7054 | cpu_set(smp_processor_id(), non_isolated_cpus); |
6684 | mutex_unlock(&sched_hotcpu_mutex); | 7055 | put_online_cpus(); |
6685 | /* XXX: Theoretical race here - CPU may be hotplugged now */ | 7056 | /* XXX: Theoretical race here - CPU may be hotplugged now */ |
6686 | hotcpu_notifier(update_sched_domains, 0); | 7057 | hotcpu_notifier(update_sched_domains, 0); |
6687 | 7058 | ||
6688 | /* Move init over to a non-isolated CPU */ | 7059 | /* Move init over to a non-isolated CPU */ |
6689 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) | 7060 | if (set_cpus_allowed(current, non_isolated_cpus) < 0) |
6690 | BUG(); | 7061 | BUG(); |
7062 | sched_init_granularity(); | ||
7063 | |||
7064 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
7065 | if (nr_cpu_ids == 1) | ||
7066 | return; | ||
7067 | |||
7068 | lb_monitor_task = kthread_create(load_balance_monitor, NULL, | ||
7069 | "group_balance"); | ||
7070 | if (!IS_ERR(lb_monitor_task)) { | ||
7071 | lb_monitor_task->flags |= PF_NOFREEZE; | ||
7072 | wake_up_process(lb_monitor_task); | ||
7073 | } else { | ||
7074 | printk(KERN_ERR "Could not create load balance monitor thread" | ||
7075 | "(error = %ld) \n", PTR_ERR(lb_monitor_task)); | ||
7076 | } | ||
7077 | #endif | ||
6691 | } | 7078 | } |
6692 | #else | 7079 | #else |
6693 | void __init sched_init_smp(void) | 7080 | void __init sched_init_smp(void) |
6694 | { | 7081 | { |
7082 | sched_init_granularity(); | ||
6695 | } | 7083 | } |
6696 | #endif /* CONFIG_SMP */ | 7084 | #endif /* CONFIG_SMP */ |
6697 | 7085 | ||
6698 | int in_sched_functions(unsigned long addr) | 7086 | int in_sched_functions(unsigned long addr) |
6699 | { | 7087 | { |
6700 | /* Linker adds these: start and end of __sched functions */ | ||
6701 | extern char __sched_text_start[], __sched_text_end[]; | ||
6702 | |||
6703 | return in_lock_functions(addr) || | 7088 | return in_lock_functions(addr) || |
6704 | (addr >= (unsigned long)__sched_text_start | 7089 | (addr >= (unsigned long)__sched_text_start |
6705 | && addr < (unsigned long)__sched_text_end); | 7090 | && addr < (unsigned long)__sched_text_end); |
@@ -6714,13 +7099,87 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) | |||
6714 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); | 7099 | cfs_rq->min_vruntime = (u64)(-(1LL << 20)); |
6715 | } | 7100 | } |
6716 | 7101 | ||
7102 | static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) | ||
7103 | { | ||
7104 | struct rt_prio_array *array; | ||
7105 | int i; | ||
7106 | |||
7107 | array = &rt_rq->active; | ||
7108 | for (i = 0; i < MAX_RT_PRIO; i++) { | ||
7109 | INIT_LIST_HEAD(array->queue + i); | ||
7110 | __clear_bit(i, array->bitmap); | ||
7111 | } | ||
7112 | /* delimiter for bitsearch: */ | ||
7113 | __set_bit(MAX_RT_PRIO, array->bitmap); | ||
7114 | |||
7115 | #if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED | ||
7116 | rt_rq->highest_prio = MAX_RT_PRIO; | ||
7117 | #endif | ||
7118 | #ifdef CONFIG_SMP | ||
7119 | rt_rq->rt_nr_migratory = 0; | ||
7120 | rt_rq->overloaded = 0; | ||
7121 | #endif | ||
7122 | |||
7123 | rt_rq->rt_time = 0; | ||
7124 | rt_rq->rt_throttled = 0; | ||
7125 | |||
7126 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
7127 | rt_rq->rq = rq; | ||
7128 | #endif | ||
7129 | } | ||
7130 | |||
7131 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
7132 | static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg, | ||
7133 | struct cfs_rq *cfs_rq, struct sched_entity *se, | ||
7134 | int cpu, int add) | ||
7135 | { | ||
7136 | tg->cfs_rq[cpu] = cfs_rq; | ||
7137 | init_cfs_rq(cfs_rq, rq); | ||
7138 | cfs_rq->tg = tg; | ||
7139 | if (add) | ||
7140 | list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | ||
7141 | |||
7142 | tg->se[cpu] = se; | ||
7143 | se->cfs_rq = &rq->cfs; | ||
7144 | se->my_q = cfs_rq; | ||
7145 | se->load.weight = tg->shares; | ||
7146 | se->load.inv_weight = div64_64(1ULL<<32, se->load.weight); | ||
7147 | se->parent = NULL; | ||
7148 | } | ||
7149 | |||
7150 | static void init_tg_rt_entry(struct rq *rq, struct task_group *tg, | ||
7151 | struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, | ||
7152 | int cpu, int add) | ||
7153 | { | ||
7154 | tg->rt_rq[cpu] = rt_rq; | ||
7155 | init_rt_rq(rt_rq, rq); | ||
7156 | rt_rq->tg = tg; | ||
7157 | rt_rq->rt_se = rt_se; | ||
7158 | if (add) | ||
7159 | list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); | ||
7160 | |||
7161 | tg->rt_se[cpu] = rt_se; | ||
7162 | rt_se->rt_rq = &rq->rt; | ||
7163 | rt_se->my_q = rt_rq; | ||
7164 | rt_se->parent = NULL; | ||
7165 | INIT_LIST_HEAD(&rt_se->run_list); | ||
7166 | } | ||
7167 | #endif | ||
7168 | |||
6717 | void __init sched_init(void) | 7169 | void __init sched_init(void) |
6718 | { | 7170 | { |
6719 | int highest_cpu = 0; | 7171 | int highest_cpu = 0; |
6720 | int i, j; | 7172 | int i, j; |
6721 | 7173 | ||
7174 | #ifdef CONFIG_SMP | ||
7175 | init_defrootdomain(); | ||
7176 | #endif | ||
7177 | |||
7178 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
7179 | list_add(&init_task_group.list, &task_groups); | ||
7180 | #endif | ||
7181 | |||
6722 | for_each_possible_cpu(i) { | 7182 | for_each_possible_cpu(i) { |
6723 | struct rt_prio_array *array; | ||
6724 | struct rq *rq; | 7183 | struct rq *rq; |
6725 | 7184 | ||
6726 | rq = cpu_rq(i); | 7185 | rq = cpu_rq(i); |
@@ -6729,52 +7188,39 @@ void __init sched_init(void) | |||
6729 | rq->nr_running = 0; | 7188 | rq->nr_running = 0; |
6730 | rq->clock = 1; | 7189 | rq->clock = 1; |
6731 | init_cfs_rq(&rq->cfs, rq); | 7190 | init_cfs_rq(&rq->cfs, rq); |
7191 | init_rt_rq(&rq->rt, rq); | ||
6732 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7192 | #ifdef CONFIG_FAIR_GROUP_SCHED |
6733 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); | ||
6734 | { | ||
6735 | struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i); | ||
6736 | struct sched_entity *se = | ||
6737 | &per_cpu(init_sched_entity, i); | ||
6738 | |||
6739 | init_cfs_rq_p[i] = cfs_rq; | ||
6740 | init_cfs_rq(cfs_rq, rq); | ||
6741 | cfs_rq->tg = &init_task_group; | ||
6742 | list_add(&cfs_rq->leaf_cfs_rq_list, | ||
6743 | &rq->leaf_cfs_rq_list); | ||
6744 | |||
6745 | init_sched_entity_p[i] = se; | ||
6746 | se->cfs_rq = &rq->cfs; | ||
6747 | se->my_q = cfs_rq; | ||
6748 | se->load.weight = init_task_group_load; | ||
6749 | se->load.inv_weight = | ||
6750 | div64_64(1ULL<<32, init_task_group_load); | ||
6751 | se->parent = NULL; | ||
6752 | } | ||
6753 | init_task_group.shares = init_task_group_load; | 7193 | init_task_group.shares = init_task_group_load; |
6754 | spin_lock_init(&init_task_group.lock); | 7194 | INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); |
7195 | init_tg_cfs_entry(rq, &init_task_group, | ||
7196 | &per_cpu(init_cfs_rq, i), | ||
7197 | &per_cpu(init_sched_entity, i), i, 1); | ||
7198 | |||
7199 | init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */ | ||
7200 | INIT_LIST_HEAD(&rq->leaf_rt_rq_list); | ||
7201 | init_tg_rt_entry(rq, &init_task_group, | ||
7202 | &per_cpu(init_rt_rq, i), | ||
7203 | &per_cpu(init_sched_rt_entity, i), i, 1); | ||
6755 | #endif | 7204 | #endif |
7205 | rq->rt_period_expire = 0; | ||
7206 | rq->rt_throttled = 0; | ||
6756 | 7207 | ||
6757 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) | 7208 | for (j = 0; j < CPU_LOAD_IDX_MAX; j++) |
6758 | rq->cpu_load[j] = 0; | 7209 | rq->cpu_load[j] = 0; |
6759 | #ifdef CONFIG_SMP | 7210 | #ifdef CONFIG_SMP |
6760 | rq->sd = NULL; | 7211 | rq->sd = NULL; |
7212 | rq->rd = NULL; | ||
6761 | rq->active_balance = 0; | 7213 | rq->active_balance = 0; |
6762 | rq->next_balance = jiffies; | 7214 | rq->next_balance = jiffies; |
6763 | rq->push_cpu = 0; | 7215 | rq->push_cpu = 0; |
6764 | rq->cpu = i; | 7216 | rq->cpu = i; |
6765 | rq->migration_thread = NULL; | 7217 | rq->migration_thread = NULL; |
6766 | INIT_LIST_HEAD(&rq->migration_queue); | 7218 | INIT_LIST_HEAD(&rq->migration_queue); |
7219 | rq_attach_root(rq, &def_root_domain); | ||
6767 | #endif | 7220 | #endif |
7221 | init_rq_hrtick(rq); | ||
6768 | atomic_set(&rq->nr_iowait, 0); | 7222 | atomic_set(&rq->nr_iowait, 0); |
6769 | |||
6770 | array = &rq->rt.active; | ||
6771 | for (j = 0; j < MAX_RT_PRIO; j++) { | ||
6772 | INIT_LIST_HEAD(array->queue + j); | ||
6773 | __clear_bit(j, array->bitmap); | ||
6774 | } | ||
6775 | highest_cpu = i; | 7223 | highest_cpu = i; |
6776 | /* delimiter for bitsearch: */ | ||
6777 | __set_bit(MAX_RT_PRIO, array->bitmap); | ||
6778 | } | 7224 | } |
6779 | 7225 | ||
6780 | set_load_weight(&init_task); | 7226 | set_load_weight(&init_task); |
@@ -6925,8 +7371,8 @@ struct task_struct *curr_task(int cpu) | |||
6925 | * @p: the task pointer to set. | 7371 | * @p: the task pointer to set. |
6926 | * | 7372 | * |
6927 | * Description: This function must only be used when non-maskable interrupts | 7373 | * Description: This function must only be used when non-maskable interrupts |
6928 | * are serviced on a separate stack. It allows the architecture to switch the | 7374 | * are serviced on a separate stack. It allows the architecture to switch the |
6929 | * notion of the current task on a cpu in a non-blocking manner. This function | 7375 | * notion of the current task on a cpu in a non-blocking manner. This function |
6930 | * must be called with all CPU's synchronized, and interrupts disabled, the | 7376 | * must be called with all CPU's synchronized, and interrupts disabled, the |
6931 | * and caller must save the original value of the current task (see | 7377 | * and caller must save the original value of the current task (see |
6932 | * curr_task() above) and restore that value before reenabling interrupts and | 7378 | * curr_task() above) and restore that value before reenabling interrupts and |
@@ -6943,12 +7389,187 @@ void set_curr_task(int cpu, struct task_struct *p) | |||
6943 | 7389 | ||
6944 | #ifdef CONFIG_FAIR_GROUP_SCHED | 7390 | #ifdef CONFIG_FAIR_GROUP_SCHED |
6945 | 7391 | ||
7392 | #ifdef CONFIG_SMP | ||
7393 | /* | ||
7394 | * distribute shares of all task groups among their schedulable entities, | ||
7395 | * to reflect load distribution across cpus. | ||
7396 | */ | ||
7397 | static int rebalance_shares(struct sched_domain *sd, int this_cpu) | ||
7398 | { | ||
7399 | struct cfs_rq *cfs_rq; | ||
7400 | struct rq *rq = cpu_rq(this_cpu); | ||
7401 | cpumask_t sdspan = sd->span; | ||
7402 | int balanced = 1; | ||
7403 | |||
7404 | /* Walk thr' all the task groups that we have */ | ||
7405 | for_each_leaf_cfs_rq(rq, cfs_rq) { | ||
7406 | int i; | ||
7407 | unsigned long total_load = 0, total_shares; | ||
7408 | struct task_group *tg = cfs_rq->tg; | ||
7409 | |||
7410 | /* Gather total task load of this group across cpus */ | ||
7411 | for_each_cpu_mask(i, sdspan) | ||
7412 | total_load += tg->cfs_rq[i]->load.weight; | ||
7413 | |||
7414 | /* Nothing to do if this group has no load */ | ||
7415 | if (!total_load) | ||
7416 | continue; | ||
7417 | |||
7418 | /* | ||
7419 | * tg->shares represents the number of cpu shares the task group | ||
7420 | * is eligible to hold on a single cpu. On N cpus, it is | ||
7421 | * eligible to hold (N * tg->shares) number of cpu shares. | ||
7422 | */ | ||
7423 | total_shares = tg->shares * cpus_weight(sdspan); | ||
7424 | |||
7425 | /* | ||
7426 | * redistribute total_shares across cpus as per the task load | ||
7427 | * distribution. | ||
7428 | */ | ||
7429 | for_each_cpu_mask(i, sdspan) { | ||
7430 | unsigned long local_load, local_shares; | ||
7431 | |||
7432 | local_load = tg->cfs_rq[i]->load.weight; | ||
7433 | local_shares = (local_load * total_shares) / total_load; | ||
7434 | if (!local_shares) | ||
7435 | local_shares = MIN_GROUP_SHARES; | ||
7436 | if (local_shares == tg->se[i]->load.weight) | ||
7437 | continue; | ||
7438 | |||
7439 | spin_lock_irq(&cpu_rq(i)->lock); | ||
7440 | set_se_shares(tg->se[i], local_shares); | ||
7441 | spin_unlock_irq(&cpu_rq(i)->lock); | ||
7442 | balanced = 0; | ||
7443 | } | ||
7444 | } | ||
7445 | |||
7446 | return balanced; | ||
7447 | } | ||
7448 | |||
7449 | /* | ||
7450 | * How frequently should we rebalance_shares() across cpus? | ||
7451 | * | ||
7452 | * The more frequently we rebalance shares, the more accurate is the fairness | ||
7453 | * of cpu bandwidth distribution between task groups. However higher frequency | ||
7454 | * also implies increased scheduling overhead. | ||
7455 | * | ||
7456 | * sysctl_sched_min_bal_int_shares represents the minimum interval between | ||
7457 | * consecutive calls to rebalance_shares() in the same sched domain. | ||
7458 | * | ||
7459 | * sysctl_sched_max_bal_int_shares represents the maximum interval between | ||
7460 | * consecutive calls to rebalance_shares() in the same sched domain. | ||
7461 | * | ||
7462 | * These settings allows for the appropriate trade-off between accuracy of | ||
7463 | * fairness and the associated overhead. | ||
7464 | * | ||
7465 | */ | ||
7466 | |||
7467 | /* default: 8ms, units: milliseconds */ | ||
7468 | const_debug unsigned int sysctl_sched_min_bal_int_shares = 8; | ||
7469 | |||
7470 | /* default: 128ms, units: milliseconds */ | ||
7471 | const_debug unsigned int sysctl_sched_max_bal_int_shares = 128; | ||
7472 | |||
7473 | /* kernel thread that runs rebalance_shares() periodically */ | ||
7474 | static int load_balance_monitor(void *unused) | ||
7475 | { | ||
7476 | unsigned int timeout = sysctl_sched_min_bal_int_shares; | ||
7477 | struct sched_param schedparm; | ||
7478 | int ret; | ||
7479 | |||
7480 | /* | ||
7481 | * We don't want this thread's execution to be limited by the shares | ||
7482 | * assigned to default group (init_task_group). Hence make it run | ||
7483 | * as a SCHED_RR RT task at the lowest priority. | ||
7484 | */ | ||
7485 | schedparm.sched_priority = 1; | ||
7486 | ret = sched_setscheduler(current, SCHED_RR, &schedparm); | ||
7487 | if (ret) | ||
7488 | printk(KERN_ERR "Couldn't set SCHED_RR policy for load balance" | ||
7489 | " monitor thread (error = %d) \n", ret); | ||
7490 | |||
7491 | while (!kthread_should_stop()) { | ||
7492 | int i, cpu, balanced = 1; | ||
7493 | |||
7494 | /* Prevent cpus going down or coming up */ | ||
7495 | get_online_cpus(); | ||
7496 | /* lockout changes to doms_cur[] array */ | ||
7497 | lock_doms_cur(); | ||
7498 | /* | ||
7499 | * Enter a rcu read-side critical section to safely walk rq->sd | ||
7500 | * chain on various cpus and to walk task group list | ||
7501 | * (rq->leaf_cfs_rq_list) in rebalance_shares(). | ||
7502 | */ | ||
7503 | rcu_read_lock(); | ||
7504 | |||
7505 | for (i = 0; i < ndoms_cur; i++) { | ||
7506 | cpumask_t cpumap = doms_cur[i]; | ||
7507 | struct sched_domain *sd = NULL, *sd_prev = NULL; | ||
7508 | |||
7509 | cpu = first_cpu(cpumap); | ||
7510 | |||
7511 | /* Find the highest domain at which to balance shares */ | ||
7512 | for_each_domain(cpu, sd) { | ||
7513 | if (!(sd->flags & SD_LOAD_BALANCE)) | ||
7514 | continue; | ||
7515 | sd_prev = sd; | ||
7516 | } | ||
7517 | |||
7518 | sd = sd_prev; | ||
7519 | /* sd == NULL? No load balance reqd in this domain */ | ||
7520 | if (!sd) | ||
7521 | continue; | ||
7522 | |||
7523 | balanced &= rebalance_shares(sd, cpu); | ||
7524 | } | ||
7525 | |||
7526 | rcu_read_unlock(); | ||
7527 | |||
7528 | unlock_doms_cur(); | ||
7529 | put_online_cpus(); | ||
7530 | |||
7531 | if (!balanced) | ||
7532 | timeout = sysctl_sched_min_bal_int_shares; | ||
7533 | else if (timeout < sysctl_sched_max_bal_int_shares) | ||
7534 | timeout *= 2; | ||
7535 | |||
7536 | msleep_interruptible(timeout); | ||
7537 | } | ||
7538 | |||
7539 | return 0; | ||
7540 | } | ||
7541 | #endif /* CONFIG_SMP */ | ||
7542 | |||
7543 | static void free_sched_group(struct task_group *tg) | ||
7544 | { | ||
7545 | int i; | ||
7546 | |||
7547 | for_each_possible_cpu(i) { | ||
7548 | if (tg->cfs_rq) | ||
7549 | kfree(tg->cfs_rq[i]); | ||
7550 | if (tg->se) | ||
7551 | kfree(tg->se[i]); | ||
7552 | if (tg->rt_rq) | ||
7553 | kfree(tg->rt_rq[i]); | ||
7554 | if (tg->rt_se) | ||
7555 | kfree(tg->rt_se[i]); | ||
7556 | } | ||
7557 | |||
7558 | kfree(tg->cfs_rq); | ||
7559 | kfree(tg->se); | ||
7560 | kfree(tg->rt_rq); | ||
7561 | kfree(tg->rt_se); | ||
7562 | kfree(tg); | ||
7563 | } | ||
7564 | |||
6946 | /* allocate runqueue etc for a new task group */ | 7565 | /* allocate runqueue etc for a new task group */ |
6947 | struct task_group *sched_create_group(void) | 7566 | struct task_group *sched_create_group(void) |
6948 | { | 7567 | { |
6949 | struct task_group *tg; | 7568 | struct task_group *tg; |
6950 | struct cfs_rq *cfs_rq; | 7569 | struct cfs_rq *cfs_rq; |
6951 | struct sched_entity *se; | 7570 | struct sched_entity *se; |
7571 | struct rt_rq *rt_rq; | ||
7572 | struct sched_rt_entity *rt_se; | ||
6952 | struct rq *rq; | 7573 | struct rq *rq; |
6953 | int i; | 7574 | int i; |
6954 | 7575 | ||
@@ -6962,97 +7583,89 @@ struct task_group *sched_create_group(void) | |||
6962 | tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); | 7583 | tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); |
6963 | if (!tg->se) | 7584 | if (!tg->se) |
6964 | goto err; | 7585 | goto err; |
7586 | tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL); | ||
7587 | if (!tg->rt_rq) | ||
7588 | goto err; | ||
7589 | tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL); | ||
7590 | if (!tg->rt_se) | ||
7591 | goto err; | ||
7592 | |||
7593 | tg->shares = NICE_0_LOAD; | ||
7594 | tg->rt_ratio = 0; /* XXX */ | ||
6965 | 7595 | ||
6966 | for_each_possible_cpu(i) { | 7596 | for_each_possible_cpu(i) { |
6967 | rq = cpu_rq(i); | 7597 | rq = cpu_rq(i); |
6968 | 7598 | ||
6969 | cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, | 7599 | cfs_rq = kmalloc_node(sizeof(struct cfs_rq), |
6970 | cpu_to_node(i)); | 7600 | GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); |
6971 | if (!cfs_rq) | 7601 | if (!cfs_rq) |
6972 | goto err; | 7602 | goto err; |
6973 | 7603 | ||
6974 | se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL, | 7604 | se = kmalloc_node(sizeof(struct sched_entity), |
6975 | cpu_to_node(i)); | 7605 | GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); |
6976 | if (!se) | 7606 | if (!se) |
6977 | goto err; | 7607 | goto err; |
6978 | 7608 | ||
6979 | memset(cfs_rq, 0, sizeof(struct cfs_rq)); | 7609 | rt_rq = kmalloc_node(sizeof(struct rt_rq), |
6980 | memset(se, 0, sizeof(struct sched_entity)); | 7610 | GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); |
7611 | if (!rt_rq) | ||
7612 | goto err; | ||
6981 | 7613 | ||
6982 | tg->cfs_rq[i] = cfs_rq; | 7614 | rt_se = kmalloc_node(sizeof(struct sched_rt_entity), |
6983 | init_cfs_rq(cfs_rq, rq); | 7615 | GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); |
6984 | cfs_rq->tg = tg; | 7616 | if (!rt_se) |
7617 | goto err; | ||
6985 | 7618 | ||
6986 | tg->se[i] = se; | 7619 | init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0); |
6987 | se->cfs_rq = &rq->cfs; | 7620 | init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0); |
6988 | se->my_q = cfs_rq; | ||
6989 | se->load.weight = NICE_0_LOAD; | ||
6990 | se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD); | ||
6991 | se->parent = NULL; | ||
6992 | } | 7621 | } |
6993 | 7622 | ||
7623 | lock_task_group_list(); | ||
6994 | for_each_possible_cpu(i) { | 7624 | for_each_possible_cpu(i) { |
6995 | rq = cpu_rq(i); | 7625 | rq = cpu_rq(i); |
6996 | cfs_rq = tg->cfs_rq[i]; | 7626 | cfs_rq = tg->cfs_rq[i]; |
6997 | list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | 7627 | list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); |
7628 | rt_rq = tg->rt_rq[i]; | ||
7629 | list_add_rcu(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); | ||
6998 | } | 7630 | } |
6999 | 7631 | list_add_rcu(&tg->list, &task_groups); | |
7000 | tg->shares = NICE_0_LOAD; | 7632 | unlock_task_group_list(); |
7001 | spin_lock_init(&tg->lock); | ||
7002 | 7633 | ||
7003 | return tg; | 7634 | return tg; |
7004 | 7635 | ||
7005 | err: | 7636 | err: |
7006 | for_each_possible_cpu(i) { | 7637 | free_sched_group(tg); |
7007 | if (tg->cfs_rq) | ||
7008 | kfree(tg->cfs_rq[i]); | ||
7009 | if (tg->se) | ||
7010 | kfree(tg->se[i]); | ||
7011 | } | ||
7012 | kfree(tg->cfs_rq); | ||
7013 | kfree(tg->se); | ||
7014 | kfree(tg); | ||
7015 | |||
7016 | return ERR_PTR(-ENOMEM); | 7638 | return ERR_PTR(-ENOMEM); |
7017 | } | 7639 | } |
7018 | 7640 | ||
7019 | /* rcu callback to free various structures associated with a task group */ | 7641 | /* rcu callback to free various structures associated with a task group */ |
7020 | static void free_sched_group(struct rcu_head *rhp) | 7642 | static void free_sched_group_rcu(struct rcu_head *rhp) |
7021 | { | 7643 | { |
7022 | struct cfs_rq *cfs_rq = container_of(rhp, struct cfs_rq, rcu); | ||
7023 | struct task_group *tg = cfs_rq->tg; | ||
7024 | struct sched_entity *se; | ||
7025 | int i; | ||
7026 | |||
7027 | /* now it should be safe to free those cfs_rqs */ | 7644 | /* now it should be safe to free those cfs_rqs */ |
7028 | for_each_possible_cpu(i) { | 7645 | free_sched_group(container_of(rhp, struct task_group, rcu)); |
7029 | cfs_rq = tg->cfs_rq[i]; | ||
7030 | kfree(cfs_rq); | ||
7031 | |||
7032 | se = tg->se[i]; | ||
7033 | kfree(se); | ||
7034 | } | ||
7035 | |||
7036 | kfree(tg->cfs_rq); | ||
7037 | kfree(tg->se); | ||
7038 | kfree(tg); | ||
7039 | } | 7646 | } |
7040 | 7647 | ||
7041 | /* Destroy runqueue etc associated with a task group */ | 7648 | /* Destroy runqueue etc associated with a task group */ |
7042 | void sched_destroy_group(struct task_group *tg) | 7649 | void sched_destroy_group(struct task_group *tg) |
7043 | { | 7650 | { |
7044 | struct cfs_rq *cfs_rq; | 7651 | struct cfs_rq *cfs_rq = NULL; |
7652 | struct rt_rq *rt_rq = NULL; | ||
7045 | int i; | 7653 | int i; |
7046 | 7654 | ||
7655 | lock_task_group_list(); | ||
7047 | for_each_possible_cpu(i) { | 7656 | for_each_possible_cpu(i) { |
7048 | cfs_rq = tg->cfs_rq[i]; | 7657 | cfs_rq = tg->cfs_rq[i]; |
7049 | list_del_rcu(&cfs_rq->leaf_cfs_rq_list); | 7658 | list_del_rcu(&cfs_rq->leaf_cfs_rq_list); |
7659 | rt_rq = tg->rt_rq[i]; | ||
7660 | list_del_rcu(&rt_rq->leaf_rt_rq_list); | ||
7050 | } | 7661 | } |
7662 | list_del_rcu(&tg->list); | ||
7663 | unlock_task_group_list(); | ||
7051 | 7664 | ||
7052 | cfs_rq = tg->cfs_rq[0]; | 7665 | BUG_ON(!cfs_rq); |
7053 | 7666 | ||
7054 | /* wait for possible concurrent references to cfs_rqs complete */ | 7667 | /* wait for possible concurrent references to cfs_rqs complete */ |
7055 | call_rcu(&cfs_rq->rcu, free_sched_group); | 7668 | call_rcu(&tg->rcu, free_sched_group_rcu); |
7056 | } | 7669 | } |
7057 | 7670 | ||
7058 | /* change task's runqueue when it moves between groups. | 7671 | /* change task's runqueue when it moves between groups. |
@@ -7068,12 +7681,9 @@ void sched_move_task(struct task_struct *tsk) | |||
7068 | 7681 | ||
7069 | rq = task_rq_lock(tsk, &flags); | 7682 | rq = task_rq_lock(tsk, &flags); |
7070 | 7683 | ||
7071 | if (tsk->sched_class != &fair_sched_class) | ||
7072 | goto done; | ||
7073 | |||
7074 | update_rq_clock(rq); | 7684 | update_rq_clock(rq); |
7075 | 7685 | ||
7076 | running = task_running(rq, tsk); | 7686 | running = task_current(rq, tsk); |
7077 | on_rq = tsk->se.on_rq; | 7687 | on_rq = tsk->se.on_rq; |
7078 | 7688 | ||
7079 | if (on_rq) { | 7689 | if (on_rq) { |
@@ -7082,7 +7692,7 @@ void sched_move_task(struct task_struct *tsk) | |||
7082 | tsk->sched_class->put_prev_task(rq, tsk); | 7692 | tsk->sched_class->put_prev_task(rq, tsk); |
7083 | } | 7693 | } |
7084 | 7694 | ||
7085 | set_task_cfs_rq(tsk); | 7695 | set_task_rq(tsk, task_cpu(tsk)); |
7086 | 7696 | ||
7087 | if (on_rq) { | 7697 | if (on_rq) { |
7088 | if (unlikely(running)) | 7698 | if (unlikely(running)) |
@@ -7090,45 +7700,82 @@ void sched_move_task(struct task_struct *tsk) | |||
7090 | enqueue_task(rq, tsk, 0); | 7700 | enqueue_task(rq, tsk, 0); |
7091 | } | 7701 | } |
7092 | 7702 | ||
7093 | done: | ||
7094 | task_rq_unlock(rq, &flags); | 7703 | task_rq_unlock(rq, &flags); |
7095 | } | 7704 | } |
7096 | 7705 | ||
7706 | /* rq->lock to be locked by caller */ | ||
7097 | static void set_se_shares(struct sched_entity *se, unsigned long shares) | 7707 | static void set_se_shares(struct sched_entity *se, unsigned long shares) |
7098 | { | 7708 | { |
7099 | struct cfs_rq *cfs_rq = se->cfs_rq; | 7709 | struct cfs_rq *cfs_rq = se->cfs_rq; |
7100 | struct rq *rq = cfs_rq->rq; | 7710 | struct rq *rq = cfs_rq->rq; |
7101 | int on_rq; | 7711 | int on_rq; |
7102 | 7712 | ||
7103 | spin_lock_irq(&rq->lock); | 7713 | if (!shares) |
7714 | shares = MIN_GROUP_SHARES; | ||
7104 | 7715 | ||
7105 | on_rq = se->on_rq; | 7716 | on_rq = se->on_rq; |
7106 | if (on_rq) | 7717 | if (on_rq) { |
7107 | dequeue_entity(cfs_rq, se, 0); | 7718 | dequeue_entity(cfs_rq, se, 0); |
7719 | dec_cpu_load(rq, se->load.weight); | ||
7720 | } | ||
7108 | 7721 | ||
7109 | se->load.weight = shares; | 7722 | se->load.weight = shares; |
7110 | se->load.inv_weight = div64_64((1ULL<<32), shares); | 7723 | se->load.inv_weight = div64_64((1ULL<<32), shares); |
7111 | 7724 | ||
7112 | if (on_rq) | 7725 | if (on_rq) { |
7113 | enqueue_entity(cfs_rq, se, 0); | 7726 | enqueue_entity(cfs_rq, se, 0); |
7114 | 7727 | inc_cpu_load(rq, se->load.weight); | |
7115 | spin_unlock_irq(&rq->lock); | 7728 | } |
7116 | } | 7729 | } |
7117 | 7730 | ||
7118 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) | 7731 | int sched_group_set_shares(struct task_group *tg, unsigned long shares) |
7119 | { | 7732 | { |
7120 | int i; | 7733 | int i; |
7734 | struct cfs_rq *cfs_rq; | ||
7735 | struct rq *rq; | ||
7121 | 7736 | ||
7122 | spin_lock(&tg->lock); | 7737 | lock_task_group_list(); |
7123 | if (tg->shares == shares) | 7738 | if (tg->shares == shares) |
7124 | goto done; | 7739 | goto done; |
7125 | 7740 | ||
7741 | if (shares < MIN_GROUP_SHARES) | ||
7742 | shares = MIN_GROUP_SHARES; | ||
7743 | |||
7744 | /* | ||
7745 | * Prevent any load balance activity (rebalance_shares, | ||
7746 | * load_balance_fair) from referring to this group first, | ||
7747 | * by taking it off the rq->leaf_cfs_rq_list on each cpu. | ||
7748 | */ | ||
7749 | for_each_possible_cpu(i) { | ||
7750 | cfs_rq = tg->cfs_rq[i]; | ||
7751 | list_del_rcu(&cfs_rq->leaf_cfs_rq_list); | ||
7752 | } | ||
7753 | |||
7754 | /* wait for any ongoing reference to this group to finish */ | ||
7755 | synchronize_sched(); | ||
7756 | |||
7757 | /* | ||
7758 | * Now we are free to modify the group's share on each cpu | ||
7759 | * w/o tripping rebalance_share or load_balance_fair. | ||
7760 | */ | ||
7126 | tg->shares = shares; | 7761 | tg->shares = shares; |
7127 | for_each_possible_cpu(i) | 7762 | for_each_possible_cpu(i) { |
7763 | spin_lock_irq(&cpu_rq(i)->lock); | ||
7128 | set_se_shares(tg->se[i], shares); | 7764 | set_se_shares(tg->se[i], shares); |
7765 | spin_unlock_irq(&cpu_rq(i)->lock); | ||
7766 | } | ||
7129 | 7767 | ||
7768 | /* | ||
7769 | * Enable load balance activity on this group, by inserting it back on | ||
7770 | * each cpu's rq->leaf_cfs_rq_list. | ||
7771 | */ | ||
7772 | for_each_possible_cpu(i) { | ||
7773 | rq = cpu_rq(i); | ||
7774 | cfs_rq = tg->cfs_rq[i]; | ||
7775 | list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); | ||
7776 | } | ||
7130 | done: | 7777 | done: |
7131 | spin_unlock(&tg->lock); | 7778 | unlock_task_group_list(); |
7132 | return 0; | 7779 | return 0; |
7133 | } | 7780 | } |
7134 | 7781 | ||
@@ -7137,6 +7784,31 @@ unsigned long sched_group_shares(struct task_group *tg) | |||
7137 | return tg->shares; | 7784 | return tg->shares; |
7138 | } | 7785 | } |
7139 | 7786 | ||
7787 | /* | ||
7788 | * Ensure the total rt_ratio <= sysctl_sched_rt_ratio | ||
7789 | */ | ||
7790 | int sched_group_set_rt_ratio(struct task_group *tg, unsigned long rt_ratio) | ||
7791 | { | ||
7792 | struct task_group *tgi; | ||
7793 | unsigned long total = 0; | ||
7794 | |||
7795 | rcu_read_lock(); | ||
7796 | list_for_each_entry_rcu(tgi, &task_groups, list) | ||
7797 | total += tgi->rt_ratio; | ||
7798 | rcu_read_unlock(); | ||
7799 | |||
7800 | if (total + rt_ratio - tg->rt_ratio > sysctl_sched_rt_ratio) | ||
7801 | return -EINVAL; | ||
7802 | |||
7803 | tg->rt_ratio = rt_ratio; | ||
7804 | return 0; | ||
7805 | } | ||
7806 | |||
7807 | unsigned long sched_group_rt_ratio(struct task_group *tg) | ||
7808 | { | ||
7809 | return tg->rt_ratio; | ||
7810 | } | ||
7811 | |||
7140 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 7812 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
7141 | 7813 | ||
7142 | #ifdef CONFIG_FAIR_CGROUP_SCHED | 7814 | #ifdef CONFIG_FAIR_CGROUP_SCHED |
@@ -7173,16 +7845,17 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp) | |||
7173 | return &tg->css; | 7845 | return &tg->css; |
7174 | } | 7846 | } |
7175 | 7847 | ||
7176 | static void cpu_cgroup_destroy(struct cgroup_subsys *ss, | 7848 | static void |
7177 | struct cgroup *cgrp) | 7849 | cpu_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp) |
7178 | { | 7850 | { |
7179 | struct task_group *tg = cgroup_tg(cgrp); | 7851 | struct task_group *tg = cgroup_tg(cgrp); |
7180 | 7852 | ||
7181 | sched_destroy_group(tg); | 7853 | sched_destroy_group(tg); |
7182 | } | 7854 | } |
7183 | 7855 | ||
7184 | static int cpu_cgroup_can_attach(struct cgroup_subsys *ss, | 7856 | static int |
7185 | struct cgroup *cgrp, struct task_struct *tsk) | 7857 | cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, |
7858 | struct task_struct *tsk) | ||
7186 | { | 7859 | { |
7187 | /* We don't support RT-tasks being in separate groups */ | 7860 | /* We don't support RT-tasks being in separate groups */ |
7188 | if (tsk->sched_class != &fair_sched_class) | 7861 | if (tsk->sched_class != &fair_sched_class) |
@@ -7211,26 +7884,169 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft) | |||
7211 | return (u64) tg->shares; | 7884 | return (u64) tg->shares; |
7212 | } | 7885 | } |
7213 | 7886 | ||
7214 | static struct cftype cpu_shares = { | 7887 | static int cpu_rt_ratio_write_uint(struct cgroup *cgrp, struct cftype *cftype, |
7215 | .name = "shares", | 7888 | u64 rt_ratio_val) |
7216 | .read_uint = cpu_shares_read_uint, | 7889 | { |
7217 | .write_uint = cpu_shares_write_uint, | 7890 | return sched_group_set_rt_ratio(cgroup_tg(cgrp), rt_ratio_val); |
7891 | } | ||
7892 | |||
7893 | static u64 cpu_rt_ratio_read_uint(struct cgroup *cgrp, struct cftype *cft) | ||
7894 | { | ||
7895 | struct task_group *tg = cgroup_tg(cgrp); | ||
7896 | |||
7897 | return (u64) tg->rt_ratio; | ||
7898 | } | ||
7899 | |||
7900 | static struct cftype cpu_files[] = { | ||
7901 | { | ||
7902 | .name = "shares", | ||
7903 | .read_uint = cpu_shares_read_uint, | ||
7904 | .write_uint = cpu_shares_write_uint, | ||
7905 | }, | ||
7906 | { | ||
7907 | .name = "rt_ratio", | ||
7908 | .read_uint = cpu_rt_ratio_read_uint, | ||
7909 | .write_uint = cpu_rt_ratio_write_uint, | ||
7910 | }, | ||
7218 | }; | 7911 | }; |
7219 | 7912 | ||
7220 | static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) | 7913 | static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont) |
7221 | { | 7914 | { |
7222 | return cgroup_add_file(cont, ss, &cpu_shares); | 7915 | return cgroup_add_files(cont, ss, cpu_files, ARRAY_SIZE(cpu_files)); |
7223 | } | 7916 | } |
7224 | 7917 | ||
7225 | struct cgroup_subsys cpu_cgroup_subsys = { | 7918 | struct cgroup_subsys cpu_cgroup_subsys = { |
7226 | .name = "cpu", | 7919 | .name = "cpu", |
7227 | .create = cpu_cgroup_create, | 7920 | .create = cpu_cgroup_create, |
7228 | .destroy = cpu_cgroup_destroy, | 7921 | .destroy = cpu_cgroup_destroy, |
7229 | .can_attach = cpu_cgroup_can_attach, | 7922 | .can_attach = cpu_cgroup_can_attach, |
7230 | .attach = cpu_cgroup_attach, | 7923 | .attach = cpu_cgroup_attach, |
7231 | .populate = cpu_cgroup_populate, | 7924 | .populate = cpu_cgroup_populate, |
7232 | .subsys_id = cpu_cgroup_subsys_id, | 7925 | .subsys_id = cpu_cgroup_subsys_id, |
7233 | .early_init = 1, | 7926 | .early_init = 1, |
7234 | }; | 7927 | }; |
7235 | 7928 | ||
7236 | #endif /* CONFIG_FAIR_CGROUP_SCHED */ | 7929 | #endif /* CONFIG_FAIR_CGROUP_SCHED */ |
7930 | |||
7931 | #ifdef CONFIG_CGROUP_CPUACCT | ||
7932 | |||
7933 | /* | ||
7934 | * CPU accounting code for task groups. | ||
7935 | * | ||
7936 | * Based on the work by Paul Menage (menage@google.com) and Balbir Singh | ||
7937 | * (balbir@in.ibm.com). | ||
7938 | */ | ||
7939 | |||
7940 | /* track cpu usage of a group of tasks */ | ||
7941 | struct cpuacct { | ||
7942 | struct cgroup_subsys_state css; | ||
7943 | /* cpuusage holds pointer to a u64-type object on every cpu */ | ||
7944 | u64 *cpuusage; | ||
7945 | }; | ||
7946 | |||
7947 | struct cgroup_subsys cpuacct_subsys; | ||
7948 | |||
7949 | /* return cpu accounting group corresponding to this container */ | ||
7950 | static inline struct cpuacct *cgroup_ca(struct cgroup *cont) | ||
7951 | { | ||
7952 | return container_of(cgroup_subsys_state(cont, cpuacct_subsys_id), | ||
7953 | struct cpuacct, css); | ||
7954 | } | ||
7955 | |||
7956 | /* return cpu accounting group to which this task belongs */ | ||
7957 | static inline struct cpuacct *task_ca(struct task_struct *tsk) | ||
7958 | { | ||
7959 | return container_of(task_subsys_state(tsk, cpuacct_subsys_id), | ||
7960 | struct cpuacct, css); | ||
7961 | } | ||
7962 | |||
7963 | /* create a new cpu accounting group */ | ||
7964 | static struct cgroup_subsys_state *cpuacct_create( | ||
7965 | struct cgroup_subsys *ss, struct cgroup *cont) | ||
7966 | { | ||
7967 | struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL); | ||
7968 | |||
7969 | if (!ca) | ||
7970 | return ERR_PTR(-ENOMEM); | ||
7971 | |||
7972 | ca->cpuusage = alloc_percpu(u64); | ||
7973 | if (!ca->cpuusage) { | ||
7974 | kfree(ca); | ||
7975 | return ERR_PTR(-ENOMEM); | ||
7976 | } | ||
7977 | |||
7978 | return &ca->css; | ||
7979 | } | ||
7980 | |||
7981 | /* destroy an existing cpu accounting group */ | ||
7982 | static void | ||
7983 | cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cont) | ||
7984 | { | ||
7985 | struct cpuacct *ca = cgroup_ca(cont); | ||
7986 | |||
7987 | free_percpu(ca->cpuusage); | ||
7988 | kfree(ca); | ||
7989 | } | ||
7990 | |||
7991 | /* return total cpu usage (in nanoseconds) of a group */ | ||
7992 | static u64 cpuusage_read(struct cgroup *cont, struct cftype *cft) | ||
7993 | { | ||
7994 | struct cpuacct *ca = cgroup_ca(cont); | ||
7995 | u64 totalcpuusage = 0; | ||
7996 | int i; | ||
7997 | |||
7998 | for_each_possible_cpu(i) { | ||
7999 | u64 *cpuusage = percpu_ptr(ca->cpuusage, i); | ||
8000 | |||
8001 | /* | ||
8002 | * Take rq->lock to make 64-bit addition safe on 32-bit | ||
8003 | * platforms. | ||
8004 | */ | ||
8005 | spin_lock_irq(&cpu_rq(i)->lock); | ||
8006 | totalcpuusage += *cpuusage; | ||
8007 | spin_unlock_irq(&cpu_rq(i)->lock); | ||
8008 | } | ||
8009 | |||
8010 | return totalcpuusage; | ||
8011 | } | ||
8012 | |||
8013 | static struct cftype files[] = { | ||
8014 | { | ||
8015 | .name = "usage", | ||
8016 | .read_uint = cpuusage_read, | ||
8017 | }, | ||
8018 | }; | ||
8019 | |||
8020 | static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cont) | ||
8021 | { | ||
8022 | return cgroup_add_files(cont, ss, files, ARRAY_SIZE(files)); | ||
8023 | } | ||
8024 | |||
8025 | /* | ||
8026 | * charge this task's execution time to its accounting group. | ||
8027 | * | ||
8028 | * called with rq->lock held. | ||
8029 | */ | ||
8030 | static void cpuacct_charge(struct task_struct *tsk, u64 cputime) | ||
8031 | { | ||
8032 | struct cpuacct *ca; | ||
8033 | |||
8034 | if (!cpuacct_subsys.active) | ||
8035 | return; | ||
8036 | |||
8037 | ca = task_ca(tsk); | ||
8038 | if (ca) { | ||
8039 | u64 *cpuusage = percpu_ptr(ca->cpuusage, task_cpu(tsk)); | ||
8040 | |||
8041 | *cpuusage += cputime; | ||
8042 | } | ||
8043 | } | ||
8044 | |||
8045 | struct cgroup_subsys cpuacct_subsys = { | ||
8046 | .name = "cpuacct", | ||
8047 | .create = cpuacct_create, | ||
8048 | .destroy = cpuacct_destroy, | ||
8049 | .populate = cpuacct_populate, | ||
8050 | .subsys_id = cpuacct_subsys_id, | ||
8051 | }; | ||
8052 | #endif /* CONFIG_CGROUP_CPUACCT */ | ||
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index e6fb392e5164..4b5e24cf2f4a 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c | |||
@@ -31,9 +31,9 @@ | |||
31 | /* | 31 | /* |
32 | * Ease the printing of nsec fields: | 32 | * Ease the printing of nsec fields: |
33 | */ | 33 | */ |
34 | static long long nsec_high(long long nsec) | 34 | static long long nsec_high(unsigned long long nsec) |
35 | { | 35 | { |
36 | if (nsec < 0) { | 36 | if ((long long)nsec < 0) { |
37 | nsec = -nsec; | 37 | nsec = -nsec; |
38 | do_div(nsec, 1000000); | 38 | do_div(nsec, 1000000); |
39 | return -nsec; | 39 | return -nsec; |
@@ -43,9 +43,9 @@ static long long nsec_high(long long nsec) | |||
43 | return nsec; | 43 | return nsec; |
44 | } | 44 | } |
45 | 45 | ||
46 | static unsigned long nsec_low(long long nsec) | 46 | static unsigned long nsec_low(unsigned long long nsec) |
47 | { | 47 | { |
48 | if (nsec < 0) | 48 | if ((long long)nsec < 0) |
49 | nsec = -nsec; | 49 | nsec = -nsec; |
50 | 50 | ||
51 | return do_div(nsec, 1000000); | 51 | return do_div(nsec, 1000000); |
@@ -80,6 +80,7 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) | |||
80 | static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) | 80 | static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) |
81 | { | 81 | { |
82 | struct task_struct *g, *p; | 82 | struct task_struct *g, *p; |
83 | unsigned long flags; | ||
83 | 84 | ||
84 | SEQ_printf(m, | 85 | SEQ_printf(m, |
85 | "\nrunnable tasks:\n" | 86 | "\nrunnable tasks:\n" |
@@ -88,7 +89,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) | |||
88 | "------------------------------------------------------" | 89 | "------------------------------------------------------" |
89 | "----------------------------------------------------\n"); | 90 | "----------------------------------------------------\n"); |
90 | 91 | ||
91 | read_lock_irq(&tasklist_lock); | 92 | read_lock_irqsave(&tasklist_lock, flags); |
92 | 93 | ||
93 | do_each_thread(g, p) { | 94 | do_each_thread(g, p) { |
94 | if (!p->se.on_rq || task_cpu(p) != rq_cpu) | 95 | if (!p->se.on_rq || task_cpu(p) != rq_cpu) |
@@ -97,7 +98,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) | |||
97 | print_task(m, rq, p); | 98 | print_task(m, rq, p); |
98 | } while_each_thread(g, p); | 99 | } while_each_thread(g, p); |
99 | 100 | ||
100 | read_unlock_irq(&tasklist_lock); | 101 | read_unlock_irqrestore(&tasklist_lock, flags); |
101 | } | 102 | } |
102 | 103 | ||
103 | void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) | 104 | void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) |
@@ -178,6 +179,7 @@ static void print_cpu(struct seq_file *m, int cpu) | |||
178 | PN(prev_clock_raw); | 179 | PN(prev_clock_raw); |
179 | P(clock_warps); | 180 | P(clock_warps); |
180 | P(clock_overflows); | 181 | P(clock_overflows); |
182 | P(clock_underflows); | ||
181 | P(clock_deep_idle_events); | 183 | P(clock_deep_idle_events); |
182 | PN(clock_max_delta); | 184 | PN(clock_max_delta); |
183 | P(cpu_load[0]); | 185 | P(cpu_load[0]); |
@@ -198,7 +200,7 @@ static int sched_debug_show(struct seq_file *m, void *v) | |||
198 | u64 now = ktime_to_ns(ktime_get()); | 200 | u64 now = ktime_to_ns(ktime_get()); |
199 | int cpu; | 201 | int cpu; |
200 | 202 | ||
201 | SEQ_printf(m, "Sched Debug Version: v0.06-v22, %s %.*s\n", | 203 | SEQ_printf(m, "Sched Debug Version: v0.07, %s %.*s\n", |
202 | init_utsname()->release, | 204 | init_utsname()->release, |
203 | (int)strcspn(init_utsname()->version, " "), | 205 | (int)strcspn(init_utsname()->version, " "), |
204 | init_utsname()->version); | 206 | init_utsname()->version); |
@@ -210,7 +212,7 @@ static int sched_debug_show(struct seq_file *m, void *v) | |||
210 | #define PN(x) \ | 212 | #define PN(x) \ |
211 | SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) | 213 | SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) |
212 | PN(sysctl_sched_latency); | 214 | PN(sysctl_sched_latency); |
213 | PN(sysctl_sched_nr_latency); | 215 | PN(sysctl_sched_min_granularity); |
214 | PN(sysctl_sched_wakeup_granularity); | 216 | PN(sysctl_sched_wakeup_granularity); |
215 | PN(sysctl_sched_batch_wakeup_granularity); | 217 | PN(sysctl_sched_batch_wakeup_granularity); |
216 | PN(sysctl_sched_child_runs_first); | 218 | PN(sysctl_sched_child_runs_first); |
@@ -298,6 +300,8 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
298 | PN(se.exec_max); | 300 | PN(se.exec_max); |
299 | PN(se.slice_max); | 301 | PN(se.slice_max); |
300 | PN(se.wait_max); | 302 | PN(se.wait_max); |
303 | PN(se.wait_sum); | ||
304 | P(se.wait_count); | ||
301 | P(sched_info.bkl_count); | 305 | P(sched_info.bkl_count); |
302 | P(se.nr_migrations); | 306 | P(se.nr_migrations); |
303 | P(se.nr_migrations_cold); | 307 | P(se.nr_migrations_cold); |
@@ -326,10 +330,12 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) | |||
326 | avg_atom = -1LL; | 330 | avg_atom = -1LL; |
327 | 331 | ||
328 | avg_per_cpu = p->se.sum_exec_runtime; | 332 | avg_per_cpu = p->se.sum_exec_runtime; |
329 | if (p->se.nr_migrations) | 333 | if (p->se.nr_migrations) { |
330 | avg_per_cpu = div64_64(avg_per_cpu, p->se.nr_migrations); | 334 | avg_per_cpu = div64_64(avg_per_cpu, |
331 | else | 335 | p->se.nr_migrations); |
336 | } else { | ||
332 | avg_per_cpu = -1LL; | 337 | avg_per_cpu = -1LL; |
338 | } | ||
333 | 339 | ||
334 | __PN(avg_atom); | 340 | __PN(avg_atom); |
335 | __PN(avg_per_cpu); | 341 | __PN(avg_per_cpu); |
@@ -363,6 +369,8 @@ void proc_sched_set_task(struct task_struct *p) | |||
363 | { | 369 | { |
364 | #ifdef CONFIG_SCHEDSTATS | 370 | #ifdef CONFIG_SCHEDSTATS |
365 | p->se.wait_max = 0; | 371 | p->se.wait_max = 0; |
372 | p->se.wait_sum = 0; | ||
373 | p->se.wait_count = 0; | ||
366 | p->se.sleep_max = 0; | 374 | p->se.sleep_max = 0; |
367 | p->se.sum_sleep_runtime = 0; | 375 | p->se.sum_sleep_runtime = 0; |
368 | p->se.block_max = 0; | 376 | p->se.block_max = 0; |
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 9971831b560e..6c091d6e159d 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c | |||
@@ -20,9 +20,11 @@ | |||
20 | * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> | 20 | * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> |
21 | */ | 21 | */ |
22 | 22 | ||
23 | #include <linux/latencytop.h> | ||
24 | |||
23 | /* | 25 | /* |
24 | * Targeted preemption latency for CPU-bound tasks: | 26 | * Targeted preemption latency for CPU-bound tasks: |
25 | * (default: 20ms, units: nanoseconds) | 27 | * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds) |
26 | * | 28 | * |
27 | * NOTE: this latency value is not the same as the concept of | 29 | * NOTE: this latency value is not the same as the concept of |
28 | * 'timeslice length' - timeslices in CFS are of variable length | 30 | * 'timeslice length' - timeslices in CFS are of variable length |
@@ -32,19 +34,24 @@ | |||
32 | * (to see the precise effective timeslice length of your workload, | 34 | * (to see the precise effective timeslice length of your workload, |
33 | * run vmstat and monitor the context-switches (cs) field) | 35 | * run vmstat and monitor the context-switches (cs) field) |
34 | */ | 36 | */ |
35 | const_debug unsigned int sysctl_sched_latency = 20000000ULL; | 37 | unsigned int sysctl_sched_latency = 20000000ULL; |
36 | 38 | ||
37 | /* | 39 | /* |
38 | * After fork, child runs first. (default) If set to 0 then | 40 | * Minimal preemption granularity for CPU-bound tasks: |
39 | * parent will (try to) run first. | 41 | * (default: 4 msec * (1 + ilog(ncpus)), units: nanoseconds) |
40 | */ | 42 | */ |
41 | const_debug unsigned int sysctl_sched_child_runs_first = 1; | 43 | unsigned int sysctl_sched_min_granularity = 4000000ULL; |
42 | 44 | ||
43 | /* | 45 | /* |
44 | * Minimal preemption granularity for CPU-bound tasks: | 46 | * is kept at sysctl_sched_latency / sysctl_sched_min_granularity |
45 | * (default: 2 msec, units: nanoseconds) | ||
46 | */ | 47 | */ |
47 | const_debug unsigned int sysctl_sched_nr_latency = 20; | 48 | static unsigned int sched_nr_latency = 5; |
49 | |||
50 | /* | ||
51 | * After fork, child runs first. (default) If set to 0 then | ||
52 | * parent will (try to) run first. | ||
53 | */ | ||
54 | const_debug unsigned int sysctl_sched_child_runs_first = 1; | ||
48 | 55 | ||
49 | /* | 56 | /* |
50 | * sys_sched_yield() compat mode | 57 | * sys_sched_yield() compat mode |
@@ -56,23 +63,23 @@ unsigned int __read_mostly sysctl_sched_compat_yield; | |||
56 | 63 | ||
57 | /* | 64 | /* |
58 | * SCHED_BATCH wake-up granularity. | 65 | * SCHED_BATCH wake-up granularity. |
59 | * (default: 10 msec, units: nanoseconds) | 66 | * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) |
60 | * | 67 | * |
61 | * This option delays the preemption effects of decoupled workloads | 68 | * This option delays the preemption effects of decoupled workloads |
62 | * and reduces their over-scheduling. Synchronous workloads will still | 69 | * and reduces their over-scheduling. Synchronous workloads will still |
63 | * have immediate wakeup/sleep latencies. | 70 | * have immediate wakeup/sleep latencies. |
64 | */ | 71 | */ |
65 | const_debug unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL; | 72 | unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL; |
66 | 73 | ||
67 | /* | 74 | /* |
68 | * SCHED_OTHER wake-up granularity. | 75 | * SCHED_OTHER wake-up granularity. |
69 | * (default: 10 msec, units: nanoseconds) | 76 | * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) |
70 | * | 77 | * |
71 | * This option delays the preemption effects of decoupled workloads | 78 | * This option delays the preemption effects of decoupled workloads |
72 | * and reduces their over-scheduling. Synchronous workloads will still | 79 | * and reduces their over-scheduling. Synchronous workloads will still |
73 | * have immediate wakeup/sleep latencies. | 80 | * have immediate wakeup/sleep latencies. |
74 | */ | 81 | */ |
75 | const_debug unsigned int sysctl_sched_wakeup_granularity = 10000000UL; | 82 | unsigned int sysctl_sched_wakeup_granularity = 10000000UL; |
76 | 83 | ||
77 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; | 84 | const_debug unsigned int sysctl_sched_migration_cost = 500000UL; |
78 | 85 | ||
@@ -212,6 +219,22 @@ static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) | |||
212 | * Scheduling class statistics methods: | 219 | * Scheduling class statistics methods: |
213 | */ | 220 | */ |
214 | 221 | ||
222 | #ifdef CONFIG_SCHED_DEBUG | ||
223 | int sched_nr_latency_handler(struct ctl_table *table, int write, | ||
224 | struct file *filp, void __user *buffer, size_t *lenp, | ||
225 | loff_t *ppos) | ||
226 | { | ||
227 | int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); | ||
228 | |||
229 | if (ret || !write) | ||
230 | return ret; | ||
231 | |||
232 | sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency, | ||
233 | sysctl_sched_min_granularity); | ||
234 | |||
235 | return 0; | ||
236 | } | ||
237 | #endif | ||
215 | 238 | ||
216 | /* | 239 | /* |
217 | * The idea is to set a period in which each task runs once. | 240 | * The idea is to set a period in which each task runs once. |
@@ -224,11 +247,11 @@ static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) | |||
224 | static u64 __sched_period(unsigned long nr_running) | 247 | static u64 __sched_period(unsigned long nr_running) |
225 | { | 248 | { |
226 | u64 period = sysctl_sched_latency; | 249 | u64 period = sysctl_sched_latency; |
227 | unsigned long nr_latency = sysctl_sched_nr_latency; | 250 | unsigned long nr_latency = sched_nr_latency; |
228 | 251 | ||
229 | if (unlikely(nr_running > nr_latency)) { | 252 | if (unlikely(nr_running > nr_latency)) { |
253 | period = sysctl_sched_min_granularity; | ||
230 | period *= nr_running; | 254 | period *= nr_running; |
231 | do_div(period, nr_latency); | ||
232 | } | 255 | } |
233 | 256 | ||
234 | return period; | 257 | return period; |
@@ -259,6 +282,7 @@ static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running) | |||
259 | { | 282 | { |
260 | u64 vslice = __sched_period(nr_running); | 283 | u64 vslice = __sched_period(nr_running); |
261 | 284 | ||
285 | vslice *= NICE_0_LOAD; | ||
262 | do_div(vslice, rq_weight); | 286 | do_div(vslice, rq_weight); |
263 | 287 | ||
264 | return vslice; | 288 | return vslice; |
@@ -329,6 +353,12 @@ static void update_curr(struct cfs_rq *cfs_rq) | |||
329 | 353 | ||
330 | __update_curr(cfs_rq, curr, delta_exec); | 354 | __update_curr(cfs_rq, curr, delta_exec); |
331 | curr->exec_start = now; | 355 | curr->exec_start = now; |
356 | |||
357 | if (entity_is_task(curr)) { | ||
358 | struct task_struct *curtask = task_of(curr); | ||
359 | |||
360 | cpuacct_charge(curtask, delta_exec); | ||
361 | } | ||
332 | } | 362 | } |
333 | 363 | ||
334 | static inline void | 364 | static inline void |
@@ -355,6 +385,9 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
355 | { | 385 | { |
356 | schedstat_set(se->wait_max, max(se->wait_max, | 386 | schedstat_set(se->wait_max, max(se->wait_max, |
357 | rq_of(cfs_rq)->clock - se->wait_start)); | 387 | rq_of(cfs_rq)->clock - se->wait_start)); |
388 | schedstat_set(se->wait_count, se->wait_count + 1); | ||
389 | schedstat_set(se->wait_sum, se->wait_sum + | ||
390 | rq_of(cfs_rq)->clock - se->wait_start); | ||
358 | schedstat_set(se->wait_start, 0); | 391 | schedstat_set(se->wait_start, 0); |
359 | } | 392 | } |
360 | 393 | ||
@@ -406,6 +439,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
406 | #ifdef CONFIG_SCHEDSTATS | 439 | #ifdef CONFIG_SCHEDSTATS |
407 | if (se->sleep_start) { | 440 | if (se->sleep_start) { |
408 | u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; | 441 | u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; |
442 | struct task_struct *tsk = task_of(se); | ||
409 | 443 | ||
410 | if ((s64)delta < 0) | 444 | if ((s64)delta < 0) |
411 | delta = 0; | 445 | delta = 0; |
@@ -415,9 +449,12 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
415 | 449 | ||
416 | se->sleep_start = 0; | 450 | se->sleep_start = 0; |
417 | se->sum_sleep_runtime += delta; | 451 | se->sum_sleep_runtime += delta; |
452 | |||
453 | account_scheduler_latency(tsk, delta >> 10, 1); | ||
418 | } | 454 | } |
419 | if (se->block_start) { | 455 | if (se->block_start) { |
420 | u64 delta = rq_of(cfs_rq)->clock - se->block_start; | 456 | u64 delta = rq_of(cfs_rq)->clock - se->block_start; |
457 | struct task_struct *tsk = task_of(se); | ||
421 | 458 | ||
422 | if ((s64)delta < 0) | 459 | if ((s64)delta < 0) |
423 | delta = 0; | 460 | delta = 0; |
@@ -434,11 +471,11 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) | |||
434 | * time that the task spent sleeping: | 471 | * time that the task spent sleeping: |
435 | */ | 472 | */ |
436 | if (unlikely(prof_on == SLEEP_PROFILING)) { | 473 | if (unlikely(prof_on == SLEEP_PROFILING)) { |
437 | struct task_struct *tsk = task_of(se); | ||
438 | 474 | ||
439 | profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), | 475 | profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), |
440 | delta >> 20); | 476 | delta >> 20); |
441 | } | 477 | } |
478 | account_scheduler_latency(tsk, delta >> 10, 0); | ||
442 | } | 479 | } |
443 | #endif | 480 | #endif |
444 | } | 481 | } |
@@ -472,19 +509,25 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) | |||
472 | } else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running) | 509 | } else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running) |
473 | vruntime += sched_vslice(cfs_rq)/2; | 510 | vruntime += sched_vslice(cfs_rq)/2; |
474 | 511 | ||
512 | /* | ||
513 | * The 'current' period is already promised to the current tasks, | ||
514 | * however the extra weight of the new task will slow them down a | ||
515 | * little, place the new task so that it fits in the slot that | ||
516 | * stays open at the end. | ||
517 | */ | ||
475 | if (initial && sched_feat(START_DEBIT)) | 518 | if (initial && sched_feat(START_DEBIT)) |
476 | vruntime += sched_vslice_add(cfs_rq, se); | 519 | vruntime += sched_vslice_add(cfs_rq, se); |
477 | 520 | ||
478 | if (!initial) { | 521 | if (!initial) { |
479 | if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se) && | 522 | /* sleeps upto a single latency don't count. */ |
480 | task_of(se)->policy != SCHED_BATCH) | 523 | if (sched_feat(NEW_FAIR_SLEEPERS)) |
481 | vruntime -= sysctl_sched_latency; | 524 | vruntime -= sysctl_sched_latency; |
482 | 525 | ||
483 | vruntime = max_t(s64, vruntime, se->vruntime); | 526 | /* ensure we never gain time by being placed backwards. */ |
527 | vruntime = max_vruntime(se->vruntime, vruntime); | ||
484 | } | 528 | } |
485 | 529 | ||
486 | se->vruntime = vruntime; | 530 | se->vruntime = vruntime; |
487 | |||
488 | } | 531 | } |
489 | 532 | ||
490 | static void | 533 | static void |
@@ -517,7 +560,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) | |||
517 | 560 | ||
518 | update_stats_dequeue(cfs_rq, se); | 561 | update_stats_dequeue(cfs_rq, se); |
519 | if (sleep) { | 562 | if (sleep) { |
520 | se->peer_preempt = 0; | ||
521 | #ifdef CONFIG_SCHEDSTATS | 563 | #ifdef CONFIG_SCHEDSTATS |
522 | if (entity_is_task(se)) { | 564 | if (entity_is_task(se)) { |
523 | struct task_struct *tsk = task_of(se); | 565 | struct task_struct *tsk = task_of(se); |
@@ -545,10 +587,8 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | |||
545 | 587 | ||
546 | ideal_runtime = sched_slice(cfs_rq, curr); | 588 | ideal_runtime = sched_slice(cfs_rq, curr); |
547 | delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; | 589 | delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; |
548 | if (delta_exec > ideal_runtime || | 590 | if (delta_exec > ideal_runtime) |
549 | (sched_feat(PREEMPT_RESTRICT) && curr->peer_preempt)) | ||
550 | resched_task(rq_of(cfs_rq)->curr); | 591 | resched_task(rq_of(cfs_rq)->curr); |
551 | curr->peer_preempt = 0; | ||
552 | } | 592 | } |
553 | 593 | ||
554 | static void | 594 | static void |
@@ -611,13 +651,29 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) | |||
611 | cfs_rq->curr = NULL; | 651 | cfs_rq->curr = NULL; |
612 | } | 652 | } |
613 | 653 | ||
614 | static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) | 654 | static void |
655 | entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) | ||
615 | { | 656 | { |
616 | /* | 657 | /* |
617 | * Update run-time statistics of the 'current'. | 658 | * Update run-time statistics of the 'current'. |
618 | */ | 659 | */ |
619 | update_curr(cfs_rq); | 660 | update_curr(cfs_rq); |
620 | 661 | ||
662 | #ifdef CONFIG_SCHED_HRTICK | ||
663 | /* | ||
664 | * queued ticks are scheduled to match the slice, so don't bother | ||
665 | * validating it and just reschedule. | ||
666 | */ | ||
667 | if (queued) | ||
668 | return resched_task(rq_of(cfs_rq)->curr); | ||
669 | /* | ||
670 | * don't let the period tick interfere with the hrtick preemption | ||
671 | */ | ||
672 | if (!sched_feat(DOUBLE_TICK) && | ||
673 | hrtimer_active(&rq_of(cfs_rq)->hrtick_timer)) | ||
674 | return; | ||
675 | #endif | ||
676 | |||
621 | if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) | 677 | if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) |
622 | check_preempt_tick(cfs_rq, curr); | 678 | check_preempt_tick(cfs_rq, curr); |
623 | } | 679 | } |
@@ -659,7 +715,7 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) | |||
659 | 715 | ||
660 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ | 716 | /* Iterate thr' all leaf cfs_rq's on a runqueue */ |
661 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ | 717 | #define for_each_leaf_cfs_rq(rq, cfs_rq) \ |
662 | list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) | 718 | list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) |
663 | 719 | ||
664 | /* Do the two (enqueued) entities belong to the same group ? */ | 720 | /* Do the two (enqueued) entities belong to the same group ? */ |
665 | static inline int | 721 | static inline int |
@@ -676,6 +732,8 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se) | |||
676 | return se->parent; | 732 | return se->parent; |
677 | } | 733 | } |
678 | 734 | ||
735 | #define GROUP_IMBALANCE_PCT 20 | ||
736 | |||
679 | #else /* CONFIG_FAIR_GROUP_SCHED */ | 737 | #else /* CONFIG_FAIR_GROUP_SCHED */ |
680 | 738 | ||
681 | #define for_each_sched_entity(se) \ | 739 | #define for_each_sched_entity(se) \ |
@@ -721,6 +779,43 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se) | |||
721 | 779 | ||
722 | #endif /* CONFIG_FAIR_GROUP_SCHED */ | 780 | #endif /* CONFIG_FAIR_GROUP_SCHED */ |
723 | 781 | ||
782 | #ifdef CONFIG_SCHED_HRTICK | ||
783 | static void hrtick_start_fair(struct rq *rq, struct task_struct *p) | ||
784 | { | ||
785 | int requeue = rq->curr == p; | ||
786 | struct sched_entity *se = &p->se; | ||
787 | struct cfs_rq *cfs_rq = cfs_rq_of(se); | ||
788 | |||
789 | WARN_ON(task_rq(p) != rq); | ||
790 | |||
791 | if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) { | ||
792 | u64 slice = sched_slice(cfs_rq, se); | ||
793 | u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; | ||
794 | s64 delta = slice - ran; | ||
795 | |||
796 | if (delta < 0) { | ||
797 | if (rq->curr == p) | ||
798 | resched_task(p); | ||
799 | return; | ||
800 | } | ||
801 | |||
802 | /* | ||
803 | * Don't schedule slices shorter than 10000ns, that just | ||
804 | * doesn't make sense. Rely on vruntime for fairness. | ||
805 | */ | ||
806 | if (!requeue) | ||
807 | delta = max(10000LL, delta); | ||
808 | |||
809 | hrtick_start(rq, delta, requeue); | ||
810 | } | ||
811 | } | ||
812 | #else | ||
813 | static inline void | ||
814 | hrtick_start_fair(struct rq *rq, struct task_struct *p) | ||
815 | { | ||
816 | } | ||
817 | #endif | ||
818 | |||
724 | /* | 819 | /* |
725 | * The enqueue_task method is called before nr_running is | 820 | * The enqueue_task method is called before nr_running is |
726 | * increased. Here we update the fair scheduling stats and | 821 | * increased. Here we update the fair scheduling stats and |
@@ -729,15 +824,28 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se) | |||
729 | static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) | 824 | static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) |
730 | { | 825 | { |
731 | struct cfs_rq *cfs_rq; | 826 | struct cfs_rq *cfs_rq; |
732 | struct sched_entity *se = &p->se; | 827 | struct sched_entity *se = &p->se, |
828 | *topse = NULL; /* Highest schedulable entity */ | ||
829 | int incload = 1; | ||
733 | 830 | ||
734 | for_each_sched_entity(se) { | 831 | for_each_sched_entity(se) { |
735 | if (se->on_rq) | 832 | topse = se; |
833 | if (se->on_rq) { | ||
834 | incload = 0; | ||
736 | break; | 835 | break; |
836 | } | ||
737 | cfs_rq = cfs_rq_of(se); | 837 | cfs_rq = cfs_rq_of(se); |
738 | enqueue_entity(cfs_rq, se, wakeup); | 838 | enqueue_entity(cfs_rq, se, wakeup); |
739 | wakeup = 1; | 839 | wakeup = 1; |
740 | } | 840 | } |
841 | /* Increment cpu load if we just enqueued the first task of a group on | ||
842 | * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs | ||
843 | * at the highest grouping level. | ||
844 | */ | ||
845 | if (incload) | ||
846 | inc_cpu_load(rq, topse->load.weight); | ||
847 | |||
848 | hrtick_start_fair(rq, rq->curr); | ||
741 | } | 849 | } |
742 | 850 | ||
743 | /* | 851 | /* |
@@ -748,16 +856,30 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) | |||
748 | static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) | 856 | static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) |
749 | { | 857 | { |
750 | struct cfs_rq *cfs_rq; | 858 | struct cfs_rq *cfs_rq; |
751 | struct sched_entity *se = &p->se; | 859 | struct sched_entity *se = &p->se, |
860 | *topse = NULL; /* Highest schedulable entity */ | ||
861 | int decload = 1; | ||
752 | 862 | ||
753 | for_each_sched_entity(se) { | 863 | for_each_sched_entity(se) { |
864 | topse = se; | ||
754 | cfs_rq = cfs_rq_of(se); | 865 | cfs_rq = cfs_rq_of(se); |
755 | dequeue_entity(cfs_rq, se, sleep); | 866 | dequeue_entity(cfs_rq, se, sleep); |
756 | /* Don't dequeue parent if it has other entities besides us */ | 867 | /* Don't dequeue parent if it has other entities besides us */ |
757 | if (cfs_rq->load.weight) | 868 | if (cfs_rq->load.weight) { |
869 | if (parent_entity(se)) | ||
870 | decload = 0; | ||
758 | break; | 871 | break; |
872 | } | ||
759 | sleep = 1; | 873 | sleep = 1; |
760 | } | 874 | } |
875 | /* Decrement cpu load if we just dequeued the last task of a group on | ||
876 | * 'rq->cpu'. 'topse' represents the group to which task 'p' belongs | ||
877 | * at the highest grouping level. | ||
878 | */ | ||
879 | if (decload) | ||
880 | dec_cpu_load(rq, topse->load.weight); | ||
881 | |||
882 | hrtick_start_fair(rq, rq->curr); | ||
761 | } | 883 | } |
762 | 884 | ||
763 | /* | 885 | /* |
@@ -767,8 +889,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) | |||
767 | */ | 889 | */ |
768 | static void yield_task_fair(struct rq *rq) | 890 | static void yield_task_fair(struct rq *rq) |
769 | { | 891 | { |
770 | struct cfs_rq *cfs_rq = task_cfs_rq(rq->curr); | 892 | struct task_struct *curr = rq->curr; |
771 | struct sched_entity *rightmost, *se = &rq->curr->se; | 893 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); |
894 | struct sched_entity *rightmost, *se = &curr->se; | ||
772 | 895 | ||
773 | /* | 896 | /* |
774 | * Are we the only task in the tree? | 897 | * Are we the only task in the tree? |
@@ -776,7 +899,7 @@ static void yield_task_fair(struct rq *rq) | |||
776 | if (unlikely(cfs_rq->nr_running == 1)) | 899 | if (unlikely(cfs_rq->nr_running == 1)) |
777 | return; | 900 | return; |
778 | 901 | ||
779 | if (likely(!sysctl_sched_compat_yield)) { | 902 | if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) { |
780 | __update_rq_clock(rq); | 903 | __update_rq_clock(rq); |
781 | /* | 904 | /* |
782 | * Update run-time statistics of the 'current'. | 905 | * Update run-time statistics of the 'current'. |
@@ -804,6 +927,154 @@ static void yield_task_fair(struct rq *rq) | |||
804 | } | 927 | } |
805 | 928 | ||
806 | /* | 929 | /* |
930 | * wake_idle() will wake a task on an idle cpu if task->cpu is | ||
931 | * not idle and an idle cpu is available. The span of cpus to | ||
932 | * search starts with cpus closest then further out as needed, | ||
933 | * so we always favor a closer, idle cpu. | ||
934 | * | ||
935 | * Returns the CPU we should wake onto. | ||
936 | */ | ||
937 | #if defined(ARCH_HAS_SCHED_WAKE_IDLE) | ||
938 | static int wake_idle(int cpu, struct task_struct *p) | ||
939 | { | ||
940 | cpumask_t tmp; | ||
941 | struct sched_domain *sd; | ||
942 | int i; | ||
943 | |||
944 | /* | ||
945 | * If it is idle, then it is the best cpu to run this task. | ||
946 | * | ||
947 | * This cpu is also the best, if it has more than one task already. | ||
948 | * Siblings must be also busy(in most cases) as they didn't already | ||
949 | * pickup the extra load from this cpu and hence we need not check | ||
950 | * sibling runqueue info. This will avoid the checks and cache miss | ||
951 | * penalities associated with that. | ||
952 | */ | ||
953 | if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1) | ||
954 | return cpu; | ||
955 | |||
956 | for_each_domain(cpu, sd) { | ||
957 | if (sd->flags & SD_WAKE_IDLE) { | ||
958 | cpus_and(tmp, sd->span, p->cpus_allowed); | ||
959 | for_each_cpu_mask(i, tmp) { | ||
960 | if (idle_cpu(i)) { | ||
961 | if (i != task_cpu(p)) { | ||
962 | schedstat_inc(p, | ||
963 | se.nr_wakeups_idle); | ||
964 | } | ||
965 | return i; | ||
966 | } | ||
967 | } | ||
968 | } else { | ||
969 | break; | ||
970 | } | ||
971 | } | ||
972 | return cpu; | ||
973 | } | ||
974 | #else | ||
975 | static inline int wake_idle(int cpu, struct task_struct *p) | ||
976 | { | ||
977 | return cpu; | ||
978 | } | ||
979 | #endif | ||
980 | |||
981 | #ifdef CONFIG_SMP | ||
982 | static int select_task_rq_fair(struct task_struct *p, int sync) | ||
983 | { | ||
984 | int cpu, this_cpu; | ||
985 | struct rq *rq; | ||
986 | struct sched_domain *sd, *this_sd = NULL; | ||
987 | int new_cpu; | ||
988 | |||
989 | cpu = task_cpu(p); | ||
990 | rq = task_rq(p); | ||
991 | this_cpu = smp_processor_id(); | ||
992 | new_cpu = cpu; | ||
993 | |||
994 | if (cpu == this_cpu) | ||
995 | goto out_set_cpu; | ||
996 | |||
997 | for_each_domain(this_cpu, sd) { | ||
998 | if (cpu_isset(cpu, sd->span)) { | ||
999 | this_sd = sd; | ||
1000 | break; | ||
1001 | } | ||
1002 | } | ||
1003 | |||
1004 | if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) | ||
1005 | goto out_set_cpu; | ||
1006 | |||
1007 | /* | ||
1008 | * Check for affine wakeup and passive balancing possibilities. | ||
1009 | */ | ||
1010 | if (this_sd) { | ||
1011 | int idx = this_sd->wake_idx; | ||
1012 | unsigned int imbalance; | ||
1013 | unsigned long load, this_load; | ||
1014 | |||
1015 | imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; | ||
1016 | |||
1017 | load = source_load(cpu, idx); | ||
1018 | this_load = target_load(this_cpu, idx); | ||
1019 | |||
1020 | new_cpu = this_cpu; /* Wake to this CPU if we can */ | ||
1021 | |||
1022 | if (this_sd->flags & SD_WAKE_AFFINE) { | ||
1023 | unsigned long tl = this_load; | ||
1024 | unsigned long tl_per_task; | ||
1025 | |||
1026 | /* | ||
1027 | * Attract cache-cold tasks on sync wakeups: | ||
1028 | */ | ||
1029 | if (sync && !task_hot(p, rq->clock, this_sd)) | ||
1030 | goto out_set_cpu; | ||
1031 | |||
1032 | schedstat_inc(p, se.nr_wakeups_affine_attempts); | ||
1033 | tl_per_task = cpu_avg_load_per_task(this_cpu); | ||
1034 | |||
1035 | /* | ||
1036 | * If sync wakeup then subtract the (maximum possible) | ||
1037 | * effect of the currently running task from the load | ||
1038 | * of the current CPU: | ||
1039 | */ | ||
1040 | if (sync) | ||
1041 | tl -= current->se.load.weight; | ||
1042 | |||
1043 | if ((tl <= load && | ||
1044 | tl + target_load(cpu, idx) <= tl_per_task) || | ||
1045 | 100*(tl + p->se.load.weight) <= imbalance*load) { | ||
1046 | /* | ||
1047 | * This domain has SD_WAKE_AFFINE and | ||
1048 | * p is cache cold in this domain, and | ||
1049 | * there is no bad imbalance. | ||
1050 | */ | ||
1051 | schedstat_inc(this_sd, ttwu_move_affine); | ||
1052 | schedstat_inc(p, se.nr_wakeups_affine); | ||
1053 | goto out_set_cpu; | ||
1054 | } | ||
1055 | } | ||
1056 | |||
1057 | /* | ||
1058 | * Start passive balancing when half the imbalance_pct | ||
1059 | * limit is reached. | ||
1060 | */ | ||
1061 | if (this_sd->flags & SD_WAKE_BALANCE) { | ||
1062 | if (imbalance*this_load <= 100*load) { | ||
1063 | schedstat_inc(this_sd, ttwu_move_balance); | ||
1064 | schedstat_inc(p, se.nr_wakeups_passive); | ||
1065 | goto out_set_cpu; | ||
1066 | } | ||
1067 | } | ||
1068 | } | ||
1069 | |||
1070 | new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ | ||
1071 | out_set_cpu: | ||
1072 | return wake_idle(new_cpu, p); | ||
1073 | } | ||
1074 | #endif /* CONFIG_SMP */ | ||
1075 | |||
1076 | |||
1077 | /* | ||
807 | * Preempt the current task with a newly woken task if needed: | 1078 | * Preempt the current task with a newly woken task if needed: |
808 | */ | 1079 | */ |
809 | static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) | 1080 | static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) |
@@ -811,7 +1082,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) | |||
811 | struct task_struct *curr = rq->curr; | 1082 | struct task_struct *curr = rq->curr; |
812 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); | 1083 | struct cfs_rq *cfs_rq = task_cfs_rq(curr); |
813 | struct sched_entity *se = &curr->se, *pse = &p->se; | 1084 | struct sched_entity *se = &curr->se, *pse = &p->se; |
814 | s64 delta, gran; | 1085 | unsigned long gran; |
815 | 1086 | ||
816 | if (unlikely(rt_prio(p->prio))) { | 1087 | if (unlikely(rt_prio(p->prio))) { |
817 | update_rq_clock(rq); | 1088 | update_rq_clock(rq); |
@@ -826,28 +1097,29 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) | |||
826 | if (unlikely(p->policy == SCHED_BATCH)) | 1097 | if (unlikely(p->policy == SCHED_BATCH)) |
827 | return; | 1098 | return; |
828 | 1099 | ||
829 | if (sched_feat(WAKEUP_PREEMPT)) { | 1100 | if (!sched_feat(WAKEUP_PREEMPT)) |
830 | while (!is_same_group(se, pse)) { | 1101 | return; |
831 | se = parent_entity(se); | ||
832 | pse = parent_entity(pse); | ||
833 | } | ||
834 | 1102 | ||
835 | delta = se->vruntime - pse->vruntime; | 1103 | while (!is_same_group(se, pse)) { |
836 | gran = sysctl_sched_wakeup_granularity; | 1104 | se = parent_entity(se); |
837 | if (unlikely(se->load.weight != NICE_0_LOAD)) | 1105 | pse = parent_entity(pse); |
838 | gran = calc_delta_fair(gran, &se->load); | 1106 | } |
839 | 1107 | ||
840 | if (delta > gran) { | 1108 | gran = sysctl_sched_wakeup_granularity; |
841 | int now = !sched_feat(PREEMPT_RESTRICT); | 1109 | /* |
1110 | * More easily preempt - nice tasks, while not making | ||
1111 | * it harder for + nice tasks. | ||
1112 | */ | ||
1113 | if (unlikely(se->load.weight > NICE_0_LOAD)) | ||
1114 | gran = calc_delta_fair(gran, &se->load); | ||
842 | 1115 | ||
843 | if (now || p->prio < curr->prio || !se->peer_preempt++) | 1116 | if (pse->vruntime + gran < se->vruntime) |
844 | resched_task(curr); | 1117 | resched_task(curr); |
845 | } | ||
846 | } | ||
847 | } | 1118 | } |
848 | 1119 | ||
849 | static struct task_struct *pick_next_task_fair(struct rq *rq) | 1120 | static struct task_struct *pick_next_task_fair(struct rq *rq) |
850 | { | 1121 | { |
1122 | struct task_struct *p; | ||
851 | struct cfs_rq *cfs_rq = &rq->cfs; | 1123 | struct cfs_rq *cfs_rq = &rq->cfs; |
852 | struct sched_entity *se; | 1124 | struct sched_entity *se; |
853 | 1125 | ||
@@ -859,7 +1131,10 @@ static struct task_struct *pick_next_task_fair(struct rq *rq) | |||
859 | cfs_rq = group_cfs_rq(se); | 1131 | cfs_rq = group_cfs_rq(se); |
860 | } while (cfs_rq); | 1132 | } while (cfs_rq); |
861 | 1133 | ||
862 | return task_of(se); | 1134 | p = task_of(se); |
1135 | hrtick_start_fair(rq, p); | ||
1136 | |||
1137 | return p; | ||
863 | } | 1138 | } |
864 | 1139 | ||
865 | /* | 1140 | /* |
@@ -916,25 +1191,6 @@ static struct task_struct *load_balance_next_fair(void *arg) | |||
916 | return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); | 1191 | return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); |
917 | } | 1192 | } |
918 | 1193 | ||
919 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
920 | static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) | ||
921 | { | ||
922 | struct sched_entity *curr; | ||
923 | struct task_struct *p; | ||
924 | |||
925 | if (!cfs_rq->nr_running) | ||
926 | return MAX_PRIO; | ||
927 | |||
928 | curr = cfs_rq->curr; | ||
929 | if (!curr) | ||
930 | curr = __pick_next_entity(cfs_rq); | ||
931 | |||
932 | p = task_of(curr); | ||
933 | |||
934 | return p->prio; | ||
935 | } | ||
936 | #endif | ||
937 | |||
938 | static unsigned long | 1194 | static unsigned long |
939 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | 1195 | load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, |
940 | unsigned long max_load_move, | 1196 | unsigned long max_load_move, |
@@ -944,28 +1200,45 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
944 | struct cfs_rq *busy_cfs_rq; | 1200 | struct cfs_rq *busy_cfs_rq; |
945 | long rem_load_move = max_load_move; | 1201 | long rem_load_move = max_load_move; |
946 | struct rq_iterator cfs_rq_iterator; | 1202 | struct rq_iterator cfs_rq_iterator; |
1203 | unsigned long load_moved; | ||
947 | 1204 | ||
948 | cfs_rq_iterator.start = load_balance_start_fair; | 1205 | cfs_rq_iterator.start = load_balance_start_fair; |
949 | cfs_rq_iterator.next = load_balance_next_fair; | 1206 | cfs_rq_iterator.next = load_balance_next_fair; |
950 | 1207 | ||
951 | for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { | 1208 | for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { |
952 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1209 | #ifdef CONFIG_FAIR_GROUP_SCHED |
953 | struct cfs_rq *this_cfs_rq; | 1210 | struct cfs_rq *this_cfs_rq = busy_cfs_rq->tg->cfs_rq[this_cpu]; |
954 | long imbalance; | 1211 | unsigned long maxload, task_load, group_weight; |
955 | unsigned long maxload; | 1212 | unsigned long thisload, per_task_load; |
1213 | struct sched_entity *se = busy_cfs_rq->tg->se[busiest->cpu]; | ||
956 | 1214 | ||
957 | this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); | 1215 | task_load = busy_cfs_rq->load.weight; |
1216 | group_weight = se->load.weight; | ||
958 | 1217 | ||
959 | imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; | 1218 | /* |
960 | /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ | 1219 | * 'group_weight' is contributed by tasks of total weight |
961 | if (imbalance <= 0) | 1220 | * 'task_load'. To move 'rem_load_move' worth of weight only, |
1221 | * we need to move a maximum task load of: | ||
1222 | * | ||
1223 | * maxload = (remload / group_weight) * task_load; | ||
1224 | */ | ||
1225 | maxload = (rem_load_move * task_load) / group_weight; | ||
1226 | |||
1227 | if (!maxload || !task_load) | ||
962 | continue; | 1228 | continue; |
963 | 1229 | ||
964 | /* Don't pull more than imbalance/2 */ | 1230 | per_task_load = task_load / busy_cfs_rq->nr_running; |
965 | imbalance /= 2; | 1231 | /* |
966 | maxload = min(rem_load_move, imbalance); | 1232 | * balance_tasks will try to forcibly move atleast one task if |
1233 | * possible (because of SCHED_LOAD_SCALE_FUZZ). Avoid that if | ||
1234 | * maxload is less than GROUP_IMBALANCE_FUZZ% the per_task_load. | ||
1235 | */ | ||
1236 | if (100 * maxload < GROUP_IMBALANCE_PCT * per_task_load) | ||
1237 | continue; | ||
967 | 1238 | ||
968 | *this_best_prio = cfs_rq_best_prio(this_cfs_rq); | 1239 | /* Disable priority-based load balance */ |
1240 | *this_best_prio = 0; | ||
1241 | thisload = this_cfs_rq->load.weight; | ||
969 | #else | 1242 | #else |
970 | # define maxload rem_load_move | 1243 | # define maxload rem_load_move |
971 | #endif | 1244 | #endif |
@@ -974,11 +1247,33 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
974 | * load_balance_[start|next]_fair iterators | 1247 | * load_balance_[start|next]_fair iterators |
975 | */ | 1248 | */ |
976 | cfs_rq_iterator.arg = busy_cfs_rq; | 1249 | cfs_rq_iterator.arg = busy_cfs_rq; |
977 | rem_load_move -= balance_tasks(this_rq, this_cpu, busiest, | 1250 | load_moved = balance_tasks(this_rq, this_cpu, busiest, |
978 | maxload, sd, idle, all_pinned, | 1251 | maxload, sd, idle, all_pinned, |
979 | this_best_prio, | 1252 | this_best_prio, |
980 | &cfs_rq_iterator); | 1253 | &cfs_rq_iterator); |
981 | 1254 | ||
1255 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
1256 | /* | ||
1257 | * load_moved holds the task load that was moved. The | ||
1258 | * effective (group) weight moved would be: | ||
1259 | * load_moved_eff = load_moved/task_load * group_weight; | ||
1260 | */ | ||
1261 | load_moved = (group_weight * load_moved) / task_load; | ||
1262 | |||
1263 | /* Adjust shares on both cpus to reflect load_moved */ | ||
1264 | group_weight -= load_moved; | ||
1265 | set_se_shares(se, group_weight); | ||
1266 | |||
1267 | se = busy_cfs_rq->tg->se[this_cpu]; | ||
1268 | if (!thisload) | ||
1269 | group_weight = load_moved; | ||
1270 | else | ||
1271 | group_weight = se->load.weight + load_moved; | ||
1272 | set_se_shares(se, group_weight); | ||
1273 | #endif | ||
1274 | |||
1275 | rem_load_move -= load_moved; | ||
1276 | |||
982 | if (rem_load_move <= 0) | 1277 | if (rem_load_move <= 0) |
983 | break; | 1278 | break; |
984 | } | 1279 | } |
@@ -1014,18 +1309,18 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
1014 | /* | 1309 | /* |
1015 | * scheduler tick hitting a task of our scheduling class: | 1310 | * scheduler tick hitting a task of our scheduling class: |
1016 | */ | 1311 | */ |
1017 | static void task_tick_fair(struct rq *rq, struct task_struct *curr) | 1312 | static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) |
1018 | { | 1313 | { |
1019 | struct cfs_rq *cfs_rq; | 1314 | struct cfs_rq *cfs_rq; |
1020 | struct sched_entity *se = &curr->se; | 1315 | struct sched_entity *se = &curr->se; |
1021 | 1316 | ||
1022 | for_each_sched_entity(se) { | 1317 | for_each_sched_entity(se) { |
1023 | cfs_rq = cfs_rq_of(se); | 1318 | cfs_rq = cfs_rq_of(se); |
1024 | entity_tick(cfs_rq, se); | 1319 | entity_tick(cfs_rq, se, queued); |
1025 | } | 1320 | } |
1026 | } | 1321 | } |
1027 | 1322 | ||
1028 | #define swap(a,b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0) | 1323 | #define swap(a, b) do { typeof(a) tmp = (a); (a) = (b); (b) = tmp; } while (0) |
1029 | 1324 | ||
1030 | /* | 1325 | /* |
1031 | * Share the fairness runtime between parent and child, thus the | 1326 | * Share the fairness runtime between parent and child, thus the |
@@ -1045,8 +1340,9 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) | |||
1045 | update_curr(cfs_rq); | 1340 | update_curr(cfs_rq); |
1046 | place_entity(cfs_rq, se, 1); | 1341 | place_entity(cfs_rq, se, 1); |
1047 | 1342 | ||
1343 | /* 'curr' will be NULL if the child belongs to a different group */ | ||
1048 | if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) && | 1344 | if (sysctl_sched_child_runs_first && this_cpu == task_cpu(p) && |
1049 | curr->vruntime < se->vruntime) { | 1345 | curr && curr->vruntime < se->vruntime) { |
1050 | /* | 1346 | /* |
1051 | * Upon rescheduling, sched_class::put_prev_task() will place | 1347 | * Upon rescheduling, sched_class::put_prev_task() will place |
1052 | * 'current' within the tree based on its new key value. | 1348 | * 'current' within the tree based on its new key value. |
@@ -1054,11 +1350,46 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) | |||
1054 | swap(curr->vruntime, se->vruntime); | 1350 | swap(curr->vruntime, se->vruntime); |
1055 | } | 1351 | } |
1056 | 1352 | ||
1057 | se->peer_preempt = 0; | ||
1058 | enqueue_task_fair(rq, p, 0); | 1353 | enqueue_task_fair(rq, p, 0); |
1059 | resched_task(rq->curr); | 1354 | resched_task(rq->curr); |
1060 | } | 1355 | } |
1061 | 1356 | ||
1357 | /* | ||
1358 | * Priority of the task has changed. Check to see if we preempt | ||
1359 | * the current task. | ||
1360 | */ | ||
1361 | static void prio_changed_fair(struct rq *rq, struct task_struct *p, | ||
1362 | int oldprio, int running) | ||
1363 | { | ||
1364 | /* | ||
1365 | * Reschedule if we are currently running on this runqueue and | ||
1366 | * our priority decreased, or if we are not currently running on | ||
1367 | * this runqueue and our priority is higher than the current's | ||
1368 | */ | ||
1369 | if (running) { | ||
1370 | if (p->prio > oldprio) | ||
1371 | resched_task(rq->curr); | ||
1372 | } else | ||
1373 | check_preempt_curr(rq, p); | ||
1374 | } | ||
1375 | |||
1376 | /* | ||
1377 | * We switched to the sched_fair class. | ||
1378 | */ | ||
1379 | static void switched_to_fair(struct rq *rq, struct task_struct *p, | ||
1380 | int running) | ||
1381 | { | ||
1382 | /* | ||
1383 | * We were most likely switched from sched_rt, so | ||
1384 | * kick off the schedule if running, otherwise just see | ||
1385 | * if we can still preempt the current task. | ||
1386 | */ | ||
1387 | if (running) | ||
1388 | resched_task(rq->curr); | ||
1389 | else | ||
1390 | check_preempt_curr(rq, p); | ||
1391 | } | ||
1392 | |||
1062 | /* Account for a task changing its policy or group. | 1393 | /* Account for a task changing its policy or group. |
1063 | * | 1394 | * |
1064 | * This routine is mostly called to set cfs_rq->curr field when a task | 1395 | * This routine is mostly called to set cfs_rq->curr field when a task |
@@ -1080,6 +1411,9 @@ static const struct sched_class fair_sched_class = { | |||
1080 | .enqueue_task = enqueue_task_fair, | 1411 | .enqueue_task = enqueue_task_fair, |
1081 | .dequeue_task = dequeue_task_fair, | 1412 | .dequeue_task = dequeue_task_fair, |
1082 | .yield_task = yield_task_fair, | 1413 | .yield_task = yield_task_fair, |
1414 | #ifdef CONFIG_SMP | ||
1415 | .select_task_rq = select_task_rq_fair, | ||
1416 | #endif /* CONFIG_SMP */ | ||
1083 | 1417 | ||
1084 | .check_preempt_curr = check_preempt_wakeup, | 1418 | .check_preempt_curr = check_preempt_wakeup, |
1085 | 1419 | ||
@@ -1094,6 +1428,9 @@ static const struct sched_class fair_sched_class = { | |||
1094 | .set_curr_task = set_curr_task_fair, | 1428 | .set_curr_task = set_curr_task_fair, |
1095 | .task_tick = task_tick_fair, | 1429 | .task_tick = task_tick_fair, |
1096 | .task_new = task_new_fair, | 1430 | .task_new = task_new_fair, |
1431 | |||
1432 | .prio_changed = prio_changed_fair, | ||
1433 | .switched_to = switched_to_fair, | ||
1097 | }; | 1434 | }; |
1098 | 1435 | ||
1099 | #ifdef CONFIG_SCHED_DEBUG | 1436 | #ifdef CONFIG_SCHED_DEBUG |
@@ -1104,7 +1441,9 @@ static void print_cfs_stats(struct seq_file *m, int cpu) | |||
1104 | #ifdef CONFIG_FAIR_GROUP_SCHED | 1441 | #ifdef CONFIG_FAIR_GROUP_SCHED |
1105 | print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs); | 1442 | print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs); |
1106 | #endif | 1443 | #endif |
1444 | rcu_read_lock(); | ||
1107 | for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) | 1445 | for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) |
1108 | print_cfs_rq(m, cpu, cfs_rq); | 1446 | print_cfs_rq(m, cpu, cfs_rq); |
1447 | rcu_read_unlock(); | ||
1109 | } | 1448 | } |
1110 | #endif | 1449 | #endif |
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index bf9c25c15b8b..2bcafa375633 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c | |||
@@ -5,6 +5,12 @@ | |||
5 | * handled in sched_fair.c) | 5 | * handled in sched_fair.c) |
6 | */ | 6 | */ |
7 | 7 | ||
8 | #ifdef CONFIG_SMP | ||
9 | static int select_task_rq_idle(struct task_struct *p, int sync) | ||
10 | { | ||
11 | return task_cpu(p); /* IDLE tasks as never migrated */ | ||
12 | } | ||
13 | #endif /* CONFIG_SMP */ | ||
8 | /* | 14 | /* |
9 | * Idle tasks are unconditionally rescheduled: | 15 | * Idle tasks are unconditionally rescheduled: |
10 | */ | 16 | */ |
@@ -55,7 +61,7 @@ move_one_task_idle(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
55 | } | 61 | } |
56 | #endif | 62 | #endif |
57 | 63 | ||
58 | static void task_tick_idle(struct rq *rq, struct task_struct *curr) | 64 | static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued) |
59 | { | 65 | { |
60 | } | 66 | } |
61 | 67 | ||
@@ -63,6 +69,33 @@ static void set_curr_task_idle(struct rq *rq) | |||
63 | { | 69 | { |
64 | } | 70 | } |
65 | 71 | ||
72 | static void switched_to_idle(struct rq *rq, struct task_struct *p, | ||
73 | int running) | ||
74 | { | ||
75 | /* Can this actually happen?? */ | ||
76 | if (running) | ||
77 | resched_task(rq->curr); | ||
78 | else | ||
79 | check_preempt_curr(rq, p); | ||
80 | } | ||
81 | |||
82 | static void prio_changed_idle(struct rq *rq, struct task_struct *p, | ||
83 | int oldprio, int running) | ||
84 | { | ||
85 | /* This can happen for hot plug CPUS */ | ||
86 | |||
87 | /* | ||
88 | * Reschedule if we are currently running on this runqueue and | ||
89 | * our priority decreased, or if we are not currently running on | ||
90 | * this runqueue and our priority is higher than the current's | ||
91 | */ | ||
92 | if (running) { | ||
93 | if (p->prio > oldprio) | ||
94 | resched_task(rq->curr); | ||
95 | } else | ||
96 | check_preempt_curr(rq, p); | ||
97 | } | ||
98 | |||
66 | /* | 99 | /* |
67 | * Simple, special scheduling class for the per-CPU idle tasks: | 100 | * Simple, special scheduling class for the per-CPU idle tasks: |
68 | */ | 101 | */ |
@@ -72,6 +105,9 @@ const struct sched_class idle_sched_class = { | |||
72 | 105 | ||
73 | /* dequeue is not valid, we print a debug message there: */ | 106 | /* dequeue is not valid, we print a debug message there: */ |
74 | .dequeue_task = dequeue_task_idle, | 107 | .dequeue_task = dequeue_task_idle, |
108 | #ifdef CONFIG_SMP | ||
109 | .select_task_rq = select_task_rq_idle, | ||
110 | #endif /* CONFIG_SMP */ | ||
75 | 111 | ||
76 | .check_preempt_curr = check_preempt_curr_idle, | 112 | .check_preempt_curr = check_preempt_curr_idle, |
77 | 113 | ||
@@ -85,5 +121,9 @@ const struct sched_class idle_sched_class = { | |||
85 | 121 | ||
86 | .set_curr_task = set_curr_task_idle, | 122 | .set_curr_task = set_curr_task_idle, |
87 | .task_tick = task_tick_idle, | 123 | .task_tick = task_tick_idle, |
124 | |||
125 | .prio_changed = prio_changed_idle, | ||
126 | .switched_to = switched_to_idle, | ||
127 | |||
88 | /* no .task_new for idle tasks */ | 128 | /* no .task_new for idle tasks */ |
89 | }; | 129 | }; |
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 8abd752a0ebd..274b40d7bef2 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c | |||
@@ -3,6 +3,217 @@ | |||
3 | * policies) | 3 | * policies) |
4 | */ | 4 | */ |
5 | 5 | ||
6 | #ifdef CONFIG_SMP | ||
7 | |||
8 | static inline int rt_overloaded(struct rq *rq) | ||
9 | { | ||
10 | return atomic_read(&rq->rd->rto_count); | ||
11 | } | ||
12 | |||
13 | static inline void rt_set_overload(struct rq *rq) | ||
14 | { | ||
15 | cpu_set(rq->cpu, rq->rd->rto_mask); | ||
16 | /* | ||
17 | * Make sure the mask is visible before we set | ||
18 | * the overload count. That is checked to determine | ||
19 | * if we should look at the mask. It would be a shame | ||
20 | * if we looked at the mask, but the mask was not | ||
21 | * updated yet. | ||
22 | */ | ||
23 | wmb(); | ||
24 | atomic_inc(&rq->rd->rto_count); | ||
25 | } | ||
26 | |||
27 | static inline void rt_clear_overload(struct rq *rq) | ||
28 | { | ||
29 | /* the order here really doesn't matter */ | ||
30 | atomic_dec(&rq->rd->rto_count); | ||
31 | cpu_clear(rq->cpu, rq->rd->rto_mask); | ||
32 | } | ||
33 | |||
34 | static void update_rt_migration(struct rq *rq) | ||
35 | { | ||
36 | if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1)) { | ||
37 | if (!rq->rt.overloaded) { | ||
38 | rt_set_overload(rq); | ||
39 | rq->rt.overloaded = 1; | ||
40 | } | ||
41 | } else if (rq->rt.overloaded) { | ||
42 | rt_clear_overload(rq); | ||
43 | rq->rt.overloaded = 0; | ||
44 | } | ||
45 | } | ||
46 | #endif /* CONFIG_SMP */ | ||
47 | |||
48 | static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se) | ||
49 | { | ||
50 | return container_of(rt_se, struct task_struct, rt); | ||
51 | } | ||
52 | |||
53 | static inline int on_rt_rq(struct sched_rt_entity *rt_se) | ||
54 | { | ||
55 | return !list_empty(&rt_se->run_list); | ||
56 | } | ||
57 | |||
58 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
59 | |||
60 | static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq) | ||
61 | { | ||
62 | if (!rt_rq->tg) | ||
63 | return SCHED_RT_FRAC; | ||
64 | |||
65 | return rt_rq->tg->rt_ratio; | ||
66 | } | ||
67 | |||
68 | #define for_each_leaf_rt_rq(rt_rq, rq) \ | ||
69 | list_for_each_entry(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list) | ||
70 | |||
71 | static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) | ||
72 | { | ||
73 | return rt_rq->rq; | ||
74 | } | ||
75 | |||
76 | static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) | ||
77 | { | ||
78 | return rt_se->rt_rq; | ||
79 | } | ||
80 | |||
81 | #define for_each_sched_rt_entity(rt_se) \ | ||
82 | for (; rt_se; rt_se = rt_se->parent) | ||
83 | |||
84 | static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) | ||
85 | { | ||
86 | return rt_se->my_q; | ||
87 | } | ||
88 | |||
89 | static void enqueue_rt_entity(struct sched_rt_entity *rt_se); | ||
90 | static void dequeue_rt_entity(struct sched_rt_entity *rt_se); | ||
91 | |||
92 | static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq) | ||
93 | { | ||
94 | struct sched_rt_entity *rt_se = rt_rq->rt_se; | ||
95 | |||
96 | if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) { | ||
97 | struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; | ||
98 | |||
99 | enqueue_rt_entity(rt_se); | ||
100 | if (rt_rq->highest_prio < curr->prio) | ||
101 | resched_task(curr); | ||
102 | } | ||
103 | } | ||
104 | |||
105 | static void sched_rt_ratio_dequeue(struct rt_rq *rt_rq) | ||
106 | { | ||
107 | struct sched_rt_entity *rt_se = rt_rq->rt_se; | ||
108 | |||
109 | if (rt_se && on_rt_rq(rt_se)) | ||
110 | dequeue_rt_entity(rt_se); | ||
111 | } | ||
112 | |||
113 | #else | ||
114 | |||
115 | static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq) | ||
116 | { | ||
117 | return sysctl_sched_rt_ratio; | ||
118 | } | ||
119 | |||
120 | #define for_each_leaf_rt_rq(rt_rq, rq) \ | ||
121 | for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL) | ||
122 | |||
123 | static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq) | ||
124 | { | ||
125 | return container_of(rt_rq, struct rq, rt); | ||
126 | } | ||
127 | |||
128 | static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se) | ||
129 | { | ||
130 | struct task_struct *p = rt_task_of(rt_se); | ||
131 | struct rq *rq = task_rq(p); | ||
132 | |||
133 | return &rq->rt; | ||
134 | } | ||
135 | |||
136 | #define for_each_sched_rt_entity(rt_se) \ | ||
137 | for (; rt_se; rt_se = NULL) | ||
138 | |||
139 | static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) | ||
140 | { | ||
141 | return NULL; | ||
142 | } | ||
143 | |||
144 | static inline void sched_rt_ratio_enqueue(struct rt_rq *rt_rq) | ||
145 | { | ||
146 | } | ||
147 | |||
148 | static inline void sched_rt_ratio_dequeue(struct rt_rq *rt_rq) | ||
149 | { | ||
150 | } | ||
151 | |||
152 | #endif | ||
153 | |||
154 | static inline int rt_se_prio(struct sched_rt_entity *rt_se) | ||
155 | { | ||
156 | #ifdef CONFIG_FAIR_GROUP_SCHED | ||
157 | struct rt_rq *rt_rq = group_rt_rq(rt_se); | ||
158 | |||
159 | if (rt_rq) | ||
160 | return rt_rq->highest_prio; | ||
161 | #endif | ||
162 | |||
163 | return rt_task_of(rt_se)->prio; | ||
164 | } | ||
165 | |||
166 | static int sched_rt_ratio_exceeded(struct rt_rq *rt_rq) | ||
167 | { | ||
168 | unsigned int rt_ratio = sched_rt_ratio(rt_rq); | ||
169 | u64 period, ratio; | ||
170 | |||
171 | if (rt_ratio == SCHED_RT_FRAC) | ||
172 | return 0; | ||
173 | |||
174 | if (rt_rq->rt_throttled) | ||
175 | return 1; | ||
176 | |||
177 | period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC; | ||
178 | ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT; | ||
179 | |||
180 | if (rt_rq->rt_time > ratio) { | ||
181 | struct rq *rq = rq_of_rt_rq(rt_rq); | ||
182 | |||
183 | rq->rt_throttled = 1; | ||
184 | rt_rq->rt_throttled = 1; | ||
185 | |||
186 | sched_rt_ratio_dequeue(rt_rq); | ||
187 | return 1; | ||
188 | } | ||
189 | |||
190 | return 0; | ||
191 | } | ||
192 | |||
193 | static void update_sched_rt_period(struct rq *rq) | ||
194 | { | ||
195 | struct rt_rq *rt_rq; | ||
196 | u64 period; | ||
197 | |||
198 | while (rq->clock > rq->rt_period_expire) { | ||
199 | period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC; | ||
200 | rq->rt_period_expire += period; | ||
201 | |||
202 | for_each_leaf_rt_rq(rt_rq, rq) { | ||
203 | unsigned long rt_ratio = sched_rt_ratio(rt_rq); | ||
204 | u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT; | ||
205 | |||
206 | rt_rq->rt_time -= min(rt_rq->rt_time, ratio); | ||
207 | if (rt_rq->rt_throttled) { | ||
208 | rt_rq->rt_throttled = 0; | ||
209 | sched_rt_ratio_enqueue(rt_rq); | ||
210 | } | ||
211 | } | ||
212 | |||
213 | rq->rt_throttled = 0; | ||
214 | } | ||
215 | } | ||
216 | |||
6 | /* | 217 | /* |
7 | * Update the current task's runtime statistics. Skip current tasks that | 218 | * Update the current task's runtime statistics. Skip current tasks that |
8 | * are not in our scheduling class. | 219 | * are not in our scheduling class. |
@@ -10,6 +221,8 @@ | |||
10 | static void update_curr_rt(struct rq *rq) | 221 | static void update_curr_rt(struct rq *rq) |
11 | { | 222 | { |
12 | struct task_struct *curr = rq->curr; | 223 | struct task_struct *curr = rq->curr; |
224 | struct sched_rt_entity *rt_se = &curr->rt; | ||
225 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); | ||
13 | u64 delta_exec; | 226 | u64 delta_exec; |
14 | 227 | ||
15 | if (!task_has_rt_policy(curr)) | 228 | if (!task_has_rt_policy(curr)) |
@@ -23,47 +236,229 @@ static void update_curr_rt(struct rq *rq) | |||
23 | 236 | ||
24 | curr->se.sum_exec_runtime += delta_exec; | 237 | curr->se.sum_exec_runtime += delta_exec; |
25 | curr->se.exec_start = rq->clock; | 238 | curr->se.exec_start = rq->clock; |
239 | cpuacct_charge(curr, delta_exec); | ||
240 | |||
241 | rt_rq->rt_time += delta_exec; | ||
242 | /* | ||
243 | * might make it a tad more accurate: | ||
244 | * | ||
245 | * update_sched_rt_period(rq); | ||
246 | */ | ||
247 | if (sched_rt_ratio_exceeded(rt_rq)) | ||
248 | resched_task(curr); | ||
26 | } | 249 | } |
27 | 250 | ||
28 | static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) | 251 | static inline |
252 | void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | ||
253 | { | ||
254 | WARN_ON(!rt_prio(rt_se_prio(rt_se))); | ||
255 | rt_rq->rt_nr_running++; | ||
256 | #if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED | ||
257 | if (rt_se_prio(rt_se) < rt_rq->highest_prio) | ||
258 | rt_rq->highest_prio = rt_se_prio(rt_se); | ||
259 | #endif | ||
260 | #ifdef CONFIG_SMP | ||
261 | if (rt_se->nr_cpus_allowed > 1) { | ||
262 | struct rq *rq = rq_of_rt_rq(rt_rq); | ||
263 | rq->rt.rt_nr_migratory++; | ||
264 | } | ||
265 | |||
266 | update_rt_migration(rq_of_rt_rq(rt_rq)); | ||
267 | #endif | ||
268 | } | ||
269 | |||
270 | static inline | ||
271 | void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) | ||
272 | { | ||
273 | WARN_ON(!rt_prio(rt_se_prio(rt_se))); | ||
274 | WARN_ON(!rt_rq->rt_nr_running); | ||
275 | rt_rq->rt_nr_running--; | ||
276 | #if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED | ||
277 | if (rt_rq->rt_nr_running) { | ||
278 | struct rt_prio_array *array; | ||
279 | |||
280 | WARN_ON(rt_se_prio(rt_se) < rt_rq->highest_prio); | ||
281 | if (rt_se_prio(rt_se) == rt_rq->highest_prio) { | ||
282 | /* recalculate */ | ||
283 | array = &rt_rq->active; | ||
284 | rt_rq->highest_prio = | ||
285 | sched_find_first_bit(array->bitmap); | ||
286 | } /* otherwise leave rq->highest prio alone */ | ||
287 | } else | ||
288 | rt_rq->highest_prio = MAX_RT_PRIO; | ||
289 | #endif | ||
290 | #ifdef CONFIG_SMP | ||
291 | if (rt_se->nr_cpus_allowed > 1) { | ||
292 | struct rq *rq = rq_of_rt_rq(rt_rq); | ||
293 | rq->rt.rt_nr_migratory--; | ||
294 | } | ||
295 | |||
296 | update_rt_migration(rq_of_rt_rq(rt_rq)); | ||
297 | #endif /* CONFIG_SMP */ | ||
298 | } | ||
299 | |||
300 | static void enqueue_rt_entity(struct sched_rt_entity *rt_se) | ||
301 | { | ||
302 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); | ||
303 | struct rt_prio_array *array = &rt_rq->active; | ||
304 | struct rt_rq *group_rq = group_rt_rq(rt_se); | ||
305 | |||
306 | if (group_rq && group_rq->rt_throttled) | ||
307 | return; | ||
308 | |||
309 | list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se)); | ||
310 | __set_bit(rt_se_prio(rt_se), array->bitmap); | ||
311 | |||
312 | inc_rt_tasks(rt_se, rt_rq); | ||
313 | } | ||
314 | |||
315 | static void dequeue_rt_entity(struct sched_rt_entity *rt_se) | ||
316 | { | ||
317 | struct rt_rq *rt_rq = rt_rq_of_se(rt_se); | ||
318 | struct rt_prio_array *array = &rt_rq->active; | ||
319 | |||
320 | list_del_init(&rt_se->run_list); | ||
321 | if (list_empty(array->queue + rt_se_prio(rt_se))) | ||
322 | __clear_bit(rt_se_prio(rt_se), array->bitmap); | ||
323 | |||
324 | dec_rt_tasks(rt_se, rt_rq); | ||
325 | } | ||
326 | |||
327 | /* | ||
328 | * Because the prio of an upper entry depends on the lower | ||
329 | * entries, we must remove entries top - down. | ||
330 | * | ||
331 | * XXX: O(1/2 h^2) because we can only walk up, not down the chain. | ||
332 | * doesn't matter much for now, as h=2 for GROUP_SCHED. | ||
333 | */ | ||
334 | static void dequeue_rt_stack(struct task_struct *p) | ||
29 | { | 335 | { |
30 | struct rt_prio_array *array = &rq->rt.active; | 336 | struct sched_rt_entity *rt_se, *top_se; |
31 | 337 | ||
32 | list_add_tail(&p->run_list, array->queue + p->prio); | 338 | /* |
33 | __set_bit(p->prio, array->bitmap); | 339 | * dequeue all, top - down. |
340 | */ | ||
341 | do { | ||
342 | rt_se = &p->rt; | ||
343 | top_se = NULL; | ||
344 | for_each_sched_rt_entity(rt_se) { | ||
345 | if (on_rt_rq(rt_se)) | ||
346 | top_se = rt_se; | ||
347 | } | ||
348 | if (top_se) | ||
349 | dequeue_rt_entity(top_se); | ||
350 | } while (top_se); | ||
34 | } | 351 | } |
35 | 352 | ||
36 | /* | 353 | /* |
37 | * Adding/removing a task to/from a priority array: | 354 | * Adding/removing a task to/from a priority array: |
38 | */ | 355 | */ |
356 | static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup) | ||
357 | { | ||
358 | struct sched_rt_entity *rt_se = &p->rt; | ||
359 | |||
360 | if (wakeup) | ||
361 | rt_se->timeout = 0; | ||
362 | |||
363 | dequeue_rt_stack(p); | ||
364 | |||
365 | /* | ||
366 | * enqueue everybody, bottom - up. | ||
367 | */ | ||
368 | for_each_sched_rt_entity(rt_se) | ||
369 | enqueue_rt_entity(rt_se); | ||
370 | |||
371 | inc_cpu_load(rq, p->se.load.weight); | ||
372 | } | ||
373 | |||
39 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) | 374 | static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) |
40 | { | 375 | { |
41 | struct rt_prio_array *array = &rq->rt.active; | 376 | struct sched_rt_entity *rt_se = &p->rt; |
377 | struct rt_rq *rt_rq; | ||
42 | 378 | ||
43 | update_curr_rt(rq); | 379 | update_curr_rt(rq); |
44 | 380 | ||
45 | list_del(&p->run_list); | 381 | dequeue_rt_stack(p); |
46 | if (list_empty(array->queue + p->prio)) | 382 | |
47 | __clear_bit(p->prio, array->bitmap); | 383 | /* |
384 | * re-enqueue all non-empty rt_rq entities. | ||
385 | */ | ||
386 | for_each_sched_rt_entity(rt_se) { | ||
387 | rt_rq = group_rt_rq(rt_se); | ||
388 | if (rt_rq && rt_rq->rt_nr_running) | ||
389 | enqueue_rt_entity(rt_se); | ||
390 | } | ||
391 | |||
392 | dec_cpu_load(rq, p->se.load.weight); | ||
48 | } | 393 | } |
49 | 394 | ||
50 | /* | 395 | /* |
51 | * Put task to the end of the run list without the overhead of dequeue | 396 | * Put task to the end of the run list without the overhead of dequeue |
52 | * followed by enqueue. | 397 | * followed by enqueue. |
53 | */ | 398 | */ |
399 | static | ||
400 | void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) | ||
401 | { | ||
402 | struct rt_prio_array *array = &rt_rq->active; | ||
403 | |||
404 | list_move_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se)); | ||
405 | } | ||
406 | |||
54 | static void requeue_task_rt(struct rq *rq, struct task_struct *p) | 407 | static void requeue_task_rt(struct rq *rq, struct task_struct *p) |
55 | { | 408 | { |
56 | struct rt_prio_array *array = &rq->rt.active; | 409 | struct sched_rt_entity *rt_se = &p->rt; |
410 | struct rt_rq *rt_rq; | ||
57 | 411 | ||
58 | list_move_tail(&p->run_list, array->queue + p->prio); | 412 | for_each_sched_rt_entity(rt_se) { |
413 | rt_rq = rt_rq_of_se(rt_se); | ||
414 | requeue_rt_entity(rt_rq, rt_se); | ||
415 | } | ||
59 | } | 416 | } |
60 | 417 | ||
61 | static void | 418 | static void yield_task_rt(struct rq *rq) |
62 | yield_task_rt(struct rq *rq) | ||
63 | { | 419 | { |
64 | requeue_task_rt(rq, rq->curr); | 420 | requeue_task_rt(rq, rq->curr); |
65 | } | 421 | } |
66 | 422 | ||
423 | #ifdef CONFIG_SMP | ||
424 | static int find_lowest_rq(struct task_struct *task); | ||
425 | |||
426 | static int select_task_rq_rt(struct task_struct *p, int sync) | ||
427 | { | ||
428 | struct rq *rq = task_rq(p); | ||
429 | |||
430 | /* | ||
431 | * If the current task is an RT task, then | ||
432 | * try to see if we can wake this RT task up on another | ||
433 | * runqueue. Otherwise simply start this RT task | ||
434 | * on its current runqueue. | ||
435 | * | ||
436 | * We want to avoid overloading runqueues. Even if | ||
437 | * the RT task is of higher priority than the current RT task. | ||
438 | * RT tasks behave differently than other tasks. If | ||
439 | * one gets preempted, we try to push it off to another queue. | ||
440 | * So trying to keep a preempting RT task on the same | ||
441 | * cache hot CPU will force the running RT task to | ||
442 | * a cold CPU. So we waste all the cache for the lower | ||
443 | * RT task in hopes of saving some of a RT task | ||
444 | * that is just being woken and probably will have | ||
445 | * cold cache anyway. | ||
446 | */ | ||
447 | if (unlikely(rt_task(rq->curr)) && | ||
448 | (p->rt.nr_cpus_allowed > 1)) { | ||
449 | int cpu = find_lowest_rq(p); | ||
450 | |||
451 | return (cpu == -1) ? task_cpu(p) : cpu; | ||
452 | } | ||
453 | |||
454 | /* | ||
455 | * Otherwise, just let it ride on the affined RQ and the | ||
456 | * post-schedule router will push the preempted task away | ||
457 | */ | ||
458 | return task_cpu(p); | ||
459 | } | ||
460 | #endif /* CONFIG_SMP */ | ||
461 | |||
67 | /* | 462 | /* |
68 | * Preempt the current task with a newly woken task if needed: | 463 | * Preempt the current task with a newly woken task if needed: |
69 | */ | 464 | */ |
@@ -73,25 +468,48 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) | |||
73 | resched_task(rq->curr); | 468 | resched_task(rq->curr); |
74 | } | 469 | } |
75 | 470 | ||
76 | static struct task_struct *pick_next_task_rt(struct rq *rq) | 471 | static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, |
472 | struct rt_rq *rt_rq) | ||
77 | { | 473 | { |
78 | struct rt_prio_array *array = &rq->rt.active; | 474 | struct rt_prio_array *array = &rt_rq->active; |
79 | struct task_struct *next; | 475 | struct sched_rt_entity *next = NULL; |
80 | struct list_head *queue; | 476 | struct list_head *queue; |
81 | int idx; | 477 | int idx; |
82 | 478 | ||
83 | idx = sched_find_first_bit(array->bitmap); | 479 | idx = sched_find_first_bit(array->bitmap); |
84 | if (idx >= MAX_RT_PRIO) | 480 | BUG_ON(idx >= MAX_RT_PRIO); |
85 | return NULL; | ||
86 | 481 | ||
87 | queue = array->queue + idx; | 482 | queue = array->queue + idx; |
88 | next = list_entry(queue->next, struct task_struct, run_list); | 483 | next = list_entry(queue->next, struct sched_rt_entity, run_list); |
89 | |||
90 | next->se.exec_start = rq->clock; | ||
91 | 484 | ||
92 | return next; | 485 | return next; |
93 | } | 486 | } |
94 | 487 | ||
488 | static struct task_struct *pick_next_task_rt(struct rq *rq) | ||
489 | { | ||
490 | struct sched_rt_entity *rt_se; | ||
491 | struct task_struct *p; | ||
492 | struct rt_rq *rt_rq; | ||
493 | |||
494 | rt_rq = &rq->rt; | ||
495 | |||
496 | if (unlikely(!rt_rq->rt_nr_running)) | ||
497 | return NULL; | ||
498 | |||
499 | if (sched_rt_ratio_exceeded(rt_rq)) | ||
500 | return NULL; | ||
501 | |||
502 | do { | ||
503 | rt_se = pick_next_rt_entity(rq, rt_rq); | ||
504 | BUG_ON(!rt_se); | ||
505 | rt_rq = group_rt_rq(rt_se); | ||
506 | } while (rt_rq); | ||
507 | |||
508 | p = rt_task_of(rt_se); | ||
509 | p->se.exec_start = rq->clock; | ||
510 | return p; | ||
511 | } | ||
512 | |||
95 | static void put_prev_task_rt(struct rq *rq, struct task_struct *p) | 513 | static void put_prev_task_rt(struct rq *rq, struct task_struct *p) |
96 | { | 514 | { |
97 | update_curr_rt(rq); | 515 | update_curr_rt(rq); |
@@ -99,76 +517,448 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) | |||
99 | } | 517 | } |
100 | 518 | ||
101 | #ifdef CONFIG_SMP | 519 | #ifdef CONFIG_SMP |
102 | /* | 520 | |
103 | * Load-balancing iterator. Note: while the runqueue stays locked | 521 | /* Only try algorithms three times */ |
104 | * during the whole iteration, the current task might be | 522 | #define RT_MAX_TRIES 3 |
105 | * dequeued so the iterator has to be dequeue-safe. Here we | 523 | |
106 | * achieve that by always pre-iterating before returning | 524 | static int double_lock_balance(struct rq *this_rq, struct rq *busiest); |
107 | * the current task: | 525 | static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep); |
108 | */ | 526 | |
109 | static struct task_struct *load_balance_start_rt(void *arg) | 527 | static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) |
110 | { | 528 | { |
111 | struct rq *rq = arg; | 529 | if (!task_running(rq, p) && |
112 | struct rt_prio_array *array = &rq->rt.active; | 530 | (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)) && |
113 | struct list_head *head, *curr; | 531 | (p->rt.nr_cpus_allowed > 1)) |
114 | struct task_struct *p; | 532 | return 1; |
533 | return 0; | ||
534 | } | ||
535 | |||
536 | /* Return the second highest RT task, NULL otherwise */ | ||
537 | static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu) | ||
538 | { | ||
539 | struct task_struct *next = NULL; | ||
540 | struct sched_rt_entity *rt_se; | ||
541 | struct rt_prio_array *array; | ||
542 | struct rt_rq *rt_rq; | ||
115 | int idx; | 543 | int idx; |
116 | 544 | ||
117 | idx = sched_find_first_bit(array->bitmap); | 545 | for_each_leaf_rt_rq(rt_rq, rq) { |
118 | if (idx >= MAX_RT_PRIO) | 546 | array = &rt_rq->active; |
119 | return NULL; | 547 | idx = sched_find_first_bit(array->bitmap); |
548 | next_idx: | ||
549 | if (idx >= MAX_RT_PRIO) | ||
550 | continue; | ||
551 | if (next && next->prio < idx) | ||
552 | continue; | ||
553 | list_for_each_entry(rt_se, array->queue + idx, run_list) { | ||
554 | struct task_struct *p = rt_task_of(rt_se); | ||
555 | if (pick_rt_task(rq, p, cpu)) { | ||
556 | next = p; | ||
557 | break; | ||
558 | } | ||
559 | } | ||
560 | if (!next) { | ||
561 | idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1); | ||
562 | goto next_idx; | ||
563 | } | ||
564 | } | ||
565 | |||
566 | return next; | ||
567 | } | ||
120 | 568 | ||
121 | head = array->queue + idx; | 569 | static DEFINE_PER_CPU(cpumask_t, local_cpu_mask); |
122 | curr = head->prev; | ||
123 | 570 | ||
124 | p = list_entry(curr, struct task_struct, run_list); | 571 | static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask) |
572 | { | ||
573 | int lowest_prio = -1; | ||
574 | int lowest_cpu = -1; | ||
575 | int count = 0; | ||
576 | int cpu; | ||
125 | 577 | ||
126 | curr = curr->prev; | 578 | cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed); |
127 | 579 | ||
128 | rq->rt.rt_load_balance_idx = idx; | 580 | /* |
129 | rq->rt.rt_load_balance_head = head; | 581 | * Scan each rq for the lowest prio. |
130 | rq->rt.rt_load_balance_curr = curr; | 582 | */ |
583 | for_each_cpu_mask(cpu, *lowest_mask) { | ||
584 | struct rq *rq = cpu_rq(cpu); | ||
131 | 585 | ||
132 | return p; | 586 | /* We look for lowest RT prio or non-rt CPU */ |
587 | if (rq->rt.highest_prio >= MAX_RT_PRIO) { | ||
588 | /* | ||
589 | * if we already found a low RT queue | ||
590 | * and now we found this non-rt queue | ||
591 | * clear the mask and set our bit. | ||
592 | * Otherwise just return the queue as is | ||
593 | * and the count==1 will cause the algorithm | ||
594 | * to use the first bit found. | ||
595 | */ | ||
596 | if (lowest_cpu != -1) { | ||
597 | cpus_clear(*lowest_mask); | ||
598 | cpu_set(rq->cpu, *lowest_mask); | ||
599 | } | ||
600 | return 1; | ||
601 | } | ||
602 | |||
603 | /* no locking for now */ | ||
604 | if ((rq->rt.highest_prio > task->prio) | ||
605 | && (rq->rt.highest_prio >= lowest_prio)) { | ||
606 | if (rq->rt.highest_prio > lowest_prio) { | ||
607 | /* new low - clear old data */ | ||
608 | lowest_prio = rq->rt.highest_prio; | ||
609 | lowest_cpu = cpu; | ||
610 | count = 0; | ||
611 | } | ||
612 | count++; | ||
613 | } else | ||
614 | cpu_clear(cpu, *lowest_mask); | ||
615 | } | ||
616 | |||
617 | /* | ||
618 | * Clear out all the set bits that represent | ||
619 | * runqueues that were of higher prio than | ||
620 | * the lowest_prio. | ||
621 | */ | ||
622 | if (lowest_cpu > 0) { | ||
623 | /* | ||
624 | * Perhaps we could add another cpumask op to | ||
625 | * zero out bits. Like cpu_zero_bits(cpumask, nrbits); | ||
626 | * Then that could be optimized to use memset and such. | ||
627 | */ | ||
628 | for_each_cpu_mask(cpu, *lowest_mask) { | ||
629 | if (cpu >= lowest_cpu) | ||
630 | break; | ||
631 | cpu_clear(cpu, *lowest_mask); | ||
632 | } | ||
633 | } | ||
634 | |||
635 | return count; | ||
133 | } | 636 | } |
134 | 637 | ||
135 | static struct task_struct *load_balance_next_rt(void *arg) | 638 | static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) |
136 | { | 639 | { |
137 | struct rq *rq = arg; | 640 | int first; |
138 | struct rt_prio_array *array = &rq->rt.active; | 641 | |
139 | struct list_head *head, *curr; | 642 | /* "this_cpu" is cheaper to preempt than a remote processor */ |
140 | struct task_struct *p; | 643 | if ((this_cpu != -1) && cpu_isset(this_cpu, *mask)) |
141 | int idx; | 644 | return this_cpu; |
645 | |||
646 | first = first_cpu(*mask); | ||
647 | if (first != NR_CPUS) | ||
648 | return first; | ||
649 | |||
650 | return -1; | ||
651 | } | ||
652 | |||
653 | static int find_lowest_rq(struct task_struct *task) | ||
654 | { | ||
655 | struct sched_domain *sd; | ||
656 | cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask); | ||
657 | int this_cpu = smp_processor_id(); | ||
658 | int cpu = task_cpu(task); | ||
659 | int count = find_lowest_cpus(task, lowest_mask); | ||
660 | |||
661 | if (!count) | ||
662 | return -1; /* No targets found */ | ||
142 | 663 | ||
143 | idx = rq->rt.rt_load_balance_idx; | 664 | /* |
144 | head = rq->rt.rt_load_balance_head; | 665 | * There is no sense in performing an optimal search if only one |
145 | curr = rq->rt.rt_load_balance_curr; | 666 | * target is found. |
667 | */ | ||
668 | if (count == 1) | ||
669 | return first_cpu(*lowest_mask); | ||
670 | |||
671 | /* | ||
672 | * At this point we have built a mask of cpus representing the | ||
673 | * lowest priority tasks in the system. Now we want to elect | ||
674 | * the best one based on our affinity and topology. | ||
675 | * | ||
676 | * We prioritize the last cpu that the task executed on since | ||
677 | * it is most likely cache-hot in that location. | ||
678 | */ | ||
679 | if (cpu_isset(cpu, *lowest_mask)) | ||
680 | return cpu; | ||
146 | 681 | ||
147 | /* | 682 | /* |
148 | * If we arrived back to the head again then | 683 | * Otherwise, we consult the sched_domains span maps to figure |
149 | * iterate to the next queue (if any): | 684 | * out which cpu is logically closest to our hot cache data. |
150 | */ | 685 | */ |
151 | if (unlikely(head == curr)) { | 686 | if (this_cpu == cpu) |
152 | int next_idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1); | 687 | this_cpu = -1; /* Skip this_cpu opt if the same */ |
153 | 688 | ||
154 | if (next_idx >= MAX_RT_PRIO) | 689 | for_each_domain(cpu, sd) { |
155 | return NULL; | 690 | if (sd->flags & SD_WAKE_AFFINE) { |
691 | cpumask_t domain_mask; | ||
692 | int best_cpu; | ||
156 | 693 | ||
157 | idx = next_idx; | 694 | cpus_and(domain_mask, sd->span, *lowest_mask); |
158 | head = array->queue + idx; | ||
159 | curr = head->prev; | ||
160 | 695 | ||
161 | rq->rt.rt_load_balance_idx = idx; | 696 | best_cpu = pick_optimal_cpu(this_cpu, |
162 | rq->rt.rt_load_balance_head = head; | 697 | &domain_mask); |
698 | if (best_cpu != -1) | ||
699 | return best_cpu; | ||
700 | } | ||
163 | } | 701 | } |
164 | 702 | ||
165 | p = list_entry(curr, struct task_struct, run_list); | 703 | /* |
704 | * And finally, if there were no matches within the domains | ||
705 | * just give the caller *something* to work with from the compatible | ||
706 | * locations. | ||
707 | */ | ||
708 | return pick_optimal_cpu(this_cpu, lowest_mask); | ||
709 | } | ||
166 | 710 | ||
167 | curr = curr->prev; | 711 | /* Will lock the rq it finds */ |
712 | static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) | ||
713 | { | ||
714 | struct rq *lowest_rq = NULL; | ||
715 | int tries; | ||
716 | int cpu; | ||
168 | 717 | ||
169 | rq->rt.rt_load_balance_curr = curr; | 718 | for (tries = 0; tries < RT_MAX_TRIES; tries++) { |
719 | cpu = find_lowest_rq(task); | ||
170 | 720 | ||
171 | return p; | 721 | if ((cpu == -1) || (cpu == rq->cpu)) |
722 | break; | ||
723 | |||
724 | lowest_rq = cpu_rq(cpu); | ||
725 | |||
726 | /* if the prio of this runqueue changed, try again */ | ||
727 | if (double_lock_balance(rq, lowest_rq)) { | ||
728 | /* | ||
729 | * We had to unlock the run queue. In | ||
730 | * the mean time, task could have | ||
731 | * migrated already or had its affinity changed. | ||
732 | * Also make sure that it wasn't scheduled on its rq. | ||
733 | */ | ||
734 | if (unlikely(task_rq(task) != rq || | ||
735 | !cpu_isset(lowest_rq->cpu, | ||
736 | task->cpus_allowed) || | ||
737 | task_running(rq, task) || | ||
738 | !task->se.on_rq)) { | ||
739 | |||
740 | spin_unlock(&lowest_rq->lock); | ||
741 | lowest_rq = NULL; | ||
742 | break; | ||
743 | } | ||
744 | } | ||
745 | |||
746 | /* If this rq is still suitable use it. */ | ||
747 | if (lowest_rq->rt.highest_prio > task->prio) | ||
748 | break; | ||
749 | |||
750 | /* try again */ | ||
751 | spin_unlock(&lowest_rq->lock); | ||
752 | lowest_rq = NULL; | ||
753 | } | ||
754 | |||
755 | return lowest_rq; | ||
756 | } | ||
757 | |||
758 | /* | ||
759 | * If the current CPU has more than one RT task, see if the non | ||
760 | * running task can migrate over to a CPU that is running a task | ||
761 | * of lesser priority. | ||
762 | */ | ||
763 | static int push_rt_task(struct rq *rq) | ||
764 | { | ||
765 | struct task_struct *next_task; | ||
766 | struct rq *lowest_rq; | ||
767 | int ret = 0; | ||
768 | int paranoid = RT_MAX_TRIES; | ||
769 | |||
770 | if (!rq->rt.overloaded) | ||
771 | return 0; | ||
772 | |||
773 | next_task = pick_next_highest_task_rt(rq, -1); | ||
774 | if (!next_task) | ||
775 | return 0; | ||
776 | |||
777 | retry: | ||
778 | if (unlikely(next_task == rq->curr)) { | ||
779 | WARN_ON(1); | ||
780 | return 0; | ||
781 | } | ||
782 | |||
783 | /* | ||
784 | * It's possible that the next_task slipped in of | ||
785 | * higher priority than current. If that's the case | ||
786 | * just reschedule current. | ||
787 | */ | ||
788 | if (unlikely(next_task->prio < rq->curr->prio)) { | ||
789 | resched_task(rq->curr); | ||
790 | return 0; | ||
791 | } | ||
792 | |||
793 | /* We might release rq lock */ | ||
794 | get_task_struct(next_task); | ||
795 | |||
796 | /* find_lock_lowest_rq locks the rq if found */ | ||
797 | lowest_rq = find_lock_lowest_rq(next_task, rq); | ||
798 | if (!lowest_rq) { | ||
799 | struct task_struct *task; | ||
800 | /* | ||
801 | * find lock_lowest_rq releases rq->lock | ||
802 | * so it is possible that next_task has changed. | ||
803 | * If it has, then try again. | ||
804 | */ | ||
805 | task = pick_next_highest_task_rt(rq, -1); | ||
806 | if (unlikely(task != next_task) && task && paranoid--) { | ||
807 | put_task_struct(next_task); | ||
808 | next_task = task; | ||
809 | goto retry; | ||
810 | } | ||
811 | goto out; | ||
812 | } | ||
813 | |||
814 | deactivate_task(rq, next_task, 0); | ||
815 | set_task_cpu(next_task, lowest_rq->cpu); | ||
816 | activate_task(lowest_rq, next_task, 0); | ||
817 | |||
818 | resched_task(lowest_rq->curr); | ||
819 | |||
820 | spin_unlock(&lowest_rq->lock); | ||
821 | |||
822 | ret = 1; | ||
823 | out: | ||
824 | put_task_struct(next_task); | ||
825 | |||
826 | return ret; | ||
827 | } | ||
828 | |||
829 | /* | ||
830 | * TODO: Currently we just use the second highest prio task on | ||
831 | * the queue, and stop when it can't migrate (or there's | ||
832 | * no more RT tasks). There may be a case where a lower | ||
833 | * priority RT task has a different affinity than the | ||
834 | * higher RT task. In this case the lower RT task could | ||
835 | * possibly be able to migrate where as the higher priority | ||
836 | * RT task could not. We currently ignore this issue. | ||
837 | * Enhancements are welcome! | ||
838 | */ | ||
839 | static void push_rt_tasks(struct rq *rq) | ||
840 | { | ||
841 | /* push_rt_task will return true if it moved an RT */ | ||
842 | while (push_rt_task(rq)) | ||
843 | ; | ||
844 | } | ||
845 | |||
846 | static int pull_rt_task(struct rq *this_rq) | ||
847 | { | ||
848 | int this_cpu = this_rq->cpu, ret = 0, cpu; | ||
849 | struct task_struct *p, *next; | ||
850 | struct rq *src_rq; | ||
851 | |||
852 | if (likely(!rt_overloaded(this_rq))) | ||
853 | return 0; | ||
854 | |||
855 | next = pick_next_task_rt(this_rq); | ||
856 | |||
857 | for_each_cpu_mask(cpu, this_rq->rd->rto_mask) { | ||
858 | if (this_cpu == cpu) | ||
859 | continue; | ||
860 | |||
861 | src_rq = cpu_rq(cpu); | ||
862 | /* | ||
863 | * We can potentially drop this_rq's lock in | ||
864 | * double_lock_balance, and another CPU could | ||
865 | * steal our next task - hence we must cause | ||
866 | * the caller to recalculate the next task | ||
867 | * in that case: | ||
868 | */ | ||
869 | if (double_lock_balance(this_rq, src_rq)) { | ||
870 | struct task_struct *old_next = next; | ||
871 | |||
872 | next = pick_next_task_rt(this_rq); | ||
873 | if (next != old_next) | ||
874 | ret = 1; | ||
875 | } | ||
876 | |||
877 | /* | ||
878 | * Are there still pullable RT tasks? | ||
879 | */ | ||
880 | if (src_rq->rt.rt_nr_running <= 1) | ||
881 | goto skip; | ||
882 | |||
883 | p = pick_next_highest_task_rt(src_rq, this_cpu); | ||
884 | |||
885 | /* | ||
886 | * Do we have an RT task that preempts | ||
887 | * the to-be-scheduled task? | ||
888 | */ | ||
889 | if (p && (!next || (p->prio < next->prio))) { | ||
890 | WARN_ON(p == src_rq->curr); | ||
891 | WARN_ON(!p->se.on_rq); | ||
892 | |||
893 | /* | ||
894 | * There's a chance that p is higher in priority | ||
895 | * than what's currently running on its cpu. | ||
896 | * This is just that p is wakeing up and hasn't | ||
897 | * had a chance to schedule. We only pull | ||
898 | * p if it is lower in priority than the | ||
899 | * current task on the run queue or | ||
900 | * this_rq next task is lower in prio than | ||
901 | * the current task on that rq. | ||
902 | */ | ||
903 | if (p->prio < src_rq->curr->prio || | ||
904 | (next && next->prio < src_rq->curr->prio)) | ||
905 | goto skip; | ||
906 | |||
907 | ret = 1; | ||
908 | |||
909 | deactivate_task(src_rq, p, 0); | ||
910 | set_task_cpu(p, this_cpu); | ||
911 | activate_task(this_rq, p, 0); | ||
912 | /* | ||
913 | * We continue with the search, just in | ||
914 | * case there's an even higher prio task | ||
915 | * in another runqueue. (low likelyhood | ||
916 | * but possible) | ||
917 | * | ||
918 | * Update next so that we won't pick a task | ||
919 | * on another cpu with a priority lower (or equal) | ||
920 | * than the one we just picked. | ||
921 | */ | ||
922 | next = p; | ||
923 | |||
924 | } | ||
925 | skip: | ||
926 | spin_unlock(&src_rq->lock); | ||
927 | } | ||
928 | |||
929 | return ret; | ||
930 | } | ||
931 | |||
932 | static void pre_schedule_rt(struct rq *rq, struct task_struct *prev) | ||
933 | { | ||
934 | /* Try to pull RT tasks here if we lower this rq's prio */ | ||
935 | if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio) | ||
936 | pull_rt_task(rq); | ||
937 | } | ||
938 | |||
939 | static void post_schedule_rt(struct rq *rq) | ||
940 | { | ||
941 | /* | ||
942 | * If we have more than one rt_task queued, then | ||
943 | * see if we can push the other rt_tasks off to other CPUS. | ||
944 | * Note we may release the rq lock, and since | ||
945 | * the lock was owned by prev, we need to release it | ||
946 | * first via finish_lock_switch and then reaquire it here. | ||
947 | */ | ||
948 | if (unlikely(rq->rt.overloaded)) { | ||
949 | spin_lock_irq(&rq->lock); | ||
950 | push_rt_tasks(rq); | ||
951 | spin_unlock_irq(&rq->lock); | ||
952 | } | ||
953 | } | ||
954 | |||
955 | |||
956 | static void task_wake_up_rt(struct rq *rq, struct task_struct *p) | ||
957 | { | ||
958 | if (!task_running(rq, p) && | ||
959 | (p->prio >= rq->rt.highest_prio) && | ||
960 | rq->rt.overloaded) | ||
961 | push_rt_tasks(rq); | ||
172 | } | 962 | } |
173 | 963 | ||
174 | static unsigned long | 964 | static unsigned long |
@@ -177,36 +967,170 @@ load_balance_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, | |||
177 | struct sched_domain *sd, enum cpu_idle_type idle, | 967 | struct sched_domain *sd, enum cpu_idle_type idle, |
178 | int *all_pinned, int *this_best_prio) | 968 | int *all_pinned, int *this_best_prio) |
179 | { | 969 | { |
180 | struct rq_iterator rt_rq_iterator; | 970 | /* don't touch RT tasks */ |
181 | 971 | return 0; | |
182 | rt_rq_iterator.start = load_balance_start_rt; | ||
183 | rt_rq_iterator.next = load_balance_next_rt; | ||
184 | /* pass 'busiest' rq argument into | ||
185 | * load_balance_[start|next]_rt iterators | ||
186 | */ | ||
187 | rt_rq_iterator.arg = busiest; | ||
188 | |||
189 | return balance_tasks(this_rq, this_cpu, busiest, max_load_move, sd, | ||
190 | idle, all_pinned, this_best_prio, &rt_rq_iterator); | ||
191 | } | 972 | } |
192 | 973 | ||
193 | static int | 974 | static int |
194 | move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, | 975 | move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest, |
195 | struct sched_domain *sd, enum cpu_idle_type idle) | 976 | struct sched_domain *sd, enum cpu_idle_type idle) |
196 | { | 977 | { |
197 | struct rq_iterator rt_rq_iterator; | 978 | /* don't touch RT tasks */ |
979 | return 0; | ||
980 | } | ||
981 | |||
982 | static void set_cpus_allowed_rt(struct task_struct *p, cpumask_t *new_mask) | ||
983 | { | ||
984 | int weight = cpus_weight(*new_mask); | ||
198 | 985 | ||
199 | rt_rq_iterator.start = load_balance_start_rt; | 986 | BUG_ON(!rt_task(p)); |
200 | rt_rq_iterator.next = load_balance_next_rt; | 987 | |
201 | rt_rq_iterator.arg = busiest; | 988 | /* |
989 | * Update the migration status of the RQ if we have an RT task | ||
990 | * which is running AND changing its weight value. | ||
991 | */ | ||
992 | if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) { | ||
993 | struct rq *rq = task_rq(p); | ||
202 | 994 | ||
203 | return iter_move_one_task(this_rq, this_cpu, busiest, sd, idle, | 995 | if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) { |
204 | &rt_rq_iterator); | 996 | rq->rt.rt_nr_migratory++; |
997 | } else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) { | ||
998 | BUG_ON(!rq->rt.rt_nr_migratory); | ||
999 | rq->rt.rt_nr_migratory--; | ||
1000 | } | ||
1001 | |||
1002 | update_rt_migration(rq); | ||
1003 | } | ||
1004 | |||
1005 | p->cpus_allowed = *new_mask; | ||
1006 | p->rt.nr_cpus_allowed = weight; | ||
205 | } | 1007 | } |
206 | #endif | ||
207 | 1008 | ||
208 | static void task_tick_rt(struct rq *rq, struct task_struct *p) | 1009 | /* Assumes rq->lock is held */ |
1010 | static void join_domain_rt(struct rq *rq) | ||
209 | { | 1011 | { |
1012 | if (rq->rt.overloaded) | ||
1013 | rt_set_overload(rq); | ||
1014 | } | ||
1015 | |||
1016 | /* Assumes rq->lock is held */ | ||
1017 | static void leave_domain_rt(struct rq *rq) | ||
1018 | { | ||
1019 | if (rq->rt.overloaded) | ||
1020 | rt_clear_overload(rq); | ||
1021 | } | ||
1022 | |||
1023 | /* | ||
1024 | * When switch from the rt queue, we bring ourselves to a position | ||
1025 | * that we might want to pull RT tasks from other runqueues. | ||
1026 | */ | ||
1027 | static void switched_from_rt(struct rq *rq, struct task_struct *p, | ||
1028 | int running) | ||
1029 | { | ||
1030 | /* | ||
1031 | * If there are other RT tasks then we will reschedule | ||
1032 | * and the scheduling of the other RT tasks will handle | ||
1033 | * the balancing. But if we are the last RT task | ||
1034 | * we may need to handle the pulling of RT tasks | ||
1035 | * now. | ||
1036 | */ | ||
1037 | if (!rq->rt.rt_nr_running) | ||
1038 | pull_rt_task(rq); | ||
1039 | } | ||
1040 | #endif /* CONFIG_SMP */ | ||
1041 | |||
1042 | /* | ||
1043 | * When switching a task to RT, we may overload the runqueue | ||
1044 | * with RT tasks. In this case we try to push them off to | ||
1045 | * other runqueues. | ||
1046 | */ | ||
1047 | static void switched_to_rt(struct rq *rq, struct task_struct *p, | ||
1048 | int running) | ||
1049 | { | ||
1050 | int check_resched = 1; | ||
1051 | |||
1052 | /* | ||
1053 | * If we are already running, then there's nothing | ||
1054 | * that needs to be done. But if we are not running | ||
1055 | * we may need to preempt the current running task. | ||
1056 | * If that current running task is also an RT task | ||
1057 | * then see if we can move to another run queue. | ||
1058 | */ | ||
1059 | if (!running) { | ||
1060 | #ifdef CONFIG_SMP | ||
1061 | if (rq->rt.overloaded && push_rt_task(rq) && | ||
1062 | /* Don't resched if we changed runqueues */ | ||
1063 | rq != task_rq(p)) | ||
1064 | check_resched = 0; | ||
1065 | #endif /* CONFIG_SMP */ | ||
1066 | if (check_resched && p->prio < rq->curr->prio) | ||
1067 | resched_task(rq->curr); | ||
1068 | } | ||
1069 | } | ||
1070 | |||
1071 | /* | ||
1072 | * Priority of the task has changed. This may cause | ||
1073 | * us to initiate a push or pull. | ||
1074 | */ | ||
1075 | static void prio_changed_rt(struct rq *rq, struct task_struct *p, | ||
1076 | int oldprio, int running) | ||
1077 | { | ||
1078 | if (running) { | ||
1079 | #ifdef CONFIG_SMP | ||
1080 | /* | ||
1081 | * If our priority decreases while running, we | ||
1082 | * may need to pull tasks to this runqueue. | ||
1083 | */ | ||
1084 | if (oldprio < p->prio) | ||
1085 | pull_rt_task(rq); | ||
1086 | /* | ||
1087 | * If there's a higher priority task waiting to run | ||
1088 | * then reschedule. | ||
1089 | */ | ||
1090 | if (p->prio > rq->rt.highest_prio) | ||
1091 | resched_task(p); | ||
1092 | #else | ||
1093 | /* For UP simply resched on drop of prio */ | ||
1094 | if (oldprio < p->prio) | ||
1095 | resched_task(p); | ||
1096 | #endif /* CONFIG_SMP */ | ||
1097 | } else { | ||
1098 | /* | ||
1099 | * This task is not running, but if it is | ||
1100 | * greater than the current running task | ||
1101 | * then reschedule. | ||
1102 | */ | ||
1103 | if (p->prio < rq->curr->prio) | ||
1104 | resched_task(rq->curr); | ||
1105 | } | ||
1106 | } | ||
1107 | |||
1108 | static void watchdog(struct rq *rq, struct task_struct *p) | ||
1109 | { | ||
1110 | unsigned long soft, hard; | ||
1111 | |||
1112 | if (!p->signal) | ||
1113 | return; | ||
1114 | |||
1115 | soft = p->signal->rlim[RLIMIT_RTTIME].rlim_cur; | ||
1116 | hard = p->signal->rlim[RLIMIT_RTTIME].rlim_max; | ||
1117 | |||
1118 | if (soft != RLIM_INFINITY) { | ||
1119 | unsigned long next; | ||
1120 | |||
1121 | p->rt.timeout++; | ||
1122 | next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); | ||
1123 | if (p->rt.timeout > next) | ||
1124 | p->it_sched_expires = p->se.sum_exec_runtime; | ||
1125 | } | ||
1126 | } | ||
1127 | |||
1128 | static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) | ||
1129 | { | ||
1130 | update_curr_rt(rq); | ||
1131 | |||
1132 | watchdog(rq, p); | ||
1133 | |||
210 | /* | 1134 | /* |
211 | * RR tasks need a special form of timeslice management. | 1135 | * RR tasks need a special form of timeslice management. |
212 | * FIFO tasks have no timeslices. | 1136 | * FIFO tasks have no timeslices. |
@@ -214,16 +1138,16 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p) | |||
214 | if (p->policy != SCHED_RR) | 1138 | if (p->policy != SCHED_RR) |
215 | return; | 1139 | return; |
216 | 1140 | ||
217 | if (--p->time_slice) | 1141 | if (--p->rt.time_slice) |
218 | return; | 1142 | return; |
219 | 1143 | ||
220 | p->time_slice = DEF_TIMESLICE; | 1144 | p->rt.time_slice = DEF_TIMESLICE; |
221 | 1145 | ||
222 | /* | 1146 | /* |
223 | * Requeue to the end of queue if we are not the only element | 1147 | * Requeue to the end of queue if we are not the only element |
224 | * on the queue: | 1148 | * on the queue: |
225 | */ | 1149 | */ |
226 | if (p->run_list.prev != p->run_list.next) { | 1150 | if (p->rt.run_list.prev != p->rt.run_list.next) { |
227 | requeue_task_rt(rq, p); | 1151 | requeue_task_rt(rq, p); |
228 | set_tsk_need_resched(p); | 1152 | set_tsk_need_resched(p); |
229 | } | 1153 | } |
@@ -241,6 +1165,9 @@ const struct sched_class rt_sched_class = { | |||
241 | .enqueue_task = enqueue_task_rt, | 1165 | .enqueue_task = enqueue_task_rt, |
242 | .dequeue_task = dequeue_task_rt, | 1166 | .dequeue_task = dequeue_task_rt, |
243 | .yield_task = yield_task_rt, | 1167 | .yield_task = yield_task_rt, |
1168 | #ifdef CONFIG_SMP | ||
1169 | .select_task_rq = select_task_rq_rt, | ||
1170 | #endif /* CONFIG_SMP */ | ||
244 | 1171 | ||
245 | .check_preempt_curr = check_preempt_curr_rt, | 1172 | .check_preempt_curr = check_preempt_curr_rt, |
246 | 1173 | ||
@@ -250,8 +1177,18 @@ const struct sched_class rt_sched_class = { | |||
250 | #ifdef CONFIG_SMP | 1177 | #ifdef CONFIG_SMP |
251 | .load_balance = load_balance_rt, | 1178 | .load_balance = load_balance_rt, |
252 | .move_one_task = move_one_task_rt, | 1179 | .move_one_task = move_one_task_rt, |
1180 | .set_cpus_allowed = set_cpus_allowed_rt, | ||
1181 | .join_domain = join_domain_rt, | ||
1182 | .leave_domain = leave_domain_rt, | ||
1183 | .pre_schedule = pre_schedule_rt, | ||
1184 | .post_schedule = post_schedule_rt, | ||
1185 | .task_wake_up = task_wake_up_rt, | ||
1186 | .switched_from = switched_from_rt, | ||
253 | #endif | 1187 | #endif |
254 | 1188 | ||
255 | .set_curr_task = set_curr_task_rt, | 1189 | .set_curr_task = set_curr_task_rt, |
256 | .task_tick = task_tick_rt, | 1190 | .task_tick = task_tick_rt, |
1191 | |||
1192 | .prio_changed = prio_changed_rt, | ||
1193 | .switched_to = switched_to_rt, | ||
257 | }; | 1194 | }; |
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h index ef1a7df80ea2..5b32433e7ee5 100644 --- a/kernel/sched_stats.h +++ b/kernel/sched_stats.h | |||
@@ -52,7 +52,8 @@ static int show_schedstat(struct seq_file *seq, void *v) | |||
52 | sd->lb_nobusyq[itype], | 52 | sd->lb_nobusyq[itype], |
53 | sd->lb_nobusyg[itype]); | 53 | sd->lb_nobusyg[itype]); |
54 | } | 54 | } |
55 | seq_printf(seq, " %u %u %u %u %u %u %u %u %u %u %u %u\n", | 55 | seq_printf(seq, |
56 | " %u %u %u %u %u %u %u %u %u %u %u %u\n", | ||
56 | sd->alb_count, sd->alb_failed, sd->alb_pushed, | 57 | sd->alb_count, sd->alb_failed, sd->alb_pushed, |
57 | sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed, | 58 | sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed, |
58 | sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed, | 59 | sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed, |
@@ -127,7 +128,7 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta) | |||
127 | # define schedstat_set(var, val) do { } while (0) | 128 | # define schedstat_set(var, val) do { } while (0) |
128 | #endif | 129 | #endif |
129 | 130 | ||
130 | #ifdef CONFIG_SCHEDSTATS | 131 | #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) |
131 | /* | 132 | /* |
132 | * Called when a process is dequeued from the active array and given | 133 | * Called when a process is dequeued from the active array and given |
133 | * the cpu. We should note that with the exception of interactive | 134 | * the cpu. We should note that with the exception of interactive |
@@ -155,7 +156,7 @@ static inline void sched_info_dequeued(struct task_struct *t) | |||
155 | */ | 156 | */ |
156 | static void sched_info_arrive(struct task_struct *t) | 157 | static void sched_info_arrive(struct task_struct *t) |
157 | { | 158 | { |
158 | unsigned long long now = sched_clock(), delta = 0; | 159 | unsigned long long now = task_rq(t)->clock, delta = 0; |
159 | 160 | ||
160 | if (t->sched_info.last_queued) | 161 | if (t->sched_info.last_queued) |
161 | delta = now - t->sched_info.last_queued; | 162 | delta = now - t->sched_info.last_queued; |
@@ -186,7 +187,7 @@ static inline void sched_info_queued(struct task_struct *t) | |||
186 | { | 187 | { |
187 | if (unlikely(sched_info_on())) | 188 | if (unlikely(sched_info_on())) |
188 | if (!t->sched_info.last_queued) | 189 | if (!t->sched_info.last_queued) |
189 | t->sched_info.last_queued = sched_clock(); | 190 | t->sched_info.last_queued = task_rq(t)->clock; |
190 | } | 191 | } |
191 | 192 | ||
192 | /* | 193 | /* |
@@ -195,7 +196,8 @@ static inline void sched_info_queued(struct task_struct *t) | |||
195 | */ | 196 | */ |
196 | static inline void sched_info_depart(struct task_struct *t) | 197 | static inline void sched_info_depart(struct task_struct *t) |
197 | { | 198 | { |
198 | unsigned long long delta = sched_clock() - t->sched_info.last_arrival; | 199 | unsigned long long delta = task_rq(t)->clock - |
200 | t->sched_info.last_arrival; | ||
199 | 201 | ||
200 | t->sched_info.cpu_time += delta; | 202 | t->sched_info.cpu_time += delta; |
201 | rq_sched_info_depart(task_rq(t), delta); | 203 | rq_sched_info_depart(task_rq(t), delta); |
@@ -231,5 +233,5 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) | |||
231 | #else | 233 | #else |
232 | #define sched_info_queued(t) do { } while (0) | 234 | #define sched_info_queued(t) do { } while (0) |
233 | #define sched_info_switch(t, next) do { } while (0) | 235 | #define sched_info_switch(t, next) do { } while (0) |
234 | #endif /* CONFIG_SCHEDSTATS */ | 236 | #endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ |
235 | 237 | ||
diff --git a/kernel/signal.c b/kernel/signal.c index 12006308c7eb..4333b6dbb424 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -55,7 +55,7 @@ static int sig_ignored(struct task_struct *t, int sig) | |||
55 | * signal handler may change by the time it is | 55 | * signal handler may change by the time it is |
56 | * unblocked. | 56 | * unblocked. |
57 | */ | 57 | */ |
58 | if (sigismember(&t->blocked, sig)) | 58 | if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig)) |
59 | return 0; | 59 | return 0; |
60 | 60 | ||
61 | /* Is it explicitly or implicitly ignored? */ | 61 | /* Is it explicitly or implicitly ignored? */ |
@@ -124,7 +124,7 @@ void recalc_sigpending_and_wake(struct task_struct *t) | |||
124 | 124 | ||
125 | void recalc_sigpending(void) | 125 | void recalc_sigpending(void) |
126 | { | 126 | { |
127 | if (!recalc_sigpending_tsk(current)) | 127 | if (!recalc_sigpending_tsk(current) && !freezing(current)) |
128 | clear_thread_flag(TIF_SIGPENDING); | 128 | clear_thread_flag(TIF_SIGPENDING); |
129 | 129 | ||
130 | } | 130 | } |
@@ -456,15 +456,15 @@ void signal_wake_up(struct task_struct *t, int resume) | |||
456 | set_tsk_thread_flag(t, TIF_SIGPENDING); | 456 | set_tsk_thread_flag(t, TIF_SIGPENDING); |
457 | 457 | ||
458 | /* | 458 | /* |
459 | * For SIGKILL, we want to wake it up in the stopped/traced case. | 459 | * For SIGKILL, we want to wake it up in the stopped/traced/killable |
460 | * We don't check t->state here because there is a race with it | 460 | * case. We don't check t->state here because there is a race with it |
461 | * executing another processor and just now entering stopped state. | 461 | * executing another processor and just now entering stopped state. |
462 | * By using wake_up_state, we ensure the process will wake up and | 462 | * By using wake_up_state, we ensure the process will wake up and |
463 | * handle its death signal. | 463 | * handle its death signal. |
464 | */ | 464 | */ |
465 | mask = TASK_INTERRUPTIBLE; | 465 | mask = TASK_INTERRUPTIBLE; |
466 | if (resume) | 466 | if (resume) |
467 | mask |= TASK_STOPPED | TASK_TRACED; | 467 | mask |= TASK_WAKEKILL; |
468 | if (!wake_up_state(t, mask)) | 468 | if (!wake_up_state(t, mask)) |
469 | kick_process(t); | 469 | kick_process(t); |
470 | } | 470 | } |
@@ -620,7 +620,7 @@ static void handle_stop_signal(int sig, struct task_struct *p) | |||
620 | * Wake up the stopped thread _after_ setting | 620 | * Wake up the stopped thread _after_ setting |
621 | * TIF_SIGPENDING | 621 | * TIF_SIGPENDING |
622 | */ | 622 | */ |
623 | state = TASK_STOPPED; | 623 | state = __TASK_STOPPED; |
624 | if (sig_user_defined(t, SIGCONT) && !sigismember(&t->blocked, SIGCONT)) { | 624 | if (sig_user_defined(t, SIGCONT) && !sigismember(&t->blocked, SIGCONT)) { |
625 | set_tsk_thread_flag(t, TIF_SIGPENDING); | 625 | set_tsk_thread_flag(t, TIF_SIGPENDING); |
626 | state |= TASK_INTERRUPTIBLE; | 626 | state |= TASK_INTERRUPTIBLE; |
@@ -732,14 +732,14 @@ static void print_fatal_signal(struct pt_regs *regs, int signr) | |||
732 | printk("%s/%d: potentially unexpected fatal signal %d.\n", | 732 | printk("%s/%d: potentially unexpected fatal signal %d.\n", |
733 | current->comm, task_pid_nr(current), signr); | 733 | current->comm, task_pid_nr(current), signr); |
734 | 734 | ||
735 | #ifdef __i386__ | 735 | #if defined(__i386__) && !defined(__arch_um__) |
736 | printk("code at %08lx: ", regs->eip); | 736 | printk("code at %08lx: ", regs->ip); |
737 | { | 737 | { |
738 | int i; | 738 | int i; |
739 | for (i = 0; i < 16; i++) { | 739 | for (i = 0; i < 16; i++) { |
740 | unsigned char insn; | 740 | unsigned char insn; |
741 | 741 | ||
742 | __get_user(insn, (unsigned char *)(regs->eip + i)); | 742 | __get_user(insn, (unsigned char *)(regs->ip + i)); |
743 | printk("%02x ", insn); | 743 | printk("%02x ", insn); |
744 | } | 744 | } |
745 | } | 745 | } |
@@ -838,7 +838,7 @@ static inline int wants_signal(int sig, struct task_struct *p) | |||
838 | return 0; | 838 | return 0; |
839 | if (sig == SIGKILL) | 839 | if (sig == SIGKILL) |
840 | return 1; | 840 | return 1; |
841 | if (p->state & (TASK_STOPPED | TASK_TRACED)) | 841 | if (task_is_stopped_or_traced(p)) |
842 | return 0; | 842 | return 0; |
843 | return task_curr(p) || !signal_pending(p); | 843 | return task_curr(p) || !signal_pending(p); |
844 | } | 844 | } |
@@ -994,6 +994,12 @@ void zap_other_threads(struct task_struct *p) | |||
994 | } | 994 | } |
995 | } | 995 | } |
996 | 996 | ||
997 | int fastcall __fatal_signal_pending(struct task_struct *tsk) | ||
998 | { | ||
999 | return sigismember(&tsk->pending.signal, SIGKILL); | ||
1000 | } | ||
1001 | EXPORT_SYMBOL(__fatal_signal_pending); | ||
1002 | |||
997 | /* | 1003 | /* |
998 | * Must be called under rcu_read_lock() or with tasklist_lock read-held. | 1004 | * Must be called under rcu_read_lock() or with tasklist_lock read-held. |
999 | */ | 1005 | */ |
@@ -1441,7 +1447,7 @@ void do_notify_parent(struct task_struct *tsk, int sig) | |||
1441 | BUG_ON(sig == -1); | 1447 | BUG_ON(sig == -1); |
1442 | 1448 | ||
1443 | /* do_notify_parent_cldstop should have been called instead. */ | 1449 | /* do_notify_parent_cldstop should have been called instead. */ |
1444 | BUG_ON(tsk->state & (TASK_STOPPED|TASK_TRACED)); | 1450 | BUG_ON(task_is_stopped_or_traced(tsk)); |
1445 | 1451 | ||
1446 | BUG_ON(!tsk->ptrace && | 1452 | BUG_ON(!tsk->ptrace && |
1447 | (tsk->group_leader != tsk || !thread_group_empty(tsk))); | 1453 | (tsk->group_leader != tsk || !thread_group_empty(tsk))); |
@@ -1729,7 +1735,7 @@ static int do_signal_stop(int signr) | |||
1729 | * so this check has no races. | 1735 | * so this check has no races. |
1730 | */ | 1736 | */ |
1731 | if (!t->exit_state && | 1737 | if (!t->exit_state && |
1732 | !(t->state & (TASK_STOPPED|TASK_TRACED))) { | 1738 | !task_is_stopped_or_traced(t)) { |
1733 | stop_count++; | 1739 | stop_count++; |
1734 | signal_wake_up(t, 0); | 1740 | signal_wake_up(t, 0); |
1735 | } | 1741 | } |
diff --git a/kernel/softirq.c b/kernel/softirq.c index bd89bc4eb0b9..d7837d45419e 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -3,7 +3,9 @@ | |||
3 | * | 3 | * |
4 | * Copyright (C) 1992 Linus Torvalds | 4 | * Copyright (C) 1992 Linus Torvalds |
5 | * | 5 | * |
6 | * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) | 6 | * Distribute under GPLv2. |
7 | * | ||
8 | * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) | ||
7 | */ | 9 | */ |
8 | 10 | ||
9 | #include <linux/module.h> | 11 | #include <linux/module.h> |
@@ -278,9 +280,14 @@ asmlinkage void do_softirq(void) | |||
278 | */ | 280 | */ |
279 | void irq_enter(void) | 281 | void irq_enter(void) |
280 | { | 282 | { |
283 | #ifdef CONFIG_NO_HZ | ||
284 | int cpu = smp_processor_id(); | ||
285 | if (idle_cpu(cpu) && !in_interrupt()) | ||
286 | tick_nohz_stop_idle(cpu); | ||
287 | #endif | ||
281 | __irq_enter(); | 288 | __irq_enter(); |
282 | #ifdef CONFIG_NO_HZ | 289 | #ifdef CONFIG_NO_HZ |
283 | if (idle_cpu(smp_processor_id())) | 290 | if (idle_cpu(cpu)) |
284 | tick_nohz_update_jiffies(); | 291 | tick_nohz_update_jiffies(); |
285 | #endif | 292 | #endif |
286 | } | 293 | } |
diff --git a/kernel/softlockup.c b/kernel/softlockup.c index 11df812263c8..7c2da88db4ed 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c | |||
@@ -8,6 +8,7 @@ | |||
8 | */ | 8 | */ |
9 | #include <linux/mm.h> | 9 | #include <linux/mm.h> |
10 | #include <linux/cpu.h> | 10 | #include <linux/cpu.h> |
11 | #include <linux/nmi.h> | ||
11 | #include <linux/init.h> | 12 | #include <linux/init.h> |
12 | #include <linux/delay.h> | 13 | #include <linux/delay.h> |
13 | #include <linux/freezer.h> | 14 | #include <linux/freezer.h> |
@@ -23,8 +24,8 @@ static DEFINE_PER_CPU(unsigned long, touch_timestamp); | |||
23 | static DEFINE_PER_CPU(unsigned long, print_timestamp); | 24 | static DEFINE_PER_CPU(unsigned long, print_timestamp); |
24 | static DEFINE_PER_CPU(struct task_struct *, watchdog_task); | 25 | static DEFINE_PER_CPU(struct task_struct *, watchdog_task); |
25 | 26 | ||
26 | static int did_panic; | 27 | static int __read_mostly did_panic; |
27 | int softlockup_thresh = 10; | 28 | unsigned long __read_mostly softlockup_thresh = 60; |
28 | 29 | ||
29 | static int | 30 | static int |
30 | softlock_panic(struct notifier_block *this, unsigned long event, void *ptr) | 31 | softlock_panic(struct notifier_block *this, unsigned long event, void *ptr) |
@@ -45,7 +46,7 @@ static struct notifier_block panic_block = { | |||
45 | */ | 46 | */ |
46 | static unsigned long get_timestamp(int this_cpu) | 47 | static unsigned long get_timestamp(int this_cpu) |
47 | { | 48 | { |
48 | return cpu_clock(this_cpu) >> 30; /* 2^30 ~= 10^9 */ | 49 | return cpu_clock(this_cpu) >> 30LL; /* 2^30 ~= 10^9 */ |
49 | } | 50 | } |
50 | 51 | ||
51 | void touch_softlockup_watchdog(void) | 52 | void touch_softlockup_watchdog(void) |
@@ -104,7 +105,7 @@ void softlockup_tick(void) | |||
104 | if (now > (touch_timestamp + 1)) | 105 | if (now > (touch_timestamp + 1)) |
105 | wake_up_process(per_cpu(watchdog_task, this_cpu)); | 106 | wake_up_process(per_cpu(watchdog_task, this_cpu)); |
106 | 107 | ||
107 | /* Warn about unreasonable 10+ seconds delays: */ | 108 | /* Warn about unreasonable delays: */ |
108 | if (now <= (touch_timestamp + softlockup_thresh)) | 109 | if (now <= (touch_timestamp + softlockup_thresh)) |
109 | return; | 110 | return; |
110 | 111 | ||
@@ -122,11 +123,93 @@ void softlockup_tick(void) | |||
122 | } | 123 | } |
123 | 124 | ||
124 | /* | 125 | /* |
126 | * Have a reasonable limit on the number of tasks checked: | ||
127 | */ | ||
128 | unsigned long __read_mostly sysctl_hung_task_check_count = 1024; | ||
129 | |||
130 | /* | ||
131 | * Zero means infinite timeout - no checking done: | ||
132 | */ | ||
133 | unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120; | ||
134 | |||
135 | unsigned long __read_mostly sysctl_hung_task_warnings = 10; | ||
136 | |||
137 | /* | ||
138 | * Only do the hung-tasks check on one CPU: | ||
139 | */ | ||
140 | static int check_cpu __read_mostly = -1; | ||
141 | |||
142 | static void check_hung_task(struct task_struct *t, unsigned long now) | ||
143 | { | ||
144 | unsigned long switch_count = t->nvcsw + t->nivcsw; | ||
145 | |||
146 | if (t->flags & PF_FROZEN) | ||
147 | return; | ||
148 | |||
149 | if (switch_count != t->last_switch_count || !t->last_switch_timestamp) { | ||
150 | t->last_switch_count = switch_count; | ||
151 | t->last_switch_timestamp = now; | ||
152 | return; | ||
153 | } | ||
154 | if ((long)(now - t->last_switch_timestamp) < | ||
155 | sysctl_hung_task_timeout_secs) | ||
156 | return; | ||
157 | if (sysctl_hung_task_warnings < 0) | ||
158 | return; | ||
159 | sysctl_hung_task_warnings--; | ||
160 | |||
161 | /* | ||
162 | * Ok, the task did not get scheduled for more than 2 minutes, | ||
163 | * complain: | ||
164 | */ | ||
165 | printk(KERN_ERR "INFO: task %s:%d blocked for more than " | ||
166 | "%ld seconds.\n", t->comm, t->pid, | ||
167 | sysctl_hung_task_timeout_secs); | ||
168 | printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\"" | ||
169 | " disables this message.\n"); | ||
170 | sched_show_task(t); | ||
171 | __debug_show_held_locks(t); | ||
172 | |||
173 | t->last_switch_timestamp = now; | ||
174 | touch_nmi_watchdog(); | ||
175 | } | ||
176 | |||
177 | /* | ||
178 | * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for | ||
179 | * a really long time (120 seconds). If that happens, print out | ||
180 | * a warning. | ||
181 | */ | ||
182 | static void check_hung_uninterruptible_tasks(int this_cpu) | ||
183 | { | ||
184 | int max_count = sysctl_hung_task_check_count; | ||
185 | unsigned long now = get_timestamp(this_cpu); | ||
186 | struct task_struct *g, *t; | ||
187 | |||
188 | /* | ||
189 | * If the system crashed already then all bets are off, | ||
190 | * do not report extra hung tasks: | ||
191 | */ | ||
192 | if ((tainted & TAINT_DIE) || did_panic) | ||
193 | return; | ||
194 | |||
195 | read_lock(&tasklist_lock); | ||
196 | do_each_thread(g, t) { | ||
197 | if (!--max_count) | ||
198 | goto unlock; | ||
199 | if (t->state & TASK_UNINTERRUPTIBLE) | ||
200 | check_hung_task(t, now); | ||
201 | } while_each_thread(g, t); | ||
202 | unlock: | ||
203 | read_unlock(&tasklist_lock); | ||
204 | } | ||
205 | |||
206 | /* | ||
125 | * The watchdog thread - runs every second and touches the timestamp. | 207 | * The watchdog thread - runs every second and touches the timestamp. |
126 | */ | 208 | */ |
127 | static int watchdog(void *__bind_cpu) | 209 | static int watchdog(void *__bind_cpu) |
128 | { | 210 | { |
129 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; | 211 | struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; |
212 | int this_cpu = (long)__bind_cpu; | ||
130 | 213 | ||
131 | sched_setscheduler(current, SCHED_FIFO, ¶m); | 214 | sched_setscheduler(current, SCHED_FIFO, ¶m); |
132 | 215 | ||
@@ -135,13 +218,23 @@ static int watchdog(void *__bind_cpu) | |||
135 | 218 | ||
136 | /* | 219 | /* |
137 | * Run briefly once per second to reset the softlockup timestamp. | 220 | * Run briefly once per second to reset the softlockup timestamp. |
138 | * If this gets delayed for more than 10 seconds then the | 221 | * If this gets delayed for more than 60 seconds then the |
139 | * debug-printout triggers in softlockup_tick(). | 222 | * debug-printout triggers in softlockup_tick(). |
140 | */ | 223 | */ |
141 | while (!kthread_should_stop()) { | 224 | while (!kthread_should_stop()) { |
142 | set_current_state(TASK_INTERRUPTIBLE); | 225 | set_current_state(TASK_INTERRUPTIBLE); |
143 | touch_softlockup_watchdog(); | 226 | touch_softlockup_watchdog(); |
144 | schedule(); | 227 | schedule(); |
228 | |||
229 | if (kthread_should_stop()) | ||
230 | break; | ||
231 | |||
232 | if (this_cpu != check_cpu) | ||
233 | continue; | ||
234 | |||
235 | if (sysctl_hung_task_timeout_secs) | ||
236 | check_hung_uninterruptible_tasks(this_cpu); | ||
237 | |||
145 | } | 238 | } |
146 | 239 | ||
147 | return 0; | 240 | return 0; |
@@ -171,9 +264,20 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
171 | break; | 264 | break; |
172 | case CPU_ONLINE: | 265 | case CPU_ONLINE: |
173 | case CPU_ONLINE_FROZEN: | 266 | case CPU_ONLINE_FROZEN: |
267 | check_cpu = any_online_cpu(cpu_online_map); | ||
174 | wake_up_process(per_cpu(watchdog_task, hotcpu)); | 268 | wake_up_process(per_cpu(watchdog_task, hotcpu)); |
175 | break; | 269 | break; |
176 | #ifdef CONFIG_HOTPLUG_CPU | 270 | #ifdef CONFIG_HOTPLUG_CPU |
271 | case CPU_DOWN_PREPARE: | ||
272 | case CPU_DOWN_PREPARE_FROZEN: | ||
273 | if (hotcpu == check_cpu) { | ||
274 | cpumask_t temp_cpu_online_map = cpu_online_map; | ||
275 | |||
276 | cpu_clear(hotcpu, temp_cpu_online_map); | ||
277 | check_cpu = any_online_cpu(temp_cpu_online_map); | ||
278 | } | ||
279 | break; | ||
280 | |||
177 | case CPU_UP_CANCELED: | 281 | case CPU_UP_CANCELED: |
178 | case CPU_UP_CANCELED_FROZEN: | 282 | case CPU_UP_CANCELED_FROZEN: |
179 | if (!per_cpu(watchdog_task, hotcpu)) | 283 | if (!per_cpu(watchdog_task, hotcpu)) |
diff --git a/kernel/spinlock.c b/kernel/spinlock.c index cd72424c2662..ae28c8245123 100644 --- a/kernel/spinlock.c +++ b/kernel/spinlock.c | |||
@@ -65,8 +65,7 @@ EXPORT_SYMBOL(_write_trylock); | |||
65 | * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are | 65 | * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are |
66 | * not re-enabled during lock-acquire (which the preempt-spin-ops do): | 66 | * not re-enabled during lock-acquire (which the preempt-spin-ops do): |
67 | */ | 67 | */ |
68 | #if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP) || \ | 68 | #if !defined(CONFIG_GENERIC_LOCKBREAK) || defined(CONFIG_DEBUG_LOCK_ALLOC) |
69 | defined(CONFIG_DEBUG_LOCK_ALLOC) | ||
70 | 69 | ||
71 | void __lockfunc _read_lock(rwlock_t *lock) | 70 | void __lockfunc _read_lock(rwlock_t *lock) |
72 | { | 71 | { |
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 319821ef78af..51b5ee53571a 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c | |||
@@ -203,13 +203,13 @@ int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) | |||
203 | int ret; | 203 | int ret; |
204 | 204 | ||
205 | /* No CPUs can come up or down during this. */ | 205 | /* No CPUs can come up or down during this. */ |
206 | lock_cpu_hotplug(); | 206 | get_online_cpus(); |
207 | p = __stop_machine_run(fn, data, cpu); | 207 | p = __stop_machine_run(fn, data, cpu); |
208 | if (!IS_ERR(p)) | 208 | if (!IS_ERR(p)) |
209 | ret = kthread_stop(p); | 209 | ret = kthread_stop(p); |
210 | else | 210 | else |
211 | ret = PTR_ERR(p); | 211 | ret = PTR_ERR(p); |
212 | unlock_cpu_hotplug(); | 212 | put_online_cpus(); |
213 | 213 | ||
214 | return ret; | 214 | return ret; |
215 | } | 215 | } |
diff --git a/kernel/sys.c b/kernel/sys.c index 304b5410d746..d1fe71eb4546 100644 --- a/kernel/sys.c +++ b/kernel/sys.c | |||
@@ -1750,7 +1750,7 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, | |||
1750 | } | 1750 | } |
1751 | 1751 | ||
1752 | asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep, | 1752 | asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep, |
1753 | struct getcpu_cache __user *cache) | 1753 | struct getcpu_cache __user *unused) |
1754 | { | 1754 | { |
1755 | int err = 0; | 1755 | int err = 0; |
1756 | int cpu = raw_smp_processor_id(); | 1756 | int cpu = raw_smp_processor_id(); |
@@ -1758,24 +1758,6 @@ asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep, | |||
1758 | err |= put_user(cpu, cpup); | 1758 | err |= put_user(cpu, cpup); |
1759 | if (nodep) | 1759 | if (nodep) |
1760 | err |= put_user(cpu_to_node(cpu), nodep); | 1760 | err |= put_user(cpu_to_node(cpu), nodep); |
1761 | if (cache) { | ||
1762 | /* | ||
1763 | * The cache is not needed for this implementation, | ||
1764 | * but make sure user programs pass something | ||
1765 | * valid. vsyscall implementations can instead make | ||
1766 | * good use of the cache. Only use t0 and t1 because | ||
1767 | * these are available in both 32bit and 64bit ABI (no | ||
1768 | * need for a compat_getcpu). 32bit has enough | ||
1769 | * padding | ||
1770 | */ | ||
1771 | unsigned long t0, t1; | ||
1772 | get_user(t0, &cache->blob[0]); | ||
1773 | get_user(t1, &cache->blob[1]); | ||
1774 | t0++; | ||
1775 | t1++; | ||
1776 | put_user(t0, &cache->blob[0]); | ||
1777 | put_user(t1, &cache->blob[1]); | ||
1778 | } | ||
1779 | return err ? -EFAULT : 0; | 1761 | return err ? -EFAULT : 0; |
1780 | } | 1762 | } |
1781 | 1763 | ||
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 52c7a151e298..beee5b3b68a2 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c | |||
@@ -40,10 +40,14 @@ cond_syscall(sys_recvfrom); | |||
40 | cond_syscall(sys_recv); | 40 | cond_syscall(sys_recv); |
41 | cond_syscall(sys_socket); | 41 | cond_syscall(sys_socket); |
42 | cond_syscall(sys_setsockopt); | 42 | cond_syscall(sys_setsockopt); |
43 | cond_syscall(compat_sys_setsockopt); | ||
43 | cond_syscall(sys_getsockopt); | 44 | cond_syscall(sys_getsockopt); |
45 | cond_syscall(compat_sys_getsockopt); | ||
44 | cond_syscall(sys_shutdown); | 46 | cond_syscall(sys_shutdown); |
45 | cond_syscall(sys_sendmsg); | 47 | cond_syscall(sys_sendmsg); |
48 | cond_syscall(compat_sys_sendmsg); | ||
46 | cond_syscall(sys_recvmsg); | 49 | cond_syscall(sys_recvmsg); |
50 | cond_syscall(compat_sys_recvmsg); | ||
47 | cond_syscall(sys_socketcall); | 51 | cond_syscall(sys_socketcall); |
48 | cond_syscall(sys_futex); | 52 | cond_syscall(sys_futex); |
49 | cond_syscall(compat_sys_futex); | 53 | cond_syscall(compat_sys_futex); |
@@ -127,6 +131,7 @@ cond_syscall(sys32_sysctl); | |||
127 | cond_syscall(ppc_rtas); | 131 | cond_syscall(ppc_rtas); |
128 | cond_syscall(sys_spu_run); | 132 | cond_syscall(sys_spu_run); |
129 | cond_syscall(sys_spu_create); | 133 | cond_syscall(sys_spu_create); |
134 | cond_syscall(sys_subpage_prot); | ||
130 | 135 | ||
131 | /* mmu depending weak syscall entries */ | 136 | /* mmu depending weak syscall entries */ |
132 | cond_syscall(sys_mprotect); | 137 | cond_syscall(sys_mprotect); |
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3b4efbe26445..7cb1ac3e6fff 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -53,6 +53,7 @@ | |||
53 | #ifdef CONFIG_X86 | 53 | #ifdef CONFIG_X86 |
54 | #include <asm/nmi.h> | 54 | #include <asm/nmi.h> |
55 | #include <asm/stacktrace.h> | 55 | #include <asm/stacktrace.h> |
56 | #include <asm/io.h> | ||
56 | #endif | 57 | #endif |
57 | 58 | ||
58 | static int deprecated_sysctl_warning(struct __sysctl_args *args); | 59 | static int deprecated_sysctl_warning(struct __sysctl_args *args); |
@@ -80,7 +81,7 @@ extern int percpu_pagelist_fraction; | |||
80 | extern int compat_log; | 81 | extern int compat_log; |
81 | extern int maps_protect; | 82 | extern int maps_protect; |
82 | extern int sysctl_stat_interval; | 83 | extern int sysctl_stat_interval; |
83 | extern int audit_argv_kb; | 84 | extern int latencytop_enabled; |
84 | 85 | ||
85 | /* Constants used for minimum and maximum */ | 86 | /* Constants used for minimum and maximum */ |
86 | #ifdef CONFIG_DETECT_SOFTLOCKUP | 87 | #ifdef CONFIG_DETECT_SOFTLOCKUP |
@@ -156,8 +157,16 @@ static int proc_dointvec_taint(struct ctl_table *table, int write, struct file * | |||
156 | #endif | 157 | #endif |
157 | 158 | ||
158 | static struct ctl_table root_table[]; | 159 | static struct ctl_table root_table[]; |
159 | static struct ctl_table_header root_table_header = | 160 | static struct ctl_table_root sysctl_table_root; |
160 | { root_table, LIST_HEAD_INIT(root_table_header.ctl_entry) }; | 161 | static struct ctl_table_header root_table_header = { |
162 | .ctl_table = root_table, | ||
163 | .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.header_list), | ||
164 | .root = &sysctl_table_root, | ||
165 | }; | ||
166 | static struct ctl_table_root sysctl_table_root = { | ||
167 | .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list), | ||
168 | .header_list = LIST_HEAD_INIT(root_table_header.ctl_entry), | ||
169 | }; | ||
161 | 170 | ||
162 | static struct ctl_table kern_table[]; | 171 | static struct ctl_table kern_table[]; |
163 | static struct ctl_table vm_table[]; | 172 | static struct ctl_table vm_table[]; |
@@ -191,14 +200,6 @@ static struct ctl_table root_table[] = { | |||
191 | .mode = 0555, | 200 | .mode = 0555, |
192 | .child = vm_table, | 201 | .child = vm_table, |
193 | }, | 202 | }, |
194 | #ifdef CONFIG_NET | ||
195 | { | ||
196 | .ctl_name = CTL_NET, | ||
197 | .procname = "net", | ||
198 | .mode = 0555, | ||
199 | .child = net_table, | ||
200 | }, | ||
201 | #endif | ||
202 | { | 203 | { |
203 | .ctl_name = CTL_FS, | 204 | .ctl_name = CTL_FS, |
204 | .procname = "fs", | 205 | .procname = "fs", |
@@ -225,21 +226,24 @@ static struct ctl_table root_table[] = { | |||
225 | }; | 226 | }; |
226 | 227 | ||
227 | #ifdef CONFIG_SCHED_DEBUG | 228 | #ifdef CONFIG_SCHED_DEBUG |
228 | static unsigned long min_sched_granularity_ns = 100000; /* 100 usecs */ | 229 | static int min_sched_granularity_ns = 100000; /* 100 usecs */ |
229 | static unsigned long max_sched_granularity_ns = 1000000000; /* 1 second */ | 230 | static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ |
230 | static unsigned long min_wakeup_granularity_ns; /* 0 usecs */ | 231 | static int min_wakeup_granularity_ns; /* 0 usecs */ |
231 | static unsigned long max_wakeup_granularity_ns = 1000000000; /* 1 second */ | 232 | static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ |
232 | #endif | 233 | #endif |
233 | 234 | ||
234 | static struct ctl_table kern_table[] = { | 235 | static struct ctl_table kern_table[] = { |
235 | #ifdef CONFIG_SCHED_DEBUG | 236 | #ifdef CONFIG_SCHED_DEBUG |
236 | { | 237 | { |
237 | .ctl_name = CTL_UNNUMBERED, | 238 | .ctl_name = CTL_UNNUMBERED, |
238 | .procname = "sched_nr_latency", | 239 | .procname = "sched_min_granularity_ns", |
239 | .data = &sysctl_sched_nr_latency, | 240 | .data = &sysctl_sched_min_granularity, |
240 | .maxlen = sizeof(unsigned int), | 241 | .maxlen = sizeof(unsigned int), |
241 | .mode = 0644, | 242 | .mode = 0644, |
242 | .proc_handler = &proc_dointvec, | 243 | .proc_handler = &sched_nr_latency_handler, |
244 | .strategy = &sysctl_intvec, | ||
245 | .extra1 = &min_sched_granularity_ns, | ||
246 | .extra2 = &max_sched_granularity_ns, | ||
243 | }, | 247 | }, |
244 | { | 248 | { |
245 | .ctl_name = CTL_UNNUMBERED, | 249 | .ctl_name = CTL_UNNUMBERED, |
@@ -247,7 +251,7 @@ static struct ctl_table kern_table[] = { | |||
247 | .data = &sysctl_sched_latency, | 251 | .data = &sysctl_sched_latency, |
248 | .maxlen = sizeof(unsigned int), | 252 | .maxlen = sizeof(unsigned int), |
249 | .mode = 0644, | 253 | .mode = 0644, |
250 | .proc_handler = &proc_dointvec_minmax, | 254 | .proc_handler = &sched_nr_latency_handler, |
251 | .strategy = &sysctl_intvec, | 255 | .strategy = &sysctl_intvec, |
252 | .extra1 = &min_sched_granularity_ns, | 256 | .extra1 = &min_sched_granularity_ns, |
253 | .extra2 = &max_sched_granularity_ns, | 257 | .extra2 = &max_sched_granularity_ns, |
@@ -298,6 +302,48 @@ static struct ctl_table kern_table[] = { | |||
298 | .mode = 0644, | 302 | .mode = 0644, |
299 | .proc_handler = &proc_dointvec, | 303 | .proc_handler = &proc_dointvec, |
300 | }, | 304 | }, |
305 | { | ||
306 | .ctl_name = CTL_UNNUMBERED, | ||
307 | .procname = "sched_nr_migrate", | ||
308 | .data = &sysctl_sched_nr_migrate, | ||
309 | .maxlen = sizeof(unsigned int), | ||
310 | .mode = 0644, | ||
311 | .proc_handler = &proc_dointvec, | ||
312 | }, | ||
313 | { | ||
314 | .ctl_name = CTL_UNNUMBERED, | ||
315 | .procname = "sched_rt_period_ms", | ||
316 | .data = &sysctl_sched_rt_period, | ||
317 | .maxlen = sizeof(unsigned int), | ||
318 | .mode = 0644, | ||
319 | .proc_handler = &proc_dointvec, | ||
320 | }, | ||
321 | { | ||
322 | .ctl_name = CTL_UNNUMBERED, | ||
323 | .procname = "sched_rt_ratio", | ||
324 | .data = &sysctl_sched_rt_ratio, | ||
325 | .maxlen = sizeof(unsigned int), | ||
326 | .mode = 0644, | ||
327 | .proc_handler = &proc_dointvec, | ||
328 | }, | ||
329 | #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP) | ||
330 | { | ||
331 | .ctl_name = CTL_UNNUMBERED, | ||
332 | .procname = "sched_min_bal_int_shares", | ||
333 | .data = &sysctl_sched_min_bal_int_shares, | ||
334 | .maxlen = sizeof(unsigned int), | ||
335 | .mode = 0644, | ||
336 | .proc_handler = &proc_dointvec, | ||
337 | }, | ||
338 | { | ||
339 | .ctl_name = CTL_UNNUMBERED, | ||
340 | .procname = "sched_max_bal_int_shares", | ||
341 | .data = &sysctl_sched_max_bal_int_shares, | ||
342 | .maxlen = sizeof(unsigned int), | ||
343 | .mode = 0644, | ||
344 | .proc_handler = &proc_dointvec, | ||
345 | }, | ||
346 | #endif | ||
301 | #endif | 347 | #endif |
302 | { | 348 | { |
303 | .ctl_name = CTL_UNNUMBERED, | 349 | .ctl_name = CTL_UNNUMBERED, |
@@ -343,16 +389,6 @@ static struct ctl_table kern_table[] = { | |||
343 | .mode = 0644, | 389 | .mode = 0644, |
344 | .proc_handler = &proc_dointvec, | 390 | .proc_handler = &proc_dointvec, |
345 | }, | 391 | }, |
346 | #ifdef CONFIG_AUDITSYSCALL | ||
347 | { | ||
348 | .ctl_name = CTL_UNNUMBERED, | ||
349 | .procname = "audit_argv_kb", | ||
350 | .data = &audit_argv_kb, | ||
351 | .maxlen = sizeof(int), | ||
352 | .mode = 0644, | ||
353 | .proc_handler = &proc_dointvec, | ||
354 | }, | ||
355 | #endif | ||
356 | { | 392 | { |
357 | .ctl_name = KERN_CORE_PATTERN, | 393 | .ctl_name = KERN_CORE_PATTERN, |
358 | .procname = "core_pattern", | 394 | .procname = "core_pattern", |
@@ -371,6 +407,15 @@ static struct ctl_table kern_table[] = { | |||
371 | .proc_handler = &proc_dointvec_taint, | 407 | .proc_handler = &proc_dointvec_taint, |
372 | }, | 408 | }, |
373 | #endif | 409 | #endif |
410 | #ifdef CONFIG_LATENCYTOP | ||
411 | { | ||
412 | .procname = "latencytop", | ||
413 | .data = &latencytop_enabled, | ||
414 | .maxlen = sizeof(int), | ||
415 | .mode = 0644, | ||
416 | .proc_handler = &proc_dointvec, | ||
417 | }, | ||
418 | #endif | ||
374 | #ifdef CONFIG_SECURITY_CAPABILITIES | 419 | #ifdef CONFIG_SECURITY_CAPABILITIES |
375 | { | 420 | { |
376 | .procname = "cap-bound", | 421 | .procname = "cap-bound", |
@@ -672,6 +717,14 @@ static struct ctl_table kern_table[] = { | |||
672 | .mode = 0644, | 717 | .mode = 0644, |
673 | .proc_handler = &proc_dointvec, | 718 | .proc_handler = &proc_dointvec, |
674 | }, | 719 | }, |
720 | { | ||
721 | .ctl_name = CTL_UNNUMBERED, | ||
722 | .procname = "io_delay_type", | ||
723 | .data = &io_delay_type, | ||
724 | .maxlen = sizeof(int), | ||
725 | .mode = 0644, | ||
726 | .proc_handler = &proc_dointvec, | ||
727 | }, | ||
675 | #endif | 728 | #endif |
676 | #if defined(CONFIG_MMU) | 729 | #if defined(CONFIG_MMU) |
677 | { | 730 | { |
@@ -717,13 +770,40 @@ static struct ctl_table kern_table[] = { | |||
717 | .ctl_name = CTL_UNNUMBERED, | 770 | .ctl_name = CTL_UNNUMBERED, |
718 | .procname = "softlockup_thresh", | 771 | .procname = "softlockup_thresh", |
719 | .data = &softlockup_thresh, | 772 | .data = &softlockup_thresh, |
720 | .maxlen = sizeof(int), | 773 | .maxlen = sizeof(unsigned long), |
721 | .mode = 0644, | 774 | .mode = 0644, |
722 | .proc_handler = &proc_dointvec_minmax, | 775 | .proc_handler = &proc_doulongvec_minmax, |
723 | .strategy = &sysctl_intvec, | 776 | .strategy = &sysctl_intvec, |
724 | .extra1 = &one, | 777 | .extra1 = &one, |
725 | .extra2 = &sixty, | 778 | .extra2 = &sixty, |
726 | }, | 779 | }, |
780 | { | ||
781 | .ctl_name = CTL_UNNUMBERED, | ||
782 | .procname = "hung_task_check_count", | ||
783 | .data = &sysctl_hung_task_check_count, | ||
784 | .maxlen = sizeof(unsigned long), | ||
785 | .mode = 0644, | ||
786 | .proc_handler = &proc_doulongvec_minmax, | ||
787 | .strategy = &sysctl_intvec, | ||
788 | }, | ||
789 | { | ||
790 | .ctl_name = CTL_UNNUMBERED, | ||
791 | .procname = "hung_task_timeout_secs", | ||
792 | .data = &sysctl_hung_task_timeout_secs, | ||
793 | .maxlen = sizeof(unsigned long), | ||
794 | .mode = 0644, | ||
795 | .proc_handler = &proc_doulongvec_minmax, | ||
796 | .strategy = &sysctl_intvec, | ||
797 | }, | ||
798 | { | ||
799 | .ctl_name = CTL_UNNUMBERED, | ||
800 | .procname = "hung_task_warnings", | ||
801 | .data = &sysctl_hung_task_warnings, | ||
802 | .maxlen = sizeof(unsigned long), | ||
803 | .mode = 0644, | ||
804 | .proc_handler = &proc_doulongvec_minmax, | ||
805 | .strategy = &sysctl_intvec, | ||
806 | }, | ||
727 | #endif | 807 | #endif |
728 | #ifdef CONFIG_COMPAT | 808 | #ifdef CONFIG_COMPAT |
729 | { | 809 | { |
@@ -895,11 +975,11 @@ static struct ctl_table vm_table[] = { | |||
895 | }, | 975 | }, |
896 | { | 976 | { |
897 | .ctl_name = CTL_UNNUMBERED, | 977 | .ctl_name = CTL_UNNUMBERED, |
898 | .procname = "hugetlb_dynamic_pool", | 978 | .procname = "nr_overcommit_hugepages", |
899 | .data = &hugetlb_dynamic_pool, | 979 | .data = &nr_overcommit_huge_pages, |
900 | .maxlen = sizeof(hugetlb_dynamic_pool), | 980 | .maxlen = sizeof(nr_overcommit_huge_pages), |
901 | .mode = 0644, | 981 | .mode = 0644, |
902 | .proc_handler = &proc_dointvec, | 982 | .proc_handler = &proc_doulongvec_minmax, |
903 | }, | 983 | }, |
904 | #endif | 984 | #endif |
905 | { | 985 | { |
@@ -1289,12 +1369,27 @@ void sysctl_head_finish(struct ctl_table_header *head) | |||
1289 | spin_unlock(&sysctl_lock); | 1369 | spin_unlock(&sysctl_lock); |
1290 | } | 1370 | } |
1291 | 1371 | ||
1292 | struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev) | 1372 | static struct list_head * |
1373 | lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces) | ||
1374 | { | ||
1375 | struct list_head *header_list; | ||
1376 | header_list = &root->header_list; | ||
1377 | if (root->lookup) | ||
1378 | header_list = root->lookup(root, namespaces); | ||
1379 | return header_list; | ||
1380 | } | ||
1381 | |||
1382 | struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces, | ||
1383 | struct ctl_table_header *prev) | ||
1293 | { | 1384 | { |
1385 | struct ctl_table_root *root; | ||
1386 | struct list_head *header_list; | ||
1294 | struct ctl_table_header *head; | 1387 | struct ctl_table_header *head; |
1295 | struct list_head *tmp; | 1388 | struct list_head *tmp; |
1389 | |||
1296 | spin_lock(&sysctl_lock); | 1390 | spin_lock(&sysctl_lock); |
1297 | if (prev) { | 1391 | if (prev) { |
1392 | head = prev; | ||
1298 | tmp = &prev->ctl_entry; | 1393 | tmp = &prev->ctl_entry; |
1299 | unuse_table(prev); | 1394 | unuse_table(prev); |
1300 | goto next; | 1395 | goto next; |
@@ -1308,14 +1403,38 @@ struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev) | |||
1308 | spin_unlock(&sysctl_lock); | 1403 | spin_unlock(&sysctl_lock); |
1309 | return head; | 1404 | return head; |
1310 | next: | 1405 | next: |
1406 | root = head->root; | ||
1311 | tmp = tmp->next; | 1407 | tmp = tmp->next; |
1312 | if (tmp == &root_table_header.ctl_entry) | 1408 | header_list = lookup_header_list(root, namespaces); |
1313 | break; | 1409 | if (tmp != header_list) |
1410 | continue; | ||
1411 | |||
1412 | do { | ||
1413 | root = list_entry(root->root_list.next, | ||
1414 | struct ctl_table_root, root_list); | ||
1415 | if (root == &sysctl_table_root) | ||
1416 | goto out; | ||
1417 | header_list = lookup_header_list(root, namespaces); | ||
1418 | } while (list_empty(header_list)); | ||
1419 | tmp = header_list->next; | ||
1314 | } | 1420 | } |
1421 | out: | ||
1315 | spin_unlock(&sysctl_lock); | 1422 | spin_unlock(&sysctl_lock); |
1316 | return NULL; | 1423 | return NULL; |
1317 | } | 1424 | } |
1318 | 1425 | ||
1426 | struct ctl_table_header *sysctl_head_next(struct ctl_table_header *prev) | ||
1427 | { | ||
1428 | return __sysctl_head_next(current->nsproxy, prev); | ||
1429 | } | ||
1430 | |||
1431 | void register_sysctl_root(struct ctl_table_root *root) | ||
1432 | { | ||
1433 | spin_lock(&sysctl_lock); | ||
1434 | list_add_tail(&root->root_list, &sysctl_table_root.root_list); | ||
1435 | spin_unlock(&sysctl_lock); | ||
1436 | } | ||
1437 | |||
1319 | #ifdef CONFIG_SYSCTL_SYSCALL | 1438 | #ifdef CONFIG_SYSCTL_SYSCALL |
1320 | int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, | 1439 | int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, |
1321 | void __user *newval, size_t newlen) | 1440 | void __user *newval, size_t newlen) |
@@ -1472,18 +1591,21 @@ static __init int sysctl_init(void) | |||
1472 | { | 1591 | { |
1473 | int err; | 1592 | int err; |
1474 | sysctl_set_parent(NULL, root_table); | 1593 | sysctl_set_parent(NULL, root_table); |
1475 | err = sysctl_check_table(root_table); | 1594 | err = sysctl_check_table(current->nsproxy, root_table); |
1476 | return 0; | 1595 | return 0; |
1477 | } | 1596 | } |
1478 | 1597 | ||
1479 | core_initcall(sysctl_init); | 1598 | core_initcall(sysctl_init); |
1480 | 1599 | ||
1481 | /** | 1600 | /** |
1482 | * register_sysctl_table - register a sysctl hierarchy | 1601 | * __register_sysctl_paths - register a sysctl hierarchy |
1602 | * @root: List of sysctl headers to register on | ||
1603 | * @namespaces: Data to compute which lists of sysctl entries are visible | ||
1604 | * @path: The path to the directory the sysctl table is in. | ||
1483 | * @table: the top-level table structure | 1605 | * @table: the top-level table structure |
1484 | * | 1606 | * |
1485 | * Register a sysctl table hierarchy. @table should be a filled in ctl_table | 1607 | * Register a sysctl table hierarchy. @table should be a filled in ctl_table |
1486 | * array. An entry with a ctl_name of 0 terminates the table. | 1608 | * array. A completely 0 filled entry terminates the table. |
1487 | * | 1609 | * |
1488 | * The members of the &struct ctl_table structure are used as follows: | 1610 | * The members of the &struct ctl_table structure are used as follows: |
1489 | * | 1611 | * |
@@ -1546,25 +1668,99 @@ core_initcall(sysctl_init); | |||
1546 | * This routine returns %NULL on a failure to register, and a pointer | 1668 | * This routine returns %NULL on a failure to register, and a pointer |
1547 | * to the table header on success. | 1669 | * to the table header on success. |
1548 | */ | 1670 | */ |
1549 | struct ctl_table_header *register_sysctl_table(struct ctl_table * table) | 1671 | struct ctl_table_header *__register_sysctl_paths( |
1672 | struct ctl_table_root *root, | ||
1673 | struct nsproxy *namespaces, | ||
1674 | const struct ctl_path *path, struct ctl_table *table) | ||
1550 | { | 1675 | { |
1551 | struct ctl_table_header *tmp; | 1676 | struct list_head *header_list; |
1552 | tmp = kmalloc(sizeof(struct ctl_table_header), GFP_KERNEL); | 1677 | struct ctl_table_header *header; |
1553 | if (!tmp) | 1678 | struct ctl_table *new, **prevp; |
1679 | unsigned int n, npath; | ||
1680 | |||
1681 | /* Count the path components */ | ||
1682 | for (npath = 0; path[npath].ctl_name || path[npath].procname; ++npath) | ||
1683 | ; | ||
1684 | |||
1685 | /* | ||
1686 | * For each path component, allocate a 2-element ctl_table array. | ||
1687 | * The first array element will be filled with the sysctl entry | ||
1688 | * for this, the second will be the sentinel (ctl_name == 0). | ||
1689 | * | ||
1690 | * We allocate everything in one go so that we don't have to | ||
1691 | * worry about freeing additional memory in unregister_sysctl_table. | ||
1692 | */ | ||
1693 | header = kzalloc(sizeof(struct ctl_table_header) + | ||
1694 | (2 * npath * sizeof(struct ctl_table)), GFP_KERNEL); | ||
1695 | if (!header) | ||
1554 | return NULL; | 1696 | return NULL; |
1555 | tmp->ctl_table = table; | 1697 | |
1556 | INIT_LIST_HEAD(&tmp->ctl_entry); | 1698 | new = (struct ctl_table *) (header + 1); |
1557 | tmp->used = 0; | 1699 | |
1558 | tmp->unregistering = NULL; | 1700 | /* Now connect the dots */ |
1559 | sysctl_set_parent(NULL, table); | 1701 | prevp = &header->ctl_table; |
1560 | if (sysctl_check_table(tmp->ctl_table)) { | 1702 | for (n = 0; n < npath; ++n, ++path) { |
1561 | kfree(tmp); | 1703 | /* Copy the procname */ |
1704 | new->procname = path->procname; | ||
1705 | new->ctl_name = path->ctl_name; | ||
1706 | new->mode = 0555; | ||
1707 | |||
1708 | *prevp = new; | ||
1709 | prevp = &new->child; | ||
1710 | |||
1711 | new += 2; | ||
1712 | } | ||
1713 | *prevp = table; | ||
1714 | header->ctl_table_arg = table; | ||
1715 | |||
1716 | INIT_LIST_HEAD(&header->ctl_entry); | ||
1717 | header->used = 0; | ||
1718 | header->unregistering = NULL; | ||
1719 | header->root = root; | ||
1720 | sysctl_set_parent(NULL, header->ctl_table); | ||
1721 | if (sysctl_check_table(namespaces, header->ctl_table)) { | ||
1722 | kfree(header); | ||
1562 | return NULL; | 1723 | return NULL; |
1563 | } | 1724 | } |
1564 | spin_lock(&sysctl_lock); | 1725 | spin_lock(&sysctl_lock); |
1565 | list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry); | 1726 | header_list = lookup_header_list(root, namespaces); |
1727 | list_add_tail(&header->ctl_entry, header_list); | ||
1566 | spin_unlock(&sysctl_lock); | 1728 | spin_unlock(&sysctl_lock); |
1567 | return tmp; | 1729 | |
1730 | return header; | ||
1731 | } | ||
1732 | |||
1733 | /** | ||
1734 | * register_sysctl_table_path - register a sysctl table hierarchy | ||
1735 | * @path: The path to the directory the sysctl table is in. | ||
1736 | * @table: the top-level table structure | ||
1737 | * | ||
1738 | * Register a sysctl table hierarchy. @table should be a filled in ctl_table | ||
1739 | * array. A completely 0 filled entry terminates the table. | ||
1740 | * | ||
1741 | * See __register_sysctl_paths for more details. | ||
1742 | */ | ||
1743 | struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, | ||
1744 | struct ctl_table *table) | ||
1745 | { | ||
1746 | return __register_sysctl_paths(&sysctl_table_root, current->nsproxy, | ||
1747 | path, table); | ||
1748 | } | ||
1749 | |||
1750 | /** | ||
1751 | * register_sysctl_table - register a sysctl table hierarchy | ||
1752 | * @table: the top-level table structure | ||
1753 | * | ||
1754 | * Register a sysctl table hierarchy. @table should be a filled in ctl_table | ||
1755 | * array. A completely 0 filled entry terminates the table. | ||
1756 | * | ||
1757 | * See register_sysctl_paths for more details. | ||
1758 | */ | ||
1759 | struct ctl_table_header *register_sysctl_table(struct ctl_table *table) | ||
1760 | { | ||
1761 | static const struct ctl_path null_path[] = { {} }; | ||
1762 | |||
1763 | return register_sysctl_paths(null_path, table); | ||
1568 | } | 1764 | } |
1569 | 1765 | ||
1570 | /** | 1766 | /** |
@@ -1577,6 +1773,10 @@ struct ctl_table_header *register_sysctl_table(struct ctl_table * table) | |||
1577 | void unregister_sysctl_table(struct ctl_table_header * header) | 1773 | void unregister_sysctl_table(struct ctl_table_header * header) |
1578 | { | 1774 | { |
1579 | might_sleep(); | 1775 | might_sleep(); |
1776 | |||
1777 | if (header == NULL) | ||
1778 | return; | ||
1779 | |||
1580 | spin_lock(&sysctl_lock); | 1780 | spin_lock(&sysctl_lock); |
1581 | start_unregistering(header); | 1781 | start_unregistering(header); |
1582 | spin_unlock(&sysctl_lock); | 1782 | spin_unlock(&sysctl_lock); |
@@ -1589,6 +1789,12 @@ struct ctl_table_header *register_sysctl_table(struct ctl_table * table) | |||
1589 | return NULL; | 1789 | return NULL; |
1590 | } | 1790 | } |
1591 | 1791 | ||
1792 | struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, | ||
1793 | struct ctl_table *table) | ||
1794 | { | ||
1795 | return NULL; | ||
1796 | } | ||
1797 | |||
1592 | void unregister_sysctl_table(struct ctl_table_header * table) | 1798 | void unregister_sysctl_table(struct ctl_table_header * table) |
1593 | { | 1799 | { |
1594 | } | 1800 | } |
@@ -2609,6 +2815,10 @@ static int deprecated_sysctl_warning(struct __sysctl_args *args) | |||
2609 | int name[CTL_MAXNAME]; | 2815 | int name[CTL_MAXNAME]; |
2610 | int i; | 2816 | int i; |
2611 | 2817 | ||
2818 | /* Check args->nlen. */ | ||
2819 | if (args->nlen < 0 || args->nlen > CTL_MAXNAME) | ||
2820 | return -ENOTDIR; | ||
2821 | |||
2612 | /* Read in the sysctl name for better debug message logging */ | 2822 | /* Read in the sysctl name for better debug message logging */ |
2613 | for (i = 0; i < args->nlen; i++) | 2823 | for (i = 0; i < args->nlen; i++) |
2614 | if (get_user(name[i], args->name + i)) | 2824 | if (get_user(name[i], args->name + i)) |
@@ -2643,6 +2853,7 @@ EXPORT_SYMBOL(proc_dostring); | |||
2643 | EXPORT_SYMBOL(proc_doulongvec_minmax); | 2853 | EXPORT_SYMBOL(proc_doulongvec_minmax); |
2644 | EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); | 2854 | EXPORT_SYMBOL(proc_doulongvec_ms_jiffies_minmax); |
2645 | EXPORT_SYMBOL(register_sysctl_table); | 2855 | EXPORT_SYMBOL(register_sysctl_table); |
2856 | EXPORT_SYMBOL(register_sysctl_paths); | ||
2646 | EXPORT_SYMBOL(sysctl_intvec); | 2857 | EXPORT_SYMBOL(sysctl_intvec); |
2647 | EXPORT_SYMBOL(sysctl_jiffies); | 2858 | EXPORT_SYMBOL(sysctl_jiffies); |
2648 | EXPORT_SYMBOL(sysctl_ms_jiffies); | 2859 | EXPORT_SYMBOL(sysctl_ms_jiffies); |
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c index ed6fe51df77a..c3206fa50048 100644 --- a/kernel/sysctl_check.c +++ b/kernel/sysctl_check.c | |||
@@ -1,6 +1,5 @@ | |||
1 | #include <linux/stat.h> | 1 | #include <linux/stat.h> |
2 | #include <linux/sysctl.h> | 2 | #include <linux/sysctl.h> |
3 | #include "../arch/s390/appldata/appldata.h" | ||
4 | #include "../fs/xfs/linux-2.6/xfs_sysctl.h" | 3 | #include "../fs/xfs/linux-2.6/xfs_sysctl.h" |
5 | #include <linux/sunrpc/debug.h> | 4 | #include <linux/sunrpc/debug.h> |
6 | #include <linux/string.h> | 5 | #include <linux/string.h> |
@@ -96,7 +95,7 @@ static struct trans_ctl_table trans_kern_table[] = { | |||
96 | 95 | ||
97 | { KERN_PTY, "pty", trans_pty_table }, | 96 | { KERN_PTY, "pty", trans_pty_table }, |
98 | { KERN_NGROUPS_MAX, "ngroups_max" }, | 97 | { KERN_NGROUPS_MAX, "ngroups_max" }, |
99 | { KERN_SPARC_SCONS_PWROFF, "scons_poweroff" }, | 98 | { KERN_SPARC_SCONS_PWROFF, "scons-poweroff" }, |
100 | { KERN_HZ_TIMER, "hz_timer" }, | 99 | { KERN_HZ_TIMER, "hz_timer" }, |
101 | { KERN_UNKNOWN_NMI_PANIC, "unknown_nmi_panic" }, | 100 | { KERN_UNKNOWN_NMI_PANIC, "unknown_nmi_panic" }, |
102 | { KERN_BOOTLOADER_TYPE, "bootloader_type" }, | 101 | { KERN_BOOTLOADER_TYPE, "bootloader_type" }, |
@@ -140,9 +139,6 @@ static struct trans_ctl_table trans_vm_table[] = { | |||
140 | { VM_PANIC_ON_OOM, "panic_on_oom" }, | 139 | { VM_PANIC_ON_OOM, "panic_on_oom" }, |
141 | { VM_VDSO_ENABLED, "vdso_enabled" }, | 140 | { VM_VDSO_ENABLED, "vdso_enabled" }, |
142 | { VM_MIN_SLAB, "min_slab_ratio" }, | 141 | { VM_MIN_SLAB, "min_slab_ratio" }, |
143 | { VM_CMM_PAGES, "cmm_pages" }, | ||
144 | { VM_CMM_TIMED_PAGES, "cmm_timed_pages" }, | ||
145 | { VM_CMM_TIMEOUT, "cmm_timeout" }, | ||
146 | 142 | ||
147 | {} | 143 | {} |
148 | }; | 144 | }; |
@@ -237,36 +233,6 @@ static struct trans_ctl_table trans_net_ipv4_conf_table[] = { | |||
237 | {} | 233 | {} |
238 | }; | 234 | }; |
239 | 235 | ||
240 | |||
241 | static struct trans_ctl_table trans_net_ipv4_vs_table[] = { | ||
242 | { NET_IPV4_VS_AMEMTHRESH, "amemthresh" }, | ||
243 | { NET_IPV4_VS_DEBUG_LEVEL, "debug_level" }, | ||
244 | { NET_IPV4_VS_AMDROPRATE, "am_droprate" }, | ||
245 | { NET_IPV4_VS_DROP_ENTRY, "drop_entry" }, | ||
246 | { NET_IPV4_VS_DROP_PACKET, "drop_packet" }, | ||
247 | { NET_IPV4_VS_SECURE_TCP, "secure_tcp" }, | ||
248 | { NET_IPV4_VS_TO_ES, "timeout_established" }, | ||
249 | { NET_IPV4_VS_TO_SS, "timeout_synsent" }, | ||
250 | { NET_IPV4_VS_TO_SR, "timeout_synrecv" }, | ||
251 | { NET_IPV4_VS_TO_FW, "timeout_finwait" }, | ||
252 | { NET_IPV4_VS_TO_TW, "timeout_timewait" }, | ||
253 | { NET_IPV4_VS_TO_CL, "timeout_close" }, | ||
254 | { NET_IPV4_VS_TO_CW, "timeout_closewait" }, | ||
255 | { NET_IPV4_VS_TO_LA, "timeout_lastack" }, | ||
256 | { NET_IPV4_VS_TO_LI, "timeout_listen" }, | ||
257 | { NET_IPV4_VS_TO_SA, "timeout_synack" }, | ||
258 | { NET_IPV4_VS_TO_UDP, "timeout_udp" }, | ||
259 | { NET_IPV4_VS_TO_ICMP, "timeout_icmp" }, | ||
260 | { NET_IPV4_VS_CACHE_BYPASS, "cache_bypass" }, | ||
261 | { NET_IPV4_VS_EXPIRE_NODEST_CONN, "expire_nodest_conn" }, | ||
262 | { NET_IPV4_VS_EXPIRE_QUIESCENT_TEMPLATE, "expire_quiescent_template" }, | ||
263 | { NET_IPV4_VS_SYNC_THRESHOLD, "sync_threshold" }, | ||
264 | { NET_IPV4_VS_NAT_ICMP_SEND, "nat_icmp_send" }, | ||
265 | { NET_IPV4_VS_LBLC_EXPIRE, "lblc_expiration" }, | ||
266 | { NET_IPV4_VS_LBLCR_EXPIRE, "lblcr_expiration" }, | ||
267 | {} | ||
268 | }; | ||
269 | |||
270 | static struct trans_ctl_table trans_net_neigh_vars_table[] = { | 236 | static struct trans_ctl_table trans_net_neigh_vars_table[] = { |
271 | { NET_NEIGH_MCAST_SOLICIT, "mcast_solicit" }, | 237 | { NET_NEIGH_MCAST_SOLICIT, "mcast_solicit" }, |
272 | { NET_NEIGH_UCAST_SOLICIT, "ucast_solicit" }, | 238 | { NET_NEIGH_UCAST_SOLICIT, "ucast_solicit" }, |
@@ -341,7 +307,6 @@ static struct trans_ctl_table trans_net_ipv4_table[] = { | |||
341 | { NET_IPV4_ROUTE, "route", trans_net_ipv4_route_table }, | 307 | { NET_IPV4_ROUTE, "route", trans_net_ipv4_route_table }, |
342 | /* NET_IPV4_FIB_HASH unused */ | 308 | /* NET_IPV4_FIB_HASH unused */ |
343 | { NET_IPV4_NETFILTER, "netfilter", trans_net_ipv4_netfilter_table }, | 309 | { NET_IPV4_NETFILTER, "netfilter", trans_net_ipv4_netfilter_table }, |
344 | { NET_IPV4_VS, "vs", trans_net_ipv4_vs_table }, | ||
345 | 310 | ||
346 | { NET_IPV4_TCP_TIMESTAMPS, "tcp_timestamps" }, | 311 | { NET_IPV4_TCP_TIMESTAMPS, "tcp_timestamps" }, |
347 | { NET_IPV4_TCP_WINDOW_SCALING, "tcp_window_scaling" }, | 312 | { NET_IPV4_TCP_WINDOW_SCALING, "tcp_window_scaling" }, |
@@ -462,7 +427,7 @@ static struct trans_ctl_table trans_net_netrom_table[] = { | |||
462 | {} | 427 | {} |
463 | }; | 428 | }; |
464 | 429 | ||
465 | static struct trans_ctl_table trans_net_ax25_table[] = { | 430 | static struct trans_ctl_table trans_net_ax25_param_table[] = { |
466 | { NET_AX25_IP_DEFAULT_MODE, "ip_default_mode" }, | 431 | { NET_AX25_IP_DEFAULT_MODE, "ip_default_mode" }, |
467 | { NET_AX25_DEFAULT_MODE, "ax25_default_mode" }, | 432 | { NET_AX25_DEFAULT_MODE, "ax25_default_mode" }, |
468 | { NET_AX25_BACKOFF_TYPE, "backoff_type" }, | 433 | { NET_AX25_BACKOFF_TYPE, "backoff_type" }, |
@@ -480,6 +445,11 @@ static struct trans_ctl_table trans_net_ax25_table[] = { | |||
480 | {} | 445 | {} |
481 | }; | 446 | }; |
482 | 447 | ||
448 | static struct trans_ctl_table trans_net_ax25_table[] = { | ||
449 | { 0, NULL, trans_net_ax25_param_table }, | ||
450 | {} | ||
451 | }; | ||
452 | |||
483 | static struct trans_ctl_table trans_net_bridge_table[] = { | 453 | static struct trans_ctl_table trans_net_bridge_table[] = { |
484 | { NET_BRIDGE_NF_CALL_ARPTABLES, "bridge-nf-call-arptables" }, | 454 | { NET_BRIDGE_NF_CALL_ARPTABLES, "bridge-nf-call-arptables" }, |
485 | { NET_BRIDGE_NF_CALL_IPTABLES, "bridge-nf-call-iptables" }, | 455 | { NET_BRIDGE_NF_CALL_IPTABLES, "bridge-nf-call-iptables" }, |
@@ -738,7 +708,7 @@ static struct trans_ctl_table trans_net_table[] = { | |||
738 | { NET_ROSE, "rose", trans_net_rose_table }, | 708 | { NET_ROSE, "rose", trans_net_rose_table }, |
739 | { NET_IPV6, "ipv6", trans_net_ipv6_table }, | 709 | { NET_IPV6, "ipv6", trans_net_ipv6_table }, |
740 | { NET_X25, "x25", trans_net_x25_table }, | 710 | { NET_X25, "x25", trans_net_x25_table }, |
741 | { NET_TR, "tr", trans_net_tr_table }, | 711 | { NET_TR, "token-ring", trans_net_tr_table }, |
742 | { NET_DECNET, "decnet", trans_net_decnet_table }, | 712 | { NET_DECNET, "decnet", trans_net_decnet_table }, |
743 | /* NET_ECONET not used */ | 713 | /* NET_ECONET not used */ |
744 | { NET_SCTP, "sctp", trans_net_sctp_table }, | 714 | { NET_SCTP, "sctp", trans_net_sctp_table }, |
@@ -1219,16 +1189,6 @@ static struct trans_ctl_table trans_arlan_table[] = { | |||
1219 | {} | 1189 | {} |
1220 | }; | 1190 | }; |
1221 | 1191 | ||
1222 | static struct trans_ctl_table trans_appldata_table[] = { | ||
1223 | { CTL_APPLDATA_TIMER, "timer" }, | ||
1224 | { CTL_APPLDATA_INTERVAL, "interval" }, | ||
1225 | { CTL_APPLDATA_OS, "os" }, | ||
1226 | { CTL_APPLDATA_NET_SUM, "net_sum" }, | ||
1227 | { CTL_APPLDATA_MEM, "mem" }, | ||
1228 | {} | ||
1229 | |||
1230 | }; | ||
1231 | |||
1232 | static struct trans_ctl_table trans_s390dbf_table[] = { | 1192 | static struct trans_ctl_table trans_s390dbf_table[] = { |
1233 | { 5678 /* CTL_S390DBF_STOPPABLE */, "debug_stoppable" }, | 1193 | { 5678 /* CTL_S390DBF_STOPPABLE */, "debug_stoppable" }, |
1234 | { 5679 /* CTL_S390DBF_ACTIVE */, "debug_active" }, | 1194 | { 5679 /* CTL_S390DBF_ACTIVE */, "debug_active" }, |
@@ -1273,7 +1233,6 @@ static struct trans_ctl_table trans_root_table[] = { | |||
1273 | { CTL_ABI, "abi" }, | 1233 | { CTL_ABI, "abi" }, |
1274 | /* CTL_CPU not used */ | 1234 | /* CTL_CPU not used */ |
1275 | { CTL_ARLAN, "arlan", trans_arlan_table }, | 1235 | { CTL_ARLAN, "arlan", trans_arlan_table }, |
1276 | { CTL_APPLDATA, "appldata", trans_appldata_table }, | ||
1277 | { CTL_S390DBF, "s390dbf", trans_s390dbf_table }, | 1236 | { CTL_S390DBF, "s390dbf", trans_s390dbf_table }, |
1278 | { CTL_SUNRPC, "sunrpc", trans_sunrpc_table }, | 1237 | { CTL_SUNRPC, "sunrpc", trans_sunrpc_table }, |
1279 | { CTL_PM, "pm", trans_pm_table }, | 1238 | { CTL_PM, "pm", trans_pm_table }, |
@@ -1383,7 +1342,8 @@ static void sysctl_repair_table(struct ctl_table *table) | |||
1383 | } | 1342 | } |
1384 | } | 1343 | } |
1385 | 1344 | ||
1386 | static struct ctl_table *sysctl_check_lookup(struct ctl_table *table) | 1345 | static struct ctl_table *sysctl_check_lookup(struct nsproxy *namespaces, |
1346 | struct ctl_table *table) | ||
1387 | { | 1347 | { |
1388 | struct ctl_table_header *head; | 1348 | struct ctl_table_header *head; |
1389 | struct ctl_table *ref, *test; | 1349 | struct ctl_table *ref, *test; |
@@ -1391,8 +1351,8 @@ static struct ctl_table *sysctl_check_lookup(struct ctl_table *table) | |||
1391 | 1351 | ||
1392 | depth = sysctl_depth(table); | 1352 | depth = sysctl_depth(table); |
1393 | 1353 | ||
1394 | for (head = sysctl_head_next(NULL); head; | 1354 | for (head = __sysctl_head_next(namespaces, NULL); head; |
1395 | head = sysctl_head_next(head)) { | 1355 | head = __sysctl_head_next(namespaces, head)) { |
1396 | cur_depth = depth; | 1356 | cur_depth = depth; |
1397 | ref = head->ctl_table; | 1357 | ref = head->ctl_table; |
1398 | repeat: | 1358 | repeat: |
@@ -1432,17 +1392,19 @@ static void set_fail(const char **fail, struct ctl_table *table, const char *str | |||
1432 | printk(KERN_ERR "sysctl table check failed: "); | 1392 | printk(KERN_ERR "sysctl table check failed: "); |
1433 | sysctl_print_path(table); | 1393 | sysctl_print_path(table); |
1434 | printk(" %s\n", *fail); | 1394 | printk(" %s\n", *fail); |
1395 | dump_stack(); | ||
1435 | } | 1396 | } |
1436 | *fail = str; | 1397 | *fail = str; |
1437 | } | 1398 | } |
1438 | 1399 | ||
1439 | static int sysctl_check_dir(struct ctl_table *table) | 1400 | static int sysctl_check_dir(struct nsproxy *namespaces, |
1401 | struct ctl_table *table) | ||
1440 | { | 1402 | { |
1441 | struct ctl_table *ref; | 1403 | struct ctl_table *ref; |
1442 | int error; | 1404 | int error; |
1443 | 1405 | ||
1444 | error = 0; | 1406 | error = 0; |
1445 | ref = sysctl_check_lookup(table); | 1407 | ref = sysctl_check_lookup(namespaces, table); |
1446 | if (ref) { | 1408 | if (ref) { |
1447 | int match = 0; | 1409 | int match = 0; |
1448 | if ((!table->procname && !ref->procname) || | 1410 | if ((!table->procname && !ref->procname) || |
@@ -1467,11 +1429,12 @@ static int sysctl_check_dir(struct ctl_table *table) | |||
1467 | return error; | 1429 | return error; |
1468 | } | 1430 | } |
1469 | 1431 | ||
1470 | static void sysctl_check_leaf(struct ctl_table *table, const char **fail) | 1432 | static void sysctl_check_leaf(struct nsproxy *namespaces, |
1433 | struct ctl_table *table, const char **fail) | ||
1471 | { | 1434 | { |
1472 | struct ctl_table *ref; | 1435 | struct ctl_table *ref; |
1473 | 1436 | ||
1474 | ref = sysctl_check_lookup(table); | 1437 | ref = sysctl_check_lookup(namespaces, table); |
1475 | if (ref && (ref != table)) | 1438 | if (ref && (ref != table)) |
1476 | set_fail(fail, table, "Sysctl already exists"); | 1439 | set_fail(fail, table, "Sysctl already exists"); |
1477 | } | 1440 | } |
@@ -1495,7 +1458,7 @@ static void sysctl_check_bin_path(struct ctl_table *table, const char **fail) | |||
1495 | } | 1458 | } |
1496 | } | 1459 | } |
1497 | 1460 | ||
1498 | int sysctl_check_table(struct ctl_table *table) | 1461 | int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table) |
1499 | { | 1462 | { |
1500 | int error = 0; | 1463 | int error = 0; |
1501 | for (; table->ctl_name || table->procname; table++) { | 1464 | for (; table->ctl_name || table->procname; table++) { |
@@ -1525,7 +1488,7 @@ int sysctl_check_table(struct ctl_table *table) | |||
1525 | set_fail(&fail, table, "Directory with extra1"); | 1488 | set_fail(&fail, table, "Directory with extra1"); |
1526 | if (table->extra2) | 1489 | if (table->extra2) |
1527 | set_fail(&fail, table, "Directory with extra2"); | 1490 | set_fail(&fail, table, "Directory with extra2"); |
1528 | if (sysctl_check_dir(table)) | 1491 | if (sysctl_check_dir(namespaces, table)) |
1529 | set_fail(&fail, table, "Inconsistent directory names"); | 1492 | set_fail(&fail, table, "Inconsistent directory names"); |
1530 | } else { | 1493 | } else { |
1531 | if ((table->strategy == sysctl_data) || | 1494 | if ((table->strategy == sysctl_data) || |
@@ -1574,7 +1537,7 @@ int sysctl_check_table(struct ctl_table *table) | |||
1574 | if (!table->procname && table->proc_handler) | 1537 | if (!table->procname && table->proc_handler) |
1575 | set_fail(&fail, table, "proc_handler without procname"); | 1538 | set_fail(&fail, table, "proc_handler without procname"); |
1576 | #endif | 1539 | #endif |
1577 | sysctl_check_leaf(table, &fail); | 1540 | sysctl_check_leaf(namespaces, table, &fail); |
1578 | } | 1541 | } |
1579 | sysctl_check_bin_path(table, &fail); | 1542 | sysctl_check_bin_path(table, &fail); |
1580 | if (fail) { | 1543 | if (fail) { |
@@ -1582,7 +1545,7 @@ int sysctl_check_table(struct ctl_table *table) | |||
1582 | error = -EINVAL; | 1545 | error = -EINVAL; |
1583 | } | 1546 | } |
1584 | if (table->child) | 1547 | if (table->child) |
1585 | error |= sysctl_check_table(table->child); | 1548 | error |= sysctl_check_table(namespaces, table->child); |
1586 | } | 1549 | } |
1587 | return error; | 1550 | return error; |
1588 | } | 1551 | } |
diff --git a/kernel/taskstats.c b/kernel/taskstats.c index 354e74bc17c1..07e86a828073 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c | |||
@@ -398,31 +398,31 @@ static int cgroupstats_user_cmd(struct sk_buff *skb, struct genl_info *info) | |||
398 | 398 | ||
399 | fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]); | 399 | fd = nla_get_u32(info->attrs[CGROUPSTATS_CMD_ATTR_FD]); |
400 | file = fget_light(fd, &fput_needed); | 400 | file = fget_light(fd, &fput_needed); |
401 | if (file) { | 401 | if (!file) |
402 | size = nla_total_size(sizeof(struct cgroupstats)); | 402 | return 0; |
403 | 403 | ||
404 | rc = prepare_reply(info, CGROUPSTATS_CMD_NEW, &rep_skb, | 404 | size = nla_total_size(sizeof(struct cgroupstats)); |
405 | size); | ||
406 | if (rc < 0) | ||
407 | goto err; | ||
408 | 405 | ||
409 | na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, | 406 | rc = prepare_reply(info, CGROUPSTATS_CMD_NEW, &rep_skb, |
410 | sizeof(struct cgroupstats)); | 407 | size); |
411 | stats = nla_data(na); | 408 | if (rc < 0) |
412 | memset(stats, 0, sizeof(*stats)); | 409 | goto err; |
413 | 410 | ||
414 | rc = cgroupstats_build(stats, file->f_dentry); | 411 | na = nla_reserve(rep_skb, CGROUPSTATS_TYPE_CGROUP_STATS, |
415 | if (rc < 0) | 412 | sizeof(struct cgroupstats)); |
416 | goto err; | 413 | stats = nla_data(na); |
414 | memset(stats, 0, sizeof(*stats)); | ||
417 | 415 | ||
418 | fput_light(file, fput_needed); | 416 | rc = cgroupstats_build(stats, file->f_dentry); |
419 | return send_reply(rep_skb, info->snd_pid); | 417 | if (rc < 0) { |
418 | nlmsg_free(rep_skb); | ||
419 | goto err; | ||
420 | } | 420 | } |
421 | 421 | ||
422 | rc = send_reply(rep_skb, info->snd_pid); | ||
423 | |||
422 | err: | 424 | err: |
423 | if (file) | 425 | fput_light(file, fput_needed); |
424 | fput_light(file, fput_needed); | ||
425 | nlmsg_free(rep_skb); | ||
426 | return rc; | 426 | return rc; |
427 | } | 427 | } |
428 | 428 | ||
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c new file mode 100644 index 000000000000..88cdb109e13c --- /dev/null +++ b/kernel/test_kprobes.c | |||
@@ -0,0 +1,216 @@ | |||
1 | /* | ||
2 | * test_kprobes.c - simple sanity test for *probes | ||
3 | * | ||
4 | * Copyright IBM Corp. 2008 | ||
5 | * | ||
6 | * This program is free software; you can redistribute it and/or modify | ||
7 | * it under the terms of the GNU General Public License as published by | ||
8 | * the Free Software Foundation; either version 2 of the License, or | ||
9 | * (at your option) any later version. | ||
10 | * | ||
11 | * This program is distributed in the hope that it would be useful, but | ||
12 | * WITHOUT ANY WARRANTY; without even the implied warranty of | ||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See | ||
14 | * the GNU General Public License for more details. | ||
15 | */ | ||
16 | |||
17 | #include <linux/kernel.h> | ||
18 | #include <linux/kprobes.h> | ||
19 | #include <linux/random.h> | ||
20 | |||
21 | #define div_factor 3 | ||
22 | |||
23 | static u32 rand1, preh_val, posth_val, jph_val; | ||
24 | static int errors, handler_errors, num_tests; | ||
25 | |||
26 | static noinline u32 kprobe_target(u32 value) | ||
27 | { | ||
28 | /* | ||
29 | * gcc ignores noinline on some architectures unless we stuff | ||
30 | * sufficient lard into the function. The get_kprobe() here is | ||
31 | * just for that. | ||
32 | * | ||
33 | * NOTE: We aren't concerned about the correctness of get_kprobe() | ||
34 | * here; hence, this call is neither under !preempt nor with the | ||
35 | * kprobe_mutex held. This is fine(tm) | ||
36 | */ | ||
37 | if (get_kprobe((void *)0xdeadbeef)) | ||
38 | printk(KERN_INFO "Kprobe smoke test: probe on 0xdeadbeef!\n"); | ||
39 | |||
40 | return (value / div_factor); | ||
41 | } | ||
42 | |||
43 | static int kp_pre_handler(struct kprobe *p, struct pt_regs *regs) | ||
44 | { | ||
45 | preh_val = (rand1 / div_factor); | ||
46 | return 0; | ||
47 | } | ||
48 | |||
49 | static void kp_post_handler(struct kprobe *p, struct pt_regs *regs, | ||
50 | unsigned long flags) | ||
51 | { | ||
52 | if (preh_val != (rand1 / div_factor)) { | ||
53 | handler_errors++; | ||
54 | printk(KERN_ERR "Kprobe smoke test failed: " | ||
55 | "incorrect value in post_handler\n"); | ||
56 | } | ||
57 | posth_val = preh_val + div_factor; | ||
58 | } | ||
59 | |||
60 | static struct kprobe kp = { | ||
61 | .symbol_name = "kprobe_target", | ||
62 | .pre_handler = kp_pre_handler, | ||
63 | .post_handler = kp_post_handler | ||
64 | }; | ||
65 | |||
66 | static int test_kprobe(void) | ||
67 | { | ||
68 | int ret; | ||
69 | |||
70 | ret = register_kprobe(&kp); | ||
71 | if (ret < 0) { | ||
72 | printk(KERN_ERR "Kprobe smoke test failed: " | ||
73 | "register_kprobe returned %d\n", ret); | ||
74 | return ret; | ||
75 | } | ||
76 | |||
77 | ret = kprobe_target(rand1); | ||
78 | unregister_kprobe(&kp); | ||
79 | |||
80 | if (preh_val == 0) { | ||
81 | printk(KERN_ERR "Kprobe smoke test failed: " | ||
82 | "kprobe pre_handler not called\n"); | ||
83 | handler_errors++; | ||
84 | } | ||
85 | |||
86 | if (posth_val == 0) { | ||
87 | printk(KERN_ERR "Kprobe smoke test failed: " | ||
88 | "kprobe post_handler not called\n"); | ||
89 | handler_errors++; | ||
90 | } | ||
91 | |||
92 | return 0; | ||
93 | } | ||
94 | |||
95 | static u32 j_kprobe_target(u32 value) | ||
96 | { | ||
97 | if (value != rand1) { | ||
98 | handler_errors++; | ||
99 | printk(KERN_ERR "Kprobe smoke test failed: " | ||
100 | "incorrect value in jprobe handler\n"); | ||
101 | } | ||
102 | |||
103 | jph_val = rand1; | ||
104 | jprobe_return(); | ||
105 | return 0; | ||
106 | } | ||
107 | |||
108 | static struct jprobe jp = { | ||
109 | .entry = j_kprobe_target, | ||
110 | .kp.symbol_name = "kprobe_target" | ||
111 | }; | ||
112 | |||
113 | static int test_jprobe(void) | ||
114 | { | ||
115 | int ret; | ||
116 | |||
117 | ret = register_jprobe(&jp); | ||
118 | if (ret < 0) { | ||
119 | printk(KERN_ERR "Kprobe smoke test failed: " | ||
120 | "register_jprobe returned %d\n", ret); | ||
121 | return ret; | ||
122 | } | ||
123 | |||
124 | ret = kprobe_target(rand1); | ||
125 | unregister_jprobe(&jp); | ||
126 | if (jph_val == 0) { | ||
127 | printk(KERN_ERR "Kprobe smoke test failed: " | ||
128 | "jprobe handler not called\n"); | ||
129 | handler_errors++; | ||
130 | } | ||
131 | |||
132 | return 0; | ||
133 | } | ||
134 | |||
135 | #ifdef CONFIG_KRETPROBES | ||
136 | static u32 krph_val; | ||
137 | |||
138 | static int return_handler(struct kretprobe_instance *ri, struct pt_regs *regs) | ||
139 | { | ||
140 | unsigned long ret = regs_return_value(regs); | ||
141 | |||
142 | if (ret != (rand1 / div_factor)) { | ||
143 | handler_errors++; | ||
144 | printk(KERN_ERR "Kprobe smoke test failed: " | ||
145 | "incorrect value in kretprobe handler\n"); | ||
146 | } | ||
147 | |||
148 | krph_val = (rand1 / div_factor); | ||
149 | return 0; | ||
150 | } | ||
151 | |||
152 | static struct kretprobe rp = { | ||
153 | .handler = return_handler, | ||
154 | .kp.symbol_name = "kprobe_target" | ||
155 | }; | ||
156 | |||
157 | static int test_kretprobe(void) | ||
158 | { | ||
159 | int ret; | ||
160 | |||
161 | ret = register_kretprobe(&rp); | ||
162 | if (ret < 0) { | ||
163 | printk(KERN_ERR "Kprobe smoke test failed: " | ||
164 | "register_kretprobe returned %d\n", ret); | ||
165 | return ret; | ||
166 | } | ||
167 | |||
168 | ret = kprobe_target(rand1); | ||
169 | unregister_kretprobe(&rp); | ||
170 | if (krph_val == 0) { | ||
171 | printk(KERN_ERR "Kprobe smoke test failed: " | ||
172 | "kretprobe handler not called\n"); | ||
173 | handler_errors++; | ||
174 | } | ||
175 | |||
176 | return 0; | ||
177 | } | ||
178 | #endif /* CONFIG_KRETPROBES */ | ||
179 | |||
180 | int init_test_probes(void) | ||
181 | { | ||
182 | int ret; | ||
183 | |||
184 | do { | ||
185 | rand1 = random32(); | ||
186 | } while (rand1 <= div_factor); | ||
187 | |||
188 | printk(KERN_INFO "Kprobe smoke test started\n"); | ||
189 | num_tests++; | ||
190 | ret = test_kprobe(); | ||
191 | if (ret < 0) | ||
192 | errors++; | ||
193 | |||
194 | num_tests++; | ||
195 | ret = test_jprobe(); | ||
196 | if (ret < 0) | ||
197 | errors++; | ||
198 | |||
199 | #ifdef CONFIG_KRETPROBES | ||
200 | num_tests++; | ||
201 | ret = test_kretprobe(); | ||
202 | if (ret < 0) | ||
203 | errors++; | ||
204 | #endif /* CONFIG_KRETPROBES */ | ||
205 | |||
206 | if (errors) | ||
207 | printk(KERN_ERR "BUG: Kprobe smoke test: %d out of " | ||
208 | "%d tests failed\n", errors, num_tests); | ||
209 | else if (handler_errors) | ||
210 | printk(KERN_ERR "BUG: Kprobe smoke test: %d error(s) " | ||
211 | "running handlers\n", handler_errors); | ||
212 | else | ||
213 | printk(KERN_INFO "Kprobe smoke test passed successfully\n"); | ||
214 | |||
215 | return 0; | ||
216 | } | ||
diff --git a/kernel/time.c b/kernel/time.c index 09d3c45c4da7..4064c0566e77 100644 --- a/kernel/time.c +++ b/kernel/time.c | |||
@@ -129,6 +129,7 @@ static inline void warp_clock(void) | |||
129 | write_seqlock_irq(&xtime_lock); | 129 | write_seqlock_irq(&xtime_lock); |
130 | wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; | 130 | wall_to_monotonic.tv_sec -= sys_tz.tz_minuteswest * 60; |
131 | xtime.tv_sec += sys_tz.tz_minuteswest * 60; | 131 | xtime.tv_sec += sys_tz.tz_minuteswest * 60; |
132 | update_xtime_cache(0); | ||
132 | write_sequnlock_irq(&xtime_lock); | 133 | write_sequnlock_irq(&xtime_lock); |
133 | clock_was_set(); | 134 | clock_was_set(); |
134 | } | 135 | } |
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 822beebe664a..3e59fce6dd43 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c | |||
@@ -41,6 +41,11 @@ unsigned long clockevent_delta2ns(unsigned long latch, | |||
41 | { | 41 | { |
42 | u64 clc = ((u64) latch << evt->shift); | 42 | u64 clc = ((u64) latch << evt->shift); |
43 | 43 | ||
44 | if (unlikely(!evt->mult)) { | ||
45 | evt->mult = 1; | ||
46 | WARN_ON(1); | ||
47 | } | ||
48 | |||
44 | do_div(clc, evt->mult); | 49 | do_div(clc, evt->mult); |
45 | if (clc < 1000) | 50 | if (clc < 1000) |
46 | clc = 1000; | 51 | clc = 1000; |
@@ -78,6 +83,11 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, | |||
78 | unsigned long long clc; | 83 | unsigned long long clc; |
79 | int64_t delta; | 84 | int64_t delta; |
80 | 85 | ||
86 | if (unlikely(expires.tv64 < 0)) { | ||
87 | WARN_ON_ONCE(1); | ||
88 | return -ETIME; | ||
89 | } | ||
90 | |||
81 | delta = ktime_to_ns(ktime_sub(expires, now)); | 91 | delta = ktime_to_ns(ktime_sub(expires, now)); |
82 | 92 | ||
83 | if (delta <= 0) | 93 | if (delta <= 0) |
@@ -146,6 +156,14 @@ static void clockevents_notify_released(void) | |||
146 | void clockevents_register_device(struct clock_event_device *dev) | 156 | void clockevents_register_device(struct clock_event_device *dev) |
147 | { | 157 | { |
148 | BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); | 158 | BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); |
159 | /* | ||
160 | * A nsec2cyc multiplicator of 0 is invalid and we'd crash | ||
161 | * on it, so fix it up and emit a warning: | ||
162 | */ | ||
163 | if (unlikely(!dev->mult)) { | ||
164 | dev->mult = 1; | ||
165 | WARN_ON(1); | ||
166 | } | ||
149 | 167 | ||
150 | spin_lock(&clockevents_lock); | 168 | spin_lock(&clockevents_lock); |
151 | 169 | ||
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index c8a9d13874df..6e9259a5d501 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c | |||
@@ -142,8 +142,13 @@ static void clocksource_watchdog(unsigned long data) | |||
142 | } | 142 | } |
143 | 143 | ||
144 | if (!list_empty(&watchdog_list)) { | 144 | if (!list_empty(&watchdog_list)) { |
145 | __mod_timer(&watchdog_timer, | 145 | /* Cycle through CPUs to check if the CPUs stay synchronized to |
146 | watchdog_timer.expires + WATCHDOG_INTERVAL); | 146 | * each other. */ |
147 | int next_cpu = next_cpu(raw_smp_processor_id(), cpu_online_map); | ||
148 | if (next_cpu >= NR_CPUS) | ||
149 | next_cpu = first_cpu(cpu_online_map); | ||
150 | watchdog_timer.expires += WATCHDOG_INTERVAL; | ||
151 | add_timer_on(&watchdog_timer, next_cpu); | ||
147 | } | 152 | } |
148 | spin_unlock(&watchdog_lock); | 153 | spin_unlock(&watchdog_lock); |
149 | } | 154 | } |
@@ -165,7 +170,7 @@ static void clocksource_check_watchdog(struct clocksource *cs) | |||
165 | if (!started && watchdog) { | 170 | if (!started && watchdog) { |
166 | watchdog_last = watchdog->read(); | 171 | watchdog_last = watchdog->read(); |
167 | watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; | 172 | watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; |
168 | add_timer(&watchdog_timer); | 173 | add_timer_on(&watchdog_timer, first_cpu(cpu_online_map)); |
169 | } | 174 | } |
170 | } else { | 175 | } else { |
171 | if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) | 176 | if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) |
@@ -175,7 +180,7 @@ static void clocksource_check_watchdog(struct clocksource *cs) | |||
175 | if (watchdog) | 180 | if (watchdog) |
176 | del_timer(&watchdog_timer); | 181 | del_timer(&watchdog_timer); |
177 | watchdog = cs; | 182 | watchdog = cs; |
178 | init_timer(&watchdog_timer); | 183 | init_timer_deferrable(&watchdog_timer); |
179 | watchdog_timer.function = clocksource_watchdog; | 184 | watchdog_timer.function = clocksource_watchdog; |
180 | 185 | ||
181 | /* Reset watchdog cycles */ | 186 | /* Reset watchdog cycles */ |
@@ -186,7 +191,8 @@ static void clocksource_check_watchdog(struct clocksource *cs) | |||
186 | watchdog_last = watchdog->read(); | 191 | watchdog_last = watchdog->read(); |
187 | watchdog_timer.expires = | 192 | watchdog_timer.expires = |
188 | jiffies + WATCHDOG_INTERVAL; | 193 | jiffies + WATCHDOG_INTERVAL; |
189 | add_timer(&watchdog_timer); | 194 | add_timer_on(&watchdog_timer, |
195 | first_cpu(cpu_online_map)); | ||
190 | } | 196 | } |
191 | } | 197 | } |
192 | } | 198 | } |
@@ -331,6 +337,21 @@ void clocksource_change_rating(struct clocksource *cs, int rating) | |||
331 | spin_unlock_irqrestore(&clocksource_lock, flags); | 337 | spin_unlock_irqrestore(&clocksource_lock, flags); |
332 | } | 338 | } |
333 | 339 | ||
340 | /** | ||
341 | * clocksource_unregister - remove a registered clocksource | ||
342 | */ | ||
343 | void clocksource_unregister(struct clocksource *cs) | ||
344 | { | ||
345 | unsigned long flags; | ||
346 | |||
347 | spin_lock_irqsave(&clocksource_lock, flags); | ||
348 | list_del(&cs->list); | ||
349 | if (clocksource_override == cs) | ||
350 | clocksource_override = NULL; | ||
351 | next_clocksource = select_clocksource(); | ||
352 | spin_unlock_irqrestore(&clocksource_lock, flags); | ||
353 | } | ||
354 | |||
334 | #ifdef CONFIG_SYSFS | 355 | #ifdef CONFIG_SYSFS |
335 | /** | 356 | /** |
336 | * sysfs_show_current_clocksources - sysfs interface for current clocksource | 357 | * sysfs_show_current_clocksources - sysfs interface for current clocksource |
@@ -441,7 +462,7 @@ static SYSDEV_ATTR(available_clocksource, 0600, | |||
441 | sysfs_show_available_clocksources, NULL); | 462 | sysfs_show_available_clocksources, NULL); |
442 | 463 | ||
443 | static struct sysdev_class clocksource_sysclass = { | 464 | static struct sysdev_class clocksource_sysclass = { |
444 | set_kset_name("clocksource"), | 465 | .name = "clocksource", |
445 | }; | 466 | }; |
446 | 467 | ||
447 | static struct sys_device device_clocksource = { | 468 | static struct sys_device device_clocksource = { |
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index de6a2d6b3ebb..e64efaf957e8 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c | |||
@@ -205,7 +205,7 @@ static void sync_cmos_clock(unsigned long dummy) | |||
205 | return; | 205 | return; |
206 | 206 | ||
207 | getnstimeofday(&now); | 207 | getnstimeofday(&now); |
208 | if (abs(xtime.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) | 208 | if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) |
209 | fail = update_persistent_clock(now); | 209 | fail = update_persistent_clock(now); |
210 | 210 | ||
211 | next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec; | 211 | next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec; |
@@ -249,10 +249,12 @@ int do_adjtimex(struct timex *txc) | |||
249 | 249 | ||
250 | /* Now we validate the data before disabling interrupts */ | 250 | /* Now we validate the data before disabling interrupts */ |
251 | 251 | ||
252 | if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) | 252 | if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) { |
253 | /* singleshot must not be used with any other mode bits */ | 253 | /* singleshot must not be used with any other mode bits */ |
254 | if (txc->modes != ADJ_OFFSET_SINGLESHOT) | 254 | if (txc->modes != ADJ_OFFSET_SINGLESHOT && |
255 | txc->modes != ADJ_OFFSET_SS_READ) | ||
255 | return -EINVAL; | 256 | return -EINVAL; |
257 | } | ||
256 | 258 | ||
257 | if (txc->modes != ADJ_OFFSET_SINGLESHOT && (txc->modes & ADJ_OFFSET)) | 259 | if (txc->modes != ADJ_OFFSET_SINGLESHOT && (txc->modes & ADJ_OFFSET)) |
258 | /* adjustment Offset limited to +- .512 seconds */ | 260 | /* adjustment Offset limited to +- .512 seconds */ |
@@ -372,7 +374,8 @@ int do_adjtimex(struct timex *txc) | |||
372 | leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0) | 374 | leave: if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0) |
373 | result = TIME_ERROR; | 375 | result = TIME_ERROR; |
374 | 376 | ||
375 | if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) | 377 | if ((txc->modes == ADJ_OFFSET_SINGLESHOT) || |
378 | (txc->modes == ADJ_OFFSET_SS_READ)) | ||
376 | txc->offset = save_adjust; | 379 | txc->offset = save_adjust; |
377 | else | 380 | else |
378 | txc->offset = ((long)shift_right(time_offset, SHIFT_UPDATE)) * | 381 | txc->offset = ((long)shift_right(time_offset, SHIFT_UPDATE)) * |
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 8cfb8b2ce773..e1bd50cbbf5d 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c | |||
@@ -126,9 +126,9 @@ int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu) | |||
126 | /* | 126 | /* |
127 | * Broadcast the event to the cpus, which are set in the mask | 127 | * Broadcast the event to the cpus, which are set in the mask |
128 | */ | 128 | */ |
129 | int tick_do_broadcast(cpumask_t mask) | 129 | static void tick_do_broadcast(cpumask_t mask) |
130 | { | 130 | { |
131 | int ret = 0, cpu = smp_processor_id(); | 131 | int cpu = smp_processor_id(); |
132 | struct tick_device *td; | 132 | struct tick_device *td; |
133 | 133 | ||
134 | /* | 134 | /* |
@@ -138,7 +138,6 @@ int tick_do_broadcast(cpumask_t mask) | |||
138 | cpu_clear(cpu, mask); | 138 | cpu_clear(cpu, mask); |
139 | td = &per_cpu(tick_cpu_device, cpu); | 139 | td = &per_cpu(tick_cpu_device, cpu); |
140 | td->evtdev->event_handler(td->evtdev); | 140 | td->evtdev->event_handler(td->evtdev); |
141 | ret = 1; | ||
142 | } | 141 | } |
143 | 142 | ||
144 | if (!cpus_empty(mask)) { | 143 | if (!cpus_empty(mask)) { |
@@ -151,9 +150,7 @@ int tick_do_broadcast(cpumask_t mask) | |||
151 | cpu = first_cpu(mask); | 150 | cpu = first_cpu(mask); |
152 | td = &per_cpu(tick_cpu_device, cpu); | 151 | td = &per_cpu(tick_cpu_device, cpu); |
153 | td->evtdev->broadcast(mask); | 152 | td->evtdev->broadcast(mask); |
154 | ret = 1; | ||
155 | } | 153 | } |
156 | return ret; | ||
157 | } | 154 | } |
158 | 155 | ||
159 | /* | 156 | /* |
@@ -384,45 +381,19 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc) | |||
384 | } | 381 | } |
385 | 382 | ||
386 | /* | 383 | /* |
387 | * Reprogram the broadcast device: | ||
388 | * | ||
389 | * Called with tick_broadcast_lock held and interrupts disabled. | ||
390 | */ | ||
391 | static int tick_broadcast_reprogram(void) | ||
392 | { | ||
393 | ktime_t expires = { .tv64 = KTIME_MAX }; | ||
394 | struct tick_device *td; | ||
395 | int cpu; | ||
396 | |||
397 | /* | ||
398 | * Find the event which expires next: | ||
399 | */ | ||
400 | for (cpu = first_cpu(tick_broadcast_oneshot_mask); cpu != NR_CPUS; | ||
401 | cpu = next_cpu(cpu, tick_broadcast_oneshot_mask)) { | ||
402 | td = &per_cpu(tick_cpu_device, cpu); | ||
403 | if (td->evtdev->next_event.tv64 < expires.tv64) | ||
404 | expires = td->evtdev->next_event; | ||
405 | } | ||
406 | |||
407 | if (expires.tv64 == KTIME_MAX) | ||
408 | return 0; | ||
409 | |||
410 | return tick_broadcast_set_event(expires, 0); | ||
411 | } | ||
412 | |||
413 | /* | ||
414 | * Handle oneshot mode broadcasting | 384 | * Handle oneshot mode broadcasting |
415 | */ | 385 | */ |
416 | static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) | 386 | static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) |
417 | { | 387 | { |
418 | struct tick_device *td; | 388 | struct tick_device *td; |
419 | cpumask_t mask; | 389 | cpumask_t mask; |
420 | ktime_t now; | 390 | ktime_t now, next_event; |
421 | int cpu; | 391 | int cpu; |
422 | 392 | ||
423 | spin_lock(&tick_broadcast_lock); | 393 | spin_lock(&tick_broadcast_lock); |
424 | again: | 394 | again: |
425 | dev->next_event.tv64 = KTIME_MAX; | 395 | dev->next_event.tv64 = KTIME_MAX; |
396 | next_event.tv64 = KTIME_MAX; | ||
426 | mask = CPU_MASK_NONE; | 397 | mask = CPU_MASK_NONE; |
427 | now = ktime_get(); | 398 | now = ktime_get(); |
428 | /* Find all expired events */ | 399 | /* Find all expired events */ |
@@ -431,19 +402,31 @@ again: | |||
431 | td = &per_cpu(tick_cpu_device, cpu); | 402 | td = &per_cpu(tick_cpu_device, cpu); |
432 | if (td->evtdev->next_event.tv64 <= now.tv64) | 403 | if (td->evtdev->next_event.tv64 <= now.tv64) |
433 | cpu_set(cpu, mask); | 404 | cpu_set(cpu, mask); |
405 | else if (td->evtdev->next_event.tv64 < next_event.tv64) | ||
406 | next_event.tv64 = td->evtdev->next_event.tv64; | ||
434 | } | 407 | } |
435 | 408 | ||
436 | /* | 409 | /* |
437 | * Wakeup the cpus which have an expired event. The broadcast | 410 | * Wakeup the cpus which have an expired event. |
438 | * device is reprogrammed in the return from idle code. | 411 | */ |
412 | tick_do_broadcast(mask); | ||
413 | |||
414 | /* | ||
415 | * Two reasons for reprogram: | ||
416 | * | ||
417 | * - The global event did not expire any CPU local | ||
418 | * events. This happens in dyntick mode, as the maximum PIT | ||
419 | * delta is quite small. | ||
420 | * | ||
421 | * - There are pending events on sleeping CPUs which were not | ||
422 | * in the event mask | ||
439 | */ | 423 | */ |
440 | if (!tick_do_broadcast(mask)) { | 424 | if (next_event.tv64 != KTIME_MAX) { |
441 | /* | 425 | /* |
442 | * The global event did not expire any CPU local | 426 | * Rearm the broadcast device. If event expired, |
443 | * events. This happens in dyntick mode, as the | 427 | * repeat the above |
444 | * maximum PIT delta is quite small. | ||
445 | */ | 428 | */ |
446 | if (tick_broadcast_reprogram()) | 429 | if (tick_broadcast_set_event(next_event, 0)) |
447 | goto again; | 430 | goto again; |
448 | } | 431 | } |
449 | spin_unlock(&tick_broadcast_lock); | 432 | spin_unlock(&tick_broadcast_lock); |
@@ -508,7 +491,7 @@ static void tick_broadcast_clear_oneshot(int cpu) | |||
508 | } | 491 | } |
509 | 492 | ||
510 | /** | 493 | /** |
511 | * tick_broadcast_setup_highres - setup the broadcast device for highres | 494 | * tick_broadcast_setup_oneshot - setup the broadcast device |
512 | */ | 495 | */ |
513 | void tick_broadcast_setup_oneshot(struct clock_event_device *bc) | 496 | void tick_broadcast_setup_oneshot(struct clock_event_device *bc) |
514 | { | 497 | { |
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index bb13f2724905..f13f2b7f4fd4 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h | |||
@@ -70,8 +70,6 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc) | |||
70 | * Broadcasting support | 70 | * Broadcasting support |
71 | */ | 71 | */ |
72 | #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST | 72 | #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST |
73 | extern int tick_do_broadcast(cpumask_t mask); | ||
74 | |||
75 | extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); | 73 | extern int tick_device_uses_broadcast(struct clock_event_device *dev, int cpu); |
76 | extern int tick_check_broadcast_device(struct clock_event_device *dev); | 74 | extern int tick_check_broadcast_device(struct clock_event_device *dev); |
77 | extern int tick_is_broadcast_device(struct clock_event_device *dev); | 75 | extern int tick_is_broadcast_device(struct clock_event_device *dev); |
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 10a1347597fd..88267f0a8471 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c | |||
@@ -9,7 +9,7 @@ | |||
9 | * | 9 | * |
10 | * Started by: Thomas Gleixner and Ingo Molnar | 10 | * Started by: Thomas Gleixner and Ingo Molnar |
11 | * | 11 | * |
12 | * For licencing details see kernel-base/COPYING | 12 | * Distribute under GPLv2. |
13 | */ | 13 | */ |
14 | #include <linux/cpu.h> | 14 | #include <linux/cpu.h> |
15 | #include <linux/err.h> | 15 | #include <linux/err.h> |
@@ -133,14 +133,55 @@ void tick_nohz_update_jiffies(void) | |||
133 | if (!ts->tick_stopped) | 133 | if (!ts->tick_stopped) |
134 | return; | 134 | return; |
135 | 135 | ||
136 | touch_softlockup_watchdog(); | ||
137 | |||
136 | cpu_clear(cpu, nohz_cpu_mask); | 138 | cpu_clear(cpu, nohz_cpu_mask); |
137 | now = ktime_get(); | 139 | now = ktime_get(); |
140 | ts->idle_waketime = now; | ||
138 | 141 | ||
139 | local_irq_save(flags); | 142 | local_irq_save(flags); |
140 | tick_do_update_jiffies64(now); | 143 | tick_do_update_jiffies64(now); |
141 | local_irq_restore(flags); | 144 | local_irq_restore(flags); |
142 | } | 145 | } |
143 | 146 | ||
147 | void tick_nohz_stop_idle(int cpu) | ||
148 | { | ||
149 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | ||
150 | |||
151 | if (ts->idle_active) { | ||
152 | ktime_t now, delta; | ||
153 | now = ktime_get(); | ||
154 | delta = ktime_sub(now, ts->idle_entrytime); | ||
155 | ts->idle_lastupdate = now; | ||
156 | ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); | ||
157 | ts->idle_active = 0; | ||
158 | } | ||
159 | } | ||
160 | |||
161 | static ktime_t tick_nohz_start_idle(int cpu) | ||
162 | { | ||
163 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | ||
164 | ktime_t now, delta; | ||
165 | |||
166 | now = ktime_get(); | ||
167 | if (ts->idle_active) { | ||
168 | delta = ktime_sub(now, ts->idle_entrytime); | ||
169 | ts->idle_lastupdate = now; | ||
170 | ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); | ||
171 | } | ||
172 | ts->idle_entrytime = now; | ||
173 | ts->idle_active = 1; | ||
174 | return now; | ||
175 | } | ||
176 | |||
177 | u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) | ||
178 | { | ||
179 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | ||
180 | |||
181 | *last_update_time = ktime_to_us(ts->idle_lastupdate); | ||
182 | return ktime_to_us(ts->idle_sleeptime); | ||
183 | } | ||
184 | |||
144 | /** | 185 | /** |
145 | * tick_nohz_stop_sched_tick - stop the idle tick from the idle task | 186 | * tick_nohz_stop_sched_tick - stop the idle tick from the idle task |
146 | * | 187 | * |
@@ -151,14 +192,16 @@ void tick_nohz_update_jiffies(void) | |||
151 | void tick_nohz_stop_sched_tick(void) | 192 | void tick_nohz_stop_sched_tick(void) |
152 | { | 193 | { |
153 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; | 194 | unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; |
195 | unsigned long rt_jiffies; | ||
154 | struct tick_sched *ts; | 196 | struct tick_sched *ts; |
155 | ktime_t last_update, expires, now, delta; | 197 | ktime_t last_update, expires, now; |
156 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; | 198 | struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; |
157 | int cpu; | 199 | int cpu; |
158 | 200 | ||
159 | local_irq_save(flags); | 201 | local_irq_save(flags); |
160 | 202 | ||
161 | cpu = smp_processor_id(); | 203 | cpu = smp_processor_id(); |
204 | now = tick_nohz_start_idle(cpu); | ||
162 | ts = &per_cpu(tick_cpu_sched, cpu); | 205 | ts = &per_cpu(tick_cpu_sched, cpu); |
163 | 206 | ||
164 | /* | 207 | /* |
@@ -190,19 +233,7 @@ void tick_nohz_stop_sched_tick(void) | |||
190 | } | 233 | } |
191 | } | 234 | } |
192 | 235 | ||
193 | now = ktime_get(); | ||
194 | /* | ||
195 | * When called from irq_exit we need to account the idle sleep time | ||
196 | * correctly. | ||
197 | */ | ||
198 | if (ts->tick_stopped) { | ||
199 | delta = ktime_sub(now, ts->idle_entrytime); | ||
200 | ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); | ||
201 | } | ||
202 | |||
203 | ts->idle_entrytime = now; | ||
204 | ts->idle_calls++; | 236 | ts->idle_calls++; |
205 | |||
206 | /* Read jiffies and the time when jiffies were updated last */ | 237 | /* Read jiffies and the time when jiffies were updated last */ |
207 | do { | 238 | do { |
208 | seq = read_seqbegin(&xtime_lock); | 239 | seq = read_seqbegin(&xtime_lock); |
@@ -214,6 +245,10 @@ void tick_nohz_stop_sched_tick(void) | |||
214 | next_jiffies = get_next_timer_interrupt(last_jiffies); | 245 | next_jiffies = get_next_timer_interrupt(last_jiffies); |
215 | delta_jiffies = next_jiffies - last_jiffies; | 246 | delta_jiffies = next_jiffies - last_jiffies; |
216 | 247 | ||
248 | rt_jiffies = rt_needs_cpu(cpu); | ||
249 | if (rt_jiffies && rt_jiffies < delta_jiffies) | ||
250 | delta_jiffies = rt_jiffies; | ||
251 | |||
217 | if (rcu_needs_cpu(cpu)) | 252 | if (rcu_needs_cpu(cpu)) |
218 | delta_jiffies = 1; | 253 | delta_jiffies = 1; |
219 | /* | 254 | /* |
@@ -289,7 +324,7 @@ void tick_nohz_stop_sched_tick(void) | |||
289 | /* Check, if the timer was already in the past */ | 324 | /* Check, if the timer was already in the past */ |
290 | if (hrtimer_active(&ts->sched_timer)) | 325 | if (hrtimer_active(&ts->sched_timer)) |
291 | goto out; | 326 | goto out; |
292 | } else if(!tick_program_event(expires, 0)) | 327 | } else if (!tick_program_event(expires, 0)) |
293 | goto out; | 328 | goto out; |
294 | /* | 329 | /* |
295 | * We are past the event already. So we crossed a | 330 | * We are past the event already. So we crossed a |
@@ -320,10 +355,8 @@ ktime_t tick_nohz_get_sleep_length(void) | |||
320 | return ts->sleep_length; | 355 | return ts->sleep_length; |
321 | } | 356 | } |
322 | 357 | ||
323 | EXPORT_SYMBOL_GPL(tick_nohz_get_sleep_length); | ||
324 | |||
325 | /** | 358 | /** |
326 | * nohz_restart_sched_tick - restart the idle tick from the idle task | 359 | * tick_nohz_restart_sched_tick - restart the idle tick from the idle task |
327 | * | 360 | * |
328 | * Restart the idle tick when the CPU is woken up from idle | 361 | * Restart the idle tick when the CPU is woken up from idle |
329 | */ | 362 | */ |
@@ -332,23 +365,22 @@ void tick_nohz_restart_sched_tick(void) | |||
332 | int cpu = smp_processor_id(); | 365 | int cpu = smp_processor_id(); |
333 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); | 366 | struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); |
334 | unsigned long ticks; | 367 | unsigned long ticks; |
335 | ktime_t now, delta; | 368 | ktime_t now; |
336 | 369 | ||
337 | if (!ts->tick_stopped) | 370 | local_irq_disable(); |
371 | tick_nohz_stop_idle(cpu); | ||
372 | |||
373 | if (!ts->tick_stopped) { | ||
374 | local_irq_enable(); | ||
338 | return; | 375 | return; |
376 | } | ||
339 | 377 | ||
340 | /* Update jiffies first */ | 378 | /* Update jiffies first */ |
341 | now = ktime_get(); | ||
342 | |||
343 | local_irq_disable(); | ||
344 | select_nohz_load_balancer(0); | 379 | select_nohz_load_balancer(0); |
380 | now = ktime_get(); | ||
345 | tick_do_update_jiffies64(now); | 381 | tick_do_update_jiffies64(now); |
346 | cpu_clear(cpu, nohz_cpu_mask); | 382 | cpu_clear(cpu, nohz_cpu_mask); |
347 | 383 | ||
348 | /* Account the idle time */ | ||
349 | delta = ktime_sub(now, ts->idle_entrytime); | ||
350 | ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); | ||
351 | |||
352 | /* | 384 | /* |
353 | * We stopped the tick in idle. Update process times would miss the | 385 | * We stopped the tick in idle. Update process times would miss the |
354 | * time we slept as update_process_times does only a 1 tick | 386 | * time we slept as update_process_times does only a 1 tick |
@@ -369,6 +401,7 @@ void tick_nohz_restart_sched_tick(void) | |||
369 | * Cancel the scheduled timer and restore the tick | 401 | * Cancel the scheduled timer and restore the tick |
370 | */ | 402 | */ |
371 | ts->tick_stopped = 0; | 403 | ts->tick_stopped = 0; |
404 | ts->idle_exittime = now; | ||
372 | hrtimer_cancel(&ts->sched_timer); | 405 | hrtimer_cancel(&ts->sched_timer); |
373 | ts->sched_timer.expires = ts->idle_tick; | 406 | ts->sched_timer.expires = ts->idle_tick; |
374 | 407 | ||
@@ -502,14 +535,13 @@ static inline void tick_nohz_switch_to_nohz(void) { } | |||
502 | */ | 535 | */ |
503 | #ifdef CONFIG_HIGH_RES_TIMERS | 536 | #ifdef CONFIG_HIGH_RES_TIMERS |
504 | /* | 537 | /* |
505 | * We rearm the timer until we get disabled by the idle code | 538 | * We rearm the timer until we get disabled by the idle code. |
506 | * Called with interrupts disabled and timer->base->cpu_base->lock held. | 539 | * Called with interrupts disabled and timer->base->cpu_base->lock held. |
507 | */ | 540 | */ |
508 | static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) | 541 | static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) |
509 | { | 542 | { |
510 | struct tick_sched *ts = | 543 | struct tick_sched *ts = |
511 | container_of(timer, struct tick_sched, sched_timer); | 544 | container_of(timer, struct tick_sched, sched_timer); |
512 | struct hrtimer_cpu_base *base = timer->base->cpu_base; | ||
513 | struct pt_regs *regs = get_irq_regs(); | 545 | struct pt_regs *regs = get_irq_regs(); |
514 | ktime_t now = ktime_get(); | 546 | ktime_t now = ktime_get(); |
515 | int cpu = smp_processor_id(); | 547 | int cpu = smp_processor_id(); |
@@ -547,15 +579,8 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) | |||
547 | touch_softlockup_watchdog(); | 579 | touch_softlockup_watchdog(); |
548 | ts->idle_jiffies++; | 580 | ts->idle_jiffies++; |
549 | } | 581 | } |
550 | /* | ||
551 | * update_process_times() might take tasklist_lock, hence | ||
552 | * drop the base lock. sched-tick hrtimers are per-CPU and | ||
553 | * never accessible by userspace APIs, so this is safe to do. | ||
554 | */ | ||
555 | spin_unlock(&base->lock); | ||
556 | update_process_times(user_mode(regs)); | 582 | update_process_times(user_mode(regs)); |
557 | profile_tick(CPU_PROFILING); | 583 | profile_tick(CPU_PROFILING); |
558 | spin_lock(&base->lock); | ||
559 | } | 584 | } |
560 | 585 | ||
561 | /* Do not restart, when we are in the idle loop */ | 586 | /* Do not restart, when we are in the idle loop */ |
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index e5e466b27598..cd5dbc4579c9 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c | |||
@@ -47,7 +47,7 @@ struct timespec wall_to_monotonic __attribute__ ((aligned (16))); | |||
47 | static unsigned long total_sleep_time; /* seconds */ | 47 | static unsigned long total_sleep_time; /* seconds */ |
48 | 48 | ||
49 | static struct timespec xtime_cache __attribute__ ((aligned (16))); | 49 | static struct timespec xtime_cache __attribute__ ((aligned (16))); |
50 | static inline void update_xtime_cache(u64 nsec) | 50 | void update_xtime_cache(u64 nsec) |
51 | { | 51 | { |
52 | xtime_cache = xtime; | 52 | xtime_cache = xtime; |
53 | timespec_add_ns(&xtime_cache, nsec); | 53 | timespec_add_ns(&xtime_cache, nsec); |
@@ -82,13 +82,12 @@ static inline s64 __get_nsec_offset(void) | |||
82 | } | 82 | } |
83 | 83 | ||
84 | /** | 84 | /** |
85 | * __get_realtime_clock_ts - Returns the time of day in a timespec | 85 | * getnstimeofday - Returns the time of day in a timespec |
86 | * @ts: pointer to the timespec to be set | 86 | * @ts: pointer to the timespec to be set |
87 | * | 87 | * |
88 | * Returns the time of day in a timespec. Used by | 88 | * Returns the time of day in a timespec. |
89 | * do_gettimeofday() and get_realtime_clock_ts(). | ||
90 | */ | 89 | */ |
91 | static inline void __get_realtime_clock_ts(struct timespec *ts) | 90 | void getnstimeofday(struct timespec *ts) |
92 | { | 91 | { |
93 | unsigned long seq; | 92 | unsigned long seq; |
94 | s64 nsecs; | 93 | s64 nsecs; |
@@ -104,30 +103,19 @@ static inline void __get_realtime_clock_ts(struct timespec *ts) | |||
104 | timespec_add_ns(ts, nsecs); | 103 | timespec_add_ns(ts, nsecs); |
105 | } | 104 | } |
106 | 105 | ||
107 | /** | ||
108 | * getnstimeofday - Returns the time of day in a timespec | ||
109 | * @ts: pointer to the timespec to be set | ||
110 | * | ||
111 | * Returns the time of day in a timespec. | ||
112 | */ | ||
113 | void getnstimeofday(struct timespec *ts) | ||
114 | { | ||
115 | __get_realtime_clock_ts(ts); | ||
116 | } | ||
117 | |||
118 | EXPORT_SYMBOL(getnstimeofday); | 106 | EXPORT_SYMBOL(getnstimeofday); |
119 | 107 | ||
120 | /** | 108 | /** |
121 | * do_gettimeofday - Returns the time of day in a timeval | 109 | * do_gettimeofday - Returns the time of day in a timeval |
122 | * @tv: pointer to the timeval to be set | 110 | * @tv: pointer to the timeval to be set |
123 | * | 111 | * |
124 | * NOTE: Users should be converted to using get_realtime_clock_ts() | 112 | * NOTE: Users should be converted to using getnstimeofday() |
125 | */ | 113 | */ |
126 | void do_gettimeofday(struct timeval *tv) | 114 | void do_gettimeofday(struct timeval *tv) |
127 | { | 115 | { |
128 | struct timespec now; | 116 | struct timespec now; |
129 | 117 | ||
130 | __get_realtime_clock_ts(&now); | 118 | getnstimeofday(&now); |
131 | tv->tv_sec = now.tv_sec; | 119 | tv->tv_sec = now.tv_sec; |
132 | tv->tv_usec = now.tv_nsec/1000; | 120 | tv->tv_usec = now.tv_nsec/1000; |
133 | } | 121 | } |
@@ -157,6 +145,7 @@ int do_settimeofday(struct timespec *tv) | |||
157 | 145 | ||
158 | set_normalized_timespec(&xtime, sec, nsec); | 146 | set_normalized_timespec(&xtime, sec, nsec); |
159 | set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); | 147 | set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); |
148 | update_xtime_cache(0); | ||
160 | 149 | ||
161 | clock->error = 0; | 150 | clock->error = 0; |
162 | ntp_clear(); | 151 | ntp_clear(); |
@@ -198,7 +187,8 @@ static void change_clocksource(void) | |||
198 | 187 | ||
199 | clock->error = 0; | 188 | clock->error = 0; |
200 | clock->xtime_nsec = 0; | 189 | clock->xtime_nsec = 0; |
201 | clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); | 190 | clocksource_calculate_interval(clock, |
191 | (unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT)); | ||
202 | 192 | ||
203 | tick_clock_notify(); | 193 | tick_clock_notify(); |
204 | 194 | ||
@@ -255,15 +245,16 @@ void __init timekeeping_init(void) | |||
255 | ntp_clear(); | 245 | ntp_clear(); |
256 | 246 | ||
257 | clock = clocksource_get_next(); | 247 | clock = clocksource_get_next(); |
258 | clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); | 248 | clocksource_calculate_interval(clock, |
249 | (unsigned long)(current_tick_length()>>TICK_LENGTH_SHIFT)); | ||
259 | clock->cycle_last = clocksource_read(clock); | 250 | clock->cycle_last = clocksource_read(clock); |
260 | 251 | ||
261 | xtime.tv_sec = sec; | 252 | xtime.tv_sec = sec; |
262 | xtime.tv_nsec = 0; | 253 | xtime.tv_nsec = 0; |
263 | set_normalized_timespec(&wall_to_monotonic, | 254 | set_normalized_timespec(&wall_to_monotonic, |
264 | -xtime.tv_sec, -xtime.tv_nsec); | 255 | -xtime.tv_sec, -xtime.tv_nsec); |
256 | update_xtime_cache(0); | ||
265 | total_sleep_time = 0; | 257 | total_sleep_time = 0; |
266 | |||
267 | write_sequnlock_irqrestore(&xtime_lock, flags); | 258 | write_sequnlock_irqrestore(&xtime_lock, flags); |
268 | } | 259 | } |
269 | 260 | ||
@@ -300,6 +291,7 @@ static int timekeeping_resume(struct sys_device *dev) | |||
300 | } | 291 | } |
301 | /* Make sure that we have the correct xtime reference */ | 292 | /* Make sure that we have the correct xtime reference */ |
302 | timespec_add_ns(&xtime, timekeeping_suspend_nsecs); | 293 | timespec_add_ns(&xtime, timekeeping_suspend_nsecs); |
294 | update_xtime_cache(0); | ||
303 | /* re-base the last cycle value */ | 295 | /* re-base the last cycle value */ |
304 | clock->cycle_last = clocksource_read(clock); | 296 | clock->cycle_last = clocksource_read(clock); |
305 | clock->error = 0; | 297 | clock->error = 0; |
@@ -335,9 +327,9 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) | |||
335 | 327 | ||
336 | /* sysfs resume/suspend bits for timekeeping */ | 328 | /* sysfs resume/suspend bits for timekeeping */ |
337 | static struct sysdev_class timekeeping_sysclass = { | 329 | static struct sysdev_class timekeeping_sysclass = { |
330 | .name = "timekeeping", | ||
338 | .resume = timekeeping_resume, | 331 | .resume = timekeeping_resume, |
339 | .suspend = timekeeping_suspend, | 332 | .suspend = timekeeping_suspend, |
340 | set_kset_name("timekeeping"), | ||
341 | }; | 333 | }; |
342 | 334 | ||
343 | static struct sys_device device_timer = { | 335 | static struct sys_device device_timer = { |
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index fdb2e03d4fe0..d3d94c1a0fd2 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c | |||
@@ -129,7 +129,8 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) | |||
129 | struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); | 129 | struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); |
130 | int i; | 130 | int i; |
131 | 131 | ||
132 | SEQ_printf(m, "\ncpu: %d\n", cpu); | 132 | SEQ_printf(m, "\n"); |
133 | SEQ_printf(m, "cpu: %d\n", cpu); | ||
133 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { | 134 | for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { |
134 | SEQ_printf(m, " clock %d:\n", i); | 135 | SEQ_printf(m, " clock %d:\n", i); |
135 | print_base(m, cpu_base->clock_base + i, now); | 136 | print_base(m, cpu_base->clock_base + i, now); |
@@ -165,6 +166,8 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now) | |||
165 | P(idle_calls); | 166 | P(idle_calls); |
166 | P(idle_sleeps); | 167 | P(idle_sleeps); |
167 | P_ns(idle_entrytime); | 168 | P_ns(idle_entrytime); |
169 | P_ns(idle_waketime); | ||
170 | P_ns(idle_exittime); | ||
168 | P_ns(idle_sleeptime); | 171 | P_ns(idle_sleeptime); |
169 | P(last_jiffies); | 172 | P(last_jiffies); |
170 | P(next_jiffies); | 173 | P(next_jiffies); |
@@ -184,7 +187,8 @@ print_tickdevice(struct seq_file *m, struct tick_device *td) | |||
184 | { | 187 | { |
185 | struct clock_event_device *dev = td->evtdev; | 188 | struct clock_event_device *dev = td->evtdev; |
186 | 189 | ||
187 | SEQ_printf(m, "\nTick Device: mode: %d\n", td->mode); | 190 | SEQ_printf(m, "\n"); |
191 | SEQ_printf(m, "Tick Device: mode: %d\n", td->mode); | ||
188 | 192 | ||
189 | SEQ_printf(m, "Clock Event Device: "); | 193 | SEQ_printf(m, "Clock Event Device: "); |
190 | if (!dev) { | 194 | if (!dev) { |
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c index c36bb7ed0301..417da8c5bc72 100644 --- a/kernel/time/timer_stats.c +++ b/kernel/time/timer_stats.c | |||
@@ -26,7 +26,7 @@ | |||
26 | * the pid and cmdline from the owner process if applicable. | 26 | * the pid and cmdline from the owner process if applicable. |
27 | * | 27 | * |
28 | * Start/stop data collection: | 28 | * Start/stop data collection: |
29 | * # echo 1[0] >/proc/timer_stats | 29 | * # echo [1|0] >/proc/timer_stats |
30 | * | 30 | * |
31 | * Display the information collected so far: | 31 | * Display the information collected so far: |
32 | * # cat /proc/timer_stats | 32 | * # cat /proc/timer_stats |
diff --git a/kernel/timer.c b/kernel/timer.c index fb4e67d5dd60..9fbb472b8cf0 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -58,59 +58,57 @@ EXPORT_SYMBOL(jiffies_64); | |||
58 | #define TVN_MASK (TVN_SIZE - 1) | 58 | #define TVN_MASK (TVN_SIZE - 1) |
59 | #define TVR_MASK (TVR_SIZE - 1) | 59 | #define TVR_MASK (TVR_SIZE - 1) |
60 | 60 | ||
61 | typedef struct tvec_s { | 61 | struct tvec { |
62 | struct list_head vec[TVN_SIZE]; | 62 | struct list_head vec[TVN_SIZE]; |
63 | } tvec_t; | 63 | }; |
64 | 64 | ||
65 | typedef struct tvec_root_s { | 65 | struct tvec_root { |
66 | struct list_head vec[TVR_SIZE]; | 66 | struct list_head vec[TVR_SIZE]; |
67 | } tvec_root_t; | 67 | }; |
68 | 68 | ||
69 | struct tvec_t_base_s { | 69 | struct tvec_base { |
70 | spinlock_t lock; | 70 | spinlock_t lock; |
71 | struct timer_list *running_timer; | 71 | struct timer_list *running_timer; |
72 | unsigned long timer_jiffies; | 72 | unsigned long timer_jiffies; |
73 | tvec_root_t tv1; | 73 | struct tvec_root tv1; |
74 | tvec_t tv2; | 74 | struct tvec tv2; |
75 | tvec_t tv3; | 75 | struct tvec tv3; |
76 | tvec_t tv4; | 76 | struct tvec tv4; |
77 | tvec_t tv5; | 77 | struct tvec tv5; |
78 | } ____cacheline_aligned; | 78 | } ____cacheline_aligned; |
79 | 79 | ||
80 | typedef struct tvec_t_base_s tvec_base_t; | 80 | struct tvec_base boot_tvec_bases; |
81 | |||
82 | tvec_base_t boot_tvec_bases; | ||
83 | EXPORT_SYMBOL(boot_tvec_bases); | 81 | EXPORT_SYMBOL(boot_tvec_bases); |
84 | static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases; | 82 | static DEFINE_PER_CPU(struct tvec_base *, tvec_bases) = &boot_tvec_bases; |
85 | 83 | ||
86 | /* | 84 | /* |
87 | * Note that all tvec_bases is 2 byte aligned and lower bit of | 85 | * Note that all tvec_bases are 2 byte aligned and lower bit of |
88 | * base in timer_list is guaranteed to be zero. Use the LSB for | 86 | * base in timer_list is guaranteed to be zero. Use the LSB for |
89 | * the new flag to indicate whether the timer is deferrable | 87 | * the new flag to indicate whether the timer is deferrable |
90 | */ | 88 | */ |
91 | #define TBASE_DEFERRABLE_FLAG (0x1) | 89 | #define TBASE_DEFERRABLE_FLAG (0x1) |
92 | 90 | ||
93 | /* Functions below help us manage 'deferrable' flag */ | 91 | /* Functions below help us manage 'deferrable' flag */ |
94 | static inline unsigned int tbase_get_deferrable(tvec_base_t *base) | 92 | static inline unsigned int tbase_get_deferrable(struct tvec_base *base) |
95 | { | 93 | { |
96 | return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG); | 94 | return ((unsigned int)(unsigned long)base & TBASE_DEFERRABLE_FLAG); |
97 | } | 95 | } |
98 | 96 | ||
99 | static inline tvec_base_t *tbase_get_base(tvec_base_t *base) | 97 | static inline struct tvec_base *tbase_get_base(struct tvec_base *base) |
100 | { | 98 | { |
101 | return ((tvec_base_t *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG)); | 99 | return ((struct tvec_base *)((unsigned long)base & ~TBASE_DEFERRABLE_FLAG)); |
102 | } | 100 | } |
103 | 101 | ||
104 | static inline void timer_set_deferrable(struct timer_list *timer) | 102 | static inline void timer_set_deferrable(struct timer_list *timer) |
105 | { | 103 | { |
106 | timer->base = ((tvec_base_t *)((unsigned long)(timer->base) | | 104 | timer->base = ((struct tvec_base *)((unsigned long)(timer->base) | |
107 | TBASE_DEFERRABLE_FLAG)); | 105 | TBASE_DEFERRABLE_FLAG)); |
108 | } | 106 | } |
109 | 107 | ||
110 | static inline void | 108 | static inline void |
111 | timer_set_base(struct timer_list *timer, tvec_base_t *new_base) | 109 | timer_set_base(struct timer_list *timer, struct tvec_base *new_base) |
112 | { | 110 | { |
113 | timer->base = (tvec_base_t *)((unsigned long)(new_base) | | 111 | timer->base = (struct tvec_base *)((unsigned long)(new_base) | |
114 | tbase_get_deferrable(timer->base)); | 112 | tbase_get_deferrable(timer->base)); |
115 | } | 113 | } |
116 | 114 | ||
@@ -246,7 +244,7 @@ unsigned long round_jiffies_relative(unsigned long j) | |||
246 | EXPORT_SYMBOL_GPL(round_jiffies_relative); | 244 | EXPORT_SYMBOL_GPL(round_jiffies_relative); |
247 | 245 | ||
248 | 246 | ||
249 | static inline void set_running_timer(tvec_base_t *base, | 247 | static inline void set_running_timer(struct tvec_base *base, |
250 | struct timer_list *timer) | 248 | struct timer_list *timer) |
251 | { | 249 | { |
252 | #ifdef CONFIG_SMP | 250 | #ifdef CONFIG_SMP |
@@ -254,7 +252,7 @@ static inline void set_running_timer(tvec_base_t *base, | |||
254 | #endif | 252 | #endif |
255 | } | 253 | } |
256 | 254 | ||
257 | static void internal_add_timer(tvec_base_t *base, struct timer_list *timer) | 255 | static void internal_add_timer(struct tvec_base *base, struct timer_list *timer) |
258 | { | 256 | { |
259 | unsigned long expires = timer->expires; | 257 | unsigned long expires = timer->expires; |
260 | unsigned long idx = expires - base->timer_jiffies; | 258 | unsigned long idx = expires - base->timer_jiffies; |
@@ -371,14 +369,14 @@ static inline void detach_timer(struct timer_list *timer, | |||
371 | * possible to set timer->base = NULL and drop the lock: the timer remains | 369 | * possible to set timer->base = NULL and drop the lock: the timer remains |
372 | * locked. | 370 | * locked. |
373 | */ | 371 | */ |
374 | static tvec_base_t *lock_timer_base(struct timer_list *timer, | 372 | static struct tvec_base *lock_timer_base(struct timer_list *timer, |
375 | unsigned long *flags) | 373 | unsigned long *flags) |
376 | __acquires(timer->base->lock) | 374 | __acquires(timer->base->lock) |
377 | { | 375 | { |
378 | tvec_base_t *base; | 376 | struct tvec_base *base; |
379 | 377 | ||
380 | for (;;) { | 378 | for (;;) { |
381 | tvec_base_t *prelock_base = timer->base; | 379 | struct tvec_base *prelock_base = timer->base; |
382 | base = tbase_get_base(prelock_base); | 380 | base = tbase_get_base(prelock_base); |
383 | if (likely(base != NULL)) { | 381 | if (likely(base != NULL)) { |
384 | spin_lock_irqsave(&base->lock, *flags); | 382 | spin_lock_irqsave(&base->lock, *flags); |
@@ -393,7 +391,7 @@ static tvec_base_t *lock_timer_base(struct timer_list *timer, | |||
393 | 391 | ||
394 | int __mod_timer(struct timer_list *timer, unsigned long expires) | 392 | int __mod_timer(struct timer_list *timer, unsigned long expires) |
395 | { | 393 | { |
396 | tvec_base_t *base, *new_base; | 394 | struct tvec_base *base, *new_base; |
397 | unsigned long flags; | 395 | unsigned long flags; |
398 | int ret = 0; | 396 | int ret = 0; |
399 | 397 | ||
@@ -445,7 +443,7 @@ EXPORT_SYMBOL(__mod_timer); | |||
445 | */ | 443 | */ |
446 | void add_timer_on(struct timer_list *timer, int cpu) | 444 | void add_timer_on(struct timer_list *timer, int cpu) |
447 | { | 445 | { |
448 | tvec_base_t *base = per_cpu(tvec_bases, cpu); | 446 | struct tvec_base *base = per_cpu(tvec_bases, cpu); |
449 | unsigned long flags; | 447 | unsigned long flags; |
450 | 448 | ||
451 | timer_stats_timer_set_start_info(timer); | 449 | timer_stats_timer_set_start_info(timer); |
@@ -508,7 +506,7 @@ EXPORT_SYMBOL(mod_timer); | |||
508 | */ | 506 | */ |
509 | int del_timer(struct timer_list *timer) | 507 | int del_timer(struct timer_list *timer) |
510 | { | 508 | { |
511 | tvec_base_t *base; | 509 | struct tvec_base *base; |
512 | unsigned long flags; | 510 | unsigned long flags; |
513 | int ret = 0; | 511 | int ret = 0; |
514 | 512 | ||
@@ -539,7 +537,7 @@ EXPORT_SYMBOL(del_timer); | |||
539 | */ | 537 | */ |
540 | int try_to_del_timer_sync(struct timer_list *timer) | 538 | int try_to_del_timer_sync(struct timer_list *timer) |
541 | { | 539 | { |
542 | tvec_base_t *base; | 540 | struct tvec_base *base; |
543 | unsigned long flags; | 541 | unsigned long flags; |
544 | int ret = -1; | 542 | int ret = -1; |
545 | 543 | ||
@@ -591,7 +589,7 @@ int del_timer_sync(struct timer_list *timer) | |||
591 | EXPORT_SYMBOL(del_timer_sync); | 589 | EXPORT_SYMBOL(del_timer_sync); |
592 | #endif | 590 | #endif |
593 | 591 | ||
594 | static int cascade(tvec_base_t *base, tvec_t *tv, int index) | 592 | static int cascade(struct tvec_base *base, struct tvec *tv, int index) |
595 | { | 593 | { |
596 | /* cascade all the timers from tv up one level */ | 594 | /* cascade all the timers from tv up one level */ |
597 | struct timer_list *timer, *tmp; | 595 | struct timer_list *timer, *tmp; |
@@ -620,7 +618,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index) | |||
620 | * This function cascades all vectors and executes all expired timer | 618 | * This function cascades all vectors and executes all expired timer |
621 | * vectors. | 619 | * vectors. |
622 | */ | 620 | */ |
623 | static inline void __run_timers(tvec_base_t *base) | 621 | static inline void __run_timers(struct tvec_base *base) |
624 | { | 622 | { |
625 | struct timer_list *timer; | 623 | struct timer_list *timer; |
626 | 624 | ||
@@ -657,7 +655,7 @@ static inline void __run_timers(tvec_base_t *base) | |||
657 | int preempt_count = preempt_count(); | 655 | int preempt_count = preempt_count(); |
658 | fn(data); | 656 | fn(data); |
659 | if (preempt_count != preempt_count()) { | 657 | if (preempt_count != preempt_count()) { |
660 | printk(KERN_WARNING "huh, entered %p " | 658 | printk(KERN_ERR "huh, entered %p " |
661 | "with preempt_count %08x, exited" | 659 | "with preempt_count %08x, exited" |
662 | " with %08x?\n", | 660 | " with %08x?\n", |
663 | fn, preempt_count, | 661 | fn, preempt_count, |
@@ -678,13 +676,13 @@ static inline void __run_timers(tvec_base_t *base) | |||
678 | * is used on S/390 to stop all activity when a cpus is idle. | 676 | * is used on S/390 to stop all activity when a cpus is idle. |
679 | * This functions needs to be called disabled. | 677 | * This functions needs to be called disabled. |
680 | */ | 678 | */ |
681 | static unsigned long __next_timer_interrupt(tvec_base_t *base) | 679 | static unsigned long __next_timer_interrupt(struct tvec_base *base) |
682 | { | 680 | { |
683 | unsigned long timer_jiffies = base->timer_jiffies; | 681 | unsigned long timer_jiffies = base->timer_jiffies; |
684 | unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA; | 682 | unsigned long expires = timer_jiffies + NEXT_TIMER_MAX_DELTA; |
685 | int index, slot, array, found = 0; | 683 | int index, slot, array, found = 0; |
686 | struct timer_list *nte; | 684 | struct timer_list *nte; |
687 | tvec_t *varray[4]; | 685 | struct tvec *varray[4]; |
688 | 686 | ||
689 | /* Look for timer events in tv1. */ | 687 | /* Look for timer events in tv1. */ |
690 | index = slot = timer_jiffies & TVR_MASK; | 688 | index = slot = timer_jiffies & TVR_MASK; |
@@ -716,7 +714,7 @@ cascade: | |||
716 | varray[3] = &base->tv5; | 714 | varray[3] = &base->tv5; |
717 | 715 | ||
718 | for (array = 0; array < 4; array++) { | 716 | for (array = 0; array < 4; array++) { |
719 | tvec_t *varp = varray[array]; | 717 | struct tvec *varp = varray[array]; |
720 | 718 | ||
721 | index = slot = timer_jiffies & TVN_MASK; | 719 | index = slot = timer_jiffies & TVN_MASK; |
722 | do { | 720 | do { |
@@ -790,12 +788,12 @@ static unsigned long cmp_next_hrtimer_event(unsigned long now, | |||
790 | } | 788 | } |
791 | 789 | ||
792 | /** | 790 | /** |
793 | * next_timer_interrupt - return the jiffy of the next pending timer | 791 | * get_next_timer_interrupt - return the jiffy of the next pending timer |
794 | * @now: current time (in jiffies) | 792 | * @now: current time (in jiffies) |
795 | */ | 793 | */ |
796 | unsigned long get_next_timer_interrupt(unsigned long now) | 794 | unsigned long get_next_timer_interrupt(unsigned long now) |
797 | { | 795 | { |
798 | tvec_base_t *base = __get_cpu_var(tvec_bases); | 796 | struct tvec_base *base = __get_cpu_var(tvec_bases); |
799 | unsigned long expires; | 797 | unsigned long expires; |
800 | 798 | ||
801 | spin_lock(&base->lock); | 799 | spin_lock(&base->lock); |
@@ -817,6 +815,19 @@ unsigned long next_timer_interrupt(void) | |||
817 | 815 | ||
818 | #endif | 816 | #endif |
819 | 817 | ||
818 | #ifndef CONFIG_VIRT_CPU_ACCOUNTING | ||
819 | void account_process_tick(struct task_struct *p, int user_tick) | ||
820 | { | ||
821 | if (user_tick) { | ||
822 | account_user_time(p, jiffies_to_cputime(1)); | ||
823 | account_user_time_scaled(p, jiffies_to_cputime(1)); | ||
824 | } else { | ||
825 | account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1)); | ||
826 | account_system_time_scaled(p, jiffies_to_cputime(1)); | ||
827 | } | ||
828 | } | ||
829 | #endif | ||
830 | |||
820 | /* | 831 | /* |
821 | * Called from the timer interrupt handler to charge one tick to the current | 832 | * Called from the timer interrupt handler to charge one tick to the current |
822 | * process. user_tick is 1 if the tick is user time, 0 for system. | 833 | * process. user_tick is 1 if the tick is user time, 0 for system. |
@@ -827,13 +838,7 @@ void update_process_times(int user_tick) | |||
827 | int cpu = smp_processor_id(); | 838 | int cpu = smp_processor_id(); |
828 | 839 | ||
829 | /* Note: this timer irq context must be accounted for as well. */ | 840 | /* Note: this timer irq context must be accounted for as well. */ |
830 | if (user_tick) { | 841 | account_process_tick(p, user_tick); |
831 | account_user_time(p, jiffies_to_cputime(1)); | ||
832 | account_user_time_scaled(p, jiffies_to_cputime(1)); | ||
833 | } else { | ||
834 | account_system_time(p, HARDIRQ_OFFSET, jiffies_to_cputime(1)); | ||
835 | account_system_time_scaled(p, jiffies_to_cputime(1)); | ||
836 | } | ||
837 | run_local_timers(); | 842 | run_local_timers(); |
838 | if (rcu_pending(cpu)) | 843 | if (rcu_pending(cpu)) |
839 | rcu_check_callbacks(cpu, user_tick); | 844 | rcu_check_callbacks(cpu, user_tick); |
@@ -887,9 +892,9 @@ static inline void calc_load(unsigned long ticks) | |||
887 | */ | 892 | */ |
888 | static void run_timer_softirq(struct softirq_action *h) | 893 | static void run_timer_softirq(struct softirq_action *h) |
889 | { | 894 | { |
890 | tvec_base_t *base = __get_cpu_var(tvec_bases); | 895 | struct tvec_base *base = __get_cpu_var(tvec_bases); |
891 | 896 | ||
892 | hrtimer_run_queues(); | 897 | hrtimer_run_pending(); |
893 | 898 | ||
894 | if (time_after_eq(jiffies, base->timer_jiffies)) | 899 | if (time_after_eq(jiffies, base->timer_jiffies)) |
895 | __run_timers(base); | 900 | __run_timers(base); |
@@ -900,6 +905,7 @@ static void run_timer_softirq(struct softirq_action *h) | |||
900 | */ | 905 | */ |
901 | void run_local_timers(void) | 906 | void run_local_timers(void) |
902 | { | 907 | { |
908 | hrtimer_run_queues(); | ||
903 | raise_softirq(TIMER_SOFTIRQ); | 909 | raise_softirq(TIMER_SOFTIRQ); |
904 | softlockup_tick(); | 910 | softlockup_tick(); |
905 | } | 911 | } |
@@ -971,7 +977,7 @@ asmlinkage long sys_getppid(void) | |||
971 | int pid; | 977 | int pid; |
972 | 978 | ||
973 | rcu_read_lock(); | 979 | rcu_read_lock(); |
974 | pid = task_ppid_nr_ns(current, current->nsproxy->pid_ns); | 980 | pid = task_tgid_nr_ns(current->real_parent, current->nsproxy->pid_ns); |
975 | rcu_read_unlock(); | 981 | rcu_read_unlock(); |
976 | 982 | ||
977 | return pid; | 983 | return pid; |
@@ -1093,6 +1099,13 @@ signed long __sched schedule_timeout_interruptible(signed long timeout) | |||
1093 | } | 1099 | } |
1094 | EXPORT_SYMBOL(schedule_timeout_interruptible); | 1100 | EXPORT_SYMBOL(schedule_timeout_interruptible); |
1095 | 1101 | ||
1102 | signed long __sched schedule_timeout_killable(signed long timeout) | ||
1103 | { | ||
1104 | __set_current_state(TASK_KILLABLE); | ||
1105 | return schedule_timeout(timeout); | ||
1106 | } | ||
1107 | EXPORT_SYMBOL(schedule_timeout_killable); | ||
1108 | |||
1096 | signed long __sched schedule_timeout_uninterruptible(signed long timeout) | 1109 | signed long __sched schedule_timeout_uninterruptible(signed long timeout) |
1097 | { | 1110 | { |
1098 | __set_current_state(TASK_UNINTERRUPTIBLE); | 1111 | __set_current_state(TASK_UNINTERRUPTIBLE); |
@@ -1212,11 +1225,11 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info) | |||
1212 | */ | 1225 | */ |
1213 | static struct lock_class_key base_lock_keys[NR_CPUS]; | 1226 | static struct lock_class_key base_lock_keys[NR_CPUS]; |
1214 | 1227 | ||
1215 | static int __devinit init_timers_cpu(int cpu) | 1228 | static int __cpuinit init_timers_cpu(int cpu) |
1216 | { | 1229 | { |
1217 | int j; | 1230 | int j; |
1218 | tvec_base_t *base; | 1231 | struct tvec_base *base; |
1219 | static char __devinitdata tvec_base_done[NR_CPUS]; | 1232 | static char __cpuinitdata tvec_base_done[NR_CPUS]; |
1220 | 1233 | ||
1221 | if (!tvec_base_done[cpu]) { | 1234 | if (!tvec_base_done[cpu]) { |
1222 | static char boot_done; | 1235 | static char boot_done; |
@@ -1270,7 +1283,7 @@ static int __devinit init_timers_cpu(int cpu) | |||
1270 | } | 1283 | } |
1271 | 1284 | ||
1272 | #ifdef CONFIG_HOTPLUG_CPU | 1285 | #ifdef CONFIG_HOTPLUG_CPU |
1273 | static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head) | 1286 | static void migrate_timer_list(struct tvec_base *new_base, struct list_head *head) |
1274 | { | 1287 | { |
1275 | struct timer_list *timer; | 1288 | struct timer_list *timer; |
1276 | 1289 | ||
@@ -1282,10 +1295,10 @@ static void migrate_timer_list(tvec_base_t *new_base, struct list_head *head) | |||
1282 | } | 1295 | } |
1283 | } | 1296 | } |
1284 | 1297 | ||
1285 | static void __devinit migrate_timers(int cpu) | 1298 | static void __cpuinit migrate_timers(int cpu) |
1286 | { | 1299 | { |
1287 | tvec_base_t *old_base; | 1300 | struct tvec_base *old_base; |
1288 | tvec_base_t *new_base; | 1301 | struct tvec_base *new_base; |
1289 | int i; | 1302 | int i; |
1290 | 1303 | ||
1291 | BUG_ON(cpu_online(cpu)); | 1304 | BUG_ON(cpu_online(cpu)); |
diff --git a/kernel/user.c b/kernel/user.c index 0f3aa0234107..bc1c48d35cb3 100644 --- a/kernel/user.c +++ b/kernel/user.c | |||
@@ -115,7 +115,7 @@ static void sched_switch_user(struct task_struct *p) { } | |||
115 | 115 | ||
116 | #if defined(CONFIG_FAIR_USER_SCHED) && defined(CONFIG_SYSFS) | 116 | #if defined(CONFIG_FAIR_USER_SCHED) && defined(CONFIG_SYSFS) |
117 | 117 | ||
118 | static struct kobject uids_kobject; /* represents /sys/kernel/uids directory */ | 118 | static struct kset *uids_kset; /* represents the /sys/kernel/uids/ directory */ |
119 | static DEFINE_MUTEX(uids_mutex); | 119 | static DEFINE_MUTEX(uids_mutex); |
120 | 120 | ||
121 | static inline void uids_mutex_lock(void) | 121 | static inline void uids_mutex_lock(void) |
@@ -128,86 +128,83 @@ static inline void uids_mutex_unlock(void) | |||
128 | mutex_unlock(&uids_mutex); | 128 | mutex_unlock(&uids_mutex); |
129 | } | 129 | } |
130 | 130 | ||
131 | /* return cpu shares held by the user */ | 131 | /* uid directory attributes */ |
132 | static ssize_t cpu_shares_show(struct kset *kset, char *buffer) | 132 | static ssize_t cpu_shares_show(struct kobject *kobj, |
133 | struct kobj_attribute *attr, | ||
134 | char *buf) | ||
133 | { | 135 | { |
134 | struct user_struct *up = container_of(kset, struct user_struct, kset); | 136 | struct user_struct *up = container_of(kobj, struct user_struct, kobj); |
135 | 137 | ||
136 | return sprintf(buffer, "%lu\n", sched_group_shares(up->tg)); | 138 | return sprintf(buf, "%lu\n", sched_group_shares(up->tg)); |
137 | } | 139 | } |
138 | 140 | ||
139 | /* modify cpu shares held by the user */ | 141 | static ssize_t cpu_shares_store(struct kobject *kobj, |
140 | static ssize_t cpu_shares_store(struct kset *kset, const char *buffer, | 142 | struct kobj_attribute *attr, |
141 | size_t size) | 143 | const char *buf, size_t size) |
142 | { | 144 | { |
143 | struct user_struct *up = container_of(kset, struct user_struct, kset); | 145 | struct user_struct *up = container_of(kobj, struct user_struct, kobj); |
144 | unsigned long shares; | 146 | unsigned long shares; |
145 | int rc; | 147 | int rc; |
146 | 148 | ||
147 | sscanf(buffer, "%lu", &shares); | 149 | sscanf(buf, "%lu", &shares); |
148 | 150 | ||
149 | rc = sched_group_set_shares(up->tg, shares); | 151 | rc = sched_group_set_shares(up->tg, shares); |
150 | 152 | ||
151 | return (rc ? rc : size); | 153 | return (rc ? rc : size); |
152 | } | 154 | } |
153 | 155 | ||
154 | static void user_attr_init(struct subsys_attribute *sa, char *name, int mode) | 156 | static struct kobj_attribute cpu_share_attr = |
157 | __ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store); | ||
158 | |||
159 | /* default attributes per uid directory */ | ||
160 | static struct attribute *uids_attributes[] = { | ||
161 | &cpu_share_attr.attr, | ||
162 | NULL | ||
163 | }; | ||
164 | |||
165 | /* the lifetime of user_struct is not managed by the core (now) */ | ||
166 | static void uids_release(struct kobject *kobj) | ||
155 | { | 167 | { |
156 | sa->attr.name = name; | 168 | return; |
157 | sa->attr.mode = mode; | ||
158 | sa->show = cpu_shares_show; | ||
159 | sa->store = cpu_shares_store; | ||
160 | } | 169 | } |
161 | 170 | ||
162 | /* Create "/sys/kernel/uids/<uid>" directory and | 171 | static struct kobj_type uids_ktype = { |
163 | * "/sys/kernel/uids/<uid>/cpu_share" file for this user. | 172 | .sysfs_ops = &kobj_sysfs_ops, |
164 | */ | 173 | .default_attrs = uids_attributes, |
165 | static int user_kobject_create(struct user_struct *up) | 174 | .release = uids_release, |
175 | }; | ||
176 | |||
177 | /* create /sys/kernel/uids/<uid>/cpu_share file for this user */ | ||
178 | static int uids_user_create(struct user_struct *up) | ||
166 | { | 179 | { |
167 | struct kset *kset = &up->kset; | 180 | struct kobject *kobj = &up->kobj; |
168 | struct kobject *kobj = &kset->kobj; | ||
169 | int error; | 181 | int error; |
170 | 182 | ||
171 | memset(kset, 0, sizeof(struct kset)); | 183 | memset(kobj, 0, sizeof(struct kobject)); |
172 | kobj->parent = &uids_kobject; /* create under /sys/kernel/uids dir */ | 184 | kobj->kset = uids_kset; |
173 | kobject_set_name(kobj, "%d", up->uid); | 185 | error = kobject_init_and_add(kobj, &uids_ktype, NULL, "%d", up->uid); |
174 | kset_init(kset); | 186 | if (error) { |
175 | user_attr_init(&up->user_attr, "cpu_share", 0644); | 187 | kobject_put(kobj); |
176 | |||
177 | error = kobject_add(kobj); | ||
178 | if (error) | ||
179 | goto done; | 188 | goto done; |
180 | 189 | } | |
181 | error = sysfs_create_file(kobj, &up->user_attr.attr); | ||
182 | if (error) | ||
183 | kobject_del(kobj); | ||
184 | 190 | ||
185 | kobject_uevent(kobj, KOBJ_ADD); | 191 | kobject_uevent(kobj, KOBJ_ADD); |
186 | |||
187 | done: | 192 | done: |
188 | return error; | 193 | return error; |
189 | } | 194 | } |
190 | 195 | ||
191 | /* create these in sysfs filesystem: | 196 | /* create these entries in sysfs: |
192 | * "/sys/kernel/uids" directory | 197 | * "/sys/kernel/uids" directory |
193 | * "/sys/kernel/uids/0" directory (for root user) | 198 | * "/sys/kernel/uids/0" directory (for root user) |
194 | * "/sys/kernel/uids/0/cpu_share" file (for root user) | 199 | * "/sys/kernel/uids/0/cpu_share" file (for root user) |
195 | */ | 200 | */ |
196 | int __init uids_kobject_init(void) | 201 | int __init uids_sysfs_init(void) |
197 | { | 202 | { |
198 | int error; | 203 | uids_kset = kset_create_and_add("uids", NULL, kernel_kobj); |
199 | 204 | if (!uids_kset) | |
200 | /* create under /sys/kernel dir */ | 205 | return -ENOMEM; |
201 | uids_kobject.parent = &kernel_subsys.kobj; | ||
202 | uids_kobject.kset = &kernel_subsys; | ||
203 | kobject_set_name(&uids_kobject, "uids"); | ||
204 | kobject_init(&uids_kobject); | ||
205 | 206 | ||
206 | error = kobject_add(&uids_kobject); | 207 | return uids_user_create(&root_user); |
207 | if (!error) | ||
208 | error = user_kobject_create(&root_user); | ||
209 | |||
210 | return error; | ||
211 | } | 208 | } |
212 | 209 | ||
213 | /* work function to remove sysfs directory for a user and free up | 210 | /* work function to remove sysfs directory for a user and free up |
@@ -216,7 +213,6 @@ int __init uids_kobject_init(void) | |||
216 | static void remove_user_sysfs_dir(struct work_struct *w) | 213 | static void remove_user_sysfs_dir(struct work_struct *w) |
217 | { | 214 | { |
218 | struct user_struct *up = container_of(w, struct user_struct, work); | 215 | struct user_struct *up = container_of(w, struct user_struct, work); |
219 | struct kobject *kobj = &up->kset.kobj; | ||
220 | unsigned long flags; | 216 | unsigned long flags; |
221 | int remove_user = 0; | 217 | int remove_user = 0; |
222 | 218 | ||
@@ -238,9 +234,9 @@ static void remove_user_sysfs_dir(struct work_struct *w) | |||
238 | if (!remove_user) | 234 | if (!remove_user) |
239 | goto done; | 235 | goto done; |
240 | 236 | ||
241 | sysfs_remove_file(kobj, &up->user_attr.attr); | 237 | kobject_uevent(&up->kobj, KOBJ_REMOVE); |
242 | kobject_uevent(kobj, KOBJ_REMOVE); | 238 | kobject_del(&up->kobj); |
243 | kobject_del(kobj); | 239 | kobject_put(&up->kobj); |
244 | 240 | ||
245 | sched_destroy_user(up); | 241 | sched_destroy_user(up); |
246 | key_put(up->uid_keyring); | 242 | key_put(up->uid_keyring); |
@@ -267,7 +263,8 @@ static inline void free_user(struct user_struct *up, unsigned long flags) | |||
267 | 263 | ||
268 | #else /* CONFIG_FAIR_USER_SCHED && CONFIG_SYSFS */ | 264 | #else /* CONFIG_FAIR_USER_SCHED && CONFIG_SYSFS */ |
269 | 265 | ||
270 | static inline int user_kobject_create(struct user_struct *up) { return 0; } | 266 | int uids_sysfs_init(void) { return 0; } |
267 | static inline int uids_user_create(struct user_struct *up) { return 0; } | ||
271 | static inline void uids_mutex_lock(void) { } | 268 | static inline void uids_mutex_lock(void) { } |
272 | static inline void uids_mutex_unlock(void) { } | 269 | static inline void uids_mutex_unlock(void) { } |
273 | 270 | ||
@@ -322,9 +319,9 @@ void free_uid(struct user_struct *up) | |||
322 | struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) | 319 | struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) |
323 | { | 320 | { |
324 | struct hlist_head *hashent = uidhashentry(ns, uid); | 321 | struct hlist_head *hashent = uidhashentry(ns, uid); |
325 | struct user_struct *up; | 322 | struct user_struct *up, *new; |
326 | 323 | ||
327 | /* Make uid_hash_find() + user_kobject_create() + uid_hash_insert() | 324 | /* Make uid_hash_find() + uids_user_create() + uid_hash_insert() |
328 | * atomic. | 325 | * atomic. |
329 | */ | 326 | */ |
330 | uids_mutex_lock(); | 327 | uids_mutex_lock(); |
@@ -334,11 +331,10 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) | |||
334 | spin_unlock_irq(&uidhash_lock); | 331 | spin_unlock_irq(&uidhash_lock); |
335 | 332 | ||
336 | if (!up) { | 333 | if (!up) { |
337 | struct user_struct *new; | ||
338 | |||
339 | new = kmem_cache_alloc(uid_cachep, GFP_KERNEL); | 334 | new = kmem_cache_alloc(uid_cachep, GFP_KERNEL); |
340 | if (!new) | 335 | if (!new) |
341 | return NULL; | 336 | goto out_unlock; |
337 | |||
342 | new->uid = uid; | 338 | new->uid = uid; |
343 | atomic_set(&new->__count, 1); | 339 | atomic_set(&new->__count, 1); |
344 | atomic_set(&new->processes, 0); | 340 | atomic_set(&new->processes, 0); |
@@ -353,26 +349,14 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) | |||
353 | #endif | 349 | #endif |
354 | new->locked_shm = 0; | 350 | new->locked_shm = 0; |
355 | 351 | ||
356 | if (alloc_uid_keyring(new, current) < 0) { | 352 | if (alloc_uid_keyring(new, current) < 0) |
357 | kmem_cache_free(uid_cachep, new); | 353 | goto out_free_user; |
358 | return NULL; | ||
359 | } | ||
360 | 354 | ||
361 | if (sched_create_user(new) < 0) { | 355 | if (sched_create_user(new) < 0) |
362 | key_put(new->uid_keyring); | 356 | goto out_put_keys; |
363 | key_put(new->session_keyring); | ||
364 | kmem_cache_free(uid_cachep, new); | ||
365 | return NULL; | ||
366 | } | ||
367 | 357 | ||
368 | if (user_kobject_create(new)) { | 358 | if (uids_user_create(new)) |
369 | sched_destroy_user(new); | 359 | goto out_destoy_sched; |
370 | key_put(new->uid_keyring); | ||
371 | key_put(new->session_keyring); | ||
372 | kmem_cache_free(uid_cachep, new); | ||
373 | uids_mutex_unlock(); | ||
374 | return NULL; | ||
375 | } | ||
376 | 360 | ||
377 | /* | 361 | /* |
378 | * Before adding this, check whether we raced | 362 | * Before adding this, check whether we raced |
@@ -400,6 +384,17 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid) | |||
400 | uids_mutex_unlock(); | 384 | uids_mutex_unlock(); |
401 | 385 | ||
402 | return up; | 386 | return up; |
387 | |||
388 | out_destoy_sched: | ||
389 | sched_destroy_user(new); | ||
390 | out_put_keys: | ||
391 | key_put(new->uid_keyring); | ||
392 | key_put(new->session_keyring); | ||
393 | out_free_user: | ||
394 | kmem_cache_free(uid_cachep, new); | ||
395 | out_unlock: | ||
396 | uids_mutex_unlock(); | ||
397 | return NULL; | ||
403 | } | 398 | } |
404 | 399 | ||
405 | void switch_uid(struct user_struct *new_user) | 400 | void switch_uid(struct user_struct *new_user) |
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index c76c06466bfd..fe3a56c2256d 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c | |||
@@ -18,6 +18,10 @@ | |||
18 | static void *get_uts(ctl_table *table, int write) | 18 | static void *get_uts(ctl_table *table, int write) |
19 | { | 19 | { |
20 | char *which = table->data; | 20 | char *which = table->data; |
21 | struct uts_namespace *uts_ns; | ||
22 | |||
23 | uts_ns = current->nsproxy->uts_ns; | ||
24 | which = (which - (char *)&init_uts_ns) + (char *)uts_ns; | ||
21 | 25 | ||
22 | if (!write) | 26 | if (!write) |
23 | down_read(&uts_sem); | 27 | down_read(&uts_sem); |
diff --git a/kernel/wait.c b/kernel/wait.c index 444ddbfaefc4..f9876888a569 100644 --- a/kernel/wait.c +++ b/kernel/wait.c | |||
@@ -215,7 +215,7 @@ void fastcall __wake_up_bit(wait_queue_head_t *wq, void *word, int bit) | |||
215 | { | 215 | { |
216 | struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); | 216 | struct wait_bit_key key = __WAIT_BIT_KEY_INITIALIZER(word, bit); |
217 | if (waitqueue_active(wq)) | 217 | if (waitqueue_active(wq)) |
218 | __wake_up(wq, TASK_INTERRUPTIBLE|TASK_UNINTERRUPTIBLE, 1, &key); | 218 | __wake_up(wq, TASK_NORMAL, 1, &key); |
219 | } | 219 | } |
220 | EXPORT_SYMBOL(__wake_up_bit); | 220 | EXPORT_SYMBOL(__wake_up_bit); |
221 | 221 | ||
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 52d5e7c9a8e6..52db48e7f6e7 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -67,9 +67,8 @@ struct workqueue_struct { | |||
67 | #endif | 67 | #endif |
68 | }; | 68 | }; |
69 | 69 | ||
70 | /* All the per-cpu workqueues on the system, for hotplug cpu to add/remove | 70 | /* Serializes the accesses to the list of workqueues. */ |
71 | threads to each one as cpus come/go. */ | 71 | static DEFINE_SPINLOCK(workqueue_lock); |
72 | static DEFINE_MUTEX(workqueue_mutex); | ||
73 | static LIST_HEAD(workqueues); | 72 | static LIST_HEAD(workqueues); |
74 | 73 | ||
75 | static int singlethread_cpu __read_mostly; | 74 | static int singlethread_cpu __read_mostly; |
@@ -592,8 +591,6 @@ EXPORT_SYMBOL(schedule_delayed_work_on); | |||
592 | * Returns zero on success. | 591 | * Returns zero on success. |
593 | * Returns -ve errno on failure. | 592 | * Returns -ve errno on failure. |
594 | * | 593 | * |
595 | * Appears to be racy against CPU hotplug. | ||
596 | * | ||
597 | * schedule_on_each_cpu() is very slow. | 594 | * schedule_on_each_cpu() is very slow. |
598 | */ | 595 | */ |
599 | int schedule_on_each_cpu(work_func_t func) | 596 | int schedule_on_each_cpu(work_func_t func) |
@@ -605,7 +602,7 @@ int schedule_on_each_cpu(work_func_t func) | |||
605 | if (!works) | 602 | if (!works) |
606 | return -ENOMEM; | 603 | return -ENOMEM; |
607 | 604 | ||
608 | preempt_disable(); /* CPU hotplug */ | 605 | get_online_cpus(); |
609 | for_each_online_cpu(cpu) { | 606 | for_each_online_cpu(cpu) { |
610 | struct work_struct *work = per_cpu_ptr(works, cpu); | 607 | struct work_struct *work = per_cpu_ptr(works, cpu); |
611 | 608 | ||
@@ -613,8 +610,8 @@ int schedule_on_each_cpu(work_func_t func) | |||
613 | set_bit(WORK_STRUCT_PENDING, work_data_bits(work)); | 610 | set_bit(WORK_STRUCT_PENDING, work_data_bits(work)); |
614 | __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work); | 611 | __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work); |
615 | } | 612 | } |
616 | preempt_enable(); | ||
617 | flush_workqueue(keventd_wq); | 613 | flush_workqueue(keventd_wq); |
614 | put_online_cpus(); | ||
618 | free_percpu(works); | 615 | free_percpu(works); |
619 | return 0; | 616 | return 0; |
620 | } | 617 | } |
@@ -722,7 +719,8 @@ static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) | |||
722 | struct workqueue_struct *__create_workqueue_key(const char *name, | 719 | struct workqueue_struct *__create_workqueue_key(const char *name, |
723 | int singlethread, | 720 | int singlethread, |
724 | int freezeable, | 721 | int freezeable, |
725 | struct lock_class_key *key) | 722 | struct lock_class_key *key, |
723 | const char *lock_name) | ||
726 | { | 724 | { |
727 | struct workqueue_struct *wq; | 725 | struct workqueue_struct *wq; |
728 | struct cpu_workqueue_struct *cwq; | 726 | struct cpu_workqueue_struct *cwq; |
@@ -739,7 +737,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name, | |||
739 | } | 737 | } |
740 | 738 | ||
741 | wq->name = name; | 739 | wq->name = name; |
742 | lockdep_init_map(&wq->lockdep_map, name, key, 0); | 740 | lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); |
743 | wq->singlethread = singlethread; | 741 | wq->singlethread = singlethread; |
744 | wq->freezeable = freezeable; | 742 | wq->freezeable = freezeable; |
745 | INIT_LIST_HEAD(&wq->list); | 743 | INIT_LIST_HEAD(&wq->list); |
@@ -749,8 +747,10 @@ struct workqueue_struct *__create_workqueue_key(const char *name, | |||
749 | err = create_workqueue_thread(cwq, singlethread_cpu); | 747 | err = create_workqueue_thread(cwq, singlethread_cpu); |
750 | start_workqueue_thread(cwq, -1); | 748 | start_workqueue_thread(cwq, -1); |
751 | } else { | 749 | } else { |
752 | mutex_lock(&workqueue_mutex); | 750 | get_online_cpus(); |
751 | spin_lock(&workqueue_lock); | ||
753 | list_add(&wq->list, &workqueues); | 752 | list_add(&wq->list, &workqueues); |
753 | spin_unlock(&workqueue_lock); | ||
754 | 754 | ||
755 | for_each_possible_cpu(cpu) { | 755 | for_each_possible_cpu(cpu) { |
756 | cwq = init_cpu_workqueue(wq, cpu); | 756 | cwq = init_cpu_workqueue(wq, cpu); |
@@ -759,7 +759,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name, | |||
759 | err = create_workqueue_thread(cwq, cpu); | 759 | err = create_workqueue_thread(cwq, cpu); |
760 | start_workqueue_thread(cwq, cpu); | 760 | start_workqueue_thread(cwq, cpu); |
761 | } | 761 | } |
762 | mutex_unlock(&workqueue_mutex); | 762 | put_online_cpus(); |
763 | } | 763 | } |
764 | 764 | ||
765 | if (err) { | 765 | if (err) { |
@@ -774,7 +774,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) | |||
774 | { | 774 | { |
775 | /* | 775 | /* |
776 | * Our caller is either destroy_workqueue() or CPU_DEAD, | 776 | * Our caller is either destroy_workqueue() or CPU_DEAD, |
777 | * workqueue_mutex protects cwq->thread | 777 | * get_online_cpus() protects cwq->thread. |
778 | */ | 778 | */ |
779 | if (cwq->thread == NULL) | 779 | if (cwq->thread == NULL) |
780 | return; | 780 | return; |
@@ -809,9 +809,11 @@ void destroy_workqueue(struct workqueue_struct *wq) | |||
809 | struct cpu_workqueue_struct *cwq; | 809 | struct cpu_workqueue_struct *cwq; |
810 | int cpu; | 810 | int cpu; |
811 | 811 | ||
812 | mutex_lock(&workqueue_mutex); | 812 | get_online_cpus(); |
813 | spin_lock(&workqueue_lock); | ||
813 | list_del(&wq->list); | 814 | list_del(&wq->list); |
814 | mutex_unlock(&workqueue_mutex); | 815 | spin_unlock(&workqueue_lock); |
816 | put_online_cpus(); | ||
815 | 817 | ||
816 | for_each_cpu_mask(cpu, *cpu_map) { | 818 | for_each_cpu_mask(cpu, *cpu_map) { |
817 | cwq = per_cpu_ptr(wq->cpu_wq, cpu); | 819 | cwq = per_cpu_ptr(wq->cpu_wq, cpu); |
@@ -834,13 +836,6 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, | |||
834 | action &= ~CPU_TASKS_FROZEN; | 836 | action &= ~CPU_TASKS_FROZEN; |
835 | 837 | ||
836 | switch (action) { | 838 | switch (action) { |
837 | case CPU_LOCK_ACQUIRE: | ||
838 | mutex_lock(&workqueue_mutex); | ||
839 | return NOTIFY_OK; | ||
840 | |||
841 | case CPU_LOCK_RELEASE: | ||
842 | mutex_unlock(&workqueue_mutex); | ||
843 | return NOTIFY_OK; | ||
844 | 839 | ||
845 | case CPU_UP_PREPARE: | 840 | case CPU_UP_PREPARE: |
846 | cpu_set(cpu, cpu_populated_map); | 841 | cpu_set(cpu, cpu_populated_map); |
@@ -853,7 +848,8 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, | |||
853 | case CPU_UP_PREPARE: | 848 | case CPU_UP_PREPARE: |
854 | if (!create_workqueue_thread(cwq, cpu)) | 849 | if (!create_workqueue_thread(cwq, cpu)) |
855 | break; | 850 | break; |
856 | printk(KERN_ERR "workqueue for %i failed\n", cpu); | 851 | printk(KERN_ERR "workqueue [%s] for %i failed\n", |
852 | wq->name, cpu); | ||
857 | return NOTIFY_BAD; | 853 | return NOTIFY_BAD; |
858 | 854 | ||
859 | case CPU_ONLINE: | 855 | case CPU_ONLINE: |