aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile15
-rw-r--r--kernel/acct.c125
-rw-r--r--kernel/audit.c34
-rw-r--r--kernel/audit.h2
-rw-r--r--kernel/auditfilter.c272
-rw-r--r--kernel/auditsc.c254
-rw-r--r--kernel/capability.c10
-rw-r--r--kernel/compat.c40
-rw-r--r--kernel/configs.c1
-rw-r--r--kernel/cpu.c219
-rw-r--r--kernel/cpuset.c179
-rw-r--r--kernel/delayacct.c162
-rw-r--r--kernel/exec_domain.c1
-rw-r--r--kernel/exit.c114
-rw-r--r--kernel/fork.c119
-rw-r--r--kernel/futex.c1095
-rw-r--r--kernel/futex_compat.c48
-rw-r--r--kernel/hrtimer.c47
-rw-r--r--kernel/irq/Makefile2
-rw-r--r--kernel/irq/autoprobe.c56
-rw-r--r--kernel/irq/chip.c533
-rw-r--r--kernel/irq/handle.c165
-rw-r--r--kernel/irq/internals.h46
-rw-r--r--kernel/irq/manage.c209
-rw-r--r--kernel/irq/migration.c20
-rw-r--r--kernel/irq/proc.c30
-rw-r--r--kernel/irq/resend.c77
-rw-r--r--kernel/irq/spurious.c37
-rw-r--r--kernel/kallsyms.c4
-rw-r--r--kernel/kexec.c14
-rw-r--r--kernel/kfifo.c28
-rw-r--r--kernel/kmod.c20
-rw-r--r--kernel/kprobes.c59
-rw-r--r--kernel/ksysfs.c1
-rw-r--r--kernel/kthread.c63
-rw-r--r--kernel/lockdep.c2724
-rw-r--r--kernel/lockdep_internals.h78
-rw-r--r--kernel/lockdep_proc.c345
-rw-r--r--kernel/module.c198
-rw-r--r--kernel/mutex-debug.c404
-rw-r--r--kernel/mutex-debug.h111
-rw-r--r--kernel/mutex.c95
-rw-r--r--kernel/mutex.h25
-rw-r--r--kernel/panic.c16
-rw-r--r--kernel/params.c16
-rw-r--r--kernel/pid.c18
-rw-r--r--kernel/posix-cpu-timers.c101
-rw-r--r--kernel/posix-timers.c21
-rw-r--r--kernel/power/Kconfig58
-rw-r--r--kernel/power/Makefile2
-rw-r--r--kernel/power/disk.c13
-rw-r--r--kernel/power/main.c44
-rw-r--r--kernel/power/pm.c37
-rw-r--r--kernel/power/power.h63
-rw-r--r--kernel/power/process.c26
-rw-r--r--kernel/power/smp.c62
-rw-r--r--kernel/power/snapshot.c1241
-rw-r--r--kernel/power/swap.c288
-rw-r--r--kernel/power/swsusp.c32
-rw-r--r--kernel/power/user.c17
-rw-r--r--kernel/printk.c87
-rw-r--r--kernel/profile.c19
-rw-r--r--kernel/ptrace.c84
-rw-r--r--kernel/rcupdate.c24
-rw-r--r--kernel/rcutorture.c201
-rw-r--r--kernel/relay.c38
-rw-r--r--kernel/resource.c128
-rw-r--r--kernel/rtmutex-debug.c242
-rw-r--r--kernel/rtmutex-debug.h33
-rw-r--r--kernel/rtmutex-tester.c441
-rw-r--r--kernel/rtmutex.c990
-rw-r--r--kernel/rtmutex.h26
-rw-r--r--kernel/rtmutex_common.h123
-rw-r--r--kernel/rwsem.c147
-rw-r--r--kernel/sched.c2086
-rw-r--r--kernel/signal.c79
-rw-r--r--kernel/softirq.c171
-rw-r--r--kernel/softlockup.c11
-rw-r--r--kernel/spinlock.c96
-rw-r--r--kernel/stacktrace.c24
-rw-r--r--kernel/stop_machine.c4
-rw-r--r--kernel/sys.c45
-rw-r--r--kernel/sysctl.c218
-rw-r--r--kernel/taskstats.c564
-rw-r--r--kernel/time.c2
-rw-r--r--kernel/time/Makefile1
-rw-r--r--kernel/time/clocksource.c349
-rw-r--r--kernel/time/jiffies.c73
-rw-r--r--kernel/timer.c534
-rw-r--r--kernel/unwind.c941
-rw-r--r--kernel/wait.c9
-rw-r--r--kernel/workqueue.c182
92 files changed, 14786 insertions, 3322 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index f6ef00f4f90f..d62ec66c1af2 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -8,19 +8,30 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
8 signal.o sys.o kmod.o workqueue.o pid.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o 11 hrtimer.o rwsem.o
12 12
13obj-$(CONFIG_STACKTRACE) += stacktrace.o
14obj-y += time/
13obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o 15obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
16obj-$(CONFIG_LOCKDEP) += lockdep.o
17ifeq ($(CONFIG_PROC_FS),y)
18obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
19endif
14obj-$(CONFIG_FUTEX) += futex.o 20obj-$(CONFIG_FUTEX) += futex.o
15ifeq ($(CONFIG_COMPAT),y) 21ifeq ($(CONFIG_COMPAT),y)
16obj-$(CONFIG_FUTEX) += futex_compat.o 22obj-$(CONFIG_FUTEX) += futex_compat.o
17endif 23endif
24obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
25obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
26obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
18obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o 27obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
19obj-$(CONFIG_SMP) += cpu.o spinlock.o 28obj-$(CONFIG_SMP) += cpu.o spinlock.o
20obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o 29obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
30obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
21obj-$(CONFIG_UID16) += uid16.o 31obj-$(CONFIG_UID16) += uid16.o
22obj-$(CONFIG_MODULES) += module.o 32obj-$(CONFIG_MODULES) += module.o
23obj-$(CONFIG_KALLSYMS) += kallsyms.o 33obj-$(CONFIG_KALLSYMS) += kallsyms.o
34obj-$(CONFIG_STACK_UNWIND) += unwind.o
24obj-$(CONFIG_PM) += power/ 35obj-$(CONFIG_PM) += power/
25obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o 36obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
26obj-$(CONFIG_KEXEC) += kexec.o 37obj-$(CONFIG_KEXEC) += kexec.o
@@ -37,6 +48,8 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
37obj-$(CONFIG_SECCOMP) += seccomp.o 48obj-$(CONFIG_SECCOMP) += seccomp.o
38obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 49obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
39obj-$(CONFIG_RELAY) += relay.o 50obj-$(CONFIG_RELAY) += relay.o
51obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
52obj-$(CONFIG_TASKSTATS) += taskstats.o
40 53
41ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) 54ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
42# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 55# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/acct.c b/kernel/acct.c
index 6802020e0ceb..f4330acead46 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -43,7 +43,6 @@
43 * a struct file opened for write. Fixed. 2/6/2000, AV. 43 * a struct file opened for write. Fixed. 2/6/2000, AV.
44 */ 44 */
45 45
46#include <linux/config.h>
47#include <linux/mm.h> 46#include <linux/mm.h>
48#include <linux/slab.h> 47#include <linux/slab.h>
49#include <linux/acct.h> 48#include <linux/acct.h>
@@ -75,7 +74,7 @@ int acct_parm[3] = {4, 2, 30};
75/* 74/*
76 * External references and all of the globals. 75 * External references and all of the globals.
77 */ 76 */
78static void do_acct_process(long, struct file *); 77static void do_acct_process(struct file *);
79 78
80/* 79/*
81 * This structure is used so that all the data protected by lock 80 * This structure is used so that all the data protected by lock
@@ -196,7 +195,7 @@ static void acct_file_reopen(struct file *file)
196 if (old_acct) { 195 if (old_acct) {
197 mnt_unpin(old_acct->f_vfsmnt); 196 mnt_unpin(old_acct->f_vfsmnt);
198 spin_unlock(&acct_globals.lock); 197 spin_unlock(&acct_globals.lock);
199 do_acct_process(0, old_acct); 198 do_acct_process(old_acct);
200 filp_close(old_acct, NULL); 199 filp_close(old_acct, NULL);
201 spin_lock(&acct_globals.lock); 200 spin_lock(&acct_globals.lock);
202 } 201 }
@@ -419,16 +418,15 @@ static u32 encode_float(u64 value)
419/* 418/*
420 * do_acct_process does all actual work. Caller holds the reference to file. 419 * do_acct_process does all actual work. Caller holds the reference to file.
421 */ 420 */
422static void do_acct_process(long exitcode, struct file *file) 421static void do_acct_process(struct file *file)
423{ 422{
423 struct pacct_struct *pacct = &current->signal->pacct;
424 acct_t ac; 424 acct_t ac;
425 mm_segment_t fs; 425 mm_segment_t fs;
426 unsigned long vsize;
427 unsigned long flim; 426 unsigned long flim;
428 u64 elapsed; 427 u64 elapsed;
429 u64 run_time; 428 u64 run_time;
430 struct timespec uptime; 429 struct timespec uptime;
431 unsigned long jiffies;
432 430
433 /* 431 /*
434 * First check to see if there is enough free_space to continue 432 * First check to see if there is enough free_space to continue
@@ -469,12 +467,6 @@ static void do_acct_process(long exitcode, struct file *file)
469#endif 467#endif
470 do_div(elapsed, AHZ); 468 do_div(elapsed, AHZ);
471 ac.ac_btime = xtime.tv_sec - elapsed; 469 ac.ac_btime = xtime.tv_sec - elapsed;
472 jiffies = cputime_to_jiffies(cputime_add(current->utime,
473 current->signal->utime));
474 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies));
475 jiffies = cputime_to_jiffies(cputime_add(current->stime,
476 current->signal->stime));
477 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies));
478 /* we really need to bite the bullet and change layout */ 470 /* we really need to bite the bullet and change layout */
479 ac.ac_uid = current->uid; 471 ac.ac_uid = current->uid;
480 ac.ac_gid = current->gid; 472 ac.ac_gid = current->gid;
@@ -491,42 +483,27 @@ static void do_acct_process(long exitcode, struct file *file)
491 ac.ac_ppid = current->parent->tgid; 483 ac.ac_ppid = current->parent->tgid;
492#endif 484#endif
493 485
494 read_lock(&tasklist_lock); /* pin current->signal */ 486 mutex_lock(&tty_mutex);
487 /* FIXME: Whoever is responsible for current->signal locking needs
488 to use the same locking all over the kernel and document it */
489 read_lock(&tasklist_lock);
495 ac.ac_tty = current->signal->tty ? 490 ac.ac_tty = current->signal->tty ?
496 old_encode_dev(tty_devnum(current->signal->tty)) : 0; 491 old_encode_dev(tty_devnum(current->signal->tty)) : 0;
497 read_unlock(&tasklist_lock); 492 read_unlock(&tasklist_lock);
498 493 mutex_unlock(&tty_mutex);
499 ac.ac_flag = 0; 494
500 if (current->flags & PF_FORKNOEXEC) 495 spin_lock_irq(&current->sighand->siglock);
501 ac.ac_flag |= AFORK; 496 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
502 if (current->flags & PF_SUPERPRIV) 497 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
503 ac.ac_flag |= ASU; 498 ac.ac_flag = pacct->ac_flag;
504 if (current->flags & PF_DUMPCORE) 499 ac.ac_mem = encode_comp_t(pacct->ac_mem);
505 ac.ac_flag |= ACORE; 500 ac.ac_minflt = encode_comp_t(pacct->ac_minflt);
506 if (current->flags & PF_SIGNALED) 501 ac.ac_majflt = encode_comp_t(pacct->ac_majflt);
507 ac.ac_flag |= AXSIG; 502 ac.ac_exitcode = pacct->ac_exitcode;
508 503 spin_unlock_irq(&current->sighand->siglock);
509 vsize = 0;
510 if (current->mm) {
511 struct vm_area_struct *vma;
512 down_read(&current->mm->mmap_sem);
513 vma = current->mm->mmap;
514 while (vma) {
515 vsize += vma->vm_end - vma->vm_start;
516 vma = vma->vm_next;
517 }
518 up_read(&current->mm->mmap_sem);
519 }
520 vsize = vsize / 1024;
521 ac.ac_mem = encode_comp_t(vsize);
522 ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ 504 ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */
523 ac.ac_rw = encode_comp_t(ac.ac_io / 1024); 505 ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
524 ac.ac_minflt = encode_comp_t(current->signal->min_flt +
525 current->min_flt);
526 ac.ac_majflt = encode_comp_t(current->signal->maj_flt +
527 current->maj_flt);
528 ac.ac_swaps = encode_comp_t(0); 506 ac.ac_swaps = encode_comp_t(0);
529 ac.ac_exitcode = exitcode;
530 507
531 /* 508 /*
532 * Kernel segment override to datasegment and write it 509 * Kernel segment override to datasegment and write it
@@ -546,12 +523,64 @@ static void do_acct_process(long exitcode, struct file *file)
546} 523}
547 524
548/** 525/**
526 * acct_init_pacct - initialize a new pacct_struct
527 * @pacct: per-process accounting info struct to initialize
528 */
529void acct_init_pacct(struct pacct_struct *pacct)
530{
531 memset(pacct, 0, sizeof(struct pacct_struct));
532 pacct->ac_utime = pacct->ac_stime = cputime_zero;
533}
534
535/**
536 * acct_collect - collect accounting information into pacct_struct
537 * @exitcode: task exit code
538 * @group_dead: not 0, if this thread is the last one in the process.
539 */
540void acct_collect(long exitcode, int group_dead)
541{
542 struct pacct_struct *pacct = &current->signal->pacct;
543 unsigned long vsize = 0;
544
545 if (group_dead && current->mm) {
546 struct vm_area_struct *vma;
547 down_read(&current->mm->mmap_sem);
548 vma = current->mm->mmap;
549 while (vma) {
550 vsize += vma->vm_end - vma->vm_start;
551 vma = vma->vm_next;
552 }
553 up_read(&current->mm->mmap_sem);
554 }
555
556 spin_lock_irq(&current->sighand->siglock);
557 if (group_dead)
558 pacct->ac_mem = vsize / 1024;
559 if (thread_group_leader(current)) {
560 pacct->ac_exitcode = exitcode;
561 if (current->flags & PF_FORKNOEXEC)
562 pacct->ac_flag |= AFORK;
563 }
564 if (current->flags & PF_SUPERPRIV)
565 pacct->ac_flag |= ASU;
566 if (current->flags & PF_DUMPCORE)
567 pacct->ac_flag |= ACORE;
568 if (current->flags & PF_SIGNALED)
569 pacct->ac_flag |= AXSIG;
570 pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime);
571 pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime);
572 pacct->ac_minflt += current->min_flt;
573 pacct->ac_majflt += current->maj_flt;
574 spin_unlock_irq(&current->sighand->siglock);
575}
576
577/**
549 * acct_process - now just a wrapper around do_acct_process 578 * acct_process - now just a wrapper around do_acct_process
550 * @exitcode: task exit code 579 * @exitcode: task exit code
551 * 580 *
552 * handles process accounting for an exiting task 581 * handles process accounting for an exiting task
553 */ 582 */
554void acct_process(long exitcode) 583void acct_process(void)
555{ 584{
556 struct file *file = NULL; 585 struct file *file = NULL;
557 586
@@ -570,7 +599,7 @@ void acct_process(long exitcode)
570 get_file(file); 599 get_file(file);
571 spin_unlock(&acct_globals.lock); 600 spin_unlock(&acct_globals.lock);
572 601
573 do_acct_process(exitcode, file); 602 do_acct_process(file);
574 fput(file); 603 fput(file);
575} 604}
576 605
@@ -599,9 +628,7 @@ void acct_update_integrals(struct task_struct *tsk)
599 */ 628 */
600void acct_clear_integrals(struct task_struct *tsk) 629void acct_clear_integrals(struct task_struct *tsk)
601{ 630{
602 if (tsk) { 631 tsk->acct_stimexpd = 0;
603 tsk->acct_stimexpd = 0; 632 tsk->acct_rss_mem1 = 0;
604 tsk->acct_rss_mem1 = 0; 633 tsk->acct_vm_mem1 = 0;
605 tsk->acct_vm_mem1 = 0;
606 }
607} 634}
diff --git a/kernel/audit.c b/kernel/audit.c
index 7dfac7031bd7..f9889ee77825 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -244,7 +244,7 @@ static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid)
244 char *ctx = NULL; 244 char *ctx = NULL;
245 u32 len; 245 u32 len;
246 int rc; 246 int rc;
247 if ((rc = selinux_ctxid_to_string(sid, &ctx, &len))) 247 if ((rc = selinux_sid_to_string(sid, &ctx, &len)))
248 return rc; 248 return rc;
249 else 249 else
250 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 250 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
@@ -267,7 +267,7 @@ static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid)
267 char *ctx = NULL; 267 char *ctx = NULL;
268 u32 len; 268 u32 len;
269 int rc; 269 int rc;
270 if ((rc = selinux_ctxid_to_string(sid, &ctx, &len))) 270 if ((rc = selinux_sid_to_string(sid, &ctx, &len)))
271 return rc; 271 return rc;
272 else 272 else
273 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 273 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
@@ -293,7 +293,7 @@ static int audit_set_enabled(int state, uid_t loginuid, u32 sid)
293 char *ctx = NULL; 293 char *ctx = NULL;
294 u32 len; 294 u32 len;
295 int rc; 295 int rc;
296 if ((rc = selinux_ctxid_to_string(sid, &ctx, &len))) 296 if ((rc = selinux_sid_to_string(sid, &ctx, &len)))
297 return rc; 297 return rc;
298 else 298 else
299 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 299 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
@@ -321,7 +321,7 @@ static int audit_set_failure(int state, uid_t loginuid, u32 sid)
321 char *ctx = NULL; 321 char *ctx = NULL;
322 u32 len; 322 u32 len;
323 int rc; 323 int rc;
324 if ((rc = selinux_ctxid_to_string(sid, &ctx, &len))) 324 if ((rc = selinux_sid_to_string(sid, &ctx, &len)))
325 return rc; 325 return rc;
326 else 326 else
327 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE, 327 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
@@ -445,7 +445,7 @@ void audit_send_reply(int pid, int seq, int type, int done, int multi,
445 * Check for appropriate CAP_AUDIT_ capabilities on incoming audit 445 * Check for appropriate CAP_AUDIT_ capabilities on incoming audit
446 * control messages. 446 * control messages.
447 */ 447 */
448static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type) 448static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
449{ 449{
450 int err = 0; 450 int err = 0;
451 451
@@ -459,13 +459,13 @@ static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type)
459 case AUDIT_DEL: 459 case AUDIT_DEL:
460 case AUDIT_DEL_RULE: 460 case AUDIT_DEL_RULE:
461 case AUDIT_SIGNAL_INFO: 461 case AUDIT_SIGNAL_INFO:
462 if (!cap_raised(eff_cap, CAP_AUDIT_CONTROL)) 462 if (security_netlink_recv(skb, CAP_AUDIT_CONTROL))
463 err = -EPERM; 463 err = -EPERM;
464 break; 464 break;
465 case AUDIT_USER: 465 case AUDIT_USER:
466 case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: 466 case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
467 case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2: 467 case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2:
468 if (!cap_raised(eff_cap, CAP_AUDIT_WRITE)) 468 if (security_netlink_recv(skb, CAP_AUDIT_WRITE))
469 err = -EPERM; 469 err = -EPERM;
470 break; 470 break;
471 default: /* bad msg */ 471 default: /* bad msg */
@@ -488,7 +488,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
488 char *ctx; 488 char *ctx;
489 u32 len; 489 u32 len;
490 490
491 err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type); 491 err = audit_netlink_ok(skb, msg_type);
492 if (err) 492 if (err)
493 return err; 493 return err;
494 494
@@ -538,7 +538,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
538 if (status_get->mask & AUDIT_STATUS_PID) { 538 if (status_get->mask & AUDIT_STATUS_PID) {
539 int old = audit_pid; 539 int old = audit_pid;
540 if (sid) { 540 if (sid) {
541 if ((err = selinux_ctxid_to_string( 541 if ((err = selinux_sid_to_string(
542 sid, &ctx, &len))) 542 sid, &ctx, &len)))
543 return err; 543 return err;
544 else 544 else
@@ -576,7 +576,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
576 "user pid=%d uid=%u auid=%u", 576 "user pid=%d uid=%u auid=%u",
577 pid, uid, loginuid); 577 pid, uid, loginuid);
578 if (sid) { 578 if (sid) {
579 if (selinux_ctxid_to_string( 579 if (selinux_sid_to_string(
580 sid, &ctx, &len)) { 580 sid, &ctx, &len)) {
581 audit_log_format(ab, 581 audit_log_format(ab,
582 " ssid=%u", sid); 582 " ssid=%u", sid);
@@ -614,7 +614,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
614 loginuid, sid); 614 loginuid, sid);
615 break; 615 break;
616 case AUDIT_SIGNAL_INFO: 616 case AUDIT_SIGNAL_INFO:
617 err = selinux_ctxid_to_string(audit_sig_sid, &ctx, &len); 617 err = selinux_sid_to_string(audit_sig_sid, &ctx, &len);
618 if (err) 618 if (err)
619 return err; 619 return err;
620 sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL); 620 sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL);
@@ -690,9 +690,7 @@ static const struct inotify_operations audit_inotify_ops = {
690/* Initialize audit support at boot time. */ 690/* Initialize audit support at boot time. */
691static int __init audit_init(void) 691static int __init audit_init(void)
692{ 692{
693#ifdef CONFIG_AUDITSYSCALL
694 int i; 693 int i;
695#endif
696 694
697 printk(KERN_INFO "audit: initializing netlink socket (%s)\n", 695 printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
698 audit_default ? "enabled" : "disabled"); 696 audit_default ? "enabled" : "disabled");
@@ -717,10 +715,10 @@ static int __init audit_init(void)
717 audit_ih = inotify_init(&audit_inotify_ops); 715 audit_ih = inotify_init(&audit_inotify_ops);
718 if (IS_ERR(audit_ih)) 716 if (IS_ERR(audit_ih))
719 audit_panic("cannot initialize inotify handle"); 717 audit_panic("cannot initialize inotify handle");
718#endif
720 719
721 for (i = 0; i < AUDIT_INODE_BUCKETS; i++) 720 for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
722 INIT_LIST_HEAD(&audit_inode_hash[i]); 721 INIT_LIST_HEAD(&audit_inode_hash[i]);
723#endif
724 722
725 return 0; 723 return 0;
726} 724}
@@ -818,7 +816,7 @@ err:
818 */ 816 */
819unsigned int audit_serial(void) 817unsigned int audit_serial(void)
820{ 818{
821 static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED; 819 static DEFINE_SPINLOCK(serial_lock);
822 static unsigned int serial = 0; 820 static unsigned int serial = 0;
823 821
824 unsigned long flags; 822 unsigned long flags;
@@ -1030,6 +1028,9 @@ void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf,
1030 struct sk_buff *skb; 1028 struct sk_buff *skb;
1031 static const unsigned char *hex = "0123456789ABCDEF"; 1029 static const unsigned char *hex = "0123456789ABCDEF";
1032 1030
1031 if (!ab)
1032 return;
1033
1033 BUG_ON(!ab->skb); 1034 BUG_ON(!ab->skb);
1034 skb = ab->skb; 1035 skb = ab->skb;
1035 avail = skb_tailroom(skb); 1036 avail = skb_tailroom(skb);
@@ -1062,6 +1063,9 @@ static void audit_log_n_string(struct audit_buffer *ab, size_t slen,
1062 unsigned char *ptr; 1063 unsigned char *ptr;
1063 struct sk_buff *skb; 1064 struct sk_buff *skb;
1064 1065
1066 if (!ab)
1067 return;
1068
1065 BUG_ON(!ab->skb); 1069 BUG_ON(!ab->skb);
1066 skb = ab->skb; 1070 skb = ab->skb;
1067 avail = skb_tailroom(skb); 1071 avail = skb_tailroom(skb);
diff --git a/kernel/audit.h b/kernel/audit.h
index 8323e4132a33..a3370232a390 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -81,6 +81,7 @@ struct audit_krule {
81 u32 mask[AUDIT_BITMASK_SIZE]; 81 u32 mask[AUDIT_BITMASK_SIZE];
82 u32 buflen; /* for data alloc on list rules */ 82 u32 buflen; /* for data alloc on list rules */
83 u32 field_count; 83 u32 field_count;
84 char *filterkey; /* ties events to rules */
84 struct audit_field *fields; 85 struct audit_field *fields;
85 struct audit_field *inode_f; /* quick access to an inode field */ 86 struct audit_field *inode_f; /* quick access to an inode field */
86 struct audit_watch *watch; /* associated watch */ 87 struct audit_watch *watch; /* associated watch */
@@ -103,6 +104,7 @@ static inline int audit_hash_ino(u32 ino)
103 return (ino & (AUDIT_INODE_BUCKETS-1)); 104 return (ino & (AUDIT_INODE_BUCKETS-1));
104} 105}
105 106
107extern int audit_match_class(int class, unsigned syscall);
106extern int audit_comparator(const u32 left, const u32 op, const u32 right); 108extern int audit_comparator(const u32 left, const u32 op, const u32 right);
107extern int audit_compare_dname_path(const char *dname, const char *path, 109extern int audit_compare_dname_path(const char *dname, const char *path,
108 int *dirlen); 110 int *dirlen);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 4c99d2c586ed..1a58a81fb09d 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -141,6 +141,7 @@ static inline void audit_free_rule(struct audit_entry *e)
141 selinux_audit_rule_free(f->se_rule); 141 selinux_audit_rule_free(f->se_rule);
142 } 142 }
143 kfree(e->rule.fields); 143 kfree(e->rule.fields);
144 kfree(e->rule.filterkey);
144 kfree(e); 145 kfree(e);
145} 146}
146 147
@@ -278,6 +279,38 @@ static int audit_to_watch(struct audit_krule *krule, char *path, int len,
278 return 0; 279 return 0;
279} 280}
280 281
282static __u32 *classes[AUDIT_SYSCALL_CLASSES];
283
284int __init audit_register_class(int class, unsigned *list)
285{
286 __u32 *p = kzalloc(AUDIT_BITMASK_SIZE * sizeof(__u32), GFP_KERNEL);
287 if (!p)
288 return -ENOMEM;
289 while (*list != ~0U) {
290 unsigned n = *list++;
291 if (n >= AUDIT_BITMASK_SIZE * 32 - AUDIT_SYSCALL_CLASSES) {
292 kfree(p);
293 return -EINVAL;
294 }
295 p[AUDIT_WORD(n)] |= AUDIT_BIT(n);
296 }
297 if (class >= AUDIT_SYSCALL_CLASSES || classes[class]) {
298 kfree(p);
299 return -EINVAL;
300 }
301 classes[class] = p;
302 return 0;
303}
304
305int audit_match_class(int class, unsigned syscall)
306{
307 if (unlikely(syscall >= AUDIT_BITMASK_SIZE * sizeof(__u32)))
308 return 0;
309 if (unlikely(class >= AUDIT_SYSCALL_CLASSES || !classes[class]))
310 return 0;
311 return classes[class][AUDIT_WORD(syscall)] & AUDIT_BIT(syscall);
312}
313
281/* Common user-space to kernel rule translation. */ 314/* Common user-space to kernel rule translation. */
282static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) 315static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
283{ 316{
@@ -321,6 +354,22 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
321 for (i = 0; i < AUDIT_BITMASK_SIZE; i++) 354 for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
322 entry->rule.mask[i] = rule->mask[i]; 355 entry->rule.mask[i] = rule->mask[i];
323 356
357 for (i = 0; i < AUDIT_SYSCALL_CLASSES; i++) {
358 int bit = AUDIT_BITMASK_SIZE * 32 - i - 1;
359 __u32 *p = &entry->rule.mask[AUDIT_WORD(bit)];
360 __u32 *class;
361
362 if (!(*p & AUDIT_BIT(bit)))
363 continue;
364 *p &= ~AUDIT_BIT(bit);
365 class = classes[i];
366 if (class) {
367 int j;
368 for (j = 0; j < AUDIT_BITMASK_SIZE; j++)
369 entry->rule.mask[j] |= class[j];
370 }
371 }
372
324 return entry; 373 return entry;
325 374
326exit_err: 375exit_err:
@@ -364,6 +413,7 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
364 case AUDIT_PERS: 413 case AUDIT_PERS:
365 case AUDIT_ARCH: 414 case AUDIT_ARCH:
366 case AUDIT_MSGTYPE: 415 case AUDIT_MSGTYPE:
416 case AUDIT_PPID:
367 case AUDIT_DEVMAJOR: 417 case AUDIT_DEVMAJOR:
368 case AUDIT_DEVMINOR: 418 case AUDIT_DEVMINOR:
369 case AUDIT_EXIT: 419 case AUDIT_EXIT:
@@ -373,6 +423,10 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
373 case AUDIT_ARG2: 423 case AUDIT_ARG2:
374 case AUDIT_ARG3: 424 case AUDIT_ARG3:
375 break; 425 break;
426 case AUDIT_PERM:
427 if (f->val & ~15)
428 goto exit_free;
429 break;
376 case AUDIT_INODE: 430 case AUDIT_INODE:
377 err = audit_to_inode(&entry->rule, f); 431 err = audit_to_inode(&entry->rule, f);
378 if (err) 432 if (err)
@@ -402,6 +456,7 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
402 case AUDIT_EQUAL: 456 case AUDIT_EQUAL:
403 break; 457 break;
404 default: 458 default:
459 err = -EINVAL;
405 goto exit_free; 460 goto exit_free;
406 } 461 }
407 } 462 }
@@ -469,11 +524,16 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
469 case AUDIT_ARG2: 524 case AUDIT_ARG2:
470 case AUDIT_ARG3: 525 case AUDIT_ARG3:
471 break; 526 break;
472 case AUDIT_SE_USER: 527 case AUDIT_SUBJ_USER:
473 case AUDIT_SE_ROLE: 528 case AUDIT_SUBJ_ROLE:
474 case AUDIT_SE_TYPE: 529 case AUDIT_SUBJ_TYPE:
475 case AUDIT_SE_SEN: 530 case AUDIT_SUBJ_SEN:
476 case AUDIT_SE_CLR: 531 case AUDIT_SUBJ_CLR:
532 case AUDIT_OBJ_USER:
533 case AUDIT_OBJ_ROLE:
534 case AUDIT_OBJ_TYPE:
535 case AUDIT_OBJ_LEV_LOW:
536 case AUDIT_OBJ_LEV_HIGH:
477 str = audit_unpack_string(&bufp, &remain, f->val); 537 str = audit_unpack_string(&bufp, &remain, f->val);
478 if (IS_ERR(str)) 538 if (IS_ERR(str))
479 goto exit_free; 539 goto exit_free;
@@ -511,6 +571,20 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
511 if (err) 571 if (err)
512 goto exit_free; 572 goto exit_free;
513 break; 573 break;
574 case AUDIT_FILTERKEY:
575 err = -EINVAL;
576 if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN)
577 goto exit_free;
578 str = audit_unpack_string(&bufp, &remain, f->val);
579 if (IS_ERR(str))
580 goto exit_free;
581 entry->rule.buflen += f->val;
582 entry->rule.filterkey = str;
583 break;
584 case AUDIT_PERM:
585 if (f->val & ~15)
586 goto exit_free;
587 break;
514 default: 588 default:
515 goto exit_free; 589 goto exit_free;
516 } 590 }
@@ -524,6 +598,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
524 case AUDIT_EQUAL: 598 case AUDIT_EQUAL:
525 break; 599 break;
526 default: 600 default:
601 err = -EINVAL;
527 goto exit_free; 602 goto exit_free;
528 } 603 }
529 } 604 }
@@ -600,11 +675,16 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
600 data->fields[i] = f->type; 675 data->fields[i] = f->type;
601 data->fieldflags[i] = f->op; 676 data->fieldflags[i] = f->op;
602 switch(f->type) { 677 switch(f->type) {
603 case AUDIT_SE_USER: 678 case AUDIT_SUBJ_USER:
604 case AUDIT_SE_ROLE: 679 case AUDIT_SUBJ_ROLE:
605 case AUDIT_SE_TYPE: 680 case AUDIT_SUBJ_TYPE:
606 case AUDIT_SE_SEN: 681 case AUDIT_SUBJ_SEN:
607 case AUDIT_SE_CLR: 682 case AUDIT_SUBJ_CLR:
683 case AUDIT_OBJ_USER:
684 case AUDIT_OBJ_ROLE:
685 case AUDIT_OBJ_TYPE:
686 case AUDIT_OBJ_LEV_LOW:
687 case AUDIT_OBJ_LEV_HIGH:
608 data->buflen += data->values[i] = 688 data->buflen += data->values[i] =
609 audit_pack_string(&bufp, f->se_str); 689 audit_pack_string(&bufp, f->se_str);
610 break; 690 break;
@@ -612,6 +692,10 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
612 data->buflen += data->values[i] = 692 data->buflen += data->values[i] =
613 audit_pack_string(&bufp, krule->watch->path); 693 audit_pack_string(&bufp, krule->watch->path);
614 break; 694 break;
695 case AUDIT_FILTERKEY:
696 data->buflen += data->values[i] =
697 audit_pack_string(&bufp, krule->filterkey);
698 break;
615 default: 699 default:
616 data->values[i] = f->val; 700 data->values[i] = f->val;
617 } 701 }
@@ -639,11 +723,16 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
639 return 1; 723 return 1;
640 724
641 switch(a->fields[i].type) { 725 switch(a->fields[i].type) {
642 case AUDIT_SE_USER: 726 case AUDIT_SUBJ_USER:
643 case AUDIT_SE_ROLE: 727 case AUDIT_SUBJ_ROLE:
644 case AUDIT_SE_TYPE: 728 case AUDIT_SUBJ_TYPE:
645 case AUDIT_SE_SEN: 729 case AUDIT_SUBJ_SEN:
646 case AUDIT_SE_CLR: 730 case AUDIT_SUBJ_CLR:
731 case AUDIT_OBJ_USER:
732 case AUDIT_OBJ_ROLE:
733 case AUDIT_OBJ_TYPE:
734 case AUDIT_OBJ_LEV_LOW:
735 case AUDIT_OBJ_LEV_HIGH:
647 if (strcmp(a->fields[i].se_str, b->fields[i].se_str)) 736 if (strcmp(a->fields[i].se_str, b->fields[i].se_str))
648 return 1; 737 return 1;
649 break; 738 break;
@@ -651,6 +740,11 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
651 if (strcmp(a->watch->path, b->watch->path)) 740 if (strcmp(a->watch->path, b->watch->path))
652 return 1; 741 return 1;
653 break; 742 break;
743 case AUDIT_FILTERKEY:
744 /* both filterkeys exist based on above type compare */
745 if (strcmp(a->filterkey, b->filterkey))
746 return 1;
747 break;
654 default: 748 default:
655 if (a->fields[i].val != b->fields[i].val) 749 if (a->fields[i].val != b->fields[i].val)
656 return 1; 750 return 1;
@@ -730,6 +824,7 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
730 u32 fcount = old->field_count; 824 u32 fcount = old->field_count;
731 struct audit_entry *entry; 825 struct audit_entry *entry;
732 struct audit_krule *new; 826 struct audit_krule *new;
827 char *fk;
733 int i, err = 0; 828 int i, err = 0;
734 829
735 entry = audit_init_entry(fcount); 830 entry = audit_init_entry(fcount);
@@ -753,13 +848,25 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
753 * the originals will all be freed when the old rule is freed. */ 848 * the originals will all be freed when the old rule is freed. */
754 for (i = 0; i < fcount; i++) { 849 for (i = 0; i < fcount; i++) {
755 switch (new->fields[i].type) { 850 switch (new->fields[i].type) {
756 case AUDIT_SE_USER: 851 case AUDIT_SUBJ_USER:
757 case AUDIT_SE_ROLE: 852 case AUDIT_SUBJ_ROLE:
758 case AUDIT_SE_TYPE: 853 case AUDIT_SUBJ_TYPE:
759 case AUDIT_SE_SEN: 854 case AUDIT_SUBJ_SEN:
760 case AUDIT_SE_CLR: 855 case AUDIT_SUBJ_CLR:
856 case AUDIT_OBJ_USER:
857 case AUDIT_OBJ_ROLE:
858 case AUDIT_OBJ_TYPE:
859 case AUDIT_OBJ_LEV_LOW:
860 case AUDIT_OBJ_LEV_HIGH:
761 err = audit_dupe_selinux_field(&new->fields[i], 861 err = audit_dupe_selinux_field(&new->fields[i],
762 &old->fields[i]); 862 &old->fields[i]);
863 break;
864 case AUDIT_FILTERKEY:
865 fk = kstrdup(old->filterkey, GFP_KERNEL);
866 if (unlikely(!fk))
867 err = -ENOMEM;
868 else
869 new->filterkey = fk;
763 } 870 }
764 if (err) { 871 if (err) {
765 audit_free_rule(entry); 872 audit_free_rule(entry);
@@ -824,7 +931,7 @@ static void audit_update_watch(struct audit_parent *parent,
824 } 931 }
825 932
826 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE); 933 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
827 audit_log_format(ab, "audit updated rules specifying watch="); 934 audit_log_format(ab, "audit updated rules specifying path=");
828 audit_log_untrustedstring(ab, owatch->path); 935 audit_log_untrustedstring(ab, owatch->path);
829 audit_log_format(ab, " with dev=%u ino=%lu\n", dev, ino); 936 audit_log_format(ab, " with dev=%u ino=%lu\n", dev, ino);
830 audit_log_end(ab); 937 audit_log_end(ab);
@@ -847,19 +954,28 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
847 struct audit_watch *w, *nextw; 954 struct audit_watch *w, *nextw;
848 struct audit_krule *r, *nextr; 955 struct audit_krule *r, *nextr;
849 struct audit_entry *e; 956 struct audit_entry *e;
957 struct audit_buffer *ab;
850 958
851 mutex_lock(&audit_filter_mutex); 959 mutex_lock(&audit_filter_mutex);
852 parent->flags |= AUDIT_PARENT_INVALID; 960 parent->flags |= AUDIT_PARENT_INVALID;
853 list_for_each_entry_safe(w, nextw, &parent->watches, wlist) { 961 list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
854 list_for_each_entry_safe(r, nextr, &w->rules, rlist) { 962 list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
855 e = container_of(r, struct audit_entry, rule); 963 e = container_of(r, struct audit_entry, rule);
964
965 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
966 audit_log_format(ab, "audit implicitly removed rule path=");
967 audit_log_untrustedstring(ab, w->path);
968 if (r->filterkey) {
969 audit_log_format(ab, " key=");
970 audit_log_untrustedstring(ab, r->filterkey);
971 } else
972 audit_log_format(ab, " key=(null)");
973 audit_log_format(ab, " list=%d", r->listnr);
974 audit_log_end(ab);
975
856 list_del(&r->rlist); 976 list_del(&r->rlist);
857 list_del_rcu(&e->list); 977 list_del_rcu(&e->list);
858 call_rcu(&e->rcu, audit_free_rule_rcu); 978 call_rcu(&e->rcu, audit_free_rule_rcu);
859
860 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
861 "audit implicitly removed rule from list=%d\n",
862 AUDIT_FILTER_EXIT);
863 } 979 }
864 audit_remove_watch(w); 980 audit_remove_watch(w);
865 } 981 }
@@ -1047,6 +1163,14 @@ static inline int audit_add_rule(struct audit_entry *entry,
1047 struct audit_watch *watch = entry->rule.watch; 1163 struct audit_watch *watch = entry->rule.watch;
1048 struct nameidata *ndp, *ndw; 1164 struct nameidata *ndp, *ndw;
1049 int h, err, putnd_needed = 0; 1165 int h, err, putnd_needed = 0;
1166#ifdef CONFIG_AUDITSYSCALL
1167 int dont_count = 0;
1168
1169 /* If either of these, don't count towards total */
1170 if (entry->rule.listnr == AUDIT_FILTER_USER ||
1171 entry->rule.listnr == AUDIT_FILTER_TYPE)
1172 dont_count = 1;
1173#endif
1050 1174
1051 if (inode_f) { 1175 if (inode_f) {
1052 h = audit_hash_ino(inode_f->val); 1176 h = audit_hash_ino(inode_f->val);
@@ -1087,6 +1211,10 @@ static inline int audit_add_rule(struct audit_entry *entry,
1087 } else { 1211 } else {
1088 list_add_tail_rcu(&entry->list, list); 1212 list_add_tail_rcu(&entry->list, list);
1089 } 1213 }
1214#ifdef CONFIG_AUDITSYSCALL
1215 if (!dont_count)
1216 audit_n_rules++;
1217#endif
1090 mutex_unlock(&audit_filter_mutex); 1218 mutex_unlock(&audit_filter_mutex);
1091 1219
1092 if (putnd_needed) 1220 if (putnd_needed)
@@ -1111,6 +1239,14 @@ static inline int audit_del_rule(struct audit_entry *entry,
1111 struct audit_watch *watch, *tmp_watch = entry->rule.watch; 1239 struct audit_watch *watch, *tmp_watch = entry->rule.watch;
1112 LIST_HEAD(inotify_list); 1240 LIST_HEAD(inotify_list);
1113 int h, ret = 0; 1241 int h, ret = 0;
1242#ifdef CONFIG_AUDITSYSCALL
1243 int dont_count = 0;
1244
1245 /* If either of these, don't count towards total */
1246 if (entry->rule.listnr == AUDIT_FILTER_USER ||
1247 entry->rule.listnr == AUDIT_FILTER_TYPE)
1248 dont_count = 1;
1249#endif
1114 1250
1115 if (inode_f) { 1251 if (inode_f) {
1116 h = audit_hash_ino(inode_f->val); 1252 h = audit_hash_ino(inode_f->val);
@@ -1148,6 +1284,10 @@ static inline int audit_del_rule(struct audit_entry *entry,
1148 list_del_rcu(&e->list); 1284 list_del_rcu(&e->list);
1149 call_rcu(&e->rcu, audit_free_rule_rcu); 1285 call_rcu(&e->rcu, audit_free_rule_rcu);
1150 1286
1287#ifdef CONFIG_AUDITSYSCALL
1288 if (!dont_count)
1289 audit_n_rules--;
1290#endif
1151 mutex_unlock(&audit_filter_mutex); 1291 mutex_unlock(&audit_filter_mutex);
1152 1292
1153 if (!list_empty(&inotify_list)) 1293 if (!list_empty(&inotify_list))
@@ -1245,6 +1385,34 @@ static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
1245 skb_queue_tail(q, skb); 1385 skb_queue_tail(q, skb);
1246} 1386}
1247 1387
1388/* Log rule additions and removals */
1389static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action,
1390 struct audit_krule *rule, int res)
1391{
1392 struct audit_buffer *ab;
1393
1394 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
1395 if (!ab)
1396 return;
1397 audit_log_format(ab, "auid=%u", loginuid);
1398 if (sid) {
1399 char *ctx = NULL;
1400 u32 len;
1401 if (selinux_sid_to_string(sid, &ctx, &len))
1402 audit_log_format(ab, " ssid=%u", sid);
1403 else
1404 audit_log_format(ab, " subj=%s", ctx);
1405 kfree(ctx);
1406 }
1407 audit_log_format(ab, " %s rule key=", action);
1408 if (rule->filterkey)
1409 audit_log_untrustedstring(ab, rule->filterkey);
1410 else
1411 audit_log_format(ab, "(null)");
1412 audit_log_format(ab, " list=%d res=%d", rule->listnr, res);
1413 audit_log_end(ab);
1414}
1415
1248/** 1416/**
1249 * audit_receive_filter - apply all rules to the specified message type 1417 * audit_receive_filter - apply all rules to the specified message type
1250 * @type: audit message type 1418 * @type: audit message type
@@ -1304,24 +1472,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
1304 1472
1305 err = audit_add_rule(entry, 1473 err = audit_add_rule(entry,
1306 &audit_filter_list[entry->rule.listnr]); 1474 &audit_filter_list[entry->rule.listnr]);
1307 1475 audit_log_rule_change(loginuid, sid, "add", &entry->rule, !err);
1308 if (sid) {
1309 char *ctx = NULL;
1310 u32 len;
1311 if (selinux_ctxid_to_string(sid, &ctx, &len)) {
1312 /* Maybe call audit_panic? */
1313 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
1314 "auid=%u ssid=%u add rule to list=%d res=%d",
1315 loginuid, sid, entry->rule.listnr, !err);
1316 } else
1317 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
1318 "auid=%u subj=%s add rule to list=%d res=%d",
1319 loginuid, ctx, entry->rule.listnr, !err);
1320 kfree(ctx);
1321 } else
1322 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
1323 "auid=%u add rule to list=%d res=%d",
1324 loginuid, entry->rule.listnr, !err);
1325 1476
1326 if (err) 1477 if (err)
1327 audit_free_rule(entry); 1478 audit_free_rule(entry);
@@ -1337,24 +1488,8 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
1337 1488
1338 err = audit_del_rule(entry, 1489 err = audit_del_rule(entry,
1339 &audit_filter_list[entry->rule.listnr]); 1490 &audit_filter_list[entry->rule.listnr]);
1340 1491 audit_log_rule_change(loginuid, sid, "remove", &entry->rule,
1341 if (sid) { 1492 !err);
1342 char *ctx = NULL;
1343 u32 len;
1344 if (selinux_ctxid_to_string(sid, &ctx, &len)) {
1345 /* Maybe call audit_panic? */
1346 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
1347 "auid=%u ssid=%u remove rule from list=%d res=%d",
1348 loginuid, sid, entry->rule.listnr, !err);
1349 } else
1350 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
1351 "auid=%u subj=%s remove rule from list=%d res=%d",
1352 loginuid, ctx, entry->rule.listnr, !err);
1353 kfree(ctx);
1354 } else
1355 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
1356 "auid=%u remove rule from list=%d res=%d",
1357 loginuid, entry->rule.listnr, !err);
1358 1493
1359 audit_free_rule(entry); 1494 audit_free_rule(entry);
1360 break; 1495 break;
@@ -1514,11 +1649,16 @@ static inline int audit_rule_has_selinux(struct audit_krule *rule)
1514 for (i = 0; i < rule->field_count; i++) { 1649 for (i = 0; i < rule->field_count; i++) {
1515 struct audit_field *f = &rule->fields[i]; 1650 struct audit_field *f = &rule->fields[i];
1516 switch (f->type) { 1651 switch (f->type) {
1517 case AUDIT_SE_USER: 1652 case AUDIT_SUBJ_USER:
1518 case AUDIT_SE_ROLE: 1653 case AUDIT_SUBJ_ROLE:
1519 case AUDIT_SE_TYPE: 1654 case AUDIT_SUBJ_TYPE:
1520 case AUDIT_SE_SEN: 1655 case AUDIT_SUBJ_SEN:
1521 case AUDIT_SE_CLR: 1656 case AUDIT_SUBJ_CLR:
1657 case AUDIT_OBJ_USER:
1658 case AUDIT_OBJ_ROLE:
1659 case AUDIT_OBJ_TYPE:
1660 case AUDIT_OBJ_LEV_LOW:
1661 case AUDIT_OBJ_LEV_HIGH:
1522 return 1; 1662 return 1;
1523 } 1663 }
1524 } 1664 }
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 9ebd96fda295..105147631753 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -85,6 +85,9 @@ extern int audit_enabled;
85/* Indicates that audit should log the full pathname. */ 85/* Indicates that audit should log the full pathname. */
86#define AUDIT_NAME_FULL -1 86#define AUDIT_NAME_FULL -1
87 87
88/* number of audit rules */
89int audit_n_rules;
90
88/* When fs/namei.c:getname() is called, we store the pointer in name and 91/* When fs/namei.c:getname() is called, we store the pointer in name and
89 * we don't let putname() free it (instead we free all of the saved 92 * we don't let putname() free it (instead we free all of the saved
90 * pointers at syscall exit time). 93 * pointers at syscall exit time).
@@ -174,6 +177,7 @@ struct audit_aux_data_path {
174 177
175/* The per-task audit context. */ 178/* The per-task audit context. */
176struct audit_context { 179struct audit_context {
180 int dummy; /* must be the first element */
177 int in_syscall; /* 1 if task is in a syscall */ 181 int in_syscall; /* 1 if task is in a syscall */
178 enum audit_state state; 182 enum audit_state state;
179 unsigned int serial; /* serial number for record */ 183 unsigned int serial; /* serial number for record */
@@ -186,6 +190,7 @@ struct audit_context {
186 int auditable; /* 1 if record should be written */ 190 int auditable; /* 1 if record should be written */
187 int name_count; 191 int name_count;
188 struct audit_names names[AUDIT_NAMES]; 192 struct audit_names names[AUDIT_NAMES];
193 char * filterkey; /* key for rule that triggered record */
189 struct dentry * pwd; 194 struct dentry * pwd;
190 struct vfsmount * pwdmnt; 195 struct vfsmount * pwdmnt;
191 struct audit_context *previous; /* For nested syscalls */ 196 struct audit_context *previous; /* For nested syscalls */
@@ -204,6 +209,54 @@ struct audit_context {
204#endif 209#endif
205}; 210};
206 211
212#define ACC_MODE(x) ("\004\002\006\006"[(x)&O_ACCMODE])
213static inline int open_arg(int flags, int mask)
214{
215 int n = ACC_MODE(flags);
216 if (flags & (O_TRUNC | O_CREAT))
217 n |= AUDIT_PERM_WRITE;
218 return n & mask;
219}
220
221static int audit_match_perm(struct audit_context *ctx, int mask)
222{
223 unsigned n = ctx->major;
224 switch (audit_classify_syscall(ctx->arch, n)) {
225 case 0: /* native */
226 if ((mask & AUDIT_PERM_WRITE) &&
227 audit_match_class(AUDIT_CLASS_WRITE, n))
228 return 1;
229 if ((mask & AUDIT_PERM_READ) &&
230 audit_match_class(AUDIT_CLASS_READ, n))
231 return 1;
232 if ((mask & AUDIT_PERM_ATTR) &&
233 audit_match_class(AUDIT_CLASS_CHATTR, n))
234 return 1;
235 return 0;
236 case 1: /* 32bit on biarch */
237 if ((mask & AUDIT_PERM_WRITE) &&
238 audit_match_class(AUDIT_CLASS_WRITE_32, n))
239 return 1;
240 if ((mask & AUDIT_PERM_READ) &&
241 audit_match_class(AUDIT_CLASS_READ_32, n))
242 return 1;
243 if ((mask & AUDIT_PERM_ATTR) &&
244 audit_match_class(AUDIT_CLASS_CHATTR_32, n))
245 return 1;
246 return 0;
247 case 2: /* open */
248 return mask & ACC_MODE(ctx->argv[1]);
249 case 3: /* openat */
250 return mask & ACC_MODE(ctx->argv[2]);
251 case 4: /* socketcall */
252 return ((mask & AUDIT_PERM_WRITE) && ctx->argv[0] == SYS_BIND);
253 case 5: /* execve */
254 return mask & AUDIT_PERM_EXEC;
255 default:
256 return 0;
257 }
258}
259
207/* Determine if any context name data matches a rule's watch data */ 260/* Determine if any context name data matches a rule's watch data */
208/* Compare a task_struct with an audit_rule. Return 1 on match, 0 261/* Compare a task_struct with an audit_rule. Return 1 on match, 0
209 * otherwise. */ 262 * otherwise. */
@@ -320,11 +373,11 @@ static int audit_filter_rules(struct task_struct *tsk,
320 if (ctx) 373 if (ctx)
321 result = audit_comparator(ctx->loginuid, f->op, f->val); 374 result = audit_comparator(ctx->loginuid, f->op, f->val);
322 break; 375 break;
323 case AUDIT_SE_USER: 376 case AUDIT_SUBJ_USER:
324 case AUDIT_SE_ROLE: 377 case AUDIT_SUBJ_ROLE:
325 case AUDIT_SE_TYPE: 378 case AUDIT_SUBJ_TYPE:
326 case AUDIT_SE_SEN: 379 case AUDIT_SUBJ_SEN:
327 case AUDIT_SE_CLR: 380 case AUDIT_SUBJ_CLR:
328 /* NOTE: this may return negative values indicating 381 /* NOTE: this may return negative values indicating
329 a temporary error. We simply treat this as a 382 a temporary error. We simply treat this as a
330 match for now to avoid losing information that 383 match for now to avoid losing information that
@@ -332,7 +385,7 @@ static int audit_filter_rules(struct task_struct *tsk,
332 logged upon error */ 385 logged upon error */
333 if (f->se_rule) { 386 if (f->se_rule) {
334 if (need_sid) { 387 if (need_sid) {
335 selinux_task_ctxid(tsk, &sid); 388 selinux_get_task_sid(tsk, &sid);
336 need_sid = 0; 389 need_sid = 0;
337 } 390 }
338 result = selinux_audit_rule_match(sid, f->type, 391 result = selinux_audit_rule_match(sid, f->type,
@@ -341,6 +394,46 @@ static int audit_filter_rules(struct task_struct *tsk,
341 ctx); 394 ctx);
342 } 395 }
343 break; 396 break;
397 case AUDIT_OBJ_USER:
398 case AUDIT_OBJ_ROLE:
399 case AUDIT_OBJ_TYPE:
400 case AUDIT_OBJ_LEV_LOW:
401 case AUDIT_OBJ_LEV_HIGH:
402 /* The above note for AUDIT_SUBJ_USER...AUDIT_SUBJ_CLR
403 also applies here */
404 if (f->se_rule) {
405 /* Find files that match */
406 if (name) {
407 result = selinux_audit_rule_match(
408 name->osid, f->type, f->op,
409 f->se_rule, ctx);
410 } else if (ctx) {
411 for (j = 0; j < ctx->name_count; j++) {
412 if (selinux_audit_rule_match(
413 ctx->names[j].osid,
414 f->type, f->op,
415 f->se_rule, ctx)) {
416 ++result;
417 break;
418 }
419 }
420 }
421 /* Find ipc objects that match */
422 if (ctx) {
423 struct audit_aux_data *aux;
424 for (aux = ctx->aux; aux;
425 aux = aux->next) {
426 if (aux->type == AUDIT_IPC) {
427 struct audit_aux_data_ipcctl *axi = (void *)aux;
428 if (selinux_audit_rule_match(axi->osid, f->type, f->op, f->se_rule, ctx)) {
429 ++result;
430 break;
431 }
432 }
433 }
434 }
435 }
436 break;
344 case AUDIT_ARG0: 437 case AUDIT_ARG0:
345 case AUDIT_ARG1: 438 case AUDIT_ARG1:
346 case AUDIT_ARG2: 439 case AUDIT_ARG2:
@@ -348,11 +441,20 @@ static int audit_filter_rules(struct task_struct *tsk,
348 if (ctx) 441 if (ctx)
349 result = audit_comparator(ctx->argv[f->type-AUDIT_ARG0], f->op, f->val); 442 result = audit_comparator(ctx->argv[f->type-AUDIT_ARG0], f->op, f->val);
350 break; 443 break;
444 case AUDIT_FILTERKEY:
445 /* ignore this field for filtering */
446 result = 1;
447 break;
448 case AUDIT_PERM:
449 result = audit_match_perm(ctx, f->val);
450 break;
351 } 451 }
352 452
353 if (!result) 453 if (!result)
354 return 0; 454 return 0;
355 } 455 }
456 if (rule->filterkey)
457 ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC);
356 switch (rule->action) { 458 switch (rule->action) {
357 case AUDIT_NEVER: *state = AUDIT_DISABLED; break; 459 case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
358 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; 460 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
@@ -467,7 +569,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
467 context->return_valid = return_valid; 569 context->return_valid = return_valid;
468 context->return_code = return_code; 570 context->return_code = return_code;
469 571
470 if (context->in_syscall && !context->auditable) { 572 if (context->in_syscall && !context->dummy && !context->auditable) {
471 enum audit_state state; 573 enum audit_state state;
472 574
473 state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]); 575 state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]);
@@ -483,17 +585,7 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
483 } 585 }
484 586
485get_context: 587get_context:
486 context->pid = tsk->pid; 588
487 context->ppid = sys_getppid(); /* sic. tsk == current in all cases */
488 context->uid = tsk->uid;
489 context->gid = tsk->gid;
490 context->euid = tsk->euid;
491 context->suid = tsk->suid;
492 context->fsuid = tsk->fsuid;
493 context->egid = tsk->egid;
494 context->sgid = tsk->sgid;
495 context->fsgid = tsk->fsgid;
496 context->personality = tsk->personality;
497 tsk->audit_context = NULL; 589 tsk->audit_context = NULL;
498 return context; 590 return context;
499} 591}
@@ -627,6 +719,7 @@ static inline void audit_free_context(struct audit_context *context)
627 } 719 }
628 audit_free_names(context); 720 audit_free_names(context);
629 audit_free_aux(context); 721 audit_free_aux(context);
722 kfree(context->filterkey);
630 kfree(context); 723 kfree(context);
631 context = previous; 724 context = previous;
632 } while (context); 725 } while (context);
@@ -658,8 +751,7 @@ static void audit_log_task_context(struct audit_buffer *ab)
658 return; 751 return;
659 752
660error_path: 753error_path:
661 if (ctx) 754 kfree(ctx);
662 kfree(ctx);
663 audit_panic("error in audit_log_task_context"); 755 audit_panic("error in audit_log_task_context");
664 return; 756 return;
665} 757}
@@ -702,6 +794,17 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
702 const char *tty; 794 const char *tty;
703 795
704 /* tsk == current */ 796 /* tsk == current */
797 context->pid = tsk->pid;
798 context->ppid = sys_getppid(); /* sic. tsk == current in all cases */
799 context->uid = tsk->uid;
800 context->gid = tsk->gid;
801 context->euid = tsk->euid;
802 context->suid = tsk->suid;
803 context->fsuid = tsk->fsuid;
804 context->egid = tsk->egid;
805 context->sgid = tsk->sgid;
806 context->fsgid = tsk->fsgid;
807 context->personality = tsk->personality;
705 808
706 ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL); 809 ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
707 if (!ab) 810 if (!ab)
@@ -714,6 +817,8 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
714 audit_log_format(ab, " success=%s exit=%ld", 817 audit_log_format(ab, " success=%s exit=%ld",
715 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", 818 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
716 context->return_code); 819 context->return_code);
820
821 mutex_lock(&tty_mutex);
717 if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) 822 if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
718 tty = tsk->signal->tty->name; 823 tty = tsk->signal->tty->name;
719 else 824 else
@@ -735,7 +840,15 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
735 context->gid, 840 context->gid,
736 context->euid, context->suid, context->fsuid, 841 context->euid, context->suid, context->fsuid,
737 context->egid, context->sgid, context->fsgid, tty); 842 context->egid, context->sgid, context->fsgid, tty);
843
844 mutex_unlock(&tty_mutex);
845
738 audit_log_task_info(ab, tsk); 846 audit_log_task_info(ab, tsk);
847 if (context->filterkey) {
848 audit_log_format(ab, " key=");
849 audit_log_untrustedstring(ab, context->filterkey);
850 } else
851 audit_log_format(ab, " key=(null)");
739 audit_log_end(ab); 852 audit_log_end(ab);
740 853
741 for (aux = context->aux; aux; aux = aux->next) { 854 for (aux = context->aux; aux; aux = aux->next) {
@@ -790,7 +903,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
790 if (axi->osid != 0) { 903 if (axi->osid != 0) {
791 char *ctx = NULL; 904 char *ctx = NULL;
792 u32 len; 905 u32 len;
793 if (selinux_ctxid_to_string( 906 if (selinux_sid_to_string(
794 axi->osid, &ctx, &len)) { 907 axi->osid, &ctx, &len)) {
795 audit_log_format(ab, " osid=%u", 908 audit_log_format(ab, " osid=%u",
796 axi->osid); 909 axi->osid);
@@ -897,7 +1010,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
897 if (n->osid != 0) { 1010 if (n->osid != 0) {
898 char *ctx = NULL; 1011 char *ctx = NULL;
899 u32 len; 1012 u32 len;
900 if (selinux_ctxid_to_string( 1013 if (selinux_sid_to_string(
901 n->osid, &ctx, &len)) { 1014 n->osid, &ctx, &len)) {
902 audit_log_format(ab, " osid=%u", n->osid); 1015 audit_log_format(ab, " osid=%u", n->osid);
903 call_panic = 2; 1016 call_panic = 2;
@@ -1014,7 +1127,8 @@ void audit_syscall_entry(int arch, int major,
1014 context->argv[3] = a4; 1127 context->argv[3] = a4;
1015 1128
1016 state = context->state; 1129 state = context->state;
1017 if (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT) 1130 context->dummy = !audit_n_rules;
1131 if (!context->dummy && (state == AUDIT_SETUP_CONTEXT || state == AUDIT_BUILD_CONTEXT))
1018 state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]); 1132 state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_ENTRY]);
1019 if (likely(state == AUDIT_DISABLED)) 1133 if (likely(state == AUDIT_DISABLED))
1020 return; 1134 return;
@@ -1061,6 +1175,8 @@ void audit_syscall_exit(int valid, long return_code)
1061 } else { 1175 } else {
1062 audit_free_names(context); 1176 audit_free_names(context);
1063 audit_free_aux(context); 1177 audit_free_aux(context);
1178 kfree(context->filterkey);
1179 context->filterkey = NULL;
1064 tsk->audit_context = context; 1180 tsk->audit_context = context;
1065 } 1181 }
1066} 1182}
@@ -1145,14 +1261,18 @@ void audit_putname(const char *name)
1145#endif 1261#endif
1146} 1262}
1147 1263
1148static void audit_inode_context(int idx, const struct inode *inode) 1264/* Copy inode data into an audit_names. */
1265static void audit_copy_inode(struct audit_names *name, const struct inode *inode)
1149{ 1266{
1150 struct audit_context *context = current->audit_context; 1267 name->ino = inode->i_ino;
1151 1268 name->dev = inode->i_sb->s_dev;
1152 selinux_get_inode_sid(inode, &context->names[idx].osid); 1269 name->mode = inode->i_mode;
1270 name->uid = inode->i_uid;
1271 name->gid = inode->i_gid;
1272 name->rdev = inode->i_rdev;
1273 selinux_get_inode_sid(inode, &name->osid);
1153} 1274}
1154 1275
1155
1156/** 1276/**
1157 * audit_inode - store the inode and device from a lookup 1277 * audit_inode - store the inode and device from a lookup
1158 * @name: name being audited 1278 * @name: name being audited
@@ -1186,20 +1306,14 @@ void __audit_inode(const char *name, const struct inode *inode)
1186 ++context->ino_count; 1306 ++context->ino_count;
1187#endif 1307#endif
1188 } 1308 }
1189 context->names[idx].ino = inode->i_ino; 1309 audit_copy_inode(&context->names[idx], inode);
1190 context->names[idx].dev = inode->i_sb->s_dev;
1191 context->names[idx].mode = inode->i_mode;
1192 context->names[idx].uid = inode->i_uid;
1193 context->names[idx].gid = inode->i_gid;
1194 context->names[idx].rdev = inode->i_rdev;
1195 audit_inode_context(idx, inode);
1196} 1310}
1197 1311
1198/** 1312/**
1199 * audit_inode_child - collect inode info for created/removed objects 1313 * audit_inode_child - collect inode info for created/removed objects
1200 * @dname: inode's dentry name 1314 * @dname: inode's dentry name
1201 * @inode: inode being audited 1315 * @inode: inode being audited
1202 * @pino: inode number of dentry parent 1316 * @parent: inode of dentry parent
1203 * 1317 *
1204 * For syscalls that create or remove filesystem objects, audit_inode 1318 * For syscalls that create or remove filesystem objects, audit_inode
1205 * can only collect information for the filesystem object's parent. 1319 * can only collect information for the filesystem object's parent.
@@ -1210,7 +1324,7 @@ void __audit_inode(const char *name, const struct inode *inode)
1210 * unsuccessful attempts. 1324 * unsuccessful attempts.
1211 */ 1325 */
1212void __audit_inode_child(const char *dname, const struct inode *inode, 1326void __audit_inode_child(const char *dname, const struct inode *inode,
1213 unsigned long pino) 1327 const struct inode *parent)
1214{ 1328{
1215 int idx; 1329 int idx;
1216 struct audit_context *context = current->audit_context; 1330 struct audit_context *context = current->audit_context;
@@ -1224,7 +1338,7 @@ void __audit_inode_child(const char *dname, const struct inode *inode,
1224 if (!dname) 1338 if (!dname)
1225 goto update_context; 1339 goto update_context;
1226 for (idx = 0; idx < context->name_count; idx++) 1340 for (idx = 0; idx < context->name_count; idx++)
1227 if (context->names[idx].ino == pino) { 1341 if (context->names[idx].ino == parent->i_ino) {
1228 const char *name = context->names[idx].name; 1342 const char *name = context->names[idx].name;
1229 1343
1230 if (!name) 1344 if (!name)
@@ -1248,16 +1362,47 @@ update_context:
1248 context->names[idx].name_len = AUDIT_NAME_FULL; 1362 context->names[idx].name_len = AUDIT_NAME_FULL;
1249 context->names[idx].name_put = 0; /* don't call __putname() */ 1363 context->names[idx].name_put = 0; /* don't call __putname() */
1250 1364
1251 if (inode) { 1365 if (!inode)
1252 context->names[idx].ino = inode->i_ino; 1366 context->names[idx].ino = (unsigned long)-1;
1253 context->names[idx].dev = inode->i_sb->s_dev; 1367 else
1254 context->names[idx].mode = inode->i_mode; 1368 audit_copy_inode(&context->names[idx], inode);
1255 context->names[idx].uid = inode->i_uid; 1369
1256 context->names[idx].gid = inode->i_gid; 1370 /* A parent was not found in audit_names, so copy the inode data for the
1257 context->names[idx].rdev = inode->i_rdev; 1371 * provided parent. */
1258 audit_inode_context(idx, inode); 1372 if (!found_name) {
1259 } else 1373 idx = context->name_count++;
1260 context->names[idx].ino = (unsigned long)-1; 1374#if AUDIT_DEBUG
1375 context->ino_count++;
1376#endif
1377 audit_copy_inode(&context->names[idx], parent);
1378 }
1379}
1380
1381/**
1382 * audit_inode_update - update inode info for last collected name
1383 * @inode: inode being audited
1384 *
1385 * When open() is called on an existing object with the O_CREAT flag, the inode
1386 * data audit initially collects is incorrect. This additional hook ensures
1387 * audit has the inode data for the actual object to be opened.
1388 */
1389void __audit_inode_update(const struct inode *inode)
1390{
1391 struct audit_context *context = current->audit_context;
1392 int idx;
1393
1394 if (!context->in_syscall || !inode)
1395 return;
1396
1397 if (context->name_count == 0) {
1398 context->name_count++;
1399#if AUDIT_DEBUG
1400 context->ino_count++;
1401#endif
1402 }
1403 idx = context->name_count - 1;
1404
1405 audit_copy_inode(&context->names[idx], inode);
1261} 1406}
1262 1407
1263/** 1408/**
@@ -1367,7 +1512,7 @@ int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr)
1367 * @mqdes: MQ descriptor 1512 * @mqdes: MQ descriptor
1368 * @msg_len: Message length 1513 * @msg_len: Message length
1369 * @msg_prio: Message priority 1514 * @msg_prio: Message priority
1370 * @abs_timeout: Message timeout in absolute time 1515 * @u_abs_timeout: Message timeout in absolute time
1371 * 1516 *
1372 * Returns 0 for success or NULL context or < 0 on error. 1517 * Returns 0 for success or NULL context or < 0 on error.
1373 */ 1518 */
@@ -1409,8 +1554,8 @@ int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
1409 * __audit_mq_timedreceive - record audit data for a POSIX MQ timed receive 1554 * __audit_mq_timedreceive - record audit data for a POSIX MQ timed receive
1410 * @mqdes: MQ descriptor 1555 * @mqdes: MQ descriptor
1411 * @msg_len: Message length 1556 * @msg_len: Message length
1412 * @msg_prio: Message priority 1557 * @u_msg_prio: Message priority
1413 * @abs_timeout: Message timeout in absolute time 1558 * @u_abs_timeout: Message timeout in absolute time
1414 * 1559 *
1415 * Returns 0 for success or NULL context or < 0 on error. 1560 * Returns 0 for success or NULL context or < 0 on error.
1416 */ 1561 */
@@ -1558,7 +1703,6 @@ int __audit_ipc_obj(struct kern_ipc_perm *ipcp)
1558 * @uid: msgq user id 1703 * @uid: msgq user id
1559 * @gid: msgq group id 1704 * @gid: msgq group id
1560 * @mode: msgq mode (permissions) 1705 * @mode: msgq mode (permissions)
1561 * @ipcp: in-kernel IPC permissions
1562 * 1706 *
1563 * Returns 0 for success or NULL context or < 0 on error. 1707 * Returns 0 for success or NULL context or < 0 on error.
1564 */ 1708 */
@@ -1589,7 +1733,7 @@ int audit_bprm(struct linux_binprm *bprm)
1589 unsigned long p, next; 1733 unsigned long p, next;
1590 void *to; 1734 void *to;
1591 1735
1592 if (likely(!audit_enabled || !context)) 1736 if (likely(!audit_enabled || !context || context->dummy))
1593 return 0; 1737 return 0;
1594 1738
1595 ax = kmalloc(sizeof(*ax) + PAGE_SIZE * MAX_ARG_PAGES - bprm->p, 1739 ax = kmalloc(sizeof(*ax) + PAGE_SIZE * MAX_ARG_PAGES - bprm->p,
@@ -1627,7 +1771,7 @@ int audit_socketcall(int nargs, unsigned long *args)
1627 struct audit_aux_data_socketcall *ax; 1771 struct audit_aux_data_socketcall *ax;
1628 struct audit_context *context = current->audit_context; 1772 struct audit_context *context = current->audit_context;
1629 1773
1630 if (likely(!context)) 1774 if (likely(!context || context->dummy))
1631 return 0; 1775 return 0;
1632 1776
1633 ax = kmalloc(sizeof(*ax) + nargs * sizeof(unsigned long), GFP_KERNEL); 1777 ax = kmalloc(sizeof(*ax) + nargs * sizeof(unsigned long), GFP_KERNEL);
@@ -1655,7 +1799,7 @@ int audit_sockaddr(int len, void *a)
1655 struct audit_aux_data_sockaddr *ax; 1799 struct audit_aux_data_sockaddr *ax;
1656 struct audit_context *context = current->audit_context; 1800 struct audit_context *context = current->audit_context;
1657 1801
1658 if (likely(!context)) 1802 if (likely(!context || context->dummy))
1659 return 0; 1803 return 0;
1660 1804
1661 ax = kmalloc(sizeof(*ax) + len, GFP_KERNEL); 1805 ax = kmalloc(sizeof(*ax) + len, GFP_KERNEL);
diff --git a/kernel/capability.c b/kernel/capability.c
index 1a4d8a40d3f9..edb845a6e84a 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -46,7 +46,7 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
46 int ret = 0; 46 int ret = 0;
47 pid_t pid; 47 pid_t pid;
48 __u32 version; 48 __u32 version;
49 task_t *target; 49 struct task_struct *target;
50 struct __user_cap_data_struct data; 50 struct __user_cap_data_struct data;
51 51
52 if (get_user(version, &header->version)) 52 if (get_user(version, &header->version))
@@ -96,7 +96,7 @@ static inline int cap_set_pg(int pgrp, kernel_cap_t *effective,
96 kernel_cap_t *inheritable, 96 kernel_cap_t *inheritable,
97 kernel_cap_t *permitted) 97 kernel_cap_t *permitted)
98{ 98{
99 task_t *g, *target; 99 struct task_struct *g, *target;
100 int ret = -EPERM; 100 int ret = -EPERM;
101 int found = 0; 101 int found = 0;
102 102
@@ -128,12 +128,12 @@ static inline int cap_set_all(kernel_cap_t *effective,
128 kernel_cap_t *inheritable, 128 kernel_cap_t *inheritable,
129 kernel_cap_t *permitted) 129 kernel_cap_t *permitted)
130{ 130{
131 task_t *g, *target; 131 struct task_struct *g, *target;
132 int ret = -EPERM; 132 int ret = -EPERM;
133 int found = 0; 133 int found = 0;
134 134
135 do_each_thread(g, target) { 135 do_each_thread(g, target) {
136 if (target == current || target->pid == 1) 136 if (target == current || is_init(target))
137 continue; 137 continue;
138 found = 1; 138 found = 1;
139 if (security_capset_check(target, effective, inheritable, 139 if (security_capset_check(target, effective, inheritable,
@@ -172,7 +172,7 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
172{ 172{
173 kernel_cap_t inheritable, permitted, effective; 173 kernel_cap_t inheritable, permitted, effective;
174 __u32 version; 174 __u32 version;
175 task_t *target; 175 struct task_struct *target;
176 int ret; 176 int ret;
177 pid_t pid; 177 pid_t pid;
178 178
diff --git a/kernel/compat.c b/kernel/compat.c
index 2f672332430f..75573e5d27b0 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -22,6 +22,7 @@
22#include <linux/security.h> 22#include <linux/security.h>
23#include <linux/timex.h> 23#include <linux/timex.h>
24#include <linux/migrate.h> 24#include <linux/migrate.h>
25#include <linux/posix-timers.h>
25 26
26#include <asm/uaccess.h> 27#include <asm/uaccess.h>
27 28
@@ -601,6 +602,30 @@ long compat_sys_clock_getres(clockid_t which_clock,
601 return err; 602 return err;
602} 603}
603 604
605static long compat_clock_nanosleep_restart(struct restart_block *restart)
606{
607 long err;
608 mm_segment_t oldfs;
609 struct timespec tu;
610 struct compat_timespec *rmtp = (struct compat_timespec *)(restart->arg1);
611
612 restart->arg1 = (unsigned long) &tu;
613 oldfs = get_fs();
614 set_fs(KERNEL_DS);
615 err = clock_nanosleep_restart(restart);
616 set_fs(oldfs);
617
618 if ((err == -ERESTART_RESTARTBLOCK) && rmtp &&
619 put_compat_timespec(&tu, rmtp))
620 return -EFAULT;
621
622 if (err == -ERESTART_RESTARTBLOCK) {
623 restart->fn = compat_clock_nanosleep_restart;
624 restart->arg1 = (unsigned long) rmtp;
625 }
626 return err;
627}
628
604long compat_sys_clock_nanosleep(clockid_t which_clock, int flags, 629long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
605 struct compat_timespec __user *rqtp, 630 struct compat_timespec __user *rqtp,
606 struct compat_timespec __user *rmtp) 631 struct compat_timespec __user *rmtp)
@@ -608,6 +633,7 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
608 long err; 633 long err;
609 mm_segment_t oldfs; 634 mm_segment_t oldfs;
610 struct timespec in, out; 635 struct timespec in, out;
636 struct restart_block *restart;
611 637
612 if (get_compat_timespec(&in, rqtp)) 638 if (get_compat_timespec(&in, rqtp))
613 return -EFAULT; 639 return -EFAULT;
@@ -618,9 +644,16 @@ long compat_sys_clock_nanosleep(clockid_t which_clock, int flags,
618 (struct timespec __user *) &in, 644 (struct timespec __user *) &in,
619 (struct timespec __user *) &out); 645 (struct timespec __user *) &out);
620 set_fs(oldfs); 646 set_fs(oldfs);
647
621 if ((err == -ERESTART_RESTARTBLOCK) && rmtp && 648 if ((err == -ERESTART_RESTARTBLOCK) && rmtp &&
622 put_compat_timespec(&out, rmtp)) 649 put_compat_timespec(&out, rmtp))
623 return -EFAULT; 650 return -EFAULT;
651
652 if (err == -ERESTART_RESTARTBLOCK) {
653 restart = &current_thread_info()->restart_block;
654 restart->fn = compat_clock_nanosleep_restart;
655 restart->arg1 = (unsigned long) rmtp;
656 }
624 return err; 657 return err;
625} 658}
626 659
@@ -730,17 +763,10 @@ void
730sigset_from_compat (sigset_t *set, compat_sigset_t *compat) 763sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
731{ 764{
732 switch (_NSIG_WORDS) { 765 switch (_NSIG_WORDS) {
733#if defined (__COMPAT_ENDIAN_SWAP__)
734 case 4: set->sig[3] = compat->sig[7] | (((long)compat->sig[6]) << 32 );
735 case 3: set->sig[2] = compat->sig[5] | (((long)compat->sig[4]) << 32 );
736 case 2: set->sig[1] = compat->sig[3] | (((long)compat->sig[2]) << 32 );
737 case 1: set->sig[0] = compat->sig[1] | (((long)compat->sig[0]) << 32 );
738#else
739 case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 ); 766 case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 );
740 case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 ); 767 case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 );
741 case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 ); 768 case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 );
742 case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 ); 769 case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 );
743#endif
744 } 770 }
745} 771}
746 772
diff --git a/kernel/configs.c b/kernel/configs.c
index 009e1ebdcb88..f9e31974f4ad 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -23,7 +23,6 @@
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 */ 24 */
25 25
26#include <linux/config.h>
27#include <linux/kernel.h> 26#include <linux/kernel.h>
28#include <linux/module.h> 27#include <linux/module.h>
29#include <linux/proc_fs.h> 28#include <linux/proc_fs.h>
diff --git a/kernel/cpu.c b/kernel/cpu.c
index fe2b8d0bfe4c..32c96628463e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -13,66 +13,66 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/kthread.h> 14#include <linux/kthread.h>
15#include <linux/stop_machine.h> 15#include <linux/stop_machine.h>
16#include <asm/semaphore.h> 16#include <linux/mutex.h>
17 17
18/* This protects CPUs going up and down... */ 18/* This protects CPUs going up and down... */
19static DECLARE_MUTEX(cpucontrol); 19static DEFINE_MUTEX(cpu_add_remove_lock);
20static DEFINE_MUTEX(cpu_bitmask_lock);
20 21
21static BLOCKING_NOTIFIER_HEAD(cpu_chain); 22static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain);
22 23
23#ifdef CONFIG_HOTPLUG_CPU 24/* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
24static struct task_struct *lock_cpu_hotplug_owner; 25 * Should always be manipulated under cpu_add_remove_lock
25static int lock_cpu_hotplug_depth; 26 */
26 27static int cpu_hotplug_disabled;
27static int __lock_cpu_hotplug(int interruptible)
28{
29 int ret = 0;
30 28
31 if (lock_cpu_hotplug_owner != current) { 29#ifdef CONFIG_HOTPLUG_CPU
32 if (interruptible)
33 ret = down_interruptible(&cpucontrol);
34 else
35 down(&cpucontrol);
36 }
37 30
38 /* 31/* Crappy recursive lock-takers in cpufreq! Complain loudly about idiots */
39 * Set only if we succeed in locking 32static struct task_struct *recursive;
40 */ 33static int recursive_depth;
41 if (!ret) {
42 lock_cpu_hotplug_depth++;
43 lock_cpu_hotplug_owner = current;
44 }
45
46 return ret;
47}
48 34
49void lock_cpu_hotplug(void) 35void lock_cpu_hotplug(void)
50{ 36{
51 __lock_cpu_hotplug(0); 37 struct task_struct *tsk = current;
38
39 if (tsk == recursive) {
40 static int warnings = 10;
41 if (warnings) {
42 printk(KERN_ERR "Lukewarm IQ detected in hotplug locking\n");
43 WARN_ON(1);
44 warnings--;
45 }
46 recursive_depth++;
47 return;
48 }
49 mutex_lock(&cpu_bitmask_lock);
50 recursive = tsk;
52} 51}
53EXPORT_SYMBOL_GPL(lock_cpu_hotplug); 52EXPORT_SYMBOL_GPL(lock_cpu_hotplug);
54 53
55void unlock_cpu_hotplug(void) 54void unlock_cpu_hotplug(void)
56{ 55{
57 if (--lock_cpu_hotplug_depth == 0) { 56 WARN_ON(recursive != current);
58 lock_cpu_hotplug_owner = NULL; 57 if (recursive_depth) {
59 up(&cpucontrol); 58 recursive_depth--;
59 return;
60 } 60 }
61 mutex_unlock(&cpu_bitmask_lock);
62 recursive = NULL;
61} 63}
62EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); 64EXPORT_SYMBOL_GPL(unlock_cpu_hotplug);
63 65
64int lock_cpu_hotplug_interruptible(void)
65{
66 return __lock_cpu_hotplug(1);
67}
68EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible);
69#endif /* CONFIG_HOTPLUG_CPU */ 66#endif /* CONFIG_HOTPLUG_CPU */
70 67
71/* Need to know about CPUs going up/down? */ 68/* Need to know about CPUs going up/down? */
72int register_cpu_notifier(struct notifier_block *nb) 69int __cpuinit register_cpu_notifier(struct notifier_block *nb)
73{ 70{
74 return blocking_notifier_chain_register(&cpu_chain, nb); 71 return blocking_notifier_chain_register(&cpu_chain, nb);
75} 72}
73
74#ifdef CONFIG_HOTPLUG_CPU
75
76EXPORT_SYMBOL(register_cpu_notifier); 76EXPORT_SYMBOL(register_cpu_notifier);
77 77
78void unregister_cpu_notifier(struct notifier_block *nb) 78void unregister_cpu_notifier(struct notifier_block *nb)
@@ -81,7 +81,6 @@ void unregister_cpu_notifier(struct notifier_block *nb)
81} 81}
82EXPORT_SYMBOL(unregister_cpu_notifier); 82EXPORT_SYMBOL(unregister_cpu_notifier);
83 83
84#ifdef CONFIG_HOTPLUG_CPU
85static inline void check_for_tasks(int cpu) 84static inline void check_for_tasks(int cpu)
86{ 85{
87 struct task_struct *p; 86 struct task_struct *p;
@@ -114,32 +113,25 @@ static int take_cpu_down(void *unused)
114 return 0; 113 return 0;
115} 114}
116 115
117int cpu_down(unsigned int cpu) 116/* Requires cpu_add_remove_lock to be held */
117static int _cpu_down(unsigned int cpu)
118{ 118{
119 int err; 119 int err;
120 struct task_struct *p; 120 struct task_struct *p;
121 cpumask_t old_allowed, tmp; 121 cpumask_t old_allowed, tmp;
122 122
123 if ((err = lock_cpu_hotplug_interruptible()) != 0) 123 if (num_online_cpus() == 1)
124 return err; 124 return -EBUSY;
125 125
126 if (num_online_cpus() == 1) { 126 if (!cpu_online(cpu))
127 err = -EBUSY; 127 return -EINVAL;
128 goto out;
129 }
130
131 if (!cpu_online(cpu)) {
132 err = -EINVAL;
133 goto out;
134 }
135 128
136 err = blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE, 129 err = blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE,
137 (void *)(long)cpu); 130 (void *)(long)cpu);
138 if (err == NOTIFY_BAD) { 131 if (err == NOTIFY_BAD) {
139 printk("%s: attempt to take down CPU %u failed\n", 132 printk("%s: attempt to take down CPU %u failed\n",
140 __FUNCTION__, cpu); 133 __FUNCTION__, cpu);
141 err = -EINVAL; 134 return -EINVAL;
142 goto out;
143 } 135 }
144 136
145 /* Ensure that we are not runnable on dying cpu */ 137 /* Ensure that we are not runnable on dying cpu */
@@ -148,7 +140,10 @@ int cpu_down(unsigned int cpu)
148 cpu_clear(cpu, tmp); 140 cpu_clear(cpu, tmp);
149 set_cpus_allowed(current, tmp); 141 set_cpus_allowed(current, tmp);
150 142
143 mutex_lock(&cpu_bitmask_lock);
151 p = __stop_machine_run(take_cpu_down, NULL, cpu); 144 p = __stop_machine_run(take_cpu_down, NULL, cpu);
145 mutex_unlock(&cpu_bitmask_lock);
146
152 if (IS_ERR(p)) { 147 if (IS_ERR(p)) {
153 /* CPU didn't die: tell everyone. Can't complain. */ 148 /* CPU didn't die: tell everyone. Can't complain. */
154 if (blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED, 149 if (blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
@@ -184,24 +179,32 @@ out_thread:
184 err = kthread_stop(p); 179 err = kthread_stop(p);
185out_allowed: 180out_allowed:
186 set_cpus_allowed(current, old_allowed); 181 set_cpus_allowed(current, old_allowed);
187out: 182 return err;
188 unlock_cpu_hotplug(); 183}
184
185int cpu_down(unsigned int cpu)
186{
187 int err = 0;
188
189 mutex_lock(&cpu_add_remove_lock);
190 if (cpu_hotplug_disabled)
191 err = -EBUSY;
192 else
193 err = _cpu_down(cpu);
194
195 mutex_unlock(&cpu_add_remove_lock);
189 return err; 196 return err;
190} 197}
191#endif /*CONFIG_HOTPLUG_CPU*/ 198#endif /*CONFIG_HOTPLUG_CPU*/
192 199
193int __devinit cpu_up(unsigned int cpu) 200/* Requires cpu_add_remove_lock to be held */
201static int __devinit _cpu_up(unsigned int cpu)
194{ 202{
195 int ret; 203 int ret;
196 void *hcpu = (void *)(long)cpu; 204 void *hcpu = (void *)(long)cpu;
197 205
198 if ((ret = lock_cpu_hotplug_interruptible()) != 0) 206 if (cpu_online(cpu) || !cpu_present(cpu))
199 return ret; 207 return -EINVAL;
200
201 if (cpu_online(cpu) || !cpu_present(cpu)) {
202 ret = -EINVAL;
203 goto out;
204 }
205 208
206 ret = blocking_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu); 209 ret = blocking_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE, hcpu);
207 if (ret == NOTIFY_BAD) { 210 if (ret == NOTIFY_BAD) {
@@ -212,7 +215,9 @@ int __devinit cpu_up(unsigned int cpu)
212 } 215 }
213 216
214 /* Arch-specific enabling code. */ 217 /* Arch-specific enabling code. */
218 mutex_lock(&cpu_bitmask_lock);
215 ret = __cpu_up(cpu); 219 ret = __cpu_up(cpu);
220 mutex_unlock(&cpu_bitmask_lock);
216 if (ret != 0) 221 if (ret != 0)
217 goto out_notify; 222 goto out_notify;
218 BUG_ON(!cpu_online(cpu)); 223 BUG_ON(!cpu_online(cpu));
@@ -224,7 +229,95 @@ out_notify:
224 if (ret != 0) 229 if (ret != 0)
225 blocking_notifier_call_chain(&cpu_chain, 230 blocking_notifier_call_chain(&cpu_chain,
226 CPU_UP_CANCELED, hcpu); 231 CPU_UP_CANCELED, hcpu);
227out: 232
228 unlock_cpu_hotplug();
229 return ret; 233 return ret;
230} 234}
235
236int __devinit cpu_up(unsigned int cpu)
237{
238 int err = 0;
239
240 mutex_lock(&cpu_add_remove_lock);
241 if (cpu_hotplug_disabled)
242 err = -EBUSY;
243 else
244 err = _cpu_up(cpu);
245
246 mutex_unlock(&cpu_add_remove_lock);
247 return err;
248}
249
250#ifdef CONFIG_SUSPEND_SMP
251static cpumask_t frozen_cpus;
252
253int disable_nonboot_cpus(void)
254{
255 int cpu, first_cpu, error;
256
257 mutex_lock(&cpu_add_remove_lock);
258 first_cpu = first_cpu(cpu_present_map);
259 if (!cpu_online(first_cpu)) {
260 error = _cpu_up(first_cpu);
261 if (error) {
262 printk(KERN_ERR "Could not bring CPU%d up.\n",
263 first_cpu);
264 goto out;
265 }
266 }
267 error = set_cpus_allowed(current, cpumask_of_cpu(first_cpu));
268 if (error) {
269 printk(KERN_ERR "Could not run on CPU%d\n", first_cpu);
270 goto out;
271 }
272 /* We take down all of the non-boot CPUs in one shot to avoid races
273 * with the userspace trying to use the CPU hotplug at the same time
274 */
275 cpus_clear(frozen_cpus);
276 printk("Disabling non-boot CPUs ...\n");
277 for_each_online_cpu(cpu) {
278 if (cpu == first_cpu)
279 continue;
280 error = _cpu_down(cpu);
281 if (!error) {
282 cpu_set(cpu, frozen_cpus);
283 printk("CPU%d is down\n", cpu);
284 } else {
285 printk(KERN_ERR "Error taking CPU%d down: %d\n",
286 cpu, error);
287 break;
288 }
289 }
290 if (!error) {
291 BUG_ON(num_online_cpus() > 1);
292 /* Make sure the CPUs won't be enabled by someone else */
293 cpu_hotplug_disabled = 1;
294 } else {
295 printk(KERN_ERR "Non-boot CPUs are not disabled");
296 }
297out:
298 mutex_unlock(&cpu_add_remove_lock);
299 return error;
300}
301
302void enable_nonboot_cpus(void)
303{
304 int cpu, error;
305
306 /* Allow everyone to use the CPU hotplug again */
307 mutex_lock(&cpu_add_remove_lock);
308 cpu_hotplug_disabled = 0;
309 mutex_unlock(&cpu_add_remove_lock);
310
311 printk("Enabling non-boot CPUs ...\n");
312 for_each_cpu_mask(cpu, frozen_cpus) {
313 error = cpu_up(cpu);
314 if (!error) {
315 printk("CPU%d is up\n", cpu);
316 continue;
317 }
318 printk(KERN_WARNING "Error taking CPU%d up: %d\n",
319 cpu, error);
320 }
321 cpus_clear(frozen_cpus);
322}
323#endif
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index b602f73fb38d..8c3c400cce91 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -18,7 +18,6 @@
18 * distribution for more details. 18 * distribution for more details.
19 */ 19 */
20 20
21#include <linux/config.h>
22#include <linux/cpu.h> 21#include <linux/cpu.h>
23#include <linux/cpumask.h> 22#include <linux/cpumask.h>
24#include <linux/cpuset.h> 23#include <linux/cpuset.h>
@@ -241,7 +240,7 @@ static struct super_block *cpuset_sb;
241 * A cpuset can only be deleted if both its 'count' of using tasks 240 * A cpuset can only be deleted if both its 'count' of using tasks
242 * is zero, and its list of 'children' cpusets is empty. Since all 241 * is zero, and its list of 'children' cpusets is empty. Since all
243 * tasks in the system use _some_ cpuset, and since there is always at 242 * tasks in the system use _some_ cpuset, and since there is always at
244 * least one task in the system (init, pid == 1), therefore, top_cpuset 243 * least one task in the system (init), therefore, top_cpuset
245 * always has either children cpusets and/or using tasks. So we don't 244 * always has either children cpusets and/or using tasks. So we don't
246 * need a special hack to ensure that top_cpuset cannot be deleted. 245 * need a special hack to ensure that top_cpuset cannot be deleted.
247 * 246 *
@@ -290,7 +289,6 @@ static struct inode *cpuset_new_inode(mode_t mode)
290 inode->i_mode = mode; 289 inode->i_mode = mode;
291 inode->i_uid = current->fsuid; 290 inode->i_uid = current->fsuid;
292 inode->i_gid = current->fsgid; 291 inode->i_gid = current->fsgid;
293 inode->i_blksize = PAGE_CACHE_SIZE;
294 inode->i_blocks = 0; 292 inode->i_blocks = 0;
295 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 293 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
296 inode->i_mapping->backing_dev_info = &cpuset_backing_dev_info; 294 inode->i_mapping->backing_dev_info = &cpuset_backing_dev_info;
@@ -763,6 +761,8 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
763 * 761 *
764 * Call with manage_mutex held. May nest a call to the 762 * Call with manage_mutex held. May nest a call to the
765 * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. 763 * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
764 * Must not be called holding callback_mutex, because we must
765 * not call lock_cpu_hotplug() while holding callback_mutex.
766 */ 766 */
767 767
768static void update_cpu_domains(struct cpuset *cur) 768static void update_cpu_domains(struct cpuset *cur)
@@ -782,7 +782,7 @@ static void update_cpu_domains(struct cpuset *cur)
782 if (is_cpu_exclusive(c)) 782 if (is_cpu_exclusive(c))
783 cpus_andnot(pspan, pspan, c->cpus_allowed); 783 cpus_andnot(pspan, pspan, c->cpus_allowed);
784 } 784 }
785 if (is_removed(cur) || !is_cpu_exclusive(cur)) { 785 if (!is_cpu_exclusive(cur)) {
786 cpus_or(pspan, pspan, cur->cpus_allowed); 786 cpus_or(pspan, pspan, cur->cpus_allowed);
787 if (cpus_equal(pspan, cur->cpus_allowed)) 787 if (cpus_equal(pspan, cur->cpus_allowed))
788 return; 788 return;
@@ -815,6 +815,10 @@ static int update_cpumask(struct cpuset *cs, char *buf)
815 struct cpuset trialcs; 815 struct cpuset trialcs;
816 int retval, cpus_unchanged; 816 int retval, cpus_unchanged;
817 817
818 /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */
819 if (cs == &top_cpuset)
820 return -EACCES;
821
818 trialcs = *cs; 822 trialcs = *cs;
819 retval = cpulist_parse(buf, trialcs.cpus_allowed); 823 retval = cpulist_parse(buf, trialcs.cpus_allowed);
820 if (retval < 0) 824 if (retval < 0)
@@ -908,6 +912,10 @@ static int update_nodemask(struct cpuset *cs, char *buf)
908 int fudge; 912 int fudge;
909 int retval; 913 int retval;
910 914
915 /* top_cpuset.mems_allowed tracks node_online_map; it's read-only */
916 if (cs == &top_cpuset)
917 return -EACCES;
918
911 trialcs = *cs; 919 trialcs = *cs;
912 retval = nodelist_parse(buf, trialcs.mems_allowed); 920 retval = nodelist_parse(buf, trialcs.mems_allowed);
913 if (retval < 0) 921 if (retval < 0)
@@ -1064,7 +1072,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
1064} 1072}
1065 1073
1066/* 1074/*
1067 * Frequency meter - How fast is some event occuring? 1075 * Frequency meter - How fast is some event occurring?
1068 * 1076 *
1069 * These routines manage a digitally filtered, constant time based, 1077 * These routines manage a digitally filtered, constant time based,
1070 * event frequency meter. There are four routines: 1078 * event frequency meter. There are four routines:
@@ -1217,7 +1225,12 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
1217 1225
1218 task_lock(tsk); 1226 task_lock(tsk);
1219 oldcs = tsk->cpuset; 1227 oldcs = tsk->cpuset;
1220 if (!oldcs) { 1228 /*
1229 * After getting 'oldcs' cpuset ptr, be sure still not exiting.
1230 * If 'oldcs' might be the top_cpuset due to the_top_cpuset_hack
1231 * then fail this attach_task(), to avoid breaking top_cpuset.count.
1232 */
1233 if (tsk->flags & PF_EXITING) {
1221 task_unlock(tsk); 1234 task_unlock(tsk);
1222 mutex_unlock(&callback_mutex); 1235 mutex_unlock(&callback_mutex);
1223 put_task_struct(tsk); 1236 put_task_struct(tsk);
@@ -1918,6 +1931,17 @@ static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1918 return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR); 1931 return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR);
1919} 1932}
1920 1933
1934/*
1935 * Locking note on the strange update_flag() call below:
1936 *
1937 * If the cpuset being removed is marked cpu_exclusive, then simulate
1938 * turning cpu_exclusive off, which will call update_cpu_domains().
1939 * The lock_cpu_hotplug() call in update_cpu_domains() must not be
1940 * made while holding callback_mutex. Elsewhere the kernel nests
1941 * callback_mutex inside lock_cpu_hotplug() calls. So the reverse
1942 * nesting would risk an ABBA deadlock.
1943 */
1944
1921static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) 1945static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1922{ 1946{
1923 struct cpuset *cs = dentry->d_fsdata; 1947 struct cpuset *cs = dentry->d_fsdata;
@@ -1937,11 +1961,16 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1937 mutex_unlock(&manage_mutex); 1961 mutex_unlock(&manage_mutex);
1938 return -EBUSY; 1962 return -EBUSY;
1939 } 1963 }
1964 if (is_cpu_exclusive(cs)) {
1965 int retval = update_flag(CS_CPU_EXCLUSIVE, cs, "0");
1966 if (retval < 0) {
1967 mutex_unlock(&manage_mutex);
1968 return retval;
1969 }
1970 }
1940 parent = cs->parent; 1971 parent = cs->parent;
1941 mutex_lock(&callback_mutex); 1972 mutex_lock(&callback_mutex);
1942 set_bit(CS_REMOVED, &cs->flags); 1973 set_bit(CS_REMOVED, &cs->flags);
1943 if (is_cpu_exclusive(cs))
1944 update_cpu_domains(cs);
1945 list_del(&cs->sibling); /* delete my sibling from parent->children */ 1974 list_del(&cs->sibling); /* delete my sibling from parent->children */
1946 spin_lock(&cs->dentry->d_lock); 1975 spin_lock(&cs->dentry->d_lock);
1947 d = dget(cs->dentry); 1976 d = dget(cs->dentry);
@@ -2016,6 +2045,104 @@ out:
2016 return err; 2045 return err;
2017} 2046}
2018 2047
2048#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_MEMORY_HOTPLUG)
2049/*
2050 * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs
2051 * or memory nodes, we need to walk over the cpuset hierarchy,
2052 * removing that CPU or node from all cpusets. If this removes the
2053 * last CPU or node from a cpuset, then the guarantee_online_cpus()
2054 * or guarantee_online_mems() code will use that emptied cpusets
2055 * parent online CPUs or nodes. Cpusets that were already empty of
2056 * CPUs or nodes are left empty.
2057 *
2058 * This routine is intentionally inefficient in a couple of regards.
2059 * It will check all cpusets in a subtree even if the top cpuset of
2060 * the subtree has no offline CPUs or nodes. It checks both CPUs and
2061 * nodes, even though the caller could have been coded to know that
2062 * only one of CPUs or nodes needed to be checked on a given call.
2063 * This was done to minimize text size rather than cpu cycles.
2064 *
2065 * Call with both manage_mutex and callback_mutex held.
2066 *
2067 * Recursive, on depth of cpuset subtree.
2068 */
2069
2070static void guarantee_online_cpus_mems_in_subtree(const struct cpuset *cur)
2071{
2072 struct cpuset *c;
2073
2074 /* Each of our child cpusets mems must be online */
2075 list_for_each_entry(c, &cur->children, sibling) {
2076 guarantee_online_cpus_mems_in_subtree(c);
2077 if (!cpus_empty(c->cpus_allowed))
2078 guarantee_online_cpus(c, &c->cpus_allowed);
2079 if (!nodes_empty(c->mems_allowed))
2080 guarantee_online_mems(c, &c->mems_allowed);
2081 }
2082}
2083
2084/*
2085 * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
2086 * cpu_online_map and node_online_map. Force the top cpuset to track
2087 * whats online after any CPU or memory node hotplug or unplug event.
2088 *
2089 * To ensure that we don't remove a CPU or node from the top cpuset
2090 * that is currently in use by a child cpuset (which would violate
2091 * the rule that cpusets must be subsets of their parent), we first
2092 * call the recursive routine guarantee_online_cpus_mems_in_subtree().
2093 *
2094 * Since there are two callers of this routine, one for CPU hotplug
2095 * events and one for memory node hotplug events, we could have coded
2096 * two separate routines here. We code it as a single common routine
2097 * in order to minimize text size.
2098 */
2099
2100static void common_cpu_mem_hotplug_unplug(void)
2101{
2102 mutex_lock(&manage_mutex);
2103 mutex_lock(&callback_mutex);
2104
2105 guarantee_online_cpus_mems_in_subtree(&top_cpuset);
2106 top_cpuset.cpus_allowed = cpu_online_map;
2107 top_cpuset.mems_allowed = node_online_map;
2108
2109 mutex_unlock(&callback_mutex);
2110 mutex_unlock(&manage_mutex);
2111}
2112#endif
2113
2114#ifdef CONFIG_HOTPLUG_CPU
2115/*
2116 * The top_cpuset tracks what CPUs and Memory Nodes are online,
2117 * period. This is necessary in order to make cpusets transparent
2118 * (of no affect) on systems that are actively using CPU hotplug
2119 * but making no active use of cpusets.
2120 *
2121 * This routine ensures that top_cpuset.cpus_allowed tracks
2122 * cpu_online_map on each CPU hotplug (cpuhp) event.
2123 */
2124
2125static int cpuset_handle_cpuhp(struct notifier_block *nb,
2126 unsigned long phase, void *cpu)
2127{
2128 common_cpu_mem_hotplug_unplug();
2129 return 0;
2130}
2131#endif
2132
2133#ifdef CONFIG_MEMORY_HOTPLUG
2134/*
2135 * Keep top_cpuset.mems_allowed tracking node_online_map.
2136 * Call this routine anytime after you change node_online_map.
2137 * See also the previous routine cpuset_handle_cpuhp().
2138 */
2139
2140void cpuset_track_online_nodes()
2141{
2142 common_cpu_mem_hotplug_unplug();
2143}
2144#endif
2145
2019/** 2146/**
2020 * cpuset_init_smp - initialize cpus_allowed 2147 * cpuset_init_smp - initialize cpus_allowed
2021 * 2148 *
@@ -2026,6 +2153,8 @@ void __init cpuset_init_smp(void)
2026{ 2153{
2027 top_cpuset.cpus_allowed = cpu_online_map; 2154 top_cpuset.cpus_allowed = cpu_online_map;
2028 top_cpuset.mems_allowed = node_online_map; 2155 top_cpuset.mems_allowed = node_online_map;
2156
2157 hotcpu_notifier(cpuset_handle_cpuhp, 0);
2029} 2158}
2030 2159
2031/** 2160/**
@@ -2195,7 +2324,7 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
2195 int i; 2324 int i;
2196 2325
2197 for (i = 0; zl->zones[i]; i++) { 2326 for (i = 0; zl->zones[i]; i++) {
2198 int nid = zl->zones[i]->zone_pgdat->node_id; 2327 int nid = zone_to_nid(zl->zones[i]);
2199 2328
2200 if (node_isset(nid, current->mems_allowed)) 2329 if (node_isset(nid, current->mems_allowed))
2201 return 1; 2330 return 1;
@@ -2266,9 +2395,9 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
2266 const struct cpuset *cs; /* current cpuset ancestors */ 2395 const struct cpuset *cs; /* current cpuset ancestors */
2267 int allowed; /* is allocation in zone z allowed? */ 2396 int allowed; /* is allocation in zone z allowed? */
2268 2397
2269 if (in_interrupt()) 2398 if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
2270 return 1; 2399 return 1;
2271 node = z->zone_pgdat->node_id; 2400 node = zone_to_nid(z);
2272 might_sleep_if(!(gfp_mask & __GFP_HARDWALL)); 2401 might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
2273 if (node_isset(node, current->mems_allowed)) 2402 if (node_isset(node, current->mems_allowed))
2274 return 1; 2403 return 1;
@@ -2370,7 +2499,7 @@ EXPORT_SYMBOL_GPL(cpuset_mem_spread_node);
2370int cpuset_excl_nodes_overlap(const struct task_struct *p) 2499int cpuset_excl_nodes_overlap(const struct task_struct *p)
2371{ 2500{
2372 const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ 2501 const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */
2373 int overlap = 0; /* do cpusets overlap? */ 2502 int overlap = 1; /* do cpusets overlap? */
2374 2503
2375 task_lock(current); 2504 task_lock(current);
2376 if (current->flags & PF_EXITING) { 2505 if (current->flags & PF_EXITING) {
@@ -2442,31 +2571,43 @@ void __cpuset_memory_pressure_bump(void)
2442 */ 2571 */
2443static int proc_cpuset_show(struct seq_file *m, void *v) 2572static int proc_cpuset_show(struct seq_file *m, void *v)
2444{ 2573{
2574 struct pid *pid;
2445 struct task_struct *tsk; 2575 struct task_struct *tsk;
2446 char *buf; 2576 char *buf;
2447 int retval = 0; 2577 int retval;
2448 2578
2579 retval = -ENOMEM;
2449 buf = kmalloc(PAGE_SIZE, GFP_KERNEL); 2580 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
2450 if (!buf) 2581 if (!buf)
2451 return -ENOMEM; 2582 goto out;
2583
2584 retval = -ESRCH;
2585 pid = m->private;
2586 tsk = get_pid_task(pid, PIDTYPE_PID);
2587 if (!tsk)
2588 goto out_free;
2452 2589
2453 tsk = m->private; 2590 retval = -EINVAL;
2454 mutex_lock(&manage_mutex); 2591 mutex_lock(&manage_mutex);
2592
2455 retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE); 2593 retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE);
2456 if (retval < 0) 2594 if (retval < 0)
2457 goto out; 2595 goto out_unlock;
2458 seq_puts(m, buf); 2596 seq_puts(m, buf);
2459 seq_putc(m, '\n'); 2597 seq_putc(m, '\n');
2460out: 2598out_unlock:
2461 mutex_unlock(&manage_mutex); 2599 mutex_unlock(&manage_mutex);
2600 put_task_struct(tsk);
2601out_free:
2462 kfree(buf); 2602 kfree(buf);
2603out:
2463 return retval; 2604 return retval;
2464} 2605}
2465 2606
2466static int cpuset_open(struct inode *inode, struct file *file) 2607static int cpuset_open(struct inode *inode, struct file *file)
2467{ 2608{
2468 struct task_struct *tsk = PROC_I(inode)->task; 2609 struct pid *pid = PROC_I(inode)->pid;
2469 return single_open(file, proc_cpuset_show, tsk); 2610 return single_open(file, proc_cpuset_show, pid);
2470} 2611}
2471 2612
2472struct file_operations proc_cpuset_operations = { 2613struct file_operations proc_cpuset_operations = {
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
new file mode 100644
index 000000000000..36752f124c6a
--- /dev/null
+++ b/kernel/delayacct.c
@@ -0,0 +1,162 @@
1/* delayacct.c - per-task delay accounting
2 *
3 * Copyright (C) Shailabh Nagar, IBM Corp. 2006
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it would be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
13 * the GNU General Public License for more details.
14 */
15
16#include <linux/sched.h>
17#include <linux/slab.h>
18#include <linux/time.h>
19#include <linux/sysctl.h>
20#include <linux/delayacct.h>
21
22int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */
23kmem_cache_t *delayacct_cache;
24
25static int __init delayacct_setup_disable(char *str)
26{
27 delayacct_on = 0;
28 return 1;
29}
30__setup("nodelayacct", delayacct_setup_disable);
31
32void delayacct_init(void)
33{
34 delayacct_cache = kmem_cache_create("delayacct_cache",
35 sizeof(struct task_delay_info),
36 0,
37 SLAB_PANIC,
38 NULL, NULL);
39 delayacct_tsk_init(&init_task);
40}
41
42void __delayacct_tsk_init(struct task_struct *tsk)
43{
44 tsk->delays = kmem_cache_zalloc(delayacct_cache, SLAB_KERNEL);
45 if (tsk->delays)
46 spin_lock_init(&tsk->delays->lock);
47}
48
49/*
50 * Start accounting for a delay statistic using
51 * its starting timestamp (@start)
52 */
53
54static inline void delayacct_start(struct timespec *start)
55{
56 do_posix_clock_monotonic_gettime(start);
57}
58
59/*
60 * Finish delay accounting for a statistic using
61 * its timestamps (@start, @end), accumalator (@total) and @count
62 */
63
64static void delayacct_end(struct timespec *start, struct timespec *end,
65 u64 *total, u32 *count)
66{
67 struct timespec ts;
68 s64 ns;
69
70 do_posix_clock_monotonic_gettime(end);
71 ts = timespec_sub(*end, *start);
72 ns = timespec_to_ns(&ts);
73 if (ns < 0)
74 return;
75
76 spin_lock(&current->delays->lock);
77 *total += ns;
78 (*count)++;
79 spin_unlock(&current->delays->lock);
80}
81
82void __delayacct_blkio_start(void)
83{
84 delayacct_start(&current->delays->blkio_start);
85}
86
87void __delayacct_blkio_end(void)
88{
89 if (current->delays->flags & DELAYACCT_PF_SWAPIN)
90 /* Swapin block I/O */
91 delayacct_end(&current->delays->blkio_start,
92 &current->delays->blkio_end,
93 &current->delays->swapin_delay,
94 &current->delays->swapin_count);
95 else /* Other block I/O */
96 delayacct_end(&current->delays->blkio_start,
97 &current->delays->blkio_end,
98 &current->delays->blkio_delay,
99 &current->delays->blkio_count);
100}
101
102int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
103{
104 s64 tmp;
105 struct timespec ts;
106 unsigned long t1,t2,t3;
107
108 /* Though tsk->delays accessed later, early exit avoids
109 * unnecessary returning of other data
110 */
111 if (!tsk->delays)
112 goto done;
113
114 tmp = (s64)d->cpu_run_real_total;
115 cputime_to_timespec(tsk->utime + tsk->stime, &ts);
116 tmp += timespec_to_ns(&ts);
117 d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
118
119 /*
120 * No locking available for sched_info (and too expensive to add one)
121 * Mitigate by taking snapshot of values
122 */
123 t1 = tsk->sched_info.pcnt;
124 t2 = tsk->sched_info.run_delay;
125 t3 = tsk->sched_info.cpu_time;
126
127 d->cpu_count += t1;
128
129 jiffies_to_timespec(t2, &ts);
130 tmp = (s64)d->cpu_delay_total + timespec_to_ns(&ts);
131 d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp;
132
133 tmp = (s64)d->cpu_run_virtual_total + (s64)jiffies_to_usecs(t3) * 1000;
134 d->cpu_run_virtual_total =
135 (tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp;
136
137 /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */
138
139 spin_lock(&tsk->delays->lock);
140 tmp = d->blkio_delay_total + tsk->delays->blkio_delay;
141 d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp;
142 tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
143 d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp;
144 d->blkio_count += tsk->delays->blkio_count;
145 d->swapin_count += tsk->delays->swapin_count;
146 spin_unlock(&tsk->delays->lock);
147
148done:
149 return 0;
150}
151
152__u64 __delayacct_blkio_ticks(struct task_struct *tsk)
153{
154 __u64 ret;
155
156 spin_lock(&tsk->delays->lock);
157 ret = nsec_to_clock_t(tsk->delays->blkio_delay +
158 tsk->delays->swapin_delay);
159 spin_unlock(&tsk->delays->lock);
160 return ret;
161}
162
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index c01cead2cfd6..3c2eaea66b1e 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -7,7 +7,6 @@
7 * 2001-05-06 Complete rewrite, Christoph Hellwig (hch@infradead.org) 7 * 2001-05-06 Complete rewrite, Christoph Hellwig (hch@infradead.org)
8 */ 8 */
9 9
10#include <linux/config.h>
11#include <linux/init.h> 10#include <linux/init.h>
12#include <linux/kernel.h> 11#include <linux/kernel.h>
13#include <linux/kmod.h> 12#include <linux/kmod.h>
diff --git a/kernel/exit.c b/kernel/exit.c
index a3baf92462bd..2e4c13cba95a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -4,7 +4,6 @@
4 * Copyright (C) 1991, 1992 Linus Torvalds 4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */ 5 */
6 6
7#include <linux/config.h>
8#include <linux/mm.h> 7#include <linux/mm.h>
9#include <linux/slab.h> 8#include <linux/slab.h>
10#include <linux/interrupt.h> 9#include <linux/interrupt.h>
@@ -26,6 +25,8 @@
26#include <linux/mount.h> 25#include <linux/mount.h>
27#include <linux/proc_fs.h> 26#include <linux/proc_fs.h>
28#include <linux/mempolicy.h> 27#include <linux/mempolicy.h>
28#include <linux/taskstats_kern.h>
29#include <linux/delayacct.h>
29#include <linux/cpuset.h> 30#include <linux/cpuset.h>
30#include <linux/syscalls.h> 31#include <linux/syscalls.h>
31#include <linux/signal.h> 32#include <linux/signal.h>
@@ -36,6 +37,7 @@
36#include <linux/compat.h> 37#include <linux/compat.h>
37#include <linux/pipe_fs_i.h> 38#include <linux/pipe_fs_i.h>
38#include <linux/audit.h> /* for audit_free() */ 39#include <linux/audit.h> /* for audit_free() */
40#include <linux/resource.h>
39 41
40#include <asm/uaccess.h> 42#include <asm/uaccess.h>
41#include <asm/unistd.h> 43#include <asm/unistd.h>
@@ -45,8 +47,6 @@
45extern void sem_exit (void); 47extern void sem_exit (void);
46extern struct task_struct *child_reaper; 48extern struct task_struct *child_reaper;
47 49
48int getrusage(struct task_struct *, int, struct rusage __user *);
49
50static void exit_mm(struct task_struct * tsk); 50static void exit_mm(struct task_struct * tsk);
51 51
52static void __unhash_process(struct task_struct *p) 52static void __unhash_process(struct task_struct *p)
@@ -136,14 +136,10 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
136 136
137void release_task(struct task_struct * p) 137void release_task(struct task_struct * p)
138{ 138{
139 struct task_struct *leader;
139 int zap_leader; 140 int zap_leader;
140 task_t *leader;
141 struct dentry *proc_dentry;
142
143repeat: 141repeat:
144 atomic_dec(&p->user->processes); 142 atomic_dec(&p->user->processes);
145 spin_lock(&p->proc_lock);
146 proc_dentry = proc_pid_unhash(p);
147 write_lock_irq(&tasklist_lock); 143 write_lock_irq(&tasklist_lock);
148 ptrace_unlink(p); 144 ptrace_unlink(p);
149 BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); 145 BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
@@ -172,8 +168,7 @@ repeat:
172 168
173 sched_exit(p); 169 sched_exit(p);
174 write_unlock_irq(&tasklist_lock); 170 write_unlock_irq(&tasklist_lock);
175 spin_unlock(&p->proc_lock); 171 proc_flush_task(p);
176 proc_pid_flush(proc_dentry);
177 release_thread(p); 172 release_thread(p);
178 call_rcu(&p->rcu, delayed_put_task_struct); 173 call_rcu(&p->rcu, delayed_put_task_struct);
179 174
@@ -216,7 +211,7 @@ out:
216 * 211 *
217 * "I ask you, have you ever known what it is to be an orphan?" 212 * "I ask you, have you ever known what it is to be an orphan?"
218 */ 213 */
219static int will_become_orphaned_pgrp(int pgrp, task_t *ignored_task) 214static int will_become_orphaned_pgrp(int pgrp, struct task_struct *ignored_task)
220{ 215{
221 struct task_struct *p; 216 struct task_struct *p;
222 int ret = 1; 217 int ret = 1;
@@ -224,7 +219,7 @@ static int will_become_orphaned_pgrp(int pgrp, task_t *ignored_task)
224 do_each_task_pid(pgrp, PIDTYPE_PGID, p) { 219 do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
225 if (p == ignored_task 220 if (p == ignored_task
226 || p->exit_state 221 || p->exit_state
227 || p->real_parent->pid == 1) 222 || is_init(p->real_parent))
228 continue; 223 continue;
229 if (process_group(p->real_parent) != pgrp 224 if (process_group(p->real_parent) != pgrp
230 && p->real_parent->signal->session == p->signal->session) { 225 && p->real_parent->signal->session == p->signal->session) {
@@ -254,17 +249,6 @@ static int has_stopped_jobs(int pgrp)
254 do_each_task_pid(pgrp, PIDTYPE_PGID, p) { 249 do_each_task_pid(pgrp, PIDTYPE_PGID, p) {
255 if (p->state != TASK_STOPPED) 250 if (p->state != TASK_STOPPED)
256 continue; 251 continue;
257
258 /* If p is stopped by a debugger on a signal that won't
259 stop it, then don't count p as stopped. This isn't
260 perfect but it's a good approximation. */
261 if (unlikely (p->ptrace)
262 && p->exit_code != SIGSTOP
263 && p->exit_code != SIGTSTP
264 && p->exit_code != SIGTTOU
265 && p->exit_code != SIGTTIN)
266 continue;
267
268 retval = 1; 252 retval = 1;
269 break; 253 break;
270 } while_each_task_pid(pgrp, PIDTYPE_PGID, p); 254 } while_each_task_pid(pgrp, PIDTYPE_PGID, p);
@@ -297,9 +281,7 @@ static void reparent_to_init(void)
297 /* Set the exit signal to SIGCHLD so we signal init on exit */ 281 /* Set the exit signal to SIGCHLD so we signal init on exit */
298 current->exit_signal = SIGCHLD; 282 current->exit_signal = SIGCHLD;
299 283
300 if ((current->policy == SCHED_NORMAL || 284 if (!has_rt_policy(current) && (task_nice(current) < 0))
301 current->policy == SCHED_BATCH)
302 && (task_nice(current) < 0))
303 set_user_nice(current, 0); 285 set_user_nice(current, 0);
304 /* cpus_allowed? */ 286 /* cpus_allowed? */
305 /* rt_priority? */ 287 /* rt_priority? */
@@ -492,6 +474,18 @@ void fastcall put_files_struct(struct files_struct *files)
492 474
493EXPORT_SYMBOL(put_files_struct); 475EXPORT_SYMBOL(put_files_struct);
494 476
477void reset_files_struct(struct task_struct *tsk, struct files_struct *files)
478{
479 struct files_struct *old;
480
481 old = tsk->files;
482 task_lock(tsk);
483 tsk->files = files;
484 task_unlock(tsk);
485 put_files_struct(old);
486}
487EXPORT_SYMBOL(reset_files_struct);
488
495static inline void __exit_files(struct task_struct *tsk) 489static inline void __exit_files(struct task_struct *tsk)
496{ 490{
497 struct files_struct * files = tsk->files; 491 struct files_struct * files = tsk->files;
@@ -589,7 +583,8 @@ static void exit_mm(struct task_struct * tsk)
589 mmput(mm); 583 mmput(mm);
590} 584}
591 585
592static inline void choose_new_parent(task_t *p, task_t *reaper) 586static inline void
587choose_new_parent(struct task_struct *p, struct task_struct *reaper)
593{ 588{
594 /* 589 /*
595 * Make sure we're not reparenting to ourselves and that 590 * Make sure we're not reparenting to ourselves and that
@@ -599,7 +594,8 @@ static inline void choose_new_parent(task_t *p, task_t *reaper)
599 p->real_parent = reaper; 594 p->real_parent = reaper;
600} 595}
601 596
602static void reparent_thread(task_t *p, task_t *father, int traced) 597static void
598reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
603{ 599{
604 /* We don't want people slaying init. */ 600 /* We don't want people slaying init. */
605 if (p->exit_signal != -1) 601 if (p->exit_signal != -1)
@@ -663,8 +659,8 @@ static void reparent_thread(task_t *p, task_t *father, int traced)
663 * group, and if no such member exists, give it to 659 * group, and if no such member exists, give it to
664 * the global child reaper process (ie "init") 660 * the global child reaper process (ie "init")
665 */ 661 */
666static void forget_original_parent(struct task_struct * father, 662static void
667 struct list_head *to_release) 663forget_original_parent(struct task_struct *father, struct list_head *to_release)
668{ 664{
669 struct task_struct *p, *reaper = father; 665 struct task_struct *p, *reaper = father;
670 struct list_head *_p, *_n; 666 struct list_head *_p, *_n;
@@ -687,7 +683,7 @@ static void forget_original_parent(struct task_struct * father,
687 */ 683 */
688 list_for_each_safe(_p, _n, &father->children) { 684 list_for_each_safe(_p, _n, &father->children) {
689 int ptrace; 685 int ptrace;
690 p = list_entry(_p,struct task_struct,sibling); 686 p = list_entry(_p, struct task_struct, sibling);
691 687
692 ptrace = p->ptrace; 688 ptrace = p->ptrace;
693 689
@@ -716,7 +712,7 @@ static void forget_original_parent(struct task_struct * father,
716 list_add(&p->ptrace_list, to_release); 712 list_add(&p->ptrace_list, to_release);
717 } 713 }
718 list_for_each_safe(_p, _n, &father->ptrace_children) { 714 list_for_each_safe(_p, _n, &father->ptrace_children) {
719 p = list_entry(_p,struct task_struct,ptrace_list); 715 p = list_entry(_p, struct task_struct, ptrace_list);
720 choose_new_parent(p, reaper); 716 choose_new_parent(p, reaper);
721 reparent_thread(p, father, 1); 717 reparent_thread(p, father, 1);
722 } 718 }
@@ -836,7 +832,7 @@ static void exit_notify(struct task_struct *tsk)
836 832
837 list_for_each_safe(_p, _n, &ptrace_dead) { 833 list_for_each_safe(_p, _n, &ptrace_dead) {
838 list_del_init(_p); 834 list_del_init(_p);
839 t = list_entry(_p,struct task_struct,ptrace_list); 835 t = list_entry(_p, struct task_struct, ptrace_list);
840 release_task(t); 836 release_task(t);
841 } 837 }
842 838
@@ -848,7 +844,9 @@ static void exit_notify(struct task_struct *tsk)
848fastcall NORET_TYPE void do_exit(long code) 844fastcall NORET_TYPE void do_exit(long code)
849{ 845{
850 struct task_struct *tsk = current; 846 struct task_struct *tsk = current;
847 struct taskstats *tidstats;
851 int group_dead; 848 int group_dead;
849 unsigned int mycpu;
852 850
853 profile_task_exit(tsk); 851 profile_task_exit(tsk);
854 852
@@ -886,6 +884,8 @@ fastcall NORET_TYPE void do_exit(long code)
886 current->comm, current->pid, 884 current->comm, current->pid,
887 preempt_count()); 885 preempt_count());
888 886
887 taskstats_exit_alloc(&tidstats, &mycpu);
888
889 acct_update_integrals(tsk); 889 acct_update_integrals(tsk);
890 if (tsk->mm) { 890 if (tsk->mm) {
891 update_hiwater_rss(tsk->mm); 891 update_hiwater_rss(tsk->mm);
@@ -895,18 +895,23 @@ fastcall NORET_TYPE void do_exit(long code)
895 if (group_dead) { 895 if (group_dead) {
896 hrtimer_cancel(&tsk->signal->real_timer); 896 hrtimer_cancel(&tsk->signal->real_timer);
897 exit_itimers(tsk->signal); 897 exit_itimers(tsk->signal);
898 acct_process(code);
899 } 898 }
899 acct_collect(code, group_dead);
900 if (unlikely(tsk->robust_list)) 900 if (unlikely(tsk->robust_list))
901 exit_robust_list(tsk); 901 exit_robust_list(tsk);
902#ifdef CONFIG_COMPAT 902#if defined(CONFIG_FUTEX) && defined(CONFIG_COMPAT)
903 if (unlikely(tsk->compat_robust_list)) 903 if (unlikely(tsk->compat_robust_list))
904 compat_exit_robust_list(tsk); 904 compat_exit_robust_list(tsk);
905#endif 905#endif
906 if (unlikely(tsk->audit_context)) 906 if (unlikely(tsk->audit_context))
907 audit_free(tsk); 907 audit_free(tsk);
908 taskstats_exit_send(tsk, tidstats, group_dead, mycpu);
909 taskstats_exit_free(tidstats);
910
908 exit_mm(tsk); 911 exit_mm(tsk);
909 912
913 if (group_dead)
914 acct_process();
910 exit_sem(tsk); 915 exit_sem(tsk);
911 __exit_files(tsk); 916 __exit_files(tsk);
912 __exit_fs(tsk); 917 __exit_fs(tsk);
@@ -930,9 +935,17 @@ fastcall NORET_TYPE void do_exit(long code)
930 tsk->mempolicy = NULL; 935 tsk->mempolicy = NULL;
931#endif 936#endif
932 /* 937 /*
933 * If DEBUG_MUTEXES is on, make sure we are holding no locks: 938 * This must happen late, after the PID is not
939 * hashed anymore:
934 */ 940 */
935 mutex_debug_check_no_locks_held(tsk); 941 if (unlikely(!list_empty(&tsk->pi_state_list)))
942 exit_pi_state_list(tsk);
943 if (unlikely(current->pi_state_cache))
944 kfree(current->pi_state_cache);
945 /*
946 * Make sure we are holding no locks:
947 */
948 debug_check_no_locks_held(tsk);
936 949
937 if (tsk->io_context) 950 if (tsk->io_context)
938 exit_io_context(); 951 exit_io_context();
@@ -940,15 +953,15 @@ fastcall NORET_TYPE void do_exit(long code)
940 if (tsk->splice_pipe) 953 if (tsk->splice_pipe)
941 __free_pipe_info(tsk->splice_pipe); 954 __free_pipe_info(tsk->splice_pipe);
942 955
943 /* PF_DEAD causes final put_task_struct after we schedule. */
944 preempt_disable(); 956 preempt_disable();
945 BUG_ON(tsk->flags & PF_DEAD); 957 /* causes final put_task_struct in finish_task_switch(). */
946 tsk->flags |= PF_DEAD; 958 tsk->state = TASK_DEAD;
947 959
948 schedule(); 960 schedule();
949 BUG(); 961 BUG();
950 /* Avoid "noreturn function does return". */ 962 /* Avoid "noreturn function does return". */
951 for (;;) ; 963 for (;;)
964 cpu_relax(); /* For when BUG is null */
952} 965}
953 966
954EXPORT_SYMBOL_GPL(do_exit); 967EXPORT_SYMBOL_GPL(do_exit);
@@ -957,7 +970,7 @@ NORET_TYPE void complete_and_exit(struct completion *comp, long code)
957{ 970{
958 if (comp) 971 if (comp)
959 complete(comp); 972 complete(comp);
960 973
961 do_exit(code); 974 do_exit(code);
962} 975}
963 976
@@ -1007,7 +1020,7 @@ asmlinkage void sys_exit_group(int error_code)
1007 do_group_exit((error_code & 0xff) << 8); 1020 do_group_exit((error_code & 0xff) << 8);
1008} 1021}
1009 1022
1010static int eligible_child(pid_t pid, int options, task_t *p) 1023static int eligible_child(pid_t pid, int options, struct task_struct *p)
1011{ 1024{
1012 if (pid > 0) { 1025 if (pid > 0) {
1013 if (p->pid != pid) 1026 if (p->pid != pid)
@@ -1039,7 +1052,7 @@ static int eligible_child(pid_t pid, int options, task_t *p)
1039 * Do not consider thread group leaders that are 1052 * Do not consider thread group leaders that are
1040 * in a non-empty thread group: 1053 * in a non-empty thread group:
1041 */ 1054 */
1042 if (current->tgid != p->tgid && delay_group_leader(p)) 1055 if (delay_group_leader(p))
1043 return 2; 1056 return 2;
1044 1057
1045 if (security_task_wait(p)) 1058 if (security_task_wait(p))
@@ -1048,12 +1061,13 @@ static int eligible_child(pid_t pid, int options, task_t *p)
1048 return 1; 1061 return 1;
1049} 1062}
1050 1063
1051static int wait_noreap_copyout(task_t *p, pid_t pid, uid_t uid, 1064static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
1052 int why, int status, 1065 int why, int status,
1053 struct siginfo __user *infop, 1066 struct siginfo __user *infop,
1054 struct rusage __user *rusagep) 1067 struct rusage __user *rusagep)
1055{ 1068{
1056 int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0; 1069 int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0;
1070
1057 put_task_struct(p); 1071 put_task_struct(p);
1058 if (!retval) 1072 if (!retval)
1059 retval = put_user(SIGCHLD, &infop->si_signo); 1073 retval = put_user(SIGCHLD, &infop->si_signo);
@@ -1078,7 +1092,7 @@ static int wait_noreap_copyout(task_t *p, pid_t pid, uid_t uid,
1078 * the lock and this task is uninteresting. If we return nonzero, we have 1092 * the lock and this task is uninteresting. If we return nonzero, we have
1079 * released the lock and the system call should return. 1093 * released the lock and the system call should return.
1080 */ 1094 */
1081static int wait_task_zombie(task_t *p, int noreap, 1095static int wait_task_zombie(struct task_struct *p, int noreap,
1082 struct siginfo __user *infop, 1096 struct siginfo __user *infop,
1083 int __user *stat_addr, struct rusage __user *ru) 1097 int __user *stat_addr, struct rusage __user *ru)
1084{ 1098{
@@ -1240,8 +1254,8 @@ static int wait_task_zombie(task_t *p, int noreap,
1240 * the lock and this task is uninteresting. If we return nonzero, we have 1254 * the lock and this task is uninteresting. If we return nonzero, we have
1241 * released the lock and the system call should return. 1255 * released the lock and the system call should return.
1242 */ 1256 */
1243static int wait_task_stopped(task_t *p, int delayed_group_leader, int noreap, 1257static int wait_task_stopped(struct task_struct *p, int delayed_group_leader,
1244 struct siginfo __user *infop, 1258 int noreap, struct siginfo __user *infop,
1245 int __user *stat_addr, struct rusage __user *ru) 1259 int __user *stat_addr, struct rusage __user *ru)
1246{ 1260{
1247 int retval, exit_code; 1261 int retval, exit_code;
@@ -1355,7 +1369,7 @@ bail_ref:
1355 * the lock and this task is uninteresting. If we return nonzero, we have 1369 * the lock and this task is uninteresting. If we return nonzero, we have
1356 * released the lock and the system call should return. 1370 * released the lock and the system call should return.
1357 */ 1371 */
1358static int wait_task_continued(task_t *p, int noreap, 1372static int wait_task_continued(struct task_struct *p, int noreap,
1359 struct siginfo __user *infop, 1373 struct siginfo __user *infop,
1360 int __user *stat_addr, struct rusage __user *ru) 1374 int __user *stat_addr, struct rusage __user *ru)
1361{ 1375{
@@ -1441,7 +1455,7 @@ repeat:
1441 int ret; 1455 int ret;
1442 1456
1443 list_for_each(_p,&tsk->children) { 1457 list_for_each(_p,&tsk->children) {
1444 p = list_entry(_p,struct task_struct,sibling); 1458 p = list_entry(_p, struct task_struct, sibling);
1445 1459
1446 ret = eligible_child(pid, options, p); 1460 ret = eligible_child(pid, options, p);
1447 if (!ret) 1461 if (!ret)
diff --git a/kernel/fork.c b/kernel/fork.c
index 49adc0e8d47c..1c999f3e0b47 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -11,7 +11,6 @@
11 * management can be a bitch. See 'mm/memory.c': 'copy_page_range()' 11 * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
12 */ 12 */
13 13
14#include <linux/config.h>
15#include <linux/slab.h> 14#include <linux/slab.h>
16#include <linux/init.h> 15#include <linux/init.h>
17#include <linux/unistd.h> 16#include <linux/unistd.h>
@@ -44,6 +43,9 @@
44#include <linux/rmap.h> 43#include <linux/rmap.h>
45#include <linux/acct.h> 44#include <linux/acct.h>
46#include <linux/cn_proc.h> 45#include <linux/cn_proc.h>
46#include <linux/delayacct.h>
47#include <linux/taskstats_kern.h>
48#include <linux/random.h>
47 49
48#include <asm/pgtable.h> 50#include <asm/pgtable.h>
49#include <asm/pgalloc.h> 51#include <asm/pgalloc.h>
@@ -62,9 +64,7 @@ int max_threads; /* tunable limit on nr_threads */
62 64
63DEFINE_PER_CPU(unsigned long, process_counts) = 0; 65DEFINE_PER_CPU(unsigned long, process_counts) = 0;
64 66
65 __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ 67__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
66
67EXPORT_SYMBOL(tasklist_lock);
68 68
69int nr_processes(void) 69int nr_processes(void)
70{ 70{
@@ -104,6 +104,7 @@ static kmem_cache_t *mm_cachep;
104void free_task(struct task_struct *tsk) 104void free_task(struct task_struct *tsk)
105{ 105{
106 free_thread_info(tsk->thread_info); 106 free_thread_info(tsk->thread_info);
107 rt_mutex_debug_task_free(tsk);
107 free_task_struct(tsk); 108 free_task_struct(tsk);
108} 109}
109EXPORT_SYMBOL(free_task); 110EXPORT_SYMBOL(free_task);
@@ -117,6 +118,7 @@ void __put_task_struct(struct task_struct *tsk)
117 security_task_free(tsk); 118 security_task_free(tsk);
118 free_uid(tsk->user); 119 free_uid(tsk->user);
119 put_group_info(tsk->group_info); 120 put_group_info(tsk->group_info);
121 delayacct_tsk_free(tsk);
120 122
121 if (!profile_handoff_task(tsk)) 123 if (!profile_handoff_task(tsk))
122 free_task(tsk); 124 free_task(tsk);
@@ -174,10 +176,16 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
174 tsk->thread_info = ti; 176 tsk->thread_info = ti;
175 setup_thread_stack(tsk, orig); 177 setup_thread_stack(tsk, orig);
176 178
179#ifdef CONFIG_CC_STACKPROTECTOR
180 tsk->stack_canary = get_random_int();
181#endif
182
177 /* One for us, one for whoever does the "release_task()" (usually parent) */ 183 /* One for us, one for whoever does the "release_task()" (usually parent) */
178 atomic_set(&tsk->usage,2); 184 atomic_set(&tsk->usage,2);
179 atomic_set(&tsk->fs_excl, 0); 185 atomic_set(&tsk->fs_excl, 0);
186#ifdef CONFIG_BLK_DEV_IO_TRACE
180 tsk->btrace_seq = 0; 187 tsk->btrace_seq = 0;
188#endif
181 tsk->splice_pipe = NULL; 189 tsk->splice_pipe = NULL;
182 return tsk; 190 return tsk;
183} 191}
@@ -193,7 +201,10 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
193 201
194 down_write(&oldmm->mmap_sem); 202 down_write(&oldmm->mmap_sem);
195 flush_cache_mm(oldmm); 203 flush_cache_mm(oldmm);
196 down_write(&mm->mmap_sem); 204 /*
205 * Not linked in yet - no deadlock potential:
206 */
207 down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
197 208
198 mm->locked_vm = 0; 209 mm->locked_vm = 0;
199 mm->mmap = NULL; 210 mm->mmap = NULL;
@@ -817,6 +828,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
817 if (clone_flags & CLONE_THREAD) { 828 if (clone_flags & CLONE_THREAD) {
818 atomic_inc(&current->signal->count); 829 atomic_inc(&current->signal->count);
819 atomic_inc(&current->signal->live); 830 atomic_inc(&current->signal->live);
831 taskstats_tgid_alloc(current->signal);
820 return 0; 832 return 0;
821 } 833 }
822 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); 834 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
@@ -861,6 +873,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
861 INIT_LIST_HEAD(&sig->cpu_timers[0]); 873 INIT_LIST_HEAD(&sig->cpu_timers[0]);
862 INIT_LIST_HEAD(&sig->cpu_timers[1]); 874 INIT_LIST_HEAD(&sig->cpu_timers[1]);
863 INIT_LIST_HEAD(&sig->cpu_timers[2]); 875 INIT_LIST_HEAD(&sig->cpu_timers[2]);
876 taskstats_tgid_init(sig);
864 877
865 task_lock(current->group_leader); 878 task_lock(current->group_leader);
866 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); 879 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
@@ -874,6 +887,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
874 tsk->it_prof_expires = 887 tsk->it_prof_expires =
875 secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); 888 secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
876 } 889 }
890 acct_init_pacct(&sig->pacct);
877 891
878 return 0; 892 return 0;
879} 893}
@@ -881,6 +895,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
881void __cleanup_signal(struct signal_struct *sig) 895void __cleanup_signal(struct signal_struct *sig)
882{ 896{
883 exit_thread_group_keys(sig); 897 exit_thread_group_keys(sig);
898 taskstats_tgid_free(sig);
884 kmem_cache_free(signal_cachep, sig); 899 kmem_cache_free(signal_cachep, sig);
885} 900}
886 901
@@ -912,6 +927,15 @@ asmlinkage long sys_set_tid_address(int __user *tidptr)
912 return current->pid; 927 return current->pid;
913} 928}
914 929
930static inline void rt_mutex_init_task(struct task_struct *p)
931{
932#ifdef CONFIG_RT_MUTEXES
933 spin_lock_init(&p->pi_lock);
934 plist_head_init(&p->pi_waiters, &p->pi_lock);
935 p->pi_blocked_on = NULL;
936#endif
937}
938
915/* 939/*
916 * This creates a new process as a copy of the old one, 940 * This creates a new process as a copy of the old one,
917 * but does not actually start it yet. 941 * but does not actually start it yet.
@@ -920,13 +944,13 @@ asmlinkage long sys_set_tid_address(int __user *tidptr)
920 * parts of the process environment (as per the clone 944 * parts of the process environment (as per the clone
921 * flags). The actual kick-off is left to the caller. 945 * flags). The actual kick-off is left to the caller.
922 */ 946 */
923static task_t *copy_process(unsigned long clone_flags, 947static struct task_struct *copy_process(unsigned long clone_flags,
924 unsigned long stack_start, 948 unsigned long stack_start,
925 struct pt_regs *regs, 949 struct pt_regs *regs,
926 unsigned long stack_size, 950 unsigned long stack_size,
927 int __user *parent_tidptr, 951 int __user *parent_tidptr,
928 int __user *child_tidptr, 952 int __user *child_tidptr,
929 int pid) 953 int pid)
930{ 954{
931 int retval; 955 int retval;
932 struct task_struct *p = NULL; 956 struct task_struct *p = NULL;
@@ -958,6 +982,10 @@ static task_t *copy_process(unsigned long clone_flags,
958 if (!p) 982 if (!p)
959 goto fork_out; 983 goto fork_out;
960 984
985#ifdef CONFIG_TRACE_IRQFLAGS
986 DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
987 DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
988#endif
961 retval = -EAGAIN; 989 retval = -EAGAIN;
962 if (atomic_read(&p->user->processes) >= 990 if (atomic_read(&p->user->processes) >=
963 p->signal->rlim[RLIMIT_NPROC].rlim_cur) { 991 p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
@@ -985,20 +1013,18 @@ static task_t *copy_process(unsigned long clone_flags,
985 goto bad_fork_cleanup_put_domain; 1013 goto bad_fork_cleanup_put_domain;
986 1014
987 p->did_exec = 0; 1015 p->did_exec = 0;
1016 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
988 copy_flags(clone_flags, p); 1017 copy_flags(clone_flags, p);
989 p->pid = pid; 1018 p->pid = pid;
990 retval = -EFAULT; 1019 retval = -EFAULT;
991 if (clone_flags & CLONE_PARENT_SETTID) 1020 if (clone_flags & CLONE_PARENT_SETTID)
992 if (put_user(p->pid, parent_tidptr)) 1021 if (put_user(p->pid, parent_tidptr))
993 goto bad_fork_cleanup; 1022 goto bad_fork_cleanup_delays_binfmt;
994
995 p->proc_dentry = NULL;
996 1023
997 INIT_LIST_HEAD(&p->children); 1024 INIT_LIST_HEAD(&p->children);
998 INIT_LIST_HEAD(&p->sibling); 1025 INIT_LIST_HEAD(&p->sibling);
999 p->vfork_done = NULL; 1026 p->vfork_done = NULL;
1000 spin_lock_init(&p->alloc_lock); 1027 spin_lock_init(&p->alloc_lock);
1001 spin_lock_init(&p->proc_lock);
1002 1028
1003 clear_tsk_thread_flag(p, TIF_SIGPENDING); 1029 clear_tsk_thread_flag(p, TIF_SIGPENDING);
1004 init_sigpending(&p->pending); 1030 init_sigpending(&p->pending);
@@ -1035,6 +1061,32 @@ static task_t *copy_process(unsigned long clone_flags,
1035 } 1061 }
1036 mpol_fix_fork_child_flag(p); 1062 mpol_fix_fork_child_flag(p);
1037#endif 1063#endif
1064#ifdef CONFIG_TRACE_IRQFLAGS
1065 p->irq_events = 0;
1066#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
1067 p->hardirqs_enabled = 1;
1068#else
1069 p->hardirqs_enabled = 0;
1070#endif
1071 p->hardirq_enable_ip = 0;
1072 p->hardirq_enable_event = 0;
1073 p->hardirq_disable_ip = _THIS_IP_;
1074 p->hardirq_disable_event = 0;
1075 p->softirqs_enabled = 1;
1076 p->softirq_enable_ip = _THIS_IP_;
1077 p->softirq_enable_event = 0;
1078 p->softirq_disable_ip = 0;
1079 p->softirq_disable_event = 0;
1080 p->hardirq_context = 0;
1081 p->softirq_context = 0;
1082#endif
1083#ifdef CONFIG_LOCKDEP
1084 p->lockdep_depth = 0; /* no locks held yet */
1085 p->curr_chain_key = 0;
1086 p->lockdep_recursion = 0;
1087#endif
1088
1089 rt_mutex_init_task(p);
1038 1090
1039#ifdef CONFIG_DEBUG_MUTEXES 1091#ifdef CONFIG_DEBUG_MUTEXES
1040 p->blocked_on = NULL; /* not blocked yet */ 1092 p->blocked_on = NULL; /* not blocked yet */
@@ -1078,6 +1130,9 @@ static task_t *copy_process(unsigned long clone_flags,
1078#ifdef CONFIG_COMPAT 1130#ifdef CONFIG_COMPAT
1079 p->compat_robust_list = NULL; 1131 p->compat_robust_list = NULL;
1080#endif 1132#endif
1133 INIT_LIST_HEAD(&p->pi_state_list);
1134 p->pi_state_cache = NULL;
1135
1081 /* 1136 /*
1082 * sigaltstack should be cleared when sharing the same VM 1137 * sigaltstack should be cleared when sharing the same VM
1083 */ 1138 */
@@ -1095,7 +1150,6 @@ static task_t *copy_process(unsigned long clone_flags,
1095 1150
1096 /* Our parent execution domain becomes current domain 1151 /* Our parent execution domain becomes current domain
1097 These must match for thread signalling to apply */ 1152 These must match for thread signalling to apply */
1098
1099 p->parent_exec_id = p->self_exec_id; 1153 p->parent_exec_id = p->self_exec_id;
1100 1154
1101 /* ok, now we should be set up.. */ 1155 /* ok, now we should be set up.. */
@@ -1118,6 +1172,9 @@ static task_t *copy_process(unsigned long clone_flags,
1118 /* Need tasklist lock for parent etc handling! */ 1172 /* Need tasklist lock for parent etc handling! */
1119 write_lock_irq(&tasklist_lock); 1173 write_lock_irq(&tasklist_lock);
1120 1174
1175 /* for sys_ioprio_set(IOPRIO_WHO_PGRP) */
1176 p->ioprio = current->ioprio;
1177
1121 /* 1178 /*
1122 * The task hasn't been attached yet, so its cpus_allowed mask will 1179 * The task hasn't been attached yet, so its cpus_allowed mask will
1123 * not be changed, nor will its assigned CPU. 1180 * not be changed, nor will its assigned CPU.
@@ -1158,18 +1215,6 @@ static task_t *copy_process(unsigned long clone_flags,
1158 } 1215 }
1159 1216
1160 if (clone_flags & CLONE_THREAD) { 1217 if (clone_flags & CLONE_THREAD) {
1161 /*
1162 * Important: if an exit-all has been started then
1163 * do not create this new thread - the whole thread
1164 * group is supposed to exit anyway.
1165 */
1166 if (current->signal->flags & SIGNAL_GROUP_EXIT) {
1167 spin_unlock(&current->sighand->siglock);
1168 write_unlock_irq(&tasklist_lock);
1169 retval = -EAGAIN;
1170 goto bad_fork_cleanup_namespace;
1171 }
1172
1173 p->group_leader = current->group_leader; 1218 p->group_leader = current->group_leader;
1174 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); 1219 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
1175 1220
@@ -1189,11 +1234,6 @@ static task_t *copy_process(unsigned long clone_flags,
1189 } 1234 }
1190 } 1235 }
1191 1236
1192 /*
1193 * inherit ioprio
1194 */
1195 p->ioprio = current->ioprio;
1196
1197 if (likely(p->pid)) { 1237 if (likely(p->pid)) {
1198 add_parent(p); 1238 add_parent(p);
1199 if (unlikely(p->ptrace & PT_PTRACED)) 1239 if (unlikely(p->ptrace & PT_PTRACED))
@@ -1246,7 +1286,8 @@ bad_fork_cleanup_policy:
1246bad_fork_cleanup_cpuset: 1286bad_fork_cleanup_cpuset:
1247#endif 1287#endif
1248 cpuset_exit(p); 1288 cpuset_exit(p);
1249bad_fork_cleanup: 1289bad_fork_cleanup_delays_binfmt:
1290 delayacct_tsk_free(p);
1250 if (p->binfmt) 1291 if (p->binfmt)
1251 module_put(p->binfmt->module); 1292 module_put(p->binfmt->module);
1252bad_fork_cleanup_put_domain: 1293bad_fork_cleanup_put_domain:
@@ -1267,9 +1308,9 @@ struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
1267 return regs; 1308 return regs;
1268} 1309}
1269 1310
1270task_t * __devinit fork_idle(int cpu) 1311struct task_struct * __devinit fork_idle(int cpu)
1271{ 1312{
1272 task_t *task; 1313 struct task_struct *task;
1273 struct pt_regs regs; 1314 struct pt_regs regs;
1274 1315
1275 task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, NULL, 0); 1316 task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, NULL, 0);
@@ -1356,8 +1397,10 @@ long do_fork(unsigned long clone_flags,
1356 1397
1357 if (clone_flags & CLONE_VFORK) { 1398 if (clone_flags & CLONE_VFORK) {
1358 wait_for_completion(&vfork); 1399 wait_for_completion(&vfork);
1359 if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) 1400 if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) {
1401 current->ptrace_message = nr;
1360 ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP); 1402 ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
1403 }
1361 } 1404 }
1362 } else { 1405 } else {
1363 free_pid(pid); 1406 free_pid(pid);
diff --git a/kernel/futex.c b/kernel/futex.c
index e1a380c77a5a..4b6770e9806d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -12,6 +12,10 @@
12 * (C) Copyright 2006 Red Hat Inc, All Rights Reserved 12 * (C) Copyright 2006 Red Hat Inc, All Rights Reserved
13 * Thanks to Thomas Gleixner for suggestions, analysis and fixes. 13 * Thanks to Thomas Gleixner for suggestions, analysis and fixes.
14 * 14 *
15 * PI-futex support started by Ingo Molnar and Thomas Gleixner
16 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
17 * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
18 *
15 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly 19 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
16 * enough at me, Linus for the original (flawed) idea, Matthew 20 * enough at me, Linus for the original (flawed) idea, Matthew
17 * Kirkwood for proof-of-concept implementation. 21 * Kirkwood for proof-of-concept implementation.
@@ -46,6 +50,8 @@
46#include <linux/signal.h> 50#include <linux/signal.h>
47#include <asm/futex.h> 51#include <asm/futex.h>
48 52
53#include "rtmutex_common.h"
54
49#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) 55#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
50 56
51/* 57/*
@@ -63,7 +69,7 @@ union futex_key {
63 int offset; 69 int offset;
64 } shared; 70 } shared;
65 struct { 71 struct {
66 unsigned long uaddr; 72 unsigned long address;
67 struct mm_struct *mm; 73 struct mm_struct *mm;
68 int offset; 74 int offset;
69 } private; 75 } private;
@@ -75,6 +81,27 @@ union futex_key {
75}; 81};
76 82
77/* 83/*
84 * Priority Inheritance state:
85 */
86struct futex_pi_state {
87 /*
88 * list of 'owned' pi_state instances - these have to be
89 * cleaned up in do_exit() if the task exits prematurely:
90 */
91 struct list_head list;
92
93 /*
94 * The PI object:
95 */
96 struct rt_mutex pi_mutex;
97
98 struct task_struct *owner;
99 atomic_t refcount;
100
101 union futex_key key;
102};
103
104/*
78 * We use this hashed waitqueue instead of a normal wait_queue_t, so 105 * We use this hashed waitqueue instead of a normal wait_queue_t, so
79 * we can wake only the relevant ones (hashed queues may be shared). 106 * we can wake only the relevant ones (hashed queues may be shared).
80 * 107 *
@@ -87,15 +114,19 @@ struct futex_q {
87 struct list_head list; 114 struct list_head list;
88 wait_queue_head_t waiters; 115 wait_queue_head_t waiters;
89 116
90 /* Which hash list lock to use. */ 117 /* Which hash list lock to use: */
91 spinlock_t *lock_ptr; 118 spinlock_t *lock_ptr;
92 119
93 /* Key which the futex is hashed on. */ 120 /* Key which the futex is hashed on: */
94 union futex_key key; 121 union futex_key key;
95 122
96 /* For fd, sigio sent using these. */ 123 /* For fd, sigio sent using these: */
97 int fd; 124 int fd;
98 struct file *filp; 125 struct file *filp;
126
127 /* Optional priority inheritance state: */
128 struct futex_pi_state *pi_state;
129 struct task_struct *task;
99}; 130};
100 131
101/* 132/*
@@ -144,8 +175,9 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
144 * 175 *
145 * Should be called with &current->mm->mmap_sem but NOT any spinlocks. 176 * Should be called with &current->mm->mmap_sem but NOT any spinlocks.
146 */ 177 */
147static int get_futex_key(unsigned long uaddr, union futex_key *key) 178static int get_futex_key(u32 __user *uaddr, union futex_key *key)
148{ 179{
180 unsigned long address = (unsigned long)uaddr;
149 struct mm_struct *mm = current->mm; 181 struct mm_struct *mm = current->mm;
150 struct vm_area_struct *vma; 182 struct vm_area_struct *vma;
151 struct page *page; 183 struct page *page;
@@ -154,16 +186,16 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
154 /* 186 /*
155 * The futex address must be "naturally" aligned. 187 * The futex address must be "naturally" aligned.
156 */ 188 */
157 key->both.offset = uaddr % PAGE_SIZE; 189 key->both.offset = address % PAGE_SIZE;
158 if (unlikely((key->both.offset % sizeof(u32)) != 0)) 190 if (unlikely((key->both.offset % sizeof(u32)) != 0))
159 return -EINVAL; 191 return -EINVAL;
160 uaddr -= key->both.offset; 192 address -= key->both.offset;
161 193
162 /* 194 /*
163 * The futex is hashed differently depending on whether 195 * The futex is hashed differently depending on whether
164 * it's in a shared or private mapping. So check vma first. 196 * it's in a shared or private mapping. So check vma first.
165 */ 197 */
166 vma = find_extend_vma(mm, uaddr); 198 vma = find_extend_vma(mm, address);
167 if (unlikely(!vma)) 199 if (unlikely(!vma))
168 return -EFAULT; 200 return -EFAULT;
169 201
@@ -184,7 +216,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
184 */ 216 */
185 if (likely(!(vma->vm_flags & VM_MAYSHARE))) { 217 if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
186 key->private.mm = mm; 218 key->private.mm = mm;
187 key->private.uaddr = uaddr; 219 key->private.address = address;
188 return 0; 220 return 0;
189 } 221 }
190 222
@@ -194,7 +226,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
194 key->shared.inode = vma->vm_file->f_dentry->d_inode; 226 key->shared.inode = vma->vm_file->f_dentry->d_inode;
195 key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ 227 key->both.offset++; /* Bit 0 of offset indicates inode-based key. */
196 if (likely(!(vma->vm_flags & VM_NONLINEAR))) { 228 if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
197 key->shared.pgoff = (((uaddr - vma->vm_start) >> PAGE_SHIFT) 229 key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
198 + vma->vm_pgoff); 230 + vma->vm_pgoff);
199 return 0; 231 return 0;
200 } 232 }
@@ -205,7 +237,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
205 * from swap. But that's a lot of code to duplicate here 237 * from swap. But that's a lot of code to duplicate here
206 * for a rare case, so we simply fetch the page. 238 * for a rare case, so we simply fetch the page.
207 */ 239 */
208 err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL); 240 err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
209 if (err >= 0) { 241 if (err >= 0) {
210 key->shared.pgoff = 242 key->shared.pgoff =
211 page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 243 page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -246,18 +278,259 @@ static void drop_key_refs(union futex_key *key)
246 } 278 }
247} 279}
248 280
249static inline int get_futex_value_locked(int *dest, int __user *from) 281static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
250{ 282{
251 int ret; 283 int ret;
252 284
253 inc_preempt_count(); 285 inc_preempt_count();
254 ret = __copy_from_user_inatomic(dest, from, sizeof(int)); 286 ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
255 dec_preempt_count(); 287 dec_preempt_count();
256 288
257 return ret ? -EFAULT : 0; 289 return ret ? -EFAULT : 0;
258} 290}
259 291
260/* 292/*
293 * Fault handling. Called with current->mm->mmap_sem held.
294 */
295static int futex_handle_fault(unsigned long address, int attempt)
296{
297 struct vm_area_struct * vma;
298 struct mm_struct *mm = current->mm;
299
300 if (attempt > 2 || !(vma = find_vma(mm, address)) ||
301 vma->vm_start > address || !(vma->vm_flags & VM_WRITE))
302 return -EFAULT;
303
304 switch (handle_mm_fault(mm, vma, address, 1)) {
305 case VM_FAULT_MINOR:
306 current->min_flt++;
307 break;
308 case VM_FAULT_MAJOR:
309 current->maj_flt++;
310 break;
311 default:
312 return -EFAULT;
313 }
314 return 0;
315}
316
317/*
318 * PI code:
319 */
320static int refill_pi_state_cache(void)
321{
322 struct futex_pi_state *pi_state;
323
324 if (likely(current->pi_state_cache))
325 return 0;
326
327 pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL);
328
329 if (!pi_state)
330 return -ENOMEM;
331
332 memset(pi_state, 0, sizeof(*pi_state));
333 INIT_LIST_HEAD(&pi_state->list);
334 /* pi_mutex gets initialized later */
335 pi_state->owner = NULL;
336 atomic_set(&pi_state->refcount, 1);
337
338 current->pi_state_cache = pi_state;
339
340 return 0;
341}
342
343static struct futex_pi_state * alloc_pi_state(void)
344{
345 struct futex_pi_state *pi_state = current->pi_state_cache;
346
347 WARN_ON(!pi_state);
348 current->pi_state_cache = NULL;
349
350 return pi_state;
351}
352
353static void free_pi_state(struct futex_pi_state *pi_state)
354{
355 if (!atomic_dec_and_test(&pi_state->refcount))
356 return;
357
358 /*
359 * If pi_state->owner is NULL, the owner is most probably dying
360 * and has cleaned up the pi_state already
361 */
362 if (pi_state->owner) {
363 spin_lock_irq(&pi_state->owner->pi_lock);
364 list_del_init(&pi_state->list);
365 spin_unlock_irq(&pi_state->owner->pi_lock);
366
367 rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
368 }
369
370 if (current->pi_state_cache)
371 kfree(pi_state);
372 else {
373 /*
374 * pi_state->list is already empty.
375 * clear pi_state->owner.
376 * refcount is at 0 - put it back to 1.
377 */
378 pi_state->owner = NULL;
379 atomic_set(&pi_state->refcount, 1);
380 current->pi_state_cache = pi_state;
381 }
382}
383
384/*
385 * Look up the task based on what TID userspace gave us.
386 * We dont trust it.
387 */
388static struct task_struct * futex_find_get_task(pid_t pid)
389{
390 struct task_struct *p;
391
392 rcu_read_lock();
393 p = find_task_by_pid(pid);
394 if (!p)
395 goto out_unlock;
396 if ((current->euid != p->euid) && (current->euid != p->uid)) {
397 p = NULL;
398 goto out_unlock;
399 }
400 if (p->exit_state != 0) {
401 p = NULL;
402 goto out_unlock;
403 }
404 get_task_struct(p);
405out_unlock:
406 rcu_read_unlock();
407
408 return p;
409}
410
411/*
412 * This task is holding PI mutexes at exit time => bad.
413 * Kernel cleans up PI-state, but userspace is likely hosed.
414 * (Robust-futex cleanup is separate and might save the day for userspace.)
415 */
416void exit_pi_state_list(struct task_struct *curr)
417{
418 struct list_head *next, *head = &curr->pi_state_list;
419 struct futex_pi_state *pi_state;
420 struct futex_hash_bucket *hb;
421 union futex_key key;
422
423 /*
424 * We are a ZOMBIE and nobody can enqueue itself on
425 * pi_state_list anymore, but we have to be careful
426 * versus waiters unqueueing themselves:
427 */
428 spin_lock_irq(&curr->pi_lock);
429 while (!list_empty(head)) {
430
431 next = head->next;
432 pi_state = list_entry(next, struct futex_pi_state, list);
433 key = pi_state->key;
434 hb = hash_futex(&key);
435 spin_unlock_irq(&curr->pi_lock);
436
437 spin_lock(&hb->lock);
438
439 spin_lock_irq(&curr->pi_lock);
440 /*
441 * We dropped the pi-lock, so re-check whether this
442 * task still owns the PI-state:
443 */
444 if (head->next != next) {
445 spin_unlock(&hb->lock);
446 continue;
447 }
448
449 WARN_ON(pi_state->owner != curr);
450 WARN_ON(list_empty(&pi_state->list));
451 list_del_init(&pi_state->list);
452 pi_state->owner = NULL;
453 spin_unlock_irq(&curr->pi_lock);
454
455 rt_mutex_unlock(&pi_state->pi_mutex);
456
457 spin_unlock(&hb->lock);
458
459 spin_lock_irq(&curr->pi_lock);
460 }
461 spin_unlock_irq(&curr->pi_lock);
462}
463
464static int
465lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
466{
467 struct futex_pi_state *pi_state = NULL;
468 struct futex_q *this, *next;
469 struct list_head *head;
470 struct task_struct *p;
471 pid_t pid;
472
473 head = &hb->chain;
474
475 list_for_each_entry_safe(this, next, head, list) {
476 if (match_futex(&this->key, &me->key)) {
477 /*
478 * Another waiter already exists - bump up
479 * the refcount and return its pi_state:
480 */
481 pi_state = this->pi_state;
482 /*
483 * Userspace might have messed up non PI and PI futexes
484 */
485 if (unlikely(!pi_state))
486 return -EINVAL;
487
488 WARN_ON(!atomic_read(&pi_state->refcount));
489
490 atomic_inc(&pi_state->refcount);
491 me->pi_state = pi_state;
492
493 return 0;
494 }
495 }
496
497 /*
498 * We are the first waiter - try to look up the real owner and attach
499 * the new pi_state to it, but bail out when the owner died bit is set
500 * and TID = 0:
501 */
502 pid = uval & FUTEX_TID_MASK;
503 if (!pid && (uval & FUTEX_OWNER_DIED))
504 return -ESRCH;
505 p = futex_find_get_task(pid);
506 if (!p)
507 return -ESRCH;
508
509 pi_state = alloc_pi_state();
510
511 /*
512 * Initialize the pi_mutex in locked state and make 'p'
513 * the owner of it:
514 */
515 rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
516
517 /* Store the key for possible exit cleanups: */
518 pi_state->key = me->key;
519
520 spin_lock_irq(&p->pi_lock);
521 WARN_ON(!list_empty(&pi_state->list));
522 list_add(&pi_state->list, &p->pi_state_list);
523 pi_state->owner = p;
524 spin_unlock_irq(&p->pi_lock);
525
526 put_task_struct(p);
527
528 me->pi_state = pi_state;
529
530 return 0;
531}
532
533/*
261 * The hash bucket lock must be held when this is called. 534 * The hash bucket lock must be held when this is called.
262 * Afterwards, the futex_q must not be accessed. 535 * Afterwards, the futex_q must not be accessed.
263 */ 536 */
@@ -284,16 +557,105 @@ static void wake_futex(struct futex_q *q)
284 q->lock_ptr = NULL; 557 q->lock_ptr = NULL;
285} 558}
286 559
560static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
561{
562 struct task_struct *new_owner;
563 struct futex_pi_state *pi_state = this->pi_state;
564 u32 curval, newval;
565
566 if (!pi_state)
567 return -EINVAL;
568
569 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
570
571 /*
572 * This happens when we have stolen the lock and the original
573 * pending owner did not enqueue itself back on the rt_mutex.
574 * Thats not a tragedy. We know that way, that a lock waiter
575 * is on the fly. We make the futex_q waiter the pending owner.
576 */
577 if (!new_owner)
578 new_owner = this->task;
579
580 /*
581 * We pass it to the next owner. (The WAITERS bit is always
582 * kept enabled while there is PI state around. We must also
583 * preserve the owner died bit.)
584 */
585 if (!(uval & FUTEX_OWNER_DIED)) {
586 newval = FUTEX_WAITERS | new_owner->pid;
587
588 inc_preempt_count();
589 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
590 dec_preempt_count();
591 if (curval == -EFAULT)
592 return -EFAULT;
593 if (curval != uval)
594 return -EINVAL;
595 }
596
597 spin_lock_irq(&pi_state->owner->pi_lock);
598 WARN_ON(list_empty(&pi_state->list));
599 list_del_init(&pi_state->list);
600 spin_unlock_irq(&pi_state->owner->pi_lock);
601
602 spin_lock_irq(&new_owner->pi_lock);
603 WARN_ON(!list_empty(&pi_state->list));
604 list_add(&pi_state->list, &new_owner->pi_state_list);
605 pi_state->owner = new_owner;
606 spin_unlock_irq(&new_owner->pi_lock);
607
608 rt_mutex_unlock(&pi_state->pi_mutex);
609
610 return 0;
611}
612
613static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
614{
615 u32 oldval;
616
617 /*
618 * There is no waiter, so we unlock the futex. The owner died
619 * bit has not to be preserved here. We are the owner:
620 */
621 inc_preempt_count();
622 oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0);
623 dec_preempt_count();
624
625 if (oldval == -EFAULT)
626 return oldval;
627 if (oldval != uval)
628 return -EAGAIN;
629
630 return 0;
631}
632
633/*
634 * Express the locking dependencies for lockdep:
635 */
636static inline void
637double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
638{
639 if (hb1 <= hb2) {
640 spin_lock(&hb1->lock);
641 if (hb1 < hb2)
642 spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
643 } else { /* hb1 > hb2 */
644 spin_lock(&hb2->lock);
645 spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
646 }
647}
648
287/* 649/*
288 * Wake up all waiters hashed on the physical page that is mapped 650 * Wake up all waiters hashed on the physical page that is mapped
289 * to this virtual address: 651 * to this virtual address:
290 */ 652 */
291static int futex_wake(unsigned long uaddr, int nr_wake) 653static int futex_wake(u32 __user *uaddr, int nr_wake)
292{ 654{
293 union futex_key key; 655 struct futex_hash_bucket *hb;
294 struct futex_hash_bucket *bh;
295 struct list_head *head;
296 struct futex_q *this, *next; 656 struct futex_q *this, *next;
657 struct list_head *head;
658 union futex_key key;
297 int ret; 659 int ret;
298 660
299 down_read(&current->mm->mmap_sem); 661 down_read(&current->mm->mmap_sem);
@@ -302,19 +664,23 @@ static int futex_wake(unsigned long uaddr, int nr_wake)
302 if (unlikely(ret != 0)) 664 if (unlikely(ret != 0))
303 goto out; 665 goto out;
304 666
305 bh = hash_futex(&key); 667 hb = hash_futex(&key);
306 spin_lock(&bh->lock); 668 spin_lock(&hb->lock);
307 head = &bh->chain; 669 head = &hb->chain;
308 670
309 list_for_each_entry_safe(this, next, head, list) { 671 list_for_each_entry_safe(this, next, head, list) {
310 if (match_futex (&this->key, &key)) { 672 if (match_futex (&this->key, &key)) {
673 if (this->pi_state) {
674 ret = -EINVAL;
675 break;
676 }
311 wake_futex(this); 677 wake_futex(this);
312 if (++ret >= nr_wake) 678 if (++ret >= nr_wake)
313 break; 679 break;
314 } 680 }
315 } 681 }
316 682
317 spin_unlock(&bh->lock); 683 spin_unlock(&hb->lock);
318out: 684out:
319 up_read(&current->mm->mmap_sem); 685 up_read(&current->mm->mmap_sem);
320 return ret; 686 return ret;
@@ -324,10 +690,12 @@ out:
324 * Wake up all waiters hashed on the physical page that is mapped 690 * Wake up all waiters hashed on the physical page that is mapped
325 * to this virtual address: 691 * to this virtual address:
326 */ 692 */
327static int futex_wake_op(unsigned long uaddr1, unsigned long uaddr2, int nr_wake, int nr_wake2, int op) 693static int
694futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2,
695 int nr_wake, int nr_wake2, int op)
328{ 696{
329 union futex_key key1, key2; 697 union futex_key key1, key2;
330 struct futex_hash_bucket *bh1, *bh2; 698 struct futex_hash_bucket *hb1, *hb2;
331 struct list_head *head; 699 struct list_head *head;
332 struct futex_q *this, *next; 700 struct futex_q *this, *next;
333 int ret, op_ret, attempt = 0; 701 int ret, op_ret, attempt = 0;
@@ -342,27 +710,25 @@ retryfull:
342 if (unlikely(ret != 0)) 710 if (unlikely(ret != 0))
343 goto out; 711 goto out;
344 712
345 bh1 = hash_futex(&key1); 713 hb1 = hash_futex(&key1);
346 bh2 = hash_futex(&key2); 714 hb2 = hash_futex(&key2);
347 715
348retry: 716retry:
349 if (bh1 < bh2) 717 double_lock_hb(hb1, hb2);
350 spin_lock(&bh1->lock);
351 spin_lock(&bh2->lock);
352 if (bh1 > bh2)
353 spin_lock(&bh1->lock);
354 718
355 op_ret = futex_atomic_op_inuser(op, (int __user *)uaddr2); 719 op_ret = futex_atomic_op_inuser(op, uaddr2);
356 if (unlikely(op_ret < 0)) { 720 if (unlikely(op_ret < 0)) {
357 int dummy; 721 u32 dummy;
358 722
359 spin_unlock(&bh1->lock); 723 spin_unlock(&hb1->lock);
360 if (bh1 != bh2) 724 if (hb1 != hb2)
361 spin_unlock(&bh2->lock); 725 spin_unlock(&hb2->lock);
362 726
363#ifndef CONFIG_MMU 727#ifndef CONFIG_MMU
364 /* we don't get EFAULT from MMU faults if we don't have an MMU, 728 /*
365 * but we might get them from range checking */ 729 * we don't get EFAULT from MMU faults if we don't have an MMU,
730 * but we might get them from range checking
731 */
366 ret = op_ret; 732 ret = op_ret;
367 goto out; 733 goto out;
368#endif 734#endif
@@ -372,47 +738,36 @@ retry:
372 goto out; 738 goto out;
373 } 739 }
374 740
375 /* futex_atomic_op_inuser needs to both read and write 741 /*
742 * futex_atomic_op_inuser needs to both read and write
376 * *(int __user *)uaddr2, but we can't modify it 743 * *(int __user *)uaddr2, but we can't modify it
377 * non-atomically. Therefore, if get_user below is not 744 * non-atomically. Therefore, if get_user below is not
378 * enough, we need to handle the fault ourselves, while 745 * enough, we need to handle the fault ourselves, while
379 * still holding the mmap_sem. */ 746 * still holding the mmap_sem.
747 */
380 if (attempt++) { 748 if (attempt++) {
381 struct vm_area_struct * vma; 749 if (futex_handle_fault((unsigned long)uaddr2,
382 struct mm_struct *mm = current->mm; 750 attempt)) {
383 751 ret = -EFAULT;
384 ret = -EFAULT;
385 if (attempt >= 2 ||
386 !(vma = find_vma(mm, uaddr2)) ||
387 vma->vm_start > uaddr2 ||
388 !(vma->vm_flags & VM_WRITE))
389 goto out;
390
391 switch (handle_mm_fault(mm, vma, uaddr2, 1)) {
392 case VM_FAULT_MINOR:
393 current->min_flt++;
394 break;
395 case VM_FAULT_MAJOR:
396 current->maj_flt++;
397 break;
398 default:
399 goto out; 752 goto out;
400 } 753 }
401 goto retry; 754 goto retry;
402 } 755 }
403 756
404 /* If we would have faulted, release mmap_sem, 757 /*
405 * fault it in and start all over again. */ 758 * If we would have faulted, release mmap_sem,
759 * fault it in and start all over again.
760 */
406 up_read(&current->mm->mmap_sem); 761 up_read(&current->mm->mmap_sem);
407 762
408 ret = get_user(dummy, (int __user *)uaddr2); 763 ret = get_user(dummy, uaddr2);
409 if (ret) 764 if (ret)
410 return ret; 765 return ret;
411 766
412 goto retryfull; 767 goto retryfull;
413 } 768 }
414 769
415 head = &bh1->chain; 770 head = &hb1->chain;
416 771
417 list_for_each_entry_safe(this, next, head, list) { 772 list_for_each_entry_safe(this, next, head, list) {
418 if (match_futex (&this->key, &key1)) { 773 if (match_futex (&this->key, &key1)) {
@@ -423,7 +778,7 @@ retry:
423 } 778 }
424 779
425 if (op_ret > 0) { 780 if (op_ret > 0) {
426 head = &bh2->chain; 781 head = &hb2->chain;
427 782
428 op_ret = 0; 783 op_ret = 0;
429 list_for_each_entry_safe(this, next, head, list) { 784 list_for_each_entry_safe(this, next, head, list) {
@@ -436,9 +791,9 @@ retry:
436 ret += op_ret; 791 ret += op_ret;
437 } 792 }
438 793
439 spin_unlock(&bh1->lock); 794 spin_unlock(&hb1->lock);
440 if (bh1 != bh2) 795 if (hb1 != hb2)
441 spin_unlock(&bh2->lock); 796 spin_unlock(&hb2->lock);
442out: 797out:
443 up_read(&current->mm->mmap_sem); 798 up_read(&current->mm->mmap_sem);
444 return ret; 799 return ret;
@@ -448,11 +803,11 @@ out:
448 * Requeue all waiters hashed on one physical page to another 803 * Requeue all waiters hashed on one physical page to another
449 * physical page. 804 * physical page.
450 */ 805 */
451static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2, 806static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
452 int nr_wake, int nr_requeue, int *valp) 807 int nr_wake, int nr_requeue, u32 *cmpval)
453{ 808{
454 union futex_key key1, key2; 809 union futex_key key1, key2;
455 struct futex_hash_bucket *bh1, *bh2; 810 struct futex_hash_bucket *hb1, *hb2;
456 struct list_head *head1; 811 struct list_head *head1;
457 struct futex_q *this, *next; 812 struct futex_q *this, *next;
458 int ret, drop_count = 0; 813 int ret, drop_count = 0;
@@ -467,68 +822,68 @@ static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
467 if (unlikely(ret != 0)) 822 if (unlikely(ret != 0))
468 goto out; 823 goto out;
469 824
470 bh1 = hash_futex(&key1); 825 hb1 = hash_futex(&key1);
471 bh2 = hash_futex(&key2); 826 hb2 = hash_futex(&key2);
472 827
473 if (bh1 < bh2) 828 double_lock_hb(hb1, hb2);
474 spin_lock(&bh1->lock);
475 spin_lock(&bh2->lock);
476 if (bh1 > bh2)
477 spin_lock(&bh1->lock);
478 829
479 if (likely(valp != NULL)) { 830 if (likely(cmpval != NULL)) {
480 int curval; 831 u32 curval;
481 832
482 ret = get_futex_value_locked(&curval, (int __user *)uaddr1); 833 ret = get_futex_value_locked(&curval, uaddr1);
483 834
484 if (unlikely(ret)) { 835 if (unlikely(ret)) {
485 spin_unlock(&bh1->lock); 836 spin_unlock(&hb1->lock);
486 if (bh1 != bh2) 837 if (hb1 != hb2)
487 spin_unlock(&bh2->lock); 838 spin_unlock(&hb2->lock);
488 839
489 /* If we would have faulted, release mmap_sem, fault 840 /*
841 * If we would have faulted, release mmap_sem, fault
490 * it in and start all over again. 842 * it in and start all over again.
491 */ 843 */
492 up_read(&current->mm->mmap_sem); 844 up_read(&current->mm->mmap_sem);
493 845
494 ret = get_user(curval, (int __user *)uaddr1); 846 ret = get_user(curval, uaddr1);
495 847
496 if (!ret) 848 if (!ret)
497 goto retry; 849 goto retry;
498 850
499 return ret; 851 return ret;
500 } 852 }
501 if (curval != *valp) { 853 if (curval != *cmpval) {
502 ret = -EAGAIN; 854 ret = -EAGAIN;
503 goto out_unlock; 855 goto out_unlock;
504 } 856 }
505 } 857 }
506 858
507 head1 = &bh1->chain; 859 head1 = &hb1->chain;
508 list_for_each_entry_safe(this, next, head1, list) { 860 list_for_each_entry_safe(this, next, head1, list) {
509 if (!match_futex (&this->key, &key1)) 861 if (!match_futex (&this->key, &key1))
510 continue; 862 continue;
511 if (++ret <= nr_wake) { 863 if (++ret <= nr_wake) {
512 wake_futex(this); 864 wake_futex(this);
513 } else { 865 } else {
514 list_move_tail(&this->list, &bh2->chain); 866 /*
515 this->lock_ptr = &bh2->lock; 867 * If key1 and key2 hash to the same bucket, no need to
868 * requeue.
869 */
870 if (likely(head1 != &hb2->chain)) {
871 list_move_tail(&this->list, &hb2->chain);
872 this->lock_ptr = &hb2->lock;
873 }
516 this->key = key2; 874 this->key = key2;
517 get_key_refs(&key2); 875 get_key_refs(&key2);
518 drop_count++; 876 drop_count++;
519 877
520 if (ret - nr_wake >= nr_requeue) 878 if (ret - nr_wake >= nr_requeue)
521 break; 879 break;
522 /* Make sure to stop if key1 == key2 */
523 if (head1 == &bh2->chain && head1 != &next->list)
524 head1 = &this->list;
525 } 880 }
526 } 881 }
527 882
528out_unlock: 883out_unlock:
529 spin_unlock(&bh1->lock); 884 spin_unlock(&hb1->lock);
530 if (bh1 != bh2) 885 if (hb1 != hb2)
531 spin_unlock(&bh2->lock); 886 spin_unlock(&hb2->lock);
532 887
533 /* drop_key_refs() must be called outside the spinlocks. */ 888 /* drop_key_refs() must be called outside the spinlocks. */
534 while (--drop_count >= 0) 889 while (--drop_count >= 0)
@@ -543,7 +898,7 @@ out:
543static inline struct futex_hash_bucket * 898static inline struct futex_hash_bucket *
544queue_lock(struct futex_q *q, int fd, struct file *filp) 899queue_lock(struct futex_q *q, int fd, struct file *filp)
545{ 900{
546 struct futex_hash_bucket *bh; 901 struct futex_hash_bucket *hb;
547 902
548 q->fd = fd; 903 q->fd = fd;
549 q->filp = filp; 904 q->filp = filp;
@@ -551,23 +906,24 @@ queue_lock(struct futex_q *q, int fd, struct file *filp)
551 init_waitqueue_head(&q->waiters); 906 init_waitqueue_head(&q->waiters);
552 907
553 get_key_refs(&q->key); 908 get_key_refs(&q->key);
554 bh = hash_futex(&q->key); 909 hb = hash_futex(&q->key);
555 q->lock_ptr = &bh->lock; 910 q->lock_ptr = &hb->lock;
556 911
557 spin_lock(&bh->lock); 912 spin_lock(&hb->lock);
558 return bh; 913 return hb;
559} 914}
560 915
561static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *bh) 916static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
562{ 917{
563 list_add_tail(&q->list, &bh->chain); 918 list_add_tail(&q->list, &hb->chain);
564 spin_unlock(&bh->lock); 919 q->task = current;
920 spin_unlock(&hb->lock);
565} 921}
566 922
567static inline void 923static inline void
568queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh) 924queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
569{ 925{
570 spin_unlock(&bh->lock); 926 spin_unlock(&hb->lock);
571 drop_key_refs(&q->key); 927 drop_key_refs(&q->key);
572} 928}
573 929
@@ -579,20 +935,22 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh)
579/* The key must be already stored in q->key. */ 935/* The key must be already stored in q->key. */
580static void queue_me(struct futex_q *q, int fd, struct file *filp) 936static void queue_me(struct futex_q *q, int fd, struct file *filp)
581{ 937{
582 struct futex_hash_bucket *bh; 938 struct futex_hash_bucket *hb;
583 bh = queue_lock(q, fd, filp); 939
584 __queue_me(q, bh); 940 hb = queue_lock(q, fd, filp);
941 __queue_me(q, hb);
585} 942}
586 943
587/* Return 1 if we were still queued (ie. 0 means we were woken) */ 944/* Return 1 if we were still queued (ie. 0 means we were woken) */
588static int unqueue_me(struct futex_q *q) 945static int unqueue_me(struct futex_q *q)
589{ 946{
590 int ret = 0;
591 spinlock_t *lock_ptr; 947 spinlock_t *lock_ptr;
948 int ret = 0;
592 949
593 /* In the common case we don't take the spinlock, which is nice. */ 950 /* In the common case we don't take the spinlock, which is nice. */
594 retry: 951 retry:
595 lock_ptr = q->lock_ptr; 952 lock_ptr = q->lock_ptr;
953 barrier();
596 if (lock_ptr != 0) { 954 if (lock_ptr != 0) {
597 spin_lock(lock_ptr); 955 spin_lock(lock_ptr);
598 /* 956 /*
@@ -614,6 +972,9 @@ static int unqueue_me(struct futex_q *q)
614 } 972 }
615 WARN_ON(list_empty(&q->list)); 973 WARN_ON(list_empty(&q->list));
616 list_del(&q->list); 974 list_del(&q->list);
975
976 BUG_ON(q->pi_state);
977
617 spin_unlock(lock_ptr); 978 spin_unlock(lock_ptr);
618 ret = 1; 979 ret = 1;
619 } 980 }
@@ -622,21 +983,42 @@ static int unqueue_me(struct futex_q *q)
622 return ret; 983 return ret;
623} 984}
624 985
625static int futex_wait(unsigned long uaddr, int val, unsigned long time) 986/*
987 * PI futexes can not be requeued and must remove themself from the
988 * hash bucket. The hash bucket lock is held on entry and dropped here.
989 */
990static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb)
991{
992 WARN_ON(list_empty(&q->list));
993 list_del(&q->list);
994
995 BUG_ON(!q->pi_state);
996 free_pi_state(q->pi_state);
997 q->pi_state = NULL;
998
999 spin_unlock(&hb->lock);
1000
1001 drop_key_refs(&q->key);
1002}
1003
1004static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
626{ 1005{
627 DECLARE_WAITQUEUE(wait, current); 1006 struct task_struct *curr = current;
628 int ret, curval; 1007 DECLARE_WAITQUEUE(wait, curr);
1008 struct futex_hash_bucket *hb;
629 struct futex_q q; 1009 struct futex_q q;
630 struct futex_hash_bucket *bh; 1010 u32 uval;
1011 int ret;
631 1012
1013 q.pi_state = NULL;
632 retry: 1014 retry:
633 down_read(&current->mm->mmap_sem); 1015 down_read(&curr->mm->mmap_sem);
634 1016
635 ret = get_futex_key(uaddr, &q.key); 1017 ret = get_futex_key(uaddr, &q.key);
636 if (unlikely(ret != 0)) 1018 if (unlikely(ret != 0))
637 goto out_release_sem; 1019 goto out_release_sem;
638 1020
639 bh = queue_lock(&q, -1, NULL); 1021 hb = queue_lock(&q, -1, NULL);
640 1022
641 /* 1023 /*
642 * Access the page AFTER the futex is queued. 1024 * Access the page AFTER the futex is queued.
@@ -658,37 +1040,35 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time)
658 * We hold the mmap semaphore, so the mapping cannot have changed 1040 * We hold the mmap semaphore, so the mapping cannot have changed
659 * since we looked it up in get_futex_key. 1041 * since we looked it up in get_futex_key.
660 */ 1042 */
661 1043 ret = get_futex_value_locked(&uval, uaddr);
662 ret = get_futex_value_locked(&curval, (int __user *)uaddr);
663 1044
664 if (unlikely(ret)) { 1045 if (unlikely(ret)) {
665 queue_unlock(&q, bh); 1046 queue_unlock(&q, hb);
666 1047
667 /* If we would have faulted, release mmap_sem, fault it in and 1048 /*
1049 * If we would have faulted, release mmap_sem, fault it in and
668 * start all over again. 1050 * start all over again.
669 */ 1051 */
670 up_read(&current->mm->mmap_sem); 1052 up_read(&curr->mm->mmap_sem);
671 1053
672 ret = get_user(curval, (int __user *)uaddr); 1054 ret = get_user(uval, uaddr);
673 1055
674 if (!ret) 1056 if (!ret)
675 goto retry; 1057 goto retry;
676 return ret; 1058 return ret;
677 } 1059 }
678 if (curval != val) { 1060 ret = -EWOULDBLOCK;
679 ret = -EWOULDBLOCK; 1061 if (uval != val)
680 queue_unlock(&q, bh); 1062 goto out_unlock_release_sem;
681 goto out_release_sem;
682 }
683 1063
684 /* Only actually queue if *uaddr contained val. */ 1064 /* Only actually queue if *uaddr contained val. */
685 __queue_me(&q, bh); 1065 __queue_me(&q, hb);
686 1066
687 /* 1067 /*
688 * Now the futex is queued and we have checked the data, we 1068 * Now the futex is queued and we have checked the data, we
689 * don't want to hold mmap_sem while we sleep. 1069 * don't want to hold mmap_sem while we sleep.
690 */ 1070 */
691 up_read(&current->mm->mmap_sem); 1071 up_read(&curr->mm->mmap_sem);
692 1072
693 /* 1073 /*
694 * There might have been scheduling since the queue_me(), as we 1074 * There might have been scheduling since the queue_me(), as we
@@ -720,12 +1100,367 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time)
720 return 0; 1100 return 0;
721 if (time == 0) 1101 if (time == 0)
722 return -ETIMEDOUT; 1102 return -ETIMEDOUT;
723 /* We expect signal_pending(current), but another thread may 1103 /*
724 * have handled it for us already. */ 1104 * We expect signal_pending(current), but another thread may
1105 * have handled it for us already.
1106 */
725 return -EINTR; 1107 return -EINTR;
726 1108
1109 out_unlock_release_sem:
1110 queue_unlock(&q, hb);
1111
727 out_release_sem: 1112 out_release_sem:
1113 up_read(&curr->mm->mmap_sem);
1114 return ret;
1115}
1116
1117/*
1118 * Userspace tried a 0 -> TID atomic transition of the futex value
1119 * and failed. The kernel side here does the whole locking operation:
1120 * if there are waiters then it will block, it does PI, etc. (Due to
1121 * races the kernel might see a 0 value of the futex too.)
1122 */
1123static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1124 long nsec, int trylock)
1125{
1126 struct hrtimer_sleeper timeout, *to = NULL;
1127 struct task_struct *curr = current;
1128 struct futex_hash_bucket *hb;
1129 u32 uval, newval, curval;
1130 struct futex_q q;
1131 int ret, attempt = 0;
1132
1133 if (refill_pi_state_cache())
1134 return -ENOMEM;
1135
1136 if (sec != MAX_SCHEDULE_TIMEOUT) {
1137 to = &timeout;
1138 hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS);
1139 hrtimer_init_sleeper(to, current);
1140 to->timer.expires = ktime_set(sec, nsec);
1141 }
1142
1143 q.pi_state = NULL;
1144 retry:
1145 down_read(&curr->mm->mmap_sem);
1146
1147 ret = get_futex_key(uaddr, &q.key);
1148 if (unlikely(ret != 0))
1149 goto out_release_sem;
1150
1151 hb = queue_lock(&q, -1, NULL);
1152
1153 retry_locked:
1154 /*
1155 * To avoid races, we attempt to take the lock here again
1156 * (by doing a 0 -> TID atomic cmpxchg), while holding all
1157 * the locks. It will most likely not succeed.
1158 */
1159 newval = current->pid;
1160
1161 inc_preempt_count();
1162 curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval);
1163 dec_preempt_count();
1164
1165 if (unlikely(curval == -EFAULT))
1166 goto uaddr_faulted;
1167
1168 /* We own the lock already */
1169 if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) {
1170 if (!detect && 0)
1171 force_sig(SIGKILL, current);
1172 ret = -EDEADLK;
1173 goto out_unlock_release_sem;
1174 }
1175
1176 /*
1177 * Surprise - we got the lock. Just return
1178 * to userspace:
1179 */
1180 if (unlikely(!curval))
1181 goto out_unlock_release_sem;
1182
1183 uval = curval;
1184 newval = uval | FUTEX_WAITERS;
1185
1186 inc_preempt_count();
1187 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
1188 dec_preempt_count();
1189
1190 if (unlikely(curval == -EFAULT))
1191 goto uaddr_faulted;
1192 if (unlikely(curval != uval))
1193 goto retry_locked;
1194
1195 /*
1196 * We dont have the lock. Look up the PI state (or create it if
1197 * we are the first waiter):
1198 */
1199 ret = lookup_pi_state(uval, hb, &q);
1200
1201 if (unlikely(ret)) {
1202 /*
1203 * There were no waiters and the owner task lookup
1204 * failed. When the OWNER_DIED bit is set, then we
1205 * know that this is a robust futex and we actually
1206 * take the lock. This is safe as we are protected by
1207 * the hash bucket lock. We also set the waiters bit
1208 * unconditionally here, to simplify glibc handling of
1209 * multiple tasks racing to acquire the lock and
1210 * cleanup the problems which were left by the dead
1211 * owner.
1212 */
1213 if (curval & FUTEX_OWNER_DIED) {
1214 uval = newval;
1215 newval = current->pid |
1216 FUTEX_OWNER_DIED | FUTEX_WAITERS;
1217
1218 inc_preempt_count();
1219 curval = futex_atomic_cmpxchg_inatomic(uaddr,
1220 uval, newval);
1221 dec_preempt_count();
1222
1223 if (unlikely(curval == -EFAULT))
1224 goto uaddr_faulted;
1225 if (unlikely(curval != uval))
1226 goto retry_locked;
1227 ret = 0;
1228 }
1229 goto out_unlock_release_sem;
1230 }
1231
1232 /*
1233 * Only actually queue now that the atomic ops are done:
1234 */
1235 __queue_me(&q, hb);
1236
1237 /*
1238 * Now the futex is queued and we have checked the data, we
1239 * don't want to hold mmap_sem while we sleep.
1240 */
1241 up_read(&curr->mm->mmap_sem);
1242
1243 WARN_ON(!q.pi_state);
1244 /*
1245 * Block on the PI mutex:
1246 */
1247 if (!trylock)
1248 ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1);
1249 else {
1250 ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
1251 /* Fixup the trylock return value: */
1252 ret = ret ? 0 : -EWOULDBLOCK;
1253 }
1254
1255 down_read(&curr->mm->mmap_sem);
1256 spin_lock(q.lock_ptr);
1257
1258 /*
1259 * Got the lock. We might not be the anticipated owner if we
1260 * did a lock-steal - fix up the PI-state in that case.
1261 */
1262 if (!ret && q.pi_state->owner != curr) {
1263 u32 newtid = current->pid | FUTEX_WAITERS;
1264
1265 /* Owner died? */
1266 if (q.pi_state->owner != NULL) {
1267 spin_lock_irq(&q.pi_state->owner->pi_lock);
1268 WARN_ON(list_empty(&q.pi_state->list));
1269 list_del_init(&q.pi_state->list);
1270 spin_unlock_irq(&q.pi_state->owner->pi_lock);
1271 } else
1272 newtid |= FUTEX_OWNER_DIED;
1273
1274 q.pi_state->owner = current;
1275
1276 spin_lock_irq(&current->pi_lock);
1277 WARN_ON(!list_empty(&q.pi_state->list));
1278 list_add(&q.pi_state->list, &current->pi_state_list);
1279 spin_unlock_irq(&current->pi_lock);
1280
1281 /* Unqueue and drop the lock */
1282 unqueue_me_pi(&q, hb);
1283 up_read(&curr->mm->mmap_sem);
1284 /*
1285 * We own it, so we have to replace the pending owner
1286 * TID. This must be atomic as we have preserve the
1287 * owner died bit here.
1288 */
1289 ret = get_user(uval, uaddr);
1290 while (!ret) {
1291 newval = (uval & FUTEX_OWNER_DIED) | newtid;
1292 curval = futex_atomic_cmpxchg_inatomic(uaddr,
1293 uval, newval);
1294 if (curval == -EFAULT)
1295 ret = -EFAULT;
1296 if (curval == uval)
1297 break;
1298 uval = curval;
1299 }
1300 } else {
1301 /*
1302 * Catch the rare case, where the lock was released
1303 * when we were on the way back before we locked
1304 * the hash bucket.
1305 */
1306 if (ret && q.pi_state->owner == curr) {
1307 if (rt_mutex_trylock(&q.pi_state->pi_mutex))
1308 ret = 0;
1309 }
1310 /* Unqueue and drop the lock */
1311 unqueue_me_pi(&q, hb);
1312 up_read(&curr->mm->mmap_sem);
1313 }
1314
1315 if (!detect && ret == -EDEADLK && 0)
1316 force_sig(SIGKILL, current);
1317
1318 return ret != -EINTR ? ret : -ERESTARTNOINTR;
1319
1320 out_unlock_release_sem:
1321 queue_unlock(&q, hb);
1322
1323 out_release_sem:
1324 up_read(&curr->mm->mmap_sem);
1325 return ret;
1326
1327 uaddr_faulted:
1328 /*
1329 * We have to r/w *(int __user *)uaddr, but we can't modify it
1330 * non-atomically. Therefore, if get_user below is not
1331 * enough, we need to handle the fault ourselves, while
1332 * still holding the mmap_sem.
1333 */
1334 if (attempt++) {
1335 if (futex_handle_fault((unsigned long)uaddr, attempt)) {
1336 ret = -EFAULT;
1337 goto out_unlock_release_sem;
1338 }
1339 goto retry_locked;
1340 }
1341
1342 queue_unlock(&q, hb);
1343 up_read(&curr->mm->mmap_sem);
1344
1345 ret = get_user(uval, uaddr);
1346 if (!ret && (uval != -EFAULT))
1347 goto retry;
1348
1349 return ret;
1350}
1351
1352/*
1353 * Userspace attempted a TID -> 0 atomic transition, and failed.
1354 * This is the in-kernel slowpath: we look up the PI state (if any),
1355 * and do the rt-mutex unlock.
1356 */
1357static int futex_unlock_pi(u32 __user *uaddr)
1358{
1359 struct futex_hash_bucket *hb;
1360 struct futex_q *this, *next;
1361 u32 uval;
1362 struct list_head *head;
1363 union futex_key key;
1364 int ret, attempt = 0;
1365
1366retry:
1367 if (get_user(uval, uaddr))
1368 return -EFAULT;
1369 /*
1370 * We release only a lock we actually own:
1371 */
1372 if ((uval & FUTEX_TID_MASK) != current->pid)
1373 return -EPERM;
1374 /*
1375 * First take all the futex related locks:
1376 */
1377 down_read(&current->mm->mmap_sem);
1378
1379 ret = get_futex_key(uaddr, &key);
1380 if (unlikely(ret != 0))
1381 goto out;
1382
1383 hb = hash_futex(&key);
1384 spin_lock(&hb->lock);
1385
1386retry_locked:
1387 /*
1388 * To avoid races, try to do the TID -> 0 atomic transition
1389 * again. If it succeeds then we can return without waking
1390 * anyone else up:
1391 */
1392 if (!(uval & FUTEX_OWNER_DIED)) {
1393 inc_preempt_count();
1394 uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
1395 dec_preempt_count();
1396 }
1397
1398 if (unlikely(uval == -EFAULT))
1399 goto pi_faulted;
1400 /*
1401 * Rare case: we managed to release the lock atomically,
1402 * no need to wake anyone else up:
1403 */
1404 if (unlikely(uval == current->pid))
1405 goto out_unlock;
1406
1407 /*
1408 * Ok, other tasks may need to be woken up - check waiters
1409 * and do the wakeup if necessary:
1410 */
1411 head = &hb->chain;
1412
1413 list_for_each_entry_safe(this, next, head, list) {
1414 if (!match_futex (&this->key, &key))
1415 continue;
1416 ret = wake_futex_pi(uaddr, uval, this);
1417 /*
1418 * The atomic access to the futex value
1419 * generated a pagefault, so retry the
1420 * user-access and the wakeup:
1421 */
1422 if (ret == -EFAULT)
1423 goto pi_faulted;
1424 goto out_unlock;
1425 }
1426 /*
1427 * No waiters - kernel unlocks the futex:
1428 */
1429 if (!(uval & FUTEX_OWNER_DIED)) {
1430 ret = unlock_futex_pi(uaddr, uval);
1431 if (ret == -EFAULT)
1432 goto pi_faulted;
1433 }
1434
1435out_unlock:
1436 spin_unlock(&hb->lock);
1437out:
728 up_read(&current->mm->mmap_sem); 1438 up_read(&current->mm->mmap_sem);
1439
1440 return ret;
1441
1442pi_faulted:
1443 /*
1444 * We have to r/w *(int __user *)uaddr, but we can't modify it
1445 * non-atomically. Therefore, if get_user below is not
1446 * enough, we need to handle the fault ourselves, while
1447 * still holding the mmap_sem.
1448 */
1449 if (attempt++) {
1450 if (futex_handle_fault((unsigned long)uaddr, attempt)) {
1451 ret = -EFAULT;
1452 goto out_unlock;
1453 }
1454 goto retry_locked;
1455 }
1456
1457 spin_unlock(&hb->lock);
1458 up_read(&current->mm->mmap_sem);
1459
1460 ret = get_user(uval, uaddr);
1461 if (!ret && (uval != -EFAULT))
1462 goto retry;
1463
729 return ret; 1464 return ret;
730} 1465}
731 1466
@@ -735,6 +1470,7 @@ static int futex_close(struct inode *inode, struct file *filp)
735 1470
736 unqueue_me(q); 1471 unqueue_me(q);
737 kfree(q); 1472 kfree(q);
1473
738 return 0; 1474 return 0;
739} 1475}
740 1476
@@ -766,7 +1502,7 @@ static struct file_operations futex_fops = {
766 * Signal allows caller to avoid the race which would occur if they 1502 * Signal allows caller to avoid the race which would occur if they
767 * set the sigio stuff up afterwards. 1503 * set the sigio stuff up afterwards.
768 */ 1504 */
769static int futex_fd(unsigned long uaddr, int signal) 1505static int futex_fd(u32 __user *uaddr, int signal)
770{ 1506{
771 struct futex_q *q; 1507 struct futex_q *q;
772 struct file *filp; 1508 struct file *filp;
@@ -803,6 +1539,7 @@ static int futex_fd(unsigned long uaddr, int signal)
803 err = -ENOMEM; 1539 err = -ENOMEM;
804 goto error; 1540 goto error;
805 } 1541 }
1542 q->pi_state = NULL;
806 1543
807 down_read(&current->mm->mmap_sem); 1544 down_read(&current->mm->mmap_sem);
808 err = get_futex_key(uaddr, &q->key); 1545 err = get_futex_key(uaddr, &q->key);
@@ -840,7 +1577,7 @@ error:
840 * Implementation: user-space maintains a per-thread list of locks it 1577 * Implementation: user-space maintains a per-thread list of locks it
841 * is holding. Upon do_exit(), the kernel carefully walks this list, 1578 * is holding. Upon do_exit(), the kernel carefully walks this list,
842 * and marks all locks that are owned by this thread with the 1579 * and marks all locks that are owned by this thread with the
843 * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is 1580 * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
844 * always manipulated with the lock held, so the list is private and 1581 * always manipulated with the lock held, so the list is private and
845 * per-thread. Userspace also maintains a per-thread 'list_op_pending' 1582 * per-thread. Userspace also maintains a per-thread 'list_op_pending'
846 * field, to allow the kernel to clean up if the thread dies after 1583 * field, to allow the kernel to clean up if the thread dies after
@@ -887,7 +1624,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
887 struct task_struct *p; 1624 struct task_struct *p;
888 1625
889 ret = -ESRCH; 1626 ret = -ESRCH;
890 read_lock(&tasklist_lock); 1627 rcu_read_lock();
891 p = find_task_by_pid(pid); 1628 p = find_task_by_pid(pid);
892 if (!p) 1629 if (!p)
893 goto err_unlock; 1630 goto err_unlock;
@@ -896,7 +1633,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
896 !capable(CAP_SYS_PTRACE)) 1633 !capable(CAP_SYS_PTRACE))
897 goto err_unlock; 1634 goto err_unlock;
898 head = p->robust_list; 1635 head = p->robust_list;
899 read_unlock(&tasklist_lock); 1636 rcu_read_unlock();
900 } 1637 }
901 1638
902 if (put_user(sizeof(*head), len_ptr)) 1639 if (put_user(sizeof(*head), len_ptr))
@@ -904,7 +1641,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
904 return put_user(head, head_ptr); 1641 return put_user(head, head_ptr);
905 1642
906err_unlock: 1643err_unlock:
907 read_unlock(&tasklist_lock); 1644 rcu_read_unlock();
908 1645
909 return ret; 1646 return ret;
910} 1647}
@@ -913,9 +1650,9 @@ err_unlock:
913 * Process a futex-list entry, check whether it's owned by the 1650 * Process a futex-list entry, check whether it's owned by the
914 * dying task, and do notification if so: 1651 * dying task, and do notification if so:
915 */ 1652 */
916int handle_futex_death(u32 __user *uaddr, struct task_struct *curr) 1653int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
917{ 1654{
918 u32 uval; 1655 u32 uval, nval, mval;
919 1656
920retry: 1657retry:
921 if (get_user(uval, uaddr)) 1658 if (get_user(uval, uaddr))
@@ -932,17 +1669,45 @@ retry:
932 * thread-death.) The rest of the cleanup is done in 1669 * thread-death.) The rest of the cleanup is done in
933 * userspace. 1670 * userspace.
934 */ 1671 */
935 if (futex_atomic_cmpxchg_inatomic(uaddr, uval, 1672 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
936 uval | FUTEX_OWNER_DIED) != uval) 1673 nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval);
1674
1675 if (nval == -EFAULT)
1676 return -1;
1677
1678 if (nval != uval)
937 goto retry; 1679 goto retry;
938 1680
939 if (uval & FUTEX_WAITERS) 1681 /*
940 futex_wake((unsigned long)uaddr, 1); 1682 * Wake robust non-PI futexes here. The wakeup of
1683 * PI futexes happens in exit_pi_state():
1684 */
1685 if (!pi) {
1686 if (uval & FUTEX_WAITERS)
1687 futex_wake(uaddr, 1);
1688 }
941 } 1689 }
942 return 0; 1690 return 0;
943} 1691}
944 1692
945/* 1693/*
1694 * Fetch a robust-list pointer. Bit 0 signals PI futexes:
1695 */
1696static inline int fetch_robust_entry(struct robust_list __user **entry,
1697 struct robust_list __user **head, int *pi)
1698{
1699 unsigned long uentry;
1700
1701 if (get_user(uentry, (unsigned long *)head))
1702 return -EFAULT;
1703
1704 *entry = (void *)(uentry & ~1UL);
1705 *pi = uentry & 1;
1706
1707 return 0;
1708}
1709
1710/*
946 * Walk curr->robust_list (very carefully, it's a userspace list!) 1711 * Walk curr->robust_list (very carefully, it's a userspace list!)
947 * and mark any locks found there dead, and notify any waiters. 1712 * and mark any locks found there dead, and notify any waiters.
948 * 1713 *
@@ -952,14 +1717,14 @@ void exit_robust_list(struct task_struct *curr)
952{ 1717{
953 struct robust_list_head __user *head = curr->robust_list; 1718 struct robust_list_head __user *head = curr->robust_list;
954 struct robust_list __user *entry, *pending; 1719 struct robust_list __user *entry, *pending;
955 unsigned int limit = ROBUST_LIST_LIMIT; 1720 unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
956 unsigned long futex_offset; 1721 unsigned long futex_offset;
957 1722
958 /* 1723 /*
959 * Fetch the list head (which was registered earlier, via 1724 * Fetch the list head (which was registered earlier, via
960 * sys_set_robust_list()): 1725 * sys_set_robust_list()):
961 */ 1726 */
962 if (get_user(entry, &head->list.next)) 1727 if (fetch_robust_entry(&entry, &head->list.next, &pi))
963 return; 1728 return;
964 /* 1729 /*
965 * Fetch the relative futex offset: 1730 * Fetch the relative futex offset:
@@ -970,24 +1735,25 @@ void exit_robust_list(struct task_struct *curr)
970 * Fetch any possibly pending lock-add first, and handle it 1735 * Fetch any possibly pending lock-add first, and handle it
971 * if it exists: 1736 * if it exists:
972 */ 1737 */
973 if (get_user(pending, &head->list_op_pending)) 1738 if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
974 return; 1739 return;
1740
975 if (pending) 1741 if (pending)
976 handle_futex_death((void *)pending + futex_offset, curr); 1742 handle_futex_death((void *)pending + futex_offset, curr, pip);
977 1743
978 while (entry != &head->list) { 1744 while (entry != &head->list) {
979 /* 1745 /*
980 * A pending lock might already be on the list, so 1746 * A pending lock might already be on the list, so
981 * dont process it twice: 1747 * don't process it twice:
982 */ 1748 */
983 if (entry != pending) 1749 if (entry != pending)
984 if (handle_futex_death((void *)entry + futex_offset, 1750 if (handle_futex_death((void *)entry + futex_offset,
985 curr)) 1751 curr, pi))
986 return; 1752 return;
987 /* 1753 /*
988 * Fetch the next entry in the list: 1754 * Fetch the next entry in the list:
989 */ 1755 */
990 if (get_user(entry, &entry->next)) 1756 if (fetch_robust_entry(&entry, &entry->next, &pi))
991 return; 1757 return;
992 /* 1758 /*
993 * Avoid excessively long or circular lists: 1759 * Avoid excessively long or circular lists:
@@ -999,8 +1765,8 @@ void exit_robust_list(struct task_struct *curr)
999 } 1765 }
1000} 1766}
1001 1767
1002long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, 1768long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout,
1003 unsigned long uaddr2, int val2, int val3) 1769 u32 __user *uaddr2, u32 val2, u32 val3)
1004{ 1770{
1005 int ret; 1771 int ret;
1006 1772
@@ -1024,6 +1790,15 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
1024 case FUTEX_WAKE_OP: 1790 case FUTEX_WAKE_OP:
1025 ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); 1791 ret = futex_wake_op(uaddr, uaddr2, val, val2, val3);
1026 break; 1792 break;
1793 case FUTEX_LOCK_PI:
1794 ret = futex_lock_pi(uaddr, val, timeout, val2, 0);
1795 break;
1796 case FUTEX_UNLOCK_PI:
1797 ret = futex_unlock_pi(uaddr);
1798 break;
1799 case FUTEX_TRYLOCK_PI:
1800 ret = futex_lock_pi(uaddr, 0, timeout, val2, 1);
1801 break;
1027 default: 1802 default:
1028 ret = -ENOSYS; 1803 ret = -ENOSYS;
1029 } 1804 }
@@ -1031,29 +1806,33 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
1031} 1806}
1032 1807
1033 1808
1034asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, 1809asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
1035 struct timespec __user *utime, u32 __user *uaddr2, 1810 struct timespec __user *utime, u32 __user *uaddr2,
1036 int val3) 1811 u32 val3)
1037{ 1812{
1038 struct timespec t; 1813 struct timespec t;
1039 unsigned long timeout = MAX_SCHEDULE_TIMEOUT; 1814 unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
1040 int val2 = 0; 1815 u32 val2 = 0;
1041 1816
1042 if (utime && (op == FUTEX_WAIT)) { 1817 if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
1043 if (copy_from_user(&t, utime, sizeof(t)) != 0) 1818 if (copy_from_user(&t, utime, sizeof(t)) != 0)
1044 return -EFAULT; 1819 return -EFAULT;
1045 if (!timespec_valid(&t)) 1820 if (!timespec_valid(&t))
1046 return -EINVAL; 1821 return -EINVAL;
1047 timeout = timespec_to_jiffies(&t) + 1; 1822 if (op == FUTEX_WAIT)
1823 timeout = timespec_to_jiffies(&t) + 1;
1824 else {
1825 timeout = t.tv_sec;
1826 val2 = t.tv_nsec;
1827 }
1048 } 1828 }
1049 /* 1829 /*
1050 * requeue parameter in 'utime' if op == FUTEX_REQUEUE. 1830 * requeue parameter in 'utime' if op == FUTEX_REQUEUE.
1051 */ 1831 */
1052 if (op >= FUTEX_REQUEUE) 1832 if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
1053 val2 = (int) (unsigned long) utime; 1833 val2 = (u32) (unsigned long) utime;
1054 1834
1055 return do_futex((unsigned long)uaddr, op, val, timeout, 1835 return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
1056 (unsigned long)uaddr2, val2, val3);
1057} 1836}
1058 1837
1059static int futexfs_get_sb(struct file_system_type *fs_type, 1838static int futexfs_get_sb(struct file_system_type *fs_type,
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 1ab6a0ea3d14..c5cca3f65cb7 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -12,6 +12,23 @@
12 12
13#include <asm/uaccess.h> 13#include <asm/uaccess.h>
14 14
15
16/*
17 * Fetch a robust-list pointer. Bit 0 signals PI futexes:
18 */
19static inline int
20fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
21 compat_uptr_t *head, int *pi)
22{
23 if (get_user(*uentry, head))
24 return -EFAULT;
25
26 *entry = compat_ptr((*uentry) & ~1);
27 *pi = (unsigned int)(*uentry) & 1;
28
29 return 0;
30}
31
15/* 32/*
16 * Walk curr->robust_list (very carefully, it's a userspace list!) 33 * Walk curr->robust_list (very carefully, it's a userspace list!)
17 * and mark any locks found there dead, and notify any waiters. 34 * and mark any locks found there dead, and notify any waiters.
@@ -22,17 +39,16 @@ void compat_exit_robust_list(struct task_struct *curr)
22{ 39{
23 struct compat_robust_list_head __user *head = curr->compat_robust_list; 40 struct compat_robust_list_head __user *head = curr->compat_robust_list;
24 struct robust_list __user *entry, *pending; 41 struct robust_list __user *entry, *pending;
42 unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
25 compat_uptr_t uentry, upending; 43 compat_uptr_t uentry, upending;
26 unsigned int limit = ROBUST_LIST_LIMIT;
27 compat_long_t futex_offset; 44 compat_long_t futex_offset;
28 45
29 /* 46 /*
30 * Fetch the list head (which was registered earlier, via 47 * Fetch the list head (which was registered earlier, via
31 * sys_set_robust_list()): 48 * sys_set_robust_list()):
32 */ 49 */
33 if (get_user(uentry, &head->list.next)) 50 if (fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
34 return; 51 return;
35 entry = compat_ptr(uentry);
36 /* 52 /*
37 * Fetch the relative futex offset: 53 * Fetch the relative futex offset:
38 */ 54 */
@@ -42,11 +58,11 @@ void compat_exit_robust_list(struct task_struct *curr)
42 * Fetch any possibly pending lock-add first, and handle it 58 * Fetch any possibly pending lock-add first, and handle it
43 * if it exists: 59 * if it exists:
44 */ 60 */
45 if (get_user(upending, &head->list_op_pending)) 61 if (fetch_robust_entry(&upending, &pending,
62 &head->list_op_pending, &pip))
46 return; 63 return;
47 pending = compat_ptr(upending);
48 if (upending) 64 if (upending)
49 handle_futex_death((void *)pending + futex_offset, curr); 65 handle_futex_death((void *)pending + futex_offset, curr, pip);
50 66
51 while (compat_ptr(uentry) != &head->list) { 67 while (compat_ptr(uentry) != &head->list) {
52 /* 68 /*
@@ -55,15 +71,15 @@ void compat_exit_robust_list(struct task_struct *curr)
55 */ 71 */
56 if (entry != pending) 72 if (entry != pending)
57 if (handle_futex_death((void *)entry + futex_offset, 73 if (handle_futex_death((void *)entry + futex_offset,
58 curr)) 74 curr, pi))
59 return; 75 return;
60 76
61 /* 77 /*
62 * Fetch the next entry in the list: 78 * Fetch the next entry in the list:
63 */ 79 */
64 if (get_user(uentry, (compat_uptr_t *)&entry->next)) 80 if (fetch_robust_entry(&uentry, &entry,
81 (compat_uptr_t *)&entry->next, &pi))
65 return; 82 return;
66 entry = compat_ptr(uentry);
67 /* 83 /*
68 * Avoid excessively long or circular lists: 84 * Avoid excessively long or circular lists:
69 */ 85 */
@@ -129,16 +145,20 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
129 unsigned long timeout = MAX_SCHEDULE_TIMEOUT; 145 unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
130 int val2 = 0; 146 int val2 = 0;
131 147
132 if (utime && (op == FUTEX_WAIT)) { 148 if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
133 if (get_compat_timespec(&t, utime)) 149 if (get_compat_timespec(&t, utime))
134 return -EFAULT; 150 return -EFAULT;
135 if (!timespec_valid(&t)) 151 if (!timespec_valid(&t))
136 return -EINVAL; 152 return -EINVAL;
137 timeout = timespec_to_jiffies(&t) + 1; 153 if (op == FUTEX_WAIT)
154 timeout = timespec_to_jiffies(&t) + 1;
155 else {
156 timeout = t.tv_sec;
157 val2 = t.tv_nsec;
158 }
138 } 159 }
139 if (op >= FUTEX_REQUEUE) 160 if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
140 val2 = (int) (unsigned long) utime; 161 val2 = (int) (unsigned long) utime;
141 162
142 return do_futex((unsigned long)uaddr, op, val, timeout, 163 return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
143 (unsigned long)uaddr2, val2, val3);
144} 164}
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 18324305724a..d0ba190dfeb6 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -98,7 +98,6 @@ static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) =
98 98
99/** 99/**
100 * ktime_get_ts - get the monotonic clock in timespec format 100 * ktime_get_ts - get the monotonic clock in timespec format
101 *
102 * @ts: pointer to timespec variable 101 * @ts: pointer to timespec variable
103 * 102 *
104 * The function calculates the monotonic clock from the realtime 103 * The function calculates the monotonic clock from the realtime
@@ -188,7 +187,7 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_base *base)
188{ 187{
189 struct hrtimer_base *new_base; 188 struct hrtimer_base *new_base;
190 189
191 new_base = &__get_cpu_var(hrtimer_bases[base->index]); 190 new_base = &__get_cpu_var(hrtimer_bases)[base->index];
192 191
193 if (base != new_base) { 192 if (base != new_base) {
194 /* 193 /*
@@ -238,7 +237,6 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
238# ifndef CONFIG_KTIME_SCALAR 237# ifndef CONFIG_KTIME_SCALAR
239/** 238/**
240 * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable 239 * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable
241 *
242 * @kt: addend 240 * @kt: addend
243 * @nsec: the scalar nsec value to add 241 * @nsec: the scalar nsec value to add
244 * 242 *
@@ -299,7 +297,6 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
299 297
300/** 298/**
301 * hrtimer_forward - forward the timer expiry 299 * hrtimer_forward - forward the timer expiry
302 *
303 * @timer: hrtimer to forward 300 * @timer: hrtimer to forward
304 * @now: forward past this time 301 * @now: forward past this time
305 * @interval: the interval to forward 302 * @interval: the interval to forward
@@ -411,7 +408,6 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
411 408
412/** 409/**
413 * hrtimer_start - (re)start an relative timer on the current CPU 410 * hrtimer_start - (re)start an relative timer on the current CPU
414 *
415 * @timer: the timer to be added 411 * @timer: the timer to be added
416 * @tim: expiry time 412 * @tim: expiry time
417 * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) 413 * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
@@ -460,14 +456,13 @@ EXPORT_SYMBOL_GPL(hrtimer_start);
460 456
461/** 457/**
462 * hrtimer_try_to_cancel - try to deactivate a timer 458 * hrtimer_try_to_cancel - try to deactivate a timer
463 *
464 * @timer: hrtimer to stop 459 * @timer: hrtimer to stop
465 * 460 *
466 * Returns: 461 * Returns:
467 * 0 when the timer was not active 462 * 0 when the timer was not active
468 * 1 when the timer was active 463 * 1 when the timer was active
469 * -1 when the timer is currently excuting the callback function and 464 * -1 when the timer is currently excuting the callback function and
470 * can not be stopped 465 * cannot be stopped
471 */ 466 */
472int hrtimer_try_to_cancel(struct hrtimer *timer) 467int hrtimer_try_to_cancel(struct hrtimer *timer)
473{ 468{
@@ -489,7 +484,6 @@ EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel);
489 484
490/** 485/**
491 * hrtimer_cancel - cancel a timer and wait for the handler to finish. 486 * hrtimer_cancel - cancel a timer and wait for the handler to finish.
492 *
493 * @timer: the timer to be cancelled 487 * @timer: the timer to be cancelled
494 * 488 *
495 * Returns: 489 * Returns:
@@ -510,7 +504,6 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel);
510 504
511/** 505/**
512 * hrtimer_get_remaining - get remaining time for the timer 506 * hrtimer_get_remaining - get remaining time for the timer
513 *
514 * @timer: the timer to read 507 * @timer: the timer to read
515 */ 508 */
516ktime_t hrtimer_get_remaining(const struct hrtimer *timer) 509ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
@@ -564,7 +557,6 @@ ktime_t hrtimer_get_next_event(void)
564 557
565/** 558/**
566 * hrtimer_init - initialize a timer to the given clock 559 * hrtimer_init - initialize a timer to the given clock
567 *
568 * @timer: the timer to be initialized 560 * @timer: the timer to be initialized
569 * @clock_id: the clock to be used 561 * @clock_id: the clock to be used
570 * @mode: timer mode abs/rel 562 * @mode: timer mode abs/rel
@@ -576,7 +568,7 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
576 568
577 memset(timer, 0, sizeof(struct hrtimer)); 569 memset(timer, 0, sizeof(struct hrtimer));
578 570
579 bases = per_cpu(hrtimer_bases, raw_smp_processor_id()); 571 bases = __raw_get_cpu_var(hrtimer_bases);
580 572
581 if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS) 573 if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS)
582 clock_id = CLOCK_MONOTONIC; 574 clock_id = CLOCK_MONOTONIC;
@@ -588,7 +580,6 @@ EXPORT_SYMBOL_GPL(hrtimer_init);
588 580
589/** 581/**
590 * hrtimer_get_res - get the timer resolution for a clock 582 * hrtimer_get_res - get the timer resolution for a clock
591 *
592 * @which_clock: which clock to query 583 * @which_clock: which clock to query
593 * @tp: pointer to timespec variable to store the resolution 584 * @tp: pointer to timespec variable to store the resolution
594 * 585 *
@@ -599,7 +590,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
599{ 590{
600 struct hrtimer_base *bases; 591 struct hrtimer_base *bases;
601 592
602 bases = per_cpu(hrtimer_bases, raw_smp_processor_id()); 593 bases = __raw_get_cpu_var(hrtimer_bases);
603 *tp = ktime_to_timespec(bases[which_clock].resolution); 594 *tp = ktime_to_timespec(bases[which_clock].resolution);
604 595
605 return 0; 596 return 0;
@@ -678,7 +669,7 @@ static int hrtimer_wakeup(struct hrtimer *timer)
678 return HRTIMER_NORESTART; 669 return HRTIMER_NORESTART;
679} 670}
680 671
681void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, task_t *task) 672void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
682{ 673{
683 sl->timer.function = hrtimer_wakeup; 674 sl->timer.function = hrtimer_wakeup;
684 sl->task = task; 675 sl->task = task;
@@ -702,7 +693,7 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
702 return t->task == NULL; 693 return t->task == NULL;
703} 694}
704 695
705static long __sched nanosleep_restart(struct restart_block *restart) 696long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
706{ 697{
707 struct hrtimer_sleeper t; 698 struct hrtimer_sleeper t;
708 struct timespec __user *rmtp; 699 struct timespec __user *rmtp;
@@ -711,13 +702,13 @@ static long __sched nanosleep_restart(struct restart_block *restart)
711 702
712 restart->fn = do_no_restart_syscall; 703 restart->fn = do_no_restart_syscall;
713 704
714 hrtimer_init(&t.timer, restart->arg3, HRTIMER_ABS); 705 hrtimer_init(&t.timer, restart->arg0, HRTIMER_ABS);
715 t.timer.expires.tv64 = ((u64)restart->arg1 << 32) | (u64) restart->arg0; 706 t.timer.expires.tv64 = ((u64)restart->arg3 << 32) | (u64) restart->arg2;
716 707
717 if (do_nanosleep(&t, HRTIMER_ABS)) 708 if (do_nanosleep(&t, HRTIMER_ABS))
718 return 0; 709 return 0;
719 710
720 rmtp = (struct timespec __user *) restart->arg2; 711 rmtp = (struct timespec __user *) restart->arg1;
721 if (rmtp) { 712 if (rmtp) {
722 time = ktime_sub(t.timer.expires, t.timer.base->get_time()); 713 time = ktime_sub(t.timer.expires, t.timer.base->get_time());
723 if (time.tv64 <= 0) 714 if (time.tv64 <= 0)
@@ -727,7 +718,7 @@ static long __sched nanosleep_restart(struct restart_block *restart)
727 return -EFAULT; 718 return -EFAULT;
728 } 719 }
729 720
730 restart->fn = nanosleep_restart; 721 restart->fn = hrtimer_nanosleep_restart;
731 722
732 /* The other values in restart are already filled in */ 723 /* The other values in restart are already filled in */
733 return -ERESTART_RESTARTBLOCK; 724 return -ERESTART_RESTARTBLOCK;
@@ -760,11 +751,11 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
760 } 751 }
761 752
762 restart = &current_thread_info()->restart_block; 753 restart = &current_thread_info()->restart_block;
763 restart->fn = nanosleep_restart; 754 restart->fn = hrtimer_nanosleep_restart;
764 restart->arg0 = t.timer.expires.tv64 & 0xFFFFFFFF; 755 restart->arg0 = (unsigned long) t.timer.base->index;
765 restart->arg1 = t.timer.expires.tv64 >> 32; 756 restart->arg1 = (unsigned long) rmtp;
766 restart->arg2 = (unsigned long) rmtp; 757 restart->arg2 = t.timer.expires.tv64 & 0xFFFFFFFF;
767 restart->arg3 = (unsigned long) t.timer.base->index; 758 restart->arg3 = t.timer.expires.tv64 >> 32;
768 759
769 return -ERESTART_RESTARTBLOCK; 760 return -ERESTART_RESTARTBLOCK;
770} 761}
@@ -791,8 +782,10 @@ static void __devinit init_hrtimers_cpu(int cpu)
791 struct hrtimer_base *base = per_cpu(hrtimer_bases, cpu); 782 struct hrtimer_base *base = per_cpu(hrtimer_bases, cpu);
792 int i; 783 int i;
793 784
794 for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) 785 for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) {
795 spin_lock_init(&base->lock); 786 spin_lock_init(&base->lock);
787 lockdep_set_class(&base->lock, &base->lock_key);
788 }
796} 789}
797 790
798#ifdef CONFIG_HOTPLUG_CPU 791#ifdef CONFIG_HOTPLUG_CPU
@@ -842,7 +835,7 @@ static void migrate_hrtimers(int cpu)
842} 835}
843#endif /* CONFIG_HOTPLUG_CPU */ 836#endif /* CONFIG_HOTPLUG_CPU */
844 837
845static int hrtimer_cpu_notify(struct notifier_block *self, 838static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
846 unsigned long action, void *hcpu) 839 unsigned long action, void *hcpu)
847{ 840{
848 long cpu = (long)hcpu; 841 long cpu = (long)hcpu;
@@ -866,7 +859,7 @@ static int hrtimer_cpu_notify(struct notifier_block *self,
866 return NOTIFY_OK; 859 return NOTIFY_OK;
867} 860}
868 861
869static struct notifier_block hrtimers_nb = { 862static struct notifier_block __cpuinitdata hrtimers_nb = {
870 .notifier_call = hrtimer_cpu_notify, 863 .notifier_call = hrtimer_cpu_notify,
871}; 864};
872 865
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 9f77f50d8143..1dab0ac3f797 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -1,5 +1,5 @@
1 1
2obj-y := handle.o manage.o spurious.o 2obj-y := handle.o manage.o spurious.o resend.o chip.o
3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o 3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
4obj-$(CONFIG_PROC_FS) += proc.o 4obj-$(CONFIG_PROC_FS) += proc.o
5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o 5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 3467097ca61a..533068cfb607 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -11,12 +11,14 @@
11#include <linux/interrupt.h> 11#include <linux/interrupt.h>
12#include <linux/delay.h> 12#include <linux/delay.h>
13 13
14#include "internals.h"
15
14/* 16/*
15 * Autodetection depends on the fact that any interrupt that 17 * Autodetection depends on the fact that any interrupt that
16 * comes in on to an unassigned handler will get stuck with 18 * comes in on to an unassigned handler will get stuck with
17 * "IRQ_WAITING" cleared and the interrupt disabled. 19 * "IRQ_WAITING" cleared and the interrupt disabled.
18 */ 20 */
19static DECLARE_MUTEX(probe_sem); 21static DEFINE_MUTEX(probing_active);
20 22
21/** 23/**
22 * probe_irq_on - begin an interrupt autodetect 24 * probe_irq_on - begin an interrupt autodetect
@@ -27,11 +29,11 @@ static DECLARE_MUTEX(probe_sem);
27 */ 29 */
28unsigned long probe_irq_on(void) 30unsigned long probe_irq_on(void)
29{ 31{
30 unsigned long val; 32 struct irq_desc *desc;
31 irq_desc_t *desc; 33 unsigned long mask;
32 unsigned int i; 34 unsigned int i;
33 35
34 down(&probe_sem); 36 mutex_lock(&probing_active);
35 /* 37 /*
36 * something may have generated an irq long ago and we want to 38 * something may have generated an irq long ago and we want to
37 * flush such a longstanding irq before considering it as spurious. 39 * flush such a longstanding irq before considering it as spurious.
@@ -40,8 +42,21 @@ unsigned long probe_irq_on(void)
40 desc = irq_desc + i; 42 desc = irq_desc + i;
41 43
42 spin_lock_irq(&desc->lock); 44 spin_lock_irq(&desc->lock);
43 if (!irq_desc[i].action) 45 if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
44 irq_desc[i].handler->startup(i); 46 /*
47 * An old-style architecture might still have
48 * the handle_bad_irq handler there:
49 */
50 compat_irq_chip_set_default_handler(desc);
51
52 /*
53 * Some chips need to know about probing in
54 * progress:
55 */
56 if (desc->chip->set_type)
57 desc->chip->set_type(i, IRQ_TYPE_PROBE);
58 desc->chip->startup(i);
59 }
45 spin_unlock_irq(&desc->lock); 60 spin_unlock_irq(&desc->lock);
46 } 61 }
47 62
@@ -57,9 +72,9 @@ unsigned long probe_irq_on(void)
57 desc = irq_desc + i; 72 desc = irq_desc + i;
58 73
59 spin_lock_irq(&desc->lock); 74 spin_lock_irq(&desc->lock);
60 if (!desc->action) { 75 if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
61 desc->status |= IRQ_AUTODETECT | IRQ_WAITING; 76 desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
62 if (desc->handler->startup(i)) 77 if (desc->chip->startup(i))
63 desc->status |= IRQ_PENDING; 78 desc->status |= IRQ_PENDING;
64 } 79 }
65 spin_unlock_irq(&desc->lock); 80 spin_unlock_irq(&desc->lock);
@@ -73,11 +88,11 @@ unsigned long probe_irq_on(void)
73 /* 88 /*
74 * Now filter out any obviously spurious interrupts 89 * Now filter out any obviously spurious interrupts
75 */ 90 */
76 val = 0; 91 mask = 0;
77 for (i = 0; i < NR_IRQS; i++) { 92 for (i = 0; i < NR_IRQS; i++) {
78 irq_desc_t *desc = irq_desc + i;
79 unsigned int status; 93 unsigned int status;
80 94
95 desc = irq_desc + i;
81 spin_lock_irq(&desc->lock); 96 spin_lock_irq(&desc->lock);
82 status = desc->status; 97 status = desc->status;
83 98
@@ -85,17 +100,16 @@ unsigned long probe_irq_on(void)
85 /* It triggered already - consider it spurious. */ 100 /* It triggered already - consider it spurious. */
86 if (!(status & IRQ_WAITING)) { 101 if (!(status & IRQ_WAITING)) {
87 desc->status = status & ~IRQ_AUTODETECT; 102 desc->status = status & ~IRQ_AUTODETECT;
88 desc->handler->shutdown(i); 103 desc->chip->shutdown(i);
89 } else 104 } else
90 if (i < 32) 105 if (i < 32)
91 val |= 1 << i; 106 mask |= 1 << i;
92 } 107 }
93 spin_unlock_irq(&desc->lock); 108 spin_unlock_irq(&desc->lock);
94 } 109 }
95 110
96 return val; 111 return mask;
97} 112}
98
99EXPORT_SYMBOL(probe_irq_on); 113EXPORT_SYMBOL(probe_irq_on);
100 114
101/** 115/**
@@ -117,7 +131,7 @@ unsigned int probe_irq_mask(unsigned long val)
117 131
118 mask = 0; 132 mask = 0;
119 for (i = 0; i < NR_IRQS; i++) { 133 for (i = 0; i < NR_IRQS; i++) {
120 irq_desc_t *desc = irq_desc + i; 134 struct irq_desc *desc = irq_desc + i;
121 unsigned int status; 135 unsigned int status;
122 136
123 spin_lock_irq(&desc->lock); 137 spin_lock_irq(&desc->lock);
@@ -128,11 +142,11 @@ unsigned int probe_irq_mask(unsigned long val)
128 mask |= 1 << i; 142 mask |= 1 << i;
129 143
130 desc->status = status & ~IRQ_AUTODETECT; 144 desc->status = status & ~IRQ_AUTODETECT;
131 desc->handler->shutdown(i); 145 desc->chip->shutdown(i);
132 } 146 }
133 spin_unlock_irq(&desc->lock); 147 spin_unlock_irq(&desc->lock);
134 } 148 }
135 up(&probe_sem); 149 mutex_unlock(&probing_active);
136 150
137 return mask & val; 151 return mask & val;
138} 152}
@@ -160,7 +174,7 @@ int probe_irq_off(unsigned long val)
160 int i, irq_found = 0, nr_irqs = 0; 174 int i, irq_found = 0, nr_irqs = 0;
161 175
162 for (i = 0; i < NR_IRQS; i++) { 176 for (i = 0; i < NR_IRQS; i++) {
163 irq_desc_t *desc = irq_desc + i; 177 struct irq_desc *desc = irq_desc + i;
164 unsigned int status; 178 unsigned int status;
165 179
166 spin_lock_irq(&desc->lock); 180 spin_lock_irq(&desc->lock);
@@ -173,16 +187,16 @@ int probe_irq_off(unsigned long val)
173 nr_irqs++; 187 nr_irqs++;
174 } 188 }
175 desc->status = status & ~IRQ_AUTODETECT; 189 desc->status = status & ~IRQ_AUTODETECT;
176 desc->handler->shutdown(i); 190 desc->chip->shutdown(i);
177 } 191 }
178 spin_unlock_irq(&desc->lock); 192 spin_unlock_irq(&desc->lock);
179 } 193 }
180 up(&probe_sem); 194 mutex_unlock(&probing_active);
181 195
182 if (nr_irqs > 1) 196 if (nr_irqs > 1)
183 irq_found = -irq_found; 197 irq_found = -irq_found;
198
184 return irq_found; 199 return irq_found;
185} 200}
186
187EXPORT_SYMBOL(probe_irq_off); 201EXPORT_SYMBOL(probe_irq_off);
188 202
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
new file mode 100644
index 000000000000..736cb0bd498f
--- /dev/null
+++ b/kernel/irq/chip.c
@@ -0,0 +1,533 @@
1/*
2 * linux/kernel/irq/chip.c
3 *
4 * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
5 * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
6 *
7 * This file contains the core interrupt handling code, for irq-chip
8 * based architectures.
9 *
10 * Detailed information is available in Documentation/DocBook/genericirq
11 */
12
13#include <linux/irq.h>
14#include <linux/module.h>
15#include <linux/interrupt.h>
16#include <linux/kernel_stat.h>
17
18#include "internals.h"
19
20/**
21 * set_irq_chip - set the irq chip for an irq
22 * @irq: irq number
23 * @chip: pointer to irq chip description structure
24 */
25int set_irq_chip(unsigned int irq, struct irq_chip *chip)
26{
27 struct irq_desc *desc;
28 unsigned long flags;
29
30 if (irq >= NR_IRQS) {
31 printk(KERN_ERR "Trying to install chip for IRQ%d\n", irq);
32 WARN_ON(1);
33 return -EINVAL;
34 }
35
36 if (!chip)
37 chip = &no_irq_chip;
38
39 desc = irq_desc + irq;
40 spin_lock_irqsave(&desc->lock, flags);
41 irq_chip_set_defaults(chip);
42 desc->chip = chip;
43 spin_unlock_irqrestore(&desc->lock, flags);
44
45 return 0;
46}
47EXPORT_SYMBOL(set_irq_chip);
48
49/**
50 * set_irq_type - set the irq type for an irq
51 * @irq: irq number
52 * @type: interrupt type - see include/linux/interrupt.h
53 */
54int set_irq_type(unsigned int irq, unsigned int type)
55{
56 struct irq_desc *desc;
57 unsigned long flags;
58 int ret = -ENXIO;
59
60 if (irq >= NR_IRQS) {
61 printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq);
62 return -ENODEV;
63 }
64
65 desc = irq_desc + irq;
66 if (desc->chip->set_type) {
67 spin_lock_irqsave(&desc->lock, flags);
68 ret = desc->chip->set_type(irq, type);
69 spin_unlock_irqrestore(&desc->lock, flags);
70 }
71 return ret;
72}
73EXPORT_SYMBOL(set_irq_type);
74
75/**
76 * set_irq_data - set irq type data for an irq
77 * @irq: Interrupt number
78 * @data: Pointer to interrupt specific data
79 *
80 * Set the hardware irq controller data for an irq
81 */
82int set_irq_data(unsigned int irq, void *data)
83{
84 struct irq_desc *desc;
85 unsigned long flags;
86
87 if (irq >= NR_IRQS) {
88 printk(KERN_ERR
89 "Trying to install controller data for IRQ%d\n", irq);
90 return -EINVAL;
91 }
92
93 desc = irq_desc + irq;
94 spin_lock_irqsave(&desc->lock, flags);
95 desc->handler_data = data;
96 spin_unlock_irqrestore(&desc->lock, flags);
97 return 0;
98}
99EXPORT_SYMBOL(set_irq_data);
100
101/**
102 * set_irq_chip_data - set irq chip data for an irq
103 * @irq: Interrupt number
104 * @data: Pointer to chip specific data
105 *
106 * Set the hardware irq chip data for an irq
107 */
108int set_irq_chip_data(unsigned int irq, void *data)
109{
110 struct irq_desc *desc = irq_desc + irq;
111 unsigned long flags;
112
113 if (irq >= NR_IRQS || !desc->chip) {
114 printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq);
115 return -EINVAL;
116 }
117
118 spin_lock_irqsave(&desc->lock, flags);
119 desc->chip_data = data;
120 spin_unlock_irqrestore(&desc->lock, flags);
121
122 return 0;
123}
124EXPORT_SYMBOL(set_irq_chip_data);
125
126/*
127 * default enable function
128 */
129static void default_enable(unsigned int irq)
130{
131 struct irq_desc *desc = irq_desc + irq;
132
133 desc->chip->unmask(irq);
134 desc->status &= ~IRQ_MASKED;
135}
136
137/*
138 * default disable function
139 */
140static void default_disable(unsigned int irq)
141{
142 struct irq_desc *desc = irq_desc + irq;
143
144 if (!(desc->status & IRQ_DELAYED_DISABLE))
145 desc->chip->mask(irq);
146}
147
148/*
149 * default startup function
150 */
151static unsigned int default_startup(unsigned int irq)
152{
153 irq_desc[irq].chip->enable(irq);
154
155 return 0;
156}
157
158/*
159 * Fixup enable/disable function pointers
160 */
161void irq_chip_set_defaults(struct irq_chip *chip)
162{
163 if (!chip->enable)
164 chip->enable = default_enable;
165 if (!chip->disable)
166 chip->disable = default_disable;
167 if (!chip->startup)
168 chip->startup = default_startup;
169 if (!chip->shutdown)
170 chip->shutdown = chip->disable;
171 if (!chip->name)
172 chip->name = chip->typename;
173}
174
175static inline void mask_ack_irq(struct irq_desc *desc, int irq)
176{
177 if (desc->chip->mask_ack)
178 desc->chip->mask_ack(irq);
179 else {
180 desc->chip->mask(irq);
181 desc->chip->ack(irq);
182 }
183}
184
185/**
186 * handle_simple_irq - Simple and software-decoded IRQs.
187 * @irq: the interrupt number
188 * @desc: the interrupt description structure for this irq
189 * @regs: pointer to a register structure
190 *
191 * Simple interrupts are either sent from a demultiplexing interrupt
192 * handler or come from hardware, where no interrupt hardware control
193 * is necessary.
194 *
195 * Note: The caller is expected to handle the ack, clear, mask and
196 * unmask issues if necessary.
197 */
198void fastcall
199handle_simple_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
200{
201 struct irqaction *action;
202 irqreturn_t action_ret;
203 const unsigned int cpu = smp_processor_id();
204
205 spin_lock(&desc->lock);
206
207 if (unlikely(desc->status & IRQ_INPROGRESS))
208 goto out_unlock;
209 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
210 kstat_cpu(cpu).irqs[irq]++;
211
212 action = desc->action;
213 if (unlikely(!action || (desc->status & IRQ_DISABLED)))
214 goto out_unlock;
215
216 desc->status |= IRQ_INPROGRESS;
217 spin_unlock(&desc->lock);
218
219 action_ret = handle_IRQ_event(irq, regs, action);
220 if (!noirqdebug)
221 note_interrupt(irq, desc, action_ret, regs);
222
223 spin_lock(&desc->lock);
224 desc->status &= ~IRQ_INPROGRESS;
225out_unlock:
226 spin_unlock(&desc->lock);
227}
228
229/**
230 * handle_level_irq - Level type irq handler
231 * @irq: the interrupt number
232 * @desc: the interrupt description structure for this irq
233 * @regs: pointer to a register structure
234 *
235 * Level type interrupts are active as long as the hardware line has
236 * the active level. This may require to mask the interrupt and unmask
237 * it after the associated handler has acknowledged the device, so the
238 * interrupt line is back to inactive.
239 */
240void fastcall
241handle_level_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
242{
243 unsigned int cpu = smp_processor_id();
244 struct irqaction *action;
245 irqreturn_t action_ret;
246
247 spin_lock(&desc->lock);
248 mask_ack_irq(desc, irq);
249
250 if (unlikely(desc->status & IRQ_INPROGRESS))
251 goto out_unlock;
252 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
253 kstat_cpu(cpu).irqs[irq]++;
254
255 /*
256 * If its disabled or no action available
257 * keep it masked and get out of here
258 */
259 action = desc->action;
260 if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
261 desc->status |= IRQ_PENDING;
262 goto out_unlock;
263 }
264
265 desc->status |= IRQ_INPROGRESS;
266 desc->status &= ~IRQ_PENDING;
267 spin_unlock(&desc->lock);
268
269 action_ret = handle_IRQ_event(irq, regs, action);
270 if (!noirqdebug)
271 note_interrupt(irq, desc, action_ret, regs);
272
273 spin_lock(&desc->lock);
274 desc->status &= ~IRQ_INPROGRESS;
275 if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
276 desc->chip->unmask(irq);
277out_unlock:
278 spin_unlock(&desc->lock);
279}
280
281/**
282 * handle_fasteoi_irq - irq handler for transparent controllers
283 * @irq: the interrupt number
284 * @desc: the interrupt description structure for this irq
285 * @regs: pointer to a register structure
286 *
287 * Only a single callback will be issued to the chip: an ->eoi()
288 * call when the interrupt has been serviced. This enables support
289 * for modern forms of interrupt handlers, which handle the flow
290 * details in hardware, transparently.
291 */
292void fastcall
293handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc,
294 struct pt_regs *regs)
295{
296 unsigned int cpu = smp_processor_id();
297 struct irqaction *action;
298 irqreturn_t action_ret;
299
300 spin_lock(&desc->lock);
301
302 if (unlikely(desc->status & IRQ_INPROGRESS))
303 goto out;
304
305 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
306 kstat_cpu(cpu).irqs[irq]++;
307
308 /*
309 * If its disabled or no action available
310 * keep it masked and get out of here
311 */
312 action = desc->action;
313 if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
314 desc->status |= IRQ_PENDING;
315 goto out;
316 }
317
318 desc->status |= IRQ_INPROGRESS;
319 desc->status &= ~IRQ_PENDING;
320 spin_unlock(&desc->lock);
321
322 action_ret = handle_IRQ_event(irq, regs, action);
323 if (!noirqdebug)
324 note_interrupt(irq, desc, action_ret, regs);
325
326 spin_lock(&desc->lock);
327 desc->status &= ~IRQ_INPROGRESS;
328out:
329 desc->chip->eoi(irq);
330
331 spin_unlock(&desc->lock);
332}
333
334/**
335 * handle_edge_irq - edge type IRQ handler
336 * @irq: the interrupt number
337 * @desc: the interrupt description structure for this irq
338 * @regs: pointer to a register structure
339 *
340 * Interrupt occures on the falling and/or rising edge of a hardware
341 * signal. The occurence is latched into the irq controller hardware
342 * and must be acked in order to be reenabled. After the ack another
343 * interrupt can happen on the same source even before the first one
344 * is handled by the assosiacted event handler. If this happens it
345 * might be necessary to disable (mask) the interrupt depending on the
346 * controller hardware. This requires to reenable the interrupt inside
347 * of the loop which handles the interrupts which have arrived while
348 * the handler was running. If all pending interrupts are handled, the
349 * loop is left.
350 */
351void fastcall
352handle_edge_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
353{
354 const unsigned int cpu = smp_processor_id();
355
356 spin_lock(&desc->lock);
357
358 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
359
360 /*
361 * If we're currently running this IRQ, or its disabled,
362 * we shouldn't process the IRQ. Mark it pending, handle
363 * the necessary masking and go out
364 */
365 if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) ||
366 !desc->action)) {
367 desc->status |= (IRQ_PENDING | IRQ_MASKED);
368 mask_ack_irq(desc, irq);
369 goto out_unlock;
370 }
371
372 kstat_cpu(cpu).irqs[irq]++;
373
374 /* Start handling the irq */
375 desc->chip->ack(irq);
376
377 /* Mark the IRQ currently in progress.*/
378 desc->status |= IRQ_INPROGRESS;
379
380 do {
381 struct irqaction *action = desc->action;
382 irqreturn_t action_ret;
383
384 if (unlikely(!action)) {
385 desc->chip->mask(irq);
386 goto out_unlock;
387 }
388
389 /*
390 * When another irq arrived while we were handling
391 * one, we could have masked the irq.
392 * Renable it, if it was not disabled in meantime.
393 */
394 if (unlikely((desc->status &
395 (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) ==
396 (IRQ_PENDING | IRQ_MASKED))) {
397 desc->chip->unmask(irq);
398 desc->status &= ~IRQ_MASKED;
399 }
400
401 desc->status &= ~IRQ_PENDING;
402 spin_unlock(&desc->lock);
403 action_ret = handle_IRQ_event(irq, regs, action);
404 if (!noirqdebug)
405 note_interrupt(irq, desc, action_ret, regs);
406 spin_lock(&desc->lock);
407
408 } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING);
409
410 desc->status &= ~IRQ_INPROGRESS;
411out_unlock:
412 spin_unlock(&desc->lock);
413}
414
415#ifdef CONFIG_SMP
416/**
417 * handle_percpu_IRQ - Per CPU local irq handler
418 * @irq: the interrupt number
419 * @desc: the interrupt description structure for this irq
420 * @regs: pointer to a register structure
421 *
422 * Per CPU interrupts on SMP machines without locking requirements
423 */
424void fastcall
425handle_percpu_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
426{
427 irqreturn_t action_ret;
428
429 kstat_this_cpu.irqs[irq]++;
430
431 if (desc->chip->ack)
432 desc->chip->ack(irq);
433
434 action_ret = handle_IRQ_event(irq, regs, desc->action);
435 if (!noirqdebug)
436 note_interrupt(irq, desc, action_ret, regs);
437
438 if (desc->chip->eoi)
439 desc->chip->eoi(irq);
440}
441
442#endif /* CONFIG_SMP */
443
444void
445__set_irq_handler(unsigned int irq,
446 void fastcall (*handle)(unsigned int, irq_desc_t *,
447 struct pt_regs *),
448 int is_chained)
449{
450 struct irq_desc *desc;
451 unsigned long flags;
452
453 if (irq >= NR_IRQS) {
454 printk(KERN_ERR
455 "Trying to install type control for IRQ%d\n", irq);
456 return;
457 }
458
459 desc = irq_desc + irq;
460
461 if (!handle)
462 handle = handle_bad_irq;
463
464 if (desc->chip == &no_irq_chip) {
465 printk(KERN_WARNING "Trying to install %sinterrupt handler "
466 "for IRQ%d\n", is_chained ? "chained " : " ", irq);
467 /*
468 * Some ARM implementations install a handler for really dumb
469 * interrupt hardware without setting an irq_chip. This worked
470 * with the ARM no_irq_chip but the check in setup_irq would
471 * prevent us to setup the interrupt at all. Switch it to
472 * dummy_irq_chip for easy transition.
473 */
474 desc->chip = &dummy_irq_chip;
475 }
476
477 spin_lock_irqsave(&desc->lock, flags);
478
479 /* Uninstall? */
480 if (handle == handle_bad_irq) {
481 if (desc->chip != &no_irq_chip) {
482 desc->chip->mask(irq);
483 desc->chip->ack(irq);
484 }
485 desc->status |= IRQ_DISABLED;
486 desc->depth = 1;
487 }
488 desc->handle_irq = handle;
489
490 if (handle != handle_bad_irq && is_chained) {
491 desc->status &= ~IRQ_DISABLED;
492 desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE;
493 desc->depth = 0;
494 desc->chip->unmask(irq);
495 }
496 spin_unlock_irqrestore(&desc->lock, flags);
497}
498
499void
500set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip,
501 void fastcall (*handle)(unsigned int,
502 struct irq_desc *,
503 struct pt_regs *))
504{
505 set_irq_chip(irq, chip);
506 __set_irq_handler(irq, handle, 0);
507}
508
509/*
510 * Get a descriptive string for the highlevel handler, for
511 * /proc/interrupts output:
512 */
513const char *
514handle_irq_name(void fastcall (*handle)(unsigned int, struct irq_desc *,
515 struct pt_regs *))
516{
517 if (handle == handle_level_irq)
518 return "level ";
519 if (handle == handle_fasteoi_irq)
520 return "fasteoi";
521 if (handle == handle_edge_irq)
522 return "edge ";
523 if (handle == handle_simple_irq)
524 return "simple ";
525#ifdef CONFIG_SMP
526 if (handle == handle_percpu_irq)
527 return "percpu ";
528#endif
529 if (handle == handle_bad_irq)
530 return "bad ";
531
532 return NULL;
533}
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 0f6530117105..4c6cdbaed661 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -1,9 +1,13 @@
1/* 1/*
2 * linux/kernel/irq/handle.c 2 * linux/kernel/irq/handle.c
3 * 3 *
4 * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar 4 * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
5 * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
5 * 6 *
6 * This file contains the core interrupt handling code. 7 * This file contains the core interrupt handling code.
8 *
9 * Detailed information is available in Documentation/DocBook/genericirq
10 *
7 */ 11 */
8 12
9#include <linux/irq.h> 13#include <linux/irq.h>
@@ -14,11 +18,27 @@
14 18
15#include "internals.h" 19#include "internals.h"
16 20
21/**
22 * handle_bad_irq - handle spurious and unhandled irqs
23 * @irq: the interrupt number
24 * @desc: description of the interrupt
25 * @regs: pointer to a register structure
26 *
27 * Handles spurious and unhandled IRQ's. It also prints a debugmessage.
28 */
29void fastcall
30handle_bad_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
31{
32 print_irq_desc(irq, desc);
33 kstat_this_cpu.irqs[irq]++;
34 ack_bad_irq(irq);
35}
36
17/* 37/*
18 * Linux has a controller-independent interrupt architecture. 38 * Linux has a controller-independent interrupt architecture.
19 * Every controller has a 'controller-template', that is used 39 * Every controller has a 'controller-template', that is used
20 * by the main code to do the right thing. Each driver-visible 40 * by the main code to do the right thing. Each driver-visible
21 * interrupt source is transparently wired to the apropriate 41 * interrupt source is transparently wired to the appropriate
22 * controller. Thus drivers need not be aware of the 42 * controller. Thus drivers need not be aware of the
23 * interrupt-controller. 43 * interrupt-controller.
24 * 44 *
@@ -28,41 +48,68 @@
28 * 48 *
29 * Controller mappings for all interrupt sources: 49 * Controller mappings for all interrupt sources:
30 */ 50 */
31irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = { 51struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned = {
32 [0 ... NR_IRQS-1] = { 52 [0 ... NR_IRQS-1] = {
33 .status = IRQ_DISABLED, 53 .status = IRQ_DISABLED,
34 .handler = &no_irq_type, 54 .chip = &no_irq_chip,
35 .lock = SPIN_LOCK_UNLOCKED 55 .handle_irq = handle_bad_irq,
56 .depth = 1,
57 .lock = SPIN_LOCK_UNLOCKED,
58#ifdef CONFIG_SMP
59 .affinity = CPU_MASK_ALL
60#endif
36 } 61 }
37}; 62};
38 63
39/* 64/*
40 * Generic 'no controller' code 65 * What should we do if we get a hw irq event on an illegal vector?
66 * Each architecture has to answer this themself.
41 */ 67 */
42static void end_none(unsigned int irq) { } 68static void ack_bad(unsigned int irq)
43static void enable_none(unsigned int irq) { }
44static void disable_none(unsigned int irq) { }
45static void shutdown_none(unsigned int irq) { }
46static unsigned int startup_none(unsigned int irq) { return 0; }
47
48static void ack_none(unsigned int irq)
49{ 69{
50 /* 70 print_irq_desc(irq, irq_desc + irq);
51 * 'what should we do if we get a hw irq event on an illegal vector'.
52 * each architecture has to answer this themself.
53 */
54 ack_bad_irq(irq); 71 ack_bad_irq(irq);
55} 72}
56 73
57struct hw_interrupt_type no_irq_type = { 74/*
58 .typename = "none", 75 * NOP functions
59 .startup = startup_none, 76 */
60 .shutdown = shutdown_none, 77static void noop(unsigned int irq)
61 .enable = enable_none, 78{
62 .disable = disable_none, 79}
63 .ack = ack_none, 80
64 .end = end_none, 81static unsigned int noop_ret(unsigned int irq)
65 .set_affinity = NULL 82{
83 return 0;
84}
85
86/*
87 * Generic no controller implementation
88 */
89struct irq_chip no_irq_chip = {
90 .name = "none",
91 .startup = noop_ret,
92 .shutdown = noop,
93 .enable = noop,
94 .disable = noop,
95 .ack = ack_bad,
96 .end = noop,
97};
98
99/*
100 * Generic dummy implementation which can be used for
101 * real dumb interrupt sources
102 */
103struct irq_chip dummy_irq_chip = {
104 .name = "dummy",
105 .startup = noop_ret,
106 .shutdown = noop,
107 .enable = noop,
108 .disable = noop,
109 .ack = noop,
110 .mask = noop,
111 .unmask = noop,
112 .end = noop,
66}; 113};
67 114
68/* 115/*
@@ -73,17 +120,24 @@ irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs)
73 return IRQ_NONE; 120 return IRQ_NONE;
74} 121}
75 122
76/* 123/**
77 * Have got an event to handle: 124 * handle_IRQ_event - irq action chain handler
125 * @irq: the interrupt number
126 * @regs: pointer to a register structure
127 * @action: the interrupt action chain for this irq
128 *
129 * Handles the action chain of an irq event
78 */ 130 */
79fastcall irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs, 131irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
80 struct irqaction *action) 132 struct irqaction *action)
81{ 133{
82 irqreturn_t ret, retval = IRQ_NONE; 134 irqreturn_t ret, retval = IRQ_NONE;
83 unsigned int status = 0; 135 unsigned int status = 0;
84 136
85 if (!(action->flags & SA_INTERRUPT)) 137 handle_dynamic_tick(action);
86 local_irq_enable(); 138
139 if (!(action->flags & IRQF_DISABLED))
140 local_irq_enable_in_hardirq();
87 141
88 do { 142 do {
89 ret = action->handler(irq, action->dev_id, regs); 143 ret = action->handler(irq, action->dev_id, regs);
@@ -93,22 +147,30 @@ fastcall irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
93 action = action->next; 147 action = action->next;
94 } while (action); 148 } while (action);
95 149
96 if (status & SA_SAMPLE_RANDOM) 150 if (status & IRQF_SAMPLE_RANDOM)
97 add_interrupt_randomness(irq); 151 add_interrupt_randomness(irq);
98 local_irq_disable(); 152 local_irq_disable();
99 153
100 return retval; 154 return retval;
101} 155}
102 156
103/* 157#ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
104 * do_IRQ handles all normal device IRQ's (the special 158/**
159 * __do_IRQ - original all in one highlevel IRQ handler
160 * @irq: the interrupt number
161 * @regs: pointer to a register structure
162 *
163 * __do_IRQ handles all normal device IRQ's (the special
105 * SMP cross-CPU interrupts have their own specific 164 * SMP cross-CPU interrupts have their own specific
106 * handlers). 165 * handlers).
166 *
167 * This is the original x86 implementation which is used for every
168 * interrupt type.
107 */ 169 */
108fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) 170fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
109{ 171{
110 irq_desc_t *desc = irq_desc + irq; 172 struct irq_desc *desc = irq_desc + irq;
111 struct irqaction * action; 173 struct irqaction *action;
112 unsigned int status; 174 unsigned int status;
113 175
114 kstat_this_cpu.irqs[irq]++; 176 kstat_this_cpu.irqs[irq]++;
@@ -118,16 +180,16 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
118 /* 180 /*
119 * No locking required for CPU-local interrupts: 181 * No locking required for CPU-local interrupts:
120 */ 182 */
121 if (desc->handler->ack) 183 if (desc->chip->ack)
122 desc->handler->ack(irq); 184 desc->chip->ack(irq);
123 action_ret = handle_IRQ_event(irq, regs, desc->action); 185 action_ret = handle_IRQ_event(irq, regs, desc->action);
124 desc->handler->end(irq); 186 desc->chip->end(irq);
125 return 1; 187 return 1;
126 } 188 }
127 189
128 spin_lock(&desc->lock); 190 spin_lock(&desc->lock);
129 if (desc->handler->ack) 191 if (desc->chip->ack)
130 desc->handler->ack(irq); 192 desc->chip->ack(irq);
131 /* 193 /*
132 * REPLAY is when Linux resends an IRQ that was dropped earlier 194 * REPLAY is when Linux resends an IRQ that was dropped earlier
133 * WAITING is used by probe to mark irqs that are being tested 195 * WAITING is used by probe to mark irqs that are being tested
@@ -187,9 +249,26 @@ out:
187 * The ->end() handler has to deal with interrupts which got 249 * The ->end() handler has to deal with interrupts which got
188 * disabled while the handler was running. 250 * disabled while the handler was running.
189 */ 251 */
190 desc->handler->end(irq); 252 desc->chip->end(irq);
191 spin_unlock(&desc->lock); 253 spin_unlock(&desc->lock);
192 254
193 return 1; 255 return 1;
194} 256}
257#endif
258
259#ifdef CONFIG_TRACE_IRQFLAGS
260
261/*
262 * lockdep: we want to handle all irq_desc locks as a single lock-class:
263 */
264static struct lock_class_key irq_desc_lock_class;
265
266void early_init_irq_lock_class(void)
267{
268 int i;
269
270 for (i = 0; i < NR_IRQS; i++)
271 lockdep_set_class(&irq_desc[i].lock, &irq_desc_lock_class);
272}
195 273
274#endif
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 46feba630266..08a849a22447 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -4,6 +4,12 @@
4 4
5extern int noirqdebug; 5extern int noirqdebug;
6 6
7/* Set default functions for irq_chip structures: */
8extern void irq_chip_set_defaults(struct irq_chip *chip);
9
10/* Set default handler: */
11extern void compat_irq_chip_set_default_handler(struct irq_desc *desc);
12
7#ifdef CONFIG_PROC_FS 13#ifdef CONFIG_PROC_FS
8extern void register_irq_proc(unsigned int irq); 14extern void register_irq_proc(unsigned int irq);
9extern void register_handler_proc(unsigned int irq, struct irqaction *action); 15extern void register_handler_proc(unsigned int irq, struct irqaction *action);
@@ -16,3 +22,43 @@ static inline void unregister_handler_proc(unsigned int irq,
16 struct irqaction *action) { } 22 struct irqaction *action) { }
17#endif 23#endif
18 24
25/*
26 * Debugging printout:
27 */
28
29#include <linux/kallsyms.h>
30
31#define P(f) if (desc->status & f) printk("%14s set\n", #f)
32
33static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
34{
35 printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n",
36 irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
37 printk("->handle_irq(): %p, ", desc->handle_irq);
38 print_symbol("%s\n", (unsigned long)desc->handle_irq);
39 printk("->chip(): %p, ", desc->chip);
40 print_symbol("%s\n", (unsigned long)desc->chip);
41 printk("->action(): %p\n", desc->action);
42 if (desc->action) {
43 printk("->action->handler(): %p, ", desc->action->handler);
44 print_symbol("%s\n", (unsigned long)desc->action->handler);
45 }
46
47 P(IRQ_INPROGRESS);
48 P(IRQ_DISABLED);
49 P(IRQ_PENDING);
50 P(IRQ_REPLAY);
51 P(IRQ_AUTODETECT);
52 P(IRQ_WAITING);
53 P(IRQ_LEVEL);
54 P(IRQ_MASKED);
55#ifdef CONFIG_IRQ_PER_CPU
56 P(IRQ_PER_CPU);
57#endif
58 P(IRQ_NOPROBE);
59 P(IRQ_NOREQUEST);
60 P(IRQ_NOAUTOEN);
61}
62
63#undef P
64
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 1279e3499534..92be519eff26 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1,12 +1,12 @@
1/* 1/*
2 * linux/kernel/irq/manage.c 2 * linux/kernel/irq/manage.c
3 * 3 *
4 * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar 4 * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
5 * Copyright (C) 2005-2006 Thomas Gleixner
5 * 6 *
6 * This file contains driver APIs to the irq subsystem. 7 * This file contains driver APIs to the irq subsystem.
7 */ 8 */
8 9
9#include <linux/config.h>
10#include <linux/irq.h> 10#include <linux/irq.h>
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/random.h> 12#include <linux/random.h>
@@ -16,12 +16,6 @@
16 16
17#ifdef CONFIG_SMP 17#ifdef CONFIG_SMP
18 18
19cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL };
20
21#if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE)
22cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS];
23#endif
24
25/** 19/**
26 * synchronize_irq - wait for pending IRQ handlers (on other CPUs) 20 * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
27 * @irq: interrupt number to wait for 21 * @irq: interrupt number to wait for
@@ -42,7 +36,6 @@ void synchronize_irq(unsigned int irq)
42 while (desc->status & IRQ_INPROGRESS) 36 while (desc->status & IRQ_INPROGRESS)
43 cpu_relax(); 37 cpu_relax();
44} 38}
45
46EXPORT_SYMBOL(synchronize_irq); 39EXPORT_SYMBOL(synchronize_irq);
47 40
48#endif 41#endif
@@ -60,7 +53,7 @@ EXPORT_SYMBOL(synchronize_irq);
60 */ 53 */
61void disable_irq_nosync(unsigned int irq) 54void disable_irq_nosync(unsigned int irq)
62{ 55{
63 irq_desc_t *desc = irq_desc + irq; 56 struct irq_desc *desc = irq_desc + irq;
64 unsigned long flags; 57 unsigned long flags;
65 58
66 if (irq >= NR_IRQS) 59 if (irq >= NR_IRQS)
@@ -69,11 +62,10 @@ void disable_irq_nosync(unsigned int irq)
69 spin_lock_irqsave(&desc->lock, flags); 62 spin_lock_irqsave(&desc->lock, flags);
70 if (!desc->depth++) { 63 if (!desc->depth++) {
71 desc->status |= IRQ_DISABLED; 64 desc->status |= IRQ_DISABLED;
72 desc->handler->disable(irq); 65 desc->chip->disable(irq);
73 } 66 }
74 spin_unlock_irqrestore(&desc->lock, flags); 67 spin_unlock_irqrestore(&desc->lock, flags);
75} 68}
76
77EXPORT_SYMBOL(disable_irq_nosync); 69EXPORT_SYMBOL(disable_irq_nosync);
78 70
79/** 71/**
@@ -90,7 +82,7 @@ EXPORT_SYMBOL(disable_irq_nosync);
90 */ 82 */
91void disable_irq(unsigned int irq) 83void disable_irq(unsigned int irq)
92{ 84{
93 irq_desc_t *desc = irq_desc + irq; 85 struct irq_desc *desc = irq_desc + irq;
94 86
95 if (irq >= NR_IRQS) 87 if (irq >= NR_IRQS)
96 return; 88 return;
@@ -99,7 +91,6 @@ void disable_irq(unsigned int irq)
99 if (desc->action) 91 if (desc->action)
100 synchronize_irq(irq); 92 synchronize_irq(irq);
101} 93}
102
103EXPORT_SYMBOL(disable_irq); 94EXPORT_SYMBOL(disable_irq);
104 95
105/** 96/**
@@ -114,7 +105,7 @@ EXPORT_SYMBOL(disable_irq);
114 */ 105 */
115void enable_irq(unsigned int irq) 106void enable_irq(unsigned int irq)
116{ 107{
117 irq_desc_t *desc = irq_desc + irq; 108 struct irq_desc *desc = irq_desc + irq;
118 unsigned long flags; 109 unsigned long flags;
119 110
120 if (irq >= NR_IRQS) 111 if (irq >= NR_IRQS)
@@ -123,17 +114,15 @@ void enable_irq(unsigned int irq)
123 spin_lock_irqsave(&desc->lock, flags); 114 spin_lock_irqsave(&desc->lock, flags);
124 switch (desc->depth) { 115 switch (desc->depth) {
125 case 0: 116 case 0:
117 printk(KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
126 WARN_ON(1); 118 WARN_ON(1);
127 break; 119 break;
128 case 1: { 120 case 1: {
129 unsigned int status = desc->status & ~IRQ_DISABLED; 121 unsigned int status = desc->status & ~IRQ_DISABLED;
130 122
131 desc->status = status; 123 /* Prevent probing on this irq: */
132 if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { 124 desc->status = status | IRQ_NOPROBE;
133 desc->status = status | IRQ_REPLAY; 125 check_irq_resend(desc, irq);
134 hw_resend_irq(desc->handler,irq);
135 }
136 desc->handler->enable(irq);
137 /* fall-through */ 126 /* fall-through */
138 } 127 }
139 default: 128 default:
@@ -141,9 +130,53 @@ void enable_irq(unsigned int irq)
141 } 130 }
142 spin_unlock_irqrestore(&desc->lock, flags); 131 spin_unlock_irqrestore(&desc->lock, flags);
143} 132}
144
145EXPORT_SYMBOL(enable_irq); 133EXPORT_SYMBOL(enable_irq);
146 134
135/**
136 * set_irq_wake - control irq power management wakeup
137 * @irq: interrupt to control
138 * @on: enable/disable power management wakeup
139 *
140 * Enable/disable power management wakeup mode, which is
141 * disabled by default. Enables and disables must match,
142 * just as they match for non-wakeup mode support.
143 *
144 * Wakeup mode lets this IRQ wake the system from sleep
145 * states like "suspend to RAM".
146 */
147int set_irq_wake(unsigned int irq, unsigned int on)
148{
149 struct irq_desc *desc = irq_desc + irq;
150 unsigned long flags;
151 int ret = -ENXIO;
152 int (*set_wake)(unsigned, unsigned) = desc->chip->set_wake;
153
154 /* wakeup-capable irqs can be shared between drivers that
155 * don't need to have the same sleep mode behaviors.
156 */
157 spin_lock_irqsave(&desc->lock, flags);
158 if (on) {
159 if (desc->wake_depth++ == 0)
160 desc->status |= IRQ_WAKEUP;
161 else
162 set_wake = NULL;
163 } else {
164 if (desc->wake_depth == 0) {
165 printk(KERN_WARNING "Unbalanced IRQ %d "
166 "wake disable\n", irq);
167 WARN_ON(1);
168 } else if (--desc->wake_depth == 0)
169 desc->status &= ~IRQ_WAKEUP;
170 else
171 set_wake = NULL;
172 }
173 if (set_wake)
174 ret = desc->chip->set_wake(irq, on);
175 spin_unlock_irqrestore(&desc->lock, flags);
176 return ret;
177}
178EXPORT_SYMBOL(set_irq_wake);
179
147/* 180/*
148 * Internal function that tells the architecture code whether a 181 * Internal function that tells the architecture code whether a
149 * particular irq has been exclusively allocated or is available 182 * particular irq has been exclusively allocated or is available
@@ -153,22 +186,33 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
153{ 186{
154 struct irqaction *action; 187 struct irqaction *action;
155 188
156 if (irq >= NR_IRQS) 189 if (irq >= NR_IRQS || irq_desc[irq].status & IRQ_NOREQUEST)
157 return 0; 190 return 0;
158 191
159 action = irq_desc[irq].action; 192 action = irq_desc[irq].action;
160 if (action) 193 if (action)
161 if (irqflags & action->flags & SA_SHIRQ) 194 if (irqflags & action->flags & IRQF_SHARED)
162 action = NULL; 195 action = NULL;
163 196
164 return !action; 197 return !action;
165} 198}
166 199
200void compat_irq_chip_set_default_handler(struct irq_desc *desc)
201{
202 /*
203 * If the architecture still has not overriden
204 * the flow handler then zap the default. This
205 * should catch incorrect flow-type setting.
206 */
207 if (desc->handle_irq == &handle_bad_irq)
208 desc->handle_irq = NULL;
209}
210
167/* 211/*
168 * Internal function to register an irqaction - typically used to 212 * Internal function to register an irqaction - typically used to
169 * allocate special interrupts that are part of the architecture. 213 * allocate special interrupts that are part of the architecture.
170 */ 214 */
171int setup_irq(unsigned int irq, struct irqaction * new) 215int setup_irq(unsigned int irq, struct irqaction *new)
172{ 216{
173 struct irq_desc *desc = irq_desc + irq; 217 struct irq_desc *desc = irq_desc + irq;
174 struct irqaction *old, **p; 218 struct irqaction *old, **p;
@@ -178,14 +222,14 @@ int setup_irq(unsigned int irq, struct irqaction * new)
178 if (irq >= NR_IRQS) 222 if (irq >= NR_IRQS)
179 return -EINVAL; 223 return -EINVAL;
180 224
181 if (desc->handler == &no_irq_type) 225 if (desc->chip == &no_irq_chip)
182 return -ENOSYS; 226 return -ENOSYS;
183 /* 227 /*
184 * Some drivers like serial.c use request_irq() heavily, 228 * Some drivers like serial.c use request_irq() heavily,
185 * so we have to be careful not to interfere with a 229 * so we have to be careful not to interfere with a
186 * running system. 230 * running system.
187 */ 231 */
188 if (new->flags & SA_SAMPLE_RANDOM) { 232 if (new->flags & IRQF_SAMPLE_RANDOM) {
189 /* 233 /*
190 * This function might sleep, we want to call it first, 234 * This function might sleep, we want to call it first,
191 * outside of the atomic block. 235 * outside of the atomic block.
@@ -200,16 +244,24 @@ int setup_irq(unsigned int irq, struct irqaction * new)
200 /* 244 /*
201 * The following block of code has to be executed atomically 245 * The following block of code has to be executed atomically
202 */ 246 */
203 spin_lock_irqsave(&desc->lock,flags); 247 spin_lock_irqsave(&desc->lock, flags);
204 p = &desc->action; 248 p = &desc->action;
205 if ((old = *p) != NULL) { 249 old = *p;
206 /* Can't share interrupts unless both agree to */ 250 if (old) {
207 if (!(old->flags & new->flags & SA_SHIRQ)) 251 /*
252 * Can't share interrupts unless both agree to and are
253 * the same type (level, edge, polarity). So both flag
254 * fields must have IRQF_SHARED set and the bits which
255 * set the trigger type must match.
256 */
257 if (!((old->flags & new->flags) & IRQF_SHARED) ||
258 ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK))
208 goto mismatch; 259 goto mismatch;
209 260
210#if defined(ARCH_HAS_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ) 261#if defined(CONFIG_IRQ_PER_CPU)
211 /* All handlers must agree on per-cpuness */ 262 /* All handlers must agree on per-cpuness */
212 if ((old->flags & IRQ_PER_CPU) != (new->flags & IRQ_PER_CPU)) 263 if ((old->flags & IRQF_PERCPU) !=
264 (new->flags & IRQF_PERCPU))
213 goto mismatch; 265 goto mismatch;
214#endif 266#endif
215 267
@@ -222,20 +274,45 @@ int setup_irq(unsigned int irq, struct irqaction * new)
222 } 274 }
223 275
224 *p = new; 276 *p = new;
225#if defined(ARCH_HAS_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ) 277#if defined(CONFIG_IRQ_PER_CPU)
226 if (new->flags & SA_PERCPU_IRQ) 278 if (new->flags & IRQF_PERCPU)
227 desc->status |= IRQ_PER_CPU; 279 desc->status |= IRQ_PER_CPU;
228#endif 280#endif
229 if (!shared) { 281 if (!shared) {
230 desc->depth = 0; 282 irq_chip_set_defaults(desc->chip);
231 desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT | 283
232 IRQ_WAITING | IRQ_INPROGRESS); 284 /* Setup the type (level, edge polarity) if configured: */
233 if (desc->handler->startup) 285 if (new->flags & IRQF_TRIGGER_MASK) {
234 desc->handler->startup(irq); 286 if (desc->chip && desc->chip->set_type)
235 else 287 desc->chip->set_type(irq,
236 desc->handler->enable(irq); 288 new->flags & IRQF_TRIGGER_MASK);
289 else
290 /*
291 * IRQF_TRIGGER_* but the PIC does not support
292 * multiple flow-types?
293 */
294 printk(KERN_WARNING "No IRQF_TRIGGER set_type "
295 "function for IRQ %d (%s)\n", irq,
296 desc->chip ? desc->chip->name :
297 "unknown");
298 } else
299 compat_irq_chip_set_default_handler(desc);
300
301 desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING |
302 IRQ_INPROGRESS);
303
304 if (!(desc->status & IRQ_NOAUTOEN)) {
305 desc->depth = 0;
306 desc->status &= ~IRQ_DISABLED;
307 if (desc->chip->startup)
308 desc->chip->startup(irq);
309 else
310 desc->chip->enable(irq);
311 } else
312 /* Undo nested disables: */
313 desc->depth = 1;
237 } 314 }
238 spin_unlock_irqrestore(&desc->lock,flags); 315 spin_unlock_irqrestore(&desc->lock, flags);
239 316
240 new->irq = irq; 317 new->irq = irq;
241 register_irq_proc(irq); 318 register_irq_proc(irq);
@@ -246,8 +323,8 @@ int setup_irq(unsigned int irq, struct irqaction * new)
246 323
247mismatch: 324mismatch:
248 spin_unlock_irqrestore(&desc->lock, flags); 325 spin_unlock_irqrestore(&desc->lock, flags);
249 if (!(new->flags & SA_PROBEIRQ)) { 326 if (!(new->flags & IRQF_PROBE_SHARED)) {
250 printk(KERN_ERR "%s: irq handler mismatch\n", __FUNCTION__); 327 printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq);
251 dump_stack(); 328 dump_stack();
252 } 329 }
253 return -EBUSY; 330 return -EBUSY;
@@ -278,10 +355,10 @@ void free_irq(unsigned int irq, void *dev_id)
278 return; 355 return;
279 356
280 desc = irq_desc + irq; 357 desc = irq_desc + irq;
281 spin_lock_irqsave(&desc->lock,flags); 358 spin_lock_irqsave(&desc->lock, flags);
282 p = &desc->action; 359 p = &desc->action;
283 for (;;) { 360 for (;;) {
284 struct irqaction * action = *p; 361 struct irqaction *action = *p;
285 362
286 if (action) { 363 if (action) {
287 struct irqaction **pp = p; 364 struct irqaction **pp = p;
@@ -295,18 +372,18 @@ void free_irq(unsigned int irq, void *dev_id)
295 372
296 /* Currently used only by UML, might disappear one day.*/ 373 /* Currently used only by UML, might disappear one day.*/
297#ifdef CONFIG_IRQ_RELEASE_METHOD 374#ifdef CONFIG_IRQ_RELEASE_METHOD
298 if (desc->handler->release) 375 if (desc->chip->release)
299 desc->handler->release(irq, dev_id); 376 desc->chip->release(irq, dev_id);
300#endif 377#endif
301 378
302 if (!desc->action) { 379 if (!desc->action) {
303 desc->status |= IRQ_DISABLED; 380 desc->status |= IRQ_DISABLED;
304 if (desc->handler->shutdown) 381 if (desc->chip->shutdown)
305 desc->handler->shutdown(irq); 382 desc->chip->shutdown(irq);
306 else 383 else
307 desc->handler->disable(irq); 384 desc->chip->disable(irq);
308 } 385 }
309 spin_unlock_irqrestore(&desc->lock,flags); 386 spin_unlock_irqrestore(&desc->lock, flags);
310 unregister_handler_proc(irq, action); 387 unregister_handler_proc(irq, action);
311 388
312 /* Make sure it's not being used on another CPU */ 389 /* Make sure it's not being used on another CPU */
@@ -314,12 +391,11 @@ void free_irq(unsigned int irq, void *dev_id)
314 kfree(action); 391 kfree(action);
315 return; 392 return;
316 } 393 }
317 printk(KERN_ERR "Trying to free free IRQ%d\n",irq); 394 printk(KERN_ERR "Trying to free already-free IRQ %d\n", irq);
318 spin_unlock_irqrestore(&desc->lock,flags); 395 spin_unlock_irqrestore(&desc->lock, flags);
319 return; 396 return;
320 } 397 }
321} 398}
322
323EXPORT_SYMBOL(free_irq); 399EXPORT_SYMBOL(free_irq);
324 400
325/** 401/**
@@ -346,28 +422,36 @@ EXPORT_SYMBOL(free_irq);
346 * 422 *
347 * Flags: 423 * Flags:
348 * 424 *
349 * SA_SHIRQ Interrupt is shared 425 * IRQF_SHARED Interrupt is shared
350 * SA_INTERRUPT Disable local interrupts while processing 426 * IRQF_DISABLED Disable local interrupts while processing
351 * SA_SAMPLE_RANDOM The interrupt can be used for entropy 427 * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy
352 * 428 *
353 */ 429 */
354int request_irq(unsigned int irq, 430int request_irq(unsigned int irq,
355 irqreturn_t (*handler)(int, void *, struct pt_regs *), 431 irqreturn_t (*handler)(int, void *, struct pt_regs *),
356 unsigned long irqflags, const char * devname, void *dev_id) 432 unsigned long irqflags, const char *devname, void *dev_id)
357{ 433{
358 struct irqaction * action; 434 struct irqaction *action;
359 int retval; 435 int retval;
360 436
437#ifdef CONFIG_LOCKDEP
438 /*
439 * Lockdep wants atomic interrupt handlers:
440 */
441 irqflags |= SA_INTERRUPT;
442#endif
361 /* 443 /*
362 * Sanity-check: shared interrupts must pass in a real dev-ID, 444 * Sanity-check: shared interrupts must pass in a real dev-ID,
363 * otherwise we'll have trouble later trying to figure out 445 * otherwise we'll have trouble later trying to figure out
364 * which interrupt is which (messes up the interrupt freeing 446 * which interrupt is which (messes up the interrupt freeing
365 * logic etc). 447 * logic etc).
366 */ 448 */
367 if ((irqflags & SA_SHIRQ) && !dev_id) 449 if ((irqflags & IRQF_SHARED) && !dev_id)
368 return -EINVAL; 450 return -EINVAL;
369 if (irq >= NR_IRQS) 451 if (irq >= NR_IRQS)
370 return -EINVAL; 452 return -EINVAL;
453 if (irq_desc[irq].status & IRQ_NOREQUEST)
454 return -EINVAL;
371 if (!handler) 455 if (!handler)
372 return -EINVAL; 456 return -EINVAL;
373 457
@@ -390,6 +474,5 @@ int request_irq(unsigned int irq,
390 474
391 return retval; 475 return retval;
392} 476}
393
394EXPORT_SYMBOL(request_irq); 477EXPORT_SYMBOL(request_irq);
395 478
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index a12d00eb5e7c..a57ebe9fa6f6 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -3,19 +3,19 @@
3 3
4void set_pending_irq(unsigned int irq, cpumask_t mask) 4void set_pending_irq(unsigned int irq, cpumask_t mask)
5{ 5{
6 irq_desc_t *desc = irq_desc + irq; 6 struct irq_desc *desc = irq_desc + irq;
7 unsigned long flags; 7 unsigned long flags;
8 8
9 spin_lock_irqsave(&desc->lock, flags); 9 spin_lock_irqsave(&desc->lock, flags);
10 desc->move_irq = 1; 10 desc->move_irq = 1;
11 pending_irq_cpumask[irq] = mask; 11 irq_desc[irq].pending_mask = mask;
12 spin_unlock_irqrestore(&desc->lock, flags); 12 spin_unlock_irqrestore(&desc->lock, flags);
13} 13}
14 14
15void move_native_irq(int irq) 15void move_native_irq(int irq)
16{ 16{
17 struct irq_desc *desc = irq_desc + irq;
17 cpumask_t tmp; 18 cpumask_t tmp;
18 irq_desc_t *desc = irq_descp(irq);
19 19
20 if (likely(!desc->move_irq)) 20 if (likely(!desc->move_irq))
21 return; 21 return;
@@ -30,15 +30,15 @@ void move_native_irq(int irq)
30 30
31 desc->move_irq = 0; 31 desc->move_irq = 0;
32 32
33 if (unlikely(cpus_empty(pending_irq_cpumask[irq]))) 33 if (unlikely(cpus_empty(irq_desc[irq].pending_mask)))
34 return; 34 return;
35 35
36 if (!desc->handler->set_affinity) 36 if (!desc->chip->set_affinity)
37 return; 37 return;
38 38
39 assert_spin_locked(&desc->lock); 39 assert_spin_locked(&desc->lock);
40 40
41 cpus_and(tmp, pending_irq_cpumask[irq], cpu_online_map); 41 cpus_and(tmp, irq_desc[irq].pending_mask, cpu_online_map);
42 42
43 /* 43 /*
44 * If there was a valid mask to work with, please 44 * If there was a valid mask to work with, please
@@ -51,12 +51,12 @@ void move_native_irq(int irq)
51 */ 51 */
52 if (likely(!cpus_empty(tmp))) { 52 if (likely(!cpus_empty(tmp))) {
53 if (likely(!(desc->status & IRQ_DISABLED))) 53 if (likely(!(desc->status & IRQ_DISABLED)))
54 desc->handler->disable(irq); 54 desc->chip->disable(irq);
55 55
56 desc->handler->set_affinity(irq,tmp); 56 desc->chip->set_affinity(irq,tmp);
57 57
58 if (likely(!(desc->status & IRQ_DISABLED))) 58 if (likely(!(desc->status & IRQ_DISABLED)))
59 desc->handler->enable(irq); 59 desc->chip->enable(irq);
60 } 60 }
61 cpus_clear(pending_irq_cpumask[irq]); 61 cpus_clear(irq_desc[irq].pending_mask);
62} 62}
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index afacd6f585fa..607c7809ad01 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -12,15 +12,10 @@
12 12
13#include "internals.h" 13#include "internals.h"
14 14
15static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS]; 15static struct proc_dir_entry *root_irq_dir;
16 16
17#ifdef CONFIG_SMP 17#ifdef CONFIG_SMP
18 18
19/*
20 * The /proc/irq/<irq>/smp_affinity values:
21 */
22static struct proc_dir_entry *smp_affinity_entry[NR_IRQS];
23
24#ifdef CONFIG_GENERIC_PENDING_IRQ 19#ifdef CONFIG_GENERIC_PENDING_IRQ
25void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) 20void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
26{ 21{
@@ -36,15 +31,15 @@ void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
36void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) 31void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
37{ 32{
38 set_balance_irq_affinity(irq, mask_val); 33 set_balance_irq_affinity(irq, mask_val);
39 irq_affinity[irq] = mask_val; 34 irq_desc[irq].affinity = mask_val;
40 irq_desc[irq].handler->set_affinity(irq, mask_val); 35 irq_desc[irq].chip->set_affinity(irq, mask_val);
41} 36}
42#endif 37#endif
43 38
44static int irq_affinity_read_proc(char *page, char **start, off_t off, 39static int irq_affinity_read_proc(char *page, char **start, off_t off,
45 int count, int *eof, void *data) 40 int count, int *eof, void *data)
46{ 41{
47 int len = cpumask_scnprintf(page, count, irq_affinity[(long)data]); 42 int len = cpumask_scnprintf(page, count, irq_desc[(long)data].affinity);
48 43
49 if (count - len < 2) 44 if (count - len < 2)
50 return -EINVAL; 45 return -EINVAL;
@@ -59,7 +54,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
59 unsigned int irq = (int)(long)data, full_count = count, err; 54 unsigned int irq = (int)(long)data, full_count = count, err;
60 cpumask_t new_value, tmp; 55 cpumask_t new_value, tmp;
61 56
62 if (!irq_desc[irq].handler->set_affinity || no_irq_affinity) 57 if (!irq_desc[irq].chip->set_affinity || no_irq_affinity)
63 return -EIO; 58 return -EIO;
64 59
65 err = cpumask_parse(buffer, count, new_value); 60 err = cpumask_parse(buffer, count, new_value);
@@ -102,7 +97,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
102{ 97{
103 char name [MAX_NAMELEN]; 98 char name [MAX_NAMELEN];
104 99
105 if (!irq_dir[irq] || action->dir || !action->name || 100 if (!irq_desc[irq].dir || action->dir || !action->name ||
106 !name_unique(irq, action)) 101 !name_unique(irq, action))
107 return; 102 return;
108 103
@@ -110,7 +105,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
110 snprintf(name, MAX_NAMELEN, "%s", action->name); 105 snprintf(name, MAX_NAMELEN, "%s", action->name);
111 106
112 /* create /proc/irq/1234/handler/ */ 107 /* create /proc/irq/1234/handler/ */
113 action->dir = proc_mkdir(name, irq_dir[irq]); 108 action->dir = proc_mkdir(name, irq_desc[irq].dir);
114} 109}
115 110
116#undef MAX_NAMELEN 111#undef MAX_NAMELEN
@@ -122,22 +117,22 @@ void register_irq_proc(unsigned int irq)
122 char name [MAX_NAMELEN]; 117 char name [MAX_NAMELEN];
123 118
124 if (!root_irq_dir || 119 if (!root_irq_dir ||
125 (irq_desc[irq].handler == &no_irq_type) || 120 (irq_desc[irq].chip == &no_irq_chip) ||
126 irq_dir[irq]) 121 irq_desc[irq].dir)
127 return; 122 return;
128 123
129 memset(name, 0, MAX_NAMELEN); 124 memset(name, 0, MAX_NAMELEN);
130 sprintf(name, "%d", irq); 125 sprintf(name, "%d", irq);
131 126
132 /* create /proc/irq/1234 */ 127 /* create /proc/irq/1234 */
133 irq_dir[irq] = proc_mkdir(name, root_irq_dir); 128 irq_desc[irq].dir = proc_mkdir(name, root_irq_dir);
134 129
135#ifdef CONFIG_SMP 130#ifdef CONFIG_SMP
136 { 131 {
137 struct proc_dir_entry *entry; 132 struct proc_dir_entry *entry;
138 133
139 /* create /proc/irq/<irq>/smp_affinity */ 134 /* create /proc/irq/<irq>/smp_affinity */
140 entry = create_proc_entry("smp_affinity", 0600, irq_dir[irq]); 135 entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir);
141 136
142 if (entry) { 137 if (entry) {
143 entry->nlink = 1; 138 entry->nlink = 1;
@@ -145,7 +140,6 @@ void register_irq_proc(unsigned int irq)
145 entry->read_proc = irq_affinity_read_proc; 140 entry->read_proc = irq_affinity_read_proc;
146 entry->write_proc = irq_affinity_write_proc; 141 entry->write_proc = irq_affinity_write_proc;
147 } 142 }
148 smp_affinity_entry[irq] = entry;
149 } 143 }
150#endif 144#endif
151} 145}
@@ -155,7 +149,7 @@ void register_irq_proc(unsigned int irq)
155void unregister_handler_proc(unsigned int irq, struct irqaction *action) 149void unregister_handler_proc(unsigned int irq, struct irqaction *action)
156{ 150{
157 if (action->dir) 151 if (action->dir)
158 remove_proc_entry(action->dir->name, irq_dir[irq]); 152 remove_proc_entry(action->dir->name, irq_desc[irq].dir);
159} 153}
160 154
161void init_irq_proc(void) 155void init_irq_proc(void)
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
new file mode 100644
index 000000000000..35f10f7ff94a
--- /dev/null
+++ b/kernel/irq/resend.c
@@ -0,0 +1,77 @@
1/*
2 * linux/kernel/irq/resend.c
3 *
4 * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
5 * Copyright (C) 2005-2006, Thomas Gleixner
6 *
7 * This file contains the IRQ-resend code
8 *
9 * If the interrupt is waiting to be processed, we try to re-run it.
10 * We can't directly run it from here since the caller might be in an
11 * interrupt-protected region. Not all irq controller chips can
12 * retrigger interrupts at the hardware level, so in those cases
13 * we allow the resending of IRQs via a tasklet.
14 */
15
16#include <linux/irq.h>
17#include <linux/module.h>
18#include <linux/random.h>
19#include <linux/interrupt.h>
20
21#include "internals.h"
22
23#ifdef CONFIG_HARDIRQS_SW_RESEND
24
25/* Bitmap to handle software resend of interrupts: */
26static DECLARE_BITMAP(irqs_resend, NR_IRQS);
27
28/*
29 * Run software resends of IRQ's
30 */
31static void resend_irqs(unsigned long arg)
32{
33 struct irq_desc *desc;
34 int irq;
35
36 while (!bitmap_empty(irqs_resend, NR_IRQS)) {
37 irq = find_first_bit(irqs_resend, NR_IRQS);
38 clear_bit(irq, irqs_resend);
39 desc = irq_desc + irq;
40 local_irq_disable();
41 desc->handle_irq(irq, desc, NULL);
42 local_irq_enable();
43 }
44}
45
46/* Tasklet to handle resend: */
47static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
48
49#endif
50
51/*
52 * IRQ resend
53 *
54 * Is called with interrupts disabled and desc->lock held.
55 */
56void check_irq_resend(struct irq_desc *desc, unsigned int irq)
57{
58 unsigned int status = desc->status;
59
60 /*
61 * Make sure the interrupt is enabled, before resending it:
62 */
63 desc->chip->enable(irq);
64
65 if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
66 desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY;
67
68 if (!desc->chip || !desc->chip->retrigger ||
69 !desc->chip->retrigger(irq)) {
70#ifdef CONFIG_HARDIRQS_SW_RESEND
71 /* Set it pending and activate the softirq: */
72 set_bit(irq, irqs_resend);
73 tasklet_schedule(&resend_tasklet);
74#endif
75 }
76 }
77}
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index b2fb3c18d06b..417e98092cf2 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -16,39 +16,39 @@ static int irqfixup __read_mostly;
16/* 16/*
17 * Recovery handler for misrouted interrupts. 17 * Recovery handler for misrouted interrupts.
18 */ 18 */
19
20static int misrouted_irq(int irq, struct pt_regs *regs) 19static int misrouted_irq(int irq, struct pt_regs *regs)
21{ 20{
22 int i; 21 int i;
23 irq_desc_t *desc;
24 int ok = 0; 22 int ok = 0;
25 int work = 0; /* Did we do work for a real IRQ */ 23 int work = 0; /* Did we do work for a real IRQ */
26 24
27 for(i = 1; i < NR_IRQS; i++) { 25 for (i = 1; i < NR_IRQS; i++) {
26 struct irq_desc *desc = irq_desc + i;
28 struct irqaction *action; 27 struct irqaction *action;
29 28
30 if (i == irq) /* Already tried */ 29 if (i == irq) /* Already tried */
31 continue; 30 continue;
32 desc = &irq_desc[i]; 31
33 spin_lock(&desc->lock); 32 spin_lock(&desc->lock);
34 action = desc->action;
35 /* Already running on another processor */ 33 /* Already running on another processor */
36 if (desc->status & IRQ_INPROGRESS) { 34 if (desc->status & IRQ_INPROGRESS) {
37 /* 35 /*
38 * Already running: If it is shared get the other 36 * Already running: If it is shared get the other
39 * CPU to go looking for our mystery interrupt too 37 * CPU to go looking for our mystery interrupt too
40 */ 38 */
41 if (desc->action && (desc->action->flags & SA_SHIRQ)) 39 if (desc->action && (desc->action->flags & IRQF_SHARED))
42 desc->status |= IRQ_PENDING; 40 desc->status |= IRQ_PENDING;
43 spin_unlock(&desc->lock); 41 spin_unlock(&desc->lock);
44 continue; 42 continue;
45 } 43 }
46 /* Honour the normal IRQ locking */ 44 /* Honour the normal IRQ locking */
47 desc->status |= IRQ_INPROGRESS; 45 desc->status |= IRQ_INPROGRESS;
46 action = desc->action;
48 spin_unlock(&desc->lock); 47 spin_unlock(&desc->lock);
48
49 while (action) { 49 while (action) {
50 /* Only shared IRQ handlers are safe to call */ 50 /* Only shared IRQ handlers are safe to call */
51 if (action->flags & SA_SHIRQ) { 51 if (action->flags & IRQF_SHARED) {
52 if (action->handler(i, action->dev_id, regs) == 52 if (action->handler(i, action->dev_id, regs) ==
53 IRQ_HANDLED) 53 IRQ_HANDLED)
54 ok = 1; 54 ok = 1;
@@ -62,9 +62,8 @@ static int misrouted_irq(int irq, struct pt_regs *regs)
62 62
63 /* 63 /*
64 * While we were looking for a fixup someone queued a real 64 * While we were looking for a fixup someone queued a real
65 * IRQ clashing with our walk 65 * IRQ clashing with our walk:
66 */ 66 */
67
68 while ((desc->status & IRQ_PENDING) && action) { 67 while ((desc->status & IRQ_PENDING) && action) {
69 /* 68 /*
70 * Perform real IRQ processing for the IRQ we deferred 69 * Perform real IRQ processing for the IRQ we deferred
@@ -80,8 +79,8 @@ static int misrouted_irq(int irq, struct pt_regs *regs)
80 * If we did actual work for the real IRQ line we must let the 79 * If we did actual work for the real IRQ line we must let the
81 * IRQ controller clean up too 80 * IRQ controller clean up too
82 */ 81 */
83 if(work) 82 if (work && desc->chip && desc->chip->end)
84 desc->handler->end(i); 83 desc->chip->end(i);
85 spin_unlock(&desc->lock); 84 spin_unlock(&desc->lock);
86 } 85 }
87 /* So the caller can adjust the irq error counts */ 86 /* So the caller can adjust the irq error counts */
@@ -100,7 +99,8 @@ static int misrouted_irq(int irq, struct pt_regs *regs)
100 */ 99 */
101 100
102static void 101static void
103__report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) 102__report_bad_irq(unsigned int irq, struct irq_desc *desc,
103 irqreturn_t action_ret)
104{ 104{
105 struct irqaction *action; 105 struct irqaction *action;
106 106
@@ -113,6 +113,7 @@ __report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
113 } 113 }
114 dump_stack(); 114 dump_stack();
115 printk(KERN_ERR "handlers:\n"); 115 printk(KERN_ERR "handlers:\n");
116
116 action = desc->action; 117 action = desc->action;
117 while (action) { 118 while (action) {
118 printk(KERN_ERR "[<%p>]", action->handler); 119 printk(KERN_ERR "[<%p>]", action->handler);
@@ -123,7 +124,8 @@ __report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
123 } 124 }
124} 125}
125 126
126static void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) 127static void
128report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret)
127{ 129{
128 static int count = 100; 130 static int count = 100;
129 131
@@ -133,8 +135,8 @@ static void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t actio
133 } 135 }
134} 136}
135 137
136void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret, 138void note_interrupt(unsigned int irq, struct irq_desc *desc,
137 struct pt_regs *regs) 139 irqreturn_t action_ret, struct pt_regs *regs)
138{ 140{
139 if (unlikely(action_ret != IRQ_HANDLED)) { 141 if (unlikely(action_ret != IRQ_HANDLED)) {
140 desc->irqs_unhandled++; 142 desc->irqs_unhandled++;
@@ -166,7 +168,8 @@ void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret,
166 */ 168 */
167 printk(KERN_EMERG "Disabling IRQ #%d\n", irq); 169 printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
168 desc->status |= IRQ_DISABLED; 170 desc->status |= IRQ_DISABLED;
169 desc->handler->disable(irq); 171 desc->depth = 1;
172 desc->chip->disable(irq);
170 } 173 }
171 desc->irqs_unhandled = 0; 174 desc->irqs_unhandled = 0;
172} 175}
@@ -177,6 +180,7 @@ int __init noirqdebug_setup(char *str)
177{ 180{
178 noirqdebug = 1; 181 noirqdebug = 1;
179 printk(KERN_INFO "IRQ lockup detection disabled\n"); 182 printk(KERN_INFO "IRQ lockup detection disabled\n");
183
180 return 1; 184 return 1;
181} 185}
182 186
@@ -187,6 +191,7 @@ static int __init irqfixup_setup(char *str)
187 irqfixup = 1; 191 irqfixup = 1;
188 printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n"); 192 printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
189 printk(KERN_WARNING "This may impact system performance.\n"); 193 printk(KERN_WARNING "This may impact system performance.\n");
194
190 return 1; 195 return 1;
191} 196}
192 197
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 39277dd6bf90..ab16a5a4cfe9 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -275,8 +275,8 @@ static void upcase_if_global(struct kallsym_iter *iter)
275static int get_ksymbol_mod(struct kallsym_iter *iter) 275static int get_ksymbol_mod(struct kallsym_iter *iter)
276{ 276{
277 iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms, 277 iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms,
278 &iter->value, 278 &iter->value, &iter->type,
279 &iter->type, iter->name); 279 iter->name, sizeof(iter->name));
280 if (iter->owner == NULL) 280 if (iter->owner == NULL)
281 return 0; 281 return 0;
282 282
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 58f0f382597c..fcdd5d2bc3f4 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -40,7 +40,7 @@ struct resource crashk_res = {
40 40
41int kexec_should_crash(struct task_struct *p) 41int kexec_should_crash(struct task_struct *p)
42{ 42{
43 if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops) 43 if (in_interrupt() || !p->pid || is_init(p) || panic_on_oops)
44 return 1; 44 return 1;
45 return 0; 45 return 0;
46} 46}
@@ -995,7 +995,8 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
995 image = xchg(dest_image, image); 995 image = xchg(dest_image, image);
996 996
997out: 997out:
998 xchg(&kexec_lock, 0); /* Release the mutex */ 998 locked = xchg(&kexec_lock, 0); /* Release the mutex */
999 BUG_ON(!locked);
999 kimage_free(image); 1000 kimage_free(image);
1000 1001
1001 return result; 1002 return result;
@@ -1042,7 +1043,6 @@ asmlinkage long compat_sys_kexec_load(unsigned long entry,
1042 1043
1043void crash_kexec(struct pt_regs *regs) 1044void crash_kexec(struct pt_regs *regs)
1044{ 1045{
1045 struct kimage *image;
1046 int locked; 1046 int locked;
1047 1047
1048 1048
@@ -1056,14 +1056,14 @@ void crash_kexec(struct pt_regs *regs)
1056 */ 1056 */
1057 locked = xchg(&kexec_lock, 1); 1057 locked = xchg(&kexec_lock, 1);
1058 if (!locked) { 1058 if (!locked) {
1059 image = xchg(&kexec_crash_image, NULL); 1059 if (kexec_crash_image) {
1060 if (image) {
1061 struct pt_regs fixed_regs; 1060 struct pt_regs fixed_regs;
1062 crash_setup_regs(&fixed_regs, regs); 1061 crash_setup_regs(&fixed_regs, regs);
1063 machine_crash_shutdown(&fixed_regs); 1062 machine_crash_shutdown(&fixed_regs);
1064 machine_kexec(image); 1063 machine_kexec(kexec_crash_image);
1065 } 1064 }
1066 xchg(&kexec_lock, 0); 1065 locked = xchg(&kexec_lock, 0);
1066 BUG_ON(!locked);
1067 } 1067 }
1068} 1068}
1069 1069
diff --git a/kernel/kfifo.c b/kernel/kfifo.c
index 64ab045c3d9d..5d1d907378a2 100644
--- a/kernel/kfifo.c
+++ b/kernel/kfifo.c
@@ -122,6 +122,13 @@ unsigned int __kfifo_put(struct kfifo *fifo,
122 122
123 len = min(len, fifo->size - fifo->in + fifo->out); 123 len = min(len, fifo->size - fifo->in + fifo->out);
124 124
125 /*
126 * Ensure that we sample the fifo->out index -before- we
127 * start putting bytes into the kfifo.
128 */
129
130 smp_mb();
131
125 /* first put the data starting from fifo->in to buffer end */ 132 /* first put the data starting from fifo->in to buffer end */
126 l = min(len, fifo->size - (fifo->in & (fifo->size - 1))); 133 l = min(len, fifo->size - (fifo->in & (fifo->size - 1)));
127 memcpy(fifo->buffer + (fifo->in & (fifo->size - 1)), buffer, l); 134 memcpy(fifo->buffer + (fifo->in & (fifo->size - 1)), buffer, l);
@@ -129,6 +136,13 @@ unsigned int __kfifo_put(struct kfifo *fifo,
129 /* then put the rest (if any) at the beginning of the buffer */ 136 /* then put the rest (if any) at the beginning of the buffer */
130 memcpy(fifo->buffer, buffer + l, len - l); 137 memcpy(fifo->buffer, buffer + l, len - l);
131 138
139 /*
140 * Ensure that we add the bytes to the kfifo -before-
141 * we update the fifo->in index.
142 */
143
144 smp_wmb();
145
132 fifo->in += len; 146 fifo->in += len;
133 147
134 return len; 148 return len;
@@ -154,6 +168,13 @@ unsigned int __kfifo_get(struct kfifo *fifo,
154 168
155 len = min(len, fifo->in - fifo->out); 169 len = min(len, fifo->in - fifo->out);
156 170
171 /*
172 * Ensure that we sample the fifo->in index -before- we
173 * start removing bytes from the kfifo.
174 */
175
176 smp_rmb();
177
157 /* first get the data from fifo->out until the end of the buffer */ 178 /* first get the data from fifo->out until the end of the buffer */
158 l = min(len, fifo->size - (fifo->out & (fifo->size - 1))); 179 l = min(len, fifo->size - (fifo->out & (fifo->size - 1)));
159 memcpy(buffer, fifo->buffer + (fifo->out & (fifo->size - 1)), l); 180 memcpy(buffer, fifo->buffer + (fifo->out & (fifo->size - 1)), l);
@@ -161,6 +182,13 @@ unsigned int __kfifo_get(struct kfifo *fifo,
161 /* then get the rest (if any) from the beginning of the buffer */ 182 /* then get the rest (if any) from the beginning of the buffer */
162 memcpy(buffer + l, fifo->buffer, len - l); 183 memcpy(buffer + l, fifo->buffer, len - l);
163 184
185 /*
186 * Ensure that we remove the bytes from the kfifo -before-
187 * we update the fifo->out index.
188 */
189
190 smp_mb();
191
164 fifo->out += len; 192 fifo->out += len;
165 193
166 return len; 194 return len;
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 20a997c73c3d..842f8015d7fd 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -20,7 +20,6 @@
20*/ 20*/
21#define __KERNEL_SYSCALLS__ 21#define __KERNEL_SYSCALLS__
22 22
23#include <linux/config.h>
24#include <linux/module.h> 23#include <linux/module.h>
25#include <linux/sched.h> 24#include <linux/sched.h>
26#include <linux/syscalls.h> 25#include <linux/syscalls.h>
@@ -177,6 +176,8 @@ static int wait_for_helper(void *data)
177 if (pid < 0) { 176 if (pid < 0) {
178 sub_info->retval = pid; 177 sub_info->retval = pid;
179 } else { 178 } else {
179 int ret;
180
180 /* 181 /*
181 * Normally it is bogus to call wait4() from in-kernel because 182 * Normally it is bogus to call wait4() from in-kernel because
182 * wait4() wants to write the exit code to a userspace address. 183 * wait4() wants to write the exit code to a userspace address.
@@ -186,7 +187,15 @@ static int wait_for_helper(void *data)
186 * 187 *
187 * Thus the __user pointer cast is valid here. 188 * Thus the __user pointer cast is valid here.
188 */ 189 */
189 sys_wait4(pid, (int __user *) &sub_info->retval, 0, NULL); 190 sys_wait4(pid, (int __user *)&ret, 0, NULL);
191
192 /*
193 * If ret is 0, either ____call_usermodehelper failed and the
194 * real error code is already in sub_info->retval or
195 * sub_info->retval is 0 anyway, so don't mess with it then.
196 */
197 if (ret)
198 sub_info->retval = ret;
190 } 199 }
191 200
192 complete(sub_info->complete); 201 complete(sub_info->complete);
@@ -198,11 +207,12 @@ static void __call_usermodehelper(void *data)
198{ 207{
199 struct subprocess_info *sub_info = data; 208 struct subprocess_info *sub_info = data;
200 pid_t pid; 209 pid_t pid;
210 int wait = sub_info->wait;
201 211
202 /* CLONE_VFORK: wait until the usermode helper has execve'd 212 /* CLONE_VFORK: wait until the usermode helper has execve'd
203 * successfully We need the data structures to stay around 213 * successfully We need the data structures to stay around
204 * until that is done. */ 214 * until that is done. */
205 if (sub_info->wait) 215 if (wait)
206 pid = kernel_thread(wait_for_helper, sub_info, 216 pid = kernel_thread(wait_for_helper, sub_info,
207 CLONE_FS | CLONE_FILES | SIGCHLD); 217 CLONE_FS | CLONE_FILES | SIGCHLD);
208 else 218 else
@@ -212,7 +222,7 @@ static void __call_usermodehelper(void *data)
212 if (pid < 0) { 222 if (pid < 0) {
213 sub_info->retval = pid; 223 sub_info->retval = pid;
214 complete(sub_info->complete); 224 complete(sub_info->complete);
215 } else if (!sub_info->wait) 225 } else if (!wait)
216 complete(sub_info->complete); 226 complete(sub_info->complete);
217} 227}
218 228
@@ -234,7 +244,7 @@ static void __call_usermodehelper(void *data)
234int call_usermodehelper_keys(char *path, char **argv, char **envp, 244int call_usermodehelper_keys(char *path, char **argv, char **envp,
235 struct key *session_keyring, int wait) 245 struct key *session_keyring, int wait)
236{ 246{
237 DECLARE_COMPLETION(done); 247 DECLARE_COMPLETION_ONSTACK(done);
238 struct subprocess_info sub_info = { 248 struct subprocess_info sub_info = {
239 .complete = &done, 249 .complete = &done,
240 .path = path, 250 .path = path,
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 1fbf466a29aa..3f57dfdc8f92 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -47,11 +47,17 @@
47 47
48static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; 48static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
49static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; 49static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
50static atomic_t kprobe_count;
50 51
51DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ 52DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
52DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ 53DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */
53static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; 54static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
54 55
56static struct notifier_block kprobe_page_fault_nb = {
57 .notifier_call = kprobe_exceptions_notify,
58 .priority = 0x7fffffff /* we need to notified first */
59};
60
55#ifdef __ARCH_WANT_KPROBES_INSN_SLOT 61#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
56/* 62/*
57 * kprobe->ainsn.insn points to the copy of the instruction to be 63 * kprobe->ainsn.insn points to the copy of the instruction to be
@@ -368,16 +374,15 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
368*/ 374*/
369static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p) 375static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
370{ 376{
371 struct kprobe *kp;
372
373 if (p->break_handler) { 377 if (p->break_handler) {
374 list_for_each_entry_rcu(kp, &old_p->list, list) { 378 if (old_p->break_handler)
375 if (kp->break_handler) 379 return -EEXIST;
376 return -EEXIST;
377 }
378 list_add_tail_rcu(&p->list, &old_p->list); 380 list_add_tail_rcu(&p->list, &old_p->list);
381 old_p->break_handler = aggr_break_handler;
379 } else 382 } else
380 list_add_rcu(&p->list, &old_p->list); 383 list_add_rcu(&p->list, &old_p->list);
384 if (p->post_handler && !old_p->post_handler)
385 old_p->post_handler = aggr_post_handler;
381 return 0; 386 return 0;
382} 387}
383 388
@@ -388,11 +393,14 @@ static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
388static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) 393static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
389{ 394{
390 copy_kprobe(p, ap); 395 copy_kprobe(p, ap);
396 flush_insn_slot(ap);
391 ap->addr = p->addr; 397 ap->addr = p->addr;
392 ap->pre_handler = aggr_pre_handler; 398 ap->pre_handler = aggr_pre_handler;
393 ap->post_handler = aggr_post_handler;
394 ap->fault_handler = aggr_fault_handler; 399 ap->fault_handler = aggr_fault_handler;
395 ap->break_handler = aggr_break_handler; 400 if (p->post_handler)
401 ap->post_handler = aggr_post_handler;
402 if (p->break_handler)
403 ap->break_handler = aggr_break_handler;
396 404
397 INIT_LIST_HEAD(&ap->list); 405 INIT_LIST_HEAD(&ap->list);
398 list_add_rcu(&p->list, &ap->list); 406 list_add_rcu(&p->list, &ap->list);
@@ -464,6 +472,8 @@ static int __kprobes __register_kprobe(struct kprobe *p,
464 old_p = get_kprobe(p->addr); 472 old_p = get_kprobe(p->addr);
465 if (old_p) { 473 if (old_p) {
466 ret = register_aggr_kprobe(old_p, p); 474 ret = register_aggr_kprobe(old_p, p);
475 if (!ret)
476 atomic_inc(&kprobe_count);
467 goto out; 477 goto out;
468 } 478 }
469 479
@@ -474,6 +484,10 @@ static int __kprobes __register_kprobe(struct kprobe *p,
474 hlist_add_head_rcu(&p->hlist, 484 hlist_add_head_rcu(&p->hlist,
475 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); 485 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
476 486
487 if (atomic_add_return(1, &kprobe_count) == \
488 (ARCH_INACTIVE_KPROBE_COUNT + 1))
489 register_page_fault_notifier(&kprobe_page_fault_nb);
490
477 arch_arm_kprobe(p); 491 arch_arm_kprobe(p);
478 492
479out: 493out:
@@ -536,14 +550,40 @@ valid_p:
536 kfree(old_p); 550 kfree(old_p);
537 } 551 }
538 arch_remove_kprobe(p); 552 arch_remove_kprobe(p);
553 } else {
554 mutex_lock(&kprobe_mutex);
555 if (p->break_handler)
556 old_p->break_handler = NULL;
557 if (p->post_handler){
558 list_for_each_entry_rcu(list_p, &old_p->list, list){
559 if (list_p->post_handler){
560 cleanup_p = 2;
561 break;
562 }
563 }
564 if (cleanup_p == 0)
565 old_p->post_handler = NULL;
566 }
567 mutex_unlock(&kprobe_mutex);
539 } 568 }
569
570 /* Call unregister_page_fault_notifier()
571 * if no probes are active
572 */
573 mutex_lock(&kprobe_mutex);
574 if (atomic_add_return(-1, &kprobe_count) == \
575 ARCH_INACTIVE_KPROBE_COUNT)
576 unregister_page_fault_notifier(&kprobe_page_fault_nb);
577 mutex_unlock(&kprobe_mutex);
578 return;
540} 579}
541 580
542static struct notifier_block kprobe_exceptions_nb = { 581static struct notifier_block kprobe_exceptions_nb = {
543 .notifier_call = kprobe_exceptions_notify, 582 .notifier_call = kprobe_exceptions_notify,
544 .priority = 0x7fffffff /* we need to notified first */ 583 .priority = 0x7fffffff /* we need to be notified first */
545}; 584};
546 585
586
547int __kprobes register_jprobe(struct jprobe *jp) 587int __kprobes register_jprobe(struct jprobe *jp)
548{ 588{
549 /* Todo: Verify probepoint is a function entry point */ 589 /* Todo: Verify probepoint is a function entry point */
@@ -652,6 +692,7 @@ static int __init init_kprobes(void)
652 INIT_HLIST_HEAD(&kprobe_table[i]); 692 INIT_HLIST_HEAD(&kprobe_table[i]);
653 INIT_HLIST_HEAD(&kretprobe_inst_table[i]); 693 INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
654 } 694 }
695 atomic_set(&kprobe_count, 0);
655 696
656 err = arch_init_kprobes(); 697 err = arch_init_kprobes();
657 if (!err) 698 if (!err)
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 9e28478a17a5..e0ffe4ab0917 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -8,7 +8,6 @@
8 * 8 *
9 */ 9 */
10 10
11#include <linux/config.h>
12#include <linux/kobject.h> 11#include <linux/kobject.h>
13#include <linux/string.h> 12#include <linux/string.h>
14#include <linux/sysfs.h> 13#include <linux/sysfs.h>
diff --git a/kernel/kthread.c b/kernel/kthread.c
index c5f3c6613b6d..4f9c60ef95e8 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -45,6 +45,13 @@ struct kthread_stop_info
45static DEFINE_MUTEX(kthread_stop_lock); 45static DEFINE_MUTEX(kthread_stop_lock);
46static struct kthread_stop_info kthread_stop_info; 46static struct kthread_stop_info kthread_stop_info;
47 47
48/**
49 * kthread_should_stop - should this kthread return now?
50 *
51 * When someone calls kthread_stop on your kthread, it will be woken
52 * and this will return true. You should then return, and your return
53 * value will be passed through to kthread_stop().
54 */
48int kthread_should_stop(void) 55int kthread_should_stop(void)
49{ 56{
50 return (kthread_stop_info.k == current); 57 return (kthread_stop_info.k == current);
@@ -122,6 +129,25 @@ static void keventd_create_kthread(void *_create)
122 complete(&create->done); 129 complete(&create->done);
123} 130}
124 131
132/**
133 * kthread_create - create a kthread.
134 * @threadfn: the function to run until signal_pending(current).
135 * @data: data ptr for @threadfn.
136 * @namefmt: printf-style name for the thread.
137 *
138 * Description: This helper function creates and names a kernel
139 * thread. The thread will be stopped: use wake_up_process() to start
140 * it. See also kthread_run(), kthread_create_on_cpu().
141 *
142 * When woken, the thread will run @threadfn() with @data as its
143 * argument. @threadfn can either call do_exit() directly if it is a
144 * standalone thread for which noone will call kthread_stop(), or
145 * return when 'kthread_should_stop()' is true (which means
146 * kthread_stop() has been called). The return value should be zero
147 * or a negative error number; it will be passed to kthread_stop().
148 *
149 * Returns a task_struct or ERR_PTR(-ENOMEM).
150 */
125struct task_struct *kthread_create(int (*threadfn)(void *data), 151struct task_struct *kthread_create(int (*threadfn)(void *data),
126 void *data, 152 void *data,
127 const char namefmt[], 153 const char namefmt[],
@@ -156,6 +182,15 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
156} 182}
157EXPORT_SYMBOL(kthread_create); 183EXPORT_SYMBOL(kthread_create);
158 184
185/**
186 * kthread_bind - bind a just-created kthread to a cpu.
187 * @k: thread created by kthread_create().
188 * @cpu: cpu (might not be online, must be possible) for @k to run on.
189 *
190 * Description: This function is equivalent to set_cpus_allowed(),
191 * except that @cpu doesn't need to be online, and the thread must be
192 * stopped (i.e., just returned from kthread_create().
193 */
159void kthread_bind(struct task_struct *k, unsigned int cpu) 194void kthread_bind(struct task_struct *k, unsigned int cpu)
160{ 195{
161 BUG_ON(k->state != TASK_INTERRUPTIBLE); 196 BUG_ON(k->state != TASK_INTERRUPTIBLE);
@@ -166,14 +201,21 @@ void kthread_bind(struct task_struct *k, unsigned int cpu)
166} 201}
167EXPORT_SYMBOL(kthread_bind); 202EXPORT_SYMBOL(kthread_bind);
168 203
204/**
205 * kthread_stop - stop a thread created by kthread_create().
206 * @k: thread created by kthread_create().
207 *
208 * Sets kthread_should_stop() for @k to return true, wakes it, and
209 * waits for it to exit. Your threadfn() must not call do_exit()
210 * itself if you use this function! This can also be called after
211 * kthread_create() instead of calling wake_up_process(): the thread
212 * will exit without calling threadfn().
213 *
214 * Returns the result of threadfn(), or %-EINTR if wake_up_process()
215 * was never called.
216 */
169int kthread_stop(struct task_struct *k) 217int kthread_stop(struct task_struct *k)
170{ 218{
171 return kthread_stop_sem(k, NULL);
172}
173EXPORT_SYMBOL(kthread_stop);
174
175int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
176{
177 int ret; 219 int ret;
178 220
179 mutex_lock(&kthread_stop_lock); 221 mutex_lock(&kthread_stop_lock);
@@ -187,10 +229,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
187 229
188 /* Now set kthread_should_stop() to true, and wake it up. */ 230 /* Now set kthread_should_stop() to true, and wake it up. */
189 kthread_stop_info.k = k; 231 kthread_stop_info.k = k;
190 if (s) 232 wake_up_process(k);
191 up(s);
192 else
193 wake_up_process(k);
194 put_task_struct(k); 233 put_task_struct(k);
195 234
196 /* Once it dies, reset stop ptr, gather result and we're done. */ 235 /* Once it dies, reset stop ptr, gather result and we're done. */
@@ -201,7 +240,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
201 240
202 return ret; 241 return ret;
203} 242}
204EXPORT_SYMBOL(kthread_stop_sem); 243EXPORT_SYMBOL(kthread_stop);
205 244
206static __init int helper_init(void) 245static __init int helper_init(void)
207{ 246{
@@ -210,5 +249,5 @@ static __init int helper_init(void)
210 249
211 return 0; 250 return 0;
212} 251}
213core_initcall(helper_init);
214 252
253core_initcall(helper_init);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
new file mode 100644
index 000000000000..e596525669ed
--- /dev/null
+++ b/kernel/lockdep.c
@@ -0,0 +1,2724 @@
1/*
2 * kernel/lockdep.c
3 *
4 * Runtime locking correctness validator
5 *
6 * Started by Ingo Molnar:
7 *
8 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
9 *
10 * this code maps all the lock dependencies as they occur in a live kernel
11 * and will warn about the following classes of locking bugs:
12 *
13 * - lock inversion scenarios
14 * - circular lock dependencies
15 * - hardirq/softirq safe/unsafe locking bugs
16 *
17 * Bugs are reported even if the current locking scenario does not cause
18 * any deadlock at this point.
19 *
20 * I.e. if anytime in the past two locks were taken in a different order,
21 * even if it happened for another task, even if those were different
22 * locks (but of the same class as this lock), this code will detect it.
23 *
24 * Thanks to Arjan van de Ven for coming up with the initial idea of
25 * mapping lock dependencies runtime.
26 */
27#include <linux/mutex.h>
28#include <linux/sched.h>
29#include <linux/delay.h>
30#include <linux/module.h>
31#include <linux/proc_fs.h>
32#include <linux/seq_file.h>
33#include <linux/spinlock.h>
34#include <linux/kallsyms.h>
35#include <linux/interrupt.h>
36#include <linux/stacktrace.h>
37#include <linux/debug_locks.h>
38#include <linux/irqflags.h>
39#include <linux/utsname.h>
40
41#include <asm/sections.h>
42
43#include "lockdep_internals.h"
44
45/*
46 * hash_lock: protects the lockdep hashes and class/list/hash allocators.
47 *
48 * This is one of the rare exceptions where it's justified
49 * to use a raw spinlock - we really dont want the spinlock
50 * code to recurse back into the lockdep code.
51 */
52static raw_spinlock_t hash_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
53
54static int lockdep_initialized;
55
56unsigned long nr_list_entries;
57static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
58
59/*
60 * Allocate a lockdep entry. (assumes hash_lock held, returns
61 * with NULL on failure)
62 */
63static struct lock_list *alloc_list_entry(void)
64{
65 if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) {
66 __raw_spin_unlock(&hash_lock);
67 debug_locks_off();
68 printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n");
69 printk("turning off the locking correctness validator.\n");
70 return NULL;
71 }
72 return list_entries + nr_list_entries++;
73}
74
75/*
76 * All data structures here are protected by the global debug_lock.
77 *
78 * Mutex key structs only get allocated, once during bootup, and never
79 * get freed - this significantly simplifies the debugging code.
80 */
81unsigned long nr_lock_classes;
82static struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
83
84/*
85 * We keep a global list of all lock classes. The list only grows,
86 * never shrinks. The list is only accessed with the lockdep
87 * spinlock lock held.
88 */
89LIST_HEAD(all_lock_classes);
90
91/*
92 * The lockdep classes are in a hash-table as well, for fast lookup:
93 */
94#define CLASSHASH_BITS (MAX_LOCKDEP_KEYS_BITS - 1)
95#define CLASSHASH_SIZE (1UL << CLASSHASH_BITS)
96#define CLASSHASH_MASK (CLASSHASH_SIZE - 1)
97#define __classhashfn(key) ((((unsigned long)key >> CLASSHASH_BITS) + (unsigned long)key) & CLASSHASH_MASK)
98#define classhashentry(key) (classhash_table + __classhashfn((key)))
99
100static struct list_head classhash_table[CLASSHASH_SIZE];
101
102unsigned long nr_lock_chains;
103static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS];
104
105/*
106 * We put the lock dependency chains into a hash-table as well, to cache
107 * their existence:
108 */
109#define CHAINHASH_BITS (MAX_LOCKDEP_CHAINS_BITS-1)
110#define CHAINHASH_SIZE (1UL << CHAINHASH_BITS)
111#define CHAINHASH_MASK (CHAINHASH_SIZE - 1)
112#define __chainhashfn(chain) \
113 (((chain >> CHAINHASH_BITS) + chain) & CHAINHASH_MASK)
114#define chainhashentry(chain) (chainhash_table + __chainhashfn((chain)))
115
116static struct list_head chainhash_table[CHAINHASH_SIZE];
117
118/*
119 * The hash key of the lock dependency chains is a hash itself too:
120 * it's a hash of all locks taken up to that lock, including that lock.
121 * It's a 64-bit hash, because it's important for the keys to be
122 * unique.
123 */
124#define iterate_chain_key(key1, key2) \
125 (((key1) << MAX_LOCKDEP_KEYS_BITS) ^ \
126 ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS)) ^ \
127 (key2))
128
129void lockdep_off(void)
130{
131 current->lockdep_recursion++;
132}
133
134EXPORT_SYMBOL(lockdep_off);
135
136void lockdep_on(void)
137{
138 current->lockdep_recursion--;
139}
140
141EXPORT_SYMBOL(lockdep_on);
142
143int lockdep_internal(void)
144{
145 return current->lockdep_recursion != 0;
146}
147
148EXPORT_SYMBOL(lockdep_internal);
149
150/*
151 * Debugging switches:
152 */
153
154#define VERBOSE 0
155#ifdef VERBOSE
156# define VERY_VERBOSE 0
157#endif
158
159#if VERBOSE
160# define HARDIRQ_VERBOSE 1
161# define SOFTIRQ_VERBOSE 1
162#else
163# define HARDIRQ_VERBOSE 0
164# define SOFTIRQ_VERBOSE 0
165#endif
166
167#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE
168/*
169 * Quick filtering for interesting events:
170 */
171static int class_filter(struct lock_class *class)
172{
173#if 0
174 /* Example */
175 if (class->name_version == 1 &&
176 !strcmp(class->name, "lockname"))
177 return 1;
178 if (class->name_version == 1 &&
179 !strcmp(class->name, "&struct->lockfield"))
180 return 1;
181#endif
182 /* Allow everything else. 0 would be filter everything else */
183 return 1;
184}
185#endif
186
187static int verbose(struct lock_class *class)
188{
189#if VERBOSE
190 return class_filter(class);
191#endif
192 return 0;
193}
194
195#ifdef CONFIG_TRACE_IRQFLAGS
196
197static int hardirq_verbose(struct lock_class *class)
198{
199#if HARDIRQ_VERBOSE
200 return class_filter(class);
201#endif
202 return 0;
203}
204
205static int softirq_verbose(struct lock_class *class)
206{
207#if SOFTIRQ_VERBOSE
208 return class_filter(class);
209#endif
210 return 0;
211}
212
213#endif
214
215/*
216 * Stack-trace: tightly packed array of stack backtrace
217 * addresses. Protected by the hash_lock.
218 */
219unsigned long nr_stack_trace_entries;
220static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
221
222static int save_trace(struct stack_trace *trace)
223{
224 trace->nr_entries = 0;
225 trace->max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries;
226 trace->entries = stack_trace + nr_stack_trace_entries;
227
228 trace->skip = 3;
229 trace->all_contexts = 0;
230
231 /* Make sure to not recurse in case the the unwinder needs to tak
232e locks. */
233 lockdep_off();
234 save_stack_trace(trace, NULL);
235 lockdep_on();
236
237 trace->max_entries = trace->nr_entries;
238
239 nr_stack_trace_entries += trace->nr_entries;
240 if (DEBUG_LOCKS_WARN_ON(nr_stack_trace_entries > MAX_STACK_TRACE_ENTRIES))
241 return 0;
242
243 if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) {
244 __raw_spin_unlock(&hash_lock);
245 if (debug_locks_off()) {
246 printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n");
247 printk("turning off the locking correctness validator.\n");
248 dump_stack();
249 }
250 return 0;
251 }
252
253 return 1;
254}
255
256unsigned int nr_hardirq_chains;
257unsigned int nr_softirq_chains;
258unsigned int nr_process_chains;
259unsigned int max_lockdep_depth;
260unsigned int max_recursion_depth;
261
262#ifdef CONFIG_DEBUG_LOCKDEP
263/*
264 * We cannot printk in early bootup code. Not even early_printk()
265 * might work. So we mark any initialization errors and printk
266 * about it later on, in lockdep_info().
267 */
268static int lockdep_init_error;
269
270/*
271 * Various lockdep statistics:
272 */
273atomic_t chain_lookup_hits;
274atomic_t chain_lookup_misses;
275atomic_t hardirqs_on_events;
276atomic_t hardirqs_off_events;
277atomic_t redundant_hardirqs_on;
278atomic_t redundant_hardirqs_off;
279atomic_t softirqs_on_events;
280atomic_t softirqs_off_events;
281atomic_t redundant_softirqs_on;
282atomic_t redundant_softirqs_off;
283atomic_t nr_unused_locks;
284atomic_t nr_cyclic_checks;
285atomic_t nr_cyclic_check_recursions;
286atomic_t nr_find_usage_forwards_checks;
287atomic_t nr_find_usage_forwards_recursions;
288atomic_t nr_find_usage_backwards_checks;
289atomic_t nr_find_usage_backwards_recursions;
290# define debug_atomic_inc(ptr) atomic_inc(ptr)
291# define debug_atomic_dec(ptr) atomic_dec(ptr)
292# define debug_atomic_read(ptr) atomic_read(ptr)
293#else
294# define debug_atomic_inc(ptr) do { } while (0)
295# define debug_atomic_dec(ptr) do { } while (0)
296# define debug_atomic_read(ptr) 0
297#endif
298
299/*
300 * Locking printouts:
301 */
302
303static const char *usage_str[] =
304{
305 [LOCK_USED] = "initial-use ",
306 [LOCK_USED_IN_HARDIRQ] = "in-hardirq-W",
307 [LOCK_USED_IN_SOFTIRQ] = "in-softirq-W",
308 [LOCK_ENABLED_SOFTIRQS] = "softirq-on-W",
309 [LOCK_ENABLED_HARDIRQS] = "hardirq-on-W",
310 [LOCK_USED_IN_HARDIRQ_READ] = "in-hardirq-R",
311 [LOCK_USED_IN_SOFTIRQ_READ] = "in-softirq-R",
312 [LOCK_ENABLED_SOFTIRQS_READ] = "softirq-on-R",
313 [LOCK_ENABLED_HARDIRQS_READ] = "hardirq-on-R",
314};
315
316const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
317{
318 unsigned long offs, size;
319 char *modname;
320
321 return kallsyms_lookup((unsigned long)key, &size, &offs, &modname, str);
322}
323
324void
325get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4)
326{
327 *c1 = '.', *c2 = '.', *c3 = '.', *c4 = '.';
328
329 if (class->usage_mask & LOCKF_USED_IN_HARDIRQ)
330 *c1 = '+';
331 else
332 if (class->usage_mask & LOCKF_ENABLED_HARDIRQS)
333 *c1 = '-';
334
335 if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ)
336 *c2 = '+';
337 else
338 if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS)
339 *c2 = '-';
340
341 if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ)
342 *c3 = '-';
343 if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ) {
344 *c3 = '+';
345 if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ)
346 *c3 = '?';
347 }
348
349 if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ)
350 *c4 = '-';
351 if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ) {
352 *c4 = '+';
353 if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ)
354 *c4 = '?';
355 }
356}
357
358static void print_lock_name(struct lock_class *class)
359{
360 char str[128], c1, c2, c3, c4;
361 const char *name;
362
363 get_usage_chars(class, &c1, &c2, &c3, &c4);
364
365 name = class->name;
366 if (!name) {
367 name = __get_key_name(class->key, str);
368 printk(" (%s", name);
369 } else {
370 printk(" (%s", name);
371 if (class->name_version > 1)
372 printk("#%d", class->name_version);
373 if (class->subclass)
374 printk("/%d", class->subclass);
375 }
376 printk("){%c%c%c%c}", c1, c2, c3, c4);
377}
378
379static void print_lockdep_cache(struct lockdep_map *lock)
380{
381 const char *name;
382 char str[128];
383
384 name = lock->name;
385 if (!name)
386 name = __get_key_name(lock->key->subkeys, str);
387
388 printk("%s", name);
389}
390
391static void print_lock(struct held_lock *hlock)
392{
393 print_lock_name(hlock->class);
394 printk(", at: ");
395 print_ip_sym(hlock->acquire_ip);
396}
397
398static void lockdep_print_held_locks(struct task_struct *curr)
399{
400 int i, depth = curr->lockdep_depth;
401
402 if (!depth) {
403 printk("no locks held by %s/%d.\n", curr->comm, curr->pid);
404 return;
405 }
406 printk("%d lock%s held by %s/%d:\n",
407 depth, depth > 1 ? "s" : "", curr->comm, curr->pid);
408
409 for (i = 0; i < depth; i++) {
410 printk(" #%d: ", i);
411 print_lock(curr->held_locks + i);
412 }
413}
414
415static void print_lock_class_header(struct lock_class *class, int depth)
416{
417 int bit;
418
419 printk("%*s->", depth, "");
420 print_lock_name(class);
421 printk(" ops: %lu", class->ops);
422 printk(" {\n");
423
424 for (bit = 0; bit < LOCK_USAGE_STATES; bit++) {
425 if (class->usage_mask & (1 << bit)) {
426 int len = depth;
427
428 len += printk("%*s %s", depth, "", usage_str[bit]);
429 len += printk(" at:\n");
430 print_stack_trace(class->usage_traces + bit, len);
431 }
432 }
433 printk("%*s }\n", depth, "");
434
435 printk("%*s ... key at: ",depth,"");
436 print_ip_sym((unsigned long)class->key);
437}
438
439/*
440 * printk all lock dependencies starting at <entry>:
441 */
442static void print_lock_dependencies(struct lock_class *class, int depth)
443{
444 struct lock_list *entry;
445
446 if (DEBUG_LOCKS_WARN_ON(depth >= 20))
447 return;
448
449 print_lock_class_header(class, depth);
450
451 list_for_each_entry(entry, &class->locks_after, entry) {
452 DEBUG_LOCKS_WARN_ON(!entry->class);
453 print_lock_dependencies(entry->class, depth + 1);
454
455 printk("%*s ... acquired at:\n",depth,"");
456 print_stack_trace(&entry->trace, 2);
457 printk("\n");
458 }
459}
460
461/*
462 * Add a new dependency to the head of the list:
463 */
464static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
465 struct list_head *head, unsigned long ip)
466{
467 struct lock_list *entry;
468 /*
469 * Lock not present yet - get a new dependency struct and
470 * add it to the list:
471 */
472 entry = alloc_list_entry();
473 if (!entry)
474 return 0;
475
476 entry->class = this;
477 save_trace(&entry->trace);
478
479 /*
480 * Since we never remove from the dependency list, the list can
481 * be walked lockless by other CPUs, it's only allocation
482 * that must be protected by the spinlock. But this also means
483 * we must make new entries visible only once writes to the
484 * entry become visible - hence the RCU op:
485 */
486 list_add_tail_rcu(&entry->entry, head);
487
488 return 1;
489}
490
491/*
492 * Recursive, forwards-direction lock-dependency checking, used for
493 * both noncyclic checking and for hardirq-unsafe/softirq-unsafe
494 * checking.
495 *
496 * (to keep the stackframe of the recursive functions small we
497 * use these global variables, and we also mark various helper
498 * functions as noinline.)
499 */
500static struct held_lock *check_source, *check_target;
501
502/*
503 * Print a dependency chain entry (this is only done when a deadlock
504 * has been detected):
505 */
506static noinline int
507print_circular_bug_entry(struct lock_list *target, unsigned int depth)
508{
509 if (debug_locks_silent)
510 return 0;
511 printk("\n-> #%u", depth);
512 print_lock_name(target->class);
513 printk(":\n");
514 print_stack_trace(&target->trace, 6);
515
516 return 0;
517}
518
519static void print_kernel_version(void)
520{
521 printk("%s %.*s\n", system_utsname.release,
522 (int)strcspn(system_utsname.version, " "),
523 system_utsname.version);
524}
525
526/*
527 * When a circular dependency is detected, print the
528 * header first:
529 */
530static noinline int
531print_circular_bug_header(struct lock_list *entry, unsigned int depth)
532{
533 struct task_struct *curr = current;
534
535 __raw_spin_unlock(&hash_lock);
536 debug_locks_off();
537 if (debug_locks_silent)
538 return 0;
539
540 printk("\n=======================================================\n");
541 printk( "[ INFO: possible circular locking dependency detected ]\n");
542 print_kernel_version();
543 printk( "-------------------------------------------------------\n");
544 printk("%s/%d is trying to acquire lock:\n",
545 curr->comm, curr->pid);
546 print_lock(check_source);
547 printk("\nbut task is already holding lock:\n");
548 print_lock(check_target);
549 printk("\nwhich lock already depends on the new lock.\n\n");
550 printk("\nthe existing dependency chain (in reverse order) is:\n");
551
552 print_circular_bug_entry(entry, depth);
553
554 return 0;
555}
556
557static noinline int print_circular_bug_tail(void)
558{
559 struct task_struct *curr = current;
560 struct lock_list this;
561
562 if (debug_locks_silent)
563 return 0;
564
565 this.class = check_source->class;
566 save_trace(&this.trace);
567 print_circular_bug_entry(&this, 0);
568
569 printk("\nother info that might help us debug this:\n\n");
570 lockdep_print_held_locks(curr);
571
572 printk("\nstack backtrace:\n");
573 dump_stack();
574
575 return 0;
576}
577
578static int noinline print_infinite_recursion_bug(void)
579{
580 __raw_spin_unlock(&hash_lock);
581 DEBUG_LOCKS_WARN_ON(1);
582
583 return 0;
584}
585
586/*
587 * Prove that the dependency graph starting at <entry> can not
588 * lead to <target>. Print an error and return 0 if it does.
589 */
590static noinline int
591check_noncircular(struct lock_class *source, unsigned int depth)
592{
593 struct lock_list *entry;
594
595 debug_atomic_inc(&nr_cyclic_check_recursions);
596 if (depth > max_recursion_depth)
597 max_recursion_depth = depth;
598 if (depth >= 20)
599 return print_infinite_recursion_bug();
600 /*
601 * Check this lock's dependency list:
602 */
603 list_for_each_entry(entry, &source->locks_after, entry) {
604 if (entry->class == check_target->class)
605 return print_circular_bug_header(entry, depth+1);
606 debug_atomic_inc(&nr_cyclic_checks);
607 if (!check_noncircular(entry->class, depth+1))
608 return print_circular_bug_entry(entry, depth+1);
609 }
610 return 1;
611}
612
613static int very_verbose(struct lock_class *class)
614{
615#if VERY_VERBOSE
616 return class_filter(class);
617#endif
618 return 0;
619}
620#ifdef CONFIG_TRACE_IRQFLAGS
621
622/*
623 * Forwards and backwards subgraph searching, for the purposes of
624 * proving that two subgraphs can be connected by a new dependency
625 * without creating any illegal irq-safe -> irq-unsafe lock dependency.
626 */
627static enum lock_usage_bit find_usage_bit;
628static struct lock_class *forwards_match, *backwards_match;
629
630/*
631 * Find a node in the forwards-direction dependency sub-graph starting
632 * at <source> that matches <find_usage_bit>.
633 *
634 * Return 2 if such a node exists in the subgraph, and put that node
635 * into <forwards_match>.
636 *
637 * Return 1 otherwise and keep <forwards_match> unchanged.
638 * Return 0 on error.
639 */
640static noinline int
641find_usage_forwards(struct lock_class *source, unsigned int depth)
642{
643 struct lock_list *entry;
644 int ret;
645
646 if (depth > max_recursion_depth)
647 max_recursion_depth = depth;
648 if (depth >= 20)
649 return print_infinite_recursion_bug();
650
651 debug_atomic_inc(&nr_find_usage_forwards_checks);
652 if (source->usage_mask & (1 << find_usage_bit)) {
653 forwards_match = source;
654 return 2;
655 }
656
657 /*
658 * Check this lock's dependency list:
659 */
660 list_for_each_entry(entry, &source->locks_after, entry) {
661 debug_atomic_inc(&nr_find_usage_forwards_recursions);
662 ret = find_usage_forwards(entry->class, depth+1);
663 if (ret == 2 || ret == 0)
664 return ret;
665 }
666 return 1;
667}
668
669/*
670 * Find a node in the backwards-direction dependency sub-graph starting
671 * at <source> that matches <find_usage_bit>.
672 *
673 * Return 2 if such a node exists in the subgraph, and put that node
674 * into <backwards_match>.
675 *
676 * Return 1 otherwise and keep <backwards_match> unchanged.
677 * Return 0 on error.
678 */
679static noinline int
680find_usage_backwards(struct lock_class *source, unsigned int depth)
681{
682 struct lock_list *entry;
683 int ret;
684
685 if (depth > max_recursion_depth)
686 max_recursion_depth = depth;
687 if (depth >= 20)
688 return print_infinite_recursion_bug();
689
690 debug_atomic_inc(&nr_find_usage_backwards_checks);
691 if (source->usage_mask & (1 << find_usage_bit)) {
692 backwards_match = source;
693 return 2;
694 }
695
696 /*
697 * Check this lock's dependency list:
698 */
699 list_for_each_entry(entry, &source->locks_before, entry) {
700 debug_atomic_inc(&nr_find_usage_backwards_recursions);
701 ret = find_usage_backwards(entry->class, depth+1);
702 if (ret == 2 || ret == 0)
703 return ret;
704 }
705 return 1;
706}
707
708static int
709print_bad_irq_dependency(struct task_struct *curr,
710 struct held_lock *prev,
711 struct held_lock *next,
712 enum lock_usage_bit bit1,
713 enum lock_usage_bit bit2,
714 const char *irqclass)
715{
716 __raw_spin_unlock(&hash_lock);
717 debug_locks_off();
718 if (debug_locks_silent)
719 return 0;
720
721 printk("\n======================================================\n");
722 printk( "[ INFO: %s-safe -> %s-unsafe lock order detected ]\n",
723 irqclass, irqclass);
724 print_kernel_version();
725 printk( "------------------------------------------------------\n");
726 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
727 curr->comm, curr->pid,
728 curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT,
729 curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT,
730 curr->hardirqs_enabled,
731 curr->softirqs_enabled);
732 print_lock(next);
733
734 printk("\nand this task is already holding:\n");
735 print_lock(prev);
736 printk("which would create a new lock dependency:\n");
737 print_lock_name(prev->class);
738 printk(" ->");
739 print_lock_name(next->class);
740 printk("\n");
741
742 printk("\nbut this new dependency connects a %s-irq-safe lock:\n",
743 irqclass);
744 print_lock_name(backwards_match);
745 printk("\n... which became %s-irq-safe at:\n", irqclass);
746
747 print_stack_trace(backwards_match->usage_traces + bit1, 1);
748
749 printk("\nto a %s-irq-unsafe lock:\n", irqclass);
750 print_lock_name(forwards_match);
751 printk("\n... which became %s-irq-unsafe at:\n", irqclass);
752 printk("...");
753
754 print_stack_trace(forwards_match->usage_traces + bit2, 1);
755
756 printk("\nother info that might help us debug this:\n\n");
757 lockdep_print_held_locks(curr);
758
759 printk("\nthe %s-irq-safe lock's dependencies:\n", irqclass);
760 print_lock_dependencies(backwards_match, 0);
761
762 printk("\nthe %s-irq-unsafe lock's dependencies:\n", irqclass);
763 print_lock_dependencies(forwards_match, 0);
764
765 printk("\nstack backtrace:\n");
766 dump_stack();
767
768 return 0;
769}
770
771static int
772check_usage(struct task_struct *curr, struct held_lock *prev,
773 struct held_lock *next, enum lock_usage_bit bit_backwards,
774 enum lock_usage_bit bit_forwards, const char *irqclass)
775{
776 int ret;
777
778 find_usage_bit = bit_backwards;
779 /* fills in <backwards_match> */
780 ret = find_usage_backwards(prev->class, 0);
781 if (!ret || ret == 1)
782 return ret;
783
784 find_usage_bit = bit_forwards;
785 ret = find_usage_forwards(next->class, 0);
786 if (!ret || ret == 1)
787 return ret;
788 /* ret == 2 */
789 return print_bad_irq_dependency(curr, prev, next,
790 bit_backwards, bit_forwards, irqclass);
791}
792
793#endif
794
795static int
796print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
797 struct held_lock *next)
798{
799 debug_locks_off();
800 __raw_spin_unlock(&hash_lock);
801 if (debug_locks_silent)
802 return 0;
803
804 printk("\n=============================================\n");
805 printk( "[ INFO: possible recursive locking detected ]\n");
806 print_kernel_version();
807 printk( "---------------------------------------------\n");
808 printk("%s/%d is trying to acquire lock:\n",
809 curr->comm, curr->pid);
810 print_lock(next);
811 printk("\nbut task is already holding lock:\n");
812 print_lock(prev);
813
814 printk("\nother info that might help us debug this:\n");
815 lockdep_print_held_locks(curr);
816
817 printk("\nstack backtrace:\n");
818 dump_stack();
819
820 return 0;
821}
822
823/*
824 * Check whether we are holding such a class already.
825 *
826 * (Note that this has to be done separately, because the graph cannot
827 * detect such classes of deadlocks.)
828 *
829 * Returns: 0 on deadlock detected, 1 on OK, 2 on recursive read
830 */
831static int
832check_deadlock(struct task_struct *curr, struct held_lock *next,
833 struct lockdep_map *next_instance, int read)
834{
835 struct held_lock *prev;
836 int i;
837
838 for (i = 0; i < curr->lockdep_depth; i++) {
839 prev = curr->held_locks + i;
840 if (prev->class != next->class)
841 continue;
842 /*
843 * Allow read-after-read recursion of the same
844 * lock class (i.e. read_lock(lock)+read_lock(lock)):
845 */
846 if ((read == 2) && prev->read)
847 return 2;
848 return print_deadlock_bug(curr, prev, next);
849 }
850 return 1;
851}
852
853/*
854 * There was a chain-cache miss, and we are about to add a new dependency
855 * to a previous lock. We recursively validate the following rules:
856 *
857 * - would the adding of the <prev> -> <next> dependency create a
858 * circular dependency in the graph? [== circular deadlock]
859 *
860 * - does the new prev->next dependency connect any hardirq-safe lock
861 * (in the full backwards-subgraph starting at <prev>) with any
862 * hardirq-unsafe lock (in the full forwards-subgraph starting at
863 * <next>)? [== illegal lock inversion with hardirq contexts]
864 *
865 * - does the new prev->next dependency connect any softirq-safe lock
866 * (in the full backwards-subgraph starting at <prev>) with any
867 * softirq-unsafe lock (in the full forwards-subgraph starting at
868 * <next>)? [== illegal lock inversion with softirq contexts]
869 *
870 * any of these scenarios could lead to a deadlock.
871 *
872 * Then if all the validations pass, we add the forwards and backwards
873 * dependency.
874 */
875static int
876check_prev_add(struct task_struct *curr, struct held_lock *prev,
877 struct held_lock *next)
878{
879 struct lock_list *entry;
880 int ret;
881
882 /*
883 * Prove that the new <prev> -> <next> dependency would not
884 * create a circular dependency in the graph. (We do this by
885 * forward-recursing into the graph starting at <next>, and
886 * checking whether we can reach <prev>.)
887 *
888 * We are using global variables to control the recursion, to
889 * keep the stackframe size of the recursive functions low:
890 */
891 check_source = next;
892 check_target = prev;
893 if (!(check_noncircular(next->class, 0)))
894 return print_circular_bug_tail();
895
896#ifdef CONFIG_TRACE_IRQFLAGS
897 /*
898 * Prove that the new dependency does not connect a hardirq-safe
899 * lock with a hardirq-unsafe lock - to achieve this we search
900 * the backwards-subgraph starting at <prev>, and the
901 * forwards-subgraph starting at <next>:
902 */
903 if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ,
904 LOCK_ENABLED_HARDIRQS, "hard"))
905 return 0;
906
907 /*
908 * Prove that the new dependency does not connect a hardirq-safe-read
909 * lock with a hardirq-unsafe lock - to achieve this we search
910 * the backwards-subgraph starting at <prev>, and the
911 * forwards-subgraph starting at <next>:
912 */
913 if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ_READ,
914 LOCK_ENABLED_HARDIRQS, "hard-read"))
915 return 0;
916
917 /*
918 * Prove that the new dependency does not connect a softirq-safe
919 * lock with a softirq-unsafe lock - to achieve this we search
920 * the backwards-subgraph starting at <prev>, and the
921 * forwards-subgraph starting at <next>:
922 */
923 if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ,
924 LOCK_ENABLED_SOFTIRQS, "soft"))
925 return 0;
926 /*
927 * Prove that the new dependency does not connect a softirq-safe-read
928 * lock with a softirq-unsafe lock - to achieve this we search
929 * the backwards-subgraph starting at <prev>, and the
930 * forwards-subgraph starting at <next>:
931 */
932 if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ_READ,
933 LOCK_ENABLED_SOFTIRQS, "soft"))
934 return 0;
935#endif
936 /*
937 * For recursive read-locks we do all the dependency checks,
938 * but we dont store read-triggered dependencies (only
939 * write-triggered dependencies). This ensures that only the
940 * write-side dependencies matter, and that if for example a
941 * write-lock never takes any other locks, then the reads are
942 * equivalent to a NOP.
943 */
944 if (next->read == 2 || prev->read == 2)
945 return 1;
946 /*
947 * Is the <prev> -> <next> dependency already present?
948 *
949 * (this may occur even though this is a new chain: consider
950 * e.g. the L1 -> L2 -> L3 -> L4 and the L5 -> L1 -> L2 -> L3
951 * chains - the second one will be new, but L1 already has
952 * L2 added to its dependency list, due to the first chain.)
953 */
954 list_for_each_entry(entry, &prev->class->locks_after, entry) {
955 if (entry->class == next->class)
956 return 2;
957 }
958
959 /*
960 * Ok, all validations passed, add the new lock
961 * to the previous lock's dependency list:
962 */
963 ret = add_lock_to_list(prev->class, next->class,
964 &prev->class->locks_after, next->acquire_ip);
965 if (!ret)
966 return 0;
967 /*
968 * Return value of 2 signals 'dependency already added',
969 * in that case we dont have to add the backlink either.
970 */
971 if (ret == 2)
972 return 2;
973 ret = add_lock_to_list(next->class, prev->class,
974 &next->class->locks_before, next->acquire_ip);
975
976 /*
977 * Debugging printouts:
978 */
979 if (verbose(prev->class) || verbose(next->class)) {
980 __raw_spin_unlock(&hash_lock);
981 printk("\n new dependency: ");
982 print_lock_name(prev->class);
983 printk(" => ");
984 print_lock_name(next->class);
985 printk("\n");
986 dump_stack();
987 __raw_spin_lock(&hash_lock);
988 }
989 return 1;
990}
991
992/*
993 * Add the dependency to all directly-previous locks that are 'relevant'.
994 * The ones that are relevant are (in increasing distance from curr):
995 * all consecutive trylock entries and the final non-trylock entry - or
996 * the end of this context's lock-chain - whichever comes first.
997 */
998static int
999check_prevs_add(struct task_struct *curr, struct held_lock *next)
1000{
1001 int depth = curr->lockdep_depth;
1002 struct held_lock *hlock;
1003
1004 /*
1005 * Debugging checks.
1006 *
1007 * Depth must not be zero for a non-head lock:
1008 */
1009 if (!depth)
1010 goto out_bug;
1011 /*
1012 * At least two relevant locks must exist for this
1013 * to be a head:
1014 */
1015 if (curr->held_locks[depth].irq_context !=
1016 curr->held_locks[depth-1].irq_context)
1017 goto out_bug;
1018
1019 for (;;) {
1020 hlock = curr->held_locks + depth-1;
1021 /*
1022 * Only non-recursive-read entries get new dependencies
1023 * added:
1024 */
1025 if (hlock->read != 2) {
1026 check_prev_add(curr, hlock, next);
1027 /*
1028 * Stop after the first non-trylock entry,
1029 * as non-trylock entries have added their
1030 * own direct dependencies already, so this
1031 * lock is connected to them indirectly:
1032 */
1033 if (!hlock->trylock)
1034 break;
1035 }
1036 depth--;
1037 /*
1038 * End of lock-stack?
1039 */
1040 if (!depth)
1041 break;
1042 /*
1043 * Stop the search if we cross into another context:
1044 */
1045 if (curr->held_locks[depth].irq_context !=
1046 curr->held_locks[depth-1].irq_context)
1047 break;
1048 }
1049 return 1;
1050out_bug:
1051 __raw_spin_unlock(&hash_lock);
1052 DEBUG_LOCKS_WARN_ON(1);
1053
1054 return 0;
1055}
1056
1057
1058/*
1059 * Is this the address of a static object:
1060 */
1061static int static_obj(void *obj)
1062{
1063 unsigned long start = (unsigned long) &_stext,
1064 end = (unsigned long) &_end,
1065 addr = (unsigned long) obj;
1066#ifdef CONFIG_SMP
1067 int i;
1068#endif
1069
1070 /*
1071 * static variable?
1072 */
1073 if ((addr >= start) && (addr < end))
1074 return 1;
1075
1076#ifdef CONFIG_SMP
1077 /*
1078 * percpu var?
1079 */
1080 for_each_possible_cpu(i) {
1081 start = (unsigned long) &__per_cpu_start + per_cpu_offset(i);
1082 end = (unsigned long) &__per_cpu_end + per_cpu_offset(i);
1083
1084 if ((addr >= start) && (addr < end))
1085 return 1;
1086 }
1087#endif
1088
1089 /*
1090 * module var?
1091 */
1092 return is_module_address(addr);
1093}
1094
1095/*
1096 * To make lock name printouts unique, we calculate a unique
1097 * class->name_version generation counter:
1098 */
1099static int count_matching_names(struct lock_class *new_class)
1100{
1101 struct lock_class *class;
1102 int count = 0;
1103
1104 if (!new_class->name)
1105 return 0;
1106
1107 list_for_each_entry(class, &all_lock_classes, lock_entry) {
1108 if (new_class->key - new_class->subclass == class->key)
1109 return class->name_version;
1110 if (class->name && !strcmp(class->name, new_class->name))
1111 count = max(count, class->name_version);
1112 }
1113
1114 return count + 1;
1115}
1116
1117extern void __error_too_big_MAX_LOCKDEP_SUBCLASSES(void);
1118
1119/*
1120 * Register a lock's class in the hash-table, if the class is not present
1121 * yet. Otherwise we look it up. We cache the result in the lock object
1122 * itself, so actual lookup of the hash should be once per lock object.
1123 */
1124static inline struct lock_class *
1125look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
1126{
1127 struct lockdep_subclass_key *key;
1128 struct list_head *hash_head;
1129 struct lock_class *class;
1130
1131#ifdef CONFIG_DEBUG_LOCKDEP
1132 /*
1133 * If the architecture calls into lockdep before initializing
1134 * the hashes then we'll warn about it later. (we cannot printk
1135 * right now)
1136 */
1137 if (unlikely(!lockdep_initialized)) {
1138 lockdep_init();
1139 lockdep_init_error = 1;
1140 }
1141#endif
1142
1143 /*
1144 * Static locks do not have their class-keys yet - for them the key
1145 * is the lock object itself:
1146 */
1147 if (unlikely(!lock->key))
1148 lock->key = (void *)lock;
1149
1150 /*
1151 * NOTE: the class-key must be unique. For dynamic locks, a static
1152 * lock_class_key variable is passed in through the mutex_init()
1153 * (or spin_lock_init()) call - which acts as the key. For static
1154 * locks we use the lock object itself as the key.
1155 */
1156 if (sizeof(struct lock_class_key) > sizeof(struct lock_class))
1157 __error_too_big_MAX_LOCKDEP_SUBCLASSES();
1158
1159 key = lock->key->subkeys + subclass;
1160
1161 hash_head = classhashentry(key);
1162
1163 /*
1164 * We can walk the hash lockfree, because the hash only
1165 * grows, and we are careful when adding entries to the end:
1166 */
1167 list_for_each_entry(class, hash_head, hash_entry)
1168 if (class->key == key)
1169 return class;
1170
1171 return NULL;
1172}
1173
1174/*
1175 * Register a lock's class in the hash-table, if the class is not present
1176 * yet. Otherwise we look it up. We cache the result in the lock object
1177 * itself, so actual lookup of the hash should be once per lock object.
1178 */
1179static inline struct lock_class *
1180register_lock_class(struct lockdep_map *lock, unsigned int subclass)
1181{
1182 struct lockdep_subclass_key *key;
1183 struct list_head *hash_head;
1184 struct lock_class *class;
1185
1186 class = look_up_lock_class(lock, subclass);
1187 if (likely(class))
1188 return class;
1189
1190 /*
1191 * Debug-check: all keys must be persistent!
1192 */
1193 if (!static_obj(lock->key)) {
1194 debug_locks_off();
1195 printk("INFO: trying to register non-static key.\n");
1196 printk("the code is fine but needs lockdep annotation.\n");
1197 printk("turning off the locking correctness validator.\n");
1198 dump_stack();
1199
1200 return NULL;
1201 }
1202
1203 key = lock->key->subkeys + subclass;
1204 hash_head = classhashentry(key);
1205
1206 __raw_spin_lock(&hash_lock);
1207 /*
1208 * We have to do the hash-walk again, to avoid races
1209 * with another CPU:
1210 */
1211 list_for_each_entry(class, hash_head, hash_entry)
1212 if (class->key == key)
1213 goto out_unlock_set;
1214 /*
1215 * Allocate a new key from the static array, and add it to
1216 * the hash:
1217 */
1218 if (nr_lock_classes >= MAX_LOCKDEP_KEYS) {
1219 __raw_spin_unlock(&hash_lock);
1220 debug_locks_off();
1221 printk("BUG: MAX_LOCKDEP_KEYS too low!\n");
1222 printk("turning off the locking correctness validator.\n");
1223 return NULL;
1224 }
1225 class = lock_classes + nr_lock_classes++;
1226 debug_atomic_inc(&nr_unused_locks);
1227 class->key = key;
1228 class->name = lock->name;
1229 class->subclass = subclass;
1230 INIT_LIST_HEAD(&class->lock_entry);
1231 INIT_LIST_HEAD(&class->locks_before);
1232 INIT_LIST_HEAD(&class->locks_after);
1233 class->name_version = count_matching_names(class);
1234 /*
1235 * We use RCU's safe list-add method to make
1236 * parallel walking of the hash-list safe:
1237 */
1238 list_add_tail_rcu(&class->hash_entry, hash_head);
1239
1240 if (verbose(class)) {
1241 __raw_spin_unlock(&hash_lock);
1242 printk("\nnew class %p: %s", class->key, class->name);
1243 if (class->name_version > 1)
1244 printk("#%d", class->name_version);
1245 printk("\n");
1246 dump_stack();
1247 __raw_spin_lock(&hash_lock);
1248 }
1249out_unlock_set:
1250 __raw_spin_unlock(&hash_lock);
1251
1252 if (!subclass)
1253 lock->class_cache = class;
1254
1255 DEBUG_LOCKS_WARN_ON(class->subclass != subclass);
1256
1257 return class;
1258}
1259
1260/*
1261 * Look up a dependency chain. If the key is not present yet then
1262 * add it and return 0 - in this case the new dependency chain is
1263 * validated. If the key is already hashed, return 1.
1264 */
1265static inline int lookup_chain_cache(u64 chain_key)
1266{
1267 struct list_head *hash_head = chainhashentry(chain_key);
1268 struct lock_chain *chain;
1269
1270 DEBUG_LOCKS_WARN_ON(!irqs_disabled());
1271 /*
1272 * We can walk it lock-free, because entries only get added
1273 * to the hash:
1274 */
1275 list_for_each_entry(chain, hash_head, entry) {
1276 if (chain->chain_key == chain_key) {
1277cache_hit:
1278 debug_atomic_inc(&chain_lookup_hits);
1279 /*
1280 * In the debugging case, force redundant checking
1281 * by returning 1:
1282 */
1283#ifdef CONFIG_DEBUG_LOCKDEP
1284 __raw_spin_lock(&hash_lock);
1285 return 1;
1286#endif
1287 return 0;
1288 }
1289 }
1290 /*
1291 * Allocate a new chain entry from the static array, and add
1292 * it to the hash:
1293 */
1294 __raw_spin_lock(&hash_lock);
1295 /*
1296 * We have to walk the chain again locked - to avoid duplicates:
1297 */
1298 list_for_each_entry(chain, hash_head, entry) {
1299 if (chain->chain_key == chain_key) {
1300 __raw_spin_unlock(&hash_lock);
1301 goto cache_hit;
1302 }
1303 }
1304 if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) {
1305 __raw_spin_unlock(&hash_lock);
1306 debug_locks_off();
1307 printk("BUG: MAX_LOCKDEP_CHAINS too low!\n");
1308 printk("turning off the locking correctness validator.\n");
1309 return 0;
1310 }
1311 chain = lock_chains + nr_lock_chains++;
1312 chain->chain_key = chain_key;
1313 list_add_tail_rcu(&chain->entry, hash_head);
1314 debug_atomic_inc(&chain_lookup_misses);
1315#ifdef CONFIG_TRACE_IRQFLAGS
1316 if (current->hardirq_context)
1317 nr_hardirq_chains++;
1318 else {
1319 if (current->softirq_context)
1320 nr_softirq_chains++;
1321 else
1322 nr_process_chains++;
1323 }
1324#else
1325 nr_process_chains++;
1326#endif
1327
1328 return 1;
1329}
1330
1331/*
1332 * We are building curr_chain_key incrementally, so double-check
1333 * it from scratch, to make sure that it's done correctly:
1334 */
1335static void check_chain_key(struct task_struct *curr)
1336{
1337#ifdef CONFIG_DEBUG_LOCKDEP
1338 struct held_lock *hlock, *prev_hlock = NULL;
1339 unsigned int i, id;
1340 u64 chain_key = 0;
1341
1342 for (i = 0; i < curr->lockdep_depth; i++) {
1343 hlock = curr->held_locks + i;
1344 if (chain_key != hlock->prev_chain_key) {
1345 debug_locks_off();
1346 printk("hm#1, depth: %u [%u], %016Lx != %016Lx\n",
1347 curr->lockdep_depth, i,
1348 (unsigned long long)chain_key,
1349 (unsigned long long)hlock->prev_chain_key);
1350 WARN_ON(1);
1351 return;
1352 }
1353 id = hlock->class - lock_classes;
1354 DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS);
1355 if (prev_hlock && (prev_hlock->irq_context !=
1356 hlock->irq_context))
1357 chain_key = 0;
1358 chain_key = iterate_chain_key(chain_key, id);
1359 prev_hlock = hlock;
1360 }
1361 if (chain_key != curr->curr_chain_key) {
1362 debug_locks_off();
1363 printk("hm#2, depth: %u [%u], %016Lx != %016Lx\n",
1364 curr->lockdep_depth, i,
1365 (unsigned long long)chain_key,
1366 (unsigned long long)curr->curr_chain_key);
1367 WARN_ON(1);
1368 }
1369#endif
1370}
1371
1372#ifdef CONFIG_TRACE_IRQFLAGS
1373
1374/*
1375 * print irq inversion bug:
1376 */
1377static int
1378print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
1379 struct held_lock *this, int forwards,
1380 const char *irqclass)
1381{
1382 __raw_spin_unlock(&hash_lock);
1383 debug_locks_off();
1384 if (debug_locks_silent)
1385 return 0;
1386
1387 printk("\n=========================================================\n");
1388 printk( "[ INFO: possible irq lock inversion dependency detected ]\n");
1389 print_kernel_version();
1390 printk( "---------------------------------------------------------\n");
1391 printk("%s/%d just changed the state of lock:\n",
1392 curr->comm, curr->pid);
1393 print_lock(this);
1394 if (forwards)
1395 printk("but this lock took another, %s-irq-unsafe lock in the past:\n", irqclass);
1396 else
1397 printk("but this lock was taken by another, %s-irq-safe lock in the past:\n", irqclass);
1398 print_lock_name(other);
1399 printk("\n\nand interrupts could create inverse lock ordering between them.\n\n");
1400
1401 printk("\nother info that might help us debug this:\n");
1402 lockdep_print_held_locks(curr);
1403
1404 printk("\nthe first lock's dependencies:\n");
1405 print_lock_dependencies(this->class, 0);
1406
1407 printk("\nthe second lock's dependencies:\n");
1408 print_lock_dependencies(other, 0);
1409
1410 printk("\nstack backtrace:\n");
1411 dump_stack();
1412
1413 return 0;
1414}
1415
1416/*
1417 * Prove that in the forwards-direction subgraph starting at <this>
1418 * there is no lock matching <mask>:
1419 */
1420static int
1421check_usage_forwards(struct task_struct *curr, struct held_lock *this,
1422 enum lock_usage_bit bit, const char *irqclass)
1423{
1424 int ret;
1425
1426 find_usage_bit = bit;
1427 /* fills in <forwards_match> */
1428 ret = find_usage_forwards(this->class, 0);
1429 if (!ret || ret == 1)
1430 return ret;
1431
1432 return print_irq_inversion_bug(curr, forwards_match, this, 1, irqclass);
1433}
1434
1435/*
1436 * Prove that in the backwards-direction subgraph starting at <this>
1437 * there is no lock matching <mask>:
1438 */
1439static int
1440check_usage_backwards(struct task_struct *curr, struct held_lock *this,
1441 enum lock_usage_bit bit, const char *irqclass)
1442{
1443 int ret;
1444
1445 find_usage_bit = bit;
1446 /* fills in <backwards_match> */
1447 ret = find_usage_backwards(this->class, 0);
1448 if (!ret || ret == 1)
1449 return ret;
1450
1451 return print_irq_inversion_bug(curr, backwards_match, this, 0, irqclass);
1452}
1453
1454static inline void print_irqtrace_events(struct task_struct *curr)
1455{
1456 printk("irq event stamp: %u\n", curr->irq_events);
1457 printk("hardirqs last enabled at (%u): ", curr->hardirq_enable_event);
1458 print_ip_sym(curr->hardirq_enable_ip);
1459 printk("hardirqs last disabled at (%u): ", curr->hardirq_disable_event);
1460 print_ip_sym(curr->hardirq_disable_ip);
1461 printk("softirqs last enabled at (%u): ", curr->softirq_enable_event);
1462 print_ip_sym(curr->softirq_enable_ip);
1463 printk("softirqs last disabled at (%u): ", curr->softirq_disable_event);
1464 print_ip_sym(curr->softirq_disable_ip);
1465}
1466
1467#else
1468static inline void print_irqtrace_events(struct task_struct *curr)
1469{
1470}
1471#endif
1472
1473static int
1474print_usage_bug(struct task_struct *curr, struct held_lock *this,
1475 enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit)
1476{
1477 __raw_spin_unlock(&hash_lock);
1478 debug_locks_off();
1479 if (debug_locks_silent)
1480 return 0;
1481
1482 printk("\n=================================\n");
1483 printk( "[ INFO: inconsistent lock state ]\n");
1484 print_kernel_version();
1485 printk( "---------------------------------\n");
1486
1487 printk("inconsistent {%s} -> {%s} usage.\n",
1488 usage_str[prev_bit], usage_str[new_bit]);
1489
1490 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n",
1491 curr->comm, curr->pid,
1492 trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT,
1493 trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT,
1494 trace_hardirqs_enabled(curr),
1495 trace_softirqs_enabled(curr));
1496 print_lock(this);
1497
1498 printk("{%s} state was registered at:\n", usage_str[prev_bit]);
1499 print_stack_trace(this->class->usage_traces + prev_bit, 1);
1500
1501 print_irqtrace_events(curr);
1502 printk("\nother info that might help us debug this:\n");
1503 lockdep_print_held_locks(curr);
1504
1505 printk("\nstack backtrace:\n");
1506 dump_stack();
1507
1508 return 0;
1509}
1510
1511/*
1512 * Print out an error if an invalid bit is set:
1513 */
1514static inline int
1515valid_state(struct task_struct *curr, struct held_lock *this,
1516 enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit)
1517{
1518 if (unlikely(this->class->usage_mask & (1 << bad_bit)))
1519 return print_usage_bug(curr, this, bad_bit, new_bit);
1520 return 1;
1521}
1522
1523#define STRICT_READ_CHECKS 1
1524
1525/*
1526 * Mark a lock with a usage bit, and validate the state transition:
1527 */
1528static int mark_lock(struct task_struct *curr, struct held_lock *this,
1529 enum lock_usage_bit new_bit, unsigned long ip)
1530{
1531 unsigned int new_mask = 1 << new_bit, ret = 1;
1532
1533 /*
1534 * If already set then do not dirty the cacheline,
1535 * nor do any checks:
1536 */
1537 if (likely(this->class->usage_mask & new_mask))
1538 return 1;
1539
1540 __raw_spin_lock(&hash_lock);
1541 /*
1542 * Make sure we didnt race:
1543 */
1544 if (unlikely(this->class->usage_mask & new_mask)) {
1545 __raw_spin_unlock(&hash_lock);
1546 return 1;
1547 }
1548
1549 this->class->usage_mask |= new_mask;
1550
1551#ifdef CONFIG_TRACE_IRQFLAGS
1552 if (new_bit == LOCK_ENABLED_HARDIRQS ||
1553 new_bit == LOCK_ENABLED_HARDIRQS_READ)
1554 ip = curr->hardirq_enable_ip;
1555 else if (new_bit == LOCK_ENABLED_SOFTIRQS ||
1556 new_bit == LOCK_ENABLED_SOFTIRQS_READ)
1557 ip = curr->softirq_enable_ip;
1558#endif
1559 if (!save_trace(this->class->usage_traces + new_bit))
1560 return 0;
1561
1562 switch (new_bit) {
1563#ifdef CONFIG_TRACE_IRQFLAGS
1564 case LOCK_USED_IN_HARDIRQ:
1565 if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS))
1566 return 0;
1567 if (!valid_state(curr, this, new_bit,
1568 LOCK_ENABLED_HARDIRQS_READ))
1569 return 0;
1570 /*
1571 * just marked it hardirq-safe, check that this lock
1572 * took no hardirq-unsafe lock in the past:
1573 */
1574 if (!check_usage_forwards(curr, this,
1575 LOCK_ENABLED_HARDIRQS, "hard"))
1576 return 0;
1577#if STRICT_READ_CHECKS
1578 /*
1579 * just marked it hardirq-safe, check that this lock
1580 * took no hardirq-unsafe-read lock in the past:
1581 */
1582 if (!check_usage_forwards(curr, this,
1583 LOCK_ENABLED_HARDIRQS_READ, "hard-read"))
1584 return 0;
1585#endif
1586 if (hardirq_verbose(this->class))
1587 ret = 2;
1588 break;
1589 case LOCK_USED_IN_SOFTIRQ:
1590 if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQS))
1591 return 0;
1592 if (!valid_state(curr, this, new_bit,
1593 LOCK_ENABLED_SOFTIRQS_READ))
1594 return 0;
1595 /*
1596 * just marked it softirq-safe, check that this lock
1597 * took no softirq-unsafe lock in the past:
1598 */
1599 if (!check_usage_forwards(curr, this,
1600 LOCK_ENABLED_SOFTIRQS, "soft"))
1601 return 0;
1602#if STRICT_READ_CHECKS
1603 /*
1604 * just marked it softirq-safe, check that this lock
1605 * took no softirq-unsafe-read lock in the past:
1606 */
1607 if (!check_usage_forwards(curr, this,
1608 LOCK_ENABLED_SOFTIRQS_READ, "soft-read"))
1609 return 0;
1610#endif
1611 if (softirq_verbose(this->class))
1612 ret = 2;
1613 break;
1614 case LOCK_USED_IN_HARDIRQ_READ:
1615 if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS))
1616 return 0;
1617 /*
1618 * just marked it hardirq-read-safe, check that this lock
1619 * took no hardirq-unsafe lock in the past:
1620 */
1621 if (!check_usage_forwards(curr, this,
1622 LOCK_ENABLED_HARDIRQS, "hard"))
1623 return 0;
1624 if (hardirq_verbose(this->class))
1625 ret = 2;
1626 break;
1627 case LOCK_USED_IN_SOFTIRQ_READ:
1628 if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQS))
1629 return 0;
1630 /*
1631 * just marked it softirq-read-safe, check that this lock
1632 * took no softirq-unsafe lock in the past:
1633 */
1634 if (!check_usage_forwards(curr, this,
1635 LOCK_ENABLED_SOFTIRQS, "soft"))
1636 return 0;
1637 if (softirq_verbose(this->class))
1638 ret = 2;
1639 break;
1640 case LOCK_ENABLED_HARDIRQS:
1641 if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ))
1642 return 0;
1643 if (!valid_state(curr, this, new_bit,
1644 LOCK_USED_IN_HARDIRQ_READ))
1645 return 0;
1646 /*
1647 * just marked it hardirq-unsafe, check that no hardirq-safe
1648 * lock in the system ever took it in the past:
1649 */
1650 if (!check_usage_backwards(curr, this,
1651 LOCK_USED_IN_HARDIRQ, "hard"))
1652 return 0;
1653#if STRICT_READ_CHECKS
1654 /*
1655 * just marked it hardirq-unsafe, check that no
1656 * hardirq-safe-read lock in the system ever took
1657 * it in the past:
1658 */
1659 if (!check_usage_backwards(curr, this,
1660 LOCK_USED_IN_HARDIRQ_READ, "hard-read"))
1661 return 0;
1662#endif
1663 if (hardirq_verbose(this->class))
1664 ret = 2;
1665 break;
1666 case LOCK_ENABLED_SOFTIRQS:
1667 if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ))
1668 return 0;
1669 if (!valid_state(curr, this, new_bit,
1670 LOCK_USED_IN_SOFTIRQ_READ))
1671 return 0;
1672 /*
1673 * just marked it softirq-unsafe, check that no softirq-safe
1674 * lock in the system ever took it in the past:
1675 */
1676 if (!check_usage_backwards(curr, this,
1677 LOCK_USED_IN_SOFTIRQ, "soft"))
1678 return 0;
1679#if STRICT_READ_CHECKS
1680 /*
1681 * just marked it softirq-unsafe, check that no
1682 * softirq-safe-read lock in the system ever took
1683 * it in the past:
1684 */
1685 if (!check_usage_backwards(curr, this,
1686 LOCK_USED_IN_SOFTIRQ_READ, "soft-read"))
1687 return 0;
1688#endif
1689 if (softirq_verbose(this->class))
1690 ret = 2;
1691 break;
1692 case LOCK_ENABLED_HARDIRQS_READ:
1693 if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ))
1694 return 0;
1695#if STRICT_READ_CHECKS
1696 /*
1697 * just marked it hardirq-read-unsafe, check that no
1698 * hardirq-safe lock in the system ever took it in the past:
1699 */
1700 if (!check_usage_backwards(curr, this,
1701 LOCK_USED_IN_HARDIRQ, "hard"))
1702 return 0;
1703#endif
1704 if (hardirq_verbose(this->class))
1705 ret = 2;
1706 break;
1707 case LOCK_ENABLED_SOFTIRQS_READ:
1708 if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ))
1709 return 0;
1710#if STRICT_READ_CHECKS
1711 /*
1712 * just marked it softirq-read-unsafe, check that no
1713 * softirq-safe lock in the system ever took it in the past:
1714 */
1715 if (!check_usage_backwards(curr, this,
1716 LOCK_USED_IN_SOFTIRQ, "soft"))
1717 return 0;
1718#endif
1719 if (softirq_verbose(this->class))
1720 ret = 2;
1721 break;
1722#endif
1723 case LOCK_USED:
1724 /*
1725 * Add it to the global list of classes:
1726 */
1727 list_add_tail_rcu(&this->class->lock_entry, &all_lock_classes);
1728 debug_atomic_dec(&nr_unused_locks);
1729 break;
1730 default:
1731 debug_locks_off();
1732 WARN_ON(1);
1733 return 0;
1734 }
1735
1736 __raw_spin_unlock(&hash_lock);
1737
1738 /*
1739 * We must printk outside of the hash_lock:
1740 */
1741 if (ret == 2) {
1742 printk("\nmarked lock as {%s}:\n", usage_str[new_bit]);
1743 print_lock(this);
1744 print_irqtrace_events(curr);
1745 dump_stack();
1746 }
1747
1748 return ret;
1749}
1750
1751#ifdef CONFIG_TRACE_IRQFLAGS
1752/*
1753 * Mark all held locks with a usage bit:
1754 */
1755static int
1756mark_held_locks(struct task_struct *curr, int hardirq, unsigned long ip)
1757{
1758 enum lock_usage_bit usage_bit;
1759 struct held_lock *hlock;
1760 int i;
1761
1762 for (i = 0; i < curr->lockdep_depth; i++) {
1763 hlock = curr->held_locks + i;
1764
1765 if (hardirq) {
1766 if (hlock->read)
1767 usage_bit = LOCK_ENABLED_HARDIRQS_READ;
1768 else
1769 usage_bit = LOCK_ENABLED_HARDIRQS;
1770 } else {
1771 if (hlock->read)
1772 usage_bit = LOCK_ENABLED_SOFTIRQS_READ;
1773 else
1774 usage_bit = LOCK_ENABLED_SOFTIRQS;
1775 }
1776 if (!mark_lock(curr, hlock, usage_bit, ip))
1777 return 0;
1778 }
1779
1780 return 1;
1781}
1782
1783/*
1784 * Debugging helper: via this flag we know that we are in
1785 * 'early bootup code', and will warn about any invalid irqs-on event:
1786 */
1787static int early_boot_irqs_enabled;
1788
1789void early_boot_irqs_off(void)
1790{
1791 early_boot_irqs_enabled = 0;
1792}
1793
1794void early_boot_irqs_on(void)
1795{
1796 early_boot_irqs_enabled = 1;
1797}
1798
1799/*
1800 * Hardirqs will be enabled:
1801 */
1802void trace_hardirqs_on(void)
1803{
1804 struct task_struct *curr = current;
1805 unsigned long ip;
1806
1807 if (unlikely(!debug_locks || current->lockdep_recursion))
1808 return;
1809
1810 if (DEBUG_LOCKS_WARN_ON(unlikely(!early_boot_irqs_enabled)))
1811 return;
1812
1813 if (unlikely(curr->hardirqs_enabled)) {
1814 debug_atomic_inc(&redundant_hardirqs_on);
1815 return;
1816 }
1817 /* we'll do an OFF -> ON transition: */
1818 curr->hardirqs_enabled = 1;
1819 ip = (unsigned long) __builtin_return_address(0);
1820
1821 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
1822 return;
1823 if (DEBUG_LOCKS_WARN_ON(current->hardirq_context))
1824 return;
1825 /*
1826 * We are going to turn hardirqs on, so set the
1827 * usage bit for all held locks:
1828 */
1829 if (!mark_held_locks(curr, 1, ip))
1830 return;
1831 /*
1832 * If we have softirqs enabled, then set the usage
1833 * bit for all held locks. (disabled hardirqs prevented
1834 * this bit from being set before)
1835 */
1836 if (curr->softirqs_enabled)
1837 if (!mark_held_locks(curr, 0, ip))
1838 return;
1839
1840 curr->hardirq_enable_ip = ip;
1841 curr->hardirq_enable_event = ++curr->irq_events;
1842 debug_atomic_inc(&hardirqs_on_events);
1843}
1844
1845EXPORT_SYMBOL(trace_hardirqs_on);
1846
1847/*
1848 * Hardirqs were disabled:
1849 */
1850void trace_hardirqs_off(void)
1851{
1852 struct task_struct *curr = current;
1853
1854 if (unlikely(!debug_locks || current->lockdep_recursion))
1855 return;
1856
1857 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
1858 return;
1859
1860 if (curr->hardirqs_enabled) {
1861 /*
1862 * We have done an ON -> OFF transition:
1863 */
1864 curr->hardirqs_enabled = 0;
1865 curr->hardirq_disable_ip = _RET_IP_;
1866 curr->hardirq_disable_event = ++curr->irq_events;
1867 debug_atomic_inc(&hardirqs_off_events);
1868 } else
1869 debug_atomic_inc(&redundant_hardirqs_off);
1870}
1871
1872EXPORT_SYMBOL(trace_hardirqs_off);
1873
1874/*
1875 * Softirqs will be enabled:
1876 */
1877void trace_softirqs_on(unsigned long ip)
1878{
1879 struct task_struct *curr = current;
1880
1881 if (unlikely(!debug_locks))
1882 return;
1883
1884 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
1885 return;
1886
1887 if (curr->softirqs_enabled) {
1888 debug_atomic_inc(&redundant_softirqs_on);
1889 return;
1890 }
1891
1892 /*
1893 * We'll do an OFF -> ON transition:
1894 */
1895 curr->softirqs_enabled = 1;
1896 curr->softirq_enable_ip = ip;
1897 curr->softirq_enable_event = ++curr->irq_events;
1898 debug_atomic_inc(&softirqs_on_events);
1899 /*
1900 * We are going to turn softirqs on, so set the
1901 * usage bit for all held locks, if hardirqs are
1902 * enabled too:
1903 */
1904 if (curr->hardirqs_enabled)
1905 mark_held_locks(curr, 0, ip);
1906}
1907
1908/*
1909 * Softirqs were disabled:
1910 */
1911void trace_softirqs_off(unsigned long ip)
1912{
1913 struct task_struct *curr = current;
1914
1915 if (unlikely(!debug_locks))
1916 return;
1917
1918 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
1919 return;
1920
1921 if (curr->softirqs_enabled) {
1922 /*
1923 * We have done an ON -> OFF transition:
1924 */
1925 curr->softirqs_enabled = 0;
1926 curr->softirq_disable_ip = ip;
1927 curr->softirq_disable_event = ++curr->irq_events;
1928 debug_atomic_inc(&softirqs_off_events);
1929 DEBUG_LOCKS_WARN_ON(!softirq_count());
1930 } else
1931 debug_atomic_inc(&redundant_softirqs_off);
1932}
1933
1934#endif
1935
1936/*
1937 * Initialize a lock instance's lock-class mapping info:
1938 */
1939void lockdep_init_map(struct lockdep_map *lock, const char *name,
1940 struct lock_class_key *key)
1941{
1942 if (unlikely(!debug_locks))
1943 return;
1944
1945 if (DEBUG_LOCKS_WARN_ON(!key))
1946 return;
1947 if (DEBUG_LOCKS_WARN_ON(!name))
1948 return;
1949 /*
1950 * Sanity check, the lock-class key must be persistent:
1951 */
1952 if (!static_obj(key)) {
1953 printk("BUG: key %p not in .data!\n", key);
1954 DEBUG_LOCKS_WARN_ON(1);
1955 return;
1956 }
1957 lock->name = name;
1958 lock->key = key;
1959 lock->class_cache = NULL;
1960}
1961
1962EXPORT_SYMBOL_GPL(lockdep_init_map);
1963
1964/*
1965 * This gets called for every mutex_lock*()/spin_lock*() operation.
1966 * We maintain the dependency maps and validate the locking attempt:
1967 */
1968static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
1969 int trylock, int read, int check, int hardirqs_off,
1970 unsigned long ip)
1971{
1972 struct task_struct *curr = current;
1973 struct lock_class *class = NULL;
1974 struct held_lock *hlock;
1975 unsigned int depth, id;
1976 int chain_head = 0;
1977 u64 chain_key;
1978
1979 if (unlikely(!debug_locks))
1980 return 0;
1981
1982 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
1983 return 0;
1984
1985 if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
1986 debug_locks_off();
1987 printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n");
1988 printk("turning off the locking correctness validator.\n");
1989 return 0;
1990 }
1991
1992 if (!subclass)
1993 class = lock->class_cache;
1994 /*
1995 * Not cached yet or subclass?
1996 */
1997 if (unlikely(!class)) {
1998 class = register_lock_class(lock, subclass);
1999 if (!class)
2000 return 0;
2001 }
2002 debug_atomic_inc((atomic_t *)&class->ops);
2003 if (very_verbose(class)) {
2004 printk("\nacquire class [%p] %s", class->key, class->name);
2005 if (class->name_version > 1)
2006 printk("#%d", class->name_version);
2007 printk("\n");
2008 dump_stack();
2009 }
2010
2011 /*
2012 * Add the lock to the list of currently held locks.
2013 * (we dont increase the depth just yet, up until the
2014 * dependency checks are done)
2015 */
2016 depth = curr->lockdep_depth;
2017 if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH))
2018 return 0;
2019
2020 hlock = curr->held_locks + depth;
2021
2022 hlock->class = class;
2023 hlock->acquire_ip = ip;
2024 hlock->instance = lock;
2025 hlock->trylock = trylock;
2026 hlock->read = read;
2027 hlock->check = check;
2028 hlock->hardirqs_off = hardirqs_off;
2029
2030 if (check != 2)
2031 goto out_calc_hash;
2032#ifdef CONFIG_TRACE_IRQFLAGS
2033 /*
2034 * If non-trylock use in a hardirq or softirq context, then
2035 * mark the lock as used in these contexts:
2036 */
2037 if (!trylock) {
2038 if (read) {
2039 if (curr->hardirq_context)
2040 if (!mark_lock(curr, hlock,
2041 LOCK_USED_IN_HARDIRQ_READ, ip))
2042 return 0;
2043 if (curr->softirq_context)
2044 if (!mark_lock(curr, hlock,
2045 LOCK_USED_IN_SOFTIRQ_READ, ip))
2046 return 0;
2047 } else {
2048 if (curr->hardirq_context)
2049 if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ, ip))
2050 return 0;
2051 if (curr->softirq_context)
2052 if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ, ip))
2053 return 0;
2054 }
2055 }
2056 if (!hardirqs_off) {
2057 if (read) {
2058 if (!mark_lock(curr, hlock,
2059 LOCK_ENABLED_HARDIRQS_READ, ip))
2060 return 0;
2061 if (curr->softirqs_enabled)
2062 if (!mark_lock(curr, hlock,
2063 LOCK_ENABLED_SOFTIRQS_READ, ip))
2064 return 0;
2065 } else {
2066 if (!mark_lock(curr, hlock,
2067 LOCK_ENABLED_HARDIRQS, ip))
2068 return 0;
2069 if (curr->softirqs_enabled)
2070 if (!mark_lock(curr, hlock,
2071 LOCK_ENABLED_SOFTIRQS, ip))
2072 return 0;
2073 }
2074 }
2075#endif
2076 /* mark it as used: */
2077 if (!mark_lock(curr, hlock, LOCK_USED, ip))
2078 return 0;
2079out_calc_hash:
2080 /*
2081 * Calculate the chain hash: it's the combined has of all the
2082 * lock keys along the dependency chain. We save the hash value
2083 * at every step so that we can get the current hash easily
2084 * after unlock. The chain hash is then used to cache dependency
2085 * results.
2086 *
2087 * The 'key ID' is what is the most compact key value to drive
2088 * the hash, not class->key.
2089 */
2090 id = class - lock_classes;
2091 if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
2092 return 0;
2093
2094 chain_key = curr->curr_chain_key;
2095 if (!depth) {
2096 if (DEBUG_LOCKS_WARN_ON(chain_key != 0))
2097 return 0;
2098 chain_head = 1;
2099 }
2100
2101 hlock->prev_chain_key = chain_key;
2102
2103#ifdef CONFIG_TRACE_IRQFLAGS
2104 /*
2105 * Keep track of points where we cross into an interrupt context:
2106 */
2107 hlock->irq_context = 2*(curr->hardirq_context ? 1 : 0) +
2108 curr->softirq_context;
2109 if (depth) {
2110 struct held_lock *prev_hlock;
2111
2112 prev_hlock = curr->held_locks + depth-1;
2113 /*
2114 * If we cross into another context, reset the
2115 * hash key (this also prevents the checking and the
2116 * adding of the dependency to 'prev'):
2117 */
2118 if (prev_hlock->irq_context != hlock->irq_context) {
2119 chain_key = 0;
2120 chain_head = 1;
2121 }
2122 }
2123#endif
2124 chain_key = iterate_chain_key(chain_key, id);
2125 curr->curr_chain_key = chain_key;
2126
2127 /*
2128 * Trylock needs to maintain the stack of held locks, but it
2129 * does not add new dependencies, because trylock can be done
2130 * in any order.
2131 *
2132 * We look up the chain_key and do the O(N^2) check and update of
2133 * the dependencies only if this is a new dependency chain.
2134 * (If lookup_chain_cache() returns with 1 it acquires
2135 * hash_lock for us)
2136 */
2137 if (!trylock && (check == 2) && lookup_chain_cache(chain_key)) {
2138 /*
2139 * Check whether last held lock:
2140 *
2141 * - is irq-safe, if this lock is irq-unsafe
2142 * - is softirq-safe, if this lock is hardirq-unsafe
2143 *
2144 * And check whether the new lock's dependency graph
2145 * could lead back to the previous lock.
2146 *
2147 * any of these scenarios could lead to a deadlock. If
2148 * All validations
2149 */
2150 int ret = check_deadlock(curr, hlock, lock, read);
2151
2152 if (!ret)
2153 return 0;
2154 /*
2155 * Mark recursive read, as we jump over it when
2156 * building dependencies (just like we jump over
2157 * trylock entries):
2158 */
2159 if (ret == 2)
2160 hlock->read = 2;
2161 /*
2162 * Add dependency only if this lock is not the head
2163 * of the chain, and if it's not a secondary read-lock:
2164 */
2165 if (!chain_head && ret != 2)
2166 if (!check_prevs_add(curr, hlock))
2167 return 0;
2168 __raw_spin_unlock(&hash_lock);
2169 }
2170 curr->lockdep_depth++;
2171 check_chain_key(curr);
2172 if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) {
2173 debug_locks_off();
2174 printk("BUG: MAX_LOCK_DEPTH too low!\n");
2175 printk("turning off the locking correctness validator.\n");
2176 return 0;
2177 }
2178 if (unlikely(curr->lockdep_depth > max_lockdep_depth))
2179 max_lockdep_depth = curr->lockdep_depth;
2180
2181 return 1;
2182}
2183
2184static int
2185print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
2186 unsigned long ip)
2187{
2188 if (!debug_locks_off())
2189 return 0;
2190 if (debug_locks_silent)
2191 return 0;
2192
2193 printk("\n=====================================\n");
2194 printk( "[ BUG: bad unlock balance detected! ]\n");
2195 printk( "-------------------------------------\n");
2196 printk("%s/%d is trying to release lock (",
2197 curr->comm, curr->pid);
2198 print_lockdep_cache(lock);
2199 printk(") at:\n");
2200 print_ip_sym(ip);
2201 printk("but there are no more locks to release!\n");
2202 printk("\nother info that might help us debug this:\n");
2203 lockdep_print_held_locks(curr);
2204
2205 printk("\nstack backtrace:\n");
2206 dump_stack();
2207
2208 return 0;
2209}
2210
2211/*
2212 * Common debugging checks for both nested and non-nested unlock:
2213 */
2214static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
2215 unsigned long ip)
2216{
2217 if (unlikely(!debug_locks))
2218 return 0;
2219 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
2220 return 0;
2221
2222 if (curr->lockdep_depth <= 0)
2223 return print_unlock_inbalance_bug(curr, lock, ip);
2224
2225 return 1;
2226}
2227
2228/*
2229 * Remove the lock to the list of currently held locks in a
2230 * potentially non-nested (out of order) manner. This is a
2231 * relatively rare operation, as all the unlock APIs default
2232 * to nested mode (which uses lock_release()):
2233 */
2234static int
2235lock_release_non_nested(struct task_struct *curr,
2236 struct lockdep_map *lock, unsigned long ip)
2237{
2238 struct held_lock *hlock, *prev_hlock;
2239 unsigned int depth;
2240 int i;
2241
2242 /*
2243 * Check whether the lock exists in the current stack
2244 * of held locks:
2245 */
2246 depth = curr->lockdep_depth;
2247 if (DEBUG_LOCKS_WARN_ON(!depth))
2248 return 0;
2249
2250 prev_hlock = NULL;
2251 for (i = depth-1; i >= 0; i--) {
2252 hlock = curr->held_locks + i;
2253 /*
2254 * We must not cross into another context:
2255 */
2256 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
2257 break;
2258 if (hlock->instance == lock)
2259 goto found_it;
2260 prev_hlock = hlock;
2261 }
2262 return print_unlock_inbalance_bug(curr, lock, ip);
2263
2264found_it:
2265 /*
2266 * We have the right lock to unlock, 'hlock' points to it.
2267 * Now we remove it from the stack, and add back the other
2268 * entries (if any), recalculating the hash along the way:
2269 */
2270 curr->lockdep_depth = i;
2271 curr->curr_chain_key = hlock->prev_chain_key;
2272
2273 for (i++; i < depth; i++) {
2274 hlock = curr->held_locks + i;
2275 if (!__lock_acquire(hlock->instance,
2276 hlock->class->subclass, hlock->trylock,
2277 hlock->read, hlock->check, hlock->hardirqs_off,
2278 hlock->acquire_ip))
2279 return 0;
2280 }
2281
2282 if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1))
2283 return 0;
2284 return 1;
2285}
2286
2287/*
2288 * Remove the lock to the list of currently held locks - this gets
2289 * called on mutex_unlock()/spin_unlock*() (or on a failed
2290 * mutex_lock_interruptible()). This is done for unlocks that nest
2291 * perfectly. (i.e. the current top of the lock-stack is unlocked)
2292 */
2293static int lock_release_nested(struct task_struct *curr,
2294 struct lockdep_map *lock, unsigned long ip)
2295{
2296 struct held_lock *hlock;
2297 unsigned int depth;
2298
2299 /*
2300 * Pop off the top of the lock stack:
2301 */
2302 depth = curr->lockdep_depth - 1;
2303 hlock = curr->held_locks + depth;
2304
2305 /*
2306 * Is the unlock non-nested:
2307 */
2308 if (hlock->instance != lock)
2309 return lock_release_non_nested(curr, lock, ip);
2310 curr->lockdep_depth--;
2311
2312 if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0)))
2313 return 0;
2314
2315 curr->curr_chain_key = hlock->prev_chain_key;
2316
2317#ifdef CONFIG_DEBUG_LOCKDEP
2318 hlock->prev_chain_key = 0;
2319 hlock->class = NULL;
2320 hlock->acquire_ip = 0;
2321 hlock->irq_context = 0;
2322#endif
2323 return 1;
2324}
2325
2326/*
2327 * Remove the lock to the list of currently held locks - this gets
2328 * called on mutex_unlock()/spin_unlock*() (or on a failed
2329 * mutex_lock_interruptible()). This is done for unlocks that nest
2330 * perfectly. (i.e. the current top of the lock-stack is unlocked)
2331 */
2332static void
2333__lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
2334{
2335 struct task_struct *curr = current;
2336
2337 if (!check_unlock(curr, lock, ip))
2338 return;
2339
2340 if (nested) {
2341 if (!lock_release_nested(curr, lock, ip))
2342 return;
2343 } else {
2344 if (!lock_release_non_nested(curr, lock, ip))
2345 return;
2346 }
2347
2348 check_chain_key(curr);
2349}
2350
2351/*
2352 * Check whether we follow the irq-flags state precisely:
2353 */
2354static void check_flags(unsigned long flags)
2355{
2356#if defined(CONFIG_DEBUG_LOCKDEP) && defined(CONFIG_TRACE_IRQFLAGS)
2357 if (!debug_locks)
2358 return;
2359
2360 if (irqs_disabled_flags(flags))
2361 DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled);
2362 else
2363 DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled);
2364
2365 /*
2366 * We dont accurately track softirq state in e.g.
2367 * hardirq contexts (such as on 4KSTACKS), so only
2368 * check if not in hardirq contexts:
2369 */
2370 if (!hardirq_count()) {
2371 if (softirq_count())
2372 DEBUG_LOCKS_WARN_ON(current->softirqs_enabled);
2373 else
2374 DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
2375 }
2376
2377 if (!debug_locks)
2378 print_irqtrace_events(current);
2379#endif
2380}
2381
2382/*
2383 * We are not always called with irqs disabled - do that here,
2384 * and also avoid lockdep recursion:
2385 */
2386void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2387 int trylock, int read, int check, unsigned long ip)
2388{
2389 unsigned long flags;
2390
2391 if (unlikely(current->lockdep_recursion))
2392 return;
2393
2394 raw_local_irq_save(flags);
2395 check_flags(flags);
2396
2397 current->lockdep_recursion = 1;
2398 __lock_acquire(lock, subclass, trylock, read, check,
2399 irqs_disabled_flags(flags), ip);
2400 current->lockdep_recursion = 0;
2401 raw_local_irq_restore(flags);
2402}
2403
2404EXPORT_SYMBOL_GPL(lock_acquire);
2405
2406void lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
2407{
2408 unsigned long flags;
2409
2410 if (unlikely(current->lockdep_recursion))
2411 return;
2412
2413 raw_local_irq_save(flags);
2414 check_flags(flags);
2415 current->lockdep_recursion = 1;
2416 __lock_release(lock, nested, ip);
2417 current->lockdep_recursion = 0;
2418 raw_local_irq_restore(flags);
2419}
2420
2421EXPORT_SYMBOL_GPL(lock_release);
2422
2423/*
2424 * Used by the testsuite, sanitize the validator state
2425 * after a simulated failure:
2426 */
2427
2428void lockdep_reset(void)
2429{
2430 unsigned long flags;
2431
2432 raw_local_irq_save(flags);
2433 current->curr_chain_key = 0;
2434 current->lockdep_depth = 0;
2435 current->lockdep_recursion = 0;
2436 memset(current->held_locks, 0, MAX_LOCK_DEPTH*sizeof(struct held_lock));
2437 nr_hardirq_chains = 0;
2438 nr_softirq_chains = 0;
2439 nr_process_chains = 0;
2440 debug_locks = 1;
2441 raw_local_irq_restore(flags);
2442}
2443
2444static void zap_class(struct lock_class *class)
2445{
2446 int i;
2447
2448 /*
2449 * Remove all dependencies this lock is
2450 * involved in:
2451 */
2452 for (i = 0; i < nr_list_entries; i++) {
2453 if (list_entries[i].class == class)
2454 list_del_rcu(&list_entries[i].entry);
2455 }
2456 /*
2457 * Unhash the class and remove it from the all_lock_classes list:
2458 */
2459 list_del_rcu(&class->hash_entry);
2460 list_del_rcu(&class->lock_entry);
2461
2462}
2463
2464static inline int within(void *addr, void *start, unsigned long size)
2465{
2466 return addr >= start && addr < start + size;
2467}
2468
2469void lockdep_free_key_range(void *start, unsigned long size)
2470{
2471 struct lock_class *class, *next;
2472 struct list_head *head;
2473 unsigned long flags;
2474 int i;
2475
2476 raw_local_irq_save(flags);
2477 __raw_spin_lock(&hash_lock);
2478
2479 /*
2480 * Unhash all classes that were created by this module:
2481 */
2482 for (i = 0; i < CLASSHASH_SIZE; i++) {
2483 head = classhash_table + i;
2484 if (list_empty(head))
2485 continue;
2486 list_for_each_entry_safe(class, next, head, hash_entry)
2487 if (within(class->key, start, size))
2488 zap_class(class);
2489 }
2490
2491 __raw_spin_unlock(&hash_lock);
2492 raw_local_irq_restore(flags);
2493}
2494
2495void lockdep_reset_lock(struct lockdep_map *lock)
2496{
2497 struct lock_class *class, *next;
2498 struct list_head *head;
2499 unsigned long flags;
2500 int i, j;
2501
2502 raw_local_irq_save(flags);
2503
2504 /*
2505 * Remove all classes this lock might have:
2506 */
2507 for (j = 0; j < MAX_LOCKDEP_SUBCLASSES; j++) {
2508 /*
2509 * If the class exists we look it up and zap it:
2510 */
2511 class = look_up_lock_class(lock, j);
2512 if (class)
2513 zap_class(class);
2514 }
2515 /*
2516 * Debug check: in the end all mapped classes should
2517 * be gone.
2518 */
2519 __raw_spin_lock(&hash_lock);
2520 for (i = 0; i < CLASSHASH_SIZE; i++) {
2521 head = classhash_table + i;
2522 if (list_empty(head))
2523 continue;
2524 list_for_each_entry_safe(class, next, head, hash_entry) {
2525 if (unlikely(class == lock->class_cache)) {
2526 __raw_spin_unlock(&hash_lock);
2527 DEBUG_LOCKS_WARN_ON(1);
2528 goto out_restore;
2529 }
2530 }
2531 }
2532 __raw_spin_unlock(&hash_lock);
2533
2534out_restore:
2535 raw_local_irq_restore(flags);
2536}
2537
2538void __init lockdep_init(void)
2539{
2540 int i;
2541
2542 /*
2543 * Some architectures have their own start_kernel()
2544 * code which calls lockdep_init(), while we also
2545 * call lockdep_init() from the start_kernel() itself,
2546 * and we want to initialize the hashes only once:
2547 */
2548 if (lockdep_initialized)
2549 return;
2550
2551 for (i = 0; i < CLASSHASH_SIZE; i++)
2552 INIT_LIST_HEAD(classhash_table + i);
2553
2554 for (i = 0; i < CHAINHASH_SIZE; i++)
2555 INIT_LIST_HEAD(chainhash_table + i);
2556
2557 lockdep_initialized = 1;
2558}
2559
2560void __init lockdep_info(void)
2561{
2562 printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n");
2563
2564 printk("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES);
2565 printk("... MAX_LOCK_DEPTH: %lu\n", MAX_LOCK_DEPTH);
2566 printk("... MAX_LOCKDEP_KEYS: %lu\n", MAX_LOCKDEP_KEYS);
2567 printk("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE);
2568 printk("... MAX_LOCKDEP_ENTRIES: %lu\n", MAX_LOCKDEP_ENTRIES);
2569 printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS);
2570 printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE);
2571
2572 printk(" memory used by lock dependency info: %lu kB\n",
2573 (sizeof(struct lock_class) * MAX_LOCKDEP_KEYS +
2574 sizeof(struct list_head) * CLASSHASH_SIZE +
2575 sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES +
2576 sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS +
2577 sizeof(struct list_head) * CHAINHASH_SIZE) / 1024);
2578
2579 printk(" per task-struct memory footprint: %lu bytes\n",
2580 sizeof(struct held_lock) * MAX_LOCK_DEPTH);
2581
2582#ifdef CONFIG_DEBUG_LOCKDEP
2583 if (lockdep_init_error)
2584 printk("WARNING: lockdep init error! Arch code didnt call lockdep_init() early enough?\n");
2585#endif
2586}
2587
2588static inline int in_range(const void *start, const void *addr, const void *end)
2589{
2590 return addr >= start && addr <= end;
2591}
2592
2593static void
2594print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
2595 const void *mem_to, struct held_lock *hlock)
2596{
2597 if (!debug_locks_off())
2598 return;
2599 if (debug_locks_silent)
2600 return;
2601
2602 printk("\n=========================\n");
2603 printk( "[ BUG: held lock freed! ]\n");
2604 printk( "-------------------------\n");
2605 printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
2606 curr->comm, curr->pid, mem_from, mem_to-1);
2607 print_lock(hlock);
2608 lockdep_print_held_locks(curr);
2609
2610 printk("\nstack backtrace:\n");
2611 dump_stack();
2612}
2613
2614/*
2615 * Called when kernel memory is freed (or unmapped), or if a lock
2616 * is destroyed or reinitialized - this code checks whether there is
2617 * any held lock in the memory range of <from> to <to>:
2618 */
2619void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
2620{
2621 const void *mem_to = mem_from + mem_len, *lock_from, *lock_to;
2622 struct task_struct *curr = current;
2623 struct held_lock *hlock;
2624 unsigned long flags;
2625 int i;
2626
2627 if (unlikely(!debug_locks))
2628 return;
2629
2630 local_irq_save(flags);
2631 for (i = 0; i < curr->lockdep_depth; i++) {
2632 hlock = curr->held_locks + i;
2633
2634 lock_from = (void *)hlock->instance;
2635 lock_to = (void *)(hlock->instance + 1);
2636
2637 if (!in_range(mem_from, lock_from, mem_to) &&
2638 !in_range(mem_from, lock_to, mem_to))
2639 continue;
2640
2641 print_freed_lock_bug(curr, mem_from, mem_to, hlock);
2642 break;
2643 }
2644 local_irq_restore(flags);
2645}
2646
2647static void print_held_locks_bug(struct task_struct *curr)
2648{
2649 if (!debug_locks_off())
2650 return;
2651 if (debug_locks_silent)
2652 return;
2653
2654 printk("\n=====================================\n");
2655 printk( "[ BUG: lock held at task exit time! ]\n");
2656 printk( "-------------------------------------\n");
2657 printk("%s/%d is exiting with locks still held!\n",
2658 curr->comm, curr->pid);
2659 lockdep_print_held_locks(curr);
2660
2661 printk("\nstack backtrace:\n");
2662 dump_stack();
2663}
2664
2665void debug_check_no_locks_held(struct task_struct *task)
2666{
2667 if (unlikely(task->lockdep_depth > 0))
2668 print_held_locks_bug(task);
2669}
2670
2671void debug_show_all_locks(void)
2672{
2673 struct task_struct *g, *p;
2674 int count = 10;
2675 int unlock = 1;
2676
2677 printk("\nShowing all locks held in the system:\n");
2678
2679 /*
2680 * Here we try to get the tasklist_lock as hard as possible,
2681 * if not successful after 2 seconds we ignore it (but keep
2682 * trying). This is to enable a debug printout even if a
2683 * tasklist_lock-holding task deadlocks or crashes.
2684 */
2685retry:
2686 if (!read_trylock(&tasklist_lock)) {
2687 if (count == 10)
2688 printk("hm, tasklist_lock locked, retrying... ");
2689 if (count) {
2690 count--;
2691 printk(" #%d", 10-count);
2692 mdelay(200);
2693 goto retry;
2694 }
2695 printk(" ignoring it.\n");
2696 unlock = 0;
2697 }
2698 if (count != 10)
2699 printk(" locked it.\n");
2700
2701 do_each_thread(g, p) {
2702 if (p->lockdep_depth)
2703 lockdep_print_held_locks(p);
2704 if (!unlock)
2705 if (read_trylock(&tasklist_lock))
2706 unlock = 1;
2707 } while_each_thread(g, p);
2708
2709 printk("\n");
2710 printk("=============================================\n\n");
2711
2712 if (unlock)
2713 read_unlock(&tasklist_lock);
2714}
2715
2716EXPORT_SYMBOL_GPL(debug_show_all_locks);
2717
2718void debug_show_held_locks(struct task_struct *task)
2719{
2720 lockdep_print_held_locks(task);
2721}
2722
2723EXPORT_SYMBOL_GPL(debug_show_held_locks);
2724
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
new file mode 100644
index 000000000000..eab043c83bb2
--- /dev/null
+++ b/kernel/lockdep_internals.h
@@ -0,0 +1,78 @@
1/*
2 * kernel/lockdep_internals.h
3 *
4 * Runtime locking correctness validator
5 *
6 * lockdep subsystem internal functions and variables.
7 */
8
9/*
10 * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies
11 * we track.
12 *
13 * We use the per-lock dependency maps in two ways: we grow it by adding
14 * every to-be-taken lock to all currently held lock's own dependency
15 * table (if it's not there yet), and we check it for lock order
16 * conflicts and deadlocks.
17 */
18#define MAX_LOCKDEP_ENTRIES 8192UL
19
20#define MAX_LOCKDEP_KEYS_BITS 11
21#define MAX_LOCKDEP_KEYS (1UL << MAX_LOCKDEP_KEYS_BITS)
22
23#define MAX_LOCKDEP_CHAINS_BITS 13
24#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS)
25
26/*
27 * Stack-trace: tightly packed array of stack backtrace
28 * addresses. Protected by the hash_lock.
29 */
30#define MAX_STACK_TRACE_ENTRIES 262144UL
31
32extern struct list_head all_lock_classes;
33
34extern void
35get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4);
36
37extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str);
38
39extern unsigned long nr_lock_classes;
40extern unsigned long nr_list_entries;
41extern unsigned long nr_lock_chains;
42extern unsigned long nr_stack_trace_entries;
43
44extern unsigned int nr_hardirq_chains;
45extern unsigned int nr_softirq_chains;
46extern unsigned int nr_process_chains;
47extern unsigned int max_lockdep_depth;
48extern unsigned int max_recursion_depth;
49
50#ifdef CONFIG_DEBUG_LOCKDEP
51/*
52 * Various lockdep statistics:
53 */
54extern atomic_t chain_lookup_hits;
55extern atomic_t chain_lookup_misses;
56extern atomic_t hardirqs_on_events;
57extern atomic_t hardirqs_off_events;
58extern atomic_t redundant_hardirqs_on;
59extern atomic_t redundant_hardirqs_off;
60extern atomic_t softirqs_on_events;
61extern atomic_t softirqs_off_events;
62extern atomic_t redundant_softirqs_on;
63extern atomic_t redundant_softirqs_off;
64extern atomic_t nr_unused_locks;
65extern atomic_t nr_cyclic_checks;
66extern atomic_t nr_cyclic_check_recursions;
67extern atomic_t nr_find_usage_forwards_checks;
68extern atomic_t nr_find_usage_forwards_recursions;
69extern atomic_t nr_find_usage_backwards_checks;
70extern atomic_t nr_find_usage_backwards_recursions;
71# define debug_atomic_inc(ptr) atomic_inc(ptr)
72# define debug_atomic_dec(ptr) atomic_dec(ptr)
73# define debug_atomic_read(ptr) atomic_read(ptr)
74#else
75# define debug_atomic_inc(ptr) do { } while (0)
76# define debug_atomic_dec(ptr) do { } while (0)
77# define debug_atomic_read(ptr) 0
78#endif
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
new file mode 100644
index 000000000000..f6e72eaab3fa
--- /dev/null
+++ b/kernel/lockdep_proc.c
@@ -0,0 +1,345 @@
1/*
2 * kernel/lockdep_proc.c
3 *
4 * Runtime locking correctness validator
5 *
6 * Started by Ingo Molnar:
7 *
8 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
9 *
10 * Code for /proc/lockdep and /proc/lockdep_stats:
11 *
12 */
13#include <linux/sched.h>
14#include <linux/module.h>
15#include <linux/proc_fs.h>
16#include <linux/seq_file.h>
17#include <linux/kallsyms.h>
18#include <linux/debug_locks.h>
19
20#include "lockdep_internals.h"
21
22static void *l_next(struct seq_file *m, void *v, loff_t *pos)
23{
24 struct lock_class *class = v;
25
26 (*pos)++;
27
28 if (class->lock_entry.next != &all_lock_classes)
29 class = list_entry(class->lock_entry.next, struct lock_class,
30 lock_entry);
31 else
32 class = NULL;
33 m->private = class;
34
35 return class;
36}
37
38static void *l_start(struct seq_file *m, loff_t *pos)
39{
40 struct lock_class *class = m->private;
41
42 if (&class->lock_entry == all_lock_classes.next)
43 seq_printf(m, "all lock classes:\n");
44
45 return class;
46}
47
48static void l_stop(struct seq_file *m, void *v)
49{
50}
51
52static unsigned long count_forward_deps(struct lock_class *class)
53{
54 struct lock_list *entry;
55 unsigned long ret = 1;
56
57 /*
58 * Recurse this class's dependency list:
59 */
60 list_for_each_entry(entry, &class->locks_after, entry)
61 ret += count_forward_deps(entry->class);
62
63 return ret;
64}
65
66static unsigned long count_backward_deps(struct lock_class *class)
67{
68 struct lock_list *entry;
69 unsigned long ret = 1;
70
71 /*
72 * Recurse this class's dependency list:
73 */
74 list_for_each_entry(entry, &class->locks_before, entry)
75 ret += count_backward_deps(entry->class);
76
77 return ret;
78}
79
80static int l_show(struct seq_file *m, void *v)
81{
82 unsigned long nr_forward_deps, nr_backward_deps;
83 struct lock_class *class = m->private;
84 char str[128], c1, c2, c3, c4;
85 const char *name;
86
87 seq_printf(m, "%p", class->key);
88#ifdef CONFIG_DEBUG_LOCKDEP
89 seq_printf(m, " OPS:%8ld", class->ops);
90#endif
91 nr_forward_deps = count_forward_deps(class);
92 seq_printf(m, " FD:%5ld", nr_forward_deps);
93
94 nr_backward_deps = count_backward_deps(class);
95 seq_printf(m, " BD:%5ld", nr_backward_deps);
96
97 get_usage_chars(class, &c1, &c2, &c3, &c4);
98 seq_printf(m, " %c%c%c%c", c1, c2, c3, c4);
99
100 name = class->name;
101 if (!name) {
102 name = __get_key_name(class->key, str);
103 seq_printf(m, ": %s", name);
104 } else{
105 seq_printf(m, ": %s", name);
106 if (class->name_version > 1)
107 seq_printf(m, "#%d", class->name_version);
108 if (class->subclass)
109 seq_printf(m, "/%d", class->subclass);
110 }
111 seq_puts(m, "\n");
112
113 return 0;
114}
115
116static struct seq_operations lockdep_ops = {
117 .start = l_start,
118 .next = l_next,
119 .stop = l_stop,
120 .show = l_show,
121};
122
123static int lockdep_open(struct inode *inode, struct file *file)
124{
125 int res = seq_open(file, &lockdep_ops);
126 if (!res) {
127 struct seq_file *m = file->private_data;
128
129 if (!list_empty(&all_lock_classes))
130 m->private = list_entry(all_lock_classes.next,
131 struct lock_class, lock_entry);
132 else
133 m->private = NULL;
134 }
135 return res;
136}
137
138static struct file_operations proc_lockdep_operations = {
139 .open = lockdep_open,
140 .read = seq_read,
141 .llseek = seq_lseek,
142 .release = seq_release,
143};
144
145static void lockdep_stats_debug_show(struct seq_file *m)
146{
147#ifdef CONFIG_DEBUG_LOCKDEP
148 unsigned int hi1 = debug_atomic_read(&hardirqs_on_events),
149 hi2 = debug_atomic_read(&hardirqs_off_events),
150 hr1 = debug_atomic_read(&redundant_hardirqs_on),
151 hr2 = debug_atomic_read(&redundant_hardirqs_off),
152 si1 = debug_atomic_read(&softirqs_on_events),
153 si2 = debug_atomic_read(&softirqs_off_events),
154 sr1 = debug_atomic_read(&redundant_softirqs_on),
155 sr2 = debug_atomic_read(&redundant_softirqs_off);
156
157 seq_printf(m, " chain lookup misses: %11u\n",
158 debug_atomic_read(&chain_lookup_misses));
159 seq_printf(m, " chain lookup hits: %11u\n",
160 debug_atomic_read(&chain_lookup_hits));
161 seq_printf(m, " cyclic checks: %11u\n",
162 debug_atomic_read(&nr_cyclic_checks));
163 seq_printf(m, " cyclic-check recursions: %11u\n",
164 debug_atomic_read(&nr_cyclic_check_recursions));
165 seq_printf(m, " find-mask forwards checks: %11u\n",
166 debug_atomic_read(&nr_find_usage_forwards_checks));
167 seq_printf(m, " find-mask forwards recursions: %11u\n",
168 debug_atomic_read(&nr_find_usage_forwards_recursions));
169 seq_printf(m, " find-mask backwards checks: %11u\n",
170 debug_atomic_read(&nr_find_usage_backwards_checks));
171 seq_printf(m, " find-mask backwards recursions:%11u\n",
172 debug_atomic_read(&nr_find_usage_backwards_recursions));
173
174 seq_printf(m, " hardirq on events: %11u\n", hi1);
175 seq_printf(m, " hardirq off events: %11u\n", hi2);
176 seq_printf(m, " redundant hardirq ons: %11u\n", hr1);
177 seq_printf(m, " redundant hardirq offs: %11u\n", hr2);
178 seq_printf(m, " softirq on events: %11u\n", si1);
179 seq_printf(m, " softirq off events: %11u\n", si2);
180 seq_printf(m, " redundant softirq ons: %11u\n", sr1);
181 seq_printf(m, " redundant softirq offs: %11u\n", sr2);
182#endif
183}
184
185static int lockdep_stats_show(struct seq_file *m, void *v)
186{
187 struct lock_class *class;
188 unsigned long nr_unused = 0, nr_uncategorized = 0,
189 nr_irq_safe = 0, nr_irq_unsafe = 0,
190 nr_softirq_safe = 0, nr_softirq_unsafe = 0,
191 nr_hardirq_safe = 0, nr_hardirq_unsafe = 0,
192 nr_irq_read_safe = 0, nr_irq_read_unsafe = 0,
193 nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0,
194 nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0,
195 sum_forward_deps = 0, factor = 0;
196
197 list_for_each_entry(class, &all_lock_classes, lock_entry) {
198
199 if (class->usage_mask == 0)
200 nr_unused++;
201 if (class->usage_mask == LOCKF_USED)
202 nr_uncategorized++;
203 if (class->usage_mask & LOCKF_USED_IN_IRQ)
204 nr_irq_safe++;
205 if (class->usage_mask & LOCKF_ENABLED_IRQS)
206 nr_irq_unsafe++;
207 if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ)
208 nr_softirq_safe++;
209 if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS)
210 nr_softirq_unsafe++;
211 if (class->usage_mask & LOCKF_USED_IN_HARDIRQ)
212 nr_hardirq_safe++;
213 if (class->usage_mask & LOCKF_ENABLED_HARDIRQS)
214 nr_hardirq_unsafe++;
215 if (class->usage_mask & LOCKF_USED_IN_IRQ_READ)
216 nr_irq_read_safe++;
217 if (class->usage_mask & LOCKF_ENABLED_IRQS_READ)
218 nr_irq_read_unsafe++;
219 if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ)
220 nr_softirq_read_safe++;
221 if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ)
222 nr_softirq_read_unsafe++;
223 if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ)
224 nr_hardirq_read_safe++;
225 if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ)
226 nr_hardirq_read_unsafe++;
227
228 sum_forward_deps += count_forward_deps(class);
229 }
230#ifdef CONFIG_LOCKDEP_DEBUG
231 DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused);
232#endif
233 seq_printf(m, " lock-classes: %11lu [max: %lu]\n",
234 nr_lock_classes, MAX_LOCKDEP_KEYS);
235 seq_printf(m, " direct dependencies: %11lu [max: %lu]\n",
236 nr_list_entries, MAX_LOCKDEP_ENTRIES);
237 seq_printf(m, " indirect dependencies: %11lu\n",
238 sum_forward_deps);
239
240 /*
241 * Total number of dependencies:
242 *
243 * All irq-safe locks may nest inside irq-unsafe locks,
244 * plus all the other known dependencies:
245 */
246 seq_printf(m, " all direct dependencies: %11lu\n",
247 nr_irq_unsafe * nr_irq_safe +
248 nr_hardirq_unsafe * nr_hardirq_safe +
249 nr_list_entries);
250
251 /*
252 * Estimated factor between direct and indirect
253 * dependencies:
254 */
255 if (nr_list_entries)
256 factor = sum_forward_deps / nr_list_entries;
257
258 seq_printf(m, " dependency chains: %11lu [max: %lu]\n",
259 nr_lock_chains, MAX_LOCKDEP_CHAINS);
260
261#ifdef CONFIG_TRACE_IRQFLAGS
262 seq_printf(m, " in-hardirq chains: %11u\n",
263 nr_hardirq_chains);
264 seq_printf(m, " in-softirq chains: %11u\n",
265 nr_softirq_chains);
266#endif
267 seq_printf(m, " in-process chains: %11u\n",
268 nr_process_chains);
269 seq_printf(m, " stack-trace entries: %11lu [max: %lu]\n",
270 nr_stack_trace_entries, MAX_STACK_TRACE_ENTRIES);
271 seq_printf(m, " combined max dependencies: %11u\n",
272 (nr_hardirq_chains + 1) *
273 (nr_softirq_chains + 1) *
274 (nr_process_chains + 1)
275 );
276 seq_printf(m, " hardirq-safe locks: %11lu\n",
277 nr_hardirq_safe);
278 seq_printf(m, " hardirq-unsafe locks: %11lu\n",
279 nr_hardirq_unsafe);
280 seq_printf(m, " softirq-safe locks: %11lu\n",
281 nr_softirq_safe);
282 seq_printf(m, " softirq-unsafe locks: %11lu\n",
283 nr_softirq_unsafe);
284 seq_printf(m, " irq-safe locks: %11lu\n",
285 nr_irq_safe);
286 seq_printf(m, " irq-unsafe locks: %11lu\n",
287 nr_irq_unsafe);
288
289 seq_printf(m, " hardirq-read-safe locks: %11lu\n",
290 nr_hardirq_read_safe);
291 seq_printf(m, " hardirq-read-unsafe locks: %11lu\n",
292 nr_hardirq_read_unsafe);
293 seq_printf(m, " softirq-read-safe locks: %11lu\n",
294 nr_softirq_read_safe);
295 seq_printf(m, " softirq-read-unsafe locks: %11lu\n",
296 nr_softirq_read_unsafe);
297 seq_printf(m, " irq-read-safe locks: %11lu\n",
298 nr_irq_read_safe);
299 seq_printf(m, " irq-read-unsafe locks: %11lu\n",
300 nr_irq_read_unsafe);
301
302 seq_printf(m, " uncategorized locks: %11lu\n",
303 nr_uncategorized);
304 seq_printf(m, " unused locks: %11lu\n",
305 nr_unused);
306 seq_printf(m, " max locking depth: %11u\n",
307 max_lockdep_depth);
308 seq_printf(m, " max recursion depth: %11u\n",
309 max_recursion_depth);
310 lockdep_stats_debug_show(m);
311 seq_printf(m, " debug_locks: %11u\n",
312 debug_locks);
313
314 return 0;
315}
316
317static int lockdep_stats_open(struct inode *inode, struct file *file)
318{
319 return single_open(file, lockdep_stats_show, NULL);
320}
321
322static struct file_operations proc_lockdep_stats_operations = {
323 .open = lockdep_stats_open,
324 .read = seq_read,
325 .llseek = seq_lseek,
326 .release = seq_release,
327};
328
329static int __init lockdep_proc_init(void)
330{
331 struct proc_dir_entry *entry;
332
333 entry = create_proc_entry("lockdep", S_IRUSR, NULL);
334 if (entry)
335 entry->proc_fops = &proc_lockdep_operations;
336
337 entry = create_proc_entry("lockdep_stats", S_IRUSR, NULL);
338 if (entry)
339 entry->proc_fops = &proc_lockdep_stats_operations;
340
341 return 0;
342}
343
344__initcall(lockdep_proc_init);
345
diff --git a/kernel/module.c b/kernel/module.c
index bbe04862e1b0..05625d5dc758 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1,4 +1,4 @@
1/* Rewritten by Rusty Russell, on the backs of many others... 1/*
2 Copyright (C) 2002 Richard Henderson 2 Copyright (C) 2002 Richard Henderson
3 Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. 3 Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM.
4 4
@@ -16,7 +16,6 @@
16 along with this program; if not, write to the Free Software 16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 17 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18*/ 18*/
19#include <linux/config.h>
20#include <linux/module.h> 19#include <linux/module.h>
21#include <linux/moduleloader.h> 20#include <linux/moduleloader.h>
22#include <linux/init.h> 21#include <linux/init.h>
@@ -40,9 +39,11 @@
40#include <linux/string.h> 39#include <linux/string.h>
41#include <linux/sched.h> 40#include <linux/sched.h>
42#include <linux/mutex.h> 41#include <linux/mutex.h>
42#include <linux/unwind.h>
43#include <asm/uaccess.h> 43#include <asm/uaccess.h>
44#include <asm/semaphore.h> 44#include <asm/semaphore.h>
45#include <asm/cacheflush.h> 45#include <asm/cacheflush.h>
46#include <linux/license.h>
46 47
47#if 0 48#if 0
48#define DEBUGP printk 49#define DEBUGP printk
@@ -120,9 +121,17 @@ extern const struct kernel_symbol __start___ksymtab_gpl[];
120extern const struct kernel_symbol __stop___ksymtab_gpl[]; 121extern const struct kernel_symbol __stop___ksymtab_gpl[];
121extern const struct kernel_symbol __start___ksymtab_gpl_future[]; 122extern const struct kernel_symbol __start___ksymtab_gpl_future[];
122extern const struct kernel_symbol __stop___ksymtab_gpl_future[]; 123extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
124extern const struct kernel_symbol __start___ksymtab_unused[];
125extern const struct kernel_symbol __stop___ksymtab_unused[];
126extern const struct kernel_symbol __start___ksymtab_unused_gpl[];
127extern const struct kernel_symbol __stop___ksymtab_unused_gpl[];
128extern const struct kernel_symbol __start___ksymtab_gpl_future[];
129extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
123extern const unsigned long __start___kcrctab[]; 130extern const unsigned long __start___kcrctab[];
124extern const unsigned long __start___kcrctab_gpl[]; 131extern const unsigned long __start___kcrctab_gpl[];
125extern const unsigned long __start___kcrctab_gpl_future[]; 132extern const unsigned long __start___kcrctab_gpl_future[];
133extern const unsigned long __start___kcrctab_unused[];
134extern const unsigned long __start___kcrctab_unused_gpl[];
126 135
127#ifndef CONFIG_MODVERSIONS 136#ifndef CONFIG_MODVERSIONS
128#define symversion(base, idx) NULL 137#define symversion(base, idx) NULL
@@ -142,6 +151,17 @@ static const struct kernel_symbol *lookup_symbol(const char *name,
142 return NULL; 151 return NULL;
143} 152}
144 153
154static void printk_unused_warning(const char *name)
155{
156 printk(KERN_WARNING "Symbol %s is marked as UNUSED, "
157 "however this module is using it.\n", name);
158 printk(KERN_WARNING "This symbol will go away in the future.\n");
159 printk(KERN_WARNING "Please evalute if this is the right api to use, "
160 "and if it really is, submit a report the linux kernel "
161 "mailinglist together with submitting your code for "
162 "inclusion.\n");
163}
164
145/* Find a symbol, return value, crc and module which owns it */ 165/* Find a symbol, return value, crc and module which owns it */
146static unsigned long __find_symbol(const char *name, 166static unsigned long __find_symbol(const char *name,
147 struct module **owner, 167 struct module **owner,
@@ -184,6 +204,25 @@ static unsigned long __find_symbol(const char *name,
184 return ks->value; 204 return ks->value;
185 } 205 }
186 206
207 ks = lookup_symbol(name, __start___ksymtab_unused,
208 __stop___ksymtab_unused);
209 if (ks) {
210 printk_unused_warning(name);
211 *crc = symversion(__start___kcrctab_unused,
212 (ks - __start___ksymtab_unused));
213 return ks->value;
214 }
215
216 if (gplok)
217 ks = lookup_symbol(name, __start___ksymtab_unused_gpl,
218 __stop___ksymtab_unused_gpl);
219 if (ks) {
220 printk_unused_warning(name);
221 *crc = symversion(__start___kcrctab_unused_gpl,
222 (ks - __start___ksymtab_unused_gpl));
223 return ks->value;
224 }
225
187 /* Now try modules. */ 226 /* Now try modules. */
188 list_for_each_entry(mod, &modules, list) { 227 list_for_each_entry(mod, &modules, list) {
189 *owner = mod; 228 *owner = mod;
@@ -202,6 +241,23 @@ static unsigned long __find_symbol(const char *name,
202 return ks->value; 241 return ks->value;
203 } 242 }
204 } 243 }
244 ks = lookup_symbol(name, mod->unused_syms, mod->unused_syms + mod->num_unused_syms);
245 if (ks) {
246 printk_unused_warning(name);
247 *crc = symversion(mod->unused_crcs, (ks - mod->unused_syms));
248 return ks->value;
249 }
250
251 if (gplok) {
252 ks = lookup_symbol(name, mod->unused_gpl_syms,
253 mod->unused_gpl_syms + mod->num_unused_gpl_syms);
254 if (ks) {
255 printk_unused_warning(name);
256 *crc = symversion(mod->unused_gpl_crcs,
257 (ks - mod->unused_gpl_syms));
258 return ks->value;
259 }
260 }
205 ks = lookup_symbol(name, mod->gpl_future_syms, 261 ks = lookup_symbol(name, mod->gpl_future_syms,
206 (mod->gpl_future_syms + 262 (mod->gpl_future_syms +
207 mod->num_gpl_future_syms)); 263 mod->num_gpl_future_syms));
@@ -877,6 +933,15 @@ static ssize_t module_sect_show(struct module_attribute *mattr,
877 return sprintf(buf, "0x%lx\n", sattr->address); 933 return sprintf(buf, "0x%lx\n", sattr->address);
878} 934}
879 935
936static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
937{
938 int section;
939
940 for (section = 0; section < sect_attrs->nsections; section++)
941 kfree(sect_attrs->attrs[section].name);
942 kfree(sect_attrs);
943}
944
880static void add_sect_attrs(struct module *mod, unsigned int nsect, 945static void add_sect_attrs(struct module *mod, unsigned int nsect,
881 char *secstrings, Elf_Shdr *sechdrs) 946 char *secstrings, Elf_Shdr *sechdrs)
882{ 947{
@@ -893,21 +958,26 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
893 + nloaded * sizeof(sect_attrs->attrs[0]), 958 + nloaded * sizeof(sect_attrs->attrs[0]),
894 sizeof(sect_attrs->grp.attrs[0])); 959 sizeof(sect_attrs->grp.attrs[0]));
895 size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.attrs[0]); 960 size[1] = (nloaded + 1) * sizeof(sect_attrs->grp.attrs[0]);
896 if (! (sect_attrs = kmalloc(size[0] + size[1], GFP_KERNEL))) 961 sect_attrs = kzalloc(size[0] + size[1], GFP_KERNEL);
962 if (sect_attrs == NULL)
897 return; 963 return;
898 964
899 /* Setup section attributes. */ 965 /* Setup section attributes. */
900 sect_attrs->grp.name = "sections"; 966 sect_attrs->grp.name = "sections";
901 sect_attrs->grp.attrs = (void *)sect_attrs + size[0]; 967 sect_attrs->grp.attrs = (void *)sect_attrs + size[0];
902 968
969 sect_attrs->nsections = 0;
903 sattr = &sect_attrs->attrs[0]; 970 sattr = &sect_attrs->attrs[0];
904 gattr = &sect_attrs->grp.attrs[0]; 971 gattr = &sect_attrs->grp.attrs[0];
905 for (i = 0; i < nsect; i++) { 972 for (i = 0; i < nsect; i++) {
906 if (! (sechdrs[i].sh_flags & SHF_ALLOC)) 973 if (! (sechdrs[i].sh_flags & SHF_ALLOC))
907 continue; 974 continue;
908 sattr->address = sechdrs[i].sh_addr; 975 sattr->address = sechdrs[i].sh_addr;
909 strlcpy(sattr->name, secstrings + sechdrs[i].sh_name, 976 sattr->name = kstrdup(secstrings + sechdrs[i].sh_name,
910 MODULE_SECT_NAME_LEN); 977 GFP_KERNEL);
978 if (sattr->name == NULL)
979 goto out;
980 sect_attrs->nsections++;
911 sattr->mattr.show = module_sect_show; 981 sattr->mattr.show = module_sect_show;
912 sattr->mattr.store = NULL; 982 sattr->mattr.store = NULL;
913 sattr->mattr.attr.name = sattr->name; 983 sattr->mattr.attr.name = sattr->name;
@@ -923,7 +993,7 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
923 mod->sect_attrs = sect_attrs; 993 mod->sect_attrs = sect_attrs;
924 return; 994 return;
925 out: 995 out:
926 kfree(sect_attrs); 996 free_sect_attrs(sect_attrs);
927} 997}
928 998
929static void remove_sect_attrs(struct module *mod) 999static void remove_sect_attrs(struct module *mod)
@@ -933,13 +1003,13 @@ static void remove_sect_attrs(struct module *mod)
933 &mod->sect_attrs->grp); 1003 &mod->sect_attrs->grp);
934 /* We are positive that no one is using any sect attrs 1004 /* We are positive that no one is using any sect attrs
935 * at this point. Deallocate immediately. */ 1005 * at this point. Deallocate immediately. */
936 kfree(mod->sect_attrs); 1006 free_sect_attrs(mod->sect_attrs);
937 mod->sect_attrs = NULL; 1007 mod->sect_attrs = NULL;
938 } 1008 }
939} 1009}
940 1010
941
942#else 1011#else
1012
943static inline void add_sect_attrs(struct module *mod, unsigned int nsect, 1013static inline void add_sect_attrs(struct module *mod, unsigned int nsect,
944 char *sectstrings, Elf_Shdr *sechdrs) 1014 char *sectstrings, Elf_Shdr *sechdrs)
945{ 1015{
@@ -998,6 +1068,12 @@ static int mod_sysfs_setup(struct module *mod,
998{ 1068{
999 int err; 1069 int err;
1000 1070
1071 if (!module_subsys.kset.subsys) {
1072 printk(KERN_ERR "%s: module_subsys not initialized\n",
1073 mod->name);
1074 err = -EINVAL;
1075 goto out;
1076 }
1001 memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj)); 1077 memset(&mod->mkobj.kobj, 0, sizeof(mod->mkobj.kobj));
1002 err = kobject_set_name(&mod->mkobj.kobj, "%s", mod->name); 1078 err = kobject_set_name(&mod->mkobj.kobj, "%s", mod->name);
1003 if (err) 1079 if (err)
@@ -1051,6 +1127,8 @@ static void free_module(struct module *mod)
1051 remove_sect_attrs(mod); 1127 remove_sect_attrs(mod);
1052 mod_kobject_remove(mod); 1128 mod_kobject_remove(mod);
1053 1129
1130 unwind_remove_table(mod->unwind_info, 0);
1131
1054 /* Arch-specific cleanup. */ 1132 /* Arch-specific cleanup. */
1055 module_arch_cleanup(mod); 1133 module_arch_cleanup(mod);
1056 1134
@@ -1063,6 +1141,9 @@ static void free_module(struct module *mod)
1063 if (mod->percpu) 1141 if (mod->percpu)
1064 percpu_modfree(mod->percpu); 1142 percpu_modfree(mod->percpu);
1065 1143
1144 /* Free lock-classes: */
1145 lockdep_free_key_range(mod->module_core, mod->core_size);
1146
1066 /* Finally, free the core (containing the module structure) */ 1147 /* Finally, free the core (containing the module structure) */
1067 module_free(mod, mod->module_core); 1148 module_free(mod, mod->module_core);
1068} 1149}
@@ -1248,16 +1329,6 @@ static void layout_sections(struct module *mod,
1248 } 1329 }
1249} 1330}
1250 1331
1251static inline int license_is_gpl_compatible(const char *license)
1252{
1253 return (strcmp(license, "GPL") == 0
1254 || strcmp(license, "GPL v2") == 0
1255 || strcmp(license, "GPL and additional rights") == 0
1256 || strcmp(license, "Dual BSD/GPL") == 0
1257 || strcmp(license, "Dual MIT/GPL") == 0
1258 || strcmp(license, "Dual MPL/GPL") == 0);
1259}
1260
1261static void set_license(struct module *mod, const char *license) 1332static void set_license(struct module *mod, const char *license)
1262{ 1333{
1263 if (!license) 1334 if (!license)
@@ -1326,7 +1397,7 @@ int is_exported(const char *name, const struct module *mod)
1326 if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab)) 1397 if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab))
1327 return 1; 1398 return 1;
1328 else 1399 else
1329 if (lookup_symbol(name, mod->syms, mod->syms + mod->num_syms)) 1400 if (mod && lookup_symbol(name, mod->syms, mod->syms + mod->num_syms))
1330 return 1; 1401 return 1;
1331 else 1402 else
1332 return 0; 1403 return 0;
@@ -1409,10 +1480,27 @@ static struct module *load_module(void __user *umod,
1409 Elf_Ehdr *hdr; 1480 Elf_Ehdr *hdr;
1410 Elf_Shdr *sechdrs; 1481 Elf_Shdr *sechdrs;
1411 char *secstrings, *args, *modmagic, *strtab = NULL; 1482 char *secstrings, *args, *modmagic, *strtab = NULL;
1412 unsigned int i, symindex = 0, strindex = 0, setupindex, exindex, 1483 unsigned int i;
1413 exportindex, modindex, obsparmindex, infoindex, gplindex, 1484 unsigned int symindex = 0;
1414 crcindex, gplcrcindex, versindex, pcpuindex, gplfutureindex, 1485 unsigned int strindex = 0;
1415 gplfuturecrcindex; 1486 unsigned int setupindex;
1487 unsigned int exindex;
1488 unsigned int exportindex;
1489 unsigned int modindex;
1490 unsigned int obsparmindex;
1491 unsigned int infoindex;
1492 unsigned int gplindex;
1493 unsigned int crcindex;
1494 unsigned int gplcrcindex;
1495 unsigned int versindex;
1496 unsigned int pcpuindex;
1497 unsigned int gplfutureindex;
1498 unsigned int gplfuturecrcindex;
1499 unsigned int unwindex = 0;
1500 unsigned int unusedindex;
1501 unsigned int unusedcrcindex;
1502 unsigned int unusedgplindex;
1503 unsigned int unusedgplcrcindex;
1416 struct module *mod; 1504 struct module *mod;
1417 long err = 0; 1505 long err = 0;
1418 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ 1506 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
@@ -1493,15 +1581,22 @@ static struct module *load_module(void __user *umod,
1493 exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab"); 1581 exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab");
1494 gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl"); 1582 gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl");
1495 gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future"); 1583 gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future");
1584 unusedindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused");
1585 unusedgplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused_gpl");
1496 crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab"); 1586 crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab");
1497 gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl"); 1587 gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl");
1498 gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future"); 1588 gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future");
1589 unusedcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused");
1590 unusedgplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused_gpl");
1499 setupindex = find_sec(hdr, sechdrs, secstrings, "__param"); 1591 setupindex = find_sec(hdr, sechdrs, secstrings, "__param");
1500 exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table"); 1592 exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table");
1501 obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm"); 1593 obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm");
1502 versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); 1594 versindex = find_sec(hdr, sechdrs, secstrings, "__versions");
1503 infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); 1595 infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo");
1504 pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); 1596 pcpuindex = find_pcpusec(hdr, sechdrs, secstrings);
1597#ifdef ARCH_UNWIND_SECTION_NAME
1598 unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME);
1599#endif
1505 1600
1506 /* Don't keep modinfo section */ 1601 /* Don't keep modinfo section */
1507 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; 1602 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
@@ -1510,6 +1605,8 @@ static struct module *load_module(void __user *umod,
1510 sechdrs[symindex].sh_flags |= SHF_ALLOC; 1605 sechdrs[symindex].sh_flags |= SHF_ALLOC;
1511 sechdrs[strindex].sh_flags |= SHF_ALLOC; 1606 sechdrs[strindex].sh_flags |= SHF_ALLOC;
1512#endif 1607#endif
1608 if (unwindex)
1609 sechdrs[unwindex].sh_flags |= SHF_ALLOC;
1513 1610
1514 /* Check module struct version now, before we try to use module. */ 1611 /* Check module struct version now, before we try to use module. */
1515 if (!check_modstruct_version(sechdrs, versindex, mod)) { 1612 if (!check_modstruct_version(sechdrs, versindex, mod)) {
@@ -1639,14 +1736,27 @@ static struct module *load_module(void __user *umod,
1639 mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr; 1736 mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr;
1640 mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size / 1737 mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size /
1641 sizeof(*mod->gpl_future_syms); 1738 sizeof(*mod->gpl_future_syms);
1739 mod->num_unused_syms = sechdrs[unusedindex].sh_size /
1740 sizeof(*mod->unused_syms);
1741 mod->num_unused_gpl_syms = sechdrs[unusedgplindex].sh_size /
1742 sizeof(*mod->unused_gpl_syms);
1642 mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr; 1743 mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr;
1643 if (gplfuturecrcindex) 1744 if (gplfuturecrcindex)
1644 mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr; 1745 mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr;
1645 1746
1747 mod->unused_syms = (void *)sechdrs[unusedindex].sh_addr;
1748 if (unusedcrcindex)
1749 mod->unused_crcs = (void *)sechdrs[unusedcrcindex].sh_addr;
1750 mod->unused_gpl_syms = (void *)sechdrs[unusedgplindex].sh_addr;
1751 if (unusedgplcrcindex)
1752 mod->unused_crcs = (void *)sechdrs[unusedgplcrcindex].sh_addr;
1753
1646#ifdef CONFIG_MODVERSIONS 1754#ifdef CONFIG_MODVERSIONS
1647 if ((mod->num_syms && !crcindex) || 1755 if ((mod->num_syms && !crcindex) ||
1648 (mod->num_gpl_syms && !gplcrcindex) || 1756 (mod->num_gpl_syms && !gplcrcindex) ||
1649 (mod->num_gpl_future_syms && !gplfuturecrcindex)) { 1757 (mod->num_gpl_future_syms && !gplfuturecrcindex) ||
1758 (mod->num_unused_syms && !unusedcrcindex) ||
1759 (mod->num_unused_gpl_syms && !unusedgplcrcindex)) {
1650 printk(KERN_WARNING "%s: No versions for exported symbols." 1760 printk(KERN_WARNING "%s: No versions for exported symbols."
1651 " Tainting kernel.\n", mod->name); 1761 " Tainting kernel.\n", mod->name);
1652 add_taint(TAINT_FORCED_MODULE); 1762 add_taint(TAINT_FORCED_MODULE);
@@ -1738,6 +1848,11 @@ static struct module *load_module(void __user *umod,
1738 goto arch_cleanup; 1848 goto arch_cleanup;
1739 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 1849 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
1740 1850
1851 /* Size of section 0 is 0, so this works well if no unwind info. */
1852 mod->unwind_info = unwind_add_table(mod,
1853 (void *)sechdrs[unwindex].sh_addr,
1854 sechdrs[unwindex].sh_size);
1855
1741 /* Get rid of temporary copy */ 1856 /* Get rid of temporary copy */
1742 vfree(hdr); 1857 vfree(hdr);
1743 1858
@@ -1836,6 +1951,7 @@ sys_init_module(void __user *umod,
1836 mod->state = MODULE_STATE_LIVE; 1951 mod->state = MODULE_STATE_LIVE;
1837 /* Drop initial reference. */ 1952 /* Drop initial reference. */
1838 module_put(mod); 1953 module_put(mod);
1954 unwind_remove_table(mod->unwind_info, 1);
1839 module_free(mod, mod->module_init); 1955 module_free(mod, mod->module_init);
1840 mod->module_init = NULL; 1956 mod->module_init = NULL;
1841 mod->init_size = 0; 1957 mod->init_size = 0;
@@ -1923,10 +2039,8 @@ const char *module_address_lookup(unsigned long addr,
1923 return NULL; 2039 return NULL;
1924} 2040}
1925 2041
1926struct module *module_get_kallsym(unsigned int symnum, 2042struct module *module_get_kallsym(unsigned int symnum, unsigned long *value,
1927 unsigned long *value, 2043 char *type, char *name, size_t namelen)
1928 char *type,
1929 char namebuf[128])
1930{ 2044{
1931 struct module *mod; 2045 struct module *mod;
1932 2046
@@ -1935,9 +2049,8 @@ struct module *module_get_kallsym(unsigned int symnum,
1935 if (symnum < mod->num_symtab) { 2049 if (symnum < mod->num_symtab) {
1936 *value = mod->symtab[symnum].st_value; 2050 *value = mod->symtab[symnum].st_value;
1937 *type = mod->symtab[symnum].st_info; 2051 *type = mod->symtab[symnum].st_info;
1938 strncpy(namebuf, 2052 strlcpy(name, mod->strtab + mod->symtab[symnum].st_name,
1939 mod->strtab + mod->symtab[symnum].st_name, 2053 namelen);
1940 127);
1941 mutex_unlock(&module_mutex); 2054 mutex_unlock(&module_mutex);
1942 return mod; 2055 return mod;
1943 } 2056 }
@@ -2066,6 +2179,29 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
2066 return e; 2179 return e;
2067} 2180}
2068 2181
2182/*
2183 * Is this a valid module address?
2184 */
2185int is_module_address(unsigned long addr)
2186{
2187 unsigned long flags;
2188 struct module *mod;
2189
2190 spin_lock_irqsave(&modlist_lock, flags);
2191
2192 list_for_each_entry(mod, &modules, list) {
2193 if (within(addr, mod->module_core, mod->core_size)) {
2194 spin_unlock_irqrestore(&modlist_lock, flags);
2195 return 1;
2196 }
2197 }
2198
2199 spin_unlock_irqrestore(&modlist_lock, flags);
2200
2201 return 0;
2202}
2203
2204
2069/* Is this a valid kernel address? We don't grab the lock: we are oopsing. */ 2205/* Is this a valid kernel address? We don't grab the lock: we are oopsing. */
2070struct module *__module_text_address(unsigned long addr) 2206struct module *__module_text_address(unsigned long addr)
2071{ 2207{
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index f4913c376950..e3203c654dda 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -16,395 +16,48 @@
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/delay.h> 17#include <linux/delay.h>
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/poison.h>
19#include <linux/spinlock.h> 20#include <linux/spinlock.h>
20#include <linux/kallsyms.h> 21#include <linux/kallsyms.h>
21#include <linux/interrupt.h> 22#include <linux/interrupt.h>
23#include <linux/debug_locks.h>
22 24
23#include "mutex-debug.h" 25#include "mutex-debug.h"
24 26
25/* 27/*
26 * We need a global lock when we walk through the multi-process
27 * lock tree. Only used in the deadlock-debugging case.
28 */
29DEFINE_SPINLOCK(debug_mutex_lock);
30
31/*
32 * All locks held by all tasks, in a single global list:
33 */
34LIST_HEAD(debug_mutex_held_locks);
35
36/*
37 * In the debug case we carry the caller's instruction pointer into
38 * other functions, but we dont want the function argument overhead
39 * in the nondebug case - hence these macros:
40 */
41#define __IP_DECL__ , unsigned long ip
42#define __IP__ , ip
43#define __RET_IP__ , (unsigned long)__builtin_return_address(0)
44
45/*
46 * "mutex debugging enabled" flag. We turn it off when we detect
47 * the first problem because we dont want to recurse back
48 * into the tracing code when doing error printk or
49 * executing a BUG():
50 */
51int debug_mutex_on = 1;
52
53static void printk_task(struct task_struct *p)
54{
55 if (p)
56 printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio);
57 else
58 printk("<none>");
59}
60
61static void printk_ti(struct thread_info *ti)
62{
63 if (ti)
64 printk_task(ti->task);
65 else
66 printk("<none>");
67}
68
69static void printk_task_short(struct task_struct *p)
70{
71 if (p)
72 printk("%s/%d [%p, %3d]", p->comm, p->pid, p, p->prio);
73 else
74 printk("<none>");
75}
76
77static void printk_lock(struct mutex *lock, int print_owner)
78{
79 printk(" [%p] {%s}\n", lock, lock->name);
80
81 if (print_owner && lock->owner) {
82 printk(".. held by: ");
83 printk_ti(lock->owner);
84 printk("\n");
85 }
86 if (lock->owner) {
87 printk("... acquired at: ");
88 print_symbol("%s\n", lock->acquire_ip);
89 }
90}
91
92/*
93 * printk locks held by a task:
94 */
95static void show_task_locks(struct task_struct *p)
96{
97 switch (p->state) {
98 case TASK_RUNNING: printk("R"); break;
99 case TASK_INTERRUPTIBLE: printk("S"); break;
100 case TASK_UNINTERRUPTIBLE: printk("D"); break;
101 case TASK_STOPPED: printk("T"); break;
102 case EXIT_ZOMBIE: printk("Z"); break;
103 case EXIT_DEAD: printk("X"); break;
104 default: printk("?"); break;
105 }
106 printk_task(p);
107 if (p->blocked_on) {
108 struct mutex *lock = p->blocked_on->lock;
109
110 printk(" blocked on mutex:");
111 printk_lock(lock, 1);
112 } else
113 printk(" (not blocked on mutex)\n");
114}
115
116/*
117 * printk all locks held in the system (if filter == NULL),
118 * or all locks belonging to a single task (if filter != NULL):
119 */
120void show_held_locks(struct task_struct *filter)
121{
122 struct list_head *curr, *cursor = NULL;
123 struct mutex *lock;
124 struct thread_info *t;
125 unsigned long flags;
126 int count = 0;
127
128 if (filter) {
129 printk("------------------------------\n");
130 printk("| showing all locks held by: | (");
131 printk_task_short(filter);
132 printk("):\n");
133 printk("------------------------------\n");
134 } else {
135 printk("---------------------------\n");
136 printk("| showing all locks held: |\n");
137 printk("---------------------------\n");
138 }
139
140 /*
141 * Play safe and acquire the global trace lock. We
142 * cannot printk with that lock held so we iterate
143 * very carefully:
144 */
145next:
146 debug_spin_lock_save(&debug_mutex_lock, flags);
147 list_for_each(curr, &debug_mutex_held_locks) {
148 if (cursor && curr != cursor)
149 continue;
150 lock = list_entry(curr, struct mutex, held_list);
151 t = lock->owner;
152 if (filter && (t != filter->thread_info))
153 continue;
154 count++;
155 cursor = curr->next;
156 debug_spin_lock_restore(&debug_mutex_lock, flags);
157
158 printk("\n#%03d: ", count);
159 printk_lock(lock, filter ? 0 : 1);
160 goto next;
161 }
162 debug_spin_lock_restore(&debug_mutex_lock, flags);
163 printk("\n");
164}
165
166void mutex_debug_show_all_locks(void)
167{
168 struct task_struct *g, *p;
169 int count = 10;
170 int unlock = 1;
171
172 printk("\nShowing all blocking locks in the system:\n");
173
174 /*
175 * Here we try to get the tasklist_lock as hard as possible,
176 * if not successful after 2 seconds we ignore it (but keep
177 * trying). This is to enable a debug printout even if a
178 * tasklist_lock-holding task deadlocks or crashes.
179 */
180retry:
181 if (!read_trylock(&tasklist_lock)) {
182 if (count == 10)
183 printk("hm, tasklist_lock locked, retrying... ");
184 if (count) {
185 count--;
186 printk(" #%d", 10-count);
187 mdelay(200);
188 goto retry;
189 }
190 printk(" ignoring it.\n");
191 unlock = 0;
192 }
193 if (count != 10)
194 printk(" locked it.\n");
195
196 do_each_thread(g, p) {
197 show_task_locks(p);
198 if (!unlock)
199 if (read_trylock(&tasklist_lock))
200 unlock = 1;
201 } while_each_thread(g, p);
202
203 printk("\n");
204 show_held_locks(NULL);
205 printk("=============================================\n\n");
206
207 if (unlock)
208 read_unlock(&tasklist_lock);
209}
210
211static void report_deadlock(struct task_struct *task, struct mutex *lock,
212 struct mutex *lockblk, unsigned long ip)
213{
214 printk("\n%s/%d is trying to acquire this lock:\n",
215 current->comm, current->pid);
216 printk_lock(lock, 1);
217 printk("... trying at: ");
218 print_symbol("%s\n", ip);
219 show_held_locks(current);
220
221 if (lockblk) {
222 printk("but %s/%d is deadlocking current task %s/%d!\n\n",
223 task->comm, task->pid, current->comm, current->pid);
224 printk("\n%s/%d is blocked on this lock:\n",
225 task->comm, task->pid);
226 printk_lock(lockblk, 1);
227
228 show_held_locks(task);
229
230 printk("\n%s/%d's [blocked] stackdump:\n\n",
231 task->comm, task->pid);
232 show_stack(task, NULL);
233 }
234
235 printk("\n%s/%d's [current] stackdump:\n\n",
236 current->comm, current->pid);
237 dump_stack();
238 mutex_debug_show_all_locks();
239 printk("[ turning off deadlock detection. Please report this. ]\n\n");
240 local_irq_disable();
241}
242
243/*
244 * Recursively check for mutex deadlocks:
245 */
246static int check_deadlock(struct mutex *lock, int depth,
247 struct thread_info *ti, unsigned long ip)
248{
249 struct mutex *lockblk;
250 struct task_struct *task;
251
252 if (!debug_mutex_on)
253 return 0;
254
255 ti = lock->owner;
256 if (!ti)
257 return 0;
258
259 task = ti->task;
260 lockblk = NULL;
261 if (task->blocked_on)
262 lockblk = task->blocked_on->lock;
263
264 /* Self-deadlock: */
265 if (current == task) {
266 DEBUG_OFF();
267 if (depth)
268 return 1;
269 printk("\n==========================================\n");
270 printk( "[ BUG: lock recursion deadlock detected! |\n");
271 printk( "------------------------------------------\n");
272 report_deadlock(task, lock, NULL, ip);
273 return 0;
274 }
275
276 /* Ugh, something corrupted the lock data structure? */
277 if (depth > 20) {
278 DEBUG_OFF();
279 printk("\n===========================================\n");
280 printk( "[ BUG: infinite lock dependency detected!? |\n");
281 printk( "-------------------------------------------\n");
282 report_deadlock(task, lock, lockblk, ip);
283 return 0;
284 }
285
286 /* Recursively check for dependencies: */
287 if (lockblk && check_deadlock(lockblk, depth+1, ti, ip)) {
288 printk("\n============================================\n");
289 printk( "[ BUG: circular locking deadlock detected! ]\n");
290 printk( "--------------------------------------------\n");
291 report_deadlock(task, lock, lockblk, ip);
292 return 0;
293 }
294 return 0;
295}
296
297/*
298 * Called when a task exits, this function checks whether the
299 * task is holding any locks, and reports the first one if so:
300 */
301void mutex_debug_check_no_locks_held(struct task_struct *task)
302{
303 struct list_head *curr, *next;
304 struct thread_info *t;
305 unsigned long flags;
306 struct mutex *lock;
307
308 if (!debug_mutex_on)
309 return;
310
311 debug_spin_lock_save(&debug_mutex_lock, flags);
312 list_for_each_safe(curr, next, &debug_mutex_held_locks) {
313 lock = list_entry(curr, struct mutex, held_list);
314 t = lock->owner;
315 if (t != task->thread_info)
316 continue;
317 list_del_init(curr);
318 DEBUG_OFF();
319 debug_spin_lock_restore(&debug_mutex_lock, flags);
320
321 printk("BUG: %s/%d, lock held at task exit time!\n",
322 task->comm, task->pid);
323 printk_lock(lock, 1);
324 if (lock->owner != task->thread_info)
325 printk("exiting task is not even the owner??\n");
326 return;
327 }
328 debug_spin_lock_restore(&debug_mutex_lock, flags);
329}
330
331/*
332 * Called when kernel memory is freed (or unmapped), or if a mutex
333 * is destroyed or reinitialized - this code checks whether there is
334 * any held lock in the memory range of <from> to <to>:
335 */
336void mutex_debug_check_no_locks_freed(const void *from, unsigned long len)
337{
338 struct list_head *curr, *next;
339 const void *to = from + len;
340 unsigned long flags;
341 struct mutex *lock;
342 void *lock_addr;
343
344 if (!debug_mutex_on)
345 return;
346
347 debug_spin_lock_save(&debug_mutex_lock, flags);
348 list_for_each_safe(curr, next, &debug_mutex_held_locks) {
349 lock = list_entry(curr, struct mutex, held_list);
350 lock_addr = lock;
351 if (lock_addr < from || lock_addr >= to)
352 continue;
353 list_del_init(curr);
354 DEBUG_OFF();
355 debug_spin_lock_restore(&debug_mutex_lock, flags);
356
357 printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n",
358 current->comm, current->pid, lock, from, to);
359 dump_stack();
360 printk_lock(lock, 1);
361 if (lock->owner != current_thread_info())
362 printk("freeing task is not even the owner??\n");
363 return;
364 }
365 debug_spin_lock_restore(&debug_mutex_lock, flags);
366}
367
368/*
369 * Must be called with lock->wait_lock held. 28 * Must be called with lock->wait_lock held.
370 */ 29 */
371void debug_mutex_set_owner(struct mutex *lock, 30void debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner)
372 struct thread_info *new_owner __IP_DECL__)
373{ 31{
374 lock->owner = new_owner; 32 lock->owner = new_owner;
375 DEBUG_WARN_ON(!list_empty(&lock->held_list));
376 if (debug_mutex_on) {
377 list_add_tail(&lock->held_list, &debug_mutex_held_locks);
378 lock->acquire_ip = ip;
379 }
380} 33}
381 34
382void debug_mutex_init_waiter(struct mutex_waiter *waiter) 35void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
383{ 36{
384 memset(waiter, 0x11, sizeof(*waiter)); 37 memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter));
385 waiter->magic = waiter; 38 waiter->magic = waiter;
386 INIT_LIST_HEAD(&waiter->list); 39 INIT_LIST_HEAD(&waiter->list);
387} 40}
388 41
389void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter) 42void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter)
390{ 43{
391 SMP_DEBUG_WARN_ON(!spin_is_locked(&lock->wait_lock)); 44 SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock));
392 DEBUG_WARN_ON(list_empty(&lock->wait_list)); 45 DEBUG_LOCKS_WARN_ON(list_empty(&lock->wait_list));
393 DEBUG_WARN_ON(waiter->magic != waiter); 46 DEBUG_LOCKS_WARN_ON(waiter->magic != waiter);
394 DEBUG_WARN_ON(list_empty(&waiter->list)); 47 DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
395} 48}
396 49
397void debug_mutex_free_waiter(struct mutex_waiter *waiter) 50void debug_mutex_free_waiter(struct mutex_waiter *waiter)
398{ 51{
399 DEBUG_WARN_ON(!list_empty(&waiter->list)); 52 DEBUG_LOCKS_WARN_ON(!list_empty(&waiter->list));
400 memset(waiter, 0x22, sizeof(*waiter)); 53 memset(waiter, MUTEX_DEBUG_FREE, sizeof(*waiter));
401} 54}
402 55
403void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, 56void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
404 struct thread_info *ti __IP_DECL__) 57 struct thread_info *ti)
405{ 58{
406 SMP_DEBUG_WARN_ON(!spin_is_locked(&lock->wait_lock)); 59 SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock));
407 check_deadlock(lock, 0, ti, ip); 60
408 /* Mark the current thread as blocked on the lock: */ 61 /* Mark the current thread as blocked on the lock: */
409 ti->task->blocked_on = waiter; 62 ti->task->blocked_on = waiter;
410 waiter->lock = lock; 63 waiter->lock = lock;
@@ -413,9 +66,9 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
413void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, 66void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
414 struct thread_info *ti) 67 struct thread_info *ti)
415{ 68{
416 DEBUG_WARN_ON(list_empty(&waiter->list)); 69 DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
417 DEBUG_WARN_ON(waiter->task != ti->task); 70 DEBUG_LOCKS_WARN_ON(waiter->task != ti->task);
418 DEBUG_WARN_ON(ti->task->blocked_on != waiter); 71 DEBUG_LOCKS_WARN_ON(ti->task->blocked_on != waiter);
419 ti->task->blocked_on = NULL; 72 ti->task->blocked_on = NULL;
420 73
421 list_del_init(&waiter->list); 74 list_del_init(&waiter->list);
@@ -424,24 +77,23 @@ void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
424 77
425void debug_mutex_unlock(struct mutex *lock) 78void debug_mutex_unlock(struct mutex *lock)
426{ 79{
427 DEBUG_WARN_ON(lock->magic != lock); 80 DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
428 DEBUG_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); 81 DEBUG_LOCKS_WARN_ON(lock->magic != lock);
429 DEBUG_WARN_ON(lock->owner != current_thread_info()); 82 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
430 if (debug_mutex_on) { 83 DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
431 DEBUG_WARN_ON(list_empty(&lock->held_list));
432 list_del_init(&lock->held_list);
433 }
434} 84}
435 85
436void debug_mutex_init(struct mutex *lock, const char *name) 86void debug_mutex_init(struct mutex *lock, const char *name,
87 struct lock_class_key *key)
437{ 88{
89#ifdef CONFIG_DEBUG_LOCK_ALLOC
438 /* 90 /*
439 * Make sure we are not reinitializing a held lock: 91 * Make sure we are not reinitializing a held lock:
440 */ 92 */
441 mutex_debug_check_no_locks_freed((void *)lock, sizeof(*lock)); 93 debug_check_no_locks_freed((void *)lock, sizeof(*lock));
94 lockdep_init_map(&lock->dep_map, name, key);
95#endif
442 lock->owner = NULL; 96 lock->owner = NULL;
443 INIT_LIST_HEAD(&lock->held_list);
444 lock->name = name;
445 lock->magic = lock; 97 lock->magic = lock;
446} 98}
447 99
@@ -455,7 +107,7 @@ void debug_mutex_init(struct mutex *lock, const char *name)
455 */ 107 */
456void fastcall mutex_destroy(struct mutex *lock) 108void fastcall mutex_destroy(struct mutex *lock)
457{ 109{
458 DEBUG_WARN_ON(mutex_is_locked(lock)); 110 DEBUG_LOCKS_WARN_ON(mutex_is_locked(lock));
459 lock->magic = NULL; 111 lock->magic = NULL;
460} 112}
461 113
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
index fd384050acb1..babfbdfc534b 100644
--- a/kernel/mutex-debug.h
+++ b/kernel/mutex-debug.h
@@ -10,125 +10,44 @@
10 * More details are in kernel/mutex-debug.c. 10 * More details are in kernel/mutex-debug.c.
11 */ 11 */
12 12
13extern spinlock_t debug_mutex_lock;
14extern struct list_head debug_mutex_held_locks;
15extern int debug_mutex_on;
16
17/*
18 * In the debug case we carry the caller's instruction pointer into
19 * other functions, but we dont want the function argument overhead
20 * in the nondebug case - hence these macros:
21 */
22#define __IP_DECL__ , unsigned long ip
23#define __IP__ , ip
24#define __RET_IP__ , (unsigned long)__builtin_return_address(0)
25
26/* 13/*
27 * This must be called with lock->wait_lock held. 14 * This must be called with lock->wait_lock held.
28 */ 15 */
29extern void debug_mutex_set_owner(struct mutex *lock, 16extern void
30 struct thread_info *new_owner __IP_DECL__); 17debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner);
31 18
32static inline void debug_mutex_clear_owner(struct mutex *lock) 19static inline void debug_mutex_clear_owner(struct mutex *lock)
33{ 20{
34 lock->owner = NULL; 21 lock->owner = NULL;
35} 22}
36 23
37extern void debug_mutex_init_waiter(struct mutex_waiter *waiter); 24extern void debug_mutex_lock_common(struct mutex *lock,
25 struct mutex_waiter *waiter);
38extern void debug_mutex_wake_waiter(struct mutex *lock, 26extern void debug_mutex_wake_waiter(struct mutex *lock,
39 struct mutex_waiter *waiter); 27 struct mutex_waiter *waiter);
40extern void debug_mutex_free_waiter(struct mutex_waiter *waiter); 28extern void debug_mutex_free_waiter(struct mutex_waiter *waiter);
41extern void debug_mutex_add_waiter(struct mutex *lock, 29extern void debug_mutex_add_waiter(struct mutex *lock,
42 struct mutex_waiter *waiter, 30 struct mutex_waiter *waiter,
43 struct thread_info *ti __IP_DECL__); 31 struct thread_info *ti);
44extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, 32extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
45 struct thread_info *ti); 33 struct thread_info *ti);
46extern void debug_mutex_unlock(struct mutex *lock); 34extern void debug_mutex_unlock(struct mutex *lock);
47extern void debug_mutex_init(struct mutex *lock, const char *name); 35extern void debug_mutex_init(struct mutex *lock, const char *name,
48 36 struct lock_class_key *key);
49#define debug_spin_lock(lock) \
50 do { \
51 local_irq_disable(); \
52 if (debug_mutex_on) \
53 spin_lock(lock); \
54 } while (0)
55 37
56#define debug_spin_unlock(lock) \ 38#define spin_lock_mutex(lock, flags) \
57 do { \
58 if (debug_mutex_on) \
59 spin_unlock(lock); \
60 local_irq_enable(); \
61 preempt_check_resched(); \
62 } while (0)
63
64#define debug_spin_lock_save(lock, flags) \
65 do { \ 39 do { \
40 struct mutex *l = container_of(lock, struct mutex, wait_lock); \
41 \
42 DEBUG_LOCKS_WARN_ON(in_interrupt()); \
66 local_irq_save(flags); \ 43 local_irq_save(flags); \
67 if (debug_mutex_on) \ 44 __raw_spin_lock(&(lock)->raw_lock); \
68 spin_lock(lock); \ 45 DEBUG_LOCKS_WARN_ON(l->magic != l); \
69 } while (0) 46 } while (0)
70 47
71#define debug_spin_lock_restore(lock, flags) \ 48#define spin_unlock_mutex(lock, flags) \
72 do { \ 49 do { \
73 if (debug_mutex_on) \ 50 __raw_spin_unlock(&(lock)->raw_lock); \
74 spin_unlock(lock); \
75 local_irq_restore(flags); \ 51 local_irq_restore(flags); \
76 preempt_check_resched(); \ 52 preempt_check_resched(); \
77 } while (0) 53 } while (0)
78
79#define spin_lock_mutex(lock) \
80 do { \
81 struct mutex *l = container_of(lock, struct mutex, wait_lock); \
82 \
83 DEBUG_WARN_ON(in_interrupt()); \
84 debug_spin_lock(&debug_mutex_lock); \
85 spin_lock(lock); \
86 DEBUG_WARN_ON(l->magic != l); \
87 } while (0)
88
89#define spin_unlock_mutex(lock) \
90 do { \
91 spin_unlock(lock); \
92 debug_spin_unlock(&debug_mutex_lock); \
93 } while (0)
94
95#define DEBUG_OFF() \
96do { \
97 if (debug_mutex_on) { \
98 debug_mutex_on = 0; \
99 console_verbose(); \
100 if (spin_is_locked(&debug_mutex_lock)) \
101 spin_unlock(&debug_mutex_lock); \
102 } \
103} while (0)
104
105#define DEBUG_BUG() \
106do { \
107 if (debug_mutex_on) { \
108 DEBUG_OFF(); \
109 BUG(); \
110 } \
111} while (0)
112
113#define DEBUG_WARN_ON(c) \
114do { \
115 if (unlikely(c && debug_mutex_on)) { \
116 DEBUG_OFF(); \
117 WARN_ON(1); \
118 } \
119} while (0)
120
121# define DEBUG_BUG_ON(c) \
122do { \
123 if (unlikely(c)) \
124 DEBUG_BUG(); \
125} while (0)
126
127#ifdef CONFIG_SMP
128# define SMP_DEBUG_WARN_ON(c) DEBUG_WARN_ON(c)
129# define SMP_DEBUG_BUG_ON(c) DEBUG_BUG_ON(c)
130#else
131# define SMP_DEBUG_WARN_ON(c) do { } while (0)
132# define SMP_DEBUG_BUG_ON(c) do { } while (0)
133#endif
134
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 5449b210d9ed..8c71cf72a497 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -17,6 +17,7 @@
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/spinlock.h> 18#include <linux/spinlock.h>
19#include <linux/interrupt.h> 19#include <linux/interrupt.h>
20#include <linux/debug_locks.h>
20 21
21/* 22/*
22 * In the DEBUG case we are using the "NULL fastpath" for mutexes, 23 * In the DEBUG case we are using the "NULL fastpath" for mutexes,
@@ -38,13 +39,14 @@
38 * 39 *
39 * It is not allowed to initialize an already locked mutex. 40 * It is not allowed to initialize an already locked mutex.
40 */ 41 */
41void fastcall __mutex_init(struct mutex *lock, const char *name) 42void
43__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
42{ 44{
43 atomic_set(&lock->count, 1); 45 atomic_set(&lock->count, 1);
44 spin_lock_init(&lock->wait_lock); 46 spin_lock_init(&lock->wait_lock);
45 INIT_LIST_HEAD(&lock->wait_list); 47 INIT_LIST_HEAD(&lock->wait_list);
46 48
47 debug_mutex_init(lock, name); 49 debug_mutex_init(lock, name, key);
48} 50}
49 51
50EXPORT_SYMBOL(__mutex_init); 52EXPORT_SYMBOL(__mutex_init);
@@ -56,7 +58,7 @@ EXPORT_SYMBOL(__mutex_init);
56 * branch is predicted by the CPU as default-untaken. 58 * branch is predicted by the CPU as default-untaken.
57 */ 59 */
58static void fastcall noinline __sched 60static void fastcall noinline __sched
59__mutex_lock_slowpath(atomic_t *lock_count __IP_DECL__); 61__mutex_lock_slowpath(atomic_t *lock_count);
60 62
61/*** 63/***
62 * mutex_lock - acquire the mutex 64 * mutex_lock - acquire the mutex
@@ -79,7 +81,7 @@ __mutex_lock_slowpath(atomic_t *lock_count __IP_DECL__);
79 * 81 *
80 * This function is similar to (but not equivalent to) down(). 82 * This function is similar to (but not equivalent to) down().
81 */ 83 */
82void fastcall __sched mutex_lock(struct mutex *lock) 84void inline fastcall __sched mutex_lock(struct mutex *lock)
83{ 85{
84 might_sleep(); 86 might_sleep();
85 /* 87 /*
@@ -92,7 +94,7 @@ void fastcall __sched mutex_lock(struct mutex *lock)
92EXPORT_SYMBOL(mutex_lock); 94EXPORT_SYMBOL(mutex_lock);
93 95
94static void fastcall noinline __sched 96static void fastcall noinline __sched
95__mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__); 97__mutex_unlock_slowpath(atomic_t *lock_count);
96 98
97/*** 99/***
98 * mutex_unlock - release the mutex 100 * mutex_unlock - release the mutex
@@ -120,17 +122,18 @@ EXPORT_SYMBOL(mutex_unlock);
120 * Lock a mutex (possibly interruptible), slowpath: 122 * Lock a mutex (possibly interruptible), slowpath:
121 */ 123 */
122static inline int __sched 124static inline int __sched
123__mutex_lock_common(struct mutex *lock, long state __IP_DECL__) 125__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
124{ 126{
125 struct task_struct *task = current; 127 struct task_struct *task = current;
126 struct mutex_waiter waiter; 128 struct mutex_waiter waiter;
127 unsigned int old_val; 129 unsigned int old_val;
130 unsigned long flags;
128 131
129 debug_mutex_init_waiter(&waiter); 132 spin_lock_mutex(&lock->wait_lock, flags);
130 133
131 spin_lock_mutex(&lock->wait_lock); 134 debug_mutex_lock_common(lock, &waiter);
132 135 mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
133 debug_mutex_add_waiter(lock, &waiter, task->thread_info, ip); 136 debug_mutex_add_waiter(lock, &waiter, task->thread_info);
134 137
135 /* add waiting tasks to the end of the waitqueue (FIFO): */ 138 /* add waiting tasks to the end of the waitqueue (FIFO): */
136 list_add_tail(&waiter.list, &lock->wait_list); 139 list_add_tail(&waiter.list, &lock->wait_list);
@@ -157,7 +160,8 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__)
157 if (unlikely(state == TASK_INTERRUPTIBLE && 160 if (unlikely(state == TASK_INTERRUPTIBLE &&
158 signal_pending(task))) { 161 signal_pending(task))) {
159 mutex_remove_waiter(lock, &waiter, task->thread_info); 162 mutex_remove_waiter(lock, &waiter, task->thread_info);
160 spin_unlock_mutex(&lock->wait_lock); 163 mutex_release(&lock->dep_map, 1, _RET_IP_);
164 spin_unlock_mutex(&lock->wait_lock, flags);
161 165
162 debug_mutex_free_waiter(&waiter); 166 debug_mutex_free_waiter(&waiter);
163 return -EINTR; 167 return -EINTR;
@@ -165,48 +169,57 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__)
165 __set_task_state(task, state); 169 __set_task_state(task, state);
166 170
167 /* didnt get the lock, go to sleep: */ 171 /* didnt get the lock, go to sleep: */
168 spin_unlock_mutex(&lock->wait_lock); 172 spin_unlock_mutex(&lock->wait_lock, flags);
169 schedule(); 173 schedule();
170 spin_lock_mutex(&lock->wait_lock); 174 spin_lock_mutex(&lock->wait_lock, flags);
171 } 175 }
172 176
173 /* got the lock - rejoice! */ 177 /* got the lock - rejoice! */
174 mutex_remove_waiter(lock, &waiter, task->thread_info); 178 mutex_remove_waiter(lock, &waiter, task->thread_info);
175 debug_mutex_set_owner(lock, task->thread_info __IP__); 179 debug_mutex_set_owner(lock, task->thread_info);
176 180
177 /* set it to 0 if there are no waiters left: */ 181 /* set it to 0 if there are no waiters left: */
178 if (likely(list_empty(&lock->wait_list))) 182 if (likely(list_empty(&lock->wait_list)))
179 atomic_set(&lock->count, 0); 183 atomic_set(&lock->count, 0);
180 184
181 spin_unlock_mutex(&lock->wait_lock); 185 spin_unlock_mutex(&lock->wait_lock, flags);
182 186
183 debug_mutex_free_waiter(&waiter); 187 debug_mutex_free_waiter(&waiter);
184 188
185 DEBUG_WARN_ON(list_empty(&lock->held_list));
186 DEBUG_WARN_ON(lock->owner != task->thread_info);
187
188 return 0; 189 return 0;
189} 190}
190 191
191static void fastcall noinline __sched 192static void fastcall noinline __sched
192__mutex_lock_slowpath(atomic_t *lock_count __IP_DECL__) 193__mutex_lock_slowpath(atomic_t *lock_count)
193{ 194{
194 struct mutex *lock = container_of(lock_count, struct mutex, count); 195 struct mutex *lock = container_of(lock_count, struct mutex, count);
195 196
196 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE __IP__); 197 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0);
198}
199
200#ifdef CONFIG_DEBUG_LOCK_ALLOC
201void __sched
202mutex_lock_nested(struct mutex *lock, unsigned int subclass)
203{
204 might_sleep();
205 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass);
197} 206}
198 207
208EXPORT_SYMBOL_GPL(mutex_lock_nested);
209#endif
210
199/* 211/*
200 * Release the lock, slowpath: 212 * Release the lock, slowpath:
201 */ 213 */
202static fastcall noinline void 214static fastcall inline void
203__mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__) 215__mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
204{ 216{
205 struct mutex *lock = container_of(lock_count, struct mutex, count); 217 struct mutex *lock = container_of(lock_count, struct mutex, count);
218 unsigned long flags;
206 219
207 DEBUG_WARN_ON(lock->owner != current_thread_info()); 220 spin_lock_mutex(&lock->wait_lock, flags);
208 221 mutex_release(&lock->dep_map, nested, _RET_IP_);
209 spin_lock_mutex(&lock->wait_lock); 222 debug_mutex_unlock(lock);
210 223
211 /* 224 /*
212 * some architectures leave the lock unlocked in the fastpath failure 225 * some architectures leave the lock unlocked in the fastpath failure
@@ -216,8 +229,6 @@ __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__)
216 if (__mutex_slowpath_needs_to_unlock()) 229 if (__mutex_slowpath_needs_to_unlock())
217 atomic_set(&lock->count, 1); 230 atomic_set(&lock->count, 1);
218 231
219 debug_mutex_unlock(lock);
220
221 if (!list_empty(&lock->wait_list)) { 232 if (!list_empty(&lock->wait_list)) {
222 /* get the first entry from the wait-list: */ 233 /* get the first entry from the wait-list: */
223 struct mutex_waiter *waiter = 234 struct mutex_waiter *waiter =
@@ -231,7 +242,16 @@ __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__)
231 242
232 debug_mutex_clear_owner(lock); 243 debug_mutex_clear_owner(lock);
233 244
234 spin_unlock_mutex(&lock->wait_lock); 245 spin_unlock_mutex(&lock->wait_lock, flags);
246}
247
248/*
249 * Release the lock, slowpath:
250 */
251static fastcall noinline void
252__mutex_unlock_slowpath(atomic_t *lock_count)
253{
254 __mutex_unlock_common_slowpath(lock_count, 1);
235} 255}
236 256
237/* 257/*
@@ -239,7 +259,7 @@ __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__)
239 * mutex_lock_interruptible() and mutex_trylock(). 259 * mutex_lock_interruptible() and mutex_trylock().
240 */ 260 */
241static int fastcall noinline __sched 261static int fastcall noinline __sched
242__mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__); 262__mutex_lock_interruptible_slowpath(atomic_t *lock_count);
243 263
244/*** 264/***
245 * mutex_lock_interruptible - acquire the mutex, interruptable 265 * mutex_lock_interruptible - acquire the mutex, interruptable
@@ -262,11 +282,11 @@ int fastcall __sched mutex_lock_interruptible(struct mutex *lock)
262EXPORT_SYMBOL(mutex_lock_interruptible); 282EXPORT_SYMBOL(mutex_lock_interruptible);
263 283
264static int fastcall noinline __sched 284static int fastcall noinline __sched
265__mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__) 285__mutex_lock_interruptible_slowpath(atomic_t *lock_count)
266{ 286{
267 struct mutex *lock = container_of(lock_count, struct mutex, count); 287 struct mutex *lock = container_of(lock_count, struct mutex, count);
268 288
269 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE __IP__); 289 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0);
270} 290}
271 291
272/* 292/*
@@ -276,18 +296,21 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__)
276static inline int __mutex_trylock_slowpath(atomic_t *lock_count) 296static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
277{ 297{
278 struct mutex *lock = container_of(lock_count, struct mutex, count); 298 struct mutex *lock = container_of(lock_count, struct mutex, count);
299 unsigned long flags;
279 int prev; 300 int prev;
280 301
281 spin_lock_mutex(&lock->wait_lock); 302 spin_lock_mutex(&lock->wait_lock, flags);
282 303
283 prev = atomic_xchg(&lock->count, -1); 304 prev = atomic_xchg(&lock->count, -1);
284 if (likely(prev == 1)) 305 if (likely(prev == 1)) {
285 debug_mutex_set_owner(lock, current_thread_info() __RET_IP__); 306 debug_mutex_set_owner(lock, current_thread_info());
307 mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
308 }
286 /* Set it back to 0 if there are no waiters: */ 309 /* Set it back to 0 if there are no waiters: */
287 if (likely(list_empty(&lock->wait_list))) 310 if (likely(list_empty(&lock->wait_list)))
288 atomic_set(&lock->count, 0); 311 atomic_set(&lock->count, 0);
289 312
290 spin_unlock_mutex(&lock->wait_lock); 313 spin_unlock_mutex(&lock->wait_lock, flags);
291 314
292 return prev == 1; 315 return prev == 1;
293} 316}
@@ -306,7 +329,7 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
306 * This function must not be used in interrupt context. The 329 * This function must not be used in interrupt context. The
307 * mutex must be released by the same task that acquired it. 330 * mutex must be released by the same task that acquired it.
308 */ 331 */
309int fastcall mutex_trylock(struct mutex *lock) 332int fastcall __sched mutex_trylock(struct mutex *lock)
310{ 333{
311 return __mutex_fastpath_trylock(&lock->count, 334 return __mutex_fastpath_trylock(&lock->count,
312 __mutex_trylock_slowpath); 335 __mutex_trylock_slowpath);
diff --git a/kernel/mutex.h b/kernel/mutex.h
index 00fe84e7b672..a075dafbb290 100644
--- a/kernel/mutex.h
+++ b/kernel/mutex.h
@@ -9,27 +9,22 @@
9 * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs: 9 * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs:
10 */ 10 */
11 11
12#define spin_lock_mutex(lock) spin_lock(lock) 12#define spin_lock_mutex(lock, flags) \
13#define spin_unlock_mutex(lock) spin_unlock(lock) 13 do { spin_lock(lock); (void)(flags); } while (0)
14#define spin_unlock_mutex(lock, flags) \
15 do { spin_unlock(lock); (void)(flags); } while (0)
14#define mutex_remove_waiter(lock, waiter, ti) \ 16#define mutex_remove_waiter(lock, waiter, ti) \
15 __list_del((waiter)->list.prev, (waiter)->list.next) 17 __list_del((waiter)->list.prev, (waiter)->list.next)
16 18
17#define DEBUG_WARN_ON(c) do { } while (0)
18#define debug_mutex_set_owner(lock, new_owner) do { } while (0) 19#define debug_mutex_set_owner(lock, new_owner) do { } while (0)
19#define debug_mutex_clear_owner(lock) do { } while (0) 20#define debug_mutex_clear_owner(lock) do { } while (0)
20#define debug_mutex_init_waiter(waiter) do { } while (0)
21#define debug_mutex_wake_waiter(lock, waiter) do { } while (0) 21#define debug_mutex_wake_waiter(lock, waiter) do { } while (0)
22#define debug_mutex_free_waiter(waiter) do { } while (0) 22#define debug_mutex_free_waiter(waiter) do { } while (0)
23#define debug_mutex_add_waiter(lock, waiter, ti, ip) do { } while (0) 23#define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0)
24#define debug_mutex_unlock(lock) do { } while (0) 24#define debug_mutex_unlock(lock) do { } while (0)
25#define debug_mutex_init(lock, name) do { } while (0) 25#define debug_mutex_init(lock, name, key) do { } while (0)
26
27/*
28 * Return-address parameters/declarations. They are very useful for
29 * debugging, but add overhead in the !DEBUG case - so we go the
30 * trouble of using this not too elegant but zero-cost solution:
31 */
32#define __IP_DECL__
33#define __IP__
34#define __RET_IP__
35 26
27static inline void
28debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
29{
30}
diff --git a/kernel/panic.c b/kernel/panic.c
index cc2a4c9c36ac..525e365f7239 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -8,7 +8,6 @@
8 * This function is used through-out the kernel (including mm and fs) 8 * This function is used through-out the kernel (including mm and fs)
9 * to indicate a major problem. 9 * to indicate a major problem.
10 */ 10 */
11#include <linux/config.h>
12#include <linux/module.h> 11#include <linux/module.h>
13#include <linux/sched.h> 12#include <linux/sched.h>
14#include <linux/delay.h> 13#include <linux/delay.h>
@@ -19,6 +18,7 @@
19#include <linux/interrupt.h> 18#include <linux/interrupt.h>
20#include <linux/nmi.h> 19#include <linux/nmi.h>
21#include <linux/kexec.h> 20#include <linux/kexec.h>
21#include <linux/debug_locks.h>
22 22
23int panic_on_oops; 23int panic_on_oops;
24int tainted; 24int tainted;
@@ -173,6 +173,7 @@ const char *print_tainted(void)
173 173
174void add_taint(unsigned flag) 174void add_taint(unsigned flag)
175{ 175{
176 debug_locks = 0; /* can't trust the integrity of the kernel anymore */
176 tainted |= flag; 177 tainted |= flag;
177} 178}
178EXPORT_SYMBOL(add_taint); 179EXPORT_SYMBOL(add_taint);
@@ -257,6 +258,7 @@ int oops_may_print(void)
257 */ 258 */
258void oops_enter(void) 259void oops_enter(void)
259{ 260{
261 debug_locks_off(); /* can't trust the integrity of the kernel anymore */
260 do_oops_enter_exit(); 262 do_oops_enter_exit();
261} 263}
262 264
@@ -268,3 +270,15 @@ void oops_exit(void)
268{ 270{
269 do_oops_enter_exit(); 271 do_oops_enter_exit();
270} 272}
273
274#ifdef CONFIG_CC_STACKPROTECTOR
275/*
276 * Called when gcc's -fstack-protector feature is used, and
277 * gcc detects corruption of the on-stack canary value
278 */
279void __stack_chk_fail(void)
280{
281 panic("stack-protector: Kernel stack is corrupted");
282}
283EXPORT_SYMBOL(__stack_chk_fail);
284#endif
diff --git a/kernel/params.c b/kernel/params.c
index af43ecdc8d9b..f406655d6653 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -15,7 +15,6 @@
15 along with this program; if not, write to the Free Software 15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/ 17*/
18#include <linux/config.h>
19#include <linux/moduleparam.h> 18#include <linux/moduleparam.h>
20#include <linux/kernel.h> 19#include <linux/kernel.h>
21#include <linux/string.h> 20#include <linux/string.h>
@@ -548,6 +547,7 @@ static void __init kernel_param_sysfs_setup(const char *name,
548 unsigned int name_skip) 547 unsigned int name_skip)
549{ 548{
550 struct module_kobject *mk; 549 struct module_kobject *mk;
550 int ret;
551 551
552 mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); 552 mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
553 BUG_ON(!mk); 553 BUG_ON(!mk);
@@ -555,7 +555,8 @@ static void __init kernel_param_sysfs_setup(const char *name,
555 mk->mod = THIS_MODULE; 555 mk->mod = THIS_MODULE;
556 kobj_set_kset_s(mk, module_subsys); 556 kobj_set_kset_s(mk, module_subsys);
557 kobject_set_name(&mk->kobj, name); 557 kobject_set_name(&mk->kobj, name);
558 kobject_register(&mk->kobj); 558 ret = kobject_register(&mk->kobj);
559 BUG_ON(ret < 0);
559 560
560 /* no need to keep the kobject if no parameter is exported */ 561 /* no need to keep the kobject if no parameter is exported */
561 if (!param_sysfs_setup(mk, kparam, num_params, name_skip)) { 562 if (!param_sysfs_setup(mk, kparam, num_params, name_skip)) {
@@ -685,13 +686,20 @@ decl_subsys(module, &module_ktype, NULL);
685 */ 686 */
686static int __init param_sysfs_init(void) 687static int __init param_sysfs_init(void)
687{ 688{
688 subsystem_register(&module_subsys); 689 int ret;
690
691 ret = subsystem_register(&module_subsys);
692 if (ret < 0) {
693 printk(KERN_WARNING "%s (%d): subsystem_register error: %d\n",
694 __FILE__, __LINE__, ret);
695 return ret;
696 }
689 697
690 param_sysfs_builtin(); 698 param_sysfs_builtin();
691 699
692 return 0; 700 return 0;
693} 701}
694__initcall(param_sysfs_init); 702subsys_initcall(param_sysfs_init);
695 703
696EXPORT_SYMBOL(param_set_byte); 704EXPORT_SYMBOL(param_set_byte);
697EXPORT_SYMBOL(param_get_byte); 705EXPORT_SYMBOL(param_get_byte);
diff --git a/kernel/pid.c b/kernel/pid.c
index eeb836b65ca4..8387e8c68193 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -218,14 +218,11 @@ struct pid * fastcall find_pid(int nr)
218 return NULL; 218 return NULL;
219} 219}
220 220
221int fastcall attach_pid(task_t *task, enum pid_type type, int nr) 221int fastcall attach_pid(struct task_struct *task, enum pid_type type, int nr)
222{ 222{
223 struct pid_link *link; 223 struct pid_link *link;
224 struct pid *pid; 224 struct pid *pid;
225 225
226 WARN_ON(!task->pid); /* to be removed soon */
227 WARN_ON(!nr); /* to be removed soon */
228
229 link = &task->pids[type]; 226 link = &task->pids[type];
230 link->pid = pid = find_pid(nr); 227 link->pid = pid = find_pid(nr);
231 hlist_add_head_rcu(&link->node, &pid->tasks[type]); 228 hlist_add_head_rcu(&link->node, &pid->tasks[type]);
@@ -233,7 +230,7 @@ int fastcall attach_pid(task_t *task, enum pid_type type, int nr)
233 return 0; 230 return 0;
234} 231}
235 232
236void fastcall detach_pid(task_t *task, enum pid_type type) 233void fastcall detach_pid(struct task_struct *task, enum pid_type type)
237{ 234{
238 struct pid_link *link; 235 struct pid_link *link;
239 struct pid *pid; 236 struct pid *pid;
@@ -252,6 +249,15 @@ void fastcall detach_pid(task_t *task, enum pid_type type)
252 free_pid(pid); 249 free_pid(pid);
253} 250}
254 251
252/* transfer_pid is an optimization of attach_pid(new), detach_pid(old) */
253void fastcall transfer_pid(struct task_struct *old, struct task_struct *new,
254 enum pid_type type)
255{
256 new->pids[type].pid = old->pids[type].pid;
257 hlist_replace_rcu(&old->pids[type].node, &new->pids[type].node);
258 old->pids[type].pid = NULL;
259}
260
255struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type) 261struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type)
256{ 262{
257 struct task_struct *result = NULL; 263 struct task_struct *result = NULL;
@@ -267,7 +273,7 @@ struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type)
267/* 273/*
268 * Must be called under rcu_read_lock() or with tasklist_lock read-held. 274 * Must be called under rcu_read_lock() or with tasklist_lock read-held.
269 */ 275 */
270task_t *find_task_by_pid_type(int type, int nr) 276struct task_struct *find_task_by_pid_type(int type, int nr)
271{ 277{
272 return pid_task(find_pid(nr), type); 278 return pid_task(find_pid(nr), type);
273} 279}
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index d38d9ec3276c..479b16b44f79 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1393,25 +1393,13 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1393 } 1393 }
1394} 1394}
1395 1395
1396static long posix_cpu_clock_nanosleep_restart(struct restart_block *); 1396static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
1397 1397 struct timespec *rqtp, struct itimerspec *it)
1398int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1399 struct timespec *rqtp, struct timespec __user *rmtp)
1400{ 1398{
1401 struct restart_block *restart_block =
1402 &current_thread_info()->restart_block;
1403 struct k_itimer timer; 1399 struct k_itimer timer;
1404 int error; 1400 int error;
1405 1401
1406 /* 1402 /*
1407 * Diagnose required errors first.
1408 */
1409 if (CPUCLOCK_PERTHREAD(which_clock) &&
1410 (CPUCLOCK_PID(which_clock) == 0 ||
1411 CPUCLOCK_PID(which_clock) == current->pid))
1412 return -EINVAL;
1413
1414 /*
1415 * Set up a temporary timer and then wait for it to go off. 1403 * Set up a temporary timer and then wait for it to go off.
1416 */ 1404 */
1417 memset(&timer, 0, sizeof timer); 1405 memset(&timer, 0, sizeof timer);
@@ -1422,11 +1410,12 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1422 timer.it_process = current; 1410 timer.it_process = current;
1423 if (!error) { 1411 if (!error) {
1424 static struct itimerspec zero_it; 1412 static struct itimerspec zero_it;
1425 struct itimerspec it = { .it_value = *rqtp, 1413
1426 .it_interval = {} }; 1414 memset(it, 0, sizeof *it);
1415 it->it_value = *rqtp;
1427 1416
1428 spin_lock_irq(&timer.it_lock); 1417 spin_lock_irq(&timer.it_lock);
1429 error = posix_cpu_timer_set(&timer, flags, &it, NULL); 1418 error = posix_cpu_timer_set(&timer, flags, it, NULL);
1430 if (error) { 1419 if (error) {
1431 spin_unlock_irq(&timer.it_lock); 1420 spin_unlock_irq(&timer.it_lock);
1432 return error; 1421 return error;
@@ -1454,49 +1443,89 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1454 * We were interrupted by a signal. 1443 * We were interrupted by a signal.
1455 */ 1444 */
1456 sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp); 1445 sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp);
1457 posix_cpu_timer_set(&timer, 0, &zero_it, &it); 1446 posix_cpu_timer_set(&timer, 0, &zero_it, it);
1458 spin_unlock_irq(&timer.it_lock); 1447 spin_unlock_irq(&timer.it_lock);
1459 1448
1460 if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) { 1449 if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) {
1461 /* 1450 /*
1462 * It actually did fire already. 1451 * It actually did fire already.
1463 */ 1452 */
1464 return 0; 1453 return 0;
1465 } 1454 }
1466 1455
1456 error = -ERESTART_RESTARTBLOCK;
1457 }
1458
1459 return error;
1460}
1461
1462int posix_cpu_nsleep(const clockid_t which_clock, int flags,
1463 struct timespec *rqtp, struct timespec __user *rmtp)
1464{
1465 struct restart_block *restart_block =
1466 &current_thread_info()->restart_block;
1467 struct itimerspec it;
1468 int error;
1469
1470 /*
1471 * Diagnose required errors first.
1472 */
1473 if (CPUCLOCK_PERTHREAD(which_clock) &&
1474 (CPUCLOCK_PID(which_clock) == 0 ||
1475 CPUCLOCK_PID(which_clock) == current->pid))
1476 return -EINVAL;
1477
1478 error = do_cpu_nanosleep(which_clock, flags, rqtp, &it);
1479
1480 if (error == -ERESTART_RESTARTBLOCK) {
1481
1482 if (flags & TIMER_ABSTIME)
1483 return -ERESTARTNOHAND;
1467 /* 1484 /*
1468 * Report back to the user the time still remaining. 1485 * Report back to the user the time still remaining.
1469 */ 1486 */
1470 if (rmtp != NULL && !(flags & TIMER_ABSTIME) && 1487 if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
1471 copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
1472 return -EFAULT; 1488 return -EFAULT;
1473 1489
1474 restart_block->fn = posix_cpu_clock_nanosleep_restart; 1490 restart_block->fn = posix_cpu_nsleep_restart;
1475 /* Caller already set restart_block->arg1 */
1476 restart_block->arg0 = which_clock; 1491 restart_block->arg0 = which_clock;
1477 restart_block->arg1 = (unsigned long) rmtp; 1492 restart_block->arg1 = (unsigned long) rmtp;
1478 restart_block->arg2 = rqtp->tv_sec; 1493 restart_block->arg2 = rqtp->tv_sec;
1479 restart_block->arg3 = rqtp->tv_nsec; 1494 restart_block->arg3 = rqtp->tv_nsec;
1480
1481 error = -ERESTART_RESTARTBLOCK;
1482 } 1495 }
1483
1484 return error; 1496 return error;
1485} 1497}
1486 1498
1487static long 1499long posix_cpu_nsleep_restart(struct restart_block *restart_block)
1488posix_cpu_clock_nanosleep_restart(struct restart_block *restart_block)
1489{ 1500{
1490 clockid_t which_clock = restart_block->arg0; 1501 clockid_t which_clock = restart_block->arg0;
1491 struct timespec __user *rmtp; 1502 struct timespec __user *rmtp;
1492 struct timespec t; 1503 struct timespec t;
1504 struct itimerspec it;
1505 int error;
1493 1506
1494 rmtp = (struct timespec __user *) restart_block->arg1; 1507 rmtp = (struct timespec __user *) restart_block->arg1;
1495 t.tv_sec = restart_block->arg2; 1508 t.tv_sec = restart_block->arg2;
1496 t.tv_nsec = restart_block->arg3; 1509 t.tv_nsec = restart_block->arg3;
1497 1510
1498 restart_block->fn = do_no_restart_syscall; 1511 restart_block->fn = do_no_restart_syscall;
1499 return posix_cpu_nsleep(which_clock, TIMER_ABSTIME, &t, rmtp); 1512 error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it);
1513
1514 if (error == -ERESTART_RESTARTBLOCK) {
1515 /*
1516 * Report back to the user the time still remaining.
1517 */
1518 if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
1519 return -EFAULT;
1520
1521 restart_block->fn = posix_cpu_nsleep_restart;
1522 restart_block->arg0 = which_clock;
1523 restart_block->arg1 = (unsigned long) rmtp;
1524 restart_block->arg2 = t.tv_sec;
1525 restart_block->arg3 = t.tv_nsec;
1526 }
1527 return error;
1528
1500} 1529}
1501 1530
1502 1531
@@ -1524,6 +1553,10 @@ static int process_cpu_nsleep(const clockid_t which_clock, int flags,
1524{ 1553{
1525 return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp); 1554 return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp);
1526} 1555}
1556static long process_cpu_nsleep_restart(struct restart_block *restart_block)
1557{
1558 return -EINVAL;
1559}
1527static int thread_cpu_clock_getres(const clockid_t which_clock, 1560static int thread_cpu_clock_getres(const clockid_t which_clock,
1528 struct timespec *tp) 1561 struct timespec *tp)
1529{ 1562{
@@ -1544,6 +1577,10 @@ static int thread_cpu_nsleep(const clockid_t which_clock, int flags,
1544{ 1577{
1545 return -EINVAL; 1578 return -EINVAL;
1546} 1579}
1580static long thread_cpu_nsleep_restart(struct restart_block *restart_block)
1581{
1582 return -EINVAL;
1583}
1547 1584
1548static __init int init_posix_cpu_timers(void) 1585static __init int init_posix_cpu_timers(void)
1549{ 1586{
@@ -1553,6 +1590,7 @@ static __init int init_posix_cpu_timers(void)
1553 .clock_set = do_posix_clock_nosettime, 1590 .clock_set = do_posix_clock_nosettime,
1554 .timer_create = process_cpu_timer_create, 1591 .timer_create = process_cpu_timer_create,
1555 .nsleep = process_cpu_nsleep, 1592 .nsleep = process_cpu_nsleep,
1593 .nsleep_restart = process_cpu_nsleep_restart,
1556 }; 1594 };
1557 struct k_clock thread = { 1595 struct k_clock thread = {
1558 .clock_getres = thread_cpu_clock_getres, 1596 .clock_getres = thread_cpu_clock_getres,
@@ -1560,6 +1598,7 @@ static __init int init_posix_cpu_timers(void)
1560 .clock_set = do_posix_clock_nosettime, 1598 .clock_set = do_posix_clock_nosettime,
1561 .timer_create = thread_cpu_timer_create, 1599 .timer_create = thread_cpu_timer_create,
1562 .nsleep = thread_cpu_nsleep, 1600 .nsleep = thread_cpu_nsleep,
1601 .nsleep_restart = thread_cpu_nsleep_restart,
1563 }; 1602 };
1564 1603
1565 register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process); 1604 register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index ac6dc8744429..e5ebcc1ec3a0 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -973,3 +973,24 @@ sys_clock_nanosleep(const clockid_t which_clock, int flags,
973 return CLOCK_DISPATCH(which_clock, nsleep, 973 return CLOCK_DISPATCH(which_clock, nsleep,
974 (which_clock, flags, &t, rmtp)); 974 (which_clock, flags, &t, rmtp));
975} 975}
976
977/*
978 * nanosleep_restart for monotonic and realtime clocks
979 */
980static int common_nsleep_restart(struct restart_block *restart_block)
981{
982 return hrtimer_nanosleep_restart(restart_block);
983}
984
985/*
986 * This will restart clock_nanosleep. This is required only by
987 * compat_clock_nanosleep_restart for now.
988 */
989long
990clock_nanosleep_restart(struct restart_block *restart_block)
991{
992 clockid_t which_clock = restart_block->arg0;
993
994 return CLOCK_DISPATCH(which_clock, nsleep_restart,
995 (restart_block));
996}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index ce0dfb8f4a4e..825068ca3479 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -36,9 +36,49 @@ config PM_DEBUG
36 code. This is helpful when debugging and reporting various PM bugs, 36 code. This is helpful when debugging and reporting various PM bugs,
37 like suspend support. 37 like suspend support.
38 38
39config DISABLE_CONSOLE_SUSPEND
40 bool "Keep console(s) enabled during suspend/resume (DANGEROUS)"
41 depends on PM && PM_DEBUG
42 default n
43 ---help---
44 This option turns off the console suspend mechanism that prevents
45 debug messages from reaching the console during the suspend/resume
46 operations. This may be helpful when debugging device drivers'
47 suspend/resume routines, but may itself lead to problems, for example
48 if netconsole is used.
49
50config PM_TRACE
51 bool "Suspend/resume event tracing"
52 depends on PM && PM_DEBUG && X86_32 && EXPERIMENTAL
53 default n
54 ---help---
55 This enables some cheesy code to save the last PM event point in the
56 RTC across reboots, so that you can debug a machine that just hangs
57 during suspend (or more commonly, during resume).
58
59 To use this debugging feature you should attempt to suspend the machine,
60 then reboot it, then run
61
62 dmesg -s 1000000 | grep 'hash matches'
63
64 CAUTION: this option will cause your machine's real-time clock to be
65 set to an invalid time after a resume.
66
67config PM_SYSFS_DEPRECATED
68 bool "Driver model /sys/devices/.../power/state files (DEPRECATED)"
69 depends on PM && SYSFS
70 default n
71 help
72 The driver model started out with a sysfs file intended to provide
73 a userspace hook for device power management. This feature has never
74 worked very well, except for limited testing purposes, and so it will
75 be removed. It's not clear that a generic mechanism could really
76 handle the wide variability of device power states; any replacements
77 are likely to be bus or driver specific.
78
39config SOFTWARE_SUSPEND 79config SOFTWARE_SUSPEND
40 bool "Software Suspend" 80 bool "Software Suspend"
41 depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP) 81 depends on PM && SWAP && ((X86 && (!SMP || SUSPEND_SMP) && !X86_PAE) || ((FRV || PPC32) && !SMP))
42 ---help--- 82 ---help---
43 Enable the possibility of suspending the machine. 83 Enable the possibility of suspending the machine.
44 It doesn't need ACPI or APM. 84 It doesn't need ACPI or APM.
@@ -60,6 +100,10 @@ config SOFTWARE_SUSPEND
60 100
61 For more information take a look at <file:Documentation/power/swsusp.txt>. 101 For more information take a look at <file:Documentation/power/swsusp.txt>.
62 102
103 (For now, swsusp is incompatible with PAE aka HIGHMEM_64G on i386.
104 we need identity mapping for resume to work, and that is trivial
105 to get with 4MB pages, but less than trivial on PAE).
106
63config PM_STD_PARTITION 107config PM_STD_PARTITION
64 string "Default resume partition" 108 string "Default resume partition"
65 depends on SOFTWARE_SUSPEND 109 depends on SOFTWARE_SUSPEND
@@ -82,18 +126,6 @@ config PM_STD_PARTITION
82 suspended image to. It will simply pick the first available swap 126 suspended image to. It will simply pick the first available swap
83 device. 127 device.
84 128
85config SWSUSP_ENCRYPT
86 bool "Encrypt suspend image"
87 depends on SOFTWARE_SUSPEND && CRYPTO=y && (CRYPTO_AES=y || CRYPTO_AES_586=y || CRYPTO_AES_X86_64=y)
88 default ""
89 ---help---
90 To prevent data gathering from swap after resume you can encrypt
91 the suspend image with a temporary key that is deleted on
92 resume.
93
94 Note that the temporary key is stored unencrypted on disk while the
95 system is suspended.
96
97config SUSPEND_SMP 129config SUSPEND_SMP
98 bool 130 bool
99 depends on HOTPLUG_CPU && X86 && PM 131 depends on HOTPLUG_CPU && X86 && PM
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index 8d0af3d37a4b..38725f526afc 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -7,6 +7,4 @@ obj-y := main.o process.o console.o
7obj-$(CONFIG_PM_LEGACY) += pm.o 7obj-$(CONFIG_PM_LEGACY) += pm.o
8obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o swap.o user.o 8obj-$(CONFIG_SOFTWARE_SUSPEND) += swsusp.o disk.o snapshot.o swap.o user.o
9 9
10obj-$(CONFIG_SUSPEND_SMP) += smp.o
11
12obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o 10obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 81d4d982f3f0..d72234942798 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -18,6 +18,7 @@
18#include <linux/fs.h> 18#include <linux/fs.h>
19#include <linux/mount.h> 19#include <linux/mount.h>
20#include <linux/pm.h> 20#include <linux/pm.h>
21#include <linux/cpu.h>
21 22
22#include "power.h" 23#include "power.h"
23 24
@@ -72,7 +73,10 @@ static int prepare_processes(void)
72 int error; 73 int error;
73 74
74 pm_prepare_console(); 75 pm_prepare_console();
75 disable_nonboot_cpus(); 76
77 error = disable_nonboot_cpus();
78 if (error)
79 goto enable_cpus;
76 80
77 if (freeze_processes()) { 81 if (freeze_processes()) {
78 error = -EBUSY; 82 error = -EBUSY;
@@ -84,6 +88,7 @@ static int prepare_processes(void)
84 return 0; 88 return 0;
85thaw: 89thaw:
86 thaw_processes(); 90 thaw_processes();
91enable_cpus:
87 enable_nonboot_cpus(); 92 enable_nonboot_cpus();
88 pm_restore_console(); 93 pm_restore_console();
89 return error; 94 return error;
@@ -98,7 +103,7 @@ static void unprepare_processes(void)
98} 103}
99 104
100/** 105/**
101 * pm_suspend_disk - The granpappy of power management. 106 * pm_suspend_disk - The granpappy of hibernation power management.
102 * 107 *
103 * If we're going through the firmware, then get it over with quickly. 108 * If we're going through the firmware, then get it over with quickly.
104 * 109 *
@@ -207,7 +212,7 @@ static int software_resume(void)
207 212
208 pr_debug("PM: Preparing devices for restore.\n"); 213 pr_debug("PM: Preparing devices for restore.\n");
209 214
210 if ((error = device_suspend(PMSG_FREEZE))) { 215 if ((error = device_suspend(PMSG_PRETHAW))) {
211 printk("Some devices failed to suspend\n"); 216 printk("Some devices failed to suspend\n");
212 swsusp_free(); 217 swsusp_free();
213 goto Thaw; 218 goto Thaw;
@@ -231,7 +236,7 @@ static int software_resume(void)
231late_initcall(software_resume); 236late_initcall(software_resume);
232 237
233 238
234static char * pm_disk_modes[] = { 239static const char * const pm_disk_modes[] = {
235 [PM_DISK_FIRMWARE] = "firmware", 240 [PM_DISK_FIRMWARE] = "firmware",
236 [PM_DISK_PLATFORM] = "platform", 241 [PM_DISK_PLATFORM] = "platform",
237 [PM_DISK_SHUTDOWN] = "shutdown", 242 [PM_DISK_SHUTDOWN] = "shutdown",
diff --git a/kernel/power/main.c b/kernel/power/main.c
index cdf0f07af92f..873228c71dab 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -16,6 +16,8 @@
16#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/pm.h> 17#include <linux/pm.h>
18#include <linux/console.h> 18#include <linux/console.h>
19#include <linux/cpu.h>
20#include <linux/resume-trace.h>
19 21
20#include "power.h" 22#include "power.h"
21 23
@@ -51,7 +53,7 @@ void pm_set_ops(struct pm_ops * ops)
51 53
52static int suspend_prepare(suspend_state_t state) 54static int suspend_prepare(suspend_state_t state)
53{ 55{
54 int error = 0; 56 int error;
55 unsigned int free_pages; 57 unsigned int free_pages;
56 58
57 if (!pm_ops || !pm_ops->enter) 59 if (!pm_ops || !pm_ops->enter)
@@ -59,12 +61,9 @@ static int suspend_prepare(suspend_state_t state)
59 61
60 pm_prepare_console(); 62 pm_prepare_console();
61 63
62 disable_nonboot_cpus(); 64 error = disable_nonboot_cpus();
63 65 if (error)
64 if (num_online_cpus() != 1) {
65 error = -EPERM;
66 goto Enable_cpu; 66 goto Enable_cpu;
67 }
68 67
69 if (freeze_processes()) { 68 if (freeze_processes()) {
70 error = -EAGAIN; 69 error = -EAGAIN;
@@ -145,7 +144,7 @@ static void suspend_finish(suspend_state_t state)
145 144
146 145
147 146
148static char *pm_states[PM_SUSPEND_MAX] = { 147static const char * const pm_states[PM_SUSPEND_MAX] = {
149 [PM_SUSPEND_STANDBY] = "standby", 148 [PM_SUSPEND_STANDBY] = "standby",
150 [PM_SUSPEND_MEM] = "mem", 149 [PM_SUSPEND_MEM] = "mem",
151#ifdef CONFIG_SOFTWARE_SUSPEND 150#ifdef CONFIG_SOFTWARE_SUSPEND
@@ -262,7 +261,7 @@ static ssize_t state_show(struct subsystem * subsys, char * buf)
262static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n) 261static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n)
263{ 262{
264 suspend_state_t state = PM_SUSPEND_STANDBY; 263 suspend_state_t state = PM_SUSPEND_STANDBY;
265 char ** s; 264 const char * const *s;
266 char *p; 265 char *p;
267 int error; 266 int error;
268 int len; 267 int len;
@@ -283,10 +282,39 @@ static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n
283 282
284power_attr(state); 283power_attr(state);
285 284
285#ifdef CONFIG_PM_TRACE
286int pm_trace_enabled;
287
288static ssize_t pm_trace_show(struct subsystem * subsys, char * buf)
289{
290 return sprintf(buf, "%d\n", pm_trace_enabled);
291}
292
293static ssize_t
294pm_trace_store(struct subsystem * subsys, const char * buf, size_t n)
295{
296 int val;
297
298 if (sscanf(buf, "%d", &val) == 1) {
299 pm_trace_enabled = !!val;
300 return n;
301 }
302 return -EINVAL;
303}
304
305power_attr(pm_trace);
306
307static struct attribute * g[] = {
308 &state_attr.attr,
309 &pm_trace_attr.attr,
310 NULL,
311};
312#else
286static struct attribute * g[] = { 313static struct attribute * g[] = {
287 &state_attr.attr, 314 &state_attr.attr,
288 NULL, 315 NULL,
289}; 316};
317#endif /* CONFIG_PM_TRACE */
290 318
291static struct attribute_group attr_group = { 319static struct attribute_group attr_group = {
292 .attrs = g, 320 .attrs = g,
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
index 84063ac8fcfc..c50d15266c10 100644
--- a/kernel/power/pm.c
+++ b/kernel/power/pm.c
@@ -75,42 +75,6 @@ struct pm_dev *pm_register(pm_dev_t type,
75 return dev; 75 return dev;
76} 76}
77 77
78static void __pm_unregister(struct pm_dev *dev)
79{
80 if (dev) {
81 list_del(&dev->entry);
82 kfree(dev);
83 }
84}
85
86/**
87 * pm_unregister_all - unregister all devices with matching callback
88 * @callback: callback function pointer
89 *
90 * Unregister every device that would call the callback passed. This
91 * is primarily meant as a helper function for loadable modules. It
92 * enables a module to give up all its managed devices without keeping
93 * its own private list.
94 */
95
96void pm_unregister_all(pm_callback callback)
97{
98 struct list_head *entry;
99
100 if (!callback)
101 return;
102
103 mutex_lock(&pm_devs_lock);
104 entry = pm_devs.next;
105 while (entry != &pm_devs) {
106 struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
107 entry = entry->next;
108 if (dev->callback == callback)
109 __pm_unregister(dev);
110 }
111 mutex_unlock(&pm_devs_lock);
112}
113
114/** 78/**
115 * pm_send - send request to a single device 79 * pm_send - send request to a single device
116 * @dev: device to send to 80 * @dev: device to send to
@@ -239,7 +203,6 @@ int pm_send_all(pm_request_t rqst, void *data)
239} 203}
240 204
241EXPORT_SYMBOL(pm_register); 205EXPORT_SYMBOL(pm_register);
242EXPORT_SYMBOL(pm_unregister_all);
243EXPORT_SYMBOL(pm_send_all); 206EXPORT_SYMBOL(pm_send_all);
244EXPORT_SYMBOL(pm_active); 207EXPORT_SYMBOL(pm_active);
245 208
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 98c41423f3b1..bfe999f7b272 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -38,8 +38,6 @@ extern struct subsystem power_subsys;
38/* References to section boundaries */ 38/* References to section boundaries */
39extern const void __nosave_begin, __nosave_end; 39extern const void __nosave_begin, __nosave_end;
40 40
41extern struct pbe *pagedir_nosave;
42
43/* Preferred image size in bytes (default 500 MB) */ 41/* Preferred image size in bytes (default 500 MB) */
44extern unsigned long image_size; 42extern unsigned long image_size;
45extern int in_suspend; 43extern int in_suspend;
@@ -50,21 +48,62 @@ extern asmlinkage int swsusp_arch_resume(void);
50 48
51extern unsigned int count_data_pages(void); 49extern unsigned int count_data_pages(void);
52 50
51/**
52 * Auxiliary structure used for reading the snapshot image data and
53 * metadata from and writing them to the list of page backup entries
54 * (PBEs) which is the main data structure of swsusp.
55 *
56 * Using struct snapshot_handle we can transfer the image, including its
57 * metadata, as a continuous sequence of bytes with the help of
58 * snapshot_read_next() and snapshot_write_next().
59 *
60 * The code that writes the image to a storage or transfers it to
61 * the user land is required to use snapshot_read_next() for this
62 * purpose and it should not make any assumptions regarding the internal
63 * structure of the image. Similarly, the code that reads the image from
64 * a storage or transfers it from the user land is required to use
65 * snapshot_write_next().
66 *
67 * This may allow us to change the internal structure of the image
68 * in the future with considerably less effort.
69 */
70
53struct snapshot_handle { 71struct snapshot_handle {
54 loff_t offset; 72 loff_t offset; /* number of the last byte ready for reading
55 unsigned int page; 73 * or writing in the sequence
56 unsigned int page_offset; 74 */
57 unsigned int prev; 75 unsigned int cur; /* number of the block of PAGE_SIZE bytes the
58 struct pbe *pbe, *last_pbe; 76 * next operation will refer to (ie. current)
59 void *buffer; 77 */
60 unsigned int buf_offset; 78 unsigned int cur_offset; /* offset with respect to the current
79 * block (for the next operation)
80 */
81 unsigned int prev; /* number of the block of PAGE_SIZE bytes that
82 * was the current one previously
83 */
84 void *buffer; /* address of the block to read from
85 * or write to
86 */
87 unsigned int buf_offset; /* location to read from or write to,
88 * given as a displacement from 'buffer'
89 */
90 int sync_read; /* Set to one to notify the caller of
91 * snapshot_write_next() that it may
92 * need to call wait_on_bio_chain()
93 */
61}; 94};
62 95
96/* This macro returns the address from/to which the caller of
97 * snapshot_read_next()/snapshot_write_next() is allowed to
98 * read/write data after the function returns
99 */
63#define data_of(handle) ((handle).buffer + (handle).buf_offset) 100#define data_of(handle) ((handle).buffer + (handle).buf_offset)
64 101
102extern unsigned int snapshot_additional_pages(struct zone *zone);
65extern int snapshot_read_next(struct snapshot_handle *handle, size_t count); 103extern int snapshot_read_next(struct snapshot_handle *handle, size_t count);
66extern int snapshot_write_next(struct snapshot_handle *handle, size_t count); 104extern int snapshot_write_next(struct snapshot_handle *handle, size_t count);
67int snapshot_image_loaded(struct snapshot_handle *handle); 105extern int snapshot_image_loaded(struct snapshot_handle *handle);
106extern void snapshot_free_unused_memory(struct snapshot_handle *handle);
68 107
69#define SNAPSHOT_IOC_MAGIC '3' 108#define SNAPSHOT_IOC_MAGIC '3'
70#define SNAPSHOT_FREEZE _IO(SNAPSHOT_IOC_MAGIC, 1) 109#define SNAPSHOT_FREEZE _IO(SNAPSHOT_IOC_MAGIC, 1)
@@ -105,10 +144,6 @@ extern struct bitmap_page *alloc_bitmap(unsigned int nr_bits);
105extern unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap); 144extern unsigned long alloc_swap_page(int swap, struct bitmap_page *bitmap);
106extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap); 145extern void free_all_swap_pages(int swap, struct bitmap_page *bitmap);
107 146
108extern unsigned int count_special_pages(void);
109extern int save_special_mem(void);
110extern int restore_special_mem(void);
111
112extern int swsusp_check(void); 147extern int swsusp_check(void);
113extern int swsusp_shrink_memory(void); 148extern int swsusp_shrink_memory(void);
114extern void swsusp_free(void); 149extern void swsusp_free(void);
diff --git a/kernel/power/process.c b/kernel/power/process.c
index b2a5f671d6cd..72e72d2c61e6 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -66,13 +66,25 @@ static inline void freeze_process(struct task_struct *p)
66 } 66 }
67} 67}
68 68
69static void cancel_freezing(struct task_struct *p)
70{
71 unsigned long flags;
72
73 if (freezing(p)) {
74 pr_debug(" clean up: %s\n", p->comm);
75 do_not_freeze(p);
76 spin_lock_irqsave(&p->sighand->siglock, flags);
77 recalc_sigpending_tsk(p);
78 spin_unlock_irqrestore(&p->sighand->siglock, flags);
79 }
80}
81
69/* 0 = success, else # of processes that we failed to stop */ 82/* 0 = success, else # of processes that we failed to stop */
70int freeze_processes(void) 83int freeze_processes(void)
71{ 84{
72 int todo, nr_user, user_frozen; 85 int todo, nr_user, user_frozen;
73 unsigned long start_time; 86 unsigned long start_time;
74 struct task_struct *g, *p; 87 struct task_struct *g, *p;
75 unsigned long flags;
76 88
77 printk( "Stopping tasks: " ); 89 printk( "Stopping tasks: " );
78 start_time = jiffies; 90 start_time = jiffies;
@@ -85,6 +97,10 @@ int freeze_processes(void)
85 continue; 97 continue;
86 if (frozen(p)) 98 if (frozen(p))
87 continue; 99 continue;
100 if (p->state == TASK_TRACED && frozen(p->parent)) {
101 cancel_freezing(p);
102 continue;
103 }
88 if (p->mm && !(p->flags & PF_BORROWED_MM)) { 104 if (p->mm && !(p->flags & PF_BORROWED_MM)) {
89 /* The task is a user-space one. 105 /* The task is a user-space one.
90 * Freeze it unless there's a vfork completion 106 * Freeze it unless there's a vfork completion
@@ -126,13 +142,7 @@ int freeze_processes(void)
126 do_each_thread(g, p) { 142 do_each_thread(g, p) {
127 if (freezeable(p) && !frozen(p)) 143 if (freezeable(p) && !frozen(p))
128 printk(KERN_ERR " %s\n", p->comm); 144 printk(KERN_ERR " %s\n", p->comm);
129 if (freezing(p)) { 145 cancel_freezing(p);
130 pr_debug(" clean up: %s\n", p->comm);
131 p->flags &= ~PF_FREEZE;
132 spin_lock_irqsave(&p->sighand->siglock, flags);
133 recalc_sigpending_tsk(p);
134 spin_unlock_irqrestore(&p->sighand->siglock, flags);
135 }
136 } while_each_thread(g, p); 146 } while_each_thread(g, p);
137 read_unlock(&tasklist_lock); 147 read_unlock(&tasklist_lock);
138 return todo; 148 return todo;
diff --git a/kernel/power/smp.c b/kernel/power/smp.c
deleted file mode 100644
index 5957312b2d68..000000000000
--- a/kernel/power/smp.c
+++ /dev/null
@@ -1,62 +0,0 @@
1/*
2 * drivers/power/smp.c - Functions for stopping other CPUs.
3 *
4 * Copyright 2004 Pavel Machek <pavel@suse.cz>
5 * Copyright (C) 2002-2003 Nigel Cunningham <ncunningham@clear.net.nz>
6 *
7 * This file is released under the GPLv2.
8 */
9
10#undef DEBUG
11
12#include <linux/smp_lock.h>
13#include <linux/interrupt.h>
14#include <linux/suspend.h>
15#include <linux/module.h>
16#include <linux/cpu.h>
17#include <asm/atomic.h>
18#include <asm/tlbflush.h>
19
20/* This is protected by pm_sem semaphore */
21static cpumask_t frozen_cpus;
22
23void disable_nonboot_cpus(void)
24{
25 int cpu, error;
26
27 error = 0;
28 cpus_clear(frozen_cpus);
29 printk("Freezing cpus ...\n");
30 for_each_online_cpu(cpu) {
31 if (cpu == 0)
32 continue;
33 error = cpu_down(cpu);
34 if (!error) {
35 cpu_set(cpu, frozen_cpus);
36 printk("CPU%d is down\n", cpu);
37 continue;
38 }
39 printk("Error taking cpu %d down: %d\n", cpu, error);
40 }
41 BUG_ON(raw_smp_processor_id() != 0);
42 if (error)
43 panic("cpus not sleeping");
44}
45
46void enable_nonboot_cpus(void)
47{
48 int cpu, error;
49
50 printk("Thawing cpus ...\n");
51 for_each_cpu_mask(cpu, frozen_cpus) {
52 error = cpu_up(cpu);
53 if (!error) {
54 printk("CPU%d is up\n", cpu);
55 continue;
56 }
57 printk("Error taking cpu %d up: %d\n", cpu, error);
58 panic("Not enough cpus");
59 }
60 cpus_clear(frozen_cpus);
61}
62
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 3d9284100b22..1b84313cbab5 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -34,95 +34,15 @@
34 34
35#include "power.h" 35#include "power.h"
36 36
37struct pbe *pagedir_nosave; 37/* List of PBEs used for creating and restoring the suspend image */
38struct pbe *restore_pblist;
39
38static unsigned int nr_copy_pages; 40static unsigned int nr_copy_pages;
39static unsigned int nr_meta_pages; 41static unsigned int nr_meta_pages;
40static unsigned long *buffer; 42static void *buffer;
41
42struct arch_saveable_page {
43 unsigned long start;
44 unsigned long end;
45 char *data;
46 struct arch_saveable_page *next;
47};
48static struct arch_saveable_page *arch_pages;
49
50int swsusp_add_arch_pages(unsigned long start, unsigned long end)
51{
52 struct arch_saveable_page *tmp;
53
54 while (start < end) {
55 tmp = kzalloc(sizeof(struct arch_saveable_page), GFP_KERNEL);
56 if (!tmp)
57 return -ENOMEM;
58 tmp->start = start;
59 tmp->end = ((start >> PAGE_SHIFT) + 1) << PAGE_SHIFT;
60 if (tmp->end > end)
61 tmp->end = end;
62 tmp->next = arch_pages;
63 start = tmp->end;
64 arch_pages = tmp;
65 }
66 return 0;
67}
68
69static unsigned int count_arch_pages(void)
70{
71 unsigned int count = 0;
72 struct arch_saveable_page *tmp = arch_pages;
73 while (tmp) {
74 count++;
75 tmp = tmp->next;
76 }
77 return count;
78}
79
80static int save_arch_mem(void)
81{
82 char *kaddr;
83 struct arch_saveable_page *tmp = arch_pages;
84 int offset;
85
86 pr_debug("swsusp: Saving arch specific memory");
87 while (tmp) {
88 tmp->data = (char *)__get_free_page(GFP_ATOMIC);
89 if (!tmp->data)
90 return -ENOMEM;
91 offset = tmp->start - (tmp->start & PAGE_MASK);
92 /* arch pages might haven't a 'struct page' */
93 kaddr = kmap_atomic_pfn(tmp->start >> PAGE_SHIFT, KM_USER0);
94 memcpy(tmp->data + offset, kaddr + offset,
95 tmp->end - tmp->start);
96 kunmap_atomic(kaddr, KM_USER0);
97
98 tmp = tmp->next;
99 }
100 return 0;
101}
102
103static int restore_arch_mem(void)
104{
105 char *kaddr;
106 struct arch_saveable_page *tmp = arch_pages;
107 int offset;
108
109 while (tmp) {
110 if (!tmp->data)
111 continue;
112 offset = tmp->start - (tmp->start & PAGE_MASK);
113 kaddr = kmap_atomic_pfn(tmp->start >> PAGE_SHIFT, KM_USER0);
114 memcpy(kaddr + offset, tmp->data + offset,
115 tmp->end - tmp->start);
116 kunmap_atomic(kaddr, KM_USER0);
117 free_page((long)tmp->data);
118 tmp->data = NULL;
119 tmp = tmp->next;
120 }
121 return 0;
122}
123 43
124#ifdef CONFIG_HIGHMEM 44#ifdef CONFIG_HIGHMEM
125static unsigned int count_highmem_pages(void) 45unsigned int count_highmem_pages(void)
126{ 46{
127 struct zone *zone; 47 struct zone *zone;
128 unsigned long zone_pfn; 48 unsigned long zone_pfn;
@@ -199,7 +119,7 @@ static int save_highmem_zone(struct zone *zone)
199 return 0; 119 return 0;
200} 120}
201 121
202static int save_highmem(void) 122int save_highmem(void)
203{ 123{
204 struct zone *zone; 124 struct zone *zone;
205 int res = 0; 125 int res = 0;
@@ -216,7 +136,7 @@ static int save_highmem(void)
216 return 0; 136 return 0;
217} 137}
218 138
219static int restore_highmem(void) 139int restore_highmem(void)
220{ 140{
221 printk("swsusp: Restoring Highmem\n"); 141 printk("swsusp: Restoring Highmem\n");
222 while (highmem_copy) { 142 while (highmem_copy) {
@@ -238,256 +158,637 @@ static inline int save_highmem(void) {return 0;}
238static inline int restore_highmem(void) {return 0;} 158static inline int restore_highmem(void) {return 0;}
239#endif 159#endif
240 160
241unsigned int count_special_pages(void) 161/**
162 * @safe_needed - on resume, for storing the PBE list and the image,
163 * we can only use memory pages that do not conflict with the pages
164 * used before suspend.
165 *
166 * The unsafe pages are marked with the PG_nosave_free flag
167 * and we count them using unsafe_pages
168 */
169
170#define PG_ANY 0
171#define PG_SAFE 1
172#define PG_UNSAFE_CLEAR 1
173#define PG_UNSAFE_KEEP 0
174
175static unsigned int allocated_unsafe_pages;
176
177static void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
242{ 178{
243 return count_arch_pages() + count_highmem_pages(); 179 void *res;
180
181 res = (void *)get_zeroed_page(gfp_mask);
182 if (safe_needed)
183 while (res && PageNosaveFree(virt_to_page(res))) {
184 /* The page is unsafe, mark it for swsusp_free() */
185 SetPageNosave(virt_to_page(res));
186 allocated_unsafe_pages++;
187 res = (void *)get_zeroed_page(gfp_mask);
188 }
189 if (res) {
190 SetPageNosave(virt_to_page(res));
191 SetPageNosaveFree(virt_to_page(res));
192 }
193 return res;
244} 194}
245 195
246int save_special_mem(void) 196unsigned long get_safe_page(gfp_t gfp_mask)
247{ 197{
248 int ret; 198 return (unsigned long)alloc_image_page(gfp_mask, PG_SAFE);
249 ret = save_arch_mem(); 199}
250 if (!ret) 200
251 ret = save_highmem(); 201/**
252 return ret; 202 * free_image_page - free page represented by @addr, allocated with
203 * alloc_image_page (page flags set by it must be cleared)
204 */
205
206static inline void free_image_page(void *addr, int clear_nosave_free)
207{
208 ClearPageNosave(virt_to_page(addr));
209 if (clear_nosave_free)
210 ClearPageNosaveFree(virt_to_page(addr));
211 free_page((unsigned long)addr);
212}
213
214/* struct linked_page is used to build chains of pages */
215
216#define LINKED_PAGE_DATA_SIZE (PAGE_SIZE - sizeof(void *))
217
218struct linked_page {
219 struct linked_page *next;
220 char data[LINKED_PAGE_DATA_SIZE];
221} __attribute__((packed));
222
223static inline void
224free_list_of_pages(struct linked_page *list, int clear_page_nosave)
225{
226 while (list) {
227 struct linked_page *lp = list->next;
228
229 free_image_page(list, clear_page_nosave);
230 list = lp;
231 }
232}
233
234/**
235 * struct chain_allocator is used for allocating small objects out of
236 * a linked list of pages called 'the chain'.
237 *
238 * The chain grows each time when there is no room for a new object in
239 * the current page. The allocated objects cannot be freed individually.
240 * It is only possible to free them all at once, by freeing the entire
241 * chain.
242 *
243 * NOTE: The chain allocator may be inefficient if the allocated objects
244 * are not much smaller than PAGE_SIZE.
245 */
246
247struct chain_allocator {
248 struct linked_page *chain; /* the chain */
249 unsigned int used_space; /* total size of objects allocated out
250 * of the current page
251 */
252 gfp_t gfp_mask; /* mask for allocating pages */
253 int safe_needed; /* if set, only "safe" pages are allocated */
254};
255
256static void
257chain_init(struct chain_allocator *ca, gfp_t gfp_mask, int safe_needed)
258{
259 ca->chain = NULL;
260 ca->used_space = LINKED_PAGE_DATA_SIZE;
261 ca->gfp_mask = gfp_mask;
262 ca->safe_needed = safe_needed;
253} 263}
254 264
255int restore_special_mem(void) 265static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
256{ 266{
257 int ret; 267 void *ret;
258 ret = restore_arch_mem(); 268
259 if (!ret) 269 if (LINKED_PAGE_DATA_SIZE - ca->used_space < size) {
260 ret = restore_highmem(); 270 struct linked_page *lp;
271
272 lp = alloc_image_page(ca->gfp_mask, ca->safe_needed);
273 if (!lp)
274 return NULL;
275
276 lp->next = ca->chain;
277 ca->chain = lp;
278 ca->used_space = 0;
279 }
280 ret = ca->chain->data + ca->used_space;
281 ca->used_space += size;
261 return ret; 282 return ret;
262} 283}
263 284
264static int pfn_is_nosave(unsigned long pfn) 285static void chain_free(struct chain_allocator *ca, int clear_page_nosave)
265{ 286{
266 unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT; 287 free_list_of_pages(ca->chain, clear_page_nosave);
267 unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT; 288 memset(ca, 0, sizeof(struct chain_allocator));
268 return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
269} 289}
270 290
271/** 291/**
272 * saveable - Determine whether a page should be cloned or not. 292 * Data types related to memory bitmaps.
273 * @pfn: The page 293 *
294 * Memory bitmap is a structure consiting of many linked lists of
295 * objects. The main list's elements are of type struct zone_bitmap
296 * and each of them corresonds to one zone. For each zone bitmap
297 * object there is a list of objects of type struct bm_block that
298 * represent each blocks of bit chunks in which information is
299 * stored.
300 *
301 * struct memory_bitmap contains a pointer to the main list of zone
302 * bitmap objects, a struct bm_position used for browsing the bitmap,
303 * and a pointer to the list of pages used for allocating all of the
304 * zone bitmap objects and bitmap block objects.
305 *
306 * NOTE: It has to be possible to lay out the bitmap in memory
307 * using only allocations of order 0. Additionally, the bitmap is
308 * designed to work with arbitrary number of zones (this is over the
309 * top for now, but let's avoid making unnecessary assumptions ;-).
274 * 310 *
275 * We save a page if it's Reserved, and not in the range of pages 311 * struct zone_bitmap contains a pointer to a list of bitmap block
276 * statically defined as 'unsaveable', or if it isn't reserved, and 312 * objects and a pointer to the bitmap block object that has been
277 * isn't part of a free chunk of pages. 313 * most recently used for setting bits. Additionally, it contains the
314 * pfns that correspond to the start and end of the represented zone.
315 *
316 * struct bm_block contains a pointer to the memory page in which
317 * information is stored (in the form of a block of bit chunks
318 * of type unsigned long each). It also contains the pfns that
319 * correspond to the start and end of the represented memory area and
320 * the number of bit chunks in the block.
321 *
322 * NOTE: Memory bitmaps are used for two types of operations only:
323 * "set a bit" and "find the next bit set". Moreover, the searching
324 * is always carried out after all of the "set a bit" operations
325 * on given bitmap.
278 */ 326 */
279 327
280static int saveable(struct zone *zone, unsigned long *zone_pfn) 328#define BM_END_OF_MAP (~0UL)
329
330#define BM_CHUNKS_PER_BLOCK (PAGE_SIZE / sizeof(long))
331#define BM_BITS_PER_CHUNK (sizeof(long) << 3)
332#define BM_BITS_PER_BLOCK (PAGE_SIZE << 3)
333
334struct bm_block {
335 struct bm_block *next; /* next element of the list */
336 unsigned long start_pfn; /* pfn represented by the first bit */
337 unsigned long end_pfn; /* pfn represented by the last bit plus 1 */
338 unsigned int size; /* number of bit chunks */
339 unsigned long *data; /* chunks of bits representing pages */
340};
341
342struct zone_bitmap {
343 struct zone_bitmap *next; /* next element of the list */
344 unsigned long start_pfn; /* minimal pfn in this zone */
345 unsigned long end_pfn; /* maximal pfn in this zone plus 1 */
346 struct bm_block *bm_blocks; /* list of bitmap blocks */
347 struct bm_block *cur_block; /* recently used bitmap block */
348};
349
350/* strcut bm_position is used for browsing memory bitmaps */
351
352struct bm_position {
353 struct zone_bitmap *zone_bm;
354 struct bm_block *block;
355 int chunk;
356 int bit;
357};
358
359struct memory_bitmap {
360 struct zone_bitmap *zone_bm_list; /* list of zone bitmaps */
361 struct linked_page *p_list; /* list of pages used to store zone
362 * bitmap objects and bitmap block
363 * objects
364 */
365 struct bm_position cur; /* most recently used bit position */
366};
367
368/* Functions that operate on memory bitmaps */
369
370static inline void memory_bm_reset_chunk(struct memory_bitmap *bm)
281{ 371{
282 unsigned long pfn = *zone_pfn + zone->zone_start_pfn; 372 bm->cur.chunk = 0;
283 struct page *page; 373 bm->cur.bit = -1;
374}
284 375
285 if (!pfn_valid(pfn)) 376static void memory_bm_position_reset(struct memory_bitmap *bm)
286 return 0; 377{
378 struct zone_bitmap *zone_bm;
287 379
288 page = pfn_to_page(pfn); 380 zone_bm = bm->zone_bm_list;
289 if (PageNosave(page)) 381 bm->cur.zone_bm = zone_bm;
290 return 0; 382 bm->cur.block = zone_bm->bm_blocks;
291 if (PageReserved(page) && pfn_is_nosave(pfn)) 383 memory_bm_reset_chunk(bm);
292 return 0; 384}
293 if (PageNosaveFree(page)) 385
294 return 0; 386static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
295 387
296 return 1; 388/**
389 * create_bm_block_list - create a list of block bitmap objects
390 */
391
392static inline struct bm_block *
393create_bm_block_list(unsigned int nr_blocks, struct chain_allocator *ca)
394{
395 struct bm_block *bblist = NULL;
396
397 while (nr_blocks-- > 0) {
398 struct bm_block *bb;
399
400 bb = chain_alloc(ca, sizeof(struct bm_block));
401 if (!bb)
402 return NULL;
403
404 bb->next = bblist;
405 bblist = bb;
406 }
407 return bblist;
297} 408}
298 409
299unsigned int count_data_pages(void) 410/**
411 * create_zone_bm_list - create a list of zone bitmap objects
412 */
413
414static inline struct zone_bitmap *
415create_zone_bm_list(unsigned int nr_zones, struct chain_allocator *ca)
300{ 416{
301 struct zone *zone; 417 struct zone_bitmap *zbmlist = NULL;
302 unsigned long zone_pfn;
303 unsigned int n = 0;
304 418
305 for_each_zone (zone) { 419 while (nr_zones-- > 0) {
306 if (is_highmem(zone)) 420 struct zone_bitmap *zbm;
307 continue; 421
308 mark_free_pages(zone); 422 zbm = chain_alloc(ca, sizeof(struct zone_bitmap));
309 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) 423 if (!zbm)
310 n += saveable(zone, &zone_pfn); 424 return NULL;
425
426 zbm->next = zbmlist;
427 zbmlist = zbm;
311 } 428 }
312 return n; 429 return zbmlist;
313} 430}
314 431
315static void copy_data_pages(struct pbe *pblist) 432/**
433 * memory_bm_create - allocate memory for a memory bitmap
434 */
435
436static int
437memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
316{ 438{
439 struct chain_allocator ca;
317 struct zone *zone; 440 struct zone *zone;
318 unsigned long zone_pfn; 441 struct zone_bitmap *zone_bm;
319 struct pbe *pbe, *p; 442 struct bm_block *bb;
443 unsigned int nr;
444
445 chain_init(&ca, gfp_mask, safe_needed);
446
447 /* Compute the number of zones */
448 nr = 0;
449 for_each_zone (zone)
450 if (populated_zone(zone) && !is_highmem(zone))
451 nr++;
452
453 /* Allocate the list of zones bitmap objects */
454 zone_bm = create_zone_bm_list(nr, &ca);
455 bm->zone_bm_list = zone_bm;
456 if (!zone_bm) {
457 chain_free(&ca, PG_UNSAFE_CLEAR);
458 return -ENOMEM;
459 }
320 460
321 pbe = pblist; 461 /* Initialize the zone bitmap objects */
322 for_each_zone (zone) { 462 for_each_zone (zone) {
323 if (is_highmem(zone)) 463 unsigned long pfn;
464
465 if (!populated_zone(zone) || is_highmem(zone))
324 continue; 466 continue;
325 mark_free_pages(zone); 467
326 /* This is necessary for swsusp_free() */ 468 zone_bm->start_pfn = zone->zone_start_pfn;
327 for_each_pb_page (p, pblist) 469 zone_bm->end_pfn = zone->zone_start_pfn + zone->spanned_pages;
328 SetPageNosaveFree(virt_to_page(p)); 470 /* Allocate the list of bitmap block objects */
329 for_each_pbe (p, pblist) 471 nr = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
330 SetPageNosaveFree(virt_to_page(p->address)); 472 bb = create_bm_block_list(nr, &ca);
331 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { 473 zone_bm->bm_blocks = bb;
332 if (saveable(zone, &zone_pfn)) { 474 zone_bm->cur_block = bb;
333 struct page *page; 475 if (!bb)
334 page = pfn_to_page(zone_pfn + zone->zone_start_pfn); 476 goto Free;
335 BUG_ON(!pbe); 477
336 pbe->orig_address = (unsigned long)page_address(page); 478 nr = zone->spanned_pages;
337 /* copy_page is not usable for copying task structs. */ 479 pfn = zone->zone_start_pfn;
338 memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE); 480 /* Initialize the bitmap block objects */
339 pbe = pbe->next; 481 while (bb) {
482 unsigned long *ptr;
483
484 ptr = alloc_image_page(gfp_mask, safe_needed);
485 bb->data = ptr;
486 if (!ptr)
487 goto Free;
488
489 bb->start_pfn = pfn;
490 if (nr >= BM_BITS_PER_BLOCK) {
491 pfn += BM_BITS_PER_BLOCK;
492 bb->size = BM_CHUNKS_PER_BLOCK;
493 nr -= BM_BITS_PER_BLOCK;
494 } else {
495 /* This is executed only once in the loop */
496 pfn += nr;
497 bb->size = DIV_ROUND_UP(nr, BM_BITS_PER_CHUNK);
340 } 498 }
499 bb->end_pfn = pfn;
500 bb = bb->next;
341 } 501 }
502 zone_bm = zone_bm->next;
342 } 503 }
343 BUG_ON(pbe); 504 bm->p_list = ca.chain;
344} 505 memory_bm_position_reset(bm);
506 return 0;
345 507
508Free:
509 bm->p_list = ca.chain;
510 memory_bm_free(bm, PG_UNSAFE_CLEAR);
511 return -ENOMEM;
512}
346 513
347/** 514/**
348 * free_pagedir - free pages allocated with alloc_pagedir() 515 * memory_bm_free - free memory occupied by the memory bitmap @bm
349 */ 516 */
350 517
351static void free_pagedir(struct pbe *pblist, int clear_nosave_free) 518static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
352{ 519{
353 struct pbe *pbe; 520 struct zone_bitmap *zone_bm;
354 521
355 while (pblist) { 522 /* Free the list of bit blocks for each zone_bitmap object */
356 pbe = (pblist + PB_PAGE_SKIP)->next; 523 zone_bm = bm->zone_bm_list;
357 ClearPageNosave(virt_to_page(pblist)); 524 while (zone_bm) {
358 if (clear_nosave_free) 525 struct bm_block *bb;
359 ClearPageNosaveFree(virt_to_page(pblist)); 526
360 free_page((unsigned long)pblist); 527 bb = zone_bm->bm_blocks;
361 pblist = pbe; 528 while (bb) {
529 if (bb->data)
530 free_image_page(bb->data, clear_nosave_free);
531 bb = bb->next;
532 }
533 zone_bm = zone_bm->next;
362 } 534 }
535 free_list_of_pages(bm->p_list, clear_nosave_free);
536 bm->zone_bm_list = NULL;
363} 537}
364 538
365/** 539/**
366 * fill_pb_page - Create a list of PBEs on a given memory page 540 * memory_bm_set_bit - set the bit in the bitmap @bm that corresponds
541 * to given pfn. The cur_zone_bm member of @bm and the cur_block member
542 * of @bm->cur_zone_bm are updated.
543 *
544 * If the bit cannot be set, the function returns -EINVAL .
367 */ 545 */
368 546
369static inline void fill_pb_page(struct pbe *pbpage) 547static int
548memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
370{ 549{
371 struct pbe *p; 550 struct zone_bitmap *zone_bm;
372 551 struct bm_block *bb;
373 p = pbpage; 552
374 pbpage += PB_PAGE_SKIP; 553 /* Check if the pfn is from the current zone */
375 do 554 zone_bm = bm->cur.zone_bm;
376 p->next = p + 1; 555 if (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) {
377 while (++p < pbpage); 556 zone_bm = bm->zone_bm_list;
557 /* We don't assume that the zones are sorted by pfns */
558 while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) {
559 zone_bm = zone_bm->next;
560 if (unlikely(!zone_bm))
561 return -EINVAL;
562 }
563 bm->cur.zone_bm = zone_bm;
564 }
565 /* Check if the pfn corresponds to the current bitmap block */
566 bb = zone_bm->cur_block;
567 if (pfn < bb->start_pfn)
568 bb = zone_bm->bm_blocks;
569
570 while (pfn >= bb->end_pfn) {
571 bb = bb->next;
572 if (unlikely(!bb))
573 return -EINVAL;
574 }
575 zone_bm->cur_block = bb;
576 pfn -= bb->start_pfn;
577 set_bit(pfn % BM_BITS_PER_CHUNK, bb->data + pfn / BM_BITS_PER_CHUNK);
578 return 0;
378} 579}
379 580
380/** 581/* Two auxiliary functions for memory_bm_next_pfn */
381 * create_pbe_list - Create a list of PBEs on top of a given chain
382 * of memory pages allocated with alloc_pagedir()
383 */
384 582
385static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages) 583/* Find the first set bit in the given chunk, if there is one */
386{
387 struct pbe *pbpage, *p;
388 unsigned int num = PBES_PER_PAGE;
389 584
390 for_each_pb_page (pbpage, pblist) { 585static inline int next_bit_in_chunk(int bit, unsigned long *chunk_p)
391 if (num >= nr_pages) 586{
392 break; 587 bit++;
588 while (bit < BM_BITS_PER_CHUNK) {
589 if (test_bit(bit, chunk_p))
590 return bit;
393 591
394 fill_pb_page(pbpage); 592 bit++;
395 num += PBES_PER_PAGE;
396 }
397 if (pbpage) {
398 for (num -= PBES_PER_PAGE - 1, p = pbpage; num < nr_pages; p++, num++)
399 p->next = p + 1;
400 p->next = NULL;
401 } 593 }
594 return -1;
402} 595}
403 596
404static unsigned int unsafe_pages; 597/* Find a chunk containing some bits set in given block of bits */
598
599static inline int next_chunk_in_block(int n, struct bm_block *bb)
600{
601 n++;
602 while (n < bb->size) {
603 if (bb->data[n])
604 return n;
605
606 n++;
607 }
608 return -1;
609}
405 610
406/** 611/**
407 * @safe_needed - on resume, for storing the PBE list and the image, 612 * memory_bm_next_pfn - find the pfn that corresponds to the next set bit
408 * we can only use memory pages that do not conflict with the pages 613 * in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is
409 * used before suspend. 614 * returned.
410 * 615 *
411 * The unsafe pages are marked with the PG_nosave_free flag 616 * It is required to run memory_bm_position_reset() before the first call to
412 * and we count them using unsafe_pages 617 * this function.
413 */ 618 */
414 619
415static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed) 620static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
416{ 621{
417 void *res; 622 struct zone_bitmap *zone_bm;
418 623 struct bm_block *bb;
419 res = (void *)get_zeroed_page(gfp_mask); 624 int chunk;
420 if (safe_needed) 625 int bit;
421 while (res && PageNosaveFree(virt_to_page(res))) { 626
422 /* The page is unsafe, mark it for swsusp_free() */ 627 do {
423 SetPageNosave(virt_to_page(res)); 628 bb = bm->cur.block;
424 unsafe_pages++; 629 do {
425 res = (void *)get_zeroed_page(gfp_mask); 630 chunk = bm->cur.chunk;
631 bit = bm->cur.bit;
632 do {
633 bit = next_bit_in_chunk(bit, bb->data + chunk);
634 if (bit >= 0)
635 goto Return_pfn;
636
637 chunk = next_chunk_in_block(chunk, bb);
638 bit = -1;
639 } while (chunk >= 0);
640 bb = bb->next;
641 bm->cur.block = bb;
642 memory_bm_reset_chunk(bm);
643 } while (bb);
644 zone_bm = bm->cur.zone_bm->next;
645 if (zone_bm) {
646 bm->cur.zone_bm = zone_bm;
647 bm->cur.block = zone_bm->bm_blocks;
648 memory_bm_reset_chunk(bm);
426 } 649 }
427 if (res) { 650 } while (zone_bm);
428 SetPageNosave(virt_to_page(res)); 651 memory_bm_position_reset(bm);
429 SetPageNosaveFree(virt_to_page(res)); 652 return BM_END_OF_MAP;
430 } 653
654Return_pfn:
655 bm->cur.chunk = chunk;
656 bm->cur.bit = bit;
657 return bb->start_pfn + chunk * BM_BITS_PER_CHUNK + bit;
658}
659
660/**
661 * snapshot_additional_pages - estimate the number of additional pages
662 * be needed for setting up the suspend image data structures for given
663 * zone (usually the returned value is greater than the exact number)
664 */
665
666unsigned int snapshot_additional_pages(struct zone *zone)
667{
668 unsigned int res;
669
670 res = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK);
671 res += DIV_ROUND_UP(res * sizeof(struct bm_block), PAGE_SIZE);
431 return res; 672 return res;
432} 673}
433 674
434unsigned long get_safe_page(gfp_t gfp_mask) 675/**
676 * pfn_is_nosave - check if given pfn is in the 'nosave' section
677 */
678
679static inline int pfn_is_nosave(unsigned long pfn)
435{ 680{
436 return (unsigned long)alloc_image_page(gfp_mask, 1); 681 unsigned long nosave_begin_pfn = __pa(&__nosave_begin) >> PAGE_SHIFT;
682 unsigned long nosave_end_pfn = PAGE_ALIGN(__pa(&__nosave_end)) >> PAGE_SHIFT;
683 return (pfn >= nosave_begin_pfn) && (pfn < nosave_end_pfn);
437} 684}
438 685
439/** 686/**
440 * alloc_pagedir - Allocate the page directory. 687 * saveable - Determine whether a page should be cloned or not.
441 * 688 * @pfn: The page
442 * First, determine exactly how many pages we need and
443 * allocate them.
444 *
445 * We arrange the pages in a chain: each page is an array of PBES_PER_PAGE
446 * struct pbe elements (pbes) and the last element in the page points
447 * to the next page.
448 * 689 *
449 * On each page we set up a list of struct_pbe elements. 690 * We save a page if it isn't Nosave, and is not in the range of pages
691 * statically defined as 'unsaveable', and it
692 * isn't a part of a free chunk of pages.
450 */ 693 */
451 694
452static struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, 695static struct page *saveable_page(unsigned long pfn)
453 int safe_needed)
454{ 696{
455 unsigned int num; 697 struct page *page;
456 struct pbe *pblist, *pbe; 698
699 if (!pfn_valid(pfn))
700 return NULL;
457 701
458 if (!nr_pages) 702 page = pfn_to_page(pfn);
703
704 if (PageNosave(page))
705 return NULL;
706 if (PageReserved(page) && pfn_is_nosave(pfn))
459 return NULL; 707 return NULL;
708 if (PageNosaveFree(page))
709 return NULL;
710
711 return page;
712}
713
714unsigned int count_data_pages(void)
715{
716 struct zone *zone;
717 unsigned long pfn, max_zone_pfn;
718 unsigned int n = 0;
719
720 for_each_zone (zone) {
721 if (is_highmem(zone))
722 continue;
723 mark_free_pages(zone);
724 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
725 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
726 n += !!saveable_page(pfn);
727 }
728 return n;
729}
730
731static inline void copy_data_page(long *dst, long *src)
732{
733 int n;
734
735 /* copy_page and memcpy are not usable for copying task structs. */
736 for (n = PAGE_SIZE / sizeof(long); n; n--)
737 *dst++ = *src++;
738}
739
740static void
741copy_data_pages(struct memory_bitmap *copy_bm, struct memory_bitmap *orig_bm)
742{
743 struct zone *zone;
744 unsigned long pfn;
745
746 for_each_zone (zone) {
747 unsigned long max_zone_pfn;
460 748
461 pblist = alloc_image_page(gfp_mask, safe_needed); 749 if (is_highmem(zone))
462 /* FIXME: rewrite this ugly loop */ 750 continue;
463 for (pbe = pblist, num = PBES_PER_PAGE; pbe && num < nr_pages; 751
464 pbe = pbe->next, num += PBES_PER_PAGE) { 752 mark_free_pages(zone);
465 pbe += PB_PAGE_SKIP; 753 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
466 pbe->next = alloc_image_page(gfp_mask, safe_needed); 754 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
755 if (saveable_page(pfn))
756 memory_bm_set_bit(orig_bm, pfn);
467 } 757 }
468 if (!pbe) { /* get_zeroed_page() failed */ 758 memory_bm_position_reset(orig_bm);
469 free_pagedir(pblist, 1); 759 memory_bm_position_reset(copy_bm);
470 pblist = NULL; 760 do {
471 } else 761 pfn = memory_bm_next_pfn(orig_bm);
472 create_pbe_list(pblist, nr_pages); 762 if (likely(pfn != BM_END_OF_MAP)) {
473 return pblist; 763 struct page *page;
764 void *src;
765
766 page = pfn_to_page(pfn);
767 src = page_address(page);
768 page = pfn_to_page(memory_bm_next_pfn(copy_bm));
769 copy_data_page(page_address(page), src);
770 }
771 } while (pfn != BM_END_OF_MAP);
474} 772}
475 773
476/** 774/**
477 * Free pages we allocated for suspend. Suspend pages are alocated 775 * swsusp_free - free pages allocated for the suspend.
478 * before atomic copy, so we need to free them after resume. 776 *
777 * Suspend pages are alocated before the atomic copy is made, so we
778 * need to release them after the resume.
479 */ 779 */
480 780
481void swsusp_free(void) 781void swsusp_free(void)
482{ 782{
483 struct zone *zone; 783 struct zone *zone;
484 unsigned long zone_pfn; 784 unsigned long pfn, max_zone_pfn;
485 785
486 for_each_zone(zone) { 786 for_each_zone(zone) {
487 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) 787 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
488 if (pfn_valid(zone_pfn + zone->zone_start_pfn)) { 788 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
489 struct page *page; 789 if (pfn_valid(pfn)) {
490 page = pfn_to_page(zone_pfn + zone->zone_start_pfn); 790 struct page *page = pfn_to_page(pfn);
791
491 if (PageNosave(page) && PageNosaveFree(page)) { 792 if (PageNosave(page) && PageNosaveFree(page)) {
492 ClearPageNosave(page); 793 ClearPageNosave(page);
493 ClearPageNosaveFree(page); 794 ClearPageNosaveFree(page);
@@ -497,7 +798,7 @@ void swsusp_free(void)
497 } 798 }
498 nr_copy_pages = 0; 799 nr_copy_pages = 0;
499 nr_meta_pages = 0; 800 nr_meta_pages = 0;
500 pagedir_nosave = NULL; 801 restore_pblist = NULL;
501 buffer = NULL; 802 buffer = NULL;
502} 803}
503 804
@@ -512,46 +813,57 @@ void swsusp_free(void)
512static int enough_free_mem(unsigned int nr_pages) 813static int enough_free_mem(unsigned int nr_pages)
513{ 814{
514 struct zone *zone; 815 struct zone *zone;
515 unsigned int n = 0; 816 unsigned int free = 0, meta = 0;
516 817
517 for_each_zone (zone) 818 for_each_zone (zone)
518 if (!is_highmem(zone)) 819 if (!is_highmem(zone)) {
519 n += zone->free_pages; 820 free += zone->free_pages;
520 pr_debug("swsusp: available memory: %u pages\n", n); 821 meta += snapshot_additional_pages(zone);
521 return n > (nr_pages + PAGES_FOR_IO + 822 }
522 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
523}
524 823
525static int alloc_data_pages(struct pbe *pblist, gfp_t gfp_mask, int safe_needed) 824 pr_debug("swsusp: pages needed: %u + %u + %u, available pages: %u\n",
526{ 825 nr_pages, PAGES_FOR_IO, meta, free);
527 struct pbe *p;
528 826
529 for_each_pbe (p, pblist) { 827 return free > nr_pages + PAGES_FOR_IO + meta;
530 p->address = (unsigned long)alloc_image_page(gfp_mask, safe_needed);
531 if (!p->address)
532 return -ENOMEM;
533 }
534 return 0;
535} 828}
536 829
537static struct pbe *swsusp_alloc(unsigned int nr_pages) 830static int
831swsusp_alloc(struct memory_bitmap *orig_bm, struct memory_bitmap *copy_bm,
832 unsigned int nr_pages)
538{ 833{
539 struct pbe *pblist; 834 int error;
540 835
541 if (!(pblist = alloc_pagedir(nr_pages, GFP_ATOMIC | __GFP_COLD, 0))) { 836 error = memory_bm_create(orig_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY);
542 printk(KERN_ERR "suspend: Allocating pagedir failed.\n"); 837 if (error)
543 return NULL; 838 goto Free;
544 }
545 839
546 if (alloc_data_pages(pblist, GFP_ATOMIC | __GFP_COLD, 0)) { 840 error = memory_bm_create(copy_bm, GFP_ATOMIC | __GFP_COLD, PG_ANY);
547 printk(KERN_ERR "suspend: Allocating image pages failed.\n"); 841 if (error)
548 swsusp_free(); 842 goto Free;
549 return NULL; 843
844 while (nr_pages-- > 0) {
845 struct page *page = alloc_page(GFP_ATOMIC | __GFP_COLD);
846 if (!page)
847 goto Free;
848
849 SetPageNosave(page);
850 SetPageNosaveFree(page);
851 memory_bm_set_bit(copy_bm, page_to_pfn(page));
550 } 852 }
853 return 0;
551 854
552 return pblist; 855Free:
856 swsusp_free();
857 return -ENOMEM;
553} 858}
554 859
860/* Memory bitmap used for marking saveable pages */
861static struct memory_bitmap orig_bm;
862/* Memory bitmap used for marking allocated pages that will contain the copies
863 * of saveable pages
864 */
865static struct memory_bitmap copy_bm;
866
555asmlinkage int swsusp_save(void) 867asmlinkage int swsusp_save(void)
556{ 868{
557 unsigned int nr_pages; 869 unsigned int nr_pages;
@@ -562,25 +874,19 @@ asmlinkage int swsusp_save(void)
562 nr_pages = count_data_pages(); 874 nr_pages = count_data_pages();
563 printk("swsusp: Need to copy %u pages\n", nr_pages); 875 printk("swsusp: Need to copy %u pages\n", nr_pages);
564 876
565 pr_debug("swsusp: pages needed: %u + %lu + %u, free: %u\n",
566 nr_pages,
567 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE,
568 PAGES_FOR_IO, nr_free_pages());
569
570 if (!enough_free_mem(nr_pages)) { 877 if (!enough_free_mem(nr_pages)) {
571 printk(KERN_ERR "swsusp: Not enough free memory\n"); 878 printk(KERN_ERR "swsusp: Not enough free memory\n");
572 return -ENOMEM; 879 return -ENOMEM;
573 } 880 }
574 881
575 pagedir_nosave = swsusp_alloc(nr_pages); 882 if (swsusp_alloc(&orig_bm, &copy_bm, nr_pages))
576 if (!pagedir_nosave)
577 return -ENOMEM; 883 return -ENOMEM;
578 884
579 /* During allocating of suspend pagedir, new cold pages may appear. 885 /* During allocating of suspend pagedir, new cold pages may appear.
580 * Kill them. 886 * Kill them.
581 */ 887 */
582 drain_local_pages(); 888 drain_local_pages();
583 copy_data_pages(pagedir_nosave); 889 copy_data_pages(&copy_bm, &orig_bm);
584 890
585 /* 891 /*
586 * End of critical section. From now on, we can write to memory, 892 * End of critical section. From now on, we can write to memory,
@@ -609,22 +915,20 @@ static void init_header(struct swsusp_info *info)
609} 915}
610 916
611/** 917/**
612 * pack_orig_addresses - the .orig_address fields of the PBEs from the 918 * pack_pfns - pfns corresponding to the set bits found in the bitmap @bm
613 * list starting at @pbe are stored in the array @buf[] (1 page) 919 * are stored in the array @buf[] (1 page at a time)
614 */ 920 */
615 921
616static inline struct pbe *pack_orig_addresses(unsigned long *buf, struct pbe *pbe) 922static inline void
923pack_pfns(unsigned long *buf, struct memory_bitmap *bm)
617{ 924{
618 int j; 925 int j;
619 926
620 for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) { 927 for (j = 0; j < PAGE_SIZE / sizeof(long); j++) {
621 buf[j] = pbe->orig_address; 928 buf[j] = memory_bm_next_pfn(bm);
622 pbe = pbe->next; 929 if (unlikely(buf[j] == BM_END_OF_MAP))
930 break;
623 } 931 }
624 if (!pbe)
625 for (; j < PAGE_SIZE / sizeof(long); j++)
626 buf[j] = 0;
627 return pbe;
628} 932}
629 933
630/** 934/**
@@ -651,37 +955,39 @@ static inline struct pbe *pack_orig_addresses(unsigned long *buf, struct pbe *pb
651 955
652int snapshot_read_next(struct snapshot_handle *handle, size_t count) 956int snapshot_read_next(struct snapshot_handle *handle, size_t count)
653{ 957{
654 if (handle->page > nr_meta_pages + nr_copy_pages) 958 if (handle->cur > nr_meta_pages + nr_copy_pages)
655 return 0; 959 return 0;
960
656 if (!buffer) { 961 if (!buffer) {
657 /* This makes the buffer be freed by swsusp_free() */ 962 /* This makes the buffer be freed by swsusp_free() */
658 buffer = alloc_image_page(GFP_ATOMIC, 0); 963 buffer = alloc_image_page(GFP_ATOMIC, PG_ANY);
659 if (!buffer) 964 if (!buffer)
660 return -ENOMEM; 965 return -ENOMEM;
661 } 966 }
662 if (!handle->offset) { 967 if (!handle->offset) {
663 init_header((struct swsusp_info *)buffer); 968 init_header((struct swsusp_info *)buffer);
664 handle->buffer = buffer; 969 handle->buffer = buffer;
665 handle->pbe = pagedir_nosave; 970 memory_bm_position_reset(&orig_bm);
971 memory_bm_position_reset(&copy_bm);
666 } 972 }
667 if (handle->prev < handle->page) { 973 if (handle->prev < handle->cur) {
668 if (handle->page <= nr_meta_pages) { 974 if (handle->cur <= nr_meta_pages) {
669 handle->pbe = pack_orig_addresses(buffer, handle->pbe); 975 memset(buffer, 0, PAGE_SIZE);
670 if (!handle->pbe) 976 pack_pfns(buffer, &orig_bm);
671 handle->pbe = pagedir_nosave;
672 } else { 977 } else {
673 handle->buffer = (void *)handle->pbe->address; 978 unsigned long pfn = memory_bm_next_pfn(&copy_bm);
674 handle->pbe = handle->pbe->next; 979
980 handle->buffer = page_address(pfn_to_page(pfn));
675 } 981 }
676 handle->prev = handle->page; 982 handle->prev = handle->cur;
677 } 983 }
678 handle->buf_offset = handle->page_offset; 984 handle->buf_offset = handle->cur_offset;
679 if (handle->page_offset + count >= PAGE_SIZE) { 985 if (handle->cur_offset + count >= PAGE_SIZE) {
680 count = PAGE_SIZE - handle->page_offset; 986 count = PAGE_SIZE - handle->cur_offset;
681 handle->page_offset = 0; 987 handle->cur_offset = 0;
682 handle->page++; 988 handle->cur++;
683 } else { 989 } else {
684 handle->page_offset += count; 990 handle->cur_offset += count;
685 } 991 }
686 handle->offset += count; 992 handle->offset += count;
687 return count; 993 return count;
@@ -693,47 +999,50 @@ int snapshot_read_next(struct snapshot_handle *handle, size_t count)
693 * had been used before suspend 999 * had been used before suspend
694 */ 1000 */
695 1001
696static int mark_unsafe_pages(struct pbe *pblist) 1002static int mark_unsafe_pages(struct memory_bitmap *bm)
697{ 1003{
698 struct zone *zone; 1004 struct zone *zone;
699 unsigned long zone_pfn; 1005 unsigned long pfn, max_zone_pfn;
700 struct pbe *p;
701
702 if (!pblist) /* a sanity check */
703 return -EINVAL;
704 1006
705 /* Clear page flags */ 1007 /* Clear page flags */
706 for_each_zone (zone) { 1008 for_each_zone (zone) {
707 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) 1009 max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages;
708 if (pfn_valid(zone_pfn + zone->zone_start_pfn)) 1010 for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++)
709 ClearPageNosaveFree(pfn_to_page(zone_pfn + 1011 if (pfn_valid(pfn))
710 zone->zone_start_pfn)); 1012 ClearPageNosaveFree(pfn_to_page(pfn));
711 } 1013 }
712 1014
713 /* Mark orig addresses */ 1015 /* Mark pages that correspond to the "original" pfns as "unsafe" */
714 for_each_pbe (p, pblist) { 1016 memory_bm_position_reset(bm);
715 if (virt_addr_valid(p->orig_address)) 1017 do {
716 SetPageNosaveFree(virt_to_page(p->orig_address)); 1018 pfn = memory_bm_next_pfn(bm);
717 else 1019 if (likely(pfn != BM_END_OF_MAP)) {
718 return -EFAULT; 1020 if (likely(pfn_valid(pfn)))
719 } 1021 SetPageNosaveFree(pfn_to_page(pfn));
1022 else
1023 return -EFAULT;
1024 }
1025 } while (pfn != BM_END_OF_MAP);
720 1026
721 unsafe_pages = 0; 1027 allocated_unsafe_pages = 0;
722 1028
723 return 0; 1029 return 0;
724} 1030}
725 1031
726static void copy_page_backup_list(struct pbe *dst, struct pbe *src) 1032static void
1033duplicate_memory_bitmap(struct memory_bitmap *dst, struct memory_bitmap *src)
727{ 1034{
728 /* We assume both lists contain the same number of elements */ 1035 unsigned long pfn;
729 while (src) { 1036
730 dst->orig_address = src->orig_address; 1037 memory_bm_position_reset(src);
731 dst = dst->next; 1038 pfn = memory_bm_next_pfn(src);
732 src = src->next; 1039 while (pfn != BM_END_OF_MAP) {
1040 memory_bm_set_bit(dst, pfn);
1041 pfn = memory_bm_next_pfn(src);
733 } 1042 }
734} 1043}
735 1044
736static int check_header(struct swsusp_info *info) 1045static inline int check_header(struct swsusp_info *info)
737{ 1046{
738 char *reason = NULL; 1047 char *reason = NULL;
739 1048
@@ -760,19 +1069,14 @@ static int check_header(struct swsusp_info *info)
760 * load header - check the image header and copy data from it 1069 * load header - check the image header and copy data from it
761 */ 1070 */
762 1071
763static int load_header(struct snapshot_handle *handle, 1072static int
764 struct swsusp_info *info) 1073load_header(struct swsusp_info *info)
765{ 1074{
766 int error; 1075 int error;
767 struct pbe *pblist;
768 1076
1077 restore_pblist = NULL;
769 error = check_header(info); 1078 error = check_header(info);
770 if (!error) { 1079 if (!error) {
771 pblist = alloc_pagedir(info->image_pages, GFP_ATOMIC, 0);
772 if (!pblist)
773 return -ENOMEM;
774 pagedir_nosave = pblist;
775 handle->pbe = pblist;
776 nr_copy_pages = info->image_pages; 1080 nr_copy_pages = info->image_pages;
777 nr_meta_pages = info->pages - info->image_pages - 1; 1081 nr_meta_pages = info->pages - info->image_pages - 1;
778 } 1082 }
@@ -780,113 +1084,137 @@ static int load_header(struct snapshot_handle *handle,
780} 1084}
781 1085
782/** 1086/**
783 * unpack_orig_addresses - copy the elements of @buf[] (1 page) to 1087 * unpack_orig_pfns - for each element of @buf[] (1 page at a time) set
784 * the PBEs in the list starting at @pbe 1088 * the corresponding bit in the memory bitmap @bm
785 */ 1089 */
786 1090
787static inline struct pbe *unpack_orig_addresses(unsigned long *buf, 1091static inline void
788 struct pbe *pbe) 1092unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm)
789{ 1093{
790 int j; 1094 int j;
791 1095
792 for (j = 0; j < PAGE_SIZE / sizeof(long) && pbe; j++) { 1096 for (j = 0; j < PAGE_SIZE / sizeof(long); j++) {
793 pbe->orig_address = buf[j]; 1097 if (unlikely(buf[j] == BM_END_OF_MAP))
794 pbe = pbe->next; 1098 break;
1099
1100 memory_bm_set_bit(bm, buf[j]);
795 } 1101 }
796 return pbe;
797} 1102}
798 1103
799/** 1104/**
800 * prepare_image - use metadata contained in the PBE list 1105 * prepare_image - use the memory bitmap @bm to mark the pages that will
801 * pointed to by pagedir_nosave to mark the pages that will 1106 * be overwritten in the process of restoring the system memory state
802 * be overwritten in the process of restoring the system 1107 * from the suspend image ("unsafe" pages) and allocate memory for the
803 * memory state from the image ("unsafe" pages) and allocate 1108 * image.
804 * memory for the image
805 * 1109 *
806 * The idea is to allocate the PBE list first and then 1110 * The idea is to allocate a new memory bitmap first and then allocate
807 * allocate as many pages as it's needed for the image data, 1111 * as many pages as needed for the image data, but not to assign these
808 * but not to assign these pages to the PBEs initially. 1112 * pages to specific tasks initially. Instead, we just mark them as
809 * Instead, we just mark them as allocated and create a list 1113 * allocated and create a list of "safe" pages that will be used later.
810 * of "safe" which will be used later
811 */ 1114 */
812 1115
813struct safe_page { 1116#define PBES_PER_LINKED_PAGE (LINKED_PAGE_DATA_SIZE / sizeof(struct pbe))
814 struct safe_page *next;
815 char padding[PAGE_SIZE - sizeof(void *)];
816};
817 1117
818static struct safe_page *safe_pages; 1118static struct linked_page *safe_pages_list;
819 1119
820static int prepare_image(struct snapshot_handle *handle) 1120static int
1121prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm)
821{ 1122{
822 int error = 0; 1123 unsigned int nr_pages;
823 unsigned int nr_pages = nr_copy_pages; 1124 struct linked_page *sp_list, *lp;
824 struct pbe *p, *pblist = NULL; 1125 int error;
825 1126
826 p = pagedir_nosave; 1127 error = mark_unsafe_pages(bm);
827 error = mark_unsafe_pages(p); 1128 if (error)
828 if (!error) { 1129 goto Free;
829 pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1); 1130
830 if (pblist) 1131 error = memory_bm_create(new_bm, GFP_ATOMIC, PG_SAFE);
831 copy_page_backup_list(pblist, p); 1132 if (error)
832 free_pagedir(p, 0); 1133 goto Free;
833 if (!pblist) 1134
1135 duplicate_memory_bitmap(new_bm, bm);
1136 memory_bm_free(bm, PG_UNSAFE_KEEP);
1137 /* Reserve some safe pages for potential later use.
1138 *
1139 * NOTE: This way we make sure there will be enough safe pages for the
1140 * chain_alloc() in get_buffer(). It is a bit wasteful, but
1141 * nr_copy_pages cannot be greater than 50% of the memory anyway.
1142 */
1143 sp_list = NULL;
1144 /* nr_copy_pages cannot be lesser than allocated_unsafe_pages */
1145 nr_pages = nr_copy_pages - allocated_unsafe_pages;
1146 nr_pages = DIV_ROUND_UP(nr_pages, PBES_PER_LINKED_PAGE);
1147 while (nr_pages > 0) {
1148 lp = alloc_image_page(GFP_ATOMIC, PG_SAFE);
1149 if (!lp) {
834 error = -ENOMEM; 1150 error = -ENOMEM;
1151 goto Free;
1152 }
1153 lp->next = sp_list;
1154 sp_list = lp;
1155 nr_pages--;
835 } 1156 }
836 safe_pages = NULL; 1157 /* Preallocate memory for the image */
837 if (!error && nr_pages > unsafe_pages) { 1158 safe_pages_list = NULL;
838 nr_pages -= unsafe_pages; 1159 nr_pages = nr_copy_pages - allocated_unsafe_pages;
839 while (nr_pages--) { 1160 while (nr_pages > 0) {
840 struct safe_page *ptr; 1161 lp = (struct linked_page *)get_zeroed_page(GFP_ATOMIC);
841 1162 if (!lp) {
842 ptr = (struct safe_page *)get_zeroed_page(GFP_ATOMIC); 1163 error = -ENOMEM;
843 if (!ptr) { 1164 goto Free;
844 error = -ENOMEM; 1165 }
845 break; 1166 if (!PageNosaveFree(virt_to_page(lp))) {
846 } 1167 /* The page is "safe", add it to the list */
847 if (!PageNosaveFree(virt_to_page(ptr))) { 1168 lp->next = safe_pages_list;
848 /* The page is "safe", add it to the list */ 1169 safe_pages_list = lp;
849 ptr->next = safe_pages;
850 safe_pages = ptr;
851 }
852 /* Mark the page as allocated */
853 SetPageNosave(virt_to_page(ptr));
854 SetPageNosaveFree(virt_to_page(ptr));
855 } 1170 }
1171 /* Mark the page as allocated */
1172 SetPageNosave(virt_to_page(lp));
1173 SetPageNosaveFree(virt_to_page(lp));
1174 nr_pages--;
856 } 1175 }
857 if (!error) { 1176 /* Free the reserved safe pages so that chain_alloc() can use them */
858 pagedir_nosave = pblist; 1177 while (sp_list) {
859 } else { 1178 lp = sp_list->next;
860 handle->pbe = NULL; 1179 free_image_page(sp_list, PG_UNSAFE_CLEAR);
861 swsusp_free(); 1180 sp_list = lp;
862 } 1181 }
1182 return 0;
1183
1184Free:
1185 swsusp_free();
863 return error; 1186 return error;
864} 1187}
865 1188
866static void *get_buffer(struct snapshot_handle *handle) 1189/**
1190 * get_buffer - compute the address that snapshot_write_next() should
1191 * set for its caller to write to.
1192 */
1193
1194static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca)
867{ 1195{
868 struct pbe *pbe = handle->pbe, *last = handle->last_pbe; 1196 struct pbe *pbe;
869 struct page *page = virt_to_page(pbe->orig_address); 1197 struct page *page = pfn_to_page(memory_bm_next_pfn(bm));
870 1198
871 if (PageNosave(page) && PageNosaveFree(page)) { 1199 if (PageNosave(page) && PageNosaveFree(page))
872 /* 1200 /* We have allocated the "original" page frame and we can
873 * We have allocated the "original" page frame and we can 1201 * use it directly to store the loaded page.
874 * use it directly to store the read page
875 */ 1202 */
876 pbe->address = 0; 1203 return page_address(page);
877 if (last && last->next) 1204
878 last->next = NULL; 1205 /* The "original" page frame has not been allocated and we have to
879 return (void *)pbe->orig_address; 1206 * use a "safe" page frame to store the loaded page.
880 }
881 /*
882 * The "original" page frame has not been allocated and we have to
883 * use a "safe" page frame to store the read page
884 */ 1207 */
885 pbe->address = (unsigned long)safe_pages; 1208 pbe = chain_alloc(ca, sizeof(struct pbe));
886 safe_pages = safe_pages->next; 1209 if (!pbe) {
887 if (last) 1210 swsusp_free();
888 last->next = pbe; 1211 return NULL;
889 handle->last_pbe = pbe; 1212 }
1213 pbe->orig_address = (unsigned long)page_address(page);
1214 pbe->address = (unsigned long)safe_pages_list;
1215 safe_pages_list = safe_pages_list->next;
1216 pbe->next = restore_pblist;
1217 restore_pblist = pbe;
890 return (void *)pbe->address; 1218 return (void *)pbe->address;
891} 1219}
892 1220
@@ -914,46 +1242,60 @@ static void *get_buffer(struct snapshot_handle *handle)
914 1242
915int snapshot_write_next(struct snapshot_handle *handle, size_t count) 1243int snapshot_write_next(struct snapshot_handle *handle, size_t count)
916{ 1244{
1245 static struct chain_allocator ca;
917 int error = 0; 1246 int error = 0;
918 1247
919 if (handle->prev && handle->page > nr_meta_pages + nr_copy_pages) 1248 /* Check if we have already loaded the entire image */
1249 if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages)
920 return 0; 1250 return 0;
1251
921 if (!buffer) { 1252 if (!buffer) {
922 /* This makes the buffer be freed by swsusp_free() */ 1253 /* This makes the buffer be freed by swsusp_free() */
923 buffer = alloc_image_page(GFP_ATOMIC, 0); 1254 buffer = alloc_image_page(GFP_ATOMIC, PG_ANY);
924 if (!buffer) 1255 if (!buffer)
925 return -ENOMEM; 1256 return -ENOMEM;
926 } 1257 }
927 if (!handle->offset) 1258 if (!handle->offset)
928 handle->buffer = buffer; 1259 handle->buffer = buffer;
929 if (handle->prev < handle->page) { 1260 handle->sync_read = 1;
930 if (!handle->prev) { 1261 if (handle->prev < handle->cur) {
931 error = load_header(handle, (struct swsusp_info *)buffer); 1262 if (handle->prev == 0) {
1263 error = load_header(buffer);
1264 if (error)
1265 return error;
1266
1267 error = memory_bm_create(&copy_bm, GFP_ATOMIC, PG_ANY);
932 if (error) 1268 if (error)
933 return error; 1269 return error;
1270
934 } else if (handle->prev <= nr_meta_pages) { 1271 } else if (handle->prev <= nr_meta_pages) {
935 handle->pbe = unpack_orig_addresses(buffer, handle->pbe); 1272 unpack_orig_pfns(buffer, &copy_bm);
936 if (!handle->pbe) { 1273 if (handle->prev == nr_meta_pages) {
937 error = prepare_image(handle); 1274 error = prepare_image(&orig_bm, &copy_bm);
938 if (error) 1275 if (error)
939 return error; 1276 return error;
940 handle->pbe = pagedir_nosave; 1277
941 handle->last_pbe = NULL; 1278 chain_init(&ca, GFP_ATOMIC, PG_SAFE);
942 handle->buffer = get_buffer(handle); 1279 memory_bm_position_reset(&orig_bm);
1280 restore_pblist = NULL;
1281 handle->buffer = get_buffer(&orig_bm, &ca);
1282 handle->sync_read = 0;
1283 if (!handle->buffer)
1284 return -ENOMEM;
943 } 1285 }
944 } else { 1286 } else {
945 handle->pbe = handle->pbe->next; 1287 handle->buffer = get_buffer(&orig_bm, &ca);
946 handle->buffer = get_buffer(handle); 1288 handle->sync_read = 0;
947 } 1289 }
948 handle->prev = handle->page; 1290 handle->prev = handle->cur;
949 } 1291 }
950 handle->buf_offset = handle->page_offset; 1292 handle->buf_offset = handle->cur_offset;
951 if (handle->page_offset + count >= PAGE_SIZE) { 1293 if (handle->cur_offset + count >= PAGE_SIZE) {
952 count = PAGE_SIZE - handle->page_offset; 1294 count = PAGE_SIZE - handle->cur_offset;
953 handle->page_offset = 0; 1295 handle->cur_offset = 0;
954 handle->page++; 1296 handle->cur++;
955 } else { 1297 } else {
956 handle->page_offset += count; 1298 handle->cur_offset += count;
957 } 1299 }
958 handle->offset += count; 1300 handle->offset += count;
959 return count; 1301 return count;
@@ -961,6 +1303,13 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
961 1303
962int snapshot_image_loaded(struct snapshot_handle *handle) 1304int snapshot_image_loaded(struct snapshot_handle *handle)
963{ 1305{
964 return !(!handle->pbe || handle->pbe->next || !nr_copy_pages || 1306 return !(!nr_copy_pages ||
965 handle->page <= nr_meta_pages + nr_copy_pages); 1307 handle->cur <= nr_meta_pages + nr_copy_pages);
1308}
1309
1310void snapshot_free_unused_memory(struct snapshot_handle *handle)
1311{
1312 /* Free only if we have loaded the image entirely */
1313 if (handle->prev && handle->cur > nr_meta_pages + nr_copy_pages)
1314 memory_bm_free(&orig_bm, PG_UNSAFE_CLEAR);
966} 1315}
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 044b8e0c1025..9b2ee5344dee 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -22,6 +22,7 @@
22#include <linux/device.h> 22#include <linux/device.h>
23#include <linux/buffer_head.h> 23#include <linux/buffer_head.h>
24#include <linux/bio.h> 24#include <linux/bio.h>
25#include <linux/blkdev.h>
25#include <linux/swap.h> 26#include <linux/swap.h>
26#include <linux/swapops.h> 27#include <linux/swapops.h>
27#include <linux/pm.h> 28#include <linux/pm.h>
@@ -49,18 +50,16 @@ static int mark_swapfiles(swp_entry_t start)
49{ 50{
50 int error; 51 int error;
51 52
52 rw_swap_page_sync(READ, 53 rw_swap_page_sync(READ, swp_entry(root_swap, 0),
53 swp_entry(root_swap, 0), 54 virt_to_page((unsigned long)&swsusp_header), NULL);
54 virt_to_page((unsigned long)&swsusp_header));
55 if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) || 55 if (!memcmp("SWAP-SPACE",swsusp_header.sig, 10) ||
56 !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) { 56 !memcmp("SWAPSPACE2",swsusp_header.sig, 10)) {
57 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10); 57 memcpy(swsusp_header.orig_sig,swsusp_header.sig, 10);
58 memcpy(swsusp_header.sig,SWSUSP_SIG, 10); 58 memcpy(swsusp_header.sig,SWSUSP_SIG, 10);
59 swsusp_header.image = start; 59 swsusp_header.image = start;
60 error = rw_swap_page_sync(WRITE, 60 error = rw_swap_page_sync(WRITE, swp_entry(root_swap, 0),
61 swp_entry(root_swap, 0), 61 virt_to_page((unsigned long)&swsusp_header),
62 virt_to_page((unsigned long) 62 NULL);
63 &swsusp_header));
64 } else { 63 } else {
65 pr_debug("swsusp: Partition is not swap space.\n"); 64 pr_debug("swsusp: Partition is not swap space.\n");
66 error = -ENODEV; 65 error = -ENODEV;
@@ -88,16 +87,37 @@ static int swsusp_swap_check(void) /* This is called before saving image */
88 * write_page - Write one page to given swap location. 87 * write_page - Write one page to given swap location.
89 * @buf: Address we're writing. 88 * @buf: Address we're writing.
90 * @offset: Offset of the swap page we're writing to. 89 * @offset: Offset of the swap page we're writing to.
90 * @bio_chain: Link the next write BIO here
91 */ 91 */
92 92
93static int write_page(void *buf, unsigned long offset) 93static int write_page(void *buf, unsigned long offset, struct bio **bio_chain)
94{ 94{
95 swp_entry_t entry; 95 swp_entry_t entry;
96 int error = -ENOSPC; 96 int error = -ENOSPC;
97 97
98 if (offset) { 98 if (offset) {
99 struct page *page = virt_to_page(buf);
100
101 if (bio_chain) {
102 /*
103 * Whether or not we successfully allocated a copy page,
104 * we take a ref on the page here. It gets undone in
105 * wait_on_bio_chain().
106 */
107 struct page *page_copy;
108 page_copy = alloc_page(GFP_ATOMIC);
109 if (page_copy == NULL) {
110 WARN_ON_ONCE(1);
111 bio_chain = NULL; /* Go synchronous */
112 get_page(page);
113 } else {
114 memcpy(page_address(page_copy),
115 page_address(page), PAGE_SIZE);
116 page = page_copy;
117 }
118 }
99 entry = swp_entry(root_swap, offset); 119 entry = swp_entry(root_swap, offset);
100 error = rw_swap_page_sync(WRITE, entry, virt_to_page(buf)); 120 error = rw_swap_page_sync(WRITE, entry, page, bio_chain);
101 } 121 }
102 return error; 122 return error;
103} 123}
@@ -146,6 +166,26 @@ static void release_swap_writer(struct swap_map_handle *handle)
146 handle->bitmap = NULL; 166 handle->bitmap = NULL;
147} 167}
148 168
169static void show_speed(struct timeval *start, struct timeval *stop,
170 unsigned nr_pages, char *msg)
171{
172 s64 elapsed_centisecs64;
173 int centisecs;
174 int k;
175 int kps;
176
177 elapsed_centisecs64 = timeval_to_ns(stop) - timeval_to_ns(start);
178 do_div(elapsed_centisecs64, NSEC_PER_SEC / 100);
179 centisecs = elapsed_centisecs64;
180 if (centisecs == 0)
181 centisecs = 1; /* avoid div-by-zero */
182 k = nr_pages * (PAGE_SIZE / 1024);
183 kps = (k * 100) / centisecs;
184 printk("%s %d kbytes in %d.%02d seconds (%d.%02d MB/s)\n", msg, k,
185 centisecs / 100, centisecs % 100,
186 kps / 1000, (kps % 1000) / 10);
187}
188
149static int get_swap_writer(struct swap_map_handle *handle) 189static int get_swap_writer(struct swap_map_handle *handle)
150{ 190{
151 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL); 191 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_KERNEL);
@@ -165,37 +205,70 @@ static int get_swap_writer(struct swap_map_handle *handle)
165 return 0; 205 return 0;
166} 206}
167 207
168static int swap_write_page(struct swap_map_handle *handle, void *buf) 208static int wait_on_bio_chain(struct bio **bio_chain)
169{ 209{
170 int error; 210 struct bio *bio;
211 struct bio *next_bio;
212 int ret = 0;
213
214 if (bio_chain == NULL)
215 return 0;
216
217 bio = *bio_chain;
218 if (bio == NULL)
219 return 0;
220 while (bio) {
221 struct page *page;
222
223 next_bio = bio->bi_private;
224 page = bio->bi_io_vec[0].bv_page;
225 wait_on_page_locked(page);
226 if (!PageUptodate(page) || PageError(page))
227 ret = -EIO;
228 put_page(page);
229 bio_put(bio);
230 bio = next_bio;
231 }
232 *bio_chain = NULL;
233 return ret;
234}
235
236static int swap_write_page(struct swap_map_handle *handle, void *buf,
237 struct bio **bio_chain)
238{
239 int error = 0;
171 unsigned long offset; 240 unsigned long offset;
172 241
173 if (!handle->cur) 242 if (!handle->cur)
174 return -EINVAL; 243 return -EINVAL;
175 offset = alloc_swap_page(root_swap, handle->bitmap); 244 offset = alloc_swap_page(root_swap, handle->bitmap);
176 error = write_page(buf, offset); 245 error = write_page(buf, offset, bio_chain);
177 if (error) 246 if (error)
178 return error; 247 return error;
179 handle->cur->entries[handle->k++] = offset; 248 handle->cur->entries[handle->k++] = offset;
180 if (handle->k >= MAP_PAGE_ENTRIES) { 249 if (handle->k >= MAP_PAGE_ENTRIES) {
250 error = wait_on_bio_chain(bio_chain);
251 if (error)
252 goto out;
181 offset = alloc_swap_page(root_swap, handle->bitmap); 253 offset = alloc_swap_page(root_swap, handle->bitmap);
182 if (!offset) 254 if (!offset)
183 return -ENOSPC; 255 return -ENOSPC;
184 handle->cur->next_swap = offset; 256 handle->cur->next_swap = offset;
185 error = write_page(handle->cur, handle->cur_swap); 257 error = write_page(handle->cur, handle->cur_swap, NULL);
186 if (error) 258 if (error)
187 return error; 259 goto out;
188 memset(handle->cur, 0, PAGE_SIZE); 260 memset(handle->cur, 0, PAGE_SIZE);
189 handle->cur_swap = offset; 261 handle->cur_swap = offset;
190 handle->k = 0; 262 handle->k = 0;
191 } 263 }
192 return 0; 264out:
265 return error;
193} 266}
194 267
195static int flush_swap_writer(struct swap_map_handle *handle) 268static int flush_swap_writer(struct swap_map_handle *handle)
196{ 269{
197 if (handle->cur && handle->cur_swap) 270 if (handle->cur && handle->cur_swap)
198 return write_page(handle->cur, handle->cur_swap); 271 return write_page(handle->cur, handle->cur_swap, NULL);
199 else 272 else
200 return -EINVAL; 273 return -EINVAL;
201} 274}
@@ -206,21 +279,29 @@ static int flush_swap_writer(struct swap_map_handle *handle)
206 279
207static int save_image(struct swap_map_handle *handle, 280static int save_image(struct swap_map_handle *handle,
208 struct snapshot_handle *snapshot, 281 struct snapshot_handle *snapshot,
209 unsigned int nr_pages) 282 unsigned int nr_to_write)
210{ 283{
211 unsigned int m; 284 unsigned int m;
212 int ret; 285 int ret;
213 int error = 0; 286 int error = 0;
287 int nr_pages;
288 int err2;
289 struct bio *bio;
290 struct timeval start;
291 struct timeval stop;
214 292
215 printk("Saving image data pages (%u pages) ... ", nr_pages); 293 printk("Saving image data pages (%u pages) ... ", nr_to_write);
216 m = nr_pages / 100; 294 m = nr_to_write / 100;
217 if (!m) 295 if (!m)
218 m = 1; 296 m = 1;
219 nr_pages = 0; 297 nr_pages = 0;
298 bio = NULL;
299 do_gettimeofday(&start);
220 do { 300 do {
221 ret = snapshot_read_next(snapshot, PAGE_SIZE); 301 ret = snapshot_read_next(snapshot, PAGE_SIZE);
222 if (ret > 0) { 302 if (ret > 0) {
223 error = swap_write_page(handle, data_of(*snapshot)); 303 error = swap_write_page(handle, data_of(*snapshot),
304 &bio);
224 if (error) 305 if (error)
225 break; 306 break;
226 if (!(nr_pages % m)) 307 if (!(nr_pages % m))
@@ -228,8 +309,13 @@ static int save_image(struct swap_map_handle *handle,
228 nr_pages++; 309 nr_pages++;
229 } 310 }
230 } while (ret > 0); 311 } while (ret > 0);
312 err2 = wait_on_bio_chain(&bio);
313 do_gettimeofday(&stop);
314 if (!error)
315 error = err2;
231 if (!error) 316 if (!error)
232 printk("\b\b\b\bdone\n"); 317 printk("\b\b\b\bdone\n");
318 show_speed(&start, &stop, nr_to_write, "Wrote");
233 return error; 319 return error;
234} 320}
235 321
@@ -245,8 +331,7 @@ static int enough_swap(unsigned int nr_pages)
245 unsigned int free_swap = count_swap_pages(root_swap, 1); 331 unsigned int free_swap = count_swap_pages(root_swap, 1);
246 332
247 pr_debug("swsusp: free swap pages: %u\n", free_swap); 333 pr_debug("swsusp: free swap pages: %u\n", free_swap);
248 return free_swap > (nr_pages + PAGES_FOR_IO + 334 return free_swap > nr_pages + PAGES_FOR_IO;
249 (nr_pages + PBES_PER_PAGE - 1) / PBES_PER_PAGE);
250} 335}
251 336
252/** 337/**
@@ -263,11 +348,11 @@ int swsusp_write(void)
263 struct swap_map_handle handle; 348 struct swap_map_handle handle;
264 struct snapshot_handle snapshot; 349 struct snapshot_handle snapshot;
265 struct swsusp_info *header; 350 struct swsusp_info *header;
266 unsigned long start;
267 int error; 351 int error;
268 352
269 if ((error = swsusp_swap_check())) { 353 if ((error = swsusp_swap_check())) {
270 printk(KERN_ERR "swsusp: Cannot find swap device, try swapon -a.\n"); 354 printk(KERN_ERR "swsusp: Cannot find swap device, try "
355 "swapon -a.\n");
271 return error; 356 return error;
272 } 357 }
273 memset(&snapshot, 0, sizeof(struct snapshot_handle)); 358 memset(&snapshot, 0, sizeof(struct snapshot_handle));
@@ -281,16 +366,17 @@ int swsusp_write(void)
281 } 366 }
282 error = get_swap_writer(&handle); 367 error = get_swap_writer(&handle);
283 if (!error) { 368 if (!error) {
284 start = handle.cur_swap; 369 unsigned long start = handle.cur_swap;
285 error = swap_write_page(&handle, header); 370 error = swap_write_page(&handle, header, NULL);
286 } 371 if (!error)
287 if (!error) 372 error = save_image(&handle, &snapshot,
288 error = save_image(&handle, &snapshot, header->pages - 1); 373 header->pages - 1);
289 if (!error) { 374 if (!error) {
290 flush_swap_writer(&handle); 375 flush_swap_writer(&handle);
291 printk("S"); 376 printk("S");
292 error = mark_swapfiles(swp_entry(root_swap, start)); 377 error = mark_swapfiles(swp_entry(root_swap, start));
293 printk("|\n"); 378 printk("|\n");
379 }
294 } 380 }
295 if (error) 381 if (error)
296 free_all_swap_pages(root_swap, handle.bitmap); 382 free_all_swap_pages(root_swap, handle.bitmap);
@@ -298,25 +384,6 @@ int swsusp_write(void)
298 return error; 384 return error;
299} 385}
300 386
301/*
302 * Using bio to read from swap.
303 * This code requires a bit more work than just using buffer heads
304 * but, it is the recommended way for 2.5/2.6.
305 * The following are to signal the beginning and end of I/O. Bios
306 * finish asynchronously, while we want them to happen synchronously.
307 * A simple atomic_t, and a wait loop take care of this problem.
308 */
309
310static atomic_t io_done = ATOMIC_INIT(0);
311
312static int end_io(struct bio *bio, unsigned int num, int err)
313{
314 if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
315 panic("I/O error reading memory image");
316 atomic_set(&io_done, 0);
317 return 0;
318}
319
320static struct block_device *resume_bdev; 387static struct block_device *resume_bdev;
321 388
322/** 389/**
@@ -324,15 +391,15 @@ static struct block_device *resume_bdev;
324 * @rw: READ or WRITE. 391 * @rw: READ or WRITE.
325 * @off physical offset of page. 392 * @off physical offset of page.
326 * @page: page we're reading or writing. 393 * @page: page we're reading or writing.
394 * @bio_chain: list of pending biod (for async reading)
327 * 395 *
328 * Straight from the textbook - allocate and initialize the bio. 396 * Straight from the textbook - allocate and initialize the bio.
329 * If we're writing, make sure the page is marked as dirty. 397 * If we're reading, make sure the page is marked as dirty.
330 * Then submit it and wait. 398 * Then submit it and, if @bio_chain == NULL, wait.
331 */ 399 */
332 400static int submit(int rw, pgoff_t page_off, struct page *page,
333static int submit(int rw, pgoff_t page_off, void *page) 401 struct bio **bio_chain)
334{ 402{
335 int error = 0;
336 struct bio *bio; 403 struct bio *bio;
337 404
338 bio = bio_alloc(GFP_ATOMIC, 1); 405 bio = bio_alloc(GFP_ATOMIC, 1);
@@ -340,33 +407,40 @@ static int submit(int rw, pgoff_t page_off, void *page)
340 return -ENOMEM; 407 return -ENOMEM;
341 bio->bi_sector = page_off * (PAGE_SIZE >> 9); 408 bio->bi_sector = page_off * (PAGE_SIZE >> 9);
342 bio->bi_bdev = resume_bdev; 409 bio->bi_bdev = resume_bdev;
343 bio->bi_end_io = end_io; 410 bio->bi_end_io = end_swap_bio_read;
344 411
345 if (bio_add_page(bio, virt_to_page(page), PAGE_SIZE, 0) < PAGE_SIZE) { 412 if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
346 printk("swsusp: ERROR: adding page to bio at %ld\n",page_off); 413 printk("swsusp: ERROR: adding page to bio at %ld\n", page_off);
347 error = -EFAULT; 414 bio_put(bio);
348 goto Done; 415 return -EFAULT;
349 } 416 }
350 417
351 atomic_set(&io_done, 1); 418 lock_page(page);
352 submit_bio(rw | (1 << BIO_RW_SYNC), bio); 419 bio_get(bio);
353 while (atomic_read(&io_done)) 420
354 yield(); 421 if (bio_chain == NULL) {
355 if (rw == READ) 422 submit_bio(rw | (1 << BIO_RW_SYNC), bio);
356 bio_set_pages_dirty(bio); 423 wait_on_page_locked(page);
357 Done: 424 if (rw == READ)
358 bio_put(bio); 425 bio_set_pages_dirty(bio);
359 return error; 426 bio_put(bio);
427 } else {
428 get_page(page);
429 bio->bi_private = *bio_chain;
430 *bio_chain = bio;
431 submit_bio(rw | (1 << BIO_RW_SYNC), bio);
432 }
433 return 0;
360} 434}
361 435
362static int bio_read_page(pgoff_t page_off, void *page) 436static int bio_read_page(pgoff_t page_off, void *addr, struct bio **bio_chain)
363{ 437{
364 return submit(READ, page_off, page); 438 return submit(READ, page_off, virt_to_page(addr), bio_chain);
365} 439}
366 440
367static int bio_write_page(pgoff_t page_off, void *page) 441static int bio_write_page(pgoff_t page_off, void *addr)
368{ 442{
369 return submit(WRITE, page_off, page); 443 return submit(WRITE, page_off, virt_to_page(addr), NULL);
370} 444}
371 445
372/** 446/**
@@ -391,7 +465,7 @@ static int get_swap_reader(struct swap_map_handle *handle,
391 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC); 465 handle->cur = (struct swap_map_page *)get_zeroed_page(GFP_ATOMIC);
392 if (!handle->cur) 466 if (!handle->cur)
393 return -ENOMEM; 467 return -ENOMEM;
394 error = bio_read_page(swp_offset(start), handle->cur); 468 error = bio_read_page(swp_offset(start), handle->cur, NULL);
395 if (error) { 469 if (error) {
396 release_swap_reader(handle); 470 release_swap_reader(handle);
397 return error; 471 return error;
@@ -400,7 +474,8 @@ static int get_swap_reader(struct swap_map_handle *handle,
400 return 0; 474 return 0;
401} 475}
402 476
403static int swap_read_page(struct swap_map_handle *handle, void *buf) 477static int swap_read_page(struct swap_map_handle *handle, void *buf,
478 struct bio **bio_chain)
404{ 479{
405 unsigned long offset; 480 unsigned long offset;
406 int error; 481 int error;
@@ -410,16 +485,17 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf)
410 offset = handle->cur->entries[handle->k]; 485 offset = handle->cur->entries[handle->k];
411 if (!offset) 486 if (!offset)
412 return -EFAULT; 487 return -EFAULT;
413 error = bio_read_page(offset, buf); 488 error = bio_read_page(offset, buf, bio_chain);
414 if (error) 489 if (error)
415 return error; 490 return error;
416 if (++handle->k >= MAP_PAGE_ENTRIES) { 491 if (++handle->k >= MAP_PAGE_ENTRIES) {
492 error = wait_on_bio_chain(bio_chain);
417 handle->k = 0; 493 handle->k = 0;
418 offset = handle->cur->next_swap; 494 offset = handle->cur->next_swap;
419 if (!offset) 495 if (!offset)
420 release_swap_reader(handle); 496 release_swap_reader(handle);
421 else 497 else if (!error)
422 error = bio_read_page(offset, handle->cur); 498 error = bio_read_page(offset, handle->cur, NULL);
423 } 499 }
424 return error; 500 return error;
425} 501}
@@ -432,33 +508,49 @@ static int swap_read_page(struct swap_map_handle *handle, void *buf)
432 508
433static int load_image(struct swap_map_handle *handle, 509static int load_image(struct swap_map_handle *handle,
434 struct snapshot_handle *snapshot, 510 struct snapshot_handle *snapshot,
435 unsigned int nr_pages) 511 unsigned int nr_to_read)
436{ 512{
437 unsigned int m; 513 unsigned int m;
438 int ret;
439 int error = 0; 514 int error = 0;
515 struct timeval start;
516 struct timeval stop;
517 struct bio *bio;
518 int err2;
519 unsigned nr_pages;
440 520
441 printk("Loading image data pages (%u pages) ... ", nr_pages); 521 printk("Loading image data pages (%u pages) ... ", nr_to_read);
442 m = nr_pages / 100; 522 m = nr_to_read / 100;
443 if (!m) 523 if (!m)
444 m = 1; 524 m = 1;
445 nr_pages = 0; 525 nr_pages = 0;
446 do { 526 bio = NULL;
447 ret = snapshot_write_next(snapshot, PAGE_SIZE); 527 do_gettimeofday(&start);
448 if (ret > 0) { 528 for ( ; ; ) {
449 error = swap_read_page(handle, data_of(*snapshot)); 529 error = snapshot_write_next(snapshot, PAGE_SIZE);
450 if (error) 530 if (error <= 0)
451 break; 531 break;
452 if (!(nr_pages % m)) 532 error = swap_read_page(handle, data_of(*snapshot), &bio);
453 printk("\b\b\b\b%3d%%", nr_pages / m); 533 if (error)
454 nr_pages++; 534 break;
455 } 535 if (snapshot->sync_read)
456 } while (ret > 0); 536 error = wait_on_bio_chain(&bio);
537 if (error)
538 break;
539 if (!(nr_pages % m))
540 printk("\b\b\b\b%3d%%", nr_pages / m);
541 nr_pages++;
542 }
543 err2 = wait_on_bio_chain(&bio);
544 do_gettimeofday(&stop);
545 if (!error)
546 error = err2;
457 if (!error) { 547 if (!error) {
458 printk("\b\b\b\bdone\n"); 548 printk("\b\b\b\bdone\n");
549 snapshot_free_unused_memory(snapshot);
459 if (!snapshot_image_loaded(snapshot)) 550 if (!snapshot_image_loaded(snapshot))
460 error = -ENODATA; 551 error = -ENODATA;
461 } 552 }
553 show_speed(&start, &stop, nr_to_read, "Read");
462 return error; 554 return error;
463} 555}
464 556
@@ -481,7 +573,7 @@ int swsusp_read(void)
481 header = (struct swsusp_info *)data_of(snapshot); 573 header = (struct swsusp_info *)data_of(snapshot);
482 error = get_swap_reader(&handle, swsusp_header.image); 574 error = get_swap_reader(&handle, swsusp_header.image);
483 if (!error) 575 if (!error)
484 error = swap_read_page(&handle, header); 576 error = swap_read_page(&handle, header, NULL);
485 if (!error) 577 if (!error)
486 error = load_image(&handle, &snapshot, header->pages - 1); 578 error = load_image(&handle, &snapshot, header->pages - 1);
487 release_swap_reader(&handle); 579 release_swap_reader(&handle);
@@ -507,7 +599,7 @@ int swsusp_check(void)
507 if (!IS_ERR(resume_bdev)) { 599 if (!IS_ERR(resume_bdev)) {
508 set_blocksize(resume_bdev, PAGE_SIZE); 600 set_blocksize(resume_bdev, PAGE_SIZE);
509 memset(&swsusp_header, 0, sizeof(swsusp_header)); 601 memset(&swsusp_header, 0, sizeof(swsusp_header));
510 if ((error = bio_read_page(0, &swsusp_header))) 602 if ((error = bio_read_page(0, &swsusp_header, NULL)))
511 return error; 603 return error;
512 if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) { 604 if (!memcmp(SWSUSP_SIG, swsusp_header.sig, 10)) {
513 memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10); 605 memcpy(swsusp_header.sig, swsusp_header.orig_sig, 10);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index f0ee4e7780d6..0b66659dc516 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -62,6 +62,16 @@ unsigned long image_size = 500 * 1024 * 1024;
62 62
63int in_suspend __nosavedata = 0; 63int in_suspend __nosavedata = 0;
64 64
65#ifdef CONFIG_HIGHMEM
66unsigned int count_highmem_pages(void);
67int save_highmem(void);
68int restore_highmem(void);
69#else
70static inline int save_highmem(void) { return 0; }
71static inline int restore_highmem(void) { return 0; }
72static inline unsigned int count_highmem_pages(void) { return 0; }
73#endif
74
65/** 75/**
66 * The following functions are used for tracing the allocated 76 * The following functions are used for tracing the allocated
67 * swap pages, so that they can be freed in case of an error. 77 * swap pages, so that they can be freed in case of an error.
@@ -182,15 +192,14 @@ int swsusp_shrink_memory(void)
182 192
183 printk("Shrinking memory... "); 193 printk("Shrinking memory... ");
184 do { 194 do {
185 size = 2 * count_special_pages(); 195 size = 2 * count_highmem_pages();
186 size += size / 50 + count_data_pages(); 196 size += size / 50 + count_data_pages() + PAGES_FOR_IO;
187 size += (size + PBES_PER_PAGE - 1) / PBES_PER_PAGE +
188 PAGES_FOR_IO;
189 tmp = size; 197 tmp = size;
190 for_each_zone (zone) 198 for_each_zone (zone)
191 if (!is_highmem(zone) && populated_zone(zone)) { 199 if (!is_highmem(zone) && populated_zone(zone)) {
192 tmp -= zone->free_pages; 200 tmp -= zone->free_pages;
193 tmp += zone->lowmem_reserve[ZONE_NORMAL]; 201 tmp += zone->lowmem_reserve[ZONE_NORMAL];
202 tmp += snapshot_additional_pages(zone);
194 } 203 }
195 if (tmp > 0) { 204 if (tmp > 0) {
196 tmp = __shrink_memory(tmp); 205 tmp = __shrink_memory(tmp);
@@ -226,7 +235,7 @@ int swsusp_suspend(void)
226 goto Enable_irqs; 235 goto Enable_irqs;
227 } 236 }
228 237
229 if ((error = save_special_mem())) { 238 if ((error = save_highmem())) {
230 printk(KERN_ERR "swsusp: Not enough free pages for highmem\n"); 239 printk(KERN_ERR "swsusp: Not enough free pages for highmem\n");
231 goto Restore_highmem; 240 goto Restore_highmem;
232 } 241 }
@@ -237,7 +246,10 @@ int swsusp_suspend(void)
237 /* Restore control flow magically appears here */ 246 /* Restore control flow magically appears here */
238 restore_processor_state(); 247 restore_processor_state();
239Restore_highmem: 248Restore_highmem:
240 restore_special_mem(); 249 restore_highmem();
250 /* NOTE: device_power_up() is just a resume() for devices
251 * that suspended with irqs off ... no overall powerup.
252 */
241 device_power_up(); 253 device_power_up();
242Enable_irqs: 254Enable_irqs:
243 local_irq_enable(); 255 local_irq_enable();
@@ -247,8 +259,12 @@ Enable_irqs:
247int swsusp_resume(void) 259int swsusp_resume(void)
248{ 260{
249 int error; 261 int error;
262
250 local_irq_disable(); 263 local_irq_disable();
251 if (device_power_down(PMSG_FREEZE)) 264 /* NOTE: device_power_down() is just a suspend() with irqs off;
265 * it has no special "power things down" semantics
266 */
267 if (device_power_down(PMSG_PRETHAW))
252 printk(KERN_ERR "Some devices failed to power down, very bad\n"); 268 printk(KERN_ERR "Some devices failed to power down, very bad\n");
253 /* We'll ignore saved state, but this gets preempt count (etc) right */ 269 /* We'll ignore saved state, but this gets preempt count (etc) right */
254 save_processor_state(); 270 save_processor_state();
@@ -263,7 +279,7 @@ int swsusp_resume(void)
263 */ 279 */
264 swsusp_free(); 280 swsusp_free();
265 restore_processor_state(); 281 restore_processor_state();
266 restore_special_mem(); 282 restore_highmem();
267 touch_softlockup_watchdog(); 283 touch_softlockup_watchdog();
268 device_power_up(); 284 device_power_up();
269 local_irq_enable(); 285 local_irq_enable();
diff --git a/kernel/power/user.c b/kernel/power/user.c
index 3f1539fbe48a..72825c853cd7 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -19,6 +19,7 @@
19#include <linux/swapops.h> 19#include <linux/swapops.h>
20#include <linux/pm.h> 20#include <linux/pm.h>
21#include <linux/fs.h> 21#include <linux/fs.h>
22#include <linux/cpu.h>
22 23
23#include <asm/uaccess.h> 24#include <asm/uaccess.h>
24 25
@@ -139,12 +140,15 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
139 if (data->frozen) 140 if (data->frozen)
140 break; 141 break;
141 down(&pm_sem); 142 down(&pm_sem);
142 disable_nonboot_cpus(); 143 error = disable_nonboot_cpus();
143 if (freeze_processes()) { 144 if (!error) {
144 thaw_processes(); 145 error = freeze_processes();
145 enable_nonboot_cpus(); 146 if (error) {
146 error = -EBUSY; 147 thaw_processes();
148 error = -EBUSY;
149 }
147 } 150 }
151 enable_nonboot_cpus();
148 up(&pm_sem); 152 up(&pm_sem);
149 if (!error) 153 if (!error)
150 data->frozen = 1; 154 data->frozen = 1;
@@ -189,9 +193,10 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
189 error = -EPERM; 193 error = -EPERM;
190 break; 194 break;
191 } 195 }
196 snapshot_free_unused_memory(&data->handle);
192 down(&pm_sem); 197 down(&pm_sem);
193 pm_prepare_console(); 198 pm_prepare_console();
194 error = device_suspend(PMSG_FREEZE); 199 error = device_suspend(PMSG_PRETHAW);
195 if (!error) { 200 if (!error) {
196 error = swsusp_resume(); 201 error = swsusp_resume();
197 device_resume(); 202 device_resume();
diff --git a/kernel/printk.c b/kernel/printk.c
index 19a955619294..771f5e861bcd 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -24,8 +24,8 @@
24#include <linux/console.h> 24#include <linux/console.h>
25#include <linux/init.h> 25#include <linux/init.h>
26#include <linux/module.h> 26#include <linux/module.h>
27#include <linux/moduleparam.h>
27#include <linux/interrupt.h> /* For in_interrupt() */ 28#include <linux/interrupt.h> /* For in_interrupt() */
28#include <linux/config.h>
29#include <linux/delay.h> 29#include <linux/delay.h>
30#include <linux/smp.h> 30#include <linux/smp.h>
31#include <linux/security.h> 31#include <linux/security.h>
@@ -52,7 +52,7 @@ int console_printk[4] = {
52 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ 52 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */
53}; 53};
54 54
55EXPORT_SYMBOL(console_printk); 55EXPORT_UNUSED_SYMBOL(console_printk); /* June 2006 */
56 56
57/* 57/*
58 * Low lever drivers may need that to know if they can schedule in 58 * Low lever drivers may need that to know if they can schedule in
@@ -327,7 +327,9 @@ static void __call_console_drivers(unsigned long start, unsigned long end)
327 struct console *con; 327 struct console *con;
328 328
329 for (con = console_drivers; con; con = con->next) { 329 for (con = console_drivers; con; con = con->next) {
330 if ((con->flags & CON_ENABLED) && con->write) 330 if ((con->flags & CON_ENABLED) && con->write &&
331 (cpu_online(smp_processor_id()) ||
332 (con->flags & CON_ANYTIME)))
331 con->write(con, &LOG_BUF(start), end - start); 333 con->write(con, &LOG_BUF(start), end - start);
332 } 334 }
333} 335}
@@ -437,6 +439,7 @@ static int printk_time = 1;
437#else 439#else
438static int printk_time = 0; 440static int printk_time = 0;
439#endif 441#endif
442module_param(printk_time, int, S_IRUGO | S_IWUSR);
440 443
441static int __init printk_time_setup(char *str) 444static int __init printk_time_setup(char *str)
442{ 445{
@@ -453,6 +456,18 @@ __attribute__((weak)) unsigned long long printk_clock(void)
453 return sched_clock(); 456 return sched_clock();
454} 457}
455 458
459/* Check if we have any console registered that can be called early in boot. */
460static int have_callable_console(void)
461{
462 struct console *con;
463
464 for (con = console_drivers; con; con = con->next)
465 if (con->flags & CON_ANYTIME)
466 return 1;
467
468 return 0;
469}
470
456/** 471/**
457 * printk - print a kernel message 472 * printk - print a kernel message
458 * @fmt: format string 473 * @fmt: format string
@@ -503,7 +518,9 @@ asmlinkage int vprintk(const char *fmt, va_list args)
503 zap_locks(); 518 zap_locks();
504 519
505 /* This stops the holder of console_sem just where we want him */ 520 /* This stops the holder of console_sem just where we want him */
506 spin_lock_irqsave(&logbuf_lock, flags); 521 local_irq_save(flags);
522 lockdep_off();
523 spin_lock(&logbuf_lock);
507 printk_cpu = smp_processor_id(); 524 printk_cpu = smp_processor_id();
508 525
509 /* Emit the output into the temporary buffer */ 526 /* Emit the output into the temporary buffer */
@@ -566,27 +583,31 @@ asmlinkage int vprintk(const char *fmt, va_list args)
566 log_level_unknown = 1; 583 log_level_unknown = 1;
567 } 584 }
568 585
569 if (!cpu_online(smp_processor_id())) { 586 if (!down_trylock(&console_sem)) {
570 /* 587 /*
571 * Some console drivers may assume that per-cpu resources have 588 * We own the drivers. We can drop the spinlock and
572 * been allocated. So don't allow them to be called by this 589 * let release_console_sem() print the text, maybe ...
573 * CPU until it is officially up. We shouldn't be calling into
574 * random console drivers on a CPU which doesn't exist yet..
575 */ 590 */
576 printk_cpu = UINT_MAX;
577 spin_unlock_irqrestore(&logbuf_lock, flags);
578 goto out;
579 }
580 if (!down_trylock(&console_sem)) {
581 console_locked = 1; 591 console_locked = 1;
592 printk_cpu = UINT_MAX;
593 spin_unlock(&logbuf_lock);
594
582 /* 595 /*
583 * We own the drivers. We can drop the spinlock and let 596 * Console drivers may assume that per-cpu resources have
584 * release_console_sem() print the text 597 * been allocated. So unless they're explicitly marked as
598 * being able to cope (CON_ANYTIME) don't call them until
599 * this CPU is officially up.
585 */ 600 */
586 printk_cpu = UINT_MAX; 601 if (cpu_online(smp_processor_id()) || have_callable_console()) {
587 spin_unlock_irqrestore(&logbuf_lock, flags); 602 console_may_schedule = 0;
588 console_may_schedule = 0; 603 release_console_sem();
589 release_console_sem(); 604 } else {
605 /* Release by hand to avoid flushing the buffer. */
606 console_locked = 0;
607 up(&console_sem);
608 }
609 lockdep_on();
610 local_irq_restore(flags);
590 } else { 611 } else {
591 /* 612 /*
592 * Someone else owns the drivers. We drop the spinlock, which 613 * Someone else owns the drivers. We drop the spinlock, which
@@ -594,9 +615,11 @@ asmlinkage int vprintk(const char *fmt, va_list args)
594 * console drivers with the output which we just produced. 615 * console drivers with the output which we just produced.
595 */ 616 */
596 printk_cpu = UINT_MAX; 617 printk_cpu = UINT_MAX;
597 spin_unlock_irqrestore(&logbuf_lock, flags); 618 spin_unlock(&logbuf_lock);
619 lockdep_on();
620 local_irq_restore(flags);
598 } 621 }
599out: 622
600 preempt_enable(); 623 preempt_enable();
601 return printed_len; 624 return printed_len;
602} 625}
@@ -698,6 +721,7 @@ int __init add_preferred_console(char *name, int idx, char *options)
698 return 0; 721 return 0;
699} 722}
700 723
724#ifndef CONFIG_DISABLE_CONSOLE_SUSPEND
701/** 725/**
702 * suspend_console - suspend the console subsystem 726 * suspend_console - suspend the console subsystem
703 * 727 *
@@ -705,6 +729,7 @@ int __init add_preferred_console(char *name, int idx, char *options)
705 */ 729 */
706void suspend_console(void) 730void suspend_console(void)
707{ 731{
732 printk("Suspending console(s)\n");
708 acquire_console_sem(); 733 acquire_console_sem();
709 console_suspended = 1; 734 console_suspended = 1;
710} 735}
@@ -714,6 +739,7 @@ void resume_console(void)
714 console_suspended = 0; 739 console_suspended = 0;
715 release_console_sem(); 740 release_console_sem();
716} 741}
742#endif /* CONFIG_DISABLE_CONSOLE_SUSPEND */
717 743
718/** 744/**
719 * acquire_console_sem - lock the console system for exclusive use. 745 * acquire_console_sem - lock the console system for exclusive use.
@@ -750,7 +776,7 @@ int is_console_locked(void)
750{ 776{
751 return console_locked; 777 return console_locked;
752} 778}
753EXPORT_SYMBOL(is_console_locked); 779EXPORT_UNUSED_SYMBOL(is_console_locked); /* June 2006 */
754 780
755/** 781/**
756 * release_console_sem - unlock the console system 782 * release_console_sem - unlock the console system
@@ -776,6 +802,9 @@ void release_console_sem(void)
776 up(&secondary_console_sem); 802 up(&secondary_console_sem);
777 return; 803 return;
778 } 804 }
805
806 console_may_schedule = 0;
807
779 for ( ; ; ) { 808 for ( ; ; ) {
780 spin_lock_irqsave(&logbuf_lock, flags); 809 spin_lock_irqsave(&logbuf_lock, flags);
781 wake_klogd |= log_start - log_end; 810 wake_klogd |= log_start - log_end;
@@ -789,11 +818,17 @@ void release_console_sem(void)
789 local_irq_restore(flags); 818 local_irq_restore(flags);
790 } 819 }
791 console_locked = 0; 820 console_locked = 0;
792 console_may_schedule = 0;
793 up(&console_sem); 821 up(&console_sem);
794 spin_unlock_irqrestore(&logbuf_lock, flags); 822 spin_unlock_irqrestore(&logbuf_lock, flags);
795 if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait)) 823 if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait)) {
796 wake_up_interruptible(&log_wait); 824 /*
825 * If we printk from within the lock dependency code,
826 * from within the scheduler code, then do not lock
827 * up due to self-recursion:
828 */
829 if (!lockdep_internal())
830 wake_up_interruptible(&log_wait);
831 }
797} 832}
798EXPORT_SYMBOL(release_console_sem); 833EXPORT_SYMBOL(release_console_sem);
799 834
diff --git a/kernel/profile.c b/kernel/profile.c
index 68afe121e507..fb660c7d35ba 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -13,7 +13,6 @@
13 * to resolve timer interrupt livelocks, William Irwin, Oracle, 2004 13 * to resolve timer interrupt livelocks, William Irwin, Oracle, 2004
14 */ 14 */
15 15
16#include <linux/config.h>
17#include <linux/module.h> 16#include <linux/module.h>
18#include <linux/profile.h> 17#include <linux/profile.h>
19#include <linux/bootmem.h> 18#include <linux/bootmem.h>
@@ -299,7 +298,7 @@ out:
299} 298}
300 299
301#ifdef CONFIG_HOTPLUG_CPU 300#ifdef CONFIG_HOTPLUG_CPU
302static int profile_cpu_callback(struct notifier_block *info, 301static int __devinit profile_cpu_callback(struct notifier_block *info,
303 unsigned long action, void *__cpu) 302 unsigned long action, void *__cpu)
304{ 303{
305 int node, cpu = (unsigned long)__cpu; 304 int node, cpu = (unsigned long)__cpu;
@@ -310,13 +309,17 @@ static int profile_cpu_callback(struct notifier_block *info,
310 node = cpu_to_node(cpu); 309 node = cpu_to_node(cpu);
311 per_cpu(cpu_profile_flip, cpu) = 0; 310 per_cpu(cpu_profile_flip, cpu) = 0;
312 if (!per_cpu(cpu_profile_hits, cpu)[1]) { 311 if (!per_cpu(cpu_profile_hits, cpu)[1]) {
313 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); 312 page = alloc_pages_node(node,
313 GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
314 0);
314 if (!page) 315 if (!page)
315 return NOTIFY_BAD; 316 return NOTIFY_BAD;
316 per_cpu(cpu_profile_hits, cpu)[1] = page_address(page); 317 per_cpu(cpu_profile_hits, cpu)[1] = page_address(page);
317 } 318 }
318 if (!per_cpu(cpu_profile_hits, cpu)[0]) { 319 if (!per_cpu(cpu_profile_hits, cpu)[0]) {
319 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); 320 page = alloc_pages_node(node,
321 GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
322 0);
320 if (!page) 323 if (!page)
321 goto out_free; 324 goto out_free;
322 per_cpu(cpu_profile_hits, cpu)[0] = page_address(page); 325 per_cpu(cpu_profile_hits, cpu)[0] = page_address(page);
@@ -492,12 +495,16 @@ static int __init create_hash_tables(void)
492 int node = cpu_to_node(cpu); 495 int node = cpu_to_node(cpu);
493 struct page *page; 496 struct page *page;
494 497
495 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); 498 page = alloc_pages_node(node,
499 GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
500 0);
496 if (!page) 501 if (!page)
497 goto out_cleanup; 502 goto out_cleanup;
498 per_cpu(cpu_profile_hits, cpu)[1] 503 per_cpu(cpu_profile_hits, cpu)[1]
499 = (struct profile_hit *)page_address(page); 504 = (struct profile_hit *)page_address(page);
500 page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); 505 page = alloc_pages_node(node,
506 GFP_KERNEL | __GFP_ZERO | GFP_THISNODE,
507 0);
501 if (!page) 508 if (!page)
502 goto out_cleanup; 509 goto out_cleanup;
503 per_cpu(cpu_profile_hits, cpu)[0] 510 per_cpu(cpu_profile_hits, cpu)[0]
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 921c22ad16e4..4d50e06fd745 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -28,7 +28,7 @@
28 * 28 *
29 * Must be called with the tasklist lock write-held. 29 * Must be called with the tasklist lock write-held.
30 */ 30 */
31void __ptrace_link(task_t *child, task_t *new_parent) 31void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
32{ 32{
33 BUG_ON(!list_empty(&child->ptrace_list)); 33 BUG_ON(!list_empty(&child->ptrace_list));
34 if (child->parent == new_parent) 34 if (child->parent == new_parent)
@@ -46,7 +46,7 @@ void __ptrace_link(task_t *child, task_t *new_parent)
46 * TASK_TRACED, resume it now. 46 * TASK_TRACED, resume it now.
47 * Requires that irqs be disabled. 47 * Requires that irqs be disabled.
48 */ 48 */
49void ptrace_untrace(task_t *child) 49void ptrace_untrace(struct task_struct *child)
50{ 50{
51 spin_lock(&child->sighand->siglock); 51 spin_lock(&child->sighand->siglock);
52 if (child->state == TASK_TRACED) { 52 if (child->state == TASK_TRACED) {
@@ -65,7 +65,7 @@ void ptrace_untrace(task_t *child)
65 * 65 *
66 * Must be called with the tasklist lock write-held. 66 * Must be called with the tasklist lock write-held.
67 */ 67 */
68void __ptrace_unlink(task_t *child) 68void __ptrace_unlink(struct task_struct *child)
69{ 69{
70 BUG_ON(!child->ptrace); 70 BUG_ON(!child->ptrace);
71 71
@@ -120,8 +120,18 @@ int ptrace_check_attach(struct task_struct *child, int kill)
120 120
121static int may_attach(struct task_struct *task) 121static int may_attach(struct task_struct *task)
122{ 122{
123 if (!task->mm) 123 /* May we inspect the given task?
124 return -EPERM; 124 * This check is used both for attaching with ptrace
125 * and for allowing access to sensitive information in /proc.
126 *
127 * ptrace_attach denies several cases that /proc allows
128 * because setting up the necessary parent/child relationship
129 * or halting the specified task is impossible.
130 */
131 int dumpable = 0;
132 /* Don't let security modules deny introspection */
133 if (task == current)
134 return 0;
125 if (((current->uid != task->euid) || 135 if (((current->uid != task->euid) ||
126 (current->uid != task->suid) || 136 (current->uid != task->suid) ||
127 (current->uid != task->uid) || 137 (current->uid != task->uid) ||
@@ -130,7 +140,9 @@ static int may_attach(struct task_struct *task)
130 (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) 140 (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
131 return -EPERM; 141 return -EPERM;
132 smp_rmb(); 142 smp_rmb();
133 if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE)) 143 if (task->mm)
144 dumpable = task->mm->dumpable;
145 if (!dumpable && !capable(CAP_SYS_PTRACE))
134 return -EPERM; 146 return -EPERM;
135 147
136 return security_ptrace(current, task); 148 return security_ptrace(current, task);
@@ -176,6 +188,8 @@ repeat:
176 goto repeat; 188 goto repeat;
177 } 189 }
178 190
191 if (!task->mm)
192 goto bad;
179 /* the same process cannot be attached many times */ 193 /* the same process cannot be attached many times */
180 if (task->ptrace & PT_PTRACED) 194 if (task->ptrace & PT_PTRACED)
181 goto bad; 195 goto bad;
@@ -200,7 +214,7 @@ out:
200 return retval; 214 return retval;
201} 215}
202 216
203void __ptrace_detach(struct task_struct *child, unsigned int data) 217static inline void __ptrace_detach(struct task_struct *child, unsigned int data)
204{ 218{
205 child->exit_code = data; 219 child->exit_code = data;
206 /* .. re-parent .. */ 220 /* .. re-parent .. */
@@ -219,6 +233,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
219 ptrace_disable(child); 233 ptrace_disable(child);
220 234
221 write_lock_irq(&tasklist_lock); 235 write_lock_irq(&tasklist_lock);
236 /* protect against de_thread()->release_task() */
222 if (child->ptrace) 237 if (child->ptrace)
223 __ptrace_detach(child, data); 238 __ptrace_detach(child, data);
224 write_unlock_irq(&tasklist_lock); 239 write_unlock_irq(&tasklist_lock);
@@ -226,60 +241,6 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
226 return 0; 241 return 0;
227} 242}
228 243
229/*
230 * Access another process' address space.
231 * Source/target buffer must be kernel space,
232 * Do not walk the page table directly, use get_user_pages
233 */
234
235int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
236{
237 struct mm_struct *mm;
238 struct vm_area_struct *vma;
239 struct page *page;
240 void *old_buf = buf;
241
242 mm = get_task_mm(tsk);
243 if (!mm)
244 return 0;
245
246 down_read(&mm->mmap_sem);
247 /* ignore errors, just check how much was sucessfully transfered */
248 while (len) {
249 int bytes, ret, offset;
250 void *maddr;
251
252 ret = get_user_pages(tsk, mm, addr, 1,
253 write, 1, &page, &vma);
254 if (ret <= 0)
255 break;
256
257 bytes = len;
258 offset = addr & (PAGE_SIZE-1);
259 if (bytes > PAGE_SIZE-offset)
260 bytes = PAGE_SIZE-offset;
261
262 maddr = kmap(page);
263 if (write) {
264 copy_to_user_page(vma, page, addr,
265 maddr + offset, buf, bytes);
266 set_page_dirty_lock(page);
267 } else {
268 copy_from_user_page(vma, page, addr,
269 buf, maddr + offset, bytes);
270 }
271 kunmap(page);
272 page_cache_release(page);
273 len -= bytes;
274 buf += bytes;
275 addr += bytes;
276 }
277 up_read(&mm->mmap_sem);
278 mmput(mm);
279
280 return buf - old_buf;
281}
282
283int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len) 244int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
284{ 245{
285 int copied = 0; 246 int copied = 0;
@@ -479,6 +440,7 @@ struct task_struct *ptrace_get_task_struct(pid_t pid)
479 child = find_task_by_pid(pid); 440 child = find_task_by_pid(pid);
480 if (child) 441 if (child)
481 get_task_struct(child); 442 get_task_struct(child);
443
482 read_unlock(&tasklist_lock); 444 read_unlock(&tasklist_lock);
483 if (!child) 445 if (!child)
484 return ERR_PTR(-ESRCH); 446 return ERR_PTR(-ESRCH);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 20e9710fc21c..523e46483b99 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -53,13 +53,13 @@
53static struct rcu_ctrlblk rcu_ctrlblk = { 53static struct rcu_ctrlblk rcu_ctrlblk = {
54 .cur = -300, 54 .cur = -300,
55 .completed = -300, 55 .completed = -300,
56 .lock = SPIN_LOCK_UNLOCKED, 56 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
57 .cpumask = CPU_MASK_NONE, 57 .cpumask = CPU_MASK_NONE,
58}; 58};
59static struct rcu_ctrlblk rcu_bh_ctrlblk = { 59static struct rcu_ctrlblk rcu_bh_ctrlblk = {
60 .cur = -300, 60 .cur = -300,
61 .completed = -300, 61 .completed = -300,
62 .lock = SPIN_LOCK_UNLOCKED, 62 .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
63 .cpumask = CPU_MASK_NONE, 63 .cpumask = CPU_MASK_NONE,
64}; 64};
65 65
@@ -182,6 +182,15 @@ long rcu_batches_completed(void)
182 return rcu_ctrlblk.completed; 182 return rcu_ctrlblk.completed;
183} 183}
184 184
185/*
186 * Return the number of RCU batches processed thus far. Useful
187 * for debug and statistics.
188 */
189long rcu_batches_completed_bh(void)
190{
191 return rcu_bh_ctrlblk.completed;
192}
193
185static void rcu_barrier_callback(struct rcu_head *notused) 194static void rcu_barrier_callback(struct rcu_head *notused)
186{ 195{
187 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) 196 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
@@ -232,12 +241,16 @@ static void rcu_do_batch(struct rcu_data *rdp)
232 next = rdp->donelist = list->next; 241 next = rdp->donelist = list->next;
233 list->func(list); 242 list->func(list);
234 list = next; 243 list = next;
235 rdp->qlen--;
236 if (++count >= rdp->blimit) 244 if (++count >= rdp->blimit)
237 break; 245 break;
238 } 246 }
247
248 local_irq_disable();
249 rdp->qlen -= count;
250 local_irq_enable();
239 if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) 251 if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
240 rdp->blimit = blimit; 252 rdp->blimit = blimit;
253
241 if (!rdp->donelist) 254 if (!rdp->donelist)
242 rdp->donetail = &rdp->donelist; 255 rdp->donetail = &rdp->donelist;
243 else 256 else
@@ -539,7 +552,7 @@ static void __devinit rcu_online_cpu(int cpu)
539 tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL); 552 tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL);
540} 553}
541 554
542static int rcu_cpu_notify(struct notifier_block *self, 555static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
543 unsigned long action, void *hcpu) 556 unsigned long action, void *hcpu)
544{ 557{
545 long cpu = (long)hcpu; 558 long cpu = (long)hcpu;
@@ -556,7 +569,7 @@ static int rcu_cpu_notify(struct notifier_block *self,
556 return NOTIFY_OK; 569 return NOTIFY_OK;
557} 570}
558 571
559static struct notifier_block rcu_nb = { 572static struct notifier_block __cpuinitdata rcu_nb = {
560 .notifier_call = rcu_cpu_notify, 573 .notifier_call = rcu_cpu_notify,
561}; 574};
562 575
@@ -619,6 +632,7 @@ module_param(qlowmark, int, 0);
619module_param(rsinterval, int, 0); 632module_param(rsinterval, int, 0);
620#endif 633#endif
621EXPORT_SYMBOL_GPL(rcu_batches_completed); 634EXPORT_SYMBOL_GPL(rcu_batches_completed);
635EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
622EXPORT_SYMBOL_GPL(call_rcu); 636EXPORT_SYMBOL_GPL(call_rcu);
623EXPORT_SYMBOL_GPL(call_rcu_bh); 637EXPORT_SYMBOL_GPL(call_rcu_bh);
624EXPORT_SYMBOL_GPL(synchronize_rcu); 638EXPORT_SYMBOL_GPL(synchronize_rcu);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 8154e7589d12..4f2c4272d59c 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Read-Copy Update /proc-based torture test facility 2 * Read-Copy Update module-based torture test facility
3 * 3 *
4 * This program is free software; you can redistribute it and/or modify 4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by 5 * it under the terms of the GNU General Public License as published by
@@ -53,6 +53,7 @@ static int stat_interval; /* Interval between stats, in seconds. */
53static int verbose; /* Print more debug info. */ 53static int verbose; /* Print more debug info. */
54static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ 54static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */
55static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/ 55static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/
56static char *torture_type = "rcu"; /* What to torture. */
56 57
57module_param(nreaders, int, 0); 58module_param(nreaders, int, 0);
58MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); 59MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
@@ -64,13 +65,16 @@ module_param(test_no_idle_hz, bool, 0);
64MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); 65MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
65module_param(shuffle_interval, int, 0); 66module_param(shuffle_interval, int, 0);
66MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); 67MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
67#define TORTURE_FLAG "rcutorture: " 68module_param(torture_type, charp, 0);
69MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh)");
70
71#define TORTURE_FLAG "-torture:"
68#define PRINTK_STRING(s) \ 72#define PRINTK_STRING(s) \
69 do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) 73 do { printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0)
70#define VERBOSE_PRINTK_STRING(s) \ 74#define VERBOSE_PRINTK_STRING(s) \
71 do { if (verbose) printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) 75 do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0)
72#define VERBOSE_PRINTK_ERRSTRING(s) \ 76#define VERBOSE_PRINTK_ERRSTRING(s) \
73 do { if (verbose) printk(KERN_ALERT TORTURE_FLAG "!!! " s "\n"); } while (0) 77 do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0)
74 78
75static char printk_buf[4096]; 79static char printk_buf[4096];
76 80
@@ -139,28 +143,6 @@ rcu_torture_free(struct rcu_torture *p)
139 spin_unlock_bh(&rcu_torture_lock); 143 spin_unlock_bh(&rcu_torture_lock);
140} 144}
141 145
142static void
143rcu_torture_cb(struct rcu_head *p)
144{
145 int i;
146 struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
147
148 if (fullstop) {
149 /* Test is ending, just drop callbacks on the floor. */
150 /* The next initialization will pick up the pieces. */
151 return;
152 }
153 i = rp->rtort_pipe_count;
154 if (i > RCU_TORTURE_PIPE_LEN)
155 i = RCU_TORTURE_PIPE_LEN;
156 atomic_inc(&rcu_torture_wcount[i]);
157 if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
158 rp->rtort_mbtest = 0;
159 rcu_torture_free(rp);
160 } else
161 call_rcu(p, rcu_torture_cb);
162}
163
164struct rcu_random_state { 146struct rcu_random_state {
165 unsigned long rrs_state; 147 unsigned long rrs_state;
166 unsigned long rrs_count; 148 unsigned long rrs_count;
@@ -191,6 +173,119 @@ rcu_random(struct rcu_random_state *rrsp)
191} 173}
192 174
193/* 175/*
176 * Operations vector for selecting different types of tests.
177 */
178
179struct rcu_torture_ops {
180 void (*init)(void);
181 void (*cleanup)(void);
182 int (*readlock)(void);
183 void (*readunlock)(int idx);
184 int (*completed)(void);
185 void (*deferredfree)(struct rcu_torture *p);
186 int (*stats)(char *page);
187 char *name;
188};
189static struct rcu_torture_ops *cur_ops = NULL;
190
191/*
192 * Definitions for rcu torture testing.
193 */
194
195static int rcu_torture_read_lock(void) __acquires(RCU)
196{
197 rcu_read_lock();
198 return 0;
199}
200
201static void rcu_torture_read_unlock(int idx) __releases(RCU)
202{
203 rcu_read_unlock();
204}
205
206static int rcu_torture_completed(void)
207{
208 return rcu_batches_completed();
209}
210
211static void
212rcu_torture_cb(struct rcu_head *p)
213{
214 int i;
215 struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
216
217 if (fullstop) {
218 /* Test is ending, just drop callbacks on the floor. */
219 /* The next initialization will pick up the pieces. */
220 return;
221 }
222 i = rp->rtort_pipe_count;
223 if (i > RCU_TORTURE_PIPE_LEN)
224 i = RCU_TORTURE_PIPE_LEN;
225 atomic_inc(&rcu_torture_wcount[i]);
226 if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
227 rp->rtort_mbtest = 0;
228 rcu_torture_free(rp);
229 } else
230 cur_ops->deferredfree(rp);
231}
232
233static void rcu_torture_deferred_free(struct rcu_torture *p)
234{
235 call_rcu(&p->rtort_rcu, rcu_torture_cb);
236}
237
238static struct rcu_torture_ops rcu_ops = {
239 .init = NULL,
240 .cleanup = NULL,
241 .readlock = rcu_torture_read_lock,
242 .readunlock = rcu_torture_read_unlock,
243 .completed = rcu_torture_completed,
244 .deferredfree = rcu_torture_deferred_free,
245 .stats = NULL,
246 .name = "rcu"
247};
248
249/*
250 * Definitions for rcu_bh torture testing.
251 */
252
253static int rcu_bh_torture_read_lock(void) __acquires(RCU_BH)
254{
255 rcu_read_lock_bh();
256 return 0;
257}
258
259static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH)
260{
261 rcu_read_unlock_bh();
262}
263
264static int rcu_bh_torture_completed(void)
265{
266 return rcu_batches_completed_bh();
267}
268
269static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
270{
271 call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
272}
273
274static struct rcu_torture_ops rcu_bh_ops = {
275 .init = NULL,
276 .cleanup = NULL,
277 .readlock = rcu_bh_torture_read_lock,
278 .readunlock = rcu_bh_torture_read_unlock,
279 .completed = rcu_bh_torture_completed,
280 .deferredfree = rcu_bh_torture_deferred_free,
281 .stats = NULL,
282 .name = "rcu_bh"
283};
284
285static struct rcu_torture_ops *torture_ops[] =
286 { &rcu_ops, &rcu_bh_ops, NULL };
287
288/*
194 * RCU torture writer kthread. Repeatedly substitutes a new structure 289 * RCU torture writer kthread. Repeatedly substitutes a new structure
195 * for that pointed to by rcu_torture_current, freeing the old structure 290 * for that pointed to by rcu_torture_current, freeing the old structure
196 * after a series of grace periods (the "pipeline"). 291 * after a series of grace periods (the "pipeline").
@@ -209,8 +304,6 @@ rcu_torture_writer(void *arg)
209 304
210 do { 305 do {
211 schedule_timeout_uninterruptible(1); 306 schedule_timeout_uninterruptible(1);
212 if (rcu_batches_completed() == oldbatch)
213 continue;
214 if ((rp = rcu_torture_alloc()) == NULL) 307 if ((rp = rcu_torture_alloc()) == NULL)
215 continue; 308 continue;
216 rp->rtort_pipe_count = 0; 309 rp->rtort_pipe_count = 0;
@@ -225,10 +318,10 @@ rcu_torture_writer(void *arg)
225 i = RCU_TORTURE_PIPE_LEN; 318 i = RCU_TORTURE_PIPE_LEN;
226 atomic_inc(&rcu_torture_wcount[i]); 319 atomic_inc(&rcu_torture_wcount[i]);
227 old_rp->rtort_pipe_count++; 320 old_rp->rtort_pipe_count++;
228 call_rcu(&old_rp->rtort_rcu, rcu_torture_cb); 321 cur_ops->deferredfree(old_rp);
229 } 322 }
230 rcu_torture_current_version++; 323 rcu_torture_current_version++;
231 oldbatch = rcu_batches_completed(); 324 oldbatch = cur_ops->completed();
232 } while (!kthread_should_stop() && !fullstop); 325 } while (!kthread_should_stop() && !fullstop);
233 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); 326 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
234 while (!kthread_should_stop()) 327 while (!kthread_should_stop())
@@ -246,6 +339,7 @@ static int
246rcu_torture_reader(void *arg) 339rcu_torture_reader(void *arg)
247{ 340{
248 int completed; 341 int completed;
342 int idx;
249 DEFINE_RCU_RANDOM(rand); 343 DEFINE_RCU_RANDOM(rand);
250 struct rcu_torture *p; 344 struct rcu_torture *p;
251 int pipe_count; 345 int pipe_count;
@@ -254,12 +348,12 @@ rcu_torture_reader(void *arg)
254 set_user_nice(current, 19); 348 set_user_nice(current, 19);
255 349
256 do { 350 do {
257 rcu_read_lock(); 351 idx = cur_ops->readlock();
258 completed = rcu_batches_completed(); 352 completed = cur_ops->completed();
259 p = rcu_dereference(rcu_torture_current); 353 p = rcu_dereference(rcu_torture_current);
260 if (p == NULL) { 354 if (p == NULL) {
261 /* Wait for rcu_torture_writer to get underway */ 355 /* Wait for rcu_torture_writer to get underway */
262 rcu_read_unlock(); 356 cur_ops->readunlock(idx);
263 schedule_timeout_interruptible(HZ); 357 schedule_timeout_interruptible(HZ);
264 continue; 358 continue;
265 } 359 }
@@ -273,14 +367,14 @@ rcu_torture_reader(void *arg)
273 pipe_count = RCU_TORTURE_PIPE_LEN; 367 pipe_count = RCU_TORTURE_PIPE_LEN;
274 } 368 }
275 ++__get_cpu_var(rcu_torture_count)[pipe_count]; 369 ++__get_cpu_var(rcu_torture_count)[pipe_count];
276 completed = rcu_batches_completed() - completed; 370 completed = cur_ops->completed() - completed;
277 if (completed > RCU_TORTURE_PIPE_LEN) { 371 if (completed > RCU_TORTURE_PIPE_LEN) {
278 /* Should not happen, but... */ 372 /* Should not happen, but... */
279 completed = RCU_TORTURE_PIPE_LEN; 373 completed = RCU_TORTURE_PIPE_LEN;
280 } 374 }
281 ++__get_cpu_var(rcu_torture_batch)[completed]; 375 ++__get_cpu_var(rcu_torture_batch)[completed];
282 preempt_enable(); 376 preempt_enable();
283 rcu_read_unlock(); 377 cur_ops->readunlock(idx);
284 schedule(); 378 schedule();
285 } while (!kthread_should_stop() && !fullstop); 379 } while (!kthread_should_stop() && !fullstop);
286 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); 380 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
@@ -311,7 +405,7 @@ rcu_torture_printk(char *page)
311 if (pipesummary[i] != 0) 405 if (pipesummary[i] != 0)
312 break; 406 break;
313 } 407 }
314 cnt += sprintf(&page[cnt], "rcutorture: "); 408 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
315 cnt += sprintf(&page[cnt], 409 cnt += sprintf(&page[cnt],
316 "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " 410 "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
317 "rtmbe: %d", 411 "rtmbe: %d",
@@ -324,7 +418,7 @@ rcu_torture_printk(char *page)
324 atomic_read(&n_rcu_torture_mberror)); 418 atomic_read(&n_rcu_torture_mberror));
325 if (atomic_read(&n_rcu_torture_mberror) != 0) 419 if (atomic_read(&n_rcu_torture_mberror) != 0)
326 cnt += sprintf(&page[cnt], " !!!"); 420 cnt += sprintf(&page[cnt], " !!!");
327 cnt += sprintf(&page[cnt], "\nrcutorture: "); 421 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
328 if (i > 1) { 422 if (i > 1) {
329 cnt += sprintf(&page[cnt], "!!! "); 423 cnt += sprintf(&page[cnt], "!!! ");
330 atomic_inc(&n_rcu_torture_error); 424 atomic_inc(&n_rcu_torture_error);
@@ -332,17 +426,19 @@ rcu_torture_printk(char *page)
332 cnt += sprintf(&page[cnt], "Reader Pipe: "); 426 cnt += sprintf(&page[cnt], "Reader Pipe: ");
333 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) 427 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
334 cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); 428 cnt += sprintf(&page[cnt], " %ld", pipesummary[i]);
335 cnt += sprintf(&page[cnt], "\nrcutorture: "); 429 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
336 cnt += sprintf(&page[cnt], "Reader Batch: "); 430 cnt += sprintf(&page[cnt], "Reader Batch: ");
337 for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++) 431 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
338 cnt += sprintf(&page[cnt], " %ld", batchsummary[i]); 432 cnt += sprintf(&page[cnt], " %ld", batchsummary[i]);
339 cnt += sprintf(&page[cnt], "\nrcutorture: "); 433 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
340 cnt += sprintf(&page[cnt], "Free-Block Circulation: "); 434 cnt += sprintf(&page[cnt], "Free-Block Circulation: ");
341 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { 435 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
342 cnt += sprintf(&page[cnt], " %d", 436 cnt += sprintf(&page[cnt], " %d",
343 atomic_read(&rcu_torture_wcount[i])); 437 atomic_read(&rcu_torture_wcount[i]));
344 } 438 }
345 cnt += sprintf(&page[cnt], "\n"); 439 cnt += sprintf(&page[cnt], "\n");
440 if (cur_ops->stats != NULL)
441 cnt += cur_ops->stats(&page[cnt]);
346 return cnt; 442 return cnt;
347} 443}
348 444
@@ -444,11 +540,11 @@ rcu_torture_shuffle(void *arg)
444static inline void 540static inline void
445rcu_torture_print_module_parms(char *tag) 541rcu_torture_print_module_parms(char *tag)
446{ 542{
447 printk(KERN_ALERT TORTURE_FLAG "--- %s: nreaders=%d " 543 printk(KERN_ALERT "%s" TORTURE_FLAG "--- %s: nreaders=%d "
448 "stat_interval=%d verbose=%d test_no_idle_hz=%d " 544 "stat_interval=%d verbose=%d test_no_idle_hz=%d "
449 "shuffle_interval = %d\n", 545 "shuffle_interval = %d\n",
450 tag, nrealreaders, stat_interval, verbose, test_no_idle_hz, 546 torture_type, tag, nrealreaders, stat_interval, verbose,
451 shuffle_interval); 547 test_no_idle_hz, shuffle_interval);
452} 548}
453 549
454static void 550static void
@@ -493,6 +589,9 @@ rcu_torture_cleanup(void)
493 rcu_barrier(); 589 rcu_barrier();
494 590
495 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ 591 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
592
593 if (cur_ops->cleanup != NULL)
594 cur_ops->cleanup();
496 if (atomic_read(&n_rcu_torture_error)) 595 if (atomic_read(&n_rcu_torture_error))
497 rcu_torture_print_module_parms("End of test: FAILURE"); 596 rcu_torture_print_module_parms("End of test: FAILURE");
498 else 597 else
@@ -508,6 +607,20 @@ rcu_torture_init(void)
508 607
509 /* Process args and tell the world that the torturer is on the job. */ 608 /* Process args and tell the world that the torturer is on the job. */
510 609
610 for (i = 0; cur_ops = torture_ops[i], cur_ops != NULL; i++) {
611 cur_ops = torture_ops[i];
612 if (strcmp(torture_type, cur_ops->name) == 0) {
613 break;
614 }
615 }
616 if (cur_ops == NULL) {
617 printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n",
618 torture_type);
619 return (-EINVAL);
620 }
621 if (cur_ops->init != NULL)
622 cur_ops->init(); /* no "goto unwind" prior to this point!!! */
623
511 if (nreaders >= 0) 624 if (nreaders >= 0)
512 nrealreaders = nreaders; 625 nrealreaders = nreaders;
513 else 626 else
diff --git a/kernel/relay.c b/kernel/relay.c
index 33345e73485c..1d63ecddfa70 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -95,7 +95,7 @@ int relay_mmap_buf(struct rchan_buf *buf, struct vm_area_struct *vma)
95 * @buf: the buffer struct 95 * @buf: the buffer struct
96 * @size: total size of the buffer 96 * @size: total size of the buffer
97 * 97 *
98 * Returns a pointer to the resulting buffer, NULL if unsuccessful. The 98 * Returns a pointer to the resulting buffer, %NULL if unsuccessful. The
99 * passed in size will get page aligned, if it isn't already. 99 * passed in size will get page aligned, if it isn't already.
100 */ 100 */
101static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size) 101static void *relay_alloc_buf(struct rchan_buf *buf, size_t *size)
@@ -132,10 +132,9 @@ depopulate:
132 132
133/** 133/**
134 * relay_create_buf - allocate and initialize a channel buffer 134 * relay_create_buf - allocate and initialize a channel buffer
135 * @alloc_size: size of the buffer to allocate 135 * @chan: the relay channel
136 * @n_subbufs: number of sub-buffers in the channel
137 * 136 *
138 * Returns channel buffer if successful, NULL otherwise 137 * Returns channel buffer if successful, %NULL otherwise.
139 */ 138 */
140struct rchan_buf *relay_create_buf(struct rchan *chan) 139struct rchan_buf *relay_create_buf(struct rchan *chan)
141{ 140{
@@ -163,6 +162,7 @@ free_buf:
163 162
164/** 163/**
165 * relay_destroy_channel - free the channel struct 164 * relay_destroy_channel - free the channel struct
165 * @kref: target kernel reference that contains the relay channel
166 * 166 *
167 * Should only be called from kref_put(). 167 * Should only be called from kref_put().
168 */ 168 */
@@ -194,6 +194,7 @@ void relay_destroy_buf(struct rchan_buf *buf)
194 194
195/** 195/**
196 * relay_remove_buf - remove a channel buffer 196 * relay_remove_buf - remove a channel buffer
197 * @kref: target kernel reference that contains the relay buffer
197 * 198 *
198 * Removes the file from the fileystem, which also frees the 199 * Removes the file from the fileystem, which also frees the
199 * rchan_buf_struct and the channel buffer. Should only be called from 200 * rchan_buf_struct and the channel buffer. Should only be called from
@@ -374,7 +375,7 @@ void relay_reset(struct rchan *chan)
374} 375}
375EXPORT_SYMBOL_GPL(relay_reset); 376EXPORT_SYMBOL_GPL(relay_reset);
376 377
377/** 378/*
378 * relay_open_buf - create a new relay channel buffer 379 * relay_open_buf - create a new relay channel buffer
379 * 380 *
380 * Internal - used by relay_open(). 381 * Internal - used by relay_open().
@@ -448,12 +449,12 @@ static inline void setup_callbacks(struct rchan *chan,
448/** 449/**
449 * relay_open - create a new relay channel 450 * relay_open - create a new relay channel
450 * @base_filename: base name of files to create 451 * @base_filename: base name of files to create
451 * @parent: dentry of parent directory, NULL for root directory 452 * @parent: dentry of parent directory, %NULL for root directory
452 * @subbuf_size: size of sub-buffers 453 * @subbuf_size: size of sub-buffers
453 * @n_subbufs: number of sub-buffers 454 * @n_subbufs: number of sub-buffers
454 * @cb: client callback functions 455 * @cb: client callback functions
455 * 456 *
456 * Returns channel pointer if successful, NULL otherwise. 457 * Returns channel pointer if successful, %NULL otherwise.
457 * 458 *
458 * Creates a channel buffer for each cpu using the sizes and 459 * Creates a channel buffer for each cpu using the sizes and
459 * attributes specified. The created channel buffer files 460 * attributes specified. The created channel buffer files
@@ -585,7 +586,7 @@ EXPORT_SYMBOL_GPL(relay_switch_subbuf);
585 * subbufs_consumed should be the number of sub-buffers newly consumed, 586 * subbufs_consumed should be the number of sub-buffers newly consumed,
586 * not the total consumed. 587 * not the total consumed.
587 * 588 *
588 * NOTE: kernel clients don't need to call this function if the channel 589 * NOTE: Kernel clients don't need to call this function if the channel
589 * mode is 'overwrite'. 590 * mode is 'overwrite'.
590 */ 591 */
591void relay_subbufs_consumed(struct rchan *chan, 592void relay_subbufs_consumed(struct rchan *chan,
@@ -641,7 +642,7 @@ EXPORT_SYMBOL_GPL(relay_close);
641 * relay_flush - close the channel 642 * relay_flush - close the channel
642 * @chan: the channel 643 * @chan: the channel
643 * 644 *
644 * Flushes all channel buffers i.e. forces buffer switch. 645 * Flushes all channel buffers, i.e. forces buffer switch.
645 */ 646 */
646void relay_flush(struct rchan *chan) 647void relay_flush(struct rchan *chan)
647{ 648{
@@ -669,7 +670,7 @@ EXPORT_SYMBOL_GPL(relay_flush);
669 */ 670 */
670static int relay_file_open(struct inode *inode, struct file *filp) 671static int relay_file_open(struct inode *inode, struct file *filp)
671{ 672{
672 struct rchan_buf *buf = inode->u.generic_ip; 673 struct rchan_buf *buf = inode->i_private;
673 kref_get(&buf->kref); 674 kref_get(&buf->kref);
674 filp->private_data = buf; 675 filp->private_data = buf;
675 676
@@ -729,7 +730,7 @@ static int relay_file_release(struct inode *inode, struct file *filp)
729 return 0; 730 return 0;
730} 731}
731 732
732/** 733/*
733 * relay_file_read_consume - update the consumed count for the buffer 734 * relay_file_read_consume - update the consumed count for the buffer
734 */ 735 */
735static void relay_file_read_consume(struct rchan_buf *buf, 736static void relay_file_read_consume(struct rchan_buf *buf,
@@ -756,7 +757,7 @@ static void relay_file_read_consume(struct rchan_buf *buf,
756 } 757 }
757} 758}
758 759
759/** 760/*
760 * relay_file_read_avail - boolean, are there unconsumed bytes available? 761 * relay_file_read_avail - boolean, are there unconsumed bytes available?
761 */ 762 */
762static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos) 763static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
@@ -793,6 +794,8 @@ static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
793 794
794/** 795/**
795 * relay_file_read_subbuf_avail - return bytes available in sub-buffer 796 * relay_file_read_subbuf_avail - return bytes available in sub-buffer
797 * @read_pos: file read position
798 * @buf: relay channel buffer
796 */ 799 */
797static size_t relay_file_read_subbuf_avail(size_t read_pos, 800static size_t relay_file_read_subbuf_avail(size_t read_pos,
798 struct rchan_buf *buf) 801 struct rchan_buf *buf)
@@ -818,6 +821,8 @@ static size_t relay_file_read_subbuf_avail(size_t read_pos,
818 821
819/** 822/**
820 * relay_file_read_start_pos - find the first available byte to read 823 * relay_file_read_start_pos - find the first available byte to read
824 * @read_pos: file read position
825 * @buf: relay channel buffer
821 * 826 *
822 * If the read_pos is in the middle of padding, return the 827 * If the read_pos is in the middle of padding, return the
823 * position of the first actually available byte, otherwise 828 * position of the first actually available byte, otherwise
@@ -844,6 +849,9 @@ static size_t relay_file_read_start_pos(size_t read_pos,
844 849
845/** 850/**
846 * relay_file_read_end_pos - return the new read position 851 * relay_file_read_end_pos - return the new read position
852 * @read_pos: file read position
853 * @buf: relay channel buffer
854 * @count: number of bytes to be read
847 */ 855 */
848static size_t relay_file_read_end_pos(struct rchan_buf *buf, 856static size_t relay_file_read_end_pos(struct rchan_buf *buf,
849 size_t read_pos, 857 size_t read_pos,
@@ -865,7 +873,7 @@ static size_t relay_file_read_end_pos(struct rchan_buf *buf,
865 return end_pos; 873 return end_pos;
866} 874}
867 875
868/** 876/*
869 * subbuf_read_actor - read up to one subbuf's worth of data 877 * subbuf_read_actor - read up to one subbuf's worth of data
870 */ 878 */
871static int subbuf_read_actor(size_t read_start, 879static int subbuf_read_actor(size_t read_start,
@@ -890,7 +898,7 @@ static int subbuf_read_actor(size_t read_start,
890 return ret; 898 return ret;
891} 899}
892 900
893/** 901/*
894 * subbuf_send_actor - send up to one subbuf's worth of data 902 * subbuf_send_actor - send up to one subbuf's worth of data
895 */ 903 */
896static int subbuf_send_actor(size_t read_start, 904static int subbuf_send_actor(size_t read_start,
@@ -933,7 +941,7 @@ typedef int (*subbuf_actor_t) (size_t read_start,
933 read_descriptor_t *desc, 941 read_descriptor_t *desc,
934 read_actor_t actor); 942 read_actor_t actor);
935 943
936/** 944/*
937 * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries 945 * relay_file_read_subbufs - read count bytes, bridging subbuf boundaries
938 */ 946 */
939static inline ssize_t relay_file_read_subbufs(struct file *filp, 947static inline ssize_t relay_file_read_subbufs(struct file *filp,
diff --git a/kernel/resource.c b/kernel/resource.c
index e3080fcc66a3..9db38a1a7520 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -7,7 +7,6 @@
7 * Arbitrary resource management. 7 * Arbitrary resource management.
8 */ 8 */
9 9
10#include <linux/config.h>
11#include <linux/module.h> 10#include <linux/module.h>
12#include <linux/sched.h> 11#include <linux/sched.h>
13#include <linux/errno.h> 12#include <linux/errno.h>
@@ -23,20 +22,18 @@
23 22
24struct resource ioport_resource = { 23struct resource ioport_resource = {
25 .name = "PCI IO", 24 .name = "PCI IO",
26 .start = 0x0000, 25 .start = 0,
27 .end = IO_SPACE_LIMIT, 26 .end = IO_SPACE_LIMIT,
28 .flags = IORESOURCE_IO, 27 .flags = IORESOURCE_IO,
29}; 28};
30
31EXPORT_SYMBOL(ioport_resource); 29EXPORT_SYMBOL(ioport_resource);
32 30
33struct resource iomem_resource = { 31struct resource iomem_resource = {
34 .name = "PCI mem", 32 .name = "PCI mem",
35 .start = 0UL, 33 .start = 0,
36 .end = ~0UL, 34 .end = -1,
37 .flags = IORESOURCE_MEM, 35 .flags = IORESOURCE_MEM,
38}; 36};
39
40EXPORT_SYMBOL(iomem_resource); 37EXPORT_SYMBOL(iomem_resource);
41 38
42static DEFINE_RWLOCK(resource_lock); 39static DEFINE_RWLOCK(resource_lock);
@@ -83,10 +80,10 @@ static int r_show(struct seq_file *m, void *v)
83 for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent) 80 for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent)
84 if (p->parent == root) 81 if (p->parent == root)
85 break; 82 break;
86 seq_printf(m, "%*s%0*lx-%0*lx : %s\n", 83 seq_printf(m, "%*s%0*llx-%0*llx : %s\n",
87 depth * 2, "", 84 depth * 2, "",
88 width, r->start, 85 width, (unsigned long long) r->start,
89 width, r->end, 86 width, (unsigned long long) r->end,
90 r->name ? r->name : "<BAD>"); 87 r->name ? r->name : "<BAD>");
91 return 0; 88 return 0;
92} 89}
@@ -151,8 +148,8 @@ __initcall(ioresources_init);
151/* Return the conflict entry if you can't request it */ 148/* Return the conflict entry if you can't request it */
152static struct resource * __request_resource(struct resource *root, struct resource *new) 149static struct resource * __request_resource(struct resource *root, struct resource *new)
153{ 150{
154 unsigned long start = new->start; 151 resource_size_t start = new->start;
155 unsigned long end = new->end; 152 resource_size_t end = new->end;
156 struct resource *tmp, **p; 153 struct resource *tmp, **p;
157 154
158 if (end < start) 155 if (end < start)
@@ -232,15 +229,55 @@ int release_resource(struct resource *old)
232 229
233EXPORT_SYMBOL(release_resource); 230EXPORT_SYMBOL(release_resource);
234 231
232#ifdef CONFIG_MEMORY_HOTPLUG
233/*
234 * Finds the lowest memory reosurce exists within [res->start.res->end)
235 * the caller must specify res->start, res->end, res->flags.
236 * If found, returns 0, res is overwritten, if not found, returns -1.
237 */
238int find_next_system_ram(struct resource *res)
239{
240 resource_size_t start, end;
241 struct resource *p;
242
243 BUG_ON(!res);
244
245 start = res->start;
246 end = res->end;
247 BUG_ON(start >= end);
248
249 read_lock(&resource_lock);
250 for (p = iomem_resource.child; p ; p = p->sibling) {
251 /* system ram is just marked as IORESOURCE_MEM */
252 if (p->flags != res->flags)
253 continue;
254 if (p->start > end) {
255 p = NULL;
256 break;
257 }
258 if ((p->end >= start) && (p->start < end))
259 break;
260 }
261 read_unlock(&resource_lock);
262 if (!p)
263 return -1;
264 /* copy data */
265 if (res->start < p->start)
266 res->start = p->start;
267 if (res->end > p->end)
268 res->end = p->end;
269 return 0;
270}
271#endif
272
235/* 273/*
236 * Find empty slot in the resource tree given range and alignment. 274 * Find empty slot in the resource tree given range and alignment.
237 */ 275 */
238static int find_resource(struct resource *root, struct resource *new, 276static int find_resource(struct resource *root, struct resource *new,
239 unsigned long size, 277 resource_size_t size, resource_size_t min,
240 unsigned long min, unsigned long max, 278 resource_size_t max, resource_size_t align,
241 unsigned long align,
242 void (*alignf)(void *, struct resource *, 279 void (*alignf)(void *, struct resource *,
243 unsigned long, unsigned long), 280 resource_size_t, resource_size_t),
244 void *alignf_data) 281 void *alignf_data)
245{ 282{
246 struct resource *this = root->child; 283 struct resource *this = root->child;
@@ -282,11 +319,10 @@ static int find_resource(struct resource *root, struct resource *new,
282 * Allocate empty slot in the resource tree given range and alignment. 319 * Allocate empty slot in the resource tree given range and alignment.
283 */ 320 */
284int allocate_resource(struct resource *root, struct resource *new, 321int allocate_resource(struct resource *root, struct resource *new,
285 unsigned long size, 322 resource_size_t size, resource_size_t min,
286 unsigned long min, unsigned long max, 323 resource_size_t max, resource_size_t align,
287 unsigned long align,
288 void (*alignf)(void *, struct resource *, 324 void (*alignf)(void *, struct resource *,
289 unsigned long, unsigned long), 325 resource_size_t, resource_size_t),
290 void *alignf_data) 326 void *alignf_data)
291{ 327{
292 int err; 328 int err;
@@ -308,12 +344,11 @@ EXPORT_SYMBOL(allocate_resource);
308 * 344 *
309 * Returns 0 on success, -EBUSY if the resource can't be inserted. 345 * Returns 0 on success, -EBUSY if the resource can't be inserted.
310 * 346 *
311 * This function is equivalent of request_resource when no conflict 347 * This function is equivalent to request_resource when no conflict
312 * happens. If a conflict happens, and the conflicting resources 348 * happens. If a conflict happens, and the conflicting resources
313 * entirely fit within the range of the new resource, then the new 349 * entirely fit within the range of the new resource, then the new
314 * resource is inserted and the conflicting resources become childs of 350 * resource is inserted and the conflicting resources become children of
315 * the new resource. Otherwise the new resource becomes the child of 351 * the new resource.
316 * the conflicting resource
317 */ 352 */
318int insert_resource(struct resource *parent, struct resource *new) 353int insert_resource(struct resource *parent, struct resource *new)
319{ 354{
@@ -321,20 +356,21 @@ int insert_resource(struct resource *parent, struct resource *new)
321 struct resource *first, *next; 356 struct resource *first, *next;
322 357
323 write_lock(&resource_lock); 358 write_lock(&resource_lock);
324 begin:
325 result = 0;
326 first = __request_resource(parent, new);
327 if (!first)
328 goto out;
329 359
330 result = -EBUSY; 360 for (;; parent = first) {
331 if (first == parent) 361 result = 0;
332 goto out; 362 first = __request_resource(parent, new);
363 if (!first)
364 goto out;
365
366 result = -EBUSY;
367 if (first == parent)
368 goto out;
333 369
334 /* Resource fully contained by the clashing resource? Recurse into it */ 370 if ((first->start > new->start) || (first->end < new->end))
335 if (first->start <= new->start && first->end >= new->end) { 371 break;
336 parent = first; 372 if ((first->start == new->start) && (first->end == new->end))
337 goto begin; 373 break;
338 } 374 }
339 375
340 for (next = first; ; next = next->sibling) { 376 for (next = first; ; next = next->sibling) {
@@ -371,17 +407,15 @@ int insert_resource(struct resource *parent, struct resource *new)
371 return result; 407 return result;
372} 408}
373 409
374EXPORT_SYMBOL(insert_resource);
375
376/* 410/*
377 * Given an existing resource, change its start and size to match the 411 * Given an existing resource, change its start and size to match the
378 * arguments. Returns -EBUSY if it can't fit. Existing children of 412 * arguments. Returns -EBUSY if it can't fit. Existing children of
379 * the resource are assumed to be immutable. 413 * the resource are assumed to be immutable.
380 */ 414 */
381int adjust_resource(struct resource *res, unsigned long start, unsigned long size) 415int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size)
382{ 416{
383 struct resource *tmp, *parent = res->parent; 417 struct resource *tmp, *parent = res->parent;
384 unsigned long end = start + size - 1; 418 resource_size_t end = start + size - 1;
385 int result = -EBUSY; 419 int result = -EBUSY;
386 420
387 write_lock(&resource_lock); 421 write_lock(&resource_lock);
@@ -428,7 +462,9 @@ EXPORT_SYMBOL(adjust_resource);
428 * 462 *
429 * Release-region releases a matching busy region. 463 * Release-region releases a matching busy region.
430 */ 464 */
431struct resource * __request_region(struct resource *parent, unsigned long start, unsigned long n, const char *name) 465struct resource * __request_region(struct resource *parent,
466 resource_size_t start, resource_size_t n,
467 const char *name)
432{ 468{
433 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); 469 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
434 470
@@ -464,7 +500,8 @@ struct resource * __request_region(struct resource *parent, unsigned long start,
464 500
465EXPORT_SYMBOL(__request_region); 501EXPORT_SYMBOL(__request_region);
466 502
467int __check_region(struct resource *parent, unsigned long start, unsigned long n) 503int __check_region(struct resource *parent, resource_size_t start,
504 resource_size_t n)
468{ 505{
469 struct resource * res; 506 struct resource * res;
470 507
@@ -479,10 +516,11 @@ int __check_region(struct resource *parent, unsigned long start, unsigned long n
479 516
480EXPORT_SYMBOL(__check_region); 517EXPORT_SYMBOL(__check_region);
481 518
482void __release_region(struct resource *parent, unsigned long start, unsigned long n) 519void __release_region(struct resource *parent, resource_size_t start,
520 resource_size_t n)
483{ 521{
484 struct resource **p; 522 struct resource **p;
485 unsigned long end; 523 resource_size_t end;
486 524
487 p = &parent->child; 525 p = &parent->child;
488 end = start + n - 1; 526 end = start + n - 1;
@@ -511,7 +549,9 @@ void __release_region(struct resource *parent, unsigned long start, unsigned lon
511 549
512 write_unlock(&resource_lock); 550 write_unlock(&resource_lock);
513 551
514 printk(KERN_WARNING "Trying to free nonexistent resource <%08lx-%08lx>\n", start, end); 552 printk(KERN_WARNING "Trying to free nonexistent resource "
553 "<%016llx-%016llx>\n", (unsigned long long)start,
554 (unsigned long long)end);
515} 555}
516 556
517EXPORT_SYMBOL(__release_region); 557EXPORT_SYMBOL(__release_region);
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
new file mode 100644
index 000000000000..0c1faa950af7
--- /dev/null
+++ b/kernel/rtmutex-debug.c
@@ -0,0 +1,242 @@
1/*
2 * RT-Mutexes: blocking mutual exclusion locks with PI support
3 *
4 * started by Ingo Molnar and Thomas Gleixner:
5 *
6 * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
8 *
9 * This code is based on the rt.c implementation in the preempt-rt tree.
10 * Portions of said code are
11 *
12 * Copyright (C) 2004 LynuxWorks, Inc., Igor Manyilov, Bill Huey
13 * Copyright (C) 2006 Esben Nielsen
14 * Copyright (C) 2006 Kihon Technologies Inc.,
15 * Steven Rostedt <rostedt@goodmis.org>
16 *
17 * See rt.c in preempt-rt for proper credits and further information
18 */
19#include <linux/config.h>
20#include <linux/sched.h>
21#include <linux/delay.h>
22#include <linux/module.h>
23#include <linux/spinlock.h>
24#include <linux/kallsyms.h>
25#include <linux/syscalls.h>
26#include <linux/interrupt.h>
27#include <linux/plist.h>
28#include <linux/fs.h>
29#include <linux/debug_locks.h>
30
31#include "rtmutex_common.h"
32
33#ifdef CONFIG_DEBUG_RT_MUTEXES
34# include "rtmutex-debug.h"
35#else
36# include "rtmutex.h"
37#endif
38
39# define TRACE_WARN_ON(x) WARN_ON(x)
40# define TRACE_BUG_ON(x) BUG_ON(x)
41
42# define TRACE_OFF() \
43do { \
44 if (rt_trace_on) { \
45 rt_trace_on = 0; \
46 console_verbose(); \
47 if (spin_is_locked(&current->pi_lock)) \
48 spin_unlock(&current->pi_lock); \
49 } \
50} while (0)
51
52# define TRACE_OFF_NOLOCK() \
53do { \
54 if (rt_trace_on) { \
55 rt_trace_on = 0; \
56 console_verbose(); \
57 } \
58} while (0)
59
60# define TRACE_BUG_LOCKED() \
61do { \
62 TRACE_OFF(); \
63 BUG(); \
64} while (0)
65
66# define TRACE_WARN_ON_LOCKED(c) \
67do { \
68 if (unlikely(c)) { \
69 TRACE_OFF(); \
70 WARN_ON(1); \
71 } \
72} while (0)
73
74# define TRACE_BUG_ON_LOCKED(c) \
75do { \
76 if (unlikely(c)) \
77 TRACE_BUG_LOCKED(); \
78} while (0)
79
80#ifdef CONFIG_SMP
81# define SMP_TRACE_BUG_ON_LOCKED(c) TRACE_BUG_ON_LOCKED(c)
82#else
83# define SMP_TRACE_BUG_ON_LOCKED(c) do { } while (0)
84#endif
85
86/*
87 * deadlock detection flag. We turn it off when we detect
88 * the first problem because we dont want to recurse back
89 * into the tracing code when doing error printk or
90 * executing a BUG():
91 */
92int rt_trace_on = 1;
93
94void deadlock_trace_off(void)
95{
96 rt_trace_on = 0;
97}
98
99static void printk_task(struct task_struct *p)
100{
101 if (p)
102 printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio);
103 else
104 printk("<none>");
105}
106
107static void printk_lock(struct rt_mutex *lock, int print_owner)
108{
109 if (lock->name)
110 printk(" [%p] {%s}\n",
111 lock, lock->name);
112 else
113 printk(" [%p] {%s:%d}\n",
114 lock, lock->file, lock->line);
115
116 if (print_owner && rt_mutex_owner(lock)) {
117 printk(".. ->owner: %p\n", lock->owner);
118 printk(".. held by: ");
119 printk_task(rt_mutex_owner(lock));
120 printk("\n");
121 }
122}
123
124void rt_mutex_debug_task_free(struct task_struct *task)
125{
126 WARN_ON(!plist_head_empty(&task->pi_waiters));
127 WARN_ON(task->pi_blocked_on);
128}
129
130/*
131 * We fill out the fields in the waiter to store the information about
132 * the deadlock. We print when we return. act_waiter can be NULL in
133 * case of a remove waiter operation.
134 */
135void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter,
136 struct rt_mutex *lock)
137{
138 struct task_struct *task;
139
140 if (!rt_trace_on || detect || !act_waiter)
141 return;
142
143 task = rt_mutex_owner(act_waiter->lock);
144 if (task && task != current) {
145 act_waiter->deadlock_task_pid = task->pid;
146 act_waiter->deadlock_lock = lock;
147 }
148}
149
150void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
151{
152 struct task_struct *task;
153
154 if (!waiter->deadlock_lock || !rt_trace_on)
155 return;
156
157 task = find_task_by_pid(waiter->deadlock_task_pid);
158 if (!task)
159 return;
160
161 TRACE_OFF_NOLOCK();
162
163 printk("\n============================================\n");
164 printk( "[ BUG: circular locking deadlock detected! ]\n");
165 printk( "--------------------------------------------\n");
166 printk("%s/%d is deadlocking current task %s/%d\n\n",
167 task->comm, task->pid, current->comm, current->pid);
168
169 printk("\n1) %s/%d is trying to acquire this lock:\n",
170 current->comm, current->pid);
171 printk_lock(waiter->lock, 1);
172
173 printk("\n2) %s/%d is blocked on this lock:\n", task->comm, task->pid);
174 printk_lock(waiter->deadlock_lock, 1);
175
176 debug_show_held_locks(current);
177 debug_show_held_locks(task);
178
179 printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task->pid);
180 show_stack(task, NULL);
181 printk("\n%s/%d's [current] stackdump:\n\n",
182 current->comm, current->pid);
183 dump_stack();
184 debug_show_all_locks();
185
186 printk("[ turning off deadlock detection."
187 "Please report this trace. ]\n\n");
188 local_irq_disable();
189}
190
191void debug_rt_mutex_lock(struct rt_mutex *lock)
192{
193}
194
195void debug_rt_mutex_unlock(struct rt_mutex *lock)
196{
197 TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current);
198}
199
200void
201debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner)
202{
203}
204
205void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
206{
207 TRACE_WARN_ON_LOCKED(!rt_mutex_owner(lock));
208}
209
210void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
211{
212 memset(waiter, 0x11, sizeof(*waiter));
213 plist_node_init(&waiter->list_entry, MAX_PRIO);
214 plist_node_init(&waiter->pi_list_entry, MAX_PRIO);
215}
216
217void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
218{
219 TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry));
220 TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
221 TRACE_WARN_ON(waiter->task);
222 memset(waiter, 0x22, sizeof(*waiter));
223}
224
225void debug_rt_mutex_init(struct rt_mutex *lock, const char *name)
226{
227 /*
228 * Make sure we are not reinitializing a held lock:
229 */
230 debug_check_no_locks_freed((void *)lock, sizeof(*lock));
231 lock->name = name;
232}
233
234void
235rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task)
236{
237}
238
239void rt_mutex_deadlock_account_unlock(struct task_struct *task)
240{
241}
242
diff --git a/kernel/rtmutex-debug.h b/kernel/rtmutex-debug.h
new file mode 100644
index 000000000000..14193d596d78
--- /dev/null
+++ b/kernel/rtmutex-debug.h
@@ -0,0 +1,33 @@
1/*
2 * RT-Mutexes: blocking mutual exclusion locks with PI support
3 *
4 * started by Ingo Molnar and Thomas Gleixner:
5 *
6 * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
8 *
9 * This file contains macros used solely by rtmutex.c. Debug version.
10 */
11
12extern void
13rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task);
14extern void rt_mutex_deadlock_account_unlock(struct task_struct *task);
15extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
16extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter);
17extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name);
18extern void debug_rt_mutex_lock(struct rt_mutex *lock);
19extern void debug_rt_mutex_unlock(struct rt_mutex *lock);
20extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
21 struct task_struct *powner);
22extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock);
23extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter,
24 struct rt_mutex *lock);
25extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter);
26# define debug_rt_mutex_reset_waiter(w) \
27 do { (w)->deadlock_lock = NULL; } while (0)
28
29static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
30 int detect)
31{
32 return (waiter != NULL);
33}
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
new file mode 100644
index 000000000000..948bd8f643e2
--- /dev/null
+++ b/kernel/rtmutex-tester.c
@@ -0,0 +1,441 @@
1/*
2 * RT-Mutex-tester: scriptable tester for rt mutexes
3 *
4 * started by Thomas Gleixner:
5 *
6 * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
7 *
8 */
9#include <linux/config.h>
10#include <linux/kthread.h>
11#include <linux/module.h>
12#include <linux/sched.h>
13#include <linux/smp_lock.h>
14#include <linux/spinlock.h>
15#include <linux/sysdev.h>
16#include <linux/timer.h>
17
18#include "rtmutex.h"
19
20#define MAX_RT_TEST_THREADS 8
21#define MAX_RT_TEST_MUTEXES 8
22
23static spinlock_t rttest_lock;
24static atomic_t rttest_event;
25
26struct test_thread_data {
27 int opcode;
28 int opdata;
29 int mutexes[MAX_RT_TEST_MUTEXES];
30 int bkl;
31 int event;
32 struct sys_device sysdev;
33};
34
35static struct test_thread_data thread_data[MAX_RT_TEST_THREADS];
36static struct task_struct *threads[MAX_RT_TEST_THREADS];
37static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES];
38
39enum test_opcodes {
40 RTTEST_NOP = 0,
41 RTTEST_SCHEDOT, /* 1 Sched other, data = nice */
42 RTTEST_SCHEDRT, /* 2 Sched fifo, data = prio */
43 RTTEST_LOCK, /* 3 Lock uninterruptible, data = lockindex */
44 RTTEST_LOCKNOWAIT, /* 4 Lock uninterruptible no wait in wakeup, data = lockindex */
45 RTTEST_LOCKINT, /* 5 Lock interruptible, data = lockindex */
46 RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */
47 RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */
48 RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */
49 RTTEST_LOCKBKL, /* 9 Lock BKL */
50 RTTEST_UNLOCKBKL, /* 10 Unlock BKL */
51 RTTEST_SIGNAL, /* 11 Signal other test thread, data = thread id */
52 RTTEST_RESETEVENT = 98, /* 98 Reset event counter */
53 RTTEST_RESET = 99, /* 99 Reset all pending operations */
54};
55
56static int handle_op(struct test_thread_data *td, int lockwakeup)
57{
58 int i, id, ret = -EINVAL;
59
60 switch(td->opcode) {
61
62 case RTTEST_NOP:
63 return 0;
64
65 case RTTEST_LOCKCONT:
66 td->mutexes[td->opdata] = 1;
67 td->event = atomic_add_return(1, &rttest_event);
68 return 0;
69
70 case RTTEST_RESET:
71 for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) {
72 if (td->mutexes[i] == 4) {
73 rt_mutex_unlock(&mutexes[i]);
74 td->mutexes[i] = 0;
75 }
76 }
77
78 if (!lockwakeup && td->bkl == 4) {
79 unlock_kernel();
80 td->bkl = 0;
81 }
82 return 0;
83
84 case RTTEST_RESETEVENT:
85 atomic_set(&rttest_event, 0);
86 return 0;
87
88 default:
89 if (lockwakeup)
90 return ret;
91 }
92
93 switch(td->opcode) {
94
95 case RTTEST_LOCK:
96 case RTTEST_LOCKNOWAIT:
97 id = td->opdata;
98 if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
99 return ret;
100
101 td->mutexes[id] = 1;
102 td->event = atomic_add_return(1, &rttest_event);
103 rt_mutex_lock(&mutexes[id]);
104 td->event = atomic_add_return(1, &rttest_event);
105 td->mutexes[id] = 4;
106 return 0;
107
108 case RTTEST_LOCKINT:
109 case RTTEST_LOCKINTNOWAIT:
110 id = td->opdata;
111 if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
112 return ret;
113
114 td->mutexes[id] = 1;
115 td->event = atomic_add_return(1, &rttest_event);
116 ret = rt_mutex_lock_interruptible(&mutexes[id], 0);
117 td->event = atomic_add_return(1, &rttest_event);
118 td->mutexes[id] = ret ? 0 : 4;
119 return ret ? -EINTR : 0;
120
121 case RTTEST_UNLOCK:
122 id = td->opdata;
123 if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4)
124 return ret;
125
126 td->event = atomic_add_return(1, &rttest_event);
127 rt_mutex_unlock(&mutexes[id]);
128 td->event = atomic_add_return(1, &rttest_event);
129 td->mutexes[id] = 0;
130 return 0;
131
132 case RTTEST_LOCKBKL:
133 if (td->bkl)
134 return 0;
135 td->bkl = 1;
136 lock_kernel();
137 td->bkl = 4;
138 return 0;
139
140 case RTTEST_UNLOCKBKL:
141 if (td->bkl != 4)
142 break;
143 unlock_kernel();
144 td->bkl = 0;
145 return 0;
146
147 default:
148 break;
149 }
150 return ret;
151}
152
153/*
154 * Schedule replacement for rtsem_down(). Only called for threads with
155 * PF_MUTEX_TESTER set.
156 *
157 * This allows us to have finegrained control over the event flow.
158 *
159 */
160void schedule_rt_mutex_test(struct rt_mutex *mutex)
161{
162 int tid, op, dat;
163 struct test_thread_data *td;
164
165 /* We have to lookup the task */
166 for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) {
167 if (threads[tid] == current)
168 break;
169 }
170
171 BUG_ON(tid == MAX_RT_TEST_THREADS);
172
173 td = &thread_data[tid];
174
175 op = td->opcode;
176 dat = td->opdata;
177
178 switch (op) {
179 case RTTEST_LOCK:
180 case RTTEST_LOCKINT:
181 case RTTEST_LOCKNOWAIT:
182 case RTTEST_LOCKINTNOWAIT:
183 if (mutex != &mutexes[dat])
184 break;
185
186 if (td->mutexes[dat] != 1)
187 break;
188
189 td->mutexes[dat] = 2;
190 td->event = atomic_add_return(1, &rttest_event);
191 break;
192
193 case RTTEST_LOCKBKL:
194 default:
195 break;
196 }
197
198 schedule();
199
200
201 switch (op) {
202 case RTTEST_LOCK:
203 case RTTEST_LOCKINT:
204 if (mutex != &mutexes[dat])
205 return;
206
207 if (td->mutexes[dat] != 2)
208 return;
209
210 td->mutexes[dat] = 3;
211 td->event = atomic_add_return(1, &rttest_event);
212 break;
213
214 case RTTEST_LOCKNOWAIT:
215 case RTTEST_LOCKINTNOWAIT:
216 if (mutex != &mutexes[dat])
217 return;
218
219 if (td->mutexes[dat] != 2)
220 return;
221
222 td->mutexes[dat] = 1;
223 td->event = atomic_add_return(1, &rttest_event);
224 return;
225
226 case RTTEST_LOCKBKL:
227 return;
228 default:
229 return;
230 }
231
232 td->opcode = 0;
233
234 for (;;) {
235 set_current_state(TASK_INTERRUPTIBLE);
236
237 if (td->opcode > 0) {
238 int ret;
239
240 set_current_state(TASK_RUNNING);
241 ret = handle_op(td, 1);
242 set_current_state(TASK_INTERRUPTIBLE);
243 if (td->opcode == RTTEST_LOCKCONT)
244 break;
245 td->opcode = ret;
246 }
247
248 /* Wait for the next command to be executed */
249 schedule();
250 }
251
252 /* Restore previous command and data */
253 td->opcode = op;
254 td->opdata = dat;
255}
256
257static int test_func(void *data)
258{
259 struct test_thread_data *td = data;
260 int ret;
261
262 current->flags |= PF_MUTEX_TESTER;
263 allow_signal(SIGHUP);
264
265 for(;;) {
266
267 set_current_state(TASK_INTERRUPTIBLE);
268
269 if (td->opcode > 0) {
270 set_current_state(TASK_RUNNING);
271 ret = handle_op(td, 0);
272 set_current_state(TASK_INTERRUPTIBLE);
273 td->opcode = ret;
274 }
275
276 /* Wait for the next command to be executed */
277 schedule();
278 try_to_freeze();
279
280 if (signal_pending(current))
281 flush_signals(current);
282
283 if(kthread_should_stop())
284 break;
285 }
286 return 0;
287}
288
289/**
290 * sysfs_test_command - interface for test commands
291 * @dev: thread reference
292 * @buf: command for actual step
293 * @count: length of buffer
294 *
295 * command syntax:
296 *
297 * opcode:data
298 */
299static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf,
300 size_t count)
301{
302 struct sched_param schedpar;
303 struct test_thread_data *td;
304 char cmdbuf[32];
305 int op, dat, tid, ret;
306
307 td = container_of(dev, struct test_thread_data, sysdev);
308 tid = td->sysdev.id;
309
310 /* strings from sysfs write are not 0 terminated! */
311 if (count >= sizeof(cmdbuf))
312 return -EINVAL;
313
314 /* strip of \n: */
315 if (buf[count-1] == '\n')
316 count--;
317 if (count < 1)
318 return -EINVAL;
319
320 memcpy(cmdbuf, buf, count);
321 cmdbuf[count] = 0;
322
323 if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2)
324 return -EINVAL;
325
326 switch (op) {
327 case RTTEST_SCHEDOT:
328 schedpar.sched_priority = 0;
329 ret = sched_setscheduler(threads[tid], SCHED_NORMAL, &schedpar);
330 if (ret)
331 return ret;
332 set_user_nice(current, 0);
333 break;
334
335 case RTTEST_SCHEDRT:
336 schedpar.sched_priority = dat;
337 ret = sched_setscheduler(threads[tid], SCHED_FIFO, &schedpar);
338 if (ret)
339 return ret;
340 break;
341
342 case RTTEST_SIGNAL:
343 send_sig(SIGHUP, threads[tid], 0);
344 break;
345
346 default:
347 if (td->opcode > 0)
348 return -EBUSY;
349 td->opdata = dat;
350 td->opcode = op;
351 wake_up_process(threads[tid]);
352 }
353
354 return count;
355}
356
357/**
358 * sysfs_test_status - sysfs interface for rt tester
359 * @dev: thread to query
360 * @buf: char buffer to be filled with thread status info
361 */
362static ssize_t sysfs_test_status(struct sys_device *dev, char *buf)
363{
364 struct test_thread_data *td;
365 struct task_struct *tsk;
366 char *curr = buf;
367 int i;
368
369 td = container_of(dev, struct test_thread_data, sysdev);
370 tsk = threads[td->sysdev.id];
371
372 spin_lock(&rttest_lock);
373
374 curr += sprintf(curr,
375 "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:",
376 td->opcode, td->event, tsk->state,
377 (MAX_RT_PRIO - 1) - tsk->prio,
378 (MAX_RT_PRIO - 1) - tsk->normal_prio,
379 tsk->pi_blocked_on, td->bkl);
380
381 for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--)
382 curr += sprintf(curr, "%d", td->mutexes[i]);
383
384 spin_unlock(&rttest_lock);
385
386 curr += sprintf(curr, ", T: %p, R: %p\n", tsk,
387 mutexes[td->sysdev.id].owner);
388
389 return curr - buf;
390}
391
392static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL);
393static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command);
394
395static struct sysdev_class rttest_sysclass = {
396 set_kset_name("rttest"),
397};
398
399static int init_test_thread(int id)
400{
401 thread_data[id].sysdev.cls = &rttest_sysclass;
402 thread_data[id].sysdev.id = id;
403
404 threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id);
405 if (IS_ERR(threads[id]))
406 return PTR_ERR(threads[id]);
407
408 return sysdev_register(&thread_data[id].sysdev);
409}
410
411static int init_rttest(void)
412{
413 int ret, i;
414
415 spin_lock_init(&rttest_lock);
416
417 for (i = 0; i < MAX_RT_TEST_MUTEXES; i++)
418 rt_mutex_init(&mutexes[i]);
419
420 ret = sysdev_class_register(&rttest_sysclass);
421 if (ret)
422 return ret;
423
424 for (i = 0; i < MAX_RT_TEST_THREADS; i++) {
425 ret = init_test_thread(i);
426 if (ret)
427 break;
428 ret = sysdev_create_file(&thread_data[i].sysdev, &attr_status);
429 if (ret)
430 break;
431 ret = sysdev_create_file(&thread_data[i].sysdev, &attr_command);
432 if (ret)
433 break;
434 }
435
436 printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" );
437
438 return ret;
439}
440
441device_initcall(init_rttest);
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
new file mode 100644
index 000000000000..4ab17da46fd8
--- /dev/null
+++ b/kernel/rtmutex.c
@@ -0,0 +1,990 @@
1/*
2 * RT-Mutexes: simple blocking mutual exclusion locks with PI support
3 *
4 * started by Ingo Molnar and Thomas Gleixner.
5 *
6 * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
8 * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
9 * Copyright (C) 2006 Esben Nielsen
10 *
11 * See Documentation/rt-mutex-design.txt for details.
12 */
13#include <linux/spinlock.h>
14#include <linux/module.h>
15#include <linux/sched.h>
16#include <linux/timer.h>
17
18#include "rtmutex_common.h"
19
20#ifdef CONFIG_DEBUG_RT_MUTEXES
21# include "rtmutex-debug.h"
22#else
23# include "rtmutex.h"
24#endif
25
26/*
27 * lock->owner state tracking:
28 *
29 * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1
30 * are used to keep track of the "owner is pending" and "lock has
31 * waiters" state.
32 *
33 * owner bit1 bit0
34 * NULL 0 0 lock is free (fast acquire possible)
35 * NULL 0 1 invalid state
36 * NULL 1 0 Transitional State*
37 * NULL 1 1 invalid state
38 * taskpointer 0 0 lock is held (fast release possible)
39 * taskpointer 0 1 task is pending owner
40 * taskpointer 1 0 lock is held and has waiters
41 * taskpointer 1 1 task is pending owner and lock has more waiters
42 *
43 * Pending ownership is assigned to the top (highest priority)
44 * waiter of the lock, when the lock is released. The thread is woken
45 * up and can now take the lock. Until the lock is taken (bit 0
46 * cleared) a competing higher priority thread can steal the lock
47 * which puts the woken up thread back on the waiters list.
48 *
49 * The fast atomic compare exchange based acquire and release is only
50 * possible when bit 0 and 1 of lock->owner are 0.
51 *
52 * (*) There's a small time where the owner can be NULL and the
53 * "lock has waiters" bit is set. This can happen when grabbing the lock.
54 * To prevent a cmpxchg of the owner releasing the lock, we need to set this
55 * bit before looking at the lock, hence the reason this is a transitional
56 * state.
57 */
58
59static void
60rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
61 unsigned long mask)
62{
63 unsigned long val = (unsigned long)owner | mask;
64
65 if (rt_mutex_has_waiters(lock))
66 val |= RT_MUTEX_HAS_WAITERS;
67
68 lock->owner = (struct task_struct *)val;
69}
70
71static inline void clear_rt_mutex_waiters(struct rt_mutex *lock)
72{
73 lock->owner = (struct task_struct *)
74 ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
75}
76
77static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
78{
79 if (!rt_mutex_has_waiters(lock))
80 clear_rt_mutex_waiters(lock);
81}
82
83/*
84 * We can speed up the acquire/release, if the architecture
85 * supports cmpxchg and if there's no debugging state to be set up
86 */
87#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
88# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c)
89static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
90{
91 unsigned long owner, *p = (unsigned long *) &lock->owner;
92
93 do {
94 owner = *p;
95 } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
96}
97#else
98# define rt_mutex_cmpxchg(l,c,n) (0)
99static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
100{
101 lock->owner = (struct task_struct *)
102 ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
103}
104#endif
105
106/*
107 * Calculate task priority from the waiter list priority
108 *
109 * Return task->normal_prio when the waiter list is empty or when
110 * the waiter is not allowed to do priority boosting
111 */
112int rt_mutex_getprio(struct task_struct *task)
113{
114 if (likely(!task_has_pi_waiters(task)))
115 return task->normal_prio;
116
117 return min(task_top_pi_waiter(task)->pi_list_entry.prio,
118 task->normal_prio);
119}
120
121/*
122 * Adjust the priority of a task, after its pi_waiters got modified.
123 *
124 * This can be both boosting and unboosting. task->pi_lock must be held.
125 */
126static void __rt_mutex_adjust_prio(struct task_struct *task)
127{
128 int prio = rt_mutex_getprio(task);
129
130 if (task->prio != prio)
131 rt_mutex_setprio(task, prio);
132}
133
134/*
135 * Adjust task priority (undo boosting). Called from the exit path of
136 * rt_mutex_slowunlock() and rt_mutex_slowlock().
137 *
138 * (Note: We do this outside of the protection of lock->wait_lock to
139 * allow the lock to be taken while or before we readjust the priority
140 * of task. We do not use the spin_xx_mutex() variants here as we are
141 * outside of the debug path.)
142 */
143static void rt_mutex_adjust_prio(struct task_struct *task)
144{
145 unsigned long flags;
146
147 spin_lock_irqsave(&task->pi_lock, flags);
148 __rt_mutex_adjust_prio(task);
149 spin_unlock_irqrestore(&task->pi_lock, flags);
150}
151
152/*
153 * Max number of times we'll walk the boosting chain:
154 */
155int max_lock_depth = 1024;
156
157/*
158 * Adjust the priority chain. Also used for deadlock detection.
159 * Decreases task's usage by one - may thus free the task.
160 * Returns 0 or -EDEADLK.
161 */
162static int rt_mutex_adjust_prio_chain(struct task_struct *task,
163 int deadlock_detect,
164 struct rt_mutex *orig_lock,
165 struct rt_mutex_waiter *orig_waiter,
166 struct task_struct *top_task)
167{
168 struct rt_mutex *lock;
169 struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
170 int detect_deadlock, ret = 0, depth = 0;
171 unsigned long flags;
172
173 detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter,
174 deadlock_detect);
175
176 /*
177 * The (de)boosting is a step by step approach with a lot of
178 * pitfalls. We want this to be preemptible and we want hold a
179 * maximum of two locks per step. So we have to check
180 * carefully whether things change under us.
181 */
182 again:
183 if (++depth > max_lock_depth) {
184 static int prev_max;
185
186 /*
187 * Print this only once. If the admin changes the limit,
188 * print a new message when reaching the limit again.
189 */
190 if (prev_max != max_lock_depth) {
191 prev_max = max_lock_depth;
192 printk(KERN_WARNING "Maximum lock depth %d reached "
193 "task: %s (%d)\n", max_lock_depth,
194 top_task->comm, top_task->pid);
195 }
196 put_task_struct(task);
197
198 return deadlock_detect ? -EDEADLK : 0;
199 }
200 retry:
201 /*
202 * Task can not go away as we did a get_task() before !
203 */
204 spin_lock_irqsave(&task->pi_lock, flags);
205
206 waiter = task->pi_blocked_on;
207 /*
208 * Check whether the end of the boosting chain has been
209 * reached or the state of the chain has changed while we
210 * dropped the locks.
211 */
212 if (!waiter || !waiter->task)
213 goto out_unlock_pi;
214
215 if (top_waiter && (!task_has_pi_waiters(task) ||
216 top_waiter != task_top_pi_waiter(task)))
217 goto out_unlock_pi;
218
219 /*
220 * When deadlock detection is off then we check, if further
221 * priority adjustment is necessary.
222 */
223 if (!detect_deadlock && waiter->list_entry.prio == task->prio)
224 goto out_unlock_pi;
225
226 lock = waiter->lock;
227 if (!spin_trylock(&lock->wait_lock)) {
228 spin_unlock_irqrestore(&task->pi_lock, flags);
229 cpu_relax();
230 goto retry;
231 }
232
233 /* Deadlock detection */
234 if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
235 debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock);
236 spin_unlock(&lock->wait_lock);
237 ret = deadlock_detect ? -EDEADLK : 0;
238 goto out_unlock_pi;
239 }
240
241 top_waiter = rt_mutex_top_waiter(lock);
242
243 /* Requeue the waiter */
244 plist_del(&waiter->list_entry, &lock->wait_list);
245 waiter->list_entry.prio = task->prio;
246 plist_add(&waiter->list_entry, &lock->wait_list);
247
248 /* Release the task */
249 spin_unlock_irqrestore(&task->pi_lock, flags);
250 put_task_struct(task);
251
252 /* Grab the next task */
253 task = rt_mutex_owner(lock);
254 get_task_struct(task);
255 spin_lock_irqsave(&task->pi_lock, flags);
256
257 if (waiter == rt_mutex_top_waiter(lock)) {
258 /* Boost the owner */
259 plist_del(&top_waiter->pi_list_entry, &task->pi_waiters);
260 waiter->pi_list_entry.prio = waiter->list_entry.prio;
261 plist_add(&waiter->pi_list_entry, &task->pi_waiters);
262 __rt_mutex_adjust_prio(task);
263
264 } else if (top_waiter == waiter) {
265 /* Deboost the owner */
266 plist_del(&waiter->pi_list_entry, &task->pi_waiters);
267 waiter = rt_mutex_top_waiter(lock);
268 waiter->pi_list_entry.prio = waiter->list_entry.prio;
269 plist_add(&waiter->pi_list_entry, &task->pi_waiters);
270 __rt_mutex_adjust_prio(task);
271 }
272
273 spin_unlock_irqrestore(&task->pi_lock, flags);
274
275 top_waiter = rt_mutex_top_waiter(lock);
276 spin_unlock(&lock->wait_lock);
277
278 if (!detect_deadlock && waiter != top_waiter)
279 goto out_put_task;
280
281 goto again;
282
283 out_unlock_pi:
284 spin_unlock_irqrestore(&task->pi_lock, flags);
285 out_put_task:
286 put_task_struct(task);
287
288 return ret;
289}
290
291/*
292 * Optimization: check if we can steal the lock from the
293 * assigned pending owner [which might not have taken the
294 * lock yet]:
295 */
296static inline int try_to_steal_lock(struct rt_mutex *lock)
297{
298 struct task_struct *pendowner = rt_mutex_owner(lock);
299 struct rt_mutex_waiter *next;
300 unsigned long flags;
301
302 if (!rt_mutex_owner_pending(lock))
303 return 0;
304
305 if (pendowner == current)
306 return 1;
307
308 spin_lock_irqsave(&pendowner->pi_lock, flags);
309 if (current->prio >= pendowner->prio) {
310 spin_unlock_irqrestore(&pendowner->pi_lock, flags);
311 return 0;
312 }
313
314 /*
315 * Check if a waiter is enqueued on the pending owners
316 * pi_waiters list. Remove it and readjust pending owners
317 * priority.
318 */
319 if (likely(!rt_mutex_has_waiters(lock))) {
320 spin_unlock_irqrestore(&pendowner->pi_lock, flags);
321 return 1;
322 }
323
324 /* No chain handling, pending owner is not blocked on anything: */
325 next = rt_mutex_top_waiter(lock);
326 plist_del(&next->pi_list_entry, &pendowner->pi_waiters);
327 __rt_mutex_adjust_prio(pendowner);
328 spin_unlock_irqrestore(&pendowner->pi_lock, flags);
329
330 /*
331 * We are going to steal the lock and a waiter was
332 * enqueued on the pending owners pi_waiters queue. So
333 * we have to enqueue this waiter into
334 * current->pi_waiters list. This covers the case,
335 * where current is boosted because it holds another
336 * lock and gets unboosted because the booster is
337 * interrupted, so we would delay a waiter with higher
338 * priority as current->normal_prio.
339 *
340 * Note: in the rare case of a SCHED_OTHER task changing
341 * its priority and thus stealing the lock, next->task
342 * might be current:
343 */
344 if (likely(next->task != current)) {
345 spin_lock_irqsave(&current->pi_lock, flags);
346 plist_add(&next->pi_list_entry, &current->pi_waiters);
347 __rt_mutex_adjust_prio(current);
348 spin_unlock_irqrestore(&current->pi_lock, flags);
349 }
350 return 1;
351}
352
353/*
354 * Try to take an rt-mutex
355 *
356 * This fails
357 * - when the lock has a real owner
358 * - when a different pending owner exists and has higher priority than current
359 *
360 * Must be called with lock->wait_lock held.
361 */
362static int try_to_take_rt_mutex(struct rt_mutex *lock)
363{
364 /*
365 * We have to be careful here if the atomic speedups are
366 * enabled, such that, when
367 * - no other waiter is on the lock
368 * - the lock has been released since we did the cmpxchg
369 * the lock can be released or taken while we are doing the
370 * checks and marking the lock with RT_MUTEX_HAS_WAITERS.
371 *
372 * The atomic acquire/release aware variant of
373 * mark_rt_mutex_waiters uses a cmpxchg loop. After setting
374 * the WAITERS bit, the atomic release / acquire can not
375 * happen anymore and lock->wait_lock protects us from the
376 * non-atomic case.
377 *
378 * Note, that this might set lock->owner =
379 * RT_MUTEX_HAS_WAITERS in the case the lock is not contended
380 * any more. This is fixed up when we take the ownership.
381 * This is the transitional state explained at the top of this file.
382 */
383 mark_rt_mutex_waiters(lock);
384
385 if (rt_mutex_owner(lock) && !try_to_steal_lock(lock))
386 return 0;
387
388 /* We got the lock. */
389 debug_rt_mutex_lock(lock);
390
391 rt_mutex_set_owner(lock, current, 0);
392
393 rt_mutex_deadlock_account_lock(lock, current);
394
395 return 1;
396}
397
398/*
399 * Task blocks on lock.
400 *
401 * Prepare waiter and propagate pi chain
402 *
403 * This must be called with lock->wait_lock held.
404 */
405static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
406 struct rt_mutex_waiter *waiter,
407 int detect_deadlock)
408{
409 struct task_struct *owner = rt_mutex_owner(lock);
410 struct rt_mutex_waiter *top_waiter = waiter;
411 unsigned long flags;
412 int chain_walk = 0, res;
413
414 spin_lock_irqsave(&current->pi_lock, flags);
415 __rt_mutex_adjust_prio(current);
416 waiter->task = current;
417 waiter->lock = lock;
418 plist_node_init(&waiter->list_entry, current->prio);
419 plist_node_init(&waiter->pi_list_entry, current->prio);
420
421 /* Get the top priority waiter on the lock */
422 if (rt_mutex_has_waiters(lock))
423 top_waiter = rt_mutex_top_waiter(lock);
424 plist_add(&waiter->list_entry, &lock->wait_list);
425
426 current->pi_blocked_on = waiter;
427
428 spin_unlock_irqrestore(&current->pi_lock, flags);
429
430 if (waiter == rt_mutex_top_waiter(lock)) {
431 spin_lock_irqsave(&owner->pi_lock, flags);
432 plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
433 plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
434
435 __rt_mutex_adjust_prio(owner);
436 if (owner->pi_blocked_on)
437 chain_walk = 1;
438 spin_unlock_irqrestore(&owner->pi_lock, flags);
439 }
440 else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock))
441 chain_walk = 1;
442
443 if (!chain_walk)
444 return 0;
445
446 /*
447 * The owner can't disappear while holding a lock,
448 * so the owner struct is protected by wait_lock.
449 * Gets dropped in rt_mutex_adjust_prio_chain()!
450 */
451 get_task_struct(owner);
452
453 spin_unlock(&lock->wait_lock);
454
455 res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
456 current);
457
458 spin_lock(&lock->wait_lock);
459
460 return res;
461}
462
463/*
464 * Wake up the next waiter on the lock.
465 *
466 * Remove the top waiter from the current tasks waiter list and from
467 * the lock waiter list. Set it as pending owner. Then wake it up.
468 *
469 * Called with lock->wait_lock held.
470 */
471static void wakeup_next_waiter(struct rt_mutex *lock)
472{
473 struct rt_mutex_waiter *waiter;
474 struct task_struct *pendowner;
475 unsigned long flags;
476
477 spin_lock_irqsave(&current->pi_lock, flags);
478
479 waiter = rt_mutex_top_waiter(lock);
480 plist_del(&waiter->list_entry, &lock->wait_list);
481
482 /*
483 * Remove it from current->pi_waiters. We do not adjust a
484 * possible priority boost right now. We execute wakeup in the
485 * boosted mode and go back to normal after releasing
486 * lock->wait_lock.
487 */
488 plist_del(&waiter->pi_list_entry, &current->pi_waiters);
489 pendowner = waiter->task;
490 waiter->task = NULL;
491
492 rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING);
493
494 spin_unlock_irqrestore(&current->pi_lock, flags);
495
496 /*
497 * Clear the pi_blocked_on variable and enqueue a possible
498 * waiter into the pi_waiters list of the pending owner. This
499 * prevents that in case the pending owner gets unboosted a
500 * waiter with higher priority than pending-owner->normal_prio
501 * is blocked on the unboosted (pending) owner.
502 */
503 spin_lock_irqsave(&pendowner->pi_lock, flags);
504
505 WARN_ON(!pendowner->pi_blocked_on);
506 WARN_ON(pendowner->pi_blocked_on != waiter);
507 WARN_ON(pendowner->pi_blocked_on->lock != lock);
508
509 pendowner->pi_blocked_on = NULL;
510
511 if (rt_mutex_has_waiters(lock)) {
512 struct rt_mutex_waiter *next;
513
514 next = rt_mutex_top_waiter(lock);
515 plist_add(&next->pi_list_entry, &pendowner->pi_waiters);
516 }
517 spin_unlock_irqrestore(&pendowner->pi_lock, flags);
518
519 wake_up_process(pendowner);
520}
521
522/*
523 * Remove a waiter from a lock
524 *
525 * Must be called with lock->wait_lock held
526 */
527static void remove_waiter(struct rt_mutex *lock,
528 struct rt_mutex_waiter *waiter)
529{
530 int first = (waiter == rt_mutex_top_waiter(lock));
531 struct task_struct *owner = rt_mutex_owner(lock);
532 unsigned long flags;
533 int chain_walk = 0;
534
535 spin_lock_irqsave(&current->pi_lock, flags);
536 plist_del(&waiter->list_entry, &lock->wait_list);
537 waiter->task = NULL;
538 current->pi_blocked_on = NULL;
539 spin_unlock_irqrestore(&current->pi_lock, flags);
540
541 if (first && owner != current) {
542
543 spin_lock_irqsave(&owner->pi_lock, flags);
544
545 plist_del(&waiter->pi_list_entry, &owner->pi_waiters);
546
547 if (rt_mutex_has_waiters(lock)) {
548 struct rt_mutex_waiter *next;
549
550 next = rt_mutex_top_waiter(lock);
551 plist_add(&next->pi_list_entry, &owner->pi_waiters);
552 }
553 __rt_mutex_adjust_prio(owner);
554
555 if (owner->pi_blocked_on)
556 chain_walk = 1;
557
558 spin_unlock_irqrestore(&owner->pi_lock, flags);
559 }
560
561 WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
562
563 if (!chain_walk)
564 return;
565
566 /* gets dropped in rt_mutex_adjust_prio_chain()! */
567 get_task_struct(owner);
568
569 spin_unlock(&lock->wait_lock);
570
571 rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current);
572
573 spin_lock(&lock->wait_lock);
574}
575
576/*
577 * Recheck the pi chain, in case we got a priority setting
578 *
579 * Called from sched_setscheduler
580 */
581void rt_mutex_adjust_pi(struct task_struct *task)
582{
583 struct rt_mutex_waiter *waiter;
584 unsigned long flags;
585
586 spin_lock_irqsave(&task->pi_lock, flags);
587
588 waiter = task->pi_blocked_on;
589 if (!waiter || waiter->list_entry.prio == task->prio) {
590 spin_unlock_irqrestore(&task->pi_lock, flags);
591 return;
592 }
593
594 spin_unlock_irqrestore(&task->pi_lock, flags);
595
596 /* gets dropped in rt_mutex_adjust_prio_chain()! */
597 get_task_struct(task);
598 rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task);
599}
600
601/*
602 * Slow path lock function:
603 */
604static int __sched
605rt_mutex_slowlock(struct rt_mutex *lock, int state,
606 struct hrtimer_sleeper *timeout,
607 int detect_deadlock)
608{
609 struct rt_mutex_waiter waiter;
610 int ret = 0;
611
612 debug_rt_mutex_init_waiter(&waiter);
613 waiter.task = NULL;
614
615 spin_lock(&lock->wait_lock);
616
617 /* Try to acquire the lock again: */
618 if (try_to_take_rt_mutex(lock)) {
619 spin_unlock(&lock->wait_lock);
620 return 0;
621 }
622
623 set_current_state(state);
624
625 /* Setup the timer, when timeout != NULL */
626 if (unlikely(timeout))
627 hrtimer_start(&timeout->timer, timeout->timer.expires,
628 HRTIMER_ABS);
629
630 for (;;) {
631 /* Try to acquire the lock: */
632 if (try_to_take_rt_mutex(lock))
633 break;
634
635 /*
636 * TASK_INTERRUPTIBLE checks for signals and
637 * timeout. Ignored otherwise.
638 */
639 if (unlikely(state == TASK_INTERRUPTIBLE)) {
640 /* Signal pending? */
641 if (signal_pending(current))
642 ret = -EINTR;
643 if (timeout && !timeout->task)
644 ret = -ETIMEDOUT;
645 if (ret)
646 break;
647 }
648
649 /*
650 * waiter.task is NULL the first time we come here and
651 * when we have been woken up by the previous owner
652 * but the lock got stolen by a higher prio task.
653 */
654 if (!waiter.task) {
655 ret = task_blocks_on_rt_mutex(lock, &waiter,
656 detect_deadlock);
657 /*
658 * If we got woken up by the owner then start loop
659 * all over without going into schedule to try
660 * to get the lock now:
661 */
662 if (unlikely(!waiter.task))
663 continue;
664
665 if (unlikely(ret))
666 break;
667 }
668
669 spin_unlock(&lock->wait_lock);
670
671 debug_rt_mutex_print_deadlock(&waiter);
672
673 if (waiter.task)
674 schedule_rt_mutex(lock);
675
676 spin_lock(&lock->wait_lock);
677 set_current_state(state);
678 }
679
680 set_current_state(TASK_RUNNING);
681
682 if (unlikely(waiter.task))
683 remove_waiter(lock, &waiter);
684
685 /*
686 * try_to_take_rt_mutex() sets the waiter bit
687 * unconditionally. We might have to fix that up.
688 */
689 fixup_rt_mutex_waiters(lock);
690
691 spin_unlock(&lock->wait_lock);
692
693 /* Remove pending timer: */
694 if (unlikely(timeout))
695 hrtimer_cancel(&timeout->timer);
696
697 /*
698 * Readjust priority, when we did not get the lock. We might
699 * have been the pending owner and boosted. Since we did not
700 * take the lock, the PI boost has to go.
701 */
702 if (unlikely(ret))
703 rt_mutex_adjust_prio(current);
704
705 debug_rt_mutex_free_waiter(&waiter);
706
707 return ret;
708}
709
710/*
711 * Slow path try-lock function:
712 */
713static inline int
714rt_mutex_slowtrylock(struct rt_mutex *lock)
715{
716 int ret = 0;
717
718 spin_lock(&lock->wait_lock);
719
720 if (likely(rt_mutex_owner(lock) != current)) {
721
722 ret = try_to_take_rt_mutex(lock);
723 /*
724 * try_to_take_rt_mutex() sets the lock waiters
725 * bit unconditionally. Clean this up.
726 */
727 fixup_rt_mutex_waiters(lock);
728 }
729
730 spin_unlock(&lock->wait_lock);
731
732 return ret;
733}
734
735/*
736 * Slow path to release a rt-mutex:
737 */
738static void __sched
739rt_mutex_slowunlock(struct rt_mutex *lock)
740{
741 spin_lock(&lock->wait_lock);
742
743 debug_rt_mutex_unlock(lock);
744
745 rt_mutex_deadlock_account_unlock(current);
746
747 if (!rt_mutex_has_waiters(lock)) {
748 lock->owner = NULL;
749 spin_unlock(&lock->wait_lock);
750 return;
751 }
752
753 wakeup_next_waiter(lock);
754
755 spin_unlock(&lock->wait_lock);
756
757 /* Undo pi boosting if necessary: */
758 rt_mutex_adjust_prio(current);
759}
760
761/*
762 * debug aware fast / slowpath lock,trylock,unlock
763 *
764 * The atomic acquire/release ops are compiled away, when either the
765 * architecture does not support cmpxchg or when debugging is enabled.
766 */
767static inline int
768rt_mutex_fastlock(struct rt_mutex *lock, int state,
769 int detect_deadlock,
770 int (*slowfn)(struct rt_mutex *lock, int state,
771 struct hrtimer_sleeper *timeout,
772 int detect_deadlock))
773{
774 if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
775 rt_mutex_deadlock_account_lock(lock, current);
776 return 0;
777 } else
778 return slowfn(lock, state, NULL, detect_deadlock);
779}
780
781static inline int
782rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
783 struct hrtimer_sleeper *timeout, int detect_deadlock,
784 int (*slowfn)(struct rt_mutex *lock, int state,
785 struct hrtimer_sleeper *timeout,
786 int detect_deadlock))
787{
788 if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
789 rt_mutex_deadlock_account_lock(lock, current);
790 return 0;
791 } else
792 return slowfn(lock, state, timeout, detect_deadlock);
793}
794
795static inline int
796rt_mutex_fasttrylock(struct rt_mutex *lock,
797 int (*slowfn)(struct rt_mutex *lock))
798{
799 if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
800 rt_mutex_deadlock_account_lock(lock, current);
801 return 1;
802 }
803 return slowfn(lock);
804}
805
806static inline void
807rt_mutex_fastunlock(struct rt_mutex *lock,
808 void (*slowfn)(struct rt_mutex *lock))
809{
810 if (likely(rt_mutex_cmpxchg(lock, current, NULL)))
811 rt_mutex_deadlock_account_unlock(current);
812 else
813 slowfn(lock);
814}
815
816/**
817 * rt_mutex_lock - lock a rt_mutex
818 *
819 * @lock: the rt_mutex to be locked
820 */
821void __sched rt_mutex_lock(struct rt_mutex *lock)
822{
823 might_sleep();
824
825 rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock);
826}
827EXPORT_SYMBOL_GPL(rt_mutex_lock);
828
829/**
830 * rt_mutex_lock_interruptible - lock a rt_mutex interruptible
831 *
832 * @lock: the rt_mutex to be locked
833 * @detect_deadlock: deadlock detection on/off
834 *
835 * Returns:
836 * 0 on success
837 * -EINTR when interrupted by a signal
838 * -EDEADLK when the lock would deadlock (when deadlock detection is on)
839 */
840int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock,
841 int detect_deadlock)
842{
843 might_sleep();
844
845 return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE,
846 detect_deadlock, rt_mutex_slowlock);
847}
848EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
849
850/**
851 * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible
852 * the timeout structure is provided
853 * by the caller
854 *
855 * @lock: the rt_mutex to be locked
856 * @timeout: timeout structure or NULL (no timeout)
857 * @detect_deadlock: deadlock detection on/off
858 *
859 * Returns:
860 * 0 on success
861 * -EINTR when interrupted by a signal
862 * -ETIMEOUT when the timeout expired
863 * -EDEADLK when the lock would deadlock (when deadlock detection is on)
864 */
865int
866rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout,
867 int detect_deadlock)
868{
869 might_sleep();
870
871 return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
872 detect_deadlock, rt_mutex_slowlock);
873}
874EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
875
876/**
877 * rt_mutex_trylock - try to lock a rt_mutex
878 *
879 * @lock: the rt_mutex to be locked
880 *
881 * Returns 1 on success and 0 on contention
882 */
883int __sched rt_mutex_trylock(struct rt_mutex *lock)
884{
885 return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
886}
887EXPORT_SYMBOL_GPL(rt_mutex_trylock);
888
889/**
890 * rt_mutex_unlock - unlock a rt_mutex
891 *
892 * @lock: the rt_mutex to be unlocked
893 */
894void __sched rt_mutex_unlock(struct rt_mutex *lock)
895{
896 rt_mutex_fastunlock(lock, rt_mutex_slowunlock);
897}
898EXPORT_SYMBOL_GPL(rt_mutex_unlock);
899
900/***
901 * rt_mutex_destroy - mark a mutex unusable
902 * @lock: the mutex to be destroyed
903 *
904 * This function marks the mutex uninitialized, and any subsequent
905 * use of the mutex is forbidden. The mutex must not be locked when
906 * this function is called.
907 */
908void rt_mutex_destroy(struct rt_mutex *lock)
909{
910 WARN_ON(rt_mutex_is_locked(lock));
911#ifdef CONFIG_DEBUG_RT_MUTEXES
912 lock->magic = NULL;
913#endif
914}
915
916EXPORT_SYMBOL_GPL(rt_mutex_destroy);
917
918/**
919 * __rt_mutex_init - initialize the rt lock
920 *
921 * @lock: the rt lock to be initialized
922 *
923 * Initialize the rt lock to unlocked state.
924 *
925 * Initializing of a locked rt lock is not allowed
926 */
927void __rt_mutex_init(struct rt_mutex *lock, const char *name)
928{
929 lock->owner = NULL;
930 spin_lock_init(&lock->wait_lock);
931 plist_head_init(&lock->wait_list, &lock->wait_lock);
932
933 debug_rt_mutex_init(lock, name);
934}
935EXPORT_SYMBOL_GPL(__rt_mutex_init);
936
937/**
938 * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
939 * proxy owner
940 *
941 * @lock: the rt_mutex to be locked
942 * @proxy_owner:the task to set as owner
943 *
944 * No locking. Caller has to do serializing itself
945 * Special API call for PI-futex support
946 */
947void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
948 struct task_struct *proxy_owner)
949{
950 __rt_mutex_init(lock, NULL);
951 debug_rt_mutex_proxy_lock(lock, proxy_owner);
952 rt_mutex_set_owner(lock, proxy_owner, 0);
953 rt_mutex_deadlock_account_lock(lock, proxy_owner);
954}
955
956/**
957 * rt_mutex_proxy_unlock - release a lock on behalf of owner
958 *
959 * @lock: the rt_mutex to be locked
960 *
961 * No locking. Caller has to do serializing itself
962 * Special API call for PI-futex support
963 */
964void rt_mutex_proxy_unlock(struct rt_mutex *lock,
965 struct task_struct *proxy_owner)
966{
967 debug_rt_mutex_proxy_unlock(lock);
968 rt_mutex_set_owner(lock, NULL, 0);
969 rt_mutex_deadlock_account_unlock(proxy_owner);
970}
971
972/**
973 * rt_mutex_next_owner - return the next owner of the lock
974 *
975 * @lock: the rt lock query
976 *
977 * Returns the next owner of the lock or NULL
978 *
979 * Caller has to serialize against other accessors to the lock
980 * itself.
981 *
982 * Special API call for PI-futex support
983 */
984struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
985{
986 if (!rt_mutex_has_waiters(lock))
987 return NULL;
988
989 return rt_mutex_top_waiter(lock)->task;
990}
diff --git a/kernel/rtmutex.h b/kernel/rtmutex.h
new file mode 100644
index 000000000000..a1a1dd06421d
--- /dev/null
+++ b/kernel/rtmutex.h
@@ -0,0 +1,26 @@
1/*
2 * RT-Mutexes: blocking mutual exclusion locks with PI support
3 *
4 * started by Ingo Molnar and Thomas Gleixner:
5 *
6 * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
8 *
9 * This file contains macros used solely by rtmutex.c.
10 * Non-debug version.
11 */
12
13#define rt_mutex_deadlock_check(l) (0)
14#define rt_mutex_deadlock_account_lock(m, t) do { } while (0)
15#define rt_mutex_deadlock_account_unlock(l) do { } while (0)
16#define debug_rt_mutex_init_waiter(w) do { } while (0)
17#define debug_rt_mutex_free_waiter(w) do { } while (0)
18#define debug_rt_mutex_lock(l) do { } while (0)
19#define debug_rt_mutex_proxy_lock(l,p) do { } while (0)
20#define debug_rt_mutex_proxy_unlock(l) do { } while (0)
21#define debug_rt_mutex_unlock(l) do { } while (0)
22#define debug_rt_mutex_init(m, n) do { } while (0)
23#define debug_rt_mutex_deadlock(d, a ,l) do { } while (0)
24#define debug_rt_mutex_print_deadlock(w) do { } while (0)
25#define debug_rt_mutex_detect_deadlock(w,d) (d)
26#define debug_rt_mutex_reset_waiter(w) do { } while (0)
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
new file mode 100644
index 000000000000..9c75856e791e
--- /dev/null
+++ b/kernel/rtmutex_common.h
@@ -0,0 +1,123 @@
1/*
2 * RT Mutexes: blocking mutual exclusion locks with PI support
3 *
4 * started by Ingo Molnar and Thomas Gleixner:
5 *
6 * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
8 *
9 * This file contains the private data structure and API definitions.
10 */
11
12#ifndef __KERNEL_RTMUTEX_COMMON_H
13#define __KERNEL_RTMUTEX_COMMON_H
14
15#include <linux/rtmutex.h>
16
17/*
18 * The rtmutex in kernel tester is independent of rtmutex debugging. We
19 * call schedule_rt_mutex_test() instead of schedule() for the tasks which
20 * belong to the tester. That way we can delay the wakeup path of those
21 * threads to provoke lock stealing and testing of complex boosting scenarios.
22 */
23#ifdef CONFIG_RT_MUTEX_TESTER
24
25extern void schedule_rt_mutex_test(struct rt_mutex *lock);
26
27#define schedule_rt_mutex(_lock) \
28 do { \
29 if (!(current->flags & PF_MUTEX_TESTER)) \
30 schedule(); \
31 else \
32 schedule_rt_mutex_test(_lock); \
33 } while (0)
34
35#else
36# define schedule_rt_mutex(_lock) schedule()
37#endif
38
39/*
40 * This is the control structure for tasks blocked on a rt_mutex,
41 * which is allocated on the kernel stack on of the blocked task.
42 *
43 * @list_entry: pi node to enqueue into the mutex waiters list
44 * @pi_list_entry: pi node to enqueue into the mutex owner waiters list
45 * @task: task reference to the blocked task
46 */
47struct rt_mutex_waiter {
48 struct plist_node list_entry;
49 struct plist_node pi_list_entry;
50 struct task_struct *task;
51 struct rt_mutex *lock;
52#ifdef CONFIG_DEBUG_RT_MUTEXES
53 unsigned long ip;
54 pid_t deadlock_task_pid;
55 struct rt_mutex *deadlock_lock;
56#endif
57};
58
59/*
60 * Various helpers to access the waiters-plist:
61 */
62static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
63{
64 return !plist_head_empty(&lock->wait_list);
65}
66
67static inline struct rt_mutex_waiter *
68rt_mutex_top_waiter(struct rt_mutex *lock)
69{
70 struct rt_mutex_waiter *w;
71
72 w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter,
73 list_entry);
74 BUG_ON(w->lock != lock);
75
76 return w;
77}
78
79static inline int task_has_pi_waiters(struct task_struct *p)
80{
81 return !plist_head_empty(&p->pi_waiters);
82}
83
84static inline struct rt_mutex_waiter *
85task_top_pi_waiter(struct task_struct *p)
86{
87 return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter,
88 pi_list_entry);
89}
90
91/*
92 * lock->owner state tracking:
93 */
94#define RT_MUTEX_OWNER_PENDING 1UL
95#define RT_MUTEX_HAS_WAITERS 2UL
96#define RT_MUTEX_OWNER_MASKALL 3UL
97
98static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
99{
100 return (struct task_struct *)
101 ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL);
102}
103
104static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock)
105{
106 return (struct task_struct *)
107 ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
108}
109
110static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock)
111{
112 return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING;
113}
114
115/*
116 * PI-futex support (proxy locking functions, etc.):
117 */
118extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
119extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
120 struct task_struct *proxy_owner);
121extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
122 struct task_struct *proxy_owner);
123#endif
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
new file mode 100644
index 000000000000..291ded556aa0
--- /dev/null
+++ b/kernel/rwsem.c
@@ -0,0 +1,147 @@
1/* kernel/rwsem.c: R/W semaphores, public implementation
2 *
3 * Written by David Howells (dhowells@redhat.com).
4 * Derived from asm-i386/semaphore.h
5 */
6
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/module.h>
10#include <linux/rwsem.h>
11
12#include <asm/system.h>
13#include <asm/atomic.h>
14
15/*
16 * lock for reading
17 */
18void down_read(struct rw_semaphore *sem)
19{
20 might_sleep();
21 rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
22
23 __down_read(sem);
24}
25
26EXPORT_SYMBOL(down_read);
27
28/*
29 * trylock for reading -- returns 1 if successful, 0 if contention
30 */
31int down_read_trylock(struct rw_semaphore *sem)
32{
33 int ret = __down_read_trylock(sem);
34
35 if (ret == 1)
36 rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
37 return ret;
38}
39
40EXPORT_SYMBOL(down_read_trylock);
41
42/*
43 * lock for writing
44 */
45void down_write(struct rw_semaphore *sem)
46{
47 might_sleep();
48 rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
49
50 __down_write(sem);
51}
52
53EXPORT_SYMBOL(down_write);
54
55/*
56 * trylock for writing -- returns 1 if successful, 0 if contention
57 */
58int down_write_trylock(struct rw_semaphore *sem)
59{
60 int ret = __down_write_trylock(sem);
61
62 if (ret == 1)
63 rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
64 return ret;
65}
66
67EXPORT_SYMBOL(down_write_trylock);
68
69/*
70 * release a read lock
71 */
72void up_read(struct rw_semaphore *sem)
73{
74 rwsem_release(&sem->dep_map, 1, _RET_IP_);
75
76 __up_read(sem);
77}
78
79EXPORT_SYMBOL(up_read);
80
81/*
82 * release a write lock
83 */
84void up_write(struct rw_semaphore *sem)
85{
86 rwsem_release(&sem->dep_map, 1, _RET_IP_);
87
88 __up_write(sem);
89}
90
91EXPORT_SYMBOL(up_write);
92
93/*
94 * downgrade write lock to read lock
95 */
96void downgrade_write(struct rw_semaphore *sem)
97{
98 /*
99 * lockdep: a downgraded write will live on as a write
100 * dependency.
101 */
102 __downgrade_write(sem);
103}
104
105EXPORT_SYMBOL(downgrade_write);
106
107#ifdef CONFIG_DEBUG_LOCK_ALLOC
108
109void down_read_nested(struct rw_semaphore *sem, int subclass)
110{
111 might_sleep();
112 rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
113
114 __down_read(sem);
115}
116
117EXPORT_SYMBOL(down_read_nested);
118
119void down_read_non_owner(struct rw_semaphore *sem)
120{
121 might_sleep();
122
123 __down_read(sem);
124}
125
126EXPORT_SYMBOL(down_read_non_owner);
127
128void down_write_nested(struct rw_semaphore *sem, int subclass)
129{
130 might_sleep();
131 rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
132
133 __down_write_nested(sem, subclass);
134}
135
136EXPORT_SYMBOL(down_write_nested);
137
138void up_read_non_owner(struct rw_semaphore *sem)
139{
140 __up_read(sem);
141}
142
143EXPORT_SYMBOL(up_read_non_owner);
144
145#endif
146
147
diff --git a/kernel/sched.c b/kernel/sched.c
index 5dbc42694477..74f169ac0773 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -30,6 +30,7 @@
30#include <linux/capability.h> 30#include <linux/capability.h>
31#include <linux/completion.h> 31#include <linux/completion.h>
32#include <linux/kernel_stat.h> 32#include <linux/kernel_stat.h>
33#include <linux/debug_locks.h>
33#include <linux/security.h> 34#include <linux/security.h>
34#include <linux/notifier.h> 35#include <linux/notifier.h>
35#include <linux/profile.h> 36#include <linux/profile.h>
@@ -50,6 +51,7 @@
50#include <linux/times.h> 51#include <linux/times.h>
51#include <linux/acct.h> 52#include <linux/acct.h>
52#include <linux/kprobes.h> 53#include <linux/kprobes.h>
54#include <linux/delayacct.h>
53#include <asm/tlb.h> 55#include <asm/tlb.h>
54 56
55#include <asm/unistd.h> 57#include <asm/unistd.h>
@@ -168,29 +170,28 @@
168 */ 170 */
169 171
170#define SCALE_PRIO(x, prio) \ 172#define SCALE_PRIO(x, prio) \
171 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) 173 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
172 174
173static unsigned int task_timeslice(task_t *p) 175static unsigned int static_prio_timeslice(int static_prio)
174{ 176{
175 if (p->static_prio < NICE_TO_PRIO(0)) 177 if (static_prio < NICE_TO_PRIO(0))
176 return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); 178 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
177 else 179 else
178 return SCALE_PRIO(DEF_TIMESLICE, p->static_prio); 180 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
181}
182
183static inline unsigned int task_timeslice(struct task_struct *p)
184{
185 return static_prio_timeslice(p->static_prio);
179} 186}
180#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \
181 < (long long) (sd)->cache_hot_time)
182 187
183/* 188/*
184 * These are the runqueue data structures: 189 * These are the runqueue data structures:
185 */ 190 */
186 191
187#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
188
189typedef struct runqueue runqueue_t;
190
191struct prio_array { 192struct prio_array {
192 unsigned int nr_active; 193 unsigned int nr_active;
193 unsigned long bitmap[BITMAP_SIZE]; 194 DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
194 struct list_head queue[MAX_PRIO]; 195 struct list_head queue[MAX_PRIO];
195}; 196};
196 197
@@ -201,7 +202,7 @@ struct prio_array {
201 * (such as the load balancing or the thread migration code), lock 202 * (such as the load balancing or the thread migration code), lock
202 * acquire operations must be ordered by ascending &runqueue. 203 * acquire operations must be ordered by ascending &runqueue.
203 */ 204 */
204struct runqueue { 205struct rq {
205 spinlock_t lock; 206 spinlock_t lock;
206 207
207 /* 208 /*
@@ -209,6 +210,7 @@ struct runqueue {
209 * remote CPUs use both these fields when doing load calculation. 210 * remote CPUs use both these fields when doing load calculation.
210 */ 211 */
211 unsigned long nr_running; 212 unsigned long nr_running;
213 unsigned long raw_weighted_load;
212#ifdef CONFIG_SMP 214#ifdef CONFIG_SMP
213 unsigned long cpu_load[3]; 215 unsigned long cpu_load[3];
214#endif 216#endif
@@ -224,9 +226,9 @@ struct runqueue {
224 226
225 unsigned long expired_timestamp; 227 unsigned long expired_timestamp;
226 unsigned long long timestamp_last_tick; 228 unsigned long long timestamp_last_tick;
227 task_t *curr, *idle; 229 struct task_struct *curr, *idle;
228 struct mm_struct *prev_mm; 230 struct mm_struct *prev_mm;
229 prio_array_t *active, *expired, arrays[2]; 231 struct prio_array *active, *expired, arrays[2];
230 int best_expired_prio; 232 int best_expired_prio;
231 atomic_t nr_iowait; 233 atomic_t nr_iowait;
232 234
@@ -236,10 +238,10 @@ struct runqueue {
236 /* For active balancing */ 238 /* For active balancing */
237 int active_balance; 239 int active_balance;
238 int push_cpu; 240 int push_cpu;
241 int cpu; /* cpu of this runqueue */
239 242
240 task_t *migration_thread; 243 struct task_struct *migration_thread;
241 struct list_head migration_queue; 244 struct list_head migration_queue;
242 int cpu;
243#endif 245#endif
244 246
245#ifdef CONFIG_SCHEDSTATS 247#ifdef CONFIG_SCHEDSTATS
@@ -261,9 +263,19 @@ struct runqueue {
261 unsigned long ttwu_cnt; 263 unsigned long ttwu_cnt;
262 unsigned long ttwu_local; 264 unsigned long ttwu_local;
263#endif 265#endif
266 struct lock_class_key rq_lock_key;
264}; 267};
265 268
266static DEFINE_PER_CPU(struct runqueue, runqueues); 269static DEFINE_PER_CPU(struct rq, runqueues);
270
271static inline int cpu_of(struct rq *rq)
272{
273#ifdef CONFIG_SMP
274 return rq->cpu;
275#else
276 return 0;
277#endif
278}
267 279
268/* 280/*
269 * The domain tree (rq->sd) is protected by RCU's quiescent state transition. 281 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
@@ -272,8 +284,8 @@ static DEFINE_PER_CPU(struct runqueue, runqueues);
272 * The domain tree of any CPU may only be accessed from within 284 * The domain tree of any CPU may only be accessed from within
273 * preempt-disabled sections. 285 * preempt-disabled sections.
274 */ 286 */
275#define for_each_domain(cpu, domain) \ 287#define for_each_domain(cpu, __sd) \
276for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent) 288 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
277 289
278#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) 290#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
279#define this_rq() (&__get_cpu_var(runqueues)) 291#define this_rq() (&__get_cpu_var(runqueues))
@@ -288,26 +300,33 @@ for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent)
288#endif 300#endif
289 301
290#ifndef __ARCH_WANT_UNLOCKED_CTXSW 302#ifndef __ARCH_WANT_UNLOCKED_CTXSW
291static inline int task_running(runqueue_t *rq, task_t *p) 303static inline int task_running(struct rq *rq, struct task_struct *p)
292{ 304{
293 return rq->curr == p; 305 return rq->curr == p;
294} 306}
295 307
296static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) 308static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
297{ 309{
298} 310}
299 311
300static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) 312static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
301{ 313{
302#ifdef CONFIG_DEBUG_SPINLOCK 314#ifdef CONFIG_DEBUG_SPINLOCK
303 /* this is a valid case when another task releases the spinlock */ 315 /* this is a valid case when another task releases the spinlock */
304 rq->lock.owner = current; 316 rq->lock.owner = current;
305#endif 317#endif
318 /*
319 * If we are tracking spinlock dependencies then we have to
320 * fix up the runqueue lock - which gets 'carried over' from
321 * prev into current:
322 */
323 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
324
306 spin_unlock_irq(&rq->lock); 325 spin_unlock_irq(&rq->lock);
307} 326}
308 327
309#else /* __ARCH_WANT_UNLOCKED_CTXSW */ 328#else /* __ARCH_WANT_UNLOCKED_CTXSW */
310static inline int task_running(runqueue_t *rq, task_t *p) 329static inline int task_running(struct rq *rq, struct task_struct *p)
311{ 330{
312#ifdef CONFIG_SMP 331#ifdef CONFIG_SMP
313 return p->oncpu; 332 return p->oncpu;
@@ -316,7 +335,7 @@ static inline int task_running(runqueue_t *rq, task_t *p)
316#endif 335#endif
317} 336}
318 337
319static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) 338static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
320{ 339{
321#ifdef CONFIG_SMP 340#ifdef CONFIG_SMP
322 /* 341 /*
@@ -333,7 +352,7 @@ static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
333#endif 352#endif
334} 353}
335 354
336static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) 355static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
337{ 356{
338#ifdef CONFIG_SMP 357#ifdef CONFIG_SMP
339 /* 358 /*
@@ -351,14 +370,33 @@ static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
351#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 370#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
352 371
353/* 372/*
373 * __task_rq_lock - lock the runqueue a given task resides on.
374 * Must be called interrupts disabled.
375 */
376static inline struct rq *__task_rq_lock(struct task_struct *p)
377 __acquires(rq->lock)
378{
379 struct rq *rq;
380
381repeat_lock_task:
382 rq = task_rq(p);
383 spin_lock(&rq->lock);
384 if (unlikely(rq != task_rq(p))) {
385 spin_unlock(&rq->lock);
386 goto repeat_lock_task;
387 }
388 return rq;
389}
390
391/*
354 * task_rq_lock - lock the runqueue a given task resides on and disable 392 * task_rq_lock - lock the runqueue a given task resides on and disable
355 * interrupts. Note the ordering: we can safely lookup the task_rq without 393 * interrupts. Note the ordering: we can safely lookup the task_rq without
356 * explicitly disabling preemption. 394 * explicitly disabling preemption.
357 */ 395 */
358static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) 396static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
359 __acquires(rq->lock) 397 __acquires(rq->lock)
360{ 398{
361 struct runqueue *rq; 399 struct rq *rq;
362 400
363repeat_lock_task: 401repeat_lock_task:
364 local_irq_save(*flags); 402 local_irq_save(*flags);
@@ -371,7 +409,13 @@ repeat_lock_task:
371 return rq; 409 return rq;
372} 410}
373 411
374static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) 412static inline void __task_rq_unlock(struct rq *rq)
413 __releases(rq->lock)
414{
415 spin_unlock(&rq->lock);
416}
417
418static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
375 __releases(rq->lock) 419 __releases(rq->lock)
376{ 420{
377 spin_unlock_irqrestore(&rq->lock, *flags); 421 spin_unlock_irqrestore(&rq->lock, *flags);
@@ -391,7 +435,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
391 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); 435 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
392 seq_printf(seq, "timestamp %lu\n", jiffies); 436 seq_printf(seq, "timestamp %lu\n", jiffies);
393 for_each_online_cpu(cpu) { 437 for_each_online_cpu(cpu) {
394 runqueue_t *rq = cpu_rq(cpu); 438 struct rq *rq = cpu_rq(cpu);
395#ifdef CONFIG_SMP 439#ifdef CONFIG_SMP
396 struct sched_domain *sd; 440 struct sched_domain *sd;
397 int dcnt = 0; 441 int dcnt = 0;
@@ -468,9 +512,36 @@ struct file_operations proc_schedstat_operations = {
468 .release = single_release, 512 .release = single_release,
469}; 513};
470 514
515/*
516 * Expects runqueue lock to be held for atomicity of update
517 */
518static inline void
519rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
520{
521 if (rq) {
522 rq->rq_sched_info.run_delay += delta_jiffies;
523 rq->rq_sched_info.pcnt++;
524 }
525}
526
527/*
528 * Expects runqueue lock to be held for atomicity of update
529 */
530static inline void
531rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
532{
533 if (rq)
534 rq->rq_sched_info.cpu_time += delta_jiffies;
535}
471# define schedstat_inc(rq, field) do { (rq)->field++; } while (0) 536# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
472# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) 537# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
473#else /* !CONFIG_SCHEDSTATS */ 538#else /* !CONFIG_SCHEDSTATS */
539static inline void
540rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
541{}
542static inline void
543rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
544{}
474# define schedstat_inc(rq, field) do { } while (0) 545# define schedstat_inc(rq, field) do { } while (0)
475# define schedstat_add(rq, field, amt) do { } while (0) 546# define schedstat_add(rq, field, amt) do { } while (0)
476#endif 547#endif
@@ -478,10 +549,10 @@ struct file_operations proc_schedstat_operations = {
478/* 549/*
479 * rq_lock - lock a given runqueue and disable interrupts. 550 * rq_lock - lock a given runqueue and disable interrupts.
480 */ 551 */
481static inline runqueue_t *this_rq_lock(void) 552static inline struct rq *this_rq_lock(void)
482 __acquires(rq->lock) 553 __acquires(rq->lock)
483{ 554{
484 runqueue_t *rq; 555 struct rq *rq;
485 556
486 local_irq_disable(); 557 local_irq_disable();
487 rq = this_rq(); 558 rq = this_rq();
@@ -490,7 +561,7 @@ static inline runqueue_t *this_rq_lock(void)
490 return rq; 561 return rq;
491} 562}
492 563
493#ifdef CONFIG_SCHEDSTATS 564#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
494/* 565/*
495 * Called when a process is dequeued from the active array and given 566 * Called when a process is dequeued from the active array and given
496 * the cpu. We should note that with the exception of interactive 567 * the cpu. We should note that with the exception of interactive
@@ -506,7 +577,7 @@ static inline runqueue_t *this_rq_lock(void)
506 * long it was from the *first* time it was queued to the time that it 577 * long it was from the *first* time it was queued to the time that it
507 * finally hit a cpu. 578 * finally hit a cpu.
508 */ 579 */
509static inline void sched_info_dequeued(task_t *t) 580static inline void sched_info_dequeued(struct task_struct *t)
510{ 581{
511 t->sched_info.last_queued = 0; 582 t->sched_info.last_queued = 0;
512} 583}
@@ -516,23 +587,18 @@ static inline void sched_info_dequeued(task_t *t)
516 * long it was waiting to run. We also note when it began so that we 587 * long it was waiting to run. We also note when it began so that we
517 * can keep stats on how long its timeslice is. 588 * can keep stats on how long its timeslice is.
518 */ 589 */
519static void sched_info_arrive(task_t *t) 590static void sched_info_arrive(struct task_struct *t)
520{ 591{
521 unsigned long now = jiffies, diff = 0; 592 unsigned long now = jiffies, delta_jiffies = 0;
522 struct runqueue *rq = task_rq(t);
523 593
524 if (t->sched_info.last_queued) 594 if (t->sched_info.last_queued)
525 diff = now - t->sched_info.last_queued; 595 delta_jiffies = now - t->sched_info.last_queued;
526 sched_info_dequeued(t); 596 sched_info_dequeued(t);
527 t->sched_info.run_delay += diff; 597 t->sched_info.run_delay += delta_jiffies;
528 t->sched_info.last_arrival = now; 598 t->sched_info.last_arrival = now;
529 t->sched_info.pcnt++; 599 t->sched_info.pcnt++;
530 600
531 if (!rq) 601 rq_sched_info_arrive(task_rq(t), delta_jiffies);
532 return;
533
534 rq->rq_sched_info.run_delay += diff;
535 rq->rq_sched_info.pcnt++;
536} 602}
537 603
538/* 604/*
@@ -550,25 +616,23 @@ static void sched_info_arrive(task_t *t)
550 * the timestamp if it is already not set. It's assumed that 616 * the timestamp if it is already not set. It's assumed that
551 * sched_info_dequeued() will clear that stamp when appropriate. 617 * sched_info_dequeued() will clear that stamp when appropriate.
552 */ 618 */
553static inline void sched_info_queued(task_t *t) 619static inline void sched_info_queued(struct task_struct *t)
554{ 620{
555 if (!t->sched_info.last_queued) 621 if (unlikely(sched_info_on()))
556 t->sched_info.last_queued = jiffies; 622 if (!t->sched_info.last_queued)
623 t->sched_info.last_queued = jiffies;
557} 624}
558 625
559/* 626/*
560 * Called when a process ceases being the active-running process, either 627 * Called when a process ceases being the active-running process, either
561 * voluntarily or involuntarily. Now we can calculate how long we ran. 628 * voluntarily or involuntarily. Now we can calculate how long we ran.
562 */ 629 */
563static inline void sched_info_depart(task_t *t) 630static inline void sched_info_depart(struct task_struct *t)
564{ 631{
565 struct runqueue *rq = task_rq(t); 632 unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival;
566 unsigned long diff = jiffies - t->sched_info.last_arrival;
567
568 t->sched_info.cpu_time += diff;
569 633
570 if (rq) 634 t->sched_info.cpu_time += delta_jiffies;
571 rq->rq_sched_info.cpu_time += diff; 635 rq_sched_info_depart(task_rq(t), delta_jiffies);
572} 636}
573 637
574/* 638/*
@@ -576,9 +640,10 @@ static inline void sched_info_depart(task_t *t)
576 * their time slice. (This may also be called when switching to or from 640 * their time slice. (This may also be called when switching to or from
577 * the idle task.) We are only called when prev != next. 641 * the idle task.) We are only called when prev != next.
578 */ 642 */
579static inline void sched_info_switch(task_t *prev, task_t *next) 643static inline void
644__sched_info_switch(struct task_struct *prev, struct task_struct *next)
580{ 645{
581 struct runqueue *rq = task_rq(prev); 646 struct rq *rq = task_rq(prev);
582 647
583 /* 648 /*
584 * prev now departs the cpu. It's not interesting to record 649 * prev now departs the cpu. It's not interesting to record
@@ -591,15 +656,21 @@ static inline void sched_info_switch(task_t *prev, task_t *next)
591 if (next != rq->idle) 656 if (next != rq->idle)
592 sched_info_arrive(next); 657 sched_info_arrive(next);
593} 658}
659static inline void
660sched_info_switch(struct task_struct *prev, struct task_struct *next)
661{
662 if (unlikely(sched_info_on()))
663 __sched_info_switch(prev, next);
664}
594#else 665#else
595#define sched_info_queued(t) do { } while (0) 666#define sched_info_queued(t) do { } while (0)
596#define sched_info_switch(t, next) do { } while (0) 667#define sched_info_switch(t, next) do { } while (0)
597#endif /* CONFIG_SCHEDSTATS */ 668#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
598 669
599/* 670/*
600 * Adding/removing a task to/from a priority array: 671 * Adding/removing a task to/from a priority array:
601 */ 672 */
602static void dequeue_task(struct task_struct *p, prio_array_t *array) 673static void dequeue_task(struct task_struct *p, struct prio_array *array)
603{ 674{
604 array->nr_active--; 675 array->nr_active--;
605 list_del(&p->run_list); 676 list_del(&p->run_list);
@@ -607,7 +678,7 @@ static void dequeue_task(struct task_struct *p, prio_array_t *array)
607 __clear_bit(p->prio, array->bitmap); 678 __clear_bit(p->prio, array->bitmap);
608} 679}
609 680
610static void enqueue_task(struct task_struct *p, prio_array_t *array) 681static void enqueue_task(struct task_struct *p, struct prio_array *array)
611{ 682{
612 sched_info_queued(p); 683 sched_info_queued(p);
613 list_add_tail(&p->run_list, array->queue + p->prio); 684 list_add_tail(&p->run_list, array->queue + p->prio);
@@ -620,12 +691,13 @@ static void enqueue_task(struct task_struct *p, prio_array_t *array)
620 * Put task to the end of the run list without the overhead of dequeue 691 * Put task to the end of the run list without the overhead of dequeue
621 * followed by enqueue. 692 * followed by enqueue.
622 */ 693 */
623static void requeue_task(struct task_struct *p, prio_array_t *array) 694static void requeue_task(struct task_struct *p, struct prio_array *array)
624{ 695{
625 list_move_tail(&p->run_list, array->queue + p->prio); 696 list_move_tail(&p->run_list, array->queue + p->prio);
626} 697}
627 698
628static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) 699static inline void
700enqueue_task_head(struct task_struct *p, struct prio_array *array)
629{ 701{
630 list_add(&p->run_list, array->queue + p->prio); 702 list_add(&p->run_list, array->queue + p->prio);
631 __set_bit(p->prio, array->bitmap); 703 __set_bit(p->prio, array->bitmap);
@@ -634,7 +706,7 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
634} 706}
635 707
636/* 708/*
637 * effective_prio - return the priority that is based on the static 709 * __normal_prio - return the priority that is based on the static
638 * priority but is modified by bonuses/penalties. 710 * priority but is modified by bonuses/penalties.
639 * 711 *
640 * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] 712 * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
@@ -647,13 +719,11 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
647 * 719 *
648 * Both properties are important to certain workloads. 720 * Both properties are important to certain workloads.
649 */ 721 */
650static int effective_prio(task_t *p) 722
723static inline int __normal_prio(struct task_struct *p)
651{ 724{
652 int bonus, prio; 725 int bonus, prio;
653 726
654 if (rt_task(p))
655 return p->prio;
656
657 bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; 727 bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
658 728
659 prio = p->static_prio - bonus; 729 prio = p->static_prio - bonus;
@@ -665,57 +735,165 @@ static int effective_prio(task_t *p)
665} 735}
666 736
667/* 737/*
738 * To aid in avoiding the subversion of "niceness" due to uneven distribution
739 * of tasks with abnormal "nice" values across CPUs the contribution that
740 * each task makes to its run queue's load is weighted according to its
741 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
742 * scaled version of the new time slice allocation that they receive on time
743 * slice expiry etc.
744 */
745
746/*
747 * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
748 * If static_prio_timeslice() is ever changed to break this assumption then
749 * this code will need modification
750 */
751#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
752#define LOAD_WEIGHT(lp) \
753 (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
754#define PRIO_TO_LOAD_WEIGHT(prio) \
755 LOAD_WEIGHT(static_prio_timeslice(prio))
756#define RTPRIO_TO_LOAD_WEIGHT(rp) \
757 (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
758
759static void set_load_weight(struct task_struct *p)
760{
761 if (has_rt_policy(p)) {
762#ifdef CONFIG_SMP
763 if (p == task_rq(p)->migration_thread)
764 /*
765 * The migration thread does the actual balancing.
766 * Giving its load any weight will skew balancing
767 * adversely.
768 */
769 p->load_weight = 0;
770 else
771#endif
772 p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
773 } else
774 p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
775}
776
777static inline void
778inc_raw_weighted_load(struct rq *rq, const struct task_struct *p)
779{
780 rq->raw_weighted_load += p->load_weight;
781}
782
783static inline void
784dec_raw_weighted_load(struct rq *rq, const struct task_struct *p)
785{
786 rq->raw_weighted_load -= p->load_weight;
787}
788
789static inline void inc_nr_running(struct task_struct *p, struct rq *rq)
790{
791 rq->nr_running++;
792 inc_raw_weighted_load(rq, p);
793}
794
795static inline void dec_nr_running(struct task_struct *p, struct rq *rq)
796{
797 rq->nr_running--;
798 dec_raw_weighted_load(rq, p);
799}
800
801/*
802 * Calculate the expected normal priority: i.e. priority
803 * without taking RT-inheritance into account. Might be
804 * boosted by interactivity modifiers. Changes upon fork,
805 * setprio syscalls, and whenever the interactivity
806 * estimator recalculates.
807 */
808static inline int normal_prio(struct task_struct *p)
809{
810 int prio;
811
812 if (has_rt_policy(p))
813 prio = MAX_RT_PRIO-1 - p->rt_priority;
814 else
815 prio = __normal_prio(p);
816 return prio;
817}
818
819/*
820 * Calculate the current priority, i.e. the priority
821 * taken into account by the scheduler. This value might
822 * be boosted by RT tasks, or might be boosted by
823 * interactivity modifiers. Will be RT if the task got
824 * RT-boosted. If not then it returns p->normal_prio.
825 */
826static int effective_prio(struct task_struct *p)
827{
828 p->normal_prio = normal_prio(p);
829 /*
830 * If we are RT tasks or we were boosted to RT priority,
831 * keep the priority unchanged. Otherwise, update priority
832 * to the normal priority:
833 */
834 if (!rt_prio(p->prio))
835 return p->normal_prio;
836 return p->prio;
837}
838
839/*
668 * __activate_task - move a task to the runqueue. 840 * __activate_task - move a task to the runqueue.
669 */ 841 */
670static void __activate_task(task_t *p, runqueue_t *rq) 842static void __activate_task(struct task_struct *p, struct rq *rq)
671{ 843{
672 prio_array_t *target = rq->active; 844 struct prio_array *target = rq->active;
673 845
674 if (batch_task(p)) 846 if (batch_task(p))
675 target = rq->expired; 847 target = rq->expired;
676 enqueue_task(p, target); 848 enqueue_task(p, target);
677 rq->nr_running++; 849 inc_nr_running(p, rq);
678} 850}
679 851
680/* 852/*
681 * __activate_idle_task - move idle task to the _front_ of runqueue. 853 * __activate_idle_task - move idle task to the _front_ of runqueue.
682 */ 854 */
683static inline void __activate_idle_task(task_t *p, runqueue_t *rq) 855static inline void __activate_idle_task(struct task_struct *p, struct rq *rq)
684{ 856{
685 enqueue_task_head(p, rq->active); 857 enqueue_task_head(p, rq->active);
686 rq->nr_running++; 858 inc_nr_running(p, rq);
687} 859}
688 860
689static int recalc_task_prio(task_t *p, unsigned long long now) 861/*
862 * Recalculate p->normal_prio and p->prio after having slept,
863 * updating the sleep-average too:
864 */
865static int recalc_task_prio(struct task_struct *p, unsigned long long now)
690{ 866{
691 /* Caller must always ensure 'now >= p->timestamp' */ 867 /* Caller must always ensure 'now >= p->timestamp' */
692 unsigned long long __sleep_time = now - p->timestamp; 868 unsigned long sleep_time = now - p->timestamp;
693 unsigned long sleep_time;
694 869
695 if (batch_task(p)) 870 if (batch_task(p))
696 sleep_time = 0; 871 sleep_time = 0;
697 else {
698 if (__sleep_time > NS_MAX_SLEEP_AVG)
699 sleep_time = NS_MAX_SLEEP_AVG;
700 else
701 sleep_time = (unsigned long)__sleep_time;
702 }
703 872
704 if (likely(sleep_time > 0)) { 873 if (likely(sleep_time > 0)) {
705 /* 874 /*
706 * User tasks that sleep a long time are categorised as 875 * This ceiling is set to the lowest priority that would allow
707 * idle. They will only have their sleep_avg increased to a 876 * a task to be reinserted into the active array on timeslice
708 * level that makes them just interactive priority to stay 877 * completion.
709 * active yet prevent them suddenly becoming cpu hogs and
710 * starving other processes.
711 */ 878 */
712 if (p->mm && sleep_time > INTERACTIVE_SLEEP(p)) { 879 unsigned long ceiling = INTERACTIVE_SLEEP(p);
713 unsigned long ceiling;
714 880
715 ceiling = JIFFIES_TO_NS(MAX_SLEEP_AVG - 881 if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) {
716 DEF_TIMESLICE); 882 /*
717 if (p->sleep_avg < ceiling) 883 * Prevents user tasks from achieving best priority
718 p->sleep_avg = ceiling; 884 * with one single large enough sleep.
885 */
886 p->sleep_avg = ceiling;
887 /*
888 * Using INTERACTIVE_SLEEP() as a ceiling places a
889 * nice(0) task 1ms sleep away from promotion, and
890 * gives it 700ms to round-robin with no chance of
891 * being demoted. This is more than generous, so
892 * mark this sleep as non-interactive to prevent the
893 * on-runqueue bonus logic from intervening should
894 * this task not receive cpu immediately.
895 */
896 p->sleep_type = SLEEP_NONINTERACTIVE;
719 } else { 897 } else {
720 /* 898 /*
721 * Tasks waking from uninterruptible sleep are 899 * Tasks waking from uninterruptible sleep are
@@ -723,12 +901,12 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
723 * are likely to be waiting on I/O 901 * are likely to be waiting on I/O
724 */ 902 */
725 if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { 903 if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
726 if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) 904 if (p->sleep_avg >= ceiling)
727 sleep_time = 0; 905 sleep_time = 0;
728 else if (p->sleep_avg + sleep_time >= 906 else if (p->sleep_avg + sleep_time >=
729 INTERACTIVE_SLEEP(p)) { 907 ceiling) {
730 p->sleep_avg = INTERACTIVE_SLEEP(p); 908 p->sleep_avg = ceiling;
731 sleep_time = 0; 909 sleep_time = 0;
732 } 910 }
733 } 911 }
734 912
@@ -742,9 +920,9 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
742 */ 920 */
743 p->sleep_avg += sleep_time; 921 p->sleep_avg += sleep_time;
744 922
745 if (p->sleep_avg > NS_MAX_SLEEP_AVG)
746 p->sleep_avg = NS_MAX_SLEEP_AVG;
747 } 923 }
924 if (p->sleep_avg > NS_MAX_SLEEP_AVG)
925 p->sleep_avg = NS_MAX_SLEEP_AVG;
748 } 926 }
749 927
750 return effective_prio(p); 928 return effective_prio(p);
@@ -756,7 +934,7 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
756 * Update all the scheduling statistics stuff. (sleep average 934 * Update all the scheduling statistics stuff. (sleep average
757 * calculation, priority modifiers, etc.) 935 * calculation, priority modifiers, etc.)
758 */ 936 */
759static void activate_task(task_t *p, runqueue_t *rq, int local) 937static void activate_task(struct task_struct *p, struct rq *rq, int local)
760{ 938{
761 unsigned long long now; 939 unsigned long long now;
762 940
@@ -764,7 +942,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
764#ifdef CONFIG_SMP 942#ifdef CONFIG_SMP
765 if (!local) { 943 if (!local) {
766 /* Compensate for drifting sched_clock */ 944 /* Compensate for drifting sched_clock */
767 runqueue_t *this_rq = this_rq(); 945 struct rq *this_rq = this_rq();
768 now = (now - this_rq->timestamp_last_tick) 946 now = (now - this_rq->timestamp_last_tick)
769 + rq->timestamp_last_tick; 947 + rq->timestamp_last_tick;
770 } 948 }
@@ -803,9 +981,9 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
803/* 981/*
804 * deactivate_task - remove a task from the runqueue. 982 * deactivate_task - remove a task from the runqueue.
805 */ 983 */
806static void deactivate_task(struct task_struct *p, runqueue_t *rq) 984static void deactivate_task(struct task_struct *p, struct rq *rq)
807{ 985{
808 rq->nr_running--; 986 dec_nr_running(p, rq);
809 dequeue_task(p, p->array); 987 dequeue_task(p, p->array);
810 p->array = NULL; 988 p->array = NULL;
811} 989}
@@ -818,7 +996,12 @@ static void deactivate_task(struct task_struct *p, runqueue_t *rq)
818 * the target CPU. 996 * the target CPU.
819 */ 997 */
820#ifdef CONFIG_SMP 998#ifdef CONFIG_SMP
821static void resched_task(task_t *p) 999
1000#ifndef tsk_is_polling
1001#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1002#endif
1003
1004static void resched_task(struct task_struct *p)
822{ 1005{
823 int cpu; 1006 int cpu;
824 1007
@@ -833,13 +1016,13 @@ static void resched_task(task_t *p)
833 if (cpu == smp_processor_id()) 1016 if (cpu == smp_processor_id())
834 return; 1017 return;
835 1018
836 /* NEED_RESCHED must be visible before we test POLLING_NRFLAG */ 1019 /* NEED_RESCHED must be visible before we test polling */
837 smp_mb(); 1020 smp_mb();
838 if (!test_tsk_thread_flag(p, TIF_POLLING_NRFLAG)) 1021 if (!tsk_is_polling(p))
839 smp_send_reschedule(cpu); 1022 smp_send_reschedule(cpu);
840} 1023}
841#else 1024#else
842static inline void resched_task(task_t *p) 1025static inline void resched_task(struct task_struct *p)
843{ 1026{
844 assert_spin_locked(&task_rq(p)->lock); 1027 assert_spin_locked(&task_rq(p)->lock);
845 set_tsk_need_resched(p); 1028 set_tsk_need_resched(p);
@@ -850,28 +1033,35 @@ static inline void resched_task(task_t *p)
850 * task_curr - is this task currently executing on a CPU? 1033 * task_curr - is this task currently executing on a CPU?
851 * @p: the task in question. 1034 * @p: the task in question.
852 */ 1035 */
853inline int task_curr(const task_t *p) 1036inline int task_curr(const struct task_struct *p)
854{ 1037{
855 return cpu_curr(task_cpu(p)) == p; 1038 return cpu_curr(task_cpu(p)) == p;
856} 1039}
857 1040
1041/* Used instead of source_load when we know the type == 0 */
1042unsigned long weighted_cpuload(const int cpu)
1043{
1044 return cpu_rq(cpu)->raw_weighted_load;
1045}
1046
858#ifdef CONFIG_SMP 1047#ifdef CONFIG_SMP
859typedef struct { 1048struct migration_req {
860 struct list_head list; 1049 struct list_head list;
861 1050
862 task_t *task; 1051 struct task_struct *task;
863 int dest_cpu; 1052 int dest_cpu;
864 1053
865 struct completion done; 1054 struct completion done;
866} migration_req_t; 1055};
867 1056
868/* 1057/*
869 * The task's runqueue lock must be held. 1058 * The task's runqueue lock must be held.
870 * Returns true if you have to wait for migration thread. 1059 * Returns true if you have to wait for migration thread.
871 */ 1060 */
872static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) 1061static int
1062migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
873{ 1063{
874 runqueue_t *rq = task_rq(p); 1064 struct rq *rq = task_rq(p);
875 1065
876 /* 1066 /*
877 * If the task is not on a runqueue (and not running), then 1067 * If the task is not on a runqueue (and not running), then
@@ -886,6 +1076,7 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
886 req->task = p; 1076 req->task = p;
887 req->dest_cpu = dest_cpu; 1077 req->dest_cpu = dest_cpu;
888 list_add(&req->list, &rq->migration_queue); 1078 list_add(&req->list, &rq->migration_queue);
1079
889 return 1; 1080 return 1;
890} 1081}
891 1082
@@ -898,10 +1089,10 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
898 * smp_call_function() if an IPI is sent by the same process we are 1089 * smp_call_function() if an IPI is sent by the same process we are
899 * waiting to become inactive. 1090 * waiting to become inactive.
900 */ 1091 */
901void wait_task_inactive(task_t *p) 1092void wait_task_inactive(struct task_struct *p)
902{ 1093{
903 unsigned long flags; 1094 unsigned long flags;
904 runqueue_t *rq; 1095 struct rq *rq;
905 int preempted; 1096 int preempted;
906 1097
907repeat: 1098repeat:
@@ -932,7 +1123,7 @@ repeat:
932 * to another CPU then no harm is done and the purpose has been 1123 * to another CPU then no harm is done and the purpose has been
933 * achieved as well. 1124 * achieved as well.
934 */ 1125 */
935void kick_process(task_t *p) 1126void kick_process(struct task_struct *p)
936{ 1127{
937 int cpu; 1128 int cpu;
938 1129
@@ -944,32 +1135,45 @@ void kick_process(task_t *p)
944} 1135}
945 1136
946/* 1137/*
947 * Return a low guess at the load of a migration-source cpu. 1138 * Return a low guess at the load of a migration-source cpu weighted
1139 * according to the scheduling class and "nice" value.
948 * 1140 *
949 * We want to under-estimate the load of migration sources, to 1141 * We want to under-estimate the load of migration sources, to
950 * balance conservatively. 1142 * balance conservatively.
951 */ 1143 */
952static inline unsigned long source_load(int cpu, int type) 1144static inline unsigned long source_load(int cpu, int type)
953{ 1145{
954 runqueue_t *rq = cpu_rq(cpu); 1146 struct rq *rq = cpu_rq(cpu);
955 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; 1147
956 if (type == 0) 1148 if (type == 0)
957 return load_now; 1149 return rq->raw_weighted_load;
958 1150
959 return min(rq->cpu_load[type-1], load_now); 1151 return min(rq->cpu_load[type-1], rq->raw_weighted_load);
960} 1152}
961 1153
962/* 1154/*
963 * Return a high guess at the load of a migration-target cpu 1155 * Return a high guess at the load of a migration-target cpu weighted
1156 * according to the scheduling class and "nice" value.
964 */ 1157 */
965static inline unsigned long target_load(int cpu, int type) 1158static inline unsigned long target_load(int cpu, int type)
966{ 1159{
967 runqueue_t *rq = cpu_rq(cpu); 1160 struct rq *rq = cpu_rq(cpu);
968 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; 1161
969 if (type == 0) 1162 if (type == 0)
970 return load_now; 1163 return rq->raw_weighted_load;
1164
1165 return max(rq->cpu_load[type-1], rq->raw_weighted_load);
1166}
971 1167
972 return max(rq->cpu_load[type-1], load_now); 1168/*
1169 * Return the average load per task on the cpu's run queue
1170 */
1171static inline unsigned long cpu_avg_load_per_task(int cpu)
1172{
1173 struct rq *rq = cpu_rq(cpu);
1174 unsigned long n = rq->nr_running;
1175
1176 return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE;
973} 1177}
974 1178
975/* 1179/*
@@ -1042,7 +1246,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1042 cpus_and(tmp, group->cpumask, p->cpus_allowed); 1246 cpus_and(tmp, group->cpumask, p->cpus_allowed);
1043 1247
1044 for_each_cpu_mask(i, tmp) { 1248 for_each_cpu_mask(i, tmp) {
1045 load = source_load(i, 0); 1249 load = weighted_cpuload(i);
1046 1250
1047 if (load < min_load || (load == min_load && i == this_cpu)) { 1251 if (load < min_load || (load == min_load && i == this_cpu)) {
1048 min_load = load; 1252 min_load = load;
@@ -1069,9 +1273,15 @@ static int sched_balance_self(int cpu, int flag)
1069 struct task_struct *t = current; 1273 struct task_struct *t = current;
1070 struct sched_domain *tmp, *sd = NULL; 1274 struct sched_domain *tmp, *sd = NULL;
1071 1275
1072 for_each_domain(cpu, tmp) 1276 for_each_domain(cpu, tmp) {
1277 /*
1278 * If power savings logic is enabled for a domain, stop there.
1279 */
1280 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1281 break;
1073 if (tmp->flags & flag) 1282 if (tmp->flags & flag)
1074 sd = tmp; 1283 sd = tmp;
1284 }
1075 1285
1076 while (sd) { 1286 while (sd) {
1077 cpumask_t span; 1287 cpumask_t span;
@@ -1116,7 +1326,7 @@ nextlevel:
1116 * Returns the CPU we should wake onto. 1326 * Returns the CPU we should wake onto.
1117 */ 1327 */
1118#if defined(ARCH_HAS_SCHED_WAKE_IDLE) 1328#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1119static int wake_idle(int cpu, task_t *p) 1329static int wake_idle(int cpu, struct task_struct *p)
1120{ 1330{
1121 cpumask_t tmp; 1331 cpumask_t tmp;
1122 struct sched_domain *sd; 1332 struct sched_domain *sd;
@@ -1139,7 +1349,7 @@ static int wake_idle(int cpu, task_t *p)
1139 return cpu; 1349 return cpu;
1140} 1350}
1141#else 1351#else
1142static inline int wake_idle(int cpu, task_t *p) 1352static inline int wake_idle(int cpu, struct task_struct *p)
1143{ 1353{
1144 return cpu; 1354 return cpu;
1145} 1355}
@@ -1159,15 +1369,15 @@ static inline int wake_idle(int cpu, task_t *p)
1159 * 1369 *
1160 * returns failure only if the task is already active. 1370 * returns failure only if the task is already active.
1161 */ 1371 */
1162static int try_to_wake_up(task_t *p, unsigned int state, int sync) 1372static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1163{ 1373{
1164 int cpu, this_cpu, success = 0; 1374 int cpu, this_cpu, success = 0;
1165 unsigned long flags; 1375 unsigned long flags;
1166 long old_state; 1376 long old_state;
1167 runqueue_t *rq; 1377 struct rq *rq;
1168#ifdef CONFIG_SMP 1378#ifdef CONFIG_SMP
1169 unsigned long load, this_load;
1170 struct sched_domain *sd, *this_sd = NULL; 1379 struct sched_domain *sd, *this_sd = NULL;
1380 unsigned long load, this_load;
1171 int new_cpu; 1381 int new_cpu;
1172#endif 1382#endif
1173 1383
@@ -1221,17 +1431,19 @@ static int try_to_wake_up(task_t *p, unsigned int state, int sync)
1221 1431
1222 if (this_sd->flags & SD_WAKE_AFFINE) { 1432 if (this_sd->flags & SD_WAKE_AFFINE) {
1223 unsigned long tl = this_load; 1433 unsigned long tl = this_load;
1434 unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu);
1435
1224 /* 1436 /*
1225 * If sync wakeup then subtract the (maximum possible) 1437 * If sync wakeup then subtract the (maximum possible)
1226 * effect of the currently running task from the load 1438 * effect of the currently running task from the load
1227 * of the current CPU: 1439 * of the current CPU:
1228 */ 1440 */
1229 if (sync) 1441 if (sync)
1230 tl -= SCHED_LOAD_SCALE; 1442 tl -= current->load_weight;
1231 1443
1232 if ((tl <= load && 1444 if ((tl <= load &&
1233 tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) || 1445 tl + target_load(cpu, idx) <= tl_per_task) ||
1234 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) { 1446 100*(tl + p->load_weight) <= imbalance*load) {
1235 /* 1447 /*
1236 * This domain has SD_WAKE_AFFINE and 1448 * This domain has SD_WAKE_AFFINE and
1237 * p is cache cold in this domain, and 1449 * p is cache cold in this domain, and
@@ -1315,15 +1527,14 @@ out:
1315 return success; 1527 return success;
1316} 1528}
1317 1529
1318int fastcall wake_up_process(task_t *p) 1530int fastcall wake_up_process(struct task_struct *p)
1319{ 1531{
1320 return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | 1532 return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
1321 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); 1533 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
1322} 1534}
1323
1324EXPORT_SYMBOL(wake_up_process); 1535EXPORT_SYMBOL(wake_up_process);
1325 1536
1326int fastcall wake_up_state(task_t *p, unsigned int state) 1537int fastcall wake_up_state(struct task_struct *p, unsigned int state)
1327{ 1538{
1328 return try_to_wake_up(p, state, 0); 1539 return try_to_wake_up(p, state, 0);
1329} 1540}
@@ -1332,7 +1543,7 @@ int fastcall wake_up_state(task_t *p, unsigned int state)
1332 * Perform scheduler related setup for a newly forked process p. 1543 * Perform scheduler related setup for a newly forked process p.
1333 * p is forked by current. 1544 * p is forked by current.
1334 */ 1545 */
1335void fastcall sched_fork(task_t *p, int clone_flags) 1546void fastcall sched_fork(struct task_struct *p, int clone_flags)
1336{ 1547{
1337 int cpu = get_cpu(); 1548 int cpu = get_cpu();
1338 1549
@@ -1348,10 +1559,17 @@ void fastcall sched_fork(task_t *p, int clone_flags)
1348 * event cannot wake it up and insert it on the runqueue either. 1559 * event cannot wake it up and insert it on the runqueue either.
1349 */ 1560 */
1350 p->state = TASK_RUNNING; 1561 p->state = TASK_RUNNING;
1562
1563 /*
1564 * Make sure we do not leak PI boosting priority to the child:
1565 */
1566 p->prio = current->normal_prio;
1567
1351 INIT_LIST_HEAD(&p->run_list); 1568 INIT_LIST_HEAD(&p->run_list);
1352 p->array = NULL; 1569 p->array = NULL;
1353#ifdef CONFIG_SCHEDSTATS 1570#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1354 memset(&p->sched_info, 0, sizeof(p->sched_info)); 1571 if (unlikely(sched_info_on()))
1572 memset(&p->sched_info, 0, sizeof(p->sched_info));
1355#endif 1573#endif
1356#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 1574#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
1357 p->oncpu = 0; 1575 p->oncpu = 0;
@@ -1394,11 +1612,11 @@ void fastcall sched_fork(task_t *p, int clone_flags)
1394 * that must be done for every newly created context, then puts the task 1612 * that must be done for every newly created context, then puts the task
1395 * on the runqueue and wakes it. 1613 * on the runqueue and wakes it.
1396 */ 1614 */
1397void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags) 1615void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1398{ 1616{
1617 struct rq *rq, *this_rq;
1399 unsigned long flags; 1618 unsigned long flags;
1400 int this_cpu, cpu; 1619 int this_cpu, cpu;
1401 runqueue_t *rq, *this_rq;
1402 1620
1403 rq = task_rq_lock(p, &flags); 1621 rq = task_rq_lock(p, &flags);
1404 BUG_ON(p->state != TASK_RUNNING); 1622 BUG_ON(p->state != TASK_RUNNING);
@@ -1427,10 +1645,11 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
1427 __activate_task(p, rq); 1645 __activate_task(p, rq);
1428 else { 1646 else {
1429 p->prio = current->prio; 1647 p->prio = current->prio;
1648 p->normal_prio = current->normal_prio;
1430 list_add_tail(&p->run_list, &current->run_list); 1649 list_add_tail(&p->run_list, &current->run_list);
1431 p->array = current->array; 1650 p->array = current->array;
1432 p->array->nr_active++; 1651 p->array->nr_active++;
1433 rq->nr_running++; 1652 inc_nr_running(p, rq);
1434 } 1653 }
1435 set_need_resched(); 1654 set_need_resched();
1436 } else 1655 } else
@@ -1477,10 +1696,10 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
1477 * artificially, because any timeslice recovered here 1696 * artificially, because any timeslice recovered here
1478 * was given away by the parent in the first place.) 1697 * was given away by the parent in the first place.)
1479 */ 1698 */
1480void fastcall sched_exit(task_t *p) 1699void fastcall sched_exit(struct task_struct *p)
1481{ 1700{
1482 unsigned long flags; 1701 unsigned long flags;
1483 runqueue_t *rq; 1702 struct rq *rq;
1484 1703
1485 /* 1704 /*
1486 * If the child was a (relative-) CPU hog then decrease 1705 * If the child was a (relative-) CPU hog then decrease
@@ -1511,7 +1730,7 @@ void fastcall sched_exit(task_t *p)
1511 * prepare_task_switch sets up locking and calls architecture specific 1730 * prepare_task_switch sets up locking and calls architecture specific
1512 * hooks. 1731 * hooks.
1513 */ 1732 */
1514static inline void prepare_task_switch(runqueue_t *rq, task_t *next) 1733static inline void prepare_task_switch(struct rq *rq, struct task_struct *next)
1515{ 1734{
1516 prepare_lock_switch(rq, next); 1735 prepare_lock_switch(rq, next);
1517 prepare_arch_switch(next); 1736 prepare_arch_switch(next);
@@ -1532,31 +1751,31 @@ static inline void prepare_task_switch(runqueue_t *rq, task_t *next)
1532 * with the lock held can cause deadlocks; see schedule() for 1751 * with the lock held can cause deadlocks; see schedule() for
1533 * details.) 1752 * details.)
1534 */ 1753 */
1535static inline void finish_task_switch(runqueue_t *rq, task_t *prev) 1754static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
1536 __releases(rq->lock) 1755 __releases(rq->lock)
1537{ 1756{
1538 struct mm_struct *mm = rq->prev_mm; 1757 struct mm_struct *mm = rq->prev_mm;
1539 unsigned long prev_task_flags; 1758 long prev_state;
1540 1759
1541 rq->prev_mm = NULL; 1760 rq->prev_mm = NULL;
1542 1761
1543 /* 1762 /*
1544 * A task struct has one reference for the use as "current". 1763 * A task struct has one reference for the use as "current".
1545 * If a task dies, then it sets EXIT_ZOMBIE in tsk->exit_state and 1764 * If a task dies, then it sets TASK_DEAD in tsk->state and calls
1546 * calls schedule one last time. The schedule call will never return, 1765 * schedule one last time. The schedule call will never return, and
1547 * and the scheduled task must drop that reference. 1766 * the scheduled task must drop that reference.
1548 * The test for EXIT_ZOMBIE must occur while the runqueue locks are 1767 * The test for TASK_DEAD must occur while the runqueue locks are
1549 * still held, otherwise prev could be scheduled on another cpu, die 1768 * still held, otherwise prev could be scheduled on another cpu, die
1550 * there before we look at prev->state, and then the reference would 1769 * there before we look at prev->state, and then the reference would
1551 * be dropped twice. 1770 * be dropped twice.
1552 * Manfred Spraul <manfred@colorfullife.com> 1771 * Manfred Spraul <manfred@colorfullife.com>
1553 */ 1772 */
1554 prev_task_flags = prev->flags; 1773 prev_state = prev->state;
1555 finish_arch_switch(prev); 1774 finish_arch_switch(prev);
1556 finish_lock_switch(rq, prev); 1775 finish_lock_switch(rq, prev);
1557 if (mm) 1776 if (mm)
1558 mmdrop(mm); 1777 mmdrop(mm);
1559 if (unlikely(prev_task_flags & PF_DEAD)) { 1778 if (unlikely(prev_state == TASK_DEAD)) {
1560 /* 1779 /*
1561 * Remove function-return probe instances associated with this 1780 * Remove function-return probe instances associated with this
1562 * task and put them back on the free list. 1781 * task and put them back on the free list.
@@ -1570,10 +1789,11 @@ static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
1570 * schedule_tail - first thing a freshly forked thread must call. 1789 * schedule_tail - first thing a freshly forked thread must call.
1571 * @prev: the thread we just switched away from. 1790 * @prev: the thread we just switched away from.
1572 */ 1791 */
1573asmlinkage void schedule_tail(task_t *prev) 1792asmlinkage void schedule_tail(struct task_struct *prev)
1574 __releases(rq->lock) 1793 __releases(rq->lock)
1575{ 1794{
1576 runqueue_t *rq = this_rq(); 1795 struct rq *rq = this_rq();
1796
1577 finish_task_switch(rq, prev); 1797 finish_task_switch(rq, prev);
1578#ifdef __ARCH_WANT_UNLOCKED_CTXSW 1798#ifdef __ARCH_WANT_UNLOCKED_CTXSW
1579 /* In this case, finish_task_switch does not reenable preemption */ 1799 /* In this case, finish_task_switch does not reenable preemption */
@@ -1587,8 +1807,9 @@ asmlinkage void schedule_tail(task_t *prev)
1587 * context_switch - switch to the new MM and the new 1807 * context_switch - switch to the new MM and the new
1588 * thread's register state. 1808 * thread's register state.
1589 */ 1809 */
1590static inline 1810static inline struct task_struct *
1591task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next) 1811context_switch(struct rq *rq, struct task_struct *prev,
1812 struct task_struct *next)
1592{ 1813{
1593 struct mm_struct *mm = next->mm; 1814 struct mm_struct *mm = next->mm;
1594 struct mm_struct *oldmm = prev->active_mm; 1815 struct mm_struct *oldmm = prev->active_mm;
@@ -1605,6 +1826,15 @@ task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next)
1605 WARN_ON(rq->prev_mm); 1826 WARN_ON(rq->prev_mm);
1606 rq->prev_mm = oldmm; 1827 rq->prev_mm = oldmm;
1607 } 1828 }
1829 /*
1830 * Since the runqueue lock will be released by the next
1831 * task (which is an invalid locking op but in the case
1832 * of the scheduler it's an obvious special-case), so we
1833 * do an early lockdep release here:
1834 */
1835#ifndef __ARCH_WANT_UNLOCKED_CTXSW
1836 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
1837#endif
1608 1838
1609 /* Here we just switch the register state and the stack. */ 1839 /* Here we just switch the register state and the stack. */
1610 switch_to(prev, next, prev); 1840 switch_to(prev, next, prev);
@@ -1648,7 +1878,8 @@ unsigned long nr_uninterruptible(void)
1648 1878
1649unsigned long long nr_context_switches(void) 1879unsigned long long nr_context_switches(void)
1650{ 1880{
1651 unsigned long long i, sum = 0; 1881 int i;
1882 unsigned long long sum = 0;
1652 1883
1653 for_each_possible_cpu(i) 1884 for_each_possible_cpu(i)
1654 sum += cpu_rq(i)->nr_switches; 1885 sum += cpu_rq(i)->nr_switches;
@@ -1684,15 +1915,21 @@ unsigned long nr_active(void)
1684#ifdef CONFIG_SMP 1915#ifdef CONFIG_SMP
1685 1916
1686/* 1917/*
1918 * Is this task likely cache-hot:
1919 */
1920static inline int
1921task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd)
1922{
1923 return (long long)(now - p->last_ran) < (long long)sd->cache_hot_time;
1924}
1925
1926/*
1687 * double_rq_lock - safely lock two runqueues 1927 * double_rq_lock - safely lock two runqueues
1688 * 1928 *
1689 * We must take them in cpu order to match code in
1690 * dependent_sleeper and wake_dependent_sleeper.
1691 *
1692 * Note this does not disable interrupts like task_rq_lock, 1929 * Note this does not disable interrupts like task_rq_lock,
1693 * you need to do so manually before calling. 1930 * you need to do so manually before calling.
1694 */ 1931 */
1695static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) 1932static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1696 __acquires(rq1->lock) 1933 __acquires(rq1->lock)
1697 __acquires(rq2->lock) 1934 __acquires(rq2->lock)
1698{ 1935{
@@ -1700,7 +1937,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
1700 spin_lock(&rq1->lock); 1937 spin_lock(&rq1->lock);
1701 __acquire(rq2->lock); /* Fake it out ;) */ 1938 __acquire(rq2->lock); /* Fake it out ;) */
1702 } else { 1939 } else {
1703 if (rq1->cpu < rq2->cpu) { 1940 if (rq1 < rq2) {
1704 spin_lock(&rq1->lock); 1941 spin_lock(&rq1->lock);
1705 spin_lock(&rq2->lock); 1942 spin_lock(&rq2->lock);
1706 } else { 1943 } else {
@@ -1716,7 +1953,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
1716 * Note this does not restore interrupts like task_rq_unlock, 1953 * Note this does not restore interrupts like task_rq_unlock,
1717 * you need to do so manually after calling. 1954 * you need to do so manually after calling.
1718 */ 1955 */
1719static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) 1956static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1720 __releases(rq1->lock) 1957 __releases(rq1->lock)
1721 __releases(rq2->lock) 1958 __releases(rq2->lock)
1722{ 1959{
@@ -1730,13 +1967,13 @@ static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
1730/* 1967/*
1731 * double_lock_balance - lock the busiest runqueue, this_rq is locked already. 1968 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1732 */ 1969 */
1733static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) 1970static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
1734 __releases(this_rq->lock) 1971 __releases(this_rq->lock)
1735 __acquires(busiest->lock) 1972 __acquires(busiest->lock)
1736 __acquires(this_rq->lock) 1973 __acquires(this_rq->lock)
1737{ 1974{
1738 if (unlikely(!spin_trylock(&busiest->lock))) { 1975 if (unlikely(!spin_trylock(&busiest->lock))) {
1739 if (busiest->cpu < this_rq->cpu) { 1976 if (busiest < this_rq) {
1740 spin_unlock(&this_rq->lock); 1977 spin_unlock(&this_rq->lock);
1741 spin_lock(&busiest->lock); 1978 spin_lock(&busiest->lock);
1742 spin_lock(&this_rq->lock); 1979 spin_lock(&this_rq->lock);
@@ -1751,11 +1988,11 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
1751 * allow dest_cpu, which will force the cpu onto dest_cpu. Then 1988 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
1752 * the cpu_allowed mask is restored. 1989 * the cpu_allowed mask is restored.
1753 */ 1990 */
1754static void sched_migrate_task(task_t *p, int dest_cpu) 1991static void sched_migrate_task(struct task_struct *p, int dest_cpu)
1755{ 1992{
1756 migration_req_t req; 1993 struct migration_req req;
1757 runqueue_t *rq;
1758 unsigned long flags; 1994 unsigned long flags;
1995 struct rq *rq;
1759 1996
1760 rq = task_rq_lock(p, &flags); 1997 rq = task_rq_lock(p, &flags);
1761 if (!cpu_isset(dest_cpu, p->cpus_allowed) 1998 if (!cpu_isset(dest_cpu, p->cpus_allowed)
@@ -1766,11 +2003,13 @@ static void sched_migrate_task(task_t *p, int dest_cpu)
1766 if (migrate_task(p, dest_cpu, &req)) { 2003 if (migrate_task(p, dest_cpu, &req)) {
1767 /* Need to wait for migration thread (might exit: take ref). */ 2004 /* Need to wait for migration thread (might exit: take ref). */
1768 struct task_struct *mt = rq->migration_thread; 2005 struct task_struct *mt = rq->migration_thread;
2006
1769 get_task_struct(mt); 2007 get_task_struct(mt);
1770 task_rq_unlock(rq, &flags); 2008 task_rq_unlock(rq, &flags);
1771 wake_up_process(mt); 2009 wake_up_process(mt);
1772 put_task_struct(mt); 2010 put_task_struct(mt);
1773 wait_for_completion(&req.done); 2011 wait_for_completion(&req.done);
2012
1774 return; 2013 return;
1775 } 2014 }
1776out: 2015out:
@@ -1794,14 +2033,14 @@ void sched_exec(void)
1794 * pull_task - move a task from a remote runqueue to the local runqueue. 2033 * pull_task - move a task from a remote runqueue to the local runqueue.
1795 * Both runqueues must be locked. 2034 * Both runqueues must be locked.
1796 */ 2035 */
1797static 2036static void pull_task(struct rq *src_rq, struct prio_array *src_array,
1798void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, 2037 struct task_struct *p, struct rq *this_rq,
1799 runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) 2038 struct prio_array *this_array, int this_cpu)
1800{ 2039{
1801 dequeue_task(p, src_array); 2040 dequeue_task(p, src_array);
1802 src_rq->nr_running--; 2041 dec_nr_running(p, src_rq);
1803 set_task_cpu(p, this_cpu); 2042 set_task_cpu(p, this_cpu);
1804 this_rq->nr_running++; 2043 inc_nr_running(p, this_rq);
1805 enqueue_task(p, this_array); 2044 enqueue_task(p, this_array);
1806 p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) 2045 p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
1807 + this_rq->timestamp_last_tick; 2046 + this_rq->timestamp_last_tick;
@@ -1817,7 +2056,7 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1817 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? 2056 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
1818 */ 2057 */
1819static 2058static
1820int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, 2059int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
1821 struct sched_domain *sd, enum idle_type idle, 2060 struct sched_domain *sd, enum idle_type idle,
1822 int *all_pinned) 2061 int *all_pinned)
1823{ 2062{
@@ -1848,26 +2087,42 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1848 return 1; 2087 return 1;
1849} 2088}
1850 2089
2090#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio)
2091
1851/* 2092/*
1852 * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, 2093 * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
1853 * as part of a balancing operation within "domain". Returns the number of 2094 * load from busiest to this_rq, as part of a balancing operation within
1854 * tasks moved. 2095 * "domain". Returns the number of tasks moved.
1855 * 2096 *
1856 * Called with both runqueues locked. 2097 * Called with both runqueues locked.
1857 */ 2098 */
1858static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, 2099static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1859 unsigned long max_nr_move, struct sched_domain *sd, 2100 unsigned long max_nr_move, unsigned long max_load_move,
1860 enum idle_type idle, int *all_pinned) 2101 struct sched_domain *sd, enum idle_type idle,
2102 int *all_pinned)
1861{ 2103{
1862 prio_array_t *array, *dst_array; 2104 int idx, pulled = 0, pinned = 0, this_best_prio, best_prio,
2105 best_prio_seen, skip_for_load;
2106 struct prio_array *array, *dst_array;
1863 struct list_head *head, *curr; 2107 struct list_head *head, *curr;
1864 int idx, pulled = 0, pinned = 0; 2108 struct task_struct *tmp;
1865 task_t *tmp; 2109 long rem_load_move;
1866 2110
1867 if (max_nr_move == 0) 2111 if (max_nr_move == 0 || max_load_move == 0)
1868 goto out; 2112 goto out;
1869 2113
2114 rem_load_move = max_load_move;
1870 pinned = 1; 2115 pinned = 1;
2116 this_best_prio = rq_best_prio(this_rq);
2117 best_prio = rq_best_prio(busiest);
2118 /*
2119 * Enable handling of the case where there is more than one task
2120 * with the best priority. If the current running task is one
2121 * of those with prio==best_prio we know it won't be moved
2122 * and therefore it's safe to override the skip (based on load) of
2123 * any task we find with that prio.
2124 */
2125 best_prio_seen = best_prio == busiest->curr->prio;
1871 2126
1872 /* 2127 /*
1873 * We first consider expired tasks. Those will likely not be 2128 * We first consider expired tasks. Those will likely not be
@@ -1903,11 +2158,22 @@ skip_bitmap:
1903 head = array->queue + idx; 2158 head = array->queue + idx;
1904 curr = head->prev; 2159 curr = head->prev;
1905skip_queue: 2160skip_queue:
1906 tmp = list_entry(curr, task_t, run_list); 2161 tmp = list_entry(curr, struct task_struct, run_list);
1907 2162
1908 curr = curr->prev; 2163 curr = curr->prev;
1909 2164
1910 if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { 2165 /*
2166 * To help distribute high priority tasks accross CPUs we don't
2167 * skip a task if it will be the highest priority task (i.e. smallest
2168 * prio value) on its new queue regardless of its load weight
2169 */
2170 skip_for_load = tmp->load_weight > rem_load_move;
2171 if (skip_for_load && idx < this_best_prio)
2172 skip_for_load = !best_prio_seen && idx == best_prio;
2173 if (skip_for_load ||
2174 !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
2175
2176 best_prio_seen |= idx == best_prio;
1911 if (curr != head) 2177 if (curr != head)
1912 goto skip_queue; 2178 goto skip_queue;
1913 idx++; 2179 idx++;
@@ -1921,9 +2187,15 @@ skip_queue:
1921 2187
1922 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); 2188 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
1923 pulled++; 2189 pulled++;
2190 rem_load_move -= tmp->load_weight;
1924 2191
1925 /* We only want to steal up to the prescribed number of tasks. */ 2192 /*
1926 if (pulled < max_nr_move) { 2193 * We only want to steal up to the prescribed number of tasks
2194 * and the prescribed amount of weighted load.
2195 */
2196 if (pulled < max_nr_move && rem_load_move > 0) {
2197 if (idx < this_best_prio)
2198 this_best_prio = idx;
1927 if (curr != head) 2199 if (curr != head)
1928 goto skip_queue; 2200 goto skip_queue;
1929 idx++; 2201 idx++;
@@ -1944,19 +2216,30 @@ out:
1944 2216
1945/* 2217/*
1946 * find_busiest_group finds and returns the busiest CPU group within the 2218 * find_busiest_group finds and returns the busiest CPU group within the
1947 * domain. It calculates and returns the number of tasks which should be 2219 * domain. It calculates and returns the amount of weighted load which
1948 * moved to restore balance via the imbalance parameter. 2220 * should be moved to restore balance via the imbalance parameter.
1949 */ 2221 */
1950static struct sched_group * 2222static struct sched_group *
1951find_busiest_group(struct sched_domain *sd, int this_cpu, 2223find_busiest_group(struct sched_domain *sd, int this_cpu,
1952 unsigned long *imbalance, enum idle_type idle, int *sd_idle) 2224 unsigned long *imbalance, enum idle_type idle, int *sd_idle,
2225 cpumask_t *cpus)
1953{ 2226{
1954 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 2227 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
1955 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 2228 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
1956 unsigned long max_pull; 2229 unsigned long max_pull;
2230 unsigned long busiest_load_per_task, busiest_nr_running;
2231 unsigned long this_load_per_task, this_nr_running;
1957 int load_idx; 2232 int load_idx;
2233#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2234 int power_savings_balance = 1;
2235 unsigned long leader_nr_running = 0, min_load_per_task = 0;
2236 unsigned long min_nr_running = ULONG_MAX;
2237 struct sched_group *group_min = NULL, *group_leader = NULL;
2238#endif
1958 2239
1959 max_load = this_load = total_load = total_pwr = 0; 2240 max_load = this_load = total_load = total_pwr = 0;
2241 busiest_load_per_task = busiest_nr_running = 0;
2242 this_load_per_task = this_nr_running = 0;
1960 if (idle == NOT_IDLE) 2243 if (idle == NOT_IDLE)
1961 load_idx = sd->busy_idx; 2244 load_idx = sd->busy_idx;
1962 else if (idle == NEWLY_IDLE) 2245 else if (idle == NEWLY_IDLE)
@@ -1965,16 +2248,24 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1965 load_idx = sd->idle_idx; 2248 load_idx = sd->idle_idx;
1966 2249
1967 do { 2250 do {
1968 unsigned long load; 2251 unsigned long load, group_capacity;
1969 int local_group; 2252 int local_group;
1970 int i; 2253 int i;
2254 unsigned long sum_nr_running, sum_weighted_load;
1971 2255
1972 local_group = cpu_isset(this_cpu, group->cpumask); 2256 local_group = cpu_isset(this_cpu, group->cpumask);
1973 2257
1974 /* Tally up the load of all CPUs in the group */ 2258 /* Tally up the load of all CPUs in the group */
1975 avg_load = 0; 2259 sum_weighted_load = sum_nr_running = avg_load = 0;
1976 2260
1977 for_each_cpu_mask(i, group->cpumask) { 2261 for_each_cpu_mask(i, group->cpumask) {
2262 struct rq *rq;
2263
2264 if (!cpu_isset(i, *cpus))
2265 continue;
2266
2267 rq = cpu_rq(i);
2268
1978 if (*sd_idle && !idle_cpu(i)) 2269 if (*sd_idle && !idle_cpu(i))
1979 *sd_idle = 0; 2270 *sd_idle = 0;
1980 2271
@@ -1985,6 +2276,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1985 load = source_load(i, load_idx); 2276 load = source_load(i, load_idx);
1986 2277
1987 avg_load += load; 2278 avg_load += load;
2279 sum_nr_running += rq->nr_running;
2280 sum_weighted_load += rq->raw_weighted_load;
1988 } 2281 }
1989 2282
1990 total_load += avg_load; 2283 total_load += avg_load;
@@ -1993,17 +2286,80 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1993 /* Adjust by relative CPU power of the group */ 2286 /* Adjust by relative CPU power of the group */
1994 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; 2287 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
1995 2288
2289 group_capacity = group->cpu_power / SCHED_LOAD_SCALE;
2290
1996 if (local_group) { 2291 if (local_group) {
1997 this_load = avg_load; 2292 this_load = avg_load;
1998 this = group; 2293 this = group;
1999 } else if (avg_load > max_load) { 2294 this_nr_running = sum_nr_running;
2295 this_load_per_task = sum_weighted_load;
2296 } else if (avg_load > max_load &&
2297 sum_nr_running > group_capacity) {
2000 max_load = avg_load; 2298 max_load = avg_load;
2001 busiest = group; 2299 busiest = group;
2300 busiest_nr_running = sum_nr_running;
2301 busiest_load_per_task = sum_weighted_load;
2302 }
2303
2304#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2305 /*
2306 * Busy processors will not participate in power savings
2307 * balance.
2308 */
2309 if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2310 goto group_next;
2311
2312 /*
2313 * If the local group is idle or completely loaded
2314 * no need to do power savings balance at this domain
2315 */
2316 if (local_group && (this_nr_running >= group_capacity ||
2317 !this_nr_running))
2318 power_savings_balance = 0;
2319
2320 /*
2321 * If a group is already running at full capacity or idle,
2322 * don't include that group in power savings calculations
2323 */
2324 if (!power_savings_balance || sum_nr_running >= group_capacity
2325 || !sum_nr_running)
2326 goto group_next;
2327
2328 /*
2329 * Calculate the group which has the least non-idle load.
2330 * This is the group from where we need to pick up the load
2331 * for saving power
2332 */
2333 if ((sum_nr_running < min_nr_running) ||
2334 (sum_nr_running == min_nr_running &&
2335 first_cpu(group->cpumask) <
2336 first_cpu(group_min->cpumask))) {
2337 group_min = group;
2338 min_nr_running = sum_nr_running;
2339 min_load_per_task = sum_weighted_load /
2340 sum_nr_running;
2341 }
2342
2343 /*
2344 * Calculate the group which is almost near its
2345 * capacity but still has some space to pick up some load
2346 * from other group and save more power
2347 */
2348 if (sum_nr_running <= group_capacity - 1) {
2349 if (sum_nr_running > leader_nr_running ||
2350 (sum_nr_running == leader_nr_running &&
2351 first_cpu(group->cpumask) >
2352 first_cpu(group_leader->cpumask))) {
2353 group_leader = group;
2354 leader_nr_running = sum_nr_running;
2355 }
2002 } 2356 }
2357group_next:
2358#endif
2003 group = group->next; 2359 group = group->next;
2004 } while (group != sd->groups); 2360 } while (group != sd->groups);
2005 2361
2006 if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE) 2362 if (!busiest || this_load >= max_load || busiest_nr_running == 0)
2007 goto out_balanced; 2363 goto out_balanced;
2008 2364
2009 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; 2365 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
@@ -2012,6 +2368,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2012 100*max_load <= sd->imbalance_pct*this_load) 2368 100*max_load <= sd->imbalance_pct*this_load)
2013 goto out_balanced; 2369 goto out_balanced;
2014 2370
2371 busiest_load_per_task /= busiest_nr_running;
2015 /* 2372 /*
2016 * We're trying to get all the cpus to the average_load, so we don't 2373 * We're trying to get all the cpus to the average_load, so we don't
2017 * want to push ourselves above the average load, nor do we wish to 2374 * want to push ourselves above the average load, nor do we wish to
@@ -2023,21 +2380,49 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2023 * by pulling tasks to us. Be careful of negative numbers as they'll 2380 * by pulling tasks to us. Be careful of negative numbers as they'll
2024 * appear as very large values with unsigned longs. 2381 * appear as very large values with unsigned longs.
2025 */ 2382 */
2383 if (max_load <= busiest_load_per_task)
2384 goto out_balanced;
2385
2386 /*
2387 * In the presence of smp nice balancing, certain scenarios can have
2388 * max load less than avg load(as we skip the groups at or below
2389 * its cpu_power, while calculating max_load..)
2390 */
2391 if (max_load < avg_load) {
2392 *imbalance = 0;
2393 goto small_imbalance;
2394 }
2026 2395
2027 /* Don't want to pull so many tasks that a group would go idle */ 2396 /* Don't want to pull so many tasks that a group would go idle */
2028 max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE); 2397 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
2029 2398
2030 /* How much load to actually move to equalise the imbalance */ 2399 /* How much load to actually move to equalise the imbalance */
2031 *imbalance = min(max_pull * busiest->cpu_power, 2400 *imbalance = min(max_pull * busiest->cpu_power,
2032 (avg_load - this_load) * this->cpu_power) 2401 (avg_load - this_load) * this->cpu_power)
2033 / SCHED_LOAD_SCALE; 2402 / SCHED_LOAD_SCALE;
2034 2403
2035 if (*imbalance < SCHED_LOAD_SCALE) { 2404 /*
2036 unsigned long pwr_now = 0, pwr_move = 0; 2405 * if *imbalance is less than the average load per runnable task
2037 unsigned long tmp; 2406 * there is no gaurantee that any tasks will be moved so we'll have
2407 * a think about bumping its value to force at least one task to be
2408 * moved
2409 */
2410 if (*imbalance < busiest_load_per_task) {
2411 unsigned long tmp, pwr_now, pwr_move;
2412 unsigned int imbn;
2413
2414small_imbalance:
2415 pwr_move = pwr_now = 0;
2416 imbn = 2;
2417 if (this_nr_running) {
2418 this_load_per_task /= this_nr_running;
2419 if (busiest_load_per_task > this_load_per_task)
2420 imbn = 1;
2421 } else
2422 this_load_per_task = SCHED_LOAD_SCALE;
2038 2423
2039 if (max_load - this_load >= SCHED_LOAD_SCALE*2) { 2424 if (max_load - this_load >= busiest_load_per_task * imbn) {
2040 *imbalance = 1; 2425 *imbalance = busiest_load_per_task;
2041 return busiest; 2426 return busiest;
2042 } 2427 }
2043 2428
@@ -2047,39 +2432,47 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2047 * moving them. 2432 * moving them.
2048 */ 2433 */
2049 2434
2050 pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); 2435 pwr_now += busiest->cpu_power *
2051 pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); 2436 min(busiest_load_per_task, max_load);
2437 pwr_now += this->cpu_power *
2438 min(this_load_per_task, this_load);
2052 pwr_now /= SCHED_LOAD_SCALE; 2439 pwr_now /= SCHED_LOAD_SCALE;
2053 2440
2054 /* Amount of load we'd subtract */ 2441 /* Amount of load we'd subtract */
2055 tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; 2442 tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power;
2056 if (max_load > tmp) 2443 if (max_load > tmp)
2057 pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE, 2444 pwr_move += busiest->cpu_power *
2058 max_load - tmp); 2445 min(busiest_load_per_task, max_load - tmp);
2059 2446
2060 /* Amount of load we'd add */ 2447 /* Amount of load we'd add */
2061 if (max_load*busiest->cpu_power < 2448 if (max_load*busiest->cpu_power <
2062 SCHED_LOAD_SCALE*SCHED_LOAD_SCALE) 2449 busiest_load_per_task*SCHED_LOAD_SCALE)
2063 tmp = max_load*busiest->cpu_power/this->cpu_power; 2450 tmp = max_load*busiest->cpu_power/this->cpu_power;
2064 else 2451 else
2065 tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; 2452 tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power;
2066 pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp); 2453 pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp);
2067 pwr_move /= SCHED_LOAD_SCALE; 2454 pwr_move /= SCHED_LOAD_SCALE;
2068 2455
2069 /* Move if we gain throughput */ 2456 /* Move if we gain throughput */
2070 if (pwr_move <= pwr_now) 2457 if (pwr_move <= pwr_now)
2071 goto out_balanced; 2458 goto out_balanced;
2072 2459
2073 *imbalance = 1; 2460 *imbalance = busiest_load_per_task;
2074 return busiest;
2075 } 2461 }
2076 2462
2077 /* Get rid of the scaling factor, rounding down as we divide */
2078 *imbalance = *imbalance / SCHED_LOAD_SCALE;
2079 return busiest; 2463 return busiest;
2080 2464
2081out_balanced: 2465out_balanced:
2466#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2467 if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2468 goto ret;
2082 2469
2470 if (this == group_leader && group_leader != group_min) {
2471 *imbalance = min_load_per_task;
2472 return group_min;
2473 }
2474ret:
2475#endif
2083 *imbalance = 0; 2476 *imbalance = 0;
2084 return NULL; 2477 return NULL;
2085} 2478}
@@ -2087,19 +2480,27 @@ out_balanced:
2087/* 2480/*
2088 * find_busiest_queue - find the busiest runqueue among the cpus in group. 2481 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2089 */ 2482 */
2090static runqueue_t *find_busiest_queue(struct sched_group *group, 2483static struct rq *
2091 enum idle_type idle) 2484find_busiest_queue(struct sched_group *group, enum idle_type idle,
2485 unsigned long imbalance, cpumask_t *cpus)
2092{ 2486{
2093 unsigned long load, max_load = 0; 2487 struct rq *busiest = NULL, *rq;
2094 runqueue_t *busiest = NULL; 2488 unsigned long max_load = 0;
2095 int i; 2489 int i;
2096 2490
2097 for_each_cpu_mask(i, group->cpumask) { 2491 for_each_cpu_mask(i, group->cpumask) {
2098 load = source_load(i, 0);
2099 2492
2100 if (load > max_load) { 2493 if (!cpu_isset(i, *cpus))
2101 max_load = load; 2494 continue;
2102 busiest = cpu_rq(i); 2495
2496 rq = cpu_rq(i);
2497
2498 if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance)
2499 continue;
2500
2501 if (rq->raw_weighted_load > max_load) {
2502 max_load = rq->raw_weighted_load;
2503 busiest = rq;
2103 } 2504 }
2104 } 2505 }
2105 2506
@@ -2112,34 +2513,41 @@ static runqueue_t *find_busiest_queue(struct sched_group *group,
2112 */ 2513 */
2113#define MAX_PINNED_INTERVAL 512 2514#define MAX_PINNED_INTERVAL 512
2114 2515
2516static inline unsigned long minus_1_or_zero(unsigned long n)
2517{
2518 return n > 0 ? n - 1 : 0;
2519}
2520
2115/* 2521/*
2116 * Check this_cpu to ensure it is balanced within domain. Attempt to move 2522 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2117 * tasks if there is an imbalance. 2523 * tasks if there is an imbalance.
2118 * 2524 *
2119 * Called with this_rq unlocked. 2525 * Called with this_rq unlocked.
2120 */ 2526 */
2121static int load_balance(int this_cpu, runqueue_t *this_rq, 2527static int load_balance(int this_cpu, struct rq *this_rq,
2122 struct sched_domain *sd, enum idle_type idle) 2528 struct sched_domain *sd, enum idle_type idle)
2123{ 2529{
2530 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
2124 struct sched_group *group; 2531 struct sched_group *group;
2125 runqueue_t *busiest;
2126 unsigned long imbalance; 2532 unsigned long imbalance;
2127 int nr_moved, all_pinned = 0; 2533 struct rq *busiest;
2128 int active_balance = 0; 2534 cpumask_t cpus = CPU_MASK_ALL;
2129 int sd_idle = 0;
2130 2535
2131 if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER) 2536 if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2537 !sched_smt_power_savings)
2132 sd_idle = 1; 2538 sd_idle = 1;
2133 2539
2134 schedstat_inc(sd, lb_cnt[idle]); 2540 schedstat_inc(sd, lb_cnt[idle]);
2135 2541
2136 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle); 2542redo:
2543 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
2544 &cpus);
2137 if (!group) { 2545 if (!group) {
2138 schedstat_inc(sd, lb_nobusyg[idle]); 2546 schedstat_inc(sd, lb_nobusyg[idle]);
2139 goto out_balanced; 2547 goto out_balanced;
2140 } 2548 }
2141 2549
2142 busiest = find_busiest_queue(group, idle); 2550 busiest = find_busiest_queue(group, idle, imbalance, &cpus);
2143 if (!busiest) { 2551 if (!busiest) {
2144 schedstat_inc(sd, lb_nobusyq[idle]); 2552 schedstat_inc(sd, lb_nobusyq[idle]);
2145 goto out_balanced; 2553 goto out_balanced;
@@ -2159,12 +2567,17 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
2159 */ 2567 */
2160 double_rq_lock(this_rq, busiest); 2568 double_rq_lock(this_rq, busiest);
2161 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2569 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2162 imbalance, sd, idle, &all_pinned); 2570 minus_1_or_zero(busiest->nr_running),
2571 imbalance, sd, idle, &all_pinned);
2163 double_rq_unlock(this_rq, busiest); 2572 double_rq_unlock(this_rq, busiest);
2164 2573
2165 /* All tasks on this runqueue were pinned by CPU affinity */ 2574 /* All tasks on this runqueue were pinned by CPU affinity */
2166 if (unlikely(all_pinned)) 2575 if (unlikely(all_pinned)) {
2576 cpu_clear(cpu_of(busiest), cpus);
2577 if (!cpus_empty(cpus))
2578 goto redo;
2167 goto out_balanced; 2579 goto out_balanced;
2580 }
2168 } 2581 }
2169 2582
2170 if (!nr_moved) { 2583 if (!nr_moved) {
@@ -2216,7 +2629,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
2216 sd->balance_interval *= 2; 2629 sd->balance_interval *= 2;
2217 } 2630 }
2218 2631
2219 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER) 2632 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2633 !sched_smt_power_savings)
2220 return -1; 2634 return -1;
2221 return nr_moved; 2635 return nr_moved;
2222 2636
@@ -2231,7 +2645,8 @@ out_one_pinned:
2231 (sd->balance_interval < sd->max_interval)) 2645 (sd->balance_interval < sd->max_interval))
2232 sd->balance_interval *= 2; 2646 sd->balance_interval *= 2;
2233 2647
2234 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) 2648 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2649 !sched_smt_power_savings)
2235 return -1; 2650 return -1;
2236 return 0; 2651 return 0;
2237} 2652}
@@ -2243,26 +2658,30 @@ out_one_pinned:
2243 * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). 2658 * Called from schedule when this_rq is about to become idle (NEWLY_IDLE).
2244 * this_rq is locked. 2659 * this_rq is locked.
2245 */ 2660 */
2246static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, 2661static int
2247 struct sched_domain *sd) 2662load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2248{ 2663{
2249 struct sched_group *group; 2664 struct sched_group *group;
2250 runqueue_t *busiest = NULL; 2665 struct rq *busiest = NULL;
2251 unsigned long imbalance; 2666 unsigned long imbalance;
2252 int nr_moved = 0; 2667 int nr_moved = 0;
2253 int sd_idle = 0; 2668 int sd_idle = 0;
2669 cpumask_t cpus = CPU_MASK_ALL;
2254 2670
2255 if (sd->flags & SD_SHARE_CPUPOWER) 2671 if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
2256 sd_idle = 1; 2672 sd_idle = 1;
2257 2673
2258 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); 2674 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
2259 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle); 2675redo:
2676 group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE,
2677 &sd_idle, &cpus);
2260 if (!group) { 2678 if (!group) {
2261 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); 2679 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
2262 goto out_balanced; 2680 goto out_balanced;
2263 } 2681 }
2264 2682
2265 busiest = find_busiest_queue(group, NEWLY_IDLE); 2683 busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance,
2684 &cpus);
2266 if (!busiest) { 2685 if (!busiest) {
2267 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); 2686 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
2268 goto out_balanced; 2687 goto out_balanced;
@@ -2277,8 +2696,15 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2277 /* Attempt to move tasks */ 2696 /* Attempt to move tasks */
2278 double_lock_balance(this_rq, busiest); 2697 double_lock_balance(this_rq, busiest);
2279 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2698 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2699 minus_1_or_zero(busiest->nr_running),
2280 imbalance, sd, NEWLY_IDLE, NULL); 2700 imbalance, sd, NEWLY_IDLE, NULL);
2281 spin_unlock(&busiest->lock); 2701 spin_unlock(&busiest->lock);
2702
2703 if (!nr_moved) {
2704 cpu_clear(cpu_of(busiest), cpus);
2705 if (!cpus_empty(cpus))
2706 goto redo;
2707 }
2282 } 2708 }
2283 2709
2284 if (!nr_moved) { 2710 if (!nr_moved) {
@@ -2292,9 +2718,11 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2292 2718
2293out_balanced: 2719out_balanced:
2294 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); 2720 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
2295 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) 2721 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2722 !sched_smt_power_savings)
2296 return -1; 2723 return -1;
2297 sd->nr_balance_failed = 0; 2724 sd->nr_balance_failed = 0;
2725
2298 return 0; 2726 return 0;
2299} 2727}
2300 2728
@@ -2302,16 +2730,15 @@ out_balanced:
2302 * idle_balance is called by schedule() if this_cpu is about to become 2730 * idle_balance is called by schedule() if this_cpu is about to become
2303 * idle. Attempts to pull tasks from other CPUs. 2731 * idle. Attempts to pull tasks from other CPUs.
2304 */ 2732 */
2305static void idle_balance(int this_cpu, runqueue_t *this_rq) 2733static void idle_balance(int this_cpu, struct rq *this_rq)
2306{ 2734{
2307 struct sched_domain *sd; 2735 struct sched_domain *sd;
2308 2736
2309 for_each_domain(this_cpu, sd) { 2737 for_each_domain(this_cpu, sd) {
2310 if (sd->flags & SD_BALANCE_NEWIDLE) { 2738 if (sd->flags & SD_BALANCE_NEWIDLE) {
2311 if (load_balance_newidle(this_cpu, this_rq, sd)) { 2739 /* If we've pulled tasks over stop searching: */
2312 /* We've pulled tasks over so stop searching */ 2740 if (load_balance_newidle(this_cpu, this_rq, sd))
2313 break; 2741 break;
2314 }
2315 } 2742 }
2316 } 2743 }
2317} 2744}
@@ -2324,14 +2751,14 @@ static void idle_balance(int this_cpu, runqueue_t *this_rq)
2324 * 2751 *
2325 * Called with busiest_rq locked. 2752 * Called with busiest_rq locked.
2326 */ 2753 */
2327static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) 2754static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
2328{ 2755{
2329 struct sched_domain *sd;
2330 runqueue_t *target_rq;
2331 int target_cpu = busiest_rq->push_cpu; 2756 int target_cpu = busiest_rq->push_cpu;
2757 struct sched_domain *sd;
2758 struct rq *target_rq;
2332 2759
2760 /* Is there any task to move? */
2333 if (busiest_rq->nr_running <= 1) 2761 if (busiest_rq->nr_running <= 1)
2334 /* no task to move */
2335 return; 2762 return;
2336 2763
2337 target_rq = cpu_rq(target_cpu); 2764 target_rq = cpu_rq(target_cpu);
@@ -2347,21 +2774,22 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
2347 double_lock_balance(busiest_rq, target_rq); 2774 double_lock_balance(busiest_rq, target_rq);
2348 2775
2349 /* Search for an sd spanning us and the target CPU. */ 2776 /* Search for an sd spanning us and the target CPU. */
2350 for_each_domain(target_cpu, sd) 2777 for_each_domain(target_cpu, sd) {
2351 if ((sd->flags & SD_LOAD_BALANCE) && 2778 if ((sd->flags & SD_LOAD_BALANCE) &&
2352 cpu_isset(busiest_cpu, sd->span)) 2779 cpu_isset(busiest_cpu, sd->span))
2353 break; 2780 break;
2781 }
2354 2782
2355 if (unlikely(sd == NULL)) 2783 if (likely(sd)) {
2356 goto out; 2784 schedstat_inc(sd, alb_cnt);
2357
2358 schedstat_inc(sd, alb_cnt);
2359 2785
2360 if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL)) 2786 if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
2361 schedstat_inc(sd, alb_pushed); 2787 RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE,
2362 else 2788 NULL))
2363 schedstat_inc(sd, alb_failed); 2789 schedstat_inc(sd, alb_pushed);
2364out: 2790 else
2791 schedstat_inc(sd, alb_failed);
2792 }
2365 spin_unlock(&target_rq->lock); 2793 spin_unlock(&target_rq->lock);
2366} 2794}
2367 2795
@@ -2374,23 +2802,27 @@ out:
2374 * Balancing parameters are set up in arch_init_sched_domains. 2802 * Balancing parameters are set up in arch_init_sched_domains.
2375 */ 2803 */
2376 2804
2377/* Don't have all balancing operations going off at once */ 2805/* Don't have all balancing operations going off at once: */
2378#define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS) 2806static inline unsigned long cpu_offset(int cpu)
2807{
2808 return jiffies + cpu * HZ / NR_CPUS;
2809}
2379 2810
2380static void rebalance_tick(int this_cpu, runqueue_t *this_rq, 2811static void
2381 enum idle_type idle) 2812rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
2382{ 2813{
2383 unsigned long old_load, this_load; 2814 unsigned long this_load, interval, j = cpu_offset(this_cpu);
2384 unsigned long j = jiffies + CPU_OFFSET(this_cpu);
2385 struct sched_domain *sd; 2815 struct sched_domain *sd;
2386 int i; 2816 int i, scale;
2817
2818 this_load = this_rq->raw_weighted_load;
2819
2820 /* Update our load: */
2821 for (i = 0, scale = 1; i < 3; i++, scale <<= 1) {
2822 unsigned long old_load, new_load;
2387 2823
2388 this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
2389 /* Update our load */
2390 for (i = 0; i < 3; i++) {
2391 unsigned long new_load = this_load;
2392 int scale = 1 << i;
2393 old_load = this_rq->cpu_load[i]; 2824 old_load = this_rq->cpu_load[i];
2825 new_load = this_load;
2394 /* 2826 /*
2395 * Round up the averaging division if load is increasing. This 2827 * Round up the averaging division if load is increasing. This
2396 * prevents us from getting stuck on 9 if the load is 10, for 2828 * prevents us from getting stuck on 9 if the load is 10, for
@@ -2402,8 +2834,6 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
2402 } 2834 }
2403 2835
2404 for_each_domain(this_cpu, sd) { 2836 for_each_domain(this_cpu, sd) {
2405 unsigned long interval;
2406
2407 if (!(sd->flags & SD_LOAD_BALANCE)) 2837 if (!(sd->flags & SD_LOAD_BALANCE))
2408 continue; 2838 continue;
2409 2839
@@ -2433,17 +2863,18 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
2433/* 2863/*
2434 * on UP we do not need to balance between CPUs: 2864 * on UP we do not need to balance between CPUs:
2435 */ 2865 */
2436static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle) 2866static inline void rebalance_tick(int cpu, struct rq *rq, enum idle_type idle)
2437{ 2867{
2438} 2868}
2439static inline void idle_balance(int cpu, runqueue_t *rq) 2869static inline void idle_balance(int cpu, struct rq *rq)
2440{ 2870{
2441} 2871}
2442#endif 2872#endif
2443 2873
2444static inline int wake_priority_sleeper(runqueue_t *rq) 2874static inline int wake_priority_sleeper(struct rq *rq)
2445{ 2875{
2446 int ret = 0; 2876 int ret = 0;
2877
2447#ifdef CONFIG_SCHED_SMT 2878#ifdef CONFIG_SCHED_SMT
2448 spin_lock(&rq->lock); 2879 spin_lock(&rq->lock);
2449 /* 2880 /*
@@ -2467,25 +2898,26 @@ EXPORT_PER_CPU_SYMBOL(kstat);
2467 * This is called on clock ticks and on context switches. 2898 * This is called on clock ticks and on context switches.
2468 * Bank in p->sched_time the ns elapsed since the last tick or switch. 2899 * Bank in p->sched_time the ns elapsed since the last tick or switch.
2469 */ 2900 */
2470static inline void update_cpu_clock(task_t *p, runqueue_t *rq, 2901static inline void
2471 unsigned long long now) 2902update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
2472{ 2903{
2473 unsigned long long last = max(p->timestamp, rq->timestamp_last_tick); 2904 p->sched_time += now - max(p->timestamp, rq->timestamp_last_tick);
2474 p->sched_time += now - last;
2475} 2905}
2476 2906
2477/* 2907/*
2478 * Return current->sched_time plus any more ns on the sched_clock 2908 * Return current->sched_time plus any more ns on the sched_clock
2479 * that have not yet been banked. 2909 * that have not yet been banked.
2480 */ 2910 */
2481unsigned long long current_sched_time(const task_t *tsk) 2911unsigned long long current_sched_time(const struct task_struct *p)
2482{ 2912{
2483 unsigned long long ns; 2913 unsigned long long ns;
2484 unsigned long flags; 2914 unsigned long flags;
2915
2485 local_irq_save(flags); 2916 local_irq_save(flags);
2486 ns = max(tsk->timestamp, task_rq(tsk)->timestamp_last_tick); 2917 ns = max(p->timestamp, task_rq(p)->timestamp_last_tick);
2487 ns = tsk->sched_time + (sched_clock() - ns); 2918 ns = p->sched_time + sched_clock() - ns;
2488 local_irq_restore(flags); 2919 local_irq_restore(flags);
2920
2489 return ns; 2921 return ns;
2490} 2922}
2491 2923
@@ -2499,11 +2931,16 @@ unsigned long long current_sched_time(const task_t *tsk)
2499 * increasing number of running tasks. We also ignore the interactivity 2931 * increasing number of running tasks. We also ignore the interactivity
2500 * if a better static_prio task has expired: 2932 * if a better static_prio task has expired:
2501 */ 2933 */
2502#define EXPIRED_STARVING(rq) \ 2934static inline int expired_starving(struct rq *rq)
2503 ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ 2935{
2504 (jiffies - (rq)->expired_timestamp >= \ 2936 if (rq->curr->static_prio > rq->best_expired_prio)
2505 STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ 2937 return 1;
2506 ((rq)->curr->static_prio > (rq)->best_expired_prio)) 2938 if (!STARVATION_LIMIT || !rq->expired_timestamp)
2939 return 0;
2940 if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running)
2941 return 1;
2942 return 0;
2943}
2507 2944
2508/* 2945/*
2509 * Account user cpu time to a process. 2946 * Account user cpu time to a process.
@@ -2536,7 +2973,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
2536 cputime_t cputime) 2973 cputime_t cputime)
2537{ 2974{
2538 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2975 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2539 runqueue_t *rq = this_rq(); 2976 struct rq *rq = this_rq();
2540 cputime64_t tmp; 2977 cputime64_t tmp;
2541 2978
2542 p->stime = cputime_add(p->stime, cputime); 2979 p->stime = cputime_add(p->stime, cputime);
@@ -2566,7 +3003,7 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
2566{ 3003{
2567 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 3004 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2568 cputime64_t tmp = cputime_to_cputime64(steal); 3005 cputime64_t tmp = cputime_to_cputime64(steal);
2569 runqueue_t *rq = this_rq(); 3006 struct rq *rq = this_rq();
2570 3007
2571 if (p == rq->idle) { 3008 if (p == rq->idle) {
2572 p->stime = cputime_add(p->stime, steal); 3009 p->stime = cputime_add(p->stime, steal);
@@ -2587,10 +3024,10 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
2587 */ 3024 */
2588void scheduler_tick(void) 3025void scheduler_tick(void)
2589{ 3026{
2590 int cpu = smp_processor_id();
2591 runqueue_t *rq = this_rq();
2592 task_t *p = current;
2593 unsigned long long now = sched_clock(); 3027 unsigned long long now = sched_clock();
3028 struct task_struct *p = current;
3029 int cpu = smp_processor_id();
3030 struct rq *rq = cpu_rq(cpu);
2594 3031
2595 update_cpu_clock(p, rq, now); 3032 update_cpu_clock(p, rq, now);
2596 3033
@@ -2640,7 +3077,7 @@ void scheduler_tick(void)
2640 3077
2641 if (!rq->expired_timestamp) 3078 if (!rq->expired_timestamp)
2642 rq->expired_timestamp = jiffies; 3079 rq->expired_timestamp = jiffies;
2643 if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { 3080 if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
2644 enqueue_task(p, rq->expired); 3081 enqueue_task(p, rq->expired);
2645 if (p->static_prio < rq->best_expired_prio) 3082 if (p->static_prio < rq->best_expired_prio)
2646 rq->best_expired_prio = p->static_prio; 3083 rq->best_expired_prio = p->static_prio;
@@ -2679,55 +3116,42 @@ out:
2679} 3116}
2680 3117
2681#ifdef CONFIG_SCHED_SMT 3118#ifdef CONFIG_SCHED_SMT
2682static inline void wakeup_busy_runqueue(runqueue_t *rq) 3119static inline void wakeup_busy_runqueue(struct rq *rq)
2683{ 3120{
2684 /* If an SMT runqueue is sleeping due to priority reasons wake it up */ 3121 /* If an SMT runqueue is sleeping due to priority reasons wake it up */
2685 if (rq->curr == rq->idle && rq->nr_running) 3122 if (rq->curr == rq->idle && rq->nr_running)
2686 resched_task(rq->idle); 3123 resched_task(rq->idle);
2687} 3124}
2688 3125
2689static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) 3126/*
3127 * Called with interrupt disabled and this_rq's runqueue locked.
3128 */
3129static void wake_sleeping_dependent(int this_cpu)
2690{ 3130{
2691 struct sched_domain *tmp, *sd = NULL; 3131 struct sched_domain *tmp, *sd = NULL;
2692 cpumask_t sibling_map;
2693 int i; 3132 int i;
2694 3133
2695 for_each_domain(this_cpu, tmp) 3134 for_each_domain(this_cpu, tmp) {
2696 if (tmp->flags & SD_SHARE_CPUPOWER) 3135 if (tmp->flags & SD_SHARE_CPUPOWER) {
2697 sd = tmp; 3136 sd = tmp;
3137 break;
3138 }
3139 }
2698 3140
2699 if (!sd) 3141 if (!sd)
2700 return; 3142 return;
2701 3143
2702 /* 3144 for_each_cpu_mask(i, sd->span) {
2703 * Unlock the current runqueue because we have to lock in 3145 struct rq *smt_rq = cpu_rq(i);
2704 * CPU order to avoid deadlocks. Caller knows that we might
2705 * unlock. We keep IRQs disabled.
2706 */
2707 spin_unlock(&this_rq->lock);
2708
2709 sibling_map = sd->span;
2710
2711 for_each_cpu_mask(i, sibling_map)
2712 spin_lock(&cpu_rq(i)->lock);
2713 /*
2714 * We clear this CPU from the mask. This both simplifies the
2715 * inner loop and keps this_rq locked when we exit:
2716 */
2717 cpu_clear(this_cpu, sibling_map);
2718 3146
2719 for_each_cpu_mask(i, sibling_map) { 3147 if (i == this_cpu)
2720 runqueue_t *smt_rq = cpu_rq(i); 3148 continue;
3149 if (unlikely(!spin_trylock(&smt_rq->lock)))
3150 continue;
2721 3151
2722 wakeup_busy_runqueue(smt_rq); 3152 wakeup_busy_runqueue(smt_rq);
3153 spin_unlock(&smt_rq->lock);
2723 } 3154 }
2724
2725 for_each_cpu_mask(i, sibling_map)
2726 spin_unlock(&cpu_rq(i)->lock);
2727 /*
2728 * We exit with this_cpu's rq still held and IRQs
2729 * still disabled:
2730 */
2731} 3155}
2732 3156
2733/* 3157/*
@@ -2735,57 +3159,53 @@ static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2735 * utilize, if another task runs on a sibling. This models the 3159 * utilize, if another task runs on a sibling. This models the
2736 * slowdown effect of other tasks running on siblings: 3160 * slowdown effect of other tasks running on siblings:
2737 */ 3161 */
2738static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd) 3162static inline unsigned long
3163smt_slice(struct task_struct *p, struct sched_domain *sd)
2739{ 3164{
2740 return p->time_slice * (100 - sd->per_cpu_gain) / 100; 3165 return p->time_slice * (100 - sd->per_cpu_gain) / 100;
2741} 3166}
2742 3167
2743static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) 3168/*
3169 * To minimise lock contention and not have to drop this_rq's runlock we only
3170 * trylock the sibling runqueues and bypass those runqueues if we fail to
3171 * acquire their lock. As we only trylock the normal locking order does not
3172 * need to be obeyed.
3173 */
3174static int
3175dependent_sleeper(int this_cpu, struct rq *this_rq, struct task_struct *p)
2744{ 3176{
2745 struct sched_domain *tmp, *sd = NULL; 3177 struct sched_domain *tmp, *sd = NULL;
2746 cpumask_t sibling_map;
2747 prio_array_t *array;
2748 int ret = 0, i; 3178 int ret = 0, i;
2749 task_t *p;
2750 3179
2751 for_each_domain(this_cpu, tmp) 3180 /* kernel/rt threads do not participate in dependent sleeping */
2752 if (tmp->flags & SD_SHARE_CPUPOWER) 3181 if (!p->mm || rt_task(p))
3182 return 0;
3183
3184 for_each_domain(this_cpu, tmp) {
3185 if (tmp->flags & SD_SHARE_CPUPOWER) {
2753 sd = tmp; 3186 sd = tmp;
3187 break;
3188 }
3189 }
2754 3190
2755 if (!sd) 3191 if (!sd)
2756 return 0; 3192 return 0;
2757 3193
2758 /* 3194 for_each_cpu_mask(i, sd->span) {
2759 * The same locking rules and details apply as for 3195 struct task_struct *smt_curr;
2760 * wake_sleeping_dependent(): 3196 struct rq *smt_rq;
2761 */
2762 spin_unlock(&this_rq->lock);
2763 sibling_map = sd->span;
2764 for_each_cpu_mask(i, sibling_map)
2765 spin_lock(&cpu_rq(i)->lock);
2766 cpu_clear(this_cpu, sibling_map);
2767 3197
2768 /* 3198 if (i == this_cpu)
2769 * Establish next task to be run - it might have gone away because 3199 continue;
2770 * we released the runqueue lock above:
2771 */
2772 if (!this_rq->nr_running)
2773 goto out_unlock;
2774 array = this_rq->active;
2775 if (!array->nr_active)
2776 array = this_rq->expired;
2777 BUG_ON(!array->nr_active);
2778 3200
2779 p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next, 3201 smt_rq = cpu_rq(i);
2780 task_t, run_list); 3202 if (unlikely(!spin_trylock(&smt_rq->lock)))
3203 continue;
2781 3204
2782 for_each_cpu_mask(i, sibling_map) { 3205 smt_curr = smt_rq->curr;
2783 runqueue_t *smt_rq = cpu_rq(i);
2784 task_t *smt_curr = smt_rq->curr;
2785 3206
2786 /* Kernel threads do not participate in dependent sleeping */ 3207 if (!smt_curr->mm)
2787 if (!p->mm || !smt_curr->mm || rt_task(p)) 3208 goto unlock;
2788 goto check_smt_task;
2789 3209
2790 /* 3210 /*
2791 * If a user task with lower static priority than the 3211 * If a user task with lower static priority than the
@@ -2803,49 +3223,23 @@ static int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2803 if ((jiffies % DEF_TIMESLICE) > 3223 if ((jiffies % DEF_TIMESLICE) >
2804 (sd->per_cpu_gain * DEF_TIMESLICE / 100)) 3224 (sd->per_cpu_gain * DEF_TIMESLICE / 100))
2805 ret = 1; 3225 ret = 1;
2806 } else 3226 } else {
2807 if (smt_curr->static_prio < p->static_prio && 3227 if (smt_curr->static_prio < p->static_prio &&
2808 !TASK_PREEMPTS_CURR(p, smt_rq) && 3228 !TASK_PREEMPTS_CURR(p, smt_rq) &&
2809 smt_slice(smt_curr, sd) > task_timeslice(p)) 3229 smt_slice(smt_curr, sd) > task_timeslice(p))
2810 ret = 1; 3230 ret = 1;
2811
2812check_smt_task:
2813 if ((!smt_curr->mm && smt_curr != smt_rq->idle) ||
2814 rt_task(smt_curr))
2815 continue;
2816 if (!p->mm) {
2817 wakeup_busy_runqueue(smt_rq);
2818 continue;
2819 }
2820
2821 /*
2822 * Reschedule a lower priority task on the SMT sibling for
2823 * it to be put to sleep, or wake it up if it has been put to
2824 * sleep for priority reasons to see if it should run now.
2825 */
2826 if (rt_task(p)) {
2827 if ((jiffies % DEF_TIMESLICE) >
2828 (sd->per_cpu_gain * DEF_TIMESLICE / 100))
2829 resched_task(smt_curr);
2830 } else {
2831 if (TASK_PREEMPTS_CURR(p, smt_rq) &&
2832 smt_slice(p, sd) > task_timeslice(smt_curr))
2833 resched_task(smt_curr);
2834 else
2835 wakeup_busy_runqueue(smt_rq);
2836 } 3231 }
3232unlock:
3233 spin_unlock(&smt_rq->lock);
2837 } 3234 }
2838out_unlock:
2839 for_each_cpu_mask(i, sibling_map)
2840 spin_unlock(&cpu_rq(i)->lock);
2841 return ret; 3235 return ret;
2842} 3236}
2843#else 3237#else
2844static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) 3238static inline void wake_sleeping_dependent(int this_cpu)
2845{ 3239{
2846} 3240}
2847 3241static inline int
2848static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) 3242dependent_sleeper(int this_cpu, struct rq *this_rq, struct task_struct *p)
2849{ 3243{
2850 return 0; 3244 return 0;
2851} 3245}
@@ -2858,12 +3252,13 @@ void fastcall add_preempt_count(int val)
2858 /* 3252 /*
2859 * Underflow? 3253 * Underflow?
2860 */ 3254 */
2861 BUG_ON((preempt_count() < 0)); 3255 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3256 return;
2862 preempt_count() += val; 3257 preempt_count() += val;
2863 /* 3258 /*
2864 * Spinlock count overflowing soon? 3259 * Spinlock count overflowing soon?
2865 */ 3260 */
2866 BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); 3261 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10);
2867} 3262}
2868EXPORT_SYMBOL(add_preempt_count); 3263EXPORT_SYMBOL(add_preempt_count);
2869 3264
@@ -2872,11 +3267,15 @@ void fastcall sub_preempt_count(int val)
2872 /* 3267 /*
2873 * Underflow? 3268 * Underflow?
2874 */ 3269 */
2875 BUG_ON(val > preempt_count()); 3270 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3271 return;
2876 /* 3272 /*
2877 * Is the spinlock portion underflowing? 3273 * Is the spinlock portion underflowing?
2878 */ 3274 */
2879 BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK)); 3275 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3276 !(preempt_count() & PREEMPT_MASK)))
3277 return;
3278
2880 preempt_count() -= val; 3279 preempt_count() -= val;
2881} 3280}
2882EXPORT_SYMBOL(sub_preempt_count); 3281EXPORT_SYMBOL(sub_preempt_count);
@@ -2894,14 +3293,14 @@ static inline int interactive_sleep(enum sleep_type sleep_type)
2894 */ 3293 */
2895asmlinkage void __sched schedule(void) 3294asmlinkage void __sched schedule(void)
2896{ 3295{
2897 long *switch_count; 3296 struct task_struct *prev, *next;
2898 task_t *prev, *next; 3297 struct prio_array *array;
2899 runqueue_t *rq;
2900 prio_array_t *array;
2901 struct list_head *queue; 3298 struct list_head *queue;
2902 unsigned long long now; 3299 unsigned long long now;
2903 unsigned long run_time; 3300 unsigned long run_time;
2904 int cpu, idx, new_prio; 3301 int cpu, idx, new_prio;
3302 long *switch_count;
3303 struct rq *rq;
2905 3304
2906 /* 3305 /*
2907 * Test if we are atomic. Since do_exit() needs to call into 3306 * Test if we are atomic. Since do_exit() needs to call into
@@ -2949,9 +3348,6 @@ need_resched_nonpreemptible:
2949 3348
2950 spin_lock_irq(&rq->lock); 3349 spin_lock_irq(&rq->lock);
2951 3350
2952 if (unlikely(prev->flags & PF_DEAD))
2953 prev->state = EXIT_DEAD;
2954
2955 switch_count = &prev->nivcsw; 3351 switch_count = &prev->nivcsw;
2956 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 3352 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
2957 switch_count = &prev->nvcsw; 3353 switch_count = &prev->nvcsw;
@@ -2967,32 +3363,13 @@ need_resched_nonpreemptible:
2967 3363
2968 cpu = smp_processor_id(); 3364 cpu = smp_processor_id();
2969 if (unlikely(!rq->nr_running)) { 3365 if (unlikely(!rq->nr_running)) {
2970go_idle:
2971 idle_balance(cpu, rq); 3366 idle_balance(cpu, rq);
2972 if (!rq->nr_running) { 3367 if (!rq->nr_running) {
2973 next = rq->idle; 3368 next = rq->idle;
2974 rq->expired_timestamp = 0; 3369 rq->expired_timestamp = 0;
2975 wake_sleeping_dependent(cpu, rq); 3370 wake_sleeping_dependent(cpu);
2976 /*
2977 * wake_sleeping_dependent() might have released
2978 * the runqueue, so break out if we got new
2979 * tasks meanwhile:
2980 */
2981 if (!rq->nr_running)
2982 goto switch_tasks;
2983 }
2984 } else {
2985 if (dependent_sleeper(cpu, rq)) {
2986 next = rq->idle;
2987 goto switch_tasks; 3371 goto switch_tasks;
2988 } 3372 }
2989 /*
2990 * dependent_sleeper() releases and reacquires the runqueue
2991 * lock, hence go into the idle loop if the rq went
2992 * empty meanwhile:
2993 */
2994 if (unlikely(!rq->nr_running))
2995 goto go_idle;
2996 } 3373 }
2997 3374
2998 array = rq->active; 3375 array = rq->active;
@@ -3010,7 +3387,7 @@ go_idle:
3010 3387
3011 idx = sched_find_first_bit(array->bitmap); 3388 idx = sched_find_first_bit(array->bitmap);
3012 queue = array->queue + idx; 3389 queue = array->queue + idx;
3013 next = list_entry(queue->next, task_t, run_list); 3390 next = list_entry(queue->next, struct task_struct, run_list);
3014 3391
3015 if (!rt_task(next) && interactive_sleep(next->sleep_type)) { 3392 if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
3016 unsigned long long delta = now - next->timestamp; 3393 unsigned long long delta = now - next->timestamp;
@@ -3030,6 +3407,8 @@ go_idle:
3030 } 3407 }
3031 } 3408 }
3032 next->sleep_type = SLEEP_NORMAL; 3409 next->sleep_type = SLEEP_NORMAL;
3410 if (dependent_sleeper(cpu, rq, next))
3411 next = rq->idle;
3033switch_tasks: 3412switch_tasks:
3034 if (next == rq->idle) 3413 if (next == rq->idle)
3035 schedstat_inc(rq, sched_goidle); 3414 schedstat_inc(rq, sched_goidle);
@@ -3071,12 +3450,11 @@ switch_tasks:
3071 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 3450 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3072 goto need_resched; 3451 goto need_resched;
3073} 3452}
3074
3075EXPORT_SYMBOL(schedule); 3453EXPORT_SYMBOL(schedule);
3076 3454
3077#ifdef CONFIG_PREEMPT 3455#ifdef CONFIG_PREEMPT
3078/* 3456/*
3079 * this is is the entry point to schedule() from in-kernel preemption 3457 * this is the entry point to schedule() from in-kernel preemption
3080 * off of preempt_enable. Kernel preemptions off return from interrupt 3458 * off of preempt_enable. Kernel preemptions off return from interrupt
3081 * occur there and call schedule directly. 3459 * occur there and call schedule directly.
3082 */ 3460 */
@@ -3116,11 +3494,10 @@ need_resched:
3116 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 3494 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3117 goto need_resched; 3495 goto need_resched;
3118} 3496}
3119
3120EXPORT_SYMBOL(preempt_schedule); 3497EXPORT_SYMBOL(preempt_schedule);
3121 3498
3122/* 3499/*
3123 * this is is the entry point to schedule() from kernel preemption 3500 * this is the entry point to schedule() from kernel preemption
3124 * off of irq context. 3501 * off of irq context.
3125 * Note, that this is called and return with irqs disabled. This will 3502 * Note, that this is called and return with irqs disabled. This will
3126 * protect us against recursive calling from irq. 3503 * protect us against recursive calling from irq.
@@ -3132,7 +3509,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
3132 struct task_struct *task = current; 3509 struct task_struct *task = current;
3133 int saved_lock_depth; 3510 int saved_lock_depth;
3134#endif 3511#endif
3135 /* Catch callers which need to be fixed*/ 3512 /* Catch callers which need to be fixed */
3136 BUG_ON(ti->preempt_count || !irqs_disabled()); 3513 BUG_ON(ti->preempt_count || !irqs_disabled());
3137 3514
3138need_resched: 3515need_resched:
@@ -3165,10 +3542,8 @@ need_resched:
3165int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, 3542int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
3166 void *key) 3543 void *key)
3167{ 3544{
3168 task_t *p = curr->private; 3545 return try_to_wake_up(curr->private, mode, sync);
3169 return try_to_wake_up(p, mode, sync);
3170} 3546}
3171
3172EXPORT_SYMBOL(default_wake_function); 3547EXPORT_SYMBOL(default_wake_function);
3173 3548
3174/* 3549/*
@@ -3186,13 +3561,11 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3186 struct list_head *tmp, *next; 3561 struct list_head *tmp, *next;
3187 3562
3188 list_for_each_safe(tmp, next, &q->task_list) { 3563 list_for_each_safe(tmp, next, &q->task_list) {
3189 wait_queue_t *curr; 3564 wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
3190 unsigned flags; 3565 unsigned flags = curr->flags;
3191 curr = list_entry(tmp, wait_queue_t, task_list); 3566
3192 flags = curr->flags;
3193 if (curr->func(curr, mode, sync, key) && 3567 if (curr->func(curr, mode, sync, key) &&
3194 (flags & WQ_FLAG_EXCLUSIVE) && 3568 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
3195 !--nr_exclusive)
3196 break; 3569 break;
3197 } 3570 }
3198} 3571}
@@ -3213,7 +3586,6 @@ void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
3213 __wake_up_common(q, mode, nr_exclusive, 0, key); 3586 __wake_up_common(q, mode, nr_exclusive, 0, key);
3214 spin_unlock_irqrestore(&q->lock, flags); 3587 spin_unlock_irqrestore(&q->lock, flags);
3215} 3588}
3216
3217EXPORT_SYMBOL(__wake_up); 3589EXPORT_SYMBOL(__wake_up);
3218 3590
3219/* 3591/*
@@ -3282,6 +3654,7 @@ EXPORT_SYMBOL(complete_all);
3282void fastcall __sched wait_for_completion(struct completion *x) 3654void fastcall __sched wait_for_completion(struct completion *x)
3283{ 3655{
3284 might_sleep(); 3656 might_sleep();
3657
3285 spin_lock_irq(&x->wait.lock); 3658 spin_lock_irq(&x->wait.lock);
3286 if (!x->done) { 3659 if (!x->done) {
3287 DECLARE_WAITQUEUE(wait, current); 3660 DECLARE_WAITQUEUE(wait, current);
@@ -3426,7 +3799,6 @@ void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
3426 schedule(); 3799 schedule();
3427 SLEEP_ON_TAIL 3800 SLEEP_ON_TAIL
3428} 3801}
3429
3430EXPORT_SYMBOL(interruptible_sleep_on); 3802EXPORT_SYMBOL(interruptible_sleep_on);
3431 3803
3432long fastcall __sched 3804long fastcall __sched
@@ -3442,7 +3814,6 @@ interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3442 3814
3443 return timeout; 3815 return timeout;
3444} 3816}
3445
3446EXPORT_SYMBOL(interruptible_sleep_on_timeout); 3817EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3447 3818
3448void fastcall __sched sleep_on(wait_queue_head_t *q) 3819void fastcall __sched sleep_on(wait_queue_head_t *q)
@@ -3455,7 +3826,6 @@ void fastcall __sched sleep_on(wait_queue_head_t *q)
3455 schedule(); 3826 schedule();
3456 SLEEP_ON_TAIL 3827 SLEEP_ON_TAIL
3457} 3828}
3458
3459EXPORT_SYMBOL(sleep_on); 3829EXPORT_SYMBOL(sleep_on);
3460 3830
3461long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) 3831long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
@@ -3473,12 +3843,65 @@ long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3473 3843
3474EXPORT_SYMBOL(sleep_on_timeout); 3844EXPORT_SYMBOL(sleep_on_timeout);
3475 3845
3476void set_user_nice(task_t *p, long nice) 3846#ifdef CONFIG_RT_MUTEXES
3847
3848/*
3849 * rt_mutex_setprio - set the current priority of a task
3850 * @p: task
3851 * @prio: prio value (kernel-internal form)
3852 *
3853 * This function changes the 'effective' priority of a task. It does
3854 * not touch ->normal_prio like __setscheduler().
3855 *
3856 * Used by the rt_mutex code to implement priority inheritance logic.
3857 */
3858void rt_mutex_setprio(struct task_struct *p, int prio)
3859{
3860 struct prio_array *array;
3861 unsigned long flags;
3862 struct rq *rq;
3863 int oldprio;
3864
3865 BUG_ON(prio < 0 || prio > MAX_PRIO);
3866
3867 rq = task_rq_lock(p, &flags);
3868
3869 oldprio = p->prio;
3870 array = p->array;
3871 if (array)
3872 dequeue_task(p, array);
3873 p->prio = prio;
3874
3875 if (array) {
3876 /*
3877 * If changing to an RT priority then queue it
3878 * in the active array!
3879 */
3880 if (rt_task(p))
3881 array = rq->active;
3882 enqueue_task(p, array);
3883 /*
3884 * Reschedule if we are currently running on this runqueue and
3885 * our priority decreased, or if we are not currently running on
3886 * this runqueue and our priority is higher than the current's
3887 */
3888 if (task_running(rq, p)) {
3889 if (p->prio > oldprio)
3890 resched_task(rq->curr);
3891 } else if (TASK_PREEMPTS_CURR(p, rq))
3892 resched_task(rq->curr);
3893 }
3894 task_rq_unlock(rq, &flags);
3895}
3896
3897#endif
3898
3899void set_user_nice(struct task_struct *p, long nice)
3477{ 3900{
3901 struct prio_array *array;
3902 int old_prio, delta;
3478 unsigned long flags; 3903 unsigned long flags;
3479 prio_array_t *array; 3904 struct rq *rq;
3480 runqueue_t *rq;
3481 int old_prio, new_prio, delta;
3482 3905
3483 if (TASK_NICE(p) == nice || nice < -20 || nice > 19) 3906 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3484 return; 3907 return;
@@ -3493,22 +3916,25 @@ void set_user_nice(task_t *p, long nice)
3493 * it wont have any effect on scheduling until the task is 3916 * it wont have any effect on scheduling until the task is
3494 * not SCHED_NORMAL/SCHED_BATCH: 3917 * not SCHED_NORMAL/SCHED_BATCH:
3495 */ 3918 */
3496 if (rt_task(p)) { 3919 if (has_rt_policy(p)) {
3497 p->static_prio = NICE_TO_PRIO(nice); 3920 p->static_prio = NICE_TO_PRIO(nice);
3498 goto out_unlock; 3921 goto out_unlock;
3499 } 3922 }
3500 array = p->array; 3923 array = p->array;
3501 if (array) 3924 if (array) {
3502 dequeue_task(p, array); 3925 dequeue_task(p, array);
3926 dec_raw_weighted_load(rq, p);
3927 }
3503 3928
3504 old_prio = p->prio;
3505 new_prio = NICE_TO_PRIO(nice);
3506 delta = new_prio - old_prio;
3507 p->static_prio = NICE_TO_PRIO(nice); 3929 p->static_prio = NICE_TO_PRIO(nice);
3508 p->prio += delta; 3930 set_load_weight(p);
3931 old_prio = p->prio;
3932 p->prio = effective_prio(p);
3933 delta = p->prio - old_prio;
3509 3934
3510 if (array) { 3935 if (array) {
3511 enqueue_task(p, array); 3936 enqueue_task(p, array);
3937 inc_raw_weighted_load(rq, p);
3512 /* 3938 /*
3513 * If the task increased its priority or is running and 3939 * If the task increased its priority or is running and
3514 * lowered its priority, then reschedule its CPU: 3940 * lowered its priority, then reschedule its CPU:
@@ -3519,7 +3945,6 @@ void set_user_nice(task_t *p, long nice)
3519out_unlock: 3945out_unlock:
3520 task_rq_unlock(rq, &flags); 3946 task_rq_unlock(rq, &flags);
3521} 3947}
3522
3523EXPORT_SYMBOL(set_user_nice); 3948EXPORT_SYMBOL(set_user_nice);
3524 3949
3525/* 3950/*
@@ -3527,10 +3952,11 @@ EXPORT_SYMBOL(set_user_nice);
3527 * @p: task 3952 * @p: task
3528 * @nice: nice value 3953 * @nice: nice value
3529 */ 3954 */
3530int can_nice(const task_t *p, const int nice) 3955int can_nice(const struct task_struct *p, const int nice)
3531{ 3956{
3532 /* convert nice value [19,-20] to rlimit style value [1,40] */ 3957 /* convert nice value [19,-20] to rlimit style value [1,40] */
3533 int nice_rlim = 20 - nice; 3958 int nice_rlim = 20 - nice;
3959
3534 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || 3960 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
3535 capable(CAP_SYS_NICE)); 3961 capable(CAP_SYS_NICE));
3536} 3962}
@@ -3546,8 +3972,7 @@ int can_nice(const task_t *p, const int nice)
3546 */ 3972 */
3547asmlinkage long sys_nice(int increment) 3973asmlinkage long sys_nice(int increment)
3548{ 3974{
3549 int retval; 3975 long nice, retval;
3550 long nice;
3551 3976
3552 /* 3977 /*
3553 * Setpriority might change our priority at the same moment. 3978 * Setpriority might change our priority at the same moment.
@@ -3586,7 +4011,7 @@ asmlinkage long sys_nice(int increment)
3586 * RT tasks are offset by -200. Normal tasks are centered 4011 * RT tasks are offset by -200. Normal tasks are centered
3587 * around 0, value goes from -16 to +15. 4012 * around 0, value goes from -16 to +15.
3588 */ 4013 */
3589int task_prio(const task_t *p) 4014int task_prio(const struct task_struct *p)
3590{ 4015{
3591 return p->prio - MAX_RT_PRIO; 4016 return p->prio - MAX_RT_PRIO;
3592} 4017}
@@ -3595,7 +4020,7 @@ int task_prio(const task_t *p)
3595 * task_nice - return the nice value of a given task. 4020 * task_nice - return the nice value of a given task.
3596 * @p: the task in question. 4021 * @p: the task in question.
3597 */ 4022 */
3598int task_nice(const task_t *p) 4023int task_nice(const struct task_struct *p)
3599{ 4024{
3600 return TASK_NICE(p); 4025 return TASK_NICE(p);
3601} 4026}
@@ -3614,7 +4039,7 @@ int idle_cpu(int cpu)
3614 * idle_task - return the idle task for a given cpu. 4039 * idle_task - return the idle task for a given cpu.
3615 * @cpu: the processor in question. 4040 * @cpu: the processor in question.
3616 */ 4041 */
3617task_t *idle_task(int cpu) 4042struct task_struct *idle_task(int cpu)
3618{ 4043{
3619 return cpu_rq(cpu)->idle; 4044 return cpu_rq(cpu)->idle;
3620} 4045}
@@ -3623,7 +4048,7 @@ task_t *idle_task(int cpu)
3623 * find_process_by_pid - find a process with a matching PID value. 4048 * find_process_by_pid - find a process with a matching PID value.
3624 * @pid: the pid in question. 4049 * @pid: the pid in question.
3625 */ 4050 */
3626static inline task_t *find_process_by_pid(pid_t pid) 4051static inline struct task_struct *find_process_by_pid(pid_t pid)
3627{ 4052{
3628 return pid ? find_task_by_pid(pid) : current; 4053 return pid ? find_task_by_pid(pid) : current;
3629} 4054}
@@ -3632,18 +4057,18 @@ static inline task_t *find_process_by_pid(pid_t pid)
3632static void __setscheduler(struct task_struct *p, int policy, int prio) 4057static void __setscheduler(struct task_struct *p, int policy, int prio)
3633{ 4058{
3634 BUG_ON(p->array); 4059 BUG_ON(p->array);
4060
3635 p->policy = policy; 4061 p->policy = policy;
3636 p->rt_priority = prio; 4062 p->rt_priority = prio;
3637 if (policy != SCHED_NORMAL && policy != SCHED_BATCH) { 4063 p->normal_prio = normal_prio(p);
3638 p->prio = MAX_RT_PRIO-1 - p->rt_priority; 4064 /* we are holding p->pi_lock already */
3639 } else { 4065 p->prio = rt_mutex_getprio(p);
3640 p->prio = p->static_prio; 4066 /*
3641 /* 4067 * SCHED_BATCH tasks are treated as perpetual CPU hogs:
3642 * SCHED_BATCH tasks are treated as perpetual CPU hogs: 4068 */
3643 */ 4069 if (policy == SCHED_BATCH)
3644 if (policy == SCHED_BATCH) 4070 p->sleep_avg = 0;
3645 p->sleep_avg = 0; 4071 set_load_weight(p);
3646 }
3647} 4072}
3648 4073
3649/** 4074/**
@@ -3652,16 +4077,19 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
3652 * @p: the task in question. 4077 * @p: the task in question.
3653 * @policy: new policy. 4078 * @policy: new policy.
3654 * @param: structure containing the new RT priority. 4079 * @param: structure containing the new RT priority.
4080 *
4081 * NOTE: the task may be already dead
3655 */ 4082 */
3656int sched_setscheduler(struct task_struct *p, int policy, 4083int sched_setscheduler(struct task_struct *p, int policy,
3657 struct sched_param *param) 4084 struct sched_param *param)
3658{ 4085{
3659 int retval; 4086 int retval, oldprio, oldpolicy = -1;
3660 int oldprio, oldpolicy = -1; 4087 struct prio_array *array;
3661 prio_array_t *array;
3662 unsigned long flags; 4088 unsigned long flags;
3663 runqueue_t *rq; 4089 struct rq *rq;
3664 4090
4091 /* may grab non-irq protected spin_locks */
4092 BUG_ON(in_interrupt());
3665recheck: 4093recheck:
3666 /* double check policy once rq lock held */ 4094 /* double check policy once rq lock held */
3667 if (policy < 0) 4095 if (policy < 0)
@@ -3678,28 +4106,32 @@ recheck:
3678 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || 4106 (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) ||
3679 (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) 4107 (!p->mm && param->sched_priority > MAX_RT_PRIO-1))
3680 return -EINVAL; 4108 return -EINVAL;
3681 if ((policy == SCHED_NORMAL || policy == SCHED_BATCH) 4109 if (is_rt_policy(policy) != (param->sched_priority != 0))
3682 != (param->sched_priority == 0))
3683 return -EINVAL; 4110 return -EINVAL;
3684 4111
3685 /* 4112 /*
3686 * Allow unprivileged RT tasks to decrease priority: 4113 * Allow unprivileged RT tasks to decrease priority:
3687 */ 4114 */
3688 if (!capable(CAP_SYS_NICE)) { 4115 if (!capable(CAP_SYS_NICE)) {
3689 /* 4116 if (is_rt_policy(policy)) {
3690 * can't change policy, except between SCHED_NORMAL 4117 unsigned long rlim_rtprio;
3691 * and SCHED_BATCH: 4118 unsigned long flags;
3692 */ 4119
3693 if (((policy != SCHED_NORMAL && p->policy != SCHED_BATCH) && 4120 if (!lock_task_sighand(p, &flags))
3694 (policy != SCHED_BATCH && p->policy != SCHED_NORMAL)) && 4121 return -ESRCH;
3695 !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) 4122 rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
3696 return -EPERM; 4123 unlock_task_sighand(p, &flags);
3697 /* can't increase priority */ 4124
3698 if ((policy != SCHED_NORMAL && policy != SCHED_BATCH) && 4125 /* can't set/change the rt policy */
3699 param->sched_priority > p->rt_priority && 4126 if (policy != p->policy && !rlim_rtprio)
3700 param->sched_priority > 4127 return -EPERM;
3701 p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) 4128
3702 return -EPERM; 4129 /* can't increase priority */
4130 if (param->sched_priority > p->rt_priority &&
4131 param->sched_priority > rlim_rtprio)
4132 return -EPERM;
4133 }
4134
3703 /* can't change other user's priorities */ 4135 /* can't change other user's priorities */
3704 if ((current->euid != p->euid) && 4136 if ((current->euid != p->euid) &&
3705 (current->euid != p->uid)) 4137 (current->euid != p->uid))
@@ -3710,14 +4142,20 @@ recheck:
3710 if (retval) 4142 if (retval)
3711 return retval; 4143 return retval;
3712 /* 4144 /*
4145 * make sure no PI-waiters arrive (or leave) while we are
4146 * changing the priority of the task:
4147 */
4148 spin_lock_irqsave(&p->pi_lock, flags);
4149 /*
3713 * To be able to change p->policy safely, the apropriate 4150 * To be able to change p->policy safely, the apropriate
3714 * runqueue lock must be held. 4151 * runqueue lock must be held.
3715 */ 4152 */
3716 rq = task_rq_lock(p, &flags); 4153 rq = __task_rq_lock(p);
3717 /* recheck policy now with rq lock held */ 4154 /* recheck policy now with rq lock held */
3718 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 4155 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
3719 policy = oldpolicy = -1; 4156 policy = oldpolicy = -1;
3720 task_rq_unlock(rq, &flags); 4157 __task_rq_unlock(rq);
4158 spin_unlock_irqrestore(&p->pi_lock, flags);
3721 goto recheck; 4159 goto recheck;
3722 } 4160 }
3723 array = p->array; 4161 array = p->array;
@@ -3738,7 +4176,11 @@ recheck:
3738 } else if (TASK_PREEMPTS_CURR(p, rq)) 4176 } else if (TASK_PREEMPTS_CURR(p, rq))
3739 resched_task(rq->curr); 4177 resched_task(rq->curr);
3740 } 4178 }
3741 task_rq_unlock(rq, &flags); 4179 __task_rq_unlock(rq);
4180 spin_unlock_irqrestore(&p->pi_lock, flags);
4181
4182 rt_mutex_adjust_pi(p);
4183
3742 return 0; 4184 return 0;
3743} 4185}
3744EXPORT_SYMBOL_GPL(sched_setscheduler); 4186EXPORT_SYMBOL_GPL(sched_setscheduler);
@@ -3746,22 +4188,22 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
3746static int 4188static int
3747do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) 4189do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3748{ 4190{
3749 int retval;
3750 struct sched_param lparam; 4191 struct sched_param lparam;
3751 struct task_struct *p; 4192 struct task_struct *p;
4193 int retval;
3752 4194
3753 if (!param || pid < 0) 4195 if (!param || pid < 0)
3754 return -EINVAL; 4196 return -EINVAL;
3755 if (copy_from_user(&lparam, param, sizeof(struct sched_param))) 4197 if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
3756 return -EFAULT; 4198 return -EFAULT;
3757 read_lock_irq(&tasklist_lock); 4199
4200 rcu_read_lock();
4201 retval = -ESRCH;
3758 p = find_process_by_pid(pid); 4202 p = find_process_by_pid(pid);
3759 if (!p) { 4203 if (p != NULL)
3760 read_unlock_irq(&tasklist_lock); 4204 retval = sched_setscheduler(p, policy, &lparam);
3761 return -ESRCH; 4205 rcu_read_unlock();
3762 } 4206
3763 retval = sched_setscheduler(p, policy, &lparam);
3764 read_unlock_irq(&tasklist_lock);
3765 return retval; 4207 return retval;
3766} 4208}
3767 4209
@@ -3797,8 +4239,8 @@ asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
3797 */ 4239 */
3798asmlinkage long sys_sched_getscheduler(pid_t pid) 4240asmlinkage long sys_sched_getscheduler(pid_t pid)
3799{ 4241{
4242 struct task_struct *p;
3800 int retval = -EINVAL; 4243 int retval = -EINVAL;
3801 task_t *p;
3802 4244
3803 if (pid < 0) 4245 if (pid < 0)
3804 goto out_nounlock; 4246 goto out_nounlock;
@@ -3825,8 +4267,8 @@ out_nounlock:
3825asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) 4267asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
3826{ 4268{
3827 struct sched_param lp; 4269 struct sched_param lp;
4270 struct task_struct *p;
3828 int retval = -EINVAL; 4271 int retval = -EINVAL;
3829 task_t *p;
3830 4272
3831 if (!param || pid < 0) 4273 if (!param || pid < 0)
3832 goto out_nounlock; 4274 goto out_nounlock;
@@ -3859,9 +4301,9 @@ out_unlock:
3859 4301
3860long sched_setaffinity(pid_t pid, cpumask_t new_mask) 4302long sched_setaffinity(pid_t pid, cpumask_t new_mask)
3861{ 4303{
3862 task_t *p;
3863 int retval;
3864 cpumask_t cpus_allowed; 4304 cpumask_t cpus_allowed;
4305 struct task_struct *p;
4306 int retval;
3865 4307
3866 lock_cpu_hotplug(); 4308 lock_cpu_hotplug();
3867 read_lock(&tasklist_lock); 4309 read_lock(&tasklist_lock);
@@ -3947,8 +4389,8 @@ cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
3947 4389
3948long sched_getaffinity(pid_t pid, cpumask_t *mask) 4390long sched_getaffinity(pid_t pid, cpumask_t *mask)
3949{ 4391{
4392 struct task_struct *p;
3950 int retval; 4393 int retval;
3951 task_t *p;
3952 4394
3953 lock_cpu_hotplug(); 4395 lock_cpu_hotplug();
3954 read_lock(&tasklist_lock); 4396 read_lock(&tasklist_lock);
@@ -4007,9 +4449,8 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
4007 */ 4449 */
4008asmlinkage long sys_sched_yield(void) 4450asmlinkage long sys_sched_yield(void)
4009{ 4451{
4010 runqueue_t *rq = this_rq_lock(); 4452 struct rq *rq = this_rq_lock();
4011 prio_array_t *array = current->array; 4453 struct prio_array *array = current->array, *target = rq->expired;
4012 prio_array_t *target = rq->expired;
4013 4454
4014 schedstat_inc(rq, yld_cnt); 4455 schedstat_inc(rq, yld_cnt);
4015 /* 4456 /*
@@ -4043,6 +4484,7 @@ asmlinkage long sys_sched_yield(void)
4043 * no need to preempt or enable interrupts: 4484 * no need to preempt or enable interrupts:
4044 */ 4485 */
4045 __release(rq->lock); 4486 __release(rq->lock);
4487 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
4046 _raw_spin_unlock(&rq->lock); 4488 _raw_spin_unlock(&rq->lock);
4047 preempt_enable_no_resched(); 4489 preempt_enable_no_resched();
4048 4490
@@ -4051,7 +4493,16 @@ asmlinkage long sys_sched_yield(void)
4051 return 0; 4493 return 0;
4052} 4494}
4053 4495
4054static inline void __cond_resched(void) 4496static inline int __resched_legal(int expected_preempt_count)
4497{
4498 if (unlikely(preempt_count() != expected_preempt_count))
4499 return 0;
4500 if (unlikely(system_state != SYSTEM_RUNNING))
4501 return 0;
4502 return 1;
4503}
4504
4505static void __cond_resched(void)
4055{ 4506{
4056#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 4507#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
4057 __might_sleep(__FILE__, __LINE__); 4508 __might_sleep(__FILE__, __LINE__);
@@ -4061,10 +4512,6 @@ static inline void __cond_resched(void)
4061 * PREEMPT_ACTIVE, which could trigger a second 4512 * PREEMPT_ACTIVE, which could trigger a second
4062 * cond_resched() call. 4513 * cond_resched() call.
4063 */ 4514 */
4064 if (unlikely(preempt_count()))
4065 return;
4066 if (unlikely(system_state != SYSTEM_RUNNING))
4067 return;
4068 do { 4515 do {
4069 add_preempt_count(PREEMPT_ACTIVE); 4516 add_preempt_count(PREEMPT_ACTIVE);
4070 schedule(); 4517 schedule();
@@ -4074,13 +4521,12 @@ static inline void __cond_resched(void)
4074 4521
4075int __sched cond_resched(void) 4522int __sched cond_resched(void)
4076{ 4523{
4077 if (need_resched()) { 4524 if (need_resched() && __resched_legal(0)) {
4078 __cond_resched(); 4525 __cond_resched();
4079 return 1; 4526 return 1;
4080 } 4527 }
4081 return 0; 4528 return 0;
4082} 4529}
4083
4084EXPORT_SYMBOL(cond_resched); 4530EXPORT_SYMBOL(cond_resched);
4085 4531
4086/* 4532/*
@@ -4101,7 +4547,8 @@ int cond_resched_lock(spinlock_t *lock)
4101 ret = 1; 4547 ret = 1;
4102 spin_lock(lock); 4548 spin_lock(lock);
4103 } 4549 }
4104 if (need_resched()) { 4550 if (need_resched() && __resched_legal(1)) {
4551 spin_release(&lock->dep_map, 1, _THIS_IP_);
4105 _raw_spin_unlock(lock); 4552 _raw_spin_unlock(lock);
4106 preempt_enable_no_resched(); 4553 preempt_enable_no_resched();
4107 __cond_resched(); 4554 __cond_resched();
@@ -4110,25 +4557,24 @@ int cond_resched_lock(spinlock_t *lock)
4110 } 4557 }
4111 return ret; 4558 return ret;
4112} 4559}
4113
4114EXPORT_SYMBOL(cond_resched_lock); 4560EXPORT_SYMBOL(cond_resched_lock);
4115 4561
4116int __sched cond_resched_softirq(void) 4562int __sched cond_resched_softirq(void)
4117{ 4563{
4118 BUG_ON(!in_softirq()); 4564 BUG_ON(!in_softirq());
4119 4565
4120 if (need_resched()) { 4566 if (need_resched() && __resched_legal(0)) {
4121 __local_bh_enable(); 4567 raw_local_irq_disable();
4568 _local_bh_enable();
4569 raw_local_irq_enable();
4122 __cond_resched(); 4570 __cond_resched();
4123 local_bh_disable(); 4571 local_bh_disable();
4124 return 1; 4572 return 1;
4125 } 4573 }
4126 return 0; 4574 return 0;
4127} 4575}
4128
4129EXPORT_SYMBOL(cond_resched_softirq); 4576EXPORT_SYMBOL(cond_resched_softirq);
4130 4577
4131
4132/** 4578/**
4133 * yield - yield the current processor to other threads. 4579 * yield - yield the current processor to other threads.
4134 * 4580 *
@@ -4140,7 +4586,6 @@ void __sched yield(void)
4140 set_current_state(TASK_RUNNING); 4586 set_current_state(TASK_RUNNING);
4141 sys_sched_yield(); 4587 sys_sched_yield();
4142} 4588}
4143
4144EXPORT_SYMBOL(yield); 4589EXPORT_SYMBOL(yield);
4145 4590
4146/* 4591/*
@@ -4152,23 +4597,26 @@ EXPORT_SYMBOL(yield);
4152 */ 4597 */
4153void __sched io_schedule(void) 4598void __sched io_schedule(void)
4154{ 4599{
4155 struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); 4600 struct rq *rq = &__raw_get_cpu_var(runqueues);
4156 4601
4602 delayacct_blkio_start();
4157 atomic_inc(&rq->nr_iowait); 4603 atomic_inc(&rq->nr_iowait);
4158 schedule(); 4604 schedule();
4159 atomic_dec(&rq->nr_iowait); 4605 atomic_dec(&rq->nr_iowait);
4606 delayacct_blkio_end();
4160} 4607}
4161
4162EXPORT_SYMBOL(io_schedule); 4608EXPORT_SYMBOL(io_schedule);
4163 4609
4164long __sched io_schedule_timeout(long timeout) 4610long __sched io_schedule_timeout(long timeout)
4165{ 4611{
4166 struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); 4612 struct rq *rq = &__raw_get_cpu_var(runqueues);
4167 long ret; 4613 long ret;
4168 4614
4615 delayacct_blkio_start();
4169 atomic_inc(&rq->nr_iowait); 4616 atomic_inc(&rq->nr_iowait);
4170 ret = schedule_timeout(timeout); 4617 ret = schedule_timeout(timeout);
4171 atomic_dec(&rq->nr_iowait); 4618 atomic_dec(&rq->nr_iowait);
4619 delayacct_blkio_end();
4172 return ret; 4620 return ret;
4173} 4621}
4174 4622
@@ -4230,9 +4678,9 @@ asmlinkage long sys_sched_get_priority_min(int policy)
4230asmlinkage 4678asmlinkage
4231long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) 4679long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4232{ 4680{
4681 struct task_struct *p;
4233 int retval = -EINVAL; 4682 int retval = -EINVAL;
4234 struct timespec t; 4683 struct timespec t;
4235 task_t *p;
4236 4684
4237 if (pid < 0) 4685 if (pid < 0)
4238 goto out_nounlock; 4686 goto out_nounlock;
@@ -4247,7 +4695,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4247 if (retval) 4695 if (retval)
4248 goto out_unlock; 4696 goto out_unlock;
4249 4697
4250 jiffies_to_timespec(p->policy & SCHED_FIFO ? 4698 jiffies_to_timespec(p->policy == SCHED_FIFO ?
4251 0 : task_timeslice(p), &t); 4699 0 : task_timeslice(p), &t);
4252 read_unlock(&tasklist_lock); 4700 read_unlock(&tasklist_lock);
4253 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 4701 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
@@ -4260,35 +4708,36 @@ out_unlock:
4260 4708
4261static inline struct task_struct *eldest_child(struct task_struct *p) 4709static inline struct task_struct *eldest_child(struct task_struct *p)
4262{ 4710{
4263 if (list_empty(&p->children)) return NULL; 4711 if (list_empty(&p->children))
4712 return NULL;
4264 return list_entry(p->children.next,struct task_struct,sibling); 4713 return list_entry(p->children.next,struct task_struct,sibling);
4265} 4714}
4266 4715
4267static inline struct task_struct *older_sibling(struct task_struct *p) 4716static inline struct task_struct *older_sibling(struct task_struct *p)
4268{ 4717{
4269 if (p->sibling.prev==&p->parent->children) return NULL; 4718 if (p->sibling.prev==&p->parent->children)
4719 return NULL;
4270 return list_entry(p->sibling.prev,struct task_struct,sibling); 4720 return list_entry(p->sibling.prev,struct task_struct,sibling);
4271} 4721}
4272 4722
4273static inline struct task_struct *younger_sibling(struct task_struct *p) 4723static inline struct task_struct *younger_sibling(struct task_struct *p)
4274{ 4724{
4275 if (p->sibling.next==&p->parent->children) return NULL; 4725 if (p->sibling.next==&p->parent->children)
4726 return NULL;
4276 return list_entry(p->sibling.next,struct task_struct,sibling); 4727 return list_entry(p->sibling.next,struct task_struct,sibling);
4277} 4728}
4278 4729
4279static void show_task(task_t *p) 4730static const char stat_nam[] = "RSDTtZX";
4731
4732static void show_task(struct task_struct *p)
4280{ 4733{
4281 task_t *relative; 4734 struct task_struct *relative;
4282 unsigned state;
4283 unsigned long free = 0; 4735 unsigned long free = 0;
4284 static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" }; 4736 unsigned state;
4285 4737
4286 printk("%-13.13s ", p->comm);
4287 state = p->state ? __ffs(p->state) + 1 : 0; 4738 state = p->state ? __ffs(p->state) + 1 : 0;
4288 if (state < ARRAY_SIZE(stat_nam)) 4739 printk("%-13.13s %c", p->comm,
4289 printk(stat_nam[state]); 4740 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4290 else
4291 printk("?");
4292#if (BITS_PER_LONG == 32) 4741#if (BITS_PER_LONG == 32)
4293 if (state == TASK_RUNNING) 4742 if (state == TASK_RUNNING)
4294 printk(" running "); 4743 printk(" running ");
@@ -4332,7 +4781,7 @@ static void show_task(task_t *p)
4332 4781
4333void show_state(void) 4782void show_state(void)
4334{ 4783{
4335 task_t *g, *p; 4784 struct task_struct *g, *p;
4336 4785
4337#if (BITS_PER_LONG == 32) 4786#if (BITS_PER_LONG == 32)
4338 printk("\n" 4787 printk("\n"
@@ -4354,7 +4803,7 @@ void show_state(void)
4354 } while_each_thread(g, p); 4803 } while_each_thread(g, p);
4355 4804
4356 read_unlock(&tasklist_lock); 4805 read_unlock(&tasklist_lock);
4357 mutex_debug_show_all_locks(); 4806 debug_show_all_locks();
4358} 4807}
4359 4808
4360/** 4809/**
@@ -4365,15 +4814,15 @@ void show_state(void)
4365 * NOTE: this function does not set the idle thread's NEED_RESCHED 4814 * NOTE: this function does not set the idle thread's NEED_RESCHED
4366 * flag, to make booting more robust. 4815 * flag, to make booting more robust.
4367 */ 4816 */
4368void __devinit init_idle(task_t *idle, int cpu) 4817void __devinit init_idle(struct task_struct *idle, int cpu)
4369{ 4818{
4370 runqueue_t *rq = cpu_rq(cpu); 4819 struct rq *rq = cpu_rq(cpu);
4371 unsigned long flags; 4820 unsigned long flags;
4372 4821
4373 idle->timestamp = sched_clock(); 4822 idle->timestamp = sched_clock();
4374 idle->sleep_avg = 0; 4823 idle->sleep_avg = 0;
4375 idle->array = NULL; 4824 idle->array = NULL;
4376 idle->prio = MAX_PRIO; 4825 idle->prio = idle->normal_prio = MAX_PRIO;
4377 idle->state = TASK_RUNNING; 4826 idle->state = TASK_RUNNING;
4378 idle->cpus_allowed = cpumask_of_cpu(cpu); 4827 idle->cpus_allowed = cpumask_of_cpu(cpu);
4379 set_task_cpu(idle, cpu); 4828 set_task_cpu(idle, cpu);
@@ -4406,7 +4855,7 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4406/* 4855/*
4407 * This is how migration works: 4856 * This is how migration works:
4408 * 4857 *
4409 * 1) we queue a migration_req_t structure in the source CPU's 4858 * 1) we queue a struct migration_req structure in the source CPU's
4410 * runqueue and wake up that CPU's migration thread. 4859 * runqueue and wake up that CPU's migration thread.
4411 * 2) we down() the locked semaphore => thread blocks. 4860 * 2) we down() the locked semaphore => thread blocks.
4412 * 3) migration thread wakes up (implicitly it forces the migrated 4861 * 3) migration thread wakes up (implicitly it forces the migrated
@@ -4428,12 +4877,12 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4428 * task must not exit() & deallocate itself prematurely. The 4877 * task must not exit() & deallocate itself prematurely. The
4429 * call is not atomic; no spinlocks may be held. 4878 * call is not atomic; no spinlocks may be held.
4430 */ 4879 */
4431int set_cpus_allowed(task_t *p, cpumask_t new_mask) 4880int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
4432{ 4881{
4882 struct migration_req req;
4433 unsigned long flags; 4883 unsigned long flags;
4884 struct rq *rq;
4434 int ret = 0; 4885 int ret = 0;
4435 migration_req_t req;
4436 runqueue_t *rq;
4437 4886
4438 rq = task_rq_lock(p, &flags); 4887 rq = task_rq_lock(p, &flags);
4439 if (!cpus_intersects(new_mask, cpu_online_map)) { 4888 if (!cpus_intersects(new_mask, cpu_online_map)) {
@@ -4456,9 +4905,9 @@ int set_cpus_allowed(task_t *p, cpumask_t new_mask)
4456 } 4905 }
4457out: 4906out:
4458 task_rq_unlock(rq, &flags); 4907 task_rq_unlock(rq, &flags);
4908
4459 return ret; 4909 return ret;
4460} 4910}
4461
4462EXPORT_SYMBOL_GPL(set_cpus_allowed); 4911EXPORT_SYMBOL_GPL(set_cpus_allowed);
4463 4912
4464/* 4913/*
@@ -4469,13 +4918,16 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed);
4469 * 4918 *
4470 * So we race with normal scheduler movements, but that's OK, as long 4919 * So we race with normal scheduler movements, but that's OK, as long
4471 * as the task is no longer on this CPU. 4920 * as the task is no longer on this CPU.
4921 *
4922 * Returns non-zero if task was successfully migrated.
4472 */ 4923 */
4473static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) 4924static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4474{ 4925{
4475 runqueue_t *rq_dest, *rq_src; 4926 struct rq *rq_dest, *rq_src;
4927 int ret = 0;
4476 4928
4477 if (unlikely(cpu_is_offline(dest_cpu))) 4929 if (unlikely(cpu_is_offline(dest_cpu)))
4478 return; 4930 return ret;
4479 4931
4480 rq_src = cpu_rq(src_cpu); 4932 rq_src = cpu_rq(src_cpu);
4481 rq_dest = cpu_rq(dest_cpu); 4933 rq_dest = cpu_rq(dest_cpu);
@@ -4499,13 +4951,14 @@ static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4499 p->timestamp = p->timestamp - rq_src->timestamp_last_tick 4951 p->timestamp = p->timestamp - rq_src->timestamp_last_tick
4500 + rq_dest->timestamp_last_tick; 4952 + rq_dest->timestamp_last_tick;
4501 deactivate_task(p, rq_src); 4953 deactivate_task(p, rq_src);
4502 activate_task(p, rq_dest, 0); 4954 __activate_task(p, rq_dest);
4503 if (TASK_PREEMPTS_CURR(p, rq_dest)) 4955 if (TASK_PREEMPTS_CURR(p, rq_dest))
4504 resched_task(rq_dest->curr); 4956 resched_task(rq_dest->curr);
4505 } 4957 }
4506 4958 ret = 1;
4507out: 4959out:
4508 double_rq_unlock(rq_src, rq_dest); 4960 double_rq_unlock(rq_src, rq_dest);
4961 return ret;
4509} 4962}
4510 4963
4511/* 4964/*
@@ -4515,16 +4968,16 @@ out:
4515 */ 4968 */
4516static int migration_thread(void *data) 4969static int migration_thread(void *data)
4517{ 4970{
4518 runqueue_t *rq;
4519 int cpu = (long)data; 4971 int cpu = (long)data;
4972 struct rq *rq;
4520 4973
4521 rq = cpu_rq(cpu); 4974 rq = cpu_rq(cpu);
4522 BUG_ON(rq->migration_thread != current); 4975 BUG_ON(rq->migration_thread != current);
4523 4976
4524 set_current_state(TASK_INTERRUPTIBLE); 4977 set_current_state(TASK_INTERRUPTIBLE);
4525 while (!kthread_should_stop()) { 4978 while (!kthread_should_stop()) {
4979 struct migration_req *req;
4526 struct list_head *head; 4980 struct list_head *head;
4527 migration_req_t *req;
4528 4981
4529 try_to_freeze(); 4982 try_to_freeze();
4530 4983
@@ -4548,7 +5001,7 @@ static int migration_thread(void *data)
4548 set_current_state(TASK_INTERRUPTIBLE); 5001 set_current_state(TASK_INTERRUPTIBLE);
4549 continue; 5002 continue;
4550 } 5003 }
4551 req = list_entry(head->next, migration_req_t, list); 5004 req = list_entry(head->next, struct migration_req, list);
4552 list_del_init(head->next); 5005 list_del_init(head->next);
4553 5006
4554 spin_unlock(&rq->lock); 5007 spin_unlock(&rq->lock);
@@ -4573,36 +5026,42 @@ wait_to_die:
4573 5026
4574#ifdef CONFIG_HOTPLUG_CPU 5027#ifdef CONFIG_HOTPLUG_CPU
4575/* Figure out where task on dead CPU should go, use force if neccessary. */ 5028/* Figure out where task on dead CPU should go, use force if neccessary. */
4576static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) 5029static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
4577{ 5030{
4578 int dest_cpu; 5031 unsigned long flags;
4579 cpumask_t mask; 5032 cpumask_t mask;
5033 struct rq *rq;
5034 int dest_cpu;
4580 5035
5036restart:
4581 /* On same node? */ 5037 /* On same node? */
4582 mask = node_to_cpumask(cpu_to_node(dead_cpu)); 5038 mask = node_to_cpumask(cpu_to_node(dead_cpu));
4583 cpus_and(mask, mask, tsk->cpus_allowed); 5039 cpus_and(mask, mask, p->cpus_allowed);
4584 dest_cpu = any_online_cpu(mask); 5040 dest_cpu = any_online_cpu(mask);
4585 5041
4586 /* On any allowed CPU? */ 5042 /* On any allowed CPU? */
4587 if (dest_cpu == NR_CPUS) 5043 if (dest_cpu == NR_CPUS)
4588 dest_cpu = any_online_cpu(tsk->cpus_allowed); 5044 dest_cpu = any_online_cpu(p->cpus_allowed);
4589 5045
4590 /* No more Mr. Nice Guy. */ 5046 /* No more Mr. Nice Guy. */
4591 if (dest_cpu == NR_CPUS) { 5047 if (dest_cpu == NR_CPUS) {
4592 cpus_setall(tsk->cpus_allowed); 5048 rq = task_rq_lock(p, &flags);
4593 dest_cpu = any_online_cpu(tsk->cpus_allowed); 5049 cpus_setall(p->cpus_allowed);
5050 dest_cpu = any_online_cpu(p->cpus_allowed);
5051 task_rq_unlock(rq, &flags);
4594 5052
4595 /* 5053 /*
4596 * Don't tell them about moving exiting tasks or 5054 * Don't tell them about moving exiting tasks or
4597 * kernel threads (both mm NULL), since they never 5055 * kernel threads (both mm NULL), since they never
4598 * leave kernel. 5056 * leave kernel.
4599 */ 5057 */
4600 if (tsk->mm && printk_ratelimit()) 5058 if (p->mm && printk_ratelimit())
4601 printk(KERN_INFO "process %d (%s) no " 5059 printk(KERN_INFO "process %d (%s) no "
4602 "longer affine to cpu%d\n", 5060 "longer affine to cpu%d\n",
4603 tsk->pid, tsk->comm, dead_cpu); 5061 p->pid, p->comm, dead_cpu);
4604 } 5062 }
4605 __migrate_task(tsk, dead_cpu, dest_cpu); 5063 if (!__migrate_task(p, dead_cpu, dest_cpu))
5064 goto restart;
4606} 5065}
4607 5066
4608/* 5067/*
@@ -4612,9 +5071,9 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
4612 * their home CPUs. So we just add the counter to another CPU's counter, 5071 * their home CPUs. So we just add the counter to another CPU's counter,
4613 * to keep the global sum constant after CPU-down: 5072 * to keep the global sum constant after CPU-down:
4614 */ 5073 */
4615static void migrate_nr_uninterruptible(runqueue_t *rq_src) 5074static void migrate_nr_uninterruptible(struct rq *rq_src)
4616{ 5075{
4617 runqueue_t *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL)); 5076 struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
4618 unsigned long flags; 5077 unsigned long flags;
4619 5078
4620 local_irq_save(flags); 5079 local_irq_save(flags);
@@ -4628,48 +5087,51 @@ static void migrate_nr_uninterruptible(runqueue_t *rq_src)
4628/* Run through task list and migrate tasks from the dead cpu. */ 5087/* Run through task list and migrate tasks from the dead cpu. */
4629static void migrate_live_tasks(int src_cpu) 5088static void migrate_live_tasks(int src_cpu)
4630{ 5089{
4631 struct task_struct *tsk, *t; 5090 struct task_struct *p, *t;
4632 5091
4633 write_lock_irq(&tasklist_lock); 5092 write_lock_irq(&tasklist_lock);
4634 5093
4635 do_each_thread(t, tsk) { 5094 do_each_thread(t, p) {
4636 if (tsk == current) 5095 if (p == current)
4637 continue; 5096 continue;
4638 5097
4639 if (task_cpu(tsk) == src_cpu) 5098 if (task_cpu(p) == src_cpu)
4640 move_task_off_dead_cpu(src_cpu, tsk); 5099 move_task_off_dead_cpu(src_cpu, p);
4641 } while_each_thread(t, tsk); 5100 } while_each_thread(t, p);
4642 5101
4643 write_unlock_irq(&tasklist_lock); 5102 write_unlock_irq(&tasklist_lock);
4644} 5103}
4645 5104
4646/* Schedules idle task to be the next runnable task on current CPU. 5105/* Schedules idle task to be the next runnable task on current CPU.
4647 * It does so by boosting its priority to highest possible and adding it to 5106 * It does so by boosting its priority to highest possible and adding it to
4648 * the _front_ of runqueue. Used by CPU offline code. 5107 * the _front_ of the runqueue. Used by CPU offline code.
4649 */ 5108 */
4650void sched_idle_next(void) 5109void sched_idle_next(void)
4651{ 5110{
4652 int cpu = smp_processor_id(); 5111 int this_cpu = smp_processor_id();
4653 runqueue_t *rq = this_rq(); 5112 struct rq *rq = cpu_rq(this_cpu);
4654 struct task_struct *p = rq->idle; 5113 struct task_struct *p = rq->idle;
4655 unsigned long flags; 5114 unsigned long flags;
4656 5115
4657 /* cpu has to be offline */ 5116 /* cpu has to be offline */
4658 BUG_ON(cpu_online(cpu)); 5117 BUG_ON(cpu_online(this_cpu));
4659 5118
4660 /* Strictly not necessary since rest of the CPUs are stopped by now 5119 /*
4661 * and interrupts disabled on current cpu. 5120 * Strictly not necessary since rest of the CPUs are stopped by now
5121 * and interrupts disabled on the current cpu.
4662 */ 5122 */
4663 spin_lock_irqsave(&rq->lock, flags); 5123 spin_lock_irqsave(&rq->lock, flags);
4664 5124
4665 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); 5125 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
4666 /* Add idle task to _front_ of it's priority queue */ 5126
5127 /* Add idle task to the _front_ of its priority queue: */
4667 __activate_idle_task(p, rq); 5128 __activate_idle_task(p, rq);
4668 5129
4669 spin_unlock_irqrestore(&rq->lock, flags); 5130 spin_unlock_irqrestore(&rq->lock, flags);
4670} 5131}
4671 5132
4672/* Ensures that the idle task is using init_mm right before its cpu goes 5133/*
5134 * Ensures that the idle task is using init_mm right before its cpu goes
4673 * offline. 5135 * offline.
4674 */ 5136 */
4675void idle_task_exit(void) 5137void idle_task_exit(void)
@@ -4683,17 +5145,17 @@ void idle_task_exit(void)
4683 mmdrop(mm); 5145 mmdrop(mm);
4684} 5146}
4685 5147
4686static void migrate_dead(unsigned int dead_cpu, task_t *tsk) 5148static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
4687{ 5149{
4688 struct runqueue *rq = cpu_rq(dead_cpu); 5150 struct rq *rq = cpu_rq(dead_cpu);
4689 5151
4690 /* Must be exiting, otherwise would be on tasklist. */ 5152 /* Must be exiting, otherwise would be on tasklist. */
4691 BUG_ON(tsk->exit_state != EXIT_ZOMBIE && tsk->exit_state != EXIT_DEAD); 5153 BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
4692 5154
4693 /* Cannot have done final schedule yet: would have vanished. */ 5155 /* Cannot have done final schedule yet: would have vanished. */
4694 BUG_ON(tsk->flags & PF_DEAD); 5156 BUG_ON(p->state == TASK_DEAD);
4695 5157
4696 get_task_struct(tsk); 5158 get_task_struct(p);
4697 5159
4698 /* 5160 /*
4699 * Drop lock around migration; if someone else moves it, 5161 * Drop lock around migration; if someone else moves it,
@@ -4701,25 +5163,25 @@ static void migrate_dead(unsigned int dead_cpu, task_t *tsk)
4701 * fine. 5163 * fine.
4702 */ 5164 */
4703 spin_unlock_irq(&rq->lock); 5165 spin_unlock_irq(&rq->lock);
4704 move_task_off_dead_cpu(dead_cpu, tsk); 5166 move_task_off_dead_cpu(dead_cpu, p);
4705 spin_lock_irq(&rq->lock); 5167 spin_lock_irq(&rq->lock);
4706 5168
4707 put_task_struct(tsk); 5169 put_task_struct(p);
4708} 5170}
4709 5171
4710/* release_task() removes task from tasklist, so we won't find dead tasks. */ 5172/* release_task() removes task from tasklist, so we won't find dead tasks. */
4711static void migrate_dead_tasks(unsigned int dead_cpu) 5173static void migrate_dead_tasks(unsigned int dead_cpu)
4712{ 5174{
4713 unsigned arr, i; 5175 struct rq *rq = cpu_rq(dead_cpu);
4714 struct runqueue *rq = cpu_rq(dead_cpu); 5176 unsigned int arr, i;
4715 5177
4716 for (arr = 0; arr < 2; arr++) { 5178 for (arr = 0; arr < 2; arr++) {
4717 for (i = 0; i < MAX_PRIO; i++) { 5179 for (i = 0; i < MAX_PRIO; i++) {
4718 struct list_head *list = &rq->arrays[arr].queue[i]; 5180 struct list_head *list = &rq->arrays[arr].queue[i];
5181
4719 while (!list_empty(list)) 5182 while (!list_empty(list))
4720 migrate_dead(dead_cpu, 5183 migrate_dead(dead_cpu, list_entry(list->next,
4721 list_entry(list->next, task_t, 5184 struct task_struct, run_list));
4722 run_list));
4723 } 5185 }
4724 } 5186 }
4725} 5187}
@@ -4729,13 +5191,13 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
4729 * migration_call - callback that gets triggered when a CPU is added. 5191 * migration_call - callback that gets triggered when a CPU is added.
4730 * Here we can start up the necessary migration thread for the new CPU. 5192 * Here we can start up the necessary migration thread for the new CPU.
4731 */ 5193 */
4732static int migration_call(struct notifier_block *nfb, unsigned long action, 5194static int __cpuinit
4733 void *hcpu) 5195migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
4734{ 5196{
4735 int cpu = (long)hcpu;
4736 struct task_struct *p; 5197 struct task_struct *p;
4737 struct runqueue *rq; 5198 int cpu = (long)hcpu;
4738 unsigned long flags; 5199 unsigned long flags;
5200 struct rq *rq;
4739 5201
4740 switch (action) { 5202 switch (action) {
4741 case CPU_UP_PREPARE: 5203 case CPU_UP_PREPARE:
@@ -4750,18 +5212,23 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
4750 task_rq_unlock(rq, &flags); 5212 task_rq_unlock(rq, &flags);
4751 cpu_rq(cpu)->migration_thread = p; 5213 cpu_rq(cpu)->migration_thread = p;
4752 break; 5214 break;
5215
4753 case CPU_ONLINE: 5216 case CPU_ONLINE:
4754 /* Strictly unneccessary, as first user will wake it. */ 5217 /* Strictly unneccessary, as first user will wake it. */
4755 wake_up_process(cpu_rq(cpu)->migration_thread); 5218 wake_up_process(cpu_rq(cpu)->migration_thread);
4756 break; 5219 break;
5220
4757#ifdef CONFIG_HOTPLUG_CPU 5221#ifdef CONFIG_HOTPLUG_CPU
4758 case CPU_UP_CANCELED: 5222 case CPU_UP_CANCELED:
5223 if (!cpu_rq(cpu)->migration_thread)
5224 break;
4759 /* Unbind it from offline cpu so it can run. Fall thru. */ 5225 /* Unbind it from offline cpu so it can run. Fall thru. */
4760 kthread_bind(cpu_rq(cpu)->migration_thread, 5226 kthread_bind(cpu_rq(cpu)->migration_thread,
4761 any_online_cpu(cpu_online_map)); 5227 any_online_cpu(cpu_online_map));
4762 kthread_stop(cpu_rq(cpu)->migration_thread); 5228 kthread_stop(cpu_rq(cpu)->migration_thread);
4763 cpu_rq(cpu)->migration_thread = NULL; 5229 cpu_rq(cpu)->migration_thread = NULL;
4764 break; 5230 break;
5231
4765 case CPU_DEAD: 5232 case CPU_DEAD:
4766 migrate_live_tasks(cpu); 5233 migrate_live_tasks(cpu);
4767 rq = cpu_rq(cpu); 5234 rq = cpu_rq(cpu);
@@ -4782,9 +5249,10 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
4782 * the requestors. */ 5249 * the requestors. */
4783 spin_lock_irq(&rq->lock); 5250 spin_lock_irq(&rq->lock);
4784 while (!list_empty(&rq->migration_queue)) { 5251 while (!list_empty(&rq->migration_queue)) {
4785 migration_req_t *req; 5252 struct migration_req *req;
5253
4786 req = list_entry(rq->migration_queue.next, 5254 req = list_entry(rq->migration_queue.next,
4787 migration_req_t, list); 5255 struct migration_req, list);
4788 list_del_init(&req->list); 5256 list_del_init(&req->list);
4789 complete(&req->done); 5257 complete(&req->done);
4790 } 5258 }
@@ -4798,7 +5266,7 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
4798/* Register at highest priority so that task migration (migrate_all_tasks) 5266/* Register at highest priority so that task migration (migrate_all_tasks)
4799 * happens before everything else. 5267 * happens before everything else.
4800 */ 5268 */
4801static struct notifier_block migration_notifier = { 5269static struct notifier_block __cpuinitdata migration_notifier = {
4802 .notifier_call = migration_call, 5270 .notifier_call = migration_call,
4803 .priority = 10 5271 .priority = 10
4804}; 5272};
@@ -4806,10 +5274,14 @@ static struct notifier_block migration_notifier = {
4806int __init migration_init(void) 5274int __init migration_init(void)
4807{ 5275{
4808 void *cpu = (void *)(long)smp_processor_id(); 5276 void *cpu = (void *)(long)smp_processor_id();
4809 /* Start one for boot CPU. */ 5277 int err;
4810 migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); 5278
5279 /* Start one for the boot CPU: */
5280 err = migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
5281 BUG_ON(err == NOTIFY_BAD);
4811 migration_call(&migration_notifier, CPU_ONLINE, cpu); 5282 migration_call(&migration_notifier, CPU_ONLINE, cpu);
4812 register_cpu_notifier(&migration_notifier); 5283 register_cpu_notifier(&migration_notifier);
5284
4813 return 0; 5285 return 0;
4814} 5286}
4815#endif 5287#endif
@@ -4905,7 +5377,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
4905 } while (sd); 5377 } while (sd);
4906} 5378}
4907#else 5379#else
4908#define sched_domain_debug(sd, cpu) {} 5380# define sched_domain_debug(sd, cpu) do { } while (0)
4909#endif 5381#endif
4910 5382
4911static int sd_degenerate(struct sched_domain *sd) 5383static int sd_degenerate(struct sched_domain *sd)
@@ -4931,8 +5403,8 @@ static int sd_degenerate(struct sched_domain *sd)
4931 return 1; 5403 return 1;
4932} 5404}
4933 5405
4934static int sd_parent_degenerate(struct sched_domain *sd, 5406static int
4935 struct sched_domain *parent) 5407sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
4936{ 5408{
4937 unsigned long cflags = sd->flags, pflags = parent->flags; 5409 unsigned long cflags = sd->flags, pflags = parent->flags;
4938 5410
@@ -4965,7 +5437,7 @@ static int sd_parent_degenerate(struct sched_domain *sd,
4965 */ 5437 */
4966static void cpu_attach_domain(struct sched_domain *sd, int cpu) 5438static void cpu_attach_domain(struct sched_domain *sd, int cpu)
4967{ 5439{
4968 runqueue_t *rq = cpu_rq(cpu); 5440 struct rq *rq = cpu_rq(cpu);
4969 struct sched_domain *tmp; 5441 struct sched_domain *tmp;
4970 5442
4971 /* Remove the sched domains which do not contribute to scheduling. */ 5443 /* Remove the sched domains which do not contribute to scheduling. */
@@ -5227,8 +5699,8 @@ static void touch_cache(void *__cache, unsigned long __size)
5227/* 5699/*
5228 * Measure the cache-cost of one task migration. Returns in units of nsec. 5700 * Measure the cache-cost of one task migration. Returns in units of nsec.
5229 */ 5701 */
5230static unsigned long long measure_one(void *cache, unsigned long size, 5702static unsigned long long
5231 int source, int target) 5703measure_one(void *cache, unsigned long size, int source, int target)
5232{ 5704{
5233 cpumask_t mask, saved_mask; 5705 cpumask_t mask, saved_mask;
5234 unsigned long long t0, t1, t2, t3, cost; 5706 unsigned long long t0, t1, t2, t3, cost;
@@ -5380,7 +5852,7 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2)
5380 cache = vmalloc(max_size); 5852 cache = vmalloc(max_size);
5381 if (!cache) { 5853 if (!cache) {
5382 printk("could not vmalloc %d bytes for cache!\n", 2*max_size); 5854 printk("could not vmalloc %d bytes for cache!\n", 2*max_size);
5383 return 1000000; // return 1 msec on very small boxen 5855 return 1000000; /* return 1 msec on very small boxen */
5384 } 5856 }
5385 5857
5386 while (size <= max_size) { 5858 while (size <= max_size) {
@@ -5578,9 +6050,9 @@ static int find_next_best_node(int node, unsigned long *used_nodes)
5578 */ 6050 */
5579static cpumask_t sched_domain_node_span(int node) 6051static cpumask_t sched_domain_node_span(int node)
5580{ 6052{
5581 int i;
5582 cpumask_t span, nodemask;
5583 DECLARE_BITMAP(used_nodes, MAX_NUMNODES); 6053 DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
6054 cpumask_t span, nodemask;
6055 int i;
5584 6056
5585 cpus_clear(span); 6057 cpus_clear(span);
5586 bitmap_zero(used_nodes, MAX_NUMNODES); 6058 bitmap_zero(used_nodes, MAX_NUMNODES);
@@ -5591,6 +6063,7 @@ static cpumask_t sched_domain_node_span(int node)
5591 6063
5592 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { 6064 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5593 int next_node = find_next_best_node(node, used_nodes); 6065 int next_node = find_next_best_node(node, used_nodes);
6066
5594 nodemask = node_to_cpumask(next_node); 6067 nodemask = node_to_cpumask(next_node);
5595 cpus_or(span, span, nodemask); 6068 cpus_or(span, span, nodemask);
5596 } 6069 }
@@ -5599,22 +6072,27 @@ static cpumask_t sched_domain_node_span(int node)
5599} 6072}
5600#endif 6073#endif
5601 6074
6075int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6076
5602/* 6077/*
5603 * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we 6078 * SMT sched-domains:
5604 * can switch it on easily if needed.
5605 */ 6079 */
5606#ifdef CONFIG_SCHED_SMT 6080#ifdef CONFIG_SCHED_SMT
5607static DEFINE_PER_CPU(struct sched_domain, cpu_domains); 6081static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
5608static struct sched_group sched_group_cpus[NR_CPUS]; 6082static struct sched_group sched_group_cpus[NR_CPUS];
6083
5609static int cpu_to_cpu_group(int cpu) 6084static int cpu_to_cpu_group(int cpu)
5610{ 6085{
5611 return cpu; 6086 return cpu;
5612} 6087}
5613#endif 6088#endif
5614 6089
6090/*
6091 * multi-core sched-domains:
6092 */
5615#ifdef CONFIG_SCHED_MC 6093#ifdef CONFIG_SCHED_MC
5616static DEFINE_PER_CPU(struct sched_domain, core_domains); 6094static DEFINE_PER_CPU(struct sched_domain, core_domains);
5617static struct sched_group sched_group_core[NR_CPUS]; 6095static struct sched_group *sched_group_core_bycpu[NR_CPUS];
5618#endif 6096#endif
5619 6097
5620#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 6098#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
@@ -5630,10 +6108,11 @@ static int cpu_to_core_group(int cpu)
5630#endif 6108#endif
5631 6109
5632static DEFINE_PER_CPU(struct sched_domain, phys_domains); 6110static DEFINE_PER_CPU(struct sched_domain, phys_domains);
5633static struct sched_group sched_group_phys[NR_CPUS]; 6111static struct sched_group *sched_group_phys_bycpu[NR_CPUS];
6112
5634static int cpu_to_phys_group(int cpu) 6113static int cpu_to_phys_group(int cpu)
5635{ 6114{
5636#if defined(CONFIG_SCHED_MC) 6115#ifdef CONFIG_SCHED_MC
5637 cpumask_t mask = cpu_coregroup_map(cpu); 6116 cpumask_t mask = cpu_coregroup_map(cpu);
5638 return first_cpu(mask); 6117 return first_cpu(mask);
5639#elif defined(CONFIG_SCHED_SMT) 6118#elif defined(CONFIG_SCHED_SMT)
@@ -5687,13 +6166,74 @@ next_sg:
5687} 6166}
5688#endif 6167#endif
5689 6168
6169/* Free memory allocated for various sched_group structures */
6170static void free_sched_groups(const cpumask_t *cpu_map)
6171{
6172 int cpu;
6173#ifdef CONFIG_NUMA
6174 int i;
6175
6176 for_each_cpu_mask(cpu, *cpu_map) {
6177 struct sched_group *sched_group_allnodes
6178 = sched_group_allnodes_bycpu[cpu];
6179 struct sched_group **sched_group_nodes
6180 = sched_group_nodes_bycpu[cpu];
6181
6182 if (sched_group_allnodes) {
6183 kfree(sched_group_allnodes);
6184 sched_group_allnodes_bycpu[cpu] = NULL;
6185 }
6186
6187 if (!sched_group_nodes)
6188 continue;
6189
6190 for (i = 0; i < MAX_NUMNODES; i++) {
6191 cpumask_t nodemask = node_to_cpumask(i);
6192 struct sched_group *oldsg, *sg = sched_group_nodes[i];
6193
6194 cpus_and(nodemask, nodemask, *cpu_map);
6195 if (cpus_empty(nodemask))
6196 continue;
6197
6198 if (sg == NULL)
6199 continue;
6200 sg = sg->next;
6201next_sg:
6202 oldsg = sg;
6203 sg = sg->next;
6204 kfree(oldsg);
6205 if (oldsg != sched_group_nodes[i])
6206 goto next_sg;
6207 }
6208 kfree(sched_group_nodes);
6209 sched_group_nodes_bycpu[cpu] = NULL;
6210 }
6211#endif
6212 for_each_cpu_mask(cpu, *cpu_map) {
6213 if (sched_group_phys_bycpu[cpu]) {
6214 kfree(sched_group_phys_bycpu[cpu]);
6215 sched_group_phys_bycpu[cpu] = NULL;
6216 }
6217#ifdef CONFIG_SCHED_MC
6218 if (sched_group_core_bycpu[cpu]) {
6219 kfree(sched_group_core_bycpu[cpu]);
6220 sched_group_core_bycpu[cpu] = NULL;
6221 }
6222#endif
6223 }
6224}
6225
5690/* 6226/*
5691 * Build sched domains for a given set of cpus and attach the sched domains 6227 * Build sched domains for a given set of cpus and attach the sched domains
5692 * to the individual cpus 6228 * to the individual cpus
5693 */ 6229 */
5694void build_sched_domains(const cpumask_t *cpu_map) 6230static int build_sched_domains(const cpumask_t *cpu_map)
5695{ 6231{
5696 int i; 6232 int i;
6233 struct sched_group *sched_group_phys = NULL;
6234#ifdef CONFIG_SCHED_MC
6235 struct sched_group *sched_group_core = NULL;
6236#endif
5697#ifdef CONFIG_NUMA 6237#ifdef CONFIG_NUMA
5698 struct sched_group **sched_group_nodes = NULL; 6238 struct sched_group **sched_group_nodes = NULL;
5699 struct sched_group *sched_group_allnodes = NULL; 6239 struct sched_group *sched_group_allnodes = NULL;
@@ -5701,11 +6241,11 @@ void build_sched_domains(const cpumask_t *cpu_map)
5701 /* 6241 /*
5702 * Allocate the per-node list of sched groups 6242 * Allocate the per-node list of sched groups
5703 */ 6243 */
5704 sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES, 6244 sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
5705 GFP_ATOMIC); 6245 GFP_KERNEL);
5706 if (!sched_group_nodes) { 6246 if (!sched_group_nodes) {
5707 printk(KERN_WARNING "Can not alloc sched group node list\n"); 6247 printk(KERN_WARNING "Can not alloc sched group node list\n");
5708 return; 6248 return -ENOMEM;
5709 } 6249 }
5710 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; 6250 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
5711#endif 6251#endif
@@ -5731,7 +6271,7 @@ void build_sched_domains(const cpumask_t *cpu_map)
5731 if (!sched_group_allnodes) { 6271 if (!sched_group_allnodes) {
5732 printk(KERN_WARNING 6272 printk(KERN_WARNING
5733 "Can not alloc allnodes sched group\n"); 6273 "Can not alloc allnodes sched group\n");
5734 break; 6274 goto error;
5735 } 6275 }
5736 sched_group_allnodes_bycpu[i] 6276 sched_group_allnodes_bycpu[i]
5737 = sched_group_allnodes; 6277 = sched_group_allnodes;
@@ -5752,6 +6292,18 @@ void build_sched_domains(const cpumask_t *cpu_map)
5752 cpus_and(sd->span, sd->span, *cpu_map); 6292 cpus_and(sd->span, sd->span, *cpu_map);
5753#endif 6293#endif
5754 6294
6295 if (!sched_group_phys) {
6296 sched_group_phys
6297 = kmalloc(sizeof(struct sched_group) * NR_CPUS,
6298 GFP_KERNEL);
6299 if (!sched_group_phys) {
6300 printk (KERN_WARNING "Can not alloc phys sched"
6301 "group\n");
6302 goto error;
6303 }
6304 sched_group_phys_bycpu[i] = sched_group_phys;
6305 }
6306
5755 p = sd; 6307 p = sd;
5756 sd = &per_cpu(phys_domains, i); 6308 sd = &per_cpu(phys_domains, i);
5757 group = cpu_to_phys_group(i); 6309 group = cpu_to_phys_group(i);
@@ -5761,6 +6313,18 @@ void build_sched_domains(const cpumask_t *cpu_map)
5761 sd->groups = &sched_group_phys[group]; 6313 sd->groups = &sched_group_phys[group];
5762 6314
5763#ifdef CONFIG_SCHED_MC 6315#ifdef CONFIG_SCHED_MC
6316 if (!sched_group_core) {
6317 sched_group_core
6318 = kmalloc(sizeof(struct sched_group) * NR_CPUS,
6319 GFP_KERNEL);
6320 if (!sched_group_core) {
6321 printk (KERN_WARNING "Can not alloc core sched"
6322 "group\n");
6323 goto error;
6324 }
6325 sched_group_core_bycpu[i] = sched_group_core;
6326 }
6327
5764 p = sd; 6328 p = sd;
5765 sd = &per_cpu(core_domains, i); 6329 sd = &per_cpu(core_domains, i);
5766 group = cpu_to_core_group(i); 6330 group = cpu_to_core_group(i);
@@ -5844,24 +6408,21 @@ void build_sched_domains(const cpumask_t *cpu_map)
5844 domainspan = sched_domain_node_span(i); 6408 domainspan = sched_domain_node_span(i);
5845 cpus_and(domainspan, domainspan, *cpu_map); 6409 cpus_and(domainspan, domainspan, *cpu_map);
5846 6410
5847 sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); 6411 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
6412 if (!sg) {
6413 printk(KERN_WARNING "Can not alloc domain group for "
6414 "node %d\n", i);
6415 goto error;
6416 }
5848 sched_group_nodes[i] = sg; 6417 sched_group_nodes[i] = sg;
5849 for_each_cpu_mask(j, nodemask) { 6418 for_each_cpu_mask(j, nodemask) {
5850 struct sched_domain *sd; 6419 struct sched_domain *sd;
5851 sd = &per_cpu(node_domains, j); 6420 sd = &per_cpu(node_domains, j);
5852 sd->groups = sg; 6421 sd->groups = sg;
5853 if (sd->groups == NULL) {
5854 /* Turn off balancing if we have no groups */
5855 sd->flags = 0;
5856 }
5857 }
5858 if (!sg) {
5859 printk(KERN_WARNING
5860 "Can not alloc domain group for node %d\n", i);
5861 continue;
5862 } 6422 }
5863 sg->cpu_power = 0; 6423 sg->cpu_power = 0;
5864 sg->cpumask = nodemask; 6424 sg->cpumask = nodemask;
6425 sg->next = sg;
5865 cpus_or(covered, covered, nodemask); 6426 cpus_or(covered, covered, nodemask);
5866 prev = sg; 6427 prev = sg;
5867 6428
@@ -5880,54 +6441,90 @@ void build_sched_domains(const cpumask_t *cpu_map)
5880 if (cpus_empty(tmp)) 6441 if (cpus_empty(tmp))
5881 continue; 6442 continue;
5882 6443
5883 sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); 6444 sg = kmalloc_node(sizeof(struct sched_group),
6445 GFP_KERNEL, i);
5884 if (!sg) { 6446 if (!sg) {
5885 printk(KERN_WARNING 6447 printk(KERN_WARNING
5886 "Can not alloc domain group for node %d\n", j); 6448 "Can not alloc domain group for node %d\n", j);
5887 break; 6449 goto error;
5888 } 6450 }
5889 sg->cpu_power = 0; 6451 sg->cpu_power = 0;
5890 sg->cpumask = tmp; 6452 sg->cpumask = tmp;
6453 sg->next = prev->next;
5891 cpus_or(covered, covered, tmp); 6454 cpus_or(covered, covered, tmp);
5892 prev->next = sg; 6455 prev->next = sg;
5893 prev = sg; 6456 prev = sg;
5894 } 6457 }
5895 prev->next = sched_group_nodes[i];
5896 } 6458 }
5897#endif 6459#endif
5898 6460
5899 /* Calculate CPU power for physical packages and nodes */ 6461 /* Calculate CPU power for physical packages and nodes */
6462#ifdef CONFIG_SCHED_SMT
5900 for_each_cpu_mask(i, *cpu_map) { 6463 for_each_cpu_mask(i, *cpu_map) {
5901 int power;
5902 struct sched_domain *sd; 6464 struct sched_domain *sd;
5903#ifdef CONFIG_SCHED_SMT
5904 sd = &per_cpu(cpu_domains, i); 6465 sd = &per_cpu(cpu_domains, i);
5905 power = SCHED_LOAD_SCALE; 6466 sd->groups->cpu_power = SCHED_LOAD_SCALE;
5906 sd->groups->cpu_power = power; 6467 }
5907#endif 6468#endif
5908#ifdef CONFIG_SCHED_MC 6469#ifdef CONFIG_SCHED_MC
6470 for_each_cpu_mask(i, *cpu_map) {
6471 int power;
6472 struct sched_domain *sd;
5909 sd = &per_cpu(core_domains, i); 6473 sd = &per_cpu(core_domains, i);
5910 power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) 6474 if (sched_smt_power_savings)
6475 power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
6476 else
6477 power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
5911 * SCHED_LOAD_SCALE / 10; 6478 * SCHED_LOAD_SCALE / 10;
5912 sd->groups->cpu_power = power; 6479 sd->groups->cpu_power = power;
6480 }
6481#endif
5913 6482
6483 for_each_cpu_mask(i, *cpu_map) {
6484 struct sched_domain *sd;
6485#ifdef CONFIG_SCHED_MC
5914 sd = &per_cpu(phys_domains, i); 6486 sd = &per_cpu(phys_domains, i);
6487 if (i != first_cpu(sd->groups->cpumask))
6488 continue;
5915 6489
5916 /* 6490 sd->groups->cpu_power = 0;
5917 * This has to be < 2 * SCHED_LOAD_SCALE 6491 if (sched_mc_power_savings || sched_smt_power_savings) {
5918 * Lets keep it SCHED_LOAD_SCALE, so that 6492 int j;
5919 * while calculating NUMA group's cpu_power 6493
5920 * we can simply do 6494 for_each_cpu_mask(j, sd->groups->cpumask) {
5921 * numa_group->cpu_power += phys_group->cpu_power; 6495 struct sched_domain *sd1;
5922 * 6496 sd1 = &per_cpu(core_domains, j);
5923 * See "only add power once for each physical pkg" 6497 /*
5924 * comment below 6498 * for each core we will add once
5925 */ 6499 * to the group in physical domain
5926 sd->groups->cpu_power = SCHED_LOAD_SCALE; 6500 */
6501 if (j != first_cpu(sd1->groups->cpumask))
6502 continue;
6503
6504 if (sched_smt_power_savings)
6505 sd->groups->cpu_power += sd1->groups->cpu_power;
6506 else
6507 sd->groups->cpu_power += SCHED_LOAD_SCALE;
6508 }
6509 } else
6510 /*
6511 * This has to be < 2 * SCHED_LOAD_SCALE
6512 * Lets keep it SCHED_LOAD_SCALE, so that
6513 * while calculating NUMA group's cpu_power
6514 * we can simply do
6515 * numa_group->cpu_power += phys_group->cpu_power;
6516 *
6517 * See "only add power once for each physical pkg"
6518 * comment below
6519 */
6520 sd->groups->cpu_power = SCHED_LOAD_SCALE;
5927#else 6521#else
6522 int power;
5928 sd = &per_cpu(phys_domains, i); 6523 sd = &per_cpu(phys_domains, i);
5929 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * 6524 if (sched_smt_power_savings)
5930 (cpus_weight(sd->groups->cpumask)-1) / 10; 6525 power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
6526 else
6527 power = SCHED_LOAD_SCALE;
5931 sd->groups->cpu_power = power; 6528 sd->groups->cpu_power = power;
5932#endif 6529#endif
5933 } 6530 }
@@ -5936,7 +6533,12 @@ void build_sched_domains(const cpumask_t *cpu_map)
5936 for (i = 0; i < MAX_NUMNODES; i++) 6533 for (i = 0; i < MAX_NUMNODES; i++)
5937 init_numa_sched_groups_power(sched_group_nodes[i]); 6534 init_numa_sched_groups_power(sched_group_nodes[i]);
5938 6535
5939 init_numa_sched_groups_power(sched_group_allnodes); 6536 if (sched_group_allnodes) {
6537 int group = cpu_to_allnodes_group(first_cpu(*cpu_map));
6538 struct sched_group *sg = &sched_group_allnodes[group];
6539
6540 init_numa_sched_groups_power(sg);
6541 }
5940#endif 6542#endif
5941 6543
5942 /* Attach the domains */ 6544 /* Attach the domains */
@@ -5955,13 +6557,20 @@ void build_sched_domains(const cpumask_t *cpu_map)
5955 * Tune cache-hot values: 6557 * Tune cache-hot values:
5956 */ 6558 */
5957 calibrate_migration_costs(cpu_map); 6559 calibrate_migration_costs(cpu_map);
6560
6561 return 0;
6562
6563error:
6564 free_sched_groups(cpu_map);
6565 return -ENOMEM;
5958} 6566}
5959/* 6567/*
5960 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 6568 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
5961 */ 6569 */
5962static void arch_init_sched_domains(const cpumask_t *cpu_map) 6570static int arch_init_sched_domains(const cpumask_t *cpu_map)
5963{ 6571{
5964 cpumask_t cpu_default_map; 6572 cpumask_t cpu_default_map;
6573 int err;
5965 6574
5966 /* 6575 /*
5967 * Setup mask for cpus without special case scheduling requirements. 6576 * Setup mask for cpus without special case scheduling requirements.
@@ -5970,51 +6579,14 @@ static void arch_init_sched_domains(const cpumask_t *cpu_map)
5970 */ 6579 */
5971 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); 6580 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
5972 6581
5973 build_sched_domains(&cpu_default_map); 6582 err = build_sched_domains(&cpu_default_map);
6583
6584 return err;
5974} 6585}
5975 6586
5976static void arch_destroy_sched_domains(const cpumask_t *cpu_map) 6587static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
5977{ 6588{
5978#ifdef CONFIG_NUMA 6589 free_sched_groups(cpu_map);
5979 int i;
5980 int cpu;
5981
5982 for_each_cpu_mask(cpu, *cpu_map) {
5983 struct sched_group *sched_group_allnodes
5984 = sched_group_allnodes_bycpu[cpu];
5985 struct sched_group **sched_group_nodes
5986 = sched_group_nodes_bycpu[cpu];
5987
5988 if (sched_group_allnodes) {
5989 kfree(sched_group_allnodes);
5990 sched_group_allnodes_bycpu[cpu] = NULL;
5991 }
5992
5993 if (!sched_group_nodes)
5994 continue;
5995
5996 for (i = 0; i < MAX_NUMNODES; i++) {
5997 cpumask_t nodemask = node_to_cpumask(i);
5998 struct sched_group *oldsg, *sg = sched_group_nodes[i];
5999
6000 cpus_and(nodemask, nodemask, *cpu_map);
6001 if (cpus_empty(nodemask))
6002 continue;
6003
6004 if (sg == NULL)
6005 continue;
6006 sg = sg->next;
6007next_sg:
6008 oldsg = sg;
6009 sg = sg->next;
6010 kfree(oldsg);
6011 if (oldsg != sched_group_nodes[i])
6012 goto next_sg;
6013 }
6014 kfree(sched_group_nodes);
6015 sched_group_nodes_bycpu[cpu] = NULL;
6016 }
6017#endif
6018} 6590}
6019 6591
6020/* 6592/*
@@ -6039,9 +6611,10 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
6039 * correct sched domains 6611 * correct sched domains
6040 * Call with hotplug lock held 6612 * Call with hotplug lock held
6041 */ 6613 */
6042void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) 6614int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
6043{ 6615{
6044 cpumask_t change_map; 6616 cpumask_t change_map;
6617 int err = 0;
6045 6618
6046 cpus_and(*partition1, *partition1, cpu_online_map); 6619 cpus_and(*partition1, *partition1, cpu_online_map);
6047 cpus_and(*partition2, *partition2, cpu_online_map); 6620 cpus_and(*partition2, *partition2, cpu_online_map);
@@ -6050,10 +6623,89 @@ void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
6050 /* Detach sched domains from all of the affected cpus */ 6623 /* Detach sched domains from all of the affected cpus */
6051 detach_destroy_domains(&change_map); 6624 detach_destroy_domains(&change_map);
6052 if (!cpus_empty(*partition1)) 6625 if (!cpus_empty(*partition1))
6053 build_sched_domains(partition1); 6626 err = build_sched_domains(partition1);
6054 if (!cpus_empty(*partition2)) 6627 if (!err && !cpus_empty(*partition2))
6055 build_sched_domains(partition2); 6628 err = build_sched_domains(partition2);
6629
6630 return err;
6631}
6632
6633#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6634int arch_reinit_sched_domains(void)
6635{
6636 int err;
6637
6638 lock_cpu_hotplug();
6639 detach_destroy_domains(&cpu_online_map);
6640 err = arch_init_sched_domains(&cpu_online_map);
6641 unlock_cpu_hotplug();
6642
6643 return err;
6644}
6645
6646static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6647{
6648 int ret;
6649
6650 if (buf[0] != '0' && buf[0] != '1')
6651 return -EINVAL;
6652
6653 if (smt)
6654 sched_smt_power_savings = (buf[0] == '1');
6655 else
6656 sched_mc_power_savings = (buf[0] == '1');
6657
6658 ret = arch_reinit_sched_domains();
6659
6660 return ret ? ret : count;
6661}
6662
6663int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
6664{
6665 int err = 0;
6666
6667#ifdef CONFIG_SCHED_SMT
6668 if (smt_capable())
6669 err = sysfs_create_file(&cls->kset.kobj,
6670 &attr_sched_smt_power_savings.attr);
6671#endif
6672#ifdef CONFIG_SCHED_MC
6673 if (!err && mc_capable())
6674 err = sysfs_create_file(&cls->kset.kobj,
6675 &attr_sched_mc_power_savings.attr);
6676#endif
6677 return err;
6678}
6679#endif
6680
6681#ifdef CONFIG_SCHED_MC
6682static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
6683{
6684 return sprintf(page, "%u\n", sched_mc_power_savings);
6685}
6686static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
6687 const char *buf, size_t count)
6688{
6689 return sched_power_savings_store(buf, count, 0);
6690}
6691SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
6692 sched_mc_power_savings_store);
6693#endif
6694
6695#ifdef CONFIG_SCHED_SMT
6696static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
6697{
6698 return sprintf(page, "%u\n", sched_smt_power_savings);
6699}
6700static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
6701 const char *buf, size_t count)
6702{
6703 return sched_power_savings_store(buf, count, 1);
6056} 6704}
6705SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
6706 sched_smt_power_savings_store);
6707#endif
6708
6057 6709
6058#ifdef CONFIG_HOTPLUG_CPU 6710#ifdef CONFIG_HOTPLUG_CPU
6059/* 6711/*
@@ -6108,6 +6760,7 @@ int in_sched_functions(unsigned long addr)
6108{ 6760{
6109 /* Linker adds these: start and end of __sched functions */ 6761 /* Linker adds these: start and end of __sched functions */
6110 extern char __sched_text_start[], __sched_text_end[]; 6762 extern char __sched_text_start[], __sched_text_end[];
6763
6111 return in_lock_functions(addr) || 6764 return in_lock_functions(addr) ||
6112 (addr >= (unsigned long)__sched_text_start 6765 (addr >= (unsigned long)__sched_text_start
6113 && addr < (unsigned long)__sched_text_end); 6766 && addr < (unsigned long)__sched_text_end);
@@ -6115,14 +6768,15 @@ int in_sched_functions(unsigned long addr)
6115 6768
6116void __init sched_init(void) 6769void __init sched_init(void)
6117{ 6770{
6118 runqueue_t *rq;
6119 int i, j, k; 6771 int i, j, k;
6120 6772
6121 for_each_possible_cpu(i) { 6773 for_each_possible_cpu(i) {
6122 prio_array_t *array; 6774 struct prio_array *array;
6775 struct rq *rq;
6123 6776
6124 rq = cpu_rq(i); 6777 rq = cpu_rq(i);
6125 spin_lock_init(&rq->lock); 6778 spin_lock_init(&rq->lock);
6779 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
6126 rq->nr_running = 0; 6780 rq->nr_running = 0;
6127 rq->active = rq->arrays; 6781 rq->active = rq->arrays;
6128 rq->expired = rq->arrays + 1; 6782 rq->expired = rq->arrays + 1;
@@ -6134,9 +6788,9 @@ void __init sched_init(void)
6134 rq->cpu_load[j] = 0; 6788 rq->cpu_load[j] = 0;
6135 rq->active_balance = 0; 6789 rq->active_balance = 0;
6136 rq->push_cpu = 0; 6790 rq->push_cpu = 0;
6791 rq->cpu = i;
6137 rq->migration_thread = NULL; 6792 rq->migration_thread = NULL;
6138 INIT_LIST_HEAD(&rq->migration_queue); 6793 INIT_LIST_HEAD(&rq->migration_queue);
6139 rq->cpu = i;
6140#endif 6794#endif
6141 atomic_set(&rq->nr_iowait, 0); 6795 atomic_set(&rq->nr_iowait, 0);
6142 6796
@@ -6151,6 +6805,12 @@ void __init sched_init(void)
6151 } 6805 }
6152 } 6806 }
6153 6807
6808 set_load_weight(&init_task);
6809
6810#ifdef CONFIG_RT_MUTEXES
6811 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
6812#endif
6813
6154 /* 6814 /*
6155 * The boot idle thread does lazy MMU switching as well: 6815 * The boot idle thread does lazy MMU switching as well:
6156 */ 6816 */
@@ -6169,7 +6829,7 @@ void __init sched_init(void)
6169#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 6829#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
6170void __might_sleep(char *file, int line) 6830void __might_sleep(char *file, int line)
6171{ 6831{
6172#if defined(in_atomic) 6832#ifdef in_atomic
6173 static unsigned long prev_jiffy; /* ratelimiting */ 6833 static unsigned long prev_jiffy; /* ratelimiting */
6174 6834
6175 if ((in_atomic() || irqs_disabled()) && 6835 if ((in_atomic() || irqs_disabled()) &&
@@ -6191,17 +6851,18 @@ EXPORT_SYMBOL(__might_sleep);
6191#ifdef CONFIG_MAGIC_SYSRQ 6851#ifdef CONFIG_MAGIC_SYSRQ
6192void normalize_rt_tasks(void) 6852void normalize_rt_tasks(void)
6193{ 6853{
6854 struct prio_array *array;
6194 struct task_struct *p; 6855 struct task_struct *p;
6195 prio_array_t *array;
6196 unsigned long flags; 6856 unsigned long flags;
6197 runqueue_t *rq; 6857 struct rq *rq;
6198 6858
6199 read_lock_irq(&tasklist_lock); 6859 read_lock_irq(&tasklist_lock);
6200 for_each_process (p) { 6860 for_each_process(p) {
6201 if (!rt_task(p)) 6861 if (!rt_task(p))
6202 continue; 6862 continue;
6203 6863
6204 rq = task_rq_lock(p, &flags); 6864 spin_lock_irqsave(&p->pi_lock, flags);
6865 rq = __task_rq_lock(p);
6205 6866
6206 array = p->array; 6867 array = p->array;
6207 if (array) 6868 if (array)
@@ -6212,7 +6873,8 @@ void normalize_rt_tasks(void)
6212 resched_task(rq->curr); 6873 resched_task(rq->curr);
6213 } 6874 }
6214 6875
6215 task_rq_unlock(rq, &flags); 6876 __task_rq_unlock(rq);
6877 spin_unlock_irqrestore(&p->pi_lock, flags);
6216 } 6878 }
6217 read_unlock_irq(&tasklist_lock); 6879 read_unlock_irq(&tasklist_lock);
6218} 6880}
@@ -6236,7 +6898,7 @@ void normalize_rt_tasks(void)
6236 * 6898 *
6237 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 6899 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6238 */ 6900 */
6239task_t *curr_task(int cpu) 6901struct task_struct *curr_task(int cpu)
6240{ 6902{
6241 return cpu_curr(cpu); 6903 return cpu_curr(cpu);
6242} 6904}
@@ -6256,7 +6918,7 @@ task_t *curr_task(int cpu)
6256 * 6918 *
6257 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 6919 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6258 */ 6920 */
6259void set_curr_task(int cpu, task_t *p) 6921void set_curr_task(int cpu, struct task_struct *p)
6260{ 6922{
6261 cpu_curr(cpu) = p; 6923 cpu_curr(cpu) = p;
6262} 6924}
diff --git a/kernel/signal.c b/kernel/signal.c
index 1b3c921737e2..fb5da6d19f14 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -10,7 +10,6 @@
10 * to allow signals to be sent reliably. 10 * to allow signals to be sent reliably.
11 */ 11 */
12 12
13#include <linux/config.h>
14#include <linux/slab.h> 13#include <linux/slab.h>
15#include <linux/module.h> 14#include <linux/module.h>
16#include <linux/smp_lock.h> 15#include <linux/smp_lock.h>
@@ -418,9 +417,8 @@ static int collect_signal(int sig, struct sigpending *list, siginfo_t *info)
418static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, 417static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
419 siginfo_t *info) 418 siginfo_t *info)
420{ 419{
421 int sig = 0; 420 int sig = next_signal(pending, mask);
422 421
423 sig = next_signal(pending, mask);
424 if (sig) { 422 if (sig) {
425 if (current->notifier) { 423 if (current->notifier) {
426 if (sigismember(current->notifier_mask, sig)) { 424 if (sigismember(current->notifier_mask, sig)) {
@@ -433,9 +431,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
433 431
434 if (!collect_signal(sig, pending, info)) 432 if (!collect_signal(sig, pending, info))
435 sig = 0; 433 sig = 0;
436
437 } 434 }
438 recalc_sigpending();
439 435
440 return sig; 436 return sig;
441} 437}
@@ -452,6 +448,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
452 if (!signr) 448 if (!signr)
453 signr = __dequeue_signal(&tsk->signal->shared_pending, 449 signr = __dequeue_signal(&tsk->signal->shared_pending,
454 mask, info); 450 mask, info);
451 recalc_sigpending_tsk(tsk);
455 if (signr && unlikely(sig_kernel_stop(signr))) { 452 if (signr && unlikely(sig_kernel_stop(signr))) {
456 /* 453 /*
457 * Set a marker that we have dequeued a stop signal. Our 454 * Set a marker that we have dequeued a stop signal. Our
@@ -584,7 +581,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
584 && !capable(CAP_KILL)) 581 && !capable(CAP_KILL))
585 return error; 582 return error;
586 583
587 error = security_task_kill(t, info, sig); 584 error = security_task_kill(t, info, sig, 0);
588 if (!error) 585 if (!error)
589 audit_signal_info(sig, t); /* Let audit system see the signal */ 586 audit_signal_info(sig, t); /* Let audit system see the signal */
590 return error; 587 return error;
@@ -792,22 +789,31 @@ out:
792/* 789/*
793 * Force a signal that the process can't ignore: if necessary 790 * Force a signal that the process can't ignore: if necessary
794 * we unblock the signal and change any SIG_IGN to SIG_DFL. 791 * we unblock the signal and change any SIG_IGN to SIG_DFL.
792 *
793 * Note: If we unblock the signal, we always reset it to SIG_DFL,
794 * since we do not want to have a signal handler that was blocked
795 * be invoked when user space had explicitly blocked it.
796 *
797 * We don't want to have recursive SIGSEGV's etc, for example.
795 */ 798 */
796
797int 799int
798force_sig_info(int sig, struct siginfo *info, struct task_struct *t) 800force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
799{ 801{
800 unsigned long int flags; 802 unsigned long int flags;
801 int ret; 803 int ret, blocked, ignored;
804 struct k_sigaction *action;
802 805
803 spin_lock_irqsave(&t->sighand->siglock, flags); 806 spin_lock_irqsave(&t->sighand->siglock, flags);
804 if (t->sighand->action[sig-1].sa.sa_handler == SIG_IGN) { 807 action = &t->sighand->action[sig-1];
805 t->sighand->action[sig-1].sa.sa_handler = SIG_DFL; 808 ignored = action->sa.sa_handler == SIG_IGN;
806 } 809 blocked = sigismember(&t->blocked, sig);
807 if (sigismember(&t->blocked, sig)) { 810 if (blocked || ignored) {
808 sigdelset(&t->blocked, sig); 811 action->sa.sa_handler = SIG_DFL;
812 if (blocked) {
813 sigdelset(&t->blocked, sig);
814 recalc_sigpending_tsk(t);
815 }
809 } 816 }
810 recalc_sigpending_tsk(t);
811 ret = specific_send_sig_info(sig, info, t); 817 ret = specific_send_sig_info(sig, info, t);
812 spin_unlock_irqrestore(&t->sighand->siglock, flags); 818 spin_unlock_irqrestore(&t->sighand->siglock, flags);
813 819
@@ -1107,7 +1113,7 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid)
1107 1113
1108/* like kill_proc_info(), but doesn't use uid/euid of "current" */ 1114/* like kill_proc_info(), but doesn't use uid/euid of "current" */
1109int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid, 1115int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid,
1110 uid_t uid, uid_t euid) 1116 uid_t uid, uid_t euid, u32 secid)
1111{ 1117{
1112 int ret = -EINVAL; 1118 int ret = -EINVAL;
1113 struct task_struct *p; 1119 struct task_struct *p;
@@ -1127,6 +1133,9 @@ int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid,
1127 ret = -EPERM; 1133 ret = -EPERM;
1128 goto out_unlock; 1134 goto out_unlock;
1129 } 1135 }
1136 ret = security_task_kill(p, info, sig, secid);
1137 if (ret)
1138 goto out_unlock;
1130 if (sig && p->sighand) { 1139 if (sig && p->sighand) {
1131 unsigned long flags; 1140 unsigned long flags;
1132 spin_lock_irqsave(&p->sighand->siglock, flags); 1141 spin_lock_irqsave(&p->sighand->siglock, flags);
@@ -1531,6 +1540,35 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
1531 spin_unlock_irqrestore(&sighand->siglock, flags); 1540 spin_unlock_irqrestore(&sighand->siglock, flags);
1532} 1541}
1533 1542
1543static inline int may_ptrace_stop(void)
1544{
1545 if (!likely(current->ptrace & PT_PTRACED))
1546 return 0;
1547
1548 if (unlikely(current->parent == current->real_parent &&
1549 (current->ptrace & PT_ATTACHED)))
1550 return 0;
1551
1552 if (unlikely(current->signal == current->parent->signal) &&
1553 unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))
1554 return 0;
1555
1556 /*
1557 * Are we in the middle of do_coredump?
1558 * If so and our tracer is also part of the coredump stopping
1559 * is a deadlock situation, and pointless because our tracer
1560 * is dead so don't allow us to stop.
1561 * If SIGKILL was already sent before the caller unlocked
1562 * ->siglock we must see ->core_waiters != 0. Otherwise it
1563 * is safe to enter schedule().
1564 */
1565 if (unlikely(current->mm->core_waiters) &&
1566 unlikely(current->mm == current->parent->mm))
1567 return 0;
1568
1569 return 1;
1570}
1571
1534/* 1572/*
1535 * This must be called with current->sighand->siglock held. 1573 * This must be called with current->sighand->siglock held.
1536 * 1574 *
@@ -1559,11 +1597,7 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
1559 spin_unlock_irq(&current->sighand->siglock); 1597 spin_unlock_irq(&current->sighand->siglock);
1560 try_to_freeze(); 1598 try_to_freeze();
1561 read_lock(&tasklist_lock); 1599 read_lock(&tasklist_lock);
1562 if (likely(current->ptrace & PT_PTRACED) && 1600 if (may_ptrace_stop()) {
1563 likely(current->parent != current->real_parent ||
1564 !(current->ptrace & PT_ATTACHED)) &&
1565 (likely(current->parent->signal != current->signal) ||
1566 !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) {
1567 do_notify_parent_cldstop(current, CLD_TRAPPED); 1601 do_notify_parent_cldstop(current, CLD_TRAPPED);
1568 read_unlock(&tasklist_lock); 1602 read_unlock(&tasklist_lock);
1569 schedule(); 1603 schedule();
@@ -2541,6 +2575,11 @@ asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize)
2541} 2575}
2542#endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */ 2576#endif /* __ARCH_WANT_SYS_RT_SIGSUSPEND */
2543 2577
2578__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma)
2579{
2580 return NULL;
2581}
2582
2544void __init signals_init(void) 2583void __init signals_init(void)
2545{ 2584{
2546 sigqueue_cachep = 2585 sigqueue_cachep =
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 336f92d64e2e..bf25015dce16 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -62,6 +62,137 @@ static inline void wakeup_softirqd(void)
62} 62}
63 63
64/* 64/*
65 * This one is for softirq.c-internal use,
66 * where hardirqs are disabled legitimately:
67 */
68#ifdef CONFIG_TRACE_IRQFLAGS
69static void __local_bh_disable(unsigned long ip)
70{
71 unsigned long flags;
72
73 WARN_ON_ONCE(in_irq());
74
75 raw_local_irq_save(flags);
76 add_preempt_count(SOFTIRQ_OFFSET);
77 /*
78 * Were softirqs turned off above:
79 */
80 if (softirq_count() == SOFTIRQ_OFFSET)
81 trace_softirqs_off(ip);
82 raw_local_irq_restore(flags);
83}
84#else /* !CONFIG_TRACE_IRQFLAGS */
85static inline void __local_bh_disable(unsigned long ip)
86{
87 add_preempt_count(SOFTIRQ_OFFSET);
88 barrier();
89}
90#endif /* CONFIG_TRACE_IRQFLAGS */
91
92void local_bh_disable(void)
93{
94 __local_bh_disable((unsigned long)__builtin_return_address(0));
95}
96
97EXPORT_SYMBOL(local_bh_disable);
98
99void __local_bh_enable(void)
100{
101 WARN_ON_ONCE(in_irq());
102
103 /*
104 * softirqs should never be enabled by __local_bh_enable(),
105 * it always nests inside local_bh_enable() sections:
106 */
107 WARN_ON_ONCE(softirq_count() == SOFTIRQ_OFFSET);
108
109 sub_preempt_count(SOFTIRQ_OFFSET);
110}
111EXPORT_SYMBOL_GPL(__local_bh_enable);
112
113/*
114 * Special-case - softirqs can safely be enabled in
115 * cond_resched_softirq(), or by __do_softirq(),
116 * without processing still-pending softirqs:
117 */
118void _local_bh_enable(void)
119{
120 WARN_ON_ONCE(in_irq());
121 WARN_ON_ONCE(!irqs_disabled());
122
123 if (softirq_count() == SOFTIRQ_OFFSET)
124 trace_softirqs_on((unsigned long)__builtin_return_address(0));
125 sub_preempt_count(SOFTIRQ_OFFSET);
126}
127
128EXPORT_SYMBOL(_local_bh_enable);
129
130void local_bh_enable(void)
131{
132#ifdef CONFIG_TRACE_IRQFLAGS
133 unsigned long flags;
134
135 WARN_ON_ONCE(in_irq());
136#endif
137 WARN_ON_ONCE(irqs_disabled());
138
139#ifdef CONFIG_TRACE_IRQFLAGS
140 local_irq_save(flags);
141#endif
142 /*
143 * Are softirqs going to be turned on now:
144 */
145 if (softirq_count() == SOFTIRQ_OFFSET)
146 trace_softirqs_on((unsigned long)__builtin_return_address(0));
147 /*
148 * Keep preemption disabled until we are done with
149 * softirq processing:
150 */
151 sub_preempt_count(SOFTIRQ_OFFSET - 1);
152
153 if (unlikely(!in_interrupt() && local_softirq_pending()))
154 do_softirq();
155
156 dec_preempt_count();
157#ifdef CONFIG_TRACE_IRQFLAGS
158 local_irq_restore(flags);
159#endif
160 preempt_check_resched();
161}
162EXPORT_SYMBOL(local_bh_enable);
163
164void local_bh_enable_ip(unsigned long ip)
165{
166#ifdef CONFIG_TRACE_IRQFLAGS
167 unsigned long flags;
168
169 WARN_ON_ONCE(in_irq());
170
171 local_irq_save(flags);
172#endif
173 /*
174 * Are softirqs going to be turned on now:
175 */
176 if (softirq_count() == SOFTIRQ_OFFSET)
177 trace_softirqs_on(ip);
178 /*
179 * Keep preemption disabled until we are done with
180 * softirq processing:
181 */
182 sub_preempt_count(SOFTIRQ_OFFSET - 1);
183
184 if (unlikely(!in_interrupt() && local_softirq_pending()))
185 do_softirq();
186
187 dec_preempt_count();
188#ifdef CONFIG_TRACE_IRQFLAGS
189 local_irq_restore(flags);
190#endif
191 preempt_check_resched();
192}
193EXPORT_SYMBOL(local_bh_enable_ip);
194
195/*
65 * We restart softirq processing MAX_SOFTIRQ_RESTART times, 196 * We restart softirq processing MAX_SOFTIRQ_RESTART times,
66 * and we fall back to softirqd after that. 197 * and we fall back to softirqd after that.
67 * 198 *
@@ -80,8 +211,11 @@ asmlinkage void __do_softirq(void)
80 int cpu; 211 int cpu;
81 212
82 pending = local_softirq_pending(); 213 pending = local_softirq_pending();
214 account_system_vtime(current);
215
216 __local_bh_disable((unsigned long)__builtin_return_address(0));
217 trace_softirq_enter();
83 218
84 local_bh_disable();
85 cpu = smp_processor_id(); 219 cpu = smp_processor_id();
86restart: 220restart:
87 /* Reset the pending bitmask before enabling irqs */ 221 /* Reset the pending bitmask before enabling irqs */
@@ -109,7 +243,10 @@ restart:
109 if (pending) 243 if (pending)
110 wakeup_softirqd(); 244 wakeup_softirqd();
111 245
112 __local_bh_enable(); 246 trace_softirq_exit();
247
248 account_system_vtime(current);
249 _local_bh_enable();
113} 250}
114 251
115#ifndef __ARCH_HAS_DO_SOFTIRQ 252#ifndef __ARCH_HAS_DO_SOFTIRQ
@@ -136,23 +273,6 @@ EXPORT_SYMBOL(do_softirq);
136 273
137#endif 274#endif
138 275
139void local_bh_enable(void)
140{
141 WARN_ON(irqs_disabled());
142 /*
143 * Keep preemption disabled until we are done with
144 * softirq processing:
145 */
146 sub_preempt_count(SOFTIRQ_OFFSET - 1);
147
148 if (unlikely(!in_interrupt() && local_softirq_pending()))
149 do_softirq();
150
151 dec_preempt_count();
152 preempt_check_resched();
153}
154EXPORT_SYMBOL(local_bh_enable);
155
156#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED 276#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
157# define invoke_softirq() __do_softirq() 277# define invoke_softirq() __do_softirq()
158#else 278#else
@@ -165,6 +285,7 @@ EXPORT_SYMBOL(local_bh_enable);
165void irq_exit(void) 285void irq_exit(void)
166{ 286{
167 account_system_vtime(current); 287 account_system_vtime(current);
288 trace_hardirq_exit();
168 sub_preempt_count(IRQ_EXIT_OFFSET); 289 sub_preempt_count(IRQ_EXIT_OFFSET);
169 if (!in_interrupt() && local_softirq_pending()) 290 if (!in_interrupt() && local_softirq_pending())
170 invoke_softirq(); 291 invoke_softirq();
@@ -208,8 +329,6 @@ void open_softirq(int nr, void (*action)(struct softirq_action*), void *data)
208 softirq_vec[nr].action = action; 329 softirq_vec[nr].action = action;
209} 330}
210 331
211EXPORT_SYMBOL(open_softirq);
212
213/* Tasklets */ 332/* Tasklets */
214struct tasklet_head 333struct tasklet_head
215{ 334{
@@ -446,7 +565,7 @@ static void takeover_tasklets(unsigned int cpu)
446} 565}
447#endif /* CONFIG_HOTPLUG_CPU */ 566#endif /* CONFIG_HOTPLUG_CPU */
448 567
449static int cpu_callback(struct notifier_block *nfb, 568static int __cpuinit cpu_callback(struct notifier_block *nfb,
450 unsigned long action, 569 unsigned long action,
451 void *hcpu) 570 void *hcpu)
452{ 571{
@@ -470,6 +589,8 @@ static int cpu_callback(struct notifier_block *nfb,
470 break; 589 break;
471#ifdef CONFIG_HOTPLUG_CPU 590#ifdef CONFIG_HOTPLUG_CPU
472 case CPU_UP_CANCELED: 591 case CPU_UP_CANCELED:
592 if (!per_cpu(ksoftirqd, hotcpu))
593 break;
473 /* Unbind so it can run. Fall thru. */ 594 /* Unbind so it can run. Fall thru. */
474 kthread_bind(per_cpu(ksoftirqd, hotcpu), 595 kthread_bind(per_cpu(ksoftirqd, hotcpu),
475 any_online_cpu(cpu_online_map)); 596 any_online_cpu(cpu_online_map));
@@ -484,14 +605,16 @@ static int cpu_callback(struct notifier_block *nfb,
484 return NOTIFY_OK; 605 return NOTIFY_OK;
485} 606}
486 607
487static struct notifier_block cpu_nfb = { 608static struct notifier_block __cpuinitdata cpu_nfb = {
488 .notifier_call = cpu_callback 609 .notifier_call = cpu_callback
489}; 610};
490 611
491__init int spawn_ksoftirqd(void) 612__init int spawn_ksoftirqd(void)
492{ 613{
493 void *cpu = (void *)(long)smp_processor_id(); 614 void *cpu = (void *)(long)smp_processor_id();
494 cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); 615 int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
616
617 BUG_ON(err == NOTIFY_BAD);
495 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); 618 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
496 register_cpu_notifier(&cpu_nfb); 619 register_cpu_notifier(&cpu_nfb);
497 return 0; 620 return 0;
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 14c7faf02909..50afeb813305 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -36,7 +36,7 @@ static struct notifier_block panic_block = {
36 36
37void touch_softlockup_watchdog(void) 37void touch_softlockup_watchdog(void)
38{ 38{
39 per_cpu(touch_timestamp, raw_smp_processor_id()) = jiffies; 39 __raw_get_cpu_var(touch_timestamp) = jiffies;
40} 40}
41EXPORT_SYMBOL(touch_softlockup_watchdog); 41EXPORT_SYMBOL(touch_softlockup_watchdog);
42 42
@@ -104,7 +104,7 @@ static int watchdog(void * __bind_cpu)
104/* 104/*
105 * Create/destroy watchdog threads as CPUs come and go: 105 * Create/destroy watchdog threads as CPUs come and go:
106 */ 106 */
107static int 107static int __cpuinit
108cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 108cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
109{ 109{
110 int hotcpu = (unsigned long)hcpu; 110 int hotcpu = (unsigned long)hcpu;
@@ -127,6 +127,8 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
127 break; 127 break;
128#ifdef CONFIG_HOTPLUG_CPU 128#ifdef CONFIG_HOTPLUG_CPU
129 case CPU_UP_CANCELED: 129 case CPU_UP_CANCELED:
130 if (!per_cpu(watchdog_task, hotcpu))
131 break;
130 /* Unbind so it can run. Fall thru. */ 132 /* Unbind so it can run. Fall thru. */
131 kthread_bind(per_cpu(watchdog_task, hotcpu), 133 kthread_bind(per_cpu(watchdog_task, hotcpu),
132 any_online_cpu(cpu_online_map)); 134 any_online_cpu(cpu_online_map));
@@ -140,15 +142,16 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
140 return NOTIFY_OK; 142 return NOTIFY_OK;
141} 143}
142 144
143static struct notifier_block cpu_nfb = { 145static struct notifier_block __cpuinitdata cpu_nfb = {
144 .notifier_call = cpu_callback 146 .notifier_call = cpu_callback
145}; 147};
146 148
147__init void spawn_softlockup_task(void) 149__init void spawn_softlockup_task(void)
148{ 150{
149 void *cpu = (void *)(long)smp_processor_id(); 151 void *cpu = (void *)(long)smp_processor_id();
152 int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
150 153
151 cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); 154 BUG_ON(err == NOTIFY_BAD);
152 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); 155 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
153 register_cpu_notifier(&cpu_nfb); 156 register_cpu_notifier(&cpu_nfb);
154 157
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index d1b810782bc4..d48143eafbfd 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -7,31 +7,27 @@
7 * 7 *
8 * This file contains the spinlock/rwlock implementations for the 8 * This file contains the spinlock/rwlock implementations for the
9 * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them) 9 * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them)
10 *
11 * Note that some architectures have special knowledge about the
12 * stack frames of these functions in their profile_pc. If you
13 * change anything significant here that could change the stack
14 * frame contact the architecture maintainers.
10 */ 15 */
11 16
12#include <linux/config.h>
13#include <linux/linkage.h> 17#include <linux/linkage.h>
14#include <linux/preempt.h> 18#include <linux/preempt.h>
15#include <linux/spinlock.h> 19#include <linux/spinlock.h>
16#include <linux/interrupt.h> 20#include <linux/interrupt.h>
21#include <linux/debug_locks.h>
17#include <linux/module.h> 22#include <linux/module.h>
18 23
19/*
20 * Generic declaration of the raw read_trylock() function,
21 * architectures are supposed to optimize this:
22 */
23int __lockfunc generic__raw_read_trylock(raw_rwlock_t *lock)
24{
25 __raw_read_lock(lock);
26 return 1;
27}
28EXPORT_SYMBOL(generic__raw_read_trylock);
29
30int __lockfunc _spin_trylock(spinlock_t *lock) 24int __lockfunc _spin_trylock(spinlock_t *lock)
31{ 25{
32 preempt_disable(); 26 preempt_disable();
33 if (_raw_spin_trylock(lock)) 27 if (_raw_spin_trylock(lock)) {
28 spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
34 return 1; 29 return 1;
30 }
35 31
36 preempt_enable(); 32 preempt_enable();
37 return 0; 33 return 0;
@@ -41,8 +37,10 @@ EXPORT_SYMBOL(_spin_trylock);
41int __lockfunc _read_trylock(rwlock_t *lock) 37int __lockfunc _read_trylock(rwlock_t *lock)
42{ 38{
43 preempt_disable(); 39 preempt_disable();
44 if (_raw_read_trylock(lock)) 40 if (_raw_read_trylock(lock)) {
41 rwlock_acquire_read(&lock->dep_map, 0, 1, _RET_IP_);
45 return 1; 42 return 1;
43 }
46 44
47 preempt_enable(); 45 preempt_enable();
48 return 0; 46 return 0;
@@ -52,19 +50,28 @@ EXPORT_SYMBOL(_read_trylock);
52int __lockfunc _write_trylock(rwlock_t *lock) 50int __lockfunc _write_trylock(rwlock_t *lock)
53{ 51{
54 preempt_disable(); 52 preempt_disable();
55 if (_raw_write_trylock(lock)) 53 if (_raw_write_trylock(lock)) {
54 rwlock_acquire(&lock->dep_map, 0, 1, _RET_IP_);
56 return 1; 55 return 1;
56 }
57 57
58 preempt_enable(); 58 preempt_enable();
59 return 0; 59 return 0;
60} 60}
61EXPORT_SYMBOL(_write_trylock); 61EXPORT_SYMBOL(_write_trylock);
62 62
63#if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP) 63/*
64 * If lockdep is enabled then we use the non-preemption spin-ops
65 * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are
66 * not re-enabled during lock-acquire (which the preempt-spin-ops do):
67 */
68#if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP) || \
69 defined(CONFIG_DEBUG_LOCK_ALLOC)
64 70
65void __lockfunc _read_lock(rwlock_t *lock) 71void __lockfunc _read_lock(rwlock_t *lock)
66{ 72{
67 preempt_disable(); 73 preempt_disable();
74 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
68 _raw_read_lock(lock); 75 _raw_read_lock(lock);
69} 76}
70EXPORT_SYMBOL(_read_lock); 77EXPORT_SYMBOL(_read_lock);
@@ -75,7 +82,17 @@ unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
75 82
76 local_irq_save(flags); 83 local_irq_save(flags);
77 preempt_disable(); 84 preempt_disable();
85 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
86 /*
87 * On lockdep we dont want the hand-coded irq-enable of
88 * _raw_spin_lock_flags() code, because lockdep assumes
89 * that interrupts are not re-enabled during lock-acquire:
90 */
91#ifdef CONFIG_PROVE_LOCKING
92 _raw_spin_lock(lock);
93#else
78 _raw_spin_lock_flags(lock, &flags); 94 _raw_spin_lock_flags(lock, &flags);
95#endif
79 return flags; 96 return flags;
80} 97}
81EXPORT_SYMBOL(_spin_lock_irqsave); 98EXPORT_SYMBOL(_spin_lock_irqsave);
@@ -84,6 +101,7 @@ void __lockfunc _spin_lock_irq(spinlock_t *lock)
84{ 101{
85 local_irq_disable(); 102 local_irq_disable();
86 preempt_disable(); 103 preempt_disable();
104 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
87 _raw_spin_lock(lock); 105 _raw_spin_lock(lock);
88} 106}
89EXPORT_SYMBOL(_spin_lock_irq); 107EXPORT_SYMBOL(_spin_lock_irq);
@@ -92,6 +110,7 @@ void __lockfunc _spin_lock_bh(spinlock_t *lock)
92{ 110{
93 local_bh_disable(); 111 local_bh_disable();
94 preempt_disable(); 112 preempt_disable();
113 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
95 _raw_spin_lock(lock); 114 _raw_spin_lock(lock);
96} 115}
97EXPORT_SYMBOL(_spin_lock_bh); 116EXPORT_SYMBOL(_spin_lock_bh);
@@ -102,6 +121,7 @@ unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
102 121
103 local_irq_save(flags); 122 local_irq_save(flags);
104 preempt_disable(); 123 preempt_disable();
124 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
105 _raw_read_lock(lock); 125 _raw_read_lock(lock);
106 return flags; 126 return flags;
107} 127}
@@ -111,6 +131,7 @@ void __lockfunc _read_lock_irq(rwlock_t *lock)
111{ 131{
112 local_irq_disable(); 132 local_irq_disable();
113 preempt_disable(); 133 preempt_disable();
134 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
114 _raw_read_lock(lock); 135 _raw_read_lock(lock);
115} 136}
116EXPORT_SYMBOL(_read_lock_irq); 137EXPORT_SYMBOL(_read_lock_irq);
@@ -119,6 +140,7 @@ void __lockfunc _read_lock_bh(rwlock_t *lock)
119{ 140{
120 local_bh_disable(); 141 local_bh_disable();
121 preempt_disable(); 142 preempt_disable();
143 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
122 _raw_read_lock(lock); 144 _raw_read_lock(lock);
123} 145}
124EXPORT_SYMBOL(_read_lock_bh); 146EXPORT_SYMBOL(_read_lock_bh);
@@ -129,6 +151,7 @@ unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
129 151
130 local_irq_save(flags); 152 local_irq_save(flags);
131 preempt_disable(); 153 preempt_disable();
154 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
132 _raw_write_lock(lock); 155 _raw_write_lock(lock);
133 return flags; 156 return flags;
134} 157}
@@ -138,6 +161,7 @@ void __lockfunc _write_lock_irq(rwlock_t *lock)
138{ 161{
139 local_irq_disable(); 162 local_irq_disable();
140 preempt_disable(); 163 preempt_disable();
164 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
141 _raw_write_lock(lock); 165 _raw_write_lock(lock);
142} 166}
143EXPORT_SYMBOL(_write_lock_irq); 167EXPORT_SYMBOL(_write_lock_irq);
@@ -146,6 +170,7 @@ void __lockfunc _write_lock_bh(rwlock_t *lock)
146{ 170{
147 local_bh_disable(); 171 local_bh_disable();
148 preempt_disable(); 172 preempt_disable();
173 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
149 _raw_write_lock(lock); 174 _raw_write_lock(lock);
150} 175}
151EXPORT_SYMBOL(_write_lock_bh); 176EXPORT_SYMBOL(_write_lock_bh);
@@ -153,6 +178,7 @@ EXPORT_SYMBOL(_write_lock_bh);
153void __lockfunc _spin_lock(spinlock_t *lock) 178void __lockfunc _spin_lock(spinlock_t *lock)
154{ 179{
155 preempt_disable(); 180 preempt_disable();
181 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
156 _raw_spin_lock(lock); 182 _raw_spin_lock(lock);
157} 183}
158 184
@@ -161,6 +187,7 @@ EXPORT_SYMBOL(_spin_lock);
161void __lockfunc _write_lock(rwlock_t *lock) 187void __lockfunc _write_lock(rwlock_t *lock)
162{ 188{
163 preempt_disable(); 189 preempt_disable();
190 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
164 _raw_write_lock(lock); 191 _raw_write_lock(lock);
165} 192}
166 193
@@ -256,8 +283,22 @@ BUILD_LOCK_OPS(write, rwlock);
256 283
257#endif /* CONFIG_PREEMPT */ 284#endif /* CONFIG_PREEMPT */
258 285
286#ifdef CONFIG_DEBUG_LOCK_ALLOC
287
288void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass)
289{
290 preempt_disable();
291 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
292 _raw_spin_lock(lock);
293}
294
295EXPORT_SYMBOL(_spin_lock_nested);
296
297#endif
298
259void __lockfunc _spin_unlock(spinlock_t *lock) 299void __lockfunc _spin_unlock(spinlock_t *lock)
260{ 300{
301 spin_release(&lock->dep_map, 1, _RET_IP_);
261 _raw_spin_unlock(lock); 302 _raw_spin_unlock(lock);
262 preempt_enable(); 303 preempt_enable();
263} 304}
@@ -265,6 +306,7 @@ EXPORT_SYMBOL(_spin_unlock);
265 306
266void __lockfunc _write_unlock(rwlock_t *lock) 307void __lockfunc _write_unlock(rwlock_t *lock)
267{ 308{
309 rwlock_release(&lock->dep_map, 1, _RET_IP_);
268 _raw_write_unlock(lock); 310 _raw_write_unlock(lock);
269 preempt_enable(); 311 preempt_enable();
270} 312}
@@ -272,6 +314,7 @@ EXPORT_SYMBOL(_write_unlock);
272 314
273void __lockfunc _read_unlock(rwlock_t *lock) 315void __lockfunc _read_unlock(rwlock_t *lock)
274{ 316{
317 rwlock_release(&lock->dep_map, 1, _RET_IP_);
275 _raw_read_unlock(lock); 318 _raw_read_unlock(lock);
276 preempt_enable(); 319 preempt_enable();
277} 320}
@@ -279,6 +322,7 @@ EXPORT_SYMBOL(_read_unlock);
279 322
280void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) 323void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
281{ 324{
325 spin_release(&lock->dep_map, 1, _RET_IP_);
282 _raw_spin_unlock(lock); 326 _raw_spin_unlock(lock);
283 local_irq_restore(flags); 327 local_irq_restore(flags);
284 preempt_enable(); 328 preempt_enable();
@@ -287,6 +331,7 @@ EXPORT_SYMBOL(_spin_unlock_irqrestore);
287 331
288void __lockfunc _spin_unlock_irq(spinlock_t *lock) 332void __lockfunc _spin_unlock_irq(spinlock_t *lock)
289{ 333{
334 spin_release(&lock->dep_map, 1, _RET_IP_);
290 _raw_spin_unlock(lock); 335 _raw_spin_unlock(lock);
291 local_irq_enable(); 336 local_irq_enable();
292 preempt_enable(); 337 preempt_enable();
@@ -295,14 +340,16 @@ EXPORT_SYMBOL(_spin_unlock_irq);
295 340
296void __lockfunc _spin_unlock_bh(spinlock_t *lock) 341void __lockfunc _spin_unlock_bh(spinlock_t *lock)
297{ 342{
343 spin_release(&lock->dep_map, 1, _RET_IP_);
298 _raw_spin_unlock(lock); 344 _raw_spin_unlock(lock);
299 preempt_enable_no_resched(); 345 preempt_enable_no_resched();
300 local_bh_enable(); 346 local_bh_enable_ip((unsigned long)__builtin_return_address(0));
301} 347}
302EXPORT_SYMBOL(_spin_unlock_bh); 348EXPORT_SYMBOL(_spin_unlock_bh);
303 349
304void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) 350void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
305{ 351{
352 rwlock_release(&lock->dep_map, 1, _RET_IP_);
306 _raw_read_unlock(lock); 353 _raw_read_unlock(lock);
307 local_irq_restore(flags); 354 local_irq_restore(flags);
308 preempt_enable(); 355 preempt_enable();
@@ -311,6 +358,7 @@ EXPORT_SYMBOL(_read_unlock_irqrestore);
311 358
312void __lockfunc _read_unlock_irq(rwlock_t *lock) 359void __lockfunc _read_unlock_irq(rwlock_t *lock)
313{ 360{
361 rwlock_release(&lock->dep_map, 1, _RET_IP_);
314 _raw_read_unlock(lock); 362 _raw_read_unlock(lock);
315 local_irq_enable(); 363 local_irq_enable();
316 preempt_enable(); 364 preempt_enable();
@@ -319,14 +367,16 @@ EXPORT_SYMBOL(_read_unlock_irq);
319 367
320void __lockfunc _read_unlock_bh(rwlock_t *lock) 368void __lockfunc _read_unlock_bh(rwlock_t *lock)
321{ 369{
370 rwlock_release(&lock->dep_map, 1, _RET_IP_);
322 _raw_read_unlock(lock); 371 _raw_read_unlock(lock);
323 preempt_enable_no_resched(); 372 preempt_enable_no_resched();
324 local_bh_enable(); 373 local_bh_enable_ip((unsigned long)__builtin_return_address(0));
325} 374}
326EXPORT_SYMBOL(_read_unlock_bh); 375EXPORT_SYMBOL(_read_unlock_bh);
327 376
328void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) 377void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
329{ 378{
379 rwlock_release(&lock->dep_map, 1, _RET_IP_);
330 _raw_write_unlock(lock); 380 _raw_write_unlock(lock);
331 local_irq_restore(flags); 381 local_irq_restore(flags);
332 preempt_enable(); 382 preempt_enable();
@@ -335,6 +385,7 @@ EXPORT_SYMBOL(_write_unlock_irqrestore);
335 385
336void __lockfunc _write_unlock_irq(rwlock_t *lock) 386void __lockfunc _write_unlock_irq(rwlock_t *lock)
337{ 387{
388 rwlock_release(&lock->dep_map, 1, _RET_IP_);
338 _raw_write_unlock(lock); 389 _raw_write_unlock(lock);
339 local_irq_enable(); 390 local_irq_enable();
340 preempt_enable(); 391 preempt_enable();
@@ -343,9 +394,10 @@ EXPORT_SYMBOL(_write_unlock_irq);
343 394
344void __lockfunc _write_unlock_bh(rwlock_t *lock) 395void __lockfunc _write_unlock_bh(rwlock_t *lock)
345{ 396{
397 rwlock_release(&lock->dep_map, 1, _RET_IP_);
346 _raw_write_unlock(lock); 398 _raw_write_unlock(lock);
347 preempt_enable_no_resched(); 399 preempt_enable_no_resched();
348 local_bh_enable(); 400 local_bh_enable_ip((unsigned long)__builtin_return_address(0));
349} 401}
350EXPORT_SYMBOL(_write_unlock_bh); 402EXPORT_SYMBOL(_write_unlock_bh);
351 403
@@ -353,11 +405,13 @@ int __lockfunc _spin_trylock_bh(spinlock_t *lock)
353{ 405{
354 local_bh_disable(); 406 local_bh_disable();
355 preempt_disable(); 407 preempt_disable();
356 if (_raw_spin_trylock(lock)) 408 if (_raw_spin_trylock(lock)) {
409 spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
357 return 1; 410 return 1;
411 }
358 412
359 preempt_enable_no_resched(); 413 preempt_enable_no_resched();
360 local_bh_enable(); 414 local_bh_enable_ip((unsigned long)__builtin_return_address(0));
361 return 0; 415 return 0;
362} 416}
363EXPORT_SYMBOL(_spin_trylock_bh); 417EXPORT_SYMBOL(_spin_trylock_bh);
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
new file mode 100644
index 000000000000..b71816e47a30
--- /dev/null
+++ b/kernel/stacktrace.c
@@ -0,0 +1,24 @@
1/*
2 * kernel/stacktrace.c
3 *
4 * Stack trace management functions
5 *
6 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 */
8#include <linux/sched.h>
9#include <linux/kallsyms.h>
10#include <linux/stacktrace.h>
11
12void print_stack_trace(struct stack_trace *trace, int spaces)
13{
14 int i, j;
15
16 for (i = 0; i < trace->nr_entries; i++) {
17 unsigned long ip = trace->entries[i];
18
19 for (j = 0; j < spaces + 1; j++)
20 printk(" ");
21 print_ip_sym(ip);
22 }
23}
24
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index dcfb5d731466..12458040e665 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -1,3 +1,6 @@
1/* Copyright 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation.
2 * GPL v2 and any later version.
3 */
1#include <linux/stop_machine.h> 4#include <linux/stop_machine.h>
2#include <linux/kthread.h> 5#include <linux/kthread.h>
3#include <linux/sched.h> 6#include <linux/sched.h>
@@ -111,7 +114,6 @@ static int stop_machine(void)
111 /* If some failed, kill them all. */ 114 /* If some failed, kill them all. */
112 if (ret < 0) { 115 if (ret < 0) {
113 stopmachine_set_state(STOPMACHINE_EXIT); 116 stopmachine_set_state(STOPMACHINE_EXIT);
114 up(&stopmachine_mutex);
115 return ret; 117 return ret;
116 } 118 }
117 119
diff --git a/kernel/sys.c b/kernel/sys.c
index 90930b28d2ca..b88806c66244 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -4,7 +4,6 @@
4 * Copyright (C) 1991, 1992 Linus Torvalds 4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */ 5 */
6 6
7#include <linux/config.h>
8#include <linux/module.h> 7#include <linux/module.h>
9#include <linux/mm.h> 8#include <linux/mm.h>
10#include <linux/utsname.h> 9#include <linux/utsname.h>
@@ -29,6 +28,7 @@
29#include <linux/tty.h> 28#include <linux/tty.h>
30#include <linux/signal.h> 29#include <linux/signal.h>
31#include <linux/cn_proc.h> 30#include <linux/cn_proc.h>
31#include <linux/getcpu.h>
32 32
33#include <linux/compat.h> 33#include <linux/compat.h>
34#include <linux/syscalls.h> 34#include <linux/syscalls.h>
@@ -137,14 +137,15 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
137 unsigned long val, void *v) 137 unsigned long val, void *v)
138{ 138{
139 int ret = NOTIFY_DONE; 139 int ret = NOTIFY_DONE;
140 struct notifier_block *nb; 140 struct notifier_block *nb, *next_nb;
141 141
142 nb = rcu_dereference(*nl); 142 nb = rcu_dereference(*nl);
143 while (nb) { 143 while (nb) {
144 next_nb = rcu_dereference(nb->next);
144 ret = nb->notifier_call(nb, val, v); 145 ret = nb->notifier_call(nb, val, v);
145 if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK) 146 if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
146 break; 147 break;
147 nb = rcu_dereference(nb->next); 148 nb = next_nb;
148 } 149 }
149 return ret; 150 return ret;
150} 151}
@@ -588,7 +589,7 @@ void emergency_restart(void)
588} 589}
589EXPORT_SYMBOL_GPL(emergency_restart); 590EXPORT_SYMBOL_GPL(emergency_restart);
590 591
591void kernel_restart_prepare(char *cmd) 592static void kernel_restart_prepare(char *cmd)
592{ 593{
593 blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); 594 blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
594 system_state = SYSTEM_RESTART; 595 system_state = SYSTEM_RESTART;
@@ -611,7 +612,6 @@ void kernel_restart(char *cmd)
611 } else { 612 } else {
612 printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd); 613 printk(KERN_EMERG "Restarting system with command '%s'.\n", cmd);
613 } 614 }
614 printk(".\n");
615 machine_restart(cmd); 615 machine_restart(cmd);
616} 616}
617EXPORT_SYMBOL_GPL(kernel_restart); 617EXPORT_SYMBOL_GPL(kernel_restart);
@@ -622,7 +622,7 @@ EXPORT_SYMBOL_GPL(kernel_restart);
622 * Move into place and start executing a preloaded standalone 622 * Move into place and start executing a preloaded standalone
623 * executable. If nothing was preloaded return an error. 623 * executable. If nothing was preloaded return an error.
624 */ 624 */
625void kernel_kexec(void) 625static void kernel_kexec(void)
626{ 626{
627#ifdef CONFIG_KEXEC 627#ifdef CONFIG_KEXEC
628 struct kimage *image; 628 struct kimage *image;
@@ -636,7 +636,6 @@ void kernel_kexec(void)
636 machine_kexec(image); 636 machine_kexec(image);
637#endif 637#endif
638} 638}
639EXPORT_SYMBOL_GPL(kernel_kexec);
640 639
641void kernel_shutdown_prepare(enum system_states state) 640void kernel_shutdown_prepare(enum system_states state)
642{ 641{
@@ -1984,7 +1983,7 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1984 error = current->mm->dumpable; 1983 error = current->mm->dumpable;
1985 break; 1984 break;
1986 case PR_SET_DUMPABLE: 1985 case PR_SET_DUMPABLE:
1987 if (arg2 < 0 || arg2 > 2) { 1986 if (arg2 < 0 || arg2 > 1) {
1988 error = -EINVAL; 1987 error = -EINVAL;
1989 break; 1988 break;
1990 } 1989 }
@@ -2063,3 +2062,33 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
2063 } 2062 }
2064 return error; 2063 return error;
2065} 2064}
2065
2066asmlinkage long sys_getcpu(unsigned __user *cpup, unsigned __user *nodep,
2067 struct getcpu_cache __user *cache)
2068{
2069 int err = 0;
2070 int cpu = raw_smp_processor_id();
2071 if (cpup)
2072 err |= put_user(cpu, cpup);
2073 if (nodep)
2074 err |= put_user(cpu_to_node(cpu), nodep);
2075 if (cache) {
2076 /*
2077 * The cache is not needed for this implementation,
2078 * but make sure user programs pass something
2079 * valid. vsyscall implementations can instead make
2080 * good use of the cache. Only use t0 and t1 because
2081 * these are available in both 32bit and 64bit ABI (no
2082 * need for a compat_getcpu). 32bit has enough
2083 * padding
2084 */
2085 unsigned long t0, t1;
2086 get_user(t0, &cache->blob[0]);
2087 get_user(t1, &cache->blob[1]);
2088 t0++;
2089 t1++;
2090 put_user(t0, &cache->blob[0]);
2091 put_user(t1, &cache->blob[1]);
2092 }
2093 return err ? -EFAULT : 0;
2094}
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index eb8bd214e7d7..c57c4532e296 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -18,7 +18,6 @@
18 * Removed it and replaced it with older style, 03/23/00, Bill Wendling 18 * Removed it and replaced it with older style, 03/23/00, Bill Wendling
19 */ 19 */
20 20
21#include <linux/config.h>
22#include <linux/module.h> 21#include <linux/module.h>
23#include <linux/mm.h> 22#include <linux/mm.h>
24#include <linux/swap.h> 23#include <linux/swap.h>
@@ -53,6 +52,10 @@
53extern int proc_nr_files(ctl_table *table, int write, struct file *filp, 52extern int proc_nr_files(ctl_table *table, int write, struct file *filp,
54 void __user *buffer, size_t *lenp, loff_t *ppos); 53 void __user *buffer, size_t *lenp, loff_t *ppos);
55 54
55#ifdef CONFIG_X86
56#include <asm/nmi.h>
57#endif
58
56#if defined(CONFIG_SYSCTL) 59#if defined(CONFIG_SYSCTL)
57 60
58/* External variables not in a header file. */ 61/* External variables not in a header file. */
@@ -73,12 +76,7 @@ extern int printk_ratelimit_burst;
73extern int pid_max_min, pid_max_max; 76extern int pid_max_min, pid_max_max;
74extern int sysctl_drop_caches; 77extern int sysctl_drop_caches;
75extern int percpu_pagelist_fraction; 78extern int percpu_pagelist_fraction;
76 79extern int compat_log;
77#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
78int unknown_nmi_panic;
79extern int proc_unknown_nmi_panic(ctl_table *, int, struct file *,
80 void __user *, size_t *, loff_t *);
81#endif
82 80
83/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ 81/* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
84static int maxolduid = 65535; 82static int maxolduid = 65535;
@@ -132,8 +130,15 @@ extern int acct_parm[];
132extern int no_unaligned_warning; 130extern int no_unaligned_warning;
133#endif 131#endif
134 132
135static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, 133#ifdef CONFIG_RT_MUTEXES
136 ctl_table *, void **); 134extern int max_lock_depth;
135#endif
136
137#ifdef CONFIG_SYSCTL_SYSCALL
138static int parse_table(int __user *, int, void __user *, size_t __user *,
139 void __user *, size_t, ctl_table *, void **);
140#endif
141
137static int proc_doutsstring(ctl_table *table, int write, struct file *filp, 142static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
138 void __user *buffer, size_t *lenp, loff_t *ppos); 143 void __user *buffer, size_t *lenp, loff_t *ppos);
139 144
@@ -143,7 +148,6 @@ static struct ctl_table_header root_table_header =
143 148
144static ctl_table kern_table[]; 149static ctl_table kern_table[];
145static ctl_table vm_table[]; 150static ctl_table vm_table[];
146static ctl_table proc_table[];
147static ctl_table fs_table[]; 151static ctl_table fs_table[];
148static ctl_table debug_table[]; 152static ctl_table debug_table[];
149static ctl_table dev_table[]; 153static ctl_table dev_table[];
@@ -161,7 +165,7 @@ int sysctl_legacy_va_layout;
161 165
162/* /proc declarations: */ 166/* /proc declarations: */
163 167
164#ifdef CONFIG_PROC_FS 168#ifdef CONFIG_PROC_SYSCTL
165 169
166static ssize_t proc_readsys(struct file *, char __user *, size_t, loff_t *); 170static ssize_t proc_readsys(struct file *, char __user *, size_t, loff_t *);
167static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *); 171static ssize_t proc_writesys(struct file *, const char __user *, size_t, loff_t *);
@@ -203,12 +207,6 @@ static ctl_table root_table[] = {
203 }, 207 },
204#endif 208#endif
205 { 209 {
206 .ctl_name = CTL_PROC,
207 .procname = "proc",
208 .mode = 0555,
209 .child = proc_table,
210 },
211 {
212 .ctl_name = CTL_FS, 210 .ctl_name = CTL_FS,
213 .procname = "fs", 211 .procname = "fs",
214 .mode = 0555, 212 .mode = 0555,
@@ -631,11 +629,27 @@ static ctl_table kern_table[] = {
631 .data = &unknown_nmi_panic, 629 .data = &unknown_nmi_panic,
632 .maxlen = sizeof (int), 630 .maxlen = sizeof (int),
633 .mode = 0644, 631 .mode = 0644,
634 .proc_handler = &proc_unknown_nmi_panic, 632 .proc_handler = &proc_dointvec,
633 },
634 {
635 .ctl_name = KERN_NMI_WATCHDOG,
636 .procname = "nmi_watchdog",
637 .data = &nmi_watchdog_enabled,
638 .maxlen = sizeof (int),
639 .mode = 0644,
640 .proc_handler = &proc_nmi_enabled,
635 }, 641 },
636#endif 642#endif
637#if defined(CONFIG_X86) 643#if defined(CONFIG_X86)
638 { 644 {
645 .ctl_name = KERN_PANIC_ON_NMI,
646 .procname = "panic_on_unrecovered_nmi",
647 .data = &panic_on_unrecovered_nmi,
648 .maxlen = sizeof(int),
649 .mode = 0644,
650 .proc_handler = &proc_dointvec,
651 },
652 {
639 .ctl_name = KERN_BOOTLOADER_TYPE, 653 .ctl_name = KERN_BOOTLOADER_TYPE,
640 .procname = "bootloader_type", 654 .procname = "bootloader_type",
641 .data = &bootloader_type, 655 .data = &bootloader_type,
@@ -684,6 +698,27 @@ static ctl_table kern_table[] = {
684 .proc_handler = &proc_dointvec, 698 .proc_handler = &proc_dointvec,
685 }, 699 },
686#endif 700#endif
701#ifdef CONFIG_COMPAT
702 {
703 .ctl_name = KERN_COMPAT_LOG,
704 .procname = "compat-log",
705 .data = &compat_log,
706 .maxlen = sizeof (int),
707 .mode = 0644,
708 .proc_handler = &proc_dointvec,
709 },
710#endif
711#ifdef CONFIG_RT_MUTEXES
712 {
713 .ctl_name = KERN_MAX_LOCK_DEPTH,
714 .procname = "max_lock_depth",
715 .data = &max_lock_depth,
716 .maxlen = sizeof(int),
717 .mode = 0644,
718 .proc_handler = &proc_dointvec,
719 },
720#endif
721
687 { .ctl_name = 0 } 722 { .ctl_name = 0 }
688}; 723};
689 724
@@ -915,19 +950,40 @@ static ctl_table vm_table[] = {
915 .extra1 = &zero, 950 .extra1 = &zero,
916 }, 951 },
917 { 952 {
918 .ctl_name = VM_ZONE_RECLAIM_INTERVAL, 953 .ctl_name = VM_MIN_UNMAPPED,
919 .procname = "zone_reclaim_interval", 954 .procname = "min_unmapped_ratio",
920 .data = &zone_reclaim_interval, 955 .data = &sysctl_min_unmapped_ratio,
921 .maxlen = sizeof(zone_reclaim_interval), 956 .maxlen = sizeof(sysctl_min_unmapped_ratio),
922 .mode = 0644, 957 .mode = 0644,
923 .proc_handler = &proc_dointvec_jiffies, 958 .proc_handler = &sysctl_min_unmapped_ratio_sysctl_handler,
924 .strategy = &sysctl_jiffies, 959 .strategy = &sysctl_intvec,
960 .extra1 = &zero,
961 .extra2 = &one_hundred,
962 },
963 {
964 .ctl_name = VM_MIN_SLAB,
965 .procname = "min_slab_ratio",
966 .data = &sysctl_min_slab_ratio,
967 .maxlen = sizeof(sysctl_min_slab_ratio),
968 .mode = 0644,
969 .proc_handler = &sysctl_min_slab_ratio_sysctl_handler,
970 .strategy = &sysctl_intvec,
971 .extra1 = &zero,
972 .extra2 = &one_hundred,
973 },
974#endif
975#ifdef CONFIG_X86_32
976 {
977 .ctl_name = VM_VDSO_ENABLED,
978 .procname = "vdso_enabled",
979 .data = &vdso_enabled,
980 .maxlen = sizeof(vdso_enabled),
981 .mode = 0644,
982 .proc_handler = &proc_dointvec,
983 .strategy = &sysctl_intvec,
984 .extra1 = &zero,
925 }, 985 },
926#endif 986#endif
927 { .ctl_name = 0 }
928};
929
930static ctl_table proc_table[] = {
931 { .ctl_name = 0 } 987 { .ctl_name = 0 }
932}; 988};
933 989
@@ -1110,12 +1166,13 @@ static void start_unregistering(struct ctl_table_header *p)
1110 1166
1111void __init sysctl_init(void) 1167void __init sysctl_init(void)
1112{ 1168{
1113#ifdef CONFIG_PROC_FS 1169#ifdef CONFIG_PROC_SYSCTL
1114 register_proc_table(root_table, proc_sys_root, &root_table_header); 1170 register_proc_table(root_table, proc_sys_root, &root_table_header);
1115 init_irq_proc(); 1171 init_irq_proc();
1116#endif 1172#endif
1117} 1173}
1118 1174
1175#ifdef CONFIG_SYSCTL_SYSCALL
1119int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp, 1176int do_sysctl(int __user *name, int nlen, void __user *oldval, size_t __user *oldlenp,
1120 void __user *newval, size_t newlen) 1177 void __user *newval, size_t newlen)
1121{ 1178{
@@ -1169,6 +1226,7 @@ asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
1169 unlock_kernel(); 1226 unlock_kernel();
1170 return error; 1227 return error;
1171} 1228}
1229#endif /* CONFIG_SYSCTL_SYSCALL */
1172 1230
1173/* 1231/*
1174 * ctl_perm does NOT grant the superuser all rights automatically, because 1232 * ctl_perm does NOT grant the superuser all rights automatically, because
@@ -1195,6 +1253,7 @@ static inline int ctl_perm(ctl_table *table, int op)
1195 return test_perm(table->mode, op); 1253 return test_perm(table->mode, op);
1196} 1254}
1197 1255
1256#ifdef CONFIG_SYSCTL_SYSCALL
1198static int parse_table(int __user *name, int nlen, 1257static int parse_table(int __user *name, int nlen,
1199 void __user *oldval, size_t __user *oldlenp, 1258 void __user *oldval, size_t __user *oldlenp,
1200 void __user *newval, size_t newlen, 1259 void __user *newval, size_t newlen,
@@ -1284,6 +1343,7 @@ int do_sysctl_strategy (ctl_table *table,
1284 } 1343 }
1285 return 0; 1344 return 0;
1286} 1345}
1346#endif /* CONFIG_SYSCTL_SYSCALL */
1287 1347
1288/** 1348/**
1289 * register_sysctl_table - register a sysctl hierarchy 1349 * register_sysctl_table - register a sysctl hierarchy
@@ -1371,7 +1431,7 @@ struct ctl_table_header *register_sysctl_table(ctl_table * table,
1371 else 1431 else
1372 list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry); 1432 list_add_tail(&tmp->ctl_entry, &root_table_header.ctl_entry);
1373 spin_unlock(&sysctl_lock); 1433 spin_unlock(&sysctl_lock);
1374#ifdef CONFIG_PROC_FS 1434#ifdef CONFIG_PROC_SYSCTL
1375 register_proc_table(table, proc_sys_root, tmp); 1435 register_proc_table(table, proc_sys_root, tmp);
1376#endif 1436#endif
1377 return tmp; 1437 return tmp;
@@ -1389,18 +1449,31 @@ void unregister_sysctl_table(struct ctl_table_header * header)
1389 might_sleep(); 1449 might_sleep();
1390 spin_lock(&sysctl_lock); 1450 spin_lock(&sysctl_lock);
1391 start_unregistering(header); 1451 start_unregistering(header);
1392#ifdef CONFIG_PROC_FS 1452#ifdef CONFIG_PROC_SYSCTL
1393 unregister_proc_table(header->ctl_table, proc_sys_root); 1453 unregister_proc_table(header->ctl_table, proc_sys_root);
1394#endif 1454#endif
1395 spin_unlock(&sysctl_lock); 1455 spin_unlock(&sysctl_lock);
1396 kfree(header); 1456 kfree(header);
1397} 1457}
1398 1458
1459#else /* !CONFIG_SYSCTL */
1460struct ctl_table_header * register_sysctl_table(ctl_table * table,
1461 int insert_at_head)
1462{
1463 return NULL;
1464}
1465
1466void unregister_sysctl_table(struct ctl_table_header * table)
1467{
1468}
1469
1470#endif /* CONFIG_SYSCTL */
1471
1399/* 1472/*
1400 * /proc/sys support 1473 * /proc/sys support
1401 */ 1474 */
1402 1475
1403#ifdef CONFIG_PROC_FS 1476#ifdef CONFIG_PROC_SYSCTL
1404 1477
1405/* Scan the sysctl entries in table and add them all into /proc */ 1478/* Scan the sysctl entries in table and add them all into /proc */
1406static void register_proc_table(ctl_table * table, struct proc_dir_entry *root, void *set) 1479static void register_proc_table(ctl_table * table, struct proc_dir_entry *root, void *set)
@@ -1839,7 +1912,7 @@ int proc_dointvec_bset(ctl_table *table, int write, struct file *filp,
1839 return -EPERM; 1912 return -EPERM;
1840 } 1913 }
1841 1914
1842 op = (current->pid == 1) ? OP_SET : OP_AND; 1915 op = is_init(current) ? OP_SET : OP_AND;
1843 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, 1916 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos,
1844 do_proc_dointvec_bset_conv,&op); 1917 do_proc_dointvec_bset_conv,&op);
1845} 1918}
@@ -2262,6 +2335,7 @@ int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write,
2262#endif /* CONFIG_PROC_FS */ 2335#endif /* CONFIG_PROC_FS */
2263 2336
2264 2337
2338#ifdef CONFIG_SYSCTL_SYSCALL
2265/* 2339/*
2266 * General sysctl support routines 2340 * General sysctl support routines
2267 */ 2341 */
@@ -2404,11 +2478,19 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
2404 return 1; 2478 return 1;
2405} 2479}
2406 2480
2407#else /* CONFIG_SYSCTL */ 2481#else /* CONFIG_SYSCTL_SYSCALL */
2408 2482
2409 2483
2410asmlinkage long sys_sysctl(struct __sysctl_args __user *args) 2484asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
2411{ 2485{
2486 static int msg_count;
2487
2488 if (msg_count < 5) {
2489 msg_count++;
2490 printk(KERN_INFO
2491 "warning: process `%s' used the removed sysctl "
2492 "system call\n", current->comm);
2493 }
2412 return -ENOSYS; 2494 return -ENOSYS;
2413} 2495}
2414 2496
@@ -2440,73 +2522,7 @@ int sysctl_ms_jiffies(ctl_table *table, int __user *name, int nlen,
2440 return -ENOSYS; 2522 return -ENOSYS;
2441} 2523}
2442 2524
2443int proc_dostring(ctl_table *table, int write, struct file *filp, 2525#endif /* CONFIG_SYSCTL_SYSCALL */
2444 void __user *buffer, size_t *lenp, loff_t *ppos)
2445{
2446 return -ENOSYS;
2447}
2448
2449int proc_dointvec(ctl_table *table, int write, struct file *filp,
2450 void __user *buffer, size_t *lenp, loff_t *ppos)
2451{
2452 return -ENOSYS;
2453}
2454
2455int proc_dointvec_bset(ctl_table *table, int write, struct file *filp,
2456 void __user *buffer, size_t *lenp, loff_t *ppos)
2457{
2458 return -ENOSYS;
2459}
2460
2461int proc_dointvec_minmax(ctl_table *table, int write, struct file *filp,
2462 void __user *buffer, size_t *lenp, loff_t *ppos)
2463{
2464 return -ENOSYS;
2465}
2466
2467int proc_dointvec_jiffies(ctl_table *table, int write, struct file *filp,
2468 void __user *buffer, size_t *lenp, loff_t *ppos)
2469{
2470 return -ENOSYS;
2471}
2472
2473int proc_dointvec_userhz_jiffies(ctl_table *table, int write, struct file *filp,
2474 void __user *buffer, size_t *lenp, loff_t *ppos)
2475{
2476 return -ENOSYS;
2477}
2478
2479int proc_dointvec_ms_jiffies(ctl_table *table, int write, struct file *filp,
2480 void __user *buffer, size_t *lenp, loff_t *ppos)
2481{
2482 return -ENOSYS;
2483}
2484
2485int proc_doulongvec_minmax(ctl_table *table, int write, struct file *filp,
2486 void __user *buffer, size_t *lenp, loff_t *ppos)
2487{
2488 return -ENOSYS;
2489}
2490
2491int proc_doulongvec_ms_jiffies_minmax(ctl_table *table, int write,
2492 struct file *filp,
2493 void __user *buffer,
2494 size_t *lenp, loff_t *ppos)
2495{
2496 return -ENOSYS;
2497}
2498
2499struct ctl_table_header * register_sysctl_table(ctl_table * table,
2500 int insert_at_head)
2501{
2502 return NULL;
2503}
2504
2505void unregister_sysctl_table(struct ctl_table_header * table)
2506{
2507}
2508
2509#endif /* CONFIG_SYSCTL */
2510 2526
2511/* 2527/*
2512 * No sense putting this after each symbol definition, twice, 2528 * No sense putting this after each symbol definition, twice,
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
new file mode 100644
index 000000000000..2ed4040d0dc5
--- /dev/null
+++ b/kernel/taskstats.c
@@ -0,0 +1,564 @@
1/*
2 * taskstats.c - Export per-task statistics to userland
3 *
4 * Copyright (C) Shailabh Nagar, IBM Corp. 2006
5 * (C) Balbir Singh, IBM Corp. 2006
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 */
18
19#include <linux/kernel.h>
20#include <linux/taskstats_kern.h>
21#include <linux/delayacct.h>
22#include <linux/cpumask.h>
23#include <linux/percpu.h>
24#include <net/genetlink.h>
25#include <asm/atomic.h>
26
27/*
28 * Maximum length of a cpumask that can be specified in
29 * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute
30 */
31#define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS)
32
33static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
34static int family_registered;
35kmem_cache_t *taskstats_cache;
36
37static struct genl_family family = {
38 .id = GENL_ID_GENERATE,
39 .name = TASKSTATS_GENL_NAME,
40 .version = TASKSTATS_GENL_VERSION,
41 .maxattr = TASKSTATS_CMD_ATTR_MAX,
42};
43
44static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1]
45__read_mostly = {
46 [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 },
47 [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
48 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
49 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
50
51struct listener {
52 struct list_head list;
53 pid_t pid;
54 char valid;
55};
56
57struct listener_list {
58 struct rw_semaphore sem;
59 struct list_head list;
60};
61static DEFINE_PER_CPU(struct listener_list, listener_array);
62
63enum actions {
64 REGISTER,
65 DEREGISTER,
66 CPU_DONT_CARE
67};
68
69static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
70 void **replyp, size_t size)
71{
72 struct sk_buff *skb;
73 void *reply;
74
75 /*
76 * If new attributes are added, please revisit this allocation
77 */
78 skb = nlmsg_new(size, GFP_KERNEL);
79 if (!skb)
80 return -ENOMEM;
81
82 if (!info) {
83 int seq = get_cpu_var(taskstats_seqnum)++;
84 put_cpu_var(taskstats_seqnum);
85
86 reply = genlmsg_put(skb, 0, seq,
87 family.id, 0, 0,
88 cmd, family.version);
89 } else
90 reply = genlmsg_put(skb, info->snd_pid, info->snd_seq,
91 family.id, 0, 0,
92 cmd, family.version);
93 if (reply == NULL) {
94 nlmsg_free(skb);
95 return -EINVAL;
96 }
97
98 *skbp = skb;
99 *replyp = reply;
100 return 0;
101}
102
103/*
104 * Send taskstats data in @skb to listener with nl_pid @pid
105 */
106static int send_reply(struct sk_buff *skb, pid_t pid)
107{
108 struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
109 void *reply = genlmsg_data(genlhdr);
110 int rc;
111
112 rc = genlmsg_end(skb, reply);
113 if (rc < 0) {
114 nlmsg_free(skb);
115 return rc;
116 }
117
118 return genlmsg_unicast(skb, pid);
119}
120
121/*
122 * Send taskstats data in @skb to listeners registered for @cpu's exit data
123 */
124static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
125{
126 struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
127 struct listener_list *listeners;
128 struct listener *s, *tmp;
129 struct sk_buff *skb_next, *skb_cur = skb;
130 void *reply = genlmsg_data(genlhdr);
131 int rc, delcount = 0;
132
133 rc = genlmsg_end(skb, reply);
134 if (rc < 0) {
135 nlmsg_free(skb);
136 return;
137 }
138
139 rc = 0;
140 listeners = &per_cpu(listener_array, cpu);
141 down_read(&listeners->sem);
142 list_for_each_entry(s, &listeners->list, list) {
143 skb_next = NULL;
144 if (!list_is_last(&s->list, &listeners->list)) {
145 skb_next = skb_clone(skb_cur, GFP_KERNEL);
146 if (!skb_next)
147 break;
148 }
149 rc = genlmsg_unicast(skb_cur, s->pid);
150 if (rc == -ECONNREFUSED) {
151 s->valid = 0;
152 delcount++;
153 }
154 skb_cur = skb_next;
155 }
156 up_read(&listeners->sem);
157
158 if (skb_cur)
159 nlmsg_free(skb_cur);
160
161 if (!delcount)
162 return;
163
164 /* Delete invalidated entries */
165 down_write(&listeners->sem);
166 list_for_each_entry_safe(s, tmp, &listeners->list, list) {
167 if (!s->valid) {
168 list_del(&s->list);
169 kfree(s);
170 }
171 }
172 up_write(&listeners->sem);
173}
174
175static int fill_pid(pid_t pid, struct task_struct *pidtsk,
176 struct taskstats *stats)
177{
178 int rc = 0;
179 struct task_struct *tsk = pidtsk;
180
181 if (!pidtsk) {
182 read_lock(&tasklist_lock);
183 tsk = find_task_by_pid(pid);
184 if (!tsk) {
185 read_unlock(&tasklist_lock);
186 return -ESRCH;
187 }
188 get_task_struct(tsk);
189 read_unlock(&tasklist_lock);
190 } else
191 get_task_struct(tsk);
192
193 /*
194 * Each accounting subsystem adds calls to its functions to
195 * fill in relevant parts of struct taskstsats as follows
196 *
197 * per-task-foo(stats, tsk);
198 */
199
200 delayacct_add_tsk(stats, tsk);
201 stats->version = TASKSTATS_VERSION;
202
203 /* Define err: label here if needed */
204 put_task_struct(tsk);
205 return rc;
206
207}
208
209static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk,
210 struct taskstats *stats)
211{
212 struct task_struct *tsk, *first;
213 unsigned long flags;
214
215 /*
216 * Add additional stats from live tasks except zombie thread group
217 * leaders who are already counted with the dead tasks
218 */
219 first = tgidtsk;
220 if (!first) {
221 read_lock(&tasklist_lock);
222 first = find_task_by_pid(tgid);
223 if (!first) {
224 read_unlock(&tasklist_lock);
225 return -ESRCH;
226 }
227 get_task_struct(first);
228 read_unlock(&tasklist_lock);
229 } else
230 get_task_struct(first);
231
232 /* Start with stats from dead tasks */
233 spin_lock_irqsave(&first->signal->stats_lock, flags);
234 if (first->signal->stats)
235 memcpy(stats, first->signal->stats, sizeof(*stats));
236 spin_unlock_irqrestore(&first->signal->stats_lock, flags);
237
238 tsk = first;
239 read_lock(&tasklist_lock);
240 do {
241 if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk))
242 continue;
243 /*
244 * Accounting subsystem can call its functions here to
245 * fill in relevant parts of struct taskstsats as follows
246 *
247 * per-task-foo(stats, tsk);
248 */
249 delayacct_add_tsk(stats, tsk);
250
251 } while_each_thread(first, tsk);
252 read_unlock(&tasklist_lock);
253 stats->version = TASKSTATS_VERSION;
254
255 /*
256 * Accounting subsytems can also add calls here to modify
257 * fields of taskstats.
258 */
259
260 return 0;
261}
262
263
264static void fill_tgid_exit(struct task_struct *tsk)
265{
266 unsigned long flags;
267
268 spin_lock_irqsave(&tsk->signal->stats_lock, flags);
269 if (!tsk->signal->stats)
270 goto ret;
271
272 /*
273 * Each accounting subsystem calls its functions here to
274 * accumalate its per-task stats for tsk, into the per-tgid structure
275 *
276 * per-task-foo(tsk->signal->stats, tsk);
277 */
278 delayacct_add_tsk(tsk->signal->stats, tsk);
279ret:
280 spin_unlock_irqrestore(&tsk->signal->stats_lock, flags);
281 return;
282}
283
284static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
285{
286 struct listener_list *listeners;
287 struct listener *s, *tmp;
288 unsigned int cpu;
289 cpumask_t mask = *maskp;
290
291 if (!cpus_subset(mask, cpu_possible_map))
292 return -EINVAL;
293
294 if (isadd == REGISTER) {
295 for_each_cpu_mask(cpu, mask) {
296 s = kmalloc_node(sizeof(struct listener), GFP_KERNEL,
297 cpu_to_node(cpu));
298 if (!s)
299 goto cleanup;
300 s->pid = pid;
301 INIT_LIST_HEAD(&s->list);
302 s->valid = 1;
303
304 listeners = &per_cpu(listener_array, cpu);
305 down_write(&listeners->sem);
306 list_add(&s->list, &listeners->list);
307 up_write(&listeners->sem);
308 }
309 return 0;
310 }
311
312 /* Deregister or cleanup */
313cleanup:
314 for_each_cpu_mask(cpu, mask) {
315 listeners = &per_cpu(listener_array, cpu);
316 down_write(&listeners->sem);
317 list_for_each_entry_safe(s, tmp, &listeners->list, list) {
318 if (s->pid == pid) {
319 list_del(&s->list);
320 kfree(s);
321 break;
322 }
323 }
324 up_write(&listeners->sem);
325 }
326 return 0;
327}
328
329static int parse(struct nlattr *na, cpumask_t *mask)
330{
331 char *data;
332 int len;
333 int ret;
334
335 if (na == NULL)
336 return 1;
337 len = nla_len(na);
338 if (len > TASKSTATS_CPUMASK_MAXLEN)
339 return -E2BIG;
340 if (len < 1)
341 return -EINVAL;
342 data = kmalloc(len, GFP_KERNEL);
343 if (!data)
344 return -ENOMEM;
345 nla_strlcpy(data, na, len);
346 ret = cpulist_parse(data, *mask);
347 kfree(data);
348 return ret;
349}
350
351static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
352{
353 int rc = 0;
354 struct sk_buff *rep_skb;
355 struct taskstats stats;
356 void *reply;
357 size_t size;
358 struct nlattr *na;
359 cpumask_t mask;
360
361 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask);
362 if (rc < 0)
363 return rc;
364 if (rc == 0)
365 return add_del_listener(info->snd_pid, &mask, REGISTER);
366
367 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], &mask);
368 if (rc < 0)
369 return rc;
370 if (rc == 0)
371 return add_del_listener(info->snd_pid, &mask, DEREGISTER);
372
373 /*
374 * Size includes space for nested attributes
375 */
376 size = nla_total_size(sizeof(u32)) +
377 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
378
379 memset(&stats, 0, sizeof(stats));
380 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
381 if (rc < 0)
382 return rc;
383
384 if (info->attrs[TASKSTATS_CMD_ATTR_PID]) {
385 u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
386 rc = fill_pid(pid, NULL, &stats);
387 if (rc < 0)
388 goto err;
389
390 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID);
391 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, pid);
392 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
393 stats);
394 } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) {
395 u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
396 rc = fill_tgid(tgid, NULL, &stats);
397 if (rc < 0)
398 goto err;
399
400 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
401 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, tgid);
402 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
403 stats);
404 } else {
405 rc = -EINVAL;
406 goto err;
407 }
408
409 nla_nest_end(rep_skb, na);
410
411 return send_reply(rep_skb, info->snd_pid);
412
413nla_put_failure:
414 return genlmsg_cancel(rep_skb, reply);
415err:
416 nlmsg_free(rep_skb);
417 return rc;
418}
419
420void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu)
421{
422 struct listener_list *listeners;
423 struct taskstats *tmp;
424 /*
425 * This is the cpu on which the task is exiting currently and will
426 * be the one for which the exit event is sent, even if the cpu
427 * on which this function is running changes later.
428 */
429 *mycpu = raw_smp_processor_id();
430
431 *ptidstats = NULL;
432 tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
433 if (!tmp)
434 return;
435
436 listeners = &per_cpu(listener_array, *mycpu);
437 down_read(&listeners->sem);
438 if (!list_empty(&listeners->list)) {
439 *ptidstats = tmp;
440 tmp = NULL;
441 }
442 up_read(&listeners->sem);
443 kfree(tmp);
444}
445
446/* Send pid data out on exit */
447void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
448 int group_dead, unsigned int mycpu)
449{
450 int rc;
451 struct sk_buff *rep_skb;
452 void *reply;
453 size_t size;
454 int is_thread_group;
455 struct nlattr *na;
456 unsigned long flags;
457
458 if (!family_registered || !tidstats)
459 return;
460
461 spin_lock_irqsave(&tsk->signal->stats_lock, flags);
462 is_thread_group = tsk->signal->stats ? 1 : 0;
463 spin_unlock_irqrestore(&tsk->signal->stats_lock, flags);
464
465 rc = 0;
466 /*
467 * Size includes space for nested attributes
468 */
469 size = nla_total_size(sizeof(u32)) +
470 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
471
472 if (is_thread_group)
473 size = 2 * size; /* PID + STATS + TGID + STATS */
474
475 rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
476 if (rc < 0)
477 goto ret;
478
479 rc = fill_pid(tsk->pid, tsk, tidstats);
480 if (rc < 0)
481 goto err_skb;
482
483 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID);
484 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, (u32)tsk->pid);
485 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
486 *tidstats);
487 nla_nest_end(rep_skb, na);
488
489 if (!is_thread_group)
490 goto send;
491
492 /*
493 * tsk has/had a thread group so fill the tsk->signal->stats structure
494 * Doesn't matter if tsk is the leader or the last group member leaving
495 */
496
497 fill_tgid_exit(tsk);
498 if (!group_dead)
499 goto send;
500
501 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
502 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid);
503 /* No locking needed for tsk->signal->stats since group is dead */
504 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
505 *tsk->signal->stats);
506 nla_nest_end(rep_skb, na);
507
508send:
509 send_cpu_listeners(rep_skb, mycpu);
510 return;
511
512nla_put_failure:
513 genlmsg_cancel(rep_skb, reply);
514 goto ret;
515err_skb:
516 nlmsg_free(rep_skb);
517ret:
518 return;
519}
520
521static struct genl_ops taskstats_ops = {
522 .cmd = TASKSTATS_CMD_GET,
523 .doit = taskstats_user_cmd,
524 .policy = taskstats_cmd_get_policy,
525};
526
527/* Needed early in initialization */
528void __init taskstats_init_early(void)
529{
530 unsigned int i;
531
532 taskstats_cache = kmem_cache_create("taskstats_cache",
533 sizeof(struct taskstats),
534 0, SLAB_PANIC, NULL, NULL);
535 for_each_possible_cpu(i) {
536 INIT_LIST_HEAD(&(per_cpu(listener_array, i).list));
537 init_rwsem(&(per_cpu(listener_array, i).sem));
538 }
539}
540
541static int __init taskstats_init(void)
542{
543 int rc;
544
545 rc = genl_register_family(&family);
546 if (rc)
547 return rc;
548
549 rc = genl_register_ops(&family, &taskstats_ops);
550 if (rc < 0)
551 goto err;
552
553 family_registered = 1;
554 return 0;
555err:
556 genl_unregister_family(&family);
557 return rc;
558}
559
560/*
561 * late initcall ensures initialization of statistics collection
562 * mechanisms precedes initialization of the taskstats interface
563 */
564late_initcall(taskstats_init);
diff --git a/kernel/time.c b/kernel/time.c
index b00ddc71cedb..5bd489747643 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -523,6 +523,7 @@ EXPORT_SYMBOL(do_gettimeofday);
523 523
524 524
525#else 525#else
526#ifndef CONFIG_GENERIC_TIME
526/* 527/*
527 * Simulate gettimeofday using do_gettimeofday which only allows a timeval 528 * Simulate gettimeofday using do_gettimeofday which only allows a timeval
528 * and therefore only yields usec accuracy 529 * and therefore only yields usec accuracy
@@ -537,6 +538,7 @@ void getnstimeofday(struct timespec *tv)
537} 538}
538EXPORT_SYMBOL_GPL(getnstimeofday); 539EXPORT_SYMBOL_GPL(getnstimeofday);
539#endif 540#endif
541#endif
540 542
541/* Converts Gregorian date to seconds since 1970-01-01 00:00:00. 543/* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
542 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 544 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
new file mode 100644
index 000000000000..e1dfd8e86cce
--- /dev/null
+++ b/kernel/time/Makefile
@@ -0,0 +1 @@
obj-y += clocksource.o jiffies.o
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
new file mode 100644
index 000000000000..74eca5939bd9
--- /dev/null
+++ b/kernel/time/clocksource.c
@@ -0,0 +1,349 @@
1/*
2 * linux/kernel/time/clocksource.c
3 *
4 * This file contains the functions which manage clocksource drivers.
5 *
6 * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 *
22 * TODO WishList:
23 * o Allow clocksource drivers to be unregistered
24 * o get rid of clocksource_jiffies extern
25 */
26
27#include <linux/clocksource.h>
28#include <linux/sysdev.h>
29#include <linux/init.h>
30#include <linux/module.h>
31
32/* XXX - Would like a better way for initializing curr_clocksource */
33extern struct clocksource clocksource_jiffies;
34
35/*[Clocksource internal variables]---------
36 * curr_clocksource:
37 * currently selected clocksource. Initialized to clocksource_jiffies.
38 * next_clocksource:
39 * pending next selected clocksource.
40 * clocksource_list:
41 * linked list with the registered clocksources
42 * clocksource_lock:
43 * protects manipulations to curr_clocksource and next_clocksource
44 * and the clocksource_list
45 * override_name:
46 * Name of the user-specified clocksource.
47 */
48static struct clocksource *curr_clocksource = &clocksource_jiffies;
49static struct clocksource *next_clocksource;
50static LIST_HEAD(clocksource_list);
51static DEFINE_SPINLOCK(clocksource_lock);
52static char override_name[32];
53static int finished_booting;
54
55/* clocksource_done_booting - Called near the end of bootup
56 *
57 * Hack to avoid lots of clocksource churn at boot time
58 */
59static int __init clocksource_done_booting(void)
60{
61 finished_booting = 1;
62 return 0;
63}
64
65late_initcall(clocksource_done_booting);
66
67/**
68 * clocksource_get_next - Returns the selected clocksource
69 *
70 */
71struct clocksource *clocksource_get_next(void)
72{
73 unsigned long flags;
74
75 spin_lock_irqsave(&clocksource_lock, flags);
76 if (next_clocksource && finished_booting) {
77 curr_clocksource = next_clocksource;
78 next_clocksource = NULL;
79 }
80 spin_unlock_irqrestore(&clocksource_lock, flags);
81
82 return curr_clocksource;
83}
84
85/**
86 * select_clocksource - Finds the best registered clocksource.
87 *
88 * Private function. Must hold clocksource_lock when called.
89 *
90 * Looks through the list of registered clocksources, returning
91 * the one with the highest rating value. If there is a clocksource
92 * name that matches the override string, it returns that clocksource.
93 */
94static struct clocksource *select_clocksource(void)
95{
96 struct clocksource *best = NULL;
97 struct list_head *tmp;
98
99 list_for_each(tmp, &clocksource_list) {
100 struct clocksource *src;
101
102 src = list_entry(tmp, struct clocksource, list);
103 if (!best)
104 best = src;
105
106 /* check for override: */
107 if (strlen(src->name) == strlen(override_name) &&
108 !strcmp(src->name, override_name)) {
109 best = src;
110 break;
111 }
112 /* pick the highest rating: */
113 if (src->rating > best->rating)
114 best = src;
115 }
116
117 return best;
118}
119
120/**
121 * is_registered_source - Checks if clocksource is registered
122 * @c: pointer to a clocksource
123 *
124 * Private helper function. Must hold clocksource_lock when called.
125 *
126 * Returns one if the clocksource is already registered, zero otherwise.
127 */
128static int is_registered_source(struct clocksource *c)
129{
130 int len = strlen(c->name);
131 struct list_head *tmp;
132
133 list_for_each(tmp, &clocksource_list) {
134 struct clocksource *src;
135
136 src = list_entry(tmp, struct clocksource, list);
137 if (strlen(src->name) == len && !strcmp(src->name, c->name))
138 return 1;
139 }
140
141 return 0;
142}
143
144/**
145 * clocksource_register - Used to install new clocksources
146 * @t: clocksource to be registered
147 *
148 * Returns -EBUSY if registration fails, zero otherwise.
149 */
150int clocksource_register(struct clocksource *c)
151{
152 int ret = 0;
153 unsigned long flags;
154
155 spin_lock_irqsave(&clocksource_lock, flags);
156 /* check if clocksource is already registered */
157 if (is_registered_source(c)) {
158 printk("register_clocksource: Cannot register %s. "
159 "Already registered!", c->name);
160 ret = -EBUSY;
161 } else {
162 /* register it */
163 list_add(&c->list, &clocksource_list);
164 /* scan the registered clocksources, and pick the best one */
165 next_clocksource = select_clocksource();
166 }
167 spin_unlock_irqrestore(&clocksource_lock, flags);
168 return ret;
169}
170EXPORT_SYMBOL(clocksource_register);
171
172/**
173 * clocksource_reselect - Rescan list for next clocksource
174 *
175 * A quick helper function to be used if a clocksource changes its
176 * rating. Forces the clocksource list to be re-scanned for the best
177 * clocksource.
178 */
179void clocksource_reselect(void)
180{
181 unsigned long flags;
182
183 spin_lock_irqsave(&clocksource_lock, flags);
184 next_clocksource = select_clocksource();
185 spin_unlock_irqrestore(&clocksource_lock, flags);
186}
187EXPORT_SYMBOL(clocksource_reselect);
188
189/**
190 * sysfs_show_current_clocksources - sysfs interface for current clocksource
191 * @dev: unused
192 * @buf: char buffer to be filled with clocksource list
193 *
194 * Provides sysfs interface for listing current clocksource.
195 */
196static ssize_t
197sysfs_show_current_clocksources(struct sys_device *dev, char *buf)
198{
199 char *curr = buf;
200
201 spin_lock_irq(&clocksource_lock);
202 curr += sprintf(curr, "%s ", curr_clocksource->name);
203 spin_unlock_irq(&clocksource_lock);
204
205 curr += sprintf(curr, "\n");
206
207 return curr - buf;
208}
209
210/**
211 * sysfs_override_clocksource - interface for manually overriding clocksource
212 * @dev: unused
213 * @buf: name of override clocksource
214 * @count: length of buffer
215 *
216 * Takes input from sysfs interface for manually overriding the default
217 * clocksource selction.
218 */
219static ssize_t sysfs_override_clocksource(struct sys_device *dev,
220 const char *buf, size_t count)
221{
222 size_t ret = count;
223 /* strings from sysfs write are not 0 terminated! */
224 if (count >= sizeof(override_name))
225 return -EINVAL;
226
227 /* strip of \n: */
228 if (buf[count-1] == '\n')
229 count--;
230 if (count < 1)
231 return -EINVAL;
232
233 spin_lock_irq(&clocksource_lock);
234
235 /* copy the name given: */
236 memcpy(override_name, buf, count);
237 override_name[count] = 0;
238
239 /* try to select it: */
240 next_clocksource = select_clocksource();
241
242 spin_unlock_irq(&clocksource_lock);
243
244 return ret;
245}
246
247/**
248 * sysfs_show_available_clocksources - sysfs interface for listing clocksource
249 * @dev: unused
250 * @buf: char buffer to be filled with clocksource list
251 *
252 * Provides sysfs interface for listing registered clocksources
253 */
254static ssize_t
255sysfs_show_available_clocksources(struct sys_device *dev, char *buf)
256{
257 struct list_head *tmp;
258 char *curr = buf;
259
260 spin_lock_irq(&clocksource_lock);
261 list_for_each(tmp, &clocksource_list) {
262 struct clocksource *src;
263
264 src = list_entry(tmp, struct clocksource, list);
265 curr += sprintf(curr, "%s ", src->name);
266 }
267 spin_unlock_irq(&clocksource_lock);
268
269 curr += sprintf(curr, "\n");
270
271 return curr - buf;
272}
273
274/*
275 * Sysfs setup bits:
276 */
277static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources,
278 sysfs_override_clocksource);
279
280static SYSDEV_ATTR(available_clocksource, 0600,
281 sysfs_show_available_clocksources, NULL);
282
283static struct sysdev_class clocksource_sysclass = {
284 set_kset_name("clocksource"),
285};
286
287static struct sys_device device_clocksource = {
288 .id = 0,
289 .cls = &clocksource_sysclass,
290};
291
292static int __init init_clocksource_sysfs(void)
293{
294 int error = sysdev_class_register(&clocksource_sysclass);
295
296 if (!error)
297 error = sysdev_register(&device_clocksource);
298 if (!error)
299 error = sysdev_create_file(
300 &device_clocksource,
301 &attr_current_clocksource);
302 if (!error)
303 error = sysdev_create_file(
304 &device_clocksource,
305 &attr_available_clocksource);
306 return error;
307}
308
309device_initcall(init_clocksource_sysfs);
310
311/**
312 * boot_override_clocksource - boot clock override
313 * @str: override name
314 *
315 * Takes a clocksource= boot argument and uses it
316 * as the clocksource override name.
317 */
318static int __init boot_override_clocksource(char* str)
319{
320 unsigned long flags;
321 spin_lock_irqsave(&clocksource_lock, flags);
322 if (str)
323 strlcpy(override_name, str, sizeof(override_name));
324 spin_unlock_irqrestore(&clocksource_lock, flags);
325 return 1;
326}
327
328__setup("clocksource=", boot_override_clocksource);
329
330/**
331 * boot_override_clock - Compatibility layer for deprecated boot option
332 * @str: override name
333 *
334 * DEPRECATED! Takes a clock= boot argument and uses it
335 * as the clocksource override name
336 */
337static int __init boot_override_clock(char* str)
338{
339 if (!strcmp(str, "pmtmr")) {
340 printk("Warning: clock=pmtmr is deprecated. "
341 "Use clocksource=acpi_pm.\n");
342 return boot_override_clocksource("acpi_pm");
343 }
344 printk("Warning! clock= boot option is deprecated. "
345 "Use clocksource=xyz\n");
346 return boot_override_clocksource(str);
347}
348
349__setup("clock=", boot_override_clock);
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
new file mode 100644
index 000000000000..126bb30c4afe
--- /dev/null
+++ b/kernel/time/jiffies.c
@@ -0,0 +1,73 @@
1/***********************************************************************
2* linux/kernel/time/jiffies.c
3*
4* This file contains the jiffies based clocksource.
5*
6* Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
7*
8* This program is free software; you can redistribute it and/or modify
9* it under the terms of the GNU General Public License as published by
10* the Free Software Foundation; either version 2 of the License, or
11* (at your option) any later version.
12*
13* This program is distributed in the hope that it will be useful,
14* but WITHOUT ANY WARRANTY; without even the implied warranty of
15* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16* GNU General Public License for more details.
17*
18* You should have received a copy of the GNU General Public License
19* along with this program; if not, write to the Free Software
20* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21*
22************************************************************************/
23#include <linux/clocksource.h>
24#include <linux/jiffies.h>
25#include <linux/init.h>
26
27/* The Jiffies based clocksource is the lowest common
28 * denominator clock source which should function on
29 * all systems. It has the same coarse resolution as
30 * the timer interrupt frequency HZ and it suffers
31 * inaccuracies caused by missed or lost timer
32 * interrupts and the inability for the timer
33 * interrupt hardware to accuratly tick at the
34 * requested HZ value. It is also not reccomended
35 * for "tick-less" systems.
36 */
37#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ))
38
39/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier
40 * conversion, the .shift value could be zero. However
41 * this would make NTP adjustments impossible as they are
42 * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to
43 * shift both the nominator and denominator the same
44 * amount, and give ntp adjustments in units of 1/2^8
45 *
46 * The value 8 is somewhat carefully chosen, as anything
47 * larger can result in overflows. NSEC_PER_JIFFY grows as
48 * HZ shrinks, so values greater then 8 overflow 32bits when
49 * HZ=100.
50 */
51#define JIFFIES_SHIFT 8
52
53static cycle_t jiffies_read(void)
54{
55 return (cycle_t) jiffies;
56}
57
58struct clocksource clocksource_jiffies = {
59 .name = "jiffies",
60 .rating = 0, /* lowest rating*/
61 .read = jiffies_read,
62 .mask = 0xffffffff, /*32bits*/
63 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
64 .shift = JIFFIES_SHIFT,
65 .is_continuous = 0, /* tick based, not free running */
66};
67
68static int __init init_jiffies_clocksource(void)
69{
70 return clocksource_register(&clocksource_jiffies);
71}
72
73module_init(init_jiffies_clocksource);
diff --git a/kernel/timer.c b/kernel/timer.c
index f35b3939e937..4f55622b0d38 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -84,7 +84,7 @@ typedef struct tvec_t_base_s tvec_base_t;
84 84
85tvec_base_t boot_tvec_bases; 85tvec_base_t boot_tvec_bases;
86EXPORT_SYMBOL(boot_tvec_bases); 86EXPORT_SYMBOL(boot_tvec_bases);
87static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = { &boot_tvec_bases }; 87static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases;
88 88
89static inline void set_running_timer(tvec_base_t *base, 89static inline void set_running_timer(tvec_base_t *base,
90 struct timer_list *timer) 90 struct timer_list *timer)
@@ -136,7 +136,7 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
136 list_add_tail(&timer->entry, vec); 136 list_add_tail(&timer->entry, vec);
137} 137}
138 138
139/*** 139/**
140 * init_timer - initialize a timer. 140 * init_timer - initialize a timer.
141 * @timer: the timer to be initialized 141 * @timer: the timer to be initialized
142 * 142 *
@@ -146,7 +146,7 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
146void fastcall init_timer(struct timer_list *timer) 146void fastcall init_timer(struct timer_list *timer)
147{ 147{
148 timer->entry.next = NULL; 148 timer->entry.next = NULL;
149 timer->base = per_cpu(tvec_bases, raw_smp_processor_id()); 149 timer->base = __raw_get_cpu_var(tvec_bases);
150} 150}
151EXPORT_SYMBOL(init_timer); 151EXPORT_SYMBOL(init_timer);
152 152
@@ -175,6 +175,7 @@ static inline void detach_timer(struct timer_list *timer,
175 */ 175 */
176static tvec_base_t *lock_timer_base(struct timer_list *timer, 176static tvec_base_t *lock_timer_base(struct timer_list *timer,
177 unsigned long *flags) 177 unsigned long *flags)
178 __acquires(timer->base->lock)
178{ 179{
179 tvec_base_t *base; 180 tvec_base_t *base;
180 181
@@ -235,7 +236,7 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
235 236
236EXPORT_SYMBOL(__mod_timer); 237EXPORT_SYMBOL(__mod_timer);
237 238
238/*** 239/**
239 * add_timer_on - start a timer on a particular CPU 240 * add_timer_on - start a timer on a particular CPU
240 * @timer: the timer to be added 241 * @timer: the timer to be added
241 * @cpu: the CPU to start it on 242 * @cpu: the CPU to start it on
@@ -255,9 +256,10 @@ void add_timer_on(struct timer_list *timer, int cpu)
255} 256}
256 257
257 258
258/*** 259/**
259 * mod_timer - modify a timer's timeout 260 * mod_timer - modify a timer's timeout
260 * @timer: the timer to be modified 261 * @timer: the timer to be modified
262 * @expires: new timeout in jiffies
261 * 263 *
262 * mod_timer is a more efficient way to update the expire field of an 264 * mod_timer is a more efficient way to update the expire field of an
263 * active timer (if the timer is inactive it will be activated) 265 * active timer (if the timer is inactive it will be activated)
@@ -291,7 +293,7 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
291 293
292EXPORT_SYMBOL(mod_timer); 294EXPORT_SYMBOL(mod_timer);
293 295
294/*** 296/**
295 * del_timer - deactive a timer. 297 * del_timer - deactive a timer.
296 * @timer: the timer to be deactivated 298 * @timer: the timer to be deactivated
297 * 299 *
@@ -323,7 +325,10 @@ int del_timer(struct timer_list *timer)
323EXPORT_SYMBOL(del_timer); 325EXPORT_SYMBOL(del_timer);
324 326
325#ifdef CONFIG_SMP 327#ifdef CONFIG_SMP
326/* 328/**
329 * try_to_del_timer_sync - Try to deactivate a timer
330 * @timer: timer do del
331 *
327 * This function tries to deactivate a timer. Upon successful (ret >= 0) 332 * This function tries to deactivate a timer. Upon successful (ret >= 0)
328 * exit the timer is not queued and the handler is not running on any CPU. 333 * exit the timer is not queued and the handler is not running on any CPU.
329 * 334 *
@@ -351,7 +356,7 @@ out:
351 return ret; 356 return ret;
352} 357}
353 358
354/*** 359/**
355 * del_timer_sync - deactivate a timer and wait for the handler to finish. 360 * del_timer_sync - deactivate a timer and wait for the handler to finish.
356 * @timer: the timer to be deactivated 361 * @timer: the timer to be deactivated
357 * 362 *
@@ -374,6 +379,7 @@ int del_timer_sync(struct timer_list *timer)
374 int ret = try_to_del_timer_sync(timer); 379 int ret = try_to_del_timer_sync(timer);
375 if (ret >= 0) 380 if (ret >= 0)
376 return ret; 381 return ret;
382 cpu_relax();
377 } 383 }
378} 384}
379 385
@@ -400,15 +406,15 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
400 return index; 406 return index;
401} 407}
402 408
403/*** 409#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
410
411/**
404 * __run_timers - run all expired timers (if any) on this CPU. 412 * __run_timers - run all expired timers (if any) on this CPU.
405 * @base: the timer vector to be processed. 413 * @base: the timer vector to be processed.
406 * 414 *
407 * This function cascades all vectors and executes all expired timer 415 * This function cascades all vectors and executes all expired timer
408 * vectors. 416 * vectors.
409 */ 417 */
410#define INDEX(N) (base->timer_jiffies >> (TVR_BITS + N * TVN_BITS)) & TVN_MASK
411
412static inline void __run_timers(tvec_base_t *base) 418static inline void __run_timers(tvec_base_t *base)
413{ 419{
414 struct timer_list *timer; 420 struct timer_list *timer;
@@ -597,7 +603,6 @@ long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */
597long time_precision = 1; /* clock precision (us) */ 603long time_precision = 1; /* clock precision (us) */
598long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ 604long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */
599long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ 605long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */
600static long time_phase; /* phase offset (scaled us) */
601long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC; 606long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC;
602 /* frequency offset (scaled ppm)*/ 607 /* frequency offset (scaled ppm)*/
603static long time_adj; /* tick adjust (scaled 1 / HZ) */ 608static long time_adj; /* tick adjust (scaled 1 / HZ) */
@@ -747,27 +752,14 @@ static long adjtime_adjustment(void)
747} 752}
748 753
749/* in the NTP reference this is called "hardclock()" */ 754/* in the NTP reference this is called "hardclock()" */
750static void update_wall_time_one_tick(void) 755static void update_ntp_one_tick(void)
751{ 756{
752 long time_adjust_step, delta_nsec; 757 long time_adjust_step;
753 758
754 time_adjust_step = adjtime_adjustment(); 759 time_adjust_step = adjtime_adjustment();
755 if (time_adjust_step) 760 if (time_adjust_step)
756 /* Reduce by this step the amount of time left */ 761 /* Reduce by this step the amount of time left */
757 time_adjust -= time_adjust_step; 762 time_adjust -= time_adjust_step;
758 delta_nsec = tick_nsec + time_adjust_step * 1000;
759 /*
760 * Advance the phase, once it gets to one microsecond, then
761 * advance the tick more.
762 */
763 time_phase += time_adj;
764 if ((time_phase >= FINENSEC) || (time_phase <= -FINENSEC)) {
765 long ltemp = shift_right(time_phase, (SHIFT_SCALE - 10));
766 time_phase -= ltemp << (SHIFT_SCALE - 10);
767 delta_nsec += ltemp;
768 }
769 xtime.tv_nsec += delta_nsec;
770 time_interpolator_update(delta_nsec);
771 763
772 /* Changes by adjtime() do not take effect till next tick. */ 764 /* Changes by adjtime() do not take effect till next tick. */
773 if (time_next_adjust != 0) { 765 if (time_next_adjust != 0) {
@@ -780,36 +772,404 @@ static void update_wall_time_one_tick(void)
780 * Return how long ticks are at the moment, that is, how much time 772 * Return how long ticks are at the moment, that is, how much time
781 * update_wall_time_one_tick will add to xtime next time we call it 773 * update_wall_time_one_tick will add to xtime next time we call it
782 * (assuming no calls to do_adjtimex in the meantime). 774 * (assuming no calls to do_adjtimex in the meantime).
783 * The return value is in fixed-point nanoseconds with SHIFT_SCALE-10 775 * The return value is in fixed-point nanoseconds shifted by the
784 * bits to the right of the binary point. 776 * specified number of bits to the right of the binary point.
785 * This function has no side-effects. 777 * This function has no side-effects.
786 */ 778 */
787u64 current_tick_length(void) 779u64 current_tick_length(void)
788{ 780{
789 long delta_nsec; 781 long delta_nsec;
782 u64 ret;
790 783
784 /* calculate the finest interval NTP will allow.
785 * ie: nanosecond value shifted by (SHIFT_SCALE - 10)
786 */
791 delta_nsec = tick_nsec + adjtime_adjustment() * 1000; 787 delta_nsec = tick_nsec + adjtime_adjustment() * 1000;
792 return ((u64) delta_nsec << (SHIFT_SCALE - 10)) + time_adj; 788 ret = (u64)delta_nsec << TICK_LENGTH_SHIFT;
789 ret += (s64)time_adj << (TICK_LENGTH_SHIFT - (SHIFT_SCALE - 10));
790
791 return ret;
793} 792}
794 793
795/* 794/* XXX - all of this timekeeping code should be later moved to time.c */
796 * Using a loop looks inefficient, but "ticks" is 795#include <linux/clocksource.h>
797 * usually just one (we shouldn't be losing ticks, 796static struct clocksource *clock; /* pointer to current clocksource */
798 * we're doing this this way mainly for interrupt 797
799 * latency reasons, not because we think we'll 798#ifdef CONFIG_GENERIC_TIME
800 * have lots of lost timer ticks 799/**
800 * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook
801 *
802 * private function, must hold xtime_lock lock when being
803 * called. Returns the number of nanoseconds since the
804 * last call to update_wall_time() (adjusted by NTP scaling)
805 */
806static inline s64 __get_nsec_offset(void)
807{
808 cycle_t cycle_now, cycle_delta;
809 s64 ns_offset;
810
811 /* read clocksource: */
812 cycle_now = clocksource_read(clock);
813
814 /* calculate the delta since the last update_wall_time: */
815 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
816
817 /* convert to nanoseconds: */
818 ns_offset = cyc2ns(clock, cycle_delta);
819
820 return ns_offset;
821}
822
823/**
824 * __get_realtime_clock_ts - Returns the time of day in a timespec
825 * @ts: pointer to the timespec to be set
826 *
827 * Returns the time of day in a timespec. Used by
828 * do_gettimeofday() and get_realtime_clock_ts().
829 */
830static inline void __get_realtime_clock_ts(struct timespec *ts)
831{
832 unsigned long seq;
833 s64 nsecs;
834
835 do {
836 seq = read_seqbegin(&xtime_lock);
837
838 *ts = xtime;
839 nsecs = __get_nsec_offset();
840
841 } while (read_seqretry(&xtime_lock, seq));
842
843 timespec_add_ns(ts, nsecs);
844}
845
846/**
847 * getnstimeofday - Returns the time of day in a timespec
848 * @ts: pointer to the timespec to be set
849 *
850 * Returns the time of day in a timespec.
851 */
852void getnstimeofday(struct timespec *ts)
853{
854 __get_realtime_clock_ts(ts);
855}
856
857EXPORT_SYMBOL(getnstimeofday);
858
859/**
860 * do_gettimeofday - Returns the time of day in a timeval
861 * @tv: pointer to the timeval to be set
862 *
863 * NOTE: Users should be converted to using get_realtime_clock_ts()
801 */ 864 */
802static void update_wall_time(unsigned long ticks) 865void do_gettimeofday(struct timeval *tv)
803{ 866{
867 struct timespec now;
868
869 __get_realtime_clock_ts(&now);
870 tv->tv_sec = now.tv_sec;
871 tv->tv_usec = now.tv_nsec/1000;
872}
873
874EXPORT_SYMBOL(do_gettimeofday);
875/**
876 * do_settimeofday - Sets the time of day
877 * @tv: pointer to the timespec variable containing the new time
878 *
879 * Sets the time of day to the new time and update NTP and notify hrtimers
880 */
881int do_settimeofday(struct timespec *tv)
882{
883 unsigned long flags;
884 time_t wtm_sec, sec = tv->tv_sec;
885 long wtm_nsec, nsec = tv->tv_nsec;
886
887 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
888 return -EINVAL;
889
890 write_seqlock_irqsave(&xtime_lock, flags);
891
892 nsec -= __get_nsec_offset();
893
894 wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
895 wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
896
897 set_normalized_timespec(&xtime, sec, nsec);
898 set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
899
900 clock->error = 0;
901 ntp_clear();
902
903 write_sequnlock_irqrestore(&xtime_lock, flags);
904
905 /* signal hrtimers about time change */
906 clock_was_set();
907
908 return 0;
909}
910
911EXPORT_SYMBOL(do_settimeofday);
912
913/**
914 * change_clocksource - Swaps clocksources if a new one is available
915 *
916 * Accumulates current time interval and initializes new clocksource
917 */
918static int change_clocksource(void)
919{
920 struct clocksource *new;
921 cycle_t now;
922 u64 nsec;
923 new = clocksource_get_next();
924 if (clock != new) {
925 now = clocksource_read(new);
926 nsec = __get_nsec_offset();
927 timespec_add_ns(&xtime, nsec);
928
929 clock = new;
930 clock->cycle_last = now;
931 printk(KERN_INFO "Time: %s clocksource has been installed.\n",
932 clock->name);
933 return 1;
934 } else if (clock->update_callback) {
935 return clock->update_callback();
936 }
937 return 0;
938}
939#else
940#define change_clocksource() (0)
941#endif
942
943/**
944 * timeofday_is_continuous - check to see if timekeeping is free running
945 */
946int timekeeping_is_continuous(void)
947{
948 unsigned long seq;
949 int ret;
950
804 do { 951 do {
805 ticks--; 952 seq = read_seqbegin(&xtime_lock);
806 update_wall_time_one_tick(); 953
807 if (xtime.tv_nsec >= 1000000000) { 954 ret = clock->is_continuous;
808 xtime.tv_nsec -= 1000000000; 955
956 } while (read_seqretry(&xtime_lock, seq));
957
958 return ret;
959}
960
961/*
962 * timekeeping_init - Initializes the clocksource and common timekeeping values
963 */
964void __init timekeeping_init(void)
965{
966 unsigned long flags;
967
968 write_seqlock_irqsave(&xtime_lock, flags);
969 clock = clocksource_get_next();
970 clocksource_calculate_interval(clock, tick_nsec);
971 clock->cycle_last = clocksource_read(clock);
972 ntp_clear();
973 write_sequnlock_irqrestore(&xtime_lock, flags);
974}
975
976
977static int timekeeping_suspended;
978/**
979 * timekeeping_resume - Resumes the generic timekeeping subsystem.
980 * @dev: unused
981 *
982 * This is for the generic clocksource timekeeping.
983 * xtime/wall_to_monotonic/jiffies/wall_jiffies/etc are
984 * still managed by arch specific suspend/resume code.
985 */
986static int timekeeping_resume(struct sys_device *dev)
987{
988 unsigned long flags;
989
990 write_seqlock_irqsave(&xtime_lock, flags);
991 /* restart the last cycle value */
992 clock->cycle_last = clocksource_read(clock);
993 clock->error = 0;
994 timekeeping_suspended = 0;
995 write_sequnlock_irqrestore(&xtime_lock, flags);
996 return 0;
997}
998
999static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
1000{
1001 unsigned long flags;
1002
1003 write_seqlock_irqsave(&xtime_lock, flags);
1004 timekeeping_suspended = 1;
1005 write_sequnlock_irqrestore(&xtime_lock, flags);
1006 return 0;
1007}
1008
1009/* sysfs resume/suspend bits for timekeeping */
1010static struct sysdev_class timekeeping_sysclass = {
1011 .resume = timekeeping_resume,
1012 .suspend = timekeeping_suspend,
1013 set_kset_name("timekeeping"),
1014};
1015
1016static struct sys_device device_timer = {
1017 .id = 0,
1018 .cls = &timekeeping_sysclass,
1019};
1020
1021static int __init timekeeping_init_device(void)
1022{
1023 int error = sysdev_class_register(&timekeeping_sysclass);
1024 if (!error)
1025 error = sysdev_register(&device_timer);
1026 return error;
1027}
1028
1029device_initcall(timekeeping_init_device);
1030
1031/*
1032 * If the error is already larger, we look ahead even further
1033 * to compensate for late or lost adjustments.
1034 */
1035static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 *offset)
1036{
1037 s64 tick_error, i;
1038 u32 look_ahead, adj;
1039 s32 error2, mult;
1040
1041 /*
1042 * Use the current error value to determine how much to look ahead.
1043 * The larger the error the slower we adjust for it to avoid problems
1044 * with losing too many ticks, otherwise we would overadjust and
1045 * produce an even larger error. The smaller the adjustment the
1046 * faster we try to adjust for it, as lost ticks can do less harm
1047 * here. This is tuned so that an error of about 1 msec is adusted
1048 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
1049 */
1050 error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ);
1051 error2 = abs(error2);
1052 for (look_ahead = 0; error2 > 0; look_ahead++)
1053 error2 >>= 2;
1054
1055 /*
1056 * Now calculate the error in (1 << look_ahead) ticks, but first
1057 * remove the single look ahead already included in the error.
1058 */
1059 tick_error = current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1);
1060 tick_error -= clock->xtime_interval >> 1;
1061 error = ((error - tick_error) >> look_ahead) + tick_error;
1062
1063 /* Finally calculate the adjustment shift value. */
1064 i = *interval;
1065 mult = 1;
1066 if (error < 0) {
1067 error = -error;
1068 *interval = -*interval;
1069 *offset = -*offset;
1070 mult = -1;
1071 }
1072 for (adj = 0; error > i; adj++)
1073 error >>= 1;
1074
1075 *interval <<= adj;
1076 *offset <<= adj;
1077 return mult << adj;
1078}
1079
1080/*
1081 * Adjust the multiplier to reduce the error value,
1082 * this is optimized for the most common adjustments of -1,0,1,
1083 * for other values we can do a bit more work.
1084 */
1085static void clocksource_adjust(struct clocksource *clock, s64 offset)
1086{
1087 s64 error, interval = clock->cycle_interval;
1088 int adj;
1089
1090 error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1);
1091 if (error > interval) {
1092 error >>= 2;
1093 if (likely(error <= interval))
1094 adj = 1;
1095 else
1096 adj = clocksource_bigadjust(error, &interval, &offset);
1097 } else if (error < -interval) {
1098 error >>= 2;
1099 if (likely(error >= -interval)) {
1100 adj = -1;
1101 interval = -interval;
1102 offset = -offset;
1103 } else
1104 adj = clocksource_bigadjust(error, &interval, &offset);
1105 } else
1106 return;
1107
1108 clock->mult += adj;
1109 clock->xtime_interval += interval;
1110 clock->xtime_nsec -= offset;
1111 clock->error -= (interval - offset) << (TICK_LENGTH_SHIFT - clock->shift);
1112}
1113
1114/**
1115 * update_wall_time - Uses the current clocksource to increment the wall time
1116 *
1117 * Called from the timer interrupt, must hold a write on xtime_lock.
1118 */
1119static void update_wall_time(void)
1120{
1121 cycle_t offset;
1122
1123 /* Make sure we're fully resumed: */
1124 if (unlikely(timekeeping_suspended))
1125 return;
1126
1127#ifdef CONFIG_GENERIC_TIME
1128 offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask;
1129#else
1130 offset = clock->cycle_interval;
1131#endif
1132 clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
1133
1134 /* normally this loop will run just once, however in the
1135 * case of lost or late ticks, it will accumulate correctly.
1136 */
1137 while (offset >= clock->cycle_interval) {
1138 /* accumulate one interval */
1139 clock->xtime_nsec += clock->xtime_interval;
1140 clock->cycle_last += clock->cycle_interval;
1141 offset -= clock->cycle_interval;
1142
1143 if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
1144 clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
809 xtime.tv_sec++; 1145 xtime.tv_sec++;
810 second_overflow(); 1146 second_overflow();
811 } 1147 }
812 } while (ticks); 1148
1149 /* interpolator bits */
1150 time_interpolator_update(clock->xtime_interval
1151 >> clock->shift);
1152 /* increment the NTP state machine */
1153 update_ntp_one_tick();
1154
1155 /* accumulate error between NTP and clock interval */
1156 clock->error += current_tick_length();
1157 clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift);
1158 }
1159
1160 /* correct the clock when NTP error is too big */
1161 clocksource_adjust(clock, offset);
1162
1163 /* store full nanoseconds into xtime */
1164 xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift;
1165 clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
1166
1167 /* check to see if there is a new clocksource to use */
1168 if (change_clocksource()) {
1169 clock->error = 0;
1170 clock->xtime_nsec = 0;
1171 clocksource_calculate_interval(clock, tick_nsec);
1172 }
813} 1173}
814 1174
815/* 1175/*
@@ -862,10 +1222,8 @@ static inline void calc_load(unsigned long ticks)
862 unsigned long active_tasks; /* fixed-point */ 1222 unsigned long active_tasks; /* fixed-point */
863 static int count = LOAD_FREQ; 1223 static int count = LOAD_FREQ;
864 1224
865 count -= ticks; 1225 active_tasks = count_active_tasks();
866 if (count < 0) { 1226 for (count -= ticks; count < 0; count += LOAD_FREQ) {
867 count += LOAD_FREQ;
868 active_tasks = count_active_tasks();
869 CALC_LOAD(avenrun[0], EXP_1, active_tasks); 1227 CALC_LOAD(avenrun[0], EXP_1, active_tasks);
870 CALC_LOAD(avenrun[1], EXP_5, active_tasks); 1228 CALC_LOAD(avenrun[1], EXP_5, active_tasks);
871 CALC_LOAD(avenrun[2], EXP_15, active_tasks); 1229 CALC_LOAD(avenrun[2], EXP_15, active_tasks);
@@ -880,7 +1238,7 @@ unsigned long wall_jiffies = INITIAL_JIFFIES;
880 * playing with xtime and avenrun. 1238 * playing with xtime and avenrun.
881 */ 1239 */
882#ifndef ARCH_HAVE_XTIME_LOCK 1240#ifndef ARCH_HAVE_XTIME_LOCK
883seqlock_t xtime_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED; 1241__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
884 1242
885EXPORT_SYMBOL(xtime_lock); 1243EXPORT_SYMBOL(xtime_lock);
886#endif 1244#endif
@@ -910,15 +1268,10 @@ void run_local_timers(void)
910 * Called by the timer interrupt. xtime_lock must already be taken 1268 * Called by the timer interrupt. xtime_lock must already be taken
911 * by the timer IRQ! 1269 * by the timer IRQ!
912 */ 1270 */
913static inline void update_times(void) 1271static inline void update_times(unsigned long ticks)
914{ 1272{
915 unsigned long ticks; 1273 wall_jiffies += ticks;
916 1274 update_wall_time();
917 ticks = jiffies - wall_jiffies;
918 if (ticks) {
919 wall_jiffies += ticks;
920 update_wall_time(ticks);
921 }
922 calc_load(ticks); 1275 calc_load(ticks);
923} 1276}
924 1277
@@ -928,12 +1281,10 @@ static inline void update_times(void)
928 * jiffies is defined in the linker script... 1281 * jiffies is defined in the linker script...
929 */ 1282 */
930 1283
931void do_timer(struct pt_regs *regs) 1284void do_timer(unsigned long ticks)
932{ 1285{
933 jiffies_64++; 1286 jiffies_64 += ticks;
934 /* prevent loading jiffies before storing new jiffies_64 value. */ 1287 update_times(ticks);
935 barrier();
936 update_times();
937} 1288}
938 1289
939#ifdef __ARCH_WANT_SYS_ALARM 1290#ifdef __ARCH_WANT_SYS_ALARM
@@ -971,46 +1322,19 @@ asmlinkage long sys_getpid(void)
971} 1322}
972 1323
973/* 1324/*
974 * Accessing ->group_leader->real_parent is not SMP-safe, it could 1325 * Accessing ->real_parent is not SMP-safe, it could
975 * change from under us. However, rather than getting any lock 1326 * change from under us. However, we can use a stale
976 * we can use an optimistic algorithm: get the parent 1327 * value of ->real_parent under rcu_read_lock(), see
977 * pid, and go back and check that the parent is still 1328 * release_task()->call_rcu(delayed_put_task_struct).
978 * the same. If it has changed (which is extremely unlikely
979 * indeed), we just try again..
980 *
981 * NOTE! This depends on the fact that even if we _do_
982 * get an old value of "parent", we can happily dereference
983 * the pointer (it was and remains a dereferencable kernel pointer
984 * no matter what): we just can't necessarily trust the result
985 * until we know that the parent pointer is valid.
986 *
987 * NOTE2: ->group_leader never changes from under us.
988 */ 1329 */
989asmlinkage long sys_getppid(void) 1330asmlinkage long sys_getppid(void)
990{ 1331{
991 int pid; 1332 int pid;
992 struct task_struct *me = current;
993 struct task_struct *parent;
994 1333
995 parent = me->group_leader->real_parent; 1334 rcu_read_lock();
996 for (;;) { 1335 pid = rcu_dereference(current->real_parent)->tgid;
997 pid = parent->tgid; 1336 rcu_read_unlock();
998#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
999{
1000 struct task_struct *old = parent;
1001 1337
1002 /*
1003 * Make sure we read the pid before re-reading the
1004 * parent pointer:
1005 */
1006 smp_rmb();
1007 parent = me->group_leader->real_parent;
1008 if (old != parent)
1009 continue;
1010}
1011#endif
1012 break;
1013 }
1014 return pid; 1338 return pid;
1015} 1339}
1016 1340
@@ -1042,7 +1366,7 @@ asmlinkage long sys_getegid(void)
1042 1366
1043static void process_timeout(unsigned long __data) 1367static void process_timeout(unsigned long __data)
1044{ 1368{
1045 wake_up_process((task_t *)__data); 1369 wake_up_process((struct task_struct *)__data);
1046} 1370}
1047 1371
1048/** 1372/**
@@ -1144,8 +1468,9 @@ asmlinkage long sys_gettid(void)
1144 return current->pid; 1468 return current->pid;
1145} 1469}
1146 1470
1147/* 1471/**
1148 * sys_sysinfo - fill in sysinfo struct 1472 * sys_sysinfo - fill in sysinfo struct
1473 * @info: pointer to buffer to fill
1149 */ 1474 */
1150asmlinkage long sys_sysinfo(struct sysinfo __user *info) 1475asmlinkage long sys_sysinfo(struct sysinfo __user *info)
1151{ 1476{
@@ -1233,6 +1558,13 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info)
1233 return 0; 1558 return 0;
1234} 1559}
1235 1560
1561/*
1562 * lockdep: we want to track each per-CPU base as a separate lock-class,
1563 * but timer-bases are kmalloc()-ed, so we need to attach separate
1564 * keys to them:
1565 */
1566static struct lock_class_key base_lock_keys[NR_CPUS];
1567
1236static int __devinit init_timers_cpu(int cpu) 1568static int __devinit init_timers_cpu(int cpu)
1237{ 1569{
1238 int j; 1570 int j;
@@ -1268,6 +1600,8 @@ static int __devinit init_timers_cpu(int cpu)
1268 } 1600 }
1269 1601
1270 spin_lock_init(&base->lock); 1602 spin_lock_init(&base->lock);
1603 lockdep_set_class(&base->lock, base_lock_keys + cpu);
1604
1271 for (j = 0; j < TVN_SIZE; j++) { 1605 for (j = 0; j < TVN_SIZE; j++) {
1272 INIT_LIST_HEAD(base->tv5.vec + j); 1606 INIT_LIST_HEAD(base->tv5.vec + j);
1273 INIT_LIST_HEAD(base->tv4.vec + j); 1607 INIT_LIST_HEAD(base->tv4.vec + j);
@@ -1326,7 +1660,7 @@ static void __devinit migrate_timers(int cpu)
1326} 1660}
1327#endif /* CONFIG_HOTPLUG_CPU */ 1661#endif /* CONFIG_HOTPLUG_CPU */
1328 1662
1329static int timer_cpu_notify(struct notifier_block *self, 1663static int __cpuinit timer_cpu_notify(struct notifier_block *self,
1330 unsigned long action, void *hcpu) 1664 unsigned long action, void *hcpu)
1331{ 1665{
1332 long cpu = (long)hcpu; 1666 long cpu = (long)hcpu;
@@ -1346,15 +1680,17 @@ static int timer_cpu_notify(struct notifier_block *self,
1346 return NOTIFY_OK; 1680 return NOTIFY_OK;
1347} 1681}
1348 1682
1349static struct notifier_block timers_nb = { 1683static struct notifier_block __cpuinitdata timers_nb = {
1350 .notifier_call = timer_cpu_notify, 1684 .notifier_call = timer_cpu_notify,
1351}; 1685};
1352 1686
1353 1687
1354void __init init_timers(void) 1688void __init init_timers(void)
1355{ 1689{
1356 timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE, 1690 int err = timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
1357 (void *)(long)smp_processor_id()); 1691 (void *)(long)smp_processor_id());
1692
1693 BUG_ON(err == NOTIFY_BAD);
1358 register_cpu_notifier(&timers_nb); 1694 register_cpu_notifier(&timers_nb);
1359 open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL); 1695 open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL);
1360} 1696}
diff --git a/kernel/unwind.c b/kernel/unwind.c
new file mode 100644
index 000000000000..2e2368607aab
--- /dev/null
+++ b/kernel/unwind.c
@@ -0,0 +1,941 @@
1/*
2 * Copyright (C) 2002-2006 Novell, Inc.
3 * Jan Beulich <jbeulich@novell.com>
4 * This code is released under version 2 of the GNU GPL.
5 *
6 * A simple API for unwinding kernel stacks. This is used for
7 * debugging and error reporting purposes. The kernel doesn't need
8 * full-blown stack unwinding with all the bells and whistles, so there
9 * is not much point in implementing the full Dwarf2 unwind API.
10 */
11
12#include <linux/unwind.h>
13#include <linux/module.h>
14#include <linux/delay.h>
15#include <linux/stop_machine.h>
16#include <asm/sections.h>
17#include <asm/uaccess.h>
18#include <asm/unaligned.h>
19
20extern char __start_unwind[], __end_unwind[];
21
22#define MAX_STACK_DEPTH 8
23
24#define EXTRA_INFO(f) { \
25 BUILD_BUG_ON_ZERO(offsetof(struct unwind_frame_info, f) \
26 % FIELD_SIZEOF(struct unwind_frame_info, f)) \
27 + offsetof(struct unwind_frame_info, f) \
28 / FIELD_SIZEOF(struct unwind_frame_info, f), \
29 FIELD_SIZEOF(struct unwind_frame_info, f) \
30 }
31#define PTREGS_INFO(f) EXTRA_INFO(regs.f)
32
33static const struct {
34 unsigned offs:BITS_PER_LONG / 2;
35 unsigned width:BITS_PER_LONG / 2;
36} reg_info[] = {
37 UNW_REGISTER_INFO
38};
39
40#undef PTREGS_INFO
41#undef EXTRA_INFO
42
43#ifndef REG_INVALID
44#define REG_INVALID(r) (reg_info[r].width == 0)
45#endif
46
47#define DW_CFA_nop 0x00
48#define DW_CFA_set_loc 0x01
49#define DW_CFA_advance_loc1 0x02
50#define DW_CFA_advance_loc2 0x03
51#define DW_CFA_advance_loc4 0x04
52#define DW_CFA_offset_extended 0x05
53#define DW_CFA_restore_extended 0x06
54#define DW_CFA_undefined 0x07
55#define DW_CFA_same_value 0x08
56#define DW_CFA_register 0x09
57#define DW_CFA_remember_state 0x0a
58#define DW_CFA_restore_state 0x0b
59#define DW_CFA_def_cfa 0x0c
60#define DW_CFA_def_cfa_register 0x0d
61#define DW_CFA_def_cfa_offset 0x0e
62#define DW_CFA_def_cfa_expression 0x0f
63#define DW_CFA_expression 0x10
64#define DW_CFA_offset_extended_sf 0x11
65#define DW_CFA_def_cfa_sf 0x12
66#define DW_CFA_def_cfa_offset_sf 0x13
67#define DW_CFA_val_offset 0x14
68#define DW_CFA_val_offset_sf 0x15
69#define DW_CFA_val_expression 0x16
70#define DW_CFA_lo_user 0x1c
71#define DW_CFA_GNU_window_save 0x2d
72#define DW_CFA_GNU_args_size 0x2e
73#define DW_CFA_GNU_negative_offset_extended 0x2f
74#define DW_CFA_hi_user 0x3f
75
76#define DW_EH_PE_FORM 0x07
77#define DW_EH_PE_native 0x00
78#define DW_EH_PE_leb128 0x01
79#define DW_EH_PE_data2 0x02
80#define DW_EH_PE_data4 0x03
81#define DW_EH_PE_data8 0x04
82#define DW_EH_PE_signed 0x08
83#define DW_EH_PE_ADJUST 0x70
84#define DW_EH_PE_abs 0x00
85#define DW_EH_PE_pcrel 0x10
86#define DW_EH_PE_textrel 0x20
87#define DW_EH_PE_datarel 0x30
88#define DW_EH_PE_funcrel 0x40
89#define DW_EH_PE_aligned 0x50
90#define DW_EH_PE_indirect 0x80
91#define DW_EH_PE_omit 0xff
92
93typedef unsigned long uleb128_t;
94typedef signed long sleb128_t;
95
96static struct unwind_table {
97 struct {
98 unsigned long pc;
99 unsigned long range;
100 } core, init;
101 const void *address;
102 unsigned long size;
103 struct unwind_table *link;
104 const char *name;
105} root_table;
106
107struct unwind_item {
108 enum item_location {
109 Nowhere,
110 Memory,
111 Register,
112 Value
113 } where;
114 uleb128_t value;
115};
116
117struct unwind_state {
118 uleb128_t loc, org;
119 const u8 *cieStart, *cieEnd;
120 uleb128_t codeAlign;
121 sleb128_t dataAlign;
122 struct cfa {
123 uleb128_t reg, offs;
124 } cfa;
125 struct unwind_item regs[ARRAY_SIZE(reg_info)];
126 unsigned stackDepth:8;
127 unsigned version:8;
128 const u8 *label;
129 const u8 *stack[MAX_STACK_DEPTH];
130};
131
132static const struct cfa badCFA = { ARRAY_SIZE(reg_info), 1 };
133
134static struct unwind_table *find_table(unsigned long pc)
135{
136 struct unwind_table *table;
137
138 for (table = &root_table; table; table = table->link)
139 if ((pc >= table->core.pc
140 && pc < table->core.pc + table->core.range)
141 || (pc >= table->init.pc
142 && pc < table->init.pc + table->init.range))
143 break;
144
145 return table;
146}
147
148static void init_unwind_table(struct unwind_table *table,
149 const char *name,
150 const void *core_start,
151 unsigned long core_size,
152 const void *init_start,
153 unsigned long init_size,
154 const void *table_start,
155 unsigned long table_size)
156{
157 table->core.pc = (unsigned long)core_start;
158 table->core.range = core_size;
159 table->init.pc = (unsigned long)init_start;
160 table->init.range = init_size;
161 table->address = table_start;
162 table->size = table_size;
163 table->link = NULL;
164 table->name = name;
165}
166
167void __init unwind_init(void)
168{
169 init_unwind_table(&root_table, "kernel",
170 _text, _end - _text,
171 NULL, 0,
172 __start_unwind, __end_unwind - __start_unwind);
173}
174
175#ifdef CONFIG_MODULES
176
177static struct unwind_table *last_table;
178
179/* Must be called with module_mutex held. */
180void *unwind_add_table(struct module *module,
181 const void *table_start,
182 unsigned long table_size)
183{
184 struct unwind_table *table;
185
186 if (table_size <= 0)
187 return NULL;
188
189 table = kmalloc(sizeof(*table), GFP_KERNEL);
190 if (!table)
191 return NULL;
192
193 init_unwind_table(table, module->name,
194 module->module_core, module->core_size,
195 module->module_init, module->init_size,
196 table_start, table_size);
197
198 if (last_table)
199 last_table->link = table;
200 else
201 root_table.link = table;
202 last_table = table;
203
204 return table;
205}
206
207struct unlink_table_info
208{
209 struct unwind_table *table;
210 int init_only;
211};
212
213static int unlink_table(void *arg)
214{
215 struct unlink_table_info *info = arg;
216 struct unwind_table *table = info->table, *prev;
217
218 for (prev = &root_table; prev->link && prev->link != table; prev = prev->link)
219 ;
220
221 if (prev->link) {
222 if (info->init_only) {
223 table->init.pc = 0;
224 table->init.range = 0;
225 info->table = NULL;
226 } else {
227 prev->link = table->link;
228 if (!prev->link)
229 last_table = prev;
230 }
231 } else
232 info->table = NULL;
233
234 return 0;
235}
236
237/* Must be called with module_mutex held. */
238void unwind_remove_table(void *handle, int init_only)
239{
240 struct unwind_table *table = handle;
241 struct unlink_table_info info;
242
243 if (!table || table == &root_table)
244 return;
245
246 if (init_only && table == last_table) {
247 table->init.pc = 0;
248 table->init.range = 0;
249 return;
250 }
251
252 info.table = table;
253 info.init_only = init_only;
254 stop_machine_run(unlink_table, &info, NR_CPUS);
255
256 if (info.table)
257 kfree(table);
258}
259
260#endif /* CONFIG_MODULES */
261
262static uleb128_t get_uleb128(const u8 **pcur, const u8 *end)
263{
264 const u8 *cur = *pcur;
265 uleb128_t value;
266 unsigned shift;
267
268 for (shift = 0, value = 0; cur < end; shift += 7) {
269 if (shift + 7 > 8 * sizeof(value)
270 && (*cur & 0x7fU) >= (1U << (8 * sizeof(value) - shift))) {
271 cur = end + 1;
272 break;
273 }
274 value |= (uleb128_t)(*cur & 0x7f) << shift;
275 if (!(*cur++ & 0x80))
276 break;
277 }
278 *pcur = cur;
279
280 return value;
281}
282
283static sleb128_t get_sleb128(const u8 **pcur, const u8 *end)
284{
285 const u8 *cur = *pcur;
286 sleb128_t value;
287 unsigned shift;
288
289 for (shift = 0, value = 0; cur < end; shift += 7) {
290 if (shift + 7 > 8 * sizeof(value)
291 && (*cur & 0x7fU) >= (1U << (8 * sizeof(value) - shift))) {
292 cur = end + 1;
293 break;
294 }
295 value |= (sleb128_t)(*cur & 0x7f) << shift;
296 if (!(*cur & 0x80)) {
297 value |= -(*cur++ & 0x40) << shift;
298 break;
299 }
300 }
301 *pcur = cur;
302
303 return value;
304}
305
306static unsigned long read_pointer(const u8 **pLoc,
307 const void *end,
308 signed ptrType)
309{
310 unsigned long value = 0;
311 union {
312 const u8 *p8;
313 const u16 *p16u;
314 const s16 *p16s;
315 const u32 *p32u;
316 const s32 *p32s;
317 const unsigned long *pul;
318 } ptr;
319
320 if (ptrType < 0 || ptrType == DW_EH_PE_omit)
321 return 0;
322 ptr.p8 = *pLoc;
323 switch(ptrType & DW_EH_PE_FORM) {
324 case DW_EH_PE_data2:
325 if (end < (const void *)(ptr.p16u + 1))
326 return 0;
327 if(ptrType & DW_EH_PE_signed)
328 value = get_unaligned(ptr.p16s++);
329 else
330 value = get_unaligned(ptr.p16u++);
331 break;
332 case DW_EH_PE_data4:
333#ifdef CONFIG_64BIT
334 if (end < (const void *)(ptr.p32u + 1))
335 return 0;
336 if(ptrType & DW_EH_PE_signed)
337 value = get_unaligned(ptr.p32s++);
338 else
339 value = get_unaligned(ptr.p32u++);
340 break;
341 case DW_EH_PE_data8:
342 BUILD_BUG_ON(sizeof(u64) != sizeof(value));
343#else
344 BUILD_BUG_ON(sizeof(u32) != sizeof(value));
345#endif
346 case DW_EH_PE_native:
347 if (end < (const void *)(ptr.pul + 1))
348 return 0;
349 value = get_unaligned(ptr.pul++);
350 break;
351 case DW_EH_PE_leb128:
352 BUILD_BUG_ON(sizeof(uleb128_t) > sizeof(value));
353 value = ptrType & DW_EH_PE_signed
354 ? get_sleb128(&ptr.p8, end)
355 : get_uleb128(&ptr.p8, end);
356 if ((const void *)ptr.p8 > end)
357 return 0;
358 break;
359 default:
360 return 0;
361 }
362 switch(ptrType & DW_EH_PE_ADJUST) {
363 case DW_EH_PE_abs:
364 break;
365 case DW_EH_PE_pcrel:
366 value += (unsigned long)*pLoc;
367 break;
368 default:
369 return 0;
370 }
371 if ((ptrType & DW_EH_PE_indirect)
372 && __get_user(value, (unsigned long *)value))
373 return 0;
374 *pLoc = ptr.p8;
375
376 return value;
377}
378
379static signed fde_pointer_type(const u32 *cie)
380{
381 const u8 *ptr = (const u8 *)(cie + 2);
382 unsigned version = *ptr;
383
384 if (version != 1)
385 return -1; /* unsupported */
386 if (*++ptr) {
387 const char *aug;
388 const u8 *end = (const u8 *)(cie + 1) + *cie;
389 uleb128_t len;
390
391 /* check if augmentation size is first (and thus present) */
392 if (*ptr != 'z')
393 return -1;
394 /* check if augmentation string is nul-terminated */
395 if ((ptr = memchr(aug = (const void *)ptr, 0, end - ptr)) == NULL)
396 return -1;
397 ++ptr; /* skip terminator */
398 get_uleb128(&ptr, end); /* skip code alignment */
399 get_sleb128(&ptr, end); /* skip data alignment */
400 /* skip return address column */
401 version <= 1 ? (void)++ptr : (void)get_uleb128(&ptr, end);
402 len = get_uleb128(&ptr, end); /* augmentation length */
403 if (ptr + len < ptr || ptr + len > end)
404 return -1;
405 end = ptr + len;
406 while (*++aug) {
407 if (ptr >= end)
408 return -1;
409 switch(*aug) {
410 case 'L':
411 ++ptr;
412 break;
413 case 'P': {
414 signed ptrType = *ptr++;
415
416 if (!read_pointer(&ptr, end, ptrType) || ptr > end)
417 return -1;
418 }
419 break;
420 case 'R':
421 return *ptr;
422 default:
423 return -1;
424 }
425 }
426 }
427 return DW_EH_PE_native|DW_EH_PE_abs;
428}
429
430static int advance_loc(unsigned long delta, struct unwind_state *state)
431{
432 state->loc += delta * state->codeAlign;
433
434 return delta > 0;
435}
436
437static void set_rule(uleb128_t reg,
438 enum item_location where,
439 uleb128_t value,
440 struct unwind_state *state)
441{
442 if (reg < ARRAY_SIZE(state->regs)) {
443 state->regs[reg].where = where;
444 state->regs[reg].value = value;
445 }
446}
447
448static int processCFI(const u8 *start,
449 const u8 *end,
450 unsigned long targetLoc,
451 signed ptrType,
452 struct unwind_state *state)
453{
454 union {
455 const u8 *p8;
456 const u16 *p16;
457 const u32 *p32;
458 } ptr;
459 int result = 1;
460
461 if (start != state->cieStart) {
462 state->loc = state->org;
463 result = processCFI(state->cieStart, state->cieEnd, 0, ptrType, state);
464 if (targetLoc == 0 && state->label == NULL)
465 return result;
466 }
467 for (ptr.p8 = start; result && ptr.p8 < end; ) {
468 switch(*ptr.p8 >> 6) {
469 uleb128_t value;
470
471 case 0:
472 switch(*ptr.p8++) {
473 case DW_CFA_nop:
474 break;
475 case DW_CFA_set_loc:
476 if ((state->loc = read_pointer(&ptr.p8, end, ptrType)) == 0)
477 result = 0;
478 break;
479 case DW_CFA_advance_loc1:
480 result = ptr.p8 < end && advance_loc(*ptr.p8++, state);
481 break;
482 case DW_CFA_advance_loc2:
483 result = ptr.p8 <= end + 2
484 && advance_loc(*ptr.p16++, state);
485 break;
486 case DW_CFA_advance_loc4:
487 result = ptr.p8 <= end + 4
488 && advance_loc(*ptr.p32++, state);
489 break;
490 case DW_CFA_offset_extended:
491 value = get_uleb128(&ptr.p8, end);
492 set_rule(value, Memory, get_uleb128(&ptr.p8, end), state);
493 break;
494 case DW_CFA_val_offset:
495 value = get_uleb128(&ptr.p8, end);
496 set_rule(value, Value, get_uleb128(&ptr.p8, end), state);
497 break;
498 case DW_CFA_offset_extended_sf:
499 value = get_uleb128(&ptr.p8, end);
500 set_rule(value, Memory, get_sleb128(&ptr.p8, end), state);
501 break;
502 case DW_CFA_val_offset_sf:
503 value = get_uleb128(&ptr.p8, end);
504 set_rule(value, Value, get_sleb128(&ptr.p8, end), state);
505 break;
506 case DW_CFA_restore_extended:
507 case DW_CFA_undefined:
508 case DW_CFA_same_value:
509 set_rule(get_uleb128(&ptr.p8, end), Nowhere, 0, state);
510 break;
511 case DW_CFA_register:
512 value = get_uleb128(&ptr.p8, end);
513 set_rule(value,
514 Register,
515 get_uleb128(&ptr.p8, end), state);
516 break;
517 case DW_CFA_remember_state:
518 if (ptr.p8 == state->label) {
519 state->label = NULL;
520 return 1;
521 }
522 if (state->stackDepth >= MAX_STACK_DEPTH)
523 return 0;
524 state->stack[state->stackDepth++] = ptr.p8;
525 break;
526 case DW_CFA_restore_state:
527 if (state->stackDepth) {
528 const uleb128_t loc = state->loc;
529 const u8 *label = state->label;
530
531 state->label = state->stack[state->stackDepth - 1];
532 memcpy(&state->cfa, &badCFA, sizeof(state->cfa));
533 memset(state->regs, 0, sizeof(state->regs));
534 state->stackDepth = 0;
535 result = processCFI(start, end, 0, ptrType, state);
536 state->loc = loc;
537 state->label = label;
538 } else
539 return 0;
540 break;
541 case DW_CFA_def_cfa:
542 state->cfa.reg = get_uleb128(&ptr.p8, end);
543 /*nobreak*/
544 case DW_CFA_def_cfa_offset:
545 state->cfa.offs = get_uleb128(&ptr.p8, end);
546 break;
547 case DW_CFA_def_cfa_sf:
548 state->cfa.reg = get_uleb128(&ptr.p8, end);
549 /*nobreak*/
550 case DW_CFA_def_cfa_offset_sf:
551 state->cfa.offs = get_sleb128(&ptr.p8, end)
552 * state->dataAlign;
553 break;
554 case DW_CFA_def_cfa_register:
555 state->cfa.reg = get_uleb128(&ptr.p8, end);
556 break;
557 /*todo case DW_CFA_def_cfa_expression: */
558 /*todo case DW_CFA_expression: */
559 /*todo case DW_CFA_val_expression: */
560 case DW_CFA_GNU_args_size:
561 get_uleb128(&ptr.p8, end);
562 break;
563 case DW_CFA_GNU_negative_offset_extended:
564 value = get_uleb128(&ptr.p8, end);
565 set_rule(value,
566 Memory,
567 (uleb128_t)0 - get_uleb128(&ptr.p8, end), state);
568 break;
569 case DW_CFA_GNU_window_save:
570 default:
571 result = 0;
572 break;
573 }
574 break;
575 case 1:
576 result = advance_loc(*ptr.p8++ & 0x3f, state);
577 break;
578 case 2:
579 value = *ptr.p8++ & 0x3f;
580 set_rule(value, Memory, get_uleb128(&ptr.p8, end), state);
581 break;
582 case 3:
583 set_rule(*ptr.p8++ & 0x3f, Nowhere, 0, state);
584 break;
585 }
586 if (ptr.p8 > end)
587 result = 0;
588 if (result && targetLoc != 0 && targetLoc < state->loc)
589 return 1;
590 }
591
592 return result
593 && ptr.p8 == end
594 && (targetLoc == 0
595 || (/*todo While in theory this should apply, gcc in practice omits
596 everything past the function prolog, and hence the location
597 never reaches the end of the function.
598 targetLoc < state->loc &&*/ state->label == NULL));
599}
600
601/* Unwind to previous to frame. Returns 0 if successful, negative
602 * number in case of an error. */
603int unwind(struct unwind_frame_info *frame)
604{
605#define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs])
606 const u32 *fde = NULL, *cie = NULL;
607 const u8 *ptr = NULL, *end = NULL;
608 unsigned long pc = UNW_PC(frame) - frame->call_frame;
609 unsigned long startLoc = 0, endLoc = 0, cfa;
610 unsigned i;
611 signed ptrType = -1;
612 uleb128_t retAddrReg = 0;
613 struct unwind_table *table;
614 struct unwind_state state;
615
616 if (UNW_PC(frame) == 0)
617 return -EINVAL;
618 if ((table = find_table(pc)) != NULL
619 && !(table->size & (sizeof(*fde) - 1))) {
620 unsigned long tableSize = table->size;
621
622 for (fde = table->address;
623 tableSize > sizeof(*fde) && tableSize - sizeof(*fde) >= *fde;
624 tableSize -= sizeof(*fde) + *fde,
625 fde += 1 + *fde / sizeof(*fde)) {
626 if (!*fde || (*fde & (sizeof(*fde) - 1)))
627 break;
628 if (!fde[1])
629 continue; /* this is a CIE */
630 if ((fde[1] & (sizeof(*fde) - 1))
631 || fde[1] > (unsigned long)(fde + 1)
632 - (unsigned long)table->address)
633 continue; /* this is not a valid FDE */
634 cie = fde + 1 - fde[1] / sizeof(*fde);
635 if (*cie <= sizeof(*cie) + 4
636 || *cie >= fde[1] - sizeof(*fde)
637 || (*cie & (sizeof(*cie) - 1))
638 || cie[1]
639 || (ptrType = fde_pointer_type(cie)) < 0) {
640 cie = NULL; /* this is not a (valid) CIE */
641 continue;
642 }
643 ptr = (const u8 *)(fde + 2);
644 startLoc = read_pointer(&ptr,
645 (const u8 *)(fde + 1) + *fde,
646 ptrType);
647 endLoc = startLoc
648 + read_pointer(&ptr,
649 (const u8 *)(fde + 1) + *fde,
650 ptrType & DW_EH_PE_indirect
651 ? ptrType
652 : ptrType & (DW_EH_PE_FORM|DW_EH_PE_signed));
653 if (pc >= startLoc && pc < endLoc)
654 break;
655 cie = NULL;
656 }
657 }
658 if (cie != NULL) {
659 memset(&state, 0, sizeof(state));
660 state.cieEnd = ptr; /* keep here temporarily */
661 ptr = (const u8 *)(cie + 2);
662 end = (const u8 *)(cie + 1) + *cie;
663 frame->call_frame = 1;
664 if ((state.version = *ptr) != 1)
665 cie = NULL; /* unsupported version */
666 else if (*++ptr) {
667 /* check if augmentation size is first (and thus present) */
668 if (*ptr == 'z') {
669 while (++ptr < end && *ptr) {
670 switch(*ptr) {
671 /* check for ignorable (or already handled)
672 * nul-terminated augmentation string */
673 case 'L':
674 case 'P':
675 case 'R':
676 continue;
677 case 'S':
678 frame->call_frame = 0;
679 continue;
680 default:
681 break;
682 }
683 break;
684 }
685 }
686 if (ptr >= end || *ptr)
687 cie = NULL;
688 }
689 ++ptr;
690 }
691 if (cie != NULL) {
692 /* get code aligment factor */
693 state.codeAlign = get_uleb128(&ptr, end);
694 /* get data aligment factor */
695 state.dataAlign = get_sleb128(&ptr, end);
696 if (state.codeAlign == 0 || state.dataAlign == 0 || ptr >= end)
697 cie = NULL;
698 else {
699 retAddrReg = state.version <= 1 ? *ptr++ : get_uleb128(&ptr, end);
700 /* skip augmentation */
701 if (((const char *)(cie + 2))[1] == 'z')
702 ptr += get_uleb128(&ptr, end);
703 if (ptr > end
704 || retAddrReg >= ARRAY_SIZE(reg_info)
705 || REG_INVALID(retAddrReg)
706 || reg_info[retAddrReg].width != sizeof(unsigned long))
707 cie = NULL;
708 }
709 }
710 if (cie != NULL) {
711 state.cieStart = ptr;
712 ptr = state.cieEnd;
713 state.cieEnd = end;
714 end = (const u8 *)(fde + 1) + *fde;
715 /* skip augmentation */
716 if (((const char *)(cie + 2))[1] == 'z') {
717 uleb128_t augSize = get_uleb128(&ptr, end);
718
719 if ((ptr += augSize) > end)
720 fde = NULL;
721 }
722 }
723 if (cie == NULL || fde == NULL) {
724#ifdef CONFIG_FRAME_POINTER
725 unsigned long top, bottom;
726#endif
727
728#ifdef CONFIG_FRAME_POINTER
729 top = STACK_TOP(frame->task);
730 bottom = STACK_BOTTOM(frame->task);
731# if FRAME_RETADDR_OFFSET < 0
732 if (UNW_SP(frame) < top
733 && UNW_FP(frame) <= UNW_SP(frame)
734 && bottom < UNW_FP(frame)
735# else
736 if (UNW_SP(frame) > top
737 && UNW_FP(frame) >= UNW_SP(frame)
738 && bottom > UNW_FP(frame)
739# endif
740 && !((UNW_SP(frame) | UNW_FP(frame))
741 & (sizeof(unsigned long) - 1))) {
742 unsigned long link;
743
744 if (!__get_user(link,
745 (unsigned long *)(UNW_FP(frame)
746 + FRAME_LINK_OFFSET))
747# if FRAME_RETADDR_OFFSET < 0
748 && link > bottom && link < UNW_FP(frame)
749# else
750 && link > UNW_FP(frame) && link < bottom
751# endif
752 && !(link & (sizeof(link) - 1))
753 && !__get_user(UNW_PC(frame),
754 (unsigned long *)(UNW_FP(frame)
755 + FRAME_RETADDR_OFFSET))) {
756 UNW_SP(frame) = UNW_FP(frame) + FRAME_RETADDR_OFFSET
757# if FRAME_RETADDR_OFFSET < 0
758 -
759# else
760 +
761# endif
762 sizeof(UNW_PC(frame));
763 UNW_FP(frame) = link;
764 return 0;
765 }
766 }
767#endif
768 return -ENXIO;
769 }
770 state.org = startLoc;
771 memcpy(&state.cfa, &badCFA, sizeof(state.cfa));
772 /* process instructions */
773 if (!processCFI(ptr, end, pc, ptrType, &state)
774 || state.loc > endLoc
775 || state.regs[retAddrReg].where == Nowhere
776 || state.cfa.reg >= ARRAY_SIZE(reg_info)
777 || reg_info[state.cfa.reg].width != sizeof(unsigned long)
778 || state.cfa.offs % sizeof(unsigned long))
779 return -EIO;
780 /* update frame */
781#ifndef CONFIG_AS_CFI_SIGNAL_FRAME
782 if(frame->call_frame
783 && !UNW_DEFAULT_RA(state.regs[retAddrReg], state.dataAlign))
784 frame->call_frame = 0;
785#endif
786 cfa = FRAME_REG(state.cfa.reg, unsigned long) + state.cfa.offs;
787 startLoc = min((unsigned long)UNW_SP(frame), cfa);
788 endLoc = max((unsigned long)UNW_SP(frame), cfa);
789 if (STACK_LIMIT(startLoc) != STACK_LIMIT(endLoc)) {
790 startLoc = min(STACK_LIMIT(cfa), cfa);
791 endLoc = max(STACK_LIMIT(cfa), cfa);
792 }
793#ifndef CONFIG_64BIT
794# define CASES CASE(8); CASE(16); CASE(32)
795#else
796# define CASES CASE(8); CASE(16); CASE(32); CASE(64)
797#endif
798 for (i = 0; i < ARRAY_SIZE(state.regs); ++i) {
799 if (REG_INVALID(i)) {
800 if (state.regs[i].where == Nowhere)
801 continue;
802 return -EIO;
803 }
804 switch(state.regs[i].where) {
805 default:
806 break;
807 case Register:
808 if (state.regs[i].value >= ARRAY_SIZE(reg_info)
809 || REG_INVALID(state.regs[i].value)
810 || reg_info[i].width > reg_info[state.regs[i].value].width)
811 return -EIO;
812 switch(reg_info[state.regs[i].value].width) {
813#define CASE(n) \
814 case sizeof(u##n): \
815 state.regs[i].value = FRAME_REG(state.regs[i].value, \
816 const u##n); \
817 break
818 CASES;
819#undef CASE
820 default:
821 return -EIO;
822 }
823 break;
824 }
825 }
826 for (i = 0; i < ARRAY_SIZE(state.regs); ++i) {
827 if (REG_INVALID(i))
828 continue;
829 switch(state.regs[i].where) {
830 case Nowhere:
831 if (reg_info[i].width != sizeof(UNW_SP(frame))
832 || &FRAME_REG(i, __typeof__(UNW_SP(frame)))
833 != &UNW_SP(frame))
834 continue;
835 UNW_SP(frame) = cfa;
836 break;
837 case Register:
838 switch(reg_info[i].width) {
839#define CASE(n) case sizeof(u##n): \
840 FRAME_REG(i, u##n) = state.regs[i].value; \
841 break
842 CASES;
843#undef CASE
844 default:
845 return -EIO;
846 }
847 break;
848 case Value:
849 if (reg_info[i].width != sizeof(unsigned long))
850 return -EIO;
851 FRAME_REG(i, unsigned long) = cfa + state.regs[i].value
852 * state.dataAlign;
853 break;
854 case Memory: {
855 unsigned long addr = cfa + state.regs[i].value
856 * state.dataAlign;
857
858 if ((state.regs[i].value * state.dataAlign)
859 % sizeof(unsigned long)
860 || addr < startLoc
861 || addr + sizeof(unsigned long) < addr
862 || addr + sizeof(unsigned long) > endLoc)
863 return -EIO;
864 switch(reg_info[i].width) {
865#define CASE(n) case sizeof(u##n): \
866 __get_user(FRAME_REG(i, u##n), (u##n *)addr); \
867 break
868 CASES;
869#undef CASE
870 default:
871 return -EIO;
872 }
873 }
874 break;
875 }
876 }
877
878 return 0;
879#undef CASES
880#undef FRAME_REG
881}
882EXPORT_SYMBOL(unwind);
883
884int unwind_init_frame_info(struct unwind_frame_info *info,
885 struct task_struct *tsk,
886 /*const*/ struct pt_regs *regs)
887{
888 info->task = tsk;
889 info->call_frame = 0;
890 arch_unw_init_frame_info(info, regs);
891
892 return 0;
893}
894EXPORT_SYMBOL(unwind_init_frame_info);
895
896/*
897 * Prepare to unwind a blocked task.
898 */
899int unwind_init_blocked(struct unwind_frame_info *info,
900 struct task_struct *tsk)
901{
902 info->task = tsk;
903 info->call_frame = 0;
904 arch_unw_init_blocked(info);
905
906 return 0;
907}
908EXPORT_SYMBOL(unwind_init_blocked);
909
910/*
911 * Prepare to unwind the currently running thread.
912 */
913int unwind_init_running(struct unwind_frame_info *info,
914 asmlinkage int (*callback)(struct unwind_frame_info *,
915 void *arg),
916 void *arg)
917{
918 info->task = current;
919 info->call_frame = 0;
920
921 return arch_unwind_init_running(info, callback, arg);
922}
923EXPORT_SYMBOL(unwind_init_running);
924
925/*
926 * Unwind until the return pointer is in user-land (or until an error
927 * occurs). Returns 0 if successful, negative number in case of
928 * error.
929 */
930int unwind_to_user(struct unwind_frame_info *info)
931{
932 while (!arch_unw_user_mode(info)) {
933 int err = unwind(info);
934
935 if (err < 0)
936 return err;
937 }
938
939 return 0;
940}
941EXPORT_SYMBOL(unwind_to_user);
diff --git a/kernel/wait.c b/kernel/wait.c
index 791681cfea98..59a82f63275d 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -3,7 +3,6 @@
3 * 3 *
4 * (C) 2004 William Irwin, Oracle 4 * (C) 2004 William Irwin, Oracle
5 */ 5 */
6#include <linux/config.h>
7#include <linux/init.h> 6#include <linux/init.h>
8#include <linux/module.h> 7#include <linux/module.h>
9#include <linux/sched.h> 8#include <linux/sched.h>
@@ -11,6 +10,14 @@
11#include <linux/wait.h> 10#include <linux/wait.h>
12#include <linux/hash.h> 11#include <linux/hash.h>
13 12
13void init_waitqueue_head(wait_queue_head_t *q)
14{
15 spin_lock_init(&q->lock);
16 INIT_LIST_HEAD(&q->task_list);
17}
18
19EXPORT_SYMBOL(init_waitqueue_head);
20
14void fastcall add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) 21void fastcall add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
15{ 22{
16 unsigned long flags; 23 unsigned long flags;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 740c5abceb07..835fe28b87a8 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -51,7 +51,7 @@ struct cpu_workqueue_struct {
51 wait_queue_head_t work_done; 51 wait_queue_head_t work_done;
52 52
53 struct workqueue_struct *wq; 53 struct workqueue_struct *wq;
54 task_t *thread; 54 struct task_struct *thread;
55 55
56 int run_depth; /* Detect run_workqueue() recursion depth */ 56 int run_depth; /* Detect run_workqueue() recursion depth */
57} ____cacheline_aligned; 57} ____cacheline_aligned;
@@ -68,7 +68,7 @@ struct workqueue_struct {
68 68
69/* All the per-cpu workqueues on the system, for hotplug cpu to add/remove 69/* All the per-cpu workqueues on the system, for hotplug cpu to add/remove
70 threads to each one as cpus come/go. */ 70 threads to each one as cpus come/go. */
71static DEFINE_SPINLOCK(workqueue_lock); 71static DEFINE_MUTEX(workqueue_mutex);
72static LIST_HEAD(workqueues); 72static LIST_HEAD(workqueues);
73 73
74static int singlethread_cpu; 74static int singlethread_cpu;
@@ -93,9 +93,12 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
93 spin_unlock_irqrestore(&cwq->lock, flags); 93 spin_unlock_irqrestore(&cwq->lock, flags);
94} 94}
95 95
96/* 96/**
97 * Queue work on a workqueue. Return non-zero if it was successfully 97 * queue_work - queue work on a workqueue
98 * added. 98 * @wq: workqueue to use
99 * @work: work to queue
100 *
101 * Returns non-zero if it was successfully added.
99 * 102 *
100 * We queue the work to the CPU it was submitted, but there is no 103 * We queue the work to the CPU it was submitted, but there is no
101 * guarantee that it will be processed by that CPU. 104 * guarantee that it will be processed by that CPU.
@@ -114,6 +117,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
114 put_cpu(); 117 put_cpu();
115 return ret; 118 return ret;
116} 119}
120EXPORT_SYMBOL_GPL(queue_work);
117 121
118static void delayed_work_timer_fn(unsigned long __data) 122static void delayed_work_timer_fn(unsigned long __data)
119{ 123{
@@ -127,6 +131,14 @@ static void delayed_work_timer_fn(unsigned long __data)
127 __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); 131 __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
128} 132}
129 133
134/**
135 * queue_delayed_work - queue work on a workqueue after delay
136 * @wq: workqueue to use
137 * @work: work to queue
138 * @delay: number of jiffies to wait before queueing
139 *
140 * Returns non-zero if it was successfully added.
141 */
130int fastcall queue_delayed_work(struct workqueue_struct *wq, 142int fastcall queue_delayed_work(struct workqueue_struct *wq,
131 struct work_struct *work, unsigned long delay) 143 struct work_struct *work, unsigned long delay)
132{ 144{
@@ -147,6 +159,38 @@ int fastcall queue_delayed_work(struct workqueue_struct *wq,
147 } 159 }
148 return ret; 160 return ret;
149} 161}
162EXPORT_SYMBOL_GPL(queue_delayed_work);
163
164/**
165 * queue_delayed_work_on - queue work on specific CPU after delay
166 * @cpu: CPU number to execute work on
167 * @wq: workqueue to use
168 * @work: work to queue
169 * @delay: number of jiffies to wait before queueing
170 *
171 * Returns non-zero if it was successfully added.
172 */
173int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
174 struct work_struct *work, unsigned long delay)
175{
176 int ret = 0;
177 struct timer_list *timer = &work->timer;
178
179 if (!test_and_set_bit(0, &work->pending)) {
180 BUG_ON(timer_pending(timer));
181 BUG_ON(!list_empty(&work->entry));
182
183 /* This stores wq for the moment, for the timer_fn */
184 work->wq_data = wq;
185 timer->expires = jiffies + delay;
186 timer->data = (unsigned long)work;
187 timer->function = delayed_work_timer_fn;
188 add_timer_on(timer, cpu);
189 ret = 1;
190 }
191 return ret;
192}
193EXPORT_SYMBOL_GPL(queue_delayed_work_on);
150 194
151static void run_workqueue(struct cpu_workqueue_struct *cwq) 195static void run_workqueue(struct cpu_workqueue_struct *cwq)
152{ 196{
@@ -251,8 +295,9 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
251 } 295 }
252} 296}
253 297
254/* 298/**
255 * flush_workqueue - ensure that any scheduled work has run to completion. 299 * flush_workqueue - ensure that any scheduled work has run to completion.
300 * @wq: workqueue to flush
256 * 301 *
257 * Forces execution of the workqueue and blocks until its completion. 302 * Forces execution of the workqueue and blocks until its completion.
258 * This is typically used in driver shutdown handlers. 303 * This is typically used in driver shutdown handlers.
@@ -275,12 +320,13 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
275 } else { 320 } else {
276 int cpu; 321 int cpu;
277 322
278 lock_cpu_hotplug(); 323 mutex_lock(&workqueue_mutex);
279 for_each_online_cpu(cpu) 324 for_each_online_cpu(cpu)
280 flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); 325 flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
281 unlock_cpu_hotplug(); 326 mutex_unlock(&workqueue_mutex);
282 } 327 }
283} 328}
329EXPORT_SYMBOL_GPL(flush_workqueue);
284 330
285static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, 331static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
286 int cpu) 332 int cpu)
@@ -325,8 +371,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
325 } 371 }
326 372
327 wq->name = name; 373 wq->name = name;
328 /* We don't need the distraction of CPUs appearing and vanishing. */ 374 mutex_lock(&workqueue_mutex);
329 lock_cpu_hotplug();
330 if (singlethread) { 375 if (singlethread) {
331 INIT_LIST_HEAD(&wq->list); 376 INIT_LIST_HEAD(&wq->list);
332 p = create_workqueue_thread(wq, singlethread_cpu); 377 p = create_workqueue_thread(wq, singlethread_cpu);
@@ -335,9 +380,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
335 else 380 else
336 wake_up_process(p); 381 wake_up_process(p);
337 } else { 382 } else {
338 spin_lock(&workqueue_lock);
339 list_add(&wq->list, &workqueues); 383 list_add(&wq->list, &workqueues);
340 spin_unlock(&workqueue_lock);
341 for_each_online_cpu(cpu) { 384 for_each_online_cpu(cpu) {
342 p = create_workqueue_thread(wq, cpu); 385 p = create_workqueue_thread(wq, cpu);
343 if (p) { 386 if (p) {
@@ -347,7 +390,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
347 destroy = 1; 390 destroy = 1;
348 } 391 }
349 } 392 }
350 unlock_cpu_hotplug(); 393 mutex_unlock(&workqueue_mutex);
351 394
352 /* 395 /*
353 * Was there any error during startup? If yes then clean up: 396 * Was there any error during startup? If yes then clean up:
@@ -358,6 +401,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
358 } 401 }
359 return wq; 402 return wq;
360} 403}
404EXPORT_SYMBOL_GPL(__create_workqueue);
361 405
362static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu) 406static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu)
363{ 407{
@@ -374,6 +418,12 @@ static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu)
374 kthread_stop(p); 418 kthread_stop(p);
375} 419}
376 420
421/**
422 * destroy_workqueue - safely terminate a workqueue
423 * @wq: target workqueue
424 *
425 * Safely destroy a workqueue. All work currently pending will be done first.
426 */
377void destroy_workqueue(struct workqueue_struct *wq) 427void destroy_workqueue(struct workqueue_struct *wq)
378{ 428{
379 int cpu; 429 int cpu;
@@ -381,69 +431,94 @@ void destroy_workqueue(struct workqueue_struct *wq)
381 flush_workqueue(wq); 431 flush_workqueue(wq);
382 432
383 /* We don't need the distraction of CPUs appearing and vanishing. */ 433 /* We don't need the distraction of CPUs appearing and vanishing. */
384 lock_cpu_hotplug(); 434 mutex_lock(&workqueue_mutex);
385 if (is_single_threaded(wq)) 435 if (is_single_threaded(wq))
386 cleanup_workqueue_thread(wq, singlethread_cpu); 436 cleanup_workqueue_thread(wq, singlethread_cpu);
387 else { 437 else {
388 for_each_online_cpu(cpu) 438 for_each_online_cpu(cpu)
389 cleanup_workqueue_thread(wq, cpu); 439 cleanup_workqueue_thread(wq, cpu);
390 spin_lock(&workqueue_lock);
391 list_del(&wq->list); 440 list_del(&wq->list);
392 spin_unlock(&workqueue_lock);
393 } 441 }
394 unlock_cpu_hotplug(); 442 mutex_unlock(&workqueue_mutex);
395 free_percpu(wq->cpu_wq); 443 free_percpu(wq->cpu_wq);
396 kfree(wq); 444 kfree(wq);
397} 445}
446EXPORT_SYMBOL_GPL(destroy_workqueue);
398 447
399static struct workqueue_struct *keventd_wq; 448static struct workqueue_struct *keventd_wq;
400 449
450/**
451 * schedule_work - put work task in global workqueue
452 * @work: job to be done
453 *
454 * This puts a job in the kernel-global workqueue.
455 */
401int fastcall schedule_work(struct work_struct *work) 456int fastcall schedule_work(struct work_struct *work)
402{ 457{
403 return queue_work(keventd_wq, work); 458 return queue_work(keventd_wq, work);
404} 459}
460EXPORT_SYMBOL(schedule_work);
405 461
462/**
463 * schedule_delayed_work - put work task in global workqueue after delay
464 * @work: job to be done
465 * @delay: number of jiffies to wait
466 *
467 * After waiting for a given time this puts a job in the kernel-global
468 * workqueue.
469 */
406int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay) 470int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay)
407{ 471{
408 return queue_delayed_work(keventd_wq, work, delay); 472 return queue_delayed_work(keventd_wq, work, delay);
409} 473}
474EXPORT_SYMBOL(schedule_delayed_work);
410 475
476/**
477 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
478 * @cpu: cpu to use
479 * @work: job to be done
480 * @delay: number of jiffies to wait
481 *
482 * After waiting for a given time this puts a job in the kernel-global
483 * workqueue on the specified CPU.
484 */
411int schedule_delayed_work_on(int cpu, 485int schedule_delayed_work_on(int cpu,
412 struct work_struct *work, unsigned long delay) 486 struct work_struct *work, unsigned long delay)
413{ 487{
414 int ret = 0; 488 return queue_delayed_work_on(cpu, keventd_wq, work, delay);
415 struct timer_list *timer = &work->timer;
416
417 if (!test_and_set_bit(0, &work->pending)) {
418 BUG_ON(timer_pending(timer));
419 BUG_ON(!list_empty(&work->entry));
420 /* This stores keventd_wq for the moment, for the timer_fn */
421 work->wq_data = keventd_wq;
422 timer->expires = jiffies + delay;
423 timer->data = (unsigned long)work;
424 timer->function = delayed_work_timer_fn;
425 add_timer_on(timer, cpu);
426 ret = 1;
427 }
428 return ret;
429} 489}
490EXPORT_SYMBOL(schedule_delayed_work_on);
430 491
431int schedule_on_each_cpu(void (*func) (void *info), void *info) 492/**
493 * schedule_on_each_cpu - call a function on each online CPU from keventd
494 * @func: the function to call
495 * @info: a pointer to pass to func()
496 *
497 * Returns zero on success.
498 * Returns -ve errno on failure.
499 *
500 * Appears to be racy against CPU hotplug.
501 *
502 * schedule_on_each_cpu() is very slow.
503 */
504int schedule_on_each_cpu(void (*func)(void *info), void *info)
432{ 505{
433 int cpu; 506 int cpu;
434 struct work_struct *work; 507 struct work_struct *works;
435
436 work = kmalloc(NR_CPUS * sizeof(struct work_struct), GFP_KERNEL);
437 508
438 if (!work) 509 works = alloc_percpu(struct work_struct);
510 if (!works)
439 return -ENOMEM; 511 return -ENOMEM;
512
513 mutex_lock(&workqueue_mutex);
440 for_each_online_cpu(cpu) { 514 for_each_online_cpu(cpu) {
441 INIT_WORK(work + cpu, func, info); 515 INIT_WORK(per_cpu_ptr(works, cpu), func, info);
442 __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), 516 __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu),
443 work + cpu); 517 per_cpu_ptr(works, cpu));
444 } 518 }
519 mutex_unlock(&workqueue_mutex);
445 flush_workqueue(keventd_wq); 520 flush_workqueue(keventd_wq);
446 kfree(work); 521 free_percpu(works);
447 return 0; 522 return 0;
448} 523}
449 524
@@ -451,6 +526,7 @@ void flush_scheduled_work(void)
451{ 526{
452 flush_workqueue(keventd_wq); 527 flush_workqueue(keventd_wq);
453} 528}
529EXPORT_SYMBOL(flush_scheduled_work);
454 530
455/** 531/**
456 * cancel_rearming_delayed_workqueue - reliably kill off a delayed 532 * cancel_rearming_delayed_workqueue - reliably kill off a delayed
@@ -547,7 +623,7 @@ static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
547} 623}
548 624
549/* We're holding the cpucontrol mutex here */ 625/* We're holding the cpucontrol mutex here */
550static int workqueue_cpu_callback(struct notifier_block *nfb, 626static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
551 unsigned long action, 627 unsigned long action,
552 void *hcpu) 628 void *hcpu)
553{ 629{
@@ -556,6 +632,7 @@ static int workqueue_cpu_callback(struct notifier_block *nfb,
556 632
557 switch (action) { 633 switch (action) {
558 case CPU_UP_PREPARE: 634 case CPU_UP_PREPARE:
635 mutex_lock(&workqueue_mutex);
559 /* Create a new workqueue thread for it. */ 636 /* Create a new workqueue thread for it. */
560 list_for_each_entry(wq, &workqueues, list) { 637 list_for_each_entry(wq, &workqueues, list) {
561 if (!create_workqueue_thread(wq, hotcpu)) { 638 if (!create_workqueue_thread(wq, hotcpu)) {
@@ -574,15 +651,27 @@ static int workqueue_cpu_callback(struct notifier_block *nfb,
574 kthread_bind(cwq->thread, hotcpu); 651 kthread_bind(cwq->thread, hotcpu);
575 wake_up_process(cwq->thread); 652 wake_up_process(cwq->thread);
576 } 653 }
654 mutex_unlock(&workqueue_mutex);
577 break; 655 break;
578 656
579 case CPU_UP_CANCELED: 657 case CPU_UP_CANCELED:
580 list_for_each_entry(wq, &workqueues, list) { 658 list_for_each_entry(wq, &workqueues, list) {
659 if (!per_cpu_ptr(wq->cpu_wq, hotcpu)->thread)
660 continue;
581 /* Unbind so it can run. */ 661 /* Unbind so it can run. */
582 kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread, 662 kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread,
583 any_online_cpu(cpu_online_map)); 663 any_online_cpu(cpu_online_map));
584 cleanup_workqueue_thread(wq, hotcpu); 664 cleanup_workqueue_thread(wq, hotcpu);
585 } 665 }
666 mutex_unlock(&workqueue_mutex);
667 break;
668
669 case CPU_DOWN_PREPARE:
670 mutex_lock(&workqueue_mutex);
671 break;
672
673 case CPU_DOWN_FAILED:
674 mutex_unlock(&workqueue_mutex);
586 break; 675 break;
587 676
588 case CPU_DEAD: 677 case CPU_DEAD:
@@ -590,6 +679,7 @@ static int workqueue_cpu_callback(struct notifier_block *nfb,
590 cleanup_workqueue_thread(wq, hotcpu); 679 cleanup_workqueue_thread(wq, hotcpu);
591 list_for_each_entry(wq, &workqueues, list) 680 list_for_each_entry(wq, &workqueues, list)
592 take_over_work(wq, hotcpu); 681 take_over_work(wq, hotcpu);
682 mutex_unlock(&workqueue_mutex);
593 break; 683 break;
594 } 684 }
595 685
@@ -605,13 +695,3 @@ void init_workqueues(void)
605 BUG_ON(!keventd_wq); 695 BUG_ON(!keventd_wq);
606} 696}
607 697
608EXPORT_SYMBOL_GPL(__create_workqueue);
609EXPORT_SYMBOL_GPL(queue_work);
610EXPORT_SYMBOL_GPL(queue_delayed_work);
611EXPORT_SYMBOL_GPL(flush_workqueue);
612EXPORT_SYMBOL_GPL(destroy_workqueue);
613
614EXPORT_SYMBOL(schedule_work);
615EXPORT_SYMBOL(schedule_delayed_work);
616EXPORT_SYMBOL(schedule_delayed_work_on);
617EXPORT_SYMBOL(flush_scheduled_work);