aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile16
-rw-r--r--kernel/acct.c119
-rw-r--r--kernel/audit.c215
-rw-r--r--kernel/audit.h62
-rw-r--r--kernel/auditfilter.c1106
-rw-r--r--kernel/auditsc.c716
-rw-r--r--kernel/capability.c8
-rw-r--r--kernel/compat.c30
-rw-r--r--kernel/configs.c1
-rw-r--r--kernel/cpu.c18
-rw-r--r--kernel/cpuset.c45
-rw-r--r--kernel/delayacct.c178
-rw-r--r--kernel/exec_domain.c1
-rw-r--r--kernel/exit.c87
-rw-r--r--kernel/fork.c94
-rw-r--r--kernel/futex.c1091
-rw-r--r--kernel/futex_compat.c14
-rw-r--r--kernel/hrtimer.c29
-rw-r--r--kernel/intermodule.c184
-rw-r--r--kernel/irq/Makefile2
-rw-r--r--kernel/irq/autoprobe.c56
-rw-r--r--kernel/irq/chip.c537
-rw-r--r--kernel/irq/handle.c161
-rw-r--r--kernel/irq/internals.h46
-rw-r--r--kernel/irq/manage.c185
-rw-r--r--kernel/irq/migration.c22
-rw-r--r--kernel/irq/proc.c33
-rw-r--r--kernel/irq/resend.c78
-rw-r--r--kernel/irq/spurious.c49
-rw-r--r--kernel/kallsyms.c4
-rw-r--r--kernel/kexec.c12
-rw-r--r--kernel/kmod.c3
-rw-r--r--kernel/kprobes.c58
-rw-r--r--kernel/ksysfs.c20
-rw-r--r--kernel/kthread.c63
-rw-r--r--kernel/lockdep.c2704
-rw-r--r--kernel/lockdep_internals.h78
-rw-r--r--kernel/lockdep_proc.c345
-rw-r--r--kernel/module.c166
-rw-r--r--kernel/mutex-debug.c404
-rw-r--r--kernel/mutex-debug.h111
-rw-r--r--kernel/mutex.c95
-rw-r--r--kernel/mutex.h25
-rw-r--r--kernel/panic.c3
-rw-r--r--kernel/params.c1
-rw-r--r--kernel/pid.c6
-rw-r--r--kernel/posix-cpu-timers.c48
-rw-r--r--kernel/power/Kconfig30
-rw-r--r--kernel/power/disk.c2
-rw-r--r--kernel/power/main.c8
-rw-r--r--kernel/power/pm.c37
-rw-r--r--kernel/power/power.h2
-rw-r--r--kernel/power/snapshot.c158
-rw-r--r--kernel/power/swap.c26
-rw-r--r--kernel/power/swsusp.c20
-rw-r--r--kernel/printk.c108
-rw-r--r--kernel/profile.c3
-rw-r--r--kernel/ptrace.c29
-rw-r--r--kernel/rcupdate.c31
-rw-r--r--kernel/rcutorture.c201
-rw-r--r--kernel/resource.c93
-rw-r--r--kernel/rtmutex-debug.c242
-rw-r--r--kernel/rtmutex-debug.h33
-rw-r--r--kernel/rtmutex-tester.c441
-rw-r--r--kernel/rtmutex.c989
-rw-r--r--kernel/rtmutex.h26
-rw-r--r--kernel/rtmutex_common.h123
-rw-r--r--kernel/rwsem.c147
-rw-r--r--kernel/sched.c1972
-rw-r--r--kernel/signal.c45
-rw-r--r--kernel/softirq.c149
-rw-r--r--kernel/softlockup.c8
-rw-r--r--kernel/spinlock.c80
-rw-r--r--kernel/stacktrace.c24
-rw-r--r--kernel/sys.c83
-rw-r--r--kernel/sys_ni.c2
-rw-r--r--kernel/sysctl.c79
-rw-r--r--kernel/taskstats.c568
-rw-r--r--kernel/time.c2
-rw-r--r--kernel/time/Makefile1
-rw-r--r--kernel/time/clocksource.c349
-rw-r--r--kernel/time/jiffies.c73
-rw-r--r--kernel/timer.c472
-rw-r--r--kernel/unwind.c918
-rw-r--r--kernel/user.c4
-rw-r--r--kernel/wait.c9
-rw-r--r--kernel/workqueue.c95
87 files changed, 14238 insertions, 2773 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 58908f9d156a..d62ec66c1af2 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -8,20 +8,30 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
8 signal.o sys.o kmod.o workqueue.o pid.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o 11 hrtimer.o rwsem.o
12 12
13obj-$(CONFIG_STACKTRACE) += stacktrace.o
14obj-y += time/
13obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o 15obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
16obj-$(CONFIG_LOCKDEP) += lockdep.o
17ifeq ($(CONFIG_PROC_FS),y)
18obj-$(CONFIG_LOCKDEP) += lockdep_proc.o
19endif
14obj-$(CONFIG_FUTEX) += futex.o 20obj-$(CONFIG_FUTEX) += futex.o
15ifeq ($(CONFIG_COMPAT),y) 21ifeq ($(CONFIG_COMPAT),y)
16obj-$(CONFIG_FUTEX) += futex_compat.o 22obj-$(CONFIG_FUTEX) += futex_compat.o
17endif 23endif
24obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
25obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
26obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
18obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o 27obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
19obj-$(CONFIG_SMP) += cpu.o spinlock.o 28obj-$(CONFIG_SMP) += cpu.o spinlock.o
20obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o 29obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
30obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
21obj-$(CONFIG_UID16) += uid16.o 31obj-$(CONFIG_UID16) += uid16.o
22obj-$(CONFIG_MODULES) += module.o 32obj-$(CONFIG_MODULES) += module.o
23obj-$(CONFIG_OBSOLETE_INTERMODULE) += intermodule.o
24obj-$(CONFIG_KALLSYMS) += kallsyms.o 33obj-$(CONFIG_KALLSYMS) += kallsyms.o
34obj-$(CONFIG_STACK_UNWIND) += unwind.o
25obj-$(CONFIG_PM) += power/ 35obj-$(CONFIG_PM) += power/
26obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o 36obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
27obj-$(CONFIG_KEXEC) += kexec.o 37obj-$(CONFIG_KEXEC) += kexec.o
@@ -38,6 +48,8 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
38obj-$(CONFIG_SECCOMP) += seccomp.o 48obj-$(CONFIG_SECCOMP) += seccomp.o
39obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 49obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
40obj-$(CONFIG_RELAY) += relay.o 50obj-$(CONFIG_RELAY) += relay.o
51obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
52obj-$(CONFIG_TASKSTATS) += taskstats.o
41 53
42ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) 54ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
43# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 55# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/acct.c b/kernel/acct.c
index b327f4d20104..2a7c933651c7 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -43,7 +43,6 @@
43 * a struct file opened for write. Fixed. 2/6/2000, AV. 43 * a struct file opened for write. Fixed. 2/6/2000, AV.
44 */ 44 */
45 45
46#include <linux/config.h>
47#include <linux/mm.h> 46#include <linux/mm.h>
48#include <linux/slab.h> 47#include <linux/slab.h>
49#include <linux/acct.h> 48#include <linux/acct.h>
@@ -75,7 +74,7 @@ int acct_parm[3] = {4, 2, 30};
75/* 74/*
76 * External references and all of the globals. 75 * External references and all of the globals.
77 */ 76 */
78static void do_acct_process(long, struct file *); 77static void do_acct_process(struct file *);
79 78
80/* 79/*
81 * This structure is used so that all the data protected by lock 80 * This structure is used so that all the data protected by lock
@@ -118,7 +117,7 @@ static int check_free_space(struct file *file)
118 spin_unlock(&acct_globals.lock); 117 spin_unlock(&acct_globals.lock);
119 118
120 /* May block */ 119 /* May block */
121 if (vfs_statfs(file->f_dentry->d_inode->i_sb, &sbuf)) 120 if (vfs_statfs(file->f_dentry, &sbuf))
122 return res; 121 return res;
123 suspend = sbuf.f_blocks * SUSPEND; 122 suspend = sbuf.f_blocks * SUSPEND;
124 resume = sbuf.f_blocks * RESUME; 123 resume = sbuf.f_blocks * RESUME;
@@ -196,7 +195,7 @@ static void acct_file_reopen(struct file *file)
196 if (old_acct) { 195 if (old_acct) {
197 mnt_unpin(old_acct->f_vfsmnt); 196 mnt_unpin(old_acct->f_vfsmnt);
198 spin_unlock(&acct_globals.lock); 197 spin_unlock(&acct_globals.lock);
199 do_acct_process(0, old_acct); 198 do_acct_process(old_acct);
200 filp_close(old_acct, NULL); 199 filp_close(old_acct, NULL);
201 spin_lock(&acct_globals.lock); 200 spin_lock(&acct_globals.lock);
202 } 201 }
@@ -419,16 +418,15 @@ static u32 encode_float(u64 value)
419/* 418/*
420 * do_acct_process does all actual work. Caller holds the reference to file. 419 * do_acct_process does all actual work. Caller holds the reference to file.
421 */ 420 */
422static void do_acct_process(long exitcode, struct file *file) 421static void do_acct_process(struct file *file)
423{ 422{
423 struct pacct_struct *pacct = &current->signal->pacct;
424 acct_t ac; 424 acct_t ac;
425 mm_segment_t fs; 425 mm_segment_t fs;
426 unsigned long vsize;
427 unsigned long flim; 426 unsigned long flim;
428 u64 elapsed; 427 u64 elapsed;
429 u64 run_time; 428 u64 run_time;
430 struct timespec uptime; 429 struct timespec uptime;
431 unsigned long jiffies;
432 430
433 /* 431 /*
434 * First check to see if there is enough free_space to continue 432 * First check to see if there is enough free_space to continue
@@ -469,12 +467,6 @@ static void do_acct_process(long exitcode, struct file *file)
469#endif 467#endif
470 do_div(elapsed, AHZ); 468 do_div(elapsed, AHZ);
471 ac.ac_btime = xtime.tv_sec - elapsed; 469 ac.ac_btime = xtime.tv_sec - elapsed;
472 jiffies = cputime_to_jiffies(cputime_add(current->utime,
473 current->signal->utime));
474 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies));
475 jiffies = cputime_to_jiffies(cputime_add(current->stime,
476 current->signal->stime));
477 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies));
478 /* we really need to bite the bullet and change layout */ 470 /* we really need to bite the bullet and change layout */
479 ac.ac_uid = current->uid; 471 ac.ac_uid = current->uid;
480 ac.ac_gid = current->gid; 472 ac.ac_gid = current->gid;
@@ -496,37 +488,18 @@ static void do_acct_process(long exitcode, struct file *file)
496 old_encode_dev(tty_devnum(current->signal->tty)) : 0; 488 old_encode_dev(tty_devnum(current->signal->tty)) : 0;
497 read_unlock(&tasklist_lock); 489 read_unlock(&tasklist_lock);
498 490
499 ac.ac_flag = 0; 491 spin_lock_irq(&current->sighand->siglock);
500 if (current->flags & PF_FORKNOEXEC) 492 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
501 ac.ac_flag |= AFORK; 493 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
502 if (current->flags & PF_SUPERPRIV) 494 ac.ac_flag = pacct->ac_flag;
503 ac.ac_flag |= ASU; 495 ac.ac_mem = encode_comp_t(pacct->ac_mem);
504 if (current->flags & PF_DUMPCORE) 496 ac.ac_minflt = encode_comp_t(pacct->ac_minflt);
505 ac.ac_flag |= ACORE; 497 ac.ac_majflt = encode_comp_t(pacct->ac_majflt);
506 if (current->flags & PF_SIGNALED) 498 ac.ac_exitcode = pacct->ac_exitcode;
507 ac.ac_flag |= AXSIG; 499 spin_unlock_irq(&current->sighand->siglock);
508
509 vsize = 0;
510 if (current->mm) {
511 struct vm_area_struct *vma;
512 down_read(&current->mm->mmap_sem);
513 vma = current->mm->mmap;
514 while (vma) {
515 vsize += vma->vm_end - vma->vm_start;
516 vma = vma->vm_next;
517 }
518 up_read(&current->mm->mmap_sem);
519 }
520 vsize = vsize / 1024;
521 ac.ac_mem = encode_comp_t(vsize);
522 ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ 500 ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */
523 ac.ac_rw = encode_comp_t(ac.ac_io / 1024); 501 ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
524 ac.ac_minflt = encode_comp_t(current->signal->min_flt +
525 current->min_flt);
526 ac.ac_majflt = encode_comp_t(current->signal->maj_flt +
527 current->maj_flt);
528 ac.ac_swaps = encode_comp_t(0); 502 ac.ac_swaps = encode_comp_t(0);
529 ac.ac_exitcode = exitcode;
530 503
531 /* 504 /*
532 * Kernel segment override to datasegment and write it 505 * Kernel segment override to datasegment and write it
@@ -546,12 +519,64 @@ static void do_acct_process(long exitcode, struct file *file)
546} 519}
547 520
548/** 521/**
522 * acct_init_pacct - initialize a new pacct_struct
523 * @pacct: per-process accounting info struct to initialize
524 */
525void acct_init_pacct(struct pacct_struct *pacct)
526{
527 memset(pacct, 0, sizeof(struct pacct_struct));
528 pacct->ac_utime = pacct->ac_stime = cputime_zero;
529}
530
531/**
532 * acct_collect - collect accounting information into pacct_struct
533 * @exitcode: task exit code
534 * @group_dead: not 0, if this thread is the last one in the process.
535 */
536void acct_collect(long exitcode, int group_dead)
537{
538 struct pacct_struct *pacct = &current->signal->pacct;
539 unsigned long vsize = 0;
540
541 if (group_dead && current->mm) {
542 struct vm_area_struct *vma;
543 down_read(&current->mm->mmap_sem);
544 vma = current->mm->mmap;
545 while (vma) {
546 vsize += vma->vm_end - vma->vm_start;
547 vma = vma->vm_next;
548 }
549 up_read(&current->mm->mmap_sem);
550 }
551
552 spin_lock_irq(&current->sighand->siglock);
553 if (group_dead)
554 pacct->ac_mem = vsize / 1024;
555 if (thread_group_leader(current)) {
556 pacct->ac_exitcode = exitcode;
557 if (current->flags & PF_FORKNOEXEC)
558 pacct->ac_flag |= AFORK;
559 }
560 if (current->flags & PF_SUPERPRIV)
561 pacct->ac_flag |= ASU;
562 if (current->flags & PF_DUMPCORE)
563 pacct->ac_flag |= ACORE;
564 if (current->flags & PF_SIGNALED)
565 pacct->ac_flag |= AXSIG;
566 pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime);
567 pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime);
568 pacct->ac_minflt += current->min_flt;
569 pacct->ac_majflt += current->maj_flt;
570 spin_unlock_irq(&current->sighand->siglock);
571}
572
573/**
549 * acct_process - now just a wrapper around do_acct_process 574 * acct_process - now just a wrapper around do_acct_process
550 * @exitcode: task exit code 575 * @exitcode: task exit code
551 * 576 *
552 * handles process accounting for an exiting task 577 * handles process accounting for an exiting task
553 */ 578 */
554void acct_process(long exitcode) 579void acct_process(void)
555{ 580{
556 struct file *file = NULL; 581 struct file *file = NULL;
557 582
@@ -570,7 +595,7 @@ void acct_process(long exitcode)
570 get_file(file); 595 get_file(file);
571 spin_unlock(&acct_globals.lock); 596 spin_unlock(&acct_globals.lock);
572 597
573 do_acct_process(exitcode, file); 598 do_acct_process(file);
574 fput(file); 599 fput(file);
575} 600}
576 601
@@ -599,9 +624,7 @@ void acct_update_integrals(struct task_struct *tsk)
599 */ 624 */
600void acct_clear_integrals(struct task_struct *tsk) 625void acct_clear_integrals(struct task_struct *tsk)
601{ 626{
602 if (tsk) { 627 tsk->acct_stimexpd = 0;
603 tsk->acct_stimexpd = 0; 628 tsk->acct_rss_mem1 = 0;
604 tsk->acct_rss_mem1 = 0; 629 tsk->acct_vm_mem1 = 0;
605 tsk->acct_vm_mem1 = 0;
606 }
607} 630}
diff --git a/kernel/audit.c b/kernel/audit.c
index df57b493e1cb..d417ca1db79b 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -56,6 +56,7 @@
56#include <linux/skbuff.h> 56#include <linux/skbuff.h>
57#include <linux/netlink.h> 57#include <linux/netlink.h>
58#include <linux/selinux.h> 58#include <linux/selinux.h>
59#include <linux/inotify.h>
59 60
60#include "audit.h" 61#include "audit.h"
61 62
@@ -89,6 +90,7 @@ static int audit_backlog_wait_overflow = 0;
89/* The identity of the user shutting down the audit system. */ 90/* The identity of the user shutting down the audit system. */
90uid_t audit_sig_uid = -1; 91uid_t audit_sig_uid = -1;
91pid_t audit_sig_pid = -1; 92pid_t audit_sig_pid = -1;
93u32 audit_sig_sid = 0;
92 94
93/* Records can be lost in several ways: 95/* Records can be lost in several ways:
94 0) [suppressed in audit_alloc] 96 0) [suppressed in audit_alloc]
@@ -102,6 +104,12 @@ static atomic_t audit_lost = ATOMIC_INIT(0);
102/* The netlink socket. */ 104/* The netlink socket. */
103static struct sock *audit_sock; 105static struct sock *audit_sock;
104 106
107/* Inotify handle. */
108struct inotify_handle *audit_ih;
109
110/* Hash for inode-based rules */
111struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
112
105/* The audit_freelist is a list of pre-allocated audit buffers (if more 113/* The audit_freelist is a list of pre-allocated audit buffers (if more
106 * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of 114 * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of
107 * being placed on the freelist). */ 115 * being placed on the freelist). */
@@ -114,10 +122,8 @@ static struct task_struct *kauditd_task;
114static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); 122static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
115static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); 123static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
116 124
117/* The netlink socket is only to be read by 1 CPU, which lets us assume 125/* Serialize requests from userspace. */
118 * that list additions and deletions never happen simultaneously in 126static DEFINE_MUTEX(audit_cmd_mutex);
119 * auditsc.c */
120DEFINE_MUTEX(audit_netlink_mutex);
121 127
122/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting 128/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
123 * audit records. Since printk uses a 1024 byte buffer, this buffer 129 * audit records. Since printk uses a 1024 byte buffer, this buffer
@@ -250,7 +256,7 @@ static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid)
250 "audit_rate_limit=%d old=%d by auid=%u", 256 "audit_rate_limit=%d old=%d by auid=%u",
251 limit, old, loginuid); 257 limit, old, loginuid);
252 audit_rate_limit = limit; 258 audit_rate_limit = limit;
253 return old; 259 return 0;
254} 260}
255 261
256static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid) 262static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid)
@@ -273,7 +279,7 @@ static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid)
273 "audit_backlog_limit=%d old=%d by auid=%u", 279 "audit_backlog_limit=%d old=%d by auid=%u",
274 limit, old, loginuid); 280 limit, old, loginuid);
275 audit_backlog_limit = limit; 281 audit_backlog_limit = limit;
276 return old; 282 return 0;
277} 283}
278 284
279static int audit_set_enabled(int state, uid_t loginuid, u32 sid) 285static int audit_set_enabled(int state, uid_t loginuid, u32 sid)
@@ -299,7 +305,7 @@ static int audit_set_enabled(int state, uid_t loginuid, u32 sid)
299 "audit_enabled=%d old=%d by auid=%u", 305 "audit_enabled=%d old=%d by auid=%u",
300 state, old, loginuid); 306 state, old, loginuid);
301 audit_enabled = state; 307 audit_enabled = state;
302 return old; 308 return 0;
303} 309}
304 310
305static int audit_set_failure(int state, uid_t loginuid, u32 sid) 311static int audit_set_failure(int state, uid_t loginuid, u32 sid)
@@ -327,7 +333,7 @@ static int audit_set_failure(int state, uid_t loginuid, u32 sid)
327 "audit_failure=%d old=%d by auid=%u", 333 "audit_failure=%d old=%d by auid=%u",
328 state, old, loginuid); 334 state, old, loginuid);
329 audit_failure = state; 335 audit_failure = state;
330 return old; 336 return 0;
331} 337}
332 338
333static int kauditd_thread(void *dummy) 339static int kauditd_thread(void *dummy)
@@ -363,9 +369,52 @@ static int kauditd_thread(void *dummy)
363 remove_wait_queue(&kauditd_wait, &wait); 369 remove_wait_queue(&kauditd_wait, &wait);
364 } 370 }
365 } 371 }
372}
373
374int audit_send_list(void *_dest)
375{
376 struct audit_netlink_list *dest = _dest;
377 int pid = dest->pid;
378 struct sk_buff *skb;
379
380 /* wait for parent to finish and send an ACK */
381 mutex_lock(&audit_cmd_mutex);
382 mutex_unlock(&audit_cmd_mutex);
383
384 while ((skb = __skb_dequeue(&dest->q)) != NULL)
385 netlink_unicast(audit_sock, skb, pid, 0);
386
387 kfree(dest);
388
366 return 0; 389 return 0;
367} 390}
368 391
392struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
393 int multi, void *payload, int size)
394{
395 struct sk_buff *skb;
396 struct nlmsghdr *nlh;
397 int len = NLMSG_SPACE(size);
398 void *data;
399 int flags = multi ? NLM_F_MULTI : 0;
400 int t = done ? NLMSG_DONE : type;
401
402 skb = alloc_skb(len, GFP_KERNEL);
403 if (!skb)
404 return NULL;
405
406 nlh = NLMSG_PUT(skb, pid, seq, t, size);
407 nlh->nlmsg_flags = flags;
408 data = NLMSG_DATA(nlh);
409 memcpy(data, payload, size);
410 return skb;
411
412nlmsg_failure: /* Used by NLMSG_PUT */
413 if (skb)
414 kfree_skb(skb);
415 return NULL;
416}
417
369/** 418/**
370 * audit_send_reply - send an audit reply message via netlink 419 * audit_send_reply - send an audit reply message via netlink
371 * @pid: process id to send reply to 420 * @pid: process id to send reply to
@@ -383,36 +432,20 @@ void audit_send_reply(int pid, int seq, int type, int done, int multi,
383 void *payload, int size) 432 void *payload, int size)
384{ 433{
385 struct sk_buff *skb; 434 struct sk_buff *skb;
386 struct nlmsghdr *nlh; 435 skb = audit_make_reply(pid, seq, type, done, multi, payload, size);
387 int len = NLMSG_SPACE(size);
388 void *data;
389 int flags = multi ? NLM_F_MULTI : 0;
390 int t = done ? NLMSG_DONE : type;
391
392 skb = alloc_skb(len, GFP_KERNEL);
393 if (!skb) 436 if (!skb)
394 return; 437 return;
395
396 nlh = NLMSG_PUT(skb, pid, seq, t, size);
397 nlh->nlmsg_flags = flags;
398 data = NLMSG_DATA(nlh);
399 memcpy(data, payload, size);
400
401 /* Ignore failure. It'll only happen if the sender goes away, 438 /* Ignore failure. It'll only happen if the sender goes away,
402 because our timeout is set to infinite. */ 439 because our timeout is set to infinite. */
403 netlink_unicast(audit_sock, skb, pid, 0); 440 netlink_unicast(audit_sock, skb, pid, 0);
404 return; 441 return;
405
406nlmsg_failure: /* Used by NLMSG_PUT */
407 if (skb)
408 kfree_skb(skb);
409} 442}
410 443
411/* 444/*
412 * Check for appropriate CAP_AUDIT_ capabilities on incoming audit 445 * Check for appropriate CAP_AUDIT_ capabilities on incoming audit
413 * control messages. 446 * control messages.
414 */ 447 */
415static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type) 448static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
416{ 449{
417 int err = 0; 450 int err = 0;
418 451
@@ -426,13 +459,13 @@ static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type)
426 case AUDIT_DEL: 459 case AUDIT_DEL:
427 case AUDIT_DEL_RULE: 460 case AUDIT_DEL_RULE:
428 case AUDIT_SIGNAL_INFO: 461 case AUDIT_SIGNAL_INFO:
429 if (!cap_raised(eff_cap, CAP_AUDIT_CONTROL)) 462 if (security_netlink_recv(skb, CAP_AUDIT_CONTROL))
430 err = -EPERM; 463 err = -EPERM;
431 break; 464 break;
432 case AUDIT_USER: 465 case AUDIT_USER:
433 case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: 466 case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
434 case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2: 467 case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2:
435 if (!cap_raised(eff_cap, CAP_AUDIT_WRITE)) 468 if (security_netlink_recv(skb, CAP_AUDIT_WRITE))
436 err = -EPERM; 469 err = -EPERM;
437 break; 470 break;
438 default: /* bad msg */ 471 default: /* bad msg */
@@ -451,9 +484,11 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
451 struct audit_buffer *ab; 484 struct audit_buffer *ab;
452 u16 msg_type = nlh->nlmsg_type; 485 u16 msg_type = nlh->nlmsg_type;
453 uid_t loginuid; /* loginuid of sender */ 486 uid_t loginuid; /* loginuid of sender */
454 struct audit_sig_info sig_data; 487 struct audit_sig_info *sig_data;
488 char *ctx;
489 u32 len;
455 490
456 err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type); 491 err = audit_netlink_ok(skb, msg_type);
457 if (err) 492 if (err)
458 return err; 493 return err;
459 494
@@ -503,12 +538,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
503 if (status_get->mask & AUDIT_STATUS_PID) { 538 if (status_get->mask & AUDIT_STATUS_PID) {
504 int old = audit_pid; 539 int old = audit_pid;
505 if (sid) { 540 if (sid) {
506 char *ctx = NULL; 541 if ((err = selinux_ctxid_to_string(
507 u32 len;
508 int rc;
509 if ((rc = selinux_ctxid_to_string(
510 sid, &ctx, &len))) 542 sid, &ctx, &len)))
511 return rc; 543 return err;
512 else 544 else
513 audit_log(NULL, GFP_KERNEL, 545 audit_log(NULL, GFP_KERNEL,
514 AUDIT_CONFIG_CHANGE, 546 AUDIT_CONFIG_CHANGE,
@@ -523,10 +555,10 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
523 audit_pid = status_get->pid; 555 audit_pid = status_get->pid;
524 } 556 }
525 if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) 557 if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
526 audit_set_rate_limit(status_get->rate_limit, 558 err = audit_set_rate_limit(status_get->rate_limit,
527 loginuid, sid); 559 loginuid, sid);
528 if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) 560 if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
529 audit_set_backlog_limit(status_get->backlog_limit, 561 err = audit_set_backlog_limit(status_get->backlog_limit,
530 loginuid, sid); 562 loginuid, sid);
531 break; 563 break;
532 case AUDIT_USER: 564 case AUDIT_USER:
@@ -544,8 +576,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
544 "user pid=%d uid=%u auid=%u", 576 "user pid=%d uid=%u auid=%u",
545 pid, uid, loginuid); 577 pid, uid, loginuid);
546 if (sid) { 578 if (sid) {
547 char *ctx = NULL;
548 u32 len;
549 if (selinux_ctxid_to_string( 579 if (selinux_ctxid_to_string(
550 sid, &ctx, &len)) { 580 sid, &ctx, &len)) {
551 audit_log_format(ab, 581 audit_log_format(ab,
@@ -584,10 +614,21 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
584 loginuid, sid); 614 loginuid, sid);
585 break; 615 break;
586 case AUDIT_SIGNAL_INFO: 616 case AUDIT_SIGNAL_INFO:
587 sig_data.uid = audit_sig_uid; 617 err = selinux_ctxid_to_string(audit_sig_sid, &ctx, &len);
588 sig_data.pid = audit_sig_pid; 618 if (err)
619 return err;
620 sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL);
621 if (!sig_data) {
622 kfree(ctx);
623 return -ENOMEM;
624 }
625 sig_data->uid = audit_sig_uid;
626 sig_data->pid = audit_sig_pid;
627 memcpy(sig_data->ctx, ctx, len);
628 kfree(ctx);
589 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, 629 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO,
590 0, 0, &sig_data, sizeof(sig_data)); 630 0, 0, sig_data, sizeof(*sig_data) + len);
631 kfree(sig_data);
591 break; 632 break;
592 default: 633 default:
593 err = -EINVAL; 634 err = -EINVAL;
@@ -629,20 +670,30 @@ static void audit_receive(struct sock *sk, int length)
629 struct sk_buff *skb; 670 struct sk_buff *skb;
630 unsigned int qlen; 671 unsigned int qlen;
631 672
632 mutex_lock(&audit_netlink_mutex); 673 mutex_lock(&audit_cmd_mutex);
633 674
634 for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) { 675 for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) {
635 skb = skb_dequeue(&sk->sk_receive_queue); 676 skb = skb_dequeue(&sk->sk_receive_queue);
636 audit_receive_skb(skb); 677 audit_receive_skb(skb);
637 kfree_skb(skb); 678 kfree_skb(skb);
638 } 679 }
639 mutex_unlock(&audit_netlink_mutex); 680 mutex_unlock(&audit_cmd_mutex);
640} 681}
641 682
683#ifdef CONFIG_AUDITSYSCALL
684static const struct inotify_operations audit_inotify_ops = {
685 .handle_event = audit_handle_ievent,
686 .destroy_watch = audit_free_parent,
687};
688#endif
642 689
643/* Initialize audit support at boot time. */ 690/* Initialize audit support at boot time. */
644static int __init audit_init(void) 691static int __init audit_init(void)
645{ 692{
693#ifdef CONFIG_AUDITSYSCALL
694 int i;
695#endif
696
646 printk(KERN_INFO "audit: initializing netlink socket (%s)\n", 697 printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
647 audit_default ? "enabled" : "disabled"); 698 audit_default ? "enabled" : "disabled");
648 audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive, 699 audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive,
@@ -661,6 +712,16 @@ static int __init audit_init(void)
661 selinux_audit_set_callback(&selinux_audit_rule_update); 712 selinux_audit_set_callback(&selinux_audit_rule_update);
662 713
663 audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); 714 audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
715
716#ifdef CONFIG_AUDITSYSCALL
717 audit_ih = inotify_init(&audit_inotify_ops);
718 if (IS_ERR(audit_ih))
719 audit_panic("cannot initialize inotify handle");
720
721 for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
722 INIT_LIST_HEAD(&audit_inode_hash[i]);
723#endif
724
664 return 0; 725 return 0;
665} 726}
666__initcall(audit_init); 727__initcall(audit_init);
@@ -690,10 +751,12 @@ static void audit_buffer_free(struct audit_buffer *ab)
690 kfree_skb(ab->skb); 751 kfree_skb(ab->skb);
691 752
692 spin_lock_irqsave(&audit_freelist_lock, flags); 753 spin_lock_irqsave(&audit_freelist_lock, flags);
693 if (++audit_freelist_count > AUDIT_MAXFREE) 754 if (audit_freelist_count > AUDIT_MAXFREE)
694 kfree(ab); 755 kfree(ab);
695 else 756 else {
757 audit_freelist_count++;
696 list_add(&ab->list, &audit_freelist); 758 list_add(&ab->list, &audit_freelist);
759 }
697 spin_unlock_irqrestore(&audit_freelist_lock, flags); 760 spin_unlock_irqrestore(&audit_freelist_lock, flags);
698} 761}
699 762
@@ -755,7 +818,7 @@ err:
755 */ 818 */
756unsigned int audit_serial(void) 819unsigned int audit_serial(void)
757{ 820{
758 static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED; 821 static DEFINE_SPINLOCK(serial_lock);
759 static unsigned int serial = 0; 822 static unsigned int serial = 0;
760 823
761 unsigned long flags; 824 unsigned long flags;
@@ -988,28 +1051,76 @@ void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf,
988 skb_put(skb, len << 1); /* new string is twice the old string */ 1051 skb_put(skb, len << 1); /* new string is twice the old string */
989} 1052}
990 1053
1054/*
1055 * Format a string of no more than slen characters into the audit buffer,
1056 * enclosed in quote marks.
1057 */
1058static void audit_log_n_string(struct audit_buffer *ab, size_t slen,
1059 const char *string)
1060{
1061 int avail, new_len;
1062 unsigned char *ptr;
1063 struct sk_buff *skb;
1064
1065 BUG_ON(!ab->skb);
1066 skb = ab->skb;
1067 avail = skb_tailroom(skb);
1068 new_len = slen + 3; /* enclosing quotes + null terminator */
1069 if (new_len > avail) {
1070 avail = audit_expand(ab, new_len);
1071 if (!avail)
1072 return;
1073 }
1074 ptr = skb->tail;
1075 *ptr++ = '"';
1076 memcpy(ptr, string, slen);
1077 ptr += slen;
1078 *ptr++ = '"';
1079 *ptr = 0;
1080 skb_put(skb, slen + 2); /* don't include null terminator */
1081}
1082
991/** 1083/**
992 * audit_log_unstrustedstring - log a string that may contain random characters 1084 * audit_log_n_unstrustedstring - log a string that may contain random characters
993 * @ab: audit_buffer 1085 * @ab: audit_buffer
1086 * @len: lenth of string (not including trailing null)
994 * @string: string to be logged 1087 * @string: string to be logged
995 * 1088 *
996 * This code will escape a string that is passed to it if the string 1089 * This code will escape a string that is passed to it if the string
997 * contains a control character, unprintable character, double quote mark, 1090 * contains a control character, unprintable character, double quote mark,
998 * or a space. Unescaped strings will start and end with a double quote mark. 1091 * or a space. Unescaped strings will start and end with a double quote mark.
999 * Strings that are escaped are printed in hex (2 digits per char). 1092 * Strings that are escaped are printed in hex (2 digits per char).
1093 *
1094 * The caller specifies the number of characters in the string to log, which may
1095 * or may not be the entire string.
1000 */ 1096 */
1001void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) 1097const char *audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len,
1098 const char *string)
1002{ 1099{
1003 const unsigned char *p = string; 1100 const unsigned char *p = string;
1004 1101
1005 while (*p) { 1102 while (*p) {
1006 if (*p == '"' || *p < 0x21 || *p > 0x7f) { 1103 if (*p == '"' || *p < 0x21 || *p > 0x7f) {
1007 audit_log_hex(ab, string, strlen(string)); 1104 audit_log_hex(ab, string, len);
1008 return; 1105 return string + len + 1;
1009 } 1106 }
1010 p++; 1107 p++;
1011 } 1108 }
1012 audit_log_format(ab, "\"%s\"", string); 1109 audit_log_n_string(ab, len, string);
1110 return p + 1;
1111}
1112
1113/**
1114 * audit_log_unstrustedstring - log a string that may contain random characters
1115 * @ab: audit_buffer
1116 * @string: string to be logged
1117 *
1118 * Same as audit_log_n_unstrustedstring(), except that strlen is used to
1119 * determine string length.
1120 */
1121const char *audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
1122{
1123 return audit_log_n_untrustedstring(ab, strlen(string), string);
1013} 1124}
1014 1125
1015/* This is a helper-function to print the escaped d_path */ 1126/* This is a helper-function to print the escaped d_path */
diff --git a/kernel/audit.h b/kernel/audit.h
index 6f733920fd32..6aa33b848cf2 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -19,9 +19,9 @@
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */ 20 */
21 21
22#include <linux/mutex.h>
23#include <linux/fs.h> 22#include <linux/fs.h>
24#include <linux/audit.h> 23#include <linux/audit.h>
24#include <linux/skbuff.h>
25 25
26/* 0 = no checking 26/* 0 = no checking
27 1 = put_count checking 27 1 = put_count checking
@@ -53,6 +53,18 @@ enum audit_state {
53}; 53};
54 54
55/* Rule lists */ 55/* Rule lists */
56struct audit_parent;
57
58struct audit_watch {
59 atomic_t count; /* reference count */
60 char *path; /* insertion path */
61 dev_t dev; /* associated superblock device */
62 unsigned long ino; /* associated inode number */
63 struct audit_parent *parent; /* associated parent */
64 struct list_head wlist; /* entry in parent->watches list */
65 struct list_head rules; /* associated rules */
66};
67
56struct audit_field { 68struct audit_field {
57 u32 type; 69 u32 type;
58 u32 val; 70 u32 val;
@@ -69,7 +81,11 @@ struct audit_krule {
69 u32 mask[AUDIT_BITMASK_SIZE]; 81 u32 mask[AUDIT_BITMASK_SIZE];
70 u32 buflen; /* for data alloc on list rules */ 82 u32 buflen; /* for data alloc on list rules */
71 u32 field_count; 83 u32 field_count;
84 char *filterkey; /* ties events to rules */
72 struct audit_field *fields; 85 struct audit_field *fields;
86 struct audit_field *inode_f; /* quick access to an inode field */
87 struct audit_watch *watch; /* associated watch */
88 struct list_head rlist; /* entry in audit_watch.rules list */
73}; 89};
74 90
75struct audit_entry { 91struct audit_entry {
@@ -78,15 +94,53 @@ struct audit_entry {
78 struct audit_krule rule; 94 struct audit_krule rule;
79}; 95};
80 96
81
82extern int audit_pid; 97extern int audit_pid;
83extern int audit_comparator(const u32 left, const u32 op, const u32 right);
84 98
99#define AUDIT_INODE_BUCKETS 32
100extern struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
101
102static inline int audit_hash_ino(u32 ino)
103{
104 return (ino & (AUDIT_INODE_BUCKETS-1));
105}
106
107extern int audit_comparator(const u32 left, const u32 op, const u32 right);
108extern int audit_compare_dname_path(const char *dname, const char *path,
109 int *dirlen);
110extern struct sk_buff * audit_make_reply(int pid, int seq, int type,
111 int done, int multi,
112 void *payload, int size);
85extern void audit_send_reply(int pid, int seq, int type, 113extern void audit_send_reply(int pid, int seq, int type,
86 int done, int multi, 114 int done, int multi,
87 void *payload, int size); 115 void *payload, int size);
88extern void audit_log_lost(const char *message); 116extern void audit_log_lost(const char *message);
89extern void audit_panic(const char *message); 117extern void audit_panic(const char *message);
90extern struct mutex audit_netlink_mutex;
91 118
119struct audit_netlink_list {
120 int pid;
121 struct sk_buff_head q;
122};
123
124int audit_send_list(void *);
125
126struct inotify_watch;
127extern void audit_free_parent(struct inotify_watch *);
128extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32,
129 const char *, struct inode *);
92extern int selinux_audit_rule_update(void); 130extern int selinux_audit_rule_update(void);
131
132#ifdef CONFIG_AUDITSYSCALL
133extern void __audit_signal_info(int sig, struct task_struct *t);
134static inline void audit_signal_info(int sig, struct task_struct *t)
135{
136 if (unlikely(audit_pid && t->tgid == audit_pid))
137 __audit_signal_info(sig, t);
138}
139extern enum audit_state audit_filter_inodes(struct task_struct *,
140 struct audit_context *);
141extern void audit_set_auditable(struct audit_context *);
142#else
143#define audit_signal_info(s,t)
144#define audit_filter_inodes(t,c) AUDIT_DISABLED
145#define audit_set_auditable(c)
146#endif
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 7c134906d689..5b4e16276ca0 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -22,13 +22,59 @@
22#include <linux/kernel.h> 22#include <linux/kernel.h>
23#include <linux/audit.h> 23#include <linux/audit.h>
24#include <linux/kthread.h> 24#include <linux/kthread.h>
25#include <linux/mutex.h>
26#include <linux/fs.h>
27#include <linux/namei.h>
25#include <linux/netlink.h> 28#include <linux/netlink.h>
29#include <linux/sched.h>
30#include <linux/inotify.h>
26#include <linux/selinux.h> 31#include <linux/selinux.h>
27#include "audit.h" 32#include "audit.h"
28 33
29/* There are three lists of rules -- one to search at task creation 34/*
30 * time, one to search at syscall entry time, and another to search at 35 * Locking model:
31 * syscall exit time. */ 36 *
37 * audit_filter_mutex:
38 * Synchronizes writes and blocking reads of audit's filterlist
39 * data. Rcu is used to traverse the filterlist and access
40 * contents of structs audit_entry, audit_watch and opaque
41 * selinux rules during filtering. If modified, these structures
42 * must be copied and replace their counterparts in the filterlist.
43 * An audit_parent struct is not accessed during filtering, so may
44 * be written directly provided audit_filter_mutex is held.
45 */
46
47/*
48 * Reference counting:
49 *
50 * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED
51 * event. Each audit_watch holds a reference to its associated parent.
52 *
53 * audit_watch: if added to lists, lifetime is from audit_init_watch() to
54 * audit_remove_watch(). Additionally, an audit_watch may exist
55 * temporarily to assist in searching existing filter data. Each
56 * audit_krule holds a reference to its associated watch.
57 */
58
59struct audit_parent {
60 struct list_head ilist; /* entry in inotify registration list */
61 struct list_head watches; /* associated watches */
62 struct inotify_watch wdata; /* inotify watch data */
63 unsigned flags; /* status flags */
64};
65
66/*
67 * audit_parent status flags:
68 *
69 * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
70 * a filesystem event to ensure we're adding audit watches to a valid parent.
71 * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
72 * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
73 * we can receive while holding nameidata.
74 */
75#define AUDIT_PARENT_INVALID 0x001
76
77/* Audit filter lists, defined in <linux/audit.h> */
32struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { 78struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
33 LIST_HEAD_INIT(audit_filter_list[0]), 79 LIST_HEAD_INIT(audit_filter_list[0]),
34 LIST_HEAD_INIT(audit_filter_list[1]), 80 LIST_HEAD_INIT(audit_filter_list[1]),
@@ -41,9 +87,53 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
41#endif 87#endif
42}; 88};
43 89
90static DEFINE_MUTEX(audit_filter_mutex);
91
92/* Inotify handle */
93extern struct inotify_handle *audit_ih;
94
95/* Inotify events we care about. */
96#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
97
98void audit_free_parent(struct inotify_watch *i_watch)
99{
100 struct audit_parent *parent;
101
102 parent = container_of(i_watch, struct audit_parent, wdata);
103 WARN_ON(!list_empty(&parent->watches));
104 kfree(parent);
105}
106
107static inline void audit_get_watch(struct audit_watch *watch)
108{
109 atomic_inc(&watch->count);
110}
111
112static void audit_put_watch(struct audit_watch *watch)
113{
114 if (atomic_dec_and_test(&watch->count)) {
115 WARN_ON(watch->parent);
116 WARN_ON(!list_empty(&watch->rules));
117 kfree(watch->path);
118 kfree(watch);
119 }
120}
121
122static void audit_remove_watch(struct audit_watch *watch)
123{
124 list_del(&watch->wlist);
125 put_inotify_watch(&watch->parent->wdata);
126 watch->parent = NULL;
127 audit_put_watch(watch); /* match initial get */
128}
129
44static inline void audit_free_rule(struct audit_entry *e) 130static inline void audit_free_rule(struct audit_entry *e)
45{ 131{
46 int i; 132 int i;
133
134 /* some rules don't have associated watches */
135 if (e->rule.watch)
136 audit_put_watch(e->rule.watch);
47 if (e->rule.fields) 137 if (e->rule.fields)
48 for (i = 0; i < e->rule.field_count; i++) { 138 for (i = 0; i < e->rule.field_count; i++) {
49 struct audit_field *f = &e->rule.fields[i]; 139 struct audit_field *f = &e->rule.fields[i];
@@ -51,6 +141,7 @@ static inline void audit_free_rule(struct audit_entry *e)
51 selinux_audit_rule_free(f->se_rule); 141 selinux_audit_rule_free(f->se_rule);
52 } 142 }
53 kfree(e->rule.fields); 143 kfree(e->rule.fields);
144 kfree(e->rule.filterkey);
54 kfree(e); 145 kfree(e);
55} 146}
56 147
@@ -60,6 +151,50 @@ static inline void audit_free_rule_rcu(struct rcu_head *head)
60 audit_free_rule(e); 151 audit_free_rule(e);
61} 152}
62 153
154/* Initialize a parent watch entry. */
155static struct audit_parent *audit_init_parent(struct nameidata *ndp)
156{
157 struct audit_parent *parent;
158 s32 wd;
159
160 parent = kzalloc(sizeof(*parent), GFP_KERNEL);
161 if (unlikely(!parent))
162 return ERR_PTR(-ENOMEM);
163
164 INIT_LIST_HEAD(&parent->watches);
165 parent->flags = 0;
166
167 inotify_init_watch(&parent->wdata);
168 /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
169 get_inotify_watch(&parent->wdata);
170 wd = inotify_add_watch(audit_ih, &parent->wdata, ndp->dentry->d_inode,
171 AUDIT_IN_WATCH);
172 if (wd < 0) {
173 audit_free_parent(&parent->wdata);
174 return ERR_PTR(wd);
175 }
176
177 return parent;
178}
179
180/* Initialize a watch entry. */
181static struct audit_watch *audit_init_watch(char *path)
182{
183 struct audit_watch *watch;
184
185 watch = kzalloc(sizeof(*watch), GFP_KERNEL);
186 if (unlikely(!watch))
187 return ERR_PTR(-ENOMEM);
188
189 INIT_LIST_HEAD(&watch->rules);
190 atomic_set(&watch->count, 1);
191 watch->path = path;
192 watch->dev = (dev_t)-1;
193 watch->ino = (unsigned long)-1;
194
195 return watch;
196}
197
63/* Initialize an audit filterlist entry. */ 198/* Initialize an audit filterlist entry. */
64static inline struct audit_entry *audit_init_entry(u32 field_count) 199static inline struct audit_entry *audit_init_entry(u32 field_count)
65{ 200{
@@ -107,6 +242,66 @@ static char *audit_unpack_string(void **bufp, size_t *remain, size_t len)
107 return str; 242 return str;
108} 243}
109 244
245/* Translate an inode field to kernel respresentation. */
246static inline int audit_to_inode(struct audit_krule *krule,
247 struct audit_field *f)
248{
249 if (krule->listnr != AUDIT_FILTER_EXIT ||
250 krule->watch || krule->inode_f)
251 return -EINVAL;
252
253 krule->inode_f = f;
254 return 0;
255}
256
257/* Translate a watch string to kernel respresentation. */
258static int audit_to_watch(struct audit_krule *krule, char *path, int len,
259 u32 op)
260{
261 struct audit_watch *watch;
262
263 if (!audit_ih)
264 return -EOPNOTSUPP;
265
266 if (path[0] != '/' || path[len-1] == '/' ||
267 krule->listnr != AUDIT_FILTER_EXIT ||
268 op & ~AUDIT_EQUAL ||
269 krule->inode_f || krule->watch) /* 1 inode # per rule, for hash */
270 return -EINVAL;
271
272 watch = audit_init_watch(path);
273 if (unlikely(IS_ERR(watch)))
274 return PTR_ERR(watch);
275
276 audit_get_watch(watch);
277 krule->watch = watch;
278
279 return 0;
280}
281
282static __u32 *classes[AUDIT_SYSCALL_CLASSES];
283
284int __init audit_register_class(int class, unsigned *list)
285{
286 __u32 *p = kzalloc(AUDIT_BITMASK_SIZE * sizeof(__u32), GFP_KERNEL);
287 if (!p)
288 return -ENOMEM;
289 while (*list != ~0U) {
290 unsigned n = *list++;
291 if (n >= AUDIT_BITMASK_SIZE * 32 - AUDIT_SYSCALL_CLASSES) {
292 kfree(p);
293 return -EINVAL;
294 }
295 p[AUDIT_WORD(n)] |= AUDIT_BIT(n);
296 }
297 if (class >= AUDIT_SYSCALL_CLASSES || classes[class]) {
298 kfree(p);
299 return -EINVAL;
300 }
301 classes[class] = p;
302 return 0;
303}
304
110/* Common user-space to kernel rule translation. */ 305/* Common user-space to kernel rule translation. */
111static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) 306static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
112{ 307{
@@ -128,8 +323,11 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
128#endif 323#endif
129 ; 324 ;
130 } 325 }
131 if (rule->action != AUDIT_NEVER && rule->action != AUDIT_POSSIBLE && 326 if (unlikely(rule->action == AUDIT_POSSIBLE)) {
132 rule->action != AUDIT_ALWAYS) 327 printk(KERN_ERR "AUDIT_POSSIBLE is deprecated\n");
328 goto exit_err;
329 }
330 if (rule->action != AUDIT_NEVER && rule->action != AUDIT_ALWAYS)
133 goto exit_err; 331 goto exit_err;
134 if (rule->field_count > AUDIT_MAX_FIELDS) 332 if (rule->field_count > AUDIT_MAX_FIELDS)
135 goto exit_err; 333 goto exit_err;
@@ -147,6 +345,22 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
147 for (i = 0; i < AUDIT_BITMASK_SIZE; i++) 345 for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
148 entry->rule.mask[i] = rule->mask[i]; 346 entry->rule.mask[i] = rule->mask[i];
149 347
348 for (i = 0; i < AUDIT_SYSCALL_CLASSES; i++) {
349 int bit = AUDIT_BITMASK_SIZE * 32 - i - 1;
350 __u32 *p = &entry->rule.mask[AUDIT_WORD(bit)];
351 __u32 *class;
352
353 if (!(*p & AUDIT_BIT(bit)))
354 continue;
355 *p &= ~AUDIT_BIT(bit);
356 class = classes[i];
357 if (class) {
358 int j;
359 for (j = 0; j < AUDIT_BITMASK_SIZE; j++)
360 entry->rule.mask[j] |= class[j];
361 }
362 }
363
150 return entry; 364 return entry;
151 365
152exit_err: 366exit_err:
@@ -158,6 +372,7 @@ exit_err:
158static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) 372static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
159{ 373{
160 struct audit_entry *entry; 374 struct audit_entry *entry;
375 struct audit_field *f;
161 int err = 0; 376 int err = 0;
162 int i; 377 int i;
163 378
@@ -172,14 +387,37 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
172 f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS); 387 f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS);
173 f->val = rule->values[i]; 388 f->val = rule->values[i];
174 389
175 if (f->type & AUDIT_UNUSED_BITS || 390 err = -EINVAL;
176 f->type == AUDIT_SE_USER || 391 switch(f->type) {
177 f->type == AUDIT_SE_ROLE || 392 default:
178 f->type == AUDIT_SE_TYPE ||
179 f->type == AUDIT_SE_SEN ||
180 f->type == AUDIT_SE_CLR) {
181 err = -EINVAL;
182 goto exit_free; 393 goto exit_free;
394 case AUDIT_PID:
395 case AUDIT_UID:
396 case AUDIT_EUID:
397 case AUDIT_SUID:
398 case AUDIT_FSUID:
399 case AUDIT_GID:
400 case AUDIT_EGID:
401 case AUDIT_SGID:
402 case AUDIT_FSGID:
403 case AUDIT_LOGINUID:
404 case AUDIT_PERS:
405 case AUDIT_ARCH:
406 case AUDIT_MSGTYPE:
407 case AUDIT_DEVMAJOR:
408 case AUDIT_DEVMINOR:
409 case AUDIT_EXIT:
410 case AUDIT_SUCCESS:
411 case AUDIT_ARG0:
412 case AUDIT_ARG1:
413 case AUDIT_ARG2:
414 case AUDIT_ARG3:
415 break;
416 case AUDIT_INODE:
417 err = audit_to_inode(&entry->rule, f);
418 if (err)
419 goto exit_free;
420 break;
183 } 421 }
184 422
185 entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1; 423 entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1;
@@ -196,6 +434,18 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
196 } 434 }
197 } 435 }
198 436
437 f = entry->rule.inode_f;
438 if (f) {
439 switch(f->op) {
440 case AUDIT_NOT_EQUAL:
441 entry->rule.inode_f = NULL;
442 case AUDIT_EQUAL:
443 break;
444 default:
445 goto exit_free;
446 }
447 }
448
199exit_nofree: 449exit_nofree:
200 return entry; 450 return entry;
201 451
@@ -210,6 +460,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
210{ 460{
211 int err = 0; 461 int err = 0;
212 struct audit_entry *entry; 462 struct audit_entry *entry;
463 struct audit_field *f;
213 void *bufp; 464 void *bufp;
214 size_t remain = datasz - sizeof(struct audit_rule_data); 465 size_t remain = datasz - sizeof(struct audit_rule_data);
215 int i; 466 int i;
@@ -235,11 +486,39 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
235 f->se_str = NULL; 486 f->se_str = NULL;
236 f->se_rule = NULL; 487 f->se_rule = NULL;
237 switch(f->type) { 488 switch(f->type) {
238 case AUDIT_SE_USER: 489 case AUDIT_PID:
239 case AUDIT_SE_ROLE: 490 case AUDIT_UID:
240 case AUDIT_SE_TYPE: 491 case AUDIT_EUID:
241 case AUDIT_SE_SEN: 492 case AUDIT_SUID:
242 case AUDIT_SE_CLR: 493 case AUDIT_FSUID:
494 case AUDIT_GID:
495 case AUDIT_EGID:
496 case AUDIT_SGID:
497 case AUDIT_FSGID:
498 case AUDIT_LOGINUID:
499 case AUDIT_PERS:
500 case AUDIT_ARCH:
501 case AUDIT_MSGTYPE:
502 case AUDIT_PPID:
503 case AUDIT_DEVMAJOR:
504 case AUDIT_DEVMINOR:
505 case AUDIT_EXIT:
506 case AUDIT_SUCCESS:
507 case AUDIT_ARG0:
508 case AUDIT_ARG1:
509 case AUDIT_ARG2:
510 case AUDIT_ARG3:
511 break;
512 case AUDIT_SUBJ_USER:
513 case AUDIT_SUBJ_ROLE:
514 case AUDIT_SUBJ_TYPE:
515 case AUDIT_SUBJ_SEN:
516 case AUDIT_SUBJ_CLR:
517 case AUDIT_OBJ_USER:
518 case AUDIT_OBJ_ROLE:
519 case AUDIT_OBJ_TYPE:
520 case AUDIT_OBJ_LEV_LOW:
521 case AUDIT_OBJ_LEV_HIGH:
243 str = audit_unpack_string(&bufp, &remain, f->val); 522 str = audit_unpack_string(&bufp, &remain, f->val);
244 if (IS_ERR(str)) 523 if (IS_ERR(str))
245 goto exit_free; 524 goto exit_free;
@@ -260,6 +539,47 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
260 } else 539 } else
261 f->se_str = str; 540 f->se_str = str;
262 break; 541 break;
542 case AUDIT_WATCH:
543 str = audit_unpack_string(&bufp, &remain, f->val);
544 if (IS_ERR(str))
545 goto exit_free;
546 entry->rule.buflen += f->val;
547
548 err = audit_to_watch(&entry->rule, str, f->val, f->op);
549 if (err) {
550 kfree(str);
551 goto exit_free;
552 }
553 break;
554 case AUDIT_INODE:
555 err = audit_to_inode(&entry->rule, f);
556 if (err)
557 goto exit_free;
558 break;
559 case AUDIT_FILTERKEY:
560 err = -EINVAL;
561 if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN)
562 goto exit_free;
563 str = audit_unpack_string(&bufp, &remain, f->val);
564 if (IS_ERR(str))
565 goto exit_free;
566 entry->rule.buflen += f->val;
567 entry->rule.filterkey = str;
568 break;
569 default:
570 goto exit_free;
571 }
572 }
573
574 f = entry->rule.inode_f;
575 if (f) {
576 switch(f->op) {
577 case AUDIT_NOT_EQUAL:
578 entry->rule.inode_f = NULL;
579 case AUDIT_EQUAL:
580 break;
581 default:
582 goto exit_free;
263 } 583 }
264 } 584 }
265 585
@@ -291,7 +611,7 @@ static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule)
291 611
292 rule = kmalloc(sizeof(*rule), GFP_KERNEL); 612 rule = kmalloc(sizeof(*rule), GFP_KERNEL);
293 if (unlikely(!rule)) 613 if (unlikely(!rule))
294 return ERR_PTR(-ENOMEM); 614 return NULL;
295 memset(rule, 0, sizeof(*rule)); 615 memset(rule, 0, sizeof(*rule));
296 616
297 rule->flags = krule->flags | krule->listnr; 617 rule->flags = krule->flags | krule->listnr;
@@ -322,7 +642,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
322 642
323 data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL); 643 data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL);
324 if (unlikely(!data)) 644 if (unlikely(!data))
325 return ERR_PTR(-ENOMEM); 645 return NULL;
326 memset(data, 0, sizeof(*data)); 646 memset(data, 0, sizeof(*data));
327 647
328 data->flags = krule->flags | krule->listnr; 648 data->flags = krule->flags | krule->listnr;
@@ -335,14 +655,27 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
335 data->fields[i] = f->type; 655 data->fields[i] = f->type;
336 data->fieldflags[i] = f->op; 656 data->fieldflags[i] = f->op;
337 switch(f->type) { 657 switch(f->type) {
338 case AUDIT_SE_USER: 658 case AUDIT_SUBJ_USER:
339 case AUDIT_SE_ROLE: 659 case AUDIT_SUBJ_ROLE:
340 case AUDIT_SE_TYPE: 660 case AUDIT_SUBJ_TYPE:
341 case AUDIT_SE_SEN: 661 case AUDIT_SUBJ_SEN:
342 case AUDIT_SE_CLR: 662 case AUDIT_SUBJ_CLR:
663 case AUDIT_OBJ_USER:
664 case AUDIT_OBJ_ROLE:
665 case AUDIT_OBJ_TYPE:
666 case AUDIT_OBJ_LEV_LOW:
667 case AUDIT_OBJ_LEV_HIGH:
343 data->buflen += data->values[i] = 668 data->buflen += data->values[i] =
344 audit_pack_string(&bufp, f->se_str); 669 audit_pack_string(&bufp, f->se_str);
345 break; 670 break;
671 case AUDIT_WATCH:
672 data->buflen += data->values[i] =
673 audit_pack_string(&bufp, krule->watch->path);
674 break;
675 case AUDIT_FILTERKEY:
676 data->buflen += data->values[i] =
677 audit_pack_string(&bufp, krule->filterkey);
678 break;
346 default: 679 default:
347 data->values[i] = f->val; 680 data->values[i] = f->val;
348 } 681 }
@@ -370,14 +703,28 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
370 return 1; 703 return 1;
371 704
372 switch(a->fields[i].type) { 705 switch(a->fields[i].type) {
373 case AUDIT_SE_USER: 706 case AUDIT_SUBJ_USER:
374 case AUDIT_SE_ROLE: 707 case AUDIT_SUBJ_ROLE:
375 case AUDIT_SE_TYPE: 708 case AUDIT_SUBJ_TYPE:
376 case AUDIT_SE_SEN: 709 case AUDIT_SUBJ_SEN:
377 case AUDIT_SE_CLR: 710 case AUDIT_SUBJ_CLR:
711 case AUDIT_OBJ_USER:
712 case AUDIT_OBJ_ROLE:
713 case AUDIT_OBJ_TYPE:
714 case AUDIT_OBJ_LEV_LOW:
715 case AUDIT_OBJ_LEV_HIGH:
378 if (strcmp(a->fields[i].se_str, b->fields[i].se_str)) 716 if (strcmp(a->fields[i].se_str, b->fields[i].se_str))
379 return 1; 717 return 1;
380 break; 718 break;
719 case AUDIT_WATCH:
720 if (strcmp(a->watch->path, b->watch->path))
721 return 1;
722 break;
723 case AUDIT_FILTERKEY:
724 /* both filterkeys exist based on above type compare */
725 if (strcmp(a->filterkey, b->filterkey))
726 return 1;
727 break;
381 default: 728 default:
382 if (a->fields[i].val != b->fields[i].val) 729 if (a->fields[i].val != b->fields[i].val)
383 return 1; 730 return 1;
@@ -391,6 +738,32 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
391 return 0; 738 return 0;
392} 739}
393 740
741/* Duplicate the given audit watch. The new watch's rules list is initialized
742 * to an empty list and wlist is undefined. */
743static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
744{
745 char *path;
746 struct audit_watch *new;
747
748 path = kstrdup(old->path, GFP_KERNEL);
749 if (unlikely(!path))
750 return ERR_PTR(-ENOMEM);
751
752 new = audit_init_watch(path);
753 if (unlikely(IS_ERR(new))) {
754 kfree(path);
755 goto out;
756 }
757
758 new->dev = old->dev;
759 new->ino = old->ino;
760 get_inotify_watch(&old->parent->wdata);
761 new->parent = old->parent;
762
763out:
764 return new;
765}
766
394/* Duplicate selinux field information. The se_rule is opaque, so must be 767/* Duplicate selinux field information. The se_rule is opaque, so must be
395 * re-initialized. */ 768 * re-initialized. */
396static inline int audit_dupe_selinux_field(struct audit_field *df, 769static inline int audit_dupe_selinux_field(struct audit_field *df,
@@ -422,12 +795,16 @@ static inline int audit_dupe_selinux_field(struct audit_field *df,
422/* Duplicate an audit rule. This will be a deep copy with the exception 795/* Duplicate an audit rule. This will be a deep copy with the exception
423 * of the watch - that pointer is carried over. The selinux specific fields 796 * of the watch - that pointer is carried over. The selinux specific fields
424 * will be updated in the copy. The point is to be able to replace the old 797 * will be updated in the copy. The point is to be able to replace the old
425 * rule with the new rule in the filterlist, then free the old rule. */ 798 * rule with the new rule in the filterlist, then free the old rule.
426static struct audit_entry *audit_dupe_rule(struct audit_krule *old) 799 * The rlist element is undefined; list manipulations are handled apart from
800 * the initial copy. */
801static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
802 struct audit_watch *watch)
427{ 803{
428 u32 fcount = old->field_count; 804 u32 fcount = old->field_count;
429 struct audit_entry *entry; 805 struct audit_entry *entry;
430 struct audit_krule *new; 806 struct audit_krule *new;
807 char *fk;
431 int i, err = 0; 808 int i, err = 0;
432 809
433 entry = audit_init_entry(fcount); 810 entry = audit_init_entry(fcount);
@@ -442,6 +819,8 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old)
442 for (i = 0; i < AUDIT_BITMASK_SIZE; i++) 819 for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
443 new->mask[i] = old->mask[i]; 820 new->mask[i] = old->mask[i];
444 new->buflen = old->buflen; 821 new->buflen = old->buflen;
822 new->inode_f = old->inode_f;
823 new->watch = NULL;
445 new->field_count = old->field_count; 824 new->field_count = old->field_count;
446 memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount); 825 memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount);
447 826
@@ -449,13 +828,25 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old)
449 * the originals will all be freed when the old rule is freed. */ 828 * the originals will all be freed when the old rule is freed. */
450 for (i = 0; i < fcount; i++) { 829 for (i = 0; i < fcount; i++) {
451 switch (new->fields[i].type) { 830 switch (new->fields[i].type) {
452 case AUDIT_SE_USER: 831 case AUDIT_SUBJ_USER:
453 case AUDIT_SE_ROLE: 832 case AUDIT_SUBJ_ROLE:
454 case AUDIT_SE_TYPE: 833 case AUDIT_SUBJ_TYPE:
455 case AUDIT_SE_SEN: 834 case AUDIT_SUBJ_SEN:
456 case AUDIT_SE_CLR: 835 case AUDIT_SUBJ_CLR:
836 case AUDIT_OBJ_USER:
837 case AUDIT_OBJ_ROLE:
838 case AUDIT_OBJ_TYPE:
839 case AUDIT_OBJ_LEV_LOW:
840 case AUDIT_OBJ_LEV_HIGH:
457 err = audit_dupe_selinux_field(&new->fields[i], 841 err = audit_dupe_selinux_field(&new->fields[i],
458 &old->fields[i]); 842 &old->fields[i]);
843 break;
844 case AUDIT_FILTERKEY:
845 fk = kstrdup(old->filterkey, GFP_KERNEL);
846 if (unlikely(!fk))
847 err = -ENOMEM;
848 else
849 new->filterkey = fk;
459 } 850 }
460 if (err) { 851 if (err) {
461 audit_free_rule(entry); 852 audit_free_rule(entry);
@@ -463,68 +854,409 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old)
463 } 854 }
464 } 855 }
465 856
857 if (watch) {
858 audit_get_watch(watch);
859 new->watch = watch;
860 }
861
466 return entry; 862 return entry;
467} 863}
468 864
469/* Add rule to given filterlist if not a duplicate. Protected by 865/* Update inode info in audit rules based on filesystem event. */
470 * audit_netlink_mutex. */ 866static void audit_update_watch(struct audit_parent *parent,
867 const char *dname, dev_t dev,
868 unsigned long ino, unsigned invalidating)
869{
870 struct audit_watch *owatch, *nwatch, *nextw;
871 struct audit_krule *r, *nextr;
872 struct audit_entry *oentry, *nentry;
873 struct audit_buffer *ab;
874
875 mutex_lock(&audit_filter_mutex);
876 list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
877 if (audit_compare_dname_path(dname, owatch->path, NULL))
878 continue;
879
880 /* If the update involves invalidating rules, do the inode-based
881 * filtering now, so we don't omit records. */
882 if (invalidating &&
883 audit_filter_inodes(current, current->audit_context) == AUDIT_RECORD_CONTEXT)
884 audit_set_auditable(current->audit_context);
885
886 nwatch = audit_dupe_watch(owatch);
887 if (unlikely(IS_ERR(nwatch))) {
888 mutex_unlock(&audit_filter_mutex);
889 audit_panic("error updating watch, skipping");
890 return;
891 }
892 nwatch->dev = dev;
893 nwatch->ino = ino;
894
895 list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) {
896
897 oentry = container_of(r, struct audit_entry, rule);
898 list_del(&oentry->rule.rlist);
899 list_del_rcu(&oentry->list);
900
901 nentry = audit_dupe_rule(&oentry->rule, nwatch);
902 if (unlikely(IS_ERR(nentry)))
903 audit_panic("error updating watch, removing");
904 else {
905 int h = audit_hash_ino((u32)ino);
906 list_add(&nentry->rule.rlist, &nwatch->rules);
907 list_add_rcu(&nentry->list, &audit_inode_hash[h]);
908 }
909
910 call_rcu(&oentry->rcu, audit_free_rule_rcu);
911 }
912
913 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
914 audit_log_format(ab, "audit updated rules specifying watch=");
915 audit_log_untrustedstring(ab, owatch->path);
916 audit_log_format(ab, " with dev=%u ino=%lu\n", dev, ino);
917 audit_log_end(ab);
918
919 audit_remove_watch(owatch);
920 goto add_watch_to_parent; /* event applies to a single watch */
921 }
922 mutex_unlock(&audit_filter_mutex);
923 return;
924
925add_watch_to_parent:
926 list_add(&nwatch->wlist, &parent->watches);
927 mutex_unlock(&audit_filter_mutex);
928 return;
929}
930
931/* Remove all watches & rules associated with a parent that is going away. */
932static void audit_remove_parent_watches(struct audit_parent *parent)
933{
934 struct audit_watch *w, *nextw;
935 struct audit_krule *r, *nextr;
936 struct audit_entry *e;
937
938 mutex_lock(&audit_filter_mutex);
939 parent->flags |= AUDIT_PARENT_INVALID;
940 list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
941 list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
942 e = container_of(r, struct audit_entry, rule);
943 list_del(&r->rlist);
944 list_del_rcu(&e->list);
945 call_rcu(&e->rcu, audit_free_rule_rcu);
946
947 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
948 "audit implicitly removed rule from list=%d\n",
949 AUDIT_FILTER_EXIT);
950 }
951 audit_remove_watch(w);
952 }
953 mutex_unlock(&audit_filter_mutex);
954}
955
956/* Unregister inotify watches for parents on in_list.
957 * Generates an IN_IGNORED event. */
958static void audit_inotify_unregister(struct list_head *in_list)
959{
960 struct audit_parent *p, *n;
961
962 list_for_each_entry_safe(p, n, in_list, ilist) {
963 list_del(&p->ilist);
964 inotify_rm_watch(audit_ih, &p->wdata);
965 /* the put matching the get in audit_do_del_rule() */
966 put_inotify_watch(&p->wdata);
967 }
968}
969
970/* Find an existing audit rule.
971 * Caller must hold audit_filter_mutex to prevent stale rule data. */
972static struct audit_entry *audit_find_rule(struct audit_entry *entry,
973 struct list_head *list)
974{
975 struct audit_entry *e, *found = NULL;
976 int h;
977
978 if (entry->rule.watch) {
979 /* we don't know the inode number, so must walk entire hash */
980 for (h = 0; h < AUDIT_INODE_BUCKETS; h++) {
981 list = &audit_inode_hash[h];
982 list_for_each_entry(e, list, list)
983 if (!audit_compare_rule(&entry->rule, &e->rule)) {
984 found = e;
985 goto out;
986 }
987 }
988 goto out;
989 }
990
991 list_for_each_entry(e, list, list)
992 if (!audit_compare_rule(&entry->rule, &e->rule)) {
993 found = e;
994 goto out;
995 }
996
997out:
998 return found;
999}
1000
1001/* Get path information necessary for adding watches. */
1002static int audit_get_nd(char *path, struct nameidata **ndp,
1003 struct nameidata **ndw)
1004{
1005 struct nameidata *ndparent, *ndwatch;
1006 int err;
1007
1008 ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL);
1009 if (unlikely(!ndparent))
1010 return -ENOMEM;
1011
1012 ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL);
1013 if (unlikely(!ndwatch)) {
1014 kfree(ndparent);
1015 return -ENOMEM;
1016 }
1017
1018 err = path_lookup(path, LOOKUP_PARENT, ndparent);
1019 if (err) {
1020 kfree(ndparent);
1021 kfree(ndwatch);
1022 return err;
1023 }
1024
1025 err = path_lookup(path, 0, ndwatch);
1026 if (err) {
1027 kfree(ndwatch);
1028 ndwatch = NULL;
1029 }
1030
1031 *ndp = ndparent;
1032 *ndw = ndwatch;
1033
1034 return 0;
1035}
1036
1037/* Release resources used for watch path information. */
1038static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
1039{
1040 if (ndp) {
1041 path_release(ndp);
1042 kfree(ndp);
1043 }
1044 if (ndw) {
1045 path_release(ndw);
1046 kfree(ndw);
1047 }
1048}
1049
1050/* Associate the given rule with an existing parent inotify_watch.
1051 * Caller must hold audit_filter_mutex. */
1052static void audit_add_to_parent(struct audit_krule *krule,
1053 struct audit_parent *parent)
1054{
1055 struct audit_watch *w, *watch = krule->watch;
1056 int watch_found = 0;
1057
1058 list_for_each_entry(w, &parent->watches, wlist) {
1059 if (strcmp(watch->path, w->path))
1060 continue;
1061
1062 watch_found = 1;
1063
1064 /* put krule's and initial refs to temporary watch */
1065 audit_put_watch(watch);
1066 audit_put_watch(watch);
1067
1068 audit_get_watch(w);
1069 krule->watch = watch = w;
1070 break;
1071 }
1072
1073 if (!watch_found) {
1074 get_inotify_watch(&parent->wdata);
1075 watch->parent = parent;
1076
1077 list_add(&watch->wlist, &parent->watches);
1078 }
1079 list_add(&krule->rlist, &watch->rules);
1080}
1081
1082/* Find a matching watch entry, or add this one.
1083 * Caller must hold audit_filter_mutex. */
1084static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp,
1085 struct nameidata *ndw)
1086{
1087 struct audit_watch *watch = krule->watch;
1088 struct inotify_watch *i_watch;
1089 struct audit_parent *parent;
1090 int ret = 0;
1091
1092 /* update watch filter fields */
1093 if (ndw) {
1094 watch->dev = ndw->dentry->d_inode->i_sb->s_dev;
1095 watch->ino = ndw->dentry->d_inode->i_ino;
1096 }
1097
1098 /* The audit_filter_mutex must not be held during inotify calls because
1099 * we hold it during inotify event callback processing. If an existing
1100 * inotify watch is found, inotify_find_watch() grabs a reference before
1101 * returning.
1102 */
1103 mutex_unlock(&audit_filter_mutex);
1104
1105 if (inotify_find_watch(audit_ih, ndp->dentry->d_inode, &i_watch) < 0) {
1106 parent = audit_init_parent(ndp);
1107 if (IS_ERR(parent)) {
1108 /* caller expects mutex locked */
1109 mutex_lock(&audit_filter_mutex);
1110 return PTR_ERR(parent);
1111 }
1112 } else
1113 parent = container_of(i_watch, struct audit_parent, wdata);
1114
1115 mutex_lock(&audit_filter_mutex);
1116
1117 /* parent was moved before we took audit_filter_mutex */
1118 if (parent->flags & AUDIT_PARENT_INVALID)
1119 ret = -ENOENT;
1120 else
1121 audit_add_to_parent(krule, parent);
1122
1123 /* match get in audit_init_parent or inotify_find_watch */
1124 put_inotify_watch(&parent->wdata);
1125 return ret;
1126}
1127
1128/* Add rule to given filterlist if not a duplicate. */
471static inline int audit_add_rule(struct audit_entry *entry, 1129static inline int audit_add_rule(struct audit_entry *entry,
472 struct list_head *list) 1130 struct list_head *list)
473{ 1131{
474 struct audit_entry *e; 1132 struct audit_entry *e;
1133 struct audit_field *inode_f = entry->rule.inode_f;
1134 struct audit_watch *watch = entry->rule.watch;
1135 struct nameidata *ndp, *ndw;
1136 int h, err, putnd_needed = 0;
1137
1138 if (inode_f) {
1139 h = audit_hash_ino(inode_f->val);
1140 list = &audit_inode_hash[h];
1141 }
475 1142
476 /* Do not use the _rcu iterator here, since this is the only 1143 mutex_lock(&audit_filter_mutex);
477 * addition routine. */ 1144 e = audit_find_rule(entry, list);
478 list_for_each_entry(e, list, list) { 1145 mutex_unlock(&audit_filter_mutex);
479 if (!audit_compare_rule(&entry->rule, &e->rule)) 1146 if (e) {
480 return -EEXIST; 1147 err = -EEXIST;
1148 goto error;
1149 }
1150
1151 /* Avoid calling path_lookup under audit_filter_mutex. */
1152 if (watch) {
1153 err = audit_get_nd(watch->path, &ndp, &ndw);
1154 if (err)
1155 goto error;
1156 putnd_needed = 1;
1157 }
1158
1159 mutex_lock(&audit_filter_mutex);
1160 if (watch) {
1161 /* audit_filter_mutex is dropped and re-taken during this call */
1162 err = audit_add_watch(&entry->rule, ndp, ndw);
1163 if (err) {
1164 mutex_unlock(&audit_filter_mutex);
1165 goto error;
1166 }
1167 h = audit_hash_ino((u32)watch->ino);
1168 list = &audit_inode_hash[h];
481 } 1169 }
482 1170
483 if (entry->rule.flags & AUDIT_FILTER_PREPEND) { 1171 if (entry->rule.flags & AUDIT_FILTER_PREPEND) {
484 list_add_rcu(&entry->list, list); 1172 list_add_rcu(&entry->list, list);
1173 entry->rule.flags &= ~AUDIT_FILTER_PREPEND;
485 } else { 1174 } else {
486 list_add_tail_rcu(&entry->list, list); 1175 list_add_tail_rcu(&entry->list, list);
487 } 1176 }
1177 mutex_unlock(&audit_filter_mutex);
488 1178
489 return 0; 1179 if (putnd_needed)
1180 audit_put_nd(ndp, ndw);
1181
1182 return 0;
1183
1184error:
1185 if (putnd_needed)
1186 audit_put_nd(ndp, ndw);
1187 if (watch)
1188 audit_put_watch(watch); /* tmp watch, matches initial get */
1189 return err;
490} 1190}
491 1191
492/* Remove an existing rule from filterlist. Protected by 1192/* Remove an existing rule from filterlist. */
493 * audit_netlink_mutex. */
494static inline int audit_del_rule(struct audit_entry *entry, 1193static inline int audit_del_rule(struct audit_entry *entry,
495 struct list_head *list) 1194 struct list_head *list)
496{ 1195{
497 struct audit_entry *e; 1196 struct audit_entry *e;
1197 struct audit_field *inode_f = entry->rule.inode_f;
1198 struct audit_watch *watch, *tmp_watch = entry->rule.watch;
1199 LIST_HEAD(inotify_list);
1200 int h, ret = 0;
1201
1202 if (inode_f) {
1203 h = audit_hash_ino(inode_f->val);
1204 list = &audit_inode_hash[h];
1205 }
498 1206
499 /* Do not use the _rcu iterator here, since this is the only 1207 mutex_lock(&audit_filter_mutex);
500 * deletion routine. */ 1208 e = audit_find_rule(entry, list);
501 list_for_each_entry(e, list, list) { 1209 if (!e) {
502 if (!audit_compare_rule(&entry->rule, &e->rule)) { 1210 mutex_unlock(&audit_filter_mutex);
503 list_del_rcu(&e->list); 1211 ret = -ENOENT;
504 call_rcu(&e->rcu, audit_free_rule_rcu); 1212 goto out;
505 return 0; 1213 }
1214
1215 watch = e->rule.watch;
1216 if (watch) {
1217 struct audit_parent *parent = watch->parent;
1218
1219 list_del(&e->rule.rlist);
1220
1221 if (list_empty(&watch->rules)) {
1222 audit_remove_watch(watch);
1223
1224 if (list_empty(&parent->watches)) {
1225 /* Put parent on the inotify un-registration
1226 * list. Grab a reference before releasing
1227 * audit_filter_mutex, to be released in
1228 * audit_inotify_unregister(). */
1229 list_add(&parent->ilist, &inotify_list);
1230 get_inotify_watch(&parent->wdata);
1231 }
506 } 1232 }
507 } 1233 }
508 return -ENOENT; /* No matching rule */ 1234
1235 list_del_rcu(&e->list);
1236 call_rcu(&e->rcu, audit_free_rule_rcu);
1237
1238 mutex_unlock(&audit_filter_mutex);
1239
1240 if (!list_empty(&inotify_list))
1241 audit_inotify_unregister(&inotify_list);
1242
1243out:
1244 if (tmp_watch)
1245 audit_put_watch(tmp_watch); /* match initial get */
1246
1247 return ret;
509} 1248}
510 1249
511/* List rules using struct audit_rule. Exists for backward 1250/* List rules using struct audit_rule. Exists for backward
512 * compatibility with userspace. */ 1251 * compatibility with userspace. */
513static int audit_list(void *_dest) 1252static void audit_list(int pid, int seq, struct sk_buff_head *q)
514{ 1253{
515 int pid, seq; 1254 struct sk_buff *skb;
516 int *dest = _dest;
517 struct audit_entry *entry; 1255 struct audit_entry *entry;
518 int i; 1256 int i;
519 1257
520 pid = dest[0]; 1258 /* This is a blocking read, so use audit_filter_mutex instead of rcu
521 seq = dest[1]; 1259 * iterator to sync with list writers. */
522 kfree(dest);
523
524 mutex_lock(&audit_netlink_mutex);
525
526 /* The *_rcu iterators not needed here because we are
527 always called with audit_netlink_mutex held. */
528 for (i=0; i<AUDIT_NR_FILTERS; i++) { 1260 for (i=0; i<AUDIT_NR_FILTERS; i++) {
529 list_for_each_entry(entry, &audit_filter_list[i], list) { 1261 list_for_each_entry(entry, &audit_filter_list[i], list) {
530 struct audit_rule *rule; 1262 struct audit_rule *rule;
@@ -532,33 +1264,41 @@ static int audit_list(void *_dest)
532 rule = audit_krule_to_rule(&entry->rule); 1264 rule = audit_krule_to_rule(&entry->rule);
533 if (unlikely(!rule)) 1265 if (unlikely(!rule))
534 break; 1266 break;
535 audit_send_reply(pid, seq, AUDIT_LIST, 0, 1, 1267 skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1,
536 rule, sizeof(*rule)); 1268 rule, sizeof(*rule));
1269 if (skb)
1270 skb_queue_tail(q, skb);
537 kfree(rule); 1271 kfree(rule);
538 } 1272 }
539 } 1273 }
540 audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0); 1274 for (i = 0; i < AUDIT_INODE_BUCKETS; i++) {
541 1275 list_for_each_entry(entry, &audit_inode_hash[i], list) {
542 mutex_unlock(&audit_netlink_mutex); 1276 struct audit_rule *rule;
543 return 0; 1277
1278 rule = audit_krule_to_rule(&entry->rule);
1279 if (unlikely(!rule))
1280 break;
1281 skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1,
1282 rule, sizeof(*rule));
1283 if (skb)
1284 skb_queue_tail(q, skb);
1285 kfree(rule);
1286 }
1287 }
1288 skb = audit_make_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
1289 if (skb)
1290 skb_queue_tail(q, skb);
544} 1291}
545 1292
546/* List rules using struct audit_rule_data. */ 1293/* List rules using struct audit_rule_data. */
547static int audit_list_rules(void *_dest) 1294static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
548{ 1295{
549 int pid, seq; 1296 struct sk_buff *skb;
550 int *dest = _dest;
551 struct audit_entry *e; 1297 struct audit_entry *e;
552 int i; 1298 int i;
553 1299
554 pid = dest[0]; 1300 /* This is a blocking read, so use audit_filter_mutex instead of rcu
555 seq = dest[1]; 1301 * iterator to sync with list writers. */
556 kfree(dest);
557
558 mutex_lock(&audit_netlink_mutex);
559
560 /* The *_rcu iterators not needed here because we are
561 always called with audit_netlink_mutex held. */
562 for (i=0; i<AUDIT_NR_FILTERS; i++) { 1302 for (i=0; i<AUDIT_NR_FILTERS; i++) {
563 list_for_each_entry(e, &audit_filter_list[i], list) { 1303 list_for_each_entry(e, &audit_filter_list[i], list) {
564 struct audit_rule_data *data; 1304 struct audit_rule_data *data;
@@ -566,15 +1306,58 @@ static int audit_list_rules(void *_dest)
566 data = audit_krule_to_data(&e->rule); 1306 data = audit_krule_to_data(&e->rule);
567 if (unlikely(!data)) 1307 if (unlikely(!data))
568 break; 1308 break;
569 audit_send_reply(pid, seq, AUDIT_LIST_RULES, 0, 1, 1309 skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1,
570 data, sizeof(*data)); 1310 data, sizeof(*data) + data->buflen);
1311 if (skb)
1312 skb_queue_tail(q, skb);
571 kfree(data); 1313 kfree(data);
572 } 1314 }
573 } 1315 }
574 audit_send_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0); 1316 for (i=0; i< AUDIT_INODE_BUCKETS; i++) {
1317 list_for_each_entry(e, &audit_inode_hash[i], list) {
1318 struct audit_rule_data *data;
575 1319
576 mutex_unlock(&audit_netlink_mutex); 1320 data = audit_krule_to_data(&e->rule);
577 return 0; 1321 if (unlikely(!data))
1322 break;
1323 skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1,
1324 data, sizeof(*data) + data->buflen);
1325 if (skb)
1326 skb_queue_tail(q, skb);
1327 kfree(data);
1328 }
1329 }
1330 skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0);
1331 if (skb)
1332 skb_queue_tail(q, skb);
1333}
1334
1335/* Log rule additions and removals */
1336static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action,
1337 struct audit_krule *rule, int res)
1338{
1339 struct audit_buffer *ab;
1340
1341 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
1342 if (!ab)
1343 return;
1344 audit_log_format(ab, "auid=%u", loginuid);
1345 if (sid) {
1346 char *ctx = NULL;
1347 u32 len;
1348 if (selinux_ctxid_to_string(sid, &ctx, &len))
1349 audit_log_format(ab, " ssid=%u", sid);
1350 else
1351 audit_log_format(ab, " subj=%s", ctx);
1352 kfree(ctx);
1353 }
1354 audit_log_format(ab, " %s rule key=", action);
1355 if (rule->filterkey)
1356 audit_log_untrustedstring(ab, rule->filterkey);
1357 else
1358 audit_log_format(ab, "(null)");
1359 audit_log_format(ab, " list=%d res=%d", rule->listnr, res);
1360 audit_log_end(ab);
578} 1361}
579 1362
580/** 1363/**
@@ -592,7 +1375,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
592 size_t datasz, uid_t loginuid, u32 sid) 1375 size_t datasz, uid_t loginuid, u32 sid)
593{ 1376{
594 struct task_struct *tsk; 1377 struct task_struct *tsk;
595 int *dest; 1378 struct audit_netlink_list *dest;
596 int err = 0; 1379 int err = 0;
597 struct audit_entry *entry; 1380 struct audit_entry *entry;
598 1381
@@ -605,18 +1388,22 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
605 * happen if we're actually running in the context of auditctl 1388 * happen if we're actually running in the context of auditctl
606 * trying to _send_ the stuff */ 1389 * trying to _send_ the stuff */
607 1390
608 dest = kmalloc(2 * sizeof(int), GFP_KERNEL); 1391 dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL);
609 if (!dest) 1392 if (!dest)
610 return -ENOMEM; 1393 return -ENOMEM;
611 dest[0] = pid; 1394 dest->pid = pid;
612 dest[1] = seq; 1395 skb_queue_head_init(&dest->q);
613 1396
1397 mutex_lock(&audit_filter_mutex);
614 if (type == AUDIT_LIST) 1398 if (type == AUDIT_LIST)
615 tsk = kthread_run(audit_list, dest, "audit_list"); 1399 audit_list(pid, seq, &dest->q);
616 else 1400 else
617 tsk = kthread_run(audit_list_rules, dest, 1401 audit_list_rules(pid, seq, &dest->q);
618 "audit_list_rules"); 1402 mutex_unlock(&audit_filter_mutex);
1403
1404 tsk = kthread_run(audit_send_list, dest, "audit_send_list");
619 if (IS_ERR(tsk)) { 1405 if (IS_ERR(tsk)) {
1406 skb_queue_purge(&dest->q);
620 kfree(dest); 1407 kfree(dest);
621 err = PTR_ERR(tsk); 1408 err = PTR_ERR(tsk);
622 } 1409 }
@@ -632,23 +1419,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
632 1419
633 err = audit_add_rule(entry, 1420 err = audit_add_rule(entry,
634 &audit_filter_list[entry->rule.listnr]); 1421 &audit_filter_list[entry->rule.listnr]);
635 if (sid) { 1422 audit_log_rule_change(loginuid, sid, "add", &entry->rule, !err);
636 char *ctx = NULL;
637 u32 len;
638 if (selinux_ctxid_to_string(sid, &ctx, &len)) {
639 /* Maybe call audit_panic? */
640 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
641 "auid=%u ssid=%u add rule to list=%d res=%d",
642 loginuid, sid, entry->rule.listnr, !err);
643 } else
644 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
645 "auid=%u subj=%s add rule to list=%d res=%d",
646 loginuid, ctx, entry->rule.listnr, !err);
647 kfree(ctx);
648 } else
649 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
650 "auid=%u add rule to list=%d res=%d",
651 loginuid, entry->rule.listnr, !err);
652 1423
653 if (err) 1424 if (err)
654 audit_free_rule(entry); 1425 audit_free_rule(entry);
@@ -664,24 +1435,8 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
664 1435
665 err = audit_del_rule(entry, 1436 err = audit_del_rule(entry,
666 &audit_filter_list[entry->rule.listnr]); 1437 &audit_filter_list[entry->rule.listnr]);
667 1438 audit_log_rule_change(loginuid, sid, "remove", &entry->rule,
668 if (sid) { 1439 !err);
669 char *ctx = NULL;
670 u32 len;
671 if (selinux_ctxid_to_string(sid, &ctx, &len)) {
672 /* Maybe call audit_panic? */
673 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
674 "auid=%u ssid=%u remove rule from list=%d res=%d",
675 loginuid, sid, entry->rule.listnr, !err);
676 } else
677 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
678 "auid=%u subj=%s remove rule from list=%d res=%d",
679 loginuid, ctx, entry->rule.listnr, !err);
680 kfree(ctx);
681 } else
682 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
683 "auid=%u remove rule from list=%d res=%d",
684 loginuid, entry->rule.listnr, !err);
685 1440
686 audit_free_rule(entry); 1441 audit_free_rule(entry);
687 break; 1442 break;
@@ -712,7 +1467,43 @@ int audit_comparator(const u32 left, const u32 op, const u32 right)
712 return 0; 1467 return 0;
713} 1468}
714 1469
1470/* Compare given dentry name with last component in given path,
1471 * return of 0 indicates a match. */
1472int audit_compare_dname_path(const char *dname, const char *path,
1473 int *dirlen)
1474{
1475 int dlen, plen;
1476 const char *p;
1477
1478 if (!dname || !path)
1479 return 1;
715 1480
1481 dlen = strlen(dname);
1482 plen = strlen(path);
1483 if (plen < dlen)
1484 return 1;
1485
1486 /* disregard trailing slashes */
1487 p = path + plen - 1;
1488 while ((*p == '/') && (p > path))
1489 p--;
1490
1491 /* find last path component */
1492 p = p - dlen + 1;
1493 if (p < path)
1494 return 1;
1495 else if (p > path) {
1496 if (*--p != '/')
1497 return 1;
1498 else
1499 p++;
1500 }
1501
1502 /* return length of path's directory component */
1503 if (dirlen)
1504 *dirlen = p - path;
1505 return strncmp(p, dname, dlen);
1506}
716 1507
717static int audit_filter_user_rules(struct netlink_skb_parms *cb, 1508static int audit_filter_user_rules(struct netlink_skb_parms *cb,
718 struct audit_krule *rule, 1509 struct audit_krule *rule,
@@ -744,7 +1535,6 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
744 } 1535 }
745 switch (rule->action) { 1536 switch (rule->action) {
746 case AUDIT_NEVER: *state = AUDIT_DISABLED; break; 1537 case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
747 case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break;
748 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; 1538 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
749 } 1539 }
750 return 1; 1540 return 1;
@@ -806,11 +1596,16 @@ static inline int audit_rule_has_selinux(struct audit_krule *rule)
806 for (i = 0; i < rule->field_count; i++) { 1596 for (i = 0; i < rule->field_count; i++) {
807 struct audit_field *f = &rule->fields[i]; 1597 struct audit_field *f = &rule->fields[i];
808 switch (f->type) { 1598 switch (f->type) {
809 case AUDIT_SE_USER: 1599 case AUDIT_SUBJ_USER:
810 case AUDIT_SE_ROLE: 1600 case AUDIT_SUBJ_ROLE:
811 case AUDIT_SE_TYPE: 1601 case AUDIT_SUBJ_TYPE:
812 case AUDIT_SE_SEN: 1602 case AUDIT_SUBJ_SEN:
813 case AUDIT_SE_CLR: 1603 case AUDIT_SUBJ_CLR:
1604 case AUDIT_OBJ_USER:
1605 case AUDIT_OBJ_ROLE:
1606 case AUDIT_OBJ_TYPE:
1607 case AUDIT_OBJ_LEV_LOW:
1608 case AUDIT_OBJ_LEV_HIGH:
814 return 1; 1609 return 1;
815 } 1610 }
816 } 1611 }
@@ -826,32 +1621,65 @@ static inline int audit_rule_has_selinux(struct audit_krule *rule)
826int selinux_audit_rule_update(void) 1621int selinux_audit_rule_update(void)
827{ 1622{
828 struct audit_entry *entry, *n, *nentry; 1623 struct audit_entry *entry, *n, *nentry;
1624 struct audit_watch *watch;
829 int i, err = 0; 1625 int i, err = 0;
830 1626
831 /* audit_netlink_mutex synchronizes the writers */ 1627 /* audit_filter_mutex synchronizes the writers */
832 mutex_lock(&audit_netlink_mutex); 1628 mutex_lock(&audit_filter_mutex);
833 1629
834 for (i = 0; i < AUDIT_NR_FILTERS; i++) { 1630 for (i = 0; i < AUDIT_NR_FILTERS; i++) {
835 list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) { 1631 list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) {
836 if (!audit_rule_has_selinux(&entry->rule)) 1632 if (!audit_rule_has_selinux(&entry->rule))
837 continue; 1633 continue;
838 1634
839 nentry = audit_dupe_rule(&entry->rule); 1635 watch = entry->rule.watch;
1636 nentry = audit_dupe_rule(&entry->rule, watch);
840 if (unlikely(IS_ERR(nentry))) { 1637 if (unlikely(IS_ERR(nentry))) {
841 /* save the first error encountered for the 1638 /* save the first error encountered for the
842 * return value */ 1639 * return value */
843 if (!err) 1640 if (!err)
844 err = PTR_ERR(nentry); 1641 err = PTR_ERR(nentry);
845 audit_panic("error updating selinux filters"); 1642 audit_panic("error updating selinux filters");
1643 if (watch)
1644 list_del(&entry->rule.rlist);
846 list_del_rcu(&entry->list); 1645 list_del_rcu(&entry->list);
847 } else { 1646 } else {
1647 if (watch) {
1648 list_add(&nentry->rule.rlist,
1649 &watch->rules);
1650 list_del(&entry->rule.rlist);
1651 }
848 list_replace_rcu(&entry->list, &nentry->list); 1652 list_replace_rcu(&entry->list, &nentry->list);
849 } 1653 }
850 call_rcu(&entry->rcu, audit_free_rule_rcu); 1654 call_rcu(&entry->rcu, audit_free_rule_rcu);
851 } 1655 }
852 } 1656 }
853 1657
854 mutex_unlock(&audit_netlink_mutex); 1658 mutex_unlock(&audit_filter_mutex);
855 1659
856 return err; 1660 return err;
857} 1661}
1662
1663/* Update watch data in audit rules based on inotify events. */
1664void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask,
1665 u32 cookie, const char *dname, struct inode *inode)
1666{
1667 struct audit_parent *parent;
1668
1669 parent = container_of(i_watch, struct audit_parent, wdata);
1670
1671 if (mask & (IN_CREATE|IN_MOVED_TO) && inode)
1672 audit_update_watch(parent, dname, inode->i_sb->s_dev,
1673 inode->i_ino, 0);
1674 else if (mask & (IN_DELETE|IN_MOVED_FROM))
1675 audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
1676 /* inotify automatically removes the watch and sends IN_IGNORED */
1677 else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
1678 audit_remove_parent_watches(parent);
1679 /* inotify does not remove the watch, so remove it manually */
1680 else if(mask & IN_MOVE_SELF) {
1681 audit_remove_parent_watches(parent);
1682 inotify_remove_watch_locked(audit_ih, i_watch);
1683 } else if (mask & IN_IGNORED)
1684 put_inotify_watch(i_watch);
1685}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 1c03a4ed1b27..ae40ac8c39e7 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. 4 * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
5 * Copyright 2005 Hewlett-Packard Development Company, L.P. 5 * Copyright 2005 Hewlett-Packard Development Company, L.P.
6 * Copyright (C) 2005 IBM Corporation 6 * Copyright (C) 2005, 2006 IBM Corporation
7 * All Rights Reserved. 7 * All Rights Reserved.
8 * 8 *
9 * This program is free software; you can redistribute it and/or modify 9 * This program is free software; you can redistribute it and/or modify
@@ -29,6 +29,9 @@
29 * this file -- see entry.S) is based on a GPL'd patch written by 29 * this file -- see entry.S) is based on a GPL'd patch written by
30 * okir@suse.de and Copyright 2003 SuSE Linux AG. 30 * okir@suse.de and Copyright 2003 SuSE Linux AG.
31 * 31 *
32 * POSIX message queue support added by George Wilson <ltcgcw@us.ibm.com>,
33 * 2006.
34 *
32 * The support of additional filter rules compares (>, <, >=, <=) was 35 * The support of additional filter rules compares (>, <, >=, <=) was
33 * added by Dustin Kirkland <dustin.kirkland@us.ibm.com>, 2005. 36 * added by Dustin Kirkland <dustin.kirkland@us.ibm.com>, 2005.
34 * 37 *
@@ -49,6 +52,7 @@
49#include <linux/module.h> 52#include <linux/module.h>
50#include <linux/mount.h> 53#include <linux/mount.h>
51#include <linux/socket.h> 54#include <linux/socket.h>
55#include <linux/mqueue.h>
52#include <linux/audit.h> 56#include <linux/audit.h>
53#include <linux/personality.h> 57#include <linux/personality.h>
54#include <linux/time.h> 58#include <linux/time.h>
@@ -59,6 +63,8 @@
59#include <linux/list.h> 63#include <linux/list.h>
60#include <linux/tty.h> 64#include <linux/tty.h>
61#include <linux/selinux.h> 65#include <linux/selinux.h>
66#include <linux/binfmts.h>
67#include <linux/syscalls.h>
62 68
63#include "audit.h" 69#include "audit.h"
64 70
@@ -76,6 +82,9 @@ extern int audit_enabled;
76 * path_lookup. */ 82 * path_lookup. */
77#define AUDIT_NAMES_RESERVED 7 83#define AUDIT_NAMES_RESERVED 7
78 84
85/* Indicates that audit should log the full pathname. */
86#define AUDIT_NAME_FULL -1
87
79/* When fs/namei.c:getname() is called, we store the pointer in name and 88/* When fs/namei.c:getname() is called, we store the pointer in name and
80 * we don't let putname() free it (instead we free all of the saved 89 * we don't let putname() free it (instead we free all of the saved
81 * pointers at syscall exit time). 90 * pointers at syscall exit time).
@@ -83,8 +92,9 @@ extern int audit_enabled;
83 * Further, in fs/namei.c:path_lookup() we store the inode and device. */ 92 * Further, in fs/namei.c:path_lookup() we store the inode and device. */
84struct audit_names { 93struct audit_names {
85 const char *name; 94 const char *name;
95 int name_len; /* number of name's characters to log */
96 unsigned name_put; /* call __putname() for this name */
86 unsigned long ino; 97 unsigned long ino;
87 unsigned long pino;
88 dev_t dev; 98 dev_t dev;
89 umode_t mode; 99 umode_t mode;
90 uid_t uid; 100 uid_t uid;
@@ -100,6 +110,33 @@ struct audit_aux_data {
100 110
101#define AUDIT_AUX_IPCPERM 0 111#define AUDIT_AUX_IPCPERM 0
102 112
113struct audit_aux_data_mq_open {
114 struct audit_aux_data d;
115 int oflag;
116 mode_t mode;
117 struct mq_attr attr;
118};
119
120struct audit_aux_data_mq_sendrecv {
121 struct audit_aux_data d;
122 mqd_t mqdes;
123 size_t msg_len;
124 unsigned int msg_prio;
125 struct timespec abs_timeout;
126};
127
128struct audit_aux_data_mq_notify {
129 struct audit_aux_data d;
130 mqd_t mqdes;
131 struct sigevent notification;
132};
133
134struct audit_aux_data_mq_getsetattr {
135 struct audit_aux_data d;
136 mqd_t mqdes;
137 struct mq_attr mqstat;
138};
139
103struct audit_aux_data_ipcctl { 140struct audit_aux_data_ipcctl {
104 struct audit_aux_data d; 141 struct audit_aux_data d;
105 struct ipc_perm p; 142 struct ipc_perm p;
@@ -110,6 +147,13 @@ struct audit_aux_data_ipcctl {
110 u32 osid; 147 u32 osid;
111}; 148};
112 149
150struct audit_aux_data_execve {
151 struct audit_aux_data d;
152 int argc;
153 int envc;
154 char mem[0];
155};
156
113struct audit_aux_data_socketcall { 157struct audit_aux_data_socketcall {
114 struct audit_aux_data d; 158 struct audit_aux_data d;
115 int nargs; 159 int nargs;
@@ -142,13 +186,14 @@ struct audit_context {
142 int auditable; /* 1 if record should be written */ 186 int auditable; /* 1 if record should be written */
143 int name_count; 187 int name_count;
144 struct audit_names names[AUDIT_NAMES]; 188 struct audit_names names[AUDIT_NAMES];
189 char * filterkey; /* key for rule that triggered record */
145 struct dentry * pwd; 190 struct dentry * pwd;
146 struct vfsmount * pwdmnt; 191 struct vfsmount * pwdmnt;
147 struct audit_context *previous; /* For nested syscalls */ 192 struct audit_context *previous; /* For nested syscalls */
148 struct audit_aux_data *aux; 193 struct audit_aux_data *aux;
149 194
150 /* Save things to print about task_struct */ 195 /* Save things to print about task_struct */
151 pid_t pid; 196 pid_t pid, ppid;
152 uid_t uid, euid, suid, fsuid; 197 uid_t uid, euid, suid, fsuid;
153 gid_t gid, egid, sgid, fsgid; 198 gid_t gid, egid, sgid, fsgid;
154 unsigned long personality; 199 unsigned long personality;
@@ -160,12 +205,13 @@ struct audit_context {
160#endif 205#endif
161}; 206};
162 207
163 208/* Determine if any context name data matches a rule's watch data */
164/* Compare a task_struct with an audit_rule. Return 1 on match, 0 209/* Compare a task_struct with an audit_rule. Return 1 on match, 0
165 * otherwise. */ 210 * otherwise. */
166static int audit_filter_rules(struct task_struct *tsk, 211static int audit_filter_rules(struct task_struct *tsk,
167 struct audit_krule *rule, 212 struct audit_krule *rule,
168 struct audit_context *ctx, 213 struct audit_context *ctx,
214 struct audit_names *name,
169 enum audit_state *state) 215 enum audit_state *state)
170{ 216{
171 int i, j, need_sid = 1; 217 int i, j, need_sid = 1;
@@ -179,6 +225,10 @@ static int audit_filter_rules(struct task_struct *tsk,
179 case AUDIT_PID: 225 case AUDIT_PID:
180 result = audit_comparator(tsk->pid, f->op, f->val); 226 result = audit_comparator(tsk->pid, f->op, f->val);
181 break; 227 break;
228 case AUDIT_PPID:
229 if (ctx)
230 result = audit_comparator(ctx->ppid, f->op, f->val);
231 break;
182 case AUDIT_UID: 232 case AUDIT_UID:
183 result = audit_comparator(tsk->uid, f->op, f->val); 233 result = audit_comparator(tsk->uid, f->op, f->val);
184 break; 234 break;
@@ -224,7 +274,10 @@ static int audit_filter_rules(struct task_struct *tsk,
224 } 274 }
225 break; 275 break;
226 case AUDIT_DEVMAJOR: 276 case AUDIT_DEVMAJOR:
227 if (ctx) { 277 if (name)
278 result = audit_comparator(MAJOR(name->dev),
279 f->op, f->val);
280 else if (ctx) {
228 for (j = 0; j < ctx->name_count; j++) { 281 for (j = 0; j < ctx->name_count; j++) {
229 if (audit_comparator(MAJOR(ctx->names[j].dev), f->op, f->val)) { 282 if (audit_comparator(MAJOR(ctx->names[j].dev), f->op, f->val)) {
230 ++result; 283 ++result;
@@ -234,7 +287,10 @@ static int audit_filter_rules(struct task_struct *tsk,
234 } 287 }
235 break; 288 break;
236 case AUDIT_DEVMINOR: 289 case AUDIT_DEVMINOR:
237 if (ctx) { 290 if (name)
291 result = audit_comparator(MINOR(name->dev),
292 f->op, f->val);
293 else if (ctx) {
238 for (j = 0; j < ctx->name_count; j++) { 294 for (j = 0; j < ctx->name_count; j++) {
239 if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) { 295 if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) {
240 ++result; 296 ++result;
@@ -244,26 +300,32 @@ static int audit_filter_rules(struct task_struct *tsk,
244 } 300 }
245 break; 301 break;
246 case AUDIT_INODE: 302 case AUDIT_INODE:
247 if (ctx) { 303 if (name)
304 result = (name->ino == f->val);
305 else if (ctx) {
248 for (j = 0; j < ctx->name_count; j++) { 306 for (j = 0; j < ctx->name_count; j++) {
249 if (audit_comparator(ctx->names[j].ino, f->op, f->val) || 307 if (audit_comparator(ctx->names[j].ino, f->op, f->val)) {
250 audit_comparator(ctx->names[j].pino, f->op, f->val)) {
251 ++result; 308 ++result;
252 break; 309 break;
253 } 310 }
254 } 311 }
255 } 312 }
256 break; 313 break;
314 case AUDIT_WATCH:
315 if (name && rule->watch->ino != (unsigned long)-1)
316 result = (name->dev == rule->watch->dev &&
317 name->ino == rule->watch->ino);
318 break;
257 case AUDIT_LOGINUID: 319 case AUDIT_LOGINUID:
258 result = 0; 320 result = 0;
259 if (ctx) 321 if (ctx)
260 result = audit_comparator(ctx->loginuid, f->op, f->val); 322 result = audit_comparator(ctx->loginuid, f->op, f->val);
261 break; 323 break;
262 case AUDIT_SE_USER: 324 case AUDIT_SUBJ_USER:
263 case AUDIT_SE_ROLE: 325 case AUDIT_SUBJ_ROLE:
264 case AUDIT_SE_TYPE: 326 case AUDIT_SUBJ_TYPE:
265 case AUDIT_SE_SEN: 327 case AUDIT_SUBJ_SEN:
266 case AUDIT_SE_CLR: 328 case AUDIT_SUBJ_CLR:
267 /* NOTE: this may return negative values indicating 329 /* NOTE: this may return negative values indicating
268 a temporary error. We simply treat this as a 330 a temporary error. We simply treat this as a
269 match for now to avoid losing information that 331 match for now to avoid losing information that
@@ -280,6 +342,46 @@ static int audit_filter_rules(struct task_struct *tsk,
280 ctx); 342 ctx);
281 } 343 }
282 break; 344 break;
345 case AUDIT_OBJ_USER:
346 case AUDIT_OBJ_ROLE:
347 case AUDIT_OBJ_TYPE:
348 case AUDIT_OBJ_LEV_LOW:
349 case AUDIT_OBJ_LEV_HIGH:
350 /* The above note for AUDIT_SUBJ_USER...AUDIT_SUBJ_CLR
351 also applies here */
352 if (f->se_rule) {
353 /* Find files that match */
354 if (name) {
355 result = selinux_audit_rule_match(
356 name->osid, f->type, f->op,
357 f->se_rule, ctx);
358 } else if (ctx) {
359 for (j = 0; j < ctx->name_count; j++) {
360 if (selinux_audit_rule_match(
361 ctx->names[j].osid,
362 f->type, f->op,
363 f->se_rule, ctx)) {
364 ++result;
365 break;
366 }
367 }
368 }
369 /* Find ipc objects that match */
370 if (ctx) {
371 struct audit_aux_data *aux;
372 for (aux = ctx->aux; aux;
373 aux = aux->next) {
374 if (aux->type == AUDIT_IPC) {
375 struct audit_aux_data_ipcctl *axi = (void *)aux;
376 if (selinux_audit_rule_match(axi->osid, f->type, f->op, f->se_rule, ctx)) {
377 ++result;
378 break;
379 }
380 }
381 }
382 }
383 }
384 break;
283 case AUDIT_ARG0: 385 case AUDIT_ARG0:
284 case AUDIT_ARG1: 386 case AUDIT_ARG1:
285 case AUDIT_ARG2: 387 case AUDIT_ARG2:
@@ -287,14 +389,19 @@ static int audit_filter_rules(struct task_struct *tsk,
287 if (ctx) 389 if (ctx)
288 result = audit_comparator(ctx->argv[f->type-AUDIT_ARG0], f->op, f->val); 390 result = audit_comparator(ctx->argv[f->type-AUDIT_ARG0], f->op, f->val);
289 break; 391 break;
392 case AUDIT_FILTERKEY:
393 /* ignore this field for filtering */
394 result = 1;
395 break;
290 } 396 }
291 397
292 if (!result) 398 if (!result)
293 return 0; 399 return 0;
294 } 400 }
401 if (rule->filterkey)
402 ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC);
295 switch (rule->action) { 403 switch (rule->action) {
296 case AUDIT_NEVER: *state = AUDIT_DISABLED; break; 404 case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
297 case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break;
298 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; 405 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
299 } 406 }
300 return 1; 407 return 1;
@@ -311,7 +418,7 @@ static enum audit_state audit_filter_task(struct task_struct *tsk)
311 418
312 rcu_read_lock(); 419 rcu_read_lock();
313 list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) { 420 list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) {
314 if (audit_filter_rules(tsk, &e->rule, NULL, &state)) { 421 if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) {
315 rcu_read_unlock(); 422 rcu_read_unlock();
316 return state; 423 return state;
317 } 424 }
@@ -341,8 +448,47 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
341 int bit = AUDIT_BIT(ctx->major); 448 int bit = AUDIT_BIT(ctx->major);
342 449
343 list_for_each_entry_rcu(e, list, list) { 450 list_for_each_entry_rcu(e, list, list) {
344 if ((e->rule.mask[word] & bit) == bit 451 if ((e->rule.mask[word] & bit) == bit &&
345 && audit_filter_rules(tsk, &e->rule, ctx, &state)) { 452 audit_filter_rules(tsk, &e->rule, ctx, NULL,
453 &state)) {
454 rcu_read_unlock();
455 return state;
456 }
457 }
458 }
459 rcu_read_unlock();
460 return AUDIT_BUILD_CONTEXT;
461}
462
463/* At syscall exit time, this filter is called if any audit_names[] have been
464 * collected during syscall processing. We only check rules in sublists at hash
465 * buckets applicable to the inode numbers in audit_names[].
466 * Regarding audit_state, same rules apply as for audit_filter_syscall().
467 */
468enum audit_state audit_filter_inodes(struct task_struct *tsk,
469 struct audit_context *ctx)
470{
471 int i;
472 struct audit_entry *e;
473 enum audit_state state;
474
475 if (audit_pid && tsk->tgid == audit_pid)
476 return AUDIT_DISABLED;
477
478 rcu_read_lock();
479 for (i = 0; i < ctx->name_count; i++) {
480 int word = AUDIT_WORD(ctx->major);
481 int bit = AUDIT_BIT(ctx->major);
482 struct audit_names *n = &ctx->names[i];
483 int h = audit_hash_ino((u32)n->ino);
484 struct list_head *list = &audit_inode_hash[h];
485
486 if (list_empty(list))
487 continue;
488
489 list_for_each_entry_rcu(e, list, list) {
490 if ((e->rule.mask[word] & bit) == bit &&
491 audit_filter_rules(tsk, &e->rule, ctx, n, &state)) {
346 rcu_read_unlock(); 492 rcu_read_unlock();
347 return state; 493 return state;
348 } 494 }
@@ -352,6 +498,11 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
352 return AUDIT_BUILD_CONTEXT; 498 return AUDIT_BUILD_CONTEXT;
353} 499}
354 500
501void audit_set_auditable(struct audit_context *ctx)
502{
503 ctx->auditable = 1;
504}
505
355static inline struct audit_context *audit_get_context(struct task_struct *tsk, 506static inline struct audit_context *audit_get_context(struct task_struct *tsk,
356 int return_valid, 507 int return_valid,
357 int return_code) 508 int return_code)
@@ -365,12 +516,22 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
365 516
366 if (context->in_syscall && !context->auditable) { 517 if (context->in_syscall && !context->auditable) {
367 enum audit_state state; 518 enum audit_state state;
519
368 state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]); 520 state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]);
521 if (state == AUDIT_RECORD_CONTEXT) {
522 context->auditable = 1;
523 goto get_context;
524 }
525
526 state = audit_filter_inodes(tsk, context);
369 if (state == AUDIT_RECORD_CONTEXT) 527 if (state == AUDIT_RECORD_CONTEXT)
370 context->auditable = 1; 528 context->auditable = 1;
529
371 } 530 }
372 531
532get_context:
373 context->pid = tsk->pid; 533 context->pid = tsk->pid;
534 context->ppid = sys_getppid(); /* sic. tsk == current in all cases */
374 context->uid = tsk->uid; 535 context->uid = tsk->uid;
375 context->gid = tsk->gid; 536 context->gid = tsk->gid;
376 context->euid = tsk->euid; 537 context->euid = tsk->euid;
@@ -413,7 +574,7 @@ static inline void audit_free_names(struct audit_context *context)
413#endif 574#endif
414 575
415 for (i = 0; i < context->name_count; i++) { 576 for (i = 0; i < context->name_count; i++) {
416 if (context->names[i].name) 577 if (context->names[i].name && context->names[i].name_put)
417 __putname(context->names[i].name); 578 __putname(context->names[i].name);
418 } 579 }
419 context->name_count = 0; 580 context->name_count = 0;
@@ -513,6 +674,7 @@ static inline void audit_free_context(struct audit_context *context)
513 } 674 }
514 audit_free_names(context); 675 audit_free_names(context);
515 audit_free_aux(context); 676 audit_free_aux(context);
677 kfree(context->filterkey);
516 kfree(context); 678 kfree(context);
517 context = previous; 679 context = previous;
518 } while (context); 680 } while (context);
@@ -544,8 +706,7 @@ static void audit_log_task_context(struct audit_buffer *ab)
544 return; 706 return;
545 707
546error_path: 708error_path:
547 if (ctx) 709 kfree(ctx);
548 kfree(ctx);
549 audit_panic("error in audit_log_task_context"); 710 audit_panic("error in audit_log_task_context");
550 return; 711 return;
551} 712}
@@ -606,7 +767,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
606 tty = "(none)"; 767 tty = "(none)";
607 audit_log_format(ab, 768 audit_log_format(ab,
608 " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" 769 " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
609 " pid=%d auid=%u uid=%u gid=%u" 770 " ppid=%d pid=%d auid=%u uid=%u gid=%u"
610 " euid=%u suid=%u fsuid=%u" 771 " euid=%u suid=%u fsuid=%u"
611 " egid=%u sgid=%u fsgid=%u tty=%s", 772 " egid=%u sgid=%u fsgid=%u tty=%s",
612 context->argv[0], 773 context->argv[0],
@@ -614,6 +775,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
614 context->argv[2], 775 context->argv[2],
615 context->argv[3], 776 context->argv[3],
616 context->name_count, 777 context->name_count,
778 context->ppid,
617 context->pid, 779 context->pid,
618 context->loginuid, 780 context->loginuid,
619 context->uid, 781 context->uid,
@@ -621,6 +783,11 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
621 context->euid, context->suid, context->fsuid, 783 context->euid, context->suid, context->fsuid,
622 context->egid, context->sgid, context->fsgid, tty); 784 context->egid, context->sgid, context->fsgid, tty);
623 audit_log_task_info(ab, tsk); 785 audit_log_task_info(ab, tsk);
786 if (context->filterkey) {
787 audit_log_format(ab, " key=");
788 audit_log_untrustedstring(ab, context->filterkey);
789 } else
790 audit_log_format(ab, " key=(null)");
624 audit_log_end(ab); 791 audit_log_end(ab);
625 792
626 for (aux = context->aux; aux; aux = aux->next) { 793 for (aux = context->aux; aux; aux = aux->next) {
@@ -630,11 +797,48 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
630 continue; /* audit_panic has been called */ 797 continue; /* audit_panic has been called */
631 798
632 switch (aux->type) { 799 switch (aux->type) {
800 case AUDIT_MQ_OPEN: {
801 struct audit_aux_data_mq_open *axi = (void *)aux;
802 audit_log_format(ab,
803 "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld "
804 "mq_msgsize=%ld mq_curmsgs=%ld",
805 axi->oflag, axi->mode, axi->attr.mq_flags,
806 axi->attr.mq_maxmsg, axi->attr.mq_msgsize,
807 axi->attr.mq_curmsgs);
808 break; }
809
810 case AUDIT_MQ_SENDRECV: {
811 struct audit_aux_data_mq_sendrecv *axi = (void *)aux;
812 audit_log_format(ab,
813 "mqdes=%d msg_len=%zd msg_prio=%u "
814 "abs_timeout_sec=%ld abs_timeout_nsec=%ld",
815 axi->mqdes, axi->msg_len, axi->msg_prio,
816 axi->abs_timeout.tv_sec, axi->abs_timeout.tv_nsec);
817 break; }
818
819 case AUDIT_MQ_NOTIFY: {
820 struct audit_aux_data_mq_notify *axi = (void *)aux;
821 audit_log_format(ab,
822 "mqdes=%d sigev_signo=%d",
823 axi->mqdes,
824 axi->notification.sigev_signo);
825 break; }
826
827 case AUDIT_MQ_GETSETATTR: {
828 struct audit_aux_data_mq_getsetattr *axi = (void *)aux;
829 audit_log_format(ab,
830 "mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld "
831 "mq_curmsgs=%ld ",
832 axi->mqdes,
833 axi->mqstat.mq_flags, axi->mqstat.mq_maxmsg,
834 axi->mqstat.mq_msgsize, axi->mqstat.mq_curmsgs);
835 break; }
836
633 case AUDIT_IPC: { 837 case AUDIT_IPC: {
634 struct audit_aux_data_ipcctl *axi = (void *)aux; 838 struct audit_aux_data_ipcctl *axi = (void *)aux;
635 audit_log_format(ab, 839 audit_log_format(ab,
636 " qbytes=%lx iuid=%u igid=%u mode=%x", 840 "ouid=%u ogid=%u mode=%x",
637 axi->qbytes, axi->uid, axi->gid, axi->mode); 841 axi->uid, axi->gid, axi->mode);
638 if (axi->osid != 0) { 842 if (axi->osid != 0) {
639 char *ctx = NULL; 843 char *ctx = NULL;
640 u32 len; 844 u32 len;
@@ -652,19 +856,18 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
652 case AUDIT_IPC_SET_PERM: { 856 case AUDIT_IPC_SET_PERM: {
653 struct audit_aux_data_ipcctl *axi = (void *)aux; 857 struct audit_aux_data_ipcctl *axi = (void *)aux;
654 audit_log_format(ab, 858 audit_log_format(ab,
655 " new qbytes=%lx new iuid=%u new igid=%u new mode=%x", 859 "qbytes=%lx ouid=%u ogid=%u mode=%x",
656 axi->qbytes, axi->uid, axi->gid, axi->mode); 860 axi->qbytes, axi->uid, axi->gid, axi->mode);
657 if (axi->osid != 0) { 861 break; }
658 char *ctx = NULL; 862
659 u32 len; 863 case AUDIT_EXECVE: {
660 if (selinux_ctxid_to_string( 864 struct audit_aux_data_execve *axi = (void *)aux;
661 axi->osid, &ctx, &len)) { 865 int i;
662 audit_log_format(ab, " osid=%u", 866 const char *p;
663 axi->osid); 867 for (i = 0, p = axi->mem; i < axi->argc; i++) {
664 call_panic = 1; 868 audit_log_format(ab, "a%d=", i);
665 } else 869 p = audit_log_untrustedstring(ab, p);
666 audit_log_format(ab, " obj=%s", ctx); 870 audit_log_format(ab, "\n");
667 kfree(ctx);
668 } 871 }
669 break; } 872 break; }
670 873
@@ -700,8 +903,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
700 } 903 }
701 } 904 }
702 for (i = 0; i < context->name_count; i++) { 905 for (i = 0; i < context->name_count; i++) {
703 unsigned long ino = context->names[i].ino; 906 struct audit_names *n = &context->names[i];
704 unsigned long pino = context->names[i].pino;
705 907
706 ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); 908 ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
707 if (!ab) 909 if (!ab)
@@ -709,33 +911,47 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
709 911
710 audit_log_format(ab, "item=%d", i); 912 audit_log_format(ab, "item=%d", i);
711 913
712 audit_log_format(ab, " name="); 914 if (n->name) {
713 if (context->names[i].name) 915 switch(n->name_len) {
714 audit_log_untrustedstring(ab, context->names[i].name); 916 case AUDIT_NAME_FULL:
715 else 917 /* log the full path */
716 audit_log_format(ab, "(null)"); 918 audit_log_format(ab, " name=");
717 919 audit_log_untrustedstring(ab, n->name);
718 if (pino != (unsigned long)-1) 920 break;
719 audit_log_format(ab, " parent=%lu", pino); 921 case 0:
720 if (ino != (unsigned long)-1) 922 /* name was specified as a relative path and the
721 audit_log_format(ab, " inode=%lu", ino); 923 * directory component is the cwd */
722 if ((pino != (unsigned long)-1) || (ino != (unsigned long)-1)) 924 audit_log_d_path(ab, " name=", context->pwd,
723 audit_log_format(ab, " dev=%02x:%02x mode=%#o" 925 context->pwdmnt);
724 " ouid=%u ogid=%u rdev=%02x:%02x", 926 break;
725 MAJOR(context->names[i].dev), 927 default:
726 MINOR(context->names[i].dev), 928 /* log the name's directory component */
727 context->names[i].mode, 929 audit_log_format(ab, " name=");
728 context->names[i].uid, 930 audit_log_n_untrustedstring(ab, n->name_len,
729 context->names[i].gid, 931 n->name);
730 MAJOR(context->names[i].rdev), 932 }
731 MINOR(context->names[i].rdev)); 933 } else
732 if (context->names[i].osid != 0) { 934 audit_log_format(ab, " name=(null)");
935
936 if (n->ino != (unsigned long)-1) {
937 audit_log_format(ab, " inode=%lu"
938 " dev=%02x:%02x mode=%#o"
939 " ouid=%u ogid=%u rdev=%02x:%02x",
940 n->ino,
941 MAJOR(n->dev),
942 MINOR(n->dev),
943 n->mode,
944 n->uid,
945 n->gid,
946 MAJOR(n->rdev),
947 MINOR(n->rdev));
948 }
949 if (n->osid != 0) {
733 char *ctx = NULL; 950 char *ctx = NULL;
734 u32 len; 951 u32 len;
735 if (selinux_ctxid_to_string( 952 if (selinux_ctxid_to_string(
736 context->names[i].osid, &ctx, &len)) { 953 n->osid, &ctx, &len)) {
737 audit_log_format(ab, " osid=%u", 954 audit_log_format(ab, " osid=%u", n->osid);
738 context->names[i].osid);
739 call_panic = 2; 955 call_panic = 2;
740 } else 956 } else
741 audit_log_format(ab, " obj=%s", ctx); 957 audit_log_format(ab, " obj=%s", ctx);
@@ -897,6 +1113,8 @@ void audit_syscall_exit(int valid, long return_code)
897 } else { 1113 } else {
898 audit_free_names(context); 1114 audit_free_names(context);
899 audit_free_aux(context); 1115 audit_free_aux(context);
1116 kfree(context->filterkey);
1117 context->filterkey = NULL;
900 tsk->audit_context = context; 1118 tsk->audit_context = context;
901 } 1119 }
902} 1120}
@@ -908,11 +1126,11 @@ void audit_syscall_exit(int valid, long return_code)
908 * Add a name to the list of audit names for this context. 1126 * Add a name to the list of audit names for this context.
909 * Called from fs/namei.c:getname(). 1127 * Called from fs/namei.c:getname().
910 */ 1128 */
911void audit_getname(const char *name) 1129void __audit_getname(const char *name)
912{ 1130{
913 struct audit_context *context = current->audit_context; 1131 struct audit_context *context = current->audit_context;
914 1132
915 if (!context || IS_ERR(name) || !name) 1133 if (IS_ERR(name) || !name)
916 return; 1134 return;
917 1135
918 if (!context->in_syscall) { 1136 if (!context->in_syscall) {
@@ -925,6 +1143,8 @@ void audit_getname(const char *name)
925 } 1143 }
926 BUG_ON(context->name_count >= AUDIT_NAMES); 1144 BUG_ON(context->name_count >= AUDIT_NAMES);
927 context->names[context->name_count].name = name; 1145 context->names[context->name_count].name = name;
1146 context->names[context->name_count].name_len = AUDIT_NAME_FULL;
1147 context->names[context->name_count].name_put = 1;
928 context->names[context->name_count].ino = (unsigned long)-1; 1148 context->names[context->name_count].ino = (unsigned long)-1;
929 ++context->name_count; 1149 ++context->name_count;
930 if (!context->pwd) { 1150 if (!context->pwd) {
@@ -991,11 +1211,10 @@ static void audit_inode_context(int idx, const struct inode *inode)
991 * audit_inode - store the inode and device from a lookup 1211 * audit_inode - store the inode and device from a lookup
992 * @name: name being audited 1212 * @name: name being audited
993 * @inode: inode being audited 1213 * @inode: inode being audited
994 * @flags: lookup flags (as used in path_lookup())
995 * 1214 *
996 * Called from fs/namei.c:path_lookup(). 1215 * Called from fs/namei.c:path_lookup().
997 */ 1216 */
998void __audit_inode(const char *name, const struct inode *inode, unsigned flags) 1217void __audit_inode(const char *name, const struct inode *inode)
999{ 1218{
1000 int idx; 1219 int idx;
1001 struct audit_context *context = current->audit_context; 1220 struct audit_context *context = current->audit_context;
@@ -1021,20 +1240,13 @@ void __audit_inode(const char *name, const struct inode *inode, unsigned flags)
1021 ++context->ino_count; 1240 ++context->ino_count;
1022#endif 1241#endif
1023 } 1242 }
1243 context->names[idx].ino = inode->i_ino;
1024 context->names[idx].dev = inode->i_sb->s_dev; 1244 context->names[idx].dev = inode->i_sb->s_dev;
1025 context->names[idx].mode = inode->i_mode; 1245 context->names[idx].mode = inode->i_mode;
1026 context->names[idx].uid = inode->i_uid; 1246 context->names[idx].uid = inode->i_uid;
1027 context->names[idx].gid = inode->i_gid; 1247 context->names[idx].gid = inode->i_gid;
1028 context->names[idx].rdev = inode->i_rdev; 1248 context->names[idx].rdev = inode->i_rdev;
1029 audit_inode_context(idx, inode); 1249 audit_inode_context(idx, inode);
1030 if ((flags & LOOKUP_PARENT) && (strcmp(name, "/") != 0) &&
1031 (strcmp(name, ".") != 0)) {
1032 context->names[idx].ino = (unsigned long)-1;
1033 context->names[idx].pino = inode->i_ino;
1034 } else {
1035 context->names[idx].ino = inode->i_ino;
1036 context->names[idx].pino = (unsigned long)-1;
1037 }
1038} 1250}
1039 1251
1040/** 1252/**
@@ -1056,51 +1268,40 @@ void __audit_inode_child(const char *dname, const struct inode *inode,
1056{ 1268{
1057 int idx; 1269 int idx;
1058 struct audit_context *context = current->audit_context; 1270 struct audit_context *context = current->audit_context;
1271 const char *found_name = NULL;
1272 int dirlen = 0;
1059 1273
1060 if (!context->in_syscall) 1274 if (!context->in_syscall)
1061 return; 1275 return;
1062 1276
1063 /* determine matching parent */ 1277 /* determine matching parent */
1064 if (dname) 1278 if (!dname)
1065 for (idx = 0; idx < context->name_count; idx++) 1279 goto update_context;
1066 if (context->names[idx].pino == pino) { 1280 for (idx = 0; idx < context->name_count; idx++)
1067 const char *n; 1281 if (context->names[idx].ino == pino) {
1068 const char *name = context->names[idx].name; 1282 const char *name = context->names[idx].name;
1069 int dlen = strlen(dname); 1283
1070 int nlen = name ? strlen(name) : 0; 1284 if (!name)
1071 1285 continue;
1072 if (nlen < dlen) 1286
1073 continue; 1287 if (audit_compare_dname_path(dname, name, &dirlen) == 0) {
1074 1288 context->names[idx].name_len = dirlen;
1075 /* disregard trailing slashes */ 1289 found_name = name;
1076 n = name + nlen - 1; 1290 break;
1077 while ((*n == '/') && (n > name))
1078 n--;
1079
1080 /* find last path component */
1081 n = n - dlen + 1;
1082 if (n < name)
1083 continue;
1084 else if (n > name) {
1085 if (*--n != '/')
1086 continue;
1087 else
1088 n++;
1089 }
1090
1091 if (strncmp(n, dname, dlen) == 0)
1092 goto update_context;
1093 } 1291 }
1292 }
1094 1293
1095 /* catch-all in case match not found */ 1294update_context:
1096 idx = context->name_count++; 1295 idx = context->name_count++;
1097 context->names[idx].name = NULL;
1098 context->names[idx].pino = pino;
1099#if AUDIT_DEBUG 1296#if AUDIT_DEBUG
1100 context->ino_count++; 1297 context->ino_count++;
1101#endif 1298#endif
1299 /* Re-use the name belonging to the slot for a matching parent directory.
1300 * All names for this context are relinquished in audit_free_names() */
1301 context->names[idx].name = found_name;
1302 context->names[idx].name_len = AUDIT_NAME_FULL;
1303 context->names[idx].name_put = 0; /* don't call __putname() */
1102 1304
1103update_context:
1104 if (inode) { 1305 if (inode) {
1105 context->names[idx].ino = inode->i_ino; 1306 context->names[idx].ino = inode->i_ino;
1106 context->names[idx].dev = inode->i_sb->s_dev; 1307 context->names[idx].dev = inode->i_sb->s_dev;
@@ -1109,7 +1310,8 @@ update_context:
1109 context->names[idx].gid = inode->i_gid; 1310 context->names[idx].gid = inode->i_gid;
1110 context->names[idx].rdev = inode->i_rdev; 1311 context->names[idx].rdev = inode->i_rdev;
1111 audit_inode_context(idx, inode); 1312 audit_inode_context(idx, inode);
1112 } 1313 } else
1314 context->names[idx].ino = (unsigned long)-1;
1113} 1315}
1114 1316
1115/** 1317/**
@@ -1142,18 +1344,23 @@ void auditsc_get_stamp(struct audit_context *ctx,
1142 */ 1344 */
1143int audit_set_loginuid(struct task_struct *task, uid_t loginuid) 1345int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
1144{ 1346{
1145 if (task->audit_context) { 1347 struct audit_context *context = task->audit_context;
1146 struct audit_buffer *ab; 1348
1147 1349 if (context) {
1148 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); 1350 /* Only log if audit is enabled */
1149 if (ab) { 1351 if (context->in_syscall) {
1150 audit_log_format(ab, "login pid=%d uid=%u " 1352 struct audit_buffer *ab;
1151 "old auid=%u new auid=%u", 1353
1152 task->pid, task->uid, 1354 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
1153 task->audit_context->loginuid, loginuid); 1355 if (ab) {
1154 audit_log_end(ab); 1356 audit_log_format(ab, "login pid=%d uid=%u "
1357 "old auid=%u new auid=%u",
1358 task->pid, task->uid,
1359 context->loginuid, loginuid);
1360 audit_log_end(ab);
1361 }
1155 } 1362 }
1156 task->audit_context->loginuid = loginuid; 1363 context->loginuid = loginuid;
1157 } 1364 }
1158 return 0; 1365 return 0;
1159} 1366}
@@ -1170,16 +1377,193 @@ uid_t audit_get_loginuid(struct audit_context *ctx)
1170} 1377}
1171 1378
1172/** 1379/**
1173 * audit_ipc_obj - record audit data for ipc object 1380 * __audit_mq_open - record audit data for a POSIX MQ open
1174 * @ipcp: ipc permissions 1381 * @oflag: open flag
1382 * @mode: mode bits
1383 * @u_attr: queue attributes
1175 * 1384 *
1176 * Returns 0 for success or NULL context or < 0 on error. 1385 * Returns 0 for success or NULL context or < 0 on error.
1177 */ 1386 */
1178int audit_ipc_obj(struct kern_ipc_perm *ipcp) 1387int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr)
1179{ 1388{
1180 struct audit_aux_data_ipcctl *ax; 1389 struct audit_aux_data_mq_open *ax;
1390 struct audit_context *context = current->audit_context;
1391
1392 if (!audit_enabled)
1393 return 0;
1394
1395 if (likely(!context))
1396 return 0;
1397
1398 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
1399 if (!ax)
1400 return -ENOMEM;
1401
1402 if (u_attr != NULL) {
1403 if (copy_from_user(&ax->attr, u_attr, sizeof(ax->attr))) {
1404 kfree(ax);
1405 return -EFAULT;
1406 }
1407 } else
1408 memset(&ax->attr, 0, sizeof(ax->attr));
1409
1410 ax->oflag = oflag;
1411 ax->mode = mode;
1412
1413 ax->d.type = AUDIT_MQ_OPEN;
1414 ax->d.next = context->aux;
1415 context->aux = (void *)ax;
1416 return 0;
1417}
1418
1419/**
1420 * __audit_mq_timedsend - record audit data for a POSIX MQ timed send
1421 * @mqdes: MQ descriptor
1422 * @msg_len: Message length
1423 * @msg_prio: Message priority
1424 * @u_abs_timeout: Message timeout in absolute time
1425 *
1426 * Returns 0 for success or NULL context or < 0 on error.
1427 */
1428int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
1429 const struct timespec __user *u_abs_timeout)
1430{
1431 struct audit_aux_data_mq_sendrecv *ax;
1432 struct audit_context *context = current->audit_context;
1433
1434 if (!audit_enabled)
1435 return 0;
1436
1437 if (likely(!context))
1438 return 0;
1439
1440 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
1441 if (!ax)
1442 return -ENOMEM;
1443
1444 if (u_abs_timeout != NULL) {
1445 if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) {
1446 kfree(ax);
1447 return -EFAULT;
1448 }
1449 } else
1450 memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout));
1451
1452 ax->mqdes = mqdes;
1453 ax->msg_len = msg_len;
1454 ax->msg_prio = msg_prio;
1455
1456 ax->d.type = AUDIT_MQ_SENDRECV;
1457 ax->d.next = context->aux;
1458 context->aux = (void *)ax;
1459 return 0;
1460}
1461
1462/**
1463 * __audit_mq_timedreceive - record audit data for a POSIX MQ timed receive
1464 * @mqdes: MQ descriptor
1465 * @msg_len: Message length
1466 * @u_msg_prio: Message priority
1467 * @u_abs_timeout: Message timeout in absolute time
1468 *
1469 * Returns 0 for success or NULL context or < 0 on error.
1470 */
1471int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len,
1472 unsigned int __user *u_msg_prio,
1473 const struct timespec __user *u_abs_timeout)
1474{
1475 struct audit_aux_data_mq_sendrecv *ax;
1476 struct audit_context *context = current->audit_context;
1477
1478 if (!audit_enabled)
1479 return 0;
1480
1481 if (likely(!context))
1482 return 0;
1483
1484 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
1485 if (!ax)
1486 return -ENOMEM;
1487
1488 if (u_msg_prio != NULL) {
1489 if (get_user(ax->msg_prio, u_msg_prio)) {
1490 kfree(ax);
1491 return -EFAULT;
1492 }
1493 } else
1494 ax->msg_prio = 0;
1495
1496 if (u_abs_timeout != NULL) {
1497 if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) {
1498 kfree(ax);
1499 return -EFAULT;
1500 }
1501 } else
1502 memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout));
1503
1504 ax->mqdes = mqdes;
1505 ax->msg_len = msg_len;
1506
1507 ax->d.type = AUDIT_MQ_SENDRECV;
1508 ax->d.next = context->aux;
1509 context->aux = (void *)ax;
1510 return 0;
1511}
1512
1513/**
1514 * __audit_mq_notify - record audit data for a POSIX MQ notify
1515 * @mqdes: MQ descriptor
1516 * @u_notification: Notification event
1517 *
1518 * Returns 0 for success or NULL context or < 0 on error.
1519 */
1520
1521int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification)
1522{
1523 struct audit_aux_data_mq_notify *ax;
1524 struct audit_context *context = current->audit_context;
1525
1526 if (!audit_enabled)
1527 return 0;
1528
1529 if (likely(!context))
1530 return 0;
1531
1532 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
1533 if (!ax)
1534 return -ENOMEM;
1535
1536 if (u_notification != NULL) {
1537 if (copy_from_user(&ax->notification, u_notification, sizeof(ax->notification))) {
1538 kfree(ax);
1539 return -EFAULT;
1540 }
1541 } else
1542 memset(&ax->notification, 0, sizeof(ax->notification));
1543
1544 ax->mqdes = mqdes;
1545
1546 ax->d.type = AUDIT_MQ_NOTIFY;
1547 ax->d.next = context->aux;
1548 context->aux = (void *)ax;
1549 return 0;
1550}
1551
1552/**
1553 * __audit_mq_getsetattr - record audit data for a POSIX MQ get/set attribute
1554 * @mqdes: MQ descriptor
1555 * @mqstat: MQ flags
1556 *
1557 * Returns 0 for success or NULL context or < 0 on error.
1558 */
1559int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
1560{
1561 struct audit_aux_data_mq_getsetattr *ax;
1181 struct audit_context *context = current->audit_context; 1562 struct audit_context *context = current->audit_context;
1182 1563
1564 if (!audit_enabled)
1565 return 0;
1566
1183 if (likely(!context)) 1567 if (likely(!context))
1184 return 0; 1568 return 0;
1185 1569
@@ -1187,6 +1571,30 @@ int audit_ipc_obj(struct kern_ipc_perm *ipcp)
1187 if (!ax) 1571 if (!ax)
1188 return -ENOMEM; 1572 return -ENOMEM;
1189 1573
1574 ax->mqdes = mqdes;
1575 ax->mqstat = *mqstat;
1576
1577 ax->d.type = AUDIT_MQ_GETSETATTR;
1578 ax->d.next = context->aux;
1579 context->aux = (void *)ax;
1580 return 0;
1581}
1582
1583/**
1584 * audit_ipc_obj - record audit data for ipc object
1585 * @ipcp: ipc permissions
1586 *
1587 * Returns 0 for success or NULL context or < 0 on error.
1588 */
1589int __audit_ipc_obj(struct kern_ipc_perm *ipcp)
1590{
1591 struct audit_aux_data_ipcctl *ax;
1592 struct audit_context *context = current->audit_context;
1593
1594 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
1595 if (!ax)
1596 return -ENOMEM;
1597
1190 ax->uid = ipcp->uid; 1598 ax->uid = ipcp->uid;
1191 ax->gid = ipcp->gid; 1599 ax->gid = ipcp->gid;
1192 ax->mode = ipcp->mode; 1600 ax->mode = ipcp->mode;
@@ -1207,14 +1615,11 @@ int audit_ipc_obj(struct kern_ipc_perm *ipcp)
1207 * 1615 *
1208 * Returns 0 for success or NULL context or < 0 on error. 1616 * Returns 0 for success or NULL context or < 0 on error.
1209 */ 1617 */
1210int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, struct kern_ipc_perm *ipcp) 1618int __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
1211{ 1619{
1212 struct audit_aux_data_ipcctl *ax; 1620 struct audit_aux_data_ipcctl *ax;
1213 struct audit_context *context = current->audit_context; 1621 struct audit_context *context = current->audit_context;
1214 1622
1215 if (likely(!context))
1216 return 0;
1217
1218 ax = kmalloc(sizeof(*ax), GFP_ATOMIC); 1623 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
1219 if (!ax) 1624 if (!ax)
1220 return -ENOMEM; 1625 return -ENOMEM;
@@ -1223,7 +1628,6 @@ int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode,
1223 ax->uid = uid; 1628 ax->uid = uid;
1224 ax->gid = gid; 1629 ax->gid = gid;
1225 ax->mode = mode; 1630 ax->mode = mode;
1226 selinux_get_ipc_sid(ipcp, &ax->osid);
1227 1631
1228 ax->d.type = AUDIT_IPC_SET_PERM; 1632 ax->d.type = AUDIT_IPC_SET_PERM;
1229 ax->d.next = context->aux; 1633 ax->d.next = context->aux;
@@ -1231,6 +1635,39 @@ int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode,
1231 return 0; 1635 return 0;
1232} 1636}
1233 1637
1638int audit_bprm(struct linux_binprm *bprm)
1639{
1640 struct audit_aux_data_execve *ax;
1641 struct audit_context *context = current->audit_context;
1642 unsigned long p, next;
1643 void *to;
1644
1645 if (likely(!audit_enabled || !context))
1646 return 0;
1647
1648 ax = kmalloc(sizeof(*ax) + PAGE_SIZE * MAX_ARG_PAGES - bprm->p,
1649 GFP_KERNEL);
1650 if (!ax)
1651 return -ENOMEM;
1652
1653 ax->argc = bprm->argc;
1654 ax->envc = bprm->envc;
1655 for (p = bprm->p, to = ax->mem; p < MAX_ARG_PAGES*PAGE_SIZE; p = next) {
1656 struct page *page = bprm->page[p / PAGE_SIZE];
1657 void *kaddr = kmap(page);
1658 next = (p + PAGE_SIZE) & ~(PAGE_SIZE - 1);
1659 memcpy(to, kaddr + (p & (PAGE_SIZE - 1)), next - p);
1660 to += next - p;
1661 kunmap(page);
1662 }
1663
1664 ax->d.type = AUDIT_EXECVE;
1665 ax->d.next = context->aux;
1666 context->aux = (void *)ax;
1667 return 0;
1668}
1669
1670
1234/** 1671/**
1235 * audit_socketcall - record audit data for sys_socketcall 1672 * audit_socketcall - record audit data for sys_socketcall
1236 * @nargs: number of args 1673 * @nargs: number of args
@@ -1325,19 +1762,20 @@ int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt)
1325 * If the audit subsystem is being terminated, record the task (pid) 1762 * If the audit subsystem is being terminated, record the task (pid)
1326 * and uid that is doing that. 1763 * and uid that is doing that.
1327 */ 1764 */
1328void audit_signal_info(int sig, struct task_struct *t) 1765void __audit_signal_info(int sig, struct task_struct *t)
1329{ 1766{
1330 extern pid_t audit_sig_pid; 1767 extern pid_t audit_sig_pid;
1331 extern uid_t audit_sig_uid; 1768 extern uid_t audit_sig_uid;
1332 1769 extern u32 audit_sig_sid;
1333 if (unlikely(audit_pid && t->tgid == audit_pid)) { 1770
1334 if (sig == SIGTERM || sig == SIGHUP) { 1771 if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) {
1335 struct audit_context *ctx = current->audit_context; 1772 struct task_struct *tsk = current;
1336 audit_sig_pid = current->pid; 1773 struct audit_context *ctx = tsk->audit_context;
1337 if (ctx) 1774 audit_sig_pid = tsk->pid;
1338 audit_sig_uid = ctx->loginuid; 1775 if (ctx)
1339 else 1776 audit_sig_uid = ctx->loginuid;
1340 audit_sig_uid = current->uid; 1777 else
1341 } 1778 audit_sig_uid = tsk->uid;
1779 selinux_get_task_sid(tsk, &audit_sig_sid);
1342 } 1780 }
1343} 1781}
diff --git a/kernel/capability.c b/kernel/capability.c
index 1a4d8a40d3f9..c7685ad00a97 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -46,7 +46,7 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
46 int ret = 0; 46 int ret = 0;
47 pid_t pid; 47 pid_t pid;
48 __u32 version; 48 __u32 version;
49 task_t *target; 49 struct task_struct *target;
50 struct __user_cap_data_struct data; 50 struct __user_cap_data_struct data;
51 51
52 if (get_user(version, &header->version)) 52 if (get_user(version, &header->version))
@@ -96,7 +96,7 @@ static inline int cap_set_pg(int pgrp, kernel_cap_t *effective,
96 kernel_cap_t *inheritable, 96 kernel_cap_t *inheritable,
97 kernel_cap_t *permitted) 97 kernel_cap_t *permitted)
98{ 98{
99 task_t *g, *target; 99 struct task_struct *g, *target;
100 int ret = -EPERM; 100 int ret = -EPERM;
101 int found = 0; 101 int found = 0;
102 102
@@ -128,7 +128,7 @@ static inline int cap_set_all(kernel_cap_t *effective,
128 kernel_cap_t *inheritable, 128 kernel_cap_t *inheritable,
129 kernel_cap_t *permitted) 129 kernel_cap_t *permitted)
130{ 130{
131 task_t *g, *target; 131 struct task_struct *g, *target;
132 int ret = -EPERM; 132 int ret = -EPERM;
133 int found = 0; 133 int found = 0;
134 134
@@ -172,7 +172,7 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
172{ 172{
173 kernel_cap_t inheritable, permitted, effective; 173 kernel_cap_t inheritable, permitted, effective;
174 __u32 version; 174 __u32 version;
175 task_t *target; 175 struct task_struct *target;
176 int ret; 176 int ret;
177 pid_t pid; 177 pid_t pid;
178 178
diff --git a/kernel/compat.c b/kernel/compat.c
index c1601a84f8d8..126dee9530aa 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -21,6 +21,7 @@
21#include <linux/unistd.h> 21#include <linux/unistd.h>
22#include <linux/security.h> 22#include <linux/security.h>
23#include <linux/timex.h> 23#include <linux/timex.h>
24#include <linux/migrate.h>
24 25
25#include <asm/uaccess.h> 26#include <asm/uaccess.h>
26 27
@@ -729,17 +730,10 @@ void
729sigset_from_compat (sigset_t *set, compat_sigset_t *compat) 730sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
730{ 731{
731 switch (_NSIG_WORDS) { 732 switch (_NSIG_WORDS) {
732#if defined (__COMPAT_ENDIAN_SWAP__)
733 case 4: set->sig[3] = compat->sig[7] | (((long)compat->sig[6]) << 32 );
734 case 3: set->sig[2] = compat->sig[5] | (((long)compat->sig[4]) << 32 );
735 case 2: set->sig[1] = compat->sig[3] | (((long)compat->sig[2]) << 32 );
736 case 1: set->sig[0] = compat->sig[1] | (((long)compat->sig[0]) << 32 );
737#else
738 case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 ); 733 case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 );
739 case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 ); 734 case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 );
740 case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 ); 735 case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 );
741 case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 ); 736 case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 );
742#endif
743 } 737 }
744} 738}
745 739
@@ -934,3 +928,25 @@ asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
934 928
935 return ret; 929 return ret;
936} 930}
931
932#ifdef CONFIG_NUMA
933asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages,
934 compat_uptr_t __user *pages32,
935 const int __user *nodes,
936 int __user *status,
937 int flags)
938{
939 const void __user * __user *pages;
940 int i;
941
942 pages = compat_alloc_user_space(nr_pages * sizeof(void *));
943 for (i = 0; i < nr_pages; i++) {
944 compat_uptr_t p;
945
946 if (get_user(p, pages32 + i) ||
947 put_user(compat_ptr(p), pages + i))
948 return -EFAULT;
949 }
950 return sys_move_pages(pid, nr_pages, pages, nodes, status, flags);
951}
952#endif
diff --git a/kernel/configs.c b/kernel/configs.c
index 009e1ebdcb88..f9e31974f4ad 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -23,7 +23,6 @@
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 */ 24 */
25 25
26#include <linux/config.h>
27#include <linux/kernel.h> 26#include <linux/kernel.h>
28#include <linux/module.h> 27#include <linux/module.h>
29#include <linux/proc_fs.h> 28#include <linux/proc_fs.h>
diff --git a/kernel/cpu.c b/kernel/cpu.c
index fe2b8d0bfe4c..70fbf2e83766 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -13,12 +13,12 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/kthread.h> 14#include <linux/kthread.h>
15#include <linux/stop_machine.h> 15#include <linux/stop_machine.h>
16#include <asm/semaphore.h> 16#include <linux/mutex.h>
17 17
18/* This protects CPUs going up and down... */ 18/* This protects CPUs going up and down... */
19static DECLARE_MUTEX(cpucontrol); 19static DEFINE_MUTEX(cpucontrol);
20 20
21static BLOCKING_NOTIFIER_HEAD(cpu_chain); 21static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain);
22 22
23#ifdef CONFIG_HOTPLUG_CPU 23#ifdef CONFIG_HOTPLUG_CPU
24static struct task_struct *lock_cpu_hotplug_owner; 24static struct task_struct *lock_cpu_hotplug_owner;
@@ -30,9 +30,9 @@ static int __lock_cpu_hotplug(int interruptible)
30 30
31 if (lock_cpu_hotplug_owner != current) { 31 if (lock_cpu_hotplug_owner != current) {
32 if (interruptible) 32 if (interruptible)
33 ret = down_interruptible(&cpucontrol); 33 ret = mutex_lock_interruptible(&cpucontrol);
34 else 34 else
35 down(&cpucontrol); 35 mutex_lock(&cpucontrol);
36 } 36 }
37 37
38 /* 38 /*
@@ -56,7 +56,7 @@ void unlock_cpu_hotplug(void)
56{ 56{
57 if (--lock_cpu_hotplug_depth == 0) { 57 if (--lock_cpu_hotplug_depth == 0) {
58 lock_cpu_hotplug_owner = NULL; 58 lock_cpu_hotplug_owner = NULL;
59 up(&cpucontrol); 59 mutex_unlock(&cpucontrol);
60 } 60 }
61} 61}
62EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); 62EXPORT_SYMBOL_GPL(unlock_cpu_hotplug);
@@ -69,10 +69,13 @@ EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible);
69#endif /* CONFIG_HOTPLUG_CPU */ 69#endif /* CONFIG_HOTPLUG_CPU */
70 70
71/* Need to know about CPUs going up/down? */ 71/* Need to know about CPUs going up/down? */
72int register_cpu_notifier(struct notifier_block *nb) 72int __cpuinit register_cpu_notifier(struct notifier_block *nb)
73{ 73{
74 return blocking_notifier_chain_register(&cpu_chain, nb); 74 return blocking_notifier_chain_register(&cpu_chain, nb);
75} 75}
76
77#ifdef CONFIG_HOTPLUG_CPU
78
76EXPORT_SYMBOL(register_cpu_notifier); 79EXPORT_SYMBOL(register_cpu_notifier);
77 80
78void unregister_cpu_notifier(struct notifier_block *nb) 81void unregister_cpu_notifier(struct notifier_block *nb)
@@ -81,7 +84,6 @@ void unregister_cpu_notifier(struct notifier_block *nb)
81} 84}
82EXPORT_SYMBOL(unregister_cpu_notifier); 85EXPORT_SYMBOL(unregister_cpu_notifier);
83 86
84#ifdef CONFIG_HOTPLUG_CPU
85static inline void check_for_tasks(int cpu) 87static inline void check_for_tasks(int cpu)
86{ 88{
87 struct task_struct *p; 89 struct task_struct *p;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ab81fdd4572b..c232dc077438 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -18,7 +18,6 @@
18 * distribution for more details. 18 * distribution for more details.
19 */ 19 */
20 20
21#include <linux/config.h>
22#include <linux/cpu.h> 21#include <linux/cpu.h>
23#include <linux/cpumask.h> 22#include <linux/cpumask.h>
24#include <linux/cpuset.h> 23#include <linux/cpuset.h>
@@ -41,6 +40,7 @@
41#include <linux/rcupdate.h> 40#include <linux/rcupdate.h>
42#include <linux/sched.h> 41#include <linux/sched.h>
43#include <linux/seq_file.h> 42#include <linux/seq_file.h>
43#include <linux/security.h>
44#include <linux/slab.h> 44#include <linux/slab.h>
45#include <linux/smp_lock.h> 45#include <linux/smp_lock.h>
46#include <linux/spinlock.h> 46#include <linux/spinlock.h>
@@ -392,11 +392,11 @@ static int cpuset_fill_super(struct super_block *sb, void *unused_data,
392 return 0; 392 return 0;
393} 393}
394 394
395static struct super_block *cpuset_get_sb(struct file_system_type *fs_type, 395static int cpuset_get_sb(struct file_system_type *fs_type,
396 int flags, const char *unused_dev_name, 396 int flags, const char *unused_dev_name,
397 void *data) 397 void *data, struct vfsmount *mnt)
398{ 398{
399 return get_sb_single(fs_type, flags, data, cpuset_fill_super); 399 return get_sb_single(fs_type, flags, data, cpuset_fill_super, mnt);
400} 400}
401 401
402static struct file_system_type cpuset_fs_type = { 402static struct file_system_type cpuset_fs_type = {
@@ -1063,7 +1063,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
1063} 1063}
1064 1064
1065/* 1065/*
1066 * Frequency meter - How fast is some event occuring? 1066 * Frequency meter - How fast is some event occurring?
1067 * 1067 *
1068 * These routines manage a digitally filtered, constant time based, 1068 * These routines manage a digitally filtered, constant time based,
1069 * event frequency meter. There are four routines: 1069 * event frequency meter. There are four routines:
@@ -1177,6 +1177,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
1177 cpumask_t cpus; 1177 cpumask_t cpus;
1178 nodemask_t from, to; 1178 nodemask_t from, to;
1179 struct mm_struct *mm; 1179 struct mm_struct *mm;
1180 int retval;
1180 1181
1181 if (sscanf(pidbuf, "%d", &pid) != 1) 1182 if (sscanf(pidbuf, "%d", &pid) != 1)
1182 return -EIO; 1183 return -EIO;
@@ -1205,6 +1206,12 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
1205 get_task_struct(tsk); 1206 get_task_struct(tsk);
1206 } 1207 }
1207 1208
1209 retval = security_task_setscheduler(tsk, 0, NULL);
1210 if (retval) {
1211 put_task_struct(tsk);
1212 return retval;
1213 }
1214
1208 mutex_lock(&callback_mutex); 1215 mutex_lock(&callback_mutex);
1209 1216
1210 task_lock(tsk); 1217 task_lock(tsk);
@@ -2434,31 +2441,43 @@ void __cpuset_memory_pressure_bump(void)
2434 */ 2441 */
2435static int proc_cpuset_show(struct seq_file *m, void *v) 2442static int proc_cpuset_show(struct seq_file *m, void *v)
2436{ 2443{
2444 struct pid *pid;
2437 struct task_struct *tsk; 2445 struct task_struct *tsk;
2438 char *buf; 2446 char *buf;
2439 int retval = 0; 2447 int retval;
2440 2448
2449 retval = -ENOMEM;
2441 buf = kmalloc(PAGE_SIZE, GFP_KERNEL); 2450 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
2442 if (!buf) 2451 if (!buf)
2443 return -ENOMEM; 2452 goto out;
2444 2453
2445 tsk = m->private; 2454 retval = -ESRCH;
2455 pid = m->private;
2456 tsk = get_pid_task(pid, PIDTYPE_PID);
2457 if (!tsk)
2458 goto out_free;
2459
2460 retval = -EINVAL;
2446 mutex_lock(&manage_mutex); 2461 mutex_lock(&manage_mutex);
2462
2447 retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE); 2463 retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE);
2448 if (retval < 0) 2464 if (retval < 0)
2449 goto out; 2465 goto out_unlock;
2450 seq_puts(m, buf); 2466 seq_puts(m, buf);
2451 seq_putc(m, '\n'); 2467 seq_putc(m, '\n');
2452out: 2468out_unlock:
2453 mutex_unlock(&manage_mutex); 2469 mutex_unlock(&manage_mutex);
2470 put_task_struct(tsk);
2471out_free:
2454 kfree(buf); 2472 kfree(buf);
2473out:
2455 return retval; 2474 return retval;
2456} 2475}
2457 2476
2458static int cpuset_open(struct inode *inode, struct file *file) 2477static int cpuset_open(struct inode *inode, struct file *file)
2459{ 2478{
2460 struct task_struct *tsk = PROC_I(inode)->task; 2479 struct pid *pid = PROC_I(inode)->pid;
2461 return single_open(file, proc_cpuset_show, tsk); 2480 return single_open(file, proc_cpuset_show, pid);
2462} 2481}
2463 2482
2464struct file_operations proc_cpuset_operations = { 2483struct file_operations proc_cpuset_operations = {
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
new file mode 100644
index 000000000000..f05392d64267
--- /dev/null
+++ b/kernel/delayacct.c
@@ -0,0 +1,178 @@
1/* delayacct.c - per-task delay accounting
2 *
3 * Copyright (C) Shailabh Nagar, IBM Corp. 2006
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it would be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
13 * the GNU General Public License for more details.
14 */
15
16#include <linux/sched.h>
17#include <linux/slab.h>
18#include <linux/time.h>
19#include <linux/sysctl.h>
20#include <linux/delayacct.h>
21
22int delayacct_on __read_mostly; /* Delay accounting turned on/off */
23kmem_cache_t *delayacct_cache;
24
25static int __init delayacct_setup_enable(char *str)
26{
27 delayacct_on = 1;
28 return 1;
29}
30__setup("delayacct", delayacct_setup_enable);
31
32void delayacct_init(void)
33{
34 delayacct_cache = kmem_cache_create("delayacct_cache",
35 sizeof(struct task_delay_info),
36 0,
37 SLAB_PANIC,
38 NULL, NULL);
39 delayacct_tsk_init(&init_task);
40}
41
42void __delayacct_tsk_init(struct task_struct *tsk)
43{
44 spin_lock_init(&tsk->delays_lock);
45 /* No need to acquire tsk->delays_lock for allocation here unless
46 __delayacct_tsk_init called after tsk is attached to tasklist
47 */
48 tsk->delays = kmem_cache_zalloc(delayacct_cache, SLAB_KERNEL);
49 if (tsk->delays)
50 spin_lock_init(&tsk->delays->lock);
51}
52
53void __delayacct_tsk_exit(struct task_struct *tsk)
54{
55 struct task_delay_info *delays = tsk->delays;
56 spin_lock(&tsk->delays_lock);
57 tsk->delays = NULL;
58 spin_unlock(&tsk->delays_lock);
59 kmem_cache_free(delayacct_cache, delays);
60}
61
62/*
63 * Start accounting for a delay statistic using
64 * its starting timestamp (@start)
65 */
66
67static inline void delayacct_start(struct timespec *start)
68{
69 do_posix_clock_monotonic_gettime(start);
70}
71
72/*
73 * Finish delay accounting for a statistic using
74 * its timestamps (@start, @end), accumalator (@total) and @count
75 */
76
77static void delayacct_end(struct timespec *start, struct timespec *end,
78 u64 *total, u32 *count)
79{
80 struct timespec ts;
81 s64 ns;
82
83 do_posix_clock_monotonic_gettime(end);
84 ts = timespec_sub(*end, *start);
85 ns = timespec_to_ns(&ts);
86 if (ns < 0)
87 return;
88
89 spin_lock(&current->delays->lock);
90 *total += ns;
91 (*count)++;
92 spin_unlock(&current->delays->lock);
93}
94
95void __delayacct_blkio_start(void)
96{
97 delayacct_start(&current->delays->blkio_start);
98}
99
100void __delayacct_blkio_end(void)
101{
102 if (current->delays->flags & DELAYACCT_PF_SWAPIN)
103 /* Swapin block I/O */
104 delayacct_end(&current->delays->blkio_start,
105 &current->delays->blkio_end,
106 &current->delays->swapin_delay,
107 &current->delays->swapin_count);
108 else /* Other block I/O */
109 delayacct_end(&current->delays->blkio_start,
110 &current->delays->blkio_end,
111 &current->delays->blkio_delay,
112 &current->delays->blkio_count);
113}
114
115int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
116{
117 s64 tmp;
118 struct timespec ts;
119 unsigned long t1,t2,t3;
120
121 spin_lock(&tsk->delays_lock);
122
123 /* Though tsk->delays accessed later, early exit avoids
124 * unnecessary returning of other data
125 */
126 if (!tsk->delays)
127 goto done;
128
129 tmp = (s64)d->cpu_run_real_total;
130 cputime_to_timespec(tsk->utime + tsk->stime, &ts);
131 tmp += timespec_to_ns(&ts);
132 d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
133
134 /*
135 * No locking available for sched_info (and too expensive to add one)
136 * Mitigate by taking snapshot of values
137 */
138 t1 = tsk->sched_info.pcnt;
139 t2 = tsk->sched_info.run_delay;
140 t3 = tsk->sched_info.cpu_time;
141
142 d->cpu_count += t1;
143
144 jiffies_to_timespec(t2, &ts);
145 tmp = (s64)d->cpu_delay_total + timespec_to_ns(&ts);
146 d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp;
147
148 tmp = (s64)d->cpu_run_virtual_total + (s64)jiffies_to_usecs(t3) * 1000;
149 d->cpu_run_virtual_total =
150 (tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp;
151
152 /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */
153
154 spin_lock(&tsk->delays->lock);
155 tmp = d->blkio_delay_total + tsk->delays->blkio_delay;
156 d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp;
157 tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
158 d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp;
159 d->blkio_count += tsk->delays->blkio_count;
160 d->swapin_count += tsk->delays->swapin_count;
161 spin_unlock(&tsk->delays->lock);
162
163done:
164 spin_unlock(&tsk->delays_lock);
165 return 0;
166}
167
168__u64 __delayacct_blkio_ticks(struct task_struct *tsk)
169{
170 __u64 ret;
171
172 spin_lock(&tsk->delays->lock);
173 ret = nsec_to_clock_t(tsk->delays->blkio_delay +
174 tsk->delays->swapin_delay);
175 spin_unlock(&tsk->delays->lock);
176 return ret;
177}
178
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index c01cead2cfd6..3c2eaea66b1e 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -7,7 +7,6 @@
7 * 2001-05-06 Complete rewrite, Christoph Hellwig (hch@infradead.org) 7 * 2001-05-06 Complete rewrite, Christoph Hellwig (hch@infradead.org)
8 */ 8 */
9 9
10#include <linux/config.h>
11#include <linux/init.h> 10#include <linux/init.h>
12#include <linux/kernel.h> 11#include <linux/kernel.h>
13#include <linux/kmod.h> 12#include <linux/kmod.h>
diff --git a/kernel/exit.c b/kernel/exit.c
index e95b93282210..dba194a8d416 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -4,7 +4,6 @@
4 * Copyright (C) 1991, 1992 Linus Torvalds 4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */ 5 */
6 6
7#include <linux/config.h>
8#include <linux/mm.h> 7#include <linux/mm.h>
9#include <linux/slab.h> 8#include <linux/slab.h>
10#include <linux/interrupt.h> 9#include <linux/interrupt.h>
@@ -26,6 +25,8 @@
26#include <linux/mount.h> 25#include <linux/mount.h>
27#include <linux/proc_fs.h> 26#include <linux/proc_fs.h>
28#include <linux/mempolicy.h> 27#include <linux/mempolicy.h>
28#include <linux/taskstats_kern.h>
29#include <linux/delayacct.h>
29#include <linux/cpuset.h> 30#include <linux/cpuset.h>
30#include <linux/syscalls.h> 31#include <linux/syscalls.h>
31#include <linux/signal.h> 32#include <linux/signal.h>
@@ -36,6 +37,7 @@
36#include <linux/compat.h> 37#include <linux/compat.h>
37#include <linux/pipe_fs_i.h> 38#include <linux/pipe_fs_i.h>
38#include <linux/audit.h> /* for audit_free() */ 39#include <linux/audit.h> /* for audit_free() */
40#include <linux/resource.h>
39 41
40#include <asm/uaccess.h> 42#include <asm/uaccess.h>
41#include <asm/unistd.h> 43#include <asm/unistd.h>
@@ -45,8 +47,6 @@
45extern void sem_exit (void); 47extern void sem_exit (void);
46extern struct task_struct *child_reaper; 48extern struct task_struct *child_reaper;
47 49
48int getrusage(struct task_struct *, int, struct rusage __user *);
49
50static void exit_mm(struct task_struct * tsk); 50static void exit_mm(struct task_struct * tsk);
51 51
52static void __unhash_process(struct task_struct *p) 52static void __unhash_process(struct task_struct *p)
@@ -136,14 +136,10 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
136 136
137void release_task(struct task_struct * p) 137void release_task(struct task_struct * p)
138{ 138{
139 struct task_struct *leader;
139 int zap_leader; 140 int zap_leader;
140 task_t *leader;
141 struct dentry *proc_dentry;
142
143repeat: 141repeat:
144 atomic_dec(&p->user->processes); 142 atomic_dec(&p->user->processes);
145 spin_lock(&p->proc_lock);
146 proc_dentry = proc_pid_unhash(p);
147 write_lock_irq(&tasklist_lock); 143 write_lock_irq(&tasklist_lock);
148 ptrace_unlink(p); 144 ptrace_unlink(p);
149 BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); 145 BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
@@ -172,8 +168,7 @@ repeat:
172 168
173 sched_exit(p); 169 sched_exit(p);
174 write_unlock_irq(&tasklist_lock); 170 write_unlock_irq(&tasklist_lock);
175 spin_unlock(&p->proc_lock); 171 proc_flush_task(p);
176 proc_pid_flush(proc_dentry);
177 release_thread(p); 172 release_thread(p);
178 call_rcu(&p->rcu, delayed_put_task_struct); 173 call_rcu(&p->rcu, delayed_put_task_struct);
179 174
@@ -216,7 +211,7 @@ out:
216 * 211 *
217 * "I ask you, have you ever known what it is to be an orphan?" 212 * "I ask you, have you ever known what it is to be an orphan?"
218 */ 213 */
219static int will_become_orphaned_pgrp(int pgrp, task_t *ignored_task) 214static int will_become_orphaned_pgrp(int pgrp, struct task_struct *ignored_task)
220{ 215{
221 struct task_struct *p; 216 struct task_struct *p;
222 int ret = 1; 217 int ret = 1;
@@ -579,7 +574,7 @@ static void exit_mm(struct task_struct * tsk)
579 down_read(&mm->mmap_sem); 574 down_read(&mm->mmap_sem);
580 } 575 }
581 atomic_inc(&mm->mm_count); 576 atomic_inc(&mm->mm_count);
582 if (mm != tsk->active_mm) BUG(); 577 BUG_ON(mm != tsk->active_mm);
583 /* more a memory barrier than a real lock */ 578 /* more a memory barrier than a real lock */
584 task_lock(tsk); 579 task_lock(tsk);
585 tsk->mm = NULL; 580 tsk->mm = NULL;
@@ -589,7 +584,8 @@ static void exit_mm(struct task_struct * tsk)
589 mmput(mm); 584 mmput(mm);
590} 585}
591 586
592static inline void choose_new_parent(task_t *p, task_t *reaper) 587static inline void
588choose_new_parent(struct task_struct *p, struct task_struct *reaper)
593{ 589{
594 /* 590 /*
595 * Make sure we're not reparenting to ourselves and that 591 * Make sure we're not reparenting to ourselves and that
@@ -599,7 +595,8 @@ static inline void choose_new_parent(task_t *p, task_t *reaper)
599 p->real_parent = reaper; 595 p->real_parent = reaper;
600} 596}
601 597
602static void reparent_thread(task_t *p, task_t *father, int traced) 598static void
599reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
603{ 600{
604 /* We don't want people slaying init. */ 601 /* We don't want people slaying init. */
605 if (p->exit_signal != -1) 602 if (p->exit_signal != -1)
@@ -663,8 +660,8 @@ static void reparent_thread(task_t *p, task_t *father, int traced)
663 * group, and if no such member exists, give it to 660 * group, and if no such member exists, give it to
664 * the global child reaper process (ie "init") 661 * the global child reaper process (ie "init")
665 */ 662 */
666static void forget_original_parent(struct task_struct * father, 663static void
667 struct list_head *to_release) 664forget_original_parent(struct task_struct *father, struct list_head *to_release)
668{ 665{
669 struct task_struct *p, *reaper = father; 666 struct task_struct *p, *reaper = father;
670 struct list_head *_p, *_n; 667 struct list_head *_p, *_n;
@@ -687,7 +684,7 @@ static void forget_original_parent(struct task_struct * father,
687 */ 684 */
688 list_for_each_safe(_p, _n, &father->children) { 685 list_for_each_safe(_p, _n, &father->children) {
689 int ptrace; 686 int ptrace;
690 p = list_entry(_p,struct task_struct,sibling); 687 p = list_entry(_p, struct task_struct, sibling);
691 688
692 ptrace = p->ptrace; 689 ptrace = p->ptrace;
693 690
@@ -716,7 +713,7 @@ static void forget_original_parent(struct task_struct * father,
716 list_add(&p->ptrace_list, to_release); 713 list_add(&p->ptrace_list, to_release);
717 } 714 }
718 list_for_each_safe(_p, _n, &father->ptrace_children) { 715 list_for_each_safe(_p, _n, &father->ptrace_children) {
719 p = list_entry(_p,struct task_struct,ptrace_list); 716 p = list_entry(_p, struct task_struct, ptrace_list);
720 choose_new_parent(p, reaper); 717 choose_new_parent(p, reaper);
721 reparent_thread(p, father, 1); 718 reparent_thread(p, father, 1);
722 } 719 }
@@ -836,7 +833,7 @@ static void exit_notify(struct task_struct *tsk)
836 833
837 list_for_each_safe(_p, _n, &ptrace_dead) { 834 list_for_each_safe(_p, _n, &ptrace_dead) {
838 list_del_init(_p); 835 list_del_init(_p);
839 t = list_entry(_p,struct task_struct,ptrace_list); 836 t = list_entry(_p, struct task_struct, ptrace_list);
840 release_task(t); 837 release_task(t);
841 } 838 }
842 839
@@ -848,7 +845,9 @@ static void exit_notify(struct task_struct *tsk)
848fastcall NORET_TYPE void do_exit(long code) 845fastcall NORET_TYPE void do_exit(long code)
849{ 846{
850 struct task_struct *tsk = current; 847 struct task_struct *tsk = current;
848 struct taskstats *tidstats;
851 int group_dead; 849 int group_dead;
850 unsigned int mycpu;
852 851
853 profile_task_exit(tsk); 852 profile_task_exit(tsk);
854 853
@@ -881,19 +880,13 @@ fastcall NORET_TYPE void do_exit(long code)
881 880
882 tsk->flags |= PF_EXITING; 881 tsk->flags |= PF_EXITING;
883 882
884 /*
885 * Make sure we don't try to process any timer firings
886 * while we are already exiting.
887 */
888 tsk->it_virt_expires = cputime_zero;
889 tsk->it_prof_expires = cputime_zero;
890 tsk->it_sched_expires = 0;
891
892 if (unlikely(in_atomic())) 883 if (unlikely(in_atomic()))
893 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n", 884 printk(KERN_INFO "note: %s[%d] exited with preempt_count %d\n",
894 current->comm, current->pid, 885 current->comm, current->pid,
895 preempt_count()); 886 preempt_count());
896 887
888 taskstats_exit_alloc(&tidstats, &mycpu);
889
897 acct_update_integrals(tsk); 890 acct_update_integrals(tsk);
898 if (tsk->mm) { 891 if (tsk->mm) {
899 update_hiwater_rss(tsk->mm); 892 update_hiwater_rss(tsk->mm);
@@ -903,18 +896,24 @@ fastcall NORET_TYPE void do_exit(long code)
903 if (group_dead) { 896 if (group_dead) {
904 hrtimer_cancel(&tsk->signal->real_timer); 897 hrtimer_cancel(&tsk->signal->real_timer);
905 exit_itimers(tsk->signal); 898 exit_itimers(tsk->signal);
906 acct_process(code);
907 } 899 }
900 acct_collect(code, group_dead);
908 if (unlikely(tsk->robust_list)) 901 if (unlikely(tsk->robust_list))
909 exit_robust_list(tsk); 902 exit_robust_list(tsk);
910#ifdef CONFIG_COMPAT 903#if defined(CONFIG_FUTEX) && defined(CONFIG_COMPAT)
911 if (unlikely(tsk->compat_robust_list)) 904 if (unlikely(tsk->compat_robust_list))
912 compat_exit_robust_list(tsk); 905 compat_exit_robust_list(tsk);
913#endif 906#endif
914 if (unlikely(tsk->audit_context)) 907 if (unlikely(tsk->audit_context))
915 audit_free(tsk); 908 audit_free(tsk);
909 taskstats_exit_send(tsk, tidstats, group_dead, mycpu);
910 taskstats_exit_free(tidstats);
911 delayacct_tsk_exit(tsk);
912
916 exit_mm(tsk); 913 exit_mm(tsk);
917 914
915 if (group_dead)
916 acct_process();
918 exit_sem(tsk); 917 exit_sem(tsk);
919 __exit_files(tsk); 918 __exit_files(tsk);
920 __exit_fs(tsk); 919 __exit_fs(tsk);
@@ -938,9 +937,17 @@ fastcall NORET_TYPE void do_exit(long code)
938 tsk->mempolicy = NULL; 937 tsk->mempolicy = NULL;
939#endif 938#endif
940 /* 939 /*
941 * If DEBUG_MUTEXES is on, make sure we are holding no locks: 940 * This must happen late, after the PID is not
941 * hashed anymore:
942 */ 942 */
943 mutex_debug_check_no_locks_held(tsk); 943 if (unlikely(!list_empty(&tsk->pi_state_list)))
944 exit_pi_state_list(tsk);
945 if (unlikely(current->pi_state_cache))
946 kfree(current->pi_state_cache);
947 /*
948 * Make sure we are holding no locks:
949 */
950 debug_check_no_locks_held(tsk);
944 951
945 if (tsk->io_context) 952 if (tsk->io_context)
946 exit_io_context(); 953 exit_io_context();
@@ -1015,7 +1022,7 @@ asmlinkage void sys_exit_group(int error_code)
1015 do_group_exit((error_code & 0xff) << 8); 1022 do_group_exit((error_code & 0xff) << 8);
1016} 1023}
1017 1024
1018static int eligible_child(pid_t pid, int options, task_t *p) 1025static int eligible_child(pid_t pid, int options, struct task_struct *p)
1019{ 1026{
1020 if (pid > 0) { 1027 if (pid > 0) {
1021 if (p->pid != pid) 1028 if (p->pid != pid)
@@ -1056,12 +1063,13 @@ static int eligible_child(pid_t pid, int options, task_t *p)
1056 return 1; 1063 return 1;
1057} 1064}
1058 1065
1059static int wait_noreap_copyout(task_t *p, pid_t pid, uid_t uid, 1066static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
1060 int why, int status, 1067 int why, int status,
1061 struct siginfo __user *infop, 1068 struct siginfo __user *infop,
1062 struct rusage __user *rusagep) 1069 struct rusage __user *rusagep)
1063{ 1070{
1064 int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0; 1071 int retval = rusagep ? getrusage(p, RUSAGE_BOTH, rusagep) : 0;
1072
1065 put_task_struct(p); 1073 put_task_struct(p);
1066 if (!retval) 1074 if (!retval)
1067 retval = put_user(SIGCHLD, &infop->si_signo); 1075 retval = put_user(SIGCHLD, &infop->si_signo);
@@ -1086,7 +1094,7 @@ static int wait_noreap_copyout(task_t *p, pid_t pid, uid_t uid,
1086 * the lock and this task is uninteresting. If we return nonzero, we have 1094 * the lock and this task is uninteresting. If we return nonzero, we have
1087 * released the lock and the system call should return. 1095 * released the lock and the system call should return.
1088 */ 1096 */
1089static int wait_task_zombie(task_t *p, int noreap, 1097static int wait_task_zombie(struct task_struct *p, int noreap,
1090 struct siginfo __user *infop, 1098 struct siginfo __user *infop,
1091 int __user *stat_addr, struct rusage __user *ru) 1099 int __user *stat_addr, struct rusage __user *ru)
1092{ 1100{
@@ -1248,8 +1256,8 @@ static int wait_task_zombie(task_t *p, int noreap,
1248 * the lock and this task is uninteresting. If we return nonzero, we have 1256 * the lock and this task is uninteresting. If we return nonzero, we have
1249 * released the lock and the system call should return. 1257 * released the lock and the system call should return.
1250 */ 1258 */
1251static int wait_task_stopped(task_t *p, int delayed_group_leader, int noreap, 1259static int wait_task_stopped(struct task_struct *p, int delayed_group_leader,
1252 struct siginfo __user *infop, 1260 int noreap, struct siginfo __user *infop,
1253 int __user *stat_addr, struct rusage __user *ru) 1261 int __user *stat_addr, struct rusage __user *ru)
1254{ 1262{
1255 int retval, exit_code; 1263 int retval, exit_code;
@@ -1363,7 +1371,7 @@ bail_ref:
1363 * the lock and this task is uninteresting. If we return nonzero, we have 1371 * the lock and this task is uninteresting. If we return nonzero, we have
1364 * released the lock and the system call should return. 1372 * released the lock and the system call should return.
1365 */ 1373 */
1366static int wait_task_continued(task_t *p, int noreap, 1374static int wait_task_continued(struct task_struct *p, int noreap,
1367 struct siginfo __user *infop, 1375 struct siginfo __user *infop,
1368 int __user *stat_addr, struct rusage __user *ru) 1376 int __user *stat_addr, struct rusage __user *ru)
1369{ 1377{
@@ -1449,7 +1457,7 @@ repeat:
1449 int ret; 1457 int ret;
1450 1458
1451 list_for_each(_p,&tsk->children) { 1459 list_for_each(_p,&tsk->children) {
1452 p = list_entry(_p,struct task_struct,sibling); 1460 p = list_entry(_p, struct task_struct, sibling);
1453 1461
1454 ret = eligible_child(pid, options, p); 1462 ret = eligible_child(pid, options, p);
1455 if (!ret) 1463 if (!ret)
@@ -1538,8 +1546,7 @@ check_continued:
1538 if (options & __WNOTHREAD) 1546 if (options & __WNOTHREAD)
1539 break; 1547 break;
1540 tsk = next_thread(tsk); 1548 tsk = next_thread(tsk);
1541 if (tsk->signal != current->signal) 1549 BUG_ON(tsk->signal != current->signal);
1542 BUG();
1543 } while (tsk != current); 1550 } while (tsk != current);
1544 1551
1545 read_unlock(&tasklist_lock); 1552 read_unlock(&tasklist_lock);
diff --git a/kernel/fork.c b/kernel/fork.c
index ac8100e3088a..1b0f7b1e0881 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -11,7 +11,6 @@
11 * management can be a bitch. See 'mm/memory.c': 'copy_page_range()' 11 * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
12 */ 12 */
13 13
14#include <linux/config.h>
15#include <linux/slab.h> 14#include <linux/slab.h>
16#include <linux/init.h> 15#include <linux/init.h>
17#include <linux/unistd.h> 16#include <linux/unistd.h>
@@ -44,6 +43,8 @@
44#include <linux/rmap.h> 43#include <linux/rmap.h>
45#include <linux/acct.h> 44#include <linux/acct.h>
46#include <linux/cn_proc.h> 45#include <linux/cn_proc.h>
46#include <linux/delayacct.h>
47#include <linux/taskstats_kern.h>
47 48
48#include <asm/pgtable.h> 49#include <asm/pgtable.h>
49#include <asm/pgalloc.h> 50#include <asm/pgalloc.h>
@@ -62,9 +63,7 @@ int max_threads; /* tunable limit on nr_threads */
62 63
63DEFINE_PER_CPU(unsigned long, process_counts) = 0; 64DEFINE_PER_CPU(unsigned long, process_counts) = 0;
64 65
65 __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ 66__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
66
67EXPORT_SYMBOL(tasklist_lock);
68 67
69int nr_processes(void) 68int nr_processes(void)
70{ 69{
@@ -104,6 +103,7 @@ static kmem_cache_t *mm_cachep;
104void free_task(struct task_struct *tsk) 103void free_task(struct task_struct *tsk)
105{ 104{
106 free_thread_info(tsk->thread_info); 105 free_thread_info(tsk->thread_info);
106 rt_mutex_debug_task_free(tsk);
107 free_task_struct(tsk); 107 free_task_struct(tsk);
108} 108}
109EXPORT_SYMBOL(free_task); 109EXPORT_SYMBOL(free_task);
@@ -193,7 +193,10 @@ static inline int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
193 193
194 down_write(&oldmm->mmap_sem); 194 down_write(&oldmm->mmap_sem);
195 flush_cache_mm(oldmm); 195 flush_cache_mm(oldmm);
196 down_write(&mm->mmap_sem); 196 /*
197 * Not linked in yet - no deadlock potential:
198 */
199 down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING);
197 200
198 mm->locked_vm = 0; 201 mm->locked_vm = 0;
199 mm->mmap = NULL; 202 mm->mmap = NULL;
@@ -368,6 +371,8 @@ void fastcall __mmdrop(struct mm_struct *mm)
368 */ 371 */
369void mmput(struct mm_struct *mm) 372void mmput(struct mm_struct *mm)
370{ 373{
374 might_sleep();
375
371 if (atomic_dec_and_test(&mm->mm_users)) { 376 if (atomic_dec_and_test(&mm->mm_users)) {
372 exit_aio(mm); 377 exit_aio(mm);
373 exit_mmap(mm); 378 exit_mmap(mm);
@@ -623,6 +628,7 @@ out:
623/* 628/*
624 * Allocate a new files structure and copy contents from the 629 * Allocate a new files structure and copy contents from the
625 * passed in files structure. 630 * passed in files structure.
631 * errorp will be valid only when the returned files_struct is NULL.
626 */ 632 */
627static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) 633static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
628{ 634{
@@ -631,6 +637,7 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
631 int open_files, size, i, expand; 637 int open_files, size, i, expand;
632 struct fdtable *old_fdt, *new_fdt; 638 struct fdtable *old_fdt, *new_fdt;
633 639
640 *errorp = -ENOMEM;
634 newf = alloc_files(); 641 newf = alloc_files();
635 if (!newf) 642 if (!newf)
636 goto out; 643 goto out;
@@ -744,7 +751,6 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
744 * break this. 751 * break this.
745 */ 752 */
746 tsk->files = NULL; 753 tsk->files = NULL;
747 error = -ENOMEM;
748 newf = dup_fd(oldf, &error); 754 newf = dup_fd(oldf, &error);
749 if (!newf) 755 if (!newf)
750 goto out; 756 goto out;
@@ -814,6 +820,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
814 if (clone_flags & CLONE_THREAD) { 820 if (clone_flags & CLONE_THREAD) {
815 atomic_inc(&current->signal->count); 821 atomic_inc(&current->signal->count);
816 atomic_inc(&current->signal->live); 822 atomic_inc(&current->signal->live);
823 taskstats_tgid_alloc(current->signal);
817 return 0; 824 return 0;
818 } 825 }
819 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); 826 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
@@ -858,6 +865,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
858 INIT_LIST_HEAD(&sig->cpu_timers[0]); 865 INIT_LIST_HEAD(&sig->cpu_timers[0]);
859 INIT_LIST_HEAD(&sig->cpu_timers[1]); 866 INIT_LIST_HEAD(&sig->cpu_timers[1]);
860 INIT_LIST_HEAD(&sig->cpu_timers[2]); 867 INIT_LIST_HEAD(&sig->cpu_timers[2]);
868 taskstats_tgid_init(sig);
861 869
862 task_lock(current->group_leader); 870 task_lock(current->group_leader);
863 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); 871 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
@@ -871,6 +879,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
871 tsk->it_prof_expires = 879 tsk->it_prof_expires =
872 secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); 880 secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
873 } 881 }
882 acct_init_pacct(&sig->pacct);
874 883
875 return 0; 884 return 0;
876} 885}
@@ -878,6 +887,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
878void __cleanup_signal(struct signal_struct *sig) 887void __cleanup_signal(struct signal_struct *sig)
879{ 888{
880 exit_thread_group_keys(sig); 889 exit_thread_group_keys(sig);
890 taskstats_tgid_free(sig);
881 kmem_cache_free(signal_cachep, sig); 891 kmem_cache_free(signal_cachep, sig);
882} 892}
883 893
@@ -909,6 +919,15 @@ asmlinkage long sys_set_tid_address(int __user *tidptr)
909 return current->pid; 919 return current->pid;
910} 920}
911 921
922static inline void rt_mutex_init_task(struct task_struct *p)
923{
924#ifdef CONFIG_RT_MUTEXES
925 spin_lock_init(&p->pi_lock);
926 plist_head_init(&p->pi_waiters, &p->pi_lock);
927 p->pi_blocked_on = NULL;
928#endif
929}
930
912/* 931/*
913 * This creates a new process as a copy of the old one, 932 * This creates a new process as a copy of the old one,
914 * but does not actually start it yet. 933 * but does not actually start it yet.
@@ -917,13 +936,13 @@ asmlinkage long sys_set_tid_address(int __user *tidptr)
917 * parts of the process environment (as per the clone 936 * parts of the process environment (as per the clone
918 * flags). The actual kick-off is left to the caller. 937 * flags). The actual kick-off is left to the caller.
919 */ 938 */
920static task_t *copy_process(unsigned long clone_flags, 939static struct task_struct *copy_process(unsigned long clone_flags,
921 unsigned long stack_start, 940 unsigned long stack_start,
922 struct pt_regs *regs, 941 struct pt_regs *regs,
923 unsigned long stack_size, 942 unsigned long stack_size,
924 int __user *parent_tidptr, 943 int __user *parent_tidptr,
925 int __user *child_tidptr, 944 int __user *child_tidptr,
926 int pid) 945 int pid)
927{ 946{
928 int retval; 947 int retval;
929 struct task_struct *p = NULL; 948 struct task_struct *p = NULL;
@@ -955,6 +974,10 @@ static task_t *copy_process(unsigned long clone_flags,
955 if (!p) 974 if (!p)
956 goto fork_out; 975 goto fork_out;
957 976
977#ifdef CONFIG_TRACE_IRQFLAGS
978 DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
979 DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
980#endif
958 retval = -EAGAIN; 981 retval = -EAGAIN;
959 if (atomic_read(&p->user->processes) >= 982 if (atomic_read(&p->user->processes) >=
960 p->signal->rlim[RLIMIT_NPROC].rlim_cur) { 983 p->signal->rlim[RLIMIT_NPROC].rlim_cur) {
@@ -982,6 +1005,7 @@ static task_t *copy_process(unsigned long clone_flags,
982 goto bad_fork_cleanup_put_domain; 1005 goto bad_fork_cleanup_put_domain;
983 1006
984 p->did_exec = 0; 1007 p->did_exec = 0;
1008 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
985 copy_flags(clone_flags, p); 1009 copy_flags(clone_flags, p);
986 p->pid = pid; 1010 p->pid = pid;
987 retval = -EFAULT; 1011 retval = -EFAULT;
@@ -989,13 +1013,10 @@ static task_t *copy_process(unsigned long clone_flags,
989 if (put_user(p->pid, parent_tidptr)) 1013 if (put_user(p->pid, parent_tidptr))
990 goto bad_fork_cleanup; 1014 goto bad_fork_cleanup;
991 1015
992 p->proc_dentry = NULL;
993
994 INIT_LIST_HEAD(&p->children); 1016 INIT_LIST_HEAD(&p->children);
995 INIT_LIST_HEAD(&p->sibling); 1017 INIT_LIST_HEAD(&p->sibling);
996 p->vfork_done = NULL; 1018 p->vfork_done = NULL;
997 spin_lock_init(&p->alloc_lock); 1019 spin_lock_init(&p->alloc_lock);
998 spin_lock_init(&p->proc_lock);
999 1020
1000 clear_tsk_thread_flag(p, TIF_SIGPENDING); 1021 clear_tsk_thread_flag(p, TIF_SIGPENDING);
1001 init_sigpending(&p->pending); 1022 init_sigpending(&p->pending);
@@ -1032,6 +1053,28 @@ static task_t *copy_process(unsigned long clone_flags,
1032 } 1053 }
1033 mpol_fix_fork_child_flag(p); 1054 mpol_fix_fork_child_flag(p);
1034#endif 1055#endif
1056#ifdef CONFIG_TRACE_IRQFLAGS
1057 p->irq_events = 0;
1058 p->hardirqs_enabled = 0;
1059 p->hardirq_enable_ip = 0;
1060 p->hardirq_enable_event = 0;
1061 p->hardirq_disable_ip = _THIS_IP_;
1062 p->hardirq_disable_event = 0;
1063 p->softirqs_enabled = 1;
1064 p->softirq_enable_ip = _THIS_IP_;
1065 p->softirq_enable_event = 0;
1066 p->softirq_disable_ip = 0;
1067 p->softirq_disable_event = 0;
1068 p->hardirq_context = 0;
1069 p->softirq_context = 0;
1070#endif
1071#ifdef CONFIG_LOCKDEP
1072 p->lockdep_depth = 0; /* no locks held yet */
1073 p->curr_chain_key = 0;
1074 p->lockdep_recursion = 0;
1075#endif
1076
1077 rt_mutex_init_task(p);
1035 1078
1036#ifdef CONFIG_DEBUG_MUTEXES 1079#ifdef CONFIG_DEBUG_MUTEXES
1037 p->blocked_on = NULL; /* not blocked yet */ 1080 p->blocked_on = NULL; /* not blocked yet */
@@ -1075,6 +1118,9 @@ static task_t *copy_process(unsigned long clone_flags,
1075#ifdef CONFIG_COMPAT 1118#ifdef CONFIG_COMPAT
1076 p->compat_robust_list = NULL; 1119 p->compat_robust_list = NULL;
1077#endif 1120#endif
1121 INIT_LIST_HEAD(&p->pi_state_list);
1122 p->pi_state_cache = NULL;
1123
1078 /* 1124 /*
1079 * sigaltstack should be cleared when sharing the same VM 1125 * sigaltstack should be cleared when sharing the same VM
1080 */ 1126 */
@@ -1155,18 +1201,6 @@ static task_t *copy_process(unsigned long clone_flags,
1155 } 1201 }
1156 1202
1157 if (clone_flags & CLONE_THREAD) { 1203 if (clone_flags & CLONE_THREAD) {
1158 /*
1159 * Important: if an exit-all has been started then
1160 * do not create this new thread - the whole thread
1161 * group is supposed to exit anyway.
1162 */
1163 if (current->signal->flags & SIGNAL_GROUP_EXIT) {
1164 spin_unlock(&current->sighand->siglock);
1165 write_unlock_irq(&tasklist_lock);
1166 retval = -EAGAIN;
1167 goto bad_fork_cleanup_namespace;
1168 }
1169
1170 p->group_leader = current->group_leader; 1204 p->group_leader = current->group_leader;
1171 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); 1205 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
1172 1206
@@ -1264,9 +1298,9 @@ struct pt_regs * __devinit __attribute__((weak)) idle_regs(struct pt_regs *regs)
1264 return regs; 1298 return regs;
1265} 1299}
1266 1300
1267task_t * __devinit fork_idle(int cpu) 1301struct task_struct * __devinit fork_idle(int cpu)
1268{ 1302{
1269 task_t *task; 1303 struct task_struct *task;
1270 struct pt_regs regs; 1304 struct pt_regs regs;
1271 1305
1272 task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, NULL, 0); 1306 task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, NULL, 0);
diff --git a/kernel/futex.c b/kernel/futex.c
index 5699c512057b..cf0c8e21d1ab 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -12,6 +12,10 @@
12 * (C) Copyright 2006 Red Hat Inc, All Rights Reserved 12 * (C) Copyright 2006 Red Hat Inc, All Rights Reserved
13 * Thanks to Thomas Gleixner for suggestions, analysis and fixes. 13 * Thanks to Thomas Gleixner for suggestions, analysis and fixes.
14 * 14 *
15 * PI-futex support started by Ingo Molnar and Thomas Gleixner
16 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
17 * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
18 *
15 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly 19 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
16 * enough at me, Linus for the original (flawed) idea, Matthew 20 * enough at me, Linus for the original (flawed) idea, Matthew
17 * Kirkwood for proof-of-concept implementation. 21 * Kirkwood for proof-of-concept implementation.
@@ -46,6 +50,8 @@
46#include <linux/signal.h> 50#include <linux/signal.h>
47#include <asm/futex.h> 51#include <asm/futex.h>
48 52
53#include "rtmutex_common.h"
54
49#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) 55#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
50 56
51/* 57/*
@@ -63,7 +69,7 @@ union futex_key {
63 int offset; 69 int offset;
64 } shared; 70 } shared;
65 struct { 71 struct {
66 unsigned long uaddr; 72 unsigned long address;
67 struct mm_struct *mm; 73 struct mm_struct *mm;
68 int offset; 74 int offset;
69 } private; 75 } private;
@@ -75,6 +81,27 @@ union futex_key {
75}; 81};
76 82
77/* 83/*
84 * Priority Inheritance state:
85 */
86struct futex_pi_state {
87 /*
88 * list of 'owned' pi_state instances - these have to be
89 * cleaned up in do_exit() if the task exits prematurely:
90 */
91 struct list_head list;
92
93 /*
94 * The PI object:
95 */
96 struct rt_mutex pi_mutex;
97
98 struct task_struct *owner;
99 atomic_t refcount;
100
101 union futex_key key;
102};
103
104/*
78 * We use this hashed waitqueue instead of a normal wait_queue_t, so 105 * We use this hashed waitqueue instead of a normal wait_queue_t, so
79 * we can wake only the relevant ones (hashed queues may be shared). 106 * we can wake only the relevant ones (hashed queues may be shared).
80 * 107 *
@@ -87,15 +114,19 @@ struct futex_q {
87 struct list_head list; 114 struct list_head list;
88 wait_queue_head_t waiters; 115 wait_queue_head_t waiters;
89 116
90 /* Which hash list lock to use. */ 117 /* Which hash list lock to use: */
91 spinlock_t *lock_ptr; 118 spinlock_t *lock_ptr;
92 119
93 /* Key which the futex is hashed on. */ 120 /* Key which the futex is hashed on: */
94 union futex_key key; 121 union futex_key key;
95 122
96 /* For fd, sigio sent using these. */ 123 /* For fd, sigio sent using these: */
97 int fd; 124 int fd;
98 struct file *filp; 125 struct file *filp;
126
127 /* Optional priority inheritance state: */
128 struct futex_pi_state *pi_state;
129 struct task_struct *task;
99}; 130};
100 131
101/* 132/*
@@ -144,8 +175,9 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
144 * 175 *
145 * Should be called with &current->mm->mmap_sem but NOT any spinlocks. 176 * Should be called with &current->mm->mmap_sem but NOT any spinlocks.
146 */ 177 */
147static int get_futex_key(unsigned long uaddr, union futex_key *key) 178static int get_futex_key(u32 __user *uaddr, union futex_key *key)
148{ 179{
180 unsigned long address = (unsigned long)uaddr;
149 struct mm_struct *mm = current->mm; 181 struct mm_struct *mm = current->mm;
150 struct vm_area_struct *vma; 182 struct vm_area_struct *vma;
151 struct page *page; 183 struct page *page;
@@ -154,16 +186,16 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
154 /* 186 /*
155 * The futex address must be "naturally" aligned. 187 * The futex address must be "naturally" aligned.
156 */ 188 */
157 key->both.offset = uaddr % PAGE_SIZE; 189 key->both.offset = address % PAGE_SIZE;
158 if (unlikely((key->both.offset % sizeof(u32)) != 0)) 190 if (unlikely((key->both.offset % sizeof(u32)) != 0))
159 return -EINVAL; 191 return -EINVAL;
160 uaddr -= key->both.offset; 192 address -= key->both.offset;
161 193
162 /* 194 /*
163 * The futex is hashed differently depending on whether 195 * The futex is hashed differently depending on whether
164 * it's in a shared or private mapping. So check vma first. 196 * it's in a shared or private mapping. So check vma first.
165 */ 197 */
166 vma = find_extend_vma(mm, uaddr); 198 vma = find_extend_vma(mm, address);
167 if (unlikely(!vma)) 199 if (unlikely(!vma))
168 return -EFAULT; 200 return -EFAULT;
169 201
@@ -184,7 +216,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
184 */ 216 */
185 if (likely(!(vma->vm_flags & VM_MAYSHARE))) { 217 if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
186 key->private.mm = mm; 218 key->private.mm = mm;
187 key->private.uaddr = uaddr; 219 key->private.address = address;
188 return 0; 220 return 0;
189 } 221 }
190 222
@@ -194,7 +226,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
194 key->shared.inode = vma->vm_file->f_dentry->d_inode; 226 key->shared.inode = vma->vm_file->f_dentry->d_inode;
195 key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ 227 key->both.offset++; /* Bit 0 of offset indicates inode-based key. */
196 if (likely(!(vma->vm_flags & VM_NONLINEAR))) { 228 if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
197 key->shared.pgoff = (((uaddr - vma->vm_start) >> PAGE_SHIFT) 229 key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
198 + vma->vm_pgoff); 230 + vma->vm_pgoff);
199 return 0; 231 return 0;
200 } 232 }
@@ -205,7 +237,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
205 * from swap. But that's a lot of code to duplicate here 237 * from swap. But that's a lot of code to duplicate here
206 * for a rare case, so we simply fetch the page. 238 * for a rare case, so we simply fetch the page.
207 */ 239 */
208 err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL); 240 err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
209 if (err >= 0) { 241 if (err >= 0) {
210 key->shared.pgoff = 242 key->shared.pgoff =
211 page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 243 page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -246,18 +278,250 @@ static void drop_key_refs(union futex_key *key)
246 } 278 }
247} 279}
248 280
249static inline int get_futex_value_locked(int *dest, int __user *from) 281static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
250{ 282{
251 int ret; 283 int ret;
252 284
253 inc_preempt_count(); 285 inc_preempt_count();
254 ret = __copy_from_user_inatomic(dest, from, sizeof(int)); 286 ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
255 dec_preempt_count(); 287 dec_preempt_count();
256 288
257 return ret ? -EFAULT : 0; 289 return ret ? -EFAULT : 0;
258} 290}
259 291
260/* 292/*
293 * Fault handling. Called with current->mm->mmap_sem held.
294 */
295static int futex_handle_fault(unsigned long address, int attempt)
296{
297 struct vm_area_struct * vma;
298 struct mm_struct *mm = current->mm;
299
300 if (attempt >= 2 || !(vma = find_vma(mm, address)) ||
301 vma->vm_start > address || !(vma->vm_flags & VM_WRITE))
302 return -EFAULT;
303
304 switch (handle_mm_fault(mm, vma, address, 1)) {
305 case VM_FAULT_MINOR:
306 current->min_flt++;
307 break;
308 case VM_FAULT_MAJOR:
309 current->maj_flt++;
310 break;
311 default:
312 return -EFAULT;
313 }
314 return 0;
315}
316
317/*
318 * PI code:
319 */
320static int refill_pi_state_cache(void)
321{
322 struct futex_pi_state *pi_state;
323
324 if (likely(current->pi_state_cache))
325 return 0;
326
327 pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL);
328
329 if (!pi_state)
330 return -ENOMEM;
331
332 memset(pi_state, 0, sizeof(*pi_state));
333 INIT_LIST_HEAD(&pi_state->list);
334 /* pi_mutex gets initialized later */
335 pi_state->owner = NULL;
336 atomic_set(&pi_state->refcount, 1);
337
338 current->pi_state_cache = pi_state;
339
340 return 0;
341}
342
343static struct futex_pi_state * alloc_pi_state(void)
344{
345 struct futex_pi_state *pi_state = current->pi_state_cache;
346
347 WARN_ON(!pi_state);
348 current->pi_state_cache = NULL;
349
350 return pi_state;
351}
352
353static void free_pi_state(struct futex_pi_state *pi_state)
354{
355 if (!atomic_dec_and_test(&pi_state->refcount))
356 return;
357
358 /*
359 * If pi_state->owner is NULL, the owner is most probably dying
360 * and has cleaned up the pi_state already
361 */
362 if (pi_state->owner) {
363 spin_lock_irq(&pi_state->owner->pi_lock);
364 list_del_init(&pi_state->list);
365 spin_unlock_irq(&pi_state->owner->pi_lock);
366
367 rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
368 }
369
370 if (current->pi_state_cache)
371 kfree(pi_state);
372 else {
373 /*
374 * pi_state->list is already empty.
375 * clear pi_state->owner.
376 * refcount is at 0 - put it back to 1.
377 */
378 pi_state->owner = NULL;
379 atomic_set(&pi_state->refcount, 1);
380 current->pi_state_cache = pi_state;
381 }
382}
383
384/*
385 * Look up the task based on what TID userspace gave us.
386 * We dont trust it.
387 */
388static struct task_struct * futex_find_get_task(pid_t pid)
389{
390 struct task_struct *p;
391
392 read_lock(&tasklist_lock);
393 p = find_task_by_pid(pid);
394 if (!p)
395 goto out_unlock;
396 if ((current->euid != p->euid) && (current->euid != p->uid)) {
397 p = NULL;
398 goto out_unlock;
399 }
400 if (p->state == EXIT_ZOMBIE || p->exit_state == EXIT_ZOMBIE) {
401 p = NULL;
402 goto out_unlock;
403 }
404 get_task_struct(p);
405out_unlock:
406 read_unlock(&tasklist_lock);
407
408 return p;
409}
410
411/*
412 * This task is holding PI mutexes at exit time => bad.
413 * Kernel cleans up PI-state, but userspace is likely hosed.
414 * (Robust-futex cleanup is separate and might save the day for userspace.)
415 */
416void exit_pi_state_list(struct task_struct *curr)
417{
418 struct futex_hash_bucket *hb;
419 struct list_head *next, *head = &curr->pi_state_list;
420 struct futex_pi_state *pi_state;
421 union futex_key key;
422
423 /*
424 * We are a ZOMBIE and nobody can enqueue itself on
425 * pi_state_list anymore, but we have to be careful
426 * versus waiters unqueueing themselfs
427 */
428 spin_lock_irq(&curr->pi_lock);
429 while (!list_empty(head)) {
430
431 next = head->next;
432 pi_state = list_entry(next, struct futex_pi_state, list);
433 key = pi_state->key;
434 spin_unlock_irq(&curr->pi_lock);
435
436 hb = hash_futex(&key);
437 spin_lock(&hb->lock);
438
439 spin_lock_irq(&curr->pi_lock);
440 if (head->next != next) {
441 spin_unlock(&hb->lock);
442 continue;
443 }
444
445 list_del_init(&pi_state->list);
446
447 WARN_ON(pi_state->owner != curr);
448
449 pi_state->owner = NULL;
450 spin_unlock_irq(&curr->pi_lock);
451
452 rt_mutex_unlock(&pi_state->pi_mutex);
453
454 spin_unlock(&hb->lock);
455
456 spin_lock_irq(&curr->pi_lock);
457 }
458 spin_unlock_irq(&curr->pi_lock);
459}
460
461static int
462lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
463{
464 struct futex_pi_state *pi_state = NULL;
465 struct futex_q *this, *next;
466 struct list_head *head;
467 struct task_struct *p;
468 pid_t pid;
469
470 head = &hb->chain;
471
472 list_for_each_entry_safe(this, next, head, list) {
473 if (match_futex (&this->key, &me->key)) {
474 /*
475 * Another waiter already exists - bump up
476 * the refcount and return its pi_state:
477 */
478 pi_state = this->pi_state;
479 /*
480 * Userspace might have messed up non PI and PI futexes
481 */
482 if (unlikely(!pi_state))
483 return -EINVAL;
484
485 atomic_inc(&pi_state->refcount);
486 me->pi_state = pi_state;
487
488 return 0;
489 }
490 }
491
492 /*
493 * We are the first waiter - try to look up the real owner and
494 * attach the new pi_state to it:
495 */
496 pid = uval & FUTEX_TID_MASK;
497 p = futex_find_get_task(pid);
498 if (!p)
499 return -ESRCH;
500
501 pi_state = alloc_pi_state();
502
503 /*
504 * Initialize the pi_mutex in locked state and make 'p'
505 * the owner of it:
506 */
507 rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
508
509 /* Store the key for possible exit cleanups: */
510 pi_state->key = me->key;
511
512 spin_lock_irq(&p->pi_lock);
513 list_add(&pi_state->list, &p->pi_state_list);
514 pi_state->owner = p;
515 spin_unlock_irq(&p->pi_lock);
516
517 put_task_struct(p);
518
519 me->pi_state = pi_state;
520
521 return 0;
522}
523
524/*
261 * The hash bucket lock must be held when this is called. 525 * The hash bucket lock must be held when this is called.
262 * Afterwards, the futex_q must not be accessed. 526 * Afterwards, the futex_q must not be accessed.
263 */ 527 */
@@ -284,16 +548,96 @@ static void wake_futex(struct futex_q *q)
284 q->lock_ptr = NULL; 548 q->lock_ptr = NULL;
285} 549}
286 550
551static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
552{
553 struct task_struct *new_owner;
554 struct futex_pi_state *pi_state = this->pi_state;
555 u32 curval, newval;
556
557 if (!pi_state)
558 return -EINVAL;
559
560 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
561
562 /*
563 * This happens when we have stolen the lock and the original
564 * pending owner did not enqueue itself back on the rt_mutex.
565 * Thats not a tragedy. We know that way, that a lock waiter
566 * is on the fly. We make the futex_q waiter the pending owner.
567 */
568 if (!new_owner)
569 new_owner = this->task;
570
571 /*
572 * We pass it to the next owner. (The WAITERS bit is always
573 * kept enabled while there is PI state around. We must also
574 * preserve the owner died bit.)
575 */
576 newval = (uval & FUTEX_OWNER_DIED) | FUTEX_WAITERS | new_owner->pid;
577
578 inc_preempt_count();
579 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
580 dec_preempt_count();
581
582 if (curval == -EFAULT)
583 return -EFAULT;
584 if (curval != uval)
585 return -EINVAL;
586
587 list_del_init(&pi_state->owner->pi_state_list);
588 list_add(&pi_state->list, &new_owner->pi_state_list);
589 pi_state->owner = new_owner;
590 rt_mutex_unlock(&pi_state->pi_mutex);
591
592 return 0;
593}
594
595static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
596{
597 u32 oldval;
598
599 /*
600 * There is no waiter, so we unlock the futex. The owner died
601 * bit has not to be preserved here. We are the owner:
602 */
603 inc_preempt_count();
604 oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0);
605 dec_preempt_count();
606
607 if (oldval == -EFAULT)
608 return oldval;
609 if (oldval != uval)
610 return -EAGAIN;
611
612 return 0;
613}
614
615/*
616 * Express the locking dependencies for lockdep:
617 */
618static inline void
619double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
620{
621 if (hb1 <= hb2) {
622 spin_lock(&hb1->lock);
623 if (hb1 < hb2)
624 spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
625 } else { /* hb1 > hb2 */
626 spin_lock(&hb2->lock);
627 spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
628 }
629}
630
287/* 631/*
288 * Wake up all waiters hashed on the physical page that is mapped 632 * Wake up all waiters hashed on the physical page that is mapped
289 * to this virtual address: 633 * to this virtual address:
290 */ 634 */
291static int futex_wake(unsigned long uaddr, int nr_wake) 635static int futex_wake(u32 __user *uaddr, int nr_wake)
292{ 636{
293 union futex_key key; 637 struct futex_hash_bucket *hb;
294 struct futex_hash_bucket *bh;
295 struct list_head *head;
296 struct futex_q *this, *next; 638 struct futex_q *this, *next;
639 struct list_head *head;
640 union futex_key key;
297 int ret; 641 int ret;
298 642
299 down_read(&current->mm->mmap_sem); 643 down_read(&current->mm->mmap_sem);
@@ -302,19 +646,23 @@ static int futex_wake(unsigned long uaddr, int nr_wake)
302 if (unlikely(ret != 0)) 646 if (unlikely(ret != 0))
303 goto out; 647 goto out;
304 648
305 bh = hash_futex(&key); 649 hb = hash_futex(&key);
306 spin_lock(&bh->lock); 650 spin_lock(&hb->lock);
307 head = &bh->chain; 651 head = &hb->chain;
308 652
309 list_for_each_entry_safe(this, next, head, list) { 653 list_for_each_entry_safe(this, next, head, list) {
310 if (match_futex (&this->key, &key)) { 654 if (match_futex (&this->key, &key)) {
655 if (this->pi_state) {
656 ret = -EINVAL;
657 break;
658 }
311 wake_futex(this); 659 wake_futex(this);
312 if (++ret >= nr_wake) 660 if (++ret >= nr_wake)
313 break; 661 break;
314 } 662 }
315 } 663 }
316 664
317 spin_unlock(&bh->lock); 665 spin_unlock(&hb->lock);
318out: 666out:
319 up_read(&current->mm->mmap_sem); 667 up_read(&current->mm->mmap_sem);
320 return ret; 668 return ret;
@@ -324,10 +672,12 @@ out:
324 * Wake up all waiters hashed on the physical page that is mapped 672 * Wake up all waiters hashed on the physical page that is mapped
325 * to this virtual address: 673 * to this virtual address:
326 */ 674 */
327static int futex_wake_op(unsigned long uaddr1, unsigned long uaddr2, int nr_wake, int nr_wake2, int op) 675static int
676futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2,
677 int nr_wake, int nr_wake2, int op)
328{ 678{
329 union futex_key key1, key2; 679 union futex_key key1, key2;
330 struct futex_hash_bucket *bh1, *bh2; 680 struct futex_hash_bucket *hb1, *hb2;
331 struct list_head *head; 681 struct list_head *head;
332 struct futex_q *this, *next; 682 struct futex_q *this, *next;
333 int ret, op_ret, attempt = 0; 683 int ret, op_ret, attempt = 0;
@@ -342,27 +692,25 @@ retryfull:
342 if (unlikely(ret != 0)) 692 if (unlikely(ret != 0))
343 goto out; 693 goto out;
344 694
345 bh1 = hash_futex(&key1); 695 hb1 = hash_futex(&key1);
346 bh2 = hash_futex(&key2); 696 hb2 = hash_futex(&key2);
347 697
348retry: 698retry:
349 if (bh1 < bh2) 699 double_lock_hb(hb1, hb2);
350 spin_lock(&bh1->lock);
351 spin_lock(&bh2->lock);
352 if (bh1 > bh2)
353 spin_lock(&bh1->lock);
354 700
355 op_ret = futex_atomic_op_inuser(op, (int __user *)uaddr2); 701 op_ret = futex_atomic_op_inuser(op, uaddr2);
356 if (unlikely(op_ret < 0)) { 702 if (unlikely(op_ret < 0)) {
357 int dummy; 703 u32 dummy;
358 704
359 spin_unlock(&bh1->lock); 705 spin_unlock(&hb1->lock);
360 if (bh1 != bh2) 706 if (hb1 != hb2)
361 spin_unlock(&bh2->lock); 707 spin_unlock(&hb2->lock);
362 708
363#ifndef CONFIG_MMU 709#ifndef CONFIG_MMU
364 /* we don't get EFAULT from MMU faults if we don't have an MMU, 710 /*
365 * but we might get them from range checking */ 711 * we don't get EFAULT from MMU faults if we don't have an MMU,
712 * but we might get them from range checking
713 */
366 ret = op_ret; 714 ret = op_ret;
367 goto out; 715 goto out;
368#endif 716#endif
@@ -372,47 +720,34 @@ retry:
372 goto out; 720 goto out;
373 } 721 }
374 722
375 /* futex_atomic_op_inuser needs to both read and write 723 /*
724 * futex_atomic_op_inuser needs to both read and write
376 * *(int __user *)uaddr2, but we can't modify it 725 * *(int __user *)uaddr2, but we can't modify it
377 * non-atomically. Therefore, if get_user below is not 726 * non-atomically. Therefore, if get_user below is not
378 * enough, we need to handle the fault ourselves, while 727 * enough, we need to handle the fault ourselves, while
379 * still holding the mmap_sem. */ 728 * still holding the mmap_sem.
729 */
380 if (attempt++) { 730 if (attempt++) {
381 struct vm_area_struct * vma; 731 if (futex_handle_fault((unsigned long)uaddr2,
382 struct mm_struct *mm = current->mm; 732 attempt))
383
384 ret = -EFAULT;
385 if (attempt >= 2 ||
386 !(vma = find_vma(mm, uaddr2)) ||
387 vma->vm_start > uaddr2 ||
388 !(vma->vm_flags & VM_WRITE))
389 goto out;
390
391 switch (handle_mm_fault(mm, vma, uaddr2, 1)) {
392 case VM_FAULT_MINOR:
393 current->min_flt++;
394 break;
395 case VM_FAULT_MAJOR:
396 current->maj_flt++;
397 break;
398 default:
399 goto out; 733 goto out;
400 }
401 goto retry; 734 goto retry;
402 } 735 }
403 736
404 /* If we would have faulted, release mmap_sem, 737 /*
405 * fault it in and start all over again. */ 738 * If we would have faulted, release mmap_sem,
739 * fault it in and start all over again.
740 */
406 up_read(&current->mm->mmap_sem); 741 up_read(&current->mm->mmap_sem);
407 742
408 ret = get_user(dummy, (int __user *)uaddr2); 743 ret = get_user(dummy, uaddr2);
409 if (ret) 744 if (ret)
410 return ret; 745 return ret;
411 746
412 goto retryfull; 747 goto retryfull;
413 } 748 }
414 749
415 head = &bh1->chain; 750 head = &hb1->chain;
416 751
417 list_for_each_entry_safe(this, next, head, list) { 752 list_for_each_entry_safe(this, next, head, list) {
418 if (match_futex (&this->key, &key1)) { 753 if (match_futex (&this->key, &key1)) {
@@ -423,7 +758,7 @@ retry:
423 } 758 }
424 759
425 if (op_ret > 0) { 760 if (op_ret > 0) {
426 head = &bh2->chain; 761 head = &hb2->chain;
427 762
428 op_ret = 0; 763 op_ret = 0;
429 list_for_each_entry_safe(this, next, head, list) { 764 list_for_each_entry_safe(this, next, head, list) {
@@ -436,9 +771,9 @@ retry:
436 ret += op_ret; 771 ret += op_ret;
437 } 772 }
438 773
439 spin_unlock(&bh1->lock); 774 spin_unlock(&hb1->lock);
440 if (bh1 != bh2) 775 if (hb1 != hb2)
441 spin_unlock(&bh2->lock); 776 spin_unlock(&hb2->lock);
442out: 777out:
443 up_read(&current->mm->mmap_sem); 778 up_read(&current->mm->mmap_sem);
444 return ret; 779 return ret;
@@ -448,11 +783,11 @@ out:
448 * Requeue all waiters hashed on one physical page to another 783 * Requeue all waiters hashed on one physical page to another
449 * physical page. 784 * physical page.
450 */ 785 */
451static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2, 786static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
452 int nr_wake, int nr_requeue, int *valp) 787 int nr_wake, int nr_requeue, u32 *cmpval)
453{ 788{
454 union futex_key key1, key2; 789 union futex_key key1, key2;
455 struct futex_hash_bucket *bh1, *bh2; 790 struct futex_hash_bucket *hb1, *hb2;
456 struct list_head *head1; 791 struct list_head *head1;
457 struct futex_q *this, *next; 792 struct futex_q *this, *next;
458 int ret, drop_count = 0; 793 int ret, drop_count = 0;
@@ -467,68 +802,68 @@ static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
467 if (unlikely(ret != 0)) 802 if (unlikely(ret != 0))
468 goto out; 803 goto out;
469 804
470 bh1 = hash_futex(&key1); 805 hb1 = hash_futex(&key1);
471 bh2 = hash_futex(&key2); 806 hb2 = hash_futex(&key2);
472 807
473 if (bh1 < bh2) 808 double_lock_hb(hb1, hb2);
474 spin_lock(&bh1->lock);
475 spin_lock(&bh2->lock);
476 if (bh1 > bh2)
477 spin_lock(&bh1->lock);
478 809
479 if (likely(valp != NULL)) { 810 if (likely(cmpval != NULL)) {
480 int curval; 811 u32 curval;
481 812
482 ret = get_futex_value_locked(&curval, (int __user *)uaddr1); 813 ret = get_futex_value_locked(&curval, uaddr1);
483 814
484 if (unlikely(ret)) { 815 if (unlikely(ret)) {
485 spin_unlock(&bh1->lock); 816 spin_unlock(&hb1->lock);
486 if (bh1 != bh2) 817 if (hb1 != hb2)
487 spin_unlock(&bh2->lock); 818 spin_unlock(&hb2->lock);
488 819
489 /* If we would have faulted, release mmap_sem, fault 820 /*
821 * If we would have faulted, release mmap_sem, fault
490 * it in and start all over again. 822 * it in and start all over again.
491 */ 823 */
492 up_read(&current->mm->mmap_sem); 824 up_read(&current->mm->mmap_sem);
493 825
494 ret = get_user(curval, (int __user *)uaddr1); 826 ret = get_user(curval, uaddr1);
495 827
496 if (!ret) 828 if (!ret)
497 goto retry; 829 goto retry;
498 830
499 return ret; 831 return ret;
500 } 832 }
501 if (curval != *valp) { 833 if (curval != *cmpval) {
502 ret = -EAGAIN; 834 ret = -EAGAIN;
503 goto out_unlock; 835 goto out_unlock;
504 } 836 }
505 } 837 }
506 838
507 head1 = &bh1->chain; 839 head1 = &hb1->chain;
508 list_for_each_entry_safe(this, next, head1, list) { 840 list_for_each_entry_safe(this, next, head1, list) {
509 if (!match_futex (&this->key, &key1)) 841 if (!match_futex (&this->key, &key1))
510 continue; 842 continue;
511 if (++ret <= nr_wake) { 843 if (++ret <= nr_wake) {
512 wake_futex(this); 844 wake_futex(this);
513 } else { 845 } else {
514 list_move_tail(&this->list, &bh2->chain); 846 /*
515 this->lock_ptr = &bh2->lock; 847 * If key1 and key2 hash to the same bucket, no need to
848 * requeue.
849 */
850 if (likely(head1 != &hb2->chain)) {
851 list_move_tail(&this->list, &hb2->chain);
852 this->lock_ptr = &hb2->lock;
853 }
516 this->key = key2; 854 this->key = key2;
517 get_key_refs(&key2); 855 get_key_refs(&key2);
518 drop_count++; 856 drop_count++;
519 857
520 if (ret - nr_wake >= nr_requeue) 858 if (ret - nr_wake >= nr_requeue)
521 break; 859 break;
522 /* Make sure to stop if key1 == key2 */
523 if (head1 == &bh2->chain && head1 != &next->list)
524 head1 = &this->list;
525 } 860 }
526 } 861 }
527 862
528out_unlock: 863out_unlock:
529 spin_unlock(&bh1->lock); 864 spin_unlock(&hb1->lock);
530 if (bh1 != bh2) 865 if (hb1 != hb2)
531 spin_unlock(&bh2->lock); 866 spin_unlock(&hb2->lock);
532 867
533 /* drop_key_refs() must be called outside the spinlocks. */ 868 /* drop_key_refs() must be called outside the spinlocks. */
534 while (--drop_count >= 0) 869 while (--drop_count >= 0)
@@ -543,7 +878,7 @@ out:
543static inline struct futex_hash_bucket * 878static inline struct futex_hash_bucket *
544queue_lock(struct futex_q *q, int fd, struct file *filp) 879queue_lock(struct futex_q *q, int fd, struct file *filp)
545{ 880{
546 struct futex_hash_bucket *bh; 881 struct futex_hash_bucket *hb;
547 882
548 q->fd = fd; 883 q->fd = fd;
549 q->filp = filp; 884 q->filp = filp;
@@ -551,23 +886,24 @@ queue_lock(struct futex_q *q, int fd, struct file *filp)
551 init_waitqueue_head(&q->waiters); 886 init_waitqueue_head(&q->waiters);
552 887
553 get_key_refs(&q->key); 888 get_key_refs(&q->key);
554 bh = hash_futex(&q->key); 889 hb = hash_futex(&q->key);
555 q->lock_ptr = &bh->lock; 890 q->lock_ptr = &hb->lock;
556 891
557 spin_lock(&bh->lock); 892 spin_lock(&hb->lock);
558 return bh; 893 return hb;
559} 894}
560 895
561static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *bh) 896static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
562{ 897{
563 list_add_tail(&q->list, &bh->chain); 898 list_add_tail(&q->list, &hb->chain);
564 spin_unlock(&bh->lock); 899 q->task = current;
900 spin_unlock(&hb->lock);
565} 901}
566 902
567static inline void 903static inline void
568queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh) 904queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
569{ 905{
570 spin_unlock(&bh->lock); 906 spin_unlock(&hb->lock);
571 drop_key_refs(&q->key); 907 drop_key_refs(&q->key);
572} 908}
573 909
@@ -579,16 +915,17 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh)
579/* The key must be already stored in q->key. */ 915/* The key must be already stored in q->key. */
580static void queue_me(struct futex_q *q, int fd, struct file *filp) 916static void queue_me(struct futex_q *q, int fd, struct file *filp)
581{ 917{
582 struct futex_hash_bucket *bh; 918 struct futex_hash_bucket *hb;
583 bh = queue_lock(q, fd, filp); 919
584 __queue_me(q, bh); 920 hb = queue_lock(q, fd, filp);
921 __queue_me(q, hb);
585} 922}
586 923
587/* Return 1 if we were still queued (ie. 0 means we were woken) */ 924/* Return 1 if we were still queued (ie. 0 means we were woken) */
588static int unqueue_me(struct futex_q *q) 925static int unqueue_me(struct futex_q *q)
589{ 926{
590 int ret = 0;
591 spinlock_t *lock_ptr; 927 spinlock_t *lock_ptr;
928 int ret = 0;
592 929
593 /* In the common case we don't take the spinlock, which is nice. */ 930 /* In the common case we don't take the spinlock, which is nice. */
594 retry: 931 retry:
@@ -614,6 +951,9 @@ static int unqueue_me(struct futex_q *q)
614 } 951 }
615 WARN_ON(list_empty(&q->list)); 952 WARN_ON(list_empty(&q->list));
616 list_del(&q->list); 953 list_del(&q->list);
954
955 BUG_ON(q->pi_state);
956
617 spin_unlock(lock_ptr); 957 spin_unlock(lock_ptr);
618 ret = 1; 958 ret = 1;
619 } 959 }
@@ -622,21 +962,42 @@ static int unqueue_me(struct futex_q *q)
622 return ret; 962 return ret;
623} 963}
624 964
625static int futex_wait(unsigned long uaddr, int val, unsigned long time) 965/*
966 * PI futexes can not be requeued and must remove themself from the
967 * hash bucket. The hash bucket lock is held on entry and dropped here.
968 */
969static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb)
626{ 970{
627 DECLARE_WAITQUEUE(wait, current); 971 WARN_ON(list_empty(&q->list));
628 int ret, curval; 972 list_del(&q->list);
973
974 BUG_ON(!q->pi_state);
975 free_pi_state(q->pi_state);
976 q->pi_state = NULL;
977
978 spin_unlock(&hb->lock);
979
980 drop_key_refs(&q->key);
981}
982
983static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
984{
985 struct task_struct *curr = current;
986 DECLARE_WAITQUEUE(wait, curr);
987 struct futex_hash_bucket *hb;
629 struct futex_q q; 988 struct futex_q q;
630 struct futex_hash_bucket *bh; 989 u32 uval;
990 int ret;
631 991
992 q.pi_state = NULL;
632 retry: 993 retry:
633 down_read(&current->mm->mmap_sem); 994 down_read(&curr->mm->mmap_sem);
634 995
635 ret = get_futex_key(uaddr, &q.key); 996 ret = get_futex_key(uaddr, &q.key);
636 if (unlikely(ret != 0)) 997 if (unlikely(ret != 0))
637 goto out_release_sem; 998 goto out_release_sem;
638 999
639 bh = queue_lock(&q, -1, NULL); 1000 hb = queue_lock(&q, -1, NULL);
640 1001
641 /* 1002 /*
642 * Access the page AFTER the futex is queued. 1003 * Access the page AFTER the futex is queued.
@@ -658,37 +1019,35 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time)
658 * We hold the mmap semaphore, so the mapping cannot have changed 1019 * We hold the mmap semaphore, so the mapping cannot have changed
659 * since we looked it up in get_futex_key. 1020 * since we looked it up in get_futex_key.
660 */ 1021 */
661 1022 ret = get_futex_value_locked(&uval, uaddr);
662 ret = get_futex_value_locked(&curval, (int __user *)uaddr);
663 1023
664 if (unlikely(ret)) { 1024 if (unlikely(ret)) {
665 queue_unlock(&q, bh); 1025 queue_unlock(&q, hb);
666 1026
667 /* If we would have faulted, release mmap_sem, fault it in and 1027 /*
1028 * If we would have faulted, release mmap_sem, fault it in and
668 * start all over again. 1029 * start all over again.
669 */ 1030 */
670 up_read(&current->mm->mmap_sem); 1031 up_read(&curr->mm->mmap_sem);
671 1032
672 ret = get_user(curval, (int __user *)uaddr); 1033 ret = get_user(uval, uaddr);
673 1034
674 if (!ret) 1035 if (!ret)
675 goto retry; 1036 goto retry;
676 return ret; 1037 return ret;
677 } 1038 }
678 if (curval != val) { 1039 ret = -EWOULDBLOCK;
679 ret = -EWOULDBLOCK; 1040 if (uval != val)
680 queue_unlock(&q, bh); 1041 goto out_unlock_release_sem;
681 goto out_release_sem;
682 }
683 1042
684 /* Only actually queue if *uaddr contained val. */ 1043 /* Only actually queue if *uaddr contained val. */
685 __queue_me(&q, bh); 1044 __queue_me(&q, hb);
686 1045
687 /* 1046 /*
688 * Now the futex is queued and we have checked the data, we 1047 * Now the futex is queued and we have checked the data, we
689 * don't want to hold mmap_sem while we sleep. 1048 * don't want to hold mmap_sem while we sleep.
690 */ 1049 */
691 up_read(&current->mm->mmap_sem); 1050 up_read(&curr->mm->mmap_sem);
692 1051
693 /* 1052 /*
694 * There might have been scheduling since the queue_me(), as we 1053 * There might have been scheduling since the queue_me(), as we
@@ -720,12 +1079,421 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time)
720 return 0; 1079 return 0;
721 if (time == 0) 1080 if (time == 0)
722 return -ETIMEDOUT; 1081 return -ETIMEDOUT;
723 /* We expect signal_pending(current), but another thread may 1082 /*
724 * have handled it for us already. */ 1083 * We expect signal_pending(current), but another thread may
1084 * have handled it for us already.
1085 */
725 return -EINTR; 1086 return -EINTR;
726 1087
1088 out_unlock_release_sem:
1089 queue_unlock(&q, hb);
1090
727 out_release_sem: 1091 out_release_sem:
1092 up_read(&curr->mm->mmap_sem);
1093 return ret;
1094}
1095
1096/*
1097 * Userspace tried a 0 -> TID atomic transition of the futex value
1098 * and failed. The kernel side here does the whole locking operation:
1099 * if there are waiters then it will block, it does PI, etc. (Due to
1100 * races the kernel might see a 0 value of the futex too.)
1101 */
1102static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
1103 struct hrtimer_sleeper *to)
1104{
1105 struct task_struct *curr = current;
1106 struct futex_hash_bucket *hb;
1107 u32 uval, newval, curval;
1108 struct futex_q q;
1109 int ret, attempt = 0;
1110
1111 if (refill_pi_state_cache())
1112 return -ENOMEM;
1113
1114 q.pi_state = NULL;
1115 retry:
1116 down_read(&curr->mm->mmap_sem);
1117
1118 ret = get_futex_key(uaddr, &q.key);
1119 if (unlikely(ret != 0))
1120 goto out_release_sem;
1121
1122 hb = queue_lock(&q, -1, NULL);
1123
1124 retry_locked:
1125 /*
1126 * To avoid races, we attempt to take the lock here again
1127 * (by doing a 0 -> TID atomic cmpxchg), while holding all
1128 * the locks. It will most likely not succeed.
1129 */
1130 newval = current->pid;
1131
1132 inc_preempt_count();
1133 curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval);
1134 dec_preempt_count();
1135
1136 if (unlikely(curval == -EFAULT))
1137 goto uaddr_faulted;
1138
1139 /* We own the lock already */
1140 if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) {
1141 if (!detect && 0)
1142 force_sig(SIGKILL, current);
1143 ret = -EDEADLK;
1144 goto out_unlock_release_sem;
1145 }
1146
1147 /*
1148 * Surprise - we got the lock. Just return
1149 * to userspace:
1150 */
1151 if (unlikely(!curval))
1152 goto out_unlock_release_sem;
1153
1154 uval = curval;
1155 newval = uval | FUTEX_WAITERS;
1156
1157 inc_preempt_count();
1158 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
1159 dec_preempt_count();
1160
1161 if (unlikely(curval == -EFAULT))
1162 goto uaddr_faulted;
1163 if (unlikely(curval != uval))
1164 goto retry_locked;
1165
1166 /*
1167 * We dont have the lock. Look up the PI state (or create it if
1168 * we are the first waiter):
1169 */
1170 ret = lookup_pi_state(uval, hb, &q);
1171
1172 if (unlikely(ret)) {
1173 /*
1174 * There were no waiters and the owner task lookup
1175 * failed. When the OWNER_DIED bit is set, then we
1176 * know that this is a robust futex and we actually
1177 * take the lock. This is safe as we are protected by
1178 * the hash bucket lock. We also set the waiters bit
1179 * unconditionally here, to simplify glibc handling of
1180 * multiple tasks racing to acquire the lock and
1181 * cleanup the problems which were left by the dead
1182 * owner.
1183 */
1184 if (curval & FUTEX_OWNER_DIED) {
1185 uval = newval;
1186 newval = current->pid |
1187 FUTEX_OWNER_DIED | FUTEX_WAITERS;
1188
1189 inc_preempt_count();
1190 curval = futex_atomic_cmpxchg_inatomic(uaddr,
1191 uval, newval);
1192 dec_preempt_count();
1193
1194 if (unlikely(curval == -EFAULT))
1195 goto uaddr_faulted;
1196 if (unlikely(curval != uval))
1197 goto retry_locked;
1198 ret = 0;
1199 }
1200 goto out_unlock_release_sem;
1201 }
1202
1203 /*
1204 * Only actually queue now that the atomic ops are done:
1205 */
1206 __queue_me(&q, hb);
1207
1208 /*
1209 * Now the futex is queued and we have checked the data, we
1210 * don't want to hold mmap_sem while we sleep.
1211 */
1212 up_read(&curr->mm->mmap_sem);
1213
1214 WARN_ON(!q.pi_state);
1215 /*
1216 * Block on the PI mutex:
1217 */
1218 if (!trylock)
1219 ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1);
1220 else {
1221 ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
1222 /* Fixup the trylock return value: */
1223 ret = ret ? 0 : -EWOULDBLOCK;
1224 }
1225
1226 down_read(&curr->mm->mmap_sem);
1227 spin_lock(q.lock_ptr);
1228
1229 /*
1230 * Got the lock. We might not be the anticipated owner if we
1231 * did a lock-steal - fix up the PI-state in that case.
1232 */
1233 if (!ret && q.pi_state->owner != curr) {
1234 u32 newtid = current->pid | FUTEX_WAITERS;
1235
1236 /* Owner died? */
1237 if (q.pi_state->owner != NULL) {
1238 spin_lock_irq(&q.pi_state->owner->pi_lock);
1239 list_del_init(&q.pi_state->list);
1240 spin_unlock_irq(&q.pi_state->owner->pi_lock);
1241 } else
1242 newtid |= FUTEX_OWNER_DIED;
1243
1244 q.pi_state->owner = current;
1245
1246 spin_lock_irq(&current->pi_lock);
1247 list_add(&q.pi_state->list, &current->pi_state_list);
1248 spin_unlock_irq(&current->pi_lock);
1249
1250 /* Unqueue and drop the lock */
1251 unqueue_me_pi(&q, hb);
1252 up_read(&curr->mm->mmap_sem);
1253 /*
1254 * We own it, so we have to replace the pending owner
1255 * TID. This must be atomic as we have preserve the
1256 * owner died bit here.
1257 */
1258 ret = get_user(uval, uaddr);
1259 while (!ret) {
1260 newval = (uval & FUTEX_OWNER_DIED) | newtid;
1261 curval = futex_atomic_cmpxchg_inatomic(uaddr,
1262 uval, newval);
1263 if (curval == -EFAULT)
1264 ret = -EFAULT;
1265 if (curval == uval)
1266 break;
1267 uval = curval;
1268 }
1269 } else {
1270 /*
1271 * Catch the rare case, where the lock was released
1272 * when we were on the way back before we locked
1273 * the hash bucket.
1274 */
1275 if (ret && q.pi_state->owner == curr) {
1276 if (rt_mutex_trylock(&q.pi_state->pi_mutex))
1277 ret = 0;
1278 }
1279 /* Unqueue and drop the lock */
1280 unqueue_me_pi(&q, hb);
1281 up_read(&curr->mm->mmap_sem);
1282 }
1283
1284 if (!detect && ret == -EDEADLK && 0)
1285 force_sig(SIGKILL, current);
1286
1287 return ret;
1288
1289 out_unlock_release_sem:
1290 queue_unlock(&q, hb);
1291
1292 out_release_sem:
1293 up_read(&curr->mm->mmap_sem);
1294 return ret;
1295
1296 uaddr_faulted:
1297 /*
1298 * We have to r/w *(int __user *)uaddr, but we can't modify it
1299 * non-atomically. Therefore, if get_user below is not
1300 * enough, we need to handle the fault ourselves, while
1301 * still holding the mmap_sem.
1302 */
1303 if (attempt++) {
1304 if (futex_handle_fault((unsigned long)uaddr, attempt))
1305 goto out_unlock_release_sem;
1306
1307 goto retry_locked;
1308 }
1309
1310 queue_unlock(&q, hb);
1311 up_read(&curr->mm->mmap_sem);
1312
1313 ret = get_user(uval, uaddr);
1314 if (!ret && (uval != -EFAULT))
1315 goto retry;
1316
1317 return ret;
1318}
1319
1320/*
1321 * Restart handler
1322 */
1323static long futex_lock_pi_restart(struct restart_block *restart)
1324{
1325 struct hrtimer_sleeper timeout, *to = NULL;
1326 int ret;
1327
1328 restart->fn = do_no_restart_syscall;
1329
1330 if (restart->arg2 || restart->arg3) {
1331 to = &timeout;
1332 hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS);
1333 hrtimer_init_sleeper(to, current);
1334 to->timer.expires.tv64 = ((u64)restart->arg1 << 32) |
1335 (u64) restart->arg0;
1336 }
1337
1338 pr_debug("lock_pi restart: %p, %d (%d)\n",
1339 (u32 __user *)restart->arg0, current->pid);
1340
1341 ret = do_futex_lock_pi((u32 __user *)restart->arg0, restart->arg1,
1342 0, to);
1343
1344 if (ret != -EINTR)
1345 return ret;
1346
1347 restart->fn = futex_lock_pi_restart;
1348
1349 /* The other values are filled in */
1350 return -ERESTART_RESTARTBLOCK;
1351}
1352
1353/*
1354 * Called from the syscall entry below.
1355 */
1356static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1357 long nsec, int trylock)
1358{
1359 struct hrtimer_sleeper timeout, *to = NULL;
1360 struct restart_block *restart;
1361 int ret;
1362
1363 if (sec != MAX_SCHEDULE_TIMEOUT) {
1364 to = &timeout;
1365 hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS);
1366 hrtimer_init_sleeper(to, current);
1367 to->timer.expires = ktime_set(sec, nsec);
1368 }
1369
1370 ret = do_futex_lock_pi(uaddr, detect, trylock, to);
1371
1372 if (ret != -EINTR)
1373 return ret;
1374
1375 pr_debug("lock_pi interrupted: %p, %d (%d)\n", uaddr, current->pid);
1376
1377 restart = &current_thread_info()->restart_block;
1378 restart->fn = futex_lock_pi_restart;
1379 restart->arg0 = (unsigned long) uaddr;
1380 restart->arg1 = detect;
1381 if (to) {
1382 restart->arg2 = to->timer.expires.tv64 & 0xFFFFFFFF;
1383 restart->arg3 = to->timer.expires.tv64 >> 32;
1384 } else
1385 restart->arg2 = restart->arg3 = 0;
1386
1387 return -ERESTART_RESTARTBLOCK;
1388}
1389
1390/*
1391 * Userspace attempted a TID -> 0 atomic transition, and failed.
1392 * This is the in-kernel slowpath: we look up the PI state (if any),
1393 * and do the rt-mutex unlock.
1394 */
1395static int futex_unlock_pi(u32 __user *uaddr)
1396{
1397 struct futex_hash_bucket *hb;
1398 struct futex_q *this, *next;
1399 u32 uval;
1400 struct list_head *head;
1401 union futex_key key;
1402 int ret, attempt = 0;
1403
1404retry:
1405 if (get_user(uval, uaddr))
1406 return -EFAULT;
1407 /*
1408 * We release only a lock we actually own:
1409 */
1410 if ((uval & FUTEX_TID_MASK) != current->pid)
1411 return -EPERM;
1412 /*
1413 * First take all the futex related locks:
1414 */
1415 down_read(&current->mm->mmap_sem);
1416
1417 ret = get_futex_key(uaddr, &key);
1418 if (unlikely(ret != 0))
1419 goto out;
1420
1421 hb = hash_futex(&key);
1422 spin_lock(&hb->lock);
1423
1424retry_locked:
1425 /*
1426 * To avoid races, try to do the TID -> 0 atomic transition
1427 * again. If it succeeds then we can return without waking
1428 * anyone else up:
1429 */
1430 inc_preempt_count();
1431 uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
1432 dec_preempt_count();
1433
1434 if (unlikely(uval == -EFAULT))
1435 goto pi_faulted;
1436 /*
1437 * Rare case: we managed to release the lock atomically,
1438 * no need to wake anyone else up:
1439 */
1440 if (unlikely(uval == current->pid))
1441 goto out_unlock;
1442
1443 /*
1444 * Ok, other tasks may need to be woken up - check waiters
1445 * and do the wakeup if necessary:
1446 */
1447 head = &hb->chain;
1448
1449 list_for_each_entry_safe(this, next, head, list) {
1450 if (!match_futex (&this->key, &key))
1451 continue;
1452 ret = wake_futex_pi(uaddr, uval, this);
1453 /*
1454 * The atomic access to the futex value
1455 * generated a pagefault, so retry the
1456 * user-access and the wakeup:
1457 */
1458 if (ret == -EFAULT)
1459 goto pi_faulted;
1460 goto out_unlock;
1461 }
1462 /*
1463 * No waiters - kernel unlocks the futex:
1464 */
1465 ret = unlock_futex_pi(uaddr, uval);
1466 if (ret == -EFAULT)
1467 goto pi_faulted;
1468
1469out_unlock:
1470 spin_unlock(&hb->lock);
1471out:
728 up_read(&current->mm->mmap_sem); 1472 up_read(&current->mm->mmap_sem);
1473
1474 return ret;
1475
1476pi_faulted:
1477 /*
1478 * We have to r/w *(int __user *)uaddr, but we can't modify it
1479 * non-atomically. Therefore, if get_user below is not
1480 * enough, we need to handle the fault ourselves, while
1481 * still holding the mmap_sem.
1482 */
1483 if (attempt++) {
1484 if (futex_handle_fault((unsigned long)uaddr, attempt))
1485 goto out_unlock;
1486
1487 goto retry_locked;
1488 }
1489
1490 spin_unlock(&hb->lock);
1491 up_read(&current->mm->mmap_sem);
1492
1493 ret = get_user(uval, uaddr);
1494 if (!ret && (uval != -EFAULT))
1495 goto retry;
1496
729 return ret; 1497 return ret;
730} 1498}
731 1499
@@ -735,6 +1503,7 @@ static int futex_close(struct inode *inode, struct file *filp)
735 1503
736 unqueue_me(q); 1504 unqueue_me(q);
737 kfree(q); 1505 kfree(q);
1506
738 return 0; 1507 return 0;
739} 1508}
740 1509
@@ -766,7 +1535,7 @@ static struct file_operations futex_fops = {
766 * Signal allows caller to avoid the race which would occur if they 1535 * Signal allows caller to avoid the race which would occur if they
767 * set the sigio stuff up afterwards. 1536 * set the sigio stuff up afterwards.
768 */ 1537 */
769static int futex_fd(unsigned long uaddr, int signal) 1538static int futex_fd(u32 __user *uaddr, int signal)
770{ 1539{
771 struct futex_q *q; 1540 struct futex_q *q;
772 struct file *filp; 1541 struct file *filp;
@@ -803,6 +1572,7 @@ static int futex_fd(unsigned long uaddr, int signal)
803 err = -ENOMEM; 1572 err = -ENOMEM;
804 goto error; 1573 goto error;
805 } 1574 }
1575 q->pi_state = NULL;
806 1576
807 down_read(&current->mm->mmap_sem); 1577 down_read(&current->mm->mmap_sem);
808 err = get_futex_key(uaddr, &q->key); 1578 err = get_futex_key(uaddr, &q->key);
@@ -840,7 +1610,7 @@ error:
840 * Implementation: user-space maintains a per-thread list of locks it 1610 * Implementation: user-space maintains a per-thread list of locks it
841 * is holding. Upon do_exit(), the kernel carefully walks this list, 1611 * is holding. Upon do_exit(), the kernel carefully walks this list,
842 * and marks all locks that are owned by this thread with the 1612 * and marks all locks that are owned by this thread with the
843 * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is 1613 * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
844 * always manipulated with the lock held, so the list is private and 1614 * always manipulated with the lock held, so the list is private and
845 * per-thread. Userspace also maintains a per-thread 'list_op_pending' 1615 * per-thread. Userspace also maintains a per-thread 'list_op_pending'
846 * field, to allow the kernel to clean up if the thread dies after 1616 * field, to allow the kernel to clean up if the thread dies after
@@ -915,7 +1685,7 @@ err_unlock:
915 */ 1685 */
916int handle_futex_death(u32 __user *uaddr, struct task_struct *curr) 1686int handle_futex_death(u32 __user *uaddr, struct task_struct *curr)
917{ 1687{
918 u32 uval; 1688 u32 uval, nval;
919 1689
920retry: 1690retry:
921 if (get_user(uval, uaddr)) 1691 if (get_user(uval, uaddr))
@@ -932,12 +1702,16 @@ retry:
932 * thread-death.) The rest of the cleanup is done in 1702 * thread-death.) The rest of the cleanup is done in
933 * userspace. 1703 * userspace.
934 */ 1704 */
935 if (futex_atomic_cmpxchg_inatomic(uaddr, uval, 1705 nval = futex_atomic_cmpxchg_inatomic(uaddr, uval,
936 uval | FUTEX_OWNER_DIED) != uval) 1706 uval | FUTEX_OWNER_DIED);
1707 if (nval == -EFAULT)
1708 return -1;
1709
1710 if (nval != uval)
937 goto retry; 1711 goto retry;
938 1712
939 if (uval & FUTEX_WAITERS) 1713 if (uval & FUTEX_WAITERS)
940 futex_wake((unsigned long)uaddr, 1); 1714 futex_wake(uaddr, 1);
941 } 1715 }
942 return 0; 1716 return 0;
943} 1717}
@@ -978,7 +1752,7 @@ void exit_robust_list(struct task_struct *curr)
978 while (entry != &head->list) { 1752 while (entry != &head->list) {
979 /* 1753 /*
980 * A pending lock might already be on the list, so 1754 * A pending lock might already be on the list, so
981 * dont process it twice: 1755 * don't process it twice:
982 */ 1756 */
983 if (entry != pending) 1757 if (entry != pending)
984 if (handle_futex_death((void *)entry + futex_offset, 1758 if (handle_futex_death((void *)entry + futex_offset,
@@ -999,8 +1773,8 @@ void exit_robust_list(struct task_struct *curr)
999 } 1773 }
1000} 1774}
1001 1775
1002long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, 1776long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout,
1003 unsigned long uaddr2, int val2, int val3) 1777 u32 __user *uaddr2, u32 val2, u32 val3)
1004{ 1778{
1005 int ret; 1779 int ret;
1006 1780
@@ -1024,6 +1798,15 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
1024 case FUTEX_WAKE_OP: 1798 case FUTEX_WAKE_OP:
1025 ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); 1799 ret = futex_wake_op(uaddr, uaddr2, val, val2, val3);
1026 break; 1800 break;
1801 case FUTEX_LOCK_PI:
1802 ret = futex_lock_pi(uaddr, val, timeout, val2, 0);
1803 break;
1804 case FUTEX_UNLOCK_PI:
1805 ret = futex_unlock_pi(uaddr);
1806 break;
1807 case FUTEX_TRYLOCK_PI:
1808 ret = futex_lock_pi(uaddr, 0, timeout, val2, 1);
1809 break;
1027 default: 1810 default:
1028 ret = -ENOSYS; 1811 ret = -ENOSYS;
1029 } 1812 }
@@ -1031,36 +1814,40 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
1031} 1814}
1032 1815
1033 1816
1034asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, 1817asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
1035 struct timespec __user *utime, u32 __user *uaddr2, 1818 struct timespec __user *utime, u32 __user *uaddr2,
1036 int val3) 1819 u32 val3)
1037{ 1820{
1038 struct timespec t; 1821 struct timespec t;
1039 unsigned long timeout = MAX_SCHEDULE_TIMEOUT; 1822 unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
1040 int val2 = 0; 1823 u32 val2 = 0;
1041 1824
1042 if (utime && (op == FUTEX_WAIT)) { 1825 if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
1043 if (copy_from_user(&t, utime, sizeof(t)) != 0) 1826 if (copy_from_user(&t, utime, sizeof(t)) != 0)
1044 return -EFAULT; 1827 return -EFAULT;
1045 if (!timespec_valid(&t)) 1828 if (!timespec_valid(&t))
1046 return -EINVAL; 1829 return -EINVAL;
1047 timeout = timespec_to_jiffies(&t) + 1; 1830 if (op == FUTEX_WAIT)
1831 timeout = timespec_to_jiffies(&t) + 1;
1832 else {
1833 timeout = t.tv_sec;
1834 val2 = t.tv_nsec;
1835 }
1048 } 1836 }
1049 /* 1837 /*
1050 * requeue parameter in 'utime' if op == FUTEX_REQUEUE. 1838 * requeue parameter in 'utime' if op == FUTEX_REQUEUE.
1051 */ 1839 */
1052 if (op >= FUTEX_REQUEUE) 1840 if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
1053 val2 = (int) (unsigned long) utime; 1841 val2 = (u32) (unsigned long) utime;
1054 1842
1055 return do_futex((unsigned long)uaddr, op, val, timeout, 1843 return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
1056 (unsigned long)uaddr2, val2, val3);
1057} 1844}
1058 1845
1059static struct super_block * 1846static int futexfs_get_sb(struct file_system_type *fs_type,
1060futexfs_get_sb(struct file_system_type *fs_type, 1847 int flags, const char *dev_name, void *data,
1061 int flags, const char *dev_name, void *data) 1848 struct vfsmount *mnt)
1062{ 1849{
1063 return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA); 1850 return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA, mnt);
1064} 1851}
1065 1852
1066static struct file_system_type futex_fs_type = { 1853static struct file_system_type futex_fs_type = {
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 1ab6a0ea3d14..d1d92b441fb7 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -129,16 +129,20 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
129 unsigned long timeout = MAX_SCHEDULE_TIMEOUT; 129 unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
130 int val2 = 0; 130 int val2 = 0;
131 131
132 if (utime && (op == FUTEX_WAIT)) { 132 if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
133 if (get_compat_timespec(&t, utime)) 133 if (get_compat_timespec(&t, utime))
134 return -EFAULT; 134 return -EFAULT;
135 if (!timespec_valid(&t)) 135 if (!timespec_valid(&t))
136 return -EINVAL; 136 return -EINVAL;
137 timeout = timespec_to_jiffies(&t) + 1; 137 if (op == FUTEX_WAIT)
138 timeout = timespec_to_jiffies(&t) + 1;
139 else {
140 timeout = t.tv_sec;
141 val2 = t.tv_nsec;
142 }
138 } 143 }
139 if (op >= FUTEX_REQUEUE) 144 if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
140 val2 = (int) (unsigned long) utime; 145 val2 = (int) (unsigned long) utime;
141 146
142 return do_futex((unsigned long)uaddr, op, val, timeout, 147 return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
143 (unsigned long)uaddr2, val2, val3);
144} 148}
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 01fa2ae98a85..d17766d40dab 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -98,7 +98,6 @@ static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) =
98 98
99/** 99/**
100 * ktime_get_ts - get the monotonic clock in timespec format 100 * ktime_get_ts - get the monotonic clock in timespec format
101 *
102 * @ts: pointer to timespec variable 101 * @ts: pointer to timespec variable
103 * 102 *
104 * The function calculates the monotonic clock from the realtime 103 * The function calculates the monotonic clock from the realtime
@@ -238,7 +237,6 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
238# ifndef CONFIG_KTIME_SCALAR 237# ifndef CONFIG_KTIME_SCALAR
239/** 238/**
240 * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable 239 * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable
241 *
242 * @kt: addend 240 * @kt: addend
243 * @nsec: the scalar nsec value to add 241 * @nsec: the scalar nsec value to add
244 * 242 *
@@ -299,7 +297,6 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
299 297
300/** 298/**
301 * hrtimer_forward - forward the timer expiry 299 * hrtimer_forward - forward the timer expiry
302 *
303 * @timer: hrtimer to forward 300 * @timer: hrtimer to forward
304 * @now: forward past this time 301 * @now: forward past this time
305 * @interval: the interval to forward 302 * @interval: the interval to forward
@@ -393,7 +390,7 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
393 if (base->first == &timer->node) 390 if (base->first == &timer->node)
394 base->first = rb_next(&timer->node); 391 base->first = rb_next(&timer->node);
395 rb_erase(&timer->node, &base->active); 392 rb_erase(&timer->node, &base->active);
396 timer->node.rb_parent = HRTIMER_INACTIVE; 393 rb_set_parent(&timer->node, &timer->node);
397} 394}
398 395
399/* 396/*
@@ -411,7 +408,6 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
411 408
412/** 409/**
413 * hrtimer_start - (re)start an relative timer on the current CPU 410 * hrtimer_start - (re)start an relative timer on the current CPU
414 *
415 * @timer: the timer to be added 411 * @timer: the timer to be added
416 * @tim: expiry time 412 * @tim: expiry time
417 * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) 413 * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
@@ -460,14 +456,13 @@ EXPORT_SYMBOL_GPL(hrtimer_start);
460 456
461/** 457/**
462 * hrtimer_try_to_cancel - try to deactivate a timer 458 * hrtimer_try_to_cancel - try to deactivate a timer
463 *
464 * @timer: hrtimer to stop 459 * @timer: hrtimer to stop
465 * 460 *
466 * Returns: 461 * Returns:
467 * 0 when the timer was not active 462 * 0 when the timer was not active
468 * 1 when the timer was active 463 * 1 when the timer was active
469 * -1 when the timer is currently excuting the callback function and 464 * -1 when the timer is currently excuting the callback function and
470 * can not be stopped 465 * cannot be stopped
471 */ 466 */
472int hrtimer_try_to_cancel(struct hrtimer *timer) 467int hrtimer_try_to_cancel(struct hrtimer *timer)
473{ 468{
@@ -489,7 +484,6 @@ EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel);
489 484
490/** 485/**
491 * hrtimer_cancel - cancel a timer and wait for the handler to finish. 486 * hrtimer_cancel - cancel a timer and wait for the handler to finish.
492 *
493 * @timer: the timer to be cancelled 487 * @timer: the timer to be cancelled
494 * 488 *
495 * Returns: 489 * Returns:
@@ -510,7 +504,6 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel);
510 504
511/** 505/**
512 * hrtimer_get_remaining - get remaining time for the timer 506 * hrtimer_get_remaining - get remaining time for the timer
513 *
514 * @timer: the timer to read 507 * @timer: the timer to read
515 */ 508 */
516ktime_t hrtimer_get_remaining(const struct hrtimer *timer) 509ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
@@ -564,7 +557,6 @@ ktime_t hrtimer_get_next_event(void)
564 557
565/** 558/**
566 * hrtimer_init - initialize a timer to the given clock 559 * hrtimer_init - initialize a timer to the given clock
567 *
568 * @timer: the timer to be initialized 560 * @timer: the timer to be initialized
569 * @clock_id: the clock to be used 561 * @clock_id: the clock to be used
570 * @mode: timer mode abs/rel 562 * @mode: timer mode abs/rel
@@ -576,19 +568,18 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
576 568
577 memset(timer, 0, sizeof(struct hrtimer)); 569 memset(timer, 0, sizeof(struct hrtimer));
578 570
579 bases = per_cpu(hrtimer_bases, raw_smp_processor_id()); 571 bases = __raw_get_cpu_var(hrtimer_bases);
580 572
581 if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS) 573 if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS)
582 clock_id = CLOCK_MONOTONIC; 574 clock_id = CLOCK_MONOTONIC;
583 575
584 timer->base = &bases[clock_id]; 576 timer->base = &bases[clock_id];
585 timer->node.rb_parent = HRTIMER_INACTIVE; 577 rb_set_parent(&timer->node, &timer->node);
586} 578}
587EXPORT_SYMBOL_GPL(hrtimer_init); 579EXPORT_SYMBOL_GPL(hrtimer_init);
588 580
589/** 581/**
590 * hrtimer_get_res - get the timer resolution for a clock 582 * hrtimer_get_res - get the timer resolution for a clock
591 *
592 * @which_clock: which clock to query 583 * @which_clock: which clock to query
593 * @tp: pointer to timespec variable to store the resolution 584 * @tp: pointer to timespec variable to store the resolution
594 * 585 *
@@ -599,7 +590,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
599{ 590{
600 struct hrtimer_base *bases; 591 struct hrtimer_base *bases;
601 592
602 bases = per_cpu(hrtimer_bases, raw_smp_processor_id()); 593 bases = __raw_get_cpu_var(hrtimer_bases);
603 *tp = ktime_to_timespec(bases[which_clock].resolution); 594 *tp = ktime_to_timespec(bases[which_clock].resolution);
604 595
605 return 0; 596 return 0;
@@ -678,7 +669,7 @@ static int hrtimer_wakeup(struct hrtimer *timer)
678 return HRTIMER_NORESTART; 669 return HRTIMER_NORESTART;
679} 670}
680 671
681void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, task_t *task) 672void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
682{ 673{
683 sl->timer.function = hrtimer_wakeup; 674 sl->timer.function = hrtimer_wakeup;
684 sl->task = task; 675 sl->task = task;
@@ -791,8 +782,10 @@ static void __devinit init_hrtimers_cpu(int cpu)
791 struct hrtimer_base *base = per_cpu(hrtimer_bases, cpu); 782 struct hrtimer_base *base = per_cpu(hrtimer_bases, cpu);
792 int i; 783 int i;
793 784
794 for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) 785 for (i = 0; i < MAX_HRTIMER_BASES; i++, base++) {
795 spin_lock_init(&base->lock); 786 spin_lock_init(&base->lock);
787 lockdep_set_class(&base->lock, &base->lock_key);
788 }
796} 789}
797 790
798#ifdef CONFIG_HOTPLUG_CPU 791#ifdef CONFIG_HOTPLUG_CPU
@@ -842,7 +835,7 @@ static void migrate_hrtimers(int cpu)
842} 835}
843#endif /* CONFIG_HOTPLUG_CPU */ 836#endif /* CONFIG_HOTPLUG_CPU */
844 837
845static int hrtimer_cpu_notify(struct notifier_block *self, 838static int __devinit hrtimer_cpu_notify(struct notifier_block *self,
846 unsigned long action, void *hcpu) 839 unsigned long action, void *hcpu)
847{ 840{
848 long cpu = (long)hcpu; 841 long cpu = (long)hcpu;
@@ -866,7 +859,7 @@ static int hrtimer_cpu_notify(struct notifier_block *self,
866 return NOTIFY_OK; 859 return NOTIFY_OK;
867} 860}
868 861
869static struct notifier_block hrtimers_nb = { 862static struct notifier_block __devinitdata hrtimers_nb = {
870 .notifier_call = hrtimer_cpu_notify, 863 .notifier_call = hrtimer_cpu_notify,
871}; 864};
872 865
diff --git a/kernel/intermodule.c b/kernel/intermodule.c
deleted file mode 100644
index 55b1e5b85db9..000000000000
--- a/kernel/intermodule.c
+++ /dev/null
@@ -1,184 +0,0 @@
1/* Deprecated, do not use. Moved from module.c to here. --RR */
2
3/* Written by Keith Owens <kaos@ocs.com.au> Oct 2000 */
4#include <linux/module.h>
5#include <linux/kmod.h>
6#include <linux/spinlock.h>
7#include <linux/list.h>
8#include <linux/slab.h>
9
10/* inter_module functions are always available, even when the kernel is
11 * compiled without modules. Consumers of inter_module_xxx routines
12 * will always work, even when both are built into the kernel, this
13 * approach removes lots of #ifdefs in mainline code.
14 */
15
16static struct list_head ime_list = LIST_HEAD_INIT(ime_list);
17static DEFINE_SPINLOCK(ime_lock);
18static int kmalloc_failed;
19
20struct inter_module_entry {
21 struct list_head list;
22 const char *im_name;
23 struct module *owner;
24 const void *userdata;
25};
26
27/**
28 * inter_module_register - register a new set of inter module data.
29 * @im_name: an arbitrary string to identify the data, must be unique
30 * @owner: module that is registering the data, always use THIS_MODULE
31 * @userdata: pointer to arbitrary userdata to be registered
32 *
33 * Description: Check that the im_name has not already been registered,
34 * complain if it has. For new data, add it to the inter_module_entry
35 * list.
36 */
37void inter_module_register(const char *im_name, struct module *owner, const void *userdata)
38{
39 struct list_head *tmp;
40 struct inter_module_entry *ime, *ime_new;
41
42 if (!(ime_new = kzalloc(sizeof(*ime), GFP_KERNEL))) {
43 /* Overloaded kernel, not fatal */
44 printk(KERN_ERR
45 "Aiee, inter_module_register: cannot kmalloc entry for '%s'\n",
46 im_name);
47 kmalloc_failed = 1;
48 return;
49 }
50 ime_new->im_name = im_name;
51 ime_new->owner = owner;
52 ime_new->userdata = userdata;
53
54 spin_lock(&ime_lock);
55 list_for_each(tmp, &ime_list) {
56 ime = list_entry(tmp, struct inter_module_entry, list);
57 if (strcmp(ime->im_name, im_name) == 0) {
58 spin_unlock(&ime_lock);
59 kfree(ime_new);
60 /* Program logic error, fatal */
61 printk(KERN_ERR "inter_module_register: duplicate im_name '%s'", im_name);
62 BUG();
63 }
64 }
65 list_add(&(ime_new->list), &ime_list);
66 spin_unlock(&ime_lock);
67}
68
69/**
70 * inter_module_unregister - unregister a set of inter module data.
71 * @im_name: an arbitrary string to identify the data, must be unique
72 *
73 * Description: Check that the im_name has been registered, complain if
74 * it has not. For existing data, remove it from the
75 * inter_module_entry list.
76 */
77void inter_module_unregister(const char *im_name)
78{
79 struct list_head *tmp;
80 struct inter_module_entry *ime;
81
82 spin_lock(&ime_lock);
83 list_for_each(tmp, &ime_list) {
84 ime = list_entry(tmp, struct inter_module_entry, list);
85 if (strcmp(ime->im_name, im_name) == 0) {
86 list_del(&(ime->list));
87 spin_unlock(&ime_lock);
88 kfree(ime);
89 return;
90 }
91 }
92 spin_unlock(&ime_lock);
93 if (kmalloc_failed) {
94 printk(KERN_ERR
95 "inter_module_unregister: no entry for '%s', "
96 "probably caused by previous kmalloc failure\n",
97 im_name);
98 return;
99 }
100 else {
101 /* Program logic error, fatal */
102 printk(KERN_ERR "inter_module_unregister: no entry for '%s'", im_name);
103 BUG();
104 }
105}
106
107/**
108 * inter_module_get - return arbitrary userdata from another module.
109 * @im_name: an arbitrary string to identify the data, must be unique
110 *
111 * Description: If the im_name has not been registered, return NULL.
112 * Try to increment the use count on the owning module, if that fails
113 * then return NULL. Otherwise return the userdata.
114 */
115static const void *inter_module_get(const char *im_name)
116{
117 struct list_head *tmp;
118 struct inter_module_entry *ime;
119 const void *result = NULL;
120
121 spin_lock(&ime_lock);
122 list_for_each(tmp, &ime_list) {
123 ime = list_entry(tmp, struct inter_module_entry, list);
124 if (strcmp(ime->im_name, im_name) == 0) {
125 if (try_module_get(ime->owner))
126 result = ime->userdata;
127 break;
128 }
129 }
130 spin_unlock(&ime_lock);
131 return(result);
132}
133
134/**
135 * inter_module_get_request - im get with automatic request_module.
136 * @im_name: an arbitrary string to identify the data, must be unique
137 * @modname: module that is expected to register im_name
138 *
139 * Description: If inter_module_get fails, do request_module then retry.
140 */
141const void *inter_module_get_request(const char *im_name, const char *modname)
142{
143 const void *result = inter_module_get(im_name);
144 if (!result) {
145 request_module("%s", modname);
146 result = inter_module_get(im_name);
147 }
148 return(result);
149}
150
151/**
152 * inter_module_put - release use of data from another module.
153 * @im_name: an arbitrary string to identify the data, must be unique
154 *
155 * Description: If the im_name has not been registered, complain,
156 * otherwise decrement the use count on the owning module.
157 */
158void inter_module_put(const char *im_name)
159{
160 struct list_head *tmp;
161 struct inter_module_entry *ime;
162
163 spin_lock(&ime_lock);
164 list_for_each(tmp, &ime_list) {
165 ime = list_entry(tmp, struct inter_module_entry, list);
166 if (strcmp(ime->im_name, im_name) == 0) {
167 if (ime->owner)
168 module_put(ime->owner);
169 spin_unlock(&ime_lock);
170 return;
171 }
172 }
173 spin_unlock(&ime_lock);
174 printk(KERN_ERR "inter_module_put: no entry for '%s'", im_name);
175 BUG();
176}
177
178EXPORT_SYMBOL(inter_module_register);
179EXPORT_SYMBOL(inter_module_unregister);
180EXPORT_SYMBOL(inter_module_get_request);
181EXPORT_SYMBOL(inter_module_put);
182
183MODULE_LICENSE("GPL");
184
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 9f77f50d8143..1dab0ac3f797 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -1,5 +1,5 @@
1 1
2obj-y := handle.o manage.o spurious.o 2obj-y := handle.o manage.o spurious.o resend.o chip.o
3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o 3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
4obj-$(CONFIG_PROC_FS) += proc.o 4obj-$(CONFIG_PROC_FS) += proc.o
5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o 5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 3467097ca61a..533068cfb607 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -11,12 +11,14 @@
11#include <linux/interrupt.h> 11#include <linux/interrupt.h>
12#include <linux/delay.h> 12#include <linux/delay.h>
13 13
14#include "internals.h"
15
14/* 16/*
15 * Autodetection depends on the fact that any interrupt that 17 * Autodetection depends on the fact that any interrupt that
16 * comes in on to an unassigned handler will get stuck with 18 * comes in on to an unassigned handler will get stuck with
17 * "IRQ_WAITING" cleared and the interrupt disabled. 19 * "IRQ_WAITING" cleared and the interrupt disabled.
18 */ 20 */
19static DECLARE_MUTEX(probe_sem); 21static DEFINE_MUTEX(probing_active);
20 22
21/** 23/**
22 * probe_irq_on - begin an interrupt autodetect 24 * probe_irq_on - begin an interrupt autodetect
@@ -27,11 +29,11 @@ static DECLARE_MUTEX(probe_sem);
27 */ 29 */
28unsigned long probe_irq_on(void) 30unsigned long probe_irq_on(void)
29{ 31{
30 unsigned long val; 32 struct irq_desc *desc;
31 irq_desc_t *desc; 33 unsigned long mask;
32 unsigned int i; 34 unsigned int i;
33 35
34 down(&probe_sem); 36 mutex_lock(&probing_active);
35 /* 37 /*
36 * something may have generated an irq long ago and we want to 38 * something may have generated an irq long ago and we want to
37 * flush such a longstanding irq before considering it as spurious. 39 * flush such a longstanding irq before considering it as spurious.
@@ -40,8 +42,21 @@ unsigned long probe_irq_on(void)
40 desc = irq_desc + i; 42 desc = irq_desc + i;
41 43
42 spin_lock_irq(&desc->lock); 44 spin_lock_irq(&desc->lock);
43 if (!irq_desc[i].action) 45 if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
44 irq_desc[i].handler->startup(i); 46 /*
47 * An old-style architecture might still have
48 * the handle_bad_irq handler there:
49 */
50 compat_irq_chip_set_default_handler(desc);
51
52 /*
53 * Some chips need to know about probing in
54 * progress:
55 */
56 if (desc->chip->set_type)
57 desc->chip->set_type(i, IRQ_TYPE_PROBE);
58 desc->chip->startup(i);
59 }
45 spin_unlock_irq(&desc->lock); 60 spin_unlock_irq(&desc->lock);
46 } 61 }
47 62
@@ -57,9 +72,9 @@ unsigned long probe_irq_on(void)
57 desc = irq_desc + i; 72 desc = irq_desc + i;
58 73
59 spin_lock_irq(&desc->lock); 74 spin_lock_irq(&desc->lock);
60 if (!desc->action) { 75 if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
61 desc->status |= IRQ_AUTODETECT | IRQ_WAITING; 76 desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
62 if (desc->handler->startup(i)) 77 if (desc->chip->startup(i))
63 desc->status |= IRQ_PENDING; 78 desc->status |= IRQ_PENDING;
64 } 79 }
65 spin_unlock_irq(&desc->lock); 80 spin_unlock_irq(&desc->lock);
@@ -73,11 +88,11 @@ unsigned long probe_irq_on(void)
73 /* 88 /*
74 * Now filter out any obviously spurious interrupts 89 * Now filter out any obviously spurious interrupts
75 */ 90 */
76 val = 0; 91 mask = 0;
77 for (i = 0; i < NR_IRQS; i++) { 92 for (i = 0; i < NR_IRQS; i++) {
78 irq_desc_t *desc = irq_desc + i;
79 unsigned int status; 93 unsigned int status;
80 94
95 desc = irq_desc + i;
81 spin_lock_irq(&desc->lock); 96 spin_lock_irq(&desc->lock);
82 status = desc->status; 97 status = desc->status;
83 98
@@ -85,17 +100,16 @@ unsigned long probe_irq_on(void)
85 /* It triggered already - consider it spurious. */ 100 /* It triggered already - consider it spurious. */
86 if (!(status & IRQ_WAITING)) { 101 if (!(status & IRQ_WAITING)) {
87 desc->status = status & ~IRQ_AUTODETECT; 102 desc->status = status & ~IRQ_AUTODETECT;
88 desc->handler->shutdown(i); 103 desc->chip->shutdown(i);
89 } else 104 } else
90 if (i < 32) 105 if (i < 32)
91 val |= 1 << i; 106 mask |= 1 << i;
92 } 107 }
93 spin_unlock_irq(&desc->lock); 108 spin_unlock_irq(&desc->lock);
94 } 109 }
95 110
96 return val; 111 return mask;
97} 112}
98
99EXPORT_SYMBOL(probe_irq_on); 113EXPORT_SYMBOL(probe_irq_on);
100 114
101/** 115/**
@@ -117,7 +131,7 @@ unsigned int probe_irq_mask(unsigned long val)
117 131
118 mask = 0; 132 mask = 0;
119 for (i = 0; i < NR_IRQS; i++) { 133 for (i = 0; i < NR_IRQS; i++) {
120 irq_desc_t *desc = irq_desc + i; 134 struct irq_desc *desc = irq_desc + i;
121 unsigned int status; 135 unsigned int status;
122 136
123 spin_lock_irq(&desc->lock); 137 spin_lock_irq(&desc->lock);
@@ -128,11 +142,11 @@ unsigned int probe_irq_mask(unsigned long val)
128 mask |= 1 << i; 142 mask |= 1 << i;
129 143
130 desc->status = status & ~IRQ_AUTODETECT; 144 desc->status = status & ~IRQ_AUTODETECT;
131 desc->handler->shutdown(i); 145 desc->chip->shutdown(i);
132 } 146 }
133 spin_unlock_irq(&desc->lock); 147 spin_unlock_irq(&desc->lock);
134 } 148 }
135 up(&probe_sem); 149 mutex_unlock(&probing_active);
136 150
137 return mask & val; 151 return mask & val;
138} 152}
@@ -160,7 +174,7 @@ int probe_irq_off(unsigned long val)
160 int i, irq_found = 0, nr_irqs = 0; 174 int i, irq_found = 0, nr_irqs = 0;
161 175
162 for (i = 0; i < NR_IRQS; i++) { 176 for (i = 0; i < NR_IRQS; i++) {
163 irq_desc_t *desc = irq_desc + i; 177 struct irq_desc *desc = irq_desc + i;
164 unsigned int status; 178 unsigned int status;
165 179
166 spin_lock_irq(&desc->lock); 180 spin_lock_irq(&desc->lock);
@@ -173,16 +187,16 @@ int probe_irq_off(unsigned long val)
173 nr_irqs++; 187 nr_irqs++;
174 } 188 }
175 desc->status = status & ~IRQ_AUTODETECT; 189 desc->status = status & ~IRQ_AUTODETECT;
176 desc->handler->shutdown(i); 190 desc->chip->shutdown(i);
177 } 191 }
178 spin_unlock_irq(&desc->lock); 192 spin_unlock_irq(&desc->lock);
179 } 193 }
180 up(&probe_sem); 194 mutex_unlock(&probing_active);
181 195
182 if (nr_irqs > 1) 196 if (nr_irqs > 1)
183 irq_found = -irq_found; 197 irq_found = -irq_found;
198
184 return irq_found; 199 return irq_found;
185} 200}
186
187EXPORT_SYMBOL(probe_irq_off); 201EXPORT_SYMBOL(probe_irq_off);
188 202
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
new file mode 100644
index 000000000000..9336f2e89e40
--- /dev/null
+++ b/kernel/irq/chip.c
@@ -0,0 +1,537 @@
1/*
2 * linux/kernel/irq/chip.c
3 *
4 * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
5 * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
6 *
7 * This file contains the core interrupt handling code, for irq-chip
8 * based architectures.
9 *
10 * Detailed information is available in Documentation/DocBook/genericirq
11 */
12
13#include <linux/irq.h>
14#include <linux/module.h>
15#include <linux/interrupt.h>
16#include <linux/kernel_stat.h>
17
18#include "internals.h"
19
20/**
21 * set_irq_chip - set the irq chip for an irq
22 * @irq: irq number
23 * @chip: pointer to irq chip description structure
24 */
25int set_irq_chip(unsigned int irq, struct irq_chip *chip)
26{
27 struct irq_desc *desc;
28 unsigned long flags;
29
30 if (irq >= NR_IRQS) {
31 printk(KERN_ERR "Trying to install chip for IRQ%d\n", irq);
32 WARN_ON(1);
33 return -EINVAL;
34 }
35
36 if (!chip)
37 chip = &no_irq_chip;
38
39 desc = irq_desc + irq;
40 spin_lock_irqsave(&desc->lock, flags);
41 irq_chip_set_defaults(chip);
42 desc->chip = chip;
43 /*
44 * For compatibility only:
45 */
46 desc->chip = chip;
47 spin_unlock_irqrestore(&desc->lock, flags);
48
49 return 0;
50}
51EXPORT_SYMBOL(set_irq_chip);
52
53/**
54 * set_irq_type - set the irq type for an irq
55 * @irq: irq number
56 * @type: interrupt type - see include/linux/interrupt.h
57 */
58int set_irq_type(unsigned int irq, unsigned int type)
59{
60 struct irq_desc *desc;
61 unsigned long flags;
62 int ret = -ENXIO;
63
64 if (irq >= NR_IRQS) {
65 printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq);
66 return -ENODEV;
67 }
68
69 desc = irq_desc + irq;
70 if (desc->chip->set_type) {
71 spin_lock_irqsave(&desc->lock, flags);
72 ret = desc->chip->set_type(irq, type);
73 spin_unlock_irqrestore(&desc->lock, flags);
74 }
75 return ret;
76}
77EXPORT_SYMBOL(set_irq_type);
78
79/**
80 * set_irq_data - set irq type data for an irq
81 * @irq: Interrupt number
82 * @data: Pointer to interrupt specific data
83 *
84 * Set the hardware irq controller data for an irq
85 */
86int set_irq_data(unsigned int irq, void *data)
87{
88 struct irq_desc *desc;
89 unsigned long flags;
90
91 if (irq >= NR_IRQS) {
92 printk(KERN_ERR
93 "Trying to install controller data for IRQ%d\n", irq);
94 return -EINVAL;
95 }
96
97 desc = irq_desc + irq;
98 spin_lock_irqsave(&desc->lock, flags);
99 desc->handler_data = data;
100 spin_unlock_irqrestore(&desc->lock, flags);
101 return 0;
102}
103EXPORT_SYMBOL(set_irq_data);
104
105/**
106 * set_irq_chip_data - set irq chip data for an irq
107 * @irq: Interrupt number
108 * @data: Pointer to chip specific data
109 *
110 * Set the hardware irq chip data for an irq
111 */
112int set_irq_chip_data(unsigned int irq, void *data)
113{
114 struct irq_desc *desc = irq_desc + irq;
115 unsigned long flags;
116
117 if (irq >= NR_IRQS || !desc->chip) {
118 printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq);
119 return -EINVAL;
120 }
121
122 spin_lock_irqsave(&desc->lock, flags);
123 desc->chip_data = data;
124 spin_unlock_irqrestore(&desc->lock, flags);
125
126 return 0;
127}
128EXPORT_SYMBOL(set_irq_chip_data);
129
130/*
131 * default enable function
132 */
133static void default_enable(unsigned int irq)
134{
135 struct irq_desc *desc = irq_desc + irq;
136
137 desc->chip->unmask(irq);
138 desc->status &= ~IRQ_MASKED;
139}
140
141/*
142 * default disable function
143 */
144static void default_disable(unsigned int irq)
145{
146 struct irq_desc *desc = irq_desc + irq;
147
148 if (!(desc->status & IRQ_DELAYED_DISABLE))
149 irq_desc[irq].chip->mask(irq);
150}
151
152/*
153 * default startup function
154 */
155static unsigned int default_startup(unsigned int irq)
156{
157 irq_desc[irq].chip->enable(irq);
158
159 return 0;
160}
161
162/*
163 * Fixup enable/disable function pointers
164 */
165void irq_chip_set_defaults(struct irq_chip *chip)
166{
167 if (!chip->enable)
168 chip->enable = default_enable;
169 if (!chip->disable)
170 chip->disable = default_disable;
171 if (!chip->startup)
172 chip->startup = default_startup;
173 if (!chip->shutdown)
174 chip->shutdown = chip->disable;
175 if (!chip->name)
176 chip->name = chip->typename;
177}
178
179static inline void mask_ack_irq(struct irq_desc *desc, int irq)
180{
181 if (desc->chip->mask_ack)
182 desc->chip->mask_ack(irq);
183 else {
184 desc->chip->mask(irq);
185 desc->chip->ack(irq);
186 }
187}
188
189/**
190 * handle_simple_irq - Simple and software-decoded IRQs.
191 * @irq: the interrupt number
192 * @desc: the interrupt description structure for this irq
193 * @regs: pointer to a register structure
194 *
195 * Simple interrupts are either sent from a demultiplexing interrupt
196 * handler or come from hardware, where no interrupt hardware control
197 * is necessary.
198 *
199 * Note: The caller is expected to handle the ack, clear, mask and
200 * unmask issues if necessary.
201 */
202void fastcall
203handle_simple_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
204{
205 struct irqaction *action;
206 irqreturn_t action_ret;
207 const unsigned int cpu = smp_processor_id();
208
209 spin_lock(&desc->lock);
210
211 if (unlikely(desc->status & IRQ_INPROGRESS))
212 goto out_unlock;
213 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
214 kstat_cpu(cpu).irqs[irq]++;
215
216 action = desc->action;
217 if (unlikely(!action || (desc->status & IRQ_DISABLED)))
218 goto out_unlock;
219
220 desc->status |= IRQ_INPROGRESS;
221 spin_unlock(&desc->lock);
222
223 action_ret = handle_IRQ_event(irq, regs, action);
224 if (!noirqdebug)
225 note_interrupt(irq, desc, action_ret, regs);
226
227 spin_lock(&desc->lock);
228 desc->status &= ~IRQ_INPROGRESS;
229out_unlock:
230 spin_unlock(&desc->lock);
231}
232
233/**
234 * handle_level_irq - Level type irq handler
235 * @irq: the interrupt number
236 * @desc: the interrupt description structure for this irq
237 * @regs: pointer to a register structure
238 *
239 * Level type interrupts are active as long as the hardware line has
240 * the active level. This may require to mask the interrupt and unmask
241 * it after the associated handler has acknowledged the device, so the
242 * interrupt line is back to inactive.
243 */
244void fastcall
245handle_level_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
246{
247 unsigned int cpu = smp_processor_id();
248 struct irqaction *action;
249 irqreturn_t action_ret;
250
251 spin_lock(&desc->lock);
252 mask_ack_irq(desc, irq);
253
254 if (unlikely(desc->status & IRQ_INPROGRESS))
255 goto out;
256 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
257 kstat_cpu(cpu).irqs[irq]++;
258
259 /*
260 * If its disabled or no action available
261 * keep it masked and get out of here
262 */
263 action = desc->action;
264 if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
265 desc->status |= IRQ_PENDING;
266 goto out;
267 }
268
269 desc->status |= IRQ_INPROGRESS;
270 desc->status &= ~IRQ_PENDING;
271 spin_unlock(&desc->lock);
272
273 action_ret = handle_IRQ_event(irq, regs, action);
274 if (!noirqdebug)
275 note_interrupt(irq, desc, action_ret, regs);
276
277 spin_lock(&desc->lock);
278 desc->status &= ~IRQ_INPROGRESS;
279out:
280 if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
281 desc->chip->unmask(irq);
282 spin_unlock(&desc->lock);
283}
284
285/**
286 * handle_fasteoi_irq - irq handler for transparent controllers
287 * @irq: the interrupt number
288 * @desc: the interrupt description structure for this irq
289 * @regs: pointer to a register structure
290 *
291 * Only a single callback will be issued to the chip: an ->eoi()
292 * call when the interrupt has been serviced. This enables support
293 * for modern forms of interrupt handlers, which handle the flow
294 * details in hardware, transparently.
295 */
296void fastcall
297handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc,
298 struct pt_regs *regs)
299{
300 unsigned int cpu = smp_processor_id();
301 struct irqaction *action;
302 irqreturn_t action_ret;
303
304 spin_lock(&desc->lock);
305
306 if (unlikely(desc->status & IRQ_INPROGRESS))
307 goto out;
308
309 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
310 kstat_cpu(cpu).irqs[irq]++;
311
312 /*
313 * If its disabled or no action available
314 * keep it masked and get out of here
315 */
316 action = desc->action;
317 if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
318 desc->status |= IRQ_PENDING;
319 goto out;
320 }
321
322 desc->status |= IRQ_INPROGRESS;
323 desc->status &= ~IRQ_PENDING;
324 spin_unlock(&desc->lock);
325
326 action_ret = handle_IRQ_event(irq, regs, action);
327 if (!noirqdebug)
328 note_interrupt(irq, desc, action_ret, regs);
329
330 spin_lock(&desc->lock);
331 desc->status &= ~IRQ_INPROGRESS;
332out:
333 desc->chip->eoi(irq);
334
335 spin_unlock(&desc->lock);
336}
337
338/**
339 * handle_edge_irq - edge type IRQ handler
340 * @irq: the interrupt number
341 * @desc: the interrupt description structure for this irq
342 * @regs: pointer to a register structure
343 *
344 * Interrupt occures on the falling and/or rising edge of a hardware
345 * signal. The occurence is latched into the irq controller hardware
346 * and must be acked in order to be reenabled. After the ack another
347 * interrupt can happen on the same source even before the first one
348 * is handled by the assosiacted event handler. If this happens it
349 * might be necessary to disable (mask) the interrupt depending on the
350 * controller hardware. This requires to reenable the interrupt inside
351 * of the loop which handles the interrupts which have arrived while
352 * the handler was running. If all pending interrupts are handled, the
353 * loop is left.
354 */
355void fastcall
356handle_edge_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
357{
358 const unsigned int cpu = smp_processor_id();
359
360 spin_lock(&desc->lock);
361
362 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
363
364 /*
365 * If we're currently running this IRQ, or its disabled,
366 * we shouldn't process the IRQ. Mark it pending, handle
367 * the necessary masking and go out
368 */
369 if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) ||
370 !desc->action)) {
371 desc->status |= (IRQ_PENDING | IRQ_MASKED);
372 mask_ack_irq(desc, irq);
373 goto out_unlock;
374 }
375
376 kstat_cpu(cpu).irqs[irq]++;
377
378 /* Start handling the irq */
379 desc->chip->ack(irq);
380
381 /* Mark the IRQ currently in progress.*/
382 desc->status |= IRQ_INPROGRESS;
383
384 do {
385 struct irqaction *action = desc->action;
386 irqreturn_t action_ret;
387
388 if (unlikely(!action)) {
389 desc->chip->mask(irq);
390 goto out_unlock;
391 }
392
393 /*
394 * When another irq arrived while we were handling
395 * one, we could have masked the irq.
396 * Renable it, if it was not disabled in meantime.
397 */
398 if (unlikely((desc->status &
399 (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) ==
400 (IRQ_PENDING | IRQ_MASKED))) {
401 desc->chip->unmask(irq);
402 desc->status &= ~IRQ_MASKED;
403 }
404
405 desc->status &= ~IRQ_PENDING;
406 spin_unlock(&desc->lock);
407 action_ret = handle_IRQ_event(irq, regs, action);
408 if (!noirqdebug)
409 note_interrupt(irq, desc, action_ret, regs);
410 spin_lock(&desc->lock);
411
412 } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING);
413
414 desc->status &= ~IRQ_INPROGRESS;
415out_unlock:
416 spin_unlock(&desc->lock);
417}
418
419#ifdef CONFIG_SMP
420/**
421 * handle_percpu_IRQ - Per CPU local irq handler
422 * @irq: the interrupt number
423 * @desc: the interrupt description structure for this irq
424 * @regs: pointer to a register structure
425 *
426 * Per CPU interrupts on SMP machines without locking requirements
427 */
428void fastcall
429handle_percpu_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
430{
431 irqreturn_t action_ret;
432
433 kstat_this_cpu.irqs[irq]++;
434
435 if (desc->chip->ack)
436 desc->chip->ack(irq);
437
438 action_ret = handle_IRQ_event(irq, regs, desc->action);
439 if (!noirqdebug)
440 note_interrupt(irq, desc, action_ret, regs);
441
442 if (desc->chip->eoi)
443 desc->chip->eoi(irq);
444}
445
446#endif /* CONFIG_SMP */
447
448void
449__set_irq_handler(unsigned int irq,
450 void fastcall (*handle)(unsigned int, irq_desc_t *,
451 struct pt_regs *),
452 int is_chained)
453{
454 struct irq_desc *desc;
455 unsigned long flags;
456
457 if (irq >= NR_IRQS) {
458 printk(KERN_ERR
459 "Trying to install type control for IRQ%d\n", irq);
460 return;
461 }
462
463 desc = irq_desc + irq;
464
465 if (!handle)
466 handle = handle_bad_irq;
467
468 if (desc->chip == &no_irq_chip) {
469 printk(KERN_WARNING "Trying to install %sinterrupt handler "
470 "for IRQ%d\n", is_chained ? "chained " : " ", irq);
471 /*
472 * Some ARM implementations install a handler for really dumb
473 * interrupt hardware without setting an irq_chip. This worked
474 * with the ARM no_irq_chip but the check in setup_irq would
475 * prevent us to setup the interrupt at all. Switch it to
476 * dummy_irq_chip for easy transition.
477 */
478 desc->chip = &dummy_irq_chip;
479 }
480
481 spin_lock_irqsave(&desc->lock, flags);
482
483 /* Uninstall? */
484 if (handle == handle_bad_irq) {
485 if (desc->chip != &no_irq_chip) {
486 desc->chip->mask(irq);
487 desc->chip->ack(irq);
488 }
489 desc->status |= IRQ_DISABLED;
490 desc->depth = 1;
491 }
492 desc->handle_irq = handle;
493
494 if (handle != handle_bad_irq && is_chained) {
495 desc->status &= ~IRQ_DISABLED;
496 desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE;
497 desc->depth = 0;
498 desc->chip->unmask(irq);
499 }
500 spin_unlock_irqrestore(&desc->lock, flags);
501}
502
503void
504set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip,
505 void fastcall (*handle)(unsigned int,
506 struct irq_desc *,
507 struct pt_regs *))
508{
509 set_irq_chip(irq, chip);
510 __set_irq_handler(irq, handle, 0);
511}
512
513/*
514 * Get a descriptive string for the highlevel handler, for
515 * /proc/interrupts output:
516 */
517const char *
518handle_irq_name(void fastcall (*handle)(unsigned int, struct irq_desc *,
519 struct pt_regs *))
520{
521 if (handle == handle_level_irq)
522 return "level ";
523 if (handle == handle_fasteoi_irq)
524 return "fasteoi";
525 if (handle == handle_edge_irq)
526 return "edge ";
527 if (handle == handle_simple_irq)
528 return "simple ";
529#ifdef CONFIG_SMP
530 if (handle == handle_percpu_irq)
531 return "percpu ";
532#endif
533 if (handle == handle_bad_irq)
534 return "bad ";
535
536 return NULL;
537}
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 51df337b37db..fc4e906aedbd 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -1,9 +1,13 @@
1/* 1/*
2 * linux/kernel/irq/handle.c 2 * linux/kernel/irq/handle.c
3 * 3 *
4 * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar 4 * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
5 * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
5 * 6 *
6 * This file contains the core interrupt handling code. 7 * This file contains the core interrupt handling code.
8 *
9 * Detailed information is available in Documentation/DocBook/genericirq
10 *
7 */ 11 */
8 12
9#include <linux/irq.h> 13#include <linux/irq.h>
@@ -14,11 +18,22 @@
14 18
15#include "internals.h" 19#include "internals.h"
16 20
21/**
22 * handle_bad_irq - handle spurious and unhandled irqs
23 */
24void fastcall
25handle_bad_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
26{
27 print_irq_desc(irq, desc);
28 kstat_this_cpu.irqs[irq]++;
29 ack_bad_irq(irq);
30}
31
17/* 32/*
18 * Linux has a controller-independent interrupt architecture. 33 * Linux has a controller-independent interrupt architecture.
19 * Every controller has a 'controller-template', that is used 34 * Every controller has a 'controller-template', that is used
20 * by the main code to do the right thing. Each driver-visible 35 * by the main code to do the right thing. Each driver-visible
21 * interrupt source is transparently wired to the apropriate 36 * interrupt source is transparently wired to the appropriate
22 * controller. Thus drivers need not be aware of the 37 * controller. Thus drivers need not be aware of the
23 * interrupt-controller. 38 * interrupt-controller.
24 * 39 *
@@ -28,41 +43,68 @@
28 * 43 *
29 * Controller mappings for all interrupt sources: 44 * Controller mappings for all interrupt sources:
30 */ 45 */
31irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = { 46struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned = {
32 [0 ... NR_IRQS-1] = { 47 [0 ... NR_IRQS-1] = {
33 .status = IRQ_DISABLED, 48 .status = IRQ_DISABLED,
34 .handler = &no_irq_type, 49 .chip = &no_irq_chip,
35 .lock = SPIN_LOCK_UNLOCKED 50 .handle_irq = handle_bad_irq,
51 .depth = 1,
52 .lock = SPIN_LOCK_UNLOCKED,
53#ifdef CONFIG_SMP
54 .affinity = CPU_MASK_ALL
55#endif
36 } 56 }
37}; 57};
38 58
39/* 59/*
40 * Generic 'no controller' code 60 * What should we do if we get a hw irq event on an illegal vector?
61 * Each architecture has to answer this themself.
41 */ 62 */
42static void end_none(unsigned int irq) { } 63static void ack_bad(unsigned int irq)
43static void enable_none(unsigned int irq) { }
44static void disable_none(unsigned int irq) { }
45static void shutdown_none(unsigned int irq) { }
46static unsigned int startup_none(unsigned int irq) { return 0; }
47
48static void ack_none(unsigned int irq)
49{ 64{
50 /* 65 print_irq_desc(irq, irq_desc + irq);
51 * 'what should we do if we get a hw irq event on an illegal vector'.
52 * each architecture has to answer this themself.
53 */
54 ack_bad_irq(irq); 66 ack_bad_irq(irq);
55} 67}
56 68
57struct hw_interrupt_type no_irq_type = { 69/*
58 .typename = "none", 70 * NOP functions
59 .startup = startup_none, 71 */
60 .shutdown = shutdown_none, 72static void noop(unsigned int irq)
61 .enable = enable_none, 73{
62 .disable = disable_none, 74}
63 .ack = ack_none, 75
64 .end = end_none, 76static unsigned int noop_ret(unsigned int irq)
65 .set_affinity = NULL 77{
78 return 0;
79}
80
81/*
82 * Generic no controller implementation
83 */
84struct irq_chip no_irq_chip = {
85 .name = "none",
86 .startup = noop_ret,
87 .shutdown = noop,
88 .enable = noop,
89 .disable = noop,
90 .ack = ack_bad,
91 .end = noop,
92};
93
94/*
95 * Generic dummy implementation which can be used for
96 * real dumb interrupt sources
97 */
98struct irq_chip dummy_irq_chip = {
99 .name = "dummy",
100 .startup = noop_ret,
101 .shutdown = noop,
102 .enable = noop,
103 .disable = noop,
104 .ack = noop,
105 .mask = noop,
106 .unmask = noop,
107 .end = noop,
66}; 108};
67 109
68/* 110/*
@@ -73,16 +115,24 @@ irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs)
73 return IRQ_NONE; 115 return IRQ_NONE;
74} 116}
75 117
76/* 118/**
77 * Have got an event to handle: 119 * handle_IRQ_event - irq action chain handler
120 * @irq: the interrupt number
121 * @regs: pointer to a register structure
122 * @action: the interrupt action chain for this irq
123 *
124 * Handles the action chain of an irq event
78 */ 125 */
79fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs, 126irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
80 struct irqaction *action) 127 struct irqaction *action)
81{ 128{
82 int ret, retval = 0, status = 0; 129 irqreturn_t ret, retval = IRQ_NONE;
130 unsigned int status = 0;
131
132 handle_dynamic_tick(action);
83 133
84 if (!(action->flags & SA_INTERRUPT)) 134 if (!(action->flags & IRQF_DISABLED))
85 local_irq_enable(); 135 local_irq_enable_in_hardirq();
86 136
87 do { 137 do {
88 ret = action->handler(irq, action->dev_id, regs); 138 ret = action->handler(irq, action->dev_id, regs);
@@ -92,22 +142,29 @@ fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
92 action = action->next; 142 action = action->next;
93 } while (action); 143 } while (action);
94 144
95 if (status & SA_SAMPLE_RANDOM) 145 if (status & IRQF_SAMPLE_RANDOM)
96 add_interrupt_randomness(irq); 146 add_interrupt_randomness(irq);
97 local_irq_disable(); 147 local_irq_disable();
98 148
99 return retval; 149 return retval;
100} 150}
101 151
102/* 152/**
103 * do_IRQ handles all normal device IRQ's (the special 153 * __do_IRQ - original all in one highlevel IRQ handler
154 * @irq: the interrupt number
155 * @regs: pointer to a register structure
156 *
157 * __do_IRQ handles all normal device IRQ's (the special
104 * SMP cross-CPU interrupts have their own specific 158 * SMP cross-CPU interrupts have their own specific
105 * handlers). 159 * handlers).
160 *
161 * This is the original x86 implementation which is used for every
162 * interrupt type.
106 */ 163 */
107fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) 164fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
108{ 165{
109 irq_desc_t *desc = irq_desc + irq; 166 struct irq_desc *desc = irq_desc + irq;
110 struct irqaction * action; 167 struct irqaction *action;
111 unsigned int status; 168 unsigned int status;
112 169
113 kstat_this_cpu.irqs[irq]++; 170 kstat_this_cpu.irqs[irq]++;
@@ -117,16 +174,16 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
117 /* 174 /*
118 * No locking required for CPU-local interrupts: 175 * No locking required for CPU-local interrupts:
119 */ 176 */
120 if (desc->handler->ack) 177 if (desc->chip->ack)
121 desc->handler->ack(irq); 178 desc->chip->ack(irq);
122 action_ret = handle_IRQ_event(irq, regs, desc->action); 179 action_ret = handle_IRQ_event(irq, regs, desc->action);
123 desc->handler->end(irq); 180 desc->chip->end(irq);
124 return 1; 181 return 1;
125 } 182 }
126 183
127 spin_lock(&desc->lock); 184 spin_lock(&desc->lock);
128 if (desc->handler->ack) 185 if (desc->chip->ack)
129 desc->handler->ack(irq); 186 desc->chip->ack(irq);
130 /* 187 /*
131 * REPLAY is when Linux resends an IRQ that was dropped earlier 188 * REPLAY is when Linux resends an IRQ that was dropped earlier
132 * WAITING is used by probe to mark irqs that are being tested 189 * WAITING is used by probe to mark irqs that are being tested
@@ -186,9 +243,25 @@ out:
186 * The ->end() handler has to deal with interrupts which got 243 * The ->end() handler has to deal with interrupts which got
187 * disabled while the handler was running. 244 * disabled while the handler was running.
188 */ 245 */
189 desc->handler->end(irq); 246 desc->chip->end(irq);
190 spin_unlock(&desc->lock); 247 spin_unlock(&desc->lock);
191 248
192 return 1; 249 return 1;
193} 250}
194 251
252#ifdef CONFIG_TRACE_IRQFLAGS
253
254/*
255 * lockdep: we want to handle all irq_desc locks as a single lock-class:
256 */
257static struct lock_class_key irq_desc_lock_class;
258
259void early_init_irq_lock_class(void)
260{
261 int i;
262
263 for (i = 0; i < NR_IRQS; i++)
264 lockdep_set_class(&irq_desc[i].lock, &irq_desc_lock_class);
265}
266
267#endif
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 46feba630266..08a849a22447 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -4,6 +4,12 @@
4 4
5extern int noirqdebug; 5extern int noirqdebug;
6 6
7/* Set default functions for irq_chip structures: */
8extern void irq_chip_set_defaults(struct irq_chip *chip);
9
10/* Set default handler: */
11extern void compat_irq_chip_set_default_handler(struct irq_desc *desc);
12
7#ifdef CONFIG_PROC_FS 13#ifdef CONFIG_PROC_FS
8extern void register_irq_proc(unsigned int irq); 14extern void register_irq_proc(unsigned int irq);
9extern void register_handler_proc(unsigned int irq, struct irqaction *action); 15extern void register_handler_proc(unsigned int irq, struct irqaction *action);
@@ -16,3 +22,43 @@ static inline void unregister_handler_proc(unsigned int irq,
16 struct irqaction *action) { } 22 struct irqaction *action) { }
17#endif 23#endif
18 24
25/*
26 * Debugging printout:
27 */
28
29#include <linux/kallsyms.h>
30
31#define P(f) if (desc->status & f) printk("%14s set\n", #f)
32
33static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
34{
35 printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n",
36 irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
37 printk("->handle_irq(): %p, ", desc->handle_irq);
38 print_symbol("%s\n", (unsigned long)desc->handle_irq);
39 printk("->chip(): %p, ", desc->chip);
40 print_symbol("%s\n", (unsigned long)desc->chip);
41 printk("->action(): %p\n", desc->action);
42 if (desc->action) {
43 printk("->action->handler(): %p, ", desc->action->handler);
44 print_symbol("%s\n", (unsigned long)desc->action->handler);
45 }
46
47 P(IRQ_INPROGRESS);
48 P(IRQ_DISABLED);
49 P(IRQ_PENDING);
50 P(IRQ_REPLAY);
51 P(IRQ_AUTODETECT);
52 P(IRQ_WAITING);
53 P(IRQ_LEVEL);
54 P(IRQ_MASKED);
55#ifdef CONFIG_IRQ_PER_CPU
56 P(IRQ_PER_CPU);
57#endif
58 P(IRQ_NOPROBE);
59 P(IRQ_NOREQUEST);
60 P(IRQ_NOAUTOEN);
61}
62
63#undef P
64
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 1279e3499534..4e461438e48b 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1,12 +1,12 @@
1/* 1/*
2 * linux/kernel/irq/manage.c 2 * linux/kernel/irq/manage.c
3 * 3 *
4 * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar 4 * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
5 * Copyright (C) 2005-2006 Thomas Gleixner
5 * 6 *
6 * This file contains driver APIs to the irq subsystem. 7 * This file contains driver APIs to the irq subsystem.
7 */ 8 */
8 9
9#include <linux/config.h>
10#include <linux/irq.h> 10#include <linux/irq.h>
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/random.h> 12#include <linux/random.h>
@@ -16,12 +16,6 @@
16 16
17#ifdef CONFIG_SMP 17#ifdef CONFIG_SMP
18 18
19cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL };
20
21#if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE)
22cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS];
23#endif
24
25/** 19/**
26 * synchronize_irq - wait for pending IRQ handlers (on other CPUs) 20 * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
27 * @irq: interrupt number to wait for 21 * @irq: interrupt number to wait for
@@ -42,7 +36,6 @@ void synchronize_irq(unsigned int irq)
42 while (desc->status & IRQ_INPROGRESS) 36 while (desc->status & IRQ_INPROGRESS)
43 cpu_relax(); 37 cpu_relax();
44} 38}
45
46EXPORT_SYMBOL(synchronize_irq); 39EXPORT_SYMBOL(synchronize_irq);
47 40
48#endif 41#endif
@@ -60,7 +53,7 @@ EXPORT_SYMBOL(synchronize_irq);
60 */ 53 */
61void disable_irq_nosync(unsigned int irq) 54void disable_irq_nosync(unsigned int irq)
62{ 55{
63 irq_desc_t *desc = irq_desc + irq; 56 struct irq_desc *desc = irq_desc + irq;
64 unsigned long flags; 57 unsigned long flags;
65 58
66 if (irq >= NR_IRQS) 59 if (irq >= NR_IRQS)
@@ -69,11 +62,10 @@ void disable_irq_nosync(unsigned int irq)
69 spin_lock_irqsave(&desc->lock, flags); 62 spin_lock_irqsave(&desc->lock, flags);
70 if (!desc->depth++) { 63 if (!desc->depth++) {
71 desc->status |= IRQ_DISABLED; 64 desc->status |= IRQ_DISABLED;
72 desc->handler->disable(irq); 65 desc->chip->disable(irq);
73 } 66 }
74 spin_unlock_irqrestore(&desc->lock, flags); 67 spin_unlock_irqrestore(&desc->lock, flags);
75} 68}
76
77EXPORT_SYMBOL(disable_irq_nosync); 69EXPORT_SYMBOL(disable_irq_nosync);
78 70
79/** 71/**
@@ -90,7 +82,7 @@ EXPORT_SYMBOL(disable_irq_nosync);
90 */ 82 */
91void disable_irq(unsigned int irq) 83void disable_irq(unsigned int irq)
92{ 84{
93 irq_desc_t *desc = irq_desc + irq; 85 struct irq_desc *desc = irq_desc + irq;
94 86
95 if (irq >= NR_IRQS) 87 if (irq >= NR_IRQS)
96 return; 88 return;
@@ -99,7 +91,6 @@ void disable_irq(unsigned int irq)
99 if (desc->action) 91 if (desc->action)
100 synchronize_irq(irq); 92 synchronize_irq(irq);
101} 93}
102
103EXPORT_SYMBOL(disable_irq); 94EXPORT_SYMBOL(disable_irq);
104 95
105/** 96/**
@@ -114,7 +105,7 @@ EXPORT_SYMBOL(disable_irq);
114 */ 105 */
115void enable_irq(unsigned int irq) 106void enable_irq(unsigned int irq)
116{ 107{
117 irq_desc_t *desc = irq_desc + irq; 108 struct irq_desc *desc = irq_desc + irq;
118 unsigned long flags; 109 unsigned long flags;
119 110
120 if (irq >= NR_IRQS) 111 if (irq >= NR_IRQS)
@@ -123,17 +114,15 @@ void enable_irq(unsigned int irq)
123 spin_lock_irqsave(&desc->lock, flags); 114 spin_lock_irqsave(&desc->lock, flags);
124 switch (desc->depth) { 115 switch (desc->depth) {
125 case 0: 116 case 0:
117 printk(KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
126 WARN_ON(1); 118 WARN_ON(1);
127 break; 119 break;
128 case 1: { 120 case 1: {
129 unsigned int status = desc->status & ~IRQ_DISABLED; 121 unsigned int status = desc->status & ~IRQ_DISABLED;
130 122
131 desc->status = status; 123 /* Prevent probing on this irq: */
132 if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { 124 desc->status = status | IRQ_NOPROBE;
133 desc->status = status | IRQ_REPLAY; 125 check_irq_resend(desc, irq);
134 hw_resend_irq(desc->handler,irq);
135 }
136 desc->handler->enable(irq);
137 /* fall-through */ 126 /* fall-through */
138 } 127 }
139 default: 128 default:
@@ -141,9 +130,29 @@ void enable_irq(unsigned int irq)
141 } 130 }
142 spin_unlock_irqrestore(&desc->lock, flags); 131 spin_unlock_irqrestore(&desc->lock, flags);
143} 132}
144
145EXPORT_SYMBOL(enable_irq); 133EXPORT_SYMBOL(enable_irq);
146 134
135/**
136 * set_irq_wake - control irq power management wakeup
137 * @irq: interrupt to control
138 * @on: enable/disable power management wakeup
139 *
140 * Enable/disable power management wakeup mode
141 */
142int set_irq_wake(unsigned int irq, unsigned int on)
143{
144 struct irq_desc *desc = irq_desc + irq;
145 unsigned long flags;
146 int ret = -ENXIO;
147
148 spin_lock_irqsave(&desc->lock, flags);
149 if (desc->chip->set_wake)
150 ret = desc->chip->set_wake(irq, on);
151 spin_unlock_irqrestore(&desc->lock, flags);
152 return ret;
153}
154EXPORT_SYMBOL(set_irq_wake);
155
147/* 156/*
148 * Internal function that tells the architecture code whether a 157 * Internal function that tells the architecture code whether a
149 * particular irq has been exclusively allocated or is available 158 * particular irq has been exclusively allocated or is available
@@ -153,22 +162,33 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
153{ 162{
154 struct irqaction *action; 163 struct irqaction *action;
155 164
156 if (irq >= NR_IRQS) 165 if (irq >= NR_IRQS || irq_desc[irq].status & IRQ_NOREQUEST)
157 return 0; 166 return 0;
158 167
159 action = irq_desc[irq].action; 168 action = irq_desc[irq].action;
160 if (action) 169 if (action)
161 if (irqflags & action->flags & SA_SHIRQ) 170 if (irqflags & action->flags & IRQF_SHARED)
162 action = NULL; 171 action = NULL;
163 172
164 return !action; 173 return !action;
165} 174}
166 175
176void compat_irq_chip_set_default_handler(struct irq_desc *desc)
177{
178 /*
179 * If the architecture still has not overriden
180 * the flow handler then zap the default. This
181 * should catch incorrect flow-type setting.
182 */
183 if (desc->handle_irq == &handle_bad_irq)
184 desc->handle_irq = NULL;
185}
186
167/* 187/*
168 * Internal function to register an irqaction - typically used to 188 * Internal function to register an irqaction - typically used to
169 * allocate special interrupts that are part of the architecture. 189 * allocate special interrupts that are part of the architecture.
170 */ 190 */
171int setup_irq(unsigned int irq, struct irqaction * new) 191int setup_irq(unsigned int irq, struct irqaction *new)
172{ 192{
173 struct irq_desc *desc = irq_desc + irq; 193 struct irq_desc *desc = irq_desc + irq;
174 struct irqaction *old, **p; 194 struct irqaction *old, **p;
@@ -178,14 +198,14 @@ int setup_irq(unsigned int irq, struct irqaction * new)
178 if (irq >= NR_IRQS) 198 if (irq >= NR_IRQS)
179 return -EINVAL; 199 return -EINVAL;
180 200
181 if (desc->handler == &no_irq_type) 201 if (desc->chip == &no_irq_chip)
182 return -ENOSYS; 202 return -ENOSYS;
183 /* 203 /*
184 * Some drivers like serial.c use request_irq() heavily, 204 * Some drivers like serial.c use request_irq() heavily,
185 * so we have to be careful not to interfere with a 205 * so we have to be careful not to interfere with a
186 * running system. 206 * running system.
187 */ 207 */
188 if (new->flags & SA_SAMPLE_RANDOM) { 208 if (new->flags & IRQF_SAMPLE_RANDOM) {
189 /* 209 /*
190 * This function might sleep, we want to call it first, 210 * This function might sleep, we want to call it first,
191 * outside of the atomic block. 211 * outside of the atomic block.
@@ -200,16 +220,24 @@ int setup_irq(unsigned int irq, struct irqaction * new)
200 /* 220 /*
201 * The following block of code has to be executed atomically 221 * The following block of code has to be executed atomically
202 */ 222 */
203 spin_lock_irqsave(&desc->lock,flags); 223 spin_lock_irqsave(&desc->lock, flags);
204 p = &desc->action; 224 p = &desc->action;
205 if ((old = *p) != NULL) { 225 old = *p;
206 /* Can't share interrupts unless both agree to */ 226 if (old) {
207 if (!(old->flags & new->flags & SA_SHIRQ)) 227 /*
228 * Can't share interrupts unless both agree to and are
229 * the same type (level, edge, polarity). So both flag
230 * fields must have IRQF_SHARED set and the bits which
231 * set the trigger type must match.
232 */
233 if (!((old->flags & new->flags) & IRQF_SHARED) ||
234 ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK))
208 goto mismatch; 235 goto mismatch;
209 236
210#if defined(ARCH_HAS_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ) 237#if defined(CONFIG_IRQ_PER_CPU)
211 /* All handlers must agree on per-cpuness */ 238 /* All handlers must agree on per-cpuness */
212 if ((old->flags & IRQ_PER_CPU) != (new->flags & IRQ_PER_CPU)) 239 if ((old->flags & IRQF_PERCPU) !=
240 (new->flags & IRQF_PERCPU))
213 goto mismatch; 241 goto mismatch;
214#endif 242#endif
215 243
@@ -222,20 +250,45 @@ int setup_irq(unsigned int irq, struct irqaction * new)
222 } 250 }
223 251
224 *p = new; 252 *p = new;
225#if defined(ARCH_HAS_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ) 253#if defined(CONFIG_IRQ_PER_CPU)
226 if (new->flags & SA_PERCPU_IRQ) 254 if (new->flags & IRQF_PERCPU)
227 desc->status |= IRQ_PER_CPU; 255 desc->status |= IRQ_PER_CPU;
228#endif 256#endif
229 if (!shared) { 257 if (!shared) {
230 desc->depth = 0; 258 irq_chip_set_defaults(desc->chip);
231 desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT | 259
232 IRQ_WAITING | IRQ_INPROGRESS); 260 /* Setup the type (level, edge polarity) if configured: */
233 if (desc->handler->startup) 261 if (new->flags & IRQF_TRIGGER_MASK) {
234 desc->handler->startup(irq); 262 if (desc->chip && desc->chip->set_type)
235 else 263 desc->chip->set_type(irq,
236 desc->handler->enable(irq); 264 new->flags & IRQF_TRIGGER_MASK);
265 else
266 /*
267 * IRQF_TRIGGER_* but the PIC does not support
268 * multiple flow-types?
269 */
270 printk(KERN_WARNING "No IRQF_TRIGGER set_type "
271 "function for IRQ %d (%s)\n", irq,
272 desc->chip ? desc->chip->name :
273 "unknown");
274 } else
275 compat_irq_chip_set_default_handler(desc);
276
277 desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING |
278 IRQ_INPROGRESS);
279
280 if (!(desc->status & IRQ_NOAUTOEN)) {
281 desc->depth = 0;
282 desc->status &= ~IRQ_DISABLED;
283 if (desc->chip->startup)
284 desc->chip->startup(irq);
285 else
286 desc->chip->enable(irq);
287 } else
288 /* Undo nested disables: */
289 desc->depth = 1;
237 } 290 }
238 spin_unlock_irqrestore(&desc->lock,flags); 291 spin_unlock_irqrestore(&desc->lock, flags);
239 292
240 new->irq = irq; 293 new->irq = irq;
241 register_irq_proc(irq); 294 register_irq_proc(irq);
@@ -246,8 +299,8 @@ int setup_irq(unsigned int irq, struct irqaction * new)
246 299
247mismatch: 300mismatch:
248 spin_unlock_irqrestore(&desc->lock, flags); 301 spin_unlock_irqrestore(&desc->lock, flags);
249 if (!(new->flags & SA_PROBEIRQ)) { 302 if (!(new->flags & IRQF_PROBE_SHARED)) {
250 printk(KERN_ERR "%s: irq handler mismatch\n", __FUNCTION__); 303 printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq);
251 dump_stack(); 304 dump_stack();
252 } 305 }
253 return -EBUSY; 306 return -EBUSY;
@@ -278,10 +331,10 @@ void free_irq(unsigned int irq, void *dev_id)
278 return; 331 return;
279 332
280 desc = irq_desc + irq; 333 desc = irq_desc + irq;
281 spin_lock_irqsave(&desc->lock,flags); 334 spin_lock_irqsave(&desc->lock, flags);
282 p = &desc->action; 335 p = &desc->action;
283 for (;;) { 336 for (;;) {
284 struct irqaction * action = *p; 337 struct irqaction *action = *p;
285 338
286 if (action) { 339 if (action) {
287 struct irqaction **pp = p; 340 struct irqaction **pp = p;
@@ -295,18 +348,18 @@ void free_irq(unsigned int irq, void *dev_id)
295 348
296 /* Currently used only by UML, might disappear one day.*/ 349 /* Currently used only by UML, might disappear one day.*/
297#ifdef CONFIG_IRQ_RELEASE_METHOD 350#ifdef CONFIG_IRQ_RELEASE_METHOD
298 if (desc->handler->release) 351 if (desc->chip->release)
299 desc->handler->release(irq, dev_id); 352 desc->chip->release(irq, dev_id);
300#endif 353#endif
301 354
302 if (!desc->action) { 355 if (!desc->action) {
303 desc->status |= IRQ_DISABLED; 356 desc->status |= IRQ_DISABLED;
304 if (desc->handler->shutdown) 357 if (desc->chip->shutdown)
305 desc->handler->shutdown(irq); 358 desc->chip->shutdown(irq);
306 else 359 else
307 desc->handler->disable(irq); 360 desc->chip->disable(irq);
308 } 361 }
309 spin_unlock_irqrestore(&desc->lock,flags); 362 spin_unlock_irqrestore(&desc->lock, flags);
310 unregister_handler_proc(irq, action); 363 unregister_handler_proc(irq, action);
311 364
312 /* Make sure it's not being used on another CPU */ 365 /* Make sure it's not being used on another CPU */
@@ -314,12 +367,11 @@ void free_irq(unsigned int irq, void *dev_id)
314 kfree(action); 367 kfree(action);
315 return; 368 return;
316 } 369 }
317 printk(KERN_ERR "Trying to free free IRQ%d\n",irq); 370 printk(KERN_ERR "Trying to free already-free IRQ %d\n", irq);
318 spin_unlock_irqrestore(&desc->lock,flags); 371 spin_unlock_irqrestore(&desc->lock, flags);
319 return; 372 return;
320 } 373 }
321} 374}
322
323EXPORT_SYMBOL(free_irq); 375EXPORT_SYMBOL(free_irq);
324 376
325/** 377/**
@@ -346,28 +398,36 @@ EXPORT_SYMBOL(free_irq);
346 * 398 *
347 * Flags: 399 * Flags:
348 * 400 *
349 * SA_SHIRQ Interrupt is shared 401 * IRQF_SHARED Interrupt is shared
350 * SA_INTERRUPT Disable local interrupts while processing 402 * IRQF_DISABLED Disable local interrupts while processing
351 * SA_SAMPLE_RANDOM The interrupt can be used for entropy 403 * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy
352 * 404 *
353 */ 405 */
354int request_irq(unsigned int irq, 406int request_irq(unsigned int irq,
355 irqreturn_t (*handler)(int, void *, struct pt_regs *), 407 irqreturn_t (*handler)(int, void *, struct pt_regs *),
356 unsigned long irqflags, const char * devname, void *dev_id) 408 unsigned long irqflags, const char *devname, void *dev_id)
357{ 409{
358 struct irqaction * action; 410 struct irqaction *action;
359 int retval; 411 int retval;
360 412
413#ifdef CONFIG_LOCKDEP
414 /*
415 * Lockdep wants atomic interrupt handlers:
416 */
417 irqflags |= SA_INTERRUPT;
418#endif
361 /* 419 /*
362 * Sanity-check: shared interrupts must pass in a real dev-ID, 420 * Sanity-check: shared interrupts must pass in a real dev-ID,
363 * otherwise we'll have trouble later trying to figure out 421 * otherwise we'll have trouble later trying to figure out
364 * which interrupt is which (messes up the interrupt freeing 422 * which interrupt is which (messes up the interrupt freeing
365 * logic etc). 423 * logic etc).
366 */ 424 */
367 if ((irqflags & SA_SHIRQ) && !dev_id) 425 if ((irqflags & IRQF_SHARED) && !dev_id)
368 return -EINVAL; 426 return -EINVAL;
369 if (irq >= NR_IRQS) 427 if (irq >= NR_IRQS)
370 return -EINVAL; 428 return -EINVAL;
429 if (irq_desc[irq].status & IRQ_NOREQUEST)
430 return -EINVAL;
371 if (!handler) 431 if (!handler)
372 return -EINVAL; 432 return -EINVAL;
373 433
@@ -390,6 +450,5 @@ int request_irq(unsigned int irq,
390 450
391 return retval; 451 return retval;
392} 452}
393
394EXPORT_SYMBOL(request_irq); 453EXPORT_SYMBOL(request_irq);
395 454
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 134f9f2e0e39..a57ebe9fa6f6 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -3,19 +3,19 @@
3 3
4void set_pending_irq(unsigned int irq, cpumask_t mask) 4void set_pending_irq(unsigned int irq, cpumask_t mask)
5{ 5{
6 irq_desc_t *desc = irq_desc + irq; 6 struct irq_desc *desc = irq_desc + irq;
7 unsigned long flags; 7 unsigned long flags;
8 8
9 spin_lock_irqsave(&desc->lock, flags); 9 spin_lock_irqsave(&desc->lock, flags);
10 desc->move_irq = 1; 10 desc->move_irq = 1;
11 pending_irq_cpumask[irq] = mask; 11 irq_desc[irq].pending_mask = mask;
12 spin_unlock_irqrestore(&desc->lock, flags); 12 spin_unlock_irqrestore(&desc->lock, flags);
13} 13}
14 14
15void move_native_irq(int irq) 15void move_native_irq(int irq)
16{ 16{
17 struct irq_desc *desc = irq_desc + irq;
17 cpumask_t tmp; 18 cpumask_t tmp;
18 irq_desc_t *desc = irq_descp(irq);
19 19
20 if (likely(!desc->move_irq)) 20 if (likely(!desc->move_irq))
21 return; 21 return;
@@ -30,15 +30,15 @@ void move_native_irq(int irq)
30 30
31 desc->move_irq = 0; 31 desc->move_irq = 0;
32 32
33 if (likely(cpus_empty(pending_irq_cpumask[irq]))) 33 if (unlikely(cpus_empty(irq_desc[irq].pending_mask)))
34 return; 34 return;
35 35
36 if (!desc->handler->set_affinity) 36 if (!desc->chip->set_affinity)
37 return; 37 return;
38 38
39 assert_spin_locked(&desc->lock); 39 assert_spin_locked(&desc->lock);
40 40
41 cpus_and(tmp, pending_irq_cpumask[irq], cpu_online_map); 41 cpus_and(tmp, irq_desc[irq].pending_mask, cpu_online_map);
42 42
43 /* 43 /*
44 * If there was a valid mask to work with, please 44 * If there was a valid mask to work with, please
@@ -49,14 +49,14 @@ void move_native_irq(int irq)
49 * cause some ioapics to mal-function. 49 * cause some ioapics to mal-function.
50 * Being paranoid i guess! 50 * Being paranoid i guess!
51 */ 51 */
52 if (unlikely(!cpus_empty(tmp))) { 52 if (likely(!cpus_empty(tmp))) {
53 if (likely(!(desc->status & IRQ_DISABLED))) 53 if (likely(!(desc->status & IRQ_DISABLED)))
54 desc->handler->disable(irq); 54 desc->chip->disable(irq);
55 55
56 desc->handler->set_affinity(irq,tmp); 56 desc->chip->set_affinity(irq,tmp);
57 57
58 if (likely(!(desc->status & IRQ_DISABLED))) 58 if (likely(!(desc->status & IRQ_DISABLED)))
59 desc->handler->enable(irq); 59 desc->chip->enable(irq);
60 } 60 }
61 cpus_clear(pending_irq_cpumask[irq]); 61 cpus_clear(irq_desc[irq].pending_mask);
62} 62}
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index d03b5eef8ce0..607c7809ad01 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -12,18 +12,15 @@
12 12
13#include "internals.h" 13#include "internals.h"
14 14
15static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS]; 15static struct proc_dir_entry *root_irq_dir;
16 16
17#ifdef CONFIG_SMP 17#ifdef CONFIG_SMP
18 18
19/*
20 * The /proc/irq/<irq>/smp_affinity values:
21 */
22static struct proc_dir_entry *smp_affinity_entry[NR_IRQS];
23
24#ifdef CONFIG_GENERIC_PENDING_IRQ 19#ifdef CONFIG_GENERIC_PENDING_IRQ
25void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) 20void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
26{ 21{
22 set_balance_irq_affinity(irq, mask_val);
23
27 /* 24 /*
28 * Save these away for later use. Re-progam when the 25 * Save these away for later use. Re-progam when the
29 * interrupt is pending 26 * interrupt is pending
@@ -33,15 +30,16 @@ void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
33#else 30#else
34void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) 31void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
35{ 32{
36 irq_affinity[irq] = mask_val; 33 set_balance_irq_affinity(irq, mask_val);
37 irq_desc[irq].handler->set_affinity(irq, mask_val); 34 irq_desc[irq].affinity = mask_val;
35 irq_desc[irq].chip->set_affinity(irq, mask_val);
38} 36}
39#endif 37#endif
40 38
41static int irq_affinity_read_proc(char *page, char **start, off_t off, 39static int irq_affinity_read_proc(char *page, char **start, off_t off,
42 int count, int *eof, void *data) 40 int count, int *eof, void *data)
43{ 41{
44 int len = cpumask_scnprintf(page, count, irq_affinity[(long)data]); 42 int len = cpumask_scnprintf(page, count, irq_desc[(long)data].affinity);
45 43
46 if (count - len < 2) 44 if (count - len < 2)
47 return -EINVAL; 45 return -EINVAL;
@@ -56,7 +54,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
56 unsigned int irq = (int)(long)data, full_count = count, err; 54 unsigned int irq = (int)(long)data, full_count = count, err;
57 cpumask_t new_value, tmp; 55 cpumask_t new_value, tmp;
58 56
59 if (!irq_desc[irq].handler->set_affinity || no_irq_affinity) 57 if (!irq_desc[irq].chip->set_affinity || no_irq_affinity)
60 return -EIO; 58 return -EIO;
61 59
62 err = cpumask_parse(buffer, count, new_value); 60 err = cpumask_parse(buffer, count, new_value);
@@ -99,7 +97,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
99{ 97{
100 char name [MAX_NAMELEN]; 98 char name [MAX_NAMELEN];
101 99
102 if (!irq_dir[irq] || action->dir || !action->name || 100 if (!irq_desc[irq].dir || action->dir || !action->name ||
103 !name_unique(irq, action)) 101 !name_unique(irq, action))
104 return; 102 return;
105 103
@@ -107,7 +105,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
107 snprintf(name, MAX_NAMELEN, "%s", action->name); 105 snprintf(name, MAX_NAMELEN, "%s", action->name);
108 106
109 /* create /proc/irq/1234/handler/ */ 107 /* create /proc/irq/1234/handler/ */
110 action->dir = proc_mkdir(name, irq_dir[irq]); 108 action->dir = proc_mkdir(name, irq_desc[irq].dir);
111} 109}
112 110
113#undef MAX_NAMELEN 111#undef MAX_NAMELEN
@@ -119,22 +117,22 @@ void register_irq_proc(unsigned int irq)
119 char name [MAX_NAMELEN]; 117 char name [MAX_NAMELEN];
120 118
121 if (!root_irq_dir || 119 if (!root_irq_dir ||
122 (irq_desc[irq].handler == &no_irq_type) || 120 (irq_desc[irq].chip == &no_irq_chip) ||
123 irq_dir[irq]) 121 irq_desc[irq].dir)
124 return; 122 return;
125 123
126 memset(name, 0, MAX_NAMELEN); 124 memset(name, 0, MAX_NAMELEN);
127 sprintf(name, "%d", irq); 125 sprintf(name, "%d", irq);
128 126
129 /* create /proc/irq/1234 */ 127 /* create /proc/irq/1234 */
130 irq_dir[irq] = proc_mkdir(name, root_irq_dir); 128 irq_desc[irq].dir = proc_mkdir(name, root_irq_dir);
131 129
132#ifdef CONFIG_SMP 130#ifdef CONFIG_SMP
133 { 131 {
134 struct proc_dir_entry *entry; 132 struct proc_dir_entry *entry;
135 133
136 /* create /proc/irq/<irq>/smp_affinity */ 134 /* create /proc/irq/<irq>/smp_affinity */
137 entry = create_proc_entry("smp_affinity", 0600, irq_dir[irq]); 135 entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir);
138 136
139 if (entry) { 137 if (entry) {
140 entry->nlink = 1; 138 entry->nlink = 1;
@@ -142,7 +140,6 @@ void register_irq_proc(unsigned int irq)
142 entry->read_proc = irq_affinity_read_proc; 140 entry->read_proc = irq_affinity_read_proc;
143 entry->write_proc = irq_affinity_write_proc; 141 entry->write_proc = irq_affinity_write_proc;
144 } 142 }
145 smp_affinity_entry[irq] = entry;
146 } 143 }
147#endif 144#endif
148} 145}
@@ -152,7 +149,7 @@ void register_irq_proc(unsigned int irq)
152void unregister_handler_proc(unsigned int irq, struct irqaction *action) 149void unregister_handler_proc(unsigned int irq, struct irqaction *action)
153{ 150{
154 if (action->dir) 151 if (action->dir)
155 remove_proc_entry(action->dir->name, irq_dir[irq]); 152 remove_proc_entry(action->dir->name, irq_desc[irq].dir);
156} 153}
157 154
158void init_irq_proc(void) 155void init_irq_proc(void)
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
new file mode 100644
index 000000000000..872f91ba2ce8
--- /dev/null
+++ b/kernel/irq/resend.c
@@ -0,0 +1,78 @@
1/*
2 * linux/kernel/irq/resend.c
3 *
4 * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
5 * Copyright (C) 2005-2006, Thomas Gleixner
6 *
7 * This file contains the IRQ-resend code
8 *
9 * If the interrupt is waiting to be processed, we try to re-run it.
10 * We can't directly run it from here since the caller might be in an
11 * interrupt-protected region. Not all irq controller chips can
12 * retrigger interrupts at the hardware level, so in those cases
13 * we allow the resending of IRQs via a tasklet.
14 */
15
16#include <linux/irq.h>
17#include <linux/module.h>
18#include <linux/random.h>
19#include <linux/interrupt.h>
20
21#include "internals.h"
22
23#ifdef CONFIG_HARDIRQS_SW_RESEND
24
25/* Bitmap to handle software resend of interrupts: */
26static DECLARE_BITMAP(irqs_resend, NR_IRQS);
27
28/*
29 * Run software resends of IRQ's
30 */
31static void resend_irqs(unsigned long arg)
32{
33 struct irq_desc *desc;
34 int irq;
35
36 while (!bitmap_empty(irqs_resend, NR_IRQS)) {
37 irq = find_first_bit(irqs_resend, NR_IRQS);
38 clear_bit(irq, irqs_resend);
39 desc = irq_desc + irq;
40 local_irq_disable();
41 desc->handle_irq(irq, desc, NULL);
42 local_irq_enable();
43 }
44}
45
46/* Tasklet to handle resend: */
47static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
48
49#endif
50
51/*
52 * IRQ resend
53 *
54 * Is called with interrupts disabled and desc->lock held.
55 */
56void check_irq_resend(struct irq_desc *desc, unsigned int irq)
57{
58 unsigned int status = desc->status;
59
60 /*
61 * Make sure the interrupt is enabled, before resending it:
62 */
63 desc->chip->enable(irq);
64
65 if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
66 desc->status &= ~IRQ_PENDING;
67 desc->status = status | IRQ_REPLAY;
68
69 if (!desc->chip || !desc->chip->retrigger ||
70 !desc->chip->retrigger(irq)) {
71#ifdef CONFIG_HARDIRQS_SW_RESEND
72 /* Set it pending and activate the softirq: */
73 set_bit(irq, irqs_resend);
74 tasklet_schedule(&resend_tasklet);
75#endif
76 }
77 }
78}
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 7df9abd5ec86..417e98092cf2 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -11,44 +11,44 @@
11#include <linux/kallsyms.h> 11#include <linux/kallsyms.h>
12#include <linux/interrupt.h> 12#include <linux/interrupt.h>
13 13
14static int irqfixup; 14static int irqfixup __read_mostly;
15 15
16/* 16/*
17 * Recovery handler for misrouted interrupts. 17 * Recovery handler for misrouted interrupts.
18 */ 18 */
19
20static int misrouted_irq(int irq, struct pt_regs *regs) 19static int misrouted_irq(int irq, struct pt_regs *regs)
21{ 20{
22 int i; 21 int i;
23 irq_desc_t *desc;
24 int ok = 0; 22 int ok = 0;
25 int work = 0; /* Did we do work for a real IRQ */ 23 int work = 0; /* Did we do work for a real IRQ */
26 24
27 for(i = 1; i < NR_IRQS; i++) { 25 for (i = 1; i < NR_IRQS; i++) {
26 struct irq_desc *desc = irq_desc + i;
28 struct irqaction *action; 27 struct irqaction *action;
29 28
30 if (i == irq) /* Already tried */ 29 if (i == irq) /* Already tried */
31 continue; 30 continue;
32 desc = &irq_desc[i]; 31
33 spin_lock(&desc->lock); 32 spin_lock(&desc->lock);
34 action = desc->action;
35 /* Already running on another processor */ 33 /* Already running on another processor */
36 if (desc->status & IRQ_INPROGRESS) { 34 if (desc->status & IRQ_INPROGRESS) {
37 /* 35 /*
38 * Already running: If it is shared get the other 36 * Already running: If it is shared get the other
39 * CPU to go looking for our mystery interrupt too 37 * CPU to go looking for our mystery interrupt too
40 */ 38 */
41 if (desc->action && (desc->action->flags & SA_SHIRQ)) 39 if (desc->action && (desc->action->flags & IRQF_SHARED))
42 desc->status |= IRQ_PENDING; 40 desc->status |= IRQ_PENDING;
43 spin_unlock(&desc->lock); 41 spin_unlock(&desc->lock);
44 continue; 42 continue;
45 } 43 }
46 /* Honour the normal IRQ locking */ 44 /* Honour the normal IRQ locking */
47 desc->status |= IRQ_INPROGRESS; 45 desc->status |= IRQ_INPROGRESS;
46 action = desc->action;
48 spin_unlock(&desc->lock); 47 spin_unlock(&desc->lock);
48
49 while (action) { 49 while (action) {
50 /* Only shared IRQ handlers are safe to call */ 50 /* Only shared IRQ handlers are safe to call */
51 if (action->flags & SA_SHIRQ) { 51 if (action->flags & IRQF_SHARED) {
52 if (action->handler(i, action->dev_id, regs) == 52 if (action->handler(i, action->dev_id, regs) ==
53 IRQ_HANDLED) 53 IRQ_HANDLED)
54 ok = 1; 54 ok = 1;
@@ -62,9 +62,8 @@ static int misrouted_irq(int irq, struct pt_regs *regs)
62 62
63 /* 63 /*
64 * While we were looking for a fixup someone queued a real 64 * While we were looking for a fixup someone queued a real
65 * IRQ clashing with our walk 65 * IRQ clashing with our walk:
66 */ 66 */
67
68 while ((desc->status & IRQ_PENDING) && action) { 67 while ((desc->status & IRQ_PENDING) && action) {
69 /* 68 /*
70 * Perform real IRQ processing for the IRQ we deferred 69 * Perform real IRQ processing for the IRQ we deferred
@@ -80,8 +79,8 @@ static int misrouted_irq(int irq, struct pt_regs *regs)
80 * If we did actual work for the real IRQ line we must let the 79 * If we did actual work for the real IRQ line we must let the
81 * IRQ controller clean up too 80 * IRQ controller clean up too
82 */ 81 */
83 if(work) 82 if (work && desc->chip && desc->chip->end)
84 desc->handler->end(i); 83 desc->chip->end(i);
85 spin_unlock(&desc->lock); 84 spin_unlock(&desc->lock);
86 } 85 }
87 /* So the caller can adjust the irq error counts */ 86 /* So the caller can adjust the irq error counts */
@@ -100,7 +99,8 @@ static int misrouted_irq(int irq, struct pt_regs *regs)
100 */ 99 */
101 100
102static void 101static void
103__report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) 102__report_bad_irq(unsigned int irq, struct irq_desc *desc,
103 irqreturn_t action_ret)
104{ 104{
105 struct irqaction *action; 105 struct irqaction *action;
106 106
@@ -113,6 +113,7 @@ __report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
113 } 113 }
114 dump_stack(); 114 dump_stack();
115 printk(KERN_ERR "handlers:\n"); 115 printk(KERN_ERR "handlers:\n");
116
116 action = desc->action; 117 action = desc->action;
117 while (action) { 118 while (action) {
118 printk(KERN_ERR "[<%p>]", action->handler); 119 printk(KERN_ERR "[<%p>]", action->handler);
@@ -123,7 +124,8 @@ __report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
123 } 124 }
124} 125}
125 126
126static void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) 127static void
128report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret)
127{ 129{
128 static int count = 100; 130 static int count = 100;
129 131
@@ -133,12 +135,12 @@ static void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t actio
133 } 135 }
134} 136}
135 137
136void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret, 138void note_interrupt(unsigned int irq, struct irq_desc *desc,
137 struct pt_regs *regs) 139 irqreturn_t action_ret, struct pt_regs *regs)
138{ 140{
139 if (action_ret != IRQ_HANDLED) { 141 if (unlikely(action_ret != IRQ_HANDLED)) {
140 desc->irqs_unhandled++; 142 desc->irqs_unhandled++;
141 if (action_ret != IRQ_NONE) 143 if (unlikely(action_ret != IRQ_NONE))
142 report_bad_irq(irq, desc, action_ret); 144 report_bad_irq(irq, desc, action_ret);
143 } 145 }
144 146
@@ -152,11 +154,11 @@ void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret,
152 } 154 }
153 155
154 desc->irq_count++; 156 desc->irq_count++;
155 if (desc->irq_count < 100000) 157 if (likely(desc->irq_count < 100000))
156 return; 158 return;
157 159
158 desc->irq_count = 0; 160 desc->irq_count = 0;
159 if (desc->irqs_unhandled > 99900) { 161 if (unlikely(desc->irqs_unhandled > 99900)) {
160 /* 162 /*
161 * The interrupt is stuck 163 * The interrupt is stuck
162 */ 164 */
@@ -166,17 +168,19 @@ void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret,
166 */ 168 */
167 printk(KERN_EMERG "Disabling IRQ #%d\n", irq); 169 printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
168 desc->status |= IRQ_DISABLED; 170 desc->status |= IRQ_DISABLED;
169 desc->handler->disable(irq); 171 desc->depth = 1;
172 desc->chip->disable(irq);
170 } 173 }
171 desc->irqs_unhandled = 0; 174 desc->irqs_unhandled = 0;
172} 175}
173 176
174int noirqdebug; 177int noirqdebug __read_mostly;
175 178
176int __init noirqdebug_setup(char *str) 179int __init noirqdebug_setup(char *str)
177{ 180{
178 noirqdebug = 1; 181 noirqdebug = 1;
179 printk(KERN_INFO "IRQ lockup detection disabled\n"); 182 printk(KERN_INFO "IRQ lockup detection disabled\n");
183
180 return 1; 184 return 1;
181} 185}
182 186
@@ -187,6 +191,7 @@ static int __init irqfixup_setup(char *str)
187 irqfixup = 1; 191 irqfixup = 1;
188 printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n"); 192 printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
189 printk(KERN_WARNING "This may impact system performance.\n"); 193 printk(KERN_WARNING "This may impact system performance.\n");
194
190 return 1; 195 return 1;
191} 196}
192 197
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 39277dd6bf90..ab16a5a4cfe9 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -275,8 +275,8 @@ static void upcase_if_global(struct kallsym_iter *iter)
275static int get_ksymbol_mod(struct kallsym_iter *iter) 275static int get_ksymbol_mod(struct kallsym_iter *iter)
276{ 276{
277 iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms, 277 iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms,
278 &iter->value, 278 &iter->value, &iter->type,
279 &iter->type, iter->name); 279 iter->name, sizeof(iter->name));
280 if (iter->owner == NULL) 280 if (iter->owner == NULL)
281 return 0; 281 return 0;
282 282
diff --git a/kernel/kexec.c b/kernel/kexec.c
index bf39d28e4c0e..50087ecf337e 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -902,14 +902,14 @@ static int kimage_load_segment(struct kimage *image,
902 * kexec does not sync, or unmount filesystems so if you need 902 * kexec does not sync, or unmount filesystems so if you need
903 * that to happen you need to do that yourself. 903 * that to happen you need to do that yourself.
904 */ 904 */
905struct kimage *kexec_image = NULL; 905struct kimage *kexec_image;
906static struct kimage *kexec_crash_image = NULL; 906struct kimage *kexec_crash_image;
907/* 907/*
908 * A home grown binary mutex. 908 * A home grown binary mutex.
909 * Nothing can wait so this mutex is safe to use 909 * Nothing can wait so this mutex is safe to use
910 * in interrupt context :) 910 * in interrupt context :)
911 */ 911 */
912static int kexec_lock = 0; 912static int kexec_lock;
913 913
914asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, 914asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
915 struct kexec_segment __user *segments, 915 struct kexec_segment __user *segments,
@@ -1042,7 +1042,6 @@ asmlinkage long compat_sys_kexec_load(unsigned long entry,
1042 1042
1043void crash_kexec(struct pt_regs *regs) 1043void crash_kexec(struct pt_regs *regs)
1044{ 1044{
1045 struct kimage *image;
1046 int locked; 1045 int locked;
1047 1046
1048 1047
@@ -1056,12 +1055,11 @@ void crash_kexec(struct pt_regs *regs)
1056 */ 1055 */
1057 locked = xchg(&kexec_lock, 1); 1056 locked = xchg(&kexec_lock, 1);
1058 if (!locked) { 1057 if (!locked) {
1059 image = xchg(&kexec_crash_image, NULL); 1058 if (kexec_crash_image) {
1060 if (image) {
1061 struct pt_regs fixed_regs; 1059 struct pt_regs fixed_regs;
1062 crash_setup_regs(&fixed_regs, regs); 1060 crash_setup_regs(&fixed_regs, regs);
1063 machine_crash_shutdown(&fixed_regs); 1061 machine_crash_shutdown(&fixed_regs);
1064 machine_kexec(image); 1062 machine_kexec(kexec_crash_image);
1065 } 1063 }
1066 xchg(&kexec_lock, 0); 1064 xchg(&kexec_lock, 0);
1067 } 1065 }
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 20a997c73c3d..1d32defa38ab 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -20,7 +20,6 @@
20*/ 20*/
21#define __KERNEL_SYSCALLS__ 21#define __KERNEL_SYSCALLS__
22 22
23#include <linux/config.h>
24#include <linux/module.h> 23#include <linux/module.h>
25#include <linux/sched.h> 24#include <linux/sched.h>
26#include <linux/syscalls.h> 25#include <linux/syscalls.h>
@@ -234,7 +233,7 @@ static void __call_usermodehelper(void *data)
234int call_usermodehelper_keys(char *path, char **argv, char **envp, 233int call_usermodehelper_keys(char *path, char **argv, char **envp,
235 struct key *session_keyring, int wait) 234 struct key *session_keyring, int wait)
236{ 235{
237 DECLARE_COMPLETION(done); 236 DECLARE_COMPLETION_ONSTACK(done);
238 struct subprocess_info sub_info = { 237 struct subprocess_info sub_info = {
239 .complete = &done, 238 .complete = &done,
240 .path = path, 239 .path = path,
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 1fbf466a29aa..64aab081153b 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -47,11 +47,17 @@
47 47
48static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; 48static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
49static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; 49static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
50static atomic_t kprobe_count;
50 51
51DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ 52DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
52DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ 53DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */
53static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; 54static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
54 55
56static struct notifier_block kprobe_page_fault_nb = {
57 .notifier_call = kprobe_exceptions_notify,
58 .priority = 0x7fffffff /* we need to notified first */
59};
60
55#ifdef __ARCH_WANT_KPROBES_INSN_SLOT 61#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
56/* 62/*
57 * kprobe->ainsn.insn points to the copy of the instruction to be 63 * kprobe->ainsn.insn points to the copy of the instruction to be
@@ -368,16 +374,15 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
368*/ 374*/
369static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p) 375static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
370{ 376{
371 struct kprobe *kp;
372
373 if (p->break_handler) { 377 if (p->break_handler) {
374 list_for_each_entry_rcu(kp, &old_p->list, list) { 378 if (old_p->break_handler)
375 if (kp->break_handler) 379 return -EEXIST;
376 return -EEXIST;
377 }
378 list_add_tail_rcu(&p->list, &old_p->list); 380 list_add_tail_rcu(&p->list, &old_p->list);
381 old_p->break_handler = aggr_break_handler;
379 } else 382 } else
380 list_add_rcu(&p->list, &old_p->list); 383 list_add_rcu(&p->list, &old_p->list);
384 if (p->post_handler && !old_p->post_handler)
385 old_p->post_handler = aggr_post_handler;
381 return 0; 386 return 0;
382} 387}
383 388
@@ -390,9 +395,11 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
390 copy_kprobe(p, ap); 395 copy_kprobe(p, ap);
391 ap->addr = p->addr; 396 ap->addr = p->addr;
392 ap->pre_handler = aggr_pre_handler; 397 ap->pre_handler = aggr_pre_handler;
393 ap->post_handler = aggr_post_handler;
394 ap->fault_handler = aggr_fault_handler; 398 ap->fault_handler = aggr_fault_handler;
395 ap->break_handler = aggr_break_handler; 399 if (p->post_handler)
400 ap->post_handler = aggr_post_handler;
401 if (p->break_handler)
402 ap->break_handler = aggr_break_handler;
396 403
397 INIT_LIST_HEAD(&ap->list); 404 INIT_LIST_HEAD(&ap->list);
398 list_add_rcu(&p->list, &ap->list); 405 list_add_rcu(&p->list, &ap->list);
@@ -464,6 +471,8 @@ static int __kprobes __register_kprobe(struct kprobe *p,
464 old_p = get_kprobe(p->addr); 471 old_p = get_kprobe(p->addr);
465 if (old_p) { 472 if (old_p) {
466 ret = register_aggr_kprobe(old_p, p); 473 ret = register_aggr_kprobe(old_p, p);
474 if (!ret)
475 atomic_inc(&kprobe_count);
467 goto out; 476 goto out;
468 } 477 }
469 478
@@ -474,6 +483,10 @@ static int __kprobes __register_kprobe(struct kprobe *p,
474 hlist_add_head_rcu(&p->hlist, 483 hlist_add_head_rcu(&p->hlist,
475 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); 484 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
476 485
486 if (atomic_add_return(1, &kprobe_count) == \
487 (ARCH_INACTIVE_KPROBE_COUNT + 1))
488 register_page_fault_notifier(&kprobe_page_fault_nb);
489
477 arch_arm_kprobe(p); 490 arch_arm_kprobe(p);
478 491
479out: 492out:
@@ -536,14 +549,40 @@ valid_p:
536 kfree(old_p); 549 kfree(old_p);
537 } 550 }
538 arch_remove_kprobe(p); 551 arch_remove_kprobe(p);
552 } else {
553 mutex_lock(&kprobe_mutex);
554 if (p->break_handler)
555 old_p->break_handler = NULL;
556 if (p->post_handler){
557 list_for_each_entry_rcu(list_p, &old_p->list, list){
558 if (list_p->post_handler){
559 cleanup_p = 2;
560 break;
561 }
562 }
563 if (cleanup_p == 0)
564 old_p->post_handler = NULL;
565 }
566 mutex_unlock(&kprobe_mutex);
539 } 567 }
568
569 /* Call unregister_page_fault_notifier()
570 * if no probes are active
571 */
572 mutex_lock(&kprobe_mutex);
573 if (atomic_add_return(-1, &kprobe_count) == \
574 ARCH_INACTIVE_KPROBE_COUNT)
575 unregister_page_fault_notifier(&kprobe_page_fault_nb);
576 mutex_unlock(&kprobe_mutex);
577 return;
540} 578}
541 579
542static struct notifier_block kprobe_exceptions_nb = { 580static struct notifier_block kprobe_exceptions_nb = {
543 .notifier_call = kprobe_exceptions_notify, 581 .notifier_call = kprobe_exceptions_notify,
544 .priority = 0x7fffffff /* we need to notified first */ 582 .priority = 0x7fffffff /* we need to be notified first */
545}; 583};
546 584
585
547int __kprobes register_jprobe(struct jprobe *jp) 586int __kprobes register_jprobe(struct jprobe *jp)
548{ 587{
549 /* Todo: Verify probepoint is a function entry point */ 588 /* Todo: Verify probepoint is a function entry point */
@@ -652,6 +691,7 @@ static int __init init_kprobes(void)
652 INIT_HLIST_HEAD(&kprobe_table[i]); 691 INIT_HLIST_HEAD(&kprobe_table[i]);
653 INIT_HLIST_HEAD(&kretprobe_inst_table[i]); 692 INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
654 } 693 }
694 atomic_set(&kprobe_count, 0);
655 695
656 err = arch_init_kprobes(); 696 err = arch_init_kprobes();
657 if (!err) 697 if (!err)
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index f119e098e67b..e0ffe4ab0917 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -8,12 +8,12 @@
8 * 8 *
9 */ 9 */
10 10
11#include <linux/config.h>
12#include <linux/kobject.h> 11#include <linux/kobject.h>
13#include <linux/string.h> 12#include <linux/string.h>
14#include <linux/sysfs.h> 13#include <linux/sysfs.h>
15#include <linux/module.h> 14#include <linux/module.h>
16#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/kexec.h>
17 17
18#define KERNEL_ATTR_RO(_name) \ 18#define KERNEL_ATTR_RO(_name) \
19static struct subsys_attribute _name##_attr = __ATTR_RO(_name) 19static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
@@ -48,6 +48,20 @@ static ssize_t uevent_helper_store(struct subsystem *subsys, const char *page, s
48KERNEL_ATTR_RW(uevent_helper); 48KERNEL_ATTR_RW(uevent_helper);
49#endif 49#endif
50 50
51#ifdef CONFIG_KEXEC
52static ssize_t kexec_loaded_show(struct subsystem *subsys, char *page)
53{
54 return sprintf(page, "%d\n", !!kexec_image);
55}
56KERNEL_ATTR_RO(kexec_loaded);
57
58static ssize_t kexec_crash_loaded_show(struct subsystem *subsys, char *page)
59{
60 return sprintf(page, "%d\n", !!kexec_crash_image);
61}
62KERNEL_ATTR_RO(kexec_crash_loaded);
63#endif /* CONFIG_KEXEC */
64
51decl_subsys(kernel, NULL, NULL); 65decl_subsys(kernel, NULL, NULL);
52EXPORT_SYMBOL_GPL(kernel_subsys); 66EXPORT_SYMBOL_GPL(kernel_subsys);
53 67
@@ -56,6 +70,10 @@ static struct attribute * kernel_attrs[] = {
56 &uevent_seqnum_attr.attr, 70 &uevent_seqnum_attr.attr,
57 &uevent_helper_attr.attr, 71 &uevent_helper_attr.attr,
58#endif 72#endif
73#ifdef CONFIG_KEXEC
74 &kexec_loaded_attr.attr,
75 &kexec_crash_loaded_attr.attr,
76#endif
59 NULL 77 NULL
60}; 78};
61 79
diff --git a/kernel/kthread.c b/kernel/kthread.c
index c5f3c6613b6d..4f9c60ef95e8 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -45,6 +45,13 @@ struct kthread_stop_info
45static DEFINE_MUTEX(kthread_stop_lock); 45static DEFINE_MUTEX(kthread_stop_lock);
46static struct kthread_stop_info kthread_stop_info; 46static struct kthread_stop_info kthread_stop_info;
47 47
48/**
49 * kthread_should_stop - should this kthread return now?
50 *
51 * When someone calls kthread_stop on your kthread, it will be woken
52 * and this will return true. You should then return, and your return
53 * value will be passed through to kthread_stop().
54 */
48int kthread_should_stop(void) 55int kthread_should_stop(void)
49{ 56{
50 return (kthread_stop_info.k == current); 57 return (kthread_stop_info.k == current);
@@ -122,6 +129,25 @@ static void keventd_create_kthread(void *_create)
122 complete(&create->done); 129 complete(&create->done);
123} 130}
124 131
132/**
133 * kthread_create - create a kthread.
134 * @threadfn: the function to run until signal_pending(current).
135 * @data: data ptr for @threadfn.
136 * @namefmt: printf-style name for the thread.
137 *
138 * Description: This helper function creates and names a kernel
139 * thread. The thread will be stopped: use wake_up_process() to start
140 * it. See also kthread_run(), kthread_create_on_cpu().
141 *
142 * When woken, the thread will run @threadfn() with @data as its
143 * argument. @threadfn can either call do_exit() directly if it is a
144 * standalone thread for which noone will call kthread_stop(), or
145 * return when 'kthread_should_stop()' is true (which means
146 * kthread_stop() has been called). The return value should be zero
147 * or a negative error number; it will be passed to kthread_stop().
148 *
149 * Returns a task_struct or ERR_PTR(-ENOMEM).
150 */
125struct task_struct *kthread_create(int (*threadfn)(void *data), 151struct task_struct *kthread_create(int (*threadfn)(void *data),
126 void *data, 152 void *data,
127 const char namefmt[], 153 const char namefmt[],
@@ -156,6 +182,15 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
156} 182}
157EXPORT_SYMBOL(kthread_create); 183EXPORT_SYMBOL(kthread_create);
158 184
185/**
186 * kthread_bind - bind a just-created kthread to a cpu.
187 * @k: thread created by kthread_create().
188 * @cpu: cpu (might not be online, must be possible) for @k to run on.
189 *
190 * Description: This function is equivalent to set_cpus_allowed(),
191 * except that @cpu doesn't need to be online, and the thread must be
192 * stopped (i.e., just returned from kthread_create().
193 */
159void kthread_bind(struct task_struct *k, unsigned int cpu) 194void kthread_bind(struct task_struct *k, unsigned int cpu)
160{ 195{
161 BUG_ON(k->state != TASK_INTERRUPTIBLE); 196 BUG_ON(k->state != TASK_INTERRUPTIBLE);
@@ -166,14 +201,21 @@ void kthread_bind(struct task_struct *k, unsigned int cpu)
166} 201}
167EXPORT_SYMBOL(kthread_bind); 202EXPORT_SYMBOL(kthread_bind);
168 203
204/**
205 * kthread_stop - stop a thread created by kthread_create().
206 * @k: thread created by kthread_create().
207 *
208 * Sets kthread_should_stop() for @k to return true, wakes it, and
209 * waits for it to exit. Your threadfn() must not call do_exit()
210 * itself if you use this function! This can also be called after
211 * kthread_create() instead of calling wake_up_process(): the thread
212 * will exit without calling threadfn().
213 *
214 * Returns the result of threadfn(), or %-EINTR if wake_up_process()
215 * was never called.
216 */
169int kthread_stop(struct task_struct *k) 217int kthread_stop(struct task_struct *k)
170{ 218{
171 return kthread_stop_sem(k, NULL);
172}
173EXPORT_SYMBOL(kthread_stop);
174
175int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
176{
177 int ret; 219 int ret;
178 220
179 mutex_lock(&kthread_stop_lock); 221 mutex_lock(&kthread_stop_lock);
@@ -187,10 +229,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
187 229
188 /* Now set kthread_should_stop() to true, and wake it up. */ 230 /* Now set kthread_should_stop() to true, and wake it up. */
189 kthread_stop_info.k = k; 231 kthread_stop_info.k = k;
190 if (s) 232 wake_up_process(k);
191 up(s);
192 else
193 wake_up_process(k);
194 put_task_struct(k); 233 put_task_struct(k);
195 234
196 /* Once it dies, reset stop ptr, gather result and we're done. */ 235 /* Once it dies, reset stop ptr, gather result and we're done. */
@@ -201,7 +240,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
201 240
202 return ret; 241 return ret;
203} 242}
204EXPORT_SYMBOL(kthread_stop_sem); 243EXPORT_SYMBOL(kthread_stop);
205 244
206static __init int helper_init(void) 245static __init int helper_init(void)
207{ 246{
@@ -210,5 +249,5 @@ static __init int helper_init(void)
210 249
211 return 0; 250 return 0;
212} 251}
213core_initcall(helper_init);
214 252
253core_initcall(helper_init);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
new file mode 100644
index 000000000000..9bad17884513
--- /dev/null
+++ b/kernel/lockdep.c
@@ -0,0 +1,2704 @@
1/*
2 * kernel/lockdep.c
3 *
4 * Runtime locking correctness validator
5 *
6 * Started by Ingo Molnar:
7 *
8 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
9 *
10 * this code maps all the lock dependencies as they occur in a live kernel
11 * and will warn about the following classes of locking bugs:
12 *
13 * - lock inversion scenarios
14 * - circular lock dependencies
15 * - hardirq/softirq safe/unsafe locking bugs
16 *
17 * Bugs are reported even if the current locking scenario does not cause
18 * any deadlock at this point.
19 *
20 * I.e. if anytime in the past two locks were taken in a different order,
21 * even if it happened for another task, even if those were different
22 * locks (but of the same class as this lock), this code will detect it.
23 *
24 * Thanks to Arjan van de Ven for coming up with the initial idea of
25 * mapping lock dependencies runtime.
26 */
27#include <linux/mutex.h>
28#include <linux/sched.h>
29#include <linux/delay.h>
30#include <linux/module.h>
31#include <linux/proc_fs.h>
32#include <linux/seq_file.h>
33#include <linux/spinlock.h>
34#include <linux/kallsyms.h>
35#include <linux/interrupt.h>
36#include <linux/stacktrace.h>
37#include <linux/debug_locks.h>
38#include <linux/irqflags.h>
39
40#include <asm/sections.h>
41
42#include "lockdep_internals.h"
43
44/*
45 * hash_lock: protects the lockdep hashes and class/list/hash allocators.
46 *
47 * This is one of the rare exceptions where it's justified
48 * to use a raw spinlock - we really dont want the spinlock
49 * code to recurse back into the lockdep code.
50 */
51static raw_spinlock_t hash_lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
52
53static int lockdep_initialized;
54
55unsigned long nr_list_entries;
56static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
57
58/*
59 * Allocate a lockdep entry. (assumes hash_lock held, returns
60 * with NULL on failure)
61 */
62static struct lock_list *alloc_list_entry(void)
63{
64 if (nr_list_entries >= MAX_LOCKDEP_ENTRIES) {
65 __raw_spin_unlock(&hash_lock);
66 debug_locks_off();
67 printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n");
68 printk("turning off the locking correctness validator.\n");
69 return NULL;
70 }
71 return list_entries + nr_list_entries++;
72}
73
74/*
75 * All data structures here are protected by the global debug_lock.
76 *
77 * Mutex key structs only get allocated, once during bootup, and never
78 * get freed - this significantly simplifies the debugging code.
79 */
80unsigned long nr_lock_classes;
81static struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
82
83/*
84 * We keep a global list of all lock classes. The list only grows,
85 * never shrinks. The list is only accessed with the lockdep
86 * spinlock lock held.
87 */
88LIST_HEAD(all_lock_classes);
89
90/*
91 * The lockdep classes are in a hash-table as well, for fast lookup:
92 */
93#define CLASSHASH_BITS (MAX_LOCKDEP_KEYS_BITS - 1)
94#define CLASSHASH_SIZE (1UL << CLASSHASH_BITS)
95#define CLASSHASH_MASK (CLASSHASH_SIZE - 1)
96#define __classhashfn(key) ((((unsigned long)key >> CLASSHASH_BITS) + (unsigned long)key) & CLASSHASH_MASK)
97#define classhashentry(key) (classhash_table + __classhashfn((key)))
98
99static struct list_head classhash_table[CLASSHASH_SIZE];
100
101unsigned long nr_lock_chains;
102static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS];
103
104/*
105 * We put the lock dependency chains into a hash-table as well, to cache
106 * their existence:
107 */
108#define CHAINHASH_BITS (MAX_LOCKDEP_CHAINS_BITS-1)
109#define CHAINHASH_SIZE (1UL << CHAINHASH_BITS)
110#define CHAINHASH_MASK (CHAINHASH_SIZE - 1)
111#define __chainhashfn(chain) \
112 (((chain >> CHAINHASH_BITS) + chain) & CHAINHASH_MASK)
113#define chainhashentry(chain) (chainhash_table + __chainhashfn((chain)))
114
115static struct list_head chainhash_table[CHAINHASH_SIZE];
116
117/*
118 * The hash key of the lock dependency chains is a hash itself too:
119 * it's a hash of all locks taken up to that lock, including that lock.
120 * It's a 64-bit hash, because it's important for the keys to be
121 * unique.
122 */
123#define iterate_chain_key(key1, key2) \
124 (((key1) << MAX_LOCKDEP_KEYS_BITS/2) ^ \
125 ((key1) >> (64-MAX_LOCKDEP_KEYS_BITS/2)) ^ \
126 (key2))
127
128void lockdep_off(void)
129{
130 current->lockdep_recursion++;
131}
132
133EXPORT_SYMBOL(lockdep_off);
134
135void lockdep_on(void)
136{
137 current->lockdep_recursion--;
138}
139
140EXPORT_SYMBOL(lockdep_on);
141
142int lockdep_internal(void)
143{
144 return current->lockdep_recursion != 0;
145}
146
147EXPORT_SYMBOL(lockdep_internal);
148
149/*
150 * Debugging switches:
151 */
152
153#define VERBOSE 0
154#ifdef VERBOSE
155# define VERY_VERBOSE 0
156#endif
157
158#if VERBOSE
159# define HARDIRQ_VERBOSE 1
160# define SOFTIRQ_VERBOSE 1
161#else
162# define HARDIRQ_VERBOSE 0
163# define SOFTIRQ_VERBOSE 0
164#endif
165
166#if VERBOSE || HARDIRQ_VERBOSE || SOFTIRQ_VERBOSE
167/*
168 * Quick filtering for interesting events:
169 */
170static int class_filter(struct lock_class *class)
171{
172#if 0
173 /* Example */
174 if (class->name_version == 1 &&
175 !strcmp(class->name, "lockname"))
176 return 1;
177 if (class->name_version == 1 &&
178 !strcmp(class->name, "&struct->lockfield"))
179 return 1;
180#endif
181 /* Allow everything else. 0 would be filter everything else */
182 return 1;
183}
184#endif
185
186static int verbose(struct lock_class *class)
187{
188#if VERBOSE
189 return class_filter(class);
190#endif
191 return 0;
192}
193
194#ifdef CONFIG_TRACE_IRQFLAGS
195
196static int hardirq_verbose(struct lock_class *class)
197{
198#if HARDIRQ_VERBOSE
199 return class_filter(class);
200#endif
201 return 0;
202}
203
204static int softirq_verbose(struct lock_class *class)
205{
206#if SOFTIRQ_VERBOSE
207 return class_filter(class);
208#endif
209 return 0;
210}
211
212#endif
213
214/*
215 * Stack-trace: tightly packed array of stack backtrace
216 * addresses. Protected by the hash_lock.
217 */
218unsigned long nr_stack_trace_entries;
219static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
220
221static int save_trace(struct stack_trace *trace)
222{
223 trace->nr_entries = 0;
224 trace->max_entries = MAX_STACK_TRACE_ENTRIES - nr_stack_trace_entries;
225 trace->entries = stack_trace + nr_stack_trace_entries;
226
227 save_stack_trace(trace, NULL, 0, 3);
228
229 trace->max_entries = trace->nr_entries;
230
231 nr_stack_trace_entries += trace->nr_entries;
232 if (DEBUG_LOCKS_WARN_ON(nr_stack_trace_entries > MAX_STACK_TRACE_ENTRIES))
233 return 0;
234
235 if (nr_stack_trace_entries == MAX_STACK_TRACE_ENTRIES) {
236 __raw_spin_unlock(&hash_lock);
237 if (debug_locks_off()) {
238 printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n");
239 printk("turning off the locking correctness validator.\n");
240 dump_stack();
241 }
242 return 0;
243 }
244
245 return 1;
246}
247
248unsigned int nr_hardirq_chains;
249unsigned int nr_softirq_chains;
250unsigned int nr_process_chains;
251unsigned int max_lockdep_depth;
252unsigned int max_recursion_depth;
253
254#ifdef CONFIG_DEBUG_LOCKDEP
255/*
256 * We cannot printk in early bootup code. Not even early_printk()
257 * might work. So we mark any initialization errors and printk
258 * about it later on, in lockdep_info().
259 */
260static int lockdep_init_error;
261
262/*
263 * Various lockdep statistics:
264 */
265atomic_t chain_lookup_hits;
266atomic_t chain_lookup_misses;
267atomic_t hardirqs_on_events;
268atomic_t hardirqs_off_events;
269atomic_t redundant_hardirqs_on;
270atomic_t redundant_hardirqs_off;
271atomic_t softirqs_on_events;
272atomic_t softirqs_off_events;
273atomic_t redundant_softirqs_on;
274atomic_t redundant_softirqs_off;
275atomic_t nr_unused_locks;
276atomic_t nr_cyclic_checks;
277atomic_t nr_cyclic_check_recursions;
278atomic_t nr_find_usage_forwards_checks;
279atomic_t nr_find_usage_forwards_recursions;
280atomic_t nr_find_usage_backwards_checks;
281atomic_t nr_find_usage_backwards_recursions;
282# define debug_atomic_inc(ptr) atomic_inc(ptr)
283# define debug_atomic_dec(ptr) atomic_dec(ptr)
284# define debug_atomic_read(ptr) atomic_read(ptr)
285#else
286# define debug_atomic_inc(ptr) do { } while (0)
287# define debug_atomic_dec(ptr) do { } while (0)
288# define debug_atomic_read(ptr) 0
289#endif
290
291/*
292 * Locking printouts:
293 */
294
295static const char *usage_str[] =
296{
297 [LOCK_USED] = "initial-use ",
298 [LOCK_USED_IN_HARDIRQ] = "in-hardirq-W",
299 [LOCK_USED_IN_SOFTIRQ] = "in-softirq-W",
300 [LOCK_ENABLED_SOFTIRQS] = "softirq-on-W",
301 [LOCK_ENABLED_HARDIRQS] = "hardirq-on-W",
302 [LOCK_USED_IN_HARDIRQ_READ] = "in-hardirq-R",
303 [LOCK_USED_IN_SOFTIRQ_READ] = "in-softirq-R",
304 [LOCK_ENABLED_SOFTIRQS_READ] = "softirq-on-R",
305 [LOCK_ENABLED_HARDIRQS_READ] = "hardirq-on-R",
306};
307
308const char * __get_key_name(struct lockdep_subclass_key *key, char *str)
309{
310 unsigned long offs, size;
311 char *modname;
312
313 return kallsyms_lookup((unsigned long)key, &size, &offs, &modname, str);
314}
315
316void
317get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4)
318{
319 *c1 = '.', *c2 = '.', *c3 = '.', *c4 = '.';
320
321 if (class->usage_mask & LOCKF_USED_IN_HARDIRQ)
322 *c1 = '+';
323 else
324 if (class->usage_mask & LOCKF_ENABLED_HARDIRQS)
325 *c1 = '-';
326
327 if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ)
328 *c2 = '+';
329 else
330 if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS)
331 *c2 = '-';
332
333 if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ)
334 *c3 = '-';
335 if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ) {
336 *c3 = '+';
337 if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ)
338 *c3 = '?';
339 }
340
341 if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ)
342 *c4 = '-';
343 if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ) {
344 *c4 = '+';
345 if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ)
346 *c4 = '?';
347 }
348}
349
350static void print_lock_name(struct lock_class *class)
351{
352 char str[128], c1, c2, c3, c4;
353 const char *name;
354
355 get_usage_chars(class, &c1, &c2, &c3, &c4);
356
357 name = class->name;
358 if (!name) {
359 name = __get_key_name(class->key, str);
360 printk(" (%s", name);
361 } else {
362 printk(" (%s", name);
363 if (class->name_version > 1)
364 printk("#%d", class->name_version);
365 if (class->subclass)
366 printk("/%d", class->subclass);
367 }
368 printk("){%c%c%c%c}", c1, c2, c3, c4);
369}
370
371static void print_lockdep_cache(struct lockdep_map *lock)
372{
373 const char *name;
374 char str[128];
375
376 name = lock->name;
377 if (!name)
378 name = __get_key_name(lock->key->subkeys, str);
379
380 printk("%s", name);
381}
382
383static void print_lock(struct held_lock *hlock)
384{
385 print_lock_name(hlock->class);
386 printk(", at: ");
387 print_ip_sym(hlock->acquire_ip);
388}
389
390static void lockdep_print_held_locks(struct task_struct *curr)
391{
392 int i, depth = curr->lockdep_depth;
393
394 if (!depth) {
395 printk("no locks held by %s/%d.\n", curr->comm, curr->pid);
396 return;
397 }
398 printk("%d lock%s held by %s/%d:\n",
399 depth, depth > 1 ? "s" : "", curr->comm, curr->pid);
400
401 for (i = 0; i < depth; i++) {
402 printk(" #%d: ", i);
403 print_lock(curr->held_locks + i);
404 }
405}
406
407static void print_lock_class_header(struct lock_class *class, int depth)
408{
409 int bit;
410
411 printk("%*s->", depth, "");
412 print_lock_name(class);
413 printk(" ops: %lu", class->ops);
414 printk(" {\n");
415
416 for (bit = 0; bit < LOCK_USAGE_STATES; bit++) {
417 if (class->usage_mask & (1 << bit)) {
418 int len = depth;
419
420 len += printk("%*s %s", depth, "", usage_str[bit]);
421 len += printk(" at:\n");
422 print_stack_trace(class->usage_traces + bit, len);
423 }
424 }
425 printk("%*s }\n", depth, "");
426
427 printk("%*s ... key at: ",depth,"");
428 print_ip_sym((unsigned long)class->key);
429}
430
431/*
432 * printk all lock dependencies starting at <entry>:
433 */
434static void print_lock_dependencies(struct lock_class *class, int depth)
435{
436 struct lock_list *entry;
437
438 if (DEBUG_LOCKS_WARN_ON(depth >= 20))
439 return;
440
441 print_lock_class_header(class, depth);
442
443 list_for_each_entry(entry, &class->locks_after, entry) {
444 DEBUG_LOCKS_WARN_ON(!entry->class);
445 print_lock_dependencies(entry->class, depth + 1);
446
447 printk("%*s ... acquired at:\n",depth,"");
448 print_stack_trace(&entry->trace, 2);
449 printk("\n");
450 }
451}
452
453/*
454 * Add a new dependency to the head of the list:
455 */
456static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
457 struct list_head *head, unsigned long ip)
458{
459 struct lock_list *entry;
460 /*
461 * Lock not present yet - get a new dependency struct and
462 * add it to the list:
463 */
464 entry = alloc_list_entry();
465 if (!entry)
466 return 0;
467
468 entry->class = this;
469 save_trace(&entry->trace);
470
471 /*
472 * Since we never remove from the dependency list, the list can
473 * be walked lockless by other CPUs, it's only allocation
474 * that must be protected by the spinlock. But this also means
475 * we must make new entries visible only once writes to the
476 * entry become visible - hence the RCU op:
477 */
478 list_add_tail_rcu(&entry->entry, head);
479
480 return 1;
481}
482
483/*
484 * Recursive, forwards-direction lock-dependency checking, used for
485 * both noncyclic checking and for hardirq-unsafe/softirq-unsafe
486 * checking.
487 *
488 * (to keep the stackframe of the recursive functions small we
489 * use these global variables, and we also mark various helper
490 * functions as noinline.)
491 */
492static struct held_lock *check_source, *check_target;
493
494/*
495 * Print a dependency chain entry (this is only done when a deadlock
496 * has been detected):
497 */
498static noinline int
499print_circular_bug_entry(struct lock_list *target, unsigned int depth)
500{
501 if (debug_locks_silent)
502 return 0;
503 printk("\n-> #%u", depth);
504 print_lock_name(target->class);
505 printk(":\n");
506 print_stack_trace(&target->trace, 6);
507
508 return 0;
509}
510
511/*
512 * When a circular dependency is detected, print the
513 * header first:
514 */
515static noinline int
516print_circular_bug_header(struct lock_list *entry, unsigned int depth)
517{
518 struct task_struct *curr = current;
519
520 __raw_spin_unlock(&hash_lock);
521 debug_locks_off();
522 if (debug_locks_silent)
523 return 0;
524
525 printk("\n=======================================================\n");
526 printk( "[ INFO: possible circular locking dependency detected ]\n");
527 printk( "-------------------------------------------------------\n");
528 printk("%s/%d is trying to acquire lock:\n",
529 curr->comm, curr->pid);
530 print_lock(check_source);
531 printk("\nbut task is already holding lock:\n");
532 print_lock(check_target);
533 printk("\nwhich lock already depends on the new lock.\n\n");
534 printk("\nthe existing dependency chain (in reverse order) is:\n");
535
536 print_circular_bug_entry(entry, depth);
537
538 return 0;
539}
540
541static noinline int print_circular_bug_tail(void)
542{
543 struct task_struct *curr = current;
544 struct lock_list this;
545
546 if (debug_locks_silent)
547 return 0;
548
549 this.class = check_source->class;
550 save_trace(&this.trace);
551 print_circular_bug_entry(&this, 0);
552
553 printk("\nother info that might help us debug this:\n\n");
554 lockdep_print_held_locks(curr);
555
556 printk("\nstack backtrace:\n");
557 dump_stack();
558
559 return 0;
560}
561
562static int noinline print_infinite_recursion_bug(void)
563{
564 __raw_spin_unlock(&hash_lock);
565 DEBUG_LOCKS_WARN_ON(1);
566
567 return 0;
568}
569
570/*
571 * Prove that the dependency graph starting at <entry> can not
572 * lead to <target>. Print an error and return 0 if it does.
573 */
574static noinline int
575check_noncircular(struct lock_class *source, unsigned int depth)
576{
577 struct lock_list *entry;
578
579 debug_atomic_inc(&nr_cyclic_check_recursions);
580 if (depth > max_recursion_depth)
581 max_recursion_depth = depth;
582 if (depth >= 20)
583 return print_infinite_recursion_bug();
584 /*
585 * Check this lock's dependency list:
586 */
587 list_for_each_entry(entry, &source->locks_after, entry) {
588 if (entry->class == check_target->class)
589 return print_circular_bug_header(entry, depth+1);
590 debug_atomic_inc(&nr_cyclic_checks);
591 if (!check_noncircular(entry->class, depth+1))
592 return print_circular_bug_entry(entry, depth+1);
593 }
594 return 1;
595}
596
597static int very_verbose(struct lock_class *class)
598{
599#if VERY_VERBOSE
600 return class_filter(class);
601#endif
602 return 0;
603}
604#ifdef CONFIG_TRACE_IRQFLAGS
605
606/*
607 * Forwards and backwards subgraph searching, for the purposes of
608 * proving that two subgraphs can be connected by a new dependency
609 * without creating any illegal irq-safe -> irq-unsafe lock dependency.
610 */
611static enum lock_usage_bit find_usage_bit;
612static struct lock_class *forwards_match, *backwards_match;
613
614/*
615 * Find a node in the forwards-direction dependency sub-graph starting
616 * at <source> that matches <find_usage_bit>.
617 *
618 * Return 2 if such a node exists in the subgraph, and put that node
619 * into <forwards_match>.
620 *
621 * Return 1 otherwise and keep <forwards_match> unchanged.
622 * Return 0 on error.
623 */
624static noinline int
625find_usage_forwards(struct lock_class *source, unsigned int depth)
626{
627 struct lock_list *entry;
628 int ret;
629
630 if (depth > max_recursion_depth)
631 max_recursion_depth = depth;
632 if (depth >= 20)
633 return print_infinite_recursion_bug();
634
635 debug_atomic_inc(&nr_find_usage_forwards_checks);
636 if (source->usage_mask & (1 << find_usage_bit)) {
637 forwards_match = source;
638 return 2;
639 }
640
641 /*
642 * Check this lock's dependency list:
643 */
644 list_for_each_entry(entry, &source->locks_after, entry) {
645 debug_atomic_inc(&nr_find_usage_forwards_recursions);
646 ret = find_usage_forwards(entry->class, depth+1);
647 if (ret == 2 || ret == 0)
648 return ret;
649 }
650 return 1;
651}
652
653/*
654 * Find a node in the backwards-direction dependency sub-graph starting
655 * at <source> that matches <find_usage_bit>.
656 *
657 * Return 2 if such a node exists in the subgraph, and put that node
658 * into <backwards_match>.
659 *
660 * Return 1 otherwise and keep <backwards_match> unchanged.
661 * Return 0 on error.
662 */
663static noinline int
664find_usage_backwards(struct lock_class *source, unsigned int depth)
665{
666 struct lock_list *entry;
667 int ret;
668
669 if (depth > max_recursion_depth)
670 max_recursion_depth = depth;
671 if (depth >= 20)
672 return print_infinite_recursion_bug();
673
674 debug_atomic_inc(&nr_find_usage_backwards_checks);
675 if (source->usage_mask & (1 << find_usage_bit)) {
676 backwards_match = source;
677 return 2;
678 }
679
680 /*
681 * Check this lock's dependency list:
682 */
683 list_for_each_entry(entry, &source->locks_before, entry) {
684 debug_atomic_inc(&nr_find_usage_backwards_recursions);
685 ret = find_usage_backwards(entry->class, depth+1);
686 if (ret == 2 || ret == 0)
687 return ret;
688 }
689 return 1;
690}
691
692static int
693print_bad_irq_dependency(struct task_struct *curr,
694 struct held_lock *prev,
695 struct held_lock *next,
696 enum lock_usage_bit bit1,
697 enum lock_usage_bit bit2,
698 const char *irqclass)
699{
700 __raw_spin_unlock(&hash_lock);
701 debug_locks_off();
702 if (debug_locks_silent)
703 return 0;
704
705 printk("\n======================================================\n");
706 printk( "[ INFO: %s-safe -> %s-unsafe lock order detected ]\n",
707 irqclass, irqclass);
708 printk( "------------------------------------------------------\n");
709 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] is trying to acquire:\n",
710 curr->comm, curr->pid,
711 curr->hardirq_context, hardirq_count() >> HARDIRQ_SHIFT,
712 curr->softirq_context, softirq_count() >> SOFTIRQ_SHIFT,
713 curr->hardirqs_enabled,
714 curr->softirqs_enabled);
715 print_lock(next);
716
717 printk("\nand this task is already holding:\n");
718 print_lock(prev);
719 printk("which would create a new lock dependency:\n");
720 print_lock_name(prev->class);
721 printk(" ->");
722 print_lock_name(next->class);
723 printk("\n");
724
725 printk("\nbut this new dependency connects a %s-irq-safe lock:\n",
726 irqclass);
727 print_lock_name(backwards_match);
728 printk("\n... which became %s-irq-safe at:\n", irqclass);
729
730 print_stack_trace(backwards_match->usage_traces + bit1, 1);
731
732 printk("\nto a %s-irq-unsafe lock:\n", irqclass);
733 print_lock_name(forwards_match);
734 printk("\n... which became %s-irq-unsafe at:\n", irqclass);
735 printk("...");
736
737 print_stack_trace(forwards_match->usage_traces + bit2, 1);
738
739 printk("\nother info that might help us debug this:\n\n");
740 lockdep_print_held_locks(curr);
741
742 printk("\nthe %s-irq-safe lock's dependencies:\n", irqclass);
743 print_lock_dependencies(backwards_match, 0);
744
745 printk("\nthe %s-irq-unsafe lock's dependencies:\n", irqclass);
746 print_lock_dependencies(forwards_match, 0);
747
748 printk("\nstack backtrace:\n");
749 dump_stack();
750
751 return 0;
752}
753
754static int
755check_usage(struct task_struct *curr, struct held_lock *prev,
756 struct held_lock *next, enum lock_usage_bit bit_backwards,
757 enum lock_usage_bit bit_forwards, const char *irqclass)
758{
759 int ret;
760
761 find_usage_bit = bit_backwards;
762 /* fills in <backwards_match> */
763 ret = find_usage_backwards(prev->class, 0);
764 if (!ret || ret == 1)
765 return ret;
766
767 find_usage_bit = bit_forwards;
768 ret = find_usage_forwards(next->class, 0);
769 if (!ret || ret == 1)
770 return ret;
771 /* ret == 2 */
772 return print_bad_irq_dependency(curr, prev, next,
773 bit_backwards, bit_forwards, irqclass);
774}
775
776#endif
777
778static int
779print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
780 struct held_lock *next)
781{
782 debug_locks_off();
783 __raw_spin_unlock(&hash_lock);
784 if (debug_locks_silent)
785 return 0;
786
787 printk("\n=============================================\n");
788 printk( "[ INFO: possible recursive locking detected ]\n");
789 printk( "---------------------------------------------\n");
790 printk("%s/%d is trying to acquire lock:\n",
791 curr->comm, curr->pid);
792 print_lock(next);
793 printk("\nbut task is already holding lock:\n");
794 print_lock(prev);
795
796 printk("\nother info that might help us debug this:\n");
797 lockdep_print_held_locks(curr);
798
799 printk("\nstack backtrace:\n");
800 dump_stack();
801
802 return 0;
803}
804
805/*
806 * Check whether we are holding such a class already.
807 *
808 * (Note that this has to be done separately, because the graph cannot
809 * detect such classes of deadlocks.)
810 *
811 * Returns: 0 on deadlock detected, 1 on OK, 2 on recursive read
812 */
813static int
814check_deadlock(struct task_struct *curr, struct held_lock *next,
815 struct lockdep_map *next_instance, int read)
816{
817 struct held_lock *prev;
818 int i;
819
820 for (i = 0; i < curr->lockdep_depth; i++) {
821 prev = curr->held_locks + i;
822 if (prev->class != next->class)
823 continue;
824 /*
825 * Allow read-after-read recursion of the same
826 * lock class (i.e. read_lock(lock)+read_lock(lock)):
827 */
828 if ((read == 2) && prev->read)
829 return 2;
830 return print_deadlock_bug(curr, prev, next);
831 }
832 return 1;
833}
834
835/*
836 * There was a chain-cache miss, and we are about to add a new dependency
837 * to a previous lock. We recursively validate the following rules:
838 *
839 * - would the adding of the <prev> -> <next> dependency create a
840 * circular dependency in the graph? [== circular deadlock]
841 *
842 * - does the new prev->next dependency connect any hardirq-safe lock
843 * (in the full backwards-subgraph starting at <prev>) with any
844 * hardirq-unsafe lock (in the full forwards-subgraph starting at
845 * <next>)? [== illegal lock inversion with hardirq contexts]
846 *
847 * - does the new prev->next dependency connect any softirq-safe lock
848 * (in the full backwards-subgraph starting at <prev>) with any
849 * softirq-unsafe lock (in the full forwards-subgraph starting at
850 * <next>)? [== illegal lock inversion with softirq contexts]
851 *
852 * any of these scenarios could lead to a deadlock.
853 *
854 * Then if all the validations pass, we add the forwards and backwards
855 * dependency.
856 */
857static int
858check_prev_add(struct task_struct *curr, struct held_lock *prev,
859 struct held_lock *next)
860{
861 struct lock_list *entry;
862 int ret;
863
864 /*
865 * Prove that the new <prev> -> <next> dependency would not
866 * create a circular dependency in the graph. (We do this by
867 * forward-recursing into the graph starting at <next>, and
868 * checking whether we can reach <prev>.)
869 *
870 * We are using global variables to control the recursion, to
871 * keep the stackframe size of the recursive functions low:
872 */
873 check_source = next;
874 check_target = prev;
875 if (!(check_noncircular(next->class, 0)))
876 return print_circular_bug_tail();
877
878#ifdef CONFIG_TRACE_IRQFLAGS
879 /*
880 * Prove that the new dependency does not connect a hardirq-safe
881 * lock with a hardirq-unsafe lock - to achieve this we search
882 * the backwards-subgraph starting at <prev>, and the
883 * forwards-subgraph starting at <next>:
884 */
885 if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ,
886 LOCK_ENABLED_HARDIRQS, "hard"))
887 return 0;
888
889 /*
890 * Prove that the new dependency does not connect a hardirq-safe-read
891 * lock with a hardirq-unsafe lock - to achieve this we search
892 * the backwards-subgraph starting at <prev>, and the
893 * forwards-subgraph starting at <next>:
894 */
895 if (!check_usage(curr, prev, next, LOCK_USED_IN_HARDIRQ_READ,
896 LOCK_ENABLED_HARDIRQS, "hard-read"))
897 return 0;
898
899 /*
900 * Prove that the new dependency does not connect a softirq-safe
901 * lock with a softirq-unsafe lock - to achieve this we search
902 * the backwards-subgraph starting at <prev>, and the
903 * forwards-subgraph starting at <next>:
904 */
905 if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ,
906 LOCK_ENABLED_SOFTIRQS, "soft"))
907 return 0;
908 /*
909 * Prove that the new dependency does not connect a softirq-safe-read
910 * lock with a softirq-unsafe lock - to achieve this we search
911 * the backwards-subgraph starting at <prev>, and the
912 * forwards-subgraph starting at <next>:
913 */
914 if (!check_usage(curr, prev, next, LOCK_USED_IN_SOFTIRQ_READ,
915 LOCK_ENABLED_SOFTIRQS, "soft"))
916 return 0;
917#endif
918 /*
919 * For recursive read-locks we do all the dependency checks,
920 * but we dont store read-triggered dependencies (only
921 * write-triggered dependencies). This ensures that only the
922 * write-side dependencies matter, and that if for example a
923 * write-lock never takes any other locks, then the reads are
924 * equivalent to a NOP.
925 */
926 if (next->read == 2 || prev->read == 2)
927 return 1;
928 /*
929 * Is the <prev> -> <next> dependency already present?
930 *
931 * (this may occur even though this is a new chain: consider
932 * e.g. the L1 -> L2 -> L3 -> L4 and the L5 -> L1 -> L2 -> L3
933 * chains - the second one will be new, but L1 already has
934 * L2 added to its dependency list, due to the first chain.)
935 */
936 list_for_each_entry(entry, &prev->class->locks_after, entry) {
937 if (entry->class == next->class)
938 return 2;
939 }
940
941 /*
942 * Ok, all validations passed, add the new lock
943 * to the previous lock's dependency list:
944 */
945 ret = add_lock_to_list(prev->class, next->class,
946 &prev->class->locks_after, next->acquire_ip);
947 if (!ret)
948 return 0;
949 /*
950 * Return value of 2 signals 'dependency already added',
951 * in that case we dont have to add the backlink either.
952 */
953 if (ret == 2)
954 return 2;
955 ret = add_lock_to_list(next->class, prev->class,
956 &next->class->locks_before, next->acquire_ip);
957
958 /*
959 * Debugging printouts:
960 */
961 if (verbose(prev->class) || verbose(next->class)) {
962 __raw_spin_unlock(&hash_lock);
963 printk("\n new dependency: ");
964 print_lock_name(prev->class);
965 printk(" => ");
966 print_lock_name(next->class);
967 printk("\n");
968 dump_stack();
969 __raw_spin_lock(&hash_lock);
970 }
971 return 1;
972}
973
974/*
975 * Add the dependency to all directly-previous locks that are 'relevant'.
976 * The ones that are relevant are (in increasing distance from curr):
977 * all consecutive trylock entries and the final non-trylock entry - or
978 * the end of this context's lock-chain - whichever comes first.
979 */
980static int
981check_prevs_add(struct task_struct *curr, struct held_lock *next)
982{
983 int depth = curr->lockdep_depth;
984 struct held_lock *hlock;
985
986 /*
987 * Debugging checks.
988 *
989 * Depth must not be zero for a non-head lock:
990 */
991 if (!depth)
992 goto out_bug;
993 /*
994 * At least two relevant locks must exist for this
995 * to be a head:
996 */
997 if (curr->held_locks[depth].irq_context !=
998 curr->held_locks[depth-1].irq_context)
999 goto out_bug;
1000
1001 for (;;) {
1002 hlock = curr->held_locks + depth-1;
1003 /*
1004 * Only non-recursive-read entries get new dependencies
1005 * added:
1006 */
1007 if (hlock->read != 2) {
1008 check_prev_add(curr, hlock, next);
1009 /*
1010 * Stop after the first non-trylock entry,
1011 * as non-trylock entries have added their
1012 * own direct dependencies already, so this
1013 * lock is connected to them indirectly:
1014 */
1015 if (!hlock->trylock)
1016 break;
1017 }
1018 depth--;
1019 /*
1020 * End of lock-stack?
1021 */
1022 if (!depth)
1023 break;
1024 /*
1025 * Stop the search if we cross into another context:
1026 */
1027 if (curr->held_locks[depth].irq_context !=
1028 curr->held_locks[depth-1].irq_context)
1029 break;
1030 }
1031 return 1;
1032out_bug:
1033 __raw_spin_unlock(&hash_lock);
1034 DEBUG_LOCKS_WARN_ON(1);
1035
1036 return 0;
1037}
1038
1039
1040/*
1041 * Is this the address of a static object:
1042 */
1043static int static_obj(void *obj)
1044{
1045 unsigned long start = (unsigned long) &_stext,
1046 end = (unsigned long) &_end,
1047 addr = (unsigned long) obj;
1048#ifdef CONFIG_SMP
1049 int i;
1050#endif
1051
1052 /*
1053 * static variable?
1054 */
1055 if ((addr >= start) && (addr < end))
1056 return 1;
1057
1058#ifdef CONFIG_SMP
1059 /*
1060 * percpu var?
1061 */
1062 for_each_possible_cpu(i) {
1063 start = (unsigned long) &__per_cpu_start + per_cpu_offset(i);
1064 end = (unsigned long) &__per_cpu_end + per_cpu_offset(i);
1065
1066 if ((addr >= start) && (addr < end))
1067 return 1;
1068 }
1069#endif
1070
1071 /*
1072 * module var?
1073 */
1074 return is_module_address(addr);
1075}
1076
1077/*
1078 * To make lock name printouts unique, we calculate a unique
1079 * class->name_version generation counter:
1080 */
1081static int count_matching_names(struct lock_class *new_class)
1082{
1083 struct lock_class *class;
1084 int count = 0;
1085
1086 if (!new_class->name)
1087 return 0;
1088
1089 list_for_each_entry(class, &all_lock_classes, lock_entry) {
1090 if (new_class->key - new_class->subclass == class->key)
1091 return class->name_version;
1092 if (class->name && !strcmp(class->name, new_class->name))
1093 count = max(count, class->name_version);
1094 }
1095
1096 return count + 1;
1097}
1098
1099extern void __error_too_big_MAX_LOCKDEP_SUBCLASSES(void);
1100
1101/*
1102 * Register a lock's class in the hash-table, if the class is not present
1103 * yet. Otherwise we look it up. We cache the result in the lock object
1104 * itself, so actual lookup of the hash should be once per lock object.
1105 */
1106static inline struct lock_class *
1107look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
1108{
1109 struct lockdep_subclass_key *key;
1110 struct list_head *hash_head;
1111 struct lock_class *class;
1112
1113#ifdef CONFIG_DEBUG_LOCKDEP
1114 /*
1115 * If the architecture calls into lockdep before initializing
1116 * the hashes then we'll warn about it later. (we cannot printk
1117 * right now)
1118 */
1119 if (unlikely(!lockdep_initialized)) {
1120 lockdep_init();
1121 lockdep_init_error = 1;
1122 }
1123#endif
1124
1125 /*
1126 * Static locks do not have their class-keys yet - for them the key
1127 * is the lock object itself:
1128 */
1129 if (unlikely(!lock->key))
1130 lock->key = (void *)lock;
1131
1132 /*
1133 * NOTE: the class-key must be unique. For dynamic locks, a static
1134 * lock_class_key variable is passed in through the mutex_init()
1135 * (or spin_lock_init()) call - which acts as the key. For static
1136 * locks we use the lock object itself as the key.
1137 */
1138 if (sizeof(struct lock_class_key) > sizeof(struct lock_class))
1139 __error_too_big_MAX_LOCKDEP_SUBCLASSES();
1140
1141 key = lock->key->subkeys + subclass;
1142
1143 hash_head = classhashentry(key);
1144
1145 /*
1146 * We can walk the hash lockfree, because the hash only
1147 * grows, and we are careful when adding entries to the end:
1148 */
1149 list_for_each_entry(class, hash_head, hash_entry)
1150 if (class->key == key)
1151 return class;
1152
1153 return NULL;
1154}
1155
1156/*
1157 * Register a lock's class in the hash-table, if the class is not present
1158 * yet. Otherwise we look it up. We cache the result in the lock object
1159 * itself, so actual lookup of the hash should be once per lock object.
1160 */
1161static inline struct lock_class *
1162register_lock_class(struct lockdep_map *lock, unsigned int subclass)
1163{
1164 struct lockdep_subclass_key *key;
1165 struct list_head *hash_head;
1166 struct lock_class *class;
1167
1168 class = look_up_lock_class(lock, subclass);
1169 if (likely(class))
1170 return class;
1171
1172 /*
1173 * Debug-check: all keys must be persistent!
1174 */
1175 if (!static_obj(lock->key)) {
1176 debug_locks_off();
1177 printk("INFO: trying to register non-static key.\n");
1178 printk("the code is fine but needs lockdep annotation.\n");
1179 printk("turning off the locking correctness validator.\n");
1180 dump_stack();
1181
1182 return NULL;
1183 }
1184
1185 key = lock->key->subkeys + subclass;
1186 hash_head = classhashentry(key);
1187
1188 __raw_spin_lock(&hash_lock);
1189 /*
1190 * We have to do the hash-walk again, to avoid races
1191 * with another CPU:
1192 */
1193 list_for_each_entry(class, hash_head, hash_entry)
1194 if (class->key == key)
1195 goto out_unlock_set;
1196 /*
1197 * Allocate a new key from the static array, and add it to
1198 * the hash:
1199 */
1200 if (nr_lock_classes >= MAX_LOCKDEP_KEYS) {
1201 __raw_spin_unlock(&hash_lock);
1202 debug_locks_off();
1203 printk("BUG: MAX_LOCKDEP_KEYS too low!\n");
1204 printk("turning off the locking correctness validator.\n");
1205 return NULL;
1206 }
1207 class = lock_classes + nr_lock_classes++;
1208 debug_atomic_inc(&nr_unused_locks);
1209 class->key = key;
1210 class->name = lock->name;
1211 class->subclass = subclass;
1212 INIT_LIST_HEAD(&class->lock_entry);
1213 INIT_LIST_HEAD(&class->locks_before);
1214 INIT_LIST_HEAD(&class->locks_after);
1215 class->name_version = count_matching_names(class);
1216 /*
1217 * We use RCU's safe list-add method to make
1218 * parallel walking of the hash-list safe:
1219 */
1220 list_add_tail_rcu(&class->hash_entry, hash_head);
1221
1222 if (verbose(class)) {
1223 __raw_spin_unlock(&hash_lock);
1224 printk("\nnew class %p: %s", class->key, class->name);
1225 if (class->name_version > 1)
1226 printk("#%d", class->name_version);
1227 printk("\n");
1228 dump_stack();
1229 __raw_spin_lock(&hash_lock);
1230 }
1231out_unlock_set:
1232 __raw_spin_unlock(&hash_lock);
1233
1234 if (!subclass)
1235 lock->class_cache = class;
1236
1237 DEBUG_LOCKS_WARN_ON(class->subclass != subclass);
1238
1239 return class;
1240}
1241
1242/*
1243 * Look up a dependency chain. If the key is not present yet then
1244 * add it and return 0 - in this case the new dependency chain is
1245 * validated. If the key is already hashed, return 1.
1246 */
1247static inline int lookup_chain_cache(u64 chain_key)
1248{
1249 struct list_head *hash_head = chainhashentry(chain_key);
1250 struct lock_chain *chain;
1251
1252 DEBUG_LOCKS_WARN_ON(!irqs_disabled());
1253 /*
1254 * We can walk it lock-free, because entries only get added
1255 * to the hash:
1256 */
1257 list_for_each_entry(chain, hash_head, entry) {
1258 if (chain->chain_key == chain_key) {
1259cache_hit:
1260 debug_atomic_inc(&chain_lookup_hits);
1261 /*
1262 * In the debugging case, force redundant checking
1263 * by returning 1:
1264 */
1265#ifdef CONFIG_DEBUG_LOCKDEP
1266 __raw_spin_lock(&hash_lock);
1267 return 1;
1268#endif
1269 return 0;
1270 }
1271 }
1272 /*
1273 * Allocate a new chain entry from the static array, and add
1274 * it to the hash:
1275 */
1276 __raw_spin_lock(&hash_lock);
1277 /*
1278 * We have to walk the chain again locked - to avoid duplicates:
1279 */
1280 list_for_each_entry(chain, hash_head, entry) {
1281 if (chain->chain_key == chain_key) {
1282 __raw_spin_unlock(&hash_lock);
1283 goto cache_hit;
1284 }
1285 }
1286 if (unlikely(nr_lock_chains >= MAX_LOCKDEP_CHAINS)) {
1287 __raw_spin_unlock(&hash_lock);
1288 debug_locks_off();
1289 printk("BUG: MAX_LOCKDEP_CHAINS too low!\n");
1290 printk("turning off the locking correctness validator.\n");
1291 return 0;
1292 }
1293 chain = lock_chains + nr_lock_chains++;
1294 chain->chain_key = chain_key;
1295 list_add_tail_rcu(&chain->entry, hash_head);
1296 debug_atomic_inc(&chain_lookup_misses);
1297#ifdef CONFIG_TRACE_IRQFLAGS
1298 if (current->hardirq_context)
1299 nr_hardirq_chains++;
1300 else {
1301 if (current->softirq_context)
1302 nr_softirq_chains++;
1303 else
1304 nr_process_chains++;
1305 }
1306#else
1307 nr_process_chains++;
1308#endif
1309
1310 return 1;
1311}
1312
1313/*
1314 * We are building curr_chain_key incrementally, so double-check
1315 * it from scratch, to make sure that it's done correctly:
1316 */
1317static void check_chain_key(struct task_struct *curr)
1318{
1319#ifdef CONFIG_DEBUG_LOCKDEP
1320 struct held_lock *hlock, *prev_hlock = NULL;
1321 unsigned int i, id;
1322 u64 chain_key = 0;
1323
1324 for (i = 0; i < curr->lockdep_depth; i++) {
1325 hlock = curr->held_locks + i;
1326 if (chain_key != hlock->prev_chain_key) {
1327 debug_locks_off();
1328 printk("hm#1, depth: %u [%u], %016Lx != %016Lx\n",
1329 curr->lockdep_depth, i,
1330 (unsigned long long)chain_key,
1331 (unsigned long long)hlock->prev_chain_key);
1332 WARN_ON(1);
1333 return;
1334 }
1335 id = hlock->class - lock_classes;
1336 DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS);
1337 if (prev_hlock && (prev_hlock->irq_context !=
1338 hlock->irq_context))
1339 chain_key = 0;
1340 chain_key = iterate_chain_key(chain_key, id);
1341 prev_hlock = hlock;
1342 }
1343 if (chain_key != curr->curr_chain_key) {
1344 debug_locks_off();
1345 printk("hm#2, depth: %u [%u], %016Lx != %016Lx\n",
1346 curr->lockdep_depth, i,
1347 (unsigned long long)chain_key,
1348 (unsigned long long)curr->curr_chain_key);
1349 WARN_ON(1);
1350 }
1351#endif
1352}
1353
1354#ifdef CONFIG_TRACE_IRQFLAGS
1355
1356/*
1357 * print irq inversion bug:
1358 */
1359static int
1360print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
1361 struct held_lock *this, int forwards,
1362 const char *irqclass)
1363{
1364 __raw_spin_unlock(&hash_lock);
1365 debug_locks_off();
1366 if (debug_locks_silent)
1367 return 0;
1368
1369 printk("\n=========================================================\n");
1370 printk( "[ INFO: possible irq lock inversion dependency detected ]\n");
1371 printk( "---------------------------------------------------------\n");
1372 printk("%s/%d just changed the state of lock:\n",
1373 curr->comm, curr->pid);
1374 print_lock(this);
1375 if (forwards)
1376 printk("but this lock took another, %s-irq-unsafe lock in the past:\n", irqclass);
1377 else
1378 printk("but this lock was taken by another, %s-irq-safe lock in the past:\n", irqclass);
1379 print_lock_name(other);
1380 printk("\n\nand interrupts could create inverse lock ordering between them.\n\n");
1381
1382 printk("\nother info that might help us debug this:\n");
1383 lockdep_print_held_locks(curr);
1384
1385 printk("\nthe first lock's dependencies:\n");
1386 print_lock_dependencies(this->class, 0);
1387
1388 printk("\nthe second lock's dependencies:\n");
1389 print_lock_dependencies(other, 0);
1390
1391 printk("\nstack backtrace:\n");
1392 dump_stack();
1393
1394 return 0;
1395}
1396
1397/*
1398 * Prove that in the forwards-direction subgraph starting at <this>
1399 * there is no lock matching <mask>:
1400 */
1401static int
1402check_usage_forwards(struct task_struct *curr, struct held_lock *this,
1403 enum lock_usage_bit bit, const char *irqclass)
1404{
1405 int ret;
1406
1407 find_usage_bit = bit;
1408 /* fills in <forwards_match> */
1409 ret = find_usage_forwards(this->class, 0);
1410 if (!ret || ret == 1)
1411 return ret;
1412
1413 return print_irq_inversion_bug(curr, forwards_match, this, 1, irqclass);
1414}
1415
1416/*
1417 * Prove that in the backwards-direction subgraph starting at <this>
1418 * there is no lock matching <mask>:
1419 */
1420static int
1421check_usage_backwards(struct task_struct *curr, struct held_lock *this,
1422 enum lock_usage_bit bit, const char *irqclass)
1423{
1424 int ret;
1425
1426 find_usage_bit = bit;
1427 /* fills in <backwards_match> */
1428 ret = find_usage_backwards(this->class, 0);
1429 if (!ret || ret == 1)
1430 return ret;
1431
1432 return print_irq_inversion_bug(curr, backwards_match, this, 0, irqclass);
1433}
1434
1435static inline void print_irqtrace_events(struct task_struct *curr)
1436{
1437 printk("irq event stamp: %u\n", curr->irq_events);
1438 printk("hardirqs last enabled at (%u): ", curr->hardirq_enable_event);
1439 print_ip_sym(curr->hardirq_enable_ip);
1440 printk("hardirqs last disabled at (%u): ", curr->hardirq_disable_event);
1441 print_ip_sym(curr->hardirq_disable_ip);
1442 printk("softirqs last enabled at (%u): ", curr->softirq_enable_event);
1443 print_ip_sym(curr->softirq_enable_ip);
1444 printk("softirqs last disabled at (%u): ", curr->softirq_disable_event);
1445 print_ip_sym(curr->softirq_disable_ip);
1446}
1447
1448#else
1449static inline void print_irqtrace_events(struct task_struct *curr)
1450{
1451}
1452#endif
1453
1454static int
1455print_usage_bug(struct task_struct *curr, struct held_lock *this,
1456 enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit)
1457{
1458 __raw_spin_unlock(&hash_lock);
1459 debug_locks_off();
1460 if (debug_locks_silent)
1461 return 0;
1462
1463 printk("\n=================================\n");
1464 printk( "[ INFO: inconsistent lock state ]\n");
1465 printk( "---------------------------------\n");
1466
1467 printk("inconsistent {%s} -> {%s} usage.\n",
1468 usage_str[prev_bit], usage_str[new_bit]);
1469
1470 printk("%s/%d [HC%u[%lu]:SC%u[%lu]:HE%u:SE%u] takes:\n",
1471 curr->comm, curr->pid,
1472 trace_hardirq_context(curr), hardirq_count() >> HARDIRQ_SHIFT,
1473 trace_softirq_context(curr), softirq_count() >> SOFTIRQ_SHIFT,
1474 trace_hardirqs_enabled(curr),
1475 trace_softirqs_enabled(curr));
1476 print_lock(this);
1477
1478 printk("{%s} state was registered at:\n", usage_str[prev_bit]);
1479 print_stack_trace(this->class->usage_traces + prev_bit, 1);
1480
1481 print_irqtrace_events(curr);
1482 printk("\nother info that might help us debug this:\n");
1483 lockdep_print_held_locks(curr);
1484
1485 printk("\nstack backtrace:\n");
1486 dump_stack();
1487
1488 return 0;
1489}
1490
1491/*
1492 * Print out an error if an invalid bit is set:
1493 */
1494static inline int
1495valid_state(struct task_struct *curr, struct held_lock *this,
1496 enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit)
1497{
1498 if (unlikely(this->class->usage_mask & (1 << bad_bit)))
1499 return print_usage_bug(curr, this, bad_bit, new_bit);
1500 return 1;
1501}
1502
1503#define STRICT_READ_CHECKS 1
1504
1505/*
1506 * Mark a lock with a usage bit, and validate the state transition:
1507 */
1508static int mark_lock(struct task_struct *curr, struct held_lock *this,
1509 enum lock_usage_bit new_bit, unsigned long ip)
1510{
1511 unsigned int new_mask = 1 << new_bit, ret = 1;
1512
1513 /*
1514 * If already set then do not dirty the cacheline,
1515 * nor do any checks:
1516 */
1517 if (likely(this->class->usage_mask & new_mask))
1518 return 1;
1519
1520 __raw_spin_lock(&hash_lock);
1521 /*
1522 * Make sure we didnt race:
1523 */
1524 if (unlikely(this->class->usage_mask & new_mask)) {
1525 __raw_spin_unlock(&hash_lock);
1526 return 1;
1527 }
1528
1529 this->class->usage_mask |= new_mask;
1530
1531#ifdef CONFIG_TRACE_IRQFLAGS
1532 if (new_bit == LOCK_ENABLED_HARDIRQS ||
1533 new_bit == LOCK_ENABLED_HARDIRQS_READ)
1534 ip = curr->hardirq_enable_ip;
1535 else if (new_bit == LOCK_ENABLED_SOFTIRQS ||
1536 new_bit == LOCK_ENABLED_SOFTIRQS_READ)
1537 ip = curr->softirq_enable_ip;
1538#endif
1539 if (!save_trace(this->class->usage_traces + new_bit))
1540 return 0;
1541
1542 switch (new_bit) {
1543#ifdef CONFIG_TRACE_IRQFLAGS
1544 case LOCK_USED_IN_HARDIRQ:
1545 if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS))
1546 return 0;
1547 if (!valid_state(curr, this, new_bit,
1548 LOCK_ENABLED_HARDIRQS_READ))
1549 return 0;
1550 /*
1551 * just marked it hardirq-safe, check that this lock
1552 * took no hardirq-unsafe lock in the past:
1553 */
1554 if (!check_usage_forwards(curr, this,
1555 LOCK_ENABLED_HARDIRQS, "hard"))
1556 return 0;
1557#if STRICT_READ_CHECKS
1558 /*
1559 * just marked it hardirq-safe, check that this lock
1560 * took no hardirq-unsafe-read lock in the past:
1561 */
1562 if (!check_usage_forwards(curr, this,
1563 LOCK_ENABLED_HARDIRQS_READ, "hard-read"))
1564 return 0;
1565#endif
1566 if (hardirq_verbose(this->class))
1567 ret = 2;
1568 break;
1569 case LOCK_USED_IN_SOFTIRQ:
1570 if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQS))
1571 return 0;
1572 if (!valid_state(curr, this, new_bit,
1573 LOCK_ENABLED_SOFTIRQS_READ))
1574 return 0;
1575 /*
1576 * just marked it softirq-safe, check that this lock
1577 * took no softirq-unsafe lock in the past:
1578 */
1579 if (!check_usage_forwards(curr, this,
1580 LOCK_ENABLED_SOFTIRQS, "soft"))
1581 return 0;
1582#if STRICT_READ_CHECKS
1583 /*
1584 * just marked it softirq-safe, check that this lock
1585 * took no softirq-unsafe-read lock in the past:
1586 */
1587 if (!check_usage_forwards(curr, this,
1588 LOCK_ENABLED_SOFTIRQS_READ, "soft-read"))
1589 return 0;
1590#endif
1591 if (softirq_verbose(this->class))
1592 ret = 2;
1593 break;
1594 case LOCK_USED_IN_HARDIRQ_READ:
1595 if (!valid_state(curr, this, new_bit, LOCK_ENABLED_HARDIRQS))
1596 return 0;
1597 /*
1598 * just marked it hardirq-read-safe, check that this lock
1599 * took no hardirq-unsafe lock in the past:
1600 */
1601 if (!check_usage_forwards(curr, this,
1602 LOCK_ENABLED_HARDIRQS, "hard"))
1603 return 0;
1604 if (hardirq_verbose(this->class))
1605 ret = 2;
1606 break;
1607 case LOCK_USED_IN_SOFTIRQ_READ:
1608 if (!valid_state(curr, this, new_bit, LOCK_ENABLED_SOFTIRQS))
1609 return 0;
1610 /*
1611 * just marked it softirq-read-safe, check that this lock
1612 * took no softirq-unsafe lock in the past:
1613 */
1614 if (!check_usage_forwards(curr, this,
1615 LOCK_ENABLED_SOFTIRQS, "soft"))
1616 return 0;
1617 if (softirq_verbose(this->class))
1618 ret = 2;
1619 break;
1620 case LOCK_ENABLED_HARDIRQS:
1621 if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ))
1622 return 0;
1623 if (!valid_state(curr, this, new_bit,
1624 LOCK_USED_IN_HARDIRQ_READ))
1625 return 0;
1626 /*
1627 * just marked it hardirq-unsafe, check that no hardirq-safe
1628 * lock in the system ever took it in the past:
1629 */
1630 if (!check_usage_backwards(curr, this,
1631 LOCK_USED_IN_HARDIRQ, "hard"))
1632 return 0;
1633#if STRICT_READ_CHECKS
1634 /*
1635 * just marked it hardirq-unsafe, check that no
1636 * hardirq-safe-read lock in the system ever took
1637 * it in the past:
1638 */
1639 if (!check_usage_backwards(curr, this,
1640 LOCK_USED_IN_HARDIRQ_READ, "hard-read"))
1641 return 0;
1642#endif
1643 if (hardirq_verbose(this->class))
1644 ret = 2;
1645 break;
1646 case LOCK_ENABLED_SOFTIRQS:
1647 if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ))
1648 return 0;
1649 if (!valid_state(curr, this, new_bit,
1650 LOCK_USED_IN_SOFTIRQ_READ))
1651 return 0;
1652 /*
1653 * just marked it softirq-unsafe, check that no softirq-safe
1654 * lock in the system ever took it in the past:
1655 */
1656 if (!check_usage_backwards(curr, this,
1657 LOCK_USED_IN_SOFTIRQ, "soft"))
1658 return 0;
1659#if STRICT_READ_CHECKS
1660 /*
1661 * just marked it softirq-unsafe, check that no
1662 * softirq-safe-read lock in the system ever took
1663 * it in the past:
1664 */
1665 if (!check_usage_backwards(curr, this,
1666 LOCK_USED_IN_SOFTIRQ_READ, "soft-read"))
1667 return 0;
1668#endif
1669 if (softirq_verbose(this->class))
1670 ret = 2;
1671 break;
1672 case LOCK_ENABLED_HARDIRQS_READ:
1673 if (!valid_state(curr, this, new_bit, LOCK_USED_IN_HARDIRQ))
1674 return 0;
1675#if STRICT_READ_CHECKS
1676 /*
1677 * just marked it hardirq-read-unsafe, check that no
1678 * hardirq-safe lock in the system ever took it in the past:
1679 */
1680 if (!check_usage_backwards(curr, this,
1681 LOCK_USED_IN_HARDIRQ, "hard"))
1682 return 0;
1683#endif
1684 if (hardirq_verbose(this->class))
1685 ret = 2;
1686 break;
1687 case LOCK_ENABLED_SOFTIRQS_READ:
1688 if (!valid_state(curr, this, new_bit, LOCK_USED_IN_SOFTIRQ))
1689 return 0;
1690#if STRICT_READ_CHECKS
1691 /*
1692 * just marked it softirq-read-unsafe, check that no
1693 * softirq-safe lock in the system ever took it in the past:
1694 */
1695 if (!check_usage_backwards(curr, this,
1696 LOCK_USED_IN_SOFTIRQ, "soft"))
1697 return 0;
1698#endif
1699 if (softirq_verbose(this->class))
1700 ret = 2;
1701 break;
1702#endif
1703 case LOCK_USED:
1704 /*
1705 * Add it to the global list of classes:
1706 */
1707 list_add_tail_rcu(&this->class->lock_entry, &all_lock_classes);
1708 debug_atomic_dec(&nr_unused_locks);
1709 break;
1710 default:
1711 debug_locks_off();
1712 WARN_ON(1);
1713 return 0;
1714 }
1715
1716 __raw_spin_unlock(&hash_lock);
1717
1718 /*
1719 * We must printk outside of the hash_lock:
1720 */
1721 if (ret == 2) {
1722 printk("\nmarked lock as {%s}:\n", usage_str[new_bit]);
1723 print_lock(this);
1724 print_irqtrace_events(curr);
1725 dump_stack();
1726 }
1727
1728 return ret;
1729}
1730
1731#ifdef CONFIG_TRACE_IRQFLAGS
1732/*
1733 * Mark all held locks with a usage bit:
1734 */
1735static int
1736mark_held_locks(struct task_struct *curr, int hardirq, unsigned long ip)
1737{
1738 enum lock_usage_bit usage_bit;
1739 struct held_lock *hlock;
1740 int i;
1741
1742 for (i = 0; i < curr->lockdep_depth; i++) {
1743 hlock = curr->held_locks + i;
1744
1745 if (hardirq) {
1746 if (hlock->read)
1747 usage_bit = LOCK_ENABLED_HARDIRQS_READ;
1748 else
1749 usage_bit = LOCK_ENABLED_HARDIRQS;
1750 } else {
1751 if (hlock->read)
1752 usage_bit = LOCK_ENABLED_SOFTIRQS_READ;
1753 else
1754 usage_bit = LOCK_ENABLED_SOFTIRQS;
1755 }
1756 if (!mark_lock(curr, hlock, usage_bit, ip))
1757 return 0;
1758 }
1759
1760 return 1;
1761}
1762
1763/*
1764 * Debugging helper: via this flag we know that we are in
1765 * 'early bootup code', and will warn about any invalid irqs-on event:
1766 */
1767static int early_boot_irqs_enabled;
1768
1769void early_boot_irqs_off(void)
1770{
1771 early_boot_irqs_enabled = 0;
1772}
1773
1774void early_boot_irqs_on(void)
1775{
1776 early_boot_irqs_enabled = 1;
1777}
1778
1779/*
1780 * Hardirqs will be enabled:
1781 */
1782void trace_hardirqs_on(void)
1783{
1784 struct task_struct *curr = current;
1785 unsigned long ip;
1786
1787 if (unlikely(!debug_locks || current->lockdep_recursion))
1788 return;
1789
1790 if (DEBUG_LOCKS_WARN_ON(unlikely(!early_boot_irqs_enabled)))
1791 return;
1792
1793 if (unlikely(curr->hardirqs_enabled)) {
1794 debug_atomic_inc(&redundant_hardirqs_on);
1795 return;
1796 }
1797 /* we'll do an OFF -> ON transition: */
1798 curr->hardirqs_enabled = 1;
1799 ip = (unsigned long) __builtin_return_address(0);
1800
1801 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
1802 return;
1803 if (DEBUG_LOCKS_WARN_ON(current->hardirq_context))
1804 return;
1805 /*
1806 * We are going to turn hardirqs on, so set the
1807 * usage bit for all held locks:
1808 */
1809 if (!mark_held_locks(curr, 1, ip))
1810 return;
1811 /*
1812 * If we have softirqs enabled, then set the usage
1813 * bit for all held locks. (disabled hardirqs prevented
1814 * this bit from being set before)
1815 */
1816 if (curr->softirqs_enabled)
1817 if (!mark_held_locks(curr, 0, ip))
1818 return;
1819
1820 curr->hardirq_enable_ip = ip;
1821 curr->hardirq_enable_event = ++curr->irq_events;
1822 debug_atomic_inc(&hardirqs_on_events);
1823}
1824
1825EXPORT_SYMBOL(trace_hardirqs_on);
1826
1827/*
1828 * Hardirqs were disabled:
1829 */
1830void trace_hardirqs_off(void)
1831{
1832 struct task_struct *curr = current;
1833
1834 if (unlikely(!debug_locks || current->lockdep_recursion))
1835 return;
1836
1837 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
1838 return;
1839
1840 if (curr->hardirqs_enabled) {
1841 /*
1842 * We have done an ON -> OFF transition:
1843 */
1844 curr->hardirqs_enabled = 0;
1845 curr->hardirq_disable_ip = _RET_IP_;
1846 curr->hardirq_disable_event = ++curr->irq_events;
1847 debug_atomic_inc(&hardirqs_off_events);
1848 } else
1849 debug_atomic_inc(&redundant_hardirqs_off);
1850}
1851
1852EXPORT_SYMBOL(trace_hardirqs_off);
1853
1854/*
1855 * Softirqs will be enabled:
1856 */
1857void trace_softirqs_on(unsigned long ip)
1858{
1859 struct task_struct *curr = current;
1860
1861 if (unlikely(!debug_locks))
1862 return;
1863
1864 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
1865 return;
1866
1867 if (curr->softirqs_enabled) {
1868 debug_atomic_inc(&redundant_softirqs_on);
1869 return;
1870 }
1871
1872 /*
1873 * We'll do an OFF -> ON transition:
1874 */
1875 curr->softirqs_enabled = 1;
1876 curr->softirq_enable_ip = ip;
1877 curr->softirq_enable_event = ++curr->irq_events;
1878 debug_atomic_inc(&softirqs_on_events);
1879 /*
1880 * We are going to turn softirqs on, so set the
1881 * usage bit for all held locks, if hardirqs are
1882 * enabled too:
1883 */
1884 if (curr->hardirqs_enabled)
1885 mark_held_locks(curr, 0, ip);
1886}
1887
1888/*
1889 * Softirqs were disabled:
1890 */
1891void trace_softirqs_off(unsigned long ip)
1892{
1893 struct task_struct *curr = current;
1894
1895 if (unlikely(!debug_locks))
1896 return;
1897
1898 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
1899 return;
1900
1901 if (curr->softirqs_enabled) {
1902 /*
1903 * We have done an ON -> OFF transition:
1904 */
1905 curr->softirqs_enabled = 0;
1906 curr->softirq_disable_ip = ip;
1907 curr->softirq_disable_event = ++curr->irq_events;
1908 debug_atomic_inc(&softirqs_off_events);
1909 DEBUG_LOCKS_WARN_ON(!softirq_count());
1910 } else
1911 debug_atomic_inc(&redundant_softirqs_off);
1912}
1913
1914#endif
1915
1916/*
1917 * Initialize a lock instance's lock-class mapping info:
1918 */
1919void lockdep_init_map(struct lockdep_map *lock, const char *name,
1920 struct lock_class_key *key)
1921{
1922 if (unlikely(!debug_locks))
1923 return;
1924
1925 if (DEBUG_LOCKS_WARN_ON(!key))
1926 return;
1927 if (DEBUG_LOCKS_WARN_ON(!name))
1928 return;
1929 /*
1930 * Sanity check, the lock-class key must be persistent:
1931 */
1932 if (!static_obj(key)) {
1933 printk("BUG: key %p not in .data!\n", key);
1934 DEBUG_LOCKS_WARN_ON(1);
1935 return;
1936 }
1937 lock->name = name;
1938 lock->key = key;
1939 lock->class_cache = NULL;
1940}
1941
1942EXPORT_SYMBOL_GPL(lockdep_init_map);
1943
1944/*
1945 * This gets called for every mutex_lock*()/spin_lock*() operation.
1946 * We maintain the dependency maps and validate the locking attempt:
1947 */
1948static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
1949 int trylock, int read, int check, int hardirqs_off,
1950 unsigned long ip)
1951{
1952 struct task_struct *curr = current;
1953 struct lock_class *class = NULL;
1954 struct held_lock *hlock;
1955 unsigned int depth, id;
1956 int chain_head = 0;
1957 u64 chain_key;
1958
1959 if (unlikely(!debug_locks))
1960 return 0;
1961
1962 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
1963 return 0;
1964
1965 if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) {
1966 debug_locks_off();
1967 printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n");
1968 printk("turning off the locking correctness validator.\n");
1969 return 0;
1970 }
1971
1972 if (!subclass)
1973 class = lock->class_cache;
1974 /*
1975 * Not cached yet or subclass?
1976 */
1977 if (unlikely(!class)) {
1978 class = register_lock_class(lock, subclass);
1979 if (!class)
1980 return 0;
1981 }
1982 debug_atomic_inc((atomic_t *)&class->ops);
1983 if (very_verbose(class)) {
1984 printk("\nacquire class [%p] %s", class->key, class->name);
1985 if (class->name_version > 1)
1986 printk("#%d", class->name_version);
1987 printk("\n");
1988 dump_stack();
1989 }
1990
1991 /*
1992 * Add the lock to the list of currently held locks.
1993 * (we dont increase the depth just yet, up until the
1994 * dependency checks are done)
1995 */
1996 depth = curr->lockdep_depth;
1997 if (DEBUG_LOCKS_WARN_ON(depth >= MAX_LOCK_DEPTH))
1998 return 0;
1999
2000 hlock = curr->held_locks + depth;
2001
2002 hlock->class = class;
2003 hlock->acquire_ip = ip;
2004 hlock->instance = lock;
2005 hlock->trylock = trylock;
2006 hlock->read = read;
2007 hlock->check = check;
2008 hlock->hardirqs_off = hardirqs_off;
2009
2010 if (check != 2)
2011 goto out_calc_hash;
2012#ifdef CONFIG_TRACE_IRQFLAGS
2013 /*
2014 * If non-trylock use in a hardirq or softirq context, then
2015 * mark the lock as used in these contexts:
2016 */
2017 if (!trylock) {
2018 if (read) {
2019 if (curr->hardirq_context)
2020 if (!mark_lock(curr, hlock,
2021 LOCK_USED_IN_HARDIRQ_READ, ip))
2022 return 0;
2023 if (curr->softirq_context)
2024 if (!mark_lock(curr, hlock,
2025 LOCK_USED_IN_SOFTIRQ_READ, ip))
2026 return 0;
2027 } else {
2028 if (curr->hardirq_context)
2029 if (!mark_lock(curr, hlock, LOCK_USED_IN_HARDIRQ, ip))
2030 return 0;
2031 if (curr->softirq_context)
2032 if (!mark_lock(curr, hlock, LOCK_USED_IN_SOFTIRQ, ip))
2033 return 0;
2034 }
2035 }
2036 if (!hardirqs_off) {
2037 if (read) {
2038 if (!mark_lock(curr, hlock,
2039 LOCK_ENABLED_HARDIRQS_READ, ip))
2040 return 0;
2041 if (curr->softirqs_enabled)
2042 if (!mark_lock(curr, hlock,
2043 LOCK_ENABLED_SOFTIRQS_READ, ip))
2044 return 0;
2045 } else {
2046 if (!mark_lock(curr, hlock,
2047 LOCK_ENABLED_HARDIRQS, ip))
2048 return 0;
2049 if (curr->softirqs_enabled)
2050 if (!mark_lock(curr, hlock,
2051 LOCK_ENABLED_SOFTIRQS, ip))
2052 return 0;
2053 }
2054 }
2055#endif
2056 /* mark it as used: */
2057 if (!mark_lock(curr, hlock, LOCK_USED, ip))
2058 return 0;
2059out_calc_hash:
2060 /*
2061 * Calculate the chain hash: it's the combined has of all the
2062 * lock keys along the dependency chain. We save the hash value
2063 * at every step so that we can get the current hash easily
2064 * after unlock. The chain hash is then used to cache dependency
2065 * results.
2066 *
2067 * The 'key ID' is what is the most compact key value to drive
2068 * the hash, not class->key.
2069 */
2070 id = class - lock_classes;
2071 if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
2072 return 0;
2073
2074 chain_key = curr->curr_chain_key;
2075 if (!depth) {
2076 if (DEBUG_LOCKS_WARN_ON(chain_key != 0))
2077 return 0;
2078 chain_head = 1;
2079 }
2080
2081 hlock->prev_chain_key = chain_key;
2082
2083#ifdef CONFIG_TRACE_IRQFLAGS
2084 /*
2085 * Keep track of points where we cross into an interrupt context:
2086 */
2087 hlock->irq_context = 2*(curr->hardirq_context ? 1 : 0) +
2088 curr->softirq_context;
2089 if (depth) {
2090 struct held_lock *prev_hlock;
2091
2092 prev_hlock = curr->held_locks + depth-1;
2093 /*
2094 * If we cross into another context, reset the
2095 * hash key (this also prevents the checking and the
2096 * adding of the dependency to 'prev'):
2097 */
2098 if (prev_hlock->irq_context != hlock->irq_context) {
2099 chain_key = 0;
2100 chain_head = 1;
2101 }
2102 }
2103#endif
2104 chain_key = iterate_chain_key(chain_key, id);
2105 curr->curr_chain_key = chain_key;
2106
2107 /*
2108 * Trylock needs to maintain the stack of held locks, but it
2109 * does not add new dependencies, because trylock can be done
2110 * in any order.
2111 *
2112 * We look up the chain_key and do the O(N^2) check and update of
2113 * the dependencies only if this is a new dependency chain.
2114 * (If lookup_chain_cache() returns with 1 it acquires
2115 * hash_lock for us)
2116 */
2117 if (!trylock && (check == 2) && lookup_chain_cache(chain_key)) {
2118 /*
2119 * Check whether last held lock:
2120 *
2121 * - is irq-safe, if this lock is irq-unsafe
2122 * - is softirq-safe, if this lock is hardirq-unsafe
2123 *
2124 * And check whether the new lock's dependency graph
2125 * could lead back to the previous lock.
2126 *
2127 * any of these scenarios could lead to a deadlock. If
2128 * All validations
2129 */
2130 int ret = check_deadlock(curr, hlock, lock, read);
2131
2132 if (!ret)
2133 return 0;
2134 /*
2135 * Mark recursive read, as we jump over it when
2136 * building dependencies (just like we jump over
2137 * trylock entries):
2138 */
2139 if (ret == 2)
2140 hlock->read = 2;
2141 /*
2142 * Add dependency only if this lock is not the head
2143 * of the chain, and if it's not a secondary read-lock:
2144 */
2145 if (!chain_head && ret != 2)
2146 if (!check_prevs_add(curr, hlock))
2147 return 0;
2148 __raw_spin_unlock(&hash_lock);
2149 }
2150 curr->lockdep_depth++;
2151 check_chain_key(curr);
2152 if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) {
2153 debug_locks_off();
2154 printk("BUG: MAX_LOCK_DEPTH too low!\n");
2155 printk("turning off the locking correctness validator.\n");
2156 return 0;
2157 }
2158 if (unlikely(curr->lockdep_depth > max_lockdep_depth))
2159 max_lockdep_depth = curr->lockdep_depth;
2160
2161 return 1;
2162}
2163
2164static int
2165print_unlock_inbalance_bug(struct task_struct *curr, struct lockdep_map *lock,
2166 unsigned long ip)
2167{
2168 if (!debug_locks_off())
2169 return 0;
2170 if (debug_locks_silent)
2171 return 0;
2172
2173 printk("\n=====================================\n");
2174 printk( "[ BUG: bad unlock balance detected! ]\n");
2175 printk( "-------------------------------------\n");
2176 printk("%s/%d is trying to release lock (",
2177 curr->comm, curr->pid);
2178 print_lockdep_cache(lock);
2179 printk(") at:\n");
2180 print_ip_sym(ip);
2181 printk("but there are no more locks to release!\n");
2182 printk("\nother info that might help us debug this:\n");
2183 lockdep_print_held_locks(curr);
2184
2185 printk("\nstack backtrace:\n");
2186 dump_stack();
2187
2188 return 0;
2189}
2190
2191/*
2192 * Common debugging checks for both nested and non-nested unlock:
2193 */
2194static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
2195 unsigned long ip)
2196{
2197 if (unlikely(!debug_locks))
2198 return 0;
2199 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
2200 return 0;
2201
2202 if (curr->lockdep_depth <= 0)
2203 return print_unlock_inbalance_bug(curr, lock, ip);
2204
2205 return 1;
2206}
2207
2208/*
2209 * Remove the lock to the list of currently held locks in a
2210 * potentially non-nested (out of order) manner. This is a
2211 * relatively rare operation, as all the unlock APIs default
2212 * to nested mode (which uses lock_release()):
2213 */
2214static int
2215lock_release_non_nested(struct task_struct *curr,
2216 struct lockdep_map *lock, unsigned long ip)
2217{
2218 struct held_lock *hlock, *prev_hlock;
2219 unsigned int depth;
2220 int i;
2221
2222 /*
2223 * Check whether the lock exists in the current stack
2224 * of held locks:
2225 */
2226 depth = curr->lockdep_depth;
2227 if (DEBUG_LOCKS_WARN_ON(!depth))
2228 return 0;
2229
2230 prev_hlock = NULL;
2231 for (i = depth-1; i >= 0; i--) {
2232 hlock = curr->held_locks + i;
2233 /*
2234 * We must not cross into another context:
2235 */
2236 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
2237 break;
2238 if (hlock->instance == lock)
2239 goto found_it;
2240 prev_hlock = hlock;
2241 }
2242 return print_unlock_inbalance_bug(curr, lock, ip);
2243
2244found_it:
2245 /*
2246 * We have the right lock to unlock, 'hlock' points to it.
2247 * Now we remove it from the stack, and add back the other
2248 * entries (if any), recalculating the hash along the way:
2249 */
2250 curr->lockdep_depth = i;
2251 curr->curr_chain_key = hlock->prev_chain_key;
2252
2253 for (i++; i < depth; i++) {
2254 hlock = curr->held_locks + i;
2255 if (!__lock_acquire(hlock->instance,
2256 hlock->class->subclass, hlock->trylock,
2257 hlock->read, hlock->check, hlock->hardirqs_off,
2258 hlock->acquire_ip))
2259 return 0;
2260 }
2261
2262 if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth - 1))
2263 return 0;
2264 return 1;
2265}
2266
2267/*
2268 * Remove the lock to the list of currently held locks - this gets
2269 * called on mutex_unlock()/spin_unlock*() (or on a failed
2270 * mutex_lock_interruptible()). This is done for unlocks that nest
2271 * perfectly. (i.e. the current top of the lock-stack is unlocked)
2272 */
2273static int lock_release_nested(struct task_struct *curr,
2274 struct lockdep_map *lock, unsigned long ip)
2275{
2276 struct held_lock *hlock;
2277 unsigned int depth;
2278
2279 /*
2280 * Pop off the top of the lock stack:
2281 */
2282 depth = curr->lockdep_depth - 1;
2283 hlock = curr->held_locks + depth;
2284
2285 /*
2286 * Is the unlock non-nested:
2287 */
2288 if (hlock->instance != lock)
2289 return lock_release_non_nested(curr, lock, ip);
2290 curr->lockdep_depth--;
2291
2292 if (DEBUG_LOCKS_WARN_ON(!depth && (hlock->prev_chain_key != 0)))
2293 return 0;
2294
2295 curr->curr_chain_key = hlock->prev_chain_key;
2296
2297#ifdef CONFIG_DEBUG_LOCKDEP
2298 hlock->prev_chain_key = 0;
2299 hlock->class = NULL;
2300 hlock->acquire_ip = 0;
2301 hlock->irq_context = 0;
2302#endif
2303 return 1;
2304}
2305
2306/*
2307 * Remove the lock to the list of currently held locks - this gets
2308 * called on mutex_unlock()/spin_unlock*() (or on a failed
2309 * mutex_lock_interruptible()). This is done for unlocks that nest
2310 * perfectly. (i.e. the current top of the lock-stack is unlocked)
2311 */
2312static void
2313__lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
2314{
2315 struct task_struct *curr = current;
2316
2317 if (!check_unlock(curr, lock, ip))
2318 return;
2319
2320 if (nested) {
2321 if (!lock_release_nested(curr, lock, ip))
2322 return;
2323 } else {
2324 if (!lock_release_non_nested(curr, lock, ip))
2325 return;
2326 }
2327
2328 check_chain_key(curr);
2329}
2330
2331/*
2332 * Check whether we follow the irq-flags state precisely:
2333 */
2334static void check_flags(unsigned long flags)
2335{
2336#if defined(CONFIG_DEBUG_LOCKDEP) && defined(CONFIG_TRACE_IRQFLAGS)
2337 if (!debug_locks)
2338 return;
2339
2340 if (irqs_disabled_flags(flags))
2341 DEBUG_LOCKS_WARN_ON(current->hardirqs_enabled);
2342 else
2343 DEBUG_LOCKS_WARN_ON(!current->hardirqs_enabled);
2344
2345 /*
2346 * We dont accurately track softirq state in e.g.
2347 * hardirq contexts (such as on 4KSTACKS), so only
2348 * check if not in hardirq contexts:
2349 */
2350 if (!hardirq_count()) {
2351 if (softirq_count())
2352 DEBUG_LOCKS_WARN_ON(current->softirqs_enabled);
2353 else
2354 DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled);
2355 }
2356
2357 if (!debug_locks)
2358 print_irqtrace_events(current);
2359#endif
2360}
2361
2362/*
2363 * We are not always called with irqs disabled - do that here,
2364 * and also avoid lockdep recursion:
2365 */
2366void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2367 int trylock, int read, int check, unsigned long ip)
2368{
2369 unsigned long flags;
2370
2371 if (unlikely(current->lockdep_recursion))
2372 return;
2373
2374 raw_local_irq_save(flags);
2375 check_flags(flags);
2376
2377 current->lockdep_recursion = 1;
2378 __lock_acquire(lock, subclass, trylock, read, check,
2379 irqs_disabled_flags(flags), ip);
2380 current->lockdep_recursion = 0;
2381 raw_local_irq_restore(flags);
2382}
2383
2384EXPORT_SYMBOL_GPL(lock_acquire);
2385
2386void lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
2387{
2388 unsigned long flags;
2389
2390 if (unlikely(current->lockdep_recursion))
2391 return;
2392
2393 raw_local_irq_save(flags);
2394 check_flags(flags);
2395 current->lockdep_recursion = 1;
2396 __lock_release(lock, nested, ip);
2397 current->lockdep_recursion = 0;
2398 raw_local_irq_restore(flags);
2399}
2400
2401EXPORT_SYMBOL_GPL(lock_release);
2402
2403/*
2404 * Used by the testsuite, sanitize the validator state
2405 * after a simulated failure:
2406 */
2407
2408void lockdep_reset(void)
2409{
2410 unsigned long flags;
2411
2412 raw_local_irq_save(flags);
2413 current->curr_chain_key = 0;
2414 current->lockdep_depth = 0;
2415 current->lockdep_recursion = 0;
2416 memset(current->held_locks, 0, MAX_LOCK_DEPTH*sizeof(struct held_lock));
2417 nr_hardirq_chains = 0;
2418 nr_softirq_chains = 0;
2419 nr_process_chains = 0;
2420 debug_locks = 1;
2421 raw_local_irq_restore(flags);
2422}
2423
2424static void zap_class(struct lock_class *class)
2425{
2426 int i;
2427
2428 /*
2429 * Remove all dependencies this lock is
2430 * involved in:
2431 */
2432 for (i = 0; i < nr_list_entries; i++) {
2433 if (list_entries[i].class == class)
2434 list_del_rcu(&list_entries[i].entry);
2435 }
2436 /*
2437 * Unhash the class and remove it from the all_lock_classes list:
2438 */
2439 list_del_rcu(&class->hash_entry);
2440 list_del_rcu(&class->lock_entry);
2441
2442}
2443
2444static inline int within(void *addr, void *start, unsigned long size)
2445{
2446 return addr >= start && addr < start + size;
2447}
2448
2449void lockdep_free_key_range(void *start, unsigned long size)
2450{
2451 struct lock_class *class, *next;
2452 struct list_head *head;
2453 unsigned long flags;
2454 int i;
2455
2456 raw_local_irq_save(flags);
2457 __raw_spin_lock(&hash_lock);
2458
2459 /*
2460 * Unhash all classes that were created by this module:
2461 */
2462 for (i = 0; i < CLASSHASH_SIZE; i++) {
2463 head = classhash_table + i;
2464 if (list_empty(head))
2465 continue;
2466 list_for_each_entry_safe(class, next, head, hash_entry)
2467 if (within(class->key, start, size))
2468 zap_class(class);
2469 }
2470
2471 __raw_spin_unlock(&hash_lock);
2472 raw_local_irq_restore(flags);
2473}
2474
2475void lockdep_reset_lock(struct lockdep_map *lock)
2476{
2477 struct lock_class *class, *next;
2478 struct list_head *head;
2479 unsigned long flags;
2480 int i, j;
2481
2482 raw_local_irq_save(flags);
2483
2484 /*
2485 * Remove all classes this lock might have:
2486 */
2487 for (j = 0; j < MAX_LOCKDEP_SUBCLASSES; j++) {
2488 /*
2489 * If the class exists we look it up and zap it:
2490 */
2491 class = look_up_lock_class(lock, j);
2492 if (class)
2493 zap_class(class);
2494 }
2495 /*
2496 * Debug check: in the end all mapped classes should
2497 * be gone.
2498 */
2499 __raw_spin_lock(&hash_lock);
2500 for (i = 0; i < CLASSHASH_SIZE; i++) {
2501 head = classhash_table + i;
2502 if (list_empty(head))
2503 continue;
2504 list_for_each_entry_safe(class, next, head, hash_entry) {
2505 if (unlikely(class == lock->class_cache)) {
2506 __raw_spin_unlock(&hash_lock);
2507 DEBUG_LOCKS_WARN_ON(1);
2508 goto out_restore;
2509 }
2510 }
2511 }
2512 __raw_spin_unlock(&hash_lock);
2513
2514out_restore:
2515 raw_local_irq_restore(flags);
2516}
2517
2518void __init lockdep_init(void)
2519{
2520 int i;
2521
2522 /*
2523 * Some architectures have their own start_kernel()
2524 * code which calls lockdep_init(), while we also
2525 * call lockdep_init() from the start_kernel() itself,
2526 * and we want to initialize the hashes only once:
2527 */
2528 if (lockdep_initialized)
2529 return;
2530
2531 for (i = 0; i < CLASSHASH_SIZE; i++)
2532 INIT_LIST_HEAD(classhash_table + i);
2533
2534 for (i = 0; i < CHAINHASH_SIZE; i++)
2535 INIT_LIST_HEAD(chainhash_table + i);
2536
2537 lockdep_initialized = 1;
2538}
2539
2540void __init lockdep_info(void)
2541{
2542 printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n");
2543
2544 printk("... MAX_LOCKDEP_SUBCLASSES: %lu\n", MAX_LOCKDEP_SUBCLASSES);
2545 printk("... MAX_LOCK_DEPTH: %lu\n", MAX_LOCK_DEPTH);
2546 printk("... MAX_LOCKDEP_KEYS: %lu\n", MAX_LOCKDEP_KEYS);
2547 printk("... CLASSHASH_SIZE: %lu\n", CLASSHASH_SIZE);
2548 printk("... MAX_LOCKDEP_ENTRIES: %lu\n", MAX_LOCKDEP_ENTRIES);
2549 printk("... MAX_LOCKDEP_CHAINS: %lu\n", MAX_LOCKDEP_CHAINS);
2550 printk("... CHAINHASH_SIZE: %lu\n", CHAINHASH_SIZE);
2551
2552 printk(" memory used by lock dependency info: %lu kB\n",
2553 (sizeof(struct lock_class) * MAX_LOCKDEP_KEYS +
2554 sizeof(struct list_head) * CLASSHASH_SIZE +
2555 sizeof(struct lock_list) * MAX_LOCKDEP_ENTRIES +
2556 sizeof(struct lock_chain) * MAX_LOCKDEP_CHAINS +
2557 sizeof(struct list_head) * CHAINHASH_SIZE) / 1024);
2558
2559 printk(" per task-struct memory footprint: %lu bytes\n",
2560 sizeof(struct held_lock) * MAX_LOCK_DEPTH);
2561
2562#ifdef CONFIG_DEBUG_LOCKDEP
2563 if (lockdep_init_error)
2564 printk("WARNING: lockdep init error! Arch code didnt call lockdep_init() early enough?\n");
2565#endif
2566}
2567
2568static inline int in_range(const void *start, const void *addr, const void *end)
2569{
2570 return addr >= start && addr <= end;
2571}
2572
2573static void
2574print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
2575 const void *mem_to, struct held_lock *hlock)
2576{
2577 if (!debug_locks_off())
2578 return;
2579 if (debug_locks_silent)
2580 return;
2581
2582 printk("\n=========================\n");
2583 printk( "[ BUG: held lock freed! ]\n");
2584 printk( "-------------------------\n");
2585 printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
2586 curr->comm, curr->pid, mem_from, mem_to-1);
2587 print_lock(hlock);
2588 lockdep_print_held_locks(curr);
2589
2590 printk("\nstack backtrace:\n");
2591 dump_stack();
2592}
2593
2594/*
2595 * Called when kernel memory is freed (or unmapped), or if a lock
2596 * is destroyed or reinitialized - this code checks whether there is
2597 * any held lock in the memory range of <from> to <to>:
2598 */
2599void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
2600{
2601 const void *mem_to = mem_from + mem_len, *lock_from, *lock_to;
2602 struct task_struct *curr = current;
2603 struct held_lock *hlock;
2604 unsigned long flags;
2605 int i;
2606
2607 if (unlikely(!debug_locks))
2608 return;
2609
2610 local_irq_save(flags);
2611 for (i = 0; i < curr->lockdep_depth; i++) {
2612 hlock = curr->held_locks + i;
2613
2614 lock_from = (void *)hlock->instance;
2615 lock_to = (void *)(hlock->instance + 1);
2616
2617 if (!in_range(mem_from, lock_from, mem_to) &&
2618 !in_range(mem_from, lock_to, mem_to))
2619 continue;
2620
2621 print_freed_lock_bug(curr, mem_from, mem_to, hlock);
2622 break;
2623 }
2624 local_irq_restore(flags);
2625}
2626
2627static void print_held_locks_bug(struct task_struct *curr)
2628{
2629 if (!debug_locks_off())
2630 return;
2631 if (debug_locks_silent)
2632 return;
2633
2634 printk("\n=====================================\n");
2635 printk( "[ BUG: lock held at task exit time! ]\n");
2636 printk( "-------------------------------------\n");
2637 printk("%s/%d is exiting with locks still held!\n",
2638 curr->comm, curr->pid);
2639 lockdep_print_held_locks(curr);
2640
2641 printk("\nstack backtrace:\n");
2642 dump_stack();
2643}
2644
2645void debug_check_no_locks_held(struct task_struct *task)
2646{
2647 if (unlikely(task->lockdep_depth > 0))
2648 print_held_locks_bug(task);
2649}
2650
2651void debug_show_all_locks(void)
2652{
2653 struct task_struct *g, *p;
2654 int count = 10;
2655 int unlock = 1;
2656
2657 printk("\nShowing all locks held in the system:\n");
2658
2659 /*
2660 * Here we try to get the tasklist_lock as hard as possible,
2661 * if not successful after 2 seconds we ignore it (but keep
2662 * trying). This is to enable a debug printout even if a
2663 * tasklist_lock-holding task deadlocks or crashes.
2664 */
2665retry:
2666 if (!read_trylock(&tasklist_lock)) {
2667 if (count == 10)
2668 printk("hm, tasklist_lock locked, retrying... ");
2669 if (count) {
2670 count--;
2671 printk(" #%d", 10-count);
2672 mdelay(200);
2673 goto retry;
2674 }
2675 printk(" ignoring it.\n");
2676 unlock = 0;
2677 }
2678 if (count != 10)
2679 printk(" locked it.\n");
2680
2681 do_each_thread(g, p) {
2682 if (p->lockdep_depth)
2683 lockdep_print_held_locks(p);
2684 if (!unlock)
2685 if (read_trylock(&tasklist_lock))
2686 unlock = 1;
2687 } while_each_thread(g, p);
2688
2689 printk("\n");
2690 printk("=============================================\n\n");
2691
2692 if (unlock)
2693 read_unlock(&tasklist_lock);
2694}
2695
2696EXPORT_SYMBOL_GPL(debug_show_all_locks);
2697
2698void debug_show_held_locks(struct task_struct *task)
2699{
2700 lockdep_print_held_locks(task);
2701}
2702
2703EXPORT_SYMBOL_GPL(debug_show_held_locks);
2704
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
new file mode 100644
index 000000000000..0d355f24fe04
--- /dev/null
+++ b/kernel/lockdep_internals.h
@@ -0,0 +1,78 @@
1/*
2 * kernel/lockdep_internals.h
3 *
4 * Runtime locking correctness validator
5 *
6 * lockdep subsystem internal functions and variables.
7 */
8
9/*
10 * MAX_LOCKDEP_ENTRIES is the maximum number of lock dependencies
11 * we track.
12 *
13 * We use the per-lock dependency maps in two ways: we grow it by adding
14 * every to-be-taken lock to all currently held lock's own dependency
15 * table (if it's not there yet), and we check it for lock order
16 * conflicts and deadlocks.
17 */
18#define MAX_LOCKDEP_ENTRIES 8192UL
19
20#define MAX_LOCKDEP_KEYS_BITS 11
21#define MAX_LOCKDEP_KEYS (1UL << MAX_LOCKDEP_KEYS_BITS)
22
23#define MAX_LOCKDEP_CHAINS_BITS 13
24#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS)
25
26/*
27 * Stack-trace: tightly packed array of stack backtrace
28 * addresses. Protected by the hash_lock.
29 */
30#define MAX_STACK_TRACE_ENTRIES 131072UL
31
32extern struct list_head all_lock_classes;
33
34extern void
35get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4);
36
37extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str);
38
39extern unsigned long nr_lock_classes;
40extern unsigned long nr_list_entries;
41extern unsigned long nr_lock_chains;
42extern unsigned long nr_stack_trace_entries;
43
44extern unsigned int nr_hardirq_chains;
45extern unsigned int nr_softirq_chains;
46extern unsigned int nr_process_chains;
47extern unsigned int max_lockdep_depth;
48extern unsigned int max_recursion_depth;
49
50#ifdef CONFIG_DEBUG_LOCKDEP
51/*
52 * Various lockdep statistics:
53 */
54extern atomic_t chain_lookup_hits;
55extern atomic_t chain_lookup_misses;
56extern atomic_t hardirqs_on_events;
57extern atomic_t hardirqs_off_events;
58extern atomic_t redundant_hardirqs_on;
59extern atomic_t redundant_hardirqs_off;
60extern atomic_t softirqs_on_events;
61extern atomic_t softirqs_off_events;
62extern atomic_t redundant_softirqs_on;
63extern atomic_t redundant_softirqs_off;
64extern atomic_t nr_unused_locks;
65extern atomic_t nr_cyclic_checks;
66extern atomic_t nr_cyclic_check_recursions;
67extern atomic_t nr_find_usage_forwards_checks;
68extern atomic_t nr_find_usage_forwards_recursions;
69extern atomic_t nr_find_usage_backwards_checks;
70extern atomic_t nr_find_usage_backwards_recursions;
71# define debug_atomic_inc(ptr) atomic_inc(ptr)
72# define debug_atomic_dec(ptr) atomic_dec(ptr)
73# define debug_atomic_read(ptr) atomic_read(ptr)
74#else
75# define debug_atomic_inc(ptr) do { } while (0)
76# define debug_atomic_dec(ptr) do { } while (0)
77# define debug_atomic_read(ptr) 0
78#endif
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
new file mode 100644
index 000000000000..f6e72eaab3fa
--- /dev/null
+++ b/kernel/lockdep_proc.c
@@ -0,0 +1,345 @@
1/*
2 * kernel/lockdep_proc.c
3 *
4 * Runtime locking correctness validator
5 *
6 * Started by Ingo Molnar:
7 *
8 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
9 *
10 * Code for /proc/lockdep and /proc/lockdep_stats:
11 *
12 */
13#include <linux/sched.h>
14#include <linux/module.h>
15#include <linux/proc_fs.h>
16#include <linux/seq_file.h>
17#include <linux/kallsyms.h>
18#include <linux/debug_locks.h>
19
20#include "lockdep_internals.h"
21
22static void *l_next(struct seq_file *m, void *v, loff_t *pos)
23{
24 struct lock_class *class = v;
25
26 (*pos)++;
27
28 if (class->lock_entry.next != &all_lock_classes)
29 class = list_entry(class->lock_entry.next, struct lock_class,
30 lock_entry);
31 else
32 class = NULL;
33 m->private = class;
34
35 return class;
36}
37
38static void *l_start(struct seq_file *m, loff_t *pos)
39{
40 struct lock_class *class = m->private;
41
42 if (&class->lock_entry == all_lock_classes.next)
43 seq_printf(m, "all lock classes:\n");
44
45 return class;
46}
47
48static void l_stop(struct seq_file *m, void *v)
49{
50}
51
52static unsigned long count_forward_deps(struct lock_class *class)
53{
54 struct lock_list *entry;
55 unsigned long ret = 1;
56
57 /*
58 * Recurse this class's dependency list:
59 */
60 list_for_each_entry(entry, &class->locks_after, entry)
61 ret += count_forward_deps(entry->class);
62
63 return ret;
64}
65
66static unsigned long count_backward_deps(struct lock_class *class)
67{
68 struct lock_list *entry;
69 unsigned long ret = 1;
70
71 /*
72 * Recurse this class's dependency list:
73 */
74 list_for_each_entry(entry, &class->locks_before, entry)
75 ret += count_backward_deps(entry->class);
76
77 return ret;
78}
79
80static int l_show(struct seq_file *m, void *v)
81{
82 unsigned long nr_forward_deps, nr_backward_deps;
83 struct lock_class *class = m->private;
84 char str[128], c1, c2, c3, c4;
85 const char *name;
86
87 seq_printf(m, "%p", class->key);
88#ifdef CONFIG_DEBUG_LOCKDEP
89 seq_printf(m, " OPS:%8ld", class->ops);
90#endif
91 nr_forward_deps = count_forward_deps(class);
92 seq_printf(m, " FD:%5ld", nr_forward_deps);
93
94 nr_backward_deps = count_backward_deps(class);
95 seq_printf(m, " BD:%5ld", nr_backward_deps);
96
97 get_usage_chars(class, &c1, &c2, &c3, &c4);
98 seq_printf(m, " %c%c%c%c", c1, c2, c3, c4);
99
100 name = class->name;
101 if (!name) {
102 name = __get_key_name(class->key, str);
103 seq_printf(m, ": %s", name);
104 } else{
105 seq_printf(m, ": %s", name);
106 if (class->name_version > 1)
107 seq_printf(m, "#%d", class->name_version);
108 if (class->subclass)
109 seq_printf(m, "/%d", class->subclass);
110 }
111 seq_puts(m, "\n");
112
113 return 0;
114}
115
116static struct seq_operations lockdep_ops = {
117 .start = l_start,
118 .next = l_next,
119 .stop = l_stop,
120 .show = l_show,
121};
122
123static int lockdep_open(struct inode *inode, struct file *file)
124{
125 int res = seq_open(file, &lockdep_ops);
126 if (!res) {
127 struct seq_file *m = file->private_data;
128
129 if (!list_empty(&all_lock_classes))
130 m->private = list_entry(all_lock_classes.next,
131 struct lock_class, lock_entry);
132 else
133 m->private = NULL;
134 }
135 return res;
136}
137
138static struct file_operations proc_lockdep_operations = {
139 .open = lockdep_open,
140 .read = seq_read,
141 .llseek = seq_lseek,
142 .release = seq_release,
143};
144
145static void lockdep_stats_debug_show(struct seq_file *m)
146{
147#ifdef CONFIG_DEBUG_LOCKDEP
148 unsigned int hi1 = debug_atomic_read(&hardirqs_on_events),
149 hi2 = debug_atomic_read(&hardirqs_off_events),
150 hr1 = debug_atomic_read(&redundant_hardirqs_on),
151 hr2 = debug_atomic_read(&redundant_hardirqs_off),
152 si1 = debug_atomic_read(&softirqs_on_events),
153 si2 = debug_atomic_read(&softirqs_off_events),
154 sr1 = debug_atomic_read(&redundant_softirqs_on),
155 sr2 = debug_atomic_read(&redundant_softirqs_off);
156
157 seq_printf(m, " chain lookup misses: %11u\n",
158 debug_atomic_read(&chain_lookup_misses));
159 seq_printf(m, " chain lookup hits: %11u\n",
160 debug_atomic_read(&chain_lookup_hits));
161 seq_printf(m, " cyclic checks: %11u\n",
162 debug_atomic_read(&nr_cyclic_checks));
163 seq_printf(m, " cyclic-check recursions: %11u\n",
164 debug_atomic_read(&nr_cyclic_check_recursions));
165 seq_printf(m, " find-mask forwards checks: %11u\n",
166 debug_atomic_read(&nr_find_usage_forwards_checks));
167 seq_printf(m, " find-mask forwards recursions: %11u\n",
168 debug_atomic_read(&nr_find_usage_forwards_recursions));
169 seq_printf(m, " find-mask backwards checks: %11u\n",
170 debug_atomic_read(&nr_find_usage_backwards_checks));
171 seq_printf(m, " find-mask backwards recursions:%11u\n",
172 debug_atomic_read(&nr_find_usage_backwards_recursions));
173
174 seq_printf(m, " hardirq on events: %11u\n", hi1);
175 seq_printf(m, " hardirq off events: %11u\n", hi2);
176 seq_printf(m, " redundant hardirq ons: %11u\n", hr1);
177 seq_printf(m, " redundant hardirq offs: %11u\n", hr2);
178 seq_printf(m, " softirq on events: %11u\n", si1);
179 seq_printf(m, " softirq off events: %11u\n", si2);
180 seq_printf(m, " redundant softirq ons: %11u\n", sr1);
181 seq_printf(m, " redundant softirq offs: %11u\n", sr2);
182#endif
183}
184
185static int lockdep_stats_show(struct seq_file *m, void *v)
186{
187 struct lock_class *class;
188 unsigned long nr_unused = 0, nr_uncategorized = 0,
189 nr_irq_safe = 0, nr_irq_unsafe = 0,
190 nr_softirq_safe = 0, nr_softirq_unsafe = 0,
191 nr_hardirq_safe = 0, nr_hardirq_unsafe = 0,
192 nr_irq_read_safe = 0, nr_irq_read_unsafe = 0,
193 nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0,
194 nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0,
195 sum_forward_deps = 0, factor = 0;
196
197 list_for_each_entry(class, &all_lock_classes, lock_entry) {
198
199 if (class->usage_mask == 0)
200 nr_unused++;
201 if (class->usage_mask == LOCKF_USED)
202 nr_uncategorized++;
203 if (class->usage_mask & LOCKF_USED_IN_IRQ)
204 nr_irq_safe++;
205 if (class->usage_mask & LOCKF_ENABLED_IRQS)
206 nr_irq_unsafe++;
207 if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ)
208 nr_softirq_safe++;
209 if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS)
210 nr_softirq_unsafe++;
211 if (class->usage_mask & LOCKF_USED_IN_HARDIRQ)
212 nr_hardirq_safe++;
213 if (class->usage_mask & LOCKF_ENABLED_HARDIRQS)
214 nr_hardirq_unsafe++;
215 if (class->usage_mask & LOCKF_USED_IN_IRQ_READ)
216 nr_irq_read_safe++;
217 if (class->usage_mask & LOCKF_ENABLED_IRQS_READ)
218 nr_irq_read_unsafe++;
219 if (class->usage_mask & LOCKF_USED_IN_SOFTIRQ_READ)
220 nr_softirq_read_safe++;
221 if (class->usage_mask & LOCKF_ENABLED_SOFTIRQS_READ)
222 nr_softirq_read_unsafe++;
223 if (class->usage_mask & LOCKF_USED_IN_HARDIRQ_READ)
224 nr_hardirq_read_safe++;
225 if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ)
226 nr_hardirq_read_unsafe++;
227
228 sum_forward_deps += count_forward_deps(class);
229 }
230#ifdef CONFIG_LOCKDEP_DEBUG
231 DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused);
232#endif
233 seq_printf(m, " lock-classes: %11lu [max: %lu]\n",
234 nr_lock_classes, MAX_LOCKDEP_KEYS);
235 seq_printf(m, " direct dependencies: %11lu [max: %lu]\n",
236 nr_list_entries, MAX_LOCKDEP_ENTRIES);
237 seq_printf(m, " indirect dependencies: %11lu\n",
238 sum_forward_deps);
239
240 /*
241 * Total number of dependencies:
242 *
243 * All irq-safe locks may nest inside irq-unsafe locks,
244 * plus all the other known dependencies:
245 */
246 seq_printf(m, " all direct dependencies: %11lu\n",
247 nr_irq_unsafe * nr_irq_safe +
248 nr_hardirq_unsafe * nr_hardirq_safe +
249 nr_list_entries);
250
251 /*
252 * Estimated factor between direct and indirect
253 * dependencies:
254 */
255 if (nr_list_entries)
256 factor = sum_forward_deps / nr_list_entries;
257
258 seq_printf(m, " dependency chains: %11lu [max: %lu]\n",
259 nr_lock_chains, MAX_LOCKDEP_CHAINS);
260
261#ifdef CONFIG_TRACE_IRQFLAGS
262 seq_printf(m, " in-hardirq chains: %11u\n",
263 nr_hardirq_chains);
264 seq_printf(m, " in-softirq chains: %11u\n",
265 nr_softirq_chains);
266#endif
267 seq_printf(m, " in-process chains: %11u\n",
268 nr_process_chains);
269 seq_printf(m, " stack-trace entries: %11lu [max: %lu]\n",
270 nr_stack_trace_entries, MAX_STACK_TRACE_ENTRIES);
271 seq_printf(m, " combined max dependencies: %11u\n",
272 (nr_hardirq_chains + 1) *
273 (nr_softirq_chains + 1) *
274 (nr_process_chains + 1)
275 );
276 seq_printf(m, " hardirq-safe locks: %11lu\n",
277 nr_hardirq_safe);
278 seq_printf(m, " hardirq-unsafe locks: %11lu\n",
279 nr_hardirq_unsafe);
280 seq_printf(m, " softirq-safe locks: %11lu\n",
281 nr_softirq_safe);
282 seq_printf(m, " softirq-unsafe locks: %11lu\n",
283 nr_softirq_unsafe);
284 seq_printf(m, " irq-safe locks: %11lu\n",
285 nr_irq_safe);
286 seq_printf(m, " irq-unsafe locks: %11lu\n",
287 nr_irq_unsafe);
288
289 seq_printf(m, " hardirq-read-safe locks: %11lu\n",
290 nr_hardirq_read_safe);
291 seq_printf(m, " hardirq-read-unsafe locks: %11lu\n",
292 nr_hardirq_read_unsafe);
293 seq_printf(m, " softirq-read-safe locks: %11lu\n",
294 nr_softirq_read_safe);
295 seq_printf(m, " softirq-read-unsafe locks: %11lu\n",
296 nr_softirq_read_unsafe);
297 seq_printf(m, " irq-read-safe locks: %11lu\n",
298 nr_irq_read_safe);
299 seq_printf(m, " irq-read-unsafe locks: %11lu\n",
300 nr_irq_read_unsafe);
301
302 seq_printf(m, " uncategorized locks: %11lu\n",
303 nr_uncategorized);
304 seq_printf(m, " unused locks: %11lu\n",
305 nr_unused);
306 seq_printf(m, " max locking depth: %11u\n",
307 max_lockdep_depth);
308 seq_printf(m, " max recursion depth: %11u\n",
309 max_recursion_depth);
310 lockdep_stats_debug_show(m);
311 seq_printf(m, " debug_locks: %11u\n",
312 debug_locks);
313
314 return 0;
315}
316
317static int lockdep_stats_open(struct inode *inode, struct file *file)
318{
319 return single_open(file, lockdep_stats_show, NULL);
320}
321
322static struct file_operations proc_lockdep_stats_operations = {
323 .open = lockdep_stats_open,
324 .read = seq_read,
325 .llseek = seq_lseek,
326 .release = seq_release,
327};
328
329static int __init lockdep_proc_init(void)
330{
331 struct proc_dir_entry *entry;
332
333 entry = create_proc_entry("lockdep", S_IRUSR, NULL);
334 if (entry)
335 entry->proc_fops = &proc_lockdep_operations;
336
337 entry = create_proc_entry("lockdep_stats", S_IRUSR, NULL);
338 if (entry)
339 entry->proc_fops = &proc_lockdep_stats_operations;
340
341 return 0;
342}
343
344__initcall(lockdep_proc_init);
345
diff --git a/kernel/module.c b/kernel/module.c
index bbe04862e1b0..2a19cd47c046 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1,4 +1,4 @@
1/* Rewritten by Rusty Russell, on the backs of many others... 1/*
2 Copyright (C) 2002 Richard Henderson 2 Copyright (C) 2002 Richard Henderson
3 Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. 3 Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM.
4 4
@@ -16,7 +16,6 @@
16 along with this program; if not, write to the Free Software 16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 17 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18*/ 18*/
19#include <linux/config.h>
20#include <linux/module.h> 19#include <linux/module.h>
21#include <linux/moduleloader.h> 20#include <linux/moduleloader.h>
22#include <linux/init.h> 21#include <linux/init.h>
@@ -40,9 +39,11 @@
40#include <linux/string.h> 39#include <linux/string.h>
41#include <linux/sched.h> 40#include <linux/sched.h>
42#include <linux/mutex.h> 41#include <linux/mutex.h>
42#include <linux/unwind.h>
43#include <asm/uaccess.h> 43#include <asm/uaccess.h>
44#include <asm/semaphore.h> 44#include <asm/semaphore.h>
45#include <asm/cacheflush.h> 45#include <asm/cacheflush.h>
46#include <linux/license.h>
46 47
47#if 0 48#if 0
48#define DEBUGP printk 49#define DEBUGP printk
@@ -120,9 +121,17 @@ extern const struct kernel_symbol __start___ksymtab_gpl[];
120extern const struct kernel_symbol __stop___ksymtab_gpl[]; 121extern const struct kernel_symbol __stop___ksymtab_gpl[];
121extern const struct kernel_symbol __start___ksymtab_gpl_future[]; 122extern const struct kernel_symbol __start___ksymtab_gpl_future[];
122extern const struct kernel_symbol __stop___ksymtab_gpl_future[]; 123extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
124extern const struct kernel_symbol __start___ksymtab_unused[];
125extern const struct kernel_symbol __stop___ksymtab_unused[];
126extern const struct kernel_symbol __start___ksymtab_unused_gpl[];
127extern const struct kernel_symbol __stop___ksymtab_unused_gpl[];
128extern const struct kernel_symbol __start___ksymtab_gpl_future[];
129extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
123extern const unsigned long __start___kcrctab[]; 130extern const unsigned long __start___kcrctab[];
124extern const unsigned long __start___kcrctab_gpl[]; 131extern const unsigned long __start___kcrctab_gpl[];
125extern const unsigned long __start___kcrctab_gpl_future[]; 132extern const unsigned long __start___kcrctab_gpl_future[];
133extern const unsigned long __start___kcrctab_unused[];
134extern const unsigned long __start___kcrctab_unused_gpl[];
126 135
127#ifndef CONFIG_MODVERSIONS 136#ifndef CONFIG_MODVERSIONS
128#define symversion(base, idx) NULL 137#define symversion(base, idx) NULL
@@ -142,6 +151,17 @@ static const struct kernel_symbol *lookup_symbol(const char *name,
142 return NULL; 151 return NULL;
143} 152}
144 153
154static void printk_unused_warning(const char *name)
155{
156 printk(KERN_WARNING "Symbol %s is marked as UNUSED, "
157 "however this module is using it.\n", name);
158 printk(KERN_WARNING "This symbol will go away in the future.\n");
159 printk(KERN_WARNING "Please evalute if this is the right api to use, "
160 "and if it really is, submit a report the linux kernel "
161 "mailinglist together with submitting your code for "
162 "inclusion.\n");
163}
164
145/* Find a symbol, return value, crc and module which owns it */ 165/* Find a symbol, return value, crc and module which owns it */
146static unsigned long __find_symbol(const char *name, 166static unsigned long __find_symbol(const char *name,
147 struct module **owner, 167 struct module **owner,
@@ -184,6 +204,25 @@ static unsigned long __find_symbol(const char *name,
184 return ks->value; 204 return ks->value;
185 } 205 }
186 206
207 ks = lookup_symbol(name, __start___ksymtab_unused,
208 __stop___ksymtab_unused);
209 if (ks) {
210 printk_unused_warning(name);
211 *crc = symversion(__start___kcrctab_unused,
212 (ks - __start___ksymtab_unused));
213 return ks->value;
214 }
215
216 if (gplok)
217 ks = lookup_symbol(name, __start___ksymtab_unused_gpl,
218 __stop___ksymtab_unused_gpl);
219 if (ks) {
220 printk_unused_warning(name);
221 *crc = symversion(__start___kcrctab_unused_gpl,
222 (ks - __start___ksymtab_unused_gpl));
223 return ks->value;
224 }
225
187 /* Now try modules. */ 226 /* Now try modules. */
188 list_for_each_entry(mod, &modules, list) { 227 list_for_each_entry(mod, &modules, list) {
189 *owner = mod; 228 *owner = mod;
@@ -202,6 +241,23 @@ static unsigned long __find_symbol(const char *name,
202 return ks->value; 241 return ks->value;
203 } 242 }
204 } 243 }
244 ks = lookup_symbol(name, mod->unused_syms, mod->unused_syms + mod->num_unused_syms);
245 if (ks) {
246 printk_unused_warning(name);
247 *crc = symversion(mod->unused_crcs, (ks - mod->unused_syms));
248 return ks->value;
249 }
250
251 if (gplok) {
252 ks = lookup_symbol(name, mod->unused_gpl_syms,
253 mod->unused_gpl_syms + mod->num_unused_gpl_syms);
254 if (ks) {
255 printk_unused_warning(name);
256 *crc = symversion(mod->unused_gpl_crcs,
257 (ks - mod->unused_gpl_syms));
258 return ks->value;
259 }
260 }
205 ks = lookup_symbol(name, mod->gpl_future_syms, 261 ks = lookup_symbol(name, mod->gpl_future_syms,
206 (mod->gpl_future_syms + 262 (mod->gpl_future_syms +
207 mod->num_gpl_future_syms)); 263 mod->num_gpl_future_syms));
@@ -1051,6 +1107,8 @@ static void free_module(struct module *mod)
1051 remove_sect_attrs(mod); 1107 remove_sect_attrs(mod);
1052 mod_kobject_remove(mod); 1108 mod_kobject_remove(mod);
1053 1109
1110 unwind_remove_table(mod->unwind_info, 0);
1111
1054 /* Arch-specific cleanup. */ 1112 /* Arch-specific cleanup. */
1055 module_arch_cleanup(mod); 1113 module_arch_cleanup(mod);
1056 1114
@@ -1063,6 +1121,9 @@ static void free_module(struct module *mod)
1063 if (mod->percpu) 1121 if (mod->percpu)
1064 percpu_modfree(mod->percpu); 1122 percpu_modfree(mod->percpu);
1065 1123
1124 /* Free lock-classes: */
1125 lockdep_free_key_range(mod->module_core, mod->core_size);
1126
1066 /* Finally, free the core (containing the module structure) */ 1127 /* Finally, free the core (containing the module structure) */
1067 module_free(mod, mod->module_core); 1128 module_free(mod, mod->module_core);
1068} 1129}
@@ -1248,16 +1309,6 @@ static void layout_sections(struct module *mod,
1248 } 1309 }
1249} 1310}
1250 1311
1251static inline int license_is_gpl_compatible(const char *license)
1252{
1253 return (strcmp(license, "GPL") == 0
1254 || strcmp(license, "GPL v2") == 0
1255 || strcmp(license, "GPL and additional rights") == 0
1256 || strcmp(license, "Dual BSD/GPL") == 0
1257 || strcmp(license, "Dual MIT/GPL") == 0
1258 || strcmp(license, "Dual MPL/GPL") == 0);
1259}
1260
1261static void set_license(struct module *mod, const char *license) 1312static void set_license(struct module *mod, const char *license)
1262{ 1313{
1263 if (!license) 1314 if (!license)
@@ -1326,7 +1377,7 @@ int is_exported(const char *name, const struct module *mod)
1326 if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab)) 1377 if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab))
1327 return 1; 1378 return 1;
1328 else 1379 else
1329 if (lookup_symbol(name, mod->syms, mod->syms + mod->num_syms)) 1380 if (mod && lookup_symbol(name, mod->syms, mod->syms + mod->num_syms))
1330 return 1; 1381 return 1;
1331 else 1382 else
1332 return 0; 1383 return 0;
@@ -1409,10 +1460,27 @@ static struct module *load_module(void __user *umod,
1409 Elf_Ehdr *hdr; 1460 Elf_Ehdr *hdr;
1410 Elf_Shdr *sechdrs; 1461 Elf_Shdr *sechdrs;
1411 char *secstrings, *args, *modmagic, *strtab = NULL; 1462 char *secstrings, *args, *modmagic, *strtab = NULL;
1412 unsigned int i, symindex = 0, strindex = 0, setupindex, exindex, 1463 unsigned int i;
1413 exportindex, modindex, obsparmindex, infoindex, gplindex, 1464 unsigned int symindex = 0;
1414 crcindex, gplcrcindex, versindex, pcpuindex, gplfutureindex, 1465 unsigned int strindex = 0;
1415 gplfuturecrcindex; 1466 unsigned int setupindex;
1467 unsigned int exindex;
1468 unsigned int exportindex;
1469 unsigned int modindex;
1470 unsigned int obsparmindex;
1471 unsigned int infoindex;
1472 unsigned int gplindex;
1473 unsigned int crcindex;
1474 unsigned int gplcrcindex;
1475 unsigned int versindex;
1476 unsigned int pcpuindex;
1477 unsigned int gplfutureindex;
1478 unsigned int gplfuturecrcindex;
1479 unsigned int unwindex = 0;
1480 unsigned int unusedindex;
1481 unsigned int unusedcrcindex;
1482 unsigned int unusedgplindex;
1483 unsigned int unusedgplcrcindex;
1416 struct module *mod; 1484 struct module *mod;
1417 long err = 0; 1485 long err = 0;
1418 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ 1486 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
@@ -1493,15 +1561,22 @@ static struct module *load_module(void __user *umod,
1493 exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab"); 1561 exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab");
1494 gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl"); 1562 gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl");
1495 gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future"); 1563 gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future");
1564 unusedindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused");
1565 unusedgplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused_gpl");
1496 crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab"); 1566 crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab");
1497 gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl"); 1567 gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl");
1498 gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future"); 1568 gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future");
1569 unusedcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused");
1570 unusedgplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused_gpl");
1499 setupindex = find_sec(hdr, sechdrs, secstrings, "__param"); 1571 setupindex = find_sec(hdr, sechdrs, secstrings, "__param");
1500 exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table"); 1572 exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table");
1501 obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm"); 1573 obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm");
1502 versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); 1574 versindex = find_sec(hdr, sechdrs, secstrings, "__versions");
1503 infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); 1575 infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo");
1504 pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); 1576 pcpuindex = find_pcpusec(hdr, sechdrs, secstrings);
1577#ifdef ARCH_UNWIND_SECTION_NAME
1578 unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME);
1579#endif
1505 1580
1506 /* Don't keep modinfo section */ 1581 /* Don't keep modinfo section */
1507 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; 1582 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
@@ -1510,6 +1585,8 @@ static struct module *load_module(void __user *umod,
1510 sechdrs[symindex].sh_flags |= SHF_ALLOC; 1585 sechdrs[symindex].sh_flags |= SHF_ALLOC;
1511 sechdrs[strindex].sh_flags |= SHF_ALLOC; 1586 sechdrs[strindex].sh_flags |= SHF_ALLOC;
1512#endif 1587#endif
1588 if (unwindex)
1589 sechdrs[unwindex].sh_flags |= SHF_ALLOC;
1513 1590
1514 /* Check module struct version now, before we try to use module. */ 1591 /* Check module struct version now, before we try to use module. */
1515 if (!check_modstruct_version(sechdrs, versindex, mod)) { 1592 if (!check_modstruct_version(sechdrs, versindex, mod)) {
@@ -1639,14 +1716,27 @@ static struct module *load_module(void __user *umod,
1639 mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr; 1716 mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr;
1640 mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size / 1717 mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size /
1641 sizeof(*mod->gpl_future_syms); 1718 sizeof(*mod->gpl_future_syms);
1719 mod->num_unused_syms = sechdrs[unusedindex].sh_size /
1720 sizeof(*mod->unused_syms);
1721 mod->num_unused_gpl_syms = sechdrs[unusedgplindex].sh_size /
1722 sizeof(*mod->unused_gpl_syms);
1642 mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr; 1723 mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr;
1643 if (gplfuturecrcindex) 1724 if (gplfuturecrcindex)
1644 mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr; 1725 mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr;
1645 1726
1727 mod->unused_syms = (void *)sechdrs[unusedindex].sh_addr;
1728 if (unusedcrcindex)
1729 mod->unused_crcs = (void *)sechdrs[unusedcrcindex].sh_addr;
1730 mod->unused_gpl_syms = (void *)sechdrs[unusedgplindex].sh_addr;
1731 if (unusedgplcrcindex)
1732 mod->unused_crcs = (void *)sechdrs[unusedgplcrcindex].sh_addr;
1733
1646#ifdef CONFIG_MODVERSIONS 1734#ifdef CONFIG_MODVERSIONS
1647 if ((mod->num_syms && !crcindex) || 1735 if ((mod->num_syms && !crcindex) ||
1648 (mod->num_gpl_syms && !gplcrcindex) || 1736 (mod->num_gpl_syms && !gplcrcindex) ||
1649 (mod->num_gpl_future_syms && !gplfuturecrcindex)) { 1737 (mod->num_gpl_future_syms && !gplfuturecrcindex) ||
1738 (mod->num_unused_syms && !unusedcrcindex) ||
1739 (mod->num_unused_gpl_syms && !unusedgplcrcindex)) {
1650 printk(KERN_WARNING "%s: No versions for exported symbols." 1740 printk(KERN_WARNING "%s: No versions for exported symbols."
1651 " Tainting kernel.\n", mod->name); 1741 " Tainting kernel.\n", mod->name);
1652 add_taint(TAINT_FORCED_MODULE); 1742 add_taint(TAINT_FORCED_MODULE);
@@ -1738,6 +1828,11 @@ static struct module *load_module(void __user *umod,
1738 goto arch_cleanup; 1828 goto arch_cleanup;
1739 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 1829 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
1740 1830
1831 /* Size of section 0 is 0, so this works well if no unwind info. */
1832 mod->unwind_info = unwind_add_table(mod,
1833 (void *)sechdrs[unwindex].sh_addr,
1834 sechdrs[unwindex].sh_size);
1835
1741 /* Get rid of temporary copy */ 1836 /* Get rid of temporary copy */
1742 vfree(hdr); 1837 vfree(hdr);
1743 1838
@@ -1836,6 +1931,7 @@ sys_init_module(void __user *umod,
1836 mod->state = MODULE_STATE_LIVE; 1931 mod->state = MODULE_STATE_LIVE;
1837 /* Drop initial reference. */ 1932 /* Drop initial reference. */
1838 module_put(mod); 1933 module_put(mod);
1934 unwind_remove_table(mod->unwind_info, 1);
1839 module_free(mod, mod->module_init); 1935 module_free(mod, mod->module_init);
1840 mod->module_init = NULL; 1936 mod->module_init = NULL;
1841 mod->init_size = 0; 1937 mod->init_size = 0;
@@ -1923,10 +2019,8 @@ const char *module_address_lookup(unsigned long addr,
1923 return NULL; 2019 return NULL;
1924} 2020}
1925 2021
1926struct module *module_get_kallsym(unsigned int symnum, 2022struct module *module_get_kallsym(unsigned int symnum, unsigned long *value,
1927 unsigned long *value, 2023 char *type, char *name, size_t namelen)
1928 char *type,
1929 char namebuf[128])
1930{ 2024{
1931 struct module *mod; 2025 struct module *mod;
1932 2026
@@ -1935,9 +2029,8 @@ struct module *module_get_kallsym(unsigned int symnum,
1935 if (symnum < mod->num_symtab) { 2029 if (symnum < mod->num_symtab) {
1936 *value = mod->symtab[symnum].st_value; 2030 *value = mod->symtab[symnum].st_value;
1937 *type = mod->symtab[symnum].st_info; 2031 *type = mod->symtab[symnum].st_info;
1938 strncpy(namebuf, 2032 strlcpy(name, mod->strtab + mod->symtab[symnum].st_name,
1939 mod->strtab + mod->symtab[symnum].st_name, 2033 namelen);
1940 127);
1941 mutex_unlock(&module_mutex); 2034 mutex_unlock(&module_mutex);
1942 return mod; 2035 return mod;
1943 } 2036 }
@@ -2066,6 +2159,29 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
2066 return e; 2159 return e;
2067} 2160}
2068 2161
2162/*
2163 * Is this a valid module address?
2164 */
2165int is_module_address(unsigned long addr)
2166{
2167 unsigned long flags;
2168 struct module *mod;
2169
2170 spin_lock_irqsave(&modlist_lock, flags);
2171
2172 list_for_each_entry(mod, &modules, list) {
2173 if (within(addr, mod->module_core, mod->core_size)) {
2174 spin_unlock_irqrestore(&modlist_lock, flags);
2175 return 1;
2176 }
2177 }
2178
2179 spin_unlock_irqrestore(&modlist_lock, flags);
2180
2181 return 0;
2182}
2183
2184
2069/* Is this a valid kernel address? We don't grab the lock: we are oopsing. */ 2185/* Is this a valid kernel address? We don't grab the lock: we are oopsing. */
2070struct module *__module_text_address(unsigned long addr) 2186struct module *__module_text_address(unsigned long addr)
2071{ 2187{
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index f4913c376950..e3203c654dda 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -16,395 +16,48 @@
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/delay.h> 17#include <linux/delay.h>
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/poison.h>
19#include <linux/spinlock.h> 20#include <linux/spinlock.h>
20#include <linux/kallsyms.h> 21#include <linux/kallsyms.h>
21#include <linux/interrupt.h> 22#include <linux/interrupt.h>
23#include <linux/debug_locks.h>
22 24
23#include "mutex-debug.h" 25#include "mutex-debug.h"
24 26
25/* 27/*
26 * We need a global lock when we walk through the multi-process
27 * lock tree. Only used in the deadlock-debugging case.
28 */
29DEFINE_SPINLOCK(debug_mutex_lock);
30
31/*
32 * All locks held by all tasks, in a single global list:
33 */
34LIST_HEAD(debug_mutex_held_locks);
35
36/*
37 * In the debug case we carry the caller's instruction pointer into
38 * other functions, but we dont want the function argument overhead
39 * in the nondebug case - hence these macros:
40 */
41#define __IP_DECL__ , unsigned long ip
42#define __IP__ , ip
43#define __RET_IP__ , (unsigned long)__builtin_return_address(0)
44
45/*
46 * "mutex debugging enabled" flag. We turn it off when we detect
47 * the first problem because we dont want to recurse back
48 * into the tracing code when doing error printk or
49 * executing a BUG():
50 */
51int debug_mutex_on = 1;
52
53static void printk_task(struct task_struct *p)
54{
55 if (p)
56 printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio);
57 else
58 printk("<none>");
59}
60
61static void printk_ti(struct thread_info *ti)
62{
63 if (ti)
64 printk_task(ti->task);
65 else
66 printk("<none>");
67}
68
69static void printk_task_short(struct task_struct *p)
70{
71 if (p)
72 printk("%s/%d [%p, %3d]", p->comm, p->pid, p, p->prio);
73 else
74 printk("<none>");
75}
76
77static void printk_lock(struct mutex *lock, int print_owner)
78{
79 printk(" [%p] {%s}\n", lock, lock->name);
80
81 if (print_owner && lock->owner) {
82 printk(".. held by: ");
83 printk_ti(lock->owner);
84 printk("\n");
85 }
86 if (lock->owner) {
87 printk("... acquired at: ");
88 print_symbol("%s\n", lock->acquire_ip);
89 }
90}
91
92/*
93 * printk locks held by a task:
94 */
95static void show_task_locks(struct task_struct *p)
96{
97 switch (p->state) {
98 case TASK_RUNNING: printk("R"); break;
99 case TASK_INTERRUPTIBLE: printk("S"); break;
100 case TASK_UNINTERRUPTIBLE: printk("D"); break;
101 case TASK_STOPPED: printk("T"); break;
102 case EXIT_ZOMBIE: printk("Z"); break;
103 case EXIT_DEAD: printk("X"); break;
104 default: printk("?"); break;
105 }
106 printk_task(p);
107 if (p->blocked_on) {
108 struct mutex *lock = p->blocked_on->lock;
109
110 printk(" blocked on mutex:");
111 printk_lock(lock, 1);
112 } else
113 printk(" (not blocked on mutex)\n");
114}
115
116/*
117 * printk all locks held in the system (if filter == NULL),
118 * or all locks belonging to a single task (if filter != NULL):
119 */
120void show_held_locks(struct task_struct *filter)
121{
122 struct list_head *curr, *cursor = NULL;
123 struct mutex *lock;
124 struct thread_info *t;
125 unsigned long flags;
126 int count = 0;
127
128 if (filter) {
129 printk("------------------------------\n");
130 printk("| showing all locks held by: | (");
131 printk_task_short(filter);
132 printk("):\n");
133 printk("------------------------------\n");
134 } else {
135 printk("---------------------------\n");
136 printk("| showing all locks held: |\n");
137 printk("---------------------------\n");
138 }
139
140 /*
141 * Play safe and acquire the global trace lock. We
142 * cannot printk with that lock held so we iterate
143 * very carefully:
144 */
145next:
146 debug_spin_lock_save(&debug_mutex_lock, flags);
147 list_for_each(curr, &debug_mutex_held_locks) {
148 if (cursor && curr != cursor)
149 continue;
150 lock = list_entry(curr, struct mutex, held_list);
151 t = lock->owner;
152 if (filter && (t != filter->thread_info))
153 continue;
154 count++;
155 cursor = curr->next;
156 debug_spin_lock_restore(&debug_mutex_lock, flags);
157
158 printk("\n#%03d: ", count);
159 printk_lock(lock, filter ? 0 : 1);
160 goto next;
161 }
162 debug_spin_lock_restore(&debug_mutex_lock, flags);
163 printk("\n");
164}
165
166void mutex_debug_show_all_locks(void)
167{
168 struct task_struct *g, *p;
169 int count = 10;
170 int unlock = 1;
171
172 printk("\nShowing all blocking locks in the system:\n");
173
174 /*
175 * Here we try to get the tasklist_lock as hard as possible,
176 * if not successful after 2 seconds we ignore it (but keep
177 * trying). This is to enable a debug printout even if a
178 * tasklist_lock-holding task deadlocks or crashes.
179 */
180retry:
181 if (!read_trylock(&tasklist_lock)) {
182 if (count == 10)
183 printk("hm, tasklist_lock locked, retrying... ");
184 if (count) {
185 count--;
186 printk(" #%d", 10-count);
187 mdelay(200);
188 goto retry;
189 }
190 printk(" ignoring it.\n");
191 unlock = 0;
192 }
193 if (count != 10)
194 printk(" locked it.\n");
195
196 do_each_thread(g, p) {
197 show_task_locks(p);
198 if (!unlock)
199 if (read_trylock(&tasklist_lock))
200 unlock = 1;
201 } while_each_thread(g, p);
202
203 printk("\n");
204 show_held_locks(NULL);
205 printk("=============================================\n\n");
206
207 if (unlock)
208 read_unlock(&tasklist_lock);
209}
210
211static void report_deadlock(struct task_struct *task, struct mutex *lock,
212 struct mutex *lockblk, unsigned long ip)
213{
214 printk("\n%s/%d is trying to acquire this lock:\n",
215 current->comm, current->pid);
216 printk_lock(lock, 1);
217 printk("... trying at: ");
218 print_symbol("%s\n", ip);
219 show_held_locks(current);
220
221 if (lockblk) {
222 printk("but %s/%d is deadlocking current task %s/%d!\n\n",
223 task->comm, task->pid, current->comm, current->pid);
224 printk("\n%s/%d is blocked on this lock:\n",
225 task->comm, task->pid);
226 printk_lock(lockblk, 1);
227
228 show_held_locks(task);
229
230 printk("\n%s/%d's [blocked] stackdump:\n\n",
231 task->comm, task->pid);
232 show_stack(task, NULL);
233 }
234
235 printk("\n%s/%d's [current] stackdump:\n\n",
236 current->comm, current->pid);
237 dump_stack();
238 mutex_debug_show_all_locks();
239 printk("[ turning off deadlock detection. Please report this. ]\n\n");
240 local_irq_disable();
241}
242
243/*
244 * Recursively check for mutex deadlocks:
245 */
246static int check_deadlock(struct mutex *lock, int depth,
247 struct thread_info *ti, unsigned long ip)
248{
249 struct mutex *lockblk;
250 struct task_struct *task;
251
252 if (!debug_mutex_on)
253 return 0;
254
255 ti = lock->owner;
256 if (!ti)
257 return 0;
258
259 task = ti->task;
260 lockblk = NULL;
261 if (task->blocked_on)
262 lockblk = task->blocked_on->lock;
263
264 /* Self-deadlock: */
265 if (current == task) {
266 DEBUG_OFF();
267 if (depth)
268 return 1;
269 printk("\n==========================================\n");
270 printk( "[ BUG: lock recursion deadlock detected! |\n");
271 printk( "------------------------------------------\n");
272 report_deadlock(task, lock, NULL, ip);
273 return 0;
274 }
275
276 /* Ugh, something corrupted the lock data structure? */
277 if (depth > 20) {
278 DEBUG_OFF();
279 printk("\n===========================================\n");
280 printk( "[ BUG: infinite lock dependency detected!? |\n");
281 printk( "-------------------------------------------\n");
282 report_deadlock(task, lock, lockblk, ip);
283 return 0;
284 }
285
286 /* Recursively check for dependencies: */
287 if (lockblk && check_deadlock(lockblk, depth+1, ti, ip)) {
288 printk("\n============================================\n");
289 printk( "[ BUG: circular locking deadlock detected! ]\n");
290 printk( "--------------------------------------------\n");
291 report_deadlock(task, lock, lockblk, ip);
292 return 0;
293 }
294 return 0;
295}
296
297/*
298 * Called when a task exits, this function checks whether the
299 * task is holding any locks, and reports the first one if so:
300 */
301void mutex_debug_check_no_locks_held(struct task_struct *task)
302{
303 struct list_head *curr, *next;
304 struct thread_info *t;
305 unsigned long flags;
306 struct mutex *lock;
307
308 if (!debug_mutex_on)
309 return;
310
311 debug_spin_lock_save(&debug_mutex_lock, flags);
312 list_for_each_safe(curr, next, &debug_mutex_held_locks) {
313 lock = list_entry(curr, struct mutex, held_list);
314 t = lock->owner;
315 if (t != task->thread_info)
316 continue;
317 list_del_init(curr);
318 DEBUG_OFF();
319 debug_spin_lock_restore(&debug_mutex_lock, flags);
320
321 printk("BUG: %s/%d, lock held at task exit time!\n",
322 task->comm, task->pid);
323 printk_lock(lock, 1);
324 if (lock->owner != task->thread_info)
325 printk("exiting task is not even the owner??\n");
326 return;
327 }
328 debug_spin_lock_restore(&debug_mutex_lock, flags);
329}
330
331/*
332 * Called when kernel memory is freed (or unmapped), or if a mutex
333 * is destroyed or reinitialized - this code checks whether there is
334 * any held lock in the memory range of <from> to <to>:
335 */
336void mutex_debug_check_no_locks_freed(const void *from, unsigned long len)
337{
338 struct list_head *curr, *next;
339 const void *to = from + len;
340 unsigned long flags;
341 struct mutex *lock;
342 void *lock_addr;
343
344 if (!debug_mutex_on)
345 return;
346
347 debug_spin_lock_save(&debug_mutex_lock, flags);
348 list_for_each_safe(curr, next, &debug_mutex_held_locks) {
349 lock = list_entry(curr, struct mutex, held_list);
350 lock_addr = lock;
351 if (lock_addr < from || lock_addr >= to)
352 continue;
353 list_del_init(curr);
354 DEBUG_OFF();
355 debug_spin_lock_restore(&debug_mutex_lock, flags);
356
357 printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n",
358 current->comm, current->pid, lock, from, to);
359 dump_stack();
360 printk_lock(lock, 1);
361 if (lock->owner != current_thread_info())
362 printk("freeing task is not even the owner??\n");
363 return;
364 }
365 debug_spin_lock_restore(&debug_mutex_lock, flags);
366}
367
368/*
369 * Must be called with lock->wait_lock held. 28 * Must be called with lock->wait_lock held.
370 */ 29 */
371void debug_mutex_set_owner(struct mutex *lock, 30void debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner)
372 struct thread_info *new_owner __IP_DECL__)
373{ 31{
374 lock->owner = new_owner; 32 lock->owner = new_owner;
375 DEBUG_WARN_ON(!list_empty(&lock->held_list));
376 if (debug_mutex_on) {
377 list_add_tail(&lock->held_list, &debug_mutex_held_locks);
378 lock->acquire_ip = ip;
379 }
380} 33}
381 34
382void debug_mutex_init_waiter(struct mutex_waiter *waiter) 35void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
383{ 36{
384 memset(waiter, 0x11, sizeof(*waiter)); 37 memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter));
385 waiter->magic = waiter; 38 waiter->magic = waiter;
386 INIT_LIST_HEAD(&waiter->list); 39 INIT_LIST_HEAD(&waiter->list);
387} 40}
388 41
389void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter) 42void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter)
390{ 43{
391 SMP_DEBUG_WARN_ON(!spin_is_locked(&lock->wait_lock)); 44 SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock));
392 DEBUG_WARN_ON(list_empty(&lock->wait_list)); 45 DEBUG_LOCKS_WARN_ON(list_empty(&lock->wait_list));
393 DEBUG_WARN_ON(waiter->magic != waiter); 46 DEBUG_LOCKS_WARN_ON(waiter->magic != waiter);
394 DEBUG_WARN_ON(list_empty(&waiter->list)); 47 DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
395} 48}
396 49
397void debug_mutex_free_waiter(struct mutex_waiter *waiter) 50void debug_mutex_free_waiter(struct mutex_waiter *waiter)
398{ 51{
399 DEBUG_WARN_ON(!list_empty(&waiter->list)); 52 DEBUG_LOCKS_WARN_ON(!list_empty(&waiter->list));
400 memset(waiter, 0x22, sizeof(*waiter)); 53 memset(waiter, MUTEX_DEBUG_FREE, sizeof(*waiter));
401} 54}
402 55
403void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, 56void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
404 struct thread_info *ti __IP_DECL__) 57 struct thread_info *ti)
405{ 58{
406 SMP_DEBUG_WARN_ON(!spin_is_locked(&lock->wait_lock)); 59 SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock));
407 check_deadlock(lock, 0, ti, ip); 60
408 /* Mark the current thread as blocked on the lock: */ 61 /* Mark the current thread as blocked on the lock: */
409 ti->task->blocked_on = waiter; 62 ti->task->blocked_on = waiter;
410 waiter->lock = lock; 63 waiter->lock = lock;
@@ -413,9 +66,9 @@ void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
413void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, 66void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
414 struct thread_info *ti) 67 struct thread_info *ti)
415{ 68{
416 DEBUG_WARN_ON(list_empty(&waiter->list)); 69 DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
417 DEBUG_WARN_ON(waiter->task != ti->task); 70 DEBUG_LOCKS_WARN_ON(waiter->task != ti->task);
418 DEBUG_WARN_ON(ti->task->blocked_on != waiter); 71 DEBUG_LOCKS_WARN_ON(ti->task->blocked_on != waiter);
419 ti->task->blocked_on = NULL; 72 ti->task->blocked_on = NULL;
420 73
421 list_del_init(&waiter->list); 74 list_del_init(&waiter->list);
@@ -424,24 +77,23 @@ void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
424 77
425void debug_mutex_unlock(struct mutex *lock) 78void debug_mutex_unlock(struct mutex *lock)
426{ 79{
427 DEBUG_WARN_ON(lock->magic != lock); 80 DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
428 DEBUG_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); 81 DEBUG_LOCKS_WARN_ON(lock->magic != lock);
429 DEBUG_WARN_ON(lock->owner != current_thread_info()); 82 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
430 if (debug_mutex_on) { 83 DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
431 DEBUG_WARN_ON(list_empty(&lock->held_list));
432 list_del_init(&lock->held_list);
433 }
434} 84}
435 85
436void debug_mutex_init(struct mutex *lock, const char *name) 86void debug_mutex_init(struct mutex *lock, const char *name,
87 struct lock_class_key *key)
437{ 88{
89#ifdef CONFIG_DEBUG_LOCK_ALLOC
438 /* 90 /*
439 * Make sure we are not reinitializing a held lock: 91 * Make sure we are not reinitializing a held lock:
440 */ 92 */
441 mutex_debug_check_no_locks_freed((void *)lock, sizeof(*lock)); 93 debug_check_no_locks_freed((void *)lock, sizeof(*lock));
94 lockdep_init_map(&lock->dep_map, name, key);
95#endif
442 lock->owner = NULL; 96 lock->owner = NULL;
443 INIT_LIST_HEAD(&lock->held_list);
444 lock->name = name;
445 lock->magic = lock; 97 lock->magic = lock;
446} 98}
447 99
@@ -455,7 +107,7 @@ void debug_mutex_init(struct mutex *lock, const char *name)
455 */ 107 */
456void fastcall mutex_destroy(struct mutex *lock) 108void fastcall mutex_destroy(struct mutex *lock)
457{ 109{
458 DEBUG_WARN_ON(mutex_is_locked(lock)); 110 DEBUG_LOCKS_WARN_ON(mutex_is_locked(lock));
459 lock->magic = NULL; 111 lock->magic = NULL;
460} 112}
461 113
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
index fd384050acb1..babfbdfc534b 100644
--- a/kernel/mutex-debug.h
+++ b/kernel/mutex-debug.h
@@ -10,125 +10,44 @@
10 * More details are in kernel/mutex-debug.c. 10 * More details are in kernel/mutex-debug.c.
11 */ 11 */
12 12
13extern spinlock_t debug_mutex_lock;
14extern struct list_head debug_mutex_held_locks;
15extern int debug_mutex_on;
16
17/*
18 * In the debug case we carry the caller's instruction pointer into
19 * other functions, but we dont want the function argument overhead
20 * in the nondebug case - hence these macros:
21 */
22#define __IP_DECL__ , unsigned long ip
23#define __IP__ , ip
24#define __RET_IP__ , (unsigned long)__builtin_return_address(0)
25
26/* 13/*
27 * This must be called with lock->wait_lock held. 14 * This must be called with lock->wait_lock held.
28 */ 15 */
29extern void debug_mutex_set_owner(struct mutex *lock, 16extern void
30 struct thread_info *new_owner __IP_DECL__); 17debug_mutex_set_owner(struct mutex *lock, struct thread_info *new_owner);
31 18
32static inline void debug_mutex_clear_owner(struct mutex *lock) 19static inline void debug_mutex_clear_owner(struct mutex *lock)
33{ 20{
34 lock->owner = NULL; 21 lock->owner = NULL;
35} 22}
36 23
37extern void debug_mutex_init_waiter(struct mutex_waiter *waiter); 24extern void debug_mutex_lock_common(struct mutex *lock,
25 struct mutex_waiter *waiter);
38extern void debug_mutex_wake_waiter(struct mutex *lock, 26extern void debug_mutex_wake_waiter(struct mutex *lock,
39 struct mutex_waiter *waiter); 27 struct mutex_waiter *waiter);
40extern void debug_mutex_free_waiter(struct mutex_waiter *waiter); 28extern void debug_mutex_free_waiter(struct mutex_waiter *waiter);
41extern void debug_mutex_add_waiter(struct mutex *lock, 29extern void debug_mutex_add_waiter(struct mutex *lock,
42 struct mutex_waiter *waiter, 30 struct mutex_waiter *waiter,
43 struct thread_info *ti __IP_DECL__); 31 struct thread_info *ti);
44extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, 32extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
45 struct thread_info *ti); 33 struct thread_info *ti);
46extern void debug_mutex_unlock(struct mutex *lock); 34extern void debug_mutex_unlock(struct mutex *lock);
47extern void debug_mutex_init(struct mutex *lock, const char *name); 35extern void debug_mutex_init(struct mutex *lock, const char *name,
48 36 struct lock_class_key *key);
49#define debug_spin_lock(lock) \
50 do { \
51 local_irq_disable(); \
52 if (debug_mutex_on) \
53 spin_lock(lock); \
54 } while (0)
55 37
56#define debug_spin_unlock(lock) \ 38#define spin_lock_mutex(lock, flags) \
57 do { \
58 if (debug_mutex_on) \
59 spin_unlock(lock); \
60 local_irq_enable(); \
61 preempt_check_resched(); \
62 } while (0)
63
64#define debug_spin_lock_save(lock, flags) \
65 do { \ 39 do { \
40 struct mutex *l = container_of(lock, struct mutex, wait_lock); \
41 \
42 DEBUG_LOCKS_WARN_ON(in_interrupt()); \
66 local_irq_save(flags); \ 43 local_irq_save(flags); \
67 if (debug_mutex_on) \ 44 __raw_spin_lock(&(lock)->raw_lock); \
68 spin_lock(lock); \ 45 DEBUG_LOCKS_WARN_ON(l->magic != l); \
69 } while (0) 46 } while (0)
70 47
71#define debug_spin_lock_restore(lock, flags) \ 48#define spin_unlock_mutex(lock, flags) \
72 do { \ 49 do { \
73 if (debug_mutex_on) \ 50 __raw_spin_unlock(&(lock)->raw_lock); \
74 spin_unlock(lock); \
75 local_irq_restore(flags); \ 51 local_irq_restore(flags); \
76 preempt_check_resched(); \ 52 preempt_check_resched(); \
77 } while (0) 53 } while (0)
78
79#define spin_lock_mutex(lock) \
80 do { \
81 struct mutex *l = container_of(lock, struct mutex, wait_lock); \
82 \
83 DEBUG_WARN_ON(in_interrupt()); \
84 debug_spin_lock(&debug_mutex_lock); \
85 spin_lock(lock); \
86 DEBUG_WARN_ON(l->magic != l); \
87 } while (0)
88
89#define spin_unlock_mutex(lock) \
90 do { \
91 spin_unlock(lock); \
92 debug_spin_unlock(&debug_mutex_lock); \
93 } while (0)
94
95#define DEBUG_OFF() \
96do { \
97 if (debug_mutex_on) { \
98 debug_mutex_on = 0; \
99 console_verbose(); \
100 if (spin_is_locked(&debug_mutex_lock)) \
101 spin_unlock(&debug_mutex_lock); \
102 } \
103} while (0)
104
105#define DEBUG_BUG() \
106do { \
107 if (debug_mutex_on) { \
108 DEBUG_OFF(); \
109 BUG(); \
110 } \
111} while (0)
112
113#define DEBUG_WARN_ON(c) \
114do { \
115 if (unlikely(c && debug_mutex_on)) { \
116 DEBUG_OFF(); \
117 WARN_ON(1); \
118 } \
119} while (0)
120
121# define DEBUG_BUG_ON(c) \
122do { \
123 if (unlikely(c)) \
124 DEBUG_BUG(); \
125} while (0)
126
127#ifdef CONFIG_SMP
128# define SMP_DEBUG_WARN_ON(c) DEBUG_WARN_ON(c)
129# define SMP_DEBUG_BUG_ON(c) DEBUG_BUG_ON(c)
130#else
131# define SMP_DEBUG_WARN_ON(c) do { } while (0)
132# define SMP_DEBUG_BUG_ON(c) do { } while (0)
133#endif
134
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 5449b210d9ed..8c71cf72a497 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -17,6 +17,7 @@
17#include <linux/module.h> 17#include <linux/module.h>
18#include <linux/spinlock.h> 18#include <linux/spinlock.h>
19#include <linux/interrupt.h> 19#include <linux/interrupt.h>
20#include <linux/debug_locks.h>
20 21
21/* 22/*
22 * In the DEBUG case we are using the "NULL fastpath" for mutexes, 23 * In the DEBUG case we are using the "NULL fastpath" for mutexes,
@@ -38,13 +39,14 @@
38 * 39 *
39 * It is not allowed to initialize an already locked mutex. 40 * It is not allowed to initialize an already locked mutex.
40 */ 41 */
41void fastcall __mutex_init(struct mutex *lock, const char *name) 42void
43__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
42{ 44{
43 atomic_set(&lock->count, 1); 45 atomic_set(&lock->count, 1);
44 spin_lock_init(&lock->wait_lock); 46 spin_lock_init(&lock->wait_lock);
45 INIT_LIST_HEAD(&lock->wait_list); 47 INIT_LIST_HEAD(&lock->wait_list);
46 48
47 debug_mutex_init(lock, name); 49 debug_mutex_init(lock, name, key);
48} 50}
49 51
50EXPORT_SYMBOL(__mutex_init); 52EXPORT_SYMBOL(__mutex_init);
@@ -56,7 +58,7 @@ EXPORT_SYMBOL(__mutex_init);
56 * branch is predicted by the CPU as default-untaken. 58 * branch is predicted by the CPU as default-untaken.
57 */ 59 */
58static void fastcall noinline __sched 60static void fastcall noinline __sched
59__mutex_lock_slowpath(atomic_t *lock_count __IP_DECL__); 61__mutex_lock_slowpath(atomic_t *lock_count);
60 62
61/*** 63/***
62 * mutex_lock - acquire the mutex 64 * mutex_lock - acquire the mutex
@@ -79,7 +81,7 @@ __mutex_lock_slowpath(atomic_t *lock_count __IP_DECL__);
79 * 81 *
80 * This function is similar to (but not equivalent to) down(). 82 * This function is similar to (but not equivalent to) down().
81 */ 83 */
82void fastcall __sched mutex_lock(struct mutex *lock) 84void inline fastcall __sched mutex_lock(struct mutex *lock)
83{ 85{
84 might_sleep(); 86 might_sleep();
85 /* 87 /*
@@ -92,7 +94,7 @@ void fastcall __sched mutex_lock(struct mutex *lock)
92EXPORT_SYMBOL(mutex_lock); 94EXPORT_SYMBOL(mutex_lock);
93 95
94static void fastcall noinline __sched 96static void fastcall noinline __sched
95__mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__); 97__mutex_unlock_slowpath(atomic_t *lock_count);
96 98
97/*** 99/***
98 * mutex_unlock - release the mutex 100 * mutex_unlock - release the mutex
@@ -120,17 +122,18 @@ EXPORT_SYMBOL(mutex_unlock);
120 * Lock a mutex (possibly interruptible), slowpath: 122 * Lock a mutex (possibly interruptible), slowpath:
121 */ 123 */
122static inline int __sched 124static inline int __sched
123__mutex_lock_common(struct mutex *lock, long state __IP_DECL__) 125__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass)
124{ 126{
125 struct task_struct *task = current; 127 struct task_struct *task = current;
126 struct mutex_waiter waiter; 128 struct mutex_waiter waiter;
127 unsigned int old_val; 129 unsigned int old_val;
130 unsigned long flags;
128 131
129 debug_mutex_init_waiter(&waiter); 132 spin_lock_mutex(&lock->wait_lock, flags);
130 133
131 spin_lock_mutex(&lock->wait_lock); 134 debug_mutex_lock_common(lock, &waiter);
132 135 mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
133 debug_mutex_add_waiter(lock, &waiter, task->thread_info, ip); 136 debug_mutex_add_waiter(lock, &waiter, task->thread_info);
134 137
135 /* add waiting tasks to the end of the waitqueue (FIFO): */ 138 /* add waiting tasks to the end of the waitqueue (FIFO): */
136 list_add_tail(&waiter.list, &lock->wait_list); 139 list_add_tail(&waiter.list, &lock->wait_list);
@@ -157,7 +160,8 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__)
157 if (unlikely(state == TASK_INTERRUPTIBLE && 160 if (unlikely(state == TASK_INTERRUPTIBLE &&
158 signal_pending(task))) { 161 signal_pending(task))) {
159 mutex_remove_waiter(lock, &waiter, task->thread_info); 162 mutex_remove_waiter(lock, &waiter, task->thread_info);
160 spin_unlock_mutex(&lock->wait_lock); 163 mutex_release(&lock->dep_map, 1, _RET_IP_);
164 spin_unlock_mutex(&lock->wait_lock, flags);
161 165
162 debug_mutex_free_waiter(&waiter); 166 debug_mutex_free_waiter(&waiter);
163 return -EINTR; 167 return -EINTR;
@@ -165,48 +169,57 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__)
165 __set_task_state(task, state); 169 __set_task_state(task, state);
166 170
167 /* didnt get the lock, go to sleep: */ 171 /* didnt get the lock, go to sleep: */
168 spin_unlock_mutex(&lock->wait_lock); 172 spin_unlock_mutex(&lock->wait_lock, flags);
169 schedule(); 173 schedule();
170 spin_lock_mutex(&lock->wait_lock); 174 spin_lock_mutex(&lock->wait_lock, flags);
171 } 175 }
172 176
173 /* got the lock - rejoice! */ 177 /* got the lock - rejoice! */
174 mutex_remove_waiter(lock, &waiter, task->thread_info); 178 mutex_remove_waiter(lock, &waiter, task->thread_info);
175 debug_mutex_set_owner(lock, task->thread_info __IP__); 179 debug_mutex_set_owner(lock, task->thread_info);
176 180
177 /* set it to 0 if there are no waiters left: */ 181 /* set it to 0 if there are no waiters left: */
178 if (likely(list_empty(&lock->wait_list))) 182 if (likely(list_empty(&lock->wait_list)))
179 atomic_set(&lock->count, 0); 183 atomic_set(&lock->count, 0);
180 184
181 spin_unlock_mutex(&lock->wait_lock); 185 spin_unlock_mutex(&lock->wait_lock, flags);
182 186
183 debug_mutex_free_waiter(&waiter); 187 debug_mutex_free_waiter(&waiter);
184 188
185 DEBUG_WARN_ON(list_empty(&lock->held_list));
186 DEBUG_WARN_ON(lock->owner != task->thread_info);
187
188 return 0; 189 return 0;
189} 190}
190 191
191static void fastcall noinline __sched 192static void fastcall noinline __sched
192__mutex_lock_slowpath(atomic_t *lock_count __IP_DECL__) 193__mutex_lock_slowpath(atomic_t *lock_count)
193{ 194{
194 struct mutex *lock = container_of(lock_count, struct mutex, count); 195 struct mutex *lock = container_of(lock_count, struct mutex, count);
195 196
196 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE __IP__); 197 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0);
198}
199
200#ifdef CONFIG_DEBUG_LOCK_ALLOC
201void __sched
202mutex_lock_nested(struct mutex *lock, unsigned int subclass)
203{
204 might_sleep();
205 __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass);
197} 206}
198 207
208EXPORT_SYMBOL_GPL(mutex_lock_nested);
209#endif
210
199/* 211/*
200 * Release the lock, slowpath: 212 * Release the lock, slowpath:
201 */ 213 */
202static fastcall noinline void 214static fastcall inline void
203__mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__) 215__mutex_unlock_common_slowpath(atomic_t *lock_count, int nested)
204{ 216{
205 struct mutex *lock = container_of(lock_count, struct mutex, count); 217 struct mutex *lock = container_of(lock_count, struct mutex, count);
218 unsigned long flags;
206 219
207 DEBUG_WARN_ON(lock->owner != current_thread_info()); 220 spin_lock_mutex(&lock->wait_lock, flags);
208 221 mutex_release(&lock->dep_map, nested, _RET_IP_);
209 spin_lock_mutex(&lock->wait_lock); 222 debug_mutex_unlock(lock);
210 223
211 /* 224 /*
212 * some architectures leave the lock unlocked in the fastpath failure 225 * some architectures leave the lock unlocked in the fastpath failure
@@ -216,8 +229,6 @@ __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__)
216 if (__mutex_slowpath_needs_to_unlock()) 229 if (__mutex_slowpath_needs_to_unlock())
217 atomic_set(&lock->count, 1); 230 atomic_set(&lock->count, 1);
218 231
219 debug_mutex_unlock(lock);
220
221 if (!list_empty(&lock->wait_list)) { 232 if (!list_empty(&lock->wait_list)) {
222 /* get the first entry from the wait-list: */ 233 /* get the first entry from the wait-list: */
223 struct mutex_waiter *waiter = 234 struct mutex_waiter *waiter =
@@ -231,7 +242,16 @@ __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__)
231 242
232 debug_mutex_clear_owner(lock); 243 debug_mutex_clear_owner(lock);
233 244
234 spin_unlock_mutex(&lock->wait_lock); 245 spin_unlock_mutex(&lock->wait_lock, flags);
246}
247
248/*
249 * Release the lock, slowpath:
250 */
251static fastcall noinline void
252__mutex_unlock_slowpath(atomic_t *lock_count)
253{
254 __mutex_unlock_common_slowpath(lock_count, 1);
235} 255}
236 256
237/* 257/*
@@ -239,7 +259,7 @@ __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__)
239 * mutex_lock_interruptible() and mutex_trylock(). 259 * mutex_lock_interruptible() and mutex_trylock().
240 */ 260 */
241static int fastcall noinline __sched 261static int fastcall noinline __sched
242__mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__); 262__mutex_lock_interruptible_slowpath(atomic_t *lock_count);
243 263
244/*** 264/***
245 * mutex_lock_interruptible - acquire the mutex, interruptable 265 * mutex_lock_interruptible - acquire the mutex, interruptable
@@ -262,11 +282,11 @@ int fastcall __sched mutex_lock_interruptible(struct mutex *lock)
262EXPORT_SYMBOL(mutex_lock_interruptible); 282EXPORT_SYMBOL(mutex_lock_interruptible);
263 283
264static int fastcall noinline __sched 284static int fastcall noinline __sched
265__mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__) 285__mutex_lock_interruptible_slowpath(atomic_t *lock_count)
266{ 286{
267 struct mutex *lock = container_of(lock_count, struct mutex, count); 287 struct mutex *lock = container_of(lock_count, struct mutex, count);
268 288
269 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE __IP__); 289 return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0);
270} 290}
271 291
272/* 292/*
@@ -276,18 +296,21 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__)
276static inline int __mutex_trylock_slowpath(atomic_t *lock_count) 296static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
277{ 297{
278 struct mutex *lock = container_of(lock_count, struct mutex, count); 298 struct mutex *lock = container_of(lock_count, struct mutex, count);
299 unsigned long flags;
279 int prev; 300 int prev;
280 301
281 spin_lock_mutex(&lock->wait_lock); 302 spin_lock_mutex(&lock->wait_lock, flags);
282 303
283 prev = atomic_xchg(&lock->count, -1); 304 prev = atomic_xchg(&lock->count, -1);
284 if (likely(prev == 1)) 305 if (likely(prev == 1)) {
285 debug_mutex_set_owner(lock, current_thread_info() __RET_IP__); 306 debug_mutex_set_owner(lock, current_thread_info());
307 mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_);
308 }
286 /* Set it back to 0 if there are no waiters: */ 309 /* Set it back to 0 if there are no waiters: */
287 if (likely(list_empty(&lock->wait_list))) 310 if (likely(list_empty(&lock->wait_list)))
288 atomic_set(&lock->count, 0); 311 atomic_set(&lock->count, 0);
289 312
290 spin_unlock_mutex(&lock->wait_lock); 313 spin_unlock_mutex(&lock->wait_lock, flags);
291 314
292 return prev == 1; 315 return prev == 1;
293} 316}
@@ -306,7 +329,7 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
306 * This function must not be used in interrupt context. The 329 * This function must not be used in interrupt context. The
307 * mutex must be released by the same task that acquired it. 330 * mutex must be released by the same task that acquired it.
308 */ 331 */
309int fastcall mutex_trylock(struct mutex *lock) 332int fastcall __sched mutex_trylock(struct mutex *lock)
310{ 333{
311 return __mutex_fastpath_trylock(&lock->count, 334 return __mutex_fastpath_trylock(&lock->count,
312 __mutex_trylock_slowpath); 335 __mutex_trylock_slowpath);
diff --git a/kernel/mutex.h b/kernel/mutex.h
index 00fe84e7b672..a075dafbb290 100644
--- a/kernel/mutex.h
+++ b/kernel/mutex.h
@@ -9,27 +9,22 @@
9 * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs: 9 * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs:
10 */ 10 */
11 11
12#define spin_lock_mutex(lock) spin_lock(lock) 12#define spin_lock_mutex(lock, flags) \
13#define spin_unlock_mutex(lock) spin_unlock(lock) 13 do { spin_lock(lock); (void)(flags); } while (0)
14#define spin_unlock_mutex(lock, flags) \
15 do { spin_unlock(lock); (void)(flags); } while (0)
14#define mutex_remove_waiter(lock, waiter, ti) \ 16#define mutex_remove_waiter(lock, waiter, ti) \
15 __list_del((waiter)->list.prev, (waiter)->list.next) 17 __list_del((waiter)->list.prev, (waiter)->list.next)
16 18
17#define DEBUG_WARN_ON(c) do { } while (0)
18#define debug_mutex_set_owner(lock, new_owner) do { } while (0) 19#define debug_mutex_set_owner(lock, new_owner) do { } while (0)
19#define debug_mutex_clear_owner(lock) do { } while (0) 20#define debug_mutex_clear_owner(lock) do { } while (0)
20#define debug_mutex_init_waiter(waiter) do { } while (0)
21#define debug_mutex_wake_waiter(lock, waiter) do { } while (0) 21#define debug_mutex_wake_waiter(lock, waiter) do { } while (0)
22#define debug_mutex_free_waiter(waiter) do { } while (0) 22#define debug_mutex_free_waiter(waiter) do { } while (0)
23#define debug_mutex_add_waiter(lock, waiter, ti, ip) do { } while (0) 23#define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0)
24#define debug_mutex_unlock(lock) do { } while (0) 24#define debug_mutex_unlock(lock) do { } while (0)
25#define debug_mutex_init(lock, name) do { } while (0) 25#define debug_mutex_init(lock, name, key) do { } while (0)
26
27/*
28 * Return-address parameters/declarations. They are very useful for
29 * debugging, but add overhead in the !DEBUG case - so we go the
30 * trouble of using this not too elegant but zero-cost solution:
31 */
32#define __IP_DECL__
33#define __IP__
34#define __RET_IP__
35 26
27static inline void
28debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
29{
30}
diff --git a/kernel/panic.c b/kernel/panic.c
index cc2a4c9c36ac..d8a0bca21233 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -8,7 +8,6 @@
8 * This function is used through-out the kernel (including mm and fs) 8 * This function is used through-out the kernel (including mm and fs)
9 * to indicate a major problem. 9 * to indicate a major problem.
10 */ 10 */
11#include <linux/config.h>
12#include <linux/module.h> 11#include <linux/module.h>
13#include <linux/sched.h> 12#include <linux/sched.h>
14#include <linux/delay.h> 13#include <linux/delay.h>
@@ -173,6 +172,7 @@ const char *print_tainted(void)
173 172
174void add_taint(unsigned flag) 173void add_taint(unsigned flag)
175{ 174{
175 debug_locks_off(); /* can't trust the integrity of the kernel anymore */
176 tainted |= flag; 176 tainted |= flag;
177} 177}
178EXPORT_SYMBOL(add_taint); 178EXPORT_SYMBOL(add_taint);
@@ -257,6 +257,7 @@ int oops_may_print(void)
257 */ 257 */
258void oops_enter(void) 258void oops_enter(void)
259{ 259{
260 debug_locks_off(); /* can't trust the integrity of the kernel anymore */
260 do_oops_enter_exit(); 261 do_oops_enter_exit();
261} 262}
262 263
diff --git a/kernel/params.c b/kernel/params.c
index af43ecdc8d9b..91aea7aa532e 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -15,7 +15,6 @@
15 along with this program; if not, write to the Free Software 15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/ 17*/
18#include <linux/config.h>
19#include <linux/moduleparam.h> 18#include <linux/moduleparam.h>
20#include <linux/kernel.h> 19#include <linux/kernel.h>
21#include <linux/string.h> 20#include <linux/string.h>
diff --git a/kernel/pid.c b/kernel/pid.c
index eeb836b65ca4..93e212f20671 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -218,7 +218,7 @@ struct pid * fastcall find_pid(int nr)
218 return NULL; 218 return NULL;
219} 219}
220 220
221int fastcall attach_pid(task_t *task, enum pid_type type, int nr) 221int fastcall attach_pid(struct task_struct *task, enum pid_type type, int nr)
222{ 222{
223 struct pid_link *link; 223 struct pid_link *link;
224 struct pid *pid; 224 struct pid *pid;
@@ -233,7 +233,7 @@ int fastcall attach_pid(task_t *task, enum pid_type type, int nr)
233 return 0; 233 return 0;
234} 234}
235 235
236void fastcall detach_pid(task_t *task, enum pid_type type) 236void fastcall detach_pid(struct task_struct *task, enum pid_type type)
237{ 237{
238 struct pid_link *link; 238 struct pid_link *link;
239 struct pid *pid; 239 struct pid *pid;
@@ -267,7 +267,7 @@ struct task_struct * fastcall pid_task(struct pid *pid, enum pid_type type)
267/* 267/*
268 * Must be called under rcu_read_lock() or with tasklist_lock read-held. 268 * Must be called under rcu_read_lock() or with tasklist_lock read-held.
269 */ 269 */
270task_t *find_task_by_pid_type(int type, int nr) 270struct task_struct *find_task_by_pid_type(int type, int nr)
271{ 271{
272 return pid_task(find_pid(nr), type); 272 return pid_task(find_pid(nr), type);
273} 273}
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 520f6c59948d..d38d9ec3276c 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -555,9 +555,6 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
555 struct cpu_timer_list *next; 555 struct cpu_timer_list *next;
556 unsigned long i; 556 unsigned long i;
557 557
558 if (CPUCLOCK_PERTHREAD(timer->it_clock) && (p->flags & PF_EXITING))
559 return;
560
561 head = (CPUCLOCK_PERTHREAD(timer->it_clock) ? 558 head = (CPUCLOCK_PERTHREAD(timer->it_clock) ?
562 p->cpu_timers : p->signal->cpu_timers); 559 p->cpu_timers : p->signal->cpu_timers);
563 head += CPUCLOCK_WHICH(timer->it_clock); 560 head += CPUCLOCK_WHICH(timer->it_clock);
@@ -1173,6 +1170,9 @@ static void check_process_timers(struct task_struct *tsk,
1173 } 1170 }
1174 t = tsk; 1171 t = tsk;
1175 do { 1172 do {
1173 if (unlikely(t->flags & PF_EXITING))
1174 continue;
1175
1176 ticks = cputime_add(cputime_add(t->utime, t->stime), 1176 ticks = cputime_add(cputime_add(t->utime, t->stime),
1177 prof_left); 1177 prof_left);
1178 if (!cputime_eq(prof_expires, cputime_zero) && 1178 if (!cputime_eq(prof_expires, cputime_zero) &&
@@ -1193,11 +1193,7 @@ static void check_process_timers(struct task_struct *tsk,
1193 t->it_sched_expires > sched)) { 1193 t->it_sched_expires > sched)) {
1194 t->it_sched_expires = sched; 1194 t->it_sched_expires = sched;
1195 } 1195 }
1196 1196 } while ((t = next_thread(t)) != tsk);
1197 do {
1198 t = next_thread(t);
1199 } while (unlikely(t->flags & PF_EXITING));
1200 } while (t != tsk);
1201 } 1197 }
1202} 1198}
1203 1199
@@ -1289,30 +1285,30 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1289 1285
1290#undef UNEXPIRED 1286#undef UNEXPIRED
1291 1287
1292 BUG_ON(tsk->exit_state);
1293
1294 /* 1288 /*
1295 * Double-check with locks held. 1289 * Double-check with locks held.
1296 */ 1290 */
1297 read_lock(&tasklist_lock); 1291 read_lock(&tasklist_lock);
1298 spin_lock(&tsk->sighand->siglock); 1292 if (likely(tsk->signal != NULL)) {
1293 spin_lock(&tsk->sighand->siglock);
1299 1294
1300 /* 1295 /*
1301 * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N] 1296 * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N]
1302 * all the timers that are firing, and put them on the firing list. 1297 * all the timers that are firing, and put them on the firing list.
1303 */ 1298 */
1304 check_thread_timers(tsk, &firing); 1299 check_thread_timers(tsk, &firing);
1305 check_process_timers(tsk, &firing); 1300 check_process_timers(tsk, &firing);
1306 1301
1307 /* 1302 /*
1308 * We must release these locks before taking any timer's lock. 1303 * We must release these locks before taking any timer's lock.
1309 * There is a potential race with timer deletion here, as the 1304 * There is a potential race with timer deletion here, as the
1310 * siglock now protects our private firing list. We have set 1305 * siglock now protects our private firing list. We have set
1311 * the firing flag in each timer, so that a deletion attempt 1306 * the firing flag in each timer, so that a deletion attempt
1312 * that gets the timer lock before we do will give it up and 1307 * that gets the timer lock before we do will give it up and
1313 * spin until we've taken care of that timer below. 1308 * spin until we've taken care of that timer below.
1314 */ 1309 */
1315 spin_unlock(&tsk->sighand->siglock); 1310 spin_unlock(&tsk->sighand->siglock);
1311 }
1316 read_unlock(&tasklist_lock); 1312 read_unlock(&tasklist_lock);
1317 1313
1318 /* 1314 /*
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index ce0dfb8f4a4e..ae44a70aae8a 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -36,6 +36,24 @@ config PM_DEBUG
36 code. This is helpful when debugging and reporting various PM bugs, 36 code. This is helpful when debugging and reporting various PM bugs,
37 like suspend support. 37 like suspend support.
38 38
39config PM_TRACE
40 bool "Suspend/resume event tracing"
41 depends on PM && PM_DEBUG && X86_32 && EXPERIMENTAL
42 default n
43 ---help---
44 This enables some cheesy code to save the last PM event point in the
45 RTC across reboots, so that you can debug a machine that just hangs
46 during suspend (or more commonly, during resume).
47
48 To use this debugging feature you should attempt to suspend the machine,
49 then reboot it, then run
50
51 dmesg -s 1000000 | grep 'hash matches'
52
53 CAUTION: this option will cause your machine's real-time clock to be
54 set to an invalid time after a resume.
55
56
39config SOFTWARE_SUSPEND 57config SOFTWARE_SUSPEND
40 bool "Software Suspend" 58 bool "Software Suspend"
41 depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP) 59 depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP)
@@ -82,18 +100,6 @@ config PM_STD_PARTITION
82 suspended image to. It will simply pick the first available swap 100 suspended image to. It will simply pick the first available swap
83 device. 101 device.
84 102
85config SWSUSP_ENCRYPT
86 bool "Encrypt suspend image"
87 depends on SOFTWARE_SUSPEND && CRYPTO=y && (CRYPTO_AES=y || CRYPTO_AES_586=y || CRYPTO_AES_X86_64=y)
88 default ""
89 ---help---
90 To prevent data gathering from swap after resume you can encrypt
91 the suspend image with a temporary key that is deleted on
92 resume.
93
94 Note that the temporary key is stored unencrypted on disk while the
95 system is suspended.
96
97config SUSPEND_SMP 103config SUSPEND_SMP
98 bool 104 bool
99 depends on HOTPLUG_CPU && X86 && PM 105 depends on HOTPLUG_CPU && X86 && PM
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 81d4d982f3f0..e13e74067845 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -231,7 +231,7 @@ static int software_resume(void)
231late_initcall(software_resume); 231late_initcall(software_resume);
232 232
233 233
234static char * pm_disk_modes[] = { 234static const char * const pm_disk_modes[] = {
235 [PM_DISK_FIRMWARE] = "firmware", 235 [PM_DISK_FIRMWARE] = "firmware",
236 [PM_DISK_PLATFORM] = "platform", 236 [PM_DISK_PLATFORM] = "platform",
237 [PM_DISK_SHUTDOWN] = "shutdown", 237 [PM_DISK_SHUTDOWN] = "shutdown",
diff --git a/kernel/power/main.c b/kernel/power/main.c
index a6d9ef46009e..6d295c776794 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -15,7 +15,7 @@
15#include <linux/errno.h> 15#include <linux/errno.h>
16#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/pm.h> 17#include <linux/pm.h>
18 18#include <linux/console.h>
19 19
20#include "power.h" 20#include "power.h"
21 21
@@ -86,6 +86,7 @@ static int suspend_prepare(suspend_state_t state)
86 goto Thaw; 86 goto Thaw;
87 } 87 }
88 88
89 suspend_console();
89 if ((error = device_suspend(PMSG_SUSPEND))) { 90 if ((error = device_suspend(PMSG_SUSPEND))) {
90 printk(KERN_ERR "Some devices failed to suspend\n"); 91 printk(KERN_ERR "Some devices failed to suspend\n");
91 goto Finish; 92 goto Finish;
@@ -133,6 +134,7 @@ int suspend_enter(suspend_state_t state)
133static void suspend_finish(suspend_state_t state) 134static void suspend_finish(suspend_state_t state)
134{ 135{
135 device_resume(); 136 device_resume();
137 resume_console();
136 thaw_processes(); 138 thaw_processes();
137 enable_nonboot_cpus(); 139 enable_nonboot_cpus();
138 if (pm_ops && pm_ops->finish) 140 if (pm_ops && pm_ops->finish)
@@ -143,7 +145,7 @@ static void suspend_finish(suspend_state_t state)
143 145
144 146
145 147
146static char *pm_states[PM_SUSPEND_MAX] = { 148static const char * const pm_states[PM_SUSPEND_MAX] = {
147 [PM_SUSPEND_STANDBY] = "standby", 149 [PM_SUSPEND_STANDBY] = "standby",
148 [PM_SUSPEND_MEM] = "mem", 150 [PM_SUSPEND_MEM] = "mem",
149#ifdef CONFIG_SOFTWARE_SUSPEND 151#ifdef CONFIG_SOFTWARE_SUSPEND
@@ -260,7 +262,7 @@ static ssize_t state_show(struct subsystem * subsys, char * buf)
260static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n) 262static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n)
261{ 263{
262 suspend_state_t state = PM_SUSPEND_STANDBY; 264 suspend_state_t state = PM_SUSPEND_STANDBY;
263 char ** s; 265 const char * const *s;
264 char *p; 266 char *p;
265 int error; 267 int error;
266 int len; 268 int len;
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
index 84063ac8fcfc..c50d15266c10 100644
--- a/kernel/power/pm.c
+++ b/kernel/power/pm.c
@@ -75,42 +75,6 @@ struct pm_dev *pm_register(pm_dev_t type,
75 return dev; 75 return dev;
76} 76}
77 77
78static void __pm_unregister(struct pm_dev *dev)
79{
80 if (dev) {
81 list_del(&dev->entry);
82 kfree(dev);
83 }
84}
85
86/**
87 * pm_unregister_all - unregister all devices with matching callback
88 * @callback: callback function pointer
89 *
90 * Unregister every device that would call the callback passed. This
91 * is primarily meant as a helper function for loadable modules. It
92 * enables a module to give up all its managed devices without keeping
93 * its own private list.
94 */
95
96void pm_unregister_all(pm_callback callback)
97{
98 struct list_head *entry;
99
100 if (!callback)
101 return;
102
103 mutex_lock(&pm_devs_lock);
104 entry = pm_devs.next;
105 while (entry != &pm_devs) {
106 struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
107 entry = entry->next;
108 if (dev->callback == callback)
109 __pm_unregister(dev);
110 }
111 mutex_unlock(&pm_devs_lock);
112}
113
114/** 78/**
115 * pm_send - send request to a single device 79 * pm_send - send request to a single device
116 * @dev: device to send to 80 * @dev: device to send to
@@ -239,7 +203,6 @@ int pm_send_all(pm_request_t rqst, void *data)
239} 203}
240 204
241EXPORT_SYMBOL(pm_register); 205EXPORT_SYMBOL(pm_register);
242EXPORT_SYMBOL(pm_unregister_all);
243EXPORT_SYMBOL(pm_send_all); 206EXPORT_SYMBOL(pm_send_all);
244EXPORT_SYMBOL(pm_active); 207EXPORT_SYMBOL(pm_active);
245 208
diff --git a/kernel/power/power.h b/kernel/power/power.h
index f06f12f21767..57a792982fb9 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -55,7 +55,7 @@ struct snapshot_handle {
55 unsigned int page; 55 unsigned int page;
56 unsigned int page_offset; 56 unsigned int page_offset;
57 unsigned int prev; 57 unsigned int prev;
58 struct pbe *pbe; 58 struct pbe *pbe, *last_pbe;
59 void *buffer; 59 void *buffer;
60 unsigned int buf_offset; 60 unsigned int buf_offset;
61}; 61};
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 3eeedbb13b78..75d4886e648e 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -150,6 +150,10 @@ int restore_highmem(void)
150 } 150 }
151 return 0; 151 return 0;
152} 152}
153#else
154static inline unsigned int count_highmem_pages(void) {return 0;}
155static inline int save_highmem(void) {return 0;}
156static inline int restore_highmem(void) {return 0;}
153#endif 157#endif
154 158
155static int pfn_is_nosave(unsigned long pfn) 159static int pfn_is_nosave(unsigned long pfn)
@@ -223,11 +227,17 @@ static void copy_data_pages(struct pbe *pblist)
223 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { 227 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
224 if (saveable(zone, &zone_pfn)) { 228 if (saveable(zone, &zone_pfn)) {
225 struct page *page; 229 struct page *page;
230 long *src, *dst;
231 int n;
232
226 page = pfn_to_page(zone_pfn + zone->zone_start_pfn); 233 page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
227 BUG_ON(!pbe); 234 BUG_ON(!pbe);
228 pbe->orig_address = (unsigned long)page_address(page); 235 pbe->orig_address = (unsigned long)page_address(page);
229 /* copy_page is not usable for copying task structs. */ 236 /* copy_page and memcpy are not usable for copying task structs. */
230 memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE); 237 dst = (long *)pbe->address;
238 src = (long *)pbe->orig_address;
239 for (n = PAGE_SIZE / sizeof(long); n; n--)
240 *dst++ = *src++;
231 pbe = pbe->next; 241 pbe = pbe->next;
232 } 242 }
233 } 243 }
@@ -293,62 +303,29 @@ static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
293 } 303 }
294} 304}
295 305
296/** 306static unsigned int unsafe_pages;
297 * On resume it is necessary to trace and eventually free the unsafe
298 * pages that have been allocated, because they are needed for I/O
299 * (on x86-64 we likely will "eat" these pages once again while
300 * creating the temporary page translation tables)
301 */
302
303struct eaten_page {
304 struct eaten_page *next;
305 char padding[PAGE_SIZE - sizeof(void *)];
306};
307
308static struct eaten_page *eaten_pages = NULL;
309
310static void release_eaten_pages(void)
311{
312 struct eaten_page *p, *q;
313
314 p = eaten_pages;
315 while (p) {
316 q = p->next;
317 /* We don't want swsusp_free() to free this page again */
318 ClearPageNosave(virt_to_page(p));
319 free_page((unsigned long)p);
320 p = q;
321 }
322 eaten_pages = NULL;
323}
324 307
325/** 308/**
326 * @safe_needed - on resume, for storing the PBE list and the image, 309 * @safe_needed - on resume, for storing the PBE list and the image,
327 * we can only use memory pages that do not conflict with the pages 310 * we can only use memory pages that do not conflict with the pages
328 * which had been used before suspend. 311 * used before suspend.
329 * 312 *
330 * The unsafe pages are marked with the PG_nosave_free flag 313 * The unsafe pages are marked with the PG_nosave_free flag
331 * 314 * and we count them using unsafe_pages
332 * Allocated but unusable (ie eaten) memory pages should be marked
333 * so that swsusp_free() can release them
334 */ 315 */
335 316
336static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed) 317static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
337{ 318{
338 void *res; 319 void *res;
339 320
321 res = (void *)get_zeroed_page(gfp_mask);
340 if (safe_needed) 322 if (safe_needed)
341 do { 323 while (res && PageNosaveFree(virt_to_page(res))) {
324 /* The page is unsafe, mark it for swsusp_free() */
325 SetPageNosave(virt_to_page(res));
326 unsafe_pages++;
342 res = (void *)get_zeroed_page(gfp_mask); 327 res = (void *)get_zeroed_page(gfp_mask);
343 if (res && PageNosaveFree(virt_to_page(res))) { 328 }
344 /* This is for swsusp_free() */
345 SetPageNosave(virt_to_page(res));
346 ((struct eaten_page *)res)->next = eaten_pages;
347 eaten_pages = res;
348 }
349 } while (res && PageNosaveFree(virt_to_page(res)));
350 else
351 res = (void *)get_zeroed_page(gfp_mask);
352 if (res) { 329 if (res) {
353 SetPageNosave(virt_to_page(res)); 330 SetPageNosave(virt_to_page(res));
354 SetPageNosaveFree(virt_to_page(res)); 331 SetPageNosaveFree(virt_to_page(res));
@@ -374,7 +351,8 @@ unsigned long get_safe_page(gfp_t gfp_mask)
374 * On each page we set up a list of struct_pbe elements. 351 * On each page we set up a list of struct_pbe elements.
375 */ 352 */
376 353
377struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed) 354static struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask,
355 int safe_needed)
378{ 356{
379 unsigned int num; 357 unsigned int num;
380 struct pbe *pblist, *pbe; 358 struct pbe *pblist, *pbe;
@@ -642,6 +620,8 @@ static int mark_unsafe_pages(struct pbe *pblist)
642 return -EFAULT; 620 return -EFAULT;
643 } 621 }
644 622
623 unsafe_pages = 0;
624
645 return 0; 625 return 0;
646} 626}
647 627
@@ -719,42 +699,99 @@ static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
719} 699}
720 700
721/** 701/**
722 * create_image - use metadata contained in the PBE list 702 * prepare_image - use metadata contained in the PBE list
723 * pointed to by pagedir_nosave to mark the pages that will 703 * pointed to by pagedir_nosave to mark the pages that will
724 * be overwritten in the process of restoring the system 704 * be overwritten in the process of restoring the system
725 * memory state from the image and allocate memory for 705 * memory state from the image ("unsafe" pages) and allocate
726 * the image avoiding these pages 706 * memory for the image
707 *
708 * The idea is to allocate the PBE list first and then
709 * allocate as many pages as it's needed for the image data,
710 * but not to assign these pages to the PBEs initially.
711 * Instead, we just mark them as allocated and create a list
712 * of "safe" which will be used later
727 */ 713 */
728 714
729static int create_image(struct snapshot_handle *handle) 715struct safe_page {
716 struct safe_page *next;
717 char padding[PAGE_SIZE - sizeof(void *)];
718};
719
720static struct safe_page *safe_pages;
721
722static int prepare_image(struct snapshot_handle *handle)
730{ 723{
731 int error = 0; 724 int error = 0;
732 struct pbe *p, *pblist; 725 unsigned int nr_pages = nr_copy_pages;
726 struct pbe *p, *pblist = NULL;
733 727
734 p = pagedir_nosave; 728 p = pagedir_nosave;
735 error = mark_unsafe_pages(p); 729 error = mark_unsafe_pages(p);
736 if (!error) { 730 if (!error) {
737 pblist = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1); 731 pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1);
738 if (pblist) 732 if (pblist)
739 copy_page_backup_list(pblist, p); 733 copy_page_backup_list(pblist, p);
740 free_pagedir(p, 0); 734 free_pagedir(p, 0);
741 if (!pblist) 735 if (!pblist)
742 error = -ENOMEM; 736 error = -ENOMEM;
743 } 737 }
744 if (!error) 738 safe_pages = NULL;
745 error = alloc_data_pages(pblist, GFP_ATOMIC, 1); 739 if (!error && nr_pages > unsafe_pages) {
740 nr_pages -= unsafe_pages;
741 while (nr_pages--) {
742 struct safe_page *ptr;
743
744 ptr = (struct safe_page *)get_zeroed_page(GFP_ATOMIC);
745 if (!ptr) {
746 error = -ENOMEM;
747 break;
748 }
749 if (!PageNosaveFree(virt_to_page(ptr))) {
750 /* The page is "safe", add it to the list */
751 ptr->next = safe_pages;
752 safe_pages = ptr;
753 }
754 /* Mark the page as allocated */
755 SetPageNosave(virt_to_page(ptr));
756 SetPageNosaveFree(virt_to_page(ptr));
757 }
758 }
746 if (!error) { 759 if (!error) {
747 release_eaten_pages();
748 pagedir_nosave = pblist; 760 pagedir_nosave = pblist;
749 } else { 761 } else {
750 pagedir_nosave = NULL;
751 handle->pbe = NULL; 762 handle->pbe = NULL;
752 nr_copy_pages = 0; 763 swsusp_free();
753 nr_meta_pages = 0;
754 } 764 }
755 return error; 765 return error;
756} 766}
757 767
768static void *get_buffer(struct snapshot_handle *handle)
769{
770 struct pbe *pbe = handle->pbe, *last = handle->last_pbe;
771 struct page *page = virt_to_page(pbe->orig_address);
772
773 if (PageNosave(page) && PageNosaveFree(page)) {
774 /*
775 * We have allocated the "original" page frame and we can
776 * use it directly to store the read page
777 */
778 pbe->address = 0;
779 if (last && last->next)
780 last->next = NULL;
781 return (void *)pbe->orig_address;
782 }
783 /*
784 * The "original" page frame has not been allocated and we have to
785 * use a "safe" page frame to store the read page
786 */
787 pbe->address = (unsigned long)safe_pages;
788 safe_pages = safe_pages->next;
789 if (last)
790 last->next = pbe;
791 handle->last_pbe = pbe;
792 return (void *)pbe->address;
793}
794
758/** 795/**
759 * snapshot_write_next - used for writing the system memory snapshot. 796 * snapshot_write_next - used for writing the system memory snapshot.
760 * 797 *
@@ -799,15 +836,16 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
799 } else if (handle->prev <= nr_meta_pages) { 836 } else if (handle->prev <= nr_meta_pages) {
800 handle->pbe = unpack_orig_addresses(buffer, handle->pbe); 837 handle->pbe = unpack_orig_addresses(buffer, handle->pbe);
801 if (!handle->pbe) { 838 if (!handle->pbe) {
802 error = create_image(handle); 839 error = prepare_image(handle);
803 if (error) 840 if (error)
804 return error; 841 return error;
805 handle->pbe = pagedir_nosave; 842 handle->pbe = pagedir_nosave;
806 handle->buffer = (void *)handle->pbe->address; 843 handle->last_pbe = NULL;
844 handle->buffer = get_buffer(handle);
807 } 845 }
808 } else { 846 } else {
809 handle->pbe = handle->pbe->next; 847 handle->pbe = handle->pbe->next;
810 handle->buffer = (void *)handle->pbe->address; 848 handle->buffer = get_buffer(handle);
811 } 849 }
812 handle->prev = handle->page; 850 handle->prev = handle->page;
813 } 851 }
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 044b8e0c1025..f1dd146bd64d 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -263,7 +263,6 @@ int swsusp_write(void)
263 struct swap_map_handle handle; 263 struct swap_map_handle handle;
264 struct snapshot_handle snapshot; 264 struct snapshot_handle snapshot;
265 struct swsusp_info *header; 265 struct swsusp_info *header;
266 unsigned long start;
267 int error; 266 int error;
268 267
269 if ((error = swsusp_swap_check())) { 268 if ((error = swsusp_swap_check())) {
@@ -281,16 +280,17 @@ int swsusp_write(void)
281 } 280 }
282 error = get_swap_writer(&handle); 281 error = get_swap_writer(&handle);
283 if (!error) { 282 if (!error) {
284 start = handle.cur_swap; 283 unsigned long start = handle.cur_swap;
285 error = swap_write_page(&handle, header); 284 error = swap_write_page(&handle, header);
286 } 285 if (!error)
287 if (!error) 286 error = save_image(&handle, &snapshot,
288 error = save_image(&handle, &snapshot, header->pages - 1); 287 header->pages - 1);
289 if (!error) { 288 if (!error) {
290 flush_swap_writer(&handle); 289 flush_swap_writer(&handle);
291 printk("S"); 290 printk("S");
292 error = mark_swapfiles(swp_entry(root_swap, start)); 291 error = mark_swapfiles(swp_entry(root_swap, start));
293 printk("|\n"); 292 printk("|\n");
293 }
294 } 294 }
295 if (error) 295 if (error)
296 free_all_swap_pages(root_swap, handle.bitmap); 296 free_all_swap_pages(root_swap, handle.bitmap);
@@ -311,8 +311,10 @@ static atomic_t io_done = ATOMIC_INIT(0);
311 311
312static int end_io(struct bio *bio, unsigned int num, int err) 312static int end_io(struct bio *bio, unsigned int num, int err)
313{ 313{
314 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 314 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
315 panic("I/O error reading memory image"); 315 printk(KERN_ERR "I/O error reading swsusp image.\n");
316 return -EIO;
317 }
316 atomic_set(&io_done, 0); 318 atomic_set(&io_done, 0);
317 return 0; 319 return 0;
318} 320}
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index c4016cbbd3e0..17f669c83012 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -67,9 +67,9 @@ unsigned int count_highmem_pages(void);
67int save_highmem(void); 67int save_highmem(void);
68int restore_highmem(void); 68int restore_highmem(void);
69#else 69#else
70static int save_highmem(void) { return 0; } 70static inline int save_highmem(void) { return 0; }
71static int restore_highmem(void) { return 0; } 71static inline int restore_highmem(void) { return 0; }
72static unsigned int count_highmem_pages(void) { return 0; } 72static inline unsigned int count_highmem_pages(void) { return 0; }
73#endif 73#endif
74 74
75/** 75/**
@@ -175,6 +175,12 @@ void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
175 */ 175 */
176 176
177#define SHRINK_BITE 10000 177#define SHRINK_BITE 10000
178static inline unsigned long __shrink_memory(long tmp)
179{
180 if (tmp > SHRINK_BITE)
181 tmp = SHRINK_BITE;
182 return shrink_all_memory(tmp);
183}
178 184
179int swsusp_shrink_memory(void) 185int swsusp_shrink_memory(void)
180{ 186{
@@ -192,15 +198,17 @@ int swsusp_shrink_memory(void)
192 PAGES_FOR_IO; 198 PAGES_FOR_IO;
193 tmp = size; 199 tmp = size;
194 for_each_zone (zone) 200 for_each_zone (zone)
195 if (!is_highmem(zone)) 201 if (!is_highmem(zone) && populated_zone(zone)) {
196 tmp -= zone->free_pages; 202 tmp -= zone->free_pages;
203 tmp += zone->lowmem_reserve[ZONE_NORMAL];
204 }
197 if (tmp > 0) { 205 if (tmp > 0) {
198 tmp = shrink_all_memory(SHRINK_BITE); 206 tmp = __shrink_memory(tmp);
199 if (!tmp) 207 if (!tmp)
200 return -ENOMEM; 208 return -ENOMEM;
201 pages += tmp; 209 pages += tmp;
202 } else if (size > image_size / PAGE_SIZE) { 210 } else if (size > image_size / PAGE_SIZE) {
203 tmp = shrink_all_memory(SHRINK_BITE); 211 tmp = __shrink_memory(size - (image_size / PAGE_SIZE));
204 pages += tmp; 212 pages += tmp;
205 } 213 }
206 printk("\b%c", p[i++%4]); 214 printk("\b%c", p[i++%4]);
diff --git a/kernel/printk.c b/kernel/printk.c
index c056f3324432..65ca0688f86f 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -24,8 +24,8 @@
24#include <linux/console.h> 24#include <linux/console.h>
25#include <linux/init.h> 25#include <linux/init.h>
26#include <linux/module.h> 26#include <linux/module.h>
27#include <linux/moduleparam.h>
27#include <linux/interrupt.h> /* For in_interrupt() */ 28#include <linux/interrupt.h> /* For in_interrupt() */
28#include <linux/config.h>
29#include <linux/delay.h> 29#include <linux/delay.h>
30#include <linux/smp.h> 30#include <linux/smp.h>
31#include <linux/security.h> 31#include <linux/security.h>
@@ -52,7 +52,7 @@ int console_printk[4] = {
52 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ 52 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */
53}; 53};
54 54
55EXPORT_SYMBOL(console_printk); 55EXPORT_UNUSED_SYMBOL(console_printk); /* June 2006 */
56 56
57/* 57/*
58 * Low lever drivers may need that to know if they can schedule in 58 * Low lever drivers may need that to know if they can schedule in
@@ -67,6 +67,7 @@ EXPORT_SYMBOL(oops_in_progress);
67 * driver system. 67 * driver system.
68 */ 68 */
69static DECLARE_MUTEX(console_sem); 69static DECLARE_MUTEX(console_sem);
70static DECLARE_MUTEX(secondary_console_sem);
70struct console *console_drivers; 71struct console *console_drivers;
71/* 72/*
72 * This is used for debugging the mess that is the VT code by 73 * This is used for debugging the mess that is the VT code by
@@ -76,7 +77,7 @@ struct console *console_drivers;
76 * path in the console code where we end up in places I want 77 * path in the console code where we end up in places I want
77 * locked without the console sempahore held 78 * locked without the console sempahore held
78 */ 79 */
79static int console_locked; 80static int console_locked, console_suspended;
80 81
81/* 82/*
82 * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars 83 * logbuf_lock protects log_buf, log_start, log_end, con_start and logged_chars
@@ -326,7 +327,9 @@ static void __call_console_drivers(unsigned long start, unsigned long end)
326 struct console *con; 327 struct console *con;
327 328
328 for (con = console_drivers; con; con = con->next) { 329 for (con = console_drivers; con; con = con->next) {
329 if ((con->flags & CON_ENABLED) && con->write) 330 if ((con->flags & CON_ENABLED) && con->write &&
331 (cpu_online(smp_processor_id()) ||
332 (con->flags & CON_ANYTIME)))
330 con->write(con, &LOG_BUF(start), end - start); 333 con->write(con, &LOG_BUF(start), end - start);
331 } 334 }
332} 335}
@@ -436,6 +439,7 @@ static int printk_time = 1;
436#else 439#else
437static int printk_time = 0; 440static int printk_time = 0;
438#endif 441#endif
442module_param(printk_time, int, S_IRUGO | S_IWUSR);
439 443
440static int __init printk_time_setup(char *str) 444static int __init printk_time_setup(char *str)
441{ 445{
@@ -452,6 +456,18 @@ __attribute__((weak)) unsigned long long printk_clock(void)
452 return sched_clock(); 456 return sched_clock();
453} 457}
454 458
459/* Check if we have any console registered that can be called early in boot. */
460static int have_callable_console(void)
461{
462 struct console *con;
463
464 for (con = console_drivers; con; con = con->next)
465 if (con->flags & CON_ANYTIME)
466 return 1;
467
468 return 0;
469}
470
455/** 471/**
456 * printk - print a kernel message 472 * printk - print a kernel message
457 * @fmt: format string 473 * @fmt: format string
@@ -502,7 +518,9 @@ asmlinkage int vprintk(const char *fmt, va_list args)
502 zap_locks(); 518 zap_locks();
503 519
504 /* This stops the holder of console_sem just where we want him */ 520 /* This stops the holder of console_sem just where we want him */
505 spin_lock_irqsave(&logbuf_lock, flags); 521 local_irq_save(flags);
522 lockdep_off();
523 spin_lock(&logbuf_lock);
506 printk_cpu = smp_processor_id(); 524 printk_cpu = smp_processor_id();
507 525
508 /* Emit the output into the temporary buffer */ 526 /* Emit the output into the temporary buffer */
@@ -565,27 +583,31 @@ asmlinkage int vprintk(const char *fmt, va_list args)
565 log_level_unknown = 1; 583 log_level_unknown = 1;
566 } 584 }
567 585
568 if (!cpu_online(smp_processor_id())) { 586 if (!down_trylock(&console_sem)) {
569 /* 587 /*
570 * Some console drivers may assume that per-cpu resources have 588 * We own the drivers. We can drop the spinlock and
571 * been allocated. So don't allow them to be called by this 589 * let release_console_sem() print the text, maybe ...
572 * CPU until it is officially up. We shouldn't be calling into
573 * random console drivers on a CPU which doesn't exist yet..
574 */ 590 */
575 printk_cpu = UINT_MAX;
576 spin_unlock_irqrestore(&logbuf_lock, flags);
577 goto out;
578 }
579 if (!down_trylock(&console_sem)) {
580 console_locked = 1; 591 console_locked = 1;
592 printk_cpu = UINT_MAX;
593 spin_unlock(&logbuf_lock);
594
581 /* 595 /*
582 * We own the drivers. We can drop the spinlock and let 596 * Console drivers may assume that per-cpu resources have
583 * release_console_sem() print the text 597 * been allocated. So unless they're explicitly marked as
598 * being able to cope (CON_ANYTIME) don't call them until
599 * this CPU is officially up.
584 */ 600 */
585 printk_cpu = UINT_MAX; 601 if (cpu_online(smp_processor_id()) || have_callable_console()) {
586 spin_unlock_irqrestore(&logbuf_lock, flags); 602 console_may_schedule = 0;
587 console_may_schedule = 0; 603 release_console_sem();
588 release_console_sem(); 604 } else {
605 /* Release by hand to avoid flushing the buffer. */
606 console_locked = 0;
607 up(&console_sem);
608 }
609 lockdep_on();
610 local_irq_restore(flags);
589 } else { 611 } else {
590 /* 612 /*
591 * Someone else owns the drivers. We drop the spinlock, which 613 * Someone else owns the drivers. We drop the spinlock, which
@@ -593,9 +615,11 @@ asmlinkage int vprintk(const char *fmt, va_list args)
593 * console drivers with the output which we just produced. 615 * console drivers with the output which we just produced.
594 */ 616 */
595 printk_cpu = UINT_MAX; 617 printk_cpu = UINT_MAX;
596 spin_unlock_irqrestore(&logbuf_lock, flags); 618 spin_unlock(&logbuf_lock);
619 lockdep_on();
620 local_irq_restore(flags);
597 } 621 }
598out: 622
599 preempt_enable(); 623 preempt_enable();
600 return printed_len; 624 return printed_len;
601} 625}
@@ -698,6 +722,23 @@ int __init add_preferred_console(char *name, int idx, char *options)
698} 722}
699 723
700/** 724/**
725 * suspend_console - suspend the console subsystem
726 *
727 * This disables printk() while we go into suspend states
728 */
729void suspend_console(void)
730{
731 acquire_console_sem();
732 console_suspended = 1;
733}
734
735void resume_console(void)
736{
737 console_suspended = 0;
738 release_console_sem();
739}
740
741/**
701 * acquire_console_sem - lock the console system for exclusive use. 742 * acquire_console_sem - lock the console system for exclusive use.
702 * 743 *
703 * Acquires a semaphore which guarantees that the caller has 744 * Acquires a semaphore which guarantees that the caller has
@@ -708,6 +749,10 @@ int __init add_preferred_console(char *name, int idx, char *options)
708void acquire_console_sem(void) 749void acquire_console_sem(void)
709{ 750{
710 BUG_ON(in_interrupt()); 751 BUG_ON(in_interrupt());
752 if (console_suspended) {
753 down(&secondary_console_sem);
754 return;
755 }
711 down(&console_sem); 756 down(&console_sem);
712 console_locked = 1; 757 console_locked = 1;
713 console_may_schedule = 1; 758 console_may_schedule = 1;
@@ -728,7 +773,7 @@ int is_console_locked(void)
728{ 773{
729 return console_locked; 774 return console_locked;
730} 775}
731EXPORT_SYMBOL(is_console_locked); 776EXPORT_UNUSED_SYMBOL(is_console_locked); /* June 2006 */
732 777
733/** 778/**
734 * release_console_sem - unlock the console system 779 * release_console_sem - unlock the console system
@@ -750,6 +795,10 @@ void release_console_sem(void)
750 unsigned long _con_start, _log_end; 795 unsigned long _con_start, _log_end;
751 unsigned long wake_klogd = 0; 796 unsigned long wake_klogd = 0;
752 797
798 if (console_suspended) {
799 up(&secondary_console_sem);
800 return;
801 }
753 for ( ; ; ) { 802 for ( ; ; ) {
754 spin_lock_irqsave(&logbuf_lock, flags); 803 spin_lock_irqsave(&logbuf_lock, flags);
755 wake_klogd |= log_start - log_end; 804 wake_klogd |= log_start - log_end;
@@ -766,8 +815,15 @@ void release_console_sem(void)
766 console_may_schedule = 0; 815 console_may_schedule = 0;
767 up(&console_sem); 816 up(&console_sem);
768 spin_unlock_irqrestore(&logbuf_lock, flags); 817 spin_unlock_irqrestore(&logbuf_lock, flags);
769 if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait)) 818 if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait)) {
770 wake_up_interruptible(&log_wait); 819 /*
820 * If we printk from within the lock dependency code,
821 * from within the scheduler code, then do not lock
822 * up due to self-recursion:
823 */
824 if (!lockdep_internal())
825 wake_up_interruptible(&log_wait);
826 }
771} 827}
772EXPORT_SYMBOL(release_console_sem); 828EXPORT_SYMBOL(release_console_sem);
773 829
diff --git a/kernel/profile.c b/kernel/profile.c
index 68afe121e507..d5bd75e7501c 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -13,7 +13,6 @@
13 * to resolve timer interrupt livelocks, William Irwin, Oracle, 2004 13 * to resolve timer interrupt livelocks, William Irwin, Oracle, 2004
14 */ 14 */
15 15
16#include <linux/config.h>
17#include <linux/module.h> 16#include <linux/module.h>
18#include <linux/profile.h> 17#include <linux/profile.h>
19#include <linux/bootmem.h> 18#include <linux/bootmem.h>
@@ -299,7 +298,7 @@ out:
299} 298}
300 299
301#ifdef CONFIG_HOTPLUG_CPU 300#ifdef CONFIG_HOTPLUG_CPU
302static int profile_cpu_callback(struct notifier_block *info, 301static int __devinit profile_cpu_callback(struct notifier_block *info,
303 unsigned long action, void *__cpu) 302 unsigned long action, void *__cpu)
304{ 303{
305 int node, cpu = (unsigned long)__cpu; 304 int node, cpu = (unsigned long)__cpu;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 921c22ad16e4..9a111f70145c 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -28,7 +28,7 @@
28 * 28 *
29 * Must be called with the tasklist lock write-held. 29 * Must be called with the tasklist lock write-held.
30 */ 30 */
31void __ptrace_link(task_t *child, task_t *new_parent) 31void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
32{ 32{
33 BUG_ON(!list_empty(&child->ptrace_list)); 33 BUG_ON(!list_empty(&child->ptrace_list));
34 if (child->parent == new_parent) 34 if (child->parent == new_parent)
@@ -46,7 +46,7 @@ void __ptrace_link(task_t *child, task_t *new_parent)
46 * TASK_TRACED, resume it now. 46 * TASK_TRACED, resume it now.
47 * Requires that irqs be disabled. 47 * Requires that irqs be disabled.
48 */ 48 */
49void ptrace_untrace(task_t *child) 49void ptrace_untrace(struct task_struct *child)
50{ 50{
51 spin_lock(&child->sighand->siglock); 51 spin_lock(&child->sighand->siglock);
52 if (child->state == TASK_TRACED) { 52 if (child->state == TASK_TRACED) {
@@ -65,7 +65,7 @@ void ptrace_untrace(task_t *child)
65 * 65 *
66 * Must be called with the tasklist lock write-held. 66 * Must be called with the tasklist lock write-held.
67 */ 67 */
68void __ptrace_unlink(task_t *child) 68void __ptrace_unlink(struct task_struct *child)
69{ 69{
70 BUG_ON(!child->ptrace); 70 BUG_ON(!child->ptrace);
71 71
@@ -120,8 +120,18 @@ int ptrace_check_attach(struct task_struct *child, int kill)
120 120
121static int may_attach(struct task_struct *task) 121static int may_attach(struct task_struct *task)
122{ 122{
123 if (!task->mm) 123 /* May we inspect the given task?
124 return -EPERM; 124 * This check is used both for attaching with ptrace
125 * and for allowing access to sensitive information in /proc.
126 *
127 * ptrace_attach denies several cases that /proc allows
128 * because setting up the necessary parent/child relationship
129 * or halting the specified task is impossible.
130 */
131 int dumpable = 0;
132 /* Don't let security modules deny introspection */
133 if (task == current)
134 return 0;
125 if (((current->uid != task->euid) || 135 if (((current->uid != task->euid) ||
126 (current->uid != task->suid) || 136 (current->uid != task->suid) ||
127 (current->uid != task->uid) || 137 (current->uid != task->uid) ||
@@ -130,7 +140,9 @@ static int may_attach(struct task_struct *task)
130 (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) 140 (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
131 return -EPERM; 141 return -EPERM;
132 smp_rmb(); 142 smp_rmb();
133 if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE)) 143 if (task->mm)
144 dumpable = task->mm->dumpable;
145 if (!dumpable && !capable(CAP_SYS_PTRACE))
134 return -EPERM; 146 return -EPERM;
135 147
136 return security_ptrace(current, task); 148 return security_ptrace(current, task);
@@ -176,6 +188,8 @@ repeat:
176 goto repeat; 188 goto repeat;
177 } 189 }
178 190
191 if (!task->mm)
192 goto bad;
179 /* the same process cannot be attached many times */ 193 /* the same process cannot be attached many times */
180 if (task->ptrace & PT_PTRACED) 194 if (task->ptrace & PT_PTRACED)
181 goto bad; 195 goto bad;
@@ -200,7 +214,7 @@ out:
200 return retval; 214 return retval;
201} 215}
202 216
203void __ptrace_detach(struct task_struct *child, unsigned int data) 217static inline void __ptrace_detach(struct task_struct *child, unsigned int data)
204{ 218{
205 child->exit_code = data; 219 child->exit_code = data;
206 /* .. re-parent .. */ 220 /* .. re-parent .. */
@@ -219,6 +233,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
219 ptrace_disable(child); 233 ptrace_disable(child);
220 234
221 write_lock_irq(&tasklist_lock); 235 write_lock_irq(&tasklist_lock);
236 /* protect against de_thread()->release_task() */
222 if (child->ptrace) 237 if (child->ptrace)
223 __ptrace_detach(child, data); 238 __ptrace_detach(child, data);
224 write_unlock_irq(&tasklist_lock); 239 write_unlock_irq(&tasklist_lock);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 2058f88c7bbb..759805c9859a 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -53,13 +53,13 @@
53static struct rcu_ctrlblk rcu_ctrlblk = { 53static struct rcu_ctrlblk rcu_ctrlblk = {
54 .cur = -300, 54 .cur = -300,
55 .completed = -300, 55 .completed = -300,
56 .lock = SPIN_LOCK_UNLOCKED, 56 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
57 .cpumask = CPU_MASK_NONE, 57 .cpumask = CPU_MASK_NONE,
58}; 58};
59static struct rcu_ctrlblk rcu_bh_ctrlblk = { 59static struct rcu_ctrlblk rcu_bh_ctrlblk = {
60 .cur = -300, 60 .cur = -300,
61 .completed = -300, 61 .completed = -300,
62 .lock = SPIN_LOCK_UNLOCKED, 62 .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
63 .cpumask = CPU_MASK_NONE, 63 .cpumask = CPU_MASK_NONE,
64}; 64};
65 65
@@ -182,6 +182,15 @@ long rcu_batches_completed(void)
182 return rcu_ctrlblk.completed; 182 return rcu_ctrlblk.completed;
183} 183}
184 184
185/*
186 * Return the number of RCU batches processed thus far. Useful
187 * for debug and statistics.
188 */
189long rcu_batches_completed_bh(void)
190{
191 return rcu_bh_ctrlblk.completed;
192}
193
185static void rcu_barrier_callback(struct rcu_head *notused) 194static void rcu_barrier_callback(struct rcu_head *notused)
186{ 195{
187 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) 196 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
@@ -539,7 +548,7 @@ static void __devinit rcu_online_cpu(int cpu)
539 tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL); 548 tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL);
540} 549}
541 550
542static int rcu_cpu_notify(struct notifier_block *self, 551static int __devinit rcu_cpu_notify(struct notifier_block *self,
543 unsigned long action, void *hcpu) 552 unsigned long action, void *hcpu)
544{ 553{
545 long cpu = (long)hcpu; 554 long cpu = (long)hcpu;
@@ -556,7 +565,7 @@ static int rcu_cpu_notify(struct notifier_block *self,
556 return NOTIFY_OK; 565 return NOTIFY_OK;
557} 566}
558 567
559static struct notifier_block rcu_nb = { 568static struct notifier_block __devinitdata rcu_nb = {
560 .notifier_call = rcu_cpu_notify, 569 .notifier_call = rcu_cpu_notify,
561}; 570};
562 571
@@ -612,14 +621,6 @@ void synchronize_rcu(void)
612 wait_for_completion(&rcu.completion); 621 wait_for_completion(&rcu.completion);
613} 622}
614 623
615/*
616 * Deprecated, use synchronize_rcu() or synchronize_sched() instead.
617 */
618void synchronize_kernel(void)
619{
620 synchronize_rcu();
621}
622
623module_param(blimit, int, 0); 624module_param(blimit, int, 0);
624module_param(qhimark, int, 0); 625module_param(qhimark, int, 0);
625module_param(qlowmark, int, 0); 626module_param(qlowmark, int, 0);
@@ -627,7 +628,7 @@ module_param(qlowmark, int, 0);
627module_param(rsinterval, int, 0); 628module_param(rsinterval, int, 0);
628#endif 629#endif
629EXPORT_SYMBOL_GPL(rcu_batches_completed); 630EXPORT_SYMBOL_GPL(rcu_batches_completed);
630EXPORT_SYMBOL_GPL_FUTURE(call_rcu); /* WARNING: GPL-only in April 2006. */ 631EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
631EXPORT_SYMBOL_GPL_FUTURE(call_rcu_bh); /* WARNING: GPL-only in April 2006. */ 632EXPORT_SYMBOL_GPL(call_rcu);
633EXPORT_SYMBOL_GPL(call_rcu_bh);
632EXPORT_SYMBOL_GPL(synchronize_rcu); 634EXPORT_SYMBOL_GPL(synchronize_rcu);
633EXPORT_SYMBOL_GPL_FUTURE(synchronize_kernel); /* WARNING: GPL-only in April 2006. */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 8154e7589d12..4d1c3d247127 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Read-Copy Update /proc-based torture test facility 2 * Read-Copy Update module-based torture test facility
3 * 3 *
4 * This program is free software; you can redistribute it and/or modify 4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by 5 * it under the terms of the GNU General Public License as published by
@@ -53,6 +53,7 @@ static int stat_interval; /* Interval between stats, in seconds. */
53static int verbose; /* Print more debug info. */ 53static int verbose; /* Print more debug info. */
54static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ 54static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */
55static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/ 55static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/
56static char *torture_type = "rcu"; /* What to torture. */
56 57
57module_param(nreaders, int, 0); 58module_param(nreaders, int, 0);
58MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); 59MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
@@ -64,13 +65,16 @@ module_param(test_no_idle_hz, bool, 0);
64MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); 65MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
65module_param(shuffle_interval, int, 0); 66module_param(shuffle_interval, int, 0);
66MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); 67MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
67#define TORTURE_FLAG "rcutorture: " 68module_param(torture_type, charp, 0);
69MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh)");
70
71#define TORTURE_FLAG "-torture:"
68#define PRINTK_STRING(s) \ 72#define PRINTK_STRING(s) \
69 do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) 73 do { printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0)
70#define VERBOSE_PRINTK_STRING(s) \ 74#define VERBOSE_PRINTK_STRING(s) \
71 do { if (verbose) printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) 75 do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0)
72#define VERBOSE_PRINTK_ERRSTRING(s) \ 76#define VERBOSE_PRINTK_ERRSTRING(s) \
73 do { if (verbose) printk(KERN_ALERT TORTURE_FLAG "!!! " s "\n"); } while (0) 77 do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0)
74 78
75static char printk_buf[4096]; 79static char printk_buf[4096];
76 80
@@ -139,28 +143,6 @@ rcu_torture_free(struct rcu_torture *p)
139 spin_unlock_bh(&rcu_torture_lock); 143 spin_unlock_bh(&rcu_torture_lock);
140} 144}
141 145
142static void
143rcu_torture_cb(struct rcu_head *p)
144{
145 int i;
146 struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
147
148 if (fullstop) {
149 /* Test is ending, just drop callbacks on the floor. */
150 /* The next initialization will pick up the pieces. */
151 return;
152 }
153 i = rp->rtort_pipe_count;
154 if (i > RCU_TORTURE_PIPE_LEN)
155 i = RCU_TORTURE_PIPE_LEN;
156 atomic_inc(&rcu_torture_wcount[i]);
157 if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
158 rp->rtort_mbtest = 0;
159 rcu_torture_free(rp);
160 } else
161 call_rcu(p, rcu_torture_cb);
162}
163
164struct rcu_random_state { 146struct rcu_random_state {
165 unsigned long rrs_state; 147 unsigned long rrs_state;
166 unsigned long rrs_count; 148 unsigned long rrs_count;
@@ -191,6 +173,119 @@ rcu_random(struct rcu_random_state *rrsp)
191} 173}
192 174
193/* 175/*
176 * Operations vector for selecting different types of tests.
177 */
178
179struct rcu_torture_ops {
180 void (*init)(void);
181 void (*cleanup)(void);
182 int (*readlock)(void);
183 void (*readunlock)(int idx);
184 int (*completed)(void);
185 void (*deferredfree)(struct rcu_torture *p);
186 int (*stats)(char *page);
187 char *name;
188};
189static struct rcu_torture_ops *cur_ops = NULL;
190
191/*
192 * Definitions for rcu torture testing.
193 */
194
195static int rcu_torture_read_lock(void)
196{
197 rcu_read_lock();
198 return 0;
199}
200
201static void rcu_torture_read_unlock(int idx)
202{
203 rcu_read_unlock();
204}
205
206static int rcu_torture_completed(void)
207{
208 return rcu_batches_completed();
209}
210
211static void
212rcu_torture_cb(struct rcu_head *p)
213{
214 int i;
215 struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
216
217 if (fullstop) {
218 /* Test is ending, just drop callbacks on the floor. */
219 /* The next initialization will pick up the pieces. */
220 return;
221 }
222 i = rp->rtort_pipe_count;
223 if (i > RCU_TORTURE_PIPE_LEN)
224 i = RCU_TORTURE_PIPE_LEN;
225 atomic_inc(&rcu_torture_wcount[i]);
226 if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
227 rp->rtort_mbtest = 0;
228 rcu_torture_free(rp);
229 } else
230 cur_ops->deferredfree(rp);
231}
232
233static void rcu_torture_deferred_free(struct rcu_torture *p)
234{
235 call_rcu(&p->rtort_rcu, rcu_torture_cb);
236}
237
238static struct rcu_torture_ops rcu_ops = {
239 .init = NULL,
240 .cleanup = NULL,
241 .readlock = rcu_torture_read_lock,
242 .readunlock = rcu_torture_read_unlock,
243 .completed = rcu_torture_completed,
244 .deferredfree = rcu_torture_deferred_free,
245 .stats = NULL,
246 .name = "rcu"
247};
248
249/*
250 * Definitions for rcu_bh torture testing.
251 */
252
253static int rcu_bh_torture_read_lock(void)
254{
255 rcu_read_lock_bh();
256 return 0;
257}
258
259static void rcu_bh_torture_read_unlock(int idx)
260{
261 rcu_read_unlock_bh();
262}
263
264static int rcu_bh_torture_completed(void)
265{
266 return rcu_batches_completed_bh();
267}
268
269static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
270{
271 call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
272}
273
274static struct rcu_torture_ops rcu_bh_ops = {
275 .init = NULL,
276 .cleanup = NULL,
277 .readlock = rcu_bh_torture_read_lock,
278 .readunlock = rcu_bh_torture_read_unlock,
279 .completed = rcu_bh_torture_completed,
280 .deferredfree = rcu_bh_torture_deferred_free,
281 .stats = NULL,
282 .name = "rcu_bh"
283};
284
285static struct rcu_torture_ops *torture_ops[] =
286 { &rcu_ops, &rcu_bh_ops, NULL };
287
288/*
194 * RCU torture writer kthread. Repeatedly substitutes a new structure 289 * RCU torture writer kthread. Repeatedly substitutes a new structure
195 * for that pointed to by rcu_torture_current, freeing the old structure 290 * for that pointed to by rcu_torture_current, freeing the old structure
196 * after a series of grace periods (the "pipeline"). 291 * after a series of grace periods (the "pipeline").
@@ -209,8 +304,6 @@ rcu_torture_writer(void *arg)
209 304
210 do { 305 do {
211 schedule_timeout_uninterruptible(1); 306 schedule_timeout_uninterruptible(1);
212 if (rcu_batches_completed() == oldbatch)
213 continue;
214 if ((rp = rcu_torture_alloc()) == NULL) 307 if ((rp = rcu_torture_alloc()) == NULL)
215 continue; 308 continue;
216 rp->rtort_pipe_count = 0; 309 rp->rtort_pipe_count = 0;
@@ -225,10 +318,10 @@ rcu_torture_writer(void *arg)
225 i = RCU_TORTURE_PIPE_LEN; 318 i = RCU_TORTURE_PIPE_LEN;
226 atomic_inc(&rcu_torture_wcount[i]); 319 atomic_inc(&rcu_torture_wcount[i]);
227 old_rp->rtort_pipe_count++; 320 old_rp->rtort_pipe_count++;
228 call_rcu(&old_rp->rtort_rcu, rcu_torture_cb); 321 cur_ops->deferredfree(old_rp);
229 } 322 }
230 rcu_torture_current_version++; 323 rcu_torture_current_version++;
231 oldbatch = rcu_batches_completed(); 324 oldbatch = cur_ops->completed();
232 } while (!kthread_should_stop() && !fullstop); 325 } while (!kthread_should_stop() && !fullstop);
233 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); 326 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
234 while (!kthread_should_stop()) 327 while (!kthread_should_stop())
@@ -246,6 +339,7 @@ static int
246rcu_torture_reader(void *arg) 339rcu_torture_reader(void *arg)
247{ 340{
248 int completed; 341 int completed;
342 int idx;
249 DEFINE_RCU_RANDOM(rand); 343 DEFINE_RCU_RANDOM(rand);
250 struct rcu_torture *p; 344 struct rcu_torture *p;
251 int pipe_count; 345 int pipe_count;
@@ -254,12 +348,12 @@ rcu_torture_reader(void *arg)
254 set_user_nice(current, 19); 348 set_user_nice(current, 19);
255 349
256 do { 350 do {
257 rcu_read_lock(); 351 idx = cur_ops->readlock();
258 completed = rcu_batches_completed(); 352 completed = cur_ops->completed();
259 p = rcu_dereference(rcu_torture_current); 353 p = rcu_dereference(rcu_torture_current);
260 if (p == NULL) { 354 if (p == NULL) {
261 /* Wait for rcu_torture_writer to get underway */ 355 /* Wait for rcu_torture_writer to get underway */
262 rcu_read_unlock(); 356 cur_ops->readunlock(idx);
263 schedule_timeout_interruptible(HZ); 357 schedule_timeout_interruptible(HZ);
264 continue; 358 continue;
265 } 359 }
@@ -273,14 +367,14 @@ rcu_torture_reader(void *arg)
273 pipe_count = RCU_TORTURE_PIPE_LEN; 367 pipe_count = RCU_TORTURE_PIPE_LEN;
274 } 368 }
275 ++__get_cpu_var(rcu_torture_count)[pipe_count]; 369 ++__get_cpu_var(rcu_torture_count)[pipe_count];
276 completed = rcu_batches_completed() - completed; 370 completed = cur_ops->completed() - completed;
277 if (completed > RCU_TORTURE_PIPE_LEN) { 371 if (completed > RCU_TORTURE_PIPE_LEN) {
278 /* Should not happen, but... */ 372 /* Should not happen, but... */
279 completed = RCU_TORTURE_PIPE_LEN; 373 completed = RCU_TORTURE_PIPE_LEN;
280 } 374 }
281 ++__get_cpu_var(rcu_torture_batch)[completed]; 375 ++__get_cpu_var(rcu_torture_batch)[completed];
282 preempt_enable(); 376 preempt_enable();
283 rcu_read_unlock(); 377 cur_ops->readunlock(idx);
284 schedule(); 378 schedule();
285 } while (!kthread_should_stop() && !fullstop); 379 } while (!kthread_should_stop() && !fullstop);
286 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); 380 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
@@ -311,7 +405,7 @@ rcu_torture_printk(char *page)
311 if (pipesummary[i] != 0) 405 if (pipesummary[i] != 0)
312 break; 406 break;
313 } 407 }
314 cnt += sprintf(&page[cnt], "rcutorture: "); 408 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
315 cnt += sprintf(&page[cnt], 409 cnt += sprintf(&page[cnt],
316 "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " 410 "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
317 "rtmbe: %d", 411 "rtmbe: %d",
@@ -324,7 +418,7 @@ rcu_torture_printk(char *page)
324 atomic_read(&n_rcu_torture_mberror)); 418 atomic_read(&n_rcu_torture_mberror));
325 if (atomic_read(&n_rcu_torture_mberror) != 0) 419 if (atomic_read(&n_rcu_torture_mberror) != 0)
326 cnt += sprintf(&page[cnt], " !!!"); 420 cnt += sprintf(&page[cnt], " !!!");
327 cnt += sprintf(&page[cnt], "\nrcutorture: "); 421 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
328 if (i > 1) { 422 if (i > 1) {
329 cnt += sprintf(&page[cnt], "!!! "); 423 cnt += sprintf(&page[cnt], "!!! ");
330 atomic_inc(&n_rcu_torture_error); 424 atomic_inc(&n_rcu_torture_error);
@@ -332,17 +426,19 @@ rcu_torture_printk(char *page)
332 cnt += sprintf(&page[cnt], "Reader Pipe: "); 426 cnt += sprintf(&page[cnt], "Reader Pipe: ");
333 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) 427 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
334 cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); 428 cnt += sprintf(&page[cnt], " %ld", pipesummary[i]);
335 cnt += sprintf(&page[cnt], "\nrcutorture: "); 429 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
336 cnt += sprintf(&page[cnt], "Reader Batch: "); 430 cnt += sprintf(&page[cnt], "Reader Batch: ");
337 for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++) 431 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
338 cnt += sprintf(&page[cnt], " %ld", batchsummary[i]); 432 cnt += sprintf(&page[cnt], " %ld", batchsummary[i]);
339 cnt += sprintf(&page[cnt], "\nrcutorture: "); 433 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
340 cnt += sprintf(&page[cnt], "Free-Block Circulation: "); 434 cnt += sprintf(&page[cnt], "Free-Block Circulation: ");
341 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { 435 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
342 cnt += sprintf(&page[cnt], " %d", 436 cnt += sprintf(&page[cnt], " %d",
343 atomic_read(&rcu_torture_wcount[i])); 437 atomic_read(&rcu_torture_wcount[i]));
344 } 438 }
345 cnt += sprintf(&page[cnt], "\n"); 439 cnt += sprintf(&page[cnt], "\n");
440 if (cur_ops->stats != NULL)
441 cnt += cur_ops->stats(&page[cnt]);
346 return cnt; 442 return cnt;
347} 443}
348 444
@@ -444,11 +540,11 @@ rcu_torture_shuffle(void *arg)
444static inline void 540static inline void
445rcu_torture_print_module_parms(char *tag) 541rcu_torture_print_module_parms(char *tag)
446{ 542{
447 printk(KERN_ALERT TORTURE_FLAG "--- %s: nreaders=%d " 543 printk(KERN_ALERT "%s" TORTURE_FLAG "--- %s: nreaders=%d "
448 "stat_interval=%d verbose=%d test_no_idle_hz=%d " 544 "stat_interval=%d verbose=%d test_no_idle_hz=%d "
449 "shuffle_interval = %d\n", 545 "shuffle_interval = %d\n",
450 tag, nrealreaders, stat_interval, verbose, test_no_idle_hz, 546 torture_type, tag, nrealreaders, stat_interval, verbose,
451 shuffle_interval); 547 test_no_idle_hz, shuffle_interval);
452} 548}
453 549
454static void 550static void
@@ -493,6 +589,9 @@ rcu_torture_cleanup(void)
493 rcu_barrier(); 589 rcu_barrier();
494 590
495 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ 591 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
592
593 if (cur_ops->cleanup != NULL)
594 cur_ops->cleanup();
496 if (atomic_read(&n_rcu_torture_error)) 595 if (atomic_read(&n_rcu_torture_error))
497 rcu_torture_print_module_parms("End of test: FAILURE"); 596 rcu_torture_print_module_parms("End of test: FAILURE");
498 else 597 else
@@ -508,6 +607,20 @@ rcu_torture_init(void)
508 607
509 /* Process args and tell the world that the torturer is on the job. */ 608 /* Process args and tell the world that the torturer is on the job. */
510 609
610 for (i = 0; cur_ops = torture_ops[i], cur_ops != NULL; i++) {
611 cur_ops = torture_ops[i];
612 if (strcmp(torture_type, cur_ops->name) == 0) {
613 break;
614 }
615 }
616 if (cur_ops == NULL) {
617 printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n",
618 torture_type);
619 return (-EINVAL);
620 }
621 if (cur_ops->init != NULL)
622 cur_ops->init(); /* no "goto unwind" prior to this point!!! */
623
511 if (nreaders >= 0) 624 if (nreaders >= 0)
512 nrealreaders = nreaders; 625 nrealreaders = nreaders;
513 else 626 else
diff --git a/kernel/resource.c b/kernel/resource.c
index e3080fcc66a3..0dd3a857579e 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -7,7 +7,6 @@
7 * Arbitrary resource management. 7 * Arbitrary resource management.
8 */ 8 */
9 9
10#include <linux/config.h>
11#include <linux/module.h> 10#include <linux/module.h>
12#include <linux/sched.h> 11#include <linux/sched.h>
13#include <linux/errno.h> 12#include <linux/errno.h>
@@ -23,20 +22,18 @@
23 22
24struct resource ioport_resource = { 23struct resource ioport_resource = {
25 .name = "PCI IO", 24 .name = "PCI IO",
26 .start = 0x0000, 25 .start = 0,
27 .end = IO_SPACE_LIMIT, 26 .end = IO_SPACE_LIMIT,
28 .flags = IORESOURCE_IO, 27 .flags = IORESOURCE_IO,
29}; 28};
30
31EXPORT_SYMBOL(ioport_resource); 29EXPORT_SYMBOL(ioport_resource);
32 30
33struct resource iomem_resource = { 31struct resource iomem_resource = {
34 .name = "PCI mem", 32 .name = "PCI mem",
35 .start = 0UL, 33 .start = 0,
36 .end = ~0UL, 34 .end = -1,
37 .flags = IORESOURCE_MEM, 35 .flags = IORESOURCE_MEM,
38}; 36};
39
40EXPORT_SYMBOL(iomem_resource); 37EXPORT_SYMBOL(iomem_resource);
41 38
42static DEFINE_RWLOCK(resource_lock); 39static DEFINE_RWLOCK(resource_lock);
@@ -83,10 +80,10 @@ static int r_show(struct seq_file *m, void *v)
83 for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent) 80 for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent)
84 if (p->parent == root) 81 if (p->parent == root)
85 break; 82 break;
86 seq_printf(m, "%*s%0*lx-%0*lx : %s\n", 83 seq_printf(m, "%*s%0*llx-%0*llx : %s\n",
87 depth * 2, "", 84 depth * 2, "",
88 width, r->start, 85 width, (unsigned long long) r->start,
89 width, r->end, 86 width, (unsigned long long) r->end,
90 r->name ? r->name : "<BAD>"); 87 r->name ? r->name : "<BAD>");
91 return 0; 88 return 0;
92} 89}
@@ -151,8 +148,8 @@ __initcall(ioresources_init);
151/* Return the conflict entry if you can't request it */ 148/* Return the conflict entry if you can't request it */
152static struct resource * __request_resource(struct resource *root, struct resource *new) 149static struct resource * __request_resource(struct resource *root, struct resource *new)
153{ 150{
154 unsigned long start = new->start; 151 resource_size_t start = new->start;
155 unsigned long end = new->end; 152 resource_size_t end = new->end;
156 struct resource *tmp, **p; 153 struct resource *tmp, **p;
157 154
158 if (end < start) 155 if (end < start)
@@ -232,15 +229,52 @@ int release_resource(struct resource *old)
232 229
233EXPORT_SYMBOL(release_resource); 230EXPORT_SYMBOL(release_resource);
234 231
232#ifdef CONFIG_MEMORY_HOTPLUG
233/*
234 * Finds the lowest memory reosurce exists within [res->start.res->end)
235 * the caller must specify res->start, res->end, res->flags.
236 * If found, returns 0, res is overwritten, if not found, returns -1.
237 */
238int find_next_system_ram(struct resource *res)
239{
240 resource_size_t start, end;
241 struct resource *p;
242
243 BUG_ON(!res);
244
245 start = res->start;
246 end = res->end;
247
248 read_lock(&resource_lock);
249 for (p = iomem_resource.child; p ; p = p->sibling) {
250 /* system ram is just marked as IORESOURCE_MEM */
251 if (p->flags != res->flags)
252 continue;
253 if (p->start > end) {
254 p = NULL;
255 break;
256 }
257 if (p->start >= start)
258 break;
259 }
260 read_unlock(&resource_lock);
261 if (!p)
262 return -1;
263 /* copy data */
264 res->start = p->start;
265 res->end = p->end;
266 return 0;
267}
268#endif
269
235/* 270/*
236 * Find empty slot in the resource tree given range and alignment. 271 * Find empty slot in the resource tree given range and alignment.
237 */ 272 */
238static int find_resource(struct resource *root, struct resource *new, 273static int find_resource(struct resource *root, struct resource *new,
239 unsigned long size, 274 resource_size_t size, resource_size_t min,
240 unsigned long min, unsigned long max, 275 resource_size_t max, resource_size_t align,
241 unsigned long align,
242 void (*alignf)(void *, struct resource *, 276 void (*alignf)(void *, struct resource *,
243 unsigned long, unsigned long), 277 resource_size_t, resource_size_t),
244 void *alignf_data) 278 void *alignf_data)
245{ 279{
246 struct resource *this = root->child; 280 struct resource *this = root->child;
@@ -282,11 +316,10 @@ static int find_resource(struct resource *root, struct resource *new,
282 * Allocate empty slot in the resource tree given range and alignment. 316 * Allocate empty slot in the resource tree given range and alignment.
283 */ 317 */
284int allocate_resource(struct resource *root, struct resource *new, 318int allocate_resource(struct resource *root, struct resource *new,
285 unsigned long size, 319 resource_size_t size, resource_size_t min,
286 unsigned long min, unsigned long max, 320 resource_size_t max, resource_size_t align,
287 unsigned long align,
288 void (*alignf)(void *, struct resource *, 321 void (*alignf)(void *, struct resource *,
289 unsigned long, unsigned long), 322 resource_size_t, resource_size_t),
290 void *alignf_data) 323 void *alignf_data)
291{ 324{
292 int err; 325 int err;
@@ -371,17 +404,15 @@ int insert_resource(struct resource *parent, struct resource *new)
371 return result; 404 return result;
372} 405}
373 406
374EXPORT_SYMBOL(insert_resource);
375
376/* 407/*
377 * Given an existing resource, change its start and size to match the 408 * Given an existing resource, change its start and size to match the
378 * arguments. Returns -EBUSY if it can't fit. Existing children of 409 * arguments. Returns -EBUSY if it can't fit. Existing children of
379 * the resource are assumed to be immutable. 410 * the resource are assumed to be immutable.
380 */ 411 */
381int adjust_resource(struct resource *res, unsigned long start, unsigned long size) 412int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size)
382{ 413{
383 struct resource *tmp, *parent = res->parent; 414 struct resource *tmp, *parent = res->parent;
384 unsigned long end = start + size - 1; 415 resource_size_t end = start + size - 1;
385 int result = -EBUSY; 416 int result = -EBUSY;
386 417
387 write_lock(&resource_lock); 418 write_lock(&resource_lock);
@@ -428,7 +459,9 @@ EXPORT_SYMBOL(adjust_resource);
428 * 459 *
429 * Release-region releases a matching busy region. 460 * Release-region releases a matching busy region.
430 */ 461 */
431struct resource * __request_region(struct resource *parent, unsigned long start, unsigned long n, const char *name) 462struct resource * __request_region(struct resource *parent,
463 resource_size_t start, resource_size_t n,
464 const char *name)
432{ 465{
433 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); 466 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
434 467
@@ -464,7 +497,8 @@ struct resource * __request_region(struct resource *parent, unsigned long start,
464 497
465EXPORT_SYMBOL(__request_region); 498EXPORT_SYMBOL(__request_region);
466 499
467int __check_region(struct resource *parent, unsigned long start, unsigned long n) 500int __check_region(struct resource *parent, resource_size_t start,
501 resource_size_t n)
468{ 502{
469 struct resource * res; 503 struct resource * res;
470 504
@@ -479,10 +513,11 @@ int __check_region(struct resource *parent, unsigned long start, unsigned long n
479 513
480EXPORT_SYMBOL(__check_region); 514EXPORT_SYMBOL(__check_region);
481 515
482void __release_region(struct resource *parent, unsigned long start, unsigned long n) 516void __release_region(struct resource *parent, resource_size_t start,
517 resource_size_t n)
483{ 518{
484 struct resource **p; 519 struct resource **p;
485 unsigned long end; 520 resource_size_t end;
486 521
487 p = &parent->child; 522 p = &parent->child;
488 end = start + n - 1; 523 end = start + n - 1;
@@ -511,7 +546,9 @@ void __release_region(struct resource *parent, unsigned long start, unsigned lon
511 546
512 write_unlock(&resource_lock); 547 write_unlock(&resource_lock);
513 548
514 printk(KERN_WARNING "Trying to free nonexistent resource <%08lx-%08lx>\n", start, end); 549 printk(KERN_WARNING "Trying to free nonexistent resource "
550 "<%016llx-%016llx>\n", (unsigned long long)start,
551 (unsigned long long)end);
515} 552}
516 553
517EXPORT_SYMBOL(__release_region); 554EXPORT_SYMBOL(__release_region);
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
new file mode 100644
index 000000000000..0c1faa950af7
--- /dev/null
+++ b/kernel/rtmutex-debug.c
@@ -0,0 +1,242 @@
1/*
2 * RT-Mutexes: blocking mutual exclusion locks with PI support
3 *
4 * started by Ingo Molnar and Thomas Gleixner:
5 *
6 * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
8 *
9 * This code is based on the rt.c implementation in the preempt-rt tree.
10 * Portions of said code are
11 *
12 * Copyright (C) 2004 LynuxWorks, Inc., Igor Manyilov, Bill Huey
13 * Copyright (C) 2006 Esben Nielsen
14 * Copyright (C) 2006 Kihon Technologies Inc.,
15 * Steven Rostedt <rostedt@goodmis.org>
16 *
17 * See rt.c in preempt-rt for proper credits and further information
18 */
19#include <linux/config.h>
20#include <linux/sched.h>
21#include <linux/delay.h>
22#include <linux/module.h>
23#include <linux/spinlock.h>
24#include <linux/kallsyms.h>
25#include <linux/syscalls.h>
26#include <linux/interrupt.h>
27#include <linux/plist.h>
28#include <linux/fs.h>
29#include <linux/debug_locks.h>
30
31#include "rtmutex_common.h"
32
33#ifdef CONFIG_DEBUG_RT_MUTEXES
34# include "rtmutex-debug.h"
35#else
36# include "rtmutex.h"
37#endif
38
39# define TRACE_WARN_ON(x) WARN_ON(x)
40# define TRACE_BUG_ON(x) BUG_ON(x)
41
42# define TRACE_OFF() \
43do { \
44 if (rt_trace_on) { \
45 rt_trace_on = 0; \
46 console_verbose(); \
47 if (spin_is_locked(&current->pi_lock)) \
48 spin_unlock(&current->pi_lock); \
49 } \
50} while (0)
51
52# define TRACE_OFF_NOLOCK() \
53do { \
54 if (rt_trace_on) { \
55 rt_trace_on = 0; \
56 console_verbose(); \
57 } \
58} while (0)
59
60# define TRACE_BUG_LOCKED() \
61do { \
62 TRACE_OFF(); \
63 BUG(); \
64} while (0)
65
66# define TRACE_WARN_ON_LOCKED(c) \
67do { \
68 if (unlikely(c)) { \
69 TRACE_OFF(); \
70 WARN_ON(1); \
71 } \
72} while (0)
73
74# define TRACE_BUG_ON_LOCKED(c) \
75do { \
76 if (unlikely(c)) \
77 TRACE_BUG_LOCKED(); \
78} while (0)
79
80#ifdef CONFIG_SMP
81# define SMP_TRACE_BUG_ON_LOCKED(c) TRACE_BUG_ON_LOCKED(c)
82#else
83# define SMP_TRACE_BUG_ON_LOCKED(c) do { } while (0)
84#endif
85
86/*
87 * deadlock detection flag. We turn it off when we detect
88 * the first problem because we dont want to recurse back
89 * into the tracing code when doing error printk or
90 * executing a BUG():
91 */
92int rt_trace_on = 1;
93
94void deadlock_trace_off(void)
95{
96 rt_trace_on = 0;
97}
98
99static void printk_task(struct task_struct *p)
100{
101 if (p)
102 printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio);
103 else
104 printk("<none>");
105}
106
107static void printk_lock(struct rt_mutex *lock, int print_owner)
108{
109 if (lock->name)
110 printk(" [%p] {%s}\n",
111 lock, lock->name);
112 else
113 printk(" [%p] {%s:%d}\n",
114 lock, lock->file, lock->line);
115
116 if (print_owner && rt_mutex_owner(lock)) {
117 printk(".. ->owner: %p\n", lock->owner);
118 printk(".. held by: ");
119 printk_task(rt_mutex_owner(lock));
120 printk("\n");
121 }
122}
123
124void rt_mutex_debug_task_free(struct task_struct *task)
125{
126 WARN_ON(!plist_head_empty(&task->pi_waiters));
127 WARN_ON(task->pi_blocked_on);
128}
129
130/*
131 * We fill out the fields in the waiter to store the information about
132 * the deadlock. We print when we return. act_waiter can be NULL in
133 * case of a remove waiter operation.
134 */
135void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter,
136 struct rt_mutex *lock)
137{
138 struct task_struct *task;
139
140 if (!rt_trace_on || detect || !act_waiter)
141 return;
142
143 task = rt_mutex_owner(act_waiter->lock);
144 if (task && task != current) {
145 act_waiter->deadlock_task_pid = task->pid;
146 act_waiter->deadlock_lock = lock;
147 }
148}
149
150void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
151{
152 struct task_struct *task;
153
154 if (!waiter->deadlock_lock || !rt_trace_on)
155 return;
156
157 task = find_task_by_pid(waiter->deadlock_task_pid);
158 if (!task)
159 return;
160
161 TRACE_OFF_NOLOCK();
162
163 printk("\n============================================\n");
164 printk( "[ BUG: circular locking deadlock detected! ]\n");
165 printk( "--------------------------------------------\n");
166 printk("%s/%d is deadlocking current task %s/%d\n\n",
167 task->comm, task->pid, current->comm, current->pid);
168
169 printk("\n1) %s/%d is trying to acquire this lock:\n",
170 current->comm, current->pid);
171 printk_lock(waiter->lock, 1);
172
173 printk("\n2) %s/%d is blocked on this lock:\n", task->comm, task->pid);
174 printk_lock(waiter->deadlock_lock, 1);
175
176 debug_show_held_locks(current);
177 debug_show_held_locks(task);
178
179 printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task->pid);
180 show_stack(task, NULL);
181 printk("\n%s/%d's [current] stackdump:\n\n",
182 current->comm, current->pid);
183 dump_stack();
184 debug_show_all_locks();
185
186 printk("[ turning off deadlock detection."
187 "Please report this trace. ]\n\n");
188 local_irq_disable();
189}
190
191void debug_rt_mutex_lock(struct rt_mutex *lock)
192{
193}
194
195void debug_rt_mutex_unlock(struct rt_mutex *lock)
196{
197 TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current);
198}
199
200void
201debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner)
202{
203}
204
205void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
206{
207 TRACE_WARN_ON_LOCKED(!rt_mutex_owner(lock));
208}
209
210void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
211{
212 memset(waiter, 0x11, sizeof(*waiter));
213 plist_node_init(&waiter->list_entry, MAX_PRIO);
214 plist_node_init(&waiter->pi_list_entry, MAX_PRIO);
215}
216
217void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
218{
219 TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry));
220 TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
221 TRACE_WARN_ON(waiter->task);
222 memset(waiter, 0x22, sizeof(*waiter));
223}
224
225void debug_rt_mutex_init(struct rt_mutex *lock, const char *name)
226{
227 /*
228 * Make sure we are not reinitializing a held lock:
229 */
230 debug_check_no_locks_freed((void *)lock, sizeof(*lock));
231 lock->name = name;
232}
233
234void
235rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task)
236{
237}
238
239void rt_mutex_deadlock_account_unlock(struct task_struct *task)
240{
241}
242
diff --git a/kernel/rtmutex-debug.h b/kernel/rtmutex-debug.h
new file mode 100644
index 000000000000..14193d596d78
--- /dev/null
+++ b/kernel/rtmutex-debug.h
@@ -0,0 +1,33 @@
1/*
2 * RT-Mutexes: blocking mutual exclusion locks with PI support
3 *
4 * started by Ingo Molnar and Thomas Gleixner:
5 *
6 * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
8 *
9 * This file contains macros used solely by rtmutex.c. Debug version.
10 */
11
12extern void
13rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task);
14extern void rt_mutex_deadlock_account_unlock(struct task_struct *task);
15extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
16extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter);
17extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name);
18extern void debug_rt_mutex_lock(struct rt_mutex *lock);
19extern void debug_rt_mutex_unlock(struct rt_mutex *lock);
20extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
21 struct task_struct *powner);
22extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock);
23extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter,
24 struct rt_mutex *lock);
25extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter);
26# define debug_rt_mutex_reset_waiter(w) \
27 do { (w)->deadlock_lock = NULL; } while (0)
28
29static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
30 int detect)
31{
32 return (waiter != NULL);
33}
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
new file mode 100644
index 000000000000..948bd8f643e2
--- /dev/null
+++ b/kernel/rtmutex-tester.c
@@ -0,0 +1,441 @@
1/*
2 * RT-Mutex-tester: scriptable tester for rt mutexes
3 *
4 * started by Thomas Gleixner:
5 *
6 * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
7 *
8 */
9#include <linux/config.h>
10#include <linux/kthread.h>
11#include <linux/module.h>
12#include <linux/sched.h>
13#include <linux/smp_lock.h>
14#include <linux/spinlock.h>
15#include <linux/sysdev.h>
16#include <linux/timer.h>
17
18#include "rtmutex.h"
19
20#define MAX_RT_TEST_THREADS 8
21#define MAX_RT_TEST_MUTEXES 8
22
23static spinlock_t rttest_lock;
24static atomic_t rttest_event;
25
26struct test_thread_data {
27 int opcode;
28 int opdata;
29 int mutexes[MAX_RT_TEST_MUTEXES];
30 int bkl;
31 int event;
32 struct sys_device sysdev;
33};
34
35static struct test_thread_data thread_data[MAX_RT_TEST_THREADS];
36static struct task_struct *threads[MAX_RT_TEST_THREADS];
37static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES];
38
39enum test_opcodes {
40 RTTEST_NOP = 0,
41 RTTEST_SCHEDOT, /* 1 Sched other, data = nice */
42 RTTEST_SCHEDRT, /* 2 Sched fifo, data = prio */
43 RTTEST_LOCK, /* 3 Lock uninterruptible, data = lockindex */
44 RTTEST_LOCKNOWAIT, /* 4 Lock uninterruptible no wait in wakeup, data = lockindex */
45 RTTEST_LOCKINT, /* 5 Lock interruptible, data = lockindex */
46 RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */
47 RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */
48 RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */
49 RTTEST_LOCKBKL, /* 9 Lock BKL */
50 RTTEST_UNLOCKBKL, /* 10 Unlock BKL */
51 RTTEST_SIGNAL, /* 11 Signal other test thread, data = thread id */
52 RTTEST_RESETEVENT = 98, /* 98 Reset event counter */
53 RTTEST_RESET = 99, /* 99 Reset all pending operations */
54};
55
56static int handle_op(struct test_thread_data *td, int lockwakeup)
57{
58 int i, id, ret = -EINVAL;
59
60 switch(td->opcode) {
61
62 case RTTEST_NOP:
63 return 0;
64
65 case RTTEST_LOCKCONT:
66 td->mutexes[td->opdata] = 1;
67 td->event = atomic_add_return(1, &rttest_event);
68 return 0;
69
70 case RTTEST_RESET:
71 for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) {
72 if (td->mutexes[i] == 4) {
73 rt_mutex_unlock(&mutexes[i]);
74 td->mutexes[i] = 0;
75 }
76 }
77
78 if (!lockwakeup && td->bkl == 4) {
79 unlock_kernel();
80 td->bkl = 0;
81 }
82 return 0;
83
84 case RTTEST_RESETEVENT:
85 atomic_set(&rttest_event, 0);
86 return 0;
87
88 default:
89 if (lockwakeup)
90 return ret;
91 }
92
93 switch(td->opcode) {
94
95 case RTTEST_LOCK:
96 case RTTEST_LOCKNOWAIT:
97 id = td->opdata;
98 if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
99 return ret;
100
101 td->mutexes[id] = 1;
102 td->event = atomic_add_return(1, &rttest_event);
103 rt_mutex_lock(&mutexes[id]);
104 td->event = atomic_add_return(1, &rttest_event);
105 td->mutexes[id] = 4;
106 return 0;
107
108 case RTTEST_LOCKINT:
109 case RTTEST_LOCKINTNOWAIT:
110 id = td->opdata;
111 if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
112 return ret;
113
114 td->mutexes[id] = 1;
115 td->event = atomic_add_return(1, &rttest_event);
116 ret = rt_mutex_lock_interruptible(&mutexes[id], 0);
117 td->event = atomic_add_return(1, &rttest_event);
118 td->mutexes[id] = ret ? 0 : 4;
119 return ret ? -EINTR : 0;
120
121 case RTTEST_UNLOCK:
122 id = td->opdata;
123 if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4)
124 return ret;
125
126 td->event = atomic_add_return(1, &rttest_event);
127 rt_mutex_unlock(&mutexes[id]);
128 td->event = atomic_add_return(1, &rttest_event);
129 td->mutexes[id] = 0;
130 return 0;
131
132 case RTTEST_LOCKBKL:
133 if (td->bkl)
134 return 0;
135 td->bkl = 1;
136 lock_kernel();
137 td->bkl = 4;
138 return 0;
139
140 case RTTEST_UNLOCKBKL:
141 if (td->bkl != 4)
142 break;
143 unlock_kernel();
144 td->bkl = 0;
145 return 0;
146
147 default:
148 break;
149 }
150 return ret;
151}
152
153/*
154 * Schedule replacement for rtsem_down(). Only called for threads with
155 * PF_MUTEX_TESTER set.
156 *
157 * This allows us to have finegrained control over the event flow.
158 *
159 */
160void schedule_rt_mutex_test(struct rt_mutex *mutex)
161{
162 int tid, op, dat;
163 struct test_thread_data *td;
164
165 /* We have to lookup the task */
166 for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) {
167 if (threads[tid] == current)
168 break;
169 }
170
171 BUG_ON(tid == MAX_RT_TEST_THREADS);
172
173 td = &thread_data[tid];
174
175 op = td->opcode;
176 dat = td->opdata;
177
178 switch (op) {
179 case RTTEST_LOCK:
180 case RTTEST_LOCKINT:
181 case RTTEST_LOCKNOWAIT:
182 case RTTEST_LOCKINTNOWAIT:
183 if (mutex != &mutexes[dat])
184 break;
185
186 if (td->mutexes[dat] != 1)
187 break;
188
189 td->mutexes[dat] = 2;
190 td->event = atomic_add_return(1, &rttest_event);
191 break;
192
193 case RTTEST_LOCKBKL:
194 default:
195 break;
196 }
197
198 schedule();
199
200
201 switch (op) {
202 case RTTEST_LOCK:
203 case RTTEST_LOCKINT:
204 if (mutex != &mutexes[dat])
205 return;
206
207 if (td->mutexes[dat] != 2)
208 return;
209
210 td->mutexes[dat] = 3;
211 td->event = atomic_add_return(1, &rttest_event);
212 break;
213
214 case RTTEST_LOCKNOWAIT:
215 case RTTEST_LOCKINTNOWAIT:
216 if (mutex != &mutexes[dat])
217 return;
218
219 if (td->mutexes[dat] != 2)
220 return;
221
222 td->mutexes[dat] = 1;
223 td->event = atomic_add_return(1, &rttest_event);
224 return;
225
226 case RTTEST_LOCKBKL:
227 return;
228 default:
229 return;
230 }
231
232 td->opcode = 0;
233
234 for (;;) {
235 set_current_state(TASK_INTERRUPTIBLE);
236
237 if (td->opcode > 0) {
238 int ret;
239
240 set_current_state(TASK_RUNNING);
241 ret = handle_op(td, 1);
242 set_current_state(TASK_INTERRUPTIBLE);
243 if (td->opcode == RTTEST_LOCKCONT)
244 break;
245 td->opcode = ret;
246 }
247
248 /* Wait for the next command to be executed */
249 schedule();
250 }
251
252 /* Restore previous command and data */
253 td->opcode = op;
254 td->opdata = dat;
255}
256
257static int test_func(void *data)
258{
259 struct test_thread_data *td = data;
260 int ret;
261
262 current->flags |= PF_MUTEX_TESTER;
263 allow_signal(SIGHUP);
264
265 for(;;) {
266
267 set_current_state(TASK_INTERRUPTIBLE);
268
269 if (td->opcode > 0) {
270 set_current_state(TASK_RUNNING);
271 ret = handle_op(td, 0);
272 set_current_state(TASK_INTERRUPTIBLE);
273 td->opcode = ret;
274 }
275
276 /* Wait for the next command to be executed */
277 schedule();
278 try_to_freeze();
279
280 if (signal_pending(current))
281 flush_signals(current);
282
283 if(kthread_should_stop())
284 break;
285 }
286 return 0;
287}
288
289/**
290 * sysfs_test_command - interface for test commands
291 * @dev: thread reference
292 * @buf: command for actual step
293 * @count: length of buffer
294 *
295 * command syntax:
296 *
297 * opcode:data
298 */
299static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf,
300 size_t count)
301{
302 struct sched_param schedpar;
303 struct test_thread_data *td;
304 char cmdbuf[32];
305 int op, dat, tid, ret;
306
307 td = container_of(dev, struct test_thread_data, sysdev);
308 tid = td->sysdev.id;
309
310 /* strings from sysfs write are not 0 terminated! */
311 if (count >= sizeof(cmdbuf))
312 return -EINVAL;
313
314 /* strip of \n: */
315 if (buf[count-1] == '\n')
316 count--;
317 if (count < 1)
318 return -EINVAL;
319
320 memcpy(cmdbuf, buf, count);
321 cmdbuf[count] = 0;
322
323 if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2)
324 return -EINVAL;
325
326 switch (op) {
327 case RTTEST_SCHEDOT:
328 schedpar.sched_priority = 0;
329 ret = sched_setscheduler(threads[tid], SCHED_NORMAL, &schedpar);
330 if (ret)
331 return ret;
332 set_user_nice(current, 0);
333 break;
334
335 case RTTEST_SCHEDRT:
336 schedpar.sched_priority = dat;
337 ret = sched_setscheduler(threads[tid], SCHED_FIFO, &schedpar);
338 if (ret)
339 return ret;
340 break;
341
342 case RTTEST_SIGNAL:
343 send_sig(SIGHUP, threads[tid], 0);
344 break;
345
346 default:
347 if (td->opcode > 0)
348 return -EBUSY;
349 td->opdata = dat;
350 td->opcode = op;
351 wake_up_process(threads[tid]);
352 }
353
354 return count;
355}
356
357/**
358 * sysfs_test_status - sysfs interface for rt tester
359 * @dev: thread to query
360 * @buf: char buffer to be filled with thread status info
361 */
362static ssize_t sysfs_test_status(struct sys_device *dev, char *buf)
363{
364 struct test_thread_data *td;
365 struct task_struct *tsk;
366 char *curr = buf;
367 int i;
368
369 td = container_of(dev, struct test_thread_data, sysdev);
370 tsk = threads[td->sysdev.id];
371
372 spin_lock(&rttest_lock);
373
374 curr += sprintf(curr,
375 "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:",
376 td->opcode, td->event, tsk->state,
377 (MAX_RT_PRIO - 1) - tsk->prio,
378 (MAX_RT_PRIO - 1) - tsk->normal_prio,
379 tsk->pi_blocked_on, td->bkl);
380
381 for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--)
382 curr += sprintf(curr, "%d", td->mutexes[i]);
383
384 spin_unlock(&rttest_lock);
385
386 curr += sprintf(curr, ", T: %p, R: %p\n", tsk,
387 mutexes[td->sysdev.id].owner);
388
389 return curr - buf;
390}
391
392static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL);
393static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command);
394
395static struct sysdev_class rttest_sysclass = {
396 set_kset_name("rttest"),
397};
398
399static int init_test_thread(int id)
400{
401 thread_data[id].sysdev.cls = &rttest_sysclass;
402 thread_data[id].sysdev.id = id;
403
404 threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id);
405 if (IS_ERR(threads[id]))
406 return PTR_ERR(threads[id]);
407
408 return sysdev_register(&thread_data[id].sysdev);
409}
410
411static int init_rttest(void)
412{
413 int ret, i;
414
415 spin_lock_init(&rttest_lock);
416
417 for (i = 0; i < MAX_RT_TEST_MUTEXES; i++)
418 rt_mutex_init(&mutexes[i]);
419
420 ret = sysdev_class_register(&rttest_sysclass);
421 if (ret)
422 return ret;
423
424 for (i = 0; i < MAX_RT_TEST_THREADS; i++) {
425 ret = init_test_thread(i);
426 if (ret)
427 break;
428 ret = sysdev_create_file(&thread_data[i].sysdev, &attr_status);
429 if (ret)
430 break;
431 ret = sysdev_create_file(&thread_data[i].sysdev, &attr_command);
432 if (ret)
433 break;
434 }
435
436 printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" );
437
438 return ret;
439}
440
441device_initcall(init_rttest);
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
new file mode 100644
index 000000000000..d2ef13b485e7
--- /dev/null
+++ b/kernel/rtmutex.c
@@ -0,0 +1,989 @@
1/*
2 * RT-Mutexes: simple blocking mutual exclusion locks with PI support
3 *
4 * started by Ingo Molnar and Thomas Gleixner.
5 *
6 * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
8 * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
9 * Copyright (C) 2006 Esben Nielsen
10 */
11#include <linux/spinlock.h>
12#include <linux/module.h>
13#include <linux/sched.h>
14#include <linux/timer.h>
15
16#include "rtmutex_common.h"
17
18#ifdef CONFIG_DEBUG_RT_MUTEXES
19# include "rtmutex-debug.h"
20#else
21# include "rtmutex.h"
22#endif
23
24/*
25 * lock->owner state tracking:
26 *
27 * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1
28 * are used to keep track of the "owner is pending" and "lock has
29 * waiters" state.
30 *
31 * owner bit1 bit0
32 * NULL 0 0 lock is free (fast acquire possible)
33 * NULL 0 1 invalid state
34 * NULL 1 0 Transitional State*
35 * NULL 1 1 invalid state
36 * taskpointer 0 0 lock is held (fast release possible)
37 * taskpointer 0 1 task is pending owner
38 * taskpointer 1 0 lock is held and has waiters
39 * taskpointer 1 1 task is pending owner and lock has more waiters
40 *
41 * Pending ownership is assigned to the top (highest priority)
42 * waiter of the lock, when the lock is released. The thread is woken
43 * up and can now take the lock. Until the lock is taken (bit 0
44 * cleared) a competing higher priority thread can steal the lock
45 * which puts the woken up thread back on the waiters list.
46 *
47 * The fast atomic compare exchange based acquire and release is only
48 * possible when bit 0 and 1 of lock->owner are 0.
49 *
50 * (*) There's a small time where the owner can be NULL and the
51 * "lock has waiters" bit is set. This can happen when grabbing the lock.
52 * To prevent a cmpxchg of the owner releasing the lock, we need to set this
53 * bit before looking at the lock, hence the reason this is a transitional
54 * state.
55 */
56
57static void
58rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
59 unsigned long mask)
60{
61 unsigned long val = (unsigned long)owner | mask;
62
63 if (rt_mutex_has_waiters(lock))
64 val |= RT_MUTEX_HAS_WAITERS;
65
66 lock->owner = (struct task_struct *)val;
67}
68
69static inline void clear_rt_mutex_waiters(struct rt_mutex *lock)
70{
71 lock->owner = (struct task_struct *)
72 ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
73}
74
75static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
76{
77 if (!rt_mutex_has_waiters(lock))
78 clear_rt_mutex_waiters(lock);
79}
80
81/*
82 * We can speed up the acquire/release, if the architecture
83 * supports cmpxchg and if there's no debugging state to be set up
84 */
85#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
86# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c)
87static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
88{
89 unsigned long owner, *p = (unsigned long *) &lock->owner;
90
91 do {
92 owner = *p;
93 } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
94}
95#else
96# define rt_mutex_cmpxchg(l,c,n) (0)
97static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
98{
99 lock->owner = (struct task_struct *)
100 ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
101}
102#endif
103
104/*
105 * Calculate task priority from the waiter list priority
106 *
107 * Return task->normal_prio when the waiter list is empty or when
108 * the waiter is not allowed to do priority boosting
109 */
110int rt_mutex_getprio(struct task_struct *task)
111{
112 if (likely(!task_has_pi_waiters(task)))
113 return task->normal_prio;
114
115 return min(task_top_pi_waiter(task)->pi_list_entry.prio,
116 task->normal_prio);
117}
118
119/*
120 * Adjust the priority of a task, after its pi_waiters got modified.
121 *
122 * This can be both boosting and unboosting. task->pi_lock must be held.
123 */
124static void __rt_mutex_adjust_prio(struct task_struct *task)
125{
126 int prio = rt_mutex_getprio(task);
127
128 if (task->prio != prio)
129 rt_mutex_setprio(task, prio);
130}
131
132/*
133 * Adjust task priority (undo boosting). Called from the exit path of
134 * rt_mutex_slowunlock() and rt_mutex_slowlock().
135 *
136 * (Note: We do this outside of the protection of lock->wait_lock to
137 * allow the lock to be taken while or before we readjust the priority
138 * of task. We do not use the spin_xx_mutex() variants here as we are
139 * outside of the debug path.)
140 */
141static void rt_mutex_adjust_prio(struct task_struct *task)
142{
143 unsigned long flags;
144
145 spin_lock_irqsave(&task->pi_lock, flags);
146 __rt_mutex_adjust_prio(task);
147 spin_unlock_irqrestore(&task->pi_lock, flags);
148}
149
150/*
151 * Max number of times we'll walk the boosting chain:
152 */
153int max_lock_depth = 1024;
154
155/*
156 * Adjust the priority chain. Also used for deadlock detection.
157 * Decreases task's usage by one - may thus free the task.
158 * Returns 0 or -EDEADLK.
159 */
160static int rt_mutex_adjust_prio_chain(struct task_struct *task,
161 int deadlock_detect,
162 struct rt_mutex *orig_lock,
163 struct rt_mutex_waiter *orig_waiter,
164 struct task_struct *top_task)
165{
166 struct rt_mutex *lock;
167 struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
168 int detect_deadlock, ret = 0, depth = 0;
169 unsigned long flags;
170
171 detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter,
172 deadlock_detect);
173
174 /*
175 * The (de)boosting is a step by step approach with a lot of
176 * pitfalls. We want this to be preemptible and we want hold a
177 * maximum of two locks per step. So we have to check
178 * carefully whether things change under us.
179 */
180 again:
181 if (++depth > max_lock_depth) {
182 static int prev_max;
183
184 /*
185 * Print this only once. If the admin changes the limit,
186 * print a new message when reaching the limit again.
187 */
188 if (prev_max != max_lock_depth) {
189 prev_max = max_lock_depth;
190 printk(KERN_WARNING "Maximum lock depth %d reached "
191 "task: %s (%d)\n", max_lock_depth,
192 top_task->comm, top_task->pid);
193 }
194 put_task_struct(task);
195
196 return deadlock_detect ? -EDEADLK : 0;
197 }
198 retry:
199 /*
200 * Task can not go away as we did a get_task() before !
201 */
202 spin_lock_irqsave(&task->pi_lock, flags);
203
204 waiter = task->pi_blocked_on;
205 /*
206 * Check whether the end of the boosting chain has been
207 * reached or the state of the chain has changed while we
208 * dropped the locks.
209 */
210 if (!waiter || !waiter->task)
211 goto out_unlock_pi;
212
213 if (top_waiter && (!task_has_pi_waiters(task) ||
214 top_waiter != task_top_pi_waiter(task)))
215 goto out_unlock_pi;
216
217 /*
218 * When deadlock detection is off then we check, if further
219 * priority adjustment is necessary.
220 */
221 if (!detect_deadlock && waiter->list_entry.prio == task->prio)
222 goto out_unlock_pi;
223
224 lock = waiter->lock;
225 if (!spin_trylock(&lock->wait_lock)) {
226 spin_unlock_irqrestore(&task->pi_lock, flags);
227 cpu_relax();
228 goto retry;
229 }
230
231 /* Deadlock detection */
232 if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
233 debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock);
234 spin_unlock(&lock->wait_lock);
235 ret = deadlock_detect ? -EDEADLK : 0;
236 goto out_unlock_pi;
237 }
238
239 top_waiter = rt_mutex_top_waiter(lock);
240
241 /* Requeue the waiter */
242 plist_del(&waiter->list_entry, &lock->wait_list);
243 waiter->list_entry.prio = task->prio;
244 plist_add(&waiter->list_entry, &lock->wait_list);
245
246 /* Release the task */
247 spin_unlock_irqrestore(&task->pi_lock, flags);
248 put_task_struct(task);
249
250 /* Grab the next task */
251 task = rt_mutex_owner(lock);
252 spin_lock_irqsave(&task->pi_lock, flags);
253
254 if (waiter == rt_mutex_top_waiter(lock)) {
255 /* Boost the owner */
256 plist_del(&top_waiter->pi_list_entry, &task->pi_waiters);
257 waiter->pi_list_entry.prio = waiter->list_entry.prio;
258 plist_add(&waiter->pi_list_entry, &task->pi_waiters);
259 __rt_mutex_adjust_prio(task);
260
261 } else if (top_waiter == waiter) {
262 /* Deboost the owner */
263 plist_del(&waiter->pi_list_entry, &task->pi_waiters);
264 waiter = rt_mutex_top_waiter(lock);
265 waiter->pi_list_entry.prio = waiter->list_entry.prio;
266 plist_add(&waiter->pi_list_entry, &task->pi_waiters);
267 __rt_mutex_adjust_prio(task);
268 }
269
270 get_task_struct(task);
271 spin_unlock_irqrestore(&task->pi_lock, flags);
272
273 top_waiter = rt_mutex_top_waiter(lock);
274 spin_unlock(&lock->wait_lock);
275
276 if (!detect_deadlock && waiter != top_waiter)
277 goto out_put_task;
278
279 goto again;
280
281 out_unlock_pi:
282 spin_unlock_irqrestore(&task->pi_lock, flags);
283 out_put_task:
284 put_task_struct(task);
285
286 return ret;
287}
288
289/*
290 * Optimization: check if we can steal the lock from the
291 * assigned pending owner [which might not have taken the
292 * lock yet]:
293 */
294static inline int try_to_steal_lock(struct rt_mutex *lock)
295{
296 struct task_struct *pendowner = rt_mutex_owner(lock);
297 struct rt_mutex_waiter *next;
298 unsigned long flags;
299
300 if (!rt_mutex_owner_pending(lock))
301 return 0;
302
303 if (pendowner == current)
304 return 1;
305
306 spin_lock_irqsave(&pendowner->pi_lock, flags);
307 if (current->prio >= pendowner->prio) {
308 spin_unlock_irqrestore(&pendowner->pi_lock, flags);
309 return 0;
310 }
311
312 /*
313 * Check if a waiter is enqueued on the pending owners
314 * pi_waiters list. Remove it and readjust pending owners
315 * priority.
316 */
317 if (likely(!rt_mutex_has_waiters(lock))) {
318 spin_unlock_irqrestore(&pendowner->pi_lock, flags);
319 return 1;
320 }
321
322 /* No chain handling, pending owner is not blocked on anything: */
323 next = rt_mutex_top_waiter(lock);
324 plist_del(&next->pi_list_entry, &pendowner->pi_waiters);
325 __rt_mutex_adjust_prio(pendowner);
326 spin_unlock_irqrestore(&pendowner->pi_lock, flags);
327
328 /*
329 * We are going to steal the lock and a waiter was
330 * enqueued on the pending owners pi_waiters queue. So
331 * we have to enqueue this waiter into
332 * current->pi_waiters list. This covers the case,
333 * where current is boosted because it holds another
334 * lock and gets unboosted because the booster is
335 * interrupted, so we would delay a waiter with higher
336 * priority as current->normal_prio.
337 *
338 * Note: in the rare case of a SCHED_OTHER task changing
339 * its priority and thus stealing the lock, next->task
340 * might be current:
341 */
342 if (likely(next->task != current)) {
343 spin_lock_irqsave(&current->pi_lock, flags);
344 plist_add(&next->pi_list_entry, &current->pi_waiters);
345 __rt_mutex_adjust_prio(current);
346 spin_unlock_irqrestore(&current->pi_lock, flags);
347 }
348 return 1;
349}
350
351/*
352 * Try to take an rt-mutex
353 *
354 * This fails
355 * - when the lock has a real owner
356 * - when a different pending owner exists and has higher priority than current
357 *
358 * Must be called with lock->wait_lock held.
359 */
360static int try_to_take_rt_mutex(struct rt_mutex *lock)
361{
362 /*
363 * We have to be careful here if the atomic speedups are
364 * enabled, such that, when
365 * - no other waiter is on the lock
366 * - the lock has been released since we did the cmpxchg
367 * the lock can be released or taken while we are doing the
368 * checks and marking the lock with RT_MUTEX_HAS_WAITERS.
369 *
370 * The atomic acquire/release aware variant of
371 * mark_rt_mutex_waiters uses a cmpxchg loop. After setting
372 * the WAITERS bit, the atomic release / acquire can not
373 * happen anymore and lock->wait_lock protects us from the
374 * non-atomic case.
375 *
376 * Note, that this might set lock->owner =
377 * RT_MUTEX_HAS_WAITERS in the case the lock is not contended
378 * any more. This is fixed up when we take the ownership.
379 * This is the transitional state explained at the top of this file.
380 */
381 mark_rt_mutex_waiters(lock);
382
383 if (rt_mutex_owner(lock) && !try_to_steal_lock(lock))
384 return 0;
385
386 /* We got the lock. */
387 debug_rt_mutex_lock(lock);
388
389 rt_mutex_set_owner(lock, current, 0);
390
391 rt_mutex_deadlock_account_lock(lock, current);
392
393 return 1;
394}
395
396/*
397 * Task blocks on lock.
398 *
399 * Prepare waiter and propagate pi chain
400 *
401 * This must be called with lock->wait_lock held.
402 */
403static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
404 struct rt_mutex_waiter *waiter,
405 int detect_deadlock)
406{
407 struct task_struct *owner = rt_mutex_owner(lock);
408 struct rt_mutex_waiter *top_waiter = waiter;
409 unsigned long flags;
410 int boost = 0, res;
411
412 spin_lock_irqsave(&current->pi_lock, flags);
413 __rt_mutex_adjust_prio(current);
414 waiter->task = current;
415 waiter->lock = lock;
416 plist_node_init(&waiter->list_entry, current->prio);
417 plist_node_init(&waiter->pi_list_entry, current->prio);
418
419 /* Get the top priority waiter on the lock */
420 if (rt_mutex_has_waiters(lock))
421 top_waiter = rt_mutex_top_waiter(lock);
422 plist_add(&waiter->list_entry, &lock->wait_list);
423
424 current->pi_blocked_on = waiter;
425
426 spin_unlock_irqrestore(&current->pi_lock, flags);
427
428 if (waiter == rt_mutex_top_waiter(lock)) {
429 spin_lock_irqsave(&owner->pi_lock, flags);
430 plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
431 plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
432
433 __rt_mutex_adjust_prio(owner);
434 if (owner->pi_blocked_on) {
435 boost = 1;
436 /* gets dropped in rt_mutex_adjust_prio_chain()! */
437 get_task_struct(owner);
438 }
439 spin_unlock_irqrestore(&owner->pi_lock, flags);
440 }
441 else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) {
442 spin_lock_irqsave(&owner->pi_lock, flags);
443 if (owner->pi_blocked_on) {
444 boost = 1;
445 /* gets dropped in rt_mutex_adjust_prio_chain()! */
446 get_task_struct(owner);
447 }
448 spin_unlock_irqrestore(&owner->pi_lock, flags);
449 }
450 if (!boost)
451 return 0;
452
453 spin_unlock(&lock->wait_lock);
454
455 res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
456 current);
457
458 spin_lock(&lock->wait_lock);
459
460 return res;
461}
462
463/*
464 * Wake up the next waiter on the lock.
465 *
466 * Remove the top waiter from the current tasks waiter list and from
467 * the lock waiter list. Set it as pending owner. Then wake it up.
468 *
469 * Called with lock->wait_lock held.
470 */
471static void wakeup_next_waiter(struct rt_mutex *lock)
472{
473 struct rt_mutex_waiter *waiter;
474 struct task_struct *pendowner;
475 unsigned long flags;
476
477 spin_lock_irqsave(&current->pi_lock, flags);
478
479 waiter = rt_mutex_top_waiter(lock);
480 plist_del(&waiter->list_entry, &lock->wait_list);
481
482 /*
483 * Remove it from current->pi_waiters. We do not adjust a
484 * possible priority boost right now. We execute wakeup in the
485 * boosted mode and go back to normal after releasing
486 * lock->wait_lock.
487 */
488 plist_del(&waiter->pi_list_entry, &current->pi_waiters);
489 pendowner = waiter->task;
490 waiter->task = NULL;
491
492 rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING);
493
494 spin_unlock_irqrestore(&current->pi_lock, flags);
495
496 /*
497 * Clear the pi_blocked_on variable and enqueue a possible
498 * waiter into the pi_waiters list of the pending owner. This
499 * prevents that in case the pending owner gets unboosted a
500 * waiter with higher priority than pending-owner->normal_prio
501 * is blocked on the unboosted (pending) owner.
502 */
503 spin_lock_irqsave(&pendowner->pi_lock, flags);
504
505 WARN_ON(!pendowner->pi_blocked_on);
506 WARN_ON(pendowner->pi_blocked_on != waiter);
507 WARN_ON(pendowner->pi_blocked_on->lock != lock);
508
509 pendowner->pi_blocked_on = NULL;
510
511 if (rt_mutex_has_waiters(lock)) {
512 struct rt_mutex_waiter *next;
513
514 next = rt_mutex_top_waiter(lock);
515 plist_add(&next->pi_list_entry, &pendowner->pi_waiters);
516 }
517 spin_unlock_irqrestore(&pendowner->pi_lock, flags);
518
519 wake_up_process(pendowner);
520}
521
522/*
523 * Remove a waiter from a lock
524 *
525 * Must be called with lock->wait_lock held
526 */
527static void remove_waiter(struct rt_mutex *lock,
528 struct rt_mutex_waiter *waiter)
529{
530 int first = (waiter == rt_mutex_top_waiter(lock));
531 struct task_struct *owner = rt_mutex_owner(lock);
532 unsigned long flags;
533 int boost = 0;
534
535 spin_lock_irqsave(&current->pi_lock, flags);
536 plist_del(&waiter->list_entry, &lock->wait_list);
537 waiter->task = NULL;
538 current->pi_blocked_on = NULL;
539 spin_unlock_irqrestore(&current->pi_lock, flags);
540
541 if (first && owner != current) {
542
543 spin_lock_irqsave(&owner->pi_lock, flags);
544
545 plist_del(&waiter->pi_list_entry, &owner->pi_waiters);
546
547 if (rt_mutex_has_waiters(lock)) {
548 struct rt_mutex_waiter *next;
549
550 next = rt_mutex_top_waiter(lock);
551 plist_add(&next->pi_list_entry, &owner->pi_waiters);
552 }
553 __rt_mutex_adjust_prio(owner);
554
555 if (owner->pi_blocked_on) {
556 boost = 1;
557 /* gets dropped in rt_mutex_adjust_prio_chain()! */
558 get_task_struct(owner);
559 }
560 spin_unlock_irqrestore(&owner->pi_lock, flags);
561 }
562
563 WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
564
565 if (!boost)
566 return;
567
568 spin_unlock(&lock->wait_lock);
569
570 rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current);
571
572 spin_lock(&lock->wait_lock);
573}
574
575/*
576 * Recheck the pi chain, in case we got a priority setting
577 *
578 * Called from sched_setscheduler
579 */
580void rt_mutex_adjust_pi(struct task_struct *task)
581{
582 struct rt_mutex_waiter *waiter;
583 unsigned long flags;
584
585 spin_lock_irqsave(&task->pi_lock, flags);
586
587 waiter = task->pi_blocked_on;
588 if (!waiter || waiter->list_entry.prio == task->prio) {
589 spin_unlock_irqrestore(&task->pi_lock, flags);
590 return;
591 }
592
593 /* gets dropped in rt_mutex_adjust_prio_chain()! */
594 get_task_struct(task);
595 spin_unlock_irqrestore(&task->pi_lock, flags);
596
597 rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task);
598}
599
600/*
601 * Slow path lock function:
602 */
603static int __sched
604rt_mutex_slowlock(struct rt_mutex *lock, int state,
605 struct hrtimer_sleeper *timeout,
606 int detect_deadlock)
607{
608 struct rt_mutex_waiter waiter;
609 int ret = 0;
610
611 debug_rt_mutex_init_waiter(&waiter);
612 waiter.task = NULL;
613
614 spin_lock(&lock->wait_lock);
615
616 /* Try to acquire the lock again: */
617 if (try_to_take_rt_mutex(lock)) {
618 spin_unlock(&lock->wait_lock);
619 return 0;
620 }
621
622 set_current_state(state);
623
624 /* Setup the timer, when timeout != NULL */
625 if (unlikely(timeout))
626 hrtimer_start(&timeout->timer, timeout->timer.expires,
627 HRTIMER_ABS);
628
629 for (;;) {
630 /* Try to acquire the lock: */
631 if (try_to_take_rt_mutex(lock))
632 break;
633
634 /*
635 * TASK_INTERRUPTIBLE checks for signals and
636 * timeout. Ignored otherwise.
637 */
638 if (unlikely(state == TASK_INTERRUPTIBLE)) {
639 /* Signal pending? */
640 if (signal_pending(current))
641 ret = -EINTR;
642 if (timeout && !timeout->task)
643 ret = -ETIMEDOUT;
644 if (ret)
645 break;
646 }
647
648 /*
649 * waiter.task is NULL the first time we come here and
650 * when we have been woken up by the previous owner
651 * but the lock got stolen by a higher prio task.
652 */
653 if (!waiter.task) {
654 ret = task_blocks_on_rt_mutex(lock, &waiter,
655 detect_deadlock);
656 /*
657 * If we got woken up by the owner then start loop
658 * all over without going into schedule to try
659 * to get the lock now:
660 */
661 if (unlikely(!waiter.task))
662 continue;
663
664 if (unlikely(ret))
665 break;
666 }
667
668 spin_unlock(&lock->wait_lock);
669
670 debug_rt_mutex_print_deadlock(&waiter);
671
672 if (waiter.task)
673 schedule_rt_mutex(lock);
674
675 spin_lock(&lock->wait_lock);
676 set_current_state(state);
677 }
678
679 set_current_state(TASK_RUNNING);
680
681 if (unlikely(waiter.task))
682 remove_waiter(lock, &waiter);
683
684 /*
685 * try_to_take_rt_mutex() sets the waiter bit
686 * unconditionally. We might have to fix that up.
687 */
688 fixup_rt_mutex_waiters(lock);
689
690 spin_unlock(&lock->wait_lock);
691
692 /* Remove pending timer: */
693 if (unlikely(timeout))
694 hrtimer_cancel(&timeout->timer);
695
696 /*
697 * Readjust priority, when we did not get the lock. We might
698 * have been the pending owner and boosted. Since we did not
699 * take the lock, the PI boost has to go.
700 */
701 if (unlikely(ret))
702 rt_mutex_adjust_prio(current);
703
704 debug_rt_mutex_free_waiter(&waiter);
705
706 return ret;
707}
708
709/*
710 * Slow path try-lock function:
711 */
712static inline int
713rt_mutex_slowtrylock(struct rt_mutex *lock)
714{
715 int ret = 0;
716
717 spin_lock(&lock->wait_lock);
718
719 if (likely(rt_mutex_owner(lock) != current)) {
720
721 ret = try_to_take_rt_mutex(lock);
722 /*
723 * try_to_take_rt_mutex() sets the lock waiters
724 * bit unconditionally. Clean this up.
725 */
726 fixup_rt_mutex_waiters(lock);
727 }
728
729 spin_unlock(&lock->wait_lock);
730
731 return ret;
732}
733
734/*
735 * Slow path to release a rt-mutex:
736 */
737static void __sched
738rt_mutex_slowunlock(struct rt_mutex *lock)
739{
740 spin_lock(&lock->wait_lock);
741
742 debug_rt_mutex_unlock(lock);
743
744 rt_mutex_deadlock_account_unlock(current);
745
746 if (!rt_mutex_has_waiters(lock)) {
747 lock->owner = NULL;
748 spin_unlock(&lock->wait_lock);
749 return;
750 }
751
752 wakeup_next_waiter(lock);
753
754 spin_unlock(&lock->wait_lock);
755
756 /* Undo pi boosting if necessary: */
757 rt_mutex_adjust_prio(current);
758}
759
760/*
761 * debug aware fast / slowpath lock,trylock,unlock
762 *
763 * The atomic acquire/release ops are compiled away, when either the
764 * architecture does not support cmpxchg or when debugging is enabled.
765 */
766static inline int
767rt_mutex_fastlock(struct rt_mutex *lock, int state,
768 int detect_deadlock,
769 int (*slowfn)(struct rt_mutex *lock, int state,
770 struct hrtimer_sleeper *timeout,
771 int detect_deadlock))
772{
773 if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
774 rt_mutex_deadlock_account_lock(lock, current);
775 return 0;
776 } else
777 return slowfn(lock, state, NULL, detect_deadlock);
778}
779
780static inline int
781rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
782 struct hrtimer_sleeper *timeout, int detect_deadlock,
783 int (*slowfn)(struct rt_mutex *lock, int state,
784 struct hrtimer_sleeper *timeout,
785 int detect_deadlock))
786{
787 if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
788 rt_mutex_deadlock_account_lock(lock, current);
789 return 0;
790 } else
791 return slowfn(lock, state, timeout, detect_deadlock);
792}
793
794static inline int
795rt_mutex_fasttrylock(struct rt_mutex *lock,
796 int (*slowfn)(struct rt_mutex *lock))
797{
798 if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
799 rt_mutex_deadlock_account_lock(lock, current);
800 return 1;
801 }
802 return slowfn(lock);
803}
804
805static inline void
806rt_mutex_fastunlock(struct rt_mutex *lock,
807 void (*slowfn)(struct rt_mutex *lock))
808{
809 if (likely(rt_mutex_cmpxchg(lock, current, NULL)))
810 rt_mutex_deadlock_account_unlock(current);
811 else
812 slowfn(lock);
813}
814
815/**
816 * rt_mutex_lock - lock a rt_mutex
817 *
818 * @lock: the rt_mutex to be locked
819 */
820void __sched rt_mutex_lock(struct rt_mutex *lock)
821{
822 might_sleep();
823
824 rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock);
825}
826EXPORT_SYMBOL_GPL(rt_mutex_lock);
827
828/**
829 * rt_mutex_lock_interruptible - lock a rt_mutex interruptible
830 *
831 * @lock: the rt_mutex to be locked
832 * @detect_deadlock: deadlock detection on/off
833 *
834 * Returns:
835 * 0 on success
836 * -EINTR when interrupted by a signal
837 * -EDEADLK when the lock would deadlock (when deadlock detection is on)
838 */
839int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock,
840 int detect_deadlock)
841{
842 might_sleep();
843
844 return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE,
845 detect_deadlock, rt_mutex_slowlock);
846}
847EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
848
849/**
850 * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible
851 * the timeout structure is provided
852 * by the caller
853 *
854 * @lock: the rt_mutex to be locked
855 * @timeout: timeout structure or NULL (no timeout)
856 * @detect_deadlock: deadlock detection on/off
857 *
858 * Returns:
859 * 0 on success
860 * -EINTR when interrupted by a signal
861 * -ETIMEOUT when the timeout expired
862 * -EDEADLK when the lock would deadlock (when deadlock detection is on)
863 */
864int
865rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout,
866 int detect_deadlock)
867{
868 might_sleep();
869
870 return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
871 detect_deadlock, rt_mutex_slowlock);
872}
873EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
874
875/**
876 * rt_mutex_trylock - try to lock a rt_mutex
877 *
878 * @lock: the rt_mutex to be locked
879 *
880 * Returns 1 on success and 0 on contention
881 */
882int __sched rt_mutex_trylock(struct rt_mutex *lock)
883{
884 return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
885}
886EXPORT_SYMBOL_GPL(rt_mutex_trylock);
887
888/**
889 * rt_mutex_unlock - unlock a rt_mutex
890 *
891 * @lock: the rt_mutex to be unlocked
892 */
893void __sched rt_mutex_unlock(struct rt_mutex *lock)
894{
895 rt_mutex_fastunlock(lock, rt_mutex_slowunlock);
896}
897EXPORT_SYMBOL_GPL(rt_mutex_unlock);
898
899/***
900 * rt_mutex_destroy - mark a mutex unusable
901 * @lock: the mutex to be destroyed
902 *
903 * This function marks the mutex uninitialized, and any subsequent
904 * use of the mutex is forbidden. The mutex must not be locked when
905 * this function is called.
906 */
907void rt_mutex_destroy(struct rt_mutex *lock)
908{
909 WARN_ON(rt_mutex_is_locked(lock));
910#ifdef CONFIG_DEBUG_RT_MUTEXES
911 lock->magic = NULL;
912#endif
913}
914
915EXPORT_SYMBOL_GPL(rt_mutex_destroy);
916
917/**
918 * __rt_mutex_init - initialize the rt lock
919 *
920 * @lock: the rt lock to be initialized
921 *
922 * Initialize the rt lock to unlocked state.
923 *
924 * Initializing of a locked rt lock is not allowed
925 */
926void __rt_mutex_init(struct rt_mutex *lock, const char *name)
927{
928 lock->owner = NULL;
929 spin_lock_init(&lock->wait_lock);
930 plist_head_init(&lock->wait_list, &lock->wait_lock);
931
932 debug_rt_mutex_init(lock, name);
933}
934EXPORT_SYMBOL_GPL(__rt_mutex_init);
935
936/**
937 * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
938 * proxy owner
939 *
940 * @lock: the rt_mutex to be locked
941 * @proxy_owner:the task to set as owner
942 *
943 * No locking. Caller has to do serializing itself
944 * Special API call for PI-futex support
945 */
946void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
947 struct task_struct *proxy_owner)
948{
949 __rt_mutex_init(lock, NULL);
950 debug_rt_mutex_proxy_lock(lock, proxy_owner);
951 rt_mutex_set_owner(lock, proxy_owner, 0);
952 rt_mutex_deadlock_account_lock(lock, proxy_owner);
953}
954
955/**
956 * rt_mutex_proxy_unlock - release a lock on behalf of owner
957 *
958 * @lock: the rt_mutex to be locked
959 *
960 * No locking. Caller has to do serializing itself
961 * Special API call for PI-futex support
962 */
963void rt_mutex_proxy_unlock(struct rt_mutex *lock,
964 struct task_struct *proxy_owner)
965{
966 debug_rt_mutex_proxy_unlock(lock);
967 rt_mutex_set_owner(lock, NULL, 0);
968 rt_mutex_deadlock_account_unlock(proxy_owner);
969}
970
971/**
972 * rt_mutex_next_owner - return the next owner of the lock
973 *
974 * @lock: the rt lock query
975 *
976 * Returns the next owner of the lock or NULL
977 *
978 * Caller has to serialize against other accessors to the lock
979 * itself.
980 *
981 * Special API call for PI-futex support
982 */
983struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
984{
985 if (!rt_mutex_has_waiters(lock))
986 return NULL;
987
988 return rt_mutex_top_waiter(lock)->task;
989}
diff --git a/kernel/rtmutex.h b/kernel/rtmutex.h
new file mode 100644
index 000000000000..a1a1dd06421d
--- /dev/null
+++ b/kernel/rtmutex.h
@@ -0,0 +1,26 @@
1/*
2 * RT-Mutexes: blocking mutual exclusion locks with PI support
3 *
4 * started by Ingo Molnar and Thomas Gleixner:
5 *
6 * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
8 *
9 * This file contains macros used solely by rtmutex.c.
10 * Non-debug version.
11 */
12
13#define rt_mutex_deadlock_check(l) (0)
14#define rt_mutex_deadlock_account_lock(m, t) do { } while (0)
15#define rt_mutex_deadlock_account_unlock(l) do { } while (0)
16#define debug_rt_mutex_init_waiter(w) do { } while (0)
17#define debug_rt_mutex_free_waiter(w) do { } while (0)
18#define debug_rt_mutex_lock(l) do { } while (0)
19#define debug_rt_mutex_proxy_lock(l,p) do { } while (0)
20#define debug_rt_mutex_proxy_unlock(l) do { } while (0)
21#define debug_rt_mutex_unlock(l) do { } while (0)
22#define debug_rt_mutex_init(m, n) do { } while (0)
23#define debug_rt_mutex_deadlock(d, a ,l) do { } while (0)
24#define debug_rt_mutex_print_deadlock(w) do { } while (0)
25#define debug_rt_mutex_detect_deadlock(w,d) (d)
26#define debug_rt_mutex_reset_waiter(w) do { } while (0)
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
new file mode 100644
index 000000000000..9c75856e791e
--- /dev/null
+++ b/kernel/rtmutex_common.h
@@ -0,0 +1,123 @@
1/*
2 * RT Mutexes: blocking mutual exclusion locks with PI support
3 *
4 * started by Ingo Molnar and Thomas Gleixner:
5 *
6 * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
8 *
9 * This file contains the private data structure and API definitions.
10 */
11
12#ifndef __KERNEL_RTMUTEX_COMMON_H
13#define __KERNEL_RTMUTEX_COMMON_H
14
15#include <linux/rtmutex.h>
16
17/*
18 * The rtmutex in kernel tester is independent of rtmutex debugging. We
19 * call schedule_rt_mutex_test() instead of schedule() for the tasks which
20 * belong to the tester. That way we can delay the wakeup path of those
21 * threads to provoke lock stealing and testing of complex boosting scenarios.
22 */
23#ifdef CONFIG_RT_MUTEX_TESTER
24
25extern void schedule_rt_mutex_test(struct rt_mutex *lock);
26
27#define schedule_rt_mutex(_lock) \
28 do { \
29 if (!(current->flags & PF_MUTEX_TESTER)) \
30 schedule(); \
31 else \
32 schedule_rt_mutex_test(_lock); \
33 } while (0)
34
35#else
36# define schedule_rt_mutex(_lock) schedule()
37#endif
38
39/*
40 * This is the control structure for tasks blocked on a rt_mutex,
41 * which is allocated on the kernel stack on of the blocked task.
42 *
43 * @list_entry: pi node to enqueue into the mutex waiters list
44 * @pi_list_entry: pi node to enqueue into the mutex owner waiters list
45 * @task: task reference to the blocked task
46 */
47struct rt_mutex_waiter {
48 struct plist_node list_entry;
49 struct plist_node pi_list_entry;
50 struct task_struct *task;
51 struct rt_mutex *lock;
52#ifdef CONFIG_DEBUG_RT_MUTEXES
53 unsigned long ip;
54 pid_t deadlock_task_pid;
55 struct rt_mutex *deadlock_lock;
56#endif
57};
58
59/*
60 * Various helpers to access the waiters-plist:
61 */
62static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
63{
64 return !plist_head_empty(&lock->wait_list);
65}
66
67static inline struct rt_mutex_waiter *
68rt_mutex_top_waiter(struct rt_mutex *lock)
69{
70 struct rt_mutex_waiter *w;
71
72 w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter,
73 list_entry);
74 BUG_ON(w->lock != lock);
75
76 return w;
77}
78
79static inline int task_has_pi_waiters(struct task_struct *p)
80{
81 return !plist_head_empty(&p->pi_waiters);
82}
83
84static inline struct rt_mutex_waiter *
85task_top_pi_waiter(struct task_struct *p)
86{
87 return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter,
88 pi_list_entry);
89}
90
91/*
92 * lock->owner state tracking:
93 */
94#define RT_MUTEX_OWNER_PENDING 1UL
95#define RT_MUTEX_HAS_WAITERS 2UL
96#define RT_MUTEX_OWNER_MASKALL 3UL
97
98static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
99{
100 return (struct task_struct *)
101 ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL);
102}
103
104static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock)
105{
106 return (struct task_struct *)
107 ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
108}
109
110static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock)
111{
112 return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING;
113}
114
115/*
116 * PI-futex support (proxy locking functions, etc.):
117 */
118extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
119extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
120 struct task_struct *proxy_owner);
121extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
122 struct task_struct *proxy_owner);
123#endif
diff --git a/kernel/rwsem.c b/kernel/rwsem.c
new file mode 100644
index 000000000000..291ded556aa0
--- /dev/null
+++ b/kernel/rwsem.c
@@ -0,0 +1,147 @@
1/* kernel/rwsem.c: R/W semaphores, public implementation
2 *
3 * Written by David Howells (dhowells@redhat.com).
4 * Derived from asm-i386/semaphore.h
5 */
6
7#include <linux/types.h>
8#include <linux/kernel.h>
9#include <linux/module.h>
10#include <linux/rwsem.h>
11
12#include <asm/system.h>
13#include <asm/atomic.h>
14
15/*
16 * lock for reading
17 */
18void down_read(struct rw_semaphore *sem)
19{
20 might_sleep();
21 rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
22
23 __down_read(sem);
24}
25
26EXPORT_SYMBOL(down_read);
27
28/*
29 * trylock for reading -- returns 1 if successful, 0 if contention
30 */
31int down_read_trylock(struct rw_semaphore *sem)
32{
33 int ret = __down_read_trylock(sem);
34
35 if (ret == 1)
36 rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
37 return ret;
38}
39
40EXPORT_SYMBOL(down_read_trylock);
41
42/*
43 * lock for writing
44 */
45void down_write(struct rw_semaphore *sem)
46{
47 might_sleep();
48 rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
49
50 __down_write(sem);
51}
52
53EXPORT_SYMBOL(down_write);
54
55/*
56 * trylock for writing -- returns 1 if successful, 0 if contention
57 */
58int down_write_trylock(struct rw_semaphore *sem)
59{
60 int ret = __down_write_trylock(sem);
61
62 if (ret == 1)
63 rwsem_acquire(&sem->dep_map, 0, 0, _RET_IP_);
64 return ret;
65}
66
67EXPORT_SYMBOL(down_write_trylock);
68
69/*
70 * release a read lock
71 */
72void up_read(struct rw_semaphore *sem)
73{
74 rwsem_release(&sem->dep_map, 1, _RET_IP_);
75
76 __up_read(sem);
77}
78
79EXPORT_SYMBOL(up_read);
80
81/*
82 * release a write lock
83 */
84void up_write(struct rw_semaphore *sem)
85{
86 rwsem_release(&sem->dep_map, 1, _RET_IP_);
87
88 __up_write(sem);
89}
90
91EXPORT_SYMBOL(up_write);
92
93/*
94 * downgrade write lock to read lock
95 */
96void downgrade_write(struct rw_semaphore *sem)
97{
98 /*
99 * lockdep: a downgraded write will live on as a write
100 * dependency.
101 */
102 __downgrade_write(sem);
103}
104
105EXPORT_SYMBOL(downgrade_write);
106
107#ifdef CONFIG_DEBUG_LOCK_ALLOC
108
109void down_read_nested(struct rw_semaphore *sem, int subclass)
110{
111 might_sleep();
112 rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
113
114 __down_read(sem);
115}
116
117EXPORT_SYMBOL(down_read_nested);
118
119void down_read_non_owner(struct rw_semaphore *sem)
120{
121 might_sleep();
122
123 __down_read(sem);
124}
125
126EXPORT_SYMBOL(down_read_non_owner);
127
128void down_write_nested(struct rw_semaphore *sem, int subclass)
129{
130 might_sleep();
131 rwsem_acquire(&sem->dep_map, subclass, 0, _RET_IP_);
132
133 __down_write_nested(sem, subclass);
134}
135
136EXPORT_SYMBOL(down_write_nested);
137
138void up_read_non_owner(struct rw_semaphore *sem)
139{
140 __up_read(sem);
141}
142
143EXPORT_SYMBOL(up_read_non_owner);
144
145#endif
146
147
diff --git a/kernel/sched.c b/kernel/sched.c
index c13f1bd2df7d..b44b9a43b0fc 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -30,6 +30,7 @@
30#include <linux/capability.h> 30#include <linux/capability.h>
31#include <linux/completion.h> 31#include <linux/completion.h>
32#include <linux/kernel_stat.h> 32#include <linux/kernel_stat.h>
33#include <linux/debug_locks.h>
33#include <linux/security.h> 34#include <linux/security.h>
34#include <linux/notifier.h> 35#include <linux/notifier.h>
35#include <linux/profile.h> 36#include <linux/profile.h>
@@ -50,6 +51,7 @@
50#include <linux/times.h> 51#include <linux/times.h>
51#include <linux/acct.h> 52#include <linux/acct.h>
52#include <linux/kprobes.h> 53#include <linux/kprobes.h>
54#include <linux/delayacct.h>
53#include <asm/tlb.h> 55#include <asm/tlb.h>
54 56
55#include <asm/unistd.h> 57#include <asm/unistd.h>
@@ -168,29 +170,28 @@
168 */ 170 */
169 171
170#define SCALE_PRIO(x, prio) \ 172#define SCALE_PRIO(x, prio) \
171 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) 173 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
172 174
173static unsigned int task_timeslice(task_t *p) 175static unsigned int static_prio_timeslice(int static_prio)
174{ 176{
175 if (p->static_prio < NICE_TO_PRIO(0)) 177 if (static_prio < NICE_TO_PRIO(0))
176 return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); 178 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
177 else 179 else
178 return SCALE_PRIO(DEF_TIMESLICE, p->static_prio); 180 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
181}
182
183static inline unsigned int task_timeslice(struct task_struct *p)
184{
185 return static_prio_timeslice(p->static_prio);
179} 186}
180#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \
181 < (long long) (sd)->cache_hot_time)
182 187
183/* 188/*
184 * These are the runqueue data structures: 189 * These are the runqueue data structures:
185 */ 190 */
186 191
187#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
188
189typedef struct runqueue runqueue_t;
190
191struct prio_array { 192struct prio_array {
192 unsigned int nr_active; 193 unsigned int nr_active;
193 unsigned long bitmap[BITMAP_SIZE]; 194 DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
194 struct list_head queue[MAX_PRIO]; 195 struct list_head queue[MAX_PRIO];
195}; 196};
196 197
@@ -201,7 +202,7 @@ struct prio_array {
201 * (such as the load balancing or the thread migration code), lock 202 * (such as the load balancing or the thread migration code), lock
202 * acquire operations must be ordered by ascending &runqueue. 203 * acquire operations must be ordered by ascending &runqueue.
203 */ 204 */
204struct runqueue { 205struct rq {
205 spinlock_t lock; 206 spinlock_t lock;
206 207
207 /* 208 /*
@@ -209,6 +210,7 @@ struct runqueue {
209 * remote CPUs use both these fields when doing load calculation. 210 * remote CPUs use both these fields when doing load calculation.
210 */ 211 */
211 unsigned long nr_running; 212 unsigned long nr_running;
213 unsigned long raw_weighted_load;
212#ifdef CONFIG_SMP 214#ifdef CONFIG_SMP
213 unsigned long cpu_load[3]; 215 unsigned long cpu_load[3];
214#endif 216#endif
@@ -224,9 +226,9 @@ struct runqueue {
224 226
225 unsigned long expired_timestamp; 227 unsigned long expired_timestamp;
226 unsigned long long timestamp_last_tick; 228 unsigned long long timestamp_last_tick;
227 task_t *curr, *idle; 229 struct task_struct *curr, *idle;
228 struct mm_struct *prev_mm; 230 struct mm_struct *prev_mm;
229 prio_array_t *active, *expired, arrays[2]; 231 struct prio_array *active, *expired, arrays[2];
230 int best_expired_prio; 232 int best_expired_prio;
231 atomic_t nr_iowait; 233 atomic_t nr_iowait;
232 234
@@ -237,9 +239,8 @@ struct runqueue {
237 int active_balance; 239 int active_balance;
238 int push_cpu; 240 int push_cpu;
239 241
240 task_t *migration_thread; 242 struct task_struct *migration_thread;
241 struct list_head migration_queue; 243 struct list_head migration_queue;
242 int cpu;
243#endif 244#endif
244 245
245#ifdef CONFIG_SCHEDSTATS 246#ifdef CONFIG_SCHEDSTATS
@@ -261,9 +262,10 @@ struct runqueue {
261 unsigned long ttwu_cnt; 262 unsigned long ttwu_cnt;
262 unsigned long ttwu_local; 263 unsigned long ttwu_local;
263#endif 264#endif
265 struct lock_class_key rq_lock_key;
264}; 266};
265 267
266static DEFINE_PER_CPU(struct runqueue, runqueues); 268static DEFINE_PER_CPU(struct rq, runqueues);
267 269
268/* 270/*
269 * The domain tree (rq->sd) is protected by RCU's quiescent state transition. 271 * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
@@ -272,8 +274,8 @@ static DEFINE_PER_CPU(struct runqueue, runqueues);
272 * The domain tree of any CPU may only be accessed from within 274 * The domain tree of any CPU may only be accessed from within
273 * preempt-disabled sections. 275 * preempt-disabled sections.
274 */ 276 */
275#define for_each_domain(cpu, domain) \ 277#define for_each_domain(cpu, __sd) \
276for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent) 278 for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
277 279
278#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) 280#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu)))
279#define this_rq() (&__get_cpu_var(runqueues)) 281#define this_rq() (&__get_cpu_var(runqueues))
@@ -288,26 +290,33 @@ for (domain = rcu_dereference(cpu_rq(cpu)->sd); domain; domain = domain->parent)
288#endif 290#endif
289 291
290#ifndef __ARCH_WANT_UNLOCKED_CTXSW 292#ifndef __ARCH_WANT_UNLOCKED_CTXSW
291static inline int task_running(runqueue_t *rq, task_t *p) 293static inline int task_running(struct rq *rq, struct task_struct *p)
292{ 294{
293 return rq->curr == p; 295 return rq->curr == p;
294} 296}
295 297
296static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) 298static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
297{ 299{
298} 300}
299 301
300static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) 302static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
301{ 303{
302#ifdef CONFIG_DEBUG_SPINLOCK 304#ifdef CONFIG_DEBUG_SPINLOCK
303 /* this is a valid case when another task releases the spinlock */ 305 /* this is a valid case when another task releases the spinlock */
304 rq->lock.owner = current; 306 rq->lock.owner = current;
305#endif 307#endif
308 /*
309 * If we are tracking spinlock dependencies then we have to
310 * fix up the runqueue lock - which gets 'carried over' from
311 * prev into current:
312 */
313 spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_);
314
306 spin_unlock_irq(&rq->lock); 315 spin_unlock_irq(&rq->lock);
307} 316}
308 317
309#else /* __ARCH_WANT_UNLOCKED_CTXSW */ 318#else /* __ARCH_WANT_UNLOCKED_CTXSW */
310static inline int task_running(runqueue_t *rq, task_t *p) 319static inline int task_running(struct rq *rq, struct task_struct *p)
311{ 320{
312#ifdef CONFIG_SMP 321#ifdef CONFIG_SMP
313 return p->oncpu; 322 return p->oncpu;
@@ -316,7 +325,7 @@ static inline int task_running(runqueue_t *rq, task_t *p)
316#endif 325#endif
317} 326}
318 327
319static inline void prepare_lock_switch(runqueue_t *rq, task_t *next) 328static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
320{ 329{
321#ifdef CONFIG_SMP 330#ifdef CONFIG_SMP
322 /* 331 /*
@@ -333,7 +342,7 @@ static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
333#endif 342#endif
334} 343}
335 344
336static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) 345static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
337{ 346{
338#ifdef CONFIG_SMP 347#ifdef CONFIG_SMP
339 /* 348 /*
@@ -351,14 +360,33 @@ static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
351#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 360#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
352 361
353/* 362/*
363 * __task_rq_lock - lock the runqueue a given task resides on.
364 * Must be called interrupts disabled.
365 */
366static inline struct rq *__task_rq_lock(struct task_struct *p)
367 __acquires(rq->lock)
368{
369 struct rq *rq;
370
371repeat_lock_task:
372 rq = task_rq(p);
373 spin_lock(&rq->lock);
374 if (unlikely(rq != task_rq(p))) {
375 spin_unlock(&rq->lock);
376 goto repeat_lock_task;
377 }
378 return rq;
379}
380
381/*
354 * task_rq_lock - lock the runqueue a given task resides on and disable 382 * task_rq_lock - lock the runqueue a given task resides on and disable
355 * interrupts. Note the ordering: we can safely lookup the task_rq without 383 * interrupts. Note the ordering: we can safely lookup the task_rq without
356 * explicitly disabling preemption. 384 * explicitly disabling preemption.
357 */ 385 */
358static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) 386static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
359 __acquires(rq->lock) 387 __acquires(rq->lock)
360{ 388{
361 struct runqueue *rq; 389 struct rq *rq;
362 390
363repeat_lock_task: 391repeat_lock_task:
364 local_irq_save(*flags); 392 local_irq_save(*flags);
@@ -371,7 +399,13 @@ repeat_lock_task:
371 return rq; 399 return rq;
372} 400}
373 401
374static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) 402static inline void __task_rq_unlock(struct rq *rq)
403 __releases(rq->lock)
404{
405 spin_unlock(&rq->lock);
406}
407
408static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
375 __releases(rq->lock) 409 __releases(rq->lock)
376{ 410{
377 spin_unlock_irqrestore(&rq->lock, *flags); 411 spin_unlock_irqrestore(&rq->lock, *flags);
@@ -391,7 +425,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
391 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); 425 seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION);
392 seq_printf(seq, "timestamp %lu\n", jiffies); 426 seq_printf(seq, "timestamp %lu\n", jiffies);
393 for_each_online_cpu(cpu) { 427 for_each_online_cpu(cpu) {
394 runqueue_t *rq = cpu_rq(cpu); 428 struct rq *rq = cpu_rq(cpu);
395#ifdef CONFIG_SMP 429#ifdef CONFIG_SMP
396 struct sched_domain *sd; 430 struct sched_domain *sd;
397 int dcnt = 0; 431 int dcnt = 0;
@@ -468,9 +502,36 @@ struct file_operations proc_schedstat_operations = {
468 .release = single_release, 502 .release = single_release,
469}; 503};
470 504
505/*
506 * Expects runqueue lock to be held for atomicity of update
507 */
508static inline void
509rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
510{
511 if (rq) {
512 rq->rq_sched_info.run_delay += delta_jiffies;
513 rq->rq_sched_info.pcnt++;
514 }
515}
516
517/*
518 * Expects runqueue lock to be held for atomicity of update
519 */
520static inline void
521rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
522{
523 if (rq)
524 rq->rq_sched_info.cpu_time += delta_jiffies;
525}
471# define schedstat_inc(rq, field) do { (rq)->field++; } while (0) 526# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
472# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) 527# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
473#else /* !CONFIG_SCHEDSTATS */ 528#else /* !CONFIG_SCHEDSTATS */
529static inline void
530rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
531{}
532static inline void
533rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
534{}
474# define schedstat_inc(rq, field) do { } while (0) 535# define schedstat_inc(rq, field) do { } while (0)
475# define schedstat_add(rq, field, amt) do { } while (0) 536# define schedstat_add(rq, field, amt) do { } while (0)
476#endif 537#endif
@@ -478,10 +539,10 @@ struct file_operations proc_schedstat_operations = {
478/* 539/*
479 * rq_lock - lock a given runqueue and disable interrupts. 540 * rq_lock - lock a given runqueue and disable interrupts.
480 */ 541 */
481static inline runqueue_t *this_rq_lock(void) 542static inline struct rq *this_rq_lock(void)
482 __acquires(rq->lock) 543 __acquires(rq->lock)
483{ 544{
484 runqueue_t *rq; 545 struct rq *rq;
485 546
486 local_irq_disable(); 547 local_irq_disable();
487 rq = this_rq(); 548 rq = this_rq();
@@ -490,7 +551,7 @@ static inline runqueue_t *this_rq_lock(void)
490 return rq; 551 return rq;
491} 552}
492 553
493#ifdef CONFIG_SCHEDSTATS 554#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
494/* 555/*
495 * Called when a process is dequeued from the active array and given 556 * Called when a process is dequeued from the active array and given
496 * the cpu. We should note that with the exception of interactive 557 * the cpu. We should note that with the exception of interactive
@@ -506,7 +567,7 @@ static inline runqueue_t *this_rq_lock(void)
506 * long it was from the *first* time it was queued to the time that it 567 * long it was from the *first* time it was queued to the time that it
507 * finally hit a cpu. 568 * finally hit a cpu.
508 */ 569 */
509static inline void sched_info_dequeued(task_t *t) 570static inline void sched_info_dequeued(struct task_struct *t)
510{ 571{
511 t->sched_info.last_queued = 0; 572 t->sched_info.last_queued = 0;
512} 573}
@@ -516,23 +577,18 @@ static inline void sched_info_dequeued(task_t *t)
516 * long it was waiting to run. We also note when it began so that we 577 * long it was waiting to run. We also note when it began so that we
517 * can keep stats on how long its timeslice is. 578 * can keep stats on how long its timeslice is.
518 */ 579 */
519static void sched_info_arrive(task_t *t) 580static void sched_info_arrive(struct task_struct *t)
520{ 581{
521 unsigned long now = jiffies, diff = 0; 582 unsigned long now = jiffies, delta_jiffies = 0;
522 struct runqueue *rq = task_rq(t);
523 583
524 if (t->sched_info.last_queued) 584 if (t->sched_info.last_queued)
525 diff = now - t->sched_info.last_queued; 585 delta_jiffies = now - t->sched_info.last_queued;
526 sched_info_dequeued(t); 586 sched_info_dequeued(t);
527 t->sched_info.run_delay += diff; 587 t->sched_info.run_delay += delta_jiffies;
528 t->sched_info.last_arrival = now; 588 t->sched_info.last_arrival = now;
529 t->sched_info.pcnt++; 589 t->sched_info.pcnt++;
530 590
531 if (!rq) 591 rq_sched_info_arrive(task_rq(t), delta_jiffies);
532 return;
533
534 rq->rq_sched_info.run_delay += diff;
535 rq->rq_sched_info.pcnt++;
536} 592}
537 593
538/* 594/*
@@ -550,25 +606,23 @@ static void sched_info_arrive(task_t *t)
550 * the timestamp if it is already not set. It's assumed that 606 * the timestamp if it is already not set. It's assumed that
551 * sched_info_dequeued() will clear that stamp when appropriate. 607 * sched_info_dequeued() will clear that stamp when appropriate.
552 */ 608 */
553static inline void sched_info_queued(task_t *t) 609static inline void sched_info_queued(struct task_struct *t)
554{ 610{
555 if (!t->sched_info.last_queued) 611 if (unlikely(sched_info_on()))
556 t->sched_info.last_queued = jiffies; 612 if (!t->sched_info.last_queued)
613 t->sched_info.last_queued = jiffies;
557} 614}
558 615
559/* 616/*
560 * Called when a process ceases being the active-running process, either 617 * Called when a process ceases being the active-running process, either
561 * voluntarily or involuntarily. Now we can calculate how long we ran. 618 * voluntarily or involuntarily. Now we can calculate how long we ran.
562 */ 619 */
563static inline void sched_info_depart(task_t *t) 620static inline void sched_info_depart(struct task_struct *t)
564{ 621{
565 struct runqueue *rq = task_rq(t); 622 unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival;
566 unsigned long diff = jiffies - t->sched_info.last_arrival;
567
568 t->sched_info.cpu_time += diff;
569 623
570 if (rq) 624 t->sched_info.cpu_time += delta_jiffies;
571 rq->rq_sched_info.cpu_time += diff; 625 rq_sched_info_depart(task_rq(t), delta_jiffies);
572} 626}
573 627
574/* 628/*
@@ -576,9 +630,10 @@ static inline void sched_info_depart(task_t *t)
576 * their time slice. (This may also be called when switching to or from 630 * their time slice. (This may also be called when switching to or from
577 * the idle task.) We are only called when prev != next. 631 * the idle task.) We are only called when prev != next.
578 */ 632 */
579static inline void sched_info_switch(task_t *prev, task_t *next) 633static inline void
634__sched_info_switch(struct task_struct *prev, struct task_struct *next)
580{ 635{
581 struct runqueue *rq = task_rq(prev); 636 struct rq *rq = task_rq(prev);
582 637
583 /* 638 /*
584 * prev now departs the cpu. It's not interesting to record 639 * prev now departs the cpu. It's not interesting to record
@@ -591,15 +646,21 @@ static inline void sched_info_switch(task_t *prev, task_t *next)
591 if (next != rq->idle) 646 if (next != rq->idle)
592 sched_info_arrive(next); 647 sched_info_arrive(next);
593} 648}
649static inline void
650sched_info_switch(struct task_struct *prev, struct task_struct *next)
651{
652 if (unlikely(sched_info_on()))
653 __sched_info_switch(prev, next);
654}
594#else 655#else
595#define sched_info_queued(t) do { } while (0) 656#define sched_info_queued(t) do { } while (0)
596#define sched_info_switch(t, next) do { } while (0) 657#define sched_info_switch(t, next) do { } while (0)
597#endif /* CONFIG_SCHEDSTATS */ 658#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
598 659
599/* 660/*
600 * Adding/removing a task to/from a priority array: 661 * Adding/removing a task to/from a priority array:
601 */ 662 */
602static void dequeue_task(struct task_struct *p, prio_array_t *array) 663static void dequeue_task(struct task_struct *p, struct prio_array *array)
603{ 664{
604 array->nr_active--; 665 array->nr_active--;
605 list_del(&p->run_list); 666 list_del(&p->run_list);
@@ -607,7 +668,7 @@ static void dequeue_task(struct task_struct *p, prio_array_t *array)
607 __clear_bit(p->prio, array->bitmap); 668 __clear_bit(p->prio, array->bitmap);
608} 669}
609 670
610static void enqueue_task(struct task_struct *p, prio_array_t *array) 671static void enqueue_task(struct task_struct *p, struct prio_array *array)
611{ 672{
612 sched_info_queued(p); 673 sched_info_queued(p);
613 list_add_tail(&p->run_list, array->queue + p->prio); 674 list_add_tail(&p->run_list, array->queue + p->prio);
@@ -620,12 +681,13 @@ static void enqueue_task(struct task_struct *p, prio_array_t *array)
620 * Put task to the end of the run list without the overhead of dequeue 681 * Put task to the end of the run list without the overhead of dequeue
621 * followed by enqueue. 682 * followed by enqueue.
622 */ 683 */
623static void requeue_task(struct task_struct *p, prio_array_t *array) 684static void requeue_task(struct task_struct *p, struct prio_array *array)
624{ 685{
625 list_move_tail(&p->run_list, array->queue + p->prio); 686 list_move_tail(&p->run_list, array->queue + p->prio);
626} 687}
627 688
628static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) 689static inline void
690enqueue_task_head(struct task_struct *p, struct prio_array *array)
629{ 691{
630 list_add(&p->run_list, array->queue + p->prio); 692 list_add(&p->run_list, array->queue + p->prio);
631 __set_bit(p->prio, array->bitmap); 693 __set_bit(p->prio, array->bitmap);
@@ -634,7 +696,7 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
634} 696}
635 697
636/* 698/*
637 * effective_prio - return the priority that is based on the static 699 * __normal_prio - return the priority that is based on the static
638 * priority but is modified by bonuses/penalties. 700 * priority but is modified by bonuses/penalties.
639 * 701 *
640 * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] 702 * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
@@ -647,13 +709,11 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
647 * 709 *
648 * Both properties are important to certain workloads. 710 * Both properties are important to certain workloads.
649 */ 711 */
650static int effective_prio(task_t *p) 712
713static inline int __normal_prio(struct task_struct *p)
651{ 714{
652 int bonus, prio; 715 int bonus, prio;
653 716
654 if (rt_task(p))
655 return p->prio;
656
657 bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; 717 bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
658 718
659 prio = p->static_prio - bonus; 719 prio = p->static_prio - bonus;
@@ -665,57 +725,165 @@ static int effective_prio(task_t *p)
665} 725}
666 726
667/* 727/*
728 * To aid in avoiding the subversion of "niceness" due to uneven distribution
729 * of tasks with abnormal "nice" values across CPUs the contribution that
730 * each task makes to its run queue's load is weighted according to its
731 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
732 * scaled version of the new time slice allocation that they receive on time
733 * slice expiry etc.
734 */
735
736/*
737 * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
738 * If static_prio_timeslice() is ever changed to break this assumption then
739 * this code will need modification
740 */
741#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
742#define LOAD_WEIGHT(lp) \
743 (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
744#define PRIO_TO_LOAD_WEIGHT(prio) \
745 LOAD_WEIGHT(static_prio_timeslice(prio))
746#define RTPRIO_TO_LOAD_WEIGHT(rp) \
747 (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
748
749static void set_load_weight(struct task_struct *p)
750{
751 if (has_rt_policy(p)) {
752#ifdef CONFIG_SMP
753 if (p == task_rq(p)->migration_thread)
754 /*
755 * The migration thread does the actual balancing.
756 * Giving its load any weight will skew balancing
757 * adversely.
758 */
759 p->load_weight = 0;
760 else
761#endif
762 p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
763 } else
764 p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
765}
766
767static inline void
768inc_raw_weighted_load(struct rq *rq, const struct task_struct *p)
769{
770 rq->raw_weighted_load += p->load_weight;
771}
772
773static inline void
774dec_raw_weighted_load(struct rq *rq, const struct task_struct *p)
775{
776 rq->raw_weighted_load -= p->load_weight;
777}
778
779static inline void inc_nr_running(struct task_struct *p, struct rq *rq)
780{
781 rq->nr_running++;
782 inc_raw_weighted_load(rq, p);
783}
784
785static inline void dec_nr_running(struct task_struct *p, struct rq *rq)
786{
787 rq->nr_running--;
788 dec_raw_weighted_load(rq, p);
789}
790
791/*
792 * Calculate the expected normal priority: i.e. priority
793 * without taking RT-inheritance into account. Might be
794 * boosted by interactivity modifiers. Changes upon fork,
795 * setprio syscalls, and whenever the interactivity
796 * estimator recalculates.
797 */
798static inline int normal_prio(struct task_struct *p)
799{
800 int prio;
801
802 if (has_rt_policy(p))
803 prio = MAX_RT_PRIO-1 - p->rt_priority;
804 else
805 prio = __normal_prio(p);
806 return prio;
807}
808
809/*
810 * Calculate the current priority, i.e. the priority
811 * taken into account by the scheduler. This value might
812 * be boosted by RT tasks, or might be boosted by
813 * interactivity modifiers. Will be RT if the task got
814 * RT-boosted. If not then it returns p->normal_prio.
815 */
816static int effective_prio(struct task_struct *p)
817{
818 p->normal_prio = normal_prio(p);
819 /*
820 * If we are RT tasks or we were boosted to RT priority,
821 * keep the priority unchanged. Otherwise, update priority
822 * to the normal priority:
823 */
824 if (!rt_prio(p->prio))
825 return p->normal_prio;
826 return p->prio;
827}
828
829/*
668 * __activate_task - move a task to the runqueue. 830 * __activate_task - move a task to the runqueue.
669 */ 831 */
670static void __activate_task(task_t *p, runqueue_t *rq) 832static void __activate_task(struct task_struct *p, struct rq *rq)
671{ 833{
672 prio_array_t *target = rq->active; 834 struct prio_array *target = rq->active;
673 835
674 if (batch_task(p)) 836 if (batch_task(p))
675 target = rq->expired; 837 target = rq->expired;
676 enqueue_task(p, target); 838 enqueue_task(p, target);
677 rq->nr_running++; 839 inc_nr_running(p, rq);
678} 840}
679 841
680/* 842/*
681 * __activate_idle_task - move idle task to the _front_ of runqueue. 843 * __activate_idle_task - move idle task to the _front_ of runqueue.
682 */ 844 */
683static inline void __activate_idle_task(task_t *p, runqueue_t *rq) 845static inline void __activate_idle_task(struct task_struct *p, struct rq *rq)
684{ 846{
685 enqueue_task_head(p, rq->active); 847 enqueue_task_head(p, rq->active);
686 rq->nr_running++; 848 inc_nr_running(p, rq);
687} 849}
688 850
689static int recalc_task_prio(task_t *p, unsigned long long now) 851/*
852 * Recalculate p->normal_prio and p->prio after having slept,
853 * updating the sleep-average too:
854 */
855static int recalc_task_prio(struct task_struct *p, unsigned long long now)
690{ 856{
691 /* Caller must always ensure 'now >= p->timestamp' */ 857 /* Caller must always ensure 'now >= p->timestamp' */
692 unsigned long long __sleep_time = now - p->timestamp; 858 unsigned long sleep_time = now - p->timestamp;
693 unsigned long sleep_time;
694 859
695 if (batch_task(p)) 860 if (batch_task(p))
696 sleep_time = 0; 861 sleep_time = 0;
697 else {
698 if (__sleep_time > NS_MAX_SLEEP_AVG)
699 sleep_time = NS_MAX_SLEEP_AVG;
700 else
701 sleep_time = (unsigned long)__sleep_time;
702 }
703 862
704 if (likely(sleep_time > 0)) { 863 if (likely(sleep_time > 0)) {
705 /* 864 /*
706 * User tasks that sleep a long time are categorised as 865 * This ceiling is set to the lowest priority that would allow
707 * idle. They will only have their sleep_avg increased to a 866 * a task to be reinserted into the active array on timeslice
708 * level that makes them just interactive priority to stay 867 * completion.
709 * active yet prevent them suddenly becoming cpu hogs and
710 * starving other processes.
711 */ 868 */
712 if (p->mm && sleep_time > INTERACTIVE_SLEEP(p)) { 869 unsigned long ceiling = INTERACTIVE_SLEEP(p);
713 unsigned long ceiling;
714 870
715 ceiling = JIFFIES_TO_NS(MAX_SLEEP_AVG - 871 if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) {
716 DEF_TIMESLICE); 872 /*
717 if (p->sleep_avg < ceiling) 873 * Prevents user tasks from achieving best priority
718 p->sleep_avg = ceiling; 874 * with one single large enough sleep.
875 */
876 p->sleep_avg = ceiling;
877 /*
878 * Using INTERACTIVE_SLEEP() as a ceiling places a
879 * nice(0) task 1ms sleep away from promotion, and
880 * gives it 700ms to round-robin with no chance of
881 * being demoted. This is more than generous, so
882 * mark this sleep as non-interactive to prevent the
883 * on-runqueue bonus logic from intervening should
884 * this task not receive cpu immediately.
885 */
886 p->sleep_type = SLEEP_NONINTERACTIVE;
719 } else { 887 } else {
720 /* 888 /*
721 * Tasks waking from uninterruptible sleep are 889 * Tasks waking from uninterruptible sleep are
@@ -723,12 +891,12 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
723 * are likely to be waiting on I/O 891 * are likely to be waiting on I/O
724 */ 892 */
725 if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { 893 if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
726 if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) 894 if (p->sleep_avg >= ceiling)
727 sleep_time = 0; 895 sleep_time = 0;
728 else if (p->sleep_avg + sleep_time >= 896 else if (p->sleep_avg + sleep_time >=
729 INTERACTIVE_SLEEP(p)) { 897 ceiling) {
730 p->sleep_avg = INTERACTIVE_SLEEP(p); 898 p->sleep_avg = ceiling;
731 sleep_time = 0; 899 sleep_time = 0;
732 } 900 }
733 } 901 }
734 902
@@ -742,9 +910,9 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
742 */ 910 */
743 p->sleep_avg += sleep_time; 911 p->sleep_avg += sleep_time;
744 912
745 if (p->sleep_avg > NS_MAX_SLEEP_AVG)
746 p->sleep_avg = NS_MAX_SLEEP_AVG;
747 } 913 }
914 if (p->sleep_avg > NS_MAX_SLEEP_AVG)
915 p->sleep_avg = NS_MAX_SLEEP_AVG;
748 } 916 }
749 917
750 return effective_prio(p); 918 return effective_prio(p);
@@ -756,7 +924,7 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
756 * Update all the scheduling statistics stuff. (sleep average 924 * Update all the scheduling statistics stuff. (sleep average
757 * calculation, priority modifiers, etc.) 925 * calculation, priority modifiers, etc.)
758 */ 926 */
759static void activate_task(task_t *p, runqueue_t *rq, int local) 927static void activate_task(struct task_struct *p, struct rq *rq, int local)
760{ 928{
761 unsigned long long now; 929 unsigned long long now;
762 930
@@ -764,7 +932,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
764#ifdef CONFIG_SMP 932#ifdef CONFIG_SMP
765 if (!local) { 933 if (!local) {
766 /* Compensate for drifting sched_clock */ 934 /* Compensate for drifting sched_clock */
767 runqueue_t *this_rq = this_rq(); 935 struct rq *this_rq = this_rq();
768 now = (now - this_rq->timestamp_last_tick) 936 now = (now - this_rq->timestamp_last_tick)
769 + rq->timestamp_last_tick; 937 + rq->timestamp_last_tick;
770 } 938 }
@@ -803,9 +971,9 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
803/* 971/*
804 * deactivate_task - remove a task from the runqueue. 972 * deactivate_task - remove a task from the runqueue.
805 */ 973 */
806static void deactivate_task(struct task_struct *p, runqueue_t *rq) 974static void deactivate_task(struct task_struct *p, struct rq *rq)
807{ 975{
808 rq->nr_running--; 976 dec_nr_running(p, rq);
809 dequeue_task(p, p->array); 977 dequeue_task(p, p->array);
810 p->array = NULL; 978 p->array = NULL;
811} 979}
@@ -818,7 +986,12 @@ static void deactivate_task(struct task_struct *p, runqueue_t *rq)
818 * the target CPU. 986 * the target CPU.
819 */ 987 */
820#ifdef CONFIG_SMP 988#ifdef CONFIG_SMP
821static void resched_task(task_t *p) 989
990#ifndef tsk_is_polling
991#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
992#endif
993
994static void resched_task(struct task_struct *p)
822{ 995{
823 int cpu; 996 int cpu;
824 997
@@ -833,13 +1006,13 @@ static void resched_task(task_t *p)
833 if (cpu == smp_processor_id()) 1006 if (cpu == smp_processor_id())
834 return; 1007 return;
835 1008
836 /* NEED_RESCHED must be visible before we test POLLING_NRFLAG */ 1009 /* NEED_RESCHED must be visible before we test polling */
837 smp_mb(); 1010 smp_mb();
838 if (!test_tsk_thread_flag(p, TIF_POLLING_NRFLAG)) 1011 if (!tsk_is_polling(p))
839 smp_send_reschedule(cpu); 1012 smp_send_reschedule(cpu);
840} 1013}
841#else 1014#else
842static inline void resched_task(task_t *p) 1015static inline void resched_task(struct task_struct *p)
843{ 1016{
844 assert_spin_locked(&task_rq(p)->lock); 1017 assert_spin_locked(&task_rq(p)->lock);
845 set_tsk_need_resched(p); 1018 set_tsk_need_resched(p);
@@ -850,28 +1023,35 @@ static inline void resched_task(task_t *p)
850 * task_curr - is this task currently executing on a CPU? 1023 * task_curr - is this task currently executing on a CPU?
851 * @p: the task in question. 1024 * @p: the task in question.
852 */ 1025 */
853inline int task_curr(const task_t *p) 1026inline int task_curr(const struct task_struct *p)
854{ 1027{
855 return cpu_curr(task_cpu(p)) == p; 1028 return cpu_curr(task_cpu(p)) == p;
856} 1029}
857 1030
1031/* Used instead of source_load when we know the type == 0 */
1032unsigned long weighted_cpuload(const int cpu)
1033{
1034 return cpu_rq(cpu)->raw_weighted_load;
1035}
1036
858#ifdef CONFIG_SMP 1037#ifdef CONFIG_SMP
859typedef struct { 1038struct migration_req {
860 struct list_head list; 1039 struct list_head list;
861 1040
862 task_t *task; 1041 struct task_struct *task;
863 int dest_cpu; 1042 int dest_cpu;
864 1043
865 struct completion done; 1044 struct completion done;
866} migration_req_t; 1045};
867 1046
868/* 1047/*
869 * The task's runqueue lock must be held. 1048 * The task's runqueue lock must be held.
870 * Returns true if you have to wait for migration thread. 1049 * Returns true if you have to wait for migration thread.
871 */ 1050 */
872static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) 1051static int
1052migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
873{ 1053{
874 runqueue_t *rq = task_rq(p); 1054 struct rq *rq = task_rq(p);
875 1055
876 /* 1056 /*
877 * If the task is not on a runqueue (and not running), then 1057 * If the task is not on a runqueue (and not running), then
@@ -886,6 +1066,7 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
886 req->task = p; 1066 req->task = p;
887 req->dest_cpu = dest_cpu; 1067 req->dest_cpu = dest_cpu;
888 list_add(&req->list, &rq->migration_queue); 1068 list_add(&req->list, &rq->migration_queue);
1069
889 return 1; 1070 return 1;
890} 1071}
891 1072
@@ -898,10 +1079,10 @@ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req)
898 * smp_call_function() if an IPI is sent by the same process we are 1079 * smp_call_function() if an IPI is sent by the same process we are
899 * waiting to become inactive. 1080 * waiting to become inactive.
900 */ 1081 */
901void wait_task_inactive(task_t *p) 1082void wait_task_inactive(struct task_struct *p)
902{ 1083{
903 unsigned long flags; 1084 unsigned long flags;
904 runqueue_t *rq; 1085 struct rq *rq;
905 int preempted; 1086 int preempted;
906 1087
907repeat: 1088repeat:
@@ -932,7 +1113,7 @@ repeat:
932 * to another CPU then no harm is done and the purpose has been 1113 * to another CPU then no harm is done and the purpose has been
933 * achieved as well. 1114 * achieved as well.
934 */ 1115 */
935void kick_process(task_t *p) 1116void kick_process(struct task_struct *p)
936{ 1117{
937 int cpu; 1118 int cpu;
938 1119
@@ -944,32 +1125,45 @@ void kick_process(task_t *p)
944} 1125}
945 1126
946/* 1127/*
947 * Return a low guess at the load of a migration-source cpu. 1128 * Return a low guess at the load of a migration-source cpu weighted
1129 * according to the scheduling class and "nice" value.
948 * 1130 *
949 * We want to under-estimate the load of migration sources, to 1131 * We want to under-estimate the load of migration sources, to
950 * balance conservatively. 1132 * balance conservatively.
951 */ 1133 */
952static inline unsigned long source_load(int cpu, int type) 1134static inline unsigned long source_load(int cpu, int type)
953{ 1135{
954 runqueue_t *rq = cpu_rq(cpu); 1136 struct rq *rq = cpu_rq(cpu);
955 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; 1137
956 if (type == 0) 1138 if (type == 0)
957 return load_now; 1139 return rq->raw_weighted_load;
958 1140
959 return min(rq->cpu_load[type-1], load_now); 1141 return min(rq->cpu_load[type-1], rq->raw_weighted_load);
960} 1142}
961 1143
962/* 1144/*
963 * Return a high guess at the load of a migration-target cpu 1145 * Return a high guess at the load of a migration-target cpu weighted
1146 * according to the scheduling class and "nice" value.
964 */ 1147 */
965static inline unsigned long target_load(int cpu, int type) 1148static inline unsigned long target_load(int cpu, int type)
966{ 1149{
967 runqueue_t *rq = cpu_rq(cpu); 1150 struct rq *rq = cpu_rq(cpu);
968 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; 1151
969 if (type == 0) 1152 if (type == 0)
970 return load_now; 1153 return rq->raw_weighted_load;
971 1154
972 return max(rq->cpu_load[type-1], load_now); 1155 return max(rq->cpu_load[type-1], rq->raw_weighted_load);
1156}
1157
1158/*
1159 * Return the average load per task on the cpu's run queue
1160 */
1161static inline unsigned long cpu_avg_load_per_task(int cpu)
1162{
1163 struct rq *rq = cpu_rq(cpu);
1164 unsigned long n = rq->nr_running;
1165
1166 return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE;
973} 1167}
974 1168
975/* 1169/*
@@ -1042,7 +1236,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1042 cpus_and(tmp, group->cpumask, p->cpus_allowed); 1236 cpus_and(tmp, group->cpumask, p->cpus_allowed);
1043 1237
1044 for_each_cpu_mask(i, tmp) { 1238 for_each_cpu_mask(i, tmp) {
1045 load = source_load(i, 0); 1239 load = weighted_cpuload(i);
1046 1240
1047 if (load < min_load || (load == min_load && i == this_cpu)) { 1241 if (load < min_load || (load == min_load && i == this_cpu)) {
1048 min_load = load; 1242 min_load = load;
@@ -1069,9 +1263,15 @@ static int sched_balance_self(int cpu, int flag)
1069 struct task_struct *t = current; 1263 struct task_struct *t = current;
1070 struct sched_domain *tmp, *sd = NULL; 1264 struct sched_domain *tmp, *sd = NULL;
1071 1265
1072 for_each_domain(cpu, tmp) 1266 for_each_domain(cpu, tmp) {
1267 /*
1268 * If power savings logic is enabled for a domain, stop there.
1269 */
1270 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1271 break;
1073 if (tmp->flags & flag) 1272 if (tmp->flags & flag)
1074 sd = tmp; 1273 sd = tmp;
1274 }
1075 1275
1076 while (sd) { 1276 while (sd) {
1077 cpumask_t span; 1277 cpumask_t span;
@@ -1116,7 +1316,7 @@ nextlevel:
1116 * Returns the CPU we should wake onto. 1316 * Returns the CPU we should wake onto.
1117 */ 1317 */
1118#if defined(ARCH_HAS_SCHED_WAKE_IDLE) 1318#if defined(ARCH_HAS_SCHED_WAKE_IDLE)
1119static int wake_idle(int cpu, task_t *p) 1319static int wake_idle(int cpu, struct task_struct *p)
1120{ 1320{
1121 cpumask_t tmp; 1321 cpumask_t tmp;
1122 struct sched_domain *sd; 1322 struct sched_domain *sd;
@@ -1139,7 +1339,7 @@ static int wake_idle(int cpu, task_t *p)
1139 return cpu; 1339 return cpu;
1140} 1340}
1141#else 1341#else
1142static inline int wake_idle(int cpu, task_t *p) 1342static inline int wake_idle(int cpu, struct task_struct *p)
1143{ 1343{
1144 return cpu; 1344 return cpu;
1145} 1345}
@@ -1159,15 +1359,15 @@ static inline int wake_idle(int cpu, task_t *p)
1159 * 1359 *
1160 * returns failure only if the task is already active. 1360 * returns failure only if the task is already active.
1161 */ 1361 */
1162static int try_to_wake_up(task_t *p, unsigned int state, int sync) 1362static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
1163{ 1363{
1164 int cpu, this_cpu, success = 0; 1364 int cpu, this_cpu, success = 0;
1165 unsigned long flags; 1365 unsigned long flags;
1166 long old_state; 1366 long old_state;
1167 runqueue_t *rq; 1367 struct rq *rq;
1168#ifdef CONFIG_SMP 1368#ifdef CONFIG_SMP
1169 unsigned long load, this_load;
1170 struct sched_domain *sd, *this_sd = NULL; 1369 struct sched_domain *sd, *this_sd = NULL;
1370 unsigned long load, this_load;
1171 int new_cpu; 1371 int new_cpu;
1172#endif 1372#endif
1173 1373
@@ -1221,17 +1421,19 @@ static int try_to_wake_up(task_t *p, unsigned int state, int sync)
1221 1421
1222 if (this_sd->flags & SD_WAKE_AFFINE) { 1422 if (this_sd->flags & SD_WAKE_AFFINE) {
1223 unsigned long tl = this_load; 1423 unsigned long tl = this_load;
1424 unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu);
1425
1224 /* 1426 /*
1225 * If sync wakeup then subtract the (maximum possible) 1427 * If sync wakeup then subtract the (maximum possible)
1226 * effect of the currently running task from the load 1428 * effect of the currently running task from the load
1227 * of the current CPU: 1429 * of the current CPU:
1228 */ 1430 */
1229 if (sync) 1431 if (sync)
1230 tl -= SCHED_LOAD_SCALE; 1432 tl -= current->load_weight;
1231 1433
1232 if ((tl <= load && 1434 if ((tl <= load &&
1233 tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) || 1435 tl + target_load(cpu, idx) <= tl_per_task) ||
1234 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) { 1436 100*(tl + p->load_weight) <= imbalance*load) {
1235 /* 1437 /*
1236 * This domain has SD_WAKE_AFFINE and 1438 * This domain has SD_WAKE_AFFINE and
1237 * p is cache cold in this domain, and 1439 * p is cache cold in this domain, and
@@ -1315,15 +1517,14 @@ out:
1315 return success; 1517 return success;
1316} 1518}
1317 1519
1318int fastcall wake_up_process(task_t *p) 1520int fastcall wake_up_process(struct task_struct *p)
1319{ 1521{
1320 return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED | 1522 return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
1321 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); 1523 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
1322} 1524}
1323
1324EXPORT_SYMBOL(wake_up_process); 1525EXPORT_SYMBOL(wake_up_process);
1325 1526
1326int fastcall wake_up_state(task_t *p, unsigned int state) 1527int fastcall wake_up_state(struct task_struct *p, unsigned int state)
1327{ 1528{
1328 return try_to_wake_up(p, state, 0); 1529 return try_to_wake_up(p, state, 0);
1329} 1530}
@@ -1332,7 +1533,7 @@ int fastcall wake_up_state(task_t *p, unsigned int state)
1332 * Perform scheduler related setup for a newly forked process p. 1533 * Perform scheduler related setup for a newly forked process p.
1333 * p is forked by current. 1534 * p is forked by current.
1334 */ 1535 */
1335void fastcall sched_fork(task_t *p, int clone_flags) 1536void fastcall sched_fork(struct task_struct *p, int clone_flags)
1336{ 1537{
1337 int cpu = get_cpu(); 1538 int cpu = get_cpu();
1338 1539
@@ -1348,10 +1549,17 @@ void fastcall sched_fork(task_t *p, int clone_flags)
1348 * event cannot wake it up and insert it on the runqueue either. 1549 * event cannot wake it up and insert it on the runqueue either.
1349 */ 1550 */
1350 p->state = TASK_RUNNING; 1551 p->state = TASK_RUNNING;
1552
1553 /*
1554 * Make sure we do not leak PI boosting priority to the child:
1555 */
1556 p->prio = current->normal_prio;
1557
1351 INIT_LIST_HEAD(&p->run_list); 1558 INIT_LIST_HEAD(&p->run_list);
1352 p->array = NULL; 1559 p->array = NULL;
1353#ifdef CONFIG_SCHEDSTATS 1560#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1354 memset(&p->sched_info, 0, sizeof(p->sched_info)); 1561 if (unlikely(sched_info_on()))
1562 memset(&p->sched_info, 0, sizeof(p->sched_info));
1355#endif 1563#endif
1356#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 1564#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
1357 p->oncpu = 0; 1565 p->oncpu = 0;
@@ -1394,11 +1602,11 @@ void fastcall sched_fork(task_t *p, int clone_flags)
1394 * that must be done for every newly created context, then puts the task 1602 * that must be done for every newly created context, then puts the task
1395 * on the runqueue and wakes it. 1603 * on the runqueue and wakes it.
1396 */ 1604 */
1397void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags) 1605void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
1398{ 1606{
1607 struct rq *rq, *this_rq;
1399 unsigned long flags; 1608 unsigned long flags;
1400 int this_cpu, cpu; 1609 int this_cpu, cpu;
1401 runqueue_t *rq, *this_rq;
1402 1610
1403 rq = task_rq_lock(p, &flags); 1611 rq = task_rq_lock(p, &flags);
1404 BUG_ON(p->state != TASK_RUNNING); 1612 BUG_ON(p->state != TASK_RUNNING);
@@ -1427,10 +1635,11 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
1427 __activate_task(p, rq); 1635 __activate_task(p, rq);
1428 else { 1636 else {
1429 p->prio = current->prio; 1637 p->prio = current->prio;
1638 p->normal_prio = current->normal_prio;
1430 list_add_tail(&p->run_list, &current->run_list); 1639 list_add_tail(&p->run_list, &current->run_list);
1431 p->array = current->array; 1640 p->array = current->array;
1432 p->array->nr_active++; 1641 p->array->nr_active++;
1433 rq->nr_running++; 1642 inc_nr_running(p, rq);
1434 } 1643 }
1435 set_need_resched(); 1644 set_need_resched();
1436 } else 1645 } else
@@ -1477,10 +1686,10 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
1477 * artificially, because any timeslice recovered here 1686 * artificially, because any timeslice recovered here
1478 * was given away by the parent in the first place.) 1687 * was given away by the parent in the first place.)
1479 */ 1688 */
1480void fastcall sched_exit(task_t *p) 1689void fastcall sched_exit(struct task_struct *p)
1481{ 1690{
1482 unsigned long flags; 1691 unsigned long flags;
1483 runqueue_t *rq; 1692 struct rq *rq;
1484 1693
1485 /* 1694 /*
1486 * If the child was a (relative-) CPU hog then decrease 1695 * If the child was a (relative-) CPU hog then decrease
@@ -1511,7 +1720,7 @@ void fastcall sched_exit(task_t *p)
1511 * prepare_task_switch sets up locking and calls architecture specific 1720 * prepare_task_switch sets up locking and calls architecture specific
1512 * hooks. 1721 * hooks.
1513 */ 1722 */
1514static inline void prepare_task_switch(runqueue_t *rq, task_t *next) 1723static inline void prepare_task_switch(struct rq *rq, struct task_struct *next)
1515{ 1724{
1516 prepare_lock_switch(rq, next); 1725 prepare_lock_switch(rq, next);
1517 prepare_arch_switch(next); 1726 prepare_arch_switch(next);
@@ -1532,7 +1741,7 @@ static inline void prepare_task_switch(runqueue_t *rq, task_t *next)
1532 * with the lock held can cause deadlocks; see schedule() for 1741 * with the lock held can cause deadlocks; see schedule() for
1533 * details.) 1742 * details.)
1534 */ 1743 */
1535static inline void finish_task_switch(runqueue_t *rq, task_t *prev) 1744static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
1536 __releases(rq->lock) 1745 __releases(rq->lock)
1537{ 1746{
1538 struct mm_struct *mm = rq->prev_mm; 1747 struct mm_struct *mm = rq->prev_mm;
@@ -1570,10 +1779,11 @@ static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
1570 * schedule_tail - first thing a freshly forked thread must call. 1779 * schedule_tail - first thing a freshly forked thread must call.
1571 * @prev: the thread we just switched away from. 1780 * @prev: the thread we just switched away from.
1572 */ 1781 */
1573asmlinkage void schedule_tail(task_t *prev) 1782asmlinkage void schedule_tail(struct task_struct *prev)
1574 __releases(rq->lock) 1783 __releases(rq->lock)
1575{ 1784{
1576 runqueue_t *rq = this_rq(); 1785 struct rq *rq = this_rq();
1786
1577 finish_task_switch(rq, prev); 1787 finish_task_switch(rq, prev);
1578#ifdef __ARCH_WANT_UNLOCKED_CTXSW 1788#ifdef __ARCH_WANT_UNLOCKED_CTXSW
1579 /* In this case, finish_task_switch does not reenable preemption */ 1789 /* In this case, finish_task_switch does not reenable preemption */
@@ -1587,8 +1797,9 @@ asmlinkage void schedule_tail(task_t *prev)
1587 * context_switch - switch to the new MM and the new 1797 * context_switch - switch to the new MM and the new
1588 * thread's register state. 1798 * thread's register state.
1589 */ 1799 */
1590static inline 1800static inline struct task_struct *
1591task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next) 1801context_switch(struct rq *rq, struct task_struct *prev,
1802 struct task_struct *next)
1592{ 1803{
1593 struct mm_struct *mm = next->mm; 1804 struct mm_struct *mm = next->mm;
1594 struct mm_struct *oldmm = prev->active_mm; 1805 struct mm_struct *oldmm = prev->active_mm;
@@ -1605,6 +1816,15 @@ task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next)
1605 WARN_ON(rq->prev_mm); 1816 WARN_ON(rq->prev_mm);
1606 rq->prev_mm = oldmm; 1817 rq->prev_mm = oldmm;
1607 } 1818 }
1819 /*
1820 * Since the runqueue lock will be released by the next
1821 * task (which is an invalid locking op but in the case
1822 * of the scheduler it's an obvious special-case), so we
1823 * do an early lockdep release here:
1824 */
1825#ifndef __ARCH_WANT_UNLOCKED_CTXSW
1826 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
1827#endif
1608 1828
1609 /* Here we just switch the register state and the stack. */ 1829 /* Here we just switch the register state and the stack. */
1610 switch_to(prev, next, prev); 1830 switch_to(prev, next, prev);
@@ -1648,7 +1868,8 @@ unsigned long nr_uninterruptible(void)
1648 1868
1649unsigned long long nr_context_switches(void) 1869unsigned long long nr_context_switches(void)
1650{ 1870{
1651 unsigned long long i, sum = 0; 1871 int i;
1872 unsigned long long sum = 0;
1652 1873
1653 for_each_possible_cpu(i) 1874 for_each_possible_cpu(i)
1654 sum += cpu_rq(i)->nr_switches; 1875 sum += cpu_rq(i)->nr_switches;
@@ -1684,15 +1905,21 @@ unsigned long nr_active(void)
1684#ifdef CONFIG_SMP 1905#ifdef CONFIG_SMP
1685 1906
1686/* 1907/*
1908 * Is this task likely cache-hot:
1909 */
1910static inline int
1911task_hot(struct task_struct *p, unsigned long long now, struct sched_domain *sd)
1912{
1913 return (long long)(now - p->last_ran) < (long long)sd->cache_hot_time;
1914}
1915
1916/*
1687 * double_rq_lock - safely lock two runqueues 1917 * double_rq_lock - safely lock two runqueues
1688 * 1918 *
1689 * We must take them in cpu order to match code in
1690 * dependent_sleeper and wake_dependent_sleeper.
1691 *
1692 * Note this does not disable interrupts like task_rq_lock, 1919 * Note this does not disable interrupts like task_rq_lock,
1693 * you need to do so manually before calling. 1920 * you need to do so manually before calling.
1694 */ 1921 */
1695static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) 1922static void double_rq_lock(struct rq *rq1, struct rq *rq2)
1696 __acquires(rq1->lock) 1923 __acquires(rq1->lock)
1697 __acquires(rq2->lock) 1924 __acquires(rq2->lock)
1698{ 1925{
@@ -1700,7 +1927,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
1700 spin_lock(&rq1->lock); 1927 spin_lock(&rq1->lock);
1701 __acquire(rq2->lock); /* Fake it out ;) */ 1928 __acquire(rq2->lock); /* Fake it out ;) */
1702 } else { 1929 } else {
1703 if (rq1->cpu < rq2->cpu) { 1930 if (rq1 < rq2) {
1704 spin_lock(&rq1->lock); 1931 spin_lock(&rq1->lock);
1705 spin_lock(&rq2->lock); 1932 spin_lock(&rq2->lock);
1706 } else { 1933 } else {
@@ -1716,7 +1943,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
1716 * Note this does not restore interrupts like task_rq_unlock, 1943 * Note this does not restore interrupts like task_rq_unlock,
1717 * you need to do so manually after calling. 1944 * you need to do so manually after calling.
1718 */ 1945 */
1719static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) 1946static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
1720 __releases(rq1->lock) 1947 __releases(rq1->lock)
1721 __releases(rq2->lock) 1948 __releases(rq2->lock)
1722{ 1949{
@@ -1730,13 +1957,13 @@ static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2)
1730/* 1957/*
1731 * double_lock_balance - lock the busiest runqueue, this_rq is locked already. 1958 * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
1732 */ 1959 */
1733static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) 1960static void double_lock_balance(struct rq *this_rq, struct rq *busiest)
1734 __releases(this_rq->lock) 1961 __releases(this_rq->lock)
1735 __acquires(busiest->lock) 1962 __acquires(busiest->lock)
1736 __acquires(this_rq->lock) 1963 __acquires(this_rq->lock)
1737{ 1964{
1738 if (unlikely(!spin_trylock(&busiest->lock))) { 1965 if (unlikely(!spin_trylock(&busiest->lock))) {
1739 if (busiest->cpu < this_rq->cpu) { 1966 if (busiest < this_rq) {
1740 spin_unlock(&this_rq->lock); 1967 spin_unlock(&this_rq->lock);
1741 spin_lock(&busiest->lock); 1968 spin_lock(&busiest->lock);
1742 spin_lock(&this_rq->lock); 1969 spin_lock(&this_rq->lock);
@@ -1751,11 +1978,11 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
1751 * allow dest_cpu, which will force the cpu onto dest_cpu. Then 1978 * allow dest_cpu, which will force the cpu onto dest_cpu. Then
1752 * the cpu_allowed mask is restored. 1979 * the cpu_allowed mask is restored.
1753 */ 1980 */
1754static void sched_migrate_task(task_t *p, int dest_cpu) 1981static void sched_migrate_task(struct task_struct *p, int dest_cpu)
1755{ 1982{
1756 migration_req_t req; 1983 struct migration_req req;
1757 runqueue_t *rq;
1758 unsigned long flags; 1984 unsigned long flags;
1985 struct rq *rq;
1759 1986
1760 rq = task_rq_lock(p, &flags); 1987 rq = task_rq_lock(p, &flags);
1761 if (!cpu_isset(dest_cpu, p->cpus_allowed) 1988 if (!cpu_isset(dest_cpu, p->cpus_allowed)
@@ -1766,11 +1993,13 @@ static void sched_migrate_task(task_t *p, int dest_cpu)
1766 if (migrate_task(p, dest_cpu, &req)) { 1993 if (migrate_task(p, dest_cpu, &req)) {
1767 /* Need to wait for migration thread (might exit: take ref). */ 1994 /* Need to wait for migration thread (might exit: take ref). */
1768 struct task_struct *mt = rq->migration_thread; 1995 struct task_struct *mt = rq->migration_thread;
1996
1769 get_task_struct(mt); 1997 get_task_struct(mt);
1770 task_rq_unlock(rq, &flags); 1998 task_rq_unlock(rq, &flags);
1771 wake_up_process(mt); 1999 wake_up_process(mt);
1772 put_task_struct(mt); 2000 put_task_struct(mt);
1773 wait_for_completion(&req.done); 2001 wait_for_completion(&req.done);
2002
1774 return; 2003 return;
1775 } 2004 }
1776out: 2005out:
@@ -1794,14 +2023,14 @@ void sched_exec(void)
1794 * pull_task - move a task from a remote runqueue to the local runqueue. 2023 * pull_task - move a task from a remote runqueue to the local runqueue.
1795 * Both runqueues must be locked. 2024 * Both runqueues must be locked.
1796 */ 2025 */
1797static 2026static void pull_task(struct rq *src_rq, struct prio_array *src_array,
1798void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, 2027 struct task_struct *p, struct rq *this_rq,
1799 runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) 2028 struct prio_array *this_array, int this_cpu)
1800{ 2029{
1801 dequeue_task(p, src_array); 2030 dequeue_task(p, src_array);
1802 src_rq->nr_running--; 2031 dec_nr_running(p, src_rq);
1803 set_task_cpu(p, this_cpu); 2032 set_task_cpu(p, this_cpu);
1804 this_rq->nr_running++; 2033 inc_nr_running(p, this_rq);
1805 enqueue_task(p, this_array); 2034 enqueue_task(p, this_array);
1806 p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) 2035 p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
1807 + this_rq->timestamp_last_tick; 2036 + this_rq->timestamp_last_tick;
@@ -1817,7 +2046,7 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1817 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? 2046 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
1818 */ 2047 */
1819static 2048static
1820int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, 2049int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
1821 struct sched_domain *sd, enum idle_type idle, 2050 struct sched_domain *sd, enum idle_type idle,
1822 int *all_pinned) 2051 int *all_pinned)
1823{ 2052{
@@ -1848,26 +2077,42 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1848 return 1; 2077 return 1;
1849} 2078}
1850 2079
2080#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio)
2081
1851/* 2082/*
1852 * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, 2083 * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
1853 * as part of a balancing operation within "domain". Returns the number of 2084 * load from busiest to this_rq, as part of a balancing operation within
1854 * tasks moved. 2085 * "domain". Returns the number of tasks moved.
1855 * 2086 *
1856 * Called with both runqueues locked. 2087 * Called with both runqueues locked.
1857 */ 2088 */
1858static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, 2089static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
1859 unsigned long max_nr_move, struct sched_domain *sd, 2090 unsigned long max_nr_move, unsigned long max_load_move,
1860 enum idle_type idle, int *all_pinned) 2091 struct sched_domain *sd, enum idle_type idle,
2092 int *all_pinned)
1861{ 2093{
1862 prio_array_t *array, *dst_array; 2094 int idx, pulled = 0, pinned = 0, this_best_prio, best_prio,
2095 best_prio_seen, skip_for_load;
2096 struct prio_array *array, *dst_array;
1863 struct list_head *head, *curr; 2097 struct list_head *head, *curr;
1864 int idx, pulled = 0, pinned = 0; 2098 struct task_struct *tmp;
1865 task_t *tmp; 2099 long rem_load_move;
1866 2100
1867 if (max_nr_move == 0) 2101 if (max_nr_move == 0 || max_load_move == 0)
1868 goto out; 2102 goto out;
1869 2103
2104 rem_load_move = max_load_move;
1870 pinned = 1; 2105 pinned = 1;
2106 this_best_prio = rq_best_prio(this_rq);
2107 best_prio = rq_best_prio(busiest);
2108 /*
2109 * Enable handling of the case where there is more than one task
2110 * with the best priority. If the current running task is one
2111 * of those with prio==best_prio we know it won't be moved
2112 * and therefore it's safe to override the skip (based on load) of
2113 * any task we find with that prio.
2114 */
2115 best_prio_seen = best_prio == busiest->curr->prio;
1871 2116
1872 /* 2117 /*
1873 * We first consider expired tasks. Those will likely not be 2118 * We first consider expired tasks. Those will likely not be
@@ -1903,11 +2148,22 @@ skip_bitmap:
1903 head = array->queue + idx; 2148 head = array->queue + idx;
1904 curr = head->prev; 2149 curr = head->prev;
1905skip_queue: 2150skip_queue:
1906 tmp = list_entry(curr, task_t, run_list); 2151 tmp = list_entry(curr, struct task_struct, run_list);
1907 2152
1908 curr = curr->prev; 2153 curr = curr->prev;
1909 2154
1910 if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { 2155 /*
2156 * To help distribute high priority tasks accross CPUs we don't
2157 * skip a task if it will be the highest priority task (i.e. smallest
2158 * prio value) on its new queue regardless of its load weight
2159 */
2160 skip_for_load = tmp->load_weight > rem_load_move;
2161 if (skip_for_load && idx < this_best_prio)
2162 skip_for_load = !best_prio_seen && idx == best_prio;
2163 if (skip_for_load ||
2164 !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
2165
2166 best_prio_seen |= idx == best_prio;
1911 if (curr != head) 2167 if (curr != head)
1912 goto skip_queue; 2168 goto skip_queue;
1913 idx++; 2169 idx++;
@@ -1921,9 +2177,15 @@ skip_queue:
1921 2177
1922 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); 2178 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
1923 pulled++; 2179 pulled++;
2180 rem_load_move -= tmp->load_weight;
1924 2181
1925 /* We only want to steal up to the prescribed number of tasks. */ 2182 /*
1926 if (pulled < max_nr_move) { 2183 * We only want to steal up to the prescribed number of tasks
2184 * and the prescribed amount of weighted load.
2185 */
2186 if (pulled < max_nr_move && rem_load_move > 0) {
2187 if (idx < this_best_prio)
2188 this_best_prio = idx;
1927 if (curr != head) 2189 if (curr != head)
1928 goto skip_queue; 2190 goto skip_queue;
1929 idx++; 2191 idx++;
@@ -1944,8 +2206,8 @@ out:
1944 2206
1945/* 2207/*
1946 * find_busiest_group finds and returns the busiest CPU group within the 2208 * find_busiest_group finds and returns the busiest CPU group within the
1947 * domain. It calculates and returns the number of tasks which should be 2209 * domain. It calculates and returns the amount of weighted load which
1948 * moved to restore balance via the imbalance parameter. 2210 * should be moved to restore balance via the imbalance parameter.
1949 */ 2211 */
1950static struct sched_group * 2212static struct sched_group *
1951find_busiest_group(struct sched_domain *sd, int this_cpu, 2213find_busiest_group(struct sched_domain *sd, int this_cpu,
@@ -1954,9 +2216,19 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1954 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 2216 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
1955 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 2217 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
1956 unsigned long max_pull; 2218 unsigned long max_pull;
2219 unsigned long busiest_load_per_task, busiest_nr_running;
2220 unsigned long this_load_per_task, this_nr_running;
1957 int load_idx; 2221 int load_idx;
2222#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2223 int power_savings_balance = 1;
2224 unsigned long leader_nr_running = 0, min_load_per_task = 0;
2225 unsigned long min_nr_running = ULONG_MAX;
2226 struct sched_group *group_min = NULL, *group_leader = NULL;
2227#endif
1958 2228
1959 max_load = this_load = total_load = total_pwr = 0; 2229 max_load = this_load = total_load = total_pwr = 0;
2230 busiest_load_per_task = busiest_nr_running = 0;
2231 this_load_per_task = this_nr_running = 0;
1960 if (idle == NOT_IDLE) 2232 if (idle == NOT_IDLE)
1961 load_idx = sd->busy_idx; 2233 load_idx = sd->busy_idx;
1962 else if (idle == NEWLY_IDLE) 2234 else if (idle == NEWLY_IDLE)
@@ -1965,16 +2237,19 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1965 load_idx = sd->idle_idx; 2237 load_idx = sd->idle_idx;
1966 2238
1967 do { 2239 do {
1968 unsigned long load; 2240 unsigned long load, group_capacity;
1969 int local_group; 2241 int local_group;
1970 int i; 2242 int i;
2243 unsigned long sum_nr_running, sum_weighted_load;
1971 2244
1972 local_group = cpu_isset(this_cpu, group->cpumask); 2245 local_group = cpu_isset(this_cpu, group->cpumask);
1973 2246
1974 /* Tally up the load of all CPUs in the group */ 2247 /* Tally up the load of all CPUs in the group */
1975 avg_load = 0; 2248 sum_weighted_load = sum_nr_running = avg_load = 0;
1976 2249
1977 for_each_cpu_mask(i, group->cpumask) { 2250 for_each_cpu_mask(i, group->cpumask) {
2251 struct rq *rq = cpu_rq(i);
2252
1978 if (*sd_idle && !idle_cpu(i)) 2253 if (*sd_idle && !idle_cpu(i))
1979 *sd_idle = 0; 2254 *sd_idle = 0;
1980 2255
@@ -1985,6 +2260,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1985 load = source_load(i, load_idx); 2260 load = source_load(i, load_idx);
1986 2261
1987 avg_load += load; 2262 avg_load += load;
2263 sum_nr_running += rq->nr_running;
2264 sum_weighted_load += rq->raw_weighted_load;
1988 } 2265 }
1989 2266
1990 total_load += avg_load; 2267 total_load += avg_load;
@@ -1993,17 +2270,80 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1993 /* Adjust by relative CPU power of the group */ 2270 /* Adjust by relative CPU power of the group */
1994 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; 2271 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
1995 2272
2273 group_capacity = group->cpu_power / SCHED_LOAD_SCALE;
2274
1996 if (local_group) { 2275 if (local_group) {
1997 this_load = avg_load; 2276 this_load = avg_load;
1998 this = group; 2277 this = group;
1999 } else if (avg_load > max_load) { 2278 this_nr_running = sum_nr_running;
2279 this_load_per_task = sum_weighted_load;
2280 } else if (avg_load > max_load &&
2281 sum_nr_running > group_capacity) {
2000 max_load = avg_load; 2282 max_load = avg_load;
2001 busiest = group; 2283 busiest = group;
2284 busiest_nr_running = sum_nr_running;
2285 busiest_load_per_task = sum_weighted_load;
2002 } 2286 }
2287
2288#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2289 /*
2290 * Busy processors will not participate in power savings
2291 * balance.
2292 */
2293 if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2294 goto group_next;
2295
2296 /*
2297 * If the local group is idle or completely loaded
2298 * no need to do power savings balance at this domain
2299 */
2300 if (local_group && (this_nr_running >= group_capacity ||
2301 !this_nr_running))
2302 power_savings_balance = 0;
2303
2304 /*
2305 * If a group is already running at full capacity or idle,
2306 * don't include that group in power savings calculations
2307 */
2308 if (!power_savings_balance || sum_nr_running >= group_capacity
2309 || !sum_nr_running)
2310 goto group_next;
2311
2312 /*
2313 * Calculate the group which has the least non-idle load.
2314 * This is the group from where we need to pick up the load
2315 * for saving power
2316 */
2317 if ((sum_nr_running < min_nr_running) ||
2318 (sum_nr_running == min_nr_running &&
2319 first_cpu(group->cpumask) <
2320 first_cpu(group_min->cpumask))) {
2321 group_min = group;
2322 min_nr_running = sum_nr_running;
2323 min_load_per_task = sum_weighted_load /
2324 sum_nr_running;
2325 }
2326
2327 /*
2328 * Calculate the group which is almost near its
2329 * capacity but still has some space to pick up some load
2330 * from other group and save more power
2331 */
2332 if (sum_nr_running <= group_capacity - 1) {
2333 if (sum_nr_running > leader_nr_running ||
2334 (sum_nr_running == leader_nr_running &&
2335 first_cpu(group->cpumask) >
2336 first_cpu(group_leader->cpumask))) {
2337 group_leader = group;
2338 leader_nr_running = sum_nr_running;
2339 }
2340 }
2341group_next:
2342#endif
2003 group = group->next; 2343 group = group->next;
2004 } while (group != sd->groups); 2344 } while (group != sd->groups);
2005 2345
2006 if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE) 2346 if (!busiest || this_load >= max_load || busiest_nr_running == 0)
2007 goto out_balanced; 2347 goto out_balanced;
2008 2348
2009 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; 2349 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
@@ -2012,6 +2352,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2012 100*max_load <= sd->imbalance_pct*this_load) 2352 100*max_load <= sd->imbalance_pct*this_load)
2013 goto out_balanced; 2353 goto out_balanced;
2014 2354
2355 busiest_load_per_task /= busiest_nr_running;
2015 /* 2356 /*
2016 * We're trying to get all the cpus to the average_load, so we don't 2357 * We're trying to get all the cpus to the average_load, so we don't
2017 * want to push ourselves above the average load, nor do we wish to 2358 * want to push ourselves above the average load, nor do we wish to
@@ -2023,21 +2364,49 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2023 * by pulling tasks to us. Be careful of negative numbers as they'll 2364 * by pulling tasks to us. Be careful of negative numbers as they'll
2024 * appear as very large values with unsigned longs. 2365 * appear as very large values with unsigned longs.
2025 */ 2366 */
2367 if (max_load <= busiest_load_per_task)
2368 goto out_balanced;
2369
2370 /*
2371 * In the presence of smp nice balancing, certain scenarios can have
2372 * max load less than avg load(as we skip the groups at or below
2373 * its cpu_power, while calculating max_load..)
2374 */
2375 if (max_load < avg_load) {
2376 *imbalance = 0;
2377 goto small_imbalance;
2378 }
2026 2379
2027 /* Don't want to pull so many tasks that a group would go idle */ 2380 /* Don't want to pull so many tasks that a group would go idle */
2028 max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE); 2381 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
2029 2382
2030 /* How much load to actually move to equalise the imbalance */ 2383 /* How much load to actually move to equalise the imbalance */
2031 *imbalance = min(max_pull * busiest->cpu_power, 2384 *imbalance = min(max_pull * busiest->cpu_power,
2032 (avg_load - this_load) * this->cpu_power) 2385 (avg_load - this_load) * this->cpu_power)
2033 / SCHED_LOAD_SCALE; 2386 / SCHED_LOAD_SCALE;
2034 2387
2035 if (*imbalance < SCHED_LOAD_SCALE) { 2388 /*
2036 unsigned long pwr_now = 0, pwr_move = 0; 2389 * if *imbalance is less than the average load per runnable task
2037 unsigned long tmp; 2390 * there is no gaurantee that any tasks will be moved so we'll have
2391 * a think about bumping its value to force at least one task to be
2392 * moved
2393 */
2394 if (*imbalance < busiest_load_per_task) {
2395 unsigned long tmp, pwr_now, pwr_move;
2396 unsigned int imbn;
2397
2398small_imbalance:
2399 pwr_move = pwr_now = 0;
2400 imbn = 2;
2401 if (this_nr_running) {
2402 this_load_per_task /= this_nr_running;
2403 if (busiest_load_per_task > this_load_per_task)
2404 imbn = 1;
2405 } else
2406 this_load_per_task = SCHED_LOAD_SCALE;
2038 2407
2039 if (max_load - this_load >= SCHED_LOAD_SCALE*2) { 2408 if (max_load - this_load >= busiest_load_per_task * imbn) {
2040 *imbalance = 1; 2409 *imbalance = busiest_load_per_task;
2041 return busiest; 2410 return busiest;
2042 } 2411 }
2043 2412
@@ -2047,39 +2416,47 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2047 * moving them. 2416 * moving them.
2048 */ 2417 */
2049 2418
2050 pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); 2419 pwr_now += busiest->cpu_power *
2051 pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); 2420 min(busiest_load_per_task, max_load);
2421 pwr_now += this->cpu_power *
2422 min(this_load_per_task, this_load);
2052 pwr_now /= SCHED_LOAD_SCALE; 2423 pwr_now /= SCHED_LOAD_SCALE;
2053 2424
2054 /* Amount of load we'd subtract */ 2425 /* Amount of load we'd subtract */
2055 tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; 2426 tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power;
2056 if (max_load > tmp) 2427 if (max_load > tmp)
2057 pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE, 2428 pwr_move += busiest->cpu_power *
2058 max_load - tmp); 2429 min(busiest_load_per_task, max_load - tmp);
2059 2430
2060 /* Amount of load we'd add */ 2431 /* Amount of load we'd add */
2061 if (max_load*busiest->cpu_power < 2432 if (max_load*busiest->cpu_power <
2062 SCHED_LOAD_SCALE*SCHED_LOAD_SCALE) 2433 busiest_load_per_task*SCHED_LOAD_SCALE)
2063 tmp = max_load*busiest->cpu_power/this->cpu_power; 2434 tmp = max_load*busiest->cpu_power/this->cpu_power;
2064 else 2435 else
2065 tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; 2436 tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power;
2066 pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp); 2437 pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp);
2067 pwr_move /= SCHED_LOAD_SCALE; 2438 pwr_move /= SCHED_LOAD_SCALE;
2068 2439
2069 /* Move if we gain throughput */ 2440 /* Move if we gain throughput */
2070 if (pwr_move <= pwr_now) 2441 if (pwr_move <= pwr_now)
2071 goto out_balanced; 2442 goto out_balanced;
2072 2443
2073 *imbalance = 1; 2444 *imbalance = busiest_load_per_task;
2074 return busiest;
2075 } 2445 }
2076 2446
2077 /* Get rid of the scaling factor, rounding down as we divide */
2078 *imbalance = *imbalance / SCHED_LOAD_SCALE;
2079 return busiest; 2447 return busiest;
2080 2448
2081out_balanced: 2449out_balanced:
2450#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2451 if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2452 goto ret;
2082 2453
2454 if (this == group_leader && group_leader != group_min) {
2455 *imbalance = min_load_per_task;
2456 return group_min;
2457 }
2458ret:
2459#endif
2083 *imbalance = 0; 2460 *imbalance = 0;
2084 return NULL; 2461 return NULL;
2085} 2462}
@@ -2087,19 +2464,23 @@ out_balanced:
2087/* 2464/*
2088 * find_busiest_queue - find the busiest runqueue among the cpus in group. 2465 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2089 */ 2466 */
2090static runqueue_t *find_busiest_queue(struct sched_group *group, 2467static struct rq *
2091 enum idle_type idle) 2468find_busiest_queue(struct sched_group *group, enum idle_type idle,
2469 unsigned long imbalance)
2092{ 2470{
2093 unsigned long load, max_load = 0; 2471 struct rq *busiest = NULL, *rq;
2094 runqueue_t *busiest = NULL; 2472 unsigned long max_load = 0;
2095 int i; 2473 int i;
2096 2474
2097 for_each_cpu_mask(i, group->cpumask) { 2475 for_each_cpu_mask(i, group->cpumask) {
2098 load = source_load(i, 0); 2476 rq = cpu_rq(i);
2099 2477
2100 if (load > max_load) { 2478 if (rq->nr_running == 1 && rq->raw_weighted_load > imbalance)
2101 max_load = load; 2479 continue;
2102 busiest = cpu_rq(i); 2480
2481 if (rq->raw_weighted_load > max_load) {
2482 max_load = rq->raw_weighted_load;
2483 busiest = rq;
2103 } 2484 }
2104 } 2485 }
2105 2486
@@ -2112,23 +2493,27 @@ static runqueue_t *find_busiest_queue(struct sched_group *group,
2112 */ 2493 */
2113#define MAX_PINNED_INTERVAL 512 2494#define MAX_PINNED_INTERVAL 512
2114 2495
2496static inline unsigned long minus_1_or_zero(unsigned long n)
2497{
2498 return n > 0 ? n - 1 : 0;
2499}
2500
2115/* 2501/*
2116 * Check this_cpu to ensure it is balanced within domain. Attempt to move 2502 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2117 * tasks if there is an imbalance. 2503 * tasks if there is an imbalance.
2118 * 2504 *
2119 * Called with this_rq unlocked. 2505 * Called with this_rq unlocked.
2120 */ 2506 */
2121static int load_balance(int this_cpu, runqueue_t *this_rq, 2507static int load_balance(int this_cpu, struct rq *this_rq,
2122 struct sched_domain *sd, enum idle_type idle) 2508 struct sched_domain *sd, enum idle_type idle)
2123{ 2509{
2510 int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
2124 struct sched_group *group; 2511 struct sched_group *group;
2125 runqueue_t *busiest;
2126 unsigned long imbalance; 2512 unsigned long imbalance;
2127 int nr_moved, all_pinned = 0; 2513 struct rq *busiest;
2128 int active_balance = 0;
2129 int sd_idle = 0;
2130 2514
2131 if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER) 2515 if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2516 !sched_smt_power_savings)
2132 sd_idle = 1; 2517 sd_idle = 1;
2133 2518
2134 schedstat_inc(sd, lb_cnt[idle]); 2519 schedstat_inc(sd, lb_cnt[idle]);
@@ -2139,7 +2524,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
2139 goto out_balanced; 2524 goto out_balanced;
2140 } 2525 }
2141 2526
2142 busiest = find_busiest_queue(group, idle); 2527 busiest = find_busiest_queue(group, idle, imbalance);
2143 if (!busiest) { 2528 if (!busiest) {
2144 schedstat_inc(sd, lb_nobusyq[idle]); 2529 schedstat_inc(sd, lb_nobusyq[idle]);
2145 goto out_balanced; 2530 goto out_balanced;
@@ -2159,7 +2544,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
2159 */ 2544 */
2160 double_rq_lock(this_rq, busiest); 2545 double_rq_lock(this_rq, busiest);
2161 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2546 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2162 imbalance, sd, idle, &all_pinned); 2547 minus_1_or_zero(busiest->nr_running),
2548 imbalance, sd, idle, &all_pinned);
2163 double_rq_unlock(this_rq, busiest); 2549 double_rq_unlock(this_rq, busiest);
2164 2550
2165 /* All tasks on this runqueue were pinned by CPU affinity */ 2551 /* All tasks on this runqueue were pinned by CPU affinity */
@@ -2216,7 +2602,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
2216 sd->balance_interval *= 2; 2602 sd->balance_interval *= 2;
2217 } 2603 }
2218 2604
2219 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER) 2605 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2606 !sched_smt_power_savings)
2220 return -1; 2607 return -1;
2221 return nr_moved; 2608 return nr_moved;
2222 2609
@@ -2231,7 +2618,8 @@ out_one_pinned:
2231 (sd->balance_interval < sd->max_interval)) 2618 (sd->balance_interval < sd->max_interval))
2232 sd->balance_interval *= 2; 2619 sd->balance_interval *= 2;
2233 2620
2234 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) 2621 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2622 !sched_smt_power_savings)
2235 return -1; 2623 return -1;
2236 return 0; 2624 return 0;
2237} 2625}
@@ -2243,16 +2631,16 @@ out_one_pinned:
2243 * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). 2631 * Called from schedule when this_rq is about to become idle (NEWLY_IDLE).
2244 * this_rq is locked. 2632 * this_rq is locked.
2245 */ 2633 */
2246static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, 2634static int
2247 struct sched_domain *sd) 2635load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
2248{ 2636{
2249 struct sched_group *group; 2637 struct sched_group *group;
2250 runqueue_t *busiest = NULL; 2638 struct rq *busiest = NULL;
2251 unsigned long imbalance; 2639 unsigned long imbalance;
2252 int nr_moved = 0; 2640 int nr_moved = 0;
2253 int sd_idle = 0; 2641 int sd_idle = 0;
2254 2642
2255 if (sd->flags & SD_SHARE_CPUPOWER) 2643 if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
2256 sd_idle = 1; 2644 sd_idle = 1;
2257 2645
2258 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); 2646 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
@@ -2262,7 +2650,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2262 goto out_balanced; 2650 goto out_balanced;
2263 } 2651 }
2264 2652
2265 busiest = find_busiest_queue(group, NEWLY_IDLE); 2653 busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance);
2266 if (!busiest) { 2654 if (!busiest) {
2267 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); 2655 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
2268 goto out_balanced; 2656 goto out_balanced;
@@ -2277,6 +2665,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2277 /* Attempt to move tasks */ 2665 /* Attempt to move tasks */
2278 double_lock_balance(this_rq, busiest); 2666 double_lock_balance(this_rq, busiest);
2279 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2667 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2668 minus_1_or_zero(busiest->nr_running),
2280 imbalance, sd, NEWLY_IDLE, NULL); 2669 imbalance, sd, NEWLY_IDLE, NULL);
2281 spin_unlock(&busiest->lock); 2670 spin_unlock(&busiest->lock);
2282 } 2671 }
@@ -2292,9 +2681,11 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2292 2681
2293out_balanced: 2682out_balanced:
2294 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); 2683 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
2295 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) 2684 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2685 !sched_smt_power_savings)
2296 return -1; 2686 return -1;
2297 sd->nr_balance_failed = 0; 2687 sd->nr_balance_failed = 0;
2688
2298 return 0; 2689 return 0;
2299} 2690}
2300 2691
@@ -2302,16 +2693,15 @@ out_balanced:
2302 * idle_balance is called by schedule() if this_cpu is about to become 2693 * idle_balance is called by schedule() if this_cpu is about to become
2303 * idle. Attempts to pull tasks from other CPUs. 2694 * idle. Attempts to pull tasks from other CPUs.
2304 */ 2695 */
2305static void idle_balance(int this_cpu, runqueue_t *this_rq) 2696static void idle_balance(int this_cpu, struct rq *this_rq)
2306{ 2697{
2307 struct sched_domain *sd; 2698 struct sched_domain *sd;
2308 2699
2309 for_each_domain(this_cpu, sd) { 2700 for_each_domain(this_cpu, sd) {
2310 if (sd->flags & SD_BALANCE_NEWIDLE) { 2701 if (sd->flags & SD_BALANCE_NEWIDLE) {
2311 if (load_balance_newidle(this_cpu, this_rq, sd)) { 2702 /* If we've pulled tasks over stop searching: */
2312 /* We've pulled tasks over so stop searching */ 2703 if (load_balance_newidle(this_cpu, this_rq, sd))
2313 break; 2704 break;
2314 }
2315 } 2705 }
2316 } 2706 }
2317} 2707}
@@ -2324,14 +2714,14 @@ static void idle_balance(int this_cpu, runqueue_t *this_rq)
2324 * 2714 *
2325 * Called with busiest_rq locked. 2715 * Called with busiest_rq locked.
2326 */ 2716 */
2327static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) 2717static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
2328{ 2718{
2329 struct sched_domain *sd;
2330 runqueue_t *target_rq;
2331 int target_cpu = busiest_rq->push_cpu; 2719 int target_cpu = busiest_rq->push_cpu;
2720 struct sched_domain *sd;
2721 struct rq *target_rq;
2332 2722
2723 /* Is there any task to move? */
2333 if (busiest_rq->nr_running <= 1) 2724 if (busiest_rq->nr_running <= 1)
2334 /* no task to move */
2335 return; 2725 return;
2336 2726
2337 target_rq = cpu_rq(target_cpu); 2727 target_rq = cpu_rq(target_cpu);
@@ -2347,21 +2737,22 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
2347 double_lock_balance(busiest_rq, target_rq); 2737 double_lock_balance(busiest_rq, target_rq);
2348 2738
2349 /* Search for an sd spanning us and the target CPU. */ 2739 /* Search for an sd spanning us and the target CPU. */
2350 for_each_domain(target_cpu, sd) 2740 for_each_domain(target_cpu, sd) {
2351 if ((sd->flags & SD_LOAD_BALANCE) && 2741 if ((sd->flags & SD_LOAD_BALANCE) &&
2352 cpu_isset(busiest_cpu, sd->span)) 2742 cpu_isset(busiest_cpu, sd->span))
2353 break; 2743 break;
2744 }
2354 2745
2355 if (unlikely(sd == NULL)) 2746 if (likely(sd)) {
2356 goto out; 2747 schedstat_inc(sd, alb_cnt);
2357
2358 schedstat_inc(sd, alb_cnt);
2359 2748
2360 if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL)) 2749 if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
2361 schedstat_inc(sd, alb_pushed); 2750 RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE,
2362 else 2751 NULL))
2363 schedstat_inc(sd, alb_failed); 2752 schedstat_inc(sd, alb_pushed);
2364out: 2753 else
2754 schedstat_inc(sd, alb_failed);
2755 }
2365 spin_unlock(&target_rq->lock); 2756 spin_unlock(&target_rq->lock);
2366} 2757}
2367 2758
@@ -2374,23 +2765,27 @@ out:
2374 * Balancing parameters are set up in arch_init_sched_domains. 2765 * Balancing parameters are set up in arch_init_sched_domains.
2375 */ 2766 */
2376 2767
2377/* Don't have all balancing operations going off at once */ 2768/* Don't have all balancing operations going off at once: */
2378#define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS) 2769static inline unsigned long cpu_offset(int cpu)
2770{
2771 return jiffies + cpu * HZ / NR_CPUS;
2772}
2379 2773
2380static void rebalance_tick(int this_cpu, runqueue_t *this_rq, 2774static void
2381 enum idle_type idle) 2775rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle)
2382{ 2776{
2383 unsigned long old_load, this_load; 2777 unsigned long this_load, interval, j = cpu_offset(this_cpu);
2384 unsigned long j = jiffies + CPU_OFFSET(this_cpu);
2385 struct sched_domain *sd; 2778 struct sched_domain *sd;
2386 int i; 2779 int i, scale;
2780
2781 this_load = this_rq->raw_weighted_load;
2782
2783 /* Update our load: */
2784 for (i = 0, scale = 1; i < 3; i++, scale <<= 1) {
2785 unsigned long old_load, new_load;
2387 2786
2388 this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
2389 /* Update our load */
2390 for (i = 0; i < 3; i++) {
2391 unsigned long new_load = this_load;
2392 int scale = 1 << i;
2393 old_load = this_rq->cpu_load[i]; 2787 old_load = this_rq->cpu_load[i];
2788 new_load = this_load;
2394 /* 2789 /*
2395 * Round up the averaging division if load is increasing. This 2790 * Round up the averaging division if load is increasing. This
2396 * prevents us from getting stuck on 9 if the load is 10, for 2791 * prevents us from getting stuck on 9 if the load is 10, for
@@ -2402,8 +2797,6 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
2402 } 2797 }
2403 2798
2404 for_each_domain(this_cpu, sd) { 2799 for_each_domain(this_cpu, sd) {
2405 unsigned long interval;
2406
2407 if (!(sd->flags & SD_LOAD_BALANCE)) 2800 if (!(sd->flags & SD_LOAD_BALANCE))
2408 continue; 2801 continue;
2409 2802
@@ -2433,17 +2826,18 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
2433/* 2826/*
2434 * on UP we do not need to balance between CPUs: 2827 * on UP we do not need to balance between CPUs:
2435 */ 2828 */
2436static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle) 2829static inline void rebalance_tick(int cpu, struct rq *rq, enum idle_type idle)
2437{ 2830{
2438} 2831}
2439static inline void idle_balance(int cpu, runqueue_t *rq) 2832static inline void idle_balance(int cpu, struct rq *rq)
2440{ 2833{
2441} 2834}
2442#endif 2835#endif
2443 2836
2444static inline int wake_priority_sleeper(runqueue_t *rq) 2837static inline int wake_priority_sleeper(struct rq *rq)
2445{ 2838{
2446 int ret = 0; 2839 int ret = 0;
2840
2447#ifdef CONFIG_SCHED_SMT 2841#ifdef CONFIG_SCHED_SMT
2448 spin_lock(&rq->lock); 2842 spin_lock(&rq->lock);
2449 /* 2843 /*
@@ -2467,25 +2861,26 @@ EXPORT_PER_CPU_SYMBOL(kstat);
2467 * This is called on clock ticks and on context switches. 2861 * This is called on clock ticks and on context switches.
2468 * Bank in p->sched_time the ns elapsed since the last tick or switch. 2862 * Bank in p->sched_time the ns elapsed since the last tick or switch.
2469 */ 2863 */
2470static inline void update_cpu_clock(task_t *p, runqueue_t *rq, 2864static inline void
2471 unsigned long long now) 2865update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now)
2472{ 2866{
2473 unsigned long long last = max(p->timestamp, rq->timestamp_last_tick); 2867 p->sched_time += now - max(p->timestamp, rq->timestamp_last_tick);
2474 p->sched_time += now - last;
2475} 2868}
2476 2869
2477/* 2870/*
2478 * Return current->sched_time plus any more ns on the sched_clock 2871 * Return current->sched_time plus any more ns on the sched_clock
2479 * that have not yet been banked. 2872 * that have not yet been banked.
2480 */ 2873 */
2481unsigned long long current_sched_time(const task_t *tsk) 2874unsigned long long current_sched_time(const struct task_struct *p)
2482{ 2875{
2483 unsigned long long ns; 2876 unsigned long long ns;
2484 unsigned long flags; 2877 unsigned long flags;
2878
2485 local_irq_save(flags); 2879 local_irq_save(flags);
2486 ns = max(tsk->timestamp, task_rq(tsk)->timestamp_last_tick); 2880 ns = max(p->timestamp, task_rq(p)->timestamp_last_tick);
2487 ns = tsk->sched_time + (sched_clock() - ns); 2881 ns = p->sched_time + sched_clock() - ns;
2488 local_irq_restore(flags); 2882 local_irq_restore(flags);
2883
2489 return ns; 2884 return ns;
2490} 2885}
2491 2886
@@ -2499,11 +2894,16 @@ unsigned long long current_sched_time(const task_t *tsk)
2499 * increasing number of running tasks. We also ignore the interactivity 2894 * increasing number of running tasks. We also ignore the interactivity
2500 * if a better static_prio task has expired: 2895 * if a better static_prio task has expired:
2501 */ 2896 */
2502#define EXPIRED_STARVING(rq) \ 2897static inline int expired_starving(struct rq *rq)
2503 ((STARVATION_LIMIT && ((rq)->expired_timestamp && \ 2898{
2504 (jiffies - (rq)->expired_timestamp >= \ 2899 if (rq->curr->static_prio > rq->best_expired_prio)
2505 STARVATION_LIMIT * ((rq)->nr_running) + 1))) || \ 2900 return 1;
2506 ((rq)->curr->static_prio > (rq)->best_expired_prio)) 2901 if (!STARVATION_LIMIT || !rq->expired_timestamp)
2902 return 0;
2903 if (jiffies - rq->expired_timestamp > STARVATION_LIMIT * rq->nr_running)
2904 return 1;
2905 return 0;
2906}
2507 2907
2508/* 2908/*
2509 * Account user cpu time to a process. 2909 * Account user cpu time to a process.
@@ -2536,7 +2936,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
2536 cputime_t cputime) 2936 cputime_t cputime)
2537{ 2937{
2538 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2938 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2539 runqueue_t *rq = this_rq(); 2939 struct rq *rq = this_rq();
2540 cputime64_t tmp; 2940 cputime64_t tmp;
2541 2941
2542 p->stime = cputime_add(p->stime, cputime); 2942 p->stime = cputime_add(p->stime, cputime);
@@ -2566,7 +2966,7 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
2566{ 2966{
2567 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; 2967 struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
2568 cputime64_t tmp = cputime_to_cputime64(steal); 2968 cputime64_t tmp = cputime_to_cputime64(steal);
2569 runqueue_t *rq = this_rq(); 2969 struct rq *rq = this_rq();
2570 2970
2571 if (p == rq->idle) { 2971 if (p == rq->idle) {
2572 p->stime = cputime_add(p->stime, steal); 2972 p->stime = cputime_add(p->stime, steal);
@@ -2587,10 +2987,10 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
2587 */ 2987 */
2588void scheduler_tick(void) 2988void scheduler_tick(void)
2589{ 2989{
2590 int cpu = smp_processor_id();
2591 runqueue_t *rq = this_rq();
2592 task_t *p = current;
2593 unsigned long long now = sched_clock(); 2990 unsigned long long now = sched_clock();
2991 struct task_struct *p = current;
2992 int cpu = smp_processor_id();
2993 struct rq *rq = cpu_rq(cpu);
2594 2994
2595 update_cpu_clock(p, rq, now); 2995 update_cpu_clock(p, rq, now);
2596 2996
@@ -2640,7 +3040,7 @@ void scheduler_tick(void)
2640 3040
2641 if (!rq->expired_timestamp) 3041 if (!rq->expired_timestamp)
2642 rq->expired_timestamp = jiffies; 3042 rq->expired_timestamp = jiffies;
2643 if (!TASK_INTERACTIVE(p) || EXPIRED_STARVING(rq)) { 3043 if (!TASK_INTERACTIVE(p) || expired_starving(rq)) {
2644 enqueue_task(p, rq->expired); 3044 enqueue_task(p, rq->expired);
2645 if (p->static_prio < rq->best_expired_prio) 3045 if (p->static_prio < rq->best_expired_prio)
2646 rq->best_expired_prio = p->static_prio; 3046 rq->best_expired_prio = p->static_prio;
@@ -2679,55 +3079,42 @@ out:
2679} 3079}
2680 3080
2681#ifdef CONFIG_SCHED_SMT 3081#ifdef CONFIG_SCHED_SMT
2682static inline void wakeup_busy_runqueue(runqueue_t *rq) 3082static inline void wakeup_busy_runqueue(struct rq *rq)
2683{ 3083{
2684 /* If an SMT runqueue is sleeping due to priority reasons wake it up */ 3084 /* If an SMT runqueue is sleeping due to priority reasons wake it up */
2685 if (rq->curr == rq->idle && rq->nr_running) 3085 if (rq->curr == rq->idle && rq->nr_running)
2686 resched_task(rq->idle); 3086 resched_task(rq->idle);
2687} 3087}
2688 3088
2689static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) 3089/*
3090 * Called with interrupt disabled and this_rq's runqueue locked.
3091 */
3092static void wake_sleeping_dependent(int this_cpu)
2690{ 3093{
2691 struct sched_domain *tmp, *sd = NULL; 3094 struct sched_domain *tmp, *sd = NULL;
2692 cpumask_t sibling_map;
2693 int i; 3095 int i;
2694 3096
2695 for_each_domain(this_cpu, tmp) 3097 for_each_domain(this_cpu, tmp) {
2696 if (tmp->flags & SD_SHARE_CPUPOWER) 3098 if (tmp->flags & SD_SHARE_CPUPOWER) {
2697 sd = tmp; 3099 sd = tmp;
3100 break;
3101 }
3102 }
2698 3103
2699 if (!sd) 3104 if (!sd)
2700 return; 3105 return;
2701 3106
2702 /* 3107 for_each_cpu_mask(i, sd->span) {
2703 * Unlock the current runqueue because we have to lock in 3108 struct rq *smt_rq = cpu_rq(i);
2704 * CPU order to avoid deadlocks. Caller knows that we might
2705 * unlock. We keep IRQs disabled.
2706 */
2707 spin_unlock(&this_rq->lock);
2708
2709 sibling_map = sd->span;
2710 3109
2711 for_each_cpu_mask(i, sibling_map) 3110 if (i == this_cpu)
2712 spin_lock(&cpu_rq(i)->lock); 3111 continue;
2713 /* 3112 if (unlikely(!spin_trylock(&smt_rq->lock)))
2714 * We clear this CPU from the mask. This both simplifies the 3113 continue;
2715 * inner loop and keps this_rq locked when we exit:
2716 */
2717 cpu_clear(this_cpu, sibling_map);
2718
2719 for_each_cpu_mask(i, sibling_map) {
2720 runqueue_t *smt_rq = cpu_rq(i);
2721 3114
2722 wakeup_busy_runqueue(smt_rq); 3115 wakeup_busy_runqueue(smt_rq);
3116 spin_unlock(&smt_rq->lock);
2723 } 3117 }
2724
2725 for_each_cpu_mask(i, sibling_map)
2726 spin_unlock(&cpu_rq(i)->lock);
2727 /*
2728 * We exit with this_cpu's rq still held and IRQs
2729 * still disabled:
2730 */
2731} 3118}
2732 3119
2733/* 3120/*
@@ -2735,57 +3122,53 @@ static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
2735 * utilize, if another task runs on a sibling. This models the 3122 * utilize, if another task runs on a sibling. This models the
2736 * slowdown effect of other tasks running on siblings: 3123 * slowdown effect of other tasks running on siblings:
2737 */ 3124 */
2738static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd) 3125static inline unsigned long
3126smt_slice(struct task_struct *p, struct sched_domain *sd)
2739{ 3127{
2740 return p->time_slice * (100 - sd->per_cpu_gain) / 100; 3128 return p->time_slice * (100 - sd->per_cpu_gain) / 100;
2741} 3129}
2742 3130
2743static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) 3131/*
3132 * To minimise lock contention and not have to drop this_rq's runlock we only
3133 * trylock the sibling runqueues and bypass those runqueues if we fail to
3134 * acquire their lock. As we only trylock the normal locking order does not
3135 * need to be obeyed.
3136 */
3137static int
3138dependent_sleeper(int this_cpu, struct rq *this_rq, struct task_struct *p)
2744{ 3139{
2745 struct sched_domain *tmp, *sd = NULL; 3140 struct sched_domain *tmp, *sd = NULL;
2746 cpumask_t sibling_map;
2747 prio_array_t *array;
2748 int ret = 0, i; 3141 int ret = 0, i;
2749 task_t *p;
2750 3142
2751 for_each_domain(this_cpu, tmp) 3143 /* kernel/rt threads do not participate in dependent sleeping */
2752 if (tmp->flags & SD_SHARE_CPUPOWER) 3144 if (!p->mm || rt_task(p))
3145 return 0;
3146
3147 for_each_domain(this_cpu, tmp) {
3148 if (tmp->flags & SD_SHARE_CPUPOWER) {
2753 sd = tmp; 3149 sd = tmp;
3150 break;
3151 }
3152 }
2754 3153
2755 if (!sd) 3154 if (!sd)
2756 return 0; 3155 return 0;
2757 3156
2758 /* 3157 for_each_cpu_mask(i, sd->span) {
2759 * The same locking rules and details apply as for 3158 struct task_struct *smt_curr;
2760 * wake_sleeping_dependent(): 3159 struct rq *smt_rq;
2761 */
2762 spin_unlock(&this_rq->lock);
2763 sibling_map = sd->span;
2764 for_each_cpu_mask(i, sibling_map)
2765 spin_lock(&cpu_rq(i)->lock);
2766 cpu_clear(this_cpu, sibling_map);
2767 3160
2768 /* 3161 if (i == this_cpu)
2769 * Establish next task to be run - it might have gone away because 3162 continue;
2770 * we released the runqueue lock above:
2771 */
2772 if (!this_rq->nr_running)
2773 goto out_unlock;
2774 array = this_rq->active;
2775 if (!array->nr_active)
2776 array = this_rq->expired;
2777 BUG_ON(!array->nr_active);
2778 3163
2779 p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next, 3164 smt_rq = cpu_rq(i);
2780 task_t, run_list); 3165 if (unlikely(!spin_trylock(&smt_rq->lock)))
3166 continue;
2781 3167
2782 for_each_cpu_mask(i, sibling_map) { 3168 smt_curr = smt_rq->curr;
2783 runqueue_t *smt_rq = cpu_rq(i);
2784 task_t *smt_curr = smt_rq->curr;
2785 3169
2786 /* Kernel threads do not participate in dependent sleeping */ 3170 if (!smt_curr->mm)
2787 if (!p->mm || !smt_curr->mm || rt_task(p)) 3171 goto unlock;
2788 goto check_smt_task;
2789 3172
2790 /* 3173 /*
2791 * If a user task with lower static priority than the 3174 * If a user task with lower static priority than the
@@ -2803,49 +3186,23 @@ static int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2803 if ((jiffies % DEF_TIMESLICE) > 3186 if ((jiffies % DEF_TIMESLICE) >
2804 (sd->per_cpu_gain * DEF_TIMESLICE / 100)) 3187 (sd->per_cpu_gain * DEF_TIMESLICE / 100))
2805 ret = 1; 3188 ret = 1;
2806 } else 3189 } else {
2807 if (smt_curr->static_prio < p->static_prio && 3190 if (smt_curr->static_prio < p->static_prio &&
2808 !TASK_PREEMPTS_CURR(p, smt_rq) && 3191 !TASK_PREEMPTS_CURR(p, smt_rq) &&
2809 smt_slice(smt_curr, sd) > task_timeslice(p)) 3192 smt_slice(smt_curr, sd) > task_timeslice(p))
2810 ret = 1; 3193 ret = 1;
2811
2812check_smt_task:
2813 if ((!smt_curr->mm && smt_curr != smt_rq->idle) ||
2814 rt_task(smt_curr))
2815 continue;
2816 if (!p->mm) {
2817 wakeup_busy_runqueue(smt_rq);
2818 continue;
2819 }
2820
2821 /*
2822 * Reschedule a lower priority task on the SMT sibling for
2823 * it to be put to sleep, or wake it up if it has been put to
2824 * sleep for priority reasons to see if it should run now.
2825 */
2826 if (rt_task(p)) {
2827 if ((jiffies % DEF_TIMESLICE) >
2828 (sd->per_cpu_gain * DEF_TIMESLICE / 100))
2829 resched_task(smt_curr);
2830 } else {
2831 if (TASK_PREEMPTS_CURR(p, smt_rq) &&
2832 smt_slice(p, sd) > task_timeslice(smt_curr))
2833 resched_task(smt_curr);
2834 else
2835 wakeup_busy_runqueue(smt_rq);
2836 } 3194 }
3195unlock:
3196 spin_unlock(&smt_rq->lock);
2837 } 3197 }
2838out_unlock:
2839 for_each_cpu_mask(i, sibling_map)
2840 spin_unlock(&cpu_rq(i)->lock);
2841 return ret; 3198 return ret;
2842} 3199}
2843#else 3200#else
2844static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) 3201static inline void wake_sleeping_dependent(int this_cpu)
2845{ 3202{
2846} 3203}
2847 3204static inline int
2848static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) 3205dependent_sleeper(int this_cpu, struct rq *this_rq, struct task_struct *p)
2849{ 3206{
2850 return 0; 3207 return 0;
2851} 3208}
@@ -2858,12 +3215,13 @@ void fastcall add_preempt_count(int val)
2858 /* 3215 /*
2859 * Underflow? 3216 * Underflow?
2860 */ 3217 */
2861 BUG_ON((preempt_count() < 0)); 3218 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3219 return;
2862 preempt_count() += val; 3220 preempt_count() += val;
2863 /* 3221 /*
2864 * Spinlock count overflowing soon? 3222 * Spinlock count overflowing soon?
2865 */ 3223 */
2866 BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); 3224 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10);
2867} 3225}
2868EXPORT_SYMBOL(add_preempt_count); 3226EXPORT_SYMBOL(add_preempt_count);
2869 3227
@@ -2872,11 +3230,15 @@ void fastcall sub_preempt_count(int val)
2872 /* 3230 /*
2873 * Underflow? 3231 * Underflow?
2874 */ 3232 */
2875 BUG_ON(val > preempt_count()); 3233 if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3234 return;
2876 /* 3235 /*
2877 * Is the spinlock portion underflowing? 3236 * Is the spinlock portion underflowing?
2878 */ 3237 */
2879 BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK)); 3238 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3239 !(preempt_count() & PREEMPT_MASK)))
3240 return;
3241
2880 preempt_count() -= val; 3242 preempt_count() -= val;
2881} 3243}
2882EXPORT_SYMBOL(sub_preempt_count); 3244EXPORT_SYMBOL(sub_preempt_count);
@@ -2894,14 +3256,14 @@ static inline int interactive_sleep(enum sleep_type sleep_type)
2894 */ 3256 */
2895asmlinkage void __sched schedule(void) 3257asmlinkage void __sched schedule(void)
2896{ 3258{
2897 long *switch_count; 3259 struct task_struct *prev, *next;
2898 task_t *prev, *next; 3260 struct prio_array *array;
2899 runqueue_t *rq;
2900 prio_array_t *array;
2901 struct list_head *queue; 3261 struct list_head *queue;
2902 unsigned long long now; 3262 unsigned long long now;
2903 unsigned long run_time; 3263 unsigned long run_time;
2904 int cpu, idx, new_prio; 3264 int cpu, idx, new_prio;
3265 long *switch_count;
3266 struct rq *rq;
2905 3267
2906 /* 3268 /*
2907 * Test if we are atomic. Since do_exit() needs to call into 3269 * Test if we are atomic. Since do_exit() needs to call into
@@ -2967,32 +3329,13 @@ need_resched_nonpreemptible:
2967 3329
2968 cpu = smp_processor_id(); 3330 cpu = smp_processor_id();
2969 if (unlikely(!rq->nr_running)) { 3331 if (unlikely(!rq->nr_running)) {
2970go_idle:
2971 idle_balance(cpu, rq); 3332 idle_balance(cpu, rq);
2972 if (!rq->nr_running) { 3333 if (!rq->nr_running) {
2973 next = rq->idle; 3334 next = rq->idle;
2974 rq->expired_timestamp = 0; 3335 rq->expired_timestamp = 0;
2975 wake_sleeping_dependent(cpu, rq); 3336 wake_sleeping_dependent(cpu);
2976 /*
2977 * wake_sleeping_dependent() might have released
2978 * the runqueue, so break out if we got new
2979 * tasks meanwhile:
2980 */
2981 if (!rq->nr_running)
2982 goto switch_tasks;
2983 }
2984 } else {
2985 if (dependent_sleeper(cpu, rq)) {
2986 next = rq->idle;
2987 goto switch_tasks; 3337 goto switch_tasks;
2988 } 3338 }
2989 /*
2990 * dependent_sleeper() releases and reacquires the runqueue
2991 * lock, hence go into the idle loop if the rq went
2992 * empty meanwhile:
2993 */
2994 if (unlikely(!rq->nr_running))
2995 goto go_idle;
2996 } 3339 }
2997 3340
2998 array = rq->active; 3341 array = rq->active;
@@ -3010,7 +3353,7 @@ go_idle:
3010 3353
3011 idx = sched_find_first_bit(array->bitmap); 3354 idx = sched_find_first_bit(array->bitmap);
3012 queue = array->queue + idx; 3355 queue = array->queue + idx;
3013 next = list_entry(queue->next, task_t, run_list); 3356 next = list_entry(queue->next, struct task_struct, run_list);
3014 3357
3015 if (!rt_task(next) && interactive_sleep(next->sleep_type)) { 3358 if (!rt_task(next) && interactive_sleep(next->sleep_type)) {
3016 unsigned long long delta = now - next->timestamp; 3359 unsigned long long delta = now - next->timestamp;
@@ -3030,6 +3373,8 @@ go_idle:
3030 } 3373 }
3031 } 3374 }
3032 next->sleep_type = SLEEP_NORMAL; 3375 next->sleep_type = SLEEP_NORMAL;
3376 if (dependent_sleeper(cpu, rq, next))
3377 next = rq->idle;
3033switch_tasks: 3378switch_tasks:
3034 if (next == rq->idle) 3379 if (next == rq->idle)
3035 schedstat_inc(rq, sched_goidle); 3380 schedstat_inc(rq, sched_goidle);
@@ -3071,12 +3416,11 @@ switch_tasks:
3071 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 3416 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3072 goto need_resched; 3417 goto need_resched;
3073} 3418}
3074
3075EXPORT_SYMBOL(schedule); 3419EXPORT_SYMBOL(schedule);
3076 3420
3077#ifdef CONFIG_PREEMPT 3421#ifdef CONFIG_PREEMPT
3078/* 3422/*
3079 * this is is the entry point to schedule() from in-kernel preemption 3423 * this is the entry point to schedule() from in-kernel preemption
3080 * off of preempt_enable. Kernel preemptions off return from interrupt 3424 * off of preempt_enable. Kernel preemptions off return from interrupt
3081 * occur there and call schedule directly. 3425 * occur there and call schedule directly.
3082 */ 3426 */
@@ -3116,11 +3460,10 @@ need_resched:
3116 if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) 3460 if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
3117 goto need_resched; 3461 goto need_resched;
3118} 3462}
3119
3120EXPORT_SYMBOL(preempt_schedule); 3463EXPORT_SYMBOL(preempt_schedule);
3121 3464
3122/* 3465/*
3123 * this is is the entry point to schedule() from kernel preemption 3466 * this is the entry point to schedule() from kernel preemption
3124 * off of irq context. 3467 * off of irq context.
3125 * Note, that this is called and return with irqs disabled. This will 3468 * Note, that this is called and return with irqs disabled. This will
3126 * protect us against recursive calling from irq. 3469 * protect us against recursive calling from irq.
@@ -3132,7 +3475,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
3132 struct task_struct *task = current; 3475 struct task_struct *task = current;
3133 int saved_lock_depth; 3476 int saved_lock_depth;
3134#endif 3477#endif
3135 /* Catch callers which need to be fixed*/ 3478 /* Catch callers which need to be fixed */
3136 BUG_ON(ti->preempt_count || !irqs_disabled()); 3479 BUG_ON(ti->preempt_count || !irqs_disabled());
3137 3480
3138need_resched: 3481need_resched:
@@ -3165,10 +3508,8 @@ need_resched:
3165int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, 3508int default_wake_function(wait_queue_t *curr, unsigned mode, int sync,
3166 void *key) 3509 void *key)
3167{ 3510{
3168 task_t *p = curr->private; 3511 return try_to_wake_up(curr->private, mode, sync);
3169 return try_to_wake_up(p, mode, sync);
3170} 3512}
3171
3172EXPORT_SYMBOL(default_wake_function); 3513EXPORT_SYMBOL(default_wake_function);
3173 3514
3174/* 3515/*
@@ -3186,13 +3527,11 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
3186 struct list_head *tmp, *next; 3527 struct list_head *tmp, *next;
3187 3528
3188 list_for_each_safe(tmp, next, &q->task_list) { 3529 list_for_each_safe(tmp, next, &q->task_list) {
3189 wait_queue_t *curr; 3530 wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);
3190 unsigned flags; 3531 unsigned flags = curr->flags;
3191 curr = list_entry(tmp, wait_queue_t, task_list); 3532
3192 flags = curr->flags;
3193 if (curr->func(curr, mode, sync, key) && 3533 if (curr->func(curr, mode, sync, key) &&
3194 (flags & WQ_FLAG_EXCLUSIVE) && 3534 (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
3195 !--nr_exclusive)
3196 break; 3535 break;
3197 } 3536 }
3198} 3537}
@@ -3213,7 +3552,6 @@ void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
3213 __wake_up_common(q, mode, nr_exclusive, 0, key); 3552 __wake_up_common(q, mode, nr_exclusive, 0, key);
3214 spin_unlock_irqrestore(&q->lock, flags); 3553 spin_unlock_irqrestore(&q->lock, flags);
3215} 3554}
3216
3217EXPORT_SYMBOL(__wake_up); 3555EXPORT_SYMBOL(__wake_up);
3218 3556
3219/* 3557/*
@@ -3282,6 +3620,7 @@ EXPORT_SYMBOL(complete_all);
3282void fastcall __sched wait_for_completion(struct completion *x) 3620void fastcall __sched wait_for_completion(struct completion *x)
3283{ 3621{
3284 might_sleep(); 3622 might_sleep();
3623
3285 spin_lock_irq(&x->wait.lock); 3624 spin_lock_irq(&x->wait.lock);
3286 if (!x->done) { 3625 if (!x->done) {
3287 DECLARE_WAITQUEUE(wait, current); 3626 DECLARE_WAITQUEUE(wait, current);
@@ -3426,7 +3765,6 @@ void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q)
3426 schedule(); 3765 schedule();
3427 SLEEP_ON_TAIL 3766 SLEEP_ON_TAIL
3428} 3767}
3429
3430EXPORT_SYMBOL(interruptible_sleep_on); 3768EXPORT_SYMBOL(interruptible_sleep_on);
3431 3769
3432long fastcall __sched 3770long fastcall __sched
@@ -3442,7 +3780,6 @@ interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
3442 3780
3443 return timeout; 3781 return timeout;
3444} 3782}
3445
3446EXPORT_SYMBOL(interruptible_sleep_on_timeout); 3783EXPORT_SYMBOL(interruptible_sleep_on_timeout);
3447 3784
3448void fastcall __sched sleep_on(wait_queue_head_t *q) 3785void fastcall __sched sleep_on(wait_queue_head_t *q)
@@ -3455,7 +3792,6 @@ void fastcall __sched sleep_on(wait_queue_head_t *q)
3455 schedule(); 3792 schedule();
3456 SLEEP_ON_TAIL 3793 SLEEP_ON_TAIL
3457} 3794}
3458
3459EXPORT_SYMBOL(sleep_on); 3795EXPORT_SYMBOL(sleep_on);
3460 3796
3461long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) 3797long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
@@ -3473,12 +3809,65 @@ long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3473 3809
3474EXPORT_SYMBOL(sleep_on_timeout); 3810EXPORT_SYMBOL(sleep_on_timeout);
3475 3811
3476void set_user_nice(task_t *p, long nice) 3812#ifdef CONFIG_RT_MUTEXES
3813
3814/*
3815 * rt_mutex_setprio - set the current priority of a task
3816 * @p: task
3817 * @prio: prio value (kernel-internal form)
3818 *
3819 * This function changes the 'effective' priority of a task. It does
3820 * not touch ->normal_prio like __setscheduler().
3821 *
3822 * Used by the rt_mutex code to implement priority inheritance logic.
3823 */
3824void rt_mutex_setprio(struct task_struct *p, int prio)
3477{ 3825{
3826 struct prio_array *array;
3478 unsigned long flags; 3827 unsigned long flags;
3479 prio_array_t *array; 3828 struct rq *rq;
3480 runqueue_t *rq; 3829 int oldprio;
3481 int old_prio, new_prio, delta; 3830
3831 BUG_ON(prio < 0 || prio > MAX_PRIO);
3832
3833 rq = task_rq_lock(p, &flags);
3834
3835 oldprio = p->prio;
3836 array = p->array;
3837 if (array)
3838 dequeue_task(p, array);
3839 p->prio = prio;
3840
3841 if (array) {
3842 /*
3843 * If changing to an RT priority then queue it
3844 * in the active array!
3845 */
3846 if (rt_task(p))
3847 array = rq->active;
3848 enqueue_task(p, array);
3849 /*
3850 * Reschedule if we are currently running on this runqueue and
3851 * our priority decreased, or if we are not currently running on
3852 * this runqueue and our priority is higher than the current's
3853 */
3854 if (task_running(rq, p)) {
3855 if (p->prio > oldprio)
3856 resched_task(rq->curr);
3857 } else if (TASK_PREEMPTS_CURR(p, rq))
3858 resched_task(rq->curr);
3859 }
3860 task_rq_unlock(rq, &flags);
3861}
3862
3863#endif
3864
3865void set_user_nice(struct task_struct *p, long nice)
3866{
3867 struct prio_array *array;
3868 int old_prio, delta;
3869 unsigned long flags;
3870 struct rq *rq;
3482 3871
3483 if (TASK_NICE(p) == nice || nice < -20 || nice > 19) 3872 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3484 return; 3873 return;
@@ -3493,22 +3882,25 @@ void set_user_nice(task_t *p, long nice)
3493 * it wont have any effect on scheduling until the task is 3882 * it wont have any effect on scheduling until the task is
3494 * not SCHED_NORMAL/SCHED_BATCH: 3883 * not SCHED_NORMAL/SCHED_BATCH:
3495 */ 3884 */
3496 if (rt_task(p)) { 3885 if (has_rt_policy(p)) {
3497 p->static_prio = NICE_TO_PRIO(nice); 3886 p->static_prio = NICE_TO_PRIO(nice);
3498 goto out_unlock; 3887 goto out_unlock;
3499 } 3888 }
3500 array = p->array; 3889 array = p->array;
3501 if (array) 3890 if (array) {
3502 dequeue_task(p, array); 3891 dequeue_task(p, array);
3892 dec_raw_weighted_load(rq, p);
3893 }
3503 3894
3504 old_prio = p->prio;
3505 new_prio = NICE_TO_PRIO(nice);
3506 delta = new_prio - old_prio;
3507 p->static_prio = NICE_TO_PRIO(nice); 3895 p->static_prio = NICE_TO_PRIO(nice);
3508 p->prio += delta; 3896 set_load_weight(p);
3897 old_prio = p->prio;
3898 p->prio = effective_prio(p);
3899 delta = p->prio - old_prio;
3509 3900
3510 if (array) { 3901 if (array) {
3511 enqueue_task(p, array); 3902 enqueue_task(p, array);
3903 inc_raw_weighted_load(rq, p);
3512 /* 3904 /*
3513 * If the task increased its priority or is running and 3905 * If the task increased its priority or is running and
3514 * lowered its priority, then reschedule its CPU: 3906 * lowered its priority, then reschedule its CPU:
@@ -3519,7 +3911,6 @@ void set_user_nice(task_t *p, long nice)
3519out_unlock: 3911out_unlock:
3520 task_rq_unlock(rq, &flags); 3912 task_rq_unlock(rq, &flags);
3521} 3913}
3522
3523EXPORT_SYMBOL(set_user_nice); 3914EXPORT_SYMBOL(set_user_nice);
3524 3915
3525/* 3916/*
@@ -3527,10 +3918,11 @@ EXPORT_SYMBOL(set_user_nice);
3527 * @p: task 3918 * @p: task
3528 * @nice: nice value 3919 * @nice: nice value
3529 */ 3920 */
3530int can_nice(const task_t *p, const int nice) 3921int can_nice(const struct task_struct *p, const int nice)
3531{ 3922{
3532 /* convert nice value [19,-20] to rlimit style value [1,40] */ 3923 /* convert nice value [19,-20] to rlimit style value [1,40] */
3533 int nice_rlim = 20 - nice; 3924 int nice_rlim = 20 - nice;
3925
3534 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || 3926 return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
3535 capable(CAP_SYS_NICE)); 3927 capable(CAP_SYS_NICE));
3536} 3928}
@@ -3546,8 +3938,7 @@ int can_nice(const task_t *p, const int nice)
3546 */ 3938 */
3547asmlinkage long sys_nice(int increment) 3939asmlinkage long sys_nice(int increment)
3548{ 3940{
3549 int retval; 3941 long nice, retval;
3550 long nice;
3551 3942
3552 /* 3943 /*
3553 * Setpriority might change our priority at the same moment. 3944 * Setpriority might change our priority at the same moment.
@@ -3586,7 +3977,7 @@ asmlinkage long sys_nice(int increment)
3586 * RT tasks are offset by -200. Normal tasks are centered 3977 * RT tasks are offset by -200. Normal tasks are centered
3587 * around 0, value goes from -16 to +15. 3978 * around 0, value goes from -16 to +15.
3588 */ 3979 */
3589int task_prio(const task_t *p) 3980int task_prio(const struct task_struct *p)
3590{ 3981{
3591 return p->prio - MAX_RT_PRIO; 3982 return p->prio - MAX_RT_PRIO;
3592} 3983}
@@ -3595,7 +3986,7 @@ int task_prio(const task_t *p)
3595 * task_nice - return the nice value of a given task. 3986 * task_nice - return the nice value of a given task.
3596 * @p: the task in question. 3987 * @p: the task in question.
3597 */ 3988 */
3598int task_nice(const task_t *p) 3989int task_nice(const struct task_struct *p)
3599{ 3990{
3600 return TASK_NICE(p); 3991 return TASK_NICE(p);
3601} 3992}
@@ -3614,7 +4005,7 @@ int idle_cpu(int cpu)
3614 * idle_task - return the idle task for a given cpu. 4005 * idle_task - return the idle task for a given cpu.
3615 * @cpu: the processor in question. 4006 * @cpu: the processor in question.
3616 */ 4007 */
3617task_t *idle_task(int cpu) 4008struct task_struct *idle_task(int cpu)
3618{ 4009{
3619 return cpu_rq(cpu)->idle; 4010 return cpu_rq(cpu)->idle;
3620} 4011}
@@ -3623,7 +4014,7 @@ task_t *idle_task(int cpu)
3623 * find_process_by_pid - find a process with a matching PID value. 4014 * find_process_by_pid - find a process with a matching PID value.
3624 * @pid: the pid in question. 4015 * @pid: the pid in question.
3625 */ 4016 */
3626static inline task_t *find_process_by_pid(pid_t pid) 4017static inline struct task_struct *find_process_by_pid(pid_t pid)
3627{ 4018{
3628 return pid ? find_task_by_pid(pid) : current; 4019 return pid ? find_task_by_pid(pid) : current;
3629} 4020}
@@ -3632,18 +4023,18 @@ static inline task_t *find_process_by_pid(pid_t pid)
3632static void __setscheduler(struct task_struct *p, int policy, int prio) 4023static void __setscheduler(struct task_struct *p, int policy, int prio)
3633{ 4024{
3634 BUG_ON(p->array); 4025 BUG_ON(p->array);
4026
3635 p->policy = policy; 4027 p->policy = policy;
3636 p->rt_priority = prio; 4028 p->rt_priority = prio;
3637 if (policy != SCHED_NORMAL && policy != SCHED_BATCH) { 4029 p->normal_prio = normal_prio(p);
3638 p->prio = MAX_RT_PRIO-1 - p->rt_priority; 4030 /* we are holding p->pi_lock already */
3639 } else { 4031 p->prio = rt_mutex_getprio(p);
3640 p->prio = p->static_prio; 4032 /*
3641 /* 4033 * SCHED_BATCH tasks are treated as perpetual CPU hogs:
3642 * SCHED_BATCH tasks are treated as perpetual CPU hogs: 4034 */
3643 */ 4035 if (policy == SCHED_BATCH)
3644 if (policy == SCHED_BATCH) 4036 p->sleep_avg = 0;
3645 p->sleep_avg = 0; 4037 set_load_weight(p);
3646 }
3647} 4038}
3648 4039
3649/** 4040/**
@@ -3656,12 +4047,13 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
3656int sched_setscheduler(struct task_struct *p, int policy, 4047int sched_setscheduler(struct task_struct *p, int policy,
3657 struct sched_param *param) 4048 struct sched_param *param)
3658{ 4049{
3659 int retval; 4050 int retval, oldprio, oldpolicy = -1;
3660 int oldprio, oldpolicy = -1; 4051 struct prio_array *array;
3661 prio_array_t *array;
3662 unsigned long flags; 4052 unsigned long flags;
3663 runqueue_t *rq; 4053 struct rq *rq;
3664 4054
4055 /* may grab non-irq protected spin_locks */
4056 BUG_ON(in_interrupt());
3665recheck: 4057recheck:
3666 /* double check policy once rq lock held */ 4058 /* double check policy once rq lock held */
3667 if (policy < 0) 4059 if (policy < 0)
@@ -3710,14 +4102,20 @@ recheck:
3710 if (retval) 4102 if (retval)
3711 return retval; 4103 return retval;
3712 /* 4104 /*
4105 * make sure no PI-waiters arrive (or leave) while we are
4106 * changing the priority of the task:
4107 */
4108 spin_lock_irqsave(&p->pi_lock, flags);
4109 /*
3713 * To be able to change p->policy safely, the apropriate 4110 * To be able to change p->policy safely, the apropriate
3714 * runqueue lock must be held. 4111 * runqueue lock must be held.
3715 */ 4112 */
3716 rq = task_rq_lock(p, &flags); 4113 rq = __task_rq_lock(p);
3717 /* recheck policy now with rq lock held */ 4114 /* recheck policy now with rq lock held */
3718 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 4115 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
3719 policy = oldpolicy = -1; 4116 policy = oldpolicy = -1;
3720 task_rq_unlock(rq, &flags); 4117 __task_rq_unlock(rq);
4118 spin_unlock_irqrestore(&p->pi_lock, flags);
3721 goto recheck; 4119 goto recheck;
3722 } 4120 }
3723 array = p->array; 4121 array = p->array;
@@ -3738,7 +4136,11 @@ recheck:
3738 } else if (TASK_PREEMPTS_CURR(p, rq)) 4136 } else if (TASK_PREEMPTS_CURR(p, rq))
3739 resched_task(rq->curr); 4137 resched_task(rq->curr);
3740 } 4138 }
3741 task_rq_unlock(rq, &flags); 4139 __task_rq_unlock(rq);
4140 spin_unlock_irqrestore(&p->pi_lock, flags);
4141
4142 rt_mutex_adjust_pi(p);
4143
3742 return 0; 4144 return 0;
3743} 4145}
3744EXPORT_SYMBOL_GPL(sched_setscheduler); 4146EXPORT_SYMBOL_GPL(sched_setscheduler);
@@ -3746,9 +4148,9 @@ EXPORT_SYMBOL_GPL(sched_setscheduler);
3746static int 4148static int
3747do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) 4149do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3748{ 4150{
3749 int retval;
3750 struct sched_param lparam; 4151 struct sched_param lparam;
3751 struct task_struct *p; 4152 struct task_struct *p;
4153 int retval;
3752 4154
3753 if (!param || pid < 0) 4155 if (!param || pid < 0)
3754 return -EINVAL; 4156 return -EINVAL;
@@ -3760,8 +4162,11 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3760 read_unlock_irq(&tasklist_lock); 4162 read_unlock_irq(&tasklist_lock);
3761 return -ESRCH; 4163 return -ESRCH;
3762 } 4164 }
3763 retval = sched_setscheduler(p, policy, &lparam); 4165 get_task_struct(p);
3764 read_unlock_irq(&tasklist_lock); 4166 read_unlock_irq(&tasklist_lock);
4167 retval = sched_setscheduler(p, policy, &lparam);
4168 put_task_struct(p);
4169
3765 return retval; 4170 return retval;
3766} 4171}
3767 4172
@@ -3797,8 +4202,8 @@ asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
3797 */ 4202 */
3798asmlinkage long sys_sched_getscheduler(pid_t pid) 4203asmlinkage long sys_sched_getscheduler(pid_t pid)
3799{ 4204{
4205 struct task_struct *p;
3800 int retval = -EINVAL; 4206 int retval = -EINVAL;
3801 task_t *p;
3802 4207
3803 if (pid < 0) 4208 if (pid < 0)
3804 goto out_nounlock; 4209 goto out_nounlock;
@@ -3825,8 +4230,8 @@ out_nounlock:
3825asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) 4230asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
3826{ 4231{
3827 struct sched_param lp; 4232 struct sched_param lp;
4233 struct task_struct *p;
3828 int retval = -EINVAL; 4234 int retval = -EINVAL;
3829 task_t *p;
3830 4235
3831 if (!param || pid < 0) 4236 if (!param || pid < 0)
3832 goto out_nounlock; 4237 goto out_nounlock;
@@ -3859,9 +4264,9 @@ out_unlock:
3859 4264
3860long sched_setaffinity(pid_t pid, cpumask_t new_mask) 4265long sched_setaffinity(pid_t pid, cpumask_t new_mask)
3861{ 4266{
3862 task_t *p;
3863 int retval;
3864 cpumask_t cpus_allowed; 4267 cpumask_t cpus_allowed;
4268 struct task_struct *p;
4269 int retval;
3865 4270
3866 lock_cpu_hotplug(); 4271 lock_cpu_hotplug();
3867 read_lock(&tasklist_lock); 4272 read_lock(&tasklist_lock);
@@ -3886,6 +4291,10 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
3886 !capable(CAP_SYS_NICE)) 4291 !capable(CAP_SYS_NICE))
3887 goto out_unlock; 4292 goto out_unlock;
3888 4293
4294 retval = security_task_setscheduler(p, 0, NULL);
4295 if (retval)
4296 goto out_unlock;
4297
3889 cpus_allowed = cpuset_cpus_allowed(p); 4298 cpus_allowed = cpuset_cpus_allowed(p);
3890 cpus_and(new_mask, new_mask, cpus_allowed); 4299 cpus_and(new_mask, new_mask, cpus_allowed);
3891 retval = set_cpus_allowed(p, new_mask); 4300 retval = set_cpus_allowed(p, new_mask);
@@ -3943,8 +4352,8 @@ cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
3943 4352
3944long sched_getaffinity(pid_t pid, cpumask_t *mask) 4353long sched_getaffinity(pid_t pid, cpumask_t *mask)
3945{ 4354{
4355 struct task_struct *p;
3946 int retval; 4356 int retval;
3947 task_t *p;
3948 4357
3949 lock_cpu_hotplug(); 4358 lock_cpu_hotplug();
3950 read_lock(&tasklist_lock); 4359 read_lock(&tasklist_lock);
@@ -3954,7 +4363,10 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
3954 if (!p) 4363 if (!p)
3955 goto out_unlock; 4364 goto out_unlock;
3956 4365
3957 retval = 0; 4366 retval = security_task_getscheduler(p);
4367 if (retval)
4368 goto out_unlock;
4369
3958 cpus_and(*mask, p->cpus_allowed, cpu_online_map); 4370 cpus_and(*mask, p->cpus_allowed, cpu_online_map);
3959 4371
3960out_unlock: 4372out_unlock:
@@ -4000,9 +4412,8 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
4000 */ 4412 */
4001asmlinkage long sys_sched_yield(void) 4413asmlinkage long sys_sched_yield(void)
4002{ 4414{
4003 runqueue_t *rq = this_rq_lock(); 4415 struct rq *rq = this_rq_lock();
4004 prio_array_t *array = current->array; 4416 struct prio_array *array = current->array, *target = rq->expired;
4005 prio_array_t *target = rq->expired;
4006 4417
4007 schedstat_inc(rq, yld_cnt); 4418 schedstat_inc(rq, yld_cnt);
4008 /* 4419 /*
@@ -4036,6 +4447,7 @@ asmlinkage long sys_sched_yield(void)
4036 * no need to preempt or enable interrupts: 4447 * no need to preempt or enable interrupts:
4037 */ 4448 */
4038 __release(rq->lock); 4449 __release(rq->lock);
4450 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
4039 _raw_spin_unlock(&rq->lock); 4451 _raw_spin_unlock(&rq->lock);
4040 preempt_enable_no_resched(); 4452 preempt_enable_no_resched();
4041 4453
@@ -4044,17 +4456,25 @@ asmlinkage long sys_sched_yield(void)
4044 return 0; 4456 return 0;
4045} 4457}
4046 4458
4047static inline void __cond_resched(void) 4459static inline int __resched_legal(void)
4048{ 4460{
4461 if (unlikely(preempt_count()))
4462 return 0;
4463 if (unlikely(system_state != SYSTEM_RUNNING))
4464 return 0;
4465 return 1;
4466}
4467
4468static void __cond_resched(void)
4469{
4470#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
4471 __might_sleep(__FILE__, __LINE__);
4472#endif
4049 /* 4473 /*
4050 * The BKS might be reacquired before we have dropped 4474 * The BKS might be reacquired before we have dropped
4051 * PREEMPT_ACTIVE, which could trigger a second 4475 * PREEMPT_ACTIVE, which could trigger a second
4052 * cond_resched() call. 4476 * cond_resched() call.
4053 */ 4477 */
4054 if (unlikely(preempt_count()))
4055 return;
4056 if (unlikely(system_state != SYSTEM_RUNNING))
4057 return;
4058 do { 4478 do {
4059 add_preempt_count(PREEMPT_ACTIVE); 4479 add_preempt_count(PREEMPT_ACTIVE);
4060 schedule(); 4480 schedule();
@@ -4064,13 +4484,12 @@ static inline void __cond_resched(void)
4064 4484
4065int __sched cond_resched(void) 4485int __sched cond_resched(void)
4066{ 4486{
4067 if (need_resched()) { 4487 if (need_resched() && __resched_legal()) {
4068 __cond_resched(); 4488 __cond_resched();
4069 return 1; 4489 return 1;
4070 } 4490 }
4071 return 0; 4491 return 0;
4072} 4492}
4073
4074EXPORT_SYMBOL(cond_resched); 4493EXPORT_SYMBOL(cond_resched);
4075 4494
4076/* 4495/*
@@ -4091,7 +4510,8 @@ int cond_resched_lock(spinlock_t *lock)
4091 ret = 1; 4510 ret = 1;
4092 spin_lock(lock); 4511 spin_lock(lock);
4093 } 4512 }
4094 if (need_resched()) { 4513 if (need_resched() && __resched_legal()) {
4514 spin_release(&lock->dep_map, 1, _THIS_IP_);
4095 _raw_spin_unlock(lock); 4515 _raw_spin_unlock(lock);
4096 preempt_enable_no_resched(); 4516 preempt_enable_no_resched();
4097 __cond_resched(); 4517 __cond_resched();
@@ -4100,25 +4520,24 @@ int cond_resched_lock(spinlock_t *lock)
4100 } 4520 }
4101 return ret; 4521 return ret;
4102} 4522}
4103
4104EXPORT_SYMBOL(cond_resched_lock); 4523EXPORT_SYMBOL(cond_resched_lock);
4105 4524
4106int __sched cond_resched_softirq(void) 4525int __sched cond_resched_softirq(void)
4107{ 4526{
4108 BUG_ON(!in_softirq()); 4527 BUG_ON(!in_softirq());
4109 4528
4110 if (need_resched()) { 4529 if (need_resched() && __resched_legal()) {
4111 __local_bh_enable(); 4530 raw_local_irq_disable();
4531 _local_bh_enable();
4532 raw_local_irq_enable();
4112 __cond_resched(); 4533 __cond_resched();
4113 local_bh_disable(); 4534 local_bh_disable();
4114 return 1; 4535 return 1;
4115 } 4536 }
4116 return 0; 4537 return 0;
4117} 4538}
4118
4119EXPORT_SYMBOL(cond_resched_softirq); 4539EXPORT_SYMBOL(cond_resched_softirq);
4120 4540
4121
4122/** 4541/**
4123 * yield - yield the current processor to other threads. 4542 * yield - yield the current processor to other threads.
4124 * 4543 *
@@ -4130,7 +4549,6 @@ void __sched yield(void)
4130 set_current_state(TASK_RUNNING); 4549 set_current_state(TASK_RUNNING);
4131 sys_sched_yield(); 4550 sys_sched_yield();
4132} 4551}
4133
4134EXPORT_SYMBOL(yield); 4552EXPORT_SYMBOL(yield);
4135 4553
4136/* 4554/*
@@ -4142,23 +4560,26 @@ EXPORT_SYMBOL(yield);
4142 */ 4560 */
4143void __sched io_schedule(void) 4561void __sched io_schedule(void)
4144{ 4562{
4145 struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); 4563 struct rq *rq = &__raw_get_cpu_var(runqueues);
4146 4564
4565 delayacct_blkio_start();
4147 atomic_inc(&rq->nr_iowait); 4566 atomic_inc(&rq->nr_iowait);
4148 schedule(); 4567 schedule();
4149 atomic_dec(&rq->nr_iowait); 4568 atomic_dec(&rq->nr_iowait);
4569 delayacct_blkio_end();
4150} 4570}
4151
4152EXPORT_SYMBOL(io_schedule); 4571EXPORT_SYMBOL(io_schedule);
4153 4572
4154long __sched io_schedule_timeout(long timeout) 4573long __sched io_schedule_timeout(long timeout)
4155{ 4574{
4156 struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); 4575 struct rq *rq = &__raw_get_cpu_var(runqueues);
4157 long ret; 4576 long ret;
4158 4577
4578 delayacct_blkio_start();
4159 atomic_inc(&rq->nr_iowait); 4579 atomic_inc(&rq->nr_iowait);
4160 ret = schedule_timeout(timeout); 4580 ret = schedule_timeout(timeout);
4161 atomic_dec(&rq->nr_iowait); 4581 atomic_dec(&rq->nr_iowait);
4582 delayacct_blkio_end();
4162 return ret; 4583 return ret;
4163} 4584}
4164 4585
@@ -4220,9 +4641,9 @@ asmlinkage long sys_sched_get_priority_min(int policy)
4220asmlinkage 4641asmlinkage
4221long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) 4642long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4222{ 4643{
4644 struct task_struct *p;
4223 int retval = -EINVAL; 4645 int retval = -EINVAL;
4224 struct timespec t; 4646 struct timespec t;
4225 task_t *p;
4226 4647
4227 if (pid < 0) 4648 if (pid < 0)
4228 goto out_nounlock; 4649 goto out_nounlock;
@@ -4237,7 +4658,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4237 if (retval) 4658 if (retval)
4238 goto out_unlock; 4659 goto out_unlock;
4239 4660
4240 jiffies_to_timespec(p->policy & SCHED_FIFO ? 4661 jiffies_to_timespec(p->policy == SCHED_FIFO ?
4241 0 : task_timeslice(p), &t); 4662 0 : task_timeslice(p), &t);
4242 read_unlock(&tasklist_lock); 4663 read_unlock(&tasklist_lock);
4243 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 4664 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
@@ -4250,35 +4671,36 @@ out_unlock:
4250 4671
4251static inline struct task_struct *eldest_child(struct task_struct *p) 4672static inline struct task_struct *eldest_child(struct task_struct *p)
4252{ 4673{
4253 if (list_empty(&p->children)) return NULL; 4674 if (list_empty(&p->children))
4675 return NULL;
4254 return list_entry(p->children.next,struct task_struct,sibling); 4676 return list_entry(p->children.next,struct task_struct,sibling);
4255} 4677}
4256 4678
4257static inline struct task_struct *older_sibling(struct task_struct *p) 4679static inline struct task_struct *older_sibling(struct task_struct *p)
4258{ 4680{
4259 if (p->sibling.prev==&p->parent->children) return NULL; 4681 if (p->sibling.prev==&p->parent->children)
4682 return NULL;
4260 return list_entry(p->sibling.prev,struct task_struct,sibling); 4683 return list_entry(p->sibling.prev,struct task_struct,sibling);
4261} 4684}
4262 4685
4263static inline struct task_struct *younger_sibling(struct task_struct *p) 4686static inline struct task_struct *younger_sibling(struct task_struct *p)
4264{ 4687{
4265 if (p->sibling.next==&p->parent->children) return NULL; 4688 if (p->sibling.next==&p->parent->children)
4689 return NULL;
4266 return list_entry(p->sibling.next,struct task_struct,sibling); 4690 return list_entry(p->sibling.next,struct task_struct,sibling);
4267} 4691}
4268 4692
4269static void show_task(task_t *p) 4693static const char stat_nam[] = "RSDTtZX";
4694
4695static void show_task(struct task_struct *p)
4270{ 4696{
4271 task_t *relative; 4697 struct task_struct *relative;
4272 unsigned state;
4273 unsigned long free = 0; 4698 unsigned long free = 0;
4274 static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" }; 4699 unsigned state;
4275 4700
4276 printk("%-13.13s ", p->comm);
4277 state = p->state ? __ffs(p->state) + 1 : 0; 4701 state = p->state ? __ffs(p->state) + 1 : 0;
4278 if (state < ARRAY_SIZE(stat_nam)) 4702 printk("%-13.13s %c", p->comm,
4279 printk(stat_nam[state]); 4703 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4280 else
4281 printk("?");
4282#if (BITS_PER_LONG == 32) 4704#if (BITS_PER_LONG == 32)
4283 if (state == TASK_RUNNING) 4705 if (state == TASK_RUNNING)
4284 printk(" running "); 4706 printk(" running ");
@@ -4322,7 +4744,7 @@ static void show_task(task_t *p)
4322 4744
4323void show_state(void) 4745void show_state(void)
4324{ 4746{
4325 task_t *g, *p; 4747 struct task_struct *g, *p;
4326 4748
4327#if (BITS_PER_LONG == 32) 4749#if (BITS_PER_LONG == 32)
4328 printk("\n" 4750 printk("\n"
@@ -4344,7 +4766,7 @@ void show_state(void)
4344 } while_each_thread(g, p); 4766 } while_each_thread(g, p);
4345 4767
4346 read_unlock(&tasklist_lock); 4768 read_unlock(&tasklist_lock);
4347 mutex_debug_show_all_locks(); 4769 debug_show_all_locks();
4348} 4770}
4349 4771
4350/** 4772/**
@@ -4355,15 +4777,15 @@ void show_state(void)
4355 * NOTE: this function does not set the idle thread's NEED_RESCHED 4777 * NOTE: this function does not set the idle thread's NEED_RESCHED
4356 * flag, to make booting more robust. 4778 * flag, to make booting more robust.
4357 */ 4779 */
4358void __devinit init_idle(task_t *idle, int cpu) 4780void __devinit init_idle(struct task_struct *idle, int cpu)
4359{ 4781{
4360 runqueue_t *rq = cpu_rq(cpu); 4782 struct rq *rq = cpu_rq(cpu);
4361 unsigned long flags; 4783 unsigned long flags;
4362 4784
4363 idle->timestamp = sched_clock(); 4785 idle->timestamp = sched_clock();
4364 idle->sleep_avg = 0; 4786 idle->sleep_avg = 0;
4365 idle->array = NULL; 4787 idle->array = NULL;
4366 idle->prio = MAX_PRIO; 4788 idle->prio = idle->normal_prio = MAX_PRIO;
4367 idle->state = TASK_RUNNING; 4789 idle->state = TASK_RUNNING;
4368 idle->cpus_allowed = cpumask_of_cpu(cpu); 4790 idle->cpus_allowed = cpumask_of_cpu(cpu);
4369 set_task_cpu(idle, cpu); 4791 set_task_cpu(idle, cpu);
@@ -4396,7 +4818,7 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4396/* 4818/*
4397 * This is how migration works: 4819 * This is how migration works:
4398 * 4820 *
4399 * 1) we queue a migration_req_t structure in the source CPU's 4821 * 1) we queue a struct migration_req structure in the source CPU's
4400 * runqueue and wake up that CPU's migration thread. 4822 * runqueue and wake up that CPU's migration thread.
4401 * 2) we down() the locked semaphore => thread blocks. 4823 * 2) we down() the locked semaphore => thread blocks.
4402 * 3) migration thread wakes up (implicitly it forces the migrated 4824 * 3) migration thread wakes up (implicitly it forces the migrated
@@ -4418,12 +4840,12 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE;
4418 * task must not exit() & deallocate itself prematurely. The 4840 * task must not exit() & deallocate itself prematurely. The
4419 * call is not atomic; no spinlocks may be held. 4841 * call is not atomic; no spinlocks may be held.
4420 */ 4842 */
4421int set_cpus_allowed(task_t *p, cpumask_t new_mask) 4843int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
4422{ 4844{
4845 struct migration_req req;
4423 unsigned long flags; 4846 unsigned long flags;
4847 struct rq *rq;
4424 int ret = 0; 4848 int ret = 0;
4425 migration_req_t req;
4426 runqueue_t *rq;
4427 4849
4428 rq = task_rq_lock(p, &flags); 4850 rq = task_rq_lock(p, &flags);
4429 if (!cpus_intersects(new_mask, cpu_online_map)) { 4851 if (!cpus_intersects(new_mask, cpu_online_map)) {
@@ -4446,9 +4868,9 @@ int set_cpus_allowed(task_t *p, cpumask_t new_mask)
4446 } 4868 }
4447out: 4869out:
4448 task_rq_unlock(rq, &flags); 4870 task_rq_unlock(rq, &flags);
4871
4449 return ret; 4872 return ret;
4450} 4873}
4451
4452EXPORT_SYMBOL_GPL(set_cpus_allowed); 4874EXPORT_SYMBOL_GPL(set_cpus_allowed);
4453 4875
4454/* 4876/*
@@ -4459,13 +4881,16 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed);
4459 * 4881 *
4460 * So we race with normal scheduler movements, but that's OK, as long 4882 * So we race with normal scheduler movements, but that's OK, as long
4461 * as the task is no longer on this CPU. 4883 * as the task is no longer on this CPU.
4884 *
4885 * Returns non-zero if task was successfully migrated.
4462 */ 4886 */
4463static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) 4887static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4464{ 4888{
4465 runqueue_t *rq_dest, *rq_src; 4889 struct rq *rq_dest, *rq_src;
4890 int ret = 0;
4466 4891
4467 if (unlikely(cpu_is_offline(dest_cpu))) 4892 if (unlikely(cpu_is_offline(dest_cpu)))
4468 return; 4893 return ret;
4469 4894
4470 rq_src = cpu_rq(src_cpu); 4895 rq_src = cpu_rq(src_cpu);
4471 rq_dest = cpu_rq(dest_cpu); 4896 rq_dest = cpu_rq(dest_cpu);
@@ -4489,13 +4914,14 @@ static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4489 p->timestamp = p->timestamp - rq_src->timestamp_last_tick 4914 p->timestamp = p->timestamp - rq_src->timestamp_last_tick
4490 + rq_dest->timestamp_last_tick; 4915 + rq_dest->timestamp_last_tick;
4491 deactivate_task(p, rq_src); 4916 deactivate_task(p, rq_src);
4492 activate_task(p, rq_dest, 0); 4917 __activate_task(p, rq_dest);
4493 if (TASK_PREEMPTS_CURR(p, rq_dest)) 4918 if (TASK_PREEMPTS_CURR(p, rq_dest))
4494 resched_task(rq_dest->curr); 4919 resched_task(rq_dest->curr);
4495 } 4920 }
4496 4921 ret = 1;
4497out: 4922out:
4498 double_rq_unlock(rq_src, rq_dest); 4923 double_rq_unlock(rq_src, rq_dest);
4924 return ret;
4499} 4925}
4500 4926
4501/* 4927/*
@@ -4505,16 +4931,16 @@ out:
4505 */ 4931 */
4506static int migration_thread(void *data) 4932static int migration_thread(void *data)
4507{ 4933{
4508 runqueue_t *rq;
4509 int cpu = (long)data; 4934 int cpu = (long)data;
4935 struct rq *rq;
4510 4936
4511 rq = cpu_rq(cpu); 4937 rq = cpu_rq(cpu);
4512 BUG_ON(rq->migration_thread != current); 4938 BUG_ON(rq->migration_thread != current);
4513 4939
4514 set_current_state(TASK_INTERRUPTIBLE); 4940 set_current_state(TASK_INTERRUPTIBLE);
4515 while (!kthread_should_stop()) { 4941 while (!kthread_should_stop()) {
4942 struct migration_req *req;
4516 struct list_head *head; 4943 struct list_head *head;
4517 migration_req_t *req;
4518 4944
4519 try_to_freeze(); 4945 try_to_freeze();
4520 4946
@@ -4538,7 +4964,7 @@ static int migration_thread(void *data)
4538 set_current_state(TASK_INTERRUPTIBLE); 4964 set_current_state(TASK_INTERRUPTIBLE);
4539 continue; 4965 continue;
4540 } 4966 }
4541 req = list_entry(head->next, migration_req_t, list); 4967 req = list_entry(head->next, struct migration_req, list);
4542 list_del_init(head->next); 4968 list_del_init(head->next);
4543 4969
4544 spin_unlock(&rq->lock); 4970 spin_unlock(&rq->lock);
@@ -4563,36 +4989,42 @@ wait_to_die:
4563 4989
4564#ifdef CONFIG_HOTPLUG_CPU 4990#ifdef CONFIG_HOTPLUG_CPU
4565/* Figure out where task on dead CPU should go, use force if neccessary. */ 4991/* Figure out where task on dead CPU should go, use force if neccessary. */
4566static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) 4992static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
4567{ 4993{
4568 int dest_cpu; 4994 unsigned long flags;
4569 cpumask_t mask; 4995 cpumask_t mask;
4996 struct rq *rq;
4997 int dest_cpu;
4570 4998
4999restart:
4571 /* On same node? */ 5000 /* On same node? */
4572 mask = node_to_cpumask(cpu_to_node(dead_cpu)); 5001 mask = node_to_cpumask(cpu_to_node(dead_cpu));
4573 cpus_and(mask, mask, tsk->cpus_allowed); 5002 cpus_and(mask, mask, p->cpus_allowed);
4574 dest_cpu = any_online_cpu(mask); 5003 dest_cpu = any_online_cpu(mask);
4575 5004
4576 /* On any allowed CPU? */ 5005 /* On any allowed CPU? */
4577 if (dest_cpu == NR_CPUS) 5006 if (dest_cpu == NR_CPUS)
4578 dest_cpu = any_online_cpu(tsk->cpus_allowed); 5007 dest_cpu = any_online_cpu(p->cpus_allowed);
4579 5008
4580 /* No more Mr. Nice Guy. */ 5009 /* No more Mr. Nice Guy. */
4581 if (dest_cpu == NR_CPUS) { 5010 if (dest_cpu == NR_CPUS) {
4582 cpus_setall(tsk->cpus_allowed); 5011 rq = task_rq_lock(p, &flags);
4583 dest_cpu = any_online_cpu(tsk->cpus_allowed); 5012 cpus_setall(p->cpus_allowed);
5013 dest_cpu = any_online_cpu(p->cpus_allowed);
5014 task_rq_unlock(rq, &flags);
4584 5015
4585 /* 5016 /*
4586 * Don't tell them about moving exiting tasks or 5017 * Don't tell them about moving exiting tasks or
4587 * kernel threads (both mm NULL), since they never 5018 * kernel threads (both mm NULL), since they never
4588 * leave kernel. 5019 * leave kernel.
4589 */ 5020 */
4590 if (tsk->mm && printk_ratelimit()) 5021 if (p->mm && printk_ratelimit())
4591 printk(KERN_INFO "process %d (%s) no " 5022 printk(KERN_INFO "process %d (%s) no "
4592 "longer affine to cpu%d\n", 5023 "longer affine to cpu%d\n",
4593 tsk->pid, tsk->comm, dead_cpu); 5024 p->pid, p->comm, dead_cpu);
4594 } 5025 }
4595 __migrate_task(tsk, dead_cpu, dest_cpu); 5026 if (!__migrate_task(p, dead_cpu, dest_cpu))
5027 goto restart;
4596} 5028}
4597 5029
4598/* 5030/*
@@ -4602,9 +5034,9 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
4602 * their home CPUs. So we just add the counter to another CPU's counter, 5034 * their home CPUs. So we just add the counter to another CPU's counter,
4603 * to keep the global sum constant after CPU-down: 5035 * to keep the global sum constant after CPU-down:
4604 */ 5036 */
4605static void migrate_nr_uninterruptible(runqueue_t *rq_src) 5037static void migrate_nr_uninterruptible(struct rq *rq_src)
4606{ 5038{
4607 runqueue_t *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL)); 5039 struct rq *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
4608 unsigned long flags; 5040 unsigned long flags;
4609 5041
4610 local_irq_save(flags); 5042 local_irq_save(flags);
@@ -4618,48 +5050,51 @@ static void migrate_nr_uninterruptible(runqueue_t *rq_src)
4618/* Run through task list and migrate tasks from the dead cpu. */ 5050/* Run through task list and migrate tasks from the dead cpu. */
4619static void migrate_live_tasks(int src_cpu) 5051static void migrate_live_tasks(int src_cpu)
4620{ 5052{
4621 struct task_struct *tsk, *t; 5053 struct task_struct *p, *t;
4622 5054
4623 write_lock_irq(&tasklist_lock); 5055 write_lock_irq(&tasklist_lock);
4624 5056
4625 do_each_thread(t, tsk) { 5057 do_each_thread(t, p) {
4626 if (tsk == current) 5058 if (p == current)
4627 continue; 5059 continue;
4628 5060
4629 if (task_cpu(tsk) == src_cpu) 5061 if (task_cpu(p) == src_cpu)
4630 move_task_off_dead_cpu(src_cpu, tsk); 5062 move_task_off_dead_cpu(src_cpu, p);
4631 } while_each_thread(t, tsk); 5063 } while_each_thread(t, p);
4632 5064
4633 write_unlock_irq(&tasklist_lock); 5065 write_unlock_irq(&tasklist_lock);
4634} 5066}
4635 5067
4636/* Schedules idle task to be the next runnable task on current CPU. 5068/* Schedules idle task to be the next runnable task on current CPU.
4637 * It does so by boosting its priority to highest possible and adding it to 5069 * It does so by boosting its priority to highest possible and adding it to
4638 * the _front_ of runqueue. Used by CPU offline code. 5070 * the _front_ of the runqueue. Used by CPU offline code.
4639 */ 5071 */
4640void sched_idle_next(void) 5072void sched_idle_next(void)
4641{ 5073{
4642 int cpu = smp_processor_id(); 5074 int this_cpu = smp_processor_id();
4643 runqueue_t *rq = this_rq(); 5075 struct rq *rq = cpu_rq(this_cpu);
4644 struct task_struct *p = rq->idle; 5076 struct task_struct *p = rq->idle;
4645 unsigned long flags; 5077 unsigned long flags;
4646 5078
4647 /* cpu has to be offline */ 5079 /* cpu has to be offline */
4648 BUG_ON(cpu_online(cpu)); 5080 BUG_ON(cpu_online(this_cpu));
4649 5081
4650 /* Strictly not necessary since rest of the CPUs are stopped by now 5082 /*
4651 * and interrupts disabled on current cpu. 5083 * Strictly not necessary since rest of the CPUs are stopped by now
5084 * and interrupts disabled on the current cpu.
4652 */ 5085 */
4653 spin_lock_irqsave(&rq->lock, flags); 5086 spin_lock_irqsave(&rq->lock, flags);
4654 5087
4655 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); 5088 __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1);
4656 /* Add idle task to _front_ of it's priority queue */ 5089
5090 /* Add idle task to the _front_ of its priority queue: */
4657 __activate_idle_task(p, rq); 5091 __activate_idle_task(p, rq);
4658 5092
4659 spin_unlock_irqrestore(&rq->lock, flags); 5093 spin_unlock_irqrestore(&rq->lock, flags);
4660} 5094}
4661 5095
4662/* Ensures that the idle task is using init_mm right before its cpu goes 5096/*
5097 * Ensures that the idle task is using init_mm right before its cpu goes
4663 * offline. 5098 * offline.
4664 */ 5099 */
4665void idle_task_exit(void) 5100void idle_task_exit(void)
@@ -4673,17 +5108,17 @@ void idle_task_exit(void)
4673 mmdrop(mm); 5108 mmdrop(mm);
4674} 5109}
4675 5110
4676static void migrate_dead(unsigned int dead_cpu, task_t *tsk) 5111static void migrate_dead(unsigned int dead_cpu, struct task_struct *p)
4677{ 5112{
4678 struct runqueue *rq = cpu_rq(dead_cpu); 5113 struct rq *rq = cpu_rq(dead_cpu);
4679 5114
4680 /* Must be exiting, otherwise would be on tasklist. */ 5115 /* Must be exiting, otherwise would be on tasklist. */
4681 BUG_ON(tsk->exit_state != EXIT_ZOMBIE && tsk->exit_state != EXIT_DEAD); 5116 BUG_ON(p->exit_state != EXIT_ZOMBIE && p->exit_state != EXIT_DEAD);
4682 5117
4683 /* Cannot have done final schedule yet: would have vanished. */ 5118 /* Cannot have done final schedule yet: would have vanished. */
4684 BUG_ON(tsk->flags & PF_DEAD); 5119 BUG_ON(p->flags & PF_DEAD);
4685 5120
4686 get_task_struct(tsk); 5121 get_task_struct(p);
4687 5122
4688 /* 5123 /*
4689 * Drop lock around migration; if someone else moves it, 5124 * Drop lock around migration; if someone else moves it,
@@ -4691,25 +5126,25 @@ static void migrate_dead(unsigned int dead_cpu, task_t *tsk)
4691 * fine. 5126 * fine.
4692 */ 5127 */
4693 spin_unlock_irq(&rq->lock); 5128 spin_unlock_irq(&rq->lock);
4694 move_task_off_dead_cpu(dead_cpu, tsk); 5129 move_task_off_dead_cpu(dead_cpu, p);
4695 spin_lock_irq(&rq->lock); 5130 spin_lock_irq(&rq->lock);
4696 5131
4697 put_task_struct(tsk); 5132 put_task_struct(p);
4698} 5133}
4699 5134
4700/* release_task() removes task from tasklist, so we won't find dead tasks. */ 5135/* release_task() removes task from tasklist, so we won't find dead tasks. */
4701static void migrate_dead_tasks(unsigned int dead_cpu) 5136static void migrate_dead_tasks(unsigned int dead_cpu)
4702{ 5137{
4703 unsigned arr, i; 5138 struct rq *rq = cpu_rq(dead_cpu);
4704 struct runqueue *rq = cpu_rq(dead_cpu); 5139 unsigned int arr, i;
4705 5140
4706 for (arr = 0; arr < 2; arr++) { 5141 for (arr = 0; arr < 2; arr++) {
4707 for (i = 0; i < MAX_PRIO; i++) { 5142 for (i = 0; i < MAX_PRIO; i++) {
4708 struct list_head *list = &rq->arrays[arr].queue[i]; 5143 struct list_head *list = &rq->arrays[arr].queue[i];
5144
4709 while (!list_empty(list)) 5145 while (!list_empty(list))
4710 migrate_dead(dead_cpu, 5146 migrate_dead(dead_cpu, list_entry(list->next,
4711 list_entry(list->next, task_t, 5147 struct task_struct, run_list));
4712 run_list));
4713 } 5148 }
4714 } 5149 }
4715} 5150}
@@ -4719,13 +5154,13 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
4719 * migration_call - callback that gets triggered when a CPU is added. 5154 * migration_call - callback that gets triggered when a CPU is added.
4720 * Here we can start up the necessary migration thread for the new CPU. 5155 * Here we can start up the necessary migration thread for the new CPU.
4721 */ 5156 */
4722static int migration_call(struct notifier_block *nfb, unsigned long action, 5157static int __cpuinit
4723 void *hcpu) 5158migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
4724{ 5159{
4725 int cpu = (long)hcpu;
4726 struct task_struct *p; 5160 struct task_struct *p;
4727 struct runqueue *rq; 5161 int cpu = (long)hcpu;
4728 unsigned long flags; 5162 unsigned long flags;
5163 struct rq *rq;
4729 5164
4730 switch (action) { 5165 switch (action) {
4731 case CPU_UP_PREPARE: 5166 case CPU_UP_PREPARE:
@@ -4740,18 +5175,23 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
4740 task_rq_unlock(rq, &flags); 5175 task_rq_unlock(rq, &flags);
4741 cpu_rq(cpu)->migration_thread = p; 5176 cpu_rq(cpu)->migration_thread = p;
4742 break; 5177 break;
5178
4743 case CPU_ONLINE: 5179 case CPU_ONLINE:
4744 /* Strictly unneccessary, as first user will wake it. */ 5180 /* Strictly unneccessary, as first user will wake it. */
4745 wake_up_process(cpu_rq(cpu)->migration_thread); 5181 wake_up_process(cpu_rq(cpu)->migration_thread);
4746 break; 5182 break;
5183
4747#ifdef CONFIG_HOTPLUG_CPU 5184#ifdef CONFIG_HOTPLUG_CPU
4748 case CPU_UP_CANCELED: 5185 case CPU_UP_CANCELED:
5186 if (!cpu_rq(cpu)->migration_thread)
5187 break;
4749 /* Unbind it from offline cpu so it can run. Fall thru. */ 5188 /* Unbind it from offline cpu so it can run. Fall thru. */
4750 kthread_bind(cpu_rq(cpu)->migration_thread, 5189 kthread_bind(cpu_rq(cpu)->migration_thread,
4751 any_online_cpu(cpu_online_map)); 5190 any_online_cpu(cpu_online_map));
4752 kthread_stop(cpu_rq(cpu)->migration_thread); 5191 kthread_stop(cpu_rq(cpu)->migration_thread);
4753 cpu_rq(cpu)->migration_thread = NULL; 5192 cpu_rq(cpu)->migration_thread = NULL;
4754 break; 5193 break;
5194
4755 case CPU_DEAD: 5195 case CPU_DEAD:
4756 migrate_live_tasks(cpu); 5196 migrate_live_tasks(cpu);
4757 rq = cpu_rq(cpu); 5197 rq = cpu_rq(cpu);
@@ -4772,9 +5212,10 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
4772 * the requestors. */ 5212 * the requestors. */
4773 spin_lock_irq(&rq->lock); 5213 spin_lock_irq(&rq->lock);
4774 while (!list_empty(&rq->migration_queue)) { 5214 while (!list_empty(&rq->migration_queue)) {
4775 migration_req_t *req; 5215 struct migration_req *req;
5216
4776 req = list_entry(rq->migration_queue.next, 5217 req = list_entry(rq->migration_queue.next,
4777 migration_req_t, list); 5218 struct migration_req, list);
4778 list_del_init(&req->list); 5219 list_del_init(&req->list);
4779 complete(&req->done); 5220 complete(&req->done);
4780 } 5221 }
@@ -4788,7 +5229,7 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
4788/* Register at highest priority so that task migration (migrate_all_tasks) 5229/* Register at highest priority so that task migration (migrate_all_tasks)
4789 * happens before everything else. 5230 * happens before everything else.
4790 */ 5231 */
4791static struct notifier_block migration_notifier = { 5232static struct notifier_block __cpuinitdata migration_notifier = {
4792 .notifier_call = migration_call, 5233 .notifier_call = migration_call,
4793 .priority = 10 5234 .priority = 10
4794}; 5235};
@@ -4796,10 +5237,12 @@ static struct notifier_block migration_notifier = {
4796int __init migration_init(void) 5237int __init migration_init(void)
4797{ 5238{
4798 void *cpu = (void *)(long)smp_processor_id(); 5239 void *cpu = (void *)(long)smp_processor_id();
4799 /* Start one for boot CPU. */ 5240
5241 /* Start one for the boot CPU: */
4800 migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); 5242 migration_call(&migration_notifier, CPU_UP_PREPARE, cpu);
4801 migration_call(&migration_notifier, CPU_ONLINE, cpu); 5243 migration_call(&migration_notifier, CPU_ONLINE, cpu);
4802 register_cpu_notifier(&migration_notifier); 5244 register_cpu_notifier(&migration_notifier);
5245
4803 return 0; 5246 return 0;
4804} 5247}
4805#endif 5248#endif
@@ -4895,7 +5338,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
4895 } while (sd); 5338 } while (sd);
4896} 5339}
4897#else 5340#else
4898#define sched_domain_debug(sd, cpu) {} 5341# define sched_domain_debug(sd, cpu) do { } while (0)
4899#endif 5342#endif
4900 5343
4901static int sd_degenerate(struct sched_domain *sd) 5344static int sd_degenerate(struct sched_domain *sd)
@@ -4921,8 +5364,8 @@ static int sd_degenerate(struct sched_domain *sd)
4921 return 1; 5364 return 1;
4922} 5365}
4923 5366
4924static int sd_parent_degenerate(struct sched_domain *sd, 5367static int
4925 struct sched_domain *parent) 5368sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
4926{ 5369{
4927 unsigned long cflags = sd->flags, pflags = parent->flags; 5370 unsigned long cflags = sd->flags, pflags = parent->flags;
4928 5371
@@ -4955,7 +5398,7 @@ static int sd_parent_degenerate(struct sched_domain *sd,
4955 */ 5398 */
4956static void cpu_attach_domain(struct sched_domain *sd, int cpu) 5399static void cpu_attach_domain(struct sched_domain *sd, int cpu)
4957{ 5400{
4958 runqueue_t *rq = cpu_rq(cpu); 5401 struct rq *rq = cpu_rq(cpu);
4959 struct sched_domain *tmp; 5402 struct sched_domain *tmp;
4960 5403
4961 /* Remove the sched domains which do not contribute to scheduling. */ 5404 /* Remove the sched domains which do not contribute to scheduling. */
@@ -5217,8 +5660,8 @@ static void touch_cache(void *__cache, unsigned long __size)
5217/* 5660/*
5218 * Measure the cache-cost of one task migration. Returns in units of nsec. 5661 * Measure the cache-cost of one task migration. Returns in units of nsec.
5219 */ 5662 */
5220static unsigned long long measure_one(void *cache, unsigned long size, 5663static unsigned long long
5221 int source, int target) 5664measure_one(void *cache, unsigned long size, int source, int target)
5222{ 5665{
5223 cpumask_t mask, saved_mask; 5666 cpumask_t mask, saved_mask;
5224 unsigned long long t0, t1, t2, t3, cost; 5667 unsigned long long t0, t1, t2, t3, cost;
@@ -5370,7 +5813,7 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2)
5370 cache = vmalloc(max_size); 5813 cache = vmalloc(max_size);
5371 if (!cache) { 5814 if (!cache) {
5372 printk("could not vmalloc %d bytes for cache!\n", 2*max_size); 5815 printk("could not vmalloc %d bytes for cache!\n", 2*max_size);
5373 return 1000000; // return 1 msec on very small boxen 5816 return 1000000; /* return 1 msec on very small boxen */
5374 } 5817 }
5375 5818
5376 while (size <= max_size) { 5819 while (size <= max_size) {
@@ -5568,9 +6011,9 @@ static int find_next_best_node(int node, unsigned long *used_nodes)
5568 */ 6011 */
5569static cpumask_t sched_domain_node_span(int node) 6012static cpumask_t sched_domain_node_span(int node)
5570{ 6013{
5571 int i;
5572 cpumask_t span, nodemask;
5573 DECLARE_BITMAP(used_nodes, MAX_NUMNODES); 6014 DECLARE_BITMAP(used_nodes, MAX_NUMNODES);
6015 cpumask_t span, nodemask;
6016 int i;
5574 6017
5575 cpus_clear(span); 6018 cpus_clear(span);
5576 bitmap_zero(used_nodes, MAX_NUMNODES); 6019 bitmap_zero(used_nodes, MAX_NUMNODES);
@@ -5581,6 +6024,7 @@ static cpumask_t sched_domain_node_span(int node)
5581 6024
5582 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) { 6025 for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
5583 int next_node = find_next_best_node(node, used_nodes); 6026 int next_node = find_next_best_node(node, used_nodes);
6027
5584 nodemask = node_to_cpumask(next_node); 6028 nodemask = node_to_cpumask(next_node);
5585 cpus_or(span, span, nodemask); 6029 cpus_or(span, span, nodemask);
5586 } 6030 }
@@ -5589,22 +6033,27 @@ static cpumask_t sched_domain_node_span(int node)
5589} 6033}
5590#endif 6034#endif
5591 6035
6036int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6037
5592/* 6038/*
5593 * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we 6039 * SMT sched-domains:
5594 * can switch it on easily if needed.
5595 */ 6040 */
5596#ifdef CONFIG_SCHED_SMT 6041#ifdef CONFIG_SCHED_SMT
5597static DEFINE_PER_CPU(struct sched_domain, cpu_domains); 6042static DEFINE_PER_CPU(struct sched_domain, cpu_domains);
5598static struct sched_group sched_group_cpus[NR_CPUS]; 6043static struct sched_group sched_group_cpus[NR_CPUS];
6044
5599static int cpu_to_cpu_group(int cpu) 6045static int cpu_to_cpu_group(int cpu)
5600{ 6046{
5601 return cpu; 6047 return cpu;
5602} 6048}
5603#endif 6049#endif
5604 6050
6051/*
6052 * multi-core sched-domains:
6053 */
5605#ifdef CONFIG_SCHED_MC 6054#ifdef CONFIG_SCHED_MC
5606static DEFINE_PER_CPU(struct sched_domain, core_domains); 6055static DEFINE_PER_CPU(struct sched_domain, core_domains);
5607static struct sched_group sched_group_core[NR_CPUS]; 6056static struct sched_group *sched_group_core_bycpu[NR_CPUS];
5608#endif 6057#endif
5609 6058
5610#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 6059#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
@@ -5620,10 +6069,11 @@ static int cpu_to_core_group(int cpu)
5620#endif 6069#endif
5621 6070
5622static DEFINE_PER_CPU(struct sched_domain, phys_domains); 6071static DEFINE_PER_CPU(struct sched_domain, phys_domains);
5623static struct sched_group sched_group_phys[NR_CPUS]; 6072static struct sched_group *sched_group_phys_bycpu[NR_CPUS];
6073
5624static int cpu_to_phys_group(int cpu) 6074static int cpu_to_phys_group(int cpu)
5625{ 6075{
5626#if defined(CONFIG_SCHED_MC) 6076#ifdef CONFIG_SCHED_MC
5627 cpumask_t mask = cpu_coregroup_map(cpu); 6077 cpumask_t mask = cpu_coregroup_map(cpu);
5628 return first_cpu(mask); 6078 return first_cpu(mask);
5629#elif defined(CONFIG_SCHED_SMT) 6079#elif defined(CONFIG_SCHED_SMT)
@@ -5677,13 +6127,74 @@ next_sg:
5677} 6127}
5678#endif 6128#endif
5679 6129
6130/* Free memory allocated for various sched_group structures */
6131static void free_sched_groups(const cpumask_t *cpu_map)
6132{
6133 int cpu;
6134#ifdef CONFIG_NUMA
6135 int i;
6136
6137 for_each_cpu_mask(cpu, *cpu_map) {
6138 struct sched_group *sched_group_allnodes
6139 = sched_group_allnodes_bycpu[cpu];
6140 struct sched_group **sched_group_nodes
6141 = sched_group_nodes_bycpu[cpu];
6142
6143 if (sched_group_allnodes) {
6144 kfree(sched_group_allnodes);
6145 sched_group_allnodes_bycpu[cpu] = NULL;
6146 }
6147
6148 if (!sched_group_nodes)
6149 continue;
6150
6151 for (i = 0; i < MAX_NUMNODES; i++) {
6152 cpumask_t nodemask = node_to_cpumask(i);
6153 struct sched_group *oldsg, *sg = sched_group_nodes[i];
6154
6155 cpus_and(nodemask, nodemask, *cpu_map);
6156 if (cpus_empty(nodemask))
6157 continue;
6158
6159 if (sg == NULL)
6160 continue;
6161 sg = sg->next;
6162next_sg:
6163 oldsg = sg;
6164 sg = sg->next;
6165 kfree(oldsg);
6166 if (oldsg != sched_group_nodes[i])
6167 goto next_sg;
6168 }
6169 kfree(sched_group_nodes);
6170 sched_group_nodes_bycpu[cpu] = NULL;
6171 }
6172#endif
6173 for_each_cpu_mask(cpu, *cpu_map) {
6174 if (sched_group_phys_bycpu[cpu]) {
6175 kfree(sched_group_phys_bycpu[cpu]);
6176 sched_group_phys_bycpu[cpu] = NULL;
6177 }
6178#ifdef CONFIG_SCHED_MC
6179 if (sched_group_core_bycpu[cpu]) {
6180 kfree(sched_group_core_bycpu[cpu]);
6181 sched_group_core_bycpu[cpu] = NULL;
6182 }
6183#endif
6184 }
6185}
6186
5680/* 6187/*
5681 * Build sched domains for a given set of cpus and attach the sched domains 6188 * Build sched domains for a given set of cpus and attach the sched domains
5682 * to the individual cpus 6189 * to the individual cpus
5683 */ 6190 */
5684void build_sched_domains(const cpumask_t *cpu_map) 6191static int build_sched_domains(const cpumask_t *cpu_map)
5685{ 6192{
5686 int i; 6193 int i;
6194 struct sched_group *sched_group_phys = NULL;
6195#ifdef CONFIG_SCHED_MC
6196 struct sched_group *sched_group_core = NULL;
6197#endif
5687#ifdef CONFIG_NUMA 6198#ifdef CONFIG_NUMA
5688 struct sched_group **sched_group_nodes = NULL; 6199 struct sched_group **sched_group_nodes = NULL;
5689 struct sched_group *sched_group_allnodes = NULL; 6200 struct sched_group *sched_group_allnodes = NULL;
@@ -5691,11 +6202,11 @@ void build_sched_domains(const cpumask_t *cpu_map)
5691 /* 6202 /*
5692 * Allocate the per-node list of sched groups 6203 * Allocate the per-node list of sched groups
5693 */ 6204 */
5694 sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES, 6205 sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
5695 GFP_ATOMIC); 6206 GFP_KERNEL);
5696 if (!sched_group_nodes) { 6207 if (!sched_group_nodes) {
5697 printk(KERN_WARNING "Can not alloc sched group node list\n"); 6208 printk(KERN_WARNING "Can not alloc sched group node list\n");
5698 return; 6209 return -ENOMEM;
5699 } 6210 }
5700 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; 6211 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
5701#endif 6212#endif
@@ -5721,7 +6232,7 @@ void build_sched_domains(const cpumask_t *cpu_map)
5721 if (!sched_group_allnodes) { 6232 if (!sched_group_allnodes) {
5722 printk(KERN_WARNING 6233 printk(KERN_WARNING
5723 "Can not alloc allnodes sched group\n"); 6234 "Can not alloc allnodes sched group\n");
5724 break; 6235 goto error;
5725 } 6236 }
5726 sched_group_allnodes_bycpu[i] 6237 sched_group_allnodes_bycpu[i]
5727 = sched_group_allnodes; 6238 = sched_group_allnodes;
@@ -5742,6 +6253,18 @@ void build_sched_domains(const cpumask_t *cpu_map)
5742 cpus_and(sd->span, sd->span, *cpu_map); 6253 cpus_and(sd->span, sd->span, *cpu_map);
5743#endif 6254#endif
5744 6255
6256 if (!sched_group_phys) {
6257 sched_group_phys
6258 = kmalloc(sizeof(struct sched_group) * NR_CPUS,
6259 GFP_KERNEL);
6260 if (!sched_group_phys) {
6261 printk (KERN_WARNING "Can not alloc phys sched"
6262 "group\n");
6263 goto error;
6264 }
6265 sched_group_phys_bycpu[i] = sched_group_phys;
6266 }
6267
5745 p = sd; 6268 p = sd;
5746 sd = &per_cpu(phys_domains, i); 6269 sd = &per_cpu(phys_domains, i);
5747 group = cpu_to_phys_group(i); 6270 group = cpu_to_phys_group(i);
@@ -5751,6 +6274,18 @@ void build_sched_domains(const cpumask_t *cpu_map)
5751 sd->groups = &sched_group_phys[group]; 6274 sd->groups = &sched_group_phys[group];
5752 6275
5753#ifdef CONFIG_SCHED_MC 6276#ifdef CONFIG_SCHED_MC
6277 if (!sched_group_core) {
6278 sched_group_core
6279 = kmalloc(sizeof(struct sched_group) * NR_CPUS,
6280 GFP_KERNEL);
6281 if (!sched_group_core) {
6282 printk (KERN_WARNING "Can not alloc core sched"
6283 "group\n");
6284 goto error;
6285 }
6286 sched_group_core_bycpu[i] = sched_group_core;
6287 }
6288
5754 p = sd; 6289 p = sd;
5755 sd = &per_cpu(core_domains, i); 6290 sd = &per_cpu(core_domains, i);
5756 group = cpu_to_core_group(i); 6291 group = cpu_to_core_group(i);
@@ -5834,24 +6369,21 @@ void build_sched_domains(const cpumask_t *cpu_map)
5834 domainspan = sched_domain_node_span(i); 6369 domainspan = sched_domain_node_span(i);
5835 cpus_and(domainspan, domainspan, *cpu_map); 6370 cpus_and(domainspan, domainspan, *cpu_map);
5836 6371
5837 sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); 6372 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
6373 if (!sg) {
6374 printk(KERN_WARNING "Can not alloc domain group for "
6375 "node %d\n", i);
6376 goto error;
6377 }
5838 sched_group_nodes[i] = sg; 6378 sched_group_nodes[i] = sg;
5839 for_each_cpu_mask(j, nodemask) { 6379 for_each_cpu_mask(j, nodemask) {
5840 struct sched_domain *sd; 6380 struct sched_domain *sd;
5841 sd = &per_cpu(node_domains, j); 6381 sd = &per_cpu(node_domains, j);
5842 sd->groups = sg; 6382 sd->groups = sg;
5843 if (sd->groups == NULL) {
5844 /* Turn off balancing if we have no groups */
5845 sd->flags = 0;
5846 }
5847 }
5848 if (!sg) {
5849 printk(KERN_WARNING
5850 "Can not alloc domain group for node %d\n", i);
5851 continue;
5852 } 6383 }
5853 sg->cpu_power = 0; 6384 sg->cpu_power = 0;
5854 sg->cpumask = nodemask; 6385 sg->cpumask = nodemask;
6386 sg->next = sg;
5855 cpus_or(covered, covered, nodemask); 6387 cpus_or(covered, covered, nodemask);
5856 prev = sg; 6388 prev = sg;
5857 6389
@@ -5870,54 +6402,90 @@ void build_sched_domains(const cpumask_t *cpu_map)
5870 if (cpus_empty(tmp)) 6402 if (cpus_empty(tmp))
5871 continue; 6403 continue;
5872 6404
5873 sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); 6405 sg = kmalloc_node(sizeof(struct sched_group),
6406 GFP_KERNEL, i);
5874 if (!sg) { 6407 if (!sg) {
5875 printk(KERN_WARNING 6408 printk(KERN_WARNING
5876 "Can not alloc domain group for node %d\n", j); 6409 "Can not alloc domain group for node %d\n", j);
5877 break; 6410 goto error;
5878 } 6411 }
5879 sg->cpu_power = 0; 6412 sg->cpu_power = 0;
5880 sg->cpumask = tmp; 6413 sg->cpumask = tmp;
6414 sg->next = prev->next;
5881 cpus_or(covered, covered, tmp); 6415 cpus_or(covered, covered, tmp);
5882 prev->next = sg; 6416 prev->next = sg;
5883 prev = sg; 6417 prev = sg;
5884 } 6418 }
5885 prev->next = sched_group_nodes[i];
5886 } 6419 }
5887#endif 6420#endif
5888 6421
5889 /* Calculate CPU power for physical packages and nodes */ 6422 /* Calculate CPU power for physical packages and nodes */
6423#ifdef CONFIG_SCHED_SMT
5890 for_each_cpu_mask(i, *cpu_map) { 6424 for_each_cpu_mask(i, *cpu_map) {
5891 int power;
5892 struct sched_domain *sd; 6425 struct sched_domain *sd;
5893#ifdef CONFIG_SCHED_SMT
5894 sd = &per_cpu(cpu_domains, i); 6426 sd = &per_cpu(cpu_domains, i);
5895 power = SCHED_LOAD_SCALE; 6427 sd->groups->cpu_power = SCHED_LOAD_SCALE;
5896 sd->groups->cpu_power = power; 6428 }
5897#endif 6429#endif
5898#ifdef CONFIG_SCHED_MC 6430#ifdef CONFIG_SCHED_MC
6431 for_each_cpu_mask(i, *cpu_map) {
6432 int power;
6433 struct sched_domain *sd;
5899 sd = &per_cpu(core_domains, i); 6434 sd = &per_cpu(core_domains, i);
5900 power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) 6435 if (sched_smt_power_savings)
6436 power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
6437 else
6438 power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
5901 * SCHED_LOAD_SCALE / 10; 6439 * SCHED_LOAD_SCALE / 10;
5902 sd->groups->cpu_power = power; 6440 sd->groups->cpu_power = power;
6441 }
6442#endif
5903 6443
6444 for_each_cpu_mask(i, *cpu_map) {
6445 struct sched_domain *sd;
6446#ifdef CONFIG_SCHED_MC
5904 sd = &per_cpu(phys_domains, i); 6447 sd = &per_cpu(phys_domains, i);
6448 if (i != first_cpu(sd->groups->cpumask))
6449 continue;
5905 6450
5906 /* 6451 sd->groups->cpu_power = 0;
5907 * This has to be < 2 * SCHED_LOAD_SCALE 6452 if (sched_mc_power_savings || sched_smt_power_savings) {
5908 * Lets keep it SCHED_LOAD_SCALE, so that 6453 int j;
5909 * while calculating NUMA group's cpu_power 6454
5910 * we can simply do 6455 for_each_cpu_mask(j, sd->groups->cpumask) {
5911 * numa_group->cpu_power += phys_group->cpu_power; 6456 struct sched_domain *sd1;
5912 * 6457 sd1 = &per_cpu(core_domains, j);
5913 * See "only add power once for each physical pkg" 6458 /*
5914 * comment below 6459 * for each core we will add once
5915 */ 6460 * to the group in physical domain
5916 sd->groups->cpu_power = SCHED_LOAD_SCALE; 6461 */
6462 if (j != first_cpu(sd1->groups->cpumask))
6463 continue;
6464
6465 if (sched_smt_power_savings)
6466 sd->groups->cpu_power += sd1->groups->cpu_power;
6467 else
6468 sd->groups->cpu_power += SCHED_LOAD_SCALE;
6469 }
6470 } else
6471 /*
6472 * This has to be < 2 * SCHED_LOAD_SCALE
6473 * Lets keep it SCHED_LOAD_SCALE, so that
6474 * while calculating NUMA group's cpu_power
6475 * we can simply do
6476 * numa_group->cpu_power += phys_group->cpu_power;
6477 *
6478 * See "only add power once for each physical pkg"
6479 * comment below
6480 */
6481 sd->groups->cpu_power = SCHED_LOAD_SCALE;
5917#else 6482#else
6483 int power;
5918 sd = &per_cpu(phys_domains, i); 6484 sd = &per_cpu(phys_domains, i);
5919 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * 6485 if (sched_smt_power_savings)
5920 (cpus_weight(sd->groups->cpumask)-1) / 10; 6486 power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
6487 else
6488 power = SCHED_LOAD_SCALE;
5921 sd->groups->cpu_power = power; 6489 sd->groups->cpu_power = power;
5922#endif 6490#endif
5923 } 6491 }
@@ -5945,13 +6513,20 @@ void build_sched_domains(const cpumask_t *cpu_map)
5945 * Tune cache-hot values: 6513 * Tune cache-hot values:
5946 */ 6514 */
5947 calibrate_migration_costs(cpu_map); 6515 calibrate_migration_costs(cpu_map);
6516
6517 return 0;
6518
6519error:
6520 free_sched_groups(cpu_map);
6521 return -ENOMEM;
5948} 6522}
5949/* 6523/*
5950 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 6524 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
5951 */ 6525 */
5952static void arch_init_sched_domains(const cpumask_t *cpu_map) 6526static int arch_init_sched_domains(const cpumask_t *cpu_map)
5953{ 6527{
5954 cpumask_t cpu_default_map; 6528 cpumask_t cpu_default_map;
6529 int err;
5955 6530
5956 /* 6531 /*
5957 * Setup mask for cpus without special case scheduling requirements. 6532 * Setup mask for cpus without special case scheduling requirements.
@@ -5960,51 +6535,14 @@ static void arch_init_sched_domains(const cpumask_t *cpu_map)
5960 */ 6535 */
5961 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); 6536 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
5962 6537
5963 build_sched_domains(&cpu_default_map); 6538 err = build_sched_domains(&cpu_default_map);
6539
6540 return err;
5964} 6541}
5965 6542
5966static void arch_destroy_sched_domains(const cpumask_t *cpu_map) 6543static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
5967{ 6544{
5968#ifdef CONFIG_NUMA 6545 free_sched_groups(cpu_map);
5969 int i;
5970 int cpu;
5971
5972 for_each_cpu_mask(cpu, *cpu_map) {
5973 struct sched_group *sched_group_allnodes
5974 = sched_group_allnodes_bycpu[cpu];
5975 struct sched_group **sched_group_nodes
5976 = sched_group_nodes_bycpu[cpu];
5977
5978 if (sched_group_allnodes) {
5979 kfree(sched_group_allnodes);
5980 sched_group_allnodes_bycpu[cpu] = NULL;
5981 }
5982
5983 if (!sched_group_nodes)
5984 continue;
5985
5986 for (i = 0; i < MAX_NUMNODES; i++) {
5987 cpumask_t nodemask = node_to_cpumask(i);
5988 struct sched_group *oldsg, *sg = sched_group_nodes[i];
5989
5990 cpus_and(nodemask, nodemask, *cpu_map);
5991 if (cpus_empty(nodemask))
5992 continue;
5993
5994 if (sg == NULL)
5995 continue;
5996 sg = sg->next;
5997next_sg:
5998 oldsg = sg;
5999 sg = sg->next;
6000 kfree(oldsg);
6001 if (oldsg != sched_group_nodes[i])
6002 goto next_sg;
6003 }
6004 kfree(sched_group_nodes);
6005 sched_group_nodes_bycpu[cpu] = NULL;
6006 }
6007#endif
6008} 6546}
6009 6547
6010/* 6548/*
@@ -6029,9 +6567,10 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
6029 * correct sched domains 6567 * correct sched domains
6030 * Call with hotplug lock held 6568 * Call with hotplug lock held
6031 */ 6569 */
6032void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) 6570int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
6033{ 6571{
6034 cpumask_t change_map; 6572 cpumask_t change_map;
6573 int err = 0;
6035 6574
6036 cpus_and(*partition1, *partition1, cpu_online_map); 6575 cpus_and(*partition1, *partition1, cpu_online_map);
6037 cpus_and(*partition2, *partition2, cpu_online_map); 6576 cpus_and(*partition2, *partition2, cpu_online_map);
@@ -6040,11 +6579,90 @@ void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
6040 /* Detach sched domains from all of the affected cpus */ 6579 /* Detach sched domains from all of the affected cpus */
6041 detach_destroy_domains(&change_map); 6580 detach_destroy_domains(&change_map);
6042 if (!cpus_empty(*partition1)) 6581 if (!cpus_empty(*partition1))
6043 build_sched_domains(partition1); 6582 err = build_sched_domains(partition1);
6044 if (!cpus_empty(*partition2)) 6583 if (!err && !cpus_empty(*partition2))
6045 build_sched_domains(partition2); 6584 err = build_sched_domains(partition2);
6585
6586 return err;
6046} 6587}
6047 6588
6589#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6590int arch_reinit_sched_domains(void)
6591{
6592 int err;
6593
6594 lock_cpu_hotplug();
6595 detach_destroy_domains(&cpu_online_map);
6596 err = arch_init_sched_domains(&cpu_online_map);
6597 unlock_cpu_hotplug();
6598
6599 return err;
6600}
6601
6602static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6603{
6604 int ret;
6605
6606 if (buf[0] != '0' && buf[0] != '1')
6607 return -EINVAL;
6608
6609 if (smt)
6610 sched_smt_power_savings = (buf[0] == '1');
6611 else
6612 sched_mc_power_savings = (buf[0] == '1');
6613
6614 ret = arch_reinit_sched_domains();
6615
6616 return ret ? ret : count;
6617}
6618
6619int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
6620{
6621 int err = 0;
6622
6623#ifdef CONFIG_SCHED_SMT
6624 if (smt_capable())
6625 err = sysfs_create_file(&cls->kset.kobj,
6626 &attr_sched_smt_power_savings.attr);
6627#endif
6628#ifdef CONFIG_SCHED_MC
6629 if (!err && mc_capable())
6630 err = sysfs_create_file(&cls->kset.kobj,
6631 &attr_sched_mc_power_savings.attr);
6632#endif
6633 return err;
6634}
6635#endif
6636
6637#ifdef CONFIG_SCHED_MC
6638static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
6639{
6640 return sprintf(page, "%u\n", sched_mc_power_savings);
6641}
6642static ssize_t sched_mc_power_savings_store(struct sys_device *dev,
6643 const char *buf, size_t count)
6644{
6645 return sched_power_savings_store(buf, count, 0);
6646}
6647SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
6648 sched_mc_power_savings_store);
6649#endif
6650
6651#ifdef CONFIG_SCHED_SMT
6652static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
6653{
6654 return sprintf(page, "%u\n", sched_smt_power_savings);
6655}
6656static ssize_t sched_smt_power_savings_store(struct sys_device *dev,
6657 const char *buf, size_t count)
6658{
6659 return sched_power_savings_store(buf, count, 1);
6660}
6661SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
6662 sched_smt_power_savings_store);
6663#endif
6664
6665
6048#ifdef CONFIG_HOTPLUG_CPU 6666#ifdef CONFIG_HOTPLUG_CPU
6049/* 6667/*
6050 * Force a reinitialization of the sched domains hierarchy. The domains 6668 * Force a reinitialization of the sched domains hierarchy. The domains
@@ -6098,6 +6716,7 @@ int in_sched_functions(unsigned long addr)
6098{ 6716{
6099 /* Linker adds these: start and end of __sched functions */ 6717 /* Linker adds these: start and end of __sched functions */
6100 extern char __sched_text_start[], __sched_text_end[]; 6718 extern char __sched_text_start[], __sched_text_end[];
6719
6101 return in_lock_functions(addr) || 6720 return in_lock_functions(addr) ||
6102 (addr >= (unsigned long)__sched_text_start 6721 (addr >= (unsigned long)__sched_text_start
6103 && addr < (unsigned long)__sched_text_end); 6722 && addr < (unsigned long)__sched_text_end);
@@ -6105,14 +6724,15 @@ int in_sched_functions(unsigned long addr)
6105 6724
6106void __init sched_init(void) 6725void __init sched_init(void)
6107{ 6726{
6108 runqueue_t *rq;
6109 int i, j, k; 6727 int i, j, k;
6110 6728
6111 for_each_possible_cpu(i) { 6729 for_each_possible_cpu(i) {
6112 prio_array_t *array; 6730 struct prio_array *array;
6731 struct rq *rq;
6113 6732
6114 rq = cpu_rq(i); 6733 rq = cpu_rq(i);
6115 spin_lock_init(&rq->lock); 6734 spin_lock_init(&rq->lock);
6735 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
6116 rq->nr_running = 0; 6736 rq->nr_running = 0;
6117 rq->active = rq->arrays; 6737 rq->active = rq->arrays;
6118 rq->expired = rq->arrays + 1; 6738 rq->expired = rq->arrays + 1;
@@ -6126,7 +6746,6 @@ void __init sched_init(void)
6126 rq->push_cpu = 0; 6746 rq->push_cpu = 0;
6127 rq->migration_thread = NULL; 6747 rq->migration_thread = NULL;
6128 INIT_LIST_HEAD(&rq->migration_queue); 6748 INIT_LIST_HEAD(&rq->migration_queue);
6129 rq->cpu = i;
6130#endif 6749#endif
6131 atomic_set(&rq->nr_iowait, 0); 6750 atomic_set(&rq->nr_iowait, 0);
6132 6751
@@ -6141,6 +6760,7 @@ void __init sched_init(void)
6141 } 6760 }
6142 } 6761 }
6143 6762
6763 set_load_weight(&init_task);
6144 /* 6764 /*
6145 * The boot idle thread does lazy MMU switching as well: 6765 * The boot idle thread does lazy MMU switching as well:
6146 */ 6766 */
@@ -6159,7 +6779,7 @@ void __init sched_init(void)
6159#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP 6779#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
6160void __might_sleep(char *file, int line) 6780void __might_sleep(char *file, int line)
6161{ 6781{
6162#if defined(in_atomic) 6782#ifdef in_atomic
6163 static unsigned long prev_jiffy; /* ratelimiting */ 6783 static unsigned long prev_jiffy; /* ratelimiting */
6164 6784
6165 if ((in_atomic() || irqs_disabled()) && 6785 if ((in_atomic() || irqs_disabled()) &&
@@ -6181,17 +6801,18 @@ EXPORT_SYMBOL(__might_sleep);
6181#ifdef CONFIG_MAGIC_SYSRQ 6801#ifdef CONFIG_MAGIC_SYSRQ
6182void normalize_rt_tasks(void) 6802void normalize_rt_tasks(void)
6183{ 6803{
6804 struct prio_array *array;
6184 struct task_struct *p; 6805 struct task_struct *p;
6185 prio_array_t *array;
6186 unsigned long flags; 6806 unsigned long flags;
6187 runqueue_t *rq; 6807 struct rq *rq;
6188 6808
6189 read_lock_irq(&tasklist_lock); 6809 read_lock_irq(&tasklist_lock);
6190 for_each_process (p) { 6810 for_each_process(p) {
6191 if (!rt_task(p)) 6811 if (!rt_task(p))
6192 continue; 6812 continue;
6193 6813
6194 rq = task_rq_lock(p, &flags); 6814 spin_lock_irqsave(&p->pi_lock, flags);
6815 rq = __task_rq_lock(p);
6195 6816
6196 array = p->array; 6817 array = p->array;
6197 if (array) 6818 if (array)
@@ -6202,7 +6823,8 @@ void normalize_rt_tasks(void)
6202 resched_task(rq->curr); 6823 resched_task(rq->curr);
6203 } 6824 }
6204 6825
6205 task_rq_unlock(rq, &flags); 6826 __task_rq_unlock(rq);
6827 spin_unlock_irqrestore(&p->pi_lock, flags);
6206 } 6828 }
6207 read_unlock_irq(&tasklist_lock); 6829 read_unlock_irq(&tasklist_lock);
6208} 6830}
@@ -6226,7 +6848,7 @@ void normalize_rt_tasks(void)
6226 * 6848 *
6227 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 6849 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6228 */ 6850 */
6229task_t *curr_task(int cpu) 6851struct task_struct *curr_task(int cpu)
6230{ 6852{
6231 return cpu_curr(cpu); 6853 return cpu_curr(cpu);
6232} 6854}
@@ -6246,7 +6868,7 @@ task_t *curr_task(int cpu)
6246 * 6868 *
6247 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! 6869 * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6248 */ 6870 */
6249void set_curr_task(int cpu, task_t *p) 6871void set_curr_task(int cpu, struct task_struct *p)
6250{ 6872{
6251 cpu_curr(cpu) = p; 6873 cpu_curr(cpu) = p;
6252} 6874}
diff --git a/kernel/signal.c b/kernel/signal.c
index e5f8aea78ffe..7fe874d12fae 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -10,7 +10,6 @@
10 * to allow signals to be sent reliably. 10 * to allow signals to be sent reliably.
11 */ 11 */
12 12
13#include <linux/config.h>
14#include <linux/slab.h> 13#include <linux/slab.h>
15#include <linux/module.h> 14#include <linux/module.h>
16#include <linux/smp_lock.h> 15#include <linux/smp_lock.h>
@@ -23,12 +22,12 @@
23#include <linux/syscalls.h> 22#include <linux/syscalls.h>
24#include <linux/ptrace.h> 23#include <linux/ptrace.h>
25#include <linux/signal.h> 24#include <linux/signal.h>
26#include <linux/audit.h>
27#include <linux/capability.h> 25#include <linux/capability.h>
28#include <asm/param.h> 26#include <asm/param.h>
29#include <asm/uaccess.h> 27#include <asm/uaccess.h>
30#include <asm/unistd.h> 28#include <asm/unistd.h>
31#include <asm/siginfo.h> 29#include <asm/siginfo.h>
30#include "audit.h" /* audit_signal_info() */
32 31
33/* 32/*
34 * SLAB caches for signal bits. 33 * SLAB caches for signal bits.
@@ -584,7 +583,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
584 && !capable(CAP_KILL)) 583 && !capable(CAP_KILL))
585 return error; 584 return error;
586 585
587 error = security_task_kill(t, info, sig); 586 error = security_task_kill(t, info, sig, 0);
588 if (!error) 587 if (!error)
589 audit_signal_info(sig, t); /* Let audit system see the signal */ 588 audit_signal_info(sig, t); /* Let audit system see the signal */
590 return error; 589 return error;
@@ -1107,7 +1106,7 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid)
1107 1106
1108/* like kill_proc_info(), but doesn't use uid/euid of "current" */ 1107/* like kill_proc_info(), but doesn't use uid/euid of "current" */
1109int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid, 1108int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid,
1110 uid_t uid, uid_t euid) 1109 uid_t uid, uid_t euid, u32 secid)
1111{ 1110{
1112 int ret = -EINVAL; 1111 int ret = -EINVAL;
1113 struct task_struct *p; 1112 struct task_struct *p;
@@ -1127,6 +1126,9 @@ int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid,
1127 ret = -EPERM; 1126 ret = -EPERM;
1128 goto out_unlock; 1127 goto out_unlock;
1129 } 1128 }
1129 ret = security_task_kill(p, info, sig, secid);
1130 if (ret)
1131 goto out_unlock;
1130 if (sig && p->sighand) { 1132 if (sig && p->sighand) {
1131 unsigned long flags; 1133 unsigned long flags;
1132 spin_lock_irqsave(&p->sighand->siglock, flags); 1134 spin_lock_irqsave(&p->sighand->siglock, flags);
@@ -1531,6 +1533,35 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
1531 spin_unlock_irqrestore(&sighand->siglock, flags); 1533 spin_unlock_irqrestore(&sighand->siglock, flags);
1532} 1534}
1533 1535
1536static inline int may_ptrace_stop(void)
1537{
1538 if (!likely(current->ptrace & PT_PTRACED))
1539 return 0;
1540
1541 if (unlikely(current->parent == current->real_parent &&
1542 (current->ptrace & PT_ATTACHED)))
1543 return 0;
1544
1545 if (unlikely(current->signal == current->parent->signal) &&
1546 unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))
1547 return 0;
1548
1549 /*
1550 * Are we in the middle of do_coredump?
1551 * If so and our tracer is also part of the coredump stopping
1552 * is a deadlock situation, and pointless because our tracer
1553 * is dead so don't allow us to stop.
1554 * If SIGKILL was already sent before the caller unlocked
1555 * ->siglock we must see ->core_waiters != 0. Otherwise it
1556 * is safe to enter schedule().
1557 */
1558 if (unlikely(current->mm->core_waiters) &&
1559 unlikely(current->mm == current->parent->mm))
1560 return 0;
1561
1562 return 1;
1563}
1564
1534/* 1565/*
1535 * This must be called with current->sighand->siglock held. 1566 * This must be called with current->sighand->siglock held.
1536 * 1567 *
@@ -1559,11 +1590,7 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
1559 spin_unlock_irq(&current->sighand->siglock); 1590 spin_unlock_irq(&current->sighand->siglock);
1560 try_to_freeze(); 1591 try_to_freeze();
1561 read_lock(&tasklist_lock); 1592 read_lock(&tasklist_lock);
1562 if (likely(current->ptrace & PT_PTRACED) && 1593 if (may_ptrace_stop()) {
1563 likely(current->parent != current->real_parent ||
1564 !(current->ptrace & PT_ATTACHED)) &&
1565 (likely(current->parent->signal != current->signal) ||
1566 !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) {
1567 do_notify_parent_cldstop(current, CLD_TRAPPED); 1594 do_notify_parent_cldstop(current, CLD_TRAPPED);
1568 read_unlock(&tasklist_lock); 1595 read_unlock(&tasklist_lock);
1569 schedule(); 1596 schedule();
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 336f92d64e2e..0f08a84ae307 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -62,6 +62,119 @@ static inline void wakeup_softirqd(void)
62} 62}
63 63
64/* 64/*
65 * This one is for softirq.c-internal use,
66 * where hardirqs are disabled legitimately:
67 */
68static void __local_bh_disable(unsigned long ip)
69{
70 unsigned long flags;
71
72 WARN_ON_ONCE(in_irq());
73
74 raw_local_irq_save(flags);
75 add_preempt_count(SOFTIRQ_OFFSET);
76 /*
77 * Were softirqs turned off above:
78 */
79 if (softirq_count() == SOFTIRQ_OFFSET)
80 trace_softirqs_off(ip);
81 raw_local_irq_restore(flags);
82}
83
84void local_bh_disable(void)
85{
86 __local_bh_disable((unsigned long)__builtin_return_address(0));
87}
88
89EXPORT_SYMBOL(local_bh_disable);
90
91void __local_bh_enable(void)
92{
93 WARN_ON_ONCE(in_irq());
94
95 /*
96 * softirqs should never be enabled by __local_bh_enable(),
97 * it always nests inside local_bh_enable() sections:
98 */
99 WARN_ON_ONCE(softirq_count() == SOFTIRQ_OFFSET);
100
101 sub_preempt_count(SOFTIRQ_OFFSET);
102}
103EXPORT_SYMBOL_GPL(__local_bh_enable);
104
105/*
106 * Special-case - softirqs can safely be enabled in
107 * cond_resched_softirq(), or by __do_softirq(),
108 * without processing still-pending softirqs:
109 */
110void _local_bh_enable(void)
111{
112 WARN_ON_ONCE(in_irq());
113 WARN_ON_ONCE(!irqs_disabled());
114
115 if (softirq_count() == SOFTIRQ_OFFSET)
116 trace_softirqs_on((unsigned long)__builtin_return_address(0));
117 sub_preempt_count(SOFTIRQ_OFFSET);
118}
119
120EXPORT_SYMBOL(_local_bh_enable);
121
122void local_bh_enable(void)
123{
124 unsigned long flags;
125
126 WARN_ON_ONCE(in_irq());
127 WARN_ON_ONCE(irqs_disabled());
128
129 local_irq_save(flags);
130 /*
131 * Are softirqs going to be turned on now:
132 */
133 if (softirq_count() == SOFTIRQ_OFFSET)
134 trace_softirqs_on((unsigned long)__builtin_return_address(0));
135 /*
136 * Keep preemption disabled until we are done with
137 * softirq processing:
138 */
139 sub_preempt_count(SOFTIRQ_OFFSET - 1);
140
141 if (unlikely(!in_interrupt() && local_softirq_pending()))
142 do_softirq();
143
144 dec_preempt_count();
145 local_irq_restore(flags);
146 preempt_check_resched();
147}
148EXPORT_SYMBOL(local_bh_enable);
149
150void local_bh_enable_ip(unsigned long ip)
151{
152 unsigned long flags;
153
154 WARN_ON_ONCE(in_irq());
155
156 local_irq_save(flags);
157 /*
158 * Are softirqs going to be turned on now:
159 */
160 if (softirq_count() == SOFTIRQ_OFFSET)
161 trace_softirqs_on(ip);
162 /*
163 * Keep preemption disabled until we are done with
164 * softirq processing:
165 */
166 sub_preempt_count(SOFTIRQ_OFFSET - 1);
167
168 if (unlikely(!in_interrupt() && local_softirq_pending()))
169 do_softirq();
170
171 dec_preempt_count();
172 local_irq_restore(flags);
173 preempt_check_resched();
174}
175EXPORT_SYMBOL(local_bh_enable_ip);
176
177/*
65 * We restart softirq processing MAX_SOFTIRQ_RESTART times, 178 * We restart softirq processing MAX_SOFTIRQ_RESTART times,
66 * and we fall back to softirqd after that. 179 * and we fall back to softirqd after that.
67 * 180 *
@@ -80,8 +193,11 @@ asmlinkage void __do_softirq(void)
80 int cpu; 193 int cpu;
81 194
82 pending = local_softirq_pending(); 195 pending = local_softirq_pending();
196 account_system_vtime(current);
197
198 __local_bh_disable((unsigned long)__builtin_return_address(0));
199 trace_softirq_enter();
83 200
84 local_bh_disable();
85 cpu = smp_processor_id(); 201 cpu = smp_processor_id();
86restart: 202restart:
87 /* Reset the pending bitmask before enabling irqs */ 203 /* Reset the pending bitmask before enabling irqs */
@@ -109,7 +225,10 @@ restart:
109 if (pending) 225 if (pending)
110 wakeup_softirqd(); 226 wakeup_softirqd();
111 227
112 __local_bh_enable(); 228 trace_softirq_exit();
229
230 account_system_vtime(current);
231 _local_bh_enable();
113} 232}
114 233
115#ifndef __ARCH_HAS_DO_SOFTIRQ 234#ifndef __ARCH_HAS_DO_SOFTIRQ
@@ -136,23 +255,6 @@ EXPORT_SYMBOL(do_softirq);
136 255
137#endif 256#endif
138 257
139void local_bh_enable(void)
140{
141 WARN_ON(irqs_disabled());
142 /*
143 * Keep preemption disabled until we are done with
144 * softirq processing:
145 */
146 sub_preempt_count(SOFTIRQ_OFFSET - 1);
147
148 if (unlikely(!in_interrupt() && local_softirq_pending()))
149 do_softirq();
150
151 dec_preempt_count();
152 preempt_check_resched();
153}
154EXPORT_SYMBOL(local_bh_enable);
155
156#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED 258#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
157# define invoke_softirq() __do_softirq() 259# define invoke_softirq() __do_softirq()
158#else 260#else
@@ -165,6 +267,7 @@ EXPORT_SYMBOL(local_bh_enable);
165void irq_exit(void) 267void irq_exit(void)
166{ 268{
167 account_system_vtime(current); 269 account_system_vtime(current);
270 trace_hardirq_exit();
168 sub_preempt_count(IRQ_EXIT_OFFSET); 271 sub_preempt_count(IRQ_EXIT_OFFSET);
169 if (!in_interrupt() && local_softirq_pending()) 272 if (!in_interrupt() && local_softirq_pending())
170 invoke_softirq(); 273 invoke_softirq();
@@ -208,8 +311,6 @@ void open_softirq(int nr, void (*action)(struct softirq_action*), void *data)
208 softirq_vec[nr].action = action; 311 softirq_vec[nr].action = action;
209} 312}
210 313
211EXPORT_SYMBOL(open_softirq);
212
213/* Tasklets */ 314/* Tasklets */
214struct tasklet_head 315struct tasklet_head
215{ 316{
@@ -446,7 +547,7 @@ static void takeover_tasklets(unsigned int cpu)
446} 547}
447#endif /* CONFIG_HOTPLUG_CPU */ 548#endif /* CONFIG_HOTPLUG_CPU */
448 549
449static int cpu_callback(struct notifier_block *nfb, 550static int __devinit cpu_callback(struct notifier_block *nfb,
450 unsigned long action, 551 unsigned long action,
451 void *hcpu) 552 void *hcpu)
452{ 553{
@@ -470,6 +571,8 @@ static int cpu_callback(struct notifier_block *nfb,
470 break; 571 break;
471#ifdef CONFIG_HOTPLUG_CPU 572#ifdef CONFIG_HOTPLUG_CPU
472 case CPU_UP_CANCELED: 573 case CPU_UP_CANCELED:
574 if (!per_cpu(ksoftirqd, hotcpu))
575 break;
473 /* Unbind so it can run. Fall thru. */ 576 /* Unbind so it can run. Fall thru. */
474 kthread_bind(per_cpu(ksoftirqd, hotcpu), 577 kthread_bind(per_cpu(ksoftirqd, hotcpu),
475 any_online_cpu(cpu_online_map)); 578 any_online_cpu(cpu_online_map));
@@ -484,7 +587,7 @@ static int cpu_callback(struct notifier_block *nfb,
484 return NOTIFY_OK; 587 return NOTIFY_OK;
485} 588}
486 589
487static struct notifier_block cpu_nfb = { 590static struct notifier_block __devinitdata cpu_nfb = {
488 .notifier_call = cpu_callback 591 .notifier_call = cpu_callback
489}; 592};
490 593
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 14c7faf02909..6b76caa22981 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -36,7 +36,7 @@ static struct notifier_block panic_block = {
36 36
37void touch_softlockup_watchdog(void) 37void touch_softlockup_watchdog(void)
38{ 38{
39 per_cpu(touch_timestamp, raw_smp_processor_id()) = jiffies; 39 __raw_get_cpu_var(touch_timestamp) = jiffies;
40} 40}
41EXPORT_SYMBOL(touch_softlockup_watchdog); 41EXPORT_SYMBOL(touch_softlockup_watchdog);
42 42
@@ -104,7 +104,7 @@ static int watchdog(void * __bind_cpu)
104/* 104/*
105 * Create/destroy watchdog threads as CPUs come and go: 105 * Create/destroy watchdog threads as CPUs come and go:
106 */ 106 */
107static int 107static int __devinit
108cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 108cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
109{ 109{
110 int hotcpu = (unsigned long)hcpu; 110 int hotcpu = (unsigned long)hcpu;
@@ -127,6 +127,8 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
127 break; 127 break;
128#ifdef CONFIG_HOTPLUG_CPU 128#ifdef CONFIG_HOTPLUG_CPU
129 case CPU_UP_CANCELED: 129 case CPU_UP_CANCELED:
130 if (!per_cpu(watchdog_task, hotcpu))
131 break;
130 /* Unbind so it can run. Fall thru. */ 132 /* Unbind so it can run. Fall thru. */
131 kthread_bind(per_cpu(watchdog_task, hotcpu), 133 kthread_bind(per_cpu(watchdog_task, hotcpu),
132 any_online_cpu(cpu_online_map)); 134 any_online_cpu(cpu_online_map));
@@ -140,7 +142,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
140 return NOTIFY_OK; 142 return NOTIFY_OK;
141} 143}
142 144
143static struct notifier_block cpu_nfb = { 145static struct notifier_block __devinitdata cpu_nfb = {
144 .notifier_call = cpu_callback 146 .notifier_call = cpu_callback
145}; 147};
146 148
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index d1b810782bc4..bfd6ad9c0330 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -9,11 +9,11 @@
9 * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them) 9 * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them)
10 */ 10 */
11 11
12#include <linux/config.h>
13#include <linux/linkage.h> 12#include <linux/linkage.h>
14#include <linux/preempt.h> 13#include <linux/preempt.h>
15#include <linux/spinlock.h> 14#include <linux/spinlock.h>
16#include <linux/interrupt.h> 15#include <linux/interrupt.h>
16#include <linux/debug_locks.h>
17#include <linux/module.h> 17#include <linux/module.h>
18 18
19/* 19/*
@@ -30,8 +30,10 @@ EXPORT_SYMBOL(generic__raw_read_trylock);
30int __lockfunc _spin_trylock(spinlock_t *lock) 30int __lockfunc _spin_trylock(spinlock_t *lock)
31{ 31{
32 preempt_disable(); 32 preempt_disable();
33 if (_raw_spin_trylock(lock)) 33 if (_raw_spin_trylock(lock)) {
34 spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
34 return 1; 35 return 1;
36 }
35 37
36 preempt_enable(); 38 preempt_enable();
37 return 0; 39 return 0;
@@ -41,8 +43,10 @@ EXPORT_SYMBOL(_spin_trylock);
41int __lockfunc _read_trylock(rwlock_t *lock) 43int __lockfunc _read_trylock(rwlock_t *lock)
42{ 44{
43 preempt_disable(); 45 preempt_disable();
44 if (_raw_read_trylock(lock)) 46 if (_raw_read_trylock(lock)) {
47 rwlock_acquire_read(&lock->dep_map, 0, 1, _RET_IP_);
45 return 1; 48 return 1;
49 }
46 50
47 preempt_enable(); 51 preempt_enable();
48 return 0; 52 return 0;
@@ -52,19 +56,28 @@ EXPORT_SYMBOL(_read_trylock);
52int __lockfunc _write_trylock(rwlock_t *lock) 56int __lockfunc _write_trylock(rwlock_t *lock)
53{ 57{
54 preempt_disable(); 58 preempt_disable();
55 if (_raw_write_trylock(lock)) 59 if (_raw_write_trylock(lock)) {
60 rwlock_acquire(&lock->dep_map, 0, 1, _RET_IP_);
56 return 1; 61 return 1;
62 }
57 63
58 preempt_enable(); 64 preempt_enable();
59 return 0; 65 return 0;
60} 66}
61EXPORT_SYMBOL(_write_trylock); 67EXPORT_SYMBOL(_write_trylock);
62 68
63#if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP) 69/*
70 * If lockdep is enabled then we use the non-preemption spin-ops
71 * even on CONFIG_PREEMPT, because lockdep assumes that interrupts are
72 * not re-enabled during lock-acquire (which the preempt-spin-ops do):
73 */
74#if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP) || \
75 defined(CONFIG_PROVE_LOCKING)
64 76
65void __lockfunc _read_lock(rwlock_t *lock) 77void __lockfunc _read_lock(rwlock_t *lock)
66{ 78{
67 preempt_disable(); 79 preempt_disable();
80 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
68 _raw_read_lock(lock); 81 _raw_read_lock(lock);
69} 82}
70EXPORT_SYMBOL(_read_lock); 83EXPORT_SYMBOL(_read_lock);
@@ -75,7 +88,17 @@ unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
75 88
76 local_irq_save(flags); 89 local_irq_save(flags);
77 preempt_disable(); 90 preempt_disable();
91 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
92 /*
93 * On lockdep we dont want the hand-coded irq-enable of
94 * _raw_spin_lock_flags() code, because lockdep assumes
95 * that interrupts are not re-enabled during lock-acquire:
96 */
97#ifdef CONFIG_PROVE_LOCKING
98 _raw_spin_lock(lock);
99#else
78 _raw_spin_lock_flags(lock, &flags); 100 _raw_spin_lock_flags(lock, &flags);
101#endif
79 return flags; 102 return flags;
80} 103}
81EXPORT_SYMBOL(_spin_lock_irqsave); 104EXPORT_SYMBOL(_spin_lock_irqsave);
@@ -84,6 +107,7 @@ void __lockfunc _spin_lock_irq(spinlock_t *lock)
84{ 107{
85 local_irq_disable(); 108 local_irq_disable();
86 preempt_disable(); 109 preempt_disable();
110 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
87 _raw_spin_lock(lock); 111 _raw_spin_lock(lock);
88} 112}
89EXPORT_SYMBOL(_spin_lock_irq); 113EXPORT_SYMBOL(_spin_lock_irq);
@@ -92,6 +116,7 @@ void __lockfunc _spin_lock_bh(spinlock_t *lock)
92{ 116{
93 local_bh_disable(); 117 local_bh_disable();
94 preempt_disable(); 118 preempt_disable();
119 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
95 _raw_spin_lock(lock); 120 _raw_spin_lock(lock);
96} 121}
97EXPORT_SYMBOL(_spin_lock_bh); 122EXPORT_SYMBOL(_spin_lock_bh);
@@ -102,6 +127,7 @@ unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
102 127
103 local_irq_save(flags); 128 local_irq_save(flags);
104 preempt_disable(); 129 preempt_disable();
130 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
105 _raw_read_lock(lock); 131 _raw_read_lock(lock);
106 return flags; 132 return flags;
107} 133}
@@ -111,6 +137,7 @@ void __lockfunc _read_lock_irq(rwlock_t *lock)
111{ 137{
112 local_irq_disable(); 138 local_irq_disable();
113 preempt_disable(); 139 preempt_disable();
140 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
114 _raw_read_lock(lock); 141 _raw_read_lock(lock);
115} 142}
116EXPORT_SYMBOL(_read_lock_irq); 143EXPORT_SYMBOL(_read_lock_irq);
@@ -119,6 +146,7 @@ void __lockfunc _read_lock_bh(rwlock_t *lock)
119{ 146{
120 local_bh_disable(); 147 local_bh_disable();
121 preempt_disable(); 148 preempt_disable();
149 rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
122 _raw_read_lock(lock); 150 _raw_read_lock(lock);
123} 151}
124EXPORT_SYMBOL(_read_lock_bh); 152EXPORT_SYMBOL(_read_lock_bh);
@@ -129,6 +157,7 @@ unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
129 157
130 local_irq_save(flags); 158 local_irq_save(flags);
131 preempt_disable(); 159 preempt_disable();
160 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
132 _raw_write_lock(lock); 161 _raw_write_lock(lock);
133 return flags; 162 return flags;
134} 163}
@@ -138,6 +167,7 @@ void __lockfunc _write_lock_irq(rwlock_t *lock)
138{ 167{
139 local_irq_disable(); 168 local_irq_disable();
140 preempt_disable(); 169 preempt_disable();
170 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
141 _raw_write_lock(lock); 171 _raw_write_lock(lock);
142} 172}
143EXPORT_SYMBOL(_write_lock_irq); 173EXPORT_SYMBOL(_write_lock_irq);
@@ -146,6 +176,7 @@ void __lockfunc _write_lock_bh(rwlock_t *lock)
146{ 176{
147 local_bh_disable(); 177 local_bh_disable();
148 preempt_disable(); 178 preempt_disable();
179 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
149 _raw_write_lock(lock); 180 _raw_write_lock(lock);
150} 181}
151EXPORT_SYMBOL(_write_lock_bh); 182EXPORT_SYMBOL(_write_lock_bh);
@@ -153,6 +184,7 @@ EXPORT_SYMBOL(_write_lock_bh);
153void __lockfunc _spin_lock(spinlock_t *lock) 184void __lockfunc _spin_lock(spinlock_t *lock)
154{ 185{
155 preempt_disable(); 186 preempt_disable();
187 spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
156 _raw_spin_lock(lock); 188 _raw_spin_lock(lock);
157} 189}
158 190
@@ -161,6 +193,7 @@ EXPORT_SYMBOL(_spin_lock);
161void __lockfunc _write_lock(rwlock_t *lock) 193void __lockfunc _write_lock(rwlock_t *lock)
162{ 194{
163 preempt_disable(); 195 preempt_disable();
196 rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
164 _raw_write_lock(lock); 197 _raw_write_lock(lock);
165} 198}
166 199
@@ -256,8 +289,22 @@ BUILD_LOCK_OPS(write, rwlock);
256 289
257#endif /* CONFIG_PREEMPT */ 290#endif /* CONFIG_PREEMPT */
258 291
292#ifdef CONFIG_DEBUG_LOCK_ALLOC
293
294void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass)
295{
296 preempt_disable();
297 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
298 _raw_spin_lock(lock);
299}
300
301EXPORT_SYMBOL(_spin_lock_nested);
302
303#endif
304
259void __lockfunc _spin_unlock(spinlock_t *lock) 305void __lockfunc _spin_unlock(spinlock_t *lock)
260{ 306{
307 spin_release(&lock->dep_map, 1, _RET_IP_);
261 _raw_spin_unlock(lock); 308 _raw_spin_unlock(lock);
262 preempt_enable(); 309 preempt_enable();
263} 310}
@@ -265,6 +312,7 @@ EXPORT_SYMBOL(_spin_unlock);
265 312
266void __lockfunc _write_unlock(rwlock_t *lock) 313void __lockfunc _write_unlock(rwlock_t *lock)
267{ 314{
315 rwlock_release(&lock->dep_map, 1, _RET_IP_);
268 _raw_write_unlock(lock); 316 _raw_write_unlock(lock);
269 preempt_enable(); 317 preempt_enable();
270} 318}
@@ -272,6 +320,7 @@ EXPORT_SYMBOL(_write_unlock);
272 320
273void __lockfunc _read_unlock(rwlock_t *lock) 321void __lockfunc _read_unlock(rwlock_t *lock)
274{ 322{
323 rwlock_release(&lock->dep_map, 1, _RET_IP_);
275 _raw_read_unlock(lock); 324 _raw_read_unlock(lock);
276 preempt_enable(); 325 preempt_enable();
277} 326}
@@ -279,6 +328,7 @@ EXPORT_SYMBOL(_read_unlock);
279 328
280void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) 329void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
281{ 330{
331 spin_release(&lock->dep_map, 1, _RET_IP_);
282 _raw_spin_unlock(lock); 332 _raw_spin_unlock(lock);
283 local_irq_restore(flags); 333 local_irq_restore(flags);
284 preempt_enable(); 334 preempt_enable();
@@ -287,6 +337,7 @@ EXPORT_SYMBOL(_spin_unlock_irqrestore);
287 337
288void __lockfunc _spin_unlock_irq(spinlock_t *lock) 338void __lockfunc _spin_unlock_irq(spinlock_t *lock)
289{ 339{
340 spin_release(&lock->dep_map, 1, _RET_IP_);
290 _raw_spin_unlock(lock); 341 _raw_spin_unlock(lock);
291 local_irq_enable(); 342 local_irq_enable();
292 preempt_enable(); 343 preempt_enable();
@@ -295,14 +346,16 @@ EXPORT_SYMBOL(_spin_unlock_irq);
295 346
296void __lockfunc _spin_unlock_bh(spinlock_t *lock) 347void __lockfunc _spin_unlock_bh(spinlock_t *lock)
297{ 348{
349 spin_release(&lock->dep_map, 1, _RET_IP_);
298 _raw_spin_unlock(lock); 350 _raw_spin_unlock(lock);
299 preempt_enable_no_resched(); 351 preempt_enable_no_resched();
300 local_bh_enable(); 352 local_bh_enable_ip((unsigned long)__builtin_return_address(0));
301} 353}
302EXPORT_SYMBOL(_spin_unlock_bh); 354EXPORT_SYMBOL(_spin_unlock_bh);
303 355
304void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags) 356void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
305{ 357{
358 rwlock_release(&lock->dep_map, 1, _RET_IP_);
306 _raw_read_unlock(lock); 359 _raw_read_unlock(lock);
307 local_irq_restore(flags); 360 local_irq_restore(flags);
308 preempt_enable(); 361 preempt_enable();
@@ -311,6 +364,7 @@ EXPORT_SYMBOL(_read_unlock_irqrestore);
311 364
312void __lockfunc _read_unlock_irq(rwlock_t *lock) 365void __lockfunc _read_unlock_irq(rwlock_t *lock)
313{ 366{
367 rwlock_release(&lock->dep_map, 1, _RET_IP_);
314 _raw_read_unlock(lock); 368 _raw_read_unlock(lock);
315 local_irq_enable(); 369 local_irq_enable();
316 preempt_enable(); 370 preempt_enable();
@@ -319,14 +373,16 @@ EXPORT_SYMBOL(_read_unlock_irq);
319 373
320void __lockfunc _read_unlock_bh(rwlock_t *lock) 374void __lockfunc _read_unlock_bh(rwlock_t *lock)
321{ 375{
376 rwlock_release(&lock->dep_map, 1, _RET_IP_);
322 _raw_read_unlock(lock); 377 _raw_read_unlock(lock);
323 preempt_enable_no_resched(); 378 preempt_enable_no_resched();
324 local_bh_enable(); 379 local_bh_enable_ip((unsigned long)__builtin_return_address(0));
325} 380}
326EXPORT_SYMBOL(_read_unlock_bh); 381EXPORT_SYMBOL(_read_unlock_bh);
327 382
328void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags) 383void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
329{ 384{
385 rwlock_release(&lock->dep_map, 1, _RET_IP_);
330 _raw_write_unlock(lock); 386 _raw_write_unlock(lock);
331 local_irq_restore(flags); 387 local_irq_restore(flags);
332 preempt_enable(); 388 preempt_enable();
@@ -335,6 +391,7 @@ EXPORT_SYMBOL(_write_unlock_irqrestore);
335 391
336void __lockfunc _write_unlock_irq(rwlock_t *lock) 392void __lockfunc _write_unlock_irq(rwlock_t *lock)
337{ 393{
394 rwlock_release(&lock->dep_map, 1, _RET_IP_);
338 _raw_write_unlock(lock); 395 _raw_write_unlock(lock);
339 local_irq_enable(); 396 local_irq_enable();
340 preempt_enable(); 397 preempt_enable();
@@ -343,9 +400,10 @@ EXPORT_SYMBOL(_write_unlock_irq);
343 400
344void __lockfunc _write_unlock_bh(rwlock_t *lock) 401void __lockfunc _write_unlock_bh(rwlock_t *lock)
345{ 402{
403 rwlock_release(&lock->dep_map, 1, _RET_IP_);
346 _raw_write_unlock(lock); 404 _raw_write_unlock(lock);
347 preempt_enable_no_resched(); 405 preempt_enable_no_resched();
348 local_bh_enable(); 406 local_bh_enable_ip((unsigned long)__builtin_return_address(0));
349} 407}
350EXPORT_SYMBOL(_write_unlock_bh); 408EXPORT_SYMBOL(_write_unlock_bh);
351 409
@@ -353,11 +411,13 @@ int __lockfunc _spin_trylock_bh(spinlock_t *lock)
353{ 411{
354 local_bh_disable(); 412 local_bh_disable();
355 preempt_disable(); 413 preempt_disable();
356 if (_raw_spin_trylock(lock)) 414 if (_raw_spin_trylock(lock)) {
415 spin_acquire(&lock->dep_map, 0, 1, _RET_IP_);
357 return 1; 416 return 1;
417 }
358 418
359 preempt_enable_no_resched(); 419 preempt_enable_no_resched();
360 local_bh_enable(); 420 local_bh_enable_ip((unsigned long)__builtin_return_address(0));
361 return 0; 421 return 0;
362} 422}
363EXPORT_SYMBOL(_spin_trylock_bh); 423EXPORT_SYMBOL(_spin_trylock_bh);
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
new file mode 100644
index 000000000000..b71816e47a30
--- /dev/null
+++ b/kernel/stacktrace.c
@@ -0,0 +1,24 @@
1/*
2 * kernel/stacktrace.c
3 *
4 * Stack trace management functions
5 *
6 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 */
8#include <linux/sched.h>
9#include <linux/kallsyms.h>
10#include <linux/stacktrace.h>
11
12void print_stack_trace(struct stack_trace *trace, int spaces)
13{
14 int i, j;
15
16 for (i = 0; i < trace->nr_entries; i++) {
17 unsigned long ip = trace->entries[i];
18
19 for (j = 0; j < spaces + 1; j++)
20 printk(" ");
21 print_ip_sym(ip);
22 }
23}
24
diff --git a/kernel/sys.c b/kernel/sys.c
index 0b6ec0e7936f..e236f98f7ec5 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -4,7 +4,6 @@
4 * Copyright (C) 1991, 1992 Linus Torvalds 4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */ 5 */
6 6
7#include <linux/config.h>
8#include <linux/module.h> 7#include <linux/module.h>
9#include <linux/mm.h> 8#include <linux/mm.h>
10#include <linux/utsname.h> 9#include <linux/utsname.h>
@@ -13,7 +12,6 @@
13#include <linux/notifier.h> 12#include <linux/notifier.h>
14#include <linux/reboot.h> 13#include <linux/reboot.h>
15#include <linux/prctl.h> 14#include <linux/prctl.h>
16#include <linux/init.h>
17#include <linux/highuid.h> 15#include <linux/highuid.h>
18#include <linux/fs.h> 16#include <linux/fs.h>
19#include <linux/kernel.h> 17#include <linux/kernel.h>
@@ -57,6 +55,12 @@
57#ifndef GET_FPEXC_CTL 55#ifndef GET_FPEXC_CTL
58# define GET_FPEXC_CTL(a,b) (-EINVAL) 56# define GET_FPEXC_CTL(a,b) (-EINVAL)
59#endif 57#endif
58#ifndef GET_ENDIAN
59# define GET_ENDIAN(a,b) (-EINVAL)
60#endif
61#ifndef SET_ENDIAN
62# define SET_ENDIAN(a,b) (-EINVAL)
63#endif
60 64
61/* 65/*
62 * this is where the system-wide overflow UID and GID are defined, for 66 * this is where the system-wide overflow UID and GID are defined, for
@@ -132,14 +136,15 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
132 unsigned long val, void *v) 136 unsigned long val, void *v)
133{ 137{
134 int ret = NOTIFY_DONE; 138 int ret = NOTIFY_DONE;
135 struct notifier_block *nb; 139 struct notifier_block *nb, *next_nb;
136 140
137 nb = rcu_dereference(*nl); 141 nb = rcu_dereference(*nl);
138 while (nb) { 142 while (nb) {
143 next_nb = rcu_dereference(nb->next);
139 ret = nb->notifier_call(nb, val, v); 144 ret = nb->notifier_call(nb, val, v);
140 if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK) 145 if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
141 break; 146 break;
142 nb = rcu_dereference(nb->next); 147 nb = next_nb;
143 } 148 }
144 return ret; 149 return ret;
145} 150}
@@ -583,7 +588,7 @@ void emergency_restart(void)
583} 588}
584EXPORT_SYMBOL_GPL(emergency_restart); 589EXPORT_SYMBOL_GPL(emergency_restart);
585 590
586void kernel_restart_prepare(char *cmd) 591static void kernel_restart_prepare(char *cmd)
587{ 592{
588 blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); 593 blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
589 system_state = SYSTEM_RESTART; 594 system_state = SYSTEM_RESTART;
@@ -617,7 +622,7 @@ EXPORT_SYMBOL_GPL(kernel_restart);
617 * Move into place and start executing a preloaded standalone 622 * Move into place and start executing a preloaded standalone
618 * executable. If nothing was preloaded return an error. 623 * executable. If nothing was preloaded return an error.
619 */ 624 */
620void kernel_kexec(void) 625static void kernel_kexec(void)
621{ 626{
622#ifdef CONFIG_KEXEC 627#ifdef CONFIG_KEXEC
623 struct kimage *image; 628 struct kimage *image;
@@ -631,7 +636,6 @@ void kernel_kexec(void)
631 machine_kexec(image); 636 machine_kexec(image);
632#endif 637#endif
633} 638}
634EXPORT_SYMBOL_GPL(kernel_kexec);
635 639
636void kernel_shutdown_prepare(enum system_states state) 640void kernel_shutdown_prepare(enum system_states state)
637{ 641{
@@ -1860,23 +1864,20 @@ out:
1860 * fields when reaping, so a sample either gets all the additions of a 1864 * fields when reaping, so a sample either gets all the additions of a
1861 * given child after it's reaped, or none so this sample is before reaping. 1865 * given child after it's reaped, or none so this sample is before reaping.
1862 * 1866 *
1863 * tasklist_lock locking optimisation: 1867 * Locking:
1864 * If we are current and single threaded, we do not need to take the tasklist 1868 * We need to take the siglock for CHILDEREN, SELF and BOTH
1865 * lock or the siglock. No one else can take our signal_struct away, 1869 * for the cases current multithreaded, non-current single threaded
1866 * no one else can reap the children to update signal->c* counters, and 1870 * non-current multithreaded. Thread traversal is now safe with
1867 * no one else can race with the signal-> fields. 1871 * the siglock held.
1868 * If we do not take the tasklist_lock, the signal-> fields could be read 1872 * Strictly speaking, we donot need to take the siglock if we are current and
1869 * out of order while another thread was just exiting. So we place a 1873 * single threaded, as no one else can take our signal_struct away, no one
1870 * read memory barrier when we avoid the lock. On the writer side, 1874 * else can reap the children to update signal->c* counters, and no one else
1871 * write memory barrier is implied in __exit_signal as __exit_signal releases 1875 * can race with the signal-> fields. If we do not take any lock, the
1872 * the siglock spinlock after updating the signal-> fields. 1876 * signal-> fields could be read out of order while another thread was just
1873 * 1877 * exiting. So we should place a read memory barrier when we avoid the lock.
1874 * We don't really need the siglock when we access the non c* fields 1878 * On the writer side, write memory barrier is implied in __exit_signal
1875 * of the signal_struct (for RUSAGE_SELF) even in multithreaded 1879 * as __exit_signal releases the siglock spinlock after updating the signal->
1876 * case, since we take the tasklist lock for read and the non c* signal-> 1880 * fields. But we don't do this yet to keep things simple.
1877 * fields are updated only in __exit_signal, which is called with
1878 * tasklist_lock taken for write, hence these two threads cannot execute
1879 * concurrently.
1880 * 1881 *
1881 */ 1882 */
1882 1883
@@ -1885,35 +1886,25 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1885 struct task_struct *t; 1886 struct task_struct *t;
1886 unsigned long flags; 1887 unsigned long flags;
1887 cputime_t utime, stime; 1888 cputime_t utime, stime;
1888 int need_lock = 0;
1889 1889
1890 memset((char *) r, 0, sizeof *r); 1890 memset((char *) r, 0, sizeof *r);
1891 utime = stime = cputime_zero; 1891 utime = stime = cputime_zero;
1892 1892
1893 if (p != current || !thread_group_empty(p)) 1893 rcu_read_lock();
1894 need_lock = 1; 1894 if (!lock_task_sighand(p, &flags)) {
1895 1895 rcu_read_unlock();
1896 if (need_lock) { 1896 return;
1897 read_lock(&tasklist_lock); 1897 }
1898 if (unlikely(!p->signal)) {
1899 read_unlock(&tasklist_lock);
1900 return;
1901 }
1902 } else
1903 /* See locking comments above */
1904 smp_rmb();
1905 1898
1906 switch (who) { 1899 switch (who) {
1907 case RUSAGE_BOTH: 1900 case RUSAGE_BOTH:
1908 case RUSAGE_CHILDREN: 1901 case RUSAGE_CHILDREN:
1909 spin_lock_irqsave(&p->sighand->siglock, flags);
1910 utime = p->signal->cutime; 1902 utime = p->signal->cutime;
1911 stime = p->signal->cstime; 1903 stime = p->signal->cstime;
1912 r->ru_nvcsw = p->signal->cnvcsw; 1904 r->ru_nvcsw = p->signal->cnvcsw;
1913 r->ru_nivcsw = p->signal->cnivcsw; 1905 r->ru_nivcsw = p->signal->cnivcsw;
1914 r->ru_minflt = p->signal->cmin_flt; 1906 r->ru_minflt = p->signal->cmin_flt;
1915 r->ru_majflt = p->signal->cmaj_flt; 1907 r->ru_majflt = p->signal->cmaj_flt;
1916 spin_unlock_irqrestore(&p->sighand->siglock, flags);
1917 1908
1918 if (who == RUSAGE_CHILDREN) 1909 if (who == RUSAGE_CHILDREN)
1919 break; 1910 break;
@@ -1941,8 +1932,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1941 BUG(); 1932 BUG();
1942 } 1933 }
1943 1934
1944 if (need_lock) 1935 unlock_task_sighand(p, &flags);
1945 read_unlock(&tasklist_lock); 1936 rcu_read_unlock();
1937
1946 cputime_to_timeval(utime, &r->ru_utime); 1938 cputime_to_timeval(utime, &r->ru_utime);
1947 cputime_to_timeval(stime, &r->ru_stime); 1939 cputime_to_timeval(stime, &r->ru_stime);
1948} 1940}
@@ -1991,7 +1983,7 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1991 error = current->mm->dumpable; 1983 error = current->mm->dumpable;
1992 break; 1984 break;
1993 case PR_SET_DUMPABLE: 1985 case PR_SET_DUMPABLE:
1994 if (arg2 < 0 || arg2 > 2) { 1986 if (arg2 < 0 || arg2 > 1) {
1995 error = -EINVAL; 1987 error = -EINVAL;
1996 break; 1988 break;
1997 } 1989 }
@@ -2057,6 +2049,13 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
2057 return -EFAULT; 2049 return -EFAULT;
2058 return 0; 2050 return 0;
2059 } 2051 }
2052 case PR_GET_ENDIAN:
2053 error = GET_ENDIAN(current, arg2);
2054 break;
2055 case PR_SET_ENDIAN:
2056 error = SET_ENDIAN(current, arg2);
2057 break;
2058
2060 default: 2059 default:
2061 error = -EINVAL; 2060 error = -EINVAL;
2062 break; 2061 break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 5433195040f1..6991bece67e8 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -87,6 +87,7 @@ cond_syscall(sys_inotify_init);
87cond_syscall(sys_inotify_add_watch); 87cond_syscall(sys_inotify_add_watch);
88cond_syscall(sys_inotify_rm_watch); 88cond_syscall(sys_inotify_rm_watch);
89cond_syscall(sys_migrate_pages); 89cond_syscall(sys_migrate_pages);
90cond_syscall(sys_move_pages);
90cond_syscall(sys_chown16); 91cond_syscall(sys_chown16);
91cond_syscall(sys_fchown16); 92cond_syscall(sys_fchown16);
92cond_syscall(sys_getegid16); 93cond_syscall(sys_getegid16);
@@ -132,3 +133,4 @@ cond_syscall(sys_mincore);
132cond_syscall(sys_madvise); 133cond_syscall(sys_madvise);
133cond_syscall(sys_mremap); 134cond_syscall(sys_mremap);
134cond_syscall(sys_remap_file_pages); 135cond_syscall(sys_remap_file_pages);
136cond_syscall(compat_sys_move_pages);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e82726faeeff..362a0cc37138 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -18,7 +18,6 @@
18 * Removed it and replaced it with older style, 03/23/00, Bill Wendling 18 * Removed it and replaced it with older style, 03/23/00, Bill Wendling
19 */ 19 */
20 20
21#include <linux/config.h>
22#include <linux/module.h> 21#include <linux/module.h>
23#include <linux/mm.h> 22#include <linux/mm.h>
24#include <linux/swap.h> 23#include <linux/swap.h>
@@ -59,6 +58,7 @@ extern int proc_nr_files(ctl_table *table, int write, struct file *filp,
59extern int C_A_D; 58extern int C_A_D;
60extern int sysctl_overcommit_memory; 59extern int sysctl_overcommit_memory;
61extern int sysctl_overcommit_ratio; 60extern int sysctl_overcommit_ratio;
61extern int sysctl_panic_on_oom;
62extern int max_threads; 62extern int max_threads;
63extern int sysrq_enabled; 63extern int sysrq_enabled;
64extern int core_uses_pid; 64extern int core_uses_pid;
@@ -72,6 +72,7 @@ extern int printk_ratelimit_burst;
72extern int pid_max_min, pid_max_max; 72extern int pid_max_min, pid_max_max;
73extern int sysctl_drop_caches; 73extern int sysctl_drop_caches;
74extern int percpu_pagelist_fraction; 74extern int percpu_pagelist_fraction;
75extern int compat_log;
75 76
76#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) 77#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
77int unknown_nmi_panic; 78int unknown_nmi_panic;
@@ -131,6 +132,10 @@ extern int acct_parm[];
131extern int no_unaligned_warning; 132extern int no_unaligned_warning;
132#endif 133#endif
133 134
135#ifdef CONFIG_RT_MUTEXES
136extern int max_lock_depth;
137#endif
138
134static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, 139static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t,
135 ctl_table *, void **); 140 ctl_table *, void **);
136static int proc_doutsstring(ctl_table *table, int write, struct file *filp, 141static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
@@ -142,7 +147,6 @@ static struct ctl_table_header root_table_header =
142 147
143static ctl_table kern_table[]; 148static ctl_table kern_table[];
144static ctl_table vm_table[]; 149static ctl_table vm_table[];
145static ctl_table proc_table[];
146static ctl_table fs_table[]; 150static ctl_table fs_table[];
147static ctl_table debug_table[]; 151static ctl_table debug_table[];
148static ctl_table dev_table[]; 152static ctl_table dev_table[];
@@ -150,7 +154,7 @@ extern ctl_table random_table[];
150#ifdef CONFIG_UNIX98_PTYS 154#ifdef CONFIG_UNIX98_PTYS
151extern ctl_table pty_table[]; 155extern ctl_table pty_table[];
152#endif 156#endif
153#ifdef CONFIG_INOTIFY 157#ifdef CONFIG_INOTIFY_USER
154extern ctl_table inotify_table[]; 158extern ctl_table inotify_table[];
155#endif 159#endif
156 160
@@ -202,12 +206,6 @@ static ctl_table root_table[] = {
202 }, 206 },
203#endif 207#endif
204 { 208 {
205 .ctl_name = CTL_PROC,
206 .procname = "proc",
207 .mode = 0555,
208 .child = proc_table,
209 },
210 {
211 .ctl_name = CTL_FS, 209 .ctl_name = CTL_FS,
212 .procname = "fs", 210 .procname = "fs",
213 .mode = 0555, 211 .mode = 0555,
@@ -398,7 +396,7 @@ static ctl_table kern_table[] = {
398 .strategy = &sysctl_string, 396 .strategy = &sysctl_string,
399 }, 397 },
400#endif 398#endif
401#ifdef CONFIG_HOTPLUG 399#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
402 { 400 {
403 .ctl_name = KERN_HOTPLUG, 401 .ctl_name = KERN_HOTPLUG,
404 .procname = "hotplug", 402 .procname = "hotplug",
@@ -683,6 +681,27 @@ static ctl_table kern_table[] = {
683 .proc_handler = &proc_dointvec, 681 .proc_handler = &proc_dointvec,
684 }, 682 },
685#endif 683#endif
684#ifdef CONFIG_COMPAT
685 {
686 .ctl_name = KERN_COMPAT_LOG,
687 .procname = "compat-log",
688 .data = &compat_log,
689 .maxlen = sizeof (int),
690 .mode = 0644,
691 .proc_handler = &proc_dointvec,
692 },
693#endif
694#ifdef CONFIG_RT_MUTEXES
695 {
696 .ctl_name = KERN_MAX_LOCK_DEPTH,
697 .procname = "max_lock_depth",
698 .data = &max_lock_depth,
699 .maxlen = sizeof(int),
700 .mode = 0644,
701 .proc_handler = &proc_dointvec,
702 },
703#endif
704
686 { .ctl_name = 0 } 705 { .ctl_name = 0 }
687}; 706};
688 707
@@ -702,6 +721,14 @@ static ctl_table vm_table[] = {
702 .proc_handler = &proc_dointvec, 721 .proc_handler = &proc_dointvec,
703 }, 722 },
704 { 723 {
724 .ctl_name = VM_PANIC_ON_OOM,
725 .procname = "panic_on_oom",
726 .data = &sysctl_panic_on_oom,
727 .maxlen = sizeof(sysctl_panic_on_oom),
728 .mode = 0644,
729 .proc_handler = &proc_dointvec,
730 },
731 {
705 .ctl_name = VM_OVERCOMMIT_RATIO, 732 .ctl_name = VM_OVERCOMMIT_RATIO,
706 .procname = "overcommit_ratio", 733 .procname = "overcommit_ratio",
707 .data = &sysctl_overcommit_ratio, 734 .data = &sysctl_overcommit_ratio,
@@ -906,19 +933,29 @@ static ctl_table vm_table[] = {
906 .extra1 = &zero, 933 .extra1 = &zero,
907 }, 934 },
908 { 935 {
909 .ctl_name = VM_ZONE_RECLAIM_INTERVAL, 936 .ctl_name = VM_MIN_UNMAPPED,
910 .procname = "zone_reclaim_interval", 937 .procname = "min_unmapped_ratio",
911 .data = &zone_reclaim_interval, 938 .data = &sysctl_min_unmapped_ratio,
912 .maxlen = sizeof(zone_reclaim_interval), 939 .maxlen = sizeof(sysctl_min_unmapped_ratio),
913 .mode = 0644, 940 .mode = 0644,
914 .proc_handler = &proc_dointvec_jiffies, 941 .proc_handler = &sysctl_min_unmapped_ratio_sysctl_handler,
915 .strategy = &sysctl_jiffies, 942 .strategy = &sysctl_intvec,
943 .extra1 = &zero,
944 .extra2 = &one_hundred,
945 },
946#endif
947#ifdef CONFIG_X86_32
948 {
949 .ctl_name = VM_VDSO_ENABLED,
950 .procname = "vdso_enabled",
951 .data = &vdso_enabled,
952 .maxlen = sizeof(vdso_enabled),
953 .mode = 0644,
954 .proc_handler = &proc_dointvec,
955 .strategy = &sysctl_intvec,
956 .extra1 = &zero,
916 }, 957 },
917#endif 958#endif
918 { .ctl_name = 0 }
919};
920
921static ctl_table proc_table[] = {
922 { .ctl_name = 0 } 959 { .ctl_name = 0 }
923}; 960};
924 961
@@ -1028,7 +1065,7 @@ static ctl_table fs_table[] = {
1028 .mode = 0644, 1065 .mode = 0644,
1029 .proc_handler = &proc_doulongvec_minmax, 1066 .proc_handler = &proc_doulongvec_minmax,
1030 }, 1067 },
1031#ifdef CONFIG_INOTIFY 1068#ifdef CONFIG_INOTIFY_USER
1032 { 1069 {
1033 .ctl_name = FS_INOTIFY, 1070 .ctl_name = FS_INOTIFY,
1034 .procname = "inotify", 1071 .procname = "inotify",
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
new file mode 100644
index 000000000000..f45179ce028e
--- /dev/null
+++ b/kernel/taskstats.c
@@ -0,0 +1,568 @@
1/*
2 * taskstats.c - Export per-task statistics to userland
3 *
4 * Copyright (C) Shailabh Nagar, IBM Corp. 2006
5 * (C) Balbir Singh, IBM Corp. 2006
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 */
18
19#include <linux/kernel.h>
20#include <linux/taskstats_kern.h>
21#include <linux/delayacct.h>
22#include <linux/cpumask.h>
23#include <linux/percpu.h>
24#include <net/genetlink.h>
25#include <asm/atomic.h>
26
27/*
28 * Maximum length of a cpumask that can be specified in
29 * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute
30 */
31#define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS)
32
33static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
34static int family_registered;
35kmem_cache_t *taskstats_cache;
36
37static struct genl_family family = {
38 .id = GENL_ID_GENERATE,
39 .name = TASKSTATS_GENL_NAME,
40 .version = TASKSTATS_GENL_VERSION,
41 .maxattr = TASKSTATS_CMD_ATTR_MAX,
42};
43
44static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1]
45__read_mostly = {
46 [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 },
47 [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
48 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
49 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
50
51struct listener {
52 struct list_head list;
53 pid_t pid;
54 char valid;
55};
56
57struct listener_list {
58 struct rw_semaphore sem;
59 struct list_head list;
60};
61static DEFINE_PER_CPU(struct listener_list, listener_array);
62
63enum actions {
64 REGISTER,
65 DEREGISTER,
66 CPU_DONT_CARE
67};
68
69static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
70 void **replyp, size_t size)
71{
72 struct sk_buff *skb;
73 void *reply;
74
75 /*
76 * If new attributes are added, please revisit this allocation
77 */
78 skb = nlmsg_new(size);
79 if (!skb)
80 return -ENOMEM;
81
82 if (!info) {
83 int seq = get_cpu_var(taskstats_seqnum)++;
84 put_cpu_var(taskstats_seqnum);
85
86 reply = genlmsg_put(skb, 0, seq,
87 family.id, 0, 0,
88 cmd, family.version);
89 } else
90 reply = genlmsg_put(skb, info->snd_pid, info->snd_seq,
91 family.id, 0, 0,
92 cmd, family.version);
93 if (reply == NULL) {
94 nlmsg_free(skb);
95 return -EINVAL;
96 }
97
98 *skbp = skb;
99 *replyp = reply;
100 return 0;
101}
102
103/*
104 * Send taskstats data in @skb to listener with nl_pid @pid
105 */
106static int send_reply(struct sk_buff *skb, pid_t pid)
107{
108 struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
109 void *reply = genlmsg_data(genlhdr);
110 int rc;
111
112 rc = genlmsg_end(skb, reply);
113 if (rc < 0) {
114 nlmsg_free(skb);
115 return rc;
116 }
117
118 return genlmsg_unicast(skb, pid);
119}
120
121/*
122 * Send taskstats data in @skb to listeners registered for @cpu's exit data
123 */
124static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
125{
126 struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
127 struct listener_list *listeners;
128 struct listener *s, *tmp;
129 struct sk_buff *skb_next, *skb_cur = skb;
130 void *reply = genlmsg_data(genlhdr);
131 int rc, ret, delcount = 0;
132
133 rc = genlmsg_end(skb, reply);
134 if (rc < 0) {
135 nlmsg_free(skb);
136 return rc;
137 }
138
139 rc = 0;
140 listeners = &per_cpu(listener_array, cpu);
141 down_read(&listeners->sem);
142 list_for_each_entry_safe(s, tmp, &listeners->list, list) {
143 skb_next = NULL;
144 if (!list_is_last(&s->list, &listeners->list)) {
145 skb_next = skb_clone(skb_cur, GFP_KERNEL);
146 if (!skb_next) {
147 nlmsg_free(skb_cur);
148 rc = -ENOMEM;
149 break;
150 }
151 }
152 ret = genlmsg_unicast(skb_cur, s->pid);
153 if (ret == -ECONNREFUSED) {
154 s->valid = 0;
155 delcount++;
156 rc = ret;
157 }
158 skb_cur = skb_next;
159 }
160 up_read(&listeners->sem);
161
162 if (!delcount)
163 return rc;
164
165 /* Delete invalidated entries */
166 down_write(&listeners->sem);
167 list_for_each_entry_safe(s, tmp, &listeners->list, list) {
168 if (!s->valid) {
169 list_del(&s->list);
170 kfree(s);
171 }
172 }
173 up_write(&listeners->sem);
174 return rc;
175}
176
177static int fill_pid(pid_t pid, struct task_struct *pidtsk,
178 struct taskstats *stats)
179{
180 int rc;
181 struct task_struct *tsk = pidtsk;
182
183 if (!pidtsk) {
184 read_lock(&tasklist_lock);
185 tsk = find_task_by_pid(pid);
186 if (!tsk) {
187 read_unlock(&tasklist_lock);
188 return -ESRCH;
189 }
190 get_task_struct(tsk);
191 read_unlock(&tasklist_lock);
192 } else
193 get_task_struct(tsk);
194
195 /*
196 * Each accounting subsystem adds calls to its functions to
197 * fill in relevant parts of struct taskstsats as follows
198 *
199 * rc = per-task-foo(stats, tsk);
200 * if (rc)
201 * goto err;
202 */
203
204 rc = delayacct_add_tsk(stats, tsk);
205 stats->version = TASKSTATS_VERSION;
206
207 /* Define err: label here if needed */
208 put_task_struct(tsk);
209 return rc;
210
211}
212
213static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk,
214 struct taskstats *stats)
215{
216 struct task_struct *tsk, *first;
217 unsigned long flags;
218
219 /*
220 * Add additional stats from live tasks except zombie thread group
221 * leaders who are already counted with the dead tasks
222 */
223 first = tgidtsk;
224 if (!first) {
225 read_lock(&tasklist_lock);
226 first = find_task_by_pid(tgid);
227 if (!first) {
228 read_unlock(&tasklist_lock);
229 return -ESRCH;
230 }
231 get_task_struct(first);
232 read_unlock(&tasklist_lock);
233 } else
234 get_task_struct(first);
235
236 /* Start with stats from dead tasks */
237 spin_lock_irqsave(&first->signal->stats_lock, flags);
238 if (first->signal->stats)
239 memcpy(stats, first->signal->stats, sizeof(*stats));
240 spin_unlock_irqrestore(&first->signal->stats_lock, flags);
241
242 tsk = first;
243 read_lock(&tasklist_lock);
244 do {
245 if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk))
246 continue;
247 /*
248 * Accounting subsystem can call its functions here to
249 * fill in relevant parts of struct taskstsats as follows
250 *
251 * per-task-foo(stats, tsk);
252 */
253 delayacct_add_tsk(stats, tsk);
254
255 } while_each_thread(first, tsk);
256 read_unlock(&tasklist_lock);
257 stats->version = TASKSTATS_VERSION;
258
259 /*
260 * Accounting subsytems can also add calls here to modify
261 * fields of taskstats.
262 */
263
264 return 0;
265}
266
267
268static void fill_tgid_exit(struct task_struct *tsk)
269{
270 unsigned long flags;
271
272 spin_lock_irqsave(&tsk->signal->stats_lock, flags);
273 if (!tsk->signal->stats)
274 goto ret;
275
276 /*
277 * Each accounting subsystem calls its functions here to
278 * accumalate its per-task stats for tsk, into the per-tgid structure
279 *
280 * per-task-foo(tsk->signal->stats, tsk);
281 */
282 delayacct_add_tsk(tsk->signal->stats, tsk);
283ret:
284 spin_unlock_irqrestore(&tsk->signal->stats_lock, flags);
285 return;
286}
287
288static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
289{
290 struct listener_list *listeners;
291 struct listener *s, *tmp;
292 unsigned int cpu;
293 cpumask_t mask = *maskp;
294
295 if (!cpus_subset(mask, cpu_possible_map))
296 return -EINVAL;
297
298 if (isadd == REGISTER) {
299 for_each_cpu_mask(cpu, mask) {
300 s = kmalloc_node(sizeof(struct listener), GFP_KERNEL,
301 cpu_to_node(cpu));
302 if (!s)
303 goto cleanup;
304 s->pid = pid;
305 INIT_LIST_HEAD(&s->list);
306 s->valid = 1;
307
308 listeners = &per_cpu(listener_array, cpu);
309 down_write(&listeners->sem);
310 list_add(&s->list, &listeners->list);
311 up_write(&listeners->sem);
312 }
313 return 0;
314 }
315
316 /* Deregister or cleanup */
317cleanup:
318 for_each_cpu_mask(cpu, mask) {
319 listeners = &per_cpu(listener_array, cpu);
320 down_write(&listeners->sem);
321 list_for_each_entry_safe(s, tmp, &listeners->list, list) {
322 if (s->pid == pid) {
323 list_del(&s->list);
324 kfree(s);
325 break;
326 }
327 }
328 up_write(&listeners->sem);
329 }
330 return 0;
331}
332
333static int parse(struct nlattr *na, cpumask_t *mask)
334{
335 char *data;
336 int len;
337 int ret;
338
339 if (na == NULL)
340 return 1;
341 len = nla_len(na);
342 if (len > TASKSTATS_CPUMASK_MAXLEN)
343 return -E2BIG;
344 if (len < 1)
345 return -EINVAL;
346 data = kmalloc(len, GFP_KERNEL);
347 if (!data)
348 return -ENOMEM;
349 nla_strlcpy(data, na, len);
350 ret = cpulist_parse(data, *mask);
351 kfree(data);
352 return ret;
353}
354
355static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
356{
357 int rc = 0;
358 struct sk_buff *rep_skb;
359 struct taskstats stats;
360 void *reply;
361 size_t size;
362 struct nlattr *na;
363 cpumask_t mask;
364
365 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask);
366 if (rc < 0)
367 return rc;
368 if (rc == 0)
369 return add_del_listener(info->snd_pid, &mask, REGISTER);
370
371 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], &mask);
372 if (rc < 0)
373 return rc;
374 if (rc == 0)
375 return add_del_listener(info->snd_pid, &mask, DEREGISTER);
376
377 /*
378 * Size includes space for nested attributes
379 */
380 size = nla_total_size(sizeof(u32)) +
381 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
382
383 memset(&stats, 0, sizeof(stats));
384 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
385 if (rc < 0)
386 return rc;
387
388 if (info->attrs[TASKSTATS_CMD_ATTR_PID]) {
389 u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
390 rc = fill_pid(pid, NULL, &stats);
391 if (rc < 0)
392 goto err;
393
394 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID);
395 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, pid);
396 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
397 stats);
398 } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) {
399 u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
400 rc = fill_tgid(tgid, NULL, &stats);
401 if (rc < 0)
402 goto err;
403
404 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
405 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, tgid);
406 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
407 stats);
408 } else {
409 rc = -EINVAL;
410 goto err;
411 }
412
413 nla_nest_end(rep_skb, na);
414
415 return send_reply(rep_skb, info->snd_pid);
416
417nla_put_failure:
418 return genlmsg_cancel(rep_skb, reply);
419err:
420 nlmsg_free(rep_skb);
421 return rc;
422}
423
424void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu)
425{
426 struct listener_list *listeners;
427 struct taskstats *tmp;
428 /*
429 * This is the cpu on which the task is exiting currently and will
430 * be the one for which the exit event is sent, even if the cpu
431 * on which this function is running changes later.
432 */
433 *mycpu = raw_smp_processor_id();
434
435 *ptidstats = NULL;
436 tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
437 if (!tmp)
438 return;
439
440 listeners = &per_cpu(listener_array, *mycpu);
441 down_read(&listeners->sem);
442 if (!list_empty(&listeners->list)) {
443 *ptidstats = tmp;
444 tmp = NULL;
445 }
446 up_read(&listeners->sem);
447 kfree(tmp);
448}
449
450/* Send pid data out on exit */
451void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
452 int group_dead, unsigned int mycpu)
453{
454 int rc;
455 struct sk_buff *rep_skb;
456 void *reply;
457 size_t size;
458 int is_thread_group;
459 struct nlattr *na;
460 unsigned long flags;
461
462 if (!family_registered || !tidstats)
463 return;
464
465 spin_lock_irqsave(&tsk->signal->stats_lock, flags);
466 is_thread_group = tsk->signal->stats ? 1 : 0;
467 spin_unlock_irqrestore(&tsk->signal->stats_lock, flags);
468
469 rc = 0;
470 /*
471 * Size includes space for nested attributes
472 */
473 size = nla_total_size(sizeof(u32)) +
474 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
475
476 if (is_thread_group)
477 size = 2 * size; /* PID + STATS + TGID + STATS */
478
479 rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
480 if (rc < 0)
481 goto ret;
482
483 rc = fill_pid(tsk->pid, tsk, tidstats);
484 if (rc < 0)
485 goto err_skb;
486
487 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID);
488 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, (u32)tsk->pid);
489 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
490 *tidstats);
491 nla_nest_end(rep_skb, na);
492
493 if (!is_thread_group)
494 goto send;
495
496 /*
497 * tsk has/had a thread group so fill the tsk->signal->stats structure
498 * Doesn't matter if tsk is the leader or the last group member leaving
499 */
500
501 fill_tgid_exit(tsk);
502 if (!group_dead)
503 goto send;
504
505 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
506 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid);
507 /* No locking needed for tsk->signal->stats since group is dead */
508 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
509 *tsk->signal->stats);
510 nla_nest_end(rep_skb, na);
511
512send:
513 send_cpu_listeners(rep_skb, mycpu);
514 return;
515
516nla_put_failure:
517 genlmsg_cancel(rep_skb, reply);
518 goto ret;
519err_skb:
520 nlmsg_free(rep_skb);
521ret:
522 return;
523}
524
525static struct genl_ops taskstats_ops = {
526 .cmd = TASKSTATS_CMD_GET,
527 .doit = taskstats_user_cmd,
528 .policy = taskstats_cmd_get_policy,
529};
530
531/* Needed early in initialization */
532void __init taskstats_init_early(void)
533{
534 unsigned int i;
535
536 taskstats_cache = kmem_cache_create("taskstats_cache",
537 sizeof(struct taskstats),
538 0, SLAB_PANIC, NULL, NULL);
539 for_each_possible_cpu(i) {
540 INIT_LIST_HEAD(&(per_cpu(listener_array, i).list));
541 init_rwsem(&(per_cpu(listener_array, i).sem));
542 }
543}
544
545static int __init taskstats_init(void)
546{
547 int rc;
548
549 rc = genl_register_family(&family);
550 if (rc)
551 return rc;
552
553 rc = genl_register_ops(&family, &taskstats_ops);
554 if (rc < 0)
555 goto err;
556
557 family_registered = 1;
558 return 0;
559err:
560 genl_unregister_family(&family);
561 return rc;
562}
563
564/*
565 * late initcall ensures initialization of statistics collection
566 * mechanisms precedes initialization of the taskstats interface
567 */
568late_initcall(taskstats_init);
diff --git a/kernel/time.c b/kernel/time.c
index b00ddc71cedb..5bd489747643 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -523,6 +523,7 @@ EXPORT_SYMBOL(do_gettimeofday);
523 523
524 524
525#else 525#else
526#ifndef CONFIG_GENERIC_TIME
526/* 527/*
527 * Simulate gettimeofday using do_gettimeofday which only allows a timeval 528 * Simulate gettimeofday using do_gettimeofday which only allows a timeval
528 * and therefore only yields usec accuracy 529 * and therefore only yields usec accuracy
@@ -537,6 +538,7 @@ void getnstimeofday(struct timespec *tv)
537} 538}
538EXPORT_SYMBOL_GPL(getnstimeofday); 539EXPORT_SYMBOL_GPL(getnstimeofday);
539#endif 540#endif
541#endif
540 542
541/* Converts Gregorian date to seconds since 1970-01-01 00:00:00. 543/* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
542 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 544 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
new file mode 100644
index 000000000000..e1dfd8e86cce
--- /dev/null
+++ b/kernel/time/Makefile
@@ -0,0 +1 @@
obj-y += clocksource.o jiffies.o
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
new file mode 100644
index 000000000000..74eca5939bd9
--- /dev/null
+++ b/kernel/time/clocksource.c
@@ -0,0 +1,349 @@
1/*
2 * linux/kernel/time/clocksource.c
3 *
4 * This file contains the functions which manage clocksource drivers.
5 *
6 * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 *
22 * TODO WishList:
23 * o Allow clocksource drivers to be unregistered
24 * o get rid of clocksource_jiffies extern
25 */
26
27#include <linux/clocksource.h>
28#include <linux/sysdev.h>
29#include <linux/init.h>
30#include <linux/module.h>
31
32/* XXX - Would like a better way for initializing curr_clocksource */
33extern struct clocksource clocksource_jiffies;
34
35/*[Clocksource internal variables]---------
36 * curr_clocksource:
37 * currently selected clocksource. Initialized to clocksource_jiffies.
38 * next_clocksource:
39 * pending next selected clocksource.
40 * clocksource_list:
41 * linked list with the registered clocksources
42 * clocksource_lock:
43 * protects manipulations to curr_clocksource and next_clocksource
44 * and the clocksource_list
45 * override_name:
46 * Name of the user-specified clocksource.
47 */
48static struct clocksource *curr_clocksource = &clocksource_jiffies;
49static struct clocksource *next_clocksource;
50static LIST_HEAD(clocksource_list);
51static DEFINE_SPINLOCK(clocksource_lock);
52static char override_name[32];
53static int finished_booting;
54
55/* clocksource_done_booting - Called near the end of bootup
56 *
57 * Hack to avoid lots of clocksource churn at boot time
58 */
59static int __init clocksource_done_booting(void)
60{
61 finished_booting = 1;
62 return 0;
63}
64
65late_initcall(clocksource_done_booting);
66
67/**
68 * clocksource_get_next - Returns the selected clocksource
69 *
70 */
71struct clocksource *clocksource_get_next(void)
72{
73 unsigned long flags;
74
75 spin_lock_irqsave(&clocksource_lock, flags);
76 if (next_clocksource && finished_booting) {
77 curr_clocksource = next_clocksource;
78 next_clocksource = NULL;
79 }
80 spin_unlock_irqrestore(&clocksource_lock, flags);
81
82 return curr_clocksource;
83}
84
85/**
86 * select_clocksource - Finds the best registered clocksource.
87 *
88 * Private function. Must hold clocksource_lock when called.
89 *
90 * Looks through the list of registered clocksources, returning
91 * the one with the highest rating value. If there is a clocksource
92 * name that matches the override string, it returns that clocksource.
93 */
94static struct clocksource *select_clocksource(void)
95{
96 struct clocksource *best = NULL;
97 struct list_head *tmp;
98
99 list_for_each(tmp, &clocksource_list) {
100 struct clocksource *src;
101
102 src = list_entry(tmp, struct clocksource, list);
103 if (!best)
104 best = src;
105
106 /* check for override: */
107 if (strlen(src->name) == strlen(override_name) &&
108 !strcmp(src->name, override_name)) {
109 best = src;
110 break;
111 }
112 /* pick the highest rating: */
113 if (src->rating > best->rating)
114 best = src;
115 }
116
117 return best;
118}
119
120/**
121 * is_registered_source - Checks if clocksource is registered
122 * @c: pointer to a clocksource
123 *
124 * Private helper function. Must hold clocksource_lock when called.
125 *
126 * Returns one if the clocksource is already registered, zero otherwise.
127 */
128static int is_registered_source(struct clocksource *c)
129{
130 int len = strlen(c->name);
131 struct list_head *tmp;
132
133 list_for_each(tmp, &clocksource_list) {
134 struct clocksource *src;
135
136 src = list_entry(tmp, struct clocksource, list);
137 if (strlen(src->name) == len && !strcmp(src->name, c->name))
138 return 1;
139 }
140
141 return 0;
142}
143
144/**
145 * clocksource_register - Used to install new clocksources
146 * @t: clocksource to be registered
147 *
148 * Returns -EBUSY if registration fails, zero otherwise.
149 */
150int clocksource_register(struct clocksource *c)
151{
152 int ret = 0;
153 unsigned long flags;
154
155 spin_lock_irqsave(&clocksource_lock, flags);
156 /* check if clocksource is already registered */
157 if (is_registered_source(c)) {
158 printk("register_clocksource: Cannot register %s. "
159 "Already registered!", c->name);
160 ret = -EBUSY;
161 } else {
162 /* register it */
163 list_add(&c->list, &clocksource_list);
164 /* scan the registered clocksources, and pick the best one */
165 next_clocksource = select_clocksource();
166 }
167 spin_unlock_irqrestore(&clocksource_lock, flags);
168 return ret;
169}
170EXPORT_SYMBOL(clocksource_register);
171
172/**
173 * clocksource_reselect - Rescan list for next clocksource
174 *
175 * A quick helper function to be used if a clocksource changes its
176 * rating. Forces the clocksource list to be re-scanned for the best
177 * clocksource.
178 */
179void clocksource_reselect(void)
180{
181 unsigned long flags;
182
183 spin_lock_irqsave(&clocksource_lock, flags);
184 next_clocksource = select_clocksource();
185 spin_unlock_irqrestore(&clocksource_lock, flags);
186}
187EXPORT_SYMBOL(clocksource_reselect);
188
189/**
190 * sysfs_show_current_clocksources - sysfs interface for current clocksource
191 * @dev: unused
192 * @buf: char buffer to be filled with clocksource list
193 *
194 * Provides sysfs interface for listing current clocksource.
195 */
196static ssize_t
197sysfs_show_current_clocksources(struct sys_device *dev, char *buf)
198{
199 char *curr = buf;
200
201 spin_lock_irq(&clocksource_lock);
202 curr += sprintf(curr, "%s ", curr_clocksource->name);
203 spin_unlock_irq(&clocksource_lock);
204
205 curr += sprintf(curr, "\n");
206
207 return curr - buf;
208}
209
210/**
211 * sysfs_override_clocksource - interface for manually overriding clocksource
212 * @dev: unused
213 * @buf: name of override clocksource
214 * @count: length of buffer
215 *
216 * Takes input from sysfs interface for manually overriding the default
217 * clocksource selction.
218 */
219static ssize_t sysfs_override_clocksource(struct sys_device *dev,
220 const char *buf, size_t count)
221{
222 size_t ret = count;
223 /* strings from sysfs write are not 0 terminated! */
224 if (count >= sizeof(override_name))
225 return -EINVAL;
226
227 /* strip of \n: */
228 if (buf[count-1] == '\n')
229 count--;
230 if (count < 1)
231 return -EINVAL;
232
233 spin_lock_irq(&clocksource_lock);
234
235 /* copy the name given: */
236 memcpy(override_name, buf, count);
237 override_name[count] = 0;
238
239 /* try to select it: */
240 next_clocksource = select_clocksource();
241
242 spin_unlock_irq(&clocksource_lock);
243
244 return ret;
245}
246
247/**
248 * sysfs_show_available_clocksources - sysfs interface for listing clocksource
249 * @dev: unused
250 * @buf: char buffer to be filled with clocksource list
251 *
252 * Provides sysfs interface for listing registered clocksources
253 */
254static ssize_t
255sysfs_show_available_clocksources(struct sys_device *dev, char *buf)
256{
257 struct list_head *tmp;
258 char *curr = buf;
259
260 spin_lock_irq(&clocksource_lock);
261 list_for_each(tmp, &clocksource_list) {
262 struct clocksource *src;
263
264 src = list_entry(tmp, struct clocksource, list);
265 curr += sprintf(curr, "%s ", src->name);
266 }
267 spin_unlock_irq(&clocksource_lock);
268
269 curr += sprintf(curr, "\n");
270
271 return curr - buf;
272}
273
274/*
275 * Sysfs setup bits:
276 */
277static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources,
278 sysfs_override_clocksource);
279
280static SYSDEV_ATTR(available_clocksource, 0600,
281 sysfs_show_available_clocksources, NULL);
282
283static struct sysdev_class clocksource_sysclass = {
284 set_kset_name("clocksource"),
285};
286
287static struct sys_device device_clocksource = {
288 .id = 0,
289 .cls = &clocksource_sysclass,
290};
291
292static int __init init_clocksource_sysfs(void)
293{
294 int error = sysdev_class_register(&clocksource_sysclass);
295
296 if (!error)
297 error = sysdev_register(&device_clocksource);
298 if (!error)
299 error = sysdev_create_file(
300 &device_clocksource,
301 &attr_current_clocksource);
302 if (!error)
303 error = sysdev_create_file(
304 &device_clocksource,
305 &attr_available_clocksource);
306 return error;
307}
308
309device_initcall(init_clocksource_sysfs);
310
311/**
312 * boot_override_clocksource - boot clock override
313 * @str: override name
314 *
315 * Takes a clocksource= boot argument and uses it
316 * as the clocksource override name.
317 */
318static int __init boot_override_clocksource(char* str)
319{
320 unsigned long flags;
321 spin_lock_irqsave(&clocksource_lock, flags);
322 if (str)
323 strlcpy(override_name, str, sizeof(override_name));
324 spin_unlock_irqrestore(&clocksource_lock, flags);
325 return 1;
326}
327
328__setup("clocksource=", boot_override_clocksource);
329
330/**
331 * boot_override_clock - Compatibility layer for deprecated boot option
332 * @str: override name
333 *
334 * DEPRECATED! Takes a clock= boot argument and uses it
335 * as the clocksource override name
336 */
337static int __init boot_override_clock(char* str)
338{
339 if (!strcmp(str, "pmtmr")) {
340 printk("Warning: clock=pmtmr is deprecated. "
341 "Use clocksource=acpi_pm.\n");
342 return boot_override_clocksource("acpi_pm");
343 }
344 printk("Warning! clock= boot option is deprecated. "
345 "Use clocksource=xyz\n");
346 return boot_override_clocksource(str);
347}
348
349__setup("clock=", boot_override_clock);
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
new file mode 100644
index 000000000000..126bb30c4afe
--- /dev/null
+++ b/kernel/time/jiffies.c
@@ -0,0 +1,73 @@
1/***********************************************************************
2* linux/kernel/time/jiffies.c
3*
4* This file contains the jiffies based clocksource.
5*
6* Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
7*
8* This program is free software; you can redistribute it and/or modify
9* it under the terms of the GNU General Public License as published by
10* the Free Software Foundation; either version 2 of the License, or
11* (at your option) any later version.
12*
13* This program is distributed in the hope that it will be useful,
14* but WITHOUT ANY WARRANTY; without even the implied warranty of
15* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16* GNU General Public License for more details.
17*
18* You should have received a copy of the GNU General Public License
19* along with this program; if not, write to the Free Software
20* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21*
22************************************************************************/
23#include <linux/clocksource.h>
24#include <linux/jiffies.h>
25#include <linux/init.h>
26
27/* The Jiffies based clocksource is the lowest common
28 * denominator clock source which should function on
29 * all systems. It has the same coarse resolution as
30 * the timer interrupt frequency HZ and it suffers
31 * inaccuracies caused by missed or lost timer
32 * interrupts and the inability for the timer
33 * interrupt hardware to accuratly tick at the
34 * requested HZ value. It is also not reccomended
35 * for "tick-less" systems.
36 */
37#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ))
38
39/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier
40 * conversion, the .shift value could be zero. However
41 * this would make NTP adjustments impossible as they are
42 * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to
43 * shift both the nominator and denominator the same
44 * amount, and give ntp adjustments in units of 1/2^8
45 *
46 * The value 8 is somewhat carefully chosen, as anything
47 * larger can result in overflows. NSEC_PER_JIFFY grows as
48 * HZ shrinks, so values greater then 8 overflow 32bits when
49 * HZ=100.
50 */
51#define JIFFIES_SHIFT 8
52
53static cycle_t jiffies_read(void)
54{
55 return (cycle_t) jiffies;
56}
57
58struct clocksource clocksource_jiffies = {
59 .name = "jiffies",
60 .rating = 0, /* lowest rating*/
61 .read = jiffies_read,
62 .mask = 0xffffffff, /*32bits*/
63 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
64 .shift = JIFFIES_SHIFT,
65 .is_continuous = 0, /* tick based, not free running */
66};
67
68static int __init init_jiffies_clocksource(void)
69{
70 return clocksource_register(&clocksource_jiffies);
71}
72
73module_init(init_jiffies_clocksource);
diff --git a/kernel/timer.c b/kernel/timer.c
index 9e49deed468c..05809c2e2fd6 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -146,7 +146,7 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
146void fastcall init_timer(struct timer_list *timer) 146void fastcall init_timer(struct timer_list *timer)
147{ 147{
148 timer->entry.next = NULL; 148 timer->entry.next = NULL;
149 timer->base = per_cpu(tvec_bases, raw_smp_processor_id()); 149 timer->base = __raw_get_cpu_var(tvec_bases);
150} 150}
151EXPORT_SYMBOL(init_timer); 151EXPORT_SYMBOL(init_timer);
152 152
@@ -374,6 +374,7 @@ int del_timer_sync(struct timer_list *timer)
374 int ret = try_to_del_timer_sync(timer); 374 int ret = try_to_del_timer_sync(timer);
375 if (ret >= 0) 375 if (ret >= 0)
376 return ret; 376 return ret;
377 cpu_relax();
377 } 378 }
378} 379}
379 380
@@ -383,23 +384,19 @@ EXPORT_SYMBOL(del_timer_sync);
383static int cascade(tvec_base_t *base, tvec_t *tv, int index) 384static int cascade(tvec_base_t *base, tvec_t *tv, int index)
384{ 385{
385 /* cascade all the timers from tv up one level */ 386 /* cascade all the timers from tv up one level */
386 struct list_head *head, *curr; 387 struct timer_list *timer, *tmp;
388 struct list_head tv_list;
389
390 list_replace_init(tv->vec + index, &tv_list);
387 391
388 head = tv->vec + index;
389 curr = head->next;
390 /* 392 /*
391 * We are removing _all_ timers from the list, so we don't have to 393 * We are removing _all_ timers from the list, so we
392 * detach them individually, just clear the list afterwards. 394 * don't have to detach them individually.
393 */ 395 */
394 while (curr != head) { 396 list_for_each_entry_safe(timer, tmp, &tv_list, entry) {
395 struct timer_list *tmp; 397 BUG_ON(timer->base != base);
396 398 internal_add_timer(base, timer);
397 tmp = list_entry(curr, struct timer_list, entry);
398 BUG_ON(tmp->base != base);
399 curr = curr->next;
400 internal_add_timer(base, tmp);
401 } 399 }
402 INIT_LIST_HEAD(head);
403 400
404 return index; 401 return index;
405} 402}
@@ -419,10 +416,10 @@ static inline void __run_timers(tvec_base_t *base)
419 416
420 spin_lock_irq(&base->lock); 417 spin_lock_irq(&base->lock);
421 while (time_after_eq(jiffies, base->timer_jiffies)) { 418 while (time_after_eq(jiffies, base->timer_jiffies)) {
422 struct list_head work_list = LIST_HEAD_INIT(work_list); 419 struct list_head work_list;
423 struct list_head *head = &work_list; 420 struct list_head *head = &work_list;
424 int index = base->timer_jiffies & TVR_MASK; 421 int index = base->timer_jiffies & TVR_MASK;
425 422
426 /* 423 /*
427 * Cascade timers: 424 * Cascade timers:
428 */ 425 */
@@ -431,8 +428,8 @@ static inline void __run_timers(tvec_base_t *base)
431 (!cascade(base, &base->tv3, INDEX(1))) && 428 (!cascade(base, &base->tv3, INDEX(1))) &&
432 !cascade(base, &base->tv4, INDEX(2))) 429 !cascade(base, &base->tv4, INDEX(2)))
433 cascade(base, &base->tv5, INDEX(3)); 430 cascade(base, &base->tv5, INDEX(3));
434 ++base->timer_jiffies; 431 ++base->timer_jiffies;
435 list_splice_init(base->tv1.vec + index, &work_list); 432 list_replace_init(base->tv1.vec + index, &work_list);
436 while (!list_empty(head)) { 433 while (!list_empty(head)) {
437 void (*fn)(unsigned long); 434 void (*fn)(unsigned long);
438 unsigned long data; 435 unsigned long data;
@@ -601,7 +598,6 @@ long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */
601long time_precision = 1; /* clock precision (us) */ 598long time_precision = 1; /* clock precision (us) */
602long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ 599long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */
603long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ 600long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */
604static long time_phase; /* phase offset (scaled us) */
605long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC; 601long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC;
606 /* frequency offset (scaled ppm)*/ 602 /* frequency offset (scaled ppm)*/
607static long time_adj; /* tick adjust (scaled 1 / HZ) */ 603static long time_adj; /* tick adjust (scaled 1 / HZ) */
@@ -751,27 +747,14 @@ static long adjtime_adjustment(void)
751} 747}
752 748
753/* in the NTP reference this is called "hardclock()" */ 749/* in the NTP reference this is called "hardclock()" */
754static void update_wall_time_one_tick(void) 750static void update_ntp_one_tick(void)
755{ 751{
756 long time_adjust_step, delta_nsec; 752 long time_adjust_step;
757 753
758 time_adjust_step = adjtime_adjustment(); 754 time_adjust_step = adjtime_adjustment();
759 if (time_adjust_step) 755 if (time_adjust_step)
760 /* Reduce by this step the amount of time left */ 756 /* Reduce by this step the amount of time left */
761 time_adjust -= time_adjust_step; 757 time_adjust -= time_adjust_step;
762 delta_nsec = tick_nsec + time_adjust_step * 1000;
763 /*
764 * Advance the phase, once it gets to one microsecond, then
765 * advance the tick more.
766 */
767 time_phase += time_adj;
768 if ((time_phase >= FINENSEC) || (time_phase <= -FINENSEC)) {
769 long ltemp = shift_right(time_phase, (SHIFT_SCALE - 10));
770 time_phase -= ltemp << (SHIFT_SCALE - 10);
771 delta_nsec += ltemp;
772 }
773 xtime.tv_nsec += delta_nsec;
774 time_interpolator_update(delta_nsec);
775 758
776 /* Changes by adjtime() do not take effect till next tick. */ 759 /* Changes by adjtime() do not take effect till next tick. */
777 if (time_next_adjust != 0) { 760 if (time_next_adjust != 0) {
@@ -784,36 +767,404 @@ static void update_wall_time_one_tick(void)
784 * Return how long ticks are at the moment, that is, how much time 767 * Return how long ticks are at the moment, that is, how much time
785 * update_wall_time_one_tick will add to xtime next time we call it 768 * update_wall_time_one_tick will add to xtime next time we call it
786 * (assuming no calls to do_adjtimex in the meantime). 769 * (assuming no calls to do_adjtimex in the meantime).
787 * The return value is in fixed-point nanoseconds with SHIFT_SCALE-10 770 * The return value is in fixed-point nanoseconds shifted by the
788 * bits to the right of the binary point. 771 * specified number of bits to the right of the binary point.
789 * This function has no side-effects. 772 * This function has no side-effects.
790 */ 773 */
791u64 current_tick_length(void) 774u64 current_tick_length(void)
792{ 775{
793 long delta_nsec; 776 long delta_nsec;
777 u64 ret;
794 778
779 /* calculate the finest interval NTP will allow.
780 * ie: nanosecond value shifted by (SHIFT_SCALE - 10)
781 */
795 delta_nsec = tick_nsec + adjtime_adjustment() * 1000; 782 delta_nsec = tick_nsec + adjtime_adjustment() * 1000;
796 return ((u64) delta_nsec << (SHIFT_SCALE - 10)) + time_adj; 783 ret = (u64)delta_nsec << TICK_LENGTH_SHIFT;
784 ret += (s64)time_adj << (TICK_LENGTH_SHIFT - (SHIFT_SCALE - 10));
785
786 return ret;
797} 787}
798 788
799/* 789/* XXX - all of this timekeeping code should be later moved to time.c */
800 * Using a loop looks inefficient, but "ticks" is 790#include <linux/clocksource.h>
801 * usually just one (we shouldn't be losing ticks, 791static struct clocksource *clock; /* pointer to current clocksource */
802 * we're doing this this way mainly for interrupt 792
803 * latency reasons, not because we think we'll 793#ifdef CONFIG_GENERIC_TIME
804 * have lots of lost timer ticks 794/**
795 * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook
796 *
797 * private function, must hold xtime_lock lock when being
798 * called. Returns the number of nanoseconds since the
799 * last call to update_wall_time() (adjusted by NTP scaling)
800 */
801static inline s64 __get_nsec_offset(void)
802{
803 cycle_t cycle_now, cycle_delta;
804 s64 ns_offset;
805
806 /* read clocksource: */
807 cycle_now = clocksource_read(clock);
808
809 /* calculate the delta since the last update_wall_time: */
810 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
811
812 /* convert to nanoseconds: */
813 ns_offset = cyc2ns(clock, cycle_delta);
814
815 return ns_offset;
816}
817
818/**
819 * __get_realtime_clock_ts - Returns the time of day in a timespec
820 * @ts: pointer to the timespec to be set
821 *
822 * Returns the time of day in a timespec. Used by
823 * do_gettimeofday() and get_realtime_clock_ts().
805 */ 824 */
806static void update_wall_time(unsigned long ticks) 825static inline void __get_realtime_clock_ts(struct timespec *ts)
807{ 826{
827 unsigned long seq;
828 s64 nsecs;
829
808 do { 830 do {
809 ticks--; 831 seq = read_seqbegin(&xtime_lock);
810 update_wall_time_one_tick(); 832
811 if (xtime.tv_nsec >= 1000000000) { 833 *ts = xtime;
812 xtime.tv_nsec -= 1000000000; 834 nsecs = __get_nsec_offset();
835
836 } while (read_seqretry(&xtime_lock, seq));
837
838 timespec_add_ns(ts, nsecs);
839}
840
841/**
842 * getnstimeofday - Returns the time of day in a timespec
843 * @ts: pointer to the timespec to be set
844 *
845 * Returns the time of day in a timespec.
846 */
847void getnstimeofday(struct timespec *ts)
848{
849 __get_realtime_clock_ts(ts);
850}
851
852EXPORT_SYMBOL(getnstimeofday);
853
854/**
855 * do_gettimeofday - Returns the time of day in a timeval
856 * @tv: pointer to the timeval to be set
857 *
858 * NOTE: Users should be converted to using get_realtime_clock_ts()
859 */
860void do_gettimeofday(struct timeval *tv)
861{
862 struct timespec now;
863
864 __get_realtime_clock_ts(&now);
865 tv->tv_sec = now.tv_sec;
866 tv->tv_usec = now.tv_nsec/1000;
867}
868
869EXPORT_SYMBOL(do_gettimeofday);
870/**
871 * do_settimeofday - Sets the time of day
872 * @tv: pointer to the timespec variable containing the new time
873 *
874 * Sets the time of day to the new time and update NTP and notify hrtimers
875 */
876int do_settimeofday(struct timespec *tv)
877{
878 unsigned long flags;
879 time_t wtm_sec, sec = tv->tv_sec;
880 long wtm_nsec, nsec = tv->tv_nsec;
881
882 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
883 return -EINVAL;
884
885 write_seqlock_irqsave(&xtime_lock, flags);
886
887 nsec -= __get_nsec_offset();
888
889 wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
890 wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
891
892 set_normalized_timespec(&xtime, sec, nsec);
893 set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
894
895 clock->error = 0;
896 ntp_clear();
897
898 write_sequnlock_irqrestore(&xtime_lock, flags);
899
900 /* signal hrtimers about time change */
901 clock_was_set();
902
903 return 0;
904}
905
906EXPORT_SYMBOL(do_settimeofday);
907
908/**
909 * change_clocksource - Swaps clocksources if a new one is available
910 *
911 * Accumulates current time interval and initializes new clocksource
912 */
913static int change_clocksource(void)
914{
915 struct clocksource *new;
916 cycle_t now;
917 u64 nsec;
918 new = clocksource_get_next();
919 if (clock != new) {
920 now = clocksource_read(new);
921 nsec = __get_nsec_offset();
922 timespec_add_ns(&xtime, nsec);
923
924 clock = new;
925 clock->cycle_last = now;
926 printk(KERN_INFO "Time: %s clocksource has been installed.\n",
927 clock->name);
928 return 1;
929 } else if (clock->update_callback) {
930 return clock->update_callback();
931 }
932 return 0;
933}
934#else
935#define change_clocksource() (0)
936#endif
937
938/**
939 * timeofday_is_continuous - check to see if timekeeping is free running
940 */
941int timekeeping_is_continuous(void)
942{
943 unsigned long seq;
944 int ret;
945
946 do {
947 seq = read_seqbegin(&xtime_lock);
948
949 ret = clock->is_continuous;
950
951 } while (read_seqretry(&xtime_lock, seq));
952
953 return ret;
954}
955
956/*
957 * timekeeping_init - Initializes the clocksource and common timekeeping values
958 */
959void __init timekeeping_init(void)
960{
961 unsigned long flags;
962
963 write_seqlock_irqsave(&xtime_lock, flags);
964 clock = clocksource_get_next();
965 clocksource_calculate_interval(clock, tick_nsec);
966 clock->cycle_last = clocksource_read(clock);
967 ntp_clear();
968 write_sequnlock_irqrestore(&xtime_lock, flags);
969}
970
971
972static int timekeeping_suspended;
973/*
974 * timekeeping_resume - Resumes the generic timekeeping subsystem.
975 * @dev: unused
976 *
977 * This is for the generic clocksource timekeeping.
978 * xtime/wall_to_monotonic/jiffies/wall_jiffies/etc are
979 * still managed by arch specific suspend/resume code.
980 */
981static int timekeeping_resume(struct sys_device *dev)
982{
983 unsigned long flags;
984
985 write_seqlock_irqsave(&xtime_lock, flags);
986 /* restart the last cycle value */
987 clock->cycle_last = clocksource_read(clock);
988 clock->error = 0;
989 timekeeping_suspended = 0;
990 write_sequnlock_irqrestore(&xtime_lock, flags);
991 return 0;
992}
993
994static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
995{
996 unsigned long flags;
997
998 write_seqlock_irqsave(&xtime_lock, flags);
999 timekeeping_suspended = 1;
1000 write_sequnlock_irqrestore(&xtime_lock, flags);
1001 return 0;
1002}
1003
1004/* sysfs resume/suspend bits for timekeeping */
1005static struct sysdev_class timekeeping_sysclass = {
1006 .resume = timekeeping_resume,
1007 .suspend = timekeeping_suspend,
1008 set_kset_name("timekeeping"),
1009};
1010
1011static struct sys_device device_timer = {
1012 .id = 0,
1013 .cls = &timekeeping_sysclass,
1014};
1015
1016static int __init timekeeping_init_device(void)
1017{
1018 int error = sysdev_class_register(&timekeeping_sysclass);
1019 if (!error)
1020 error = sysdev_register(&device_timer);
1021 return error;
1022}
1023
1024device_initcall(timekeeping_init_device);
1025
1026/*
1027 * If the error is already larger, we look ahead even further
1028 * to compensate for late or lost adjustments.
1029 */
1030static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 *offset)
1031{
1032 s64 tick_error, i;
1033 u32 look_ahead, adj;
1034 s32 error2, mult;
1035
1036 /*
1037 * Use the current error value to determine how much to look ahead.
1038 * The larger the error the slower we adjust for it to avoid problems
1039 * with losing too many ticks, otherwise we would overadjust and
1040 * produce an even larger error. The smaller the adjustment the
1041 * faster we try to adjust for it, as lost ticks can do less harm
1042 * here. This is tuned so that an error of about 1 msec is adusted
1043 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
1044 */
1045 error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ);
1046 error2 = abs(error2);
1047 for (look_ahead = 0; error2 > 0; look_ahead++)
1048 error2 >>= 2;
1049
1050 /*
1051 * Now calculate the error in (1 << look_ahead) ticks, but first
1052 * remove the single look ahead already included in the error.
1053 */
1054 tick_error = current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1);
1055 tick_error -= clock->xtime_interval >> 1;
1056 error = ((error - tick_error) >> look_ahead) + tick_error;
1057
1058 /* Finally calculate the adjustment shift value. */
1059 i = *interval;
1060 mult = 1;
1061 if (error < 0) {
1062 error = -error;
1063 *interval = -*interval;
1064 *offset = -*offset;
1065 mult = -1;
1066 }
1067 for (adj = 0; error > i; adj++)
1068 error >>= 1;
1069
1070 *interval <<= adj;
1071 *offset <<= adj;
1072 return mult << adj;
1073}
1074
1075/*
1076 * Adjust the multiplier to reduce the error value,
1077 * this is optimized for the most common adjustments of -1,0,1,
1078 * for other values we can do a bit more work.
1079 */
1080static void clocksource_adjust(struct clocksource *clock, s64 offset)
1081{
1082 s64 error, interval = clock->cycle_interval;
1083 int adj;
1084
1085 error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1);
1086 if (error > interval) {
1087 error >>= 2;
1088 if (likely(error <= interval))
1089 adj = 1;
1090 else
1091 adj = clocksource_bigadjust(error, &interval, &offset);
1092 } else if (error < -interval) {
1093 error >>= 2;
1094 if (likely(error >= -interval)) {
1095 adj = -1;
1096 interval = -interval;
1097 offset = -offset;
1098 } else
1099 adj = clocksource_bigadjust(error, &interval, &offset);
1100 } else
1101 return;
1102
1103 clock->mult += adj;
1104 clock->xtime_interval += interval;
1105 clock->xtime_nsec -= offset;
1106 clock->error -= (interval - offset) << (TICK_LENGTH_SHIFT - clock->shift);
1107}
1108
1109/*
1110 * update_wall_time - Uses the current clocksource to increment the wall time
1111 *
1112 * Called from the timer interrupt, must hold a write on xtime_lock.
1113 */
1114static void update_wall_time(void)
1115{
1116 cycle_t offset;
1117
1118 /* Make sure we're fully resumed: */
1119 if (unlikely(timekeeping_suspended))
1120 return;
1121
1122#ifdef CONFIG_GENERIC_TIME
1123 offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask;
1124#else
1125 offset = clock->cycle_interval;
1126#endif
1127 clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
1128
1129 /* normally this loop will run just once, however in the
1130 * case of lost or late ticks, it will accumulate correctly.
1131 */
1132 while (offset >= clock->cycle_interval) {
1133 /* accumulate one interval */
1134 clock->xtime_nsec += clock->xtime_interval;
1135 clock->cycle_last += clock->cycle_interval;
1136 offset -= clock->cycle_interval;
1137
1138 if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
1139 clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
813 xtime.tv_sec++; 1140 xtime.tv_sec++;
814 second_overflow(); 1141 second_overflow();
815 } 1142 }
816 } while (ticks); 1143
1144 /* interpolator bits */
1145 time_interpolator_update(clock->xtime_interval
1146 >> clock->shift);
1147 /* increment the NTP state machine */
1148 update_ntp_one_tick();
1149
1150 /* accumulate error between NTP and clock interval */
1151 clock->error += current_tick_length();
1152 clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift);
1153 }
1154
1155 /* correct the clock when NTP error is too big */
1156 clocksource_adjust(clock, offset);
1157
1158 /* store full nanoseconds into xtime */
1159 xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift;
1160 clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
1161
1162 /* check to see if there is a new clocksource to use */
1163 if (change_clocksource()) {
1164 clock->error = 0;
1165 clock->xtime_nsec = 0;
1166 clocksource_calculate_interval(clock, tick_nsec);
1167 }
817} 1168}
818 1169
819/* 1170/*
@@ -884,7 +1235,7 @@ unsigned long wall_jiffies = INITIAL_JIFFIES;
884 * playing with xtime and avenrun. 1235 * playing with xtime and avenrun.
885 */ 1236 */
886#ifndef ARCH_HAVE_XTIME_LOCK 1237#ifndef ARCH_HAVE_XTIME_LOCK
887seqlock_t xtime_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED; 1238__cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
888 1239
889EXPORT_SYMBOL(xtime_lock); 1240EXPORT_SYMBOL(xtime_lock);
890#endif 1241#endif
@@ -919,10 +1270,8 @@ static inline void update_times(void)
919 unsigned long ticks; 1270 unsigned long ticks;
920 1271
921 ticks = jiffies - wall_jiffies; 1272 ticks = jiffies - wall_jiffies;
922 if (ticks) { 1273 wall_jiffies += ticks;
923 wall_jiffies += ticks; 1274 update_wall_time();
924 update_wall_time(ticks);
925 }
926 calc_load(ticks); 1275 calc_load(ticks);
927} 1276}
928 1277
@@ -1046,7 +1395,7 @@ asmlinkage long sys_getegid(void)
1046 1395
1047static void process_timeout(unsigned long __data) 1396static void process_timeout(unsigned long __data)
1048{ 1397{
1049 wake_up_process((task_t *)__data); 1398 wake_up_process((struct task_struct *)__data);
1050} 1399}
1051 1400
1052/** 1401/**
@@ -1237,6 +1586,13 @@ asmlinkage long sys_sysinfo(struct sysinfo __user *info)
1237 return 0; 1586 return 0;
1238} 1587}
1239 1588
1589/*
1590 * lockdep: we want to track each per-CPU base as a separate lock-class,
1591 * but timer-bases are kmalloc()-ed, so we need to attach separate
1592 * keys to them:
1593 */
1594static struct lock_class_key base_lock_keys[NR_CPUS];
1595
1240static int __devinit init_timers_cpu(int cpu) 1596static int __devinit init_timers_cpu(int cpu)
1241{ 1597{
1242 int j; 1598 int j;
@@ -1272,6 +1628,8 @@ static int __devinit init_timers_cpu(int cpu)
1272 } 1628 }
1273 1629
1274 spin_lock_init(&base->lock); 1630 spin_lock_init(&base->lock);
1631 lockdep_set_class(&base->lock, base_lock_keys + cpu);
1632
1275 for (j = 0; j < TVN_SIZE; j++) { 1633 for (j = 0; j < TVN_SIZE; j++) {
1276 INIT_LIST_HEAD(base->tv5.vec + j); 1634 INIT_LIST_HEAD(base->tv5.vec + j);
1277 INIT_LIST_HEAD(base->tv4.vec + j); 1635 INIT_LIST_HEAD(base->tv4.vec + j);
@@ -1330,7 +1688,7 @@ static void __devinit migrate_timers(int cpu)
1330} 1688}
1331#endif /* CONFIG_HOTPLUG_CPU */ 1689#endif /* CONFIG_HOTPLUG_CPU */
1332 1690
1333static int timer_cpu_notify(struct notifier_block *self, 1691static int __devinit timer_cpu_notify(struct notifier_block *self,
1334 unsigned long action, void *hcpu) 1692 unsigned long action, void *hcpu)
1335{ 1693{
1336 long cpu = (long)hcpu; 1694 long cpu = (long)hcpu;
@@ -1350,7 +1708,7 @@ static int timer_cpu_notify(struct notifier_block *self,
1350 return NOTIFY_OK; 1708 return NOTIFY_OK;
1351} 1709}
1352 1710
1353static struct notifier_block timers_nb = { 1711static struct notifier_block __devinitdata timers_nb = {
1354 .notifier_call = timer_cpu_notify, 1712 .notifier_call = timer_cpu_notify,
1355}; 1713};
1356 1714
diff --git a/kernel/unwind.c b/kernel/unwind.c
new file mode 100644
index 000000000000..f69c804c8e62
--- /dev/null
+++ b/kernel/unwind.c
@@ -0,0 +1,918 @@
1/*
2 * Copyright (C) 2002-2006 Novell, Inc.
3 * Jan Beulich <jbeulich@novell.com>
4 * This code is released under version 2 of the GNU GPL.
5 *
6 * A simple API for unwinding kernel stacks. This is used for
7 * debugging and error reporting purposes. The kernel doesn't need
8 * full-blown stack unwinding with all the bells and whistles, so there
9 * is not much point in implementing the full Dwarf2 unwind API.
10 */
11
12#include <linux/unwind.h>
13#include <linux/module.h>
14#include <linux/delay.h>
15#include <linux/stop_machine.h>
16#include <asm/sections.h>
17#include <asm/uaccess.h>
18#include <asm/unaligned.h>
19
20extern char __start_unwind[], __end_unwind[];
21
22#define MAX_STACK_DEPTH 8
23
24#define EXTRA_INFO(f) { \
25 BUILD_BUG_ON_ZERO(offsetof(struct unwind_frame_info, f) \
26 % FIELD_SIZEOF(struct unwind_frame_info, f)) \
27 + offsetof(struct unwind_frame_info, f) \
28 / FIELD_SIZEOF(struct unwind_frame_info, f), \
29 FIELD_SIZEOF(struct unwind_frame_info, f) \
30 }
31#define PTREGS_INFO(f) EXTRA_INFO(regs.f)
32
33static const struct {
34 unsigned offs:BITS_PER_LONG / 2;
35 unsigned width:BITS_PER_LONG / 2;
36} reg_info[] = {
37 UNW_REGISTER_INFO
38};
39
40#undef PTREGS_INFO
41#undef EXTRA_INFO
42
43#ifndef REG_INVALID
44#define REG_INVALID(r) (reg_info[r].width == 0)
45#endif
46
47#define DW_CFA_nop 0x00
48#define DW_CFA_set_loc 0x01
49#define DW_CFA_advance_loc1 0x02
50#define DW_CFA_advance_loc2 0x03
51#define DW_CFA_advance_loc4 0x04
52#define DW_CFA_offset_extended 0x05
53#define DW_CFA_restore_extended 0x06
54#define DW_CFA_undefined 0x07
55#define DW_CFA_same_value 0x08
56#define DW_CFA_register 0x09
57#define DW_CFA_remember_state 0x0a
58#define DW_CFA_restore_state 0x0b
59#define DW_CFA_def_cfa 0x0c
60#define DW_CFA_def_cfa_register 0x0d
61#define DW_CFA_def_cfa_offset 0x0e
62#define DW_CFA_def_cfa_expression 0x0f
63#define DW_CFA_expression 0x10
64#define DW_CFA_offset_extended_sf 0x11
65#define DW_CFA_def_cfa_sf 0x12
66#define DW_CFA_def_cfa_offset_sf 0x13
67#define DW_CFA_val_offset 0x14
68#define DW_CFA_val_offset_sf 0x15
69#define DW_CFA_val_expression 0x16
70#define DW_CFA_lo_user 0x1c
71#define DW_CFA_GNU_window_save 0x2d
72#define DW_CFA_GNU_args_size 0x2e
73#define DW_CFA_GNU_negative_offset_extended 0x2f
74#define DW_CFA_hi_user 0x3f
75
76#define DW_EH_PE_FORM 0x07
77#define DW_EH_PE_native 0x00
78#define DW_EH_PE_leb128 0x01
79#define DW_EH_PE_data2 0x02
80#define DW_EH_PE_data4 0x03
81#define DW_EH_PE_data8 0x04
82#define DW_EH_PE_signed 0x08
83#define DW_EH_PE_ADJUST 0x70
84#define DW_EH_PE_abs 0x00
85#define DW_EH_PE_pcrel 0x10
86#define DW_EH_PE_textrel 0x20
87#define DW_EH_PE_datarel 0x30
88#define DW_EH_PE_funcrel 0x40
89#define DW_EH_PE_aligned 0x50
90#define DW_EH_PE_indirect 0x80
91#define DW_EH_PE_omit 0xff
92
93typedef unsigned long uleb128_t;
94typedef signed long sleb128_t;
95
96static struct unwind_table {
97 struct {
98 unsigned long pc;
99 unsigned long range;
100 } core, init;
101 const void *address;
102 unsigned long size;
103 struct unwind_table *link;
104 const char *name;
105} root_table, *last_table;
106
107struct unwind_item {
108 enum item_location {
109 Nowhere,
110 Memory,
111 Register,
112 Value
113 } where;
114 uleb128_t value;
115};
116
117struct unwind_state {
118 uleb128_t loc, org;
119 const u8 *cieStart, *cieEnd;
120 uleb128_t codeAlign;
121 sleb128_t dataAlign;
122 struct cfa {
123 uleb128_t reg, offs;
124 } cfa;
125 struct unwind_item regs[ARRAY_SIZE(reg_info)];
126 unsigned stackDepth:8;
127 unsigned version:8;
128 const u8 *label;
129 const u8 *stack[MAX_STACK_DEPTH];
130};
131
132static const struct cfa badCFA = { ARRAY_SIZE(reg_info), 1 };
133
134static struct unwind_table *find_table(unsigned long pc)
135{
136 struct unwind_table *table;
137
138 for (table = &root_table; table; table = table->link)
139 if ((pc >= table->core.pc
140 && pc < table->core.pc + table->core.range)
141 || (pc >= table->init.pc
142 && pc < table->init.pc + table->init.range))
143 break;
144
145 return table;
146}
147
148static void init_unwind_table(struct unwind_table *table,
149 const char *name,
150 const void *core_start,
151 unsigned long core_size,
152 const void *init_start,
153 unsigned long init_size,
154 const void *table_start,
155 unsigned long table_size)
156{
157 table->core.pc = (unsigned long)core_start;
158 table->core.range = core_size;
159 table->init.pc = (unsigned long)init_start;
160 table->init.range = init_size;
161 table->address = table_start;
162 table->size = table_size;
163 table->link = NULL;
164 table->name = name;
165}
166
167void __init unwind_init(void)
168{
169 init_unwind_table(&root_table, "kernel",
170 _text, _end - _text,
171 NULL, 0,
172 __start_unwind, __end_unwind - __start_unwind);
173}
174
175#ifdef CONFIG_MODULES
176
177/* Must be called with module_mutex held. */
178void *unwind_add_table(struct module *module,
179 const void *table_start,
180 unsigned long table_size)
181{
182 struct unwind_table *table;
183
184 if (table_size <= 0)
185 return NULL;
186
187 table = kmalloc(sizeof(*table), GFP_KERNEL);
188 if (!table)
189 return NULL;
190
191 init_unwind_table(table, module->name,
192 module->module_core, module->core_size,
193 module->module_init, module->init_size,
194 table_start, table_size);
195
196 if (last_table)
197 last_table->link = table;
198 else
199 root_table.link = table;
200 last_table = table;
201
202 return table;
203}
204
205struct unlink_table_info
206{
207 struct unwind_table *table;
208 int init_only;
209};
210
211static int unlink_table(void *arg)
212{
213 struct unlink_table_info *info = arg;
214 struct unwind_table *table = info->table, *prev;
215
216 for (prev = &root_table; prev->link && prev->link != table; prev = prev->link)
217 ;
218
219 if (prev->link) {
220 if (info->init_only) {
221 table->init.pc = 0;
222 table->init.range = 0;
223 info->table = NULL;
224 } else {
225 prev->link = table->link;
226 if (!prev->link)
227 last_table = prev;
228 }
229 } else
230 info->table = NULL;
231
232 return 0;
233}
234
235/* Must be called with module_mutex held. */
236void unwind_remove_table(void *handle, int init_only)
237{
238 struct unwind_table *table = handle;
239 struct unlink_table_info info;
240
241 if (!table || table == &root_table)
242 return;
243
244 if (init_only && table == last_table) {
245 table->init.pc = 0;
246 table->init.range = 0;
247 return;
248 }
249
250 info.table = table;
251 info.init_only = init_only;
252 stop_machine_run(unlink_table, &info, NR_CPUS);
253
254 if (info.table)
255 kfree(table);
256}
257
258#endif /* CONFIG_MODULES */
259
260static uleb128_t get_uleb128(const u8 **pcur, const u8 *end)
261{
262 const u8 *cur = *pcur;
263 uleb128_t value;
264 unsigned shift;
265
266 for (shift = 0, value = 0; cur < end; shift += 7) {
267 if (shift + 7 > 8 * sizeof(value)
268 && (*cur & 0x7fU) >= (1U << (8 * sizeof(value) - shift))) {
269 cur = end + 1;
270 break;
271 }
272 value |= (uleb128_t)(*cur & 0x7f) << shift;
273 if (!(*cur++ & 0x80))
274 break;
275 }
276 *pcur = cur;
277
278 return value;
279}
280
281static sleb128_t get_sleb128(const u8 **pcur, const u8 *end)
282{
283 const u8 *cur = *pcur;
284 sleb128_t value;
285 unsigned shift;
286
287 for (shift = 0, value = 0; cur < end; shift += 7) {
288 if (shift + 7 > 8 * sizeof(value)
289 && (*cur & 0x7fU) >= (1U << (8 * sizeof(value) - shift))) {
290 cur = end + 1;
291 break;
292 }
293 value |= (sleb128_t)(*cur & 0x7f) << shift;
294 if (!(*cur & 0x80)) {
295 value |= -(*cur++ & 0x40) << shift;
296 break;
297 }
298 }
299 *pcur = cur;
300
301 return value;
302}
303
304static unsigned long read_pointer(const u8 **pLoc,
305 const void *end,
306 signed ptrType)
307{
308 unsigned long value = 0;
309 union {
310 const u8 *p8;
311 const u16 *p16u;
312 const s16 *p16s;
313 const u32 *p32u;
314 const s32 *p32s;
315 const unsigned long *pul;
316 } ptr;
317
318 if (ptrType < 0 || ptrType == DW_EH_PE_omit)
319 return 0;
320 ptr.p8 = *pLoc;
321 switch(ptrType & DW_EH_PE_FORM) {
322 case DW_EH_PE_data2:
323 if (end < (const void *)(ptr.p16u + 1))
324 return 0;
325 if(ptrType & DW_EH_PE_signed)
326 value = get_unaligned(ptr.p16s++);
327 else
328 value = get_unaligned(ptr.p16u++);
329 break;
330 case DW_EH_PE_data4:
331#ifdef CONFIG_64BIT
332 if (end < (const void *)(ptr.p32u + 1))
333 return 0;
334 if(ptrType & DW_EH_PE_signed)
335 value = get_unaligned(ptr.p32s++);
336 else
337 value = get_unaligned(ptr.p32u++);
338 break;
339 case DW_EH_PE_data8:
340 BUILD_BUG_ON(sizeof(u64) != sizeof(value));
341#else
342 BUILD_BUG_ON(sizeof(u32) != sizeof(value));
343#endif
344 case DW_EH_PE_native:
345 if (end < (const void *)(ptr.pul + 1))
346 return 0;
347 value = get_unaligned(ptr.pul++);
348 break;
349 case DW_EH_PE_leb128:
350 BUILD_BUG_ON(sizeof(uleb128_t) > sizeof(value));
351 value = ptrType & DW_EH_PE_signed
352 ? get_sleb128(&ptr.p8, end)
353 : get_uleb128(&ptr.p8, end);
354 if ((const void *)ptr.p8 > end)
355 return 0;
356 break;
357 default:
358 return 0;
359 }
360 switch(ptrType & DW_EH_PE_ADJUST) {
361 case DW_EH_PE_abs:
362 break;
363 case DW_EH_PE_pcrel:
364 value += (unsigned long)*pLoc;
365 break;
366 default:
367 return 0;
368 }
369 if ((ptrType & DW_EH_PE_indirect)
370 && __get_user(value, (unsigned long *)value))
371 return 0;
372 *pLoc = ptr.p8;
373
374 return value;
375}
376
377static signed fde_pointer_type(const u32 *cie)
378{
379 const u8 *ptr = (const u8 *)(cie + 2);
380 unsigned version = *ptr;
381
382 if (version != 1)
383 return -1; /* unsupported */
384 if (*++ptr) {
385 const char *aug;
386 const u8 *end = (const u8 *)(cie + 1) + *cie;
387 uleb128_t len;
388
389 /* check if augmentation size is first (and thus present) */
390 if (*ptr != 'z')
391 return -1;
392 /* check if augmentation string is nul-terminated */
393 if ((ptr = memchr(aug = (const void *)ptr, 0, end - ptr)) == NULL)
394 return -1;
395 ++ptr; /* skip terminator */
396 get_uleb128(&ptr, end); /* skip code alignment */
397 get_sleb128(&ptr, end); /* skip data alignment */
398 /* skip return address column */
399 version <= 1 ? (void)++ptr : (void)get_uleb128(&ptr, end);
400 len = get_uleb128(&ptr, end); /* augmentation length */
401 if (ptr + len < ptr || ptr + len > end)
402 return -1;
403 end = ptr + len;
404 while (*++aug) {
405 if (ptr >= end)
406 return -1;
407 switch(*aug) {
408 case 'L':
409 ++ptr;
410 break;
411 case 'P': {
412 signed ptrType = *ptr++;
413
414 if (!read_pointer(&ptr, end, ptrType) || ptr > end)
415 return -1;
416 }
417 break;
418 case 'R':
419 return *ptr;
420 default:
421 return -1;
422 }
423 }
424 }
425 return DW_EH_PE_native|DW_EH_PE_abs;
426}
427
428static int advance_loc(unsigned long delta, struct unwind_state *state)
429{
430 state->loc += delta * state->codeAlign;
431
432 return delta > 0;
433}
434
435static void set_rule(uleb128_t reg,
436 enum item_location where,
437 uleb128_t value,
438 struct unwind_state *state)
439{
440 if (reg < ARRAY_SIZE(state->regs)) {
441 state->regs[reg].where = where;
442 state->regs[reg].value = value;
443 }
444}
445
446static int processCFI(const u8 *start,
447 const u8 *end,
448 unsigned long targetLoc,
449 signed ptrType,
450 struct unwind_state *state)
451{
452 union {
453 const u8 *p8;
454 const u16 *p16;
455 const u32 *p32;
456 } ptr;
457 int result = 1;
458
459 if (start != state->cieStart) {
460 state->loc = state->org;
461 result = processCFI(state->cieStart, state->cieEnd, 0, ptrType, state);
462 if (targetLoc == 0 && state->label == NULL)
463 return result;
464 }
465 for (ptr.p8 = start; result && ptr.p8 < end; ) {
466 switch(*ptr.p8 >> 6) {
467 uleb128_t value;
468
469 case 0:
470 switch(*ptr.p8++) {
471 case DW_CFA_nop:
472 break;
473 case DW_CFA_set_loc:
474 if ((state->loc = read_pointer(&ptr.p8, end, ptrType)) == 0)
475 result = 0;
476 break;
477 case DW_CFA_advance_loc1:
478 result = ptr.p8 < end && advance_loc(*ptr.p8++, state);
479 break;
480 case DW_CFA_advance_loc2:
481 result = ptr.p8 <= end + 2
482 && advance_loc(*ptr.p16++, state);
483 break;
484 case DW_CFA_advance_loc4:
485 result = ptr.p8 <= end + 4
486 && advance_loc(*ptr.p32++, state);
487 break;
488 case DW_CFA_offset_extended:
489 value = get_uleb128(&ptr.p8, end);
490 set_rule(value, Memory, get_uleb128(&ptr.p8, end), state);
491 break;
492 case DW_CFA_val_offset:
493 value = get_uleb128(&ptr.p8, end);
494 set_rule(value, Value, get_uleb128(&ptr.p8, end), state);
495 break;
496 case DW_CFA_offset_extended_sf:
497 value = get_uleb128(&ptr.p8, end);
498 set_rule(value, Memory, get_sleb128(&ptr.p8, end), state);
499 break;
500 case DW_CFA_val_offset_sf:
501 value = get_uleb128(&ptr.p8, end);
502 set_rule(value, Value, get_sleb128(&ptr.p8, end), state);
503 break;
504 case DW_CFA_restore_extended:
505 case DW_CFA_undefined:
506 case DW_CFA_same_value:
507 set_rule(get_uleb128(&ptr.p8, end), Nowhere, 0, state);
508 break;
509 case DW_CFA_register:
510 value = get_uleb128(&ptr.p8, end);
511 set_rule(value,
512 Register,
513 get_uleb128(&ptr.p8, end), state);
514 break;
515 case DW_CFA_remember_state:
516 if (ptr.p8 == state->label) {
517 state->label = NULL;
518 return 1;
519 }
520 if (state->stackDepth >= MAX_STACK_DEPTH)
521 return 0;
522 state->stack[state->stackDepth++] = ptr.p8;
523 break;
524 case DW_CFA_restore_state:
525 if (state->stackDepth) {
526 const uleb128_t loc = state->loc;
527 const u8 *label = state->label;
528
529 state->label = state->stack[state->stackDepth - 1];
530 memcpy(&state->cfa, &badCFA, sizeof(state->cfa));
531 memset(state->regs, 0, sizeof(state->regs));
532 state->stackDepth = 0;
533 result = processCFI(start, end, 0, ptrType, state);
534 state->loc = loc;
535 state->label = label;
536 } else
537 return 0;
538 break;
539 case DW_CFA_def_cfa:
540 state->cfa.reg = get_uleb128(&ptr.p8, end);
541 /*nobreak*/
542 case DW_CFA_def_cfa_offset:
543 state->cfa.offs = get_uleb128(&ptr.p8, end);
544 break;
545 case DW_CFA_def_cfa_sf:
546 state->cfa.reg = get_uleb128(&ptr.p8, end);
547 /*nobreak*/
548 case DW_CFA_def_cfa_offset_sf:
549 state->cfa.offs = get_sleb128(&ptr.p8, end)
550 * state->dataAlign;
551 break;
552 case DW_CFA_def_cfa_register:
553 state->cfa.reg = get_uleb128(&ptr.p8, end);
554 break;
555 /*todo case DW_CFA_def_cfa_expression: */
556 /*todo case DW_CFA_expression: */
557 /*todo case DW_CFA_val_expression: */
558 case DW_CFA_GNU_args_size:
559 get_uleb128(&ptr.p8, end);
560 break;
561 case DW_CFA_GNU_negative_offset_extended:
562 value = get_uleb128(&ptr.p8, end);
563 set_rule(value,
564 Memory,
565 (uleb128_t)0 - get_uleb128(&ptr.p8, end), state);
566 break;
567 case DW_CFA_GNU_window_save:
568 default:
569 result = 0;
570 break;
571 }
572 break;
573 case 1:
574 result = advance_loc(*ptr.p8++ & 0x3f, state);
575 break;
576 case 2:
577 value = *ptr.p8++ & 0x3f;
578 set_rule(value, Memory, get_uleb128(&ptr.p8, end), state);
579 break;
580 case 3:
581 set_rule(*ptr.p8++ & 0x3f, Nowhere, 0, state);
582 break;
583 }
584 if (ptr.p8 > end)
585 result = 0;
586 if (result && targetLoc != 0 && targetLoc < state->loc)
587 return 1;
588 }
589
590 return result
591 && ptr.p8 == end
592 && (targetLoc == 0
593 || (/*todo While in theory this should apply, gcc in practice omits
594 everything past the function prolog, and hence the location
595 never reaches the end of the function.
596 targetLoc < state->loc &&*/ state->label == NULL));
597}
598
599/* Unwind to previous to frame. Returns 0 if successful, negative
600 * number in case of an error. */
601int unwind(struct unwind_frame_info *frame)
602{
603#define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs])
604 const u32 *fde = NULL, *cie = NULL;
605 const u8 *ptr = NULL, *end = NULL;
606 unsigned long startLoc = 0, endLoc = 0, cfa;
607 unsigned i;
608 signed ptrType = -1;
609 uleb128_t retAddrReg = 0;
610 struct unwind_table *table;
611 struct unwind_state state;
612
613 if (UNW_PC(frame) == 0)
614 return -EINVAL;
615 if ((table = find_table(UNW_PC(frame))) != NULL
616 && !(table->size & (sizeof(*fde) - 1))) {
617 unsigned long tableSize = table->size;
618
619 for (fde = table->address;
620 tableSize > sizeof(*fde) && tableSize - sizeof(*fde) >= *fde;
621 tableSize -= sizeof(*fde) + *fde,
622 fde += 1 + *fde / sizeof(*fde)) {
623 if (!*fde || (*fde & (sizeof(*fde) - 1)))
624 break;
625 if (!fde[1])
626 continue; /* this is a CIE */
627 if ((fde[1] & (sizeof(*fde) - 1))
628 || fde[1] > (unsigned long)(fde + 1)
629 - (unsigned long)table->address)
630 continue; /* this is not a valid FDE */
631 cie = fde + 1 - fde[1] / sizeof(*fde);
632 if (*cie <= sizeof(*cie) + 4
633 || *cie >= fde[1] - sizeof(*fde)
634 || (*cie & (sizeof(*cie) - 1))
635 || cie[1]
636 || (ptrType = fde_pointer_type(cie)) < 0) {
637 cie = NULL; /* this is not a (valid) CIE */
638 continue;
639 }
640 ptr = (const u8 *)(fde + 2);
641 startLoc = read_pointer(&ptr,
642 (const u8 *)(fde + 1) + *fde,
643 ptrType);
644 endLoc = startLoc
645 + read_pointer(&ptr,
646 (const u8 *)(fde + 1) + *fde,
647 ptrType & DW_EH_PE_indirect
648 ? ptrType
649 : ptrType & (DW_EH_PE_FORM|DW_EH_PE_signed));
650 if (UNW_PC(frame) >= startLoc && UNW_PC(frame) < endLoc)
651 break;
652 cie = NULL;
653 }
654 }
655 if (cie != NULL) {
656 memset(&state, 0, sizeof(state));
657 state.cieEnd = ptr; /* keep here temporarily */
658 ptr = (const u8 *)(cie + 2);
659 end = (const u8 *)(cie + 1) + *cie;
660 if ((state.version = *ptr) != 1)
661 cie = NULL; /* unsupported version */
662 else if (*++ptr) {
663 /* check if augmentation size is first (and thus present) */
664 if (*ptr == 'z') {
665 /* check for ignorable (or already handled)
666 * nul-terminated augmentation string */
667 while (++ptr < end && *ptr)
668 if (strchr("LPR", *ptr) == NULL)
669 break;
670 }
671 if (ptr >= end || *ptr)
672 cie = NULL;
673 }
674 ++ptr;
675 }
676 if (cie != NULL) {
677 /* get code aligment factor */
678 state.codeAlign = get_uleb128(&ptr, end);
679 /* get data aligment factor */
680 state.dataAlign = get_sleb128(&ptr, end);
681 if (state.codeAlign == 0 || state.dataAlign == 0 || ptr >= end)
682 cie = NULL;
683 else {
684 retAddrReg = state.version <= 1 ? *ptr++ : get_uleb128(&ptr, end);
685 /* skip augmentation */
686 if (((const char *)(cie + 2))[1] == 'z')
687 ptr += get_uleb128(&ptr, end);
688 if (ptr > end
689 || retAddrReg >= ARRAY_SIZE(reg_info)
690 || REG_INVALID(retAddrReg)
691 || reg_info[retAddrReg].width != sizeof(unsigned long))
692 cie = NULL;
693 }
694 }
695 if (cie != NULL) {
696 state.cieStart = ptr;
697 ptr = state.cieEnd;
698 state.cieEnd = end;
699 end = (const u8 *)(fde + 1) + *fde;
700 /* skip augmentation */
701 if (((const char *)(cie + 2))[1] == 'z') {
702 uleb128_t augSize = get_uleb128(&ptr, end);
703
704 if ((ptr += augSize) > end)
705 fde = NULL;
706 }
707 }
708 if (cie == NULL || fde == NULL) {
709#ifdef CONFIG_FRAME_POINTER
710 unsigned long top, bottom;
711#endif
712
713#ifdef CONFIG_FRAME_POINTER
714 top = STACK_TOP(frame->task);
715 bottom = STACK_BOTTOM(frame->task);
716# if FRAME_RETADDR_OFFSET < 0
717 if (UNW_SP(frame) < top
718 && UNW_FP(frame) <= UNW_SP(frame)
719 && bottom < UNW_FP(frame)
720# else
721 if (UNW_SP(frame) > top
722 && UNW_FP(frame) >= UNW_SP(frame)
723 && bottom > UNW_FP(frame)
724# endif
725 && !((UNW_SP(frame) | UNW_FP(frame))
726 & (sizeof(unsigned long) - 1))) {
727 unsigned long link;
728
729 if (!__get_user(link,
730 (unsigned long *)(UNW_FP(frame)
731 + FRAME_LINK_OFFSET))
732# if FRAME_RETADDR_OFFSET < 0
733 && link > bottom && link < UNW_FP(frame)
734# else
735 && link > UNW_FP(frame) && link < bottom
736# endif
737 && !(link & (sizeof(link) - 1))
738 && !__get_user(UNW_PC(frame),
739 (unsigned long *)(UNW_FP(frame)
740 + FRAME_RETADDR_OFFSET))) {
741 UNW_SP(frame) = UNW_FP(frame) + FRAME_RETADDR_OFFSET
742# if FRAME_RETADDR_OFFSET < 0
743 -
744# else
745 +
746# endif
747 sizeof(UNW_PC(frame));
748 UNW_FP(frame) = link;
749 return 0;
750 }
751 }
752#endif
753 return -ENXIO;
754 }
755 state.org = startLoc;
756 memcpy(&state.cfa, &badCFA, sizeof(state.cfa));
757 /* process instructions */
758 if (!processCFI(ptr, end, UNW_PC(frame), ptrType, &state)
759 || state.loc > endLoc
760 || state.regs[retAddrReg].where == Nowhere
761 || state.cfa.reg >= ARRAY_SIZE(reg_info)
762 || reg_info[state.cfa.reg].width != sizeof(unsigned long)
763 || state.cfa.offs % sizeof(unsigned long))
764 return -EIO;
765 /* update frame */
766 cfa = FRAME_REG(state.cfa.reg, unsigned long) + state.cfa.offs;
767 startLoc = min((unsigned long)UNW_SP(frame), cfa);
768 endLoc = max((unsigned long)UNW_SP(frame), cfa);
769 if (STACK_LIMIT(startLoc) != STACK_LIMIT(endLoc)) {
770 startLoc = min(STACK_LIMIT(cfa), cfa);
771 endLoc = max(STACK_LIMIT(cfa), cfa);
772 }
773#ifndef CONFIG_64BIT
774# define CASES CASE(8); CASE(16); CASE(32)
775#else
776# define CASES CASE(8); CASE(16); CASE(32); CASE(64)
777#endif
778 for (i = 0; i < ARRAY_SIZE(state.regs); ++i) {
779 if (REG_INVALID(i)) {
780 if (state.regs[i].where == Nowhere)
781 continue;
782 return -EIO;
783 }
784 switch(state.regs[i].where) {
785 default:
786 break;
787 case Register:
788 if (state.regs[i].value >= ARRAY_SIZE(reg_info)
789 || REG_INVALID(state.regs[i].value)
790 || reg_info[i].width > reg_info[state.regs[i].value].width)
791 return -EIO;
792 switch(reg_info[state.regs[i].value].width) {
793#define CASE(n) \
794 case sizeof(u##n): \
795 state.regs[i].value = FRAME_REG(state.regs[i].value, \
796 const u##n); \
797 break
798 CASES;
799#undef CASE
800 default:
801 return -EIO;
802 }
803 break;
804 }
805 }
806 for (i = 0; i < ARRAY_SIZE(state.regs); ++i) {
807 if (REG_INVALID(i))
808 continue;
809 switch(state.regs[i].where) {
810 case Nowhere:
811 if (reg_info[i].width != sizeof(UNW_SP(frame))
812 || &FRAME_REG(i, __typeof__(UNW_SP(frame)))
813 != &UNW_SP(frame))
814 continue;
815 UNW_SP(frame) = cfa;
816 break;
817 case Register:
818 switch(reg_info[i].width) {
819#define CASE(n) case sizeof(u##n): \
820 FRAME_REG(i, u##n) = state.regs[i].value; \
821 break
822 CASES;
823#undef CASE
824 default:
825 return -EIO;
826 }
827 break;
828 case Value:
829 if (reg_info[i].width != sizeof(unsigned long))
830 return -EIO;
831 FRAME_REG(i, unsigned long) = cfa + state.regs[i].value
832 * state.dataAlign;
833 break;
834 case Memory: {
835 unsigned long addr = cfa + state.regs[i].value
836 * state.dataAlign;
837
838 if ((state.regs[i].value * state.dataAlign)
839 % sizeof(unsigned long)
840 || addr < startLoc
841 || addr + sizeof(unsigned long) < addr
842 || addr + sizeof(unsigned long) > endLoc)
843 return -EIO;
844 switch(reg_info[i].width) {
845#define CASE(n) case sizeof(u##n): \
846 __get_user(FRAME_REG(i, u##n), (u##n *)addr); \
847 break
848 CASES;
849#undef CASE
850 default:
851 return -EIO;
852 }
853 }
854 break;
855 }
856 }
857
858 return 0;
859#undef CASES
860#undef FRAME_REG
861}
862EXPORT_SYMBOL(unwind);
863
864int unwind_init_frame_info(struct unwind_frame_info *info,
865 struct task_struct *tsk,
866 /*const*/ struct pt_regs *regs)
867{
868 info->task = tsk;
869 arch_unw_init_frame_info(info, regs);
870
871 return 0;
872}
873EXPORT_SYMBOL(unwind_init_frame_info);
874
875/*
876 * Prepare to unwind a blocked task.
877 */
878int unwind_init_blocked(struct unwind_frame_info *info,
879 struct task_struct *tsk)
880{
881 info->task = tsk;
882 arch_unw_init_blocked(info);
883
884 return 0;
885}
886EXPORT_SYMBOL(unwind_init_blocked);
887
888/*
889 * Prepare to unwind the currently running thread.
890 */
891int unwind_init_running(struct unwind_frame_info *info,
892 asmlinkage int (*callback)(struct unwind_frame_info *,
893 void *arg),
894 void *arg)
895{
896 info->task = current;
897
898 return arch_unwind_init_running(info, callback, arg);
899}
900EXPORT_SYMBOL(unwind_init_running);
901
902/*
903 * Unwind until the return pointer is in user-land (or until an error
904 * occurs). Returns 0 if successful, negative number in case of
905 * error.
906 */
907int unwind_to_user(struct unwind_frame_info *info)
908{
909 while (!arch_unw_user_mode(info)) {
910 int err = unwind(info);
911
912 if (err < 0)
913 return err;
914 }
915
916 return 0;
917}
918EXPORT_SYMBOL(unwind_to_user);
diff --git a/kernel/user.c b/kernel/user.c
index 2116642f42c6..6408c0424291 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -140,7 +140,7 @@ struct user_struct * alloc_uid(uid_t uid)
140 atomic_set(&new->processes, 0); 140 atomic_set(&new->processes, 0);
141 atomic_set(&new->files, 0); 141 atomic_set(&new->files, 0);
142 atomic_set(&new->sigpending, 0); 142 atomic_set(&new->sigpending, 0);
143#ifdef CONFIG_INOTIFY 143#ifdef CONFIG_INOTIFY_USER
144 atomic_set(&new->inotify_watches, 0); 144 atomic_set(&new->inotify_watches, 0);
145 atomic_set(&new->inotify_devs, 0); 145 atomic_set(&new->inotify_devs, 0);
146#endif 146#endif
@@ -148,7 +148,7 @@ struct user_struct * alloc_uid(uid_t uid)
148 new->mq_bytes = 0; 148 new->mq_bytes = 0;
149 new->locked_shm = 0; 149 new->locked_shm = 0;
150 150
151 if (alloc_uid_keyring(new) < 0) { 151 if (alloc_uid_keyring(new, current) < 0) {
152 kmem_cache_free(uid_cachep, new); 152 kmem_cache_free(uid_cachep, new);
153 return NULL; 153 return NULL;
154 } 154 }
diff --git a/kernel/wait.c b/kernel/wait.c
index 791681cfea98..59a82f63275d 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -3,7 +3,6 @@
3 * 3 *
4 * (C) 2004 William Irwin, Oracle 4 * (C) 2004 William Irwin, Oracle
5 */ 5 */
6#include <linux/config.h>
7#include <linux/init.h> 6#include <linux/init.h>
8#include <linux/module.h> 7#include <linux/module.h>
9#include <linux/sched.h> 8#include <linux/sched.h>
@@ -11,6 +10,14 @@
11#include <linux/wait.h> 10#include <linux/wait.h>
12#include <linux/hash.h> 11#include <linux/hash.h>
13 12
13void init_waitqueue_head(wait_queue_head_t *q)
14{
15 spin_lock_init(&q->lock);
16 INIT_LIST_HEAD(&q->task_list);
17}
18
19EXPORT_SYMBOL(init_waitqueue_head);
20
14void fastcall add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) 21void fastcall add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
15{ 22{
16 unsigned long flags; 23 unsigned long flags;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 880fb415a8f6..eebb1d839235 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -51,7 +51,7 @@ struct cpu_workqueue_struct {
51 wait_queue_head_t work_done; 51 wait_queue_head_t work_done;
52 52
53 struct workqueue_struct *wq; 53 struct workqueue_struct *wq;
54 task_t *thread; 54 struct task_struct *thread;
55 55
56 int run_depth; /* Detect run_workqueue() recursion depth */ 56 int run_depth; /* Detect run_workqueue() recursion depth */
57} ____cacheline_aligned; 57} ____cacheline_aligned;
@@ -114,6 +114,7 @@ int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
114 put_cpu(); 114 put_cpu();
115 return ret; 115 return ret;
116} 116}
117EXPORT_SYMBOL_GPL(queue_work);
117 118
118static void delayed_work_timer_fn(unsigned long __data) 119static void delayed_work_timer_fn(unsigned long __data)
119{ 120{
@@ -147,6 +148,29 @@ int fastcall queue_delayed_work(struct workqueue_struct *wq,
147 } 148 }
148 return ret; 149 return ret;
149} 150}
151EXPORT_SYMBOL_GPL(queue_delayed_work);
152
153int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
154 struct work_struct *work, unsigned long delay)
155{
156 int ret = 0;
157 struct timer_list *timer = &work->timer;
158
159 if (!test_and_set_bit(0, &work->pending)) {
160 BUG_ON(timer_pending(timer));
161 BUG_ON(!list_empty(&work->entry));
162
163 /* This stores wq for the moment, for the timer_fn */
164 work->wq_data = wq;
165 timer->expires = jiffies + delay;
166 timer->data = (unsigned long)work;
167 timer->function = delayed_work_timer_fn;
168 add_timer_on(timer, cpu);
169 ret = 1;
170 }
171 return ret;
172}
173EXPORT_SYMBOL_GPL(queue_delayed_work_on);
150 174
151static void run_workqueue(struct cpu_workqueue_struct *cwq) 175static void run_workqueue(struct cpu_workqueue_struct *cwq)
152{ 176{
@@ -281,6 +305,7 @@ void fastcall flush_workqueue(struct workqueue_struct *wq)
281 unlock_cpu_hotplug(); 305 unlock_cpu_hotplug();
282 } 306 }
283} 307}
308EXPORT_SYMBOL_GPL(flush_workqueue);
284 309
285static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq, 310static struct task_struct *create_workqueue_thread(struct workqueue_struct *wq,
286 int cpu) 311 int cpu)
@@ -358,6 +383,7 @@ struct workqueue_struct *__create_workqueue(const char *name,
358 } 383 }
359 return wq; 384 return wq;
360} 385}
386EXPORT_SYMBOL_GPL(__create_workqueue);
361 387
362static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu) 388static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu)
363{ 389{
@@ -395,6 +421,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
395 free_percpu(wq->cpu_wq); 421 free_percpu(wq->cpu_wq);
396 kfree(wq); 422 kfree(wq);
397} 423}
424EXPORT_SYMBOL_GPL(destroy_workqueue);
398 425
399static struct workqueue_struct *keventd_wq; 426static struct workqueue_struct *keventd_wq;
400 427
@@ -402,48 +429,49 @@ int fastcall schedule_work(struct work_struct *work)
402{ 429{
403 return queue_work(keventd_wq, work); 430 return queue_work(keventd_wq, work);
404} 431}
432EXPORT_SYMBOL(schedule_work);
405 433
406int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay) 434int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay)
407{ 435{
408 return queue_delayed_work(keventd_wq, work, delay); 436 return queue_delayed_work(keventd_wq, work, delay);
409} 437}
438EXPORT_SYMBOL(schedule_delayed_work);
410 439
411int schedule_delayed_work_on(int cpu, 440int schedule_delayed_work_on(int cpu,
412 struct work_struct *work, unsigned long delay) 441 struct work_struct *work, unsigned long delay)
413{ 442{
414 int ret = 0; 443 return queue_delayed_work_on(cpu, keventd_wq, work, delay);
415 struct timer_list *timer = &work->timer;
416
417 if (!test_and_set_bit(0, &work->pending)) {
418 BUG_ON(timer_pending(timer));
419 BUG_ON(!list_empty(&work->entry));
420 /* This stores keventd_wq for the moment, for the timer_fn */
421 work->wq_data = keventd_wq;
422 timer->expires = jiffies + delay;
423 timer->data = (unsigned long)work;
424 timer->function = delayed_work_timer_fn;
425 add_timer_on(timer, cpu);
426 ret = 1;
427 }
428 return ret;
429} 444}
445EXPORT_SYMBOL(schedule_delayed_work_on);
430 446
431int schedule_on_each_cpu(void (*func) (void *info), void *info) 447/**
448 * schedule_on_each_cpu - call a function on each online CPU from keventd
449 * @func: the function to call
450 * @info: a pointer to pass to func()
451 *
452 * Returns zero on success.
453 * Returns -ve errno on failure.
454 *
455 * Appears to be racy against CPU hotplug.
456 *
457 * schedule_on_each_cpu() is very slow.
458 */
459int schedule_on_each_cpu(void (*func)(void *info), void *info)
432{ 460{
433 int cpu; 461 int cpu;
434 struct work_struct *work; 462 struct work_struct *works;
435 463
436 work = kmalloc(NR_CPUS * sizeof(struct work_struct), GFP_KERNEL); 464 works = alloc_percpu(struct work_struct);
437 465 if (!works)
438 if (!work)
439 return -ENOMEM; 466 return -ENOMEM;
467
440 for_each_online_cpu(cpu) { 468 for_each_online_cpu(cpu) {
441 INIT_WORK(work + cpu, func, info); 469 INIT_WORK(per_cpu_ptr(works, cpu), func, info);
442 __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), 470 __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu),
443 work + cpu); 471 per_cpu_ptr(works, cpu));
444 } 472 }
445 flush_workqueue(keventd_wq); 473 flush_workqueue(keventd_wq);
446 kfree(work); 474 free_percpu(works);
447 return 0; 475 return 0;
448} 476}
449 477
@@ -451,6 +479,7 @@ void flush_scheduled_work(void)
451{ 479{
452 flush_workqueue(keventd_wq); 480 flush_workqueue(keventd_wq);
453} 481}
482EXPORT_SYMBOL(flush_scheduled_work);
454 483
455/** 484/**
456 * cancel_rearming_delayed_workqueue - reliably kill off a delayed 485 * cancel_rearming_delayed_workqueue - reliably kill off a delayed
@@ -531,11 +560,11 @@ int current_is_keventd(void)
531static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) 560static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
532{ 561{
533 struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); 562 struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
534 LIST_HEAD(list); 563 struct list_head list;
535 struct work_struct *work; 564 struct work_struct *work;
536 565
537 spin_lock_irq(&cwq->lock); 566 spin_lock_irq(&cwq->lock);
538 list_splice_init(&cwq->worklist, &list); 567 list_replace_init(&cwq->worklist, &list);
539 568
540 while (!list_empty(&list)) { 569 while (!list_empty(&list)) {
541 printk("Taking work for %s\n", wq->name); 570 printk("Taking work for %s\n", wq->name);
@@ -547,7 +576,7 @@ static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
547} 576}
548 577
549/* We're holding the cpucontrol mutex here */ 578/* We're holding the cpucontrol mutex here */
550static int workqueue_cpu_callback(struct notifier_block *nfb, 579static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
551 unsigned long action, 580 unsigned long action,
552 void *hcpu) 581 void *hcpu)
553{ 582{
@@ -578,6 +607,8 @@ static int workqueue_cpu_callback(struct notifier_block *nfb,
578 607
579 case CPU_UP_CANCELED: 608 case CPU_UP_CANCELED:
580 list_for_each_entry(wq, &workqueues, list) { 609 list_for_each_entry(wq, &workqueues, list) {
610 if (!per_cpu_ptr(wq->cpu_wq, hotcpu)->thread)
611 continue;
581 /* Unbind so it can run. */ 612 /* Unbind so it can run. */
582 kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread, 613 kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread,
583 any_online_cpu(cpu_online_map)); 614 any_online_cpu(cpu_online_map));
@@ -605,13 +636,3 @@ void init_workqueues(void)
605 BUG_ON(!keventd_wq); 636 BUG_ON(!keventd_wq);
606} 637}
607 638
608EXPORT_SYMBOL_GPL(__create_workqueue);
609EXPORT_SYMBOL_GPL(queue_work);
610EXPORT_SYMBOL_GPL(queue_delayed_work);
611EXPORT_SYMBOL_GPL(flush_workqueue);
612EXPORT_SYMBOL_GPL(destroy_workqueue);
613
614EXPORT_SYMBOL(schedule_work);
615EXPORT_SYMBOL(schedule_delayed_work);
616EXPORT_SYMBOL(schedule_delayed_work_on);
617EXPORT_SYMBOL(flush_scheduled_work);