aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorSteven Whitehouse <swhiteho@redhat.com>2006-07-03 10:25:08 -0400
committerSteven Whitehouse <swhiteho@redhat.com>2006-07-03 10:25:08 -0400
commit0a1340c185734a57fbf4775927966ad4a1347b02 (patch)
treed9ed8f0dd809a7c542a3356601125ea5b5aaa804 /kernel
parentaf18ddb8864b096e3ed4732e2d4b21c956dcfe3a (diff)
parent29454dde27d8e340bb1987bad9aa504af7081eba (diff)
Merge rsync://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux-2.6
Conflicts: include/linux/kernel.h
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile6
-rw-r--r--kernel/acct.c119
-rw-r--r--kernel/audit.c215
-rw-r--r--kernel/audit.h62
-rw-r--r--kernel/auditfilter.c1106
-rw-r--r--kernel/auditsc.c716
-rw-r--r--kernel/compat.c30
-rw-r--r--kernel/configs.c1
-rw-r--r--kernel/cpu.c18
-rw-r--r--kernel/cpuset.c45
-rw-r--r--kernel/exec_domain.c1
-rw-r--r--kernel/exit.c31
-rw-r--r--kernel/fork.c41
-rw-r--r--kernel/futex.c1077
-rw-r--r--kernel/futex_compat.c14
-rw-r--r--kernel/hrtimer.c23
-rw-r--r--kernel/intermodule.c184
-rw-r--r--kernel/irq/Makefile2
-rw-r--r--kernel/irq/autoprobe.c56
-rw-r--r--kernel/irq/chip.c534
-rw-r--r--kernel/irq/handle.c143
-rw-r--r--kernel/irq/internals.h46
-rw-r--r--kernel/irq/manage.c179
-rw-r--r--kernel/irq/migration.c22
-rw-r--r--kernel/irq/proc.c33
-rw-r--r--kernel/irq/resend.c78
-rw-r--r--kernel/irq/spurious.c49
-rw-r--r--kernel/kexec.c12
-rw-r--r--kernel/kmod.c1
-rw-r--r--kernel/kprobes.c58
-rw-r--r--kernel/ksysfs.c20
-rw-r--r--kernel/kthread.c61
-rw-r--r--kernel/module.c129
-rw-r--r--kernel/mutex-debug.c17
-rw-r--r--kernel/mutex-debug.h25
-rw-r--r--kernel/mutex.c21
-rw-r--r--kernel/mutex.h6
-rw-r--r--kernel/panic.c1
-rw-r--r--kernel/params.c1
-rw-r--r--kernel/power/Kconfig30
-rw-r--r--kernel/power/disk.c2
-rw-r--r--kernel/power/main.c6
-rw-r--r--kernel/power/power.h2
-rw-r--r--kernel/power/snapshot.c148
-rw-r--r--kernel/power/swsusp.c20
-rw-r--r--kernel/printk.c53
-rw-r--r--kernel/profile.c3
-rw-r--r--kernel/ptrace.c23
-rw-r--r--kernel/rcupdate.c27
-rw-r--r--kernel/rcutorture.c201
-rw-r--r--kernel/resource.c91
-rw-r--r--kernel/rtmutex-debug.c513
-rw-r--r--kernel/rtmutex-debug.h37
-rw-r--r--kernel/rtmutex-tester.c440
-rw-r--r--kernel/rtmutex.c990
-rw-r--r--kernel/rtmutex.h29
-rw-r--r--kernel/rtmutex_common.h123
-rw-r--r--kernel/sched.c1253
-rw-r--r--kernel/signal.c45
-rw-r--r--kernel/softirq.c6
-rw-r--r--kernel/softlockup.c8
-rw-r--r--kernel/spinlock.c1
-rw-r--r--kernel/stop_machine.c17
-rw-r--r--kernel/sys.c81
-rw-r--r--kernel/sys_ni.c2
-rw-r--r--kernel/sysctl.c68
-rw-r--r--kernel/time.c2
-rw-r--r--kernel/time/Makefile1
-rw-r--r--kernel/time/clocksource.c349
-rw-r--r--kernel/time/jiffies.c73
-rw-r--r--kernel/timer.c432
-rw-r--r--kernel/unwind.c918
-rw-r--r--kernel/user.c4
-rw-r--r--kernel/wait.c1
-rw-r--r--kernel/workqueue.c36
75 files changed, 9457 insertions, 1761 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 58908f9d156a..82fb182f6f61 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,18 +10,22 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o 11 hrtimer.o
12 12
13obj-y += time/
13obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o 14obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
14obj-$(CONFIG_FUTEX) += futex.o 15obj-$(CONFIG_FUTEX) += futex.o
15ifeq ($(CONFIG_COMPAT),y) 16ifeq ($(CONFIG_COMPAT),y)
16obj-$(CONFIG_FUTEX) += futex_compat.o 17obj-$(CONFIG_FUTEX) += futex_compat.o
17endif 18endif
19obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
20obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
21obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
18obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o 22obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
19obj-$(CONFIG_SMP) += cpu.o spinlock.o 23obj-$(CONFIG_SMP) += cpu.o spinlock.o
20obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o 24obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
21obj-$(CONFIG_UID16) += uid16.o 25obj-$(CONFIG_UID16) += uid16.o
22obj-$(CONFIG_MODULES) += module.o 26obj-$(CONFIG_MODULES) += module.o
23obj-$(CONFIG_OBSOLETE_INTERMODULE) += intermodule.o
24obj-$(CONFIG_KALLSYMS) += kallsyms.o 27obj-$(CONFIG_KALLSYMS) += kallsyms.o
28obj-$(CONFIG_STACK_UNWIND) += unwind.o
25obj-$(CONFIG_PM) += power/ 29obj-$(CONFIG_PM) += power/
26obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o 30obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
27obj-$(CONFIG_KEXEC) += kexec.o 31obj-$(CONFIG_KEXEC) += kexec.o
diff --git a/kernel/acct.c b/kernel/acct.c
index b327f4d20104..f18e0b8df3e1 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -43,7 +43,6 @@
43 * a struct file opened for write. Fixed. 2/6/2000, AV. 43 * a struct file opened for write. Fixed. 2/6/2000, AV.
44 */ 44 */
45 45
46#include <linux/config.h>
47#include <linux/mm.h> 46#include <linux/mm.h>
48#include <linux/slab.h> 47#include <linux/slab.h>
49#include <linux/acct.h> 48#include <linux/acct.h>
@@ -75,7 +74,7 @@ int acct_parm[3] = {4, 2, 30};
75/* 74/*
76 * External references and all of the globals. 75 * External references and all of the globals.
77 */ 76 */
78static void do_acct_process(long, struct file *); 77static void do_acct_process(struct file *);
79 78
80/* 79/*
81 * This structure is used so that all the data protected by lock 80 * This structure is used so that all the data protected by lock
@@ -118,7 +117,7 @@ static int check_free_space(struct file *file)
118 spin_unlock(&acct_globals.lock); 117 spin_unlock(&acct_globals.lock);
119 118
120 /* May block */ 119 /* May block */
121 if (vfs_statfs(file->f_dentry->d_inode->i_sb, &sbuf)) 120 if (vfs_statfs(file->f_dentry, &sbuf))
122 return res; 121 return res;
123 suspend = sbuf.f_blocks * SUSPEND; 122 suspend = sbuf.f_blocks * SUSPEND;
124 resume = sbuf.f_blocks * RESUME; 123 resume = sbuf.f_blocks * RESUME;
@@ -196,7 +195,7 @@ static void acct_file_reopen(struct file *file)
196 if (old_acct) { 195 if (old_acct) {
197 mnt_unpin(old_acct->f_vfsmnt); 196 mnt_unpin(old_acct->f_vfsmnt);
198 spin_unlock(&acct_globals.lock); 197 spin_unlock(&acct_globals.lock);
199 do_acct_process(0, old_acct); 198 do_acct_process(old_acct);
200 filp_close(old_acct, NULL); 199 filp_close(old_acct, NULL);
201 spin_lock(&acct_globals.lock); 200 spin_lock(&acct_globals.lock);
202 } 201 }
@@ -419,16 +418,15 @@ static u32 encode_float(u64 value)
419/* 418/*
420 * do_acct_process does all actual work. Caller holds the reference to file. 419 * do_acct_process does all actual work. Caller holds the reference to file.
421 */ 420 */
422static void do_acct_process(long exitcode, struct file *file) 421static void do_acct_process(struct file *file)
423{ 422{
423 struct pacct_struct *pacct = &current->signal->pacct;
424 acct_t ac; 424 acct_t ac;
425 mm_segment_t fs; 425 mm_segment_t fs;
426 unsigned long vsize;
427 unsigned long flim; 426 unsigned long flim;
428 u64 elapsed; 427 u64 elapsed;
429 u64 run_time; 428 u64 run_time;
430 struct timespec uptime; 429 struct timespec uptime;
431 unsigned long jiffies;
432 430
433 /* 431 /*
434 * First check to see if there is enough free_space to continue 432 * First check to see if there is enough free_space to continue
@@ -469,12 +467,6 @@ static void do_acct_process(long exitcode, struct file *file)
469#endif 467#endif
470 do_div(elapsed, AHZ); 468 do_div(elapsed, AHZ);
471 ac.ac_btime = xtime.tv_sec - elapsed; 469 ac.ac_btime = xtime.tv_sec - elapsed;
472 jiffies = cputime_to_jiffies(cputime_add(current->utime,
473 current->signal->utime));
474 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies));
475 jiffies = cputime_to_jiffies(cputime_add(current->stime,
476 current->signal->stime));
477 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies));
478 /* we really need to bite the bullet and change layout */ 470 /* we really need to bite the bullet and change layout */
479 ac.ac_uid = current->uid; 471 ac.ac_uid = current->uid;
480 ac.ac_gid = current->gid; 472 ac.ac_gid = current->gid;
@@ -496,37 +488,18 @@ static void do_acct_process(long exitcode, struct file *file)
496 old_encode_dev(tty_devnum(current->signal->tty)) : 0; 488 old_encode_dev(tty_devnum(current->signal->tty)) : 0;
497 read_unlock(&tasklist_lock); 489 read_unlock(&tasklist_lock);
498 490
499 ac.ac_flag = 0; 491 spin_lock(&current->sighand->siglock);
500 if (current->flags & PF_FORKNOEXEC) 492 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
501 ac.ac_flag |= AFORK; 493 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
502 if (current->flags & PF_SUPERPRIV) 494 ac.ac_flag = pacct->ac_flag;
503 ac.ac_flag |= ASU; 495 ac.ac_mem = encode_comp_t(pacct->ac_mem);
504 if (current->flags & PF_DUMPCORE) 496 ac.ac_minflt = encode_comp_t(pacct->ac_minflt);
505 ac.ac_flag |= ACORE; 497 ac.ac_majflt = encode_comp_t(pacct->ac_majflt);
506 if (current->flags & PF_SIGNALED) 498 ac.ac_exitcode = pacct->ac_exitcode;
507 ac.ac_flag |= AXSIG; 499 spin_unlock(&current->sighand->siglock);
508
509 vsize = 0;
510 if (current->mm) {
511 struct vm_area_struct *vma;
512 down_read(&current->mm->mmap_sem);
513 vma = current->mm->mmap;
514 while (vma) {
515 vsize += vma->vm_end - vma->vm_start;
516 vma = vma->vm_next;
517 }
518 up_read(&current->mm->mmap_sem);
519 }
520 vsize = vsize / 1024;
521 ac.ac_mem = encode_comp_t(vsize);
522 ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ 500 ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */
523 ac.ac_rw = encode_comp_t(ac.ac_io / 1024); 501 ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
524 ac.ac_minflt = encode_comp_t(current->signal->min_flt +
525 current->min_flt);
526 ac.ac_majflt = encode_comp_t(current->signal->maj_flt +
527 current->maj_flt);
528 ac.ac_swaps = encode_comp_t(0); 502 ac.ac_swaps = encode_comp_t(0);
529 ac.ac_exitcode = exitcode;
530 503
531 /* 504 /*
532 * Kernel segment override to datasegment and write it 505 * Kernel segment override to datasegment and write it
@@ -546,12 +519,64 @@ static void do_acct_process(long exitcode, struct file *file)
546} 519}
547 520
548/** 521/**
522 * acct_init_pacct - initialize a new pacct_struct
523 * @pacct: per-process accounting info struct to initialize
524 */
525void acct_init_pacct(struct pacct_struct *pacct)
526{
527 memset(pacct, 0, sizeof(struct pacct_struct));
528 pacct->ac_utime = pacct->ac_stime = cputime_zero;
529}
530
531/**
532 * acct_collect - collect accounting information into pacct_struct
533 * @exitcode: task exit code
534 * @group_dead: not 0, if this thread is the last one in the process.
535 */
536void acct_collect(long exitcode, int group_dead)
537{
538 struct pacct_struct *pacct = &current->signal->pacct;
539 unsigned long vsize = 0;
540
541 if (group_dead && current->mm) {
542 struct vm_area_struct *vma;
543 down_read(&current->mm->mmap_sem);
544 vma = current->mm->mmap;
545 while (vma) {
546 vsize += vma->vm_end - vma->vm_start;
547 vma = vma->vm_next;
548 }
549 up_read(&current->mm->mmap_sem);
550 }
551
552 spin_lock_irq(&current->sighand->siglock);
553 if (group_dead)
554 pacct->ac_mem = vsize / 1024;
555 if (thread_group_leader(current)) {
556 pacct->ac_exitcode = exitcode;
557 if (current->flags & PF_FORKNOEXEC)
558 pacct->ac_flag |= AFORK;
559 }
560 if (current->flags & PF_SUPERPRIV)
561 pacct->ac_flag |= ASU;
562 if (current->flags & PF_DUMPCORE)
563 pacct->ac_flag |= ACORE;
564 if (current->flags & PF_SIGNALED)
565 pacct->ac_flag |= AXSIG;
566 pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime);
567 pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime);
568 pacct->ac_minflt += current->min_flt;
569 pacct->ac_majflt += current->maj_flt;
570 spin_unlock_irq(&current->sighand->siglock);
571}
572
573/**
549 * acct_process - now just a wrapper around do_acct_process 574 * acct_process - now just a wrapper around do_acct_process
550 * @exitcode: task exit code 575 * @exitcode: task exit code
551 * 576 *
552 * handles process accounting for an exiting task 577 * handles process accounting for an exiting task
553 */ 578 */
554void acct_process(long exitcode) 579void acct_process(void)
555{ 580{
556 struct file *file = NULL; 581 struct file *file = NULL;
557 582
@@ -570,7 +595,7 @@ void acct_process(long exitcode)
570 get_file(file); 595 get_file(file);
571 spin_unlock(&acct_globals.lock); 596 spin_unlock(&acct_globals.lock);
572 597
573 do_acct_process(exitcode, file); 598 do_acct_process(file);
574 fput(file); 599 fput(file);
575} 600}
576 601
@@ -599,9 +624,7 @@ void acct_update_integrals(struct task_struct *tsk)
599 */ 624 */
600void acct_clear_integrals(struct task_struct *tsk) 625void acct_clear_integrals(struct task_struct *tsk)
601{ 626{
602 if (tsk) { 627 tsk->acct_stimexpd = 0;
603 tsk->acct_stimexpd = 0; 628 tsk->acct_rss_mem1 = 0;
604 tsk->acct_rss_mem1 = 0; 629 tsk->acct_vm_mem1 = 0;
605 tsk->acct_vm_mem1 = 0;
606 }
607} 630}
diff --git a/kernel/audit.c b/kernel/audit.c
index df57b493e1cb..d417ca1db79b 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -56,6 +56,7 @@
56#include <linux/skbuff.h> 56#include <linux/skbuff.h>
57#include <linux/netlink.h> 57#include <linux/netlink.h>
58#include <linux/selinux.h> 58#include <linux/selinux.h>
59#include <linux/inotify.h>
59 60
60#include "audit.h" 61#include "audit.h"
61 62
@@ -89,6 +90,7 @@ static int audit_backlog_wait_overflow = 0;
89/* The identity of the user shutting down the audit system. */ 90/* The identity of the user shutting down the audit system. */
90uid_t audit_sig_uid = -1; 91uid_t audit_sig_uid = -1;
91pid_t audit_sig_pid = -1; 92pid_t audit_sig_pid = -1;
93u32 audit_sig_sid = 0;
92 94
93/* Records can be lost in several ways: 95/* Records can be lost in several ways:
94 0) [suppressed in audit_alloc] 96 0) [suppressed in audit_alloc]
@@ -102,6 +104,12 @@ static atomic_t audit_lost = ATOMIC_INIT(0);
102/* The netlink socket. */ 104/* The netlink socket. */
103static struct sock *audit_sock; 105static struct sock *audit_sock;
104 106
107/* Inotify handle. */
108struct inotify_handle *audit_ih;
109
110/* Hash for inode-based rules */
111struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
112
105/* The audit_freelist is a list of pre-allocated audit buffers (if more 113/* The audit_freelist is a list of pre-allocated audit buffers (if more
106 * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of 114 * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of
107 * being placed on the freelist). */ 115 * being placed on the freelist). */
@@ -114,10 +122,8 @@ static struct task_struct *kauditd_task;
114static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); 122static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
115static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); 123static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
116 124
117/* The netlink socket is only to be read by 1 CPU, which lets us assume 125/* Serialize requests from userspace. */
118 * that list additions and deletions never happen simultaneously in 126static DEFINE_MUTEX(audit_cmd_mutex);
119 * auditsc.c */
120DEFINE_MUTEX(audit_netlink_mutex);
121 127
122/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting 128/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
123 * audit records. Since printk uses a 1024 byte buffer, this buffer 129 * audit records. Since printk uses a 1024 byte buffer, this buffer
@@ -250,7 +256,7 @@ static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid)
250 "audit_rate_limit=%d old=%d by auid=%u", 256 "audit_rate_limit=%d old=%d by auid=%u",
251 limit, old, loginuid); 257 limit, old, loginuid);
252 audit_rate_limit = limit; 258 audit_rate_limit = limit;
253 return old; 259 return 0;
254} 260}
255 261
256static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid) 262static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid)
@@ -273,7 +279,7 @@ static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid)
273 "audit_backlog_limit=%d old=%d by auid=%u", 279 "audit_backlog_limit=%d old=%d by auid=%u",
274 limit, old, loginuid); 280 limit, old, loginuid);
275 audit_backlog_limit = limit; 281 audit_backlog_limit = limit;
276 return old; 282 return 0;
277} 283}
278 284
279static int audit_set_enabled(int state, uid_t loginuid, u32 sid) 285static int audit_set_enabled(int state, uid_t loginuid, u32 sid)
@@ -299,7 +305,7 @@ static int audit_set_enabled(int state, uid_t loginuid, u32 sid)
299 "audit_enabled=%d old=%d by auid=%u", 305 "audit_enabled=%d old=%d by auid=%u",
300 state, old, loginuid); 306 state, old, loginuid);
301 audit_enabled = state; 307 audit_enabled = state;
302 return old; 308 return 0;
303} 309}
304 310
305static int audit_set_failure(int state, uid_t loginuid, u32 sid) 311static int audit_set_failure(int state, uid_t loginuid, u32 sid)
@@ -327,7 +333,7 @@ static int audit_set_failure(int state, uid_t loginuid, u32 sid)
327 "audit_failure=%d old=%d by auid=%u", 333 "audit_failure=%d old=%d by auid=%u",
328 state, old, loginuid); 334 state, old, loginuid);
329 audit_failure = state; 335 audit_failure = state;
330 return old; 336 return 0;
331} 337}
332 338
333static int kauditd_thread(void *dummy) 339static int kauditd_thread(void *dummy)
@@ -363,9 +369,52 @@ static int kauditd_thread(void *dummy)
363 remove_wait_queue(&kauditd_wait, &wait); 369 remove_wait_queue(&kauditd_wait, &wait);
364 } 370 }
365 } 371 }
372}
373
374int audit_send_list(void *_dest)
375{
376 struct audit_netlink_list *dest = _dest;
377 int pid = dest->pid;
378 struct sk_buff *skb;
379
380 /* wait for parent to finish and send an ACK */
381 mutex_lock(&audit_cmd_mutex);
382 mutex_unlock(&audit_cmd_mutex);
383
384 while ((skb = __skb_dequeue(&dest->q)) != NULL)
385 netlink_unicast(audit_sock, skb, pid, 0);
386
387 kfree(dest);
388
366 return 0; 389 return 0;
367} 390}
368 391
392struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
393 int multi, void *payload, int size)
394{
395 struct sk_buff *skb;
396 struct nlmsghdr *nlh;
397 int len = NLMSG_SPACE(size);
398 void *data;
399 int flags = multi ? NLM_F_MULTI : 0;
400 int t = done ? NLMSG_DONE : type;
401
402 skb = alloc_skb(len, GFP_KERNEL);
403 if (!skb)
404 return NULL;
405
406 nlh = NLMSG_PUT(skb, pid, seq, t, size);
407 nlh->nlmsg_flags = flags;
408 data = NLMSG_DATA(nlh);
409 memcpy(data, payload, size);
410 return skb;
411
412nlmsg_failure: /* Used by NLMSG_PUT */
413 if (skb)
414 kfree_skb(skb);
415 return NULL;
416}
417
369/** 418/**
370 * audit_send_reply - send an audit reply message via netlink 419 * audit_send_reply - send an audit reply message via netlink
371 * @pid: process id to send reply to 420 * @pid: process id to send reply to
@@ -383,36 +432,20 @@ void audit_send_reply(int pid, int seq, int type, int done, int multi,
383 void *payload, int size) 432 void *payload, int size)
384{ 433{
385 struct sk_buff *skb; 434 struct sk_buff *skb;
386 struct nlmsghdr *nlh; 435 skb = audit_make_reply(pid, seq, type, done, multi, payload, size);
387 int len = NLMSG_SPACE(size);
388 void *data;
389 int flags = multi ? NLM_F_MULTI : 0;
390 int t = done ? NLMSG_DONE : type;
391
392 skb = alloc_skb(len, GFP_KERNEL);
393 if (!skb) 436 if (!skb)
394 return; 437 return;
395
396 nlh = NLMSG_PUT(skb, pid, seq, t, size);
397 nlh->nlmsg_flags = flags;
398 data = NLMSG_DATA(nlh);
399 memcpy(data, payload, size);
400
401 /* Ignore failure. It'll only happen if the sender goes away, 438 /* Ignore failure. It'll only happen if the sender goes away,
402 because our timeout is set to infinite. */ 439 because our timeout is set to infinite. */
403 netlink_unicast(audit_sock, skb, pid, 0); 440 netlink_unicast(audit_sock, skb, pid, 0);
404 return; 441 return;
405
406nlmsg_failure: /* Used by NLMSG_PUT */
407 if (skb)
408 kfree_skb(skb);
409} 442}
410 443
411/* 444/*
412 * Check for appropriate CAP_AUDIT_ capabilities on incoming audit 445 * Check for appropriate CAP_AUDIT_ capabilities on incoming audit
413 * control messages. 446 * control messages.
414 */ 447 */
415static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type) 448static int audit_netlink_ok(struct sk_buff *skb, u16 msg_type)
416{ 449{
417 int err = 0; 450 int err = 0;
418 451
@@ -426,13 +459,13 @@ static int audit_netlink_ok(kernel_cap_t eff_cap, u16 msg_type)
426 case AUDIT_DEL: 459 case AUDIT_DEL:
427 case AUDIT_DEL_RULE: 460 case AUDIT_DEL_RULE:
428 case AUDIT_SIGNAL_INFO: 461 case AUDIT_SIGNAL_INFO:
429 if (!cap_raised(eff_cap, CAP_AUDIT_CONTROL)) 462 if (security_netlink_recv(skb, CAP_AUDIT_CONTROL))
430 err = -EPERM; 463 err = -EPERM;
431 break; 464 break;
432 case AUDIT_USER: 465 case AUDIT_USER:
433 case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG: 466 case AUDIT_FIRST_USER_MSG...AUDIT_LAST_USER_MSG:
434 case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2: 467 case AUDIT_FIRST_USER_MSG2...AUDIT_LAST_USER_MSG2:
435 if (!cap_raised(eff_cap, CAP_AUDIT_WRITE)) 468 if (security_netlink_recv(skb, CAP_AUDIT_WRITE))
436 err = -EPERM; 469 err = -EPERM;
437 break; 470 break;
438 default: /* bad msg */ 471 default: /* bad msg */
@@ -451,9 +484,11 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
451 struct audit_buffer *ab; 484 struct audit_buffer *ab;
452 u16 msg_type = nlh->nlmsg_type; 485 u16 msg_type = nlh->nlmsg_type;
453 uid_t loginuid; /* loginuid of sender */ 486 uid_t loginuid; /* loginuid of sender */
454 struct audit_sig_info sig_data; 487 struct audit_sig_info *sig_data;
488 char *ctx;
489 u32 len;
455 490
456 err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type); 491 err = audit_netlink_ok(skb, msg_type);
457 if (err) 492 if (err)
458 return err; 493 return err;
459 494
@@ -503,12 +538,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
503 if (status_get->mask & AUDIT_STATUS_PID) { 538 if (status_get->mask & AUDIT_STATUS_PID) {
504 int old = audit_pid; 539 int old = audit_pid;
505 if (sid) { 540 if (sid) {
506 char *ctx = NULL; 541 if ((err = selinux_ctxid_to_string(
507 u32 len;
508 int rc;
509 if ((rc = selinux_ctxid_to_string(
510 sid, &ctx, &len))) 542 sid, &ctx, &len)))
511 return rc; 543 return err;
512 else 544 else
513 audit_log(NULL, GFP_KERNEL, 545 audit_log(NULL, GFP_KERNEL,
514 AUDIT_CONFIG_CHANGE, 546 AUDIT_CONFIG_CHANGE,
@@ -523,10 +555,10 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
523 audit_pid = status_get->pid; 555 audit_pid = status_get->pid;
524 } 556 }
525 if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) 557 if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
526 audit_set_rate_limit(status_get->rate_limit, 558 err = audit_set_rate_limit(status_get->rate_limit,
527 loginuid, sid); 559 loginuid, sid);
528 if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) 560 if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
529 audit_set_backlog_limit(status_get->backlog_limit, 561 err = audit_set_backlog_limit(status_get->backlog_limit,
530 loginuid, sid); 562 loginuid, sid);
531 break; 563 break;
532 case AUDIT_USER: 564 case AUDIT_USER:
@@ -544,8 +576,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
544 "user pid=%d uid=%u auid=%u", 576 "user pid=%d uid=%u auid=%u",
545 pid, uid, loginuid); 577 pid, uid, loginuid);
546 if (sid) { 578 if (sid) {
547 char *ctx = NULL;
548 u32 len;
549 if (selinux_ctxid_to_string( 579 if (selinux_ctxid_to_string(
550 sid, &ctx, &len)) { 580 sid, &ctx, &len)) {
551 audit_log_format(ab, 581 audit_log_format(ab,
@@ -584,10 +614,21 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
584 loginuid, sid); 614 loginuid, sid);
585 break; 615 break;
586 case AUDIT_SIGNAL_INFO: 616 case AUDIT_SIGNAL_INFO:
587 sig_data.uid = audit_sig_uid; 617 err = selinux_ctxid_to_string(audit_sig_sid, &ctx, &len);
588 sig_data.pid = audit_sig_pid; 618 if (err)
619 return err;
620 sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL);
621 if (!sig_data) {
622 kfree(ctx);
623 return -ENOMEM;
624 }
625 sig_data->uid = audit_sig_uid;
626 sig_data->pid = audit_sig_pid;
627 memcpy(sig_data->ctx, ctx, len);
628 kfree(ctx);
589 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, 629 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO,
590 0, 0, &sig_data, sizeof(sig_data)); 630 0, 0, sig_data, sizeof(*sig_data) + len);
631 kfree(sig_data);
591 break; 632 break;
592 default: 633 default:
593 err = -EINVAL; 634 err = -EINVAL;
@@ -629,20 +670,30 @@ static void audit_receive(struct sock *sk, int length)
629 struct sk_buff *skb; 670 struct sk_buff *skb;
630 unsigned int qlen; 671 unsigned int qlen;
631 672
632 mutex_lock(&audit_netlink_mutex); 673 mutex_lock(&audit_cmd_mutex);
633 674
634 for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) { 675 for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) {
635 skb = skb_dequeue(&sk->sk_receive_queue); 676 skb = skb_dequeue(&sk->sk_receive_queue);
636 audit_receive_skb(skb); 677 audit_receive_skb(skb);
637 kfree_skb(skb); 678 kfree_skb(skb);
638 } 679 }
639 mutex_unlock(&audit_netlink_mutex); 680 mutex_unlock(&audit_cmd_mutex);
640} 681}
641 682
683#ifdef CONFIG_AUDITSYSCALL
684static const struct inotify_operations audit_inotify_ops = {
685 .handle_event = audit_handle_ievent,
686 .destroy_watch = audit_free_parent,
687};
688#endif
642 689
643/* Initialize audit support at boot time. */ 690/* Initialize audit support at boot time. */
644static int __init audit_init(void) 691static int __init audit_init(void)
645{ 692{
693#ifdef CONFIG_AUDITSYSCALL
694 int i;
695#endif
696
646 printk(KERN_INFO "audit: initializing netlink socket (%s)\n", 697 printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
647 audit_default ? "enabled" : "disabled"); 698 audit_default ? "enabled" : "disabled");
648 audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive, 699 audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive,
@@ -661,6 +712,16 @@ static int __init audit_init(void)
661 selinux_audit_set_callback(&selinux_audit_rule_update); 712 selinux_audit_set_callback(&selinux_audit_rule_update);
662 713
663 audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); 714 audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
715
716#ifdef CONFIG_AUDITSYSCALL
717 audit_ih = inotify_init(&audit_inotify_ops);
718 if (IS_ERR(audit_ih))
719 audit_panic("cannot initialize inotify handle");
720
721 for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
722 INIT_LIST_HEAD(&audit_inode_hash[i]);
723#endif
724
664 return 0; 725 return 0;
665} 726}
666__initcall(audit_init); 727__initcall(audit_init);
@@ -690,10 +751,12 @@ static void audit_buffer_free(struct audit_buffer *ab)
690 kfree_skb(ab->skb); 751 kfree_skb(ab->skb);
691 752
692 spin_lock_irqsave(&audit_freelist_lock, flags); 753 spin_lock_irqsave(&audit_freelist_lock, flags);
693 if (++audit_freelist_count > AUDIT_MAXFREE) 754 if (audit_freelist_count > AUDIT_MAXFREE)
694 kfree(ab); 755 kfree(ab);
695 else 756 else {
757 audit_freelist_count++;
696 list_add(&ab->list, &audit_freelist); 758 list_add(&ab->list, &audit_freelist);
759 }
697 spin_unlock_irqrestore(&audit_freelist_lock, flags); 760 spin_unlock_irqrestore(&audit_freelist_lock, flags);
698} 761}
699 762
@@ -755,7 +818,7 @@ err:
755 */ 818 */
756unsigned int audit_serial(void) 819unsigned int audit_serial(void)
757{ 820{
758 static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED; 821 static DEFINE_SPINLOCK(serial_lock);
759 static unsigned int serial = 0; 822 static unsigned int serial = 0;
760 823
761 unsigned long flags; 824 unsigned long flags;
@@ -988,28 +1051,76 @@ void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf,
988 skb_put(skb, len << 1); /* new string is twice the old string */ 1051 skb_put(skb, len << 1); /* new string is twice the old string */
989} 1052}
990 1053
1054/*
1055 * Format a string of no more than slen characters into the audit buffer,
1056 * enclosed in quote marks.
1057 */
1058static void audit_log_n_string(struct audit_buffer *ab, size_t slen,
1059 const char *string)
1060{
1061 int avail, new_len;
1062 unsigned char *ptr;
1063 struct sk_buff *skb;
1064
1065 BUG_ON(!ab->skb);
1066 skb = ab->skb;
1067 avail = skb_tailroom(skb);
1068 new_len = slen + 3; /* enclosing quotes + null terminator */
1069 if (new_len > avail) {
1070 avail = audit_expand(ab, new_len);
1071 if (!avail)
1072 return;
1073 }
1074 ptr = skb->tail;
1075 *ptr++ = '"';
1076 memcpy(ptr, string, slen);
1077 ptr += slen;
1078 *ptr++ = '"';
1079 *ptr = 0;
1080 skb_put(skb, slen + 2); /* don't include null terminator */
1081}
1082
991/** 1083/**
992 * audit_log_unstrustedstring - log a string that may contain random characters 1084 * audit_log_n_unstrustedstring - log a string that may contain random characters
993 * @ab: audit_buffer 1085 * @ab: audit_buffer
1086 * @len: lenth of string (not including trailing null)
994 * @string: string to be logged 1087 * @string: string to be logged
995 * 1088 *
996 * This code will escape a string that is passed to it if the string 1089 * This code will escape a string that is passed to it if the string
997 * contains a control character, unprintable character, double quote mark, 1090 * contains a control character, unprintable character, double quote mark,
998 * or a space. Unescaped strings will start and end with a double quote mark. 1091 * or a space. Unescaped strings will start and end with a double quote mark.
999 * Strings that are escaped are printed in hex (2 digits per char). 1092 * Strings that are escaped are printed in hex (2 digits per char).
1093 *
1094 * The caller specifies the number of characters in the string to log, which may
1095 * or may not be the entire string.
1000 */ 1096 */
1001void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) 1097const char *audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len,
1098 const char *string)
1002{ 1099{
1003 const unsigned char *p = string; 1100 const unsigned char *p = string;
1004 1101
1005 while (*p) { 1102 while (*p) {
1006 if (*p == '"' || *p < 0x21 || *p > 0x7f) { 1103 if (*p == '"' || *p < 0x21 || *p > 0x7f) {
1007 audit_log_hex(ab, string, strlen(string)); 1104 audit_log_hex(ab, string, len);
1008 return; 1105 return string + len + 1;
1009 } 1106 }
1010 p++; 1107 p++;
1011 } 1108 }
1012 audit_log_format(ab, "\"%s\"", string); 1109 audit_log_n_string(ab, len, string);
1110 return p + 1;
1111}
1112
1113/**
1114 * audit_log_unstrustedstring - log a string that may contain random characters
1115 * @ab: audit_buffer
1116 * @string: string to be logged
1117 *
1118 * Same as audit_log_n_unstrustedstring(), except that strlen is used to
1119 * determine string length.
1120 */
1121const char *audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
1122{
1123 return audit_log_n_untrustedstring(ab, strlen(string), string);
1013} 1124}
1014 1125
1015/* This is a helper-function to print the escaped d_path */ 1126/* This is a helper-function to print the escaped d_path */
diff --git a/kernel/audit.h b/kernel/audit.h
index 6f733920fd32..6aa33b848cf2 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -19,9 +19,9 @@
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */ 20 */
21 21
22#include <linux/mutex.h>
23#include <linux/fs.h> 22#include <linux/fs.h>
24#include <linux/audit.h> 23#include <linux/audit.h>
24#include <linux/skbuff.h>
25 25
26/* 0 = no checking 26/* 0 = no checking
27 1 = put_count checking 27 1 = put_count checking
@@ -53,6 +53,18 @@ enum audit_state {
53}; 53};
54 54
55/* Rule lists */ 55/* Rule lists */
56struct audit_parent;
57
58struct audit_watch {
59 atomic_t count; /* reference count */
60 char *path; /* insertion path */
61 dev_t dev; /* associated superblock device */
62 unsigned long ino; /* associated inode number */
63 struct audit_parent *parent; /* associated parent */
64 struct list_head wlist; /* entry in parent->watches list */
65 struct list_head rules; /* associated rules */
66};
67
56struct audit_field { 68struct audit_field {
57 u32 type; 69 u32 type;
58 u32 val; 70 u32 val;
@@ -69,7 +81,11 @@ struct audit_krule {
69 u32 mask[AUDIT_BITMASK_SIZE]; 81 u32 mask[AUDIT_BITMASK_SIZE];
70 u32 buflen; /* for data alloc on list rules */ 82 u32 buflen; /* for data alloc on list rules */
71 u32 field_count; 83 u32 field_count;
84 char *filterkey; /* ties events to rules */
72 struct audit_field *fields; 85 struct audit_field *fields;
86 struct audit_field *inode_f; /* quick access to an inode field */
87 struct audit_watch *watch; /* associated watch */
88 struct list_head rlist; /* entry in audit_watch.rules list */
73}; 89};
74 90
75struct audit_entry { 91struct audit_entry {
@@ -78,15 +94,53 @@ struct audit_entry {
78 struct audit_krule rule; 94 struct audit_krule rule;
79}; 95};
80 96
81
82extern int audit_pid; 97extern int audit_pid;
83extern int audit_comparator(const u32 left, const u32 op, const u32 right);
84 98
99#define AUDIT_INODE_BUCKETS 32
100extern struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
101
102static inline int audit_hash_ino(u32 ino)
103{
104 return (ino & (AUDIT_INODE_BUCKETS-1));
105}
106
107extern int audit_comparator(const u32 left, const u32 op, const u32 right);
108extern int audit_compare_dname_path(const char *dname, const char *path,
109 int *dirlen);
110extern struct sk_buff * audit_make_reply(int pid, int seq, int type,
111 int done, int multi,
112 void *payload, int size);
85extern void audit_send_reply(int pid, int seq, int type, 113extern void audit_send_reply(int pid, int seq, int type,
86 int done, int multi, 114 int done, int multi,
87 void *payload, int size); 115 void *payload, int size);
88extern void audit_log_lost(const char *message); 116extern void audit_log_lost(const char *message);
89extern void audit_panic(const char *message); 117extern void audit_panic(const char *message);
90extern struct mutex audit_netlink_mutex;
91 118
119struct audit_netlink_list {
120 int pid;
121 struct sk_buff_head q;
122};
123
124int audit_send_list(void *);
125
126struct inotify_watch;
127extern void audit_free_parent(struct inotify_watch *);
128extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32,
129 const char *, struct inode *);
92extern int selinux_audit_rule_update(void); 130extern int selinux_audit_rule_update(void);
131
132#ifdef CONFIG_AUDITSYSCALL
133extern void __audit_signal_info(int sig, struct task_struct *t);
134static inline void audit_signal_info(int sig, struct task_struct *t)
135{
136 if (unlikely(audit_pid && t->tgid == audit_pid))
137 __audit_signal_info(sig, t);
138}
139extern enum audit_state audit_filter_inodes(struct task_struct *,
140 struct audit_context *);
141extern void audit_set_auditable(struct audit_context *);
142#else
143#define audit_signal_info(s,t)
144#define audit_filter_inodes(t,c) AUDIT_DISABLED
145#define audit_set_auditable(c)
146#endif
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 7c134906d689..5b4e16276ca0 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -22,13 +22,59 @@
22#include <linux/kernel.h> 22#include <linux/kernel.h>
23#include <linux/audit.h> 23#include <linux/audit.h>
24#include <linux/kthread.h> 24#include <linux/kthread.h>
25#include <linux/mutex.h>
26#include <linux/fs.h>
27#include <linux/namei.h>
25#include <linux/netlink.h> 28#include <linux/netlink.h>
29#include <linux/sched.h>
30#include <linux/inotify.h>
26#include <linux/selinux.h> 31#include <linux/selinux.h>
27#include "audit.h" 32#include "audit.h"
28 33
29/* There are three lists of rules -- one to search at task creation 34/*
30 * time, one to search at syscall entry time, and another to search at 35 * Locking model:
31 * syscall exit time. */ 36 *
37 * audit_filter_mutex:
38 * Synchronizes writes and blocking reads of audit's filterlist
39 * data. Rcu is used to traverse the filterlist and access
40 * contents of structs audit_entry, audit_watch and opaque
41 * selinux rules during filtering. If modified, these structures
42 * must be copied and replace their counterparts in the filterlist.
43 * An audit_parent struct is not accessed during filtering, so may
44 * be written directly provided audit_filter_mutex is held.
45 */
46
47/*
48 * Reference counting:
49 *
50 * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED
51 * event. Each audit_watch holds a reference to its associated parent.
52 *
53 * audit_watch: if added to lists, lifetime is from audit_init_watch() to
54 * audit_remove_watch(). Additionally, an audit_watch may exist
55 * temporarily to assist in searching existing filter data. Each
56 * audit_krule holds a reference to its associated watch.
57 */
58
59struct audit_parent {
60 struct list_head ilist; /* entry in inotify registration list */
61 struct list_head watches; /* associated watches */
62 struct inotify_watch wdata; /* inotify watch data */
63 unsigned flags; /* status flags */
64};
65
66/*
67 * audit_parent status flags:
68 *
69 * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
70 * a filesystem event to ensure we're adding audit watches to a valid parent.
71 * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
72 * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
73 * we can receive while holding nameidata.
74 */
75#define AUDIT_PARENT_INVALID 0x001
76
77/* Audit filter lists, defined in <linux/audit.h> */
32struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { 78struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
33 LIST_HEAD_INIT(audit_filter_list[0]), 79 LIST_HEAD_INIT(audit_filter_list[0]),
34 LIST_HEAD_INIT(audit_filter_list[1]), 80 LIST_HEAD_INIT(audit_filter_list[1]),
@@ -41,9 +87,53 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
41#endif 87#endif
42}; 88};
43 89
90static DEFINE_MUTEX(audit_filter_mutex);
91
92/* Inotify handle */
93extern struct inotify_handle *audit_ih;
94
95/* Inotify events we care about. */
96#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
97
98void audit_free_parent(struct inotify_watch *i_watch)
99{
100 struct audit_parent *parent;
101
102 parent = container_of(i_watch, struct audit_parent, wdata);
103 WARN_ON(!list_empty(&parent->watches));
104 kfree(parent);
105}
106
107static inline void audit_get_watch(struct audit_watch *watch)
108{
109 atomic_inc(&watch->count);
110}
111
112static void audit_put_watch(struct audit_watch *watch)
113{
114 if (atomic_dec_and_test(&watch->count)) {
115 WARN_ON(watch->parent);
116 WARN_ON(!list_empty(&watch->rules));
117 kfree(watch->path);
118 kfree(watch);
119 }
120}
121
122static void audit_remove_watch(struct audit_watch *watch)
123{
124 list_del(&watch->wlist);
125 put_inotify_watch(&watch->parent->wdata);
126 watch->parent = NULL;
127 audit_put_watch(watch); /* match initial get */
128}
129
44static inline void audit_free_rule(struct audit_entry *e) 130static inline void audit_free_rule(struct audit_entry *e)
45{ 131{
46 int i; 132 int i;
133
134 /* some rules don't have associated watches */
135 if (e->rule.watch)
136 audit_put_watch(e->rule.watch);
47 if (e->rule.fields) 137 if (e->rule.fields)
48 for (i = 0; i < e->rule.field_count; i++) { 138 for (i = 0; i < e->rule.field_count; i++) {
49 struct audit_field *f = &e->rule.fields[i]; 139 struct audit_field *f = &e->rule.fields[i];
@@ -51,6 +141,7 @@ static inline void audit_free_rule(struct audit_entry *e)
51 selinux_audit_rule_free(f->se_rule); 141 selinux_audit_rule_free(f->se_rule);
52 } 142 }
53 kfree(e->rule.fields); 143 kfree(e->rule.fields);
144 kfree(e->rule.filterkey);
54 kfree(e); 145 kfree(e);
55} 146}
56 147
@@ -60,6 +151,50 @@ static inline void audit_free_rule_rcu(struct rcu_head *head)
60 audit_free_rule(e); 151 audit_free_rule(e);
61} 152}
62 153
154/* Initialize a parent watch entry. */
155static struct audit_parent *audit_init_parent(struct nameidata *ndp)
156{
157 struct audit_parent *parent;
158 s32 wd;
159
160 parent = kzalloc(sizeof(*parent), GFP_KERNEL);
161 if (unlikely(!parent))
162 return ERR_PTR(-ENOMEM);
163
164 INIT_LIST_HEAD(&parent->watches);
165 parent->flags = 0;
166
167 inotify_init_watch(&parent->wdata);
168 /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
169 get_inotify_watch(&parent->wdata);
170 wd = inotify_add_watch(audit_ih, &parent->wdata, ndp->dentry->d_inode,
171 AUDIT_IN_WATCH);
172 if (wd < 0) {
173 audit_free_parent(&parent->wdata);
174 return ERR_PTR(wd);
175 }
176
177 return parent;
178}
179
180/* Initialize a watch entry. */
181static struct audit_watch *audit_init_watch(char *path)
182{
183 struct audit_watch *watch;
184
185 watch = kzalloc(sizeof(*watch), GFP_KERNEL);
186 if (unlikely(!watch))
187 return ERR_PTR(-ENOMEM);
188
189 INIT_LIST_HEAD(&watch->rules);
190 atomic_set(&watch->count, 1);
191 watch->path = path;
192 watch->dev = (dev_t)-1;
193 watch->ino = (unsigned long)-1;
194
195 return watch;
196}
197
63/* Initialize an audit filterlist entry. */ 198/* Initialize an audit filterlist entry. */
64static inline struct audit_entry *audit_init_entry(u32 field_count) 199static inline struct audit_entry *audit_init_entry(u32 field_count)
65{ 200{
@@ -107,6 +242,66 @@ static char *audit_unpack_string(void **bufp, size_t *remain, size_t len)
107 return str; 242 return str;
108} 243}
109 244
245/* Translate an inode field to kernel respresentation. */
246static inline int audit_to_inode(struct audit_krule *krule,
247 struct audit_field *f)
248{
249 if (krule->listnr != AUDIT_FILTER_EXIT ||
250 krule->watch || krule->inode_f)
251 return -EINVAL;
252
253 krule->inode_f = f;
254 return 0;
255}
256
257/* Translate a watch string to kernel respresentation. */
258static int audit_to_watch(struct audit_krule *krule, char *path, int len,
259 u32 op)
260{
261 struct audit_watch *watch;
262
263 if (!audit_ih)
264 return -EOPNOTSUPP;
265
266 if (path[0] != '/' || path[len-1] == '/' ||
267 krule->listnr != AUDIT_FILTER_EXIT ||
268 op & ~AUDIT_EQUAL ||
269 krule->inode_f || krule->watch) /* 1 inode # per rule, for hash */
270 return -EINVAL;
271
272 watch = audit_init_watch(path);
273 if (unlikely(IS_ERR(watch)))
274 return PTR_ERR(watch);
275
276 audit_get_watch(watch);
277 krule->watch = watch;
278
279 return 0;
280}
281
282static __u32 *classes[AUDIT_SYSCALL_CLASSES];
283
284int __init audit_register_class(int class, unsigned *list)
285{
286 __u32 *p = kzalloc(AUDIT_BITMASK_SIZE * sizeof(__u32), GFP_KERNEL);
287 if (!p)
288 return -ENOMEM;
289 while (*list != ~0U) {
290 unsigned n = *list++;
291 if (n >= AUDIT_BITMASK_SIZE * 32 - AUDIT_SYSCALL_CLASSES) {
292 kfree(p);
293 return -EINVAL;
294 }
295 p[AUDIT_WORD(n)] |= AUDIT_BIT(n);
296 }
297 if (class >= AUDIT_SYSCALL_CLASSES || classes[class]) {
298 kfree(p);
299 return -EINVAL;
300 }
301 classes[class] = p;
302 return 0;
303}
304
110/* Common user-space to kernel rule translation. */ 305/* Common user-space to kernel rule translation. */
111static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) 306static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
112{ 307{
@@ -128,8 +323,11 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
128#endif 323#endif
129 ; 324 ;
130 } 325 }
131 if (rule->action != AUDIT_NEVER && rule->action != AUDIT_POSSIBLE && 326 if (unlikely(rule->action == AUDIT_POSSIBLE)) {
132 rule->action != AUDIT_ALWAYS) 327 printk(KERN_ERR "AUDIT_POSSIBLE is deprecated\n");
328 goto exit_err;
329 }
330 if (rule->action != AUDIT_NEVER && rule->action != AUDIT_ALWAYS)
133 goto exit_err; 331 goto exit_err;
134 if (rule->field_count > AUDIT_MAX_FIELDS) 332 if (rule->field_count > AUDIT_MAX_FIELDS)
135 goto exit_err; 333 goto exit_err;
@@ -147,6 +345,22 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
147 for (i = 0; i < AUDIT_BITMASK_SIZE; i++) 345 for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
148 entry->rule.mask[i] = rule->mask[i]; 346 entry->rule.mask[i] = rule->mask[i];
149 347
348 for (i = 0; i < AUDIT_SYSCALL_CLASSES; i++) {
349 int bit = AUDIT_BITMASK_SIZE * 32 - i - 1;
350 __u32 *p = &entry->rule.mask[AUDIT_WORD(bit)];
351 __u32 *class;
352
353 if (!(*p & AUDIT_BIT(bit)))
354 continue;
355 *p &= ~AUDIT_BIT(bit);
356 class = classes[i];
357 if (class) {
358 int j;
359 for (j = 0; j < AUDIT_BITMASK_SIZE; j++)
360 entry->rule.mask[j] |= class[j];
361 }
362 }
363
150 return entry; 364 return entry;
151 365
152exit_err: 366exit_err:
@@ -158,6 +372,7 @@ exit_err:
158static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) 372static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
159{ 373{
160 struct audit_entry *entry; 374 struct audit_entry *entry;
375 struct audit_field *f;
161 int err = 0; 376 int err = 0;
162 int i; 377 int i;
163 378
@@ -172,14 +387,37 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
172 f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS); 387 f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS);
173 f->val = rule->values[i]; 388 f->val = rule->values[i];
174 389
175 if (f->type & AUDIT_UNUSED_BITS || 390 err = -EINVAL;
176 f->type == AUDIT_SE_USER || 391 switch(f->type) {
177 f->type == AUDIT_SE_ROLE || 392 default:
178 f->type == AUDIT_SE_TYPE ||
179 f->type == AUDIT_SE_SEN ||
180 f->type == AUDIT_SE_CLR) {
181 err = -EINVAL;
182 goto exit_free; 393 goto exit_free;
394 case AUDIT_PID:
395 case AUDIT_UID:
396 case AUDIT_EUID:
397 case AUDIT_SUID:
398 case AUDIT_FSUID:
399 case AUDIT_GID:
400 case AUDIT_EGID:
401 case AUDIT_SGID:
402 case AUDIT_FSGID:
403 case AUDIT_LOGINUID:
404 case AUDIT_PERS:
405 case AUDIT_ARCH:
406 case AUDIT_MSGTYPE:
407 case AUDIT_DEVMAJOR:
408 case AUDIT_DEVMINOR:
409 case AUDIT_EXIT:
410 case AUDIT_SUCCESS:
411 case AUDIT_ARG0:
412 case AUDIT_ARG1:
413 case AUDIT_ARG2:
414 case AUDIT_ARG3:
415 break;
416 case AUDIT_INODE:
417 err = audit_to_inode(&entry->rule, f);
418 if (err)
419 goto exit_free;
420 break;
183 } 421 }
184 422
185 entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1; 423 entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1;
@@ -196,6 +434,18 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
196 } 434 }
197 } 435 }
198 436
437 f = entry->rule.inode_f;
438 if (f) {
439 switch(f->op) {
440 case AUDIT_NOT_EQUAL:
441 entry->rule.inode_f = NULL;
442 case AUDIT_EQUAL:
443 break;
444 default:
445 goto exit_free;
446 }
447 }
448
199exit_nofree: 449exit_nofree:
200 return entry; 450 return entry;
201 451
@@ -210,6 +460,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
210{ 460{
211 int err = 0; 461 int err = 0;
212 struct audit_entry *entry; 462 struct audit_entry *entry;
463 struct audit_field *f;
213 void *bufp; 464 void *bufp;
214 size_t remain = datasz - sizeof(struct audit_rule_data); 465 size_t remain = datasz - sizeof(struct audit_rule_data);
215 int i; 466 int i;
@@ -235,11 +486,39 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
235 f->se_str = NULL; 486 f->se_str = NULL;
236 f->se_rule = NULL; 487 f->se_rule = NULL;
237 switch(f->type) { 488 switch(f->type) {
238 case AUDIT_SE_USER: 489 case AUDIT_PID:
239 case AUDIT_SE_ROLE: 490 case AUDIT_UID:
240 case AUDIT_SE_TYPE: 491 case AUDIT_EUID:
241 case AUDIT_SE_SEN: 492 case AUDIT_SUID:
242 case AUDIT_SE_CLR: 493 case AUDIT_FSUID:
494 case AUDIT_GID:
495 case AUDIT_EGID:
496 case AUDIT_SGID:
497 case AUDIT_FSGID:
498 case AUDIT_LOGINUID:
499 case AUDIT_PERS:
500 case AUDIT_ARCH:
501 case AUDIT_MSGTYPE:
502 case AUDIT_PPID:
503 case AUDIT_DEVMAJOR:
504 case AUDIT_DEVMINOR:
505 case AUDIT_EXIT:
506 case AUDIT_SUCCESS:
507 case AUDIT_ARG0:
508 case AUDIT_ARG1:
509 case AUDIT_ARG2:
510 case AUDIT_ARG3:
511 break;
512 case AUDIT_SUBJ_USER:
513 case AUDIT_SUBJ_ROLE:
514 case AUDIT_SUBJ_TYPE:
515 case AUDIT_SUBJ_SEN:
516 case AUDIT_SUBJ_CLR:
517 case AUDIT_OBJ_USER:
518 case AUDIT_OBJ_ROLE:
519 case AUDIT_OBJ_TYPE:
520 case AUDIT_OBJ_LEV_LOW:
521 case AUDIT_OBJ_LEV_HIGH:
243 str = audit_unpack_string(&bufp, &remain, f->val); 522 str = audit_unpack_string(&bufp, &remain, f->val);
244 if (IS_ERR(str)) 523 if (IS_ERR(str))
245 goto exit_free; 524 goto exit_free;
@@ -260,6 +539,47 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
260 } else 539 } else
261 f->se_str = str; 540 f->se_str = str;
262 break; 541 break;
542 case AUDIT_WATCH:
543 str = audit_unpack_string(&bufp, &remain, f->val);
544 if (IS_ERR(str))
545 goto exit_free;
546 entry->rule.buflen += f->val;
547
548 err = audit_to_watch(&entry->rule, str, f->val, f->op);
549 if (err) {
550 kfree(str);
551 goto exit_free;
552 }
553 break;
554 case AUDIT_INODE:
555 err = audit_to_inode(&entry->rule, f);
556 if (err)
557 goto exit_free;
558 break;
559 case AUDIT_FILTERKEY:
560 err = -EINVAL;
561 if (entry->rule.filterkey || f->val > AUDIT_MAX_KEY_LEN)
562 goto exit_free;
563 str = audit_unpack_string(&bufp, &remain, f->val);
564 if (IS_ERR(str))
565 goto exit_free;
566 entry->rule.buflen += f->val;
567 entry->rule.filterkey = str;
568 break;
569 default:
570 goto exit_free;
571 }
572 }
573
574 f = entry->rule.inode_f;
575 if (f) {
576 switch(f->op) {
577 case AUDIT_NOT_EQUAL:
578 entry->rule.inode_f = NULL;
579 case AUDIT_EQUAL:
580 break;
581 default:
582 goto exit_free;
263 } 583 }
264 } 584 }
265 585
@@ -291,7 +611,7 @@ static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule)
291 611
292 rule = kmalloc(sizeof(*rule), GFP_KERNEL); 612 rule = kmalloc(sizeof(*rule), GFP_KERNEL);
293 if (unlikely(!rule)) 613 if (unlikely(!rule))
294 return ERR_PTR(-ENOMEM); 614 return NULL;
295 memset(rule, 0, sizeof(*rule)); 615 memset(rule, 0, sizeof(*rule));
296 616
297 rule->flags = krule->flags | krule->listnr; 617 rule->flags = krule->flags | krule->listnr;
@@ -322,7 +642,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
322 642
323 data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL); 643 data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL);
324 if (unlikely(!data)) 644 if (unlikely(!data))
325 return ERR_PTR(-ENOMEM); 645 return NULL;
326 memset(data, 0, sizeof(*data)); 646 memset(data, 0, sizeof(*data));
327 647
328 data->flags = krule->flags | krule->listnr; 648 data->flags = krule->flags | krule->listnr;
@@ -335,14 +655,27 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
335 data->fields[i] = f->type; 655 data->fields[i] = f->type;
336 data->fieldflags[i] = f->op; 656 data->fieldflags[i] = f->op;
337 switch(f->type) { 657 switch(f->type) {
338 case AUDIT_SE_USER: 658 case AUDIT_SUBJ_USER:
339 case AUDIT_SE_ROLE: 659 case AUDIT_SUBJ_ROLE:
340 case AUDIT_SE_TYPE: 660 case AUDIT_SUBJ_TYPE:
341 case AUDIT_SE_SEN: 661 case AUDIT_SUBJ_SEN:
342 case AUDIT_SE_CLR: 662 case AUDIT_SUBJ_CLR:
663 case AUDIT_OBJ_USER:
664 case AUDIT_OBJ_ROLE:
665 case AUDIT_OBJ_TYPE:
666 case AUDIT_OBJ_LEV_LOW:
667 case AUDIT_OBJ_LEV_HIGH:
343 data->buflen += data->values[i] = 668 data->buflen += data->values[i] =
344 audit_pack_string(&bufp, f->se_str); 669 audit_pack_string(&bufp, f->se_str);
345 break; 670 break;
671 case AUDIT_WATCH:
672 data->buflen += data->values[i] =
673 audit_pack_string(&bufp, krule->watch->path);
674 break;
675 case AUDIT_FILTERKEY:
676 data->buflen += data->values[i] =
677 audit_pack_string(&bufp, krule->filterkey);
678 break;
346 default: 679 default:
347 data->values[i] = f->val; 680 data->values[i] = f->val;
348 } 681 }
@@ -370,14 +703,28 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
370 return 1; 703 return 1;
371 704
372 switch(a->fields[i].type) { 705 switch(a->fields[i].type) {
373 case AUDIT_SE_USER: 706 case AUDIT_SUBJ_USER:
374 case AUDIT_SE_ROLE: 707 case AUDIT_SUBJ_ROLE:
375 case AUDIT_SE_TYPE: 708 case AUDIT_SUBJ_TYPE:
376 case AUDIT_SE_SEN: 709 case AUDIT_SUBJ_SEN:
377 case AUDIT_SE_CLR: 710 case AUDIT_SUBJ_CLR:
711 case AUDIT_OBJ_USER:
712 case AUDIT_OBJ_ROLE:
713 case AUDIT_OBJ_TYPE:
714 case AUDIT_OBJ_LEV_LOW:
715 case AUDIT_OBJ_LEV_HIGH:
378 if (strcmp(a->fields[i].se_str, b->fields[i].se_str)) 716 if (strcmp(a->fields[i].se_str, b->fields[i].se_str))
379 return 1; 717 return 1;
380 break; 718 break;
719 case AUDIT_WATCH:
720 if (strcmp(a->watch->path, b->watch->path))
721 return 1;
722 break;
723 case AUDIT_FILTERKEY:
724 /* both filterkeys exist based on above type compare */
725 if (strcmp(a->filterkey, b->filterkey))
726 return 1;
727 break;
381 default: 728 default:
382 if (a->fields[i].val != b->fields[i].val) 729 if (a->fields[i].val != b->fields[i].val)
383 return 1; 730 return 1;
@@ -391,6 +738,32 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
391 return 0; 738 return 0;
392} 739}
393 740
741/* Duplicate the given audit watch. The new watch's rules list is initialized
742 * to an empty list and wlist is undefined. */
743static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
744{
745 char *path;
746 struct audit_watch *new;
747
748 path = kstrdup(old->path, GFP_KERNEL);
749 if (unlikely(!path))
750 return ERR_PTR(-ENOMEM);
751
752 new = audit_init_watch(path);
753 if (unlikely(IS_ERR(new))) {
754 kfree(path);
755 goto out;
756 }
757
758 new->dev = old->dev;
759 new->ino = old->ino;
760 get_inotify_watch(&old->parent->wdata);
761 new->parent = old->parent;
762
763out:
764 return new;
765}
766
394/* Duplicate selinux field information. The se_rule is opaque, so must be 767/* Duplicate selinux field information. The se_rule is opaque, so must be
395 * re-initialized. */ 768 * re-initialized. */
396static inline int audit_dupe_selinux_field(struct audit_field *df, 769static inline int audit_dupe_selinux_field(struct audit_field *df,
@@ -422,12 +795,16 @@ static inline int audit_dupe_selinux_field(struct audit_field *df,
422/* Duplicate an audit rule. This will be a deep copy with the exception 795/* Duplicate an audit rule. This will be a deep copy with the exception
423 * of the watch - that pointer is carried over. The selinux specific fields 796 * of the watch - that pointer is carried over. The selinux specific fields
424 * will be updated in the copy. The point is to be able to replace the old 797 * will be updated in the copy. The point is to be able to replace the old
425 * rule with the new rule in the filterlist, then free the old rule. */ 798 * rule with the new rule in the filterlist, then free the old rule.
426static struct audit_entry *audit_dupe_rule(struct audit_krule *old) 799 * The rlist element is undefined; list manipulations are handled apart from
800 * the initial copy. */
801static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
802 struct audit_watch *watch)
427{ 803{
428 u32 fcount = old->field_count; 804 u32 fcount = old->field_count;
429 struct audit_entry *entry; 805 struct audit_entry *entry;
430 struct audit_krule *new; 806 struct audit_krule *new;
807 char *fk;
431 int i, err = 0; 808 int i, err = 0;
432 809
433 entry = audit_init_entry(fcount); 810 entry = audit_init_entry(fcount);
@@ -442,6 +819,8 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old)
442 for (i = 0; i < AUDIT_BITMASK_SIZE; i++) 819 for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
443 new->mask[i] = old->mask[i]; 820 new->mask[i] = old->mask[i];
444 new->buflen = old->buflen; 821 new->buflen = old->buflen;
822 new->inode_f = old->inode_f;
823 new->watch = NULL;
445 new->field_count = old->field_count; 824 new->field_count = old->field_count;
446 memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount); 825 memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount);
447 826
@@ -449,13 +828,25 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old)
449 * the originals will all be freed when the old rule is freed. */ 828 * the originals will all be freed when the old rule is freed. */
450 for (i = 0; i < fcount; i++) { 829 for (i = 0; i < fcount; i++) {
451 switch (new->fields[i].type) { 830 switch (new->fields[i].type) {
452 case AUDIT_SE_USER: 831 case AUDIT_SUBJ_USER:
453 case AUDIT_SE_ROLE: 832 case AUDIT_SUBJ_ROLE:
454 case AUDIT_SE_TYPE: 833 case AUDIT_SUBJ_TYPE:
455 case AUDIT_SE_SEN: 834 case AUDIT_SUBJ_SEN:
456 case AUDIT_SE_CLR: 835 case AUDIT_SUBJ_CLR:
836 case AUDIT_OBJ_USER:
837 case AUDIT_OBJ_ROLE:
838 case AUDIT_OBJ_TYPE:
839 case AUDIT_OBJ_LEV_LOW:
840 case AUDIT_OBJ_LEV_HIGH:
457 err = audit_dupe_selinux_field(&new->fields[i], 841 err = audit_dupe_selinux_field(&new->fields[i],
458 &old->fields[i]); 842 &old->fields[i]);
843 break;
844 case AUDIT_FILTERKEY:
845 fk = kstrdup(old->filterkey, GFP_KERNEL);
846 if (unlikely(!fk))
847 err = -ENOMEM;
848 else
849 new->filterkey = fk;
459 } 850 }
460 if (err) { 851 if (err) {
461 audit_free_rule(entry); 852 audit_free_rule(entry);
@@ -463,68 +854,409 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old)
463 } 854 }
464 } 855 }
465 856
857 if (watch) {
858 audit_get_watch(watch);
859 new->watch = watch;
860 }
861
466 return entry; 862 return entry;
467} 863}
468 864
469/* Add rule to given filterlist if not a duplicate. Protected by 865/* Update inode info in audit rules based on filesystem event. */
470 * audit_netlink_mutex. */ 866static void audit_update_watch(struct audit_parent *parent,
867 const char *dname, dev_t dev,
868 unsigned long ino, unsigned invalidating)
869{
870 struct audit_watch *owatch, *nwatch, *nextw;
871 struct audit_krule *r, *nextr;
872 struct audit_entry *oentry, *nentry;
873 struct audit_buffer *ab;
874
875 mutex_lock(&audit_filter_mutex);
876 list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
877 if (audit_compare_dname_path(dname, owatch->path, NULL))
878 continue;
879
880 /* If the update involves invalidating rules, do the inode-based
881 * filtering now, so we don't omit records. */
882 if (invalidating &&
883 audit_filter_inodes(current, current->audit_context) == AUDIT_RECORD_CONTEXT)
884 audit_set_auditable(current->audit_context);
885
886 nwatch = audit_dupe_watch(owatch);
887 if (unlikely(IS_ERR(nwatch))) {
888 mutex_unlock(&audit_filter_mutex);
889 audit_panic("error updating watch, skipping");
890 return;
891 }
892 nwatch->dev = dev;
893 nwatch->ino = ino;
894
895 list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) {
896
897 oentry = container_of(r, struct audit_entry, rule);
898 list_del(&oentry->rule.rlist);
899 list_del_rcu(&oentry->list);
900
901 nentry = audit_dupe_rule(&oentry->rule, nwatch);
902 if (unlikely(IS_ERR(nentry)))
903 audit_panic("error updating watch, removing");
904 else {
905 int h = audit_hash_ino((u32)ino);
906 list_add(&nentry->rule.rlist, &nwatch->rules);
907 list_add_rcu(&nentry->list, &audit_inode_hash[h]);
908 }
909
910 call_rcu(&oentry->rcu, audit_free_rule_rcu);
911 }
912
913 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
914 audit_log_format(ab, "audit updated rules specifying watch=");
915 audit_log_untrustedstring(ab, owatch->path);
916 audit_log_format(ab, " with dev=%u ino=%lu\n", dev, ino);
917 audit_log_end(ab);
918
919 audit_remove_watch(owatch);
920 goto add_watch_to_parent; /* event applies to a single watch */
921 }
922 mutex_unlock(&audit_filter_mutex);
923 return;
924
925add_watch_to_parent:
926 list_add(&nwatch->wlist, &parent->watches);
927 mutex_unlock(&audit_filter_mutex);
928 return;
929}
930
931/* Remove all watches & rules associated with a parent that is going away. */
932static void audit_remove_parent_watches(struct audit_parent *parent)
933{
934 struct audit_watch *w, *nextw;
935 struct audit_krule *r, *nextr;
936 struct audit_entry *e;
937
938 mutex_lock(&audit_filter_mutex);
939 parent->flags |= AUDIT_PARENT_INVALID;
940 list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
941 list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
942 e = container_of(r, struct audit_entry, rule);
943 list_del(&r->rlist);
944 list_del_rcu(&e->list);
945 call_rcu(&e->rcu, audit_free_rule_rcu);
946
947 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
948 "audit implicitly removed rule from list=%d\n",
949 AUDIT_FILTER_EXIT);
950 }
951 audit_remove_watch(w);
952 }
953 mutex_unlock(&audit_filter_mutex);
954}
955
956/* Unregister inotify watches for parents on in_list.
957 * Generates an IN_IGNORED event. */
958static void audit_inotify_unregister(struct list_head *in_list)
959{
960 struct audit_parent *p, *n;
961
962 list_for_each_entry_safe(p, n, in_list, ilist) {
963 list_del(&p->ilist);
964 inotify_rm_watch(audit_ih, &p->wdata);
965 /* the put matching the get in audit_do_del_rule() */
966 put_inotify_watch(&p->wdata);
967 }
968}
969
970/* Find an existing audit rule.
971 * Caller must hold audit_filter_mutex to prevent stale rule data. */
972static struct audit_entry *audit_find_rule(struct audit_entry *entry,
973 struct list_head *list)
974{
975 struct audit_entry *e, *found = NULL;
976 int h;
977
978 if (entry->rule.watch) {
979 /* we don't know the inode number, so must walk entire hash */
980 for (h = 0; h < AUDIT_INODE_BUCKETS; h++) {
981 list = &audit_inode_hash[h];
982 list_for_each_entry(e, list, list)
983 if (!audit_compare_rule(&entry->rule, &e->rule)) {
984 found = e;
985 goto out;
986 }
987 }
988 goto out;
989 }
990
991 list_for_each_entry(e, list, list)
992 if (!audit_compare_rule(&entry->rule, &e->rule)) {
993 found = e;
994 goto out;
995 }
996
997out:
998 return found;
999}
1000
1001/* Get path information necessary for adding watches. */
1002static int audit_get_nd(char *path, struct nameidata **ndp,
1003 struct nameidata **ndw)
1004{
1005 struct nameidata *ndparent, *ndwatch;
1006 int err;
1007
1008 ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL);
1009 if (unlikely(!ndparent))
1010 return -ENOMEM;
1011
1012 ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL);
1013 if (unlikely(!ndwatch)) {
1014 kfree(ndparent);
1015 return -ENOMEM;
1016 }
1017
1018 err = path_lookup(path, LOOKUP_PARENT, ndparent);
1019 if (err) {
1020 kfree(ndparent);
1021 kfree(ndwatch);
1022 return err;
1023 }
1024
1025 err = path_lookup(path, 0, ndwatch);
1026 if (err) {
1027 kfree(ndwatch);
1028 ndwatch = NULL;
1029 }
1030
1031 *ndp = ndparent;
1032 *ndw = ndwatch;
1033
1034 return 0;
1035}
1036
1037/* Release resources used for watch path information. */
1038static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
1039{
1040 if (ndp) {
1041 path_release(ndp);
1042 kfree(ndp);
1043 }
1044 if (ndw) {
1045 path_release(ndw);
1046 kfree(ndw);
1047 }
1048}
1049
1050/* Associate the given rule with an existing parent inotify_watch.
1051 * Caller must hold audit_filter_mutex. */
1052static void audit_add_to_parent(struct audit_krule *krule,
1053 struct audit_parent *parent)
1054{
1055 struct audit_watch *w, *watch = krule->watch;
1056 int watch_found = 0;
1057
1058 list_for_each_entry(w, &parent->watches, wlist) {
1059 if (strcmp(watch->path, w->path))
1060 continue;
1061
1062 watch_found = 1;
1063
1064 /* put krule's and initial refs to temporary watch */
1065 audit_put_watch(watch);
1066 audit_put_watch(watch);
1067
1068 audit_get_watch(w);
1069 krule->watch = watch = w;
1070 break;
1071 }
1072
1073 if (!watch_found) {
1074 get_inotify_watch(&parent->wdata);
1075 watch->parent = parent;
1076
1077 list_add(&watch->wlist, &parent->watches);
1078 }
1079 list_add(&krule->rlist, &watch->rules);
1080}
1081
1082/* Find a matching watch entry, or add this one.
1083 * Caller must hold audit_filter_mutex. */
1084static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp,
1085 struct nameidata *ndw)
1086{
1087 struct audit_watch *watch = krule->watch;
1088 struct inotify_watch *i_watch;
1089 struct audit_parent *parent;
1090 int ret = 0;
1091
1092 /* update watch filter fields */
1093 if (ndw) {
1094 watch->dev = ndw->dentry->d_inode->i_sb->s_dev;
1095 watch->ino = ndw->dentry->d_inode->i_ino;
1096 }
1097
1098 /* The audit_filter_mutex must not be held during inotify calls because
1099 * we hold it during inotify event callback processing. If an existing
1100 * inotify watch is found, inotify_find_watch() grabs a reference before
1101 * returning.
1102 */
1103 mutex_unlock(&audit_filter_mutex);
1104
1105 if (inotify_find_watch(audit_ih, ndp->dentry->d_inode, &i_watch) < 0) {
1106 parent = audit_init_parent(ndp);
1107 if (IS_ERR(parent)) {
1108 /* caller expects mutex locked */
1109 mutex_lock(&audit_filter_mutex);
1110 return PTR_ERR(parent);
1111 }
1112 } else
1113 parent = container_of(i_watch, struct audit_parent, wdata);
1114
1115 mutex_lock(&audit_filter_mutex);
1116
1117 /* parent was moved before we took audit_filter_mutex */
1118 if (parent->flags & AUDIT_PARENT_INVALID)
1119 ret = -ENOENT;
1120 else
1121 audit_add_to_parent(krule, parent);
1122
1123 /* match get in audit_init_parent or inotify_find_watch */
1124 put_inotify_watch(&parent->wdata);
1125 return ret;
1126}
1127
1128/* Add rule to given filterlist if not a duplicate. */
471static inline int audit_add_rule(struct audit_entry *entry, 1129static inline int audit_add_rule(struct audit_entry *entry,
472 struct list_head *list) 1130 struct list_head *list)
473{ 1131{
474 struct audit_entry *e; 1132 struct audit_entry *e;
1133 struct audit_field *inode_f = entry->rule.inode_f;
1134 struct audit_watch *watch = entry->rule.watch;
1135 struct nameidata *ndp, *ndw;
1136 int h, err, putnd_needed = 0;
1137
1138 if (inode_f) {
1139 h = audit_hash_ino(inode_f->val);
1140 list = &audit_inode_hash[h];
1141 }
475 1142
476 /* Do not use the _rcu iterator here, since this is the only 1143 mutex_lock(&audit_filter_mutex);
477 * addition routine. */ 1144 e = audit_find_rule(entry, list);
478 list_for_each_entry(e, list, list) { 1145 mutex_unlock(&audit_filter_mutex);
479 if (!audit_compare_rule(&entry->rule, &e->rule)) 1146 if (e) {
480 return -EEXIST; 1147 err = -EEXIST;
1148 goto error;
1149 }
1150
1151 /* Avoid calling path_lookup under audit_filter_mutex. */
1152 if (watch) {
1153 err = audit_get_nd(watch->path, &ndp, &ndw);
1154 if (err)
1155 goto error;
1156 putnd_needed = 1;
1157 }
1158
1159 mutex_lock(&audit_filter_mutex);
1160 if (watch) {
1161 /* audit_filter_mutex is dropped and re-taken during this call */
1162 err = audit_add_watch(&entry->rule, ndp, ndw);
1163 if (err) {
1164 mutex_unlock(&audit_filter_mutex);
1165 goto error;
1166 }
1167 h = audit_hash_ino((u32)watch->ino);
1168 list = &audit_inode_hash[h];
481 } 1169 }
482 1170
483 if (entry->rule.flags & AUDIT_FILTER_PREPEND) { 1171 if (entry->rule.flags & AUDIT_FILTER_PREPEND) {
484 list_add_rcu(&entry->list, list); 1172 list_add_rcu(&entry->list, list);
1173 entry->rule.flags &= ~AUDIT_FILTER_PREPEND;
485 } else { 1174 } else {
486 list_add_tail_rcu(&entry->list, list); 1175 list_add_tail_rcu(&entry->list, list);
487 } 1176 }
1177 mutex_unlock(&audit_filter_mutex);
488 1178
489 return 0; 1179 if (putnd_needed)
1180 audit_put_nd(ndp, ndw);
1181
1182 return 0;
1183
1184error:
1185 if (putnd_needed)
1186 audit_put_nd(ndp, ndw);
1187 if (watch)
1188 audit_put_watch(watch); /* tmp watch, matches initial get */
1189 return err;
490} 1190}
491 1191
492/* Remove an existing rule from filterlist. Protected by 1192/* Remove an existing rule from filterlist. */
493 * audit_netlink_mutex. */
494static inline int audit_del_rule(struct audit_entry *entry, 1193static inline int audit_del_rule(struct audit_entry *entry,
495 struct list_head *list) 1194 struct list_head *list)
496{ 1195{
497 struct audit_entry *e; 1196 struct audit_entry *e;
1197 struct audit_field *inode_f = entry->rule.inode_f;
1198 struct audit_watch *watch, *tmp_watch = entry->rule.watch;
1199 LIST_HEAD(inotify_list);
1200 int h, ret = 0;
1201
1202 if (inode_f) {
1203 h = audit_hash_ino(inode_f->val);
1204 list = &audit_inode_hash[h];
1205 }
498 1206
499 /* Do not use the _rcu iterator here, since this is the only 1207 mutex_lock(&audit_filter_mutex);
500 * deletion routine. */ 1208 e = audit_find_rule(entry, list);
501 list_for_each_entry(e, list, list) { 1209 if (!e) {
502 if (!audit_compare_rule(&entry->rule, &e->rule)) { 1210 mutex_unlock(&audit_filter_mutex);
503 list_del_rcu(&e->list); 1211 ret = -ENOENT;
504 call_rcu(&e->rcu, audit_free_rule_rcu); 1212 goto out;
505 return 0; 1213 }
1214
1215 watch = e->rule.watch;
1216 if (watch) {
1217 struct audit_parent *parent = watch->parent;
1218
1219 list_del(&e->rule.rlist);
1220
1221 if (list_empty(&watch->rules)) {
1222 audit_remove_watch(watch);
1223
1224 if (list_empty(&parent->watches)) {
1225 /* Put parent on the inotify un-registration
1226 * list. Grab a reference before releasing
1227 * audit_filter_mutex, to be released in
1228 * audit_inotify_unregister(). */
1229 list_add(&parent->ilist, &inotify_list);
1230 get_inotify_watch(&parent->wdata);
1231 }
506 } 1232 }
507 } 1233 }
508 return -ENOENT; /* No matching rule */ 1234
1235 list_del_rcu(&e->list);
1236 call_rcu(&e->rcu, audit_free_rule_rcu);
1237
1238 mutex_unlock(&audit_filter_mutex);
1239
1240 if (!list_empty(&inotify_list))
1241 audit_inotify_unregister(&inotify_list);
1242
1243out:
1244 if (tmp_watch)
1245 audit_put_watch(tmp_watch); /* match initial get */
1246
1247 return ret;
509} 1248}
510 1249
511/* List rules using struct audit_rule. Exists for backward 1250/* List rules using struct audit_rule. Exists for backward
512 * compatibility with userspace. */ 1251 * compatibility with userspace. */
513static int audit_list(void *_dest) 1252static void audit_list(int pid, int seq, struct sk_buff_head *q)
514{ 1253{
515 int pid, seq; 1254 struct sk_buff *skb;
516 int *dest = _dest;
517 struct audit_entry *entry; 1255 struct audit_entry *entry;
518 int i; 1256 int i;
519 1257
520 pid = dest[0]; 1258 /* This is a blocking read, so use audit_filter_mutex instead of rcu
521 seq = dest[1]; 1259 * iterator to sync with list writers. */
522 kfree(dest);
523
524 mutex_lock(&audit_netlink_mutex);
525
526 /* The *_rcu iterators not needed here because we are
527 always called with audit_netlink_mutex held. */
528 for (i=0; i<AUDIT_NR_FILTERS; i++) { 1260 for (i=0; i<AUDIT_NR_FILTERS; i++) {
529 list_for_each_entry(entry, &audit_filter_list[i], list) { 1261 list_for_each_entry(entry, &audit_filter_list[i], list) {
530 struct audit_rule *rule; 1262 struct audit_rule *rule;
@@ -532,33 +1264,41 @@ static int audit_list(void *_dest)
532 rule = audit_krule_to_rule(&entry->rule); 1264 rule = audit_krule_to_rule(&entry->rule);
533 if (unlikely(!rule)) 1265 if (unlikely(!rule))
534 break; 1266 break;
535 audit_send_reply(pid, seq, AUDIT_LIST, 0, 1, 1267 skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1,
536 rule, sizeof(*rule)); 1268 rule, sizeof(*rule));
1269 if (skb)
1270 skb_queue_tail(q, skb);
537 kfree(rule); 1271 kfree(rule);
538 } 1272 }
539 } 1273 }
540 audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0); 1274 for (i = 0; i < AUDIT_INODE_BUCKETS; i++) {
541 1275 list_for_each_entry(entry, &audit_inode_hash[i], list) {
542 mutex_unlock(&audit_netlink_mutex); 1276 struct audit_rule *rule;
543 return 0; 1277
1278 rule = audit_krule_to_rule(&entry->rule);
1279 if (unlikely(!rule))
1280 break;
1281 skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1,
1282 rule, sizeof(*rule));
1283 if (skb)
1284 skb_queue_tail(q, skb);
1285 kfree(rule);
1286 }
1287 }
1288 skb = audit_make_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
1289 if (skb)
1290 skb_queue_tail(q, skb);
544} 1291}
545 1292
546/* List rules using struct audit_rule_data. */ 1293/* List rules using struct audit_rule_data. */
547static int audit_list_rules(void *_dest) 1294static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
548{ 1295{
549 int pid, seq; 1296 struct sk_buff *skb;
550 int *dest = _dest;
551 struct audit_entry *e; 1297 struct audit_entry *e;
552 int i; 1298 int i;
553 1299
554 pid = dest[0]; 1300 /* This is a blocking read, so use audit_filter_mutex instead of rcu
555 seq = dest[1]; 1301 * iterator to sync with list writers. */
556 kfree(dest);
557
558 mutex_lock(&audit_netlink_mutex);
559
560 /* The *_rcu iterators not needed here because we are
561 always called with audit_netlink_mutex held. */
562 for (i=0; i<AUDIT_NR_FILTERS; i++) { 1302 for (i=0; i<AUDIT_NR_FILTERS; i++) {
563 list_for_each_entry(e, &audit_filter_list[i], list) { 1303 list_for_each_entry(e, &audit_filter_list[i], list) {
564 struct audit_rule_data *data; 1304 struct audit_rule_data *data;
@@ -566,15 +1306,58 @@ static int audit_list_rules(void *_dest)
566 data = audit_krule_to_data(&e->rule); 1306 data = audit_krule_to_data(&e->rule);
567 if (unlikely(!data)) 1307 if (unlikely(!data))
568 break; 1308 break;
569 audit_send_reply(pid, seq, AUDIT_LIST_RULES, 0, 1, 1309 skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1,
570 data, sizeof(*data)); 1310 data, sizeof(*data) + data->buflen);
1311 if (skb)
1312 skb_queue_tail(q, skb);
571 kfree(data); 1313 kfree(data);
572 } 1314 }
573 } 1315 }
574 audit_send_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0); 1316 for (i=0; i< AUDIT_INODE_BUCKETS; i++) {
1317 list_for_each_entry(e, &audit_inode_hash[i], list) {
1318 struct audit_rule_data *data;
575 1319
576 mutex_unlock(&audit_netlink_mutex); 1320 data = audit_krule_to_data(&e->rule);
577 return 0; 1321 if (unlikely(!data))
1322 break;
1323 skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1,
1324 data, sizeof(*data) + data->buflen);
1325 if (skb)
1326 skb_queue_tail(q, skb);
1327 kfree(data);
1328 }
1329 }
1330 skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0);
1331 if (skb)
1332 skb_queue_tail(q, skb);
1333}
1334
1335/* Log rule additions and removals */
1336static void audit_log_rule_change(uid_t loginuid, u32 sid, char *action,
1337 struct audit_krule *rule, int res)
1338{
1339 struct audit_buffer *ab;
1340
1341 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
1342 if (!ab)
1343 return;
1344 audit_log_format(ab, "auid=%u", loginuid);
1345 if (sid) {
1346 char *ctx = NULL;
1347 u32 len;
1348 if (selinux_ctxid_to_string(sid, &ctx, &len))
1349 audit_log_format(ab, " ssid=%u", sid);
1350 else
1351 audit_log_format(ab, " subj=%s", ctx);
1352 kfree(ctx);
1353 }
1354 audit_log_format(ab, " %s rule key=", action);
1355 if (rule->filterkey)
1356 audit_log_untrustedstring(ab, rule->filterkey);
1357 else
1358 audit_log_format(ab, "(null)");
1359 audit_log_format(ab, " list=%d res=%d", rule->listnr, res);
1360 audit_log_end(ab);
578} 1361}
579 1362
580/** 1363/**
@@ -592,7 +1375,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
592 size_t datasz, uid_t loginuid, u32 sid) 1375 size_t datasz, uid_t loginuid, u32 sid)
593{ 1376{
594 struct task_struct *tsk; 1377 struct task_struct *tsk;
595 int *dest; 1378 struct audit_netlink_list *dest;
596 int err = 0; 1379 int err = 0;
597 struct audit_entry *entry; 1380 struct audit_entry *entry;
598 1381
@@ -605,18 +1388,22 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
605 * happen if we're actually running in the context of auditctl 1388 * happen if we're actually running in the context of auditctl
606 * trying to _send_ the stuff */ 1389 * trying to _send_ the stuff */
607 1390
608 dest = kmalloc(2 * sizeof(int), GFP_KERNEL); 1391 dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL);
609 if (!dest) 1392 if (!dest)
610 return -ENOMEM; 1393 return -ENOMEM;
611 dest[0] = pid; 1394 dest->pid = pid;
612 dest[1] = seq; 1395 skb_queue_head_init(&dest->q);
613 1396
1397 mutex_lock(&audit_filter_mutex);
614 if (type == AUDIT_LIST) 1398 if (type == AUDIT_LIST)
615 tsk = kthread_run(audit_list, dest, "audit_list"); 1399 audit_list(pid, seq, &dest->q);
616 else 1400 else
617 tsk = kthread_run(audit_list_rules, dest, 1401 audit_list_rules(pid, seq, &dest->q);
618 "audit_list_rules"); 1402 mutex_unlock(&audit_filter_mutex);
1403
1404 tsk = kthread_run(audit_send_list, dest, "audit_send_list");
619 if (IS_ERR(tsk)) { 1405 if (IS_ERR(tsk)) {
1406 skb_queue_purge(&dest->q);
620 kfree(dest); 1407 kfree(dest);
621 err = PTR_ERR(tsk); 1408 err = PTR_ERR(tsk);
622 } 1409 }
@@ -632,23 +1419,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
632 1419
633 err = audit_add_rule(entry, 1420 err = audit_add_rule(entry,
634 &audit_filter_list[entry->rule.listnr]); 1421 &audit_filter_list[entry->rule.listnr]);
635 if (sid) { 1422 audit_log_rule_change(loginuid, sid, "add", &entry->rule, !err);
636 char *ctx = NULL;
637 u32 len;
638 if (selinux_ctxid_to_string(sid, &ctx, &len)) {
639 /* Maybe call audit_panic? */
640 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
641 "auid=%u ssid=%u add rule to list=%d res=%d",
642 loginuid, sid, entry->rule.listnr, !err);
643 } else
644 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
645 "auid=%u subj=%s add rule to list=%d res=%d",
646 loginuid, ctx, entry->rule.listnr, !err);
647 kfree(ctx);
648 } else
649 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
650 "auid=%u add rule to list=%d res=%d",
651 loginuid, entry->rule.listnr, !err);
652 1423
653 if (err) 1424 if (err)
654 audit_free_rule(entry); 1425 audit_free_rule(entry);
@@ -664,24 +1435,8 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
664 1435
665 err = audit_del_rule(entry, 1436 err = audit_del_rule(entry,
666 &audit_filter_list[entry->rule.listnr]); 1437 &audit_filter_list[entry->rule.listnr]);
667 1438 audit_log_rule_change(loginuid, sid, "remove", &entry->rule,
668 if (sid) { 1439 !err);
669 char *ctx = NULL;
670 u32 len;
671 if (selinux_ctxid_to_string(sid, &ctx, &len)) {
672 /* Maybe call audit_panic? */
673 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
674 "auid=%u ssid=%u remove rule from list=%d res=%d",
675 loginuid, sid, entry->rule.listnr, !err);
676 } else
677 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
678 "auid=%u subj=%s remove rule from list=%d res=%d",
679 loginuid, ctx, entry->rule.listnr, !err);
680 kfree(ctx);
681 } else
682 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
683 "auid=%u remove rule from list=%d res=%d",
684 loginuid, entry->rule.listnr, !err);
685 1440
686 audit_free_rule(entry); 1441 audit_free_rule(entry);
687 break; 1442 break;
@@ -712,7 +1467,43 @@ int audit_comparator(const u32 left, const u32 op, const u32 right)
712 return 0; 1467 return 0;
713} 1468}
714 1469
1470/* Compare given dentry name with last component in given path,
1471 * return of 0 indicates a match. */
1472int audit_compare_dname_path(const char *dname, const char *path,
1473 int *dirlen)
1474{
1475 int dlen, plen;
1476 const char *p;
1477
1478 if (!dname || !path)
1479 return 1;
715 1480
1481 dlen = strlen(dname);
1482 plen = strlen(path);
1483 if (plen < dlen)
1484 return 1;
1485
1486 /* disregard trailing slashes */
1487 p = path + plen - 1;
1488 while ((*p == '/') && (p > path))
1489 p--;
1490
1491 /* find last path component */
1492 p = p - dlen + 1;
1493 if (p < path)
1494 return 1;
1495 else if (p > path) {
1496 if (*--p != '/')
1497 return 1;
1498 else
1499 p++;
1500 }
1501
1502 /* return length of path's directory component */
1503 if (dirlen)
1504 *dirlen = p - path;
1505 return strncmp(p, dname, dlen);
1506}
716 1507
717static int audit_filter_user_rules(struct netlink_skb_parms *cb, 1508static int audit_filter_user_rules(struct netlink_skb_parms *cb,
718 struct audit_krule *rule, 1509 struct audit_krule *rule,
@@ -744,7 +1535,6 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
744 } 1535 }
745 switch (rule->action) { 1536 switch (rule->action) {
746 case AUDIT_NEVER: *state = AUDIT_DISABLED; break; 1537 case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
747 case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break;
748 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; 1538 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
749 } 1539 }
750 return 1; 1540 return 1;
@@ -806,11 +1596,16 @@ static inline int audit_rule_has_selinux(struct audit_krule *rule)
806 for (i = 0; i < rule->field_count; i++) { 1596 for (i = 0; i < rule->field_count; i++) {
807 struct audit_field *f = &rule->fields[i]; 1597 struct audit_field *f = &rule->fields[i];
808 switch (f->type) { 1598 switch (f->type) {
809 case AUDIT_SE_USER: 1599 case AUDIT_SUBJ_USER:
810 case AUDIT_SE_ROLE: 1600 case AUDIT_SUBJ_ROLE:
811 case AUDIT_SE_TYPE: 1601 case AUDIT_SUBJ_TYPE:
812 case AUDIT_SE_SEN: 1602 case AUDIT_SUBJ_SEN:
813 case AUDIT_SE_CLR: 1603 case AUDIT_SUBJ_CLR:
1604 case AUDIT_OBJ_USER:
1605 case AUDIT_OBJ_ROLE:
1606 case AUDIT_OBJ_TYPE:
1607 case AUDIT_OBJ_LEV_LOW:
1608 case AUDIT_OBJ_LEV_HIGH:
814 return 1; 1609 return 1;
815 } 1610 }
816 } 1611 }
@@ -826,32 +1621,65 @@ static inline int audit_rule_has_selinux(struct audit_krule *rule)
826int selinux_audit_rule_update(void) 1621int selinux_audit_rule_update(void)
827{ 1622{
828 struct audit_entry *entry, *n, *nentry; 1623 struct audit_entry *entry, *n, *nentry;
1624 struct audit_watch *watch;
829 int i, err = 0; 1625 int i, err = 0;
830 1626
831 /* audit_netlink_mutex synchronizes the writers */ 1627 /* audit_filter_mutex synchronizes the writers */
832 mutex_lock(&audit_netlink_mutex); 1628 mutex_lock(&audit_filter_mutex);
833 1629
834 for (i = 0; i < AUDIT_NR_FILTERS; i++) { 1630 for (i = 0; i < AUDIT_NR_FILTERS; i++) {
835 list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) { 1631 list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) {
836 if (!audit_rule_has_selinux(&entry->rule)) 1632 if (!audit_rule_has_selinux(&entry->rule))
837 continue; 1633 continue;
838 1634
839 nentry = audit_dupe_rule(&entry->rule); 1635 watch = entry->rule.watch;
1636 nentry = audit_dupe_rule(&entry->rule, watch);
840 if (unlikely(IS_ERR(nentry))) { 1637 if (unlikely(IS_ERR(nentry))) {
841 /* save the first error encountered for the 1638 /* save the first error encountered for the
842 * return value */ 1639 * return value */
843 if (!err) 1640 if (!err)
844 err = PTR_ERR(nentry); 1641 err = PTR_ERR(nentry);
845 audit_panic("error updating selinux filters"); 1642 audit_panic("error updating selinux filters");
1643 if (watch)
1644 list_del(&entry->rule.rlist);
846 list_del_rcu(&entry->list); 1645 list_del_rcu(&entry->list);
847 } else { 1646 } else {
1647 if (watch) {
1648 list_add(&nentry->rule.rlist,
1649 &watch->rules);
1650 list_del(&entry->rule.rlist);
1651 }
848 list_replace_rcu(&entry->list, &nentry->list); 1652 list_replace_rcu(&entry->list, &nentry->list);
849 } 1653 }
850 call_rcu(&entry->rcu, audit_free_rule_rcu); 1654 call_rcu(&entry->rcu, audit_free_rule_rcu);
851 } 1655 }
852 } 1656 }
853 1657
854 mutex_unlock(&audit_netlink_mutex); 1658 mutex_unlock(&audit_filter_mutex);
855 1659
856 return err; 1660 return err;
857} 1661}
1662
1663/* Update watch data in audit rules based on inotify events. */
1664void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask,
1665 u32 cookie, const char *dname, struct inode *inode)
1666{
1667 struct audit_parent *parent;
1668
1669 parent = container_of(i_watch, struct audit_parent, wdata);
1670
1671 if (mask & (IN_CREATE|IN_MOVED_TO) && inode)
1672 audit_update_watch(parent, dname, inode->i_sb->s_dev,
1673 inode->i_ino, 0);
1674 else if (mask & (IN_DELETE|IN_MOVED_FROM))
1675 audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
1676 /* inotify automatically removes the watch and sends IN_IGNORED */
1677 else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
1678 audit_remove_parent_watches(parent);
1679 /* inotify does not remove the watch, so remove it manually */
1680 else if(mask & IN_MOVE_SELF) {
1681 audit_remove_parent_watches(parent);
1682 inotify_remove_watch_locked(audit_ih, i_watch);
1683 } else if (mask & IN_IGNORED)
1684 put_inotify_watch(i_watch);
1685}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 1c03a4ed1b27..ae40ac8c39e7 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. 4 * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
5 * Copyright 2005 Hewlett-Packard Development Company, L.P. 5 * Copyright 2005 Hewlett-Packard Development Company, L.P.
6 * Copyright (C) 2005 IBM Corporation 6 * Copyright (C) 2005, 2006 IBM Corporation
7 * All Rights Reserved. 7 * All Rights Reserved.
8 * 8 *
9 * This program is free software; you can redistribute it and/or modify 9 * This program is free software; you can redistribute it and/or modify
@@ -29,6 +29,9 @@
29 * this file -- see entry.S) is based on a GPL'd patch written by 29 * this file -- see entry.S) is based on a GPL'd patch written by
30 * okir@suse.de and Copyright 2003 SuSE Linux AG. 30 * okir@suse.de and Copyright 2003 SuSE Linux AG.
31 * 31 *
32 * POSIX message queue support added by George Wilson <ltcgcw@us.ibm.com>,
33 * 2006.
34 *
32 * The support of additional filter rules compares (>, <, >=, <=) was 35 * The support of additional filter rules compares (>, <, >=, <=) was
33 * added by Dustin Kirkland <dustin.kirkland@us.ibm.com>, 2005. 36 * added by Dustin Kirkland <dustin.kirkland@us.ibm.com>, 2005.
34 * 37 *
@@ -49,6 +52,7 @@
49#include <linux/module.h> 52#include <linux/module.h>
50#include <linux/mount.h> 53#include <linux/mount.h>
51#include <linux/socket.h> 54#include <linux/socket.h>
55#include <linux/mqueue.h>
52#include <linux/audit.h> 56#include <linux/audit.h>
53#include <linux/personality.h> 57#include <linux/personality.h>
54#include <linux/time.h> 58#include <linux/time.h>
@@ -59,6 +63,8 @@
59#include <linux/list.h> 63#include <linux/list.h>
60#include <linux/tty.h> 64#include <linux/tty.h>
61#include <linux/selinux.h> 65#include <linux/selinux.h>
66#include <linux/binfmts.h>
67#include <linux/syscalls.h>
62 68
63#include "audit.h" 69#include "audit.h"
64 70
@@ -76,6 +82,9 @@ extern int audit_enabled;
76 * path_lookup. */ 82 * path_lookup. */
77#define AUDIT_NAMES_RESERVED 7 83#define AUDIT_NAMES_RESERVED 7
78 84
85/* Indicates that audit should log the full pathname. */
86#define AUDIT_NAME_FULL -1
87
79/* When fs/namei.c:getname() is called, we store the pointer in name and 88/* When fs/namei.c:getname() is called, we store the pointer in name and
80 * we don't let putname() free it (instead we free all of the saved 89 * we don't let putname() free it (instead we free all of the saved
81 * pointers at syscall exit time). 90 * pointers at syscall exit time).
@@ -83,8 +92,9 @@ extern int audit_enabled;
83 * Further, in fs/namei.c:path_lookup() we store the inode and device. */ 92 * Further, in fs/namei.c:path_lookup() we store the inode and device. */
84struct audit_names { 93struct audit_names {
85 const char *name; 94 const char *name;
95 int name_len; /* number of name's characters to log */
96 unsigned name_put; /* call __putname() for this name */
86 unsigned long ino; 97 unsigned long ino;
87 unsigned long pino;
88 dev_t dev; 98 dev_t dev;
89 umode_t mode; 99 umode_t mode;
90 uid_t uid; 100 uid_t uid;
@@ -100,6 +110,33 @@ struct audit_aux_data {
100 110
101#define AUDIT_AUX_IPCPERM 0 111#define AUDIT_AUX_IPCPERM 0
102 112
113struct audit_aux_data_mq_open {
114 struct audit_aux_data d;
115 int oflag;
116 mode_t mode;
117 struct mq_attr attr;
118};
119
120struct audit_aux_data_mq_sendrecv {
121 struct audit_aux_data d;
122 mqd_t mqdes;
123 size_t msg_len;
124 unsigned int msg_prio;
125 struct timespec abs_timeout;
126};
127
128struct audit_aux_data_mq_notify {
129 struct audit_aux_data d;
130 mqd_t mqdes;
131 struct sigevent notification;
132};
133
134struct audit_aux_data_mq_getsetattr {
135 struct audit_aux_data d;
136 mqd_t mqdes;
137 struct mq_attr mqstat;
138};
139
103struct audit_aux_data_ipcctl { 140struct audit_aux_data_ipcctl {
104 struct audit_aux_data d; 141 struct audit_aux_data d;
105 struct ipc_perm p; 142 struct ipc_perm p;
@@ -110,6 +147,13 @@ struct audit_aux_data_ipcctl {
110 u32 osid; 147 u32 osid;
111}; 148};
112 149
150struct audit_aux_data_execve {
151 struct audit_aux_data d;
152 int argc;
153 int envc;
154 char mem[0];
155};
156
113struct audit_aux_data_socketcall { 157struct audit_aux_data_socketcall {
114 struct audit_aux_data d; 158 struct audit_aux_data d;
115 int nargs; 159 int nargs;
@@ -142,13 +186,14 @@ struct audit_context {
142 int auditable; /* 1 if record should be written */ 186 int auditable; /* 1 if record should be written */
143 int name_count; 187 int name_count;
144 struct audit_names names[AUDIT_NAMES]; 188 struct audit_names names[AUDIT_NAMES];
189 char * filterkey; /* key for rule that triggered record */
145 struct dentry * pwd; 190 struct dentry * pwd;
146 struct vfsmount * pwdmnt; 191 struct vfsmount * pwdmnt;
147 struct audit_context *previous; /* For nested syscalls */ 192 struct audit_context *previous; /* For nested syscalls */
148 struct audit_aux_data *aux; 193 struct audit_aux_data *aux;
149 194
150 /* Save things to print about task_struct */ 195 /* Save things to print about task_struct */
151 pid_t pid; 196 pid_t pid, ppid;
152 uid_t uid, euid, suid, fsuid; 197 uid_t uid, euid, suid, fsuid;
153 gid_t gid, egid, sgid, fsgid; 198 gid_t gid, egid, sgid, fsgid;
154 unsigned long personality; 199 unsigned long personality;
@@ -160,12 +205,13 @@ struct audit_context {
160#endif 205#endif
161}; 206};
162 207
163 208/* Determine if any context name data matches a rule's watch data */
164/* Compare a task_struct with an audit_rule. Return 1 on match, 0 209/* Compare a task_struct with an audit_rule. Return 1 on match, 0
165 * otherwise. */ 210 * otherwise. */
166static int audit_filter_rules(struct task_struct *tsk, 211static int audit_filter_rules(struct task_struct *tsk,
167 struct audit_krule *rule, 212 struct audit_krule *rule,
168 struct audit_context *ctx, 213 struct audit_context *ctx,
214 struct audit_names *name,
169 enum audit_state *state) 215 enum audit_state *state)
170{ 216{
171 int i, j, need_sid = 1; 217 int i, j, need_sid = 1;
@@ -179,6 +225,10 @@ static int audit_filter_rules(struct task_struct *tsk,
179 case AUDIT_PID: 225 case AUDIT_PID:
180 result = audit_comparator(tsk->pid, f->op, f->val); 226 result = audit_comparator(tsk->pid, f->op, f->val);
181 break; 227 break;
228 case AUDIT_PPID:
229 if (ctx)
230 result = audit_comparator(ctx->ppid, f->op, f->val);
231 break;
182 case AUDIT_UID: 232 case AUDIT_UID:
183 result = audit_comparator(tsk->uid, f->op, f->val); 233 result = audit_comparator(tsk->uid, f->op, f->val);
184 break; 234 break;
@@ -224,7 +274,10 @@ static int audit_filter_rules(struct task_struct *tsk,
224 } 274 }
225 break; 275 break;
226 case AUDIT_DEVMAJOR: 276 case AUDIT_DEVMAJOR:
227 if (ctx) { 277 if (name)
278 result = audit_comparator(MAJOR(name->dev),
279 f->op, f->val);
280 else if (ctx) {
228 for (j = 0; j < ctx->name_count; j++) { 281 for (j = 0; j < ctx->name_count; j++) {
229 if (audit_comparator(MAJOR(ctx->names[j].dev), f->op, f->val)) { 282 if (audit_comparator(MAJOR(ctx->names[j].dev), f->op, f->val)) {
230 ++result; 283 ++result;
@@ -234,7 +287,10 @@ static int audit_filter_rules(struct task_struct *tsk,
234 } 287 }
235 break; 288 break;
236 case AUDIT_DEVMINOR: 289 case AUDIT_DEVMINOR:
237 if (ctx) { 290 if (name)
291 result = audit_comparator(MINOR(name->dev),
292 f->op, f->val);
293 else if (ctx) {
238 for (j = 0; j < ctx->name_count; j++) { 294 for (j = 0; j < ctx->name_count; j++) {
239 if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) { 295 if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) {
240 ++result; 296 ++result;
@@ -244,26 +300,32 @@ static int audit_filter_rules(struct task_struct *tsk,
244 } 300 }
245 break; 301 break;
246 case AUDIT_INODE: 302 case AUDIT_INODE:
247 if (ctx) { 303 if (name)
304 result = (name->ino == f->val);
305 else if (ctx) {
248 for (j = 0; j < ctx->name_count; j++) { 306 for (j = 0; j < ctx->name_count; j++) {
249 if (audit_comparator(ctx->names[j].ino, f->op, f->val) || 307 if (audit_comparator(ctx->names[j].ino, f->op, f->val)) {
250 audit_comparator(ctx->names[j].pino, f->op, f->val)) {
251 ++result; 308 ++result;
252 break; 309 break;
253 } 310 }
254 } 311 }
255 } 312 }
256 break; 313 break;
314 case AUDIT_WATCH:
315 if (name && rule->watch->ino != (unsigned long)-1)
316 result = (name->dev == rule->watch->dev &&
317 name->ino == rule->watch->ino);
318 break;
257 case AUDIT_LOGINUID: 319 case AUDIT_LOGINUID:
258 result = 0; 320 result = 0;
259 if (ctx) 321 if (ctx)
260 result = audit_comparator(ctx->loginuid, f->op, f->val); 322 result = audit_comparator(ctx->loginuid, f->op, f->val);
261 break; 323 break;
262 case AUDIT_SE_USER: 324 case AUDIT_SUBJ_USER:
263 case AUDIT_SE_ROLE: 325 case AUDIT_SUBJ_ROLE:
264 case AUDIT_SE_TYPE: 326 case AUDIT_SUBJ_TYPE:
265 case AUDIT_SE_SEN: 327 case AUDIT_SUBJ_SEN:
266 case AUDIT_SE_CLR: 328 case AUDIT_SUBJ_CLR:
267 /* NOTE: this may return negative values indicating 329 /* NOTE: this may return negative values indicating
268 a temporary error. We simply treat this as a 330 a temporary error. We simply treat this as a
269 match for now to avoid losing information that 331 match for now to avoid losing information that
@@ -280,6 +342,46 @@ static int audit_filter_rules(struct task_struct *tsk,
280 ctx); 342 ctx);
281 } 343 }
282 break; 344 break;
345 case AUDIT_OBJ_USER:
346 case AUDIT_OBJ_ROLE:
347 case AUDIT_OBJ_TYPE:
348 case AUDIT_OBJ_LEV_LOW:
349 case AUDIT_OBJ_LEV_HIGH:
350 /* The above note for AUDIT_SUBJ_USER...AUDIT_SUBJ_CLR
351 also applies here */
352 if (f->se_rule) {
353 /* Find files that match */
354 if (name) {
355 result = selinux_audit_rule_match(
356 name->osid, f->type, f->op,
357 f->se_rule, ctx);
358 } else if (ctx) {
359 for (j = 0; j < ctx->name_count; j++) {
360 if (selinux_audit_rule_match(
361 ctx->names[j].osid,
362 f->type, f->op,
363 f->se_rule, ctx)) {
364 ++result;
365 break;
366 }
367 }
368 }
369 /* Find ipc objects that match */
370 if (ctx) {
371 struct audit_aux_data *aux;
372 for (aux = ctx->aux; aux;
373 aux = aux->next) {
374 if (aux->type == AUDIT_IPC) {
375 struct audit_aux_data_ipcctl *axi = (void *)aux;
376 if (selinux_audit_rule_match(axi->osid, f->type, f->op, f->se_rule, ctx)) {
377 ++result;
378 break;
379 }
380 }
381 }
382 }
383 }
384 break;
283 case AUDIT_ARG0: 385 case AUDIT_ARG0:
284 case AUDIT_ARG1: 386 case AUDIT_ARG1:
285 case AUDIT_ARG2: 387 case AUDIT_ARG2:
@@ -287,14 +389,19 @@ static int audit_filter_rules(struct task_struct *tsk,
287 if (ctx) 389 if (ctx)
288 result = audit_comparator(ctx->argv[f->type-AUDIT_ARG0], f->op, f->val); 390 result = audit_comparator(ctx->argv[f->type-AUDIT_ARG0], f->op, f->val);
289 break; 391 break;
392 case AUDIT_FILTERKEY:
393 /* ignore this field for filtering */
394 result = 1;
395 break;
290 } 396 }
291 397
292 if (!result) 398 if (!result)
293 return 0; 399 return 0;
294 } 400 }
401 if (rule->filterkey)
402 ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC);
295 switch (rule->action) { 403 switch (rule->action) {
296 case AUDIT_NEVER: *state = AUDIT_DISABLED; break; 404 case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
297 case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break;
298 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; 405 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
299 } 406 }
300 return 1; 407 return 1;
@@ -311,7 +418,7 @@ static enum audit_state audit_filter_task(struct task_struct *tsk)
311 418
312 rcu_read_lock(); 419 rcu_read_lock();
313 list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) { 420 list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) {
314 if (audit_filter_rules(tsk, &e->rule, NULL, &state)) { 421 if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) {
315 rcu_read_unlock(); 422 rcu_read_unlock();
316 return state; 423 return state;
317 } 424 }
@@ -341,8 +448,47 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
341 int bit = AUDIT_BIT(ctx->major); 448 int bit = AUDIT_BIT(ctx->major);
342 449
343 list_for_each_entry_rcu(e, list, list) { 450 list_for_each_entry_rcu(e, list, list) {
344 if ((e->rule.mask[word] & bit) == bit 451 if ((e->rule.mask[word] & bit) == bit &&
345 && audit_filter_rules(tsk, &e->rule, ctx, &state)) { 452 audit_filter_rules(tsk, &e->rule, ctx, NULL,
453 &state)) {
454 rcu_read_unlock();
455 return state;
456 }
457 }
458 }
459 rcu_read_unlock();
460 return AUDIT_BUILD_CONTEXT;
461}
462
463/* At syscall exit time, this filter is called if any audit_names[] have been
464 * collected during syscall processing. We only check rules in sublists at hash
465 * buckets applicable to the inode numbers in audit_names[].
466 * Regarding audit_state, same rules apply as for audit_filter_syscall().
467 */
468enum audit_state audit_filter_inodes(struct task_struct *tsk,
469 struct audit_context *ctx)
470{
471 int i;
472 struct audit_entry *e;
473 enum audit_state state;
474
475 if (audit_pid && tsk->tgid == audit_pid)
476 return AUDIT_DISABLED;
477
478 rcu_read_lock();
479 for (i = 0; i < ctx->name_count; i++) {
480 int word = AUDIT_WORD(ctx->major);
481 int bit = AUDIT_BIT(ctx->major);
482 struct audit_names *n = &ctx->names[i];
483 int h = audit_hash_ino((u32)n->ino);
484 struct list_head *list = &audit_inode_hash[h];
485
486 if (list_empty(list))
487 continue;
488
489 list_for_each_entry_rcu(e, list, list) {
490 if ((e->rule.mask[word] & bit) == bit &&
491 audit_filter_rules(tsk, &e->rule, ctx, n, &state)) {
346 rcu_read_unlock(); 492 rcu_read_unlock();
347 return state; 493 return state;
348 } 494 }
@@ -352,6 +498,11 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
352 return AUDIT_BUILD_CONTEXT; 498 return AUDIT_BUILD_CONTEXT;
353} 499}
354 500
501void audit_set_auditable(struct audit_context *ctx)
502{
503 ctx->auditable = 1;
504}
505
355static inline struct audit_context *audit_get_context(struct task_struct *tsk, 506static inline struct audit_context *audit_get_context(struct task_struct *tsk,
356 int return_valid, 507 int return_valid,
357 int return_code) 508 int return_code)
@@ -365,12 +516,22 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
365 516
366 if (context->in_syscall && !context->auditable) { 517 if (context->in_syscall && !context->auditable) {
367 enum audit_state state; 518 enum audit_state state;
519
368 state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]); 520 state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]);
521 if (state == AUDIT_RECORD_CONTEXT) {
522 context->auditable = 1;
523 goto get_context;
524 }
525
526 state = audit_filter_inodes(tsk, context);
369 if (state == AUDIT_RECORD_CONTEXT) 527 if (state == AUDIT_RECORD_CONTEXT)
370 context->auditable = 1; 528 context->auditable = 1;
529
371 } 530 }
372 531
532get_context:
373 context->pid = tsk->pid; 533 context->pid = tsk->pid;
534 context->ppid = sys_getppid(); /* sic. tsk == current in all cases */
374 context->uid = tsk->uid; 535 context->uid = tsk->uid;
375 context->gid = tsk->gid; 536 context->gid = tsk->gid;
376 context->euid = tsk->euid; 537 context->euid = tsk->euid;
@@ -413,7 +574,7 @@ static inline void audit_free_names(struct audit_context *context)
413#endif 574#endif
414 575
415 for (i = 0; i < context->name_count; i++) { 576 for (i = 0; i < context->name_count; i++) {
416 if (context->names[i].name) 577 if (context->names[i].name && context->names[i].name_put)
417 __putname(context->names[i].name); 578 __putname(context->names[i].name);
418 } 579 }
419 context->name_count = 0; 580 context->name_count = 0;
@@ -513,6 +674,7 @@ static inline void audit_free_context(struct audit_context *context)
513 } 674 }
514 audit_free_names(context); 675 audit_free_names(context);
515 audit_free_aux(context); 676 audit_free_aux(context);
677 kfree(context->filterkey);
516 kfree(context); 678 kfree(context);
517 context = previous; 679 context = previous;
518 } while (context); 680 } while (context);
@@ -544,8 +706,7 @@ static void audit_log_task_context(struct audit_buffer *ab)
544 return; 706 return;
545 707
546error_path: 708error_path:
547 if (ctx) 709 kfree(ctx);
548 kfree(ctx);
549 audit_panic("error in audit_log_task_context"); 710 audit_panic("error in audit_log_task_context");
550 return; 711 return;
551} 712}
@@ -606,7 +767,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
606 tty = "(none)"; 767 tty = "(none)";
607 audit_log_format(ab, 768 audit_log_format(ab,
608 " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" 769 " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
609 " pid=%d auid=%u uid=%u gid=%u" 770 " ppid=%d pid=%d auid=%u uid=%u gid=%u"
610 " euid=%u suid=%u fsuid=%u" 771 " euid=%u suid=%u fsuid=%u"
611 " egid=%u sgid=%u fsgid=%u tty=%s", 772 " egid=%u sgid=%u fsgid=%u tty=%s",
612 context->argv[0], 773 context->argv[0],
@@ -614,6 +775,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
614 context->argv[2], 775 context->argv[2],
615 context->argv[3], 776 context->argv[3],
616 context->name_count, 777 context->name_count,
778 context->ppid,
617 context->pid, 779 context->pid,
618 context->loginuid, 780 context->loginuid,
619 context->uid, 781 context->uid,
@@ -621,6 +783,11 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
621 context->euid, context->suid, context->fsuid, 783 context->euid, context->suid, context->fsuid,
622 context->egid, context->sgid, context->fsgid, tty); 784 context->egid, context->sgid, context->fsgid, tty);
623 audit_log_task_info(ab, tsk); 785 audit_log_task_info(ab, tsk);
786 if (context->filterkey) {
787 audit_log_format(ab, " key=");
788 audit_log_untrustedstring(ab, context->filterkey);
789 } else
790 audit_log_format(ab, " key=(null)");
624 audit_log_end(ab); 791 audit_log_end(ab);
625 792
626 for (aux = context->aux; aux; aux = aux->next) { 793 for (aux = context->aux; aux; aux = aux->next) {
@@ -630,11 +797,48 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
630 continue; /* audit_panic has been called */ 797 continue; /* audit_panic has been called */
631 798
632 switch (aux->type) { 799 switch (aux->type) {
800 case AUDIT_MQ_OPEN: {
801 struct audit_aux_data_mq_open *axi = (void *)aux;
802 audit_log_format(ab,
803 "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld "
804 "mq_msgsize=%ld mq_curmsgs=%ld",
805 axi->oflag, axi->mode, axi->attr.mq_flags,
806 axi->attr.mq_maxmsg, axi->attr.mq_msgsize,
807 axi->attr.mq_curmsgs);
808 break; }
809
810 case AUDIT_MQ_SENDRECV: {
811 struct audit_aux_data_mq_sendrecv *axi = (void *)aux;
812 audit_log_format(ab,
813 "mqdes=%d msg_len=%zd msg_prio=%u "
814 "abs_timeout_sec=%ld abs_timeout_nsec=%ld",
815 axi->mqdes, axi->msg_len, axi->msg_prio,
816 axi->abs_timeout.tv_sec, axi->abs_timeout.tv_nsec);
817 break; }
818
819 case AUDIT_MQ_NOTIFY: {
820 struct audit_aux_data_mq_notify *axi = (void *)aux;
821 audit_log_format(ab,
822 "mqdes=%d sigev_signo=%d",
823 axi->mqdes,
824 axi->notification.sigev_signo);
825 break; }
826
827 case AUDIT_MQ_GETSETATTR: {
828 struct audit_aux_data_mq_getsetattr *axi = (void *)aux;
829 audit_log_format(ab,
830 "mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld "
831 "mq_curmsgs=%ld ",
832 axi->mqdes,
833 axi->mqstat.mq_flags, axi->mqstat.mq_maxmsg,
834 axi->mqstat.mq_msgsize, axi->mqstat.mq_curmsgs);
835 break; }
836
633 case AUDIT_IPC: { 837 case AUDIT_IPC: {
634 struct audit_aux_data_ipcctl *axi = (void *)aux; 838 struct audit_aux_data_ipcctl *axi = (void *)aux;
635 audit_log_format(ab, 839 audit_log_format(ab,
636 " qbytes=%lx iuid=%u igid=%u mode=%x", 840 "ouid=%u ogid=%u mode=%x",
637 axi->qbytes, axi->uid, axi->gid, axi->mode); 841 axi->uid, axi->gid, axi->mode);
638 if (axi->osid != 0) { 842 if (axi->osid != 0) {
639 char *ctx = NULL; 843 char *ctx = NULL;
640 u32 len; 844 u32 len;
@@ -652,19 +856,18 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
652 case AUDIT_IPC_SET_PERM: { 856 case AUDIT_IPC_SET_PERM: {
653 struct audit_aux_data_ipcctl *axi = (void *)aux; 857 struct audit_aux_data_ipcctl *axi = (void *)aux;
654 audit_log_format(ab, 858 audit_log_format(ab,
655 " new qbytes=%lx new iuid=%u new igid=%u new mode=%x", 859 "qbytes=%lx ouid=%u ogid=%u mode=%x",
656 axi->qbytes, axi->uid, axi->gid, axi->mode); 860 axi->qbytes, axi->uid, axi->gid, axi->mode);
657 if (axi->osid != 0) { 861 break; }
658 char *ctx = NULL; 862
659 u32 len; 863 case AUDIT_EXECVE: {
660 if (selinux_ctxid_to_string( 864 struct audit_aux_data_execve *axi = (void *)aux;
661 axi->osid, &ctx, &len)) { 865 int i;
662 audit_log_format(ab, " osid=%u", 866 const char *p;
663 axi->osid); 867 for (i = 0, p = axi->mem; i < axi->argc; i++) {
664 call_panic = 1; 868 audit_log_format(ab, "a%d=", i);
665 } else 869 p = audit_log_untrustedstring(ab, p);
666 audit_log_format(ab, " obj=%s", ctx); 870 audit_log_format(ab, "\n");
667 kfree(ctx);
668 } 871 }
669 break; } 872 break; }
670 873
@@ -700,8 +903,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
700 } 903 }
701 } 904 }
702 for (i = 0; i < context->name_count; i++) { 905 for (i = 0; i < context->name_count; i++) {
703 unsigned long ino = context->names[i].ino; 906 struct audit_names *n = &context->names[i];
704 unsigned long pino = context->names[i].pino;
705 907
706 ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); 908 ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
707 if (!ab) 909 if (!ab)
@@ -709,33 +911,47 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
709 911
710 audit_log_format(ab, "item=%d", i); 912 audit_log_format(ab, "item=%d", i);
711 913
712 audit_log_format(ab, " name="); 914 if (n->name) {
713 if (context->names[i].name) 915 switch(n->name_len) {
714 audit_log_untrustedstring(ab, context->names[i].name); 916 case AUDIT_NAME_FULL:
715 else 917 /* log the full path */
716 audit_log_format(ab, "(null)"); 918 audit_log_format(ab, " name=");
717 919 audit_log_untrustedstring(ab, n->name);
718 if (pino != (unsigned long)-1) 920 break;
719 audit_log_format(ab, " parent=%lu", pino); 921 case 0:
720 if (ino != (unsigned long)-1) 922 /* name was specified as a relative path and the
721 audit_log_format(ab, " inode=%lu", ino); 923 * directory component is the cwd */
722 if ((pino != (unsigned long)-1) || (ino != (unsigned long)-1)) 924 audit_log_d_path(ab, " name=", context->pwd,
723 audit_log_format(ab, " dev=%02x:%02x mode=%#o" 925 context->pwdmnt);
724 " ouid=%u ogid=%u rdev=%02x:%02x", 926 break;
725 MAJOR(context->names[i].dev), 927 default:
726 MINOR(context->names[i].dev), 928 /* log the name's directory component */
727 context->names[i].mode, 929 audit_log_format(ab, " name=");
728 context->names[i].uid, 930 audit_log_n_untrustedstring(ab, n->name_len,
729 context->names[i].gid, 931 n->name);
730 MAJOR(context->names[i].rdev), 932 }
731 MINOR(context->names[i].rdev)); 933 } else
732 if (context->names[i].osid != 0) { 934 audit_log_format(ab, " name=(null)");
935
936 if (n->ino != (unsigned long)-1) {
937 audit_log_format(ab, " inode=%lu"
938 " dev=%02x:%02x mode=%#o"
939 " ouid=%u ogid=%u rdev=%02x:%02x",
940 n->ino,
941 MAJOR(n->dev),
942 MINOR(n->dev),
943 n->mode,
944 n->uid,
945 n->gid,
946 MAJOR(n->rdev),
947 MINOR(n->rdev));
948 }
949 if (n->osid != 0) {
733 char *ctx = NULL; 950 char *ctx = NULL;
734 u32 len; 951 u32 len;
735 if (selinux_ctxid_to_string( 952 if (selinux_ctxid_to_string(
736 context->names[i].osid, &ctx, &len)) { 953 n->osid, &ctx, &len)) {
737 audit_log_format(ab, " osid=%u", 954 audit_log_format(ab, " osid=%u", n->osid);
738 context->names[i].osid);
739 call_panic = 2; 955 call_panic = 2;
740 } else 956 } else
741 audit_log_format(ab, " obj=%s", ctx); 957 audit_log_format(ab, " obj=%s", ctx);
@@ -897,6 +1113,8 @@ void audit_syscall_exit(int valid, long return_code)
897 } else { 1113 } else {
898 audit_free_names(context); 1114 audit_free_names(context);
899 audit_free_aux(context); 1115 audit_free_aux(context);
1116 kfree(context->filterkey);
1117 context->filterkey = NULL;
900 tsk->audit_context = context; 1118 tsk->audit_context = context;
901 } 1119 }
902} 1120}
@@ -908,11 +1126,11 @@ void audit_syscall_exit(int valid, long return_code)
908 * Add a name to the list of audit names for this context. 1126 * Add a name to the list of audit names for this context.
909 * Called from fs/namei.c:getname(). 1127 * Called from fs/namei.c:getname().
910 */ 1128 */
911void audit_getname(const char *name) 1129void __audit_getname(const char *name)
912{ 1130{
913 struct audit_context *context = current->audit_context; 1131 struct audit_context *context = current->audit_context;
914 1132
915 if (!context || IS_ERR(name) || !name) 1133 if (IS_ERR(name) || !name)
916 return; 1134 return;
917 1135
918 if (!context->in_syscall) { 1136 if (!context->in_syscall) {
@@ -925,6 +1143,8 @@ void audit_getname(const char *name)
925 } 1143 }
926 BUG_ON(context->name_count >= AUDIT_NAMES); 1144 BUG_ON(context->name_count >= AUDIT_NAMES);
927 context->names[context->name_count].name = name; 1145 context->names[context->name_count].name = name;
1146 context->names[context->name_count].name_len = AUDIT_NAME_FULL;
1147 context->names[context->name_count].name_put = 1;
928 context->names[context->name_count].ino = (unsigned long)-1; 1148 context->names[context->name_count].ino = (unsigned long)-1;
929 ++context->name_count; 1149 ++context->name_count;
930 if (!context->pwd) { 1150 if (!context->pwd) {
@@ -991,11 +1211,10 @@ static void audit_inode_context(int idx, const struct inode *inode)
991 * audit_inode - store the inode and device from a lookup 1211 * audit_inode - store the inode and device from a lookup
992 * @name: name being audited 1212 * @name: name being audited
993 * @inode: inode being audited 1213 * @inode: inode being audited
994 * @flags: lookup flags (as used in path_lookup())
995 * 1214 *
996 * Called from fs/namei.c:path_lookup(). 1215 * Called from fs/namei.c:path_lookup().
997 */ 1216 */
998void __audit_inode(const char *name, const struct inode *inode, unsigned flags) 1217void __audit_inode(const char *name, const struct inode *inode)
999{ 1218{
1000 int idx; 1219 int idx;
1001 struct audit_context *context = current->audit_context; 1220 struct audit_context *context = current->audit_context;
@@ -1021,20 +1240,13 @@ void __audit_inode(const char *name, const struct inode *inode, unsigned flags)
1021 ++context->ino_count; 1240 ++context->ino_count;
1022#endif 1241#endif
1023 } 1242 }
1243 context->names[idx].ino = inode->i_ino;
1024 context->names[idx].dev = inode->i_sb->s_dev; 1244 context->names[idx].dev = inode->i_sb->s_dev;
1025 context->names[idx].mode = inode->i_mode; 1245 context->names[idx].mode = inode->i_mode;
1026 context->names[idx].uid = inode->i_uid; 1246 context->names[idx].uid = inode->i_uid;
1027 context->names[idx].gid = inode->i_gid; 1247 context->names[idx].gid = inode->i_gid;
1028 context->names[idx].rdev = inode->i_rdev; 1248 context->names[idx].rdev = inode->i_rdev;
1029 audit_inode_context(idx, inode); 1249 audit_inode_context(idx, inode);
1030 if ((flags & LOOKUP_PARENT) && (strcmp(name, "/") != 0) &&
1031 (strcmp(name, ".") != 0)) {
1032 context->names[idx].ino = (unsigned long)-1;
1033 context->names[idx].pino = inode->i_ino;
1034 } else {
1035 context->names[idx].ino = inode->i_ino;
1036 context->names[idx].pino = (unsigned long)-1;
1037 }
1038} 1250}
1039 1251
1040/** 1252/**
@@ -1056,51 +1268,40 @@ void __audit_inode_child(const char *dname, const struct inode *inode,
1056{ 1268{
1057 int idx; 1269 int idx;
1058 struct audit_context *context = current->audit_context; 1270 struct audit_context *context = current->audit_context;
1271 const char *found_name = NULL;
1272 int dirlen = 0;
1059 1273
1060 if (!context->in_syscall) 1274 if (!context->in_syscall)
1061 return; 1275 return;
1062 1276
1063 /* determine matching parent */ 1277 /* determine matching parent */
1064 if (dname) 1278 if (!dname)
1065 for (idx = 0; idx < context->name_count; idx++) 1279 goto update_context;
1066 if (context->names[idx].pino == pino) { 1280 for (idx = 0; idx < context->name_count; idx++)
1067 const char *n; 1281 if (context->names[idx].ino == pino) {
1068 const char *name = context->names[idx].name; 1282 const char *name = context->names[idx].name;
1069 int dlen = strlen(dname); 1283
1070 int nlen = name ? strlen(name) : 0; 1284 if (!name)
1071 1285 continue;
1072 if (nlen < dlen) 1286
1073 continue; 1287 if (audit_compare_dname_path(dname, name, &dirlen) == 0) {
1074 1288 context->names[idx].name_len = dirlen;
1075 /* disregard trailing slashes */ 1289 found_name = name;
1076 n = name + nlen - 1; 1290 break;
1077 while ((*n == '/') && (n > name))
1078 n--;
1079
1080 /* find last path component */
1081 n = n - dlen + 1;
1082 if (n < name)
1083 continue;
1084 else if (n > name) {
1085 if (*--n != '/')
1086 continue;
1087 else
1088 n++;
1089 }
1090
1091 if (strncmp(n, dname, dlen) == 0)
1092 goto update_context;
1093 } 1291 }
1292 }
1094 1293
1095 /* catch-all in case match not found */ 1294update_context:
1096 idx = context->name_count++; 1295 idx = context->name_count++;
1097 context->names[idx].name = NULL;
1098 context->names[idx].pino = pino;
1099#if AUDIT_DEBUG 1296#if AUDIT_DEBUG
1100 context->ino_count++; 1297 context->ino_count++;
1101#endif 1298#endif
1299 /* Re-use the name belonging to the slot for a matching parent directory.
1300 * All names for this context are relinquished in audit_free_names() */
1301 context->names[idx].name = found_name;
1302 context->names[idx].name_len = AUDIT_NAME_FULL;
1303 context->names[idx].name_put = 0; /* don't call __putname() */
1102 1304
1103update_context:
1104 if (inode) { 1305 if (inode) {
1105 context->names[idx].ino = inode->i_ino; 1306 context->names[idx].ino = inode->i_ino;
1106 context->names[idx].dev = inode->i_sb->s_dev; 1307 context->names[idx].dev = inode->i_sb->s_dev;
@@ -1109,7 +1310,8 @@ update_context:
1109 context->names[idx].gid = inode->i_gid; 1310 context->names[idx].gid = inode->i_gid;
1110 context->names[idx].rdev = inode->i_rdev; 1311 context->names[idx].rdev = inode->i_rdev;
1111 audit_inode_context(idx, inode); 1312 audit_inode_context(idx, inode);
1112 } 1313 } else
1314 context->names[idx].ino = (unsigned long)-1;
1113} 1315}
1114 1316
1115/** 1317/**
@@ -1142,18 +1344,23 @@ void auditsc_get_stamp(struct audit_context *ctx,
1142 */ 1344 */
1143int audit_set_loginuid(struct task_struct *task, uid_t loginuid) 1345int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
1144{ 1346{
1145 if (task->audit_context) { 1347 struct audit_context *context = task->audit_context;
1146 struct audit_buffer *ab; 1348
1147 1349 if (context) {
1148 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); 1350 /* Only log if audit is enabled */
1149 if (ab) { 1351 if (context->in_syscall) {
1150 audit_log_format(ab, "login pid=%d uid=%u " 1352 struct audit_buffer *ab;
1151 "old auid=%u new auid=%u", 1353
1152 task->pid, task->uid, 1354 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
1153 task->audit_context->loginuid, loginuid); 1355 if (ab) {
1154 audit_log_end(ab); 1356 audit_log_format(ab, "login pid=%d uid=%u "
1357 "old auid=%u new auid=%u",
1358 task->pid, task->uid,
1359 context->loginuid, loginuid);
1360 audit_log_end(ab);
1361 }
1155 } 1362 }
1156 task->audit_context->loginuid = loginuid; 1363 context->loginuid = loginuid;
1157 } 1364 }
1158 return 0; 1365 return 0;
1159} 1366}
@@ -1170,16 +1377,193 @@ uid_t audit_get_loginuid(struct audit_context *ctx)
1170} 1377}
1171 1378
1172/** 1379/**
1173 * audit_ipc_obj - record audit data for ipc object 1380 * __audit_mq_open - record audit data for a POSIX MQ open
1174 * @ipcp: ipc permissions 1381 * @oflag: open flag
1382 * @mode: mode bits
1383 * @u_attr: queue attributes
1175 * 1384 *
1176 * Returns 0 for success or NULL context or < 0 on error. 1385 * Returns 0 for success or NULL context or < 0 on error.
1177 */ 1386 */
1178int audit_ipc_obj(struct kern_ipc_perm *ipcp) 1387int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr)
1179{ 1388{
1180 struct audit_aux_data_ipcctl *ax; 1389 struct audit_aux_data_mq_open *ax;
1390 struct audit_context *context = current->audit_context;
1391
1392 if (!audit_enabled)
1393 return 0;
1394
1395 if (likely(!context))
1396 return 0;
1397
1398 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
1399 if (!ax)
1400 return -ENOMEM;
1401
1402 if (u_attr != NULL) {
1403 if (copy_from_user(&ax->attr, u_attr, sizeof(ax->attr))) {
1404 kfree(ax);
1405 return -EFAULT;
1406 }
1407 } else
1408 memset(&ax->attr, 0, sizeof(ax->attr));
1409
1410 ax->oflag = oflag;
1411 ax->mode = mode;
1412
1413 ax->d.type = AUDIT_MQ_OPEN;
1414 ax->d.next = context->aux;
1415 context->aux = (void *)ax;
1416 return 0;
1417}
1418
1419/**
1420 * __audit_mq_timedsend - record audit data for a POSIX MQ timed send
1421 * @mqdes: MQ descriptor
1422 * @msg_len: Message length
1423 * @msg_prio: Message priority
1424 * @u_abs_timeout: Message timeout in absolute time
1425 *
1426 * Returns 0 for success or NULL context or < 0 on error.
1427 */
1428int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
1429 const struct timespec __user *u_abs_timeout)
1430{
1431 struct audit_aux_data_mq_sendrecv *ax;
1432 struct audit_context *context = current->audit_context;
1433
1434 if (!audit_enabled)
1435 return 0;
1436
1437 if (likely(!context))
1438 return 0;
1439
1440 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
1441 if (!ax)
1442 return -ENOMEM;
1443
1444 if (u_abs_timeout != NULL) {
1445 if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) {
1446 kfree(ax);
1447 return -EFAULT;
1448 }
1449 } else
1450 memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout));
1451
1452 ax->mqdes = mqdes;
1453 ax->msg_len = msg_len;
1454 ax->msg_prio = msg_prio;
1455
1456 ax->d.type = AUDIT_MQ_SENDRECV;
1457 ax->d.next = context->aux;
1458 context->aux = (void *)ax;
1459 return 0;
1460}
1461
1462/**
1463 * __audit_mq_timedreceive - record audit data for a POSIX MQ timed receive
1464 * @mqdes: MQ descriptor
1465 * @msg_len: Message length
1466 * @u_msg_prio: Message priority
1467 * @u_abs_timeout: Message timeout in absolute time
1468 *
1469 * Returns 0 for success or NULL context or < 0 on error.
1470 */
1471int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len,
1472 unsigned int __user *u_msg_prio,
1473 const struct timespec __user *u_abs_timeout)
1474{
1475 struct audit_aux_data_mq_sendrecv *ax;
1476 struct audit_context *context = current->audit_context;
1477
1478 if (!audit_enabled)
1479 return 0;
1480
1481 if (likely(!context))
1482 return 0;
1483
1484 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
1485 if (!ax)
1486 return -ENOMEM;
1487
1488 if (u_msg_prio != NULL) {
1489 if (get_user(ax->msg_prio, u_msg_prio)) {
1490 kfree(ax);
1491 return -EFAULT;
1492 }
1493 } else
1494 ax->msg_prio = 0;
1495
1496 if (u_abs_timeout != NULL) {
1497 if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) {
1498 kfree(ax);
1499 return -EFAULT;
1500 }
1501 } else
1502 memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout));
1503
1504 ax->mqdes = mqdes;
1505 ax->msg_len = msg_len;
1506
1507 ax->d.type = AUDIT_MQ_SENDRECV;
1508 ax->d.next = context->aux;
1509 context->aux = (void *)ax;
1510 return 0;
1511}
1512
1513/**
1514 * __audit_mq_notify - record audit data for a POSIX MQ notify
1515 * @mqdes: MQ descriptor
1516 * @u_notification: Notification event
1517 *
1518 * Returns 0 for success or NULL context or < 0 on error.
1519 */
1520
1521int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification)
1522{
1523 struct audit_aux_data_mq_notify *ax;
1524 struct audit_context *context = current->audit_context;
1525
1526 if (!audit_enabled)
1527 return 0;
1528
1529 if (likely(!context))
1530 return 0;
1531
1532 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
1533 if (!ax)
1534 return -ENOMEM;
1535
1536 if (u_notification != NULL) {
1537 if (copy_from_user(&ax->notification, u_notification, sizeof(ax->notification))) {
1538 kfree(ax);
1539 return -EFAULT;
1540 }
1541 } else
1542 memset(&ax->notification, 0, sizeof(ax->notification));
1543
1544 ax->mqdes = mqdes;
1545
1546 ax->d.type = AUDIT_MQ_NOTIFY;
1547 ax->d.next = context->aux;
1548 context->aux = (void *)ax;
1549 return 0;
1550}
1551
1552/**
1553 * __audit_mq_getsetattr - record audit data for a POSIX MQ get/set attribute
1554 * @mqdes: MQ descriptor
1555 * @mqstat: MQ flags
1556 *
1557 * Returns 0 for success or NULL context or < 0 on error.
1558 */
1559int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
1560{
1561 struct audit_aux_data_mq_getsetattr *ax;
1181 struct audit_context *context = current->audit_context; 1562 struct audit_context *context = current->audit_context;
1182 1563
1564 if (!audit_enabled)
1565 return 0;
1566
1183 if (likely(!context)) 1567 if (likely(!context))
1184 return 0; 1568 return 0;
1185 1569
@@ -1187,6 +1571,30 @@ int audit_ipc_obj(struct kern_ipc_perm *ipcp)
1187 if (!ax) 1571 if (!ax)
1188 return -ENOMEM; 1572 return -ENOMEM;
1189 1573
1574 ax->mqdes = mqdes;
1575 ax->mqstat = *mqstat;
1576
1577 ax->d.type = AUDIT_MQ_GETSETATTR;
1578 ax->d.next = context->aux;
1579 context->aux = (void *)ax;
1580 return 0;
1581}
1582
1583/**
1584 * audit_ipc_obj - record audit data for ipc object
1585 * @ipcp: ipc permissions
1586 *
1587 * Returns 0 for success or NULL context or < 0 on error.
1588 */
1589int __audit_ipc_obj(struct kern_ipc_perm *ipcp)
1590{
1591 struct audit_aux_data_ipcctl *ax;
1592 struct audit_context *context = current->audit_context;
1593
1594 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
1595 if (!ax)
1596 return -ENOMEM;
1597
1190 ax->uid = ipcp->uid; 1598 ax->uid = ipcp->uid;
1191 ax->gid = ipcp->gid; 1599 ax->gid = ipcp->gid;
1192 ax->mode = ipcp->mode; 1600 ax->mode = ipcp->mode;
@@ -1207,14 +1615,11 @@ int audit_ipc_obj(struct kern_ipc_perm *ipcp)
1207 * 1615 *
1208 * Returns 0 for success or NULL context or < 0 on error. 1616 * Returns 0 for success or NULL context or < 0 on error.
1209 */ 1617 */
1210int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, struct kern_ipc_perm *ipcp) 1618int __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
1211{ 1619{
1212 struct audit_aux_data_ipcctl *ax; 1620 struct audit_aux_data_ipcctl *ax;
1213 struct audit_context *context = current->audit_context; 1621 struct audit_context *context = current->audit_context;
1214 1622
1215 if (likely(!context))
1216 return 0;
1217
1218 ax = kmalloc(sizeof(*ax), GFP_ATOMIC); 1623 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
1219 if (!ax) 1624 if (!ax)
1220 return -ENOMEM; 1625 return -ENOMEM;
@@ -1223,7 +1628,6 @@ int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode,
1223 ax->uid = uid; 1628 ax->uid = uid;
1224 ax->gid = gid; 1629 ax->gid = gid;
1225 ax->mode = mode; 1630 ax->mode = mode;
1226 selinux_get_ipc_sid(ipcp, &ax->osid);
1227 1631
1228 ax->d.type = AUDIT_IPC_SET_PERM; 1632 ax->d.type = AUDIT_IPC_SET_PERM;
1229 ax->d.next = context->aux; 1633 ax->d.next = context->aux;
@@ -1231,6 +1635,39 @@ int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode,
1231 return 0; 1635 return 0;
1232} 1636}
1233 1637
1638int audit_bprm(struct linux_binprm *bprm)
1639{
1640 struct audit_aux_data_execve *ax;
1641 struct audit_context *context = current->audit_context;
1642 unsigned long p, next;
1643 void *to;
1644
1645 if (likely(!audit_enabled || !context))
1646 return 0;
1647
1648 ax = kmalloc(sizeof(*ax) + PAGE_SIZE * MAX_ARG_PAGES - bprm->p,
1649 GFP_KERNEL);
1650 if (!ax)
1651 return -ENOMEM;
1652
1653 ax->argc = bprm->argc;
1654 ax->envc = bprm->envc;
1655 for (p = bprm->p, to = ax->mem; p < MAX_ARG_PAGES*PAGE_SIZE; p = next) {
1656 struct page *page = bprm->page[p / PAGE_SIZE];
1657 void *kaddr = kmap(page);
1658 next = (p + PAGE_SIZE) & ~(PAGE_SIZE - 1);
1659 memcpy(to, kaddr + (p & (PAGE_SIZE - 1)), next - p);
1660 to += next - p;
1661 kunmap(page);
1662 }
1663
1664 ax->d.type = AUDIT_EXECVE;
1665 ax->d.next = context->aux;
1666 context->aux = (void *)ax;
1667 return 0;
1668}
1669
1670
1234/** 1671/**
1235 * audit_socketcall - record audit data for sys_socketcall 1672 * audit_socketcall - record audit data for sys_socketcall
1236 * @nargs: number of args 1673 * @nargs: number of args
@@ -1325,19 +1762,20 @@ int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt)
1325 * If the audit subsystem is being terminated, record the task (pid) 1762 * If the audit subsystem is being terminated, record the task (pid)
1326 * and uid that is doing that. 1763 * and uid that is doing that.
1327 */ 1764 */
1328void audit_signal_info(int sig, struct task_struct *t) 1765void __audit_signal_info(int sig, struct task_struct *t)
1329{ 1766{
1330 extern pid_t audit_sig_pid; 1767 extern pid_t audit_sig_pid;
1331 extern uid_t audit_sig_uid; 1768 extern uid_t audit_sig_uid;
1332 1769 extern u32 audit_sig_sid;
1333 if (unlikely(audit_pid && t->tgid == audit_pid)) { 1770
1334 if (sig == SIGTERM || sig == SIGHUP) { 1771 if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) {
1335 struct audit_context *ctx = current->audit_context; 1772 struct task_struct *tsk = current;
1336 audit_sig_pid = current->pid; 1773 struct audit_context *ctx = tsk->audit_context;
1337 if (ctx) 1774 audit_sig_pid = tsk->pid;
1338 audit_sig_uid = ctx->loginuid; 1775 if (ctx)
1339 else 1776 audit_sig_uid = ctx->loginuid;
1340 audit_sig_uid = current->uid; 1777 else
1341 } 1778 audit_sig_uid = tsk->uid;
1779 selinux_get_task_sid(tsk, &audit_sig_sid);
1342 } 1780 }
1343} 1781}
diff --git a/kernel/compat.c b/kernel/compat.c
index c1601a84f8d8..126dee9530aa 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -21,6 +21,7 @@
21#include <linux/unistd.h> 21#include <linux/unistd.h>
22#include <linux/security.h> 22#include <linux/security.h>
23#include <linux/timex.h> 23#include <linux/timex.h>
24#include <linux/migrate.h>
24 25
25#include <asm/uaccess.h> 26#include <asm/uaccess.h>
26 27
@@ -729,17 +730,10 @@ void
729sigset_from_compat (sigset_t *set, compat_sigset_t *compat) 730sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
730{ 731{
731 switch (_NSIG_WORDS) { 732 switch (_NSIG_WORDS) {
732#if defined (__COMPAT_ENDIAN_SWAP__)
733 case 4: set->sig[3] = compat->sig[7] | (((long)compat->sig[6]) << 32 );
734 case 3: set->sig[2] = compat->sig[5] | (((long)compat->sig[4]) << 32 );
735 case 2: set->sig[1] = compat->sig[3] | (((long)compat->sig[2]) << 32 );
736 case 1: set->sig[0] = compat->sig[1] | (((long)compat->sig[0]) << 32 );
737#else
738 case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 ); 733 case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 );
739 case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 ); 734 case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 );
740 case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 ); 735 case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 );
741 case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 ); 736 case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 );
742#endif
743 } 737 }
744} 738}
745 739
@@ -934,3 +928,25 @@ asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
934 928
935 return ret; 929 return ret;
936} 930}
931
932#ifdef CONFIG_NUMA
933asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages,
934 compat_uptr_t __user *pages32,
935 const int __user *nodes,
936 int __user *status,
937 int flags)
938{
939 const void __user * __user *pages;
940 int i;
941
942 pages = compat_alloc_user_space(nr_pages * sizeof(void *));
943 for (i = 0; i < nr_pages; i++) {
944 compat_uptr_t p;
945
946 if (get_user(p, pages32 + i) ||
947 put_user(compat_ptr(p), pages + i))
948 return -EFAULT;
949 }
950 return sys_move_pages(pid, nr_pages, pages, nodes, status, flags);
951}
952#endif
diff --git a/kernel/configs.c b/kernel/configs.c
index 009e1ebdcb88..f9e31974f4ad 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -23,7 +23,6 @@
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 */ 24 */
25 25
26#include <linux/config.h>
27#include <linux/kernel.h> 26#include <linux/kernel.h>
28#include <linux/module.h> 27#include <linux/module.h>
29#include <linux/proc_fs.h> 28#include <linux/proc_fs.h>
diff --git a/kernel/cpu.c b/kernel/cpu.c
index fe2b8d0bfe4c..70fbf2e83766 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -13,12 +13,12 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/kthread.h> 14#include <linux/kthread.h>
15#include <linux/stop_machine.h> 15#include <linux/stop_machine.h>
16#include <asm/semaphore.h> 16#include <linux/mutex.h>
17 17
18/* This protects CPUs going up and down... */ 18/* This protects CPUs going up and down... */
19static DECLARE_MUTEX(cpucontrol); 19static DEFINE_MUTEX(cpucontrol);
20 20
21static BLOCKING_NOTIFIER_HEAD(cpu_chain); 21static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain);
22 22
23#ifdef CONFIG_HOTPLUG_CPU 23#ifdef CONFIG_HOTPLUG_CPU
24static struct task_struct *lock_cpu_hotplug_owner; 24static struct task_struct *lock_cpu_hotplug_owner;
@@ -30,9 +30,9 @@ static int __lock_cpu_hotplug(int interruptible)
30 30
31 if (lock_cpu_hotplug_owner != current) { 31 if (lock_cpu_hotplug_owner != current) {
32 if (interruptible) 32 if (interruptible)
33 ret = down_interruptible(&cpucontrol); 33 ret = mutex_lock_interruptible(&cpucontrol);
34 else 34 else
35 down(&cpucontrol); 35 mutex_lock(&cpucontrol);
36 } 36 }
37 37
38 /* 38 /*
@@ -56,7 +56,7 @@ void unlock_cpu_hotplug(void)
56{ 56{
57 if (--lock_cpu_hotplug_depth == 0) { 57 if (--lock_cpu_hotplug_depth == 0) {
58 lock_cpu_hotplug_owner = NULL; 58 lock_cpu_hotplug_owner = NULL;
59 up(&cpucontrol); 59 mutex_unlock(&cpucontrol);
60 } 60 }
61} 61}
62EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); 62EXPORT_SYMBOL_GPL(unlock_cpu_hotplug);
@@ -69,10 +69,13 @@ EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible);
69#endif /* CONFIG_HOTPLUG_CPU */ 69#endif /* CONFIG_HOTPLUG_CPU */
70 70
71/* Need to know about CPUs going up/down? */ 71/* Need to know about CPUs going up/down? */
72int register_cpu_notifier(struct notifier_block *nb) 72int __cpuinit register_cpu_notifier(struct notifier_block *nb)
73{ 73{
74 return blocking_notifier_chain_register(&cpu_chain, nb); 74 return blocking_notifier_chain_register(&cpu_chain, nb);
75} 75}
76
77#ifdef CONFIG_HOTPLUG_CPU
78
76EXPORT_SYMBOL(register_cpu_notifier); 79EXPORT_SYMBOL(register_cpu_notifier);
77 80
78void unregister_cpu_notifier(struct notifier_block *nb) 81void unregister_cpu_notifier(struct notifier_block *nb)
@@ -81,7 +84,6 @@ void unregister_cpu_notifier(struct notifier_block *nb)
81} 84}
82EXPORT_SYMBOL(unregister_cpu_notifier); 85EXPORT_SYMBOL(unregister_cpu_notifier);
83 86
84#ifdef CONFIG_HOTPLUG_CPU
85static inline void check_for_tasks(int cpu) 87static inline void check_for_tasks(int cpu)
86{ 88{
87 struct task_struct *p; 89 struct task_struct *p;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ab81fdd4572b..c232dc077438 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -18,7 +18,6 @@
18 * distribution for more details. 18 * distribution for more details.
19 */ 19 */
20 20
21#include <linux/config.h>
22#include <linux/cpu.h> 21#include <linux/cpu.h>
23#include <linux/cpumask.h> 22#include <linux/cpumask.h>
24#include <linux/cpuset.h> 23#include <linux/cpuset.h>
@@ -41,6 +40,7 @@
41#include <linux/rcupdate.h> 40#include <linux/rcupdate.h>
42#include <linux/sched.h> 41#include <linux/sched.h>
43#include <linux/seq_file.h> 42#include <linux/seq_file.h>
43#include <linux/security.h>
44#include <linux/slab.h> 44#include <linux/slab.h>
45#include <linux/smp_lock.h> 45#include <linux/smp_lock.h>
46#include <linux/spinlock.h> 46#include <linux/spinlock.h>
@@ -392,11 +392,11 @@ static int cpuset_fill_super(struct super_block *sb, void *unused_data,
392 return 0; 392 return 0;
393} 393}
394 394
395static struct super_block *cpuset_get_sb(struct file_system_type *fs_type, 395static int cpuset_get_sb(struct file_system_type *fs_type,
396 int flags, const char *unused_dev_name, 396 int flags, const char *unused_dev_name,
397 void *data) 397 void *data, struct vfsmount *mnt)
398{ 398{
399 return get_sb_single(fs_type, flags, data, cpuset_fill_super); 399 return get_sb_single(fs_type, flags, data, cpuset_fill_super, mnt);
400} 400}
401 401
402static struct file_system_type cpuset_fs_type = { 402static struct file_system_type cpuset_fs_type = {
@@ -1063,7 +1063,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf)
1063} 1063}
1064 1064
1065/* 1065/*
1066 * Frequency meter - How fast is some event occuring? 1066 * Frequency meter - How fast is some event occurring?
1067 * 1067 *
1068 * These routines manage a digitally filtered, constant time based, 1068 * These routines manage a digitally filtered, constant time based,
1069 * event frequency meter. There are four routines: 1069 * event frequency meter. There are four routines:
@@ -1177,6 +1177,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
1177 cpumask_t cpus; 1177 cpumask_t cpus;
1178 nodemask_t from, to; 1178 nodemask_t from, to;
1179 struct mm_struct *mm; 1179 struct mm_struct *mm;
1180 int retval;
1180 1181
1181 if (sscanf(pidbuf, "%d", &pid) != 1) 1182 if (sscanf(pidbuf, "%d", &pid) != 1)
1182 return -EIO; 1183 return -EIO;
@@ -1205,6 +1206,12 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
1205 get_task_struct(tsk); 1206 get_task_struct(tsk);
1206 } 1207 }
1207 1208
1209 retval = security_task_setscheduler(tsk, 0, NULL);
1210 if (retval) {
1211 put_task_struct(tsk);
1212 return retval;
1213 }
1214
1208 mutex_lock(&callback_mutex); 1215 mutex_lock(&callback_mutex);
1209 1216
1210 task_lock(tsk); 1217 task_lock(tsk);
@@ -2434,31 +2441,43 @@ void __cpuset_memory_pressure_bump(void)
2434 */ 2441 */
2435static int proc_cpuset_show(struct seq_file *m, void *v) 2442static int proc_cpuset_show(struct seq_file *m, void *v)
2436{ 2443{
2444 struct pid *pid;
2437 struct task_struct *tsk; 2445 struct task_struct *tsk;
2438 char *buf; 2446 char *buf;
2439 int retval = 0; 2447 int retval;
2440 2448
2449 retval = -ENOMEM;
2441 buf = kmalloc(PAGE_SIZE, GFP_KERNEL); 2450 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
2442 if (!buf) 2451 if (!buf)
2443 return -ENOMEM; 2452 goto out;
2444 2453
2445 tsk = m->private; 2454 retval = -ESRCH;
2455 pid = m->private;
2456 tsk = get_pid_task(pid, PIDTYPE_PID);
2457 if (!tsk)
2458 goto out_free;
2459
2460 retval = -EINVAL;
2446 mutex_lock(&manage_mutex); 2461 mutex_lock(&manage_mutex);
2462
2447 retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE); 2463 retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE);
2448 if (retval < 0) 2464 if (retval < 0)
2449 goto out; 2465 goto out_unlock;
2450 seq_puts(m, buf); 2466 seq_puts(m, buf);
2451 seq_putc(m, '\n'); 2467 seq_putc(m, '\n');
2452out: 2468out_unlock:
2453 mutex_unlock(&manage_mutex); 2469 mutex_unlock(&manage_mutex);
2470 put_task_struct(tsk);
2471out_free:
2454 kfree(buf); 2472 kfree(buf);
2473out:
2455 return retval; 2474 return retval;
2456} 2475}
2457 2476
2458static int cpuset_open(struct inode *inode, struct file *file) 2477static int cpuset_open(struct inode *inode, struct file *file)
2459{ 2478{
2460 struct task_struct *tsk = PROC_I(inode)->task; 2479 struct pid *pid = PROC_I(inode)->pid;
2461 return single_open(file, proc_cpuset_show, tsk); 2480 return single_open(file, proc_cpuset_show, pid);
2462} 2481}
2463 2482
2464struct file_operations proc_cpuset_operations = { 2483struct file_operations proc_cpuset_operations = {
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index c01cead2cfd6..3c2eaea66b1e 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -7,7 +7,6 @@
7 * 2001-05-06 Complete rewrite, Christoph Hellwig (hch@infradead.org) 7 * 2001-05-06 Complete rewrite, Christoph Hellwig (hch@infradead.org)
8 */ 8 */
9 9
10#include <linux/config.h>
11#include <linux/init.h> 10#include <linux/init.h>
12#include <linux/kernel.h> 11#include <linux/kernel.h>
13#include <linux/kmod.h> 12#include <linux/kmod.h>
diff --git a/kernel/exit.c b/kernel/exit.c
index e06d0c10a24e..7f7ef2258553 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -4,7 +4,6 @@
4 * Copyright (C) 1991, 1992 Linus Torvalds 4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */ 5 */
6 6
7#include <linux/config.h>
8#include <linux/mm.h> 7#include <linux/mm.h>
9#include <linux/slab.h> 8#include <linux/slab.h>
10#include <linux/interrupt.h> 9#include <linux/interrupt.h>
@@ -36,6 +35,7 @@
36#include <linux/compat.h> 35#include <linux/compat.h>
37#include <linux/pipe_fs_i.h> 36#include <linux/pipe_fs_i.h>
38#include <linux/audit.h> /* for audit_free() */ 37#include <linux/audit.h> /* for audit_free() */
38#include <linux/resource.h>
39 39
40#include <asm/uaccess.h> 40#include <asm/uaccess.h>
41#include <asm/unistd.h> 41#include <asm/unistd.h>
@@ -45,8 +45,6 @@
45extern void sem_exit (void); 45extern void sem_exit (void);
46extern struct task_struct *child_reaper; 46extern struct task_struct *child_reaper;
47 47
48int getrusage(struct task_struct *, int, struct rusage __user *);
49
50static void exit_mm(struct task_struct * tsk); 48static void exit_mm(struct task_struct * tsk);
51 49
52static void __unhash_process(struct task_struct *p) 50static void __unhash_process(struct task_struct *p)
@@ -138,12 +136,8 @@ void release_task(struct task_struct * p)
138{ 136{
139 int zap_leader; 137 int zap_leader;
140 task_t *leader; 138 task_t *leader;
141 struct dentry *proc_dentry;
142
143repeat: 139repeat:
144 atomic_dec(&p->user->processes); 140 atomic_dec(&p->user->processes);
145 spin_lock(&p->proc_lock);
146 proc_dentry = proc_pid_unhash(p);
147 write_lock_irq(&tasklist_lock); 141 write_lock_irq(&tasklist_lock);
148 ptrace_unlink(p); 142 ptrace_unlink(p);
149 BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); 143 BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
@@ -172,8 +166,7 @@ repeat:
172 166
173 sched_exit(p); 167 sched_exit(p);
174 write_unlock_irq(&tasklist_lock); 168 write_unlock_irq(&tasklist_lock);
175 spin_unlock(&p->proc_lock); 169 proc_flush_task(p);
176 proc_pid_flush(proc_dentry);
177 release_thread(p); 170 release_thread(p);
178 call_rcu(&p->rcu, delayed_put_task_struct); 171 call_rcu(&p->rcu, delayed_put_task_struct);
179 172
@@ -579,7 +572,7 @@ static void exit_mm(struct task_struct * tsk)
579 down_read(&mm->mmap_sem); 572 down_read(&mm->mmap_sem);
580 } 573 }
581 atomic_inc(&mm->mm_count); 574 atomic_inc(&mm->mm_count);
582 if (mm != tsk->active_mm) BUG(); 575 BUG_ON(mm != tsk->active_mm);
583 /* more a memory barrier than a real lock */ 576 /* more a memory barrier than a real lock */
584 task_lock(tsk); 577 task_lock(tsk);
585 tsk->mm = NULL; 578 tsk->mm = NULL;
@@ -895,11 +888,11 @@ fastcall NORET_TYPE void do_exit(long code)
895 if (group_dead) { 888 if (group_dead) {
896 hrtimer_cancel(&tsk->signal->real_timer); 889 hrtimer_cancel(&tsk->signal->real_timer);
897 exit_itimers(tsk->signal); 890 exit_itimers(tsk->signal);
898 acct_process(code);
899 } 891 }
892 acct_collect(code, group_dead);
900 if (unlikely(tsk->robust_list)) 893 if (unlikely(tsk->robust_list))
901 exit_robust_list(tsk); 894 exit_robust_list(tsk);
902#ifdef CONFIG_COMPAT 895#if defined(CONFIG_FUTEX) && defined(CONFIG_COMPAT)
903 if (unlikely(tsk->compat_robust_list)) 896 if (unlikely(tsk->compat_robust_list))
904 compat_exit_robust_list(tsk); 897 compat_exit_robust_list(tsk);
905#endif 898#endif
@@ -907,6 +900,8 @@ fastcall NORET_TYPE void do_exit(long code)
907 audit_free(tsk); 900 audit_free(tsk);
908 exit_mm(tsk); 901 exit_mm(tsk);
909 902
903 if (group_dead)
904 acct_process();
910 exit_sem(tsk); 905 exit_sem(tsk);
911 __exit_files(tsk); 906 __exit_files(tsk);
912 __exit_fs(tsk); 907 __exit_fs(tsk);
@@ -930,9 +925,18 @@ fastcall NORET_TYPE void do_exit(long code)
930 tsk->mempolicy = NULL; 925 tsk->mempolicy = NULL;
931#endif 926#endif
932 /* 927 /*
928 * This must happen late, after the PID is not
929 * hashed anymore:
930 */
931 if (unlikely(!list_empty(&tsk->pi_state_list)))
932 exit_pi_state_list(tsk);
933 if (unlikely(current->pi_state_cache))
934 kfree(current->pi_state_cache);
935 /*
933 * If DEBUG_MUTEXES is on, make sure we are holding no locks: 936 * If DEBUG_MUTEXES is on, make sure we are holding no locks:
934 */ 937 */
935 mutex_debug_check_no_locks_held(tsk); 938 mutex_debug_check_no_locks_held(tsk);
939 rt_mutex_debug_check_no_locks_held(tsk);
936 940
937 if (tsk->io_context) 941 if (tsk->io_context)
938 exit_io_context(); 942 exit_io_context();
@@ -1530,8 +1534,7 @@ check_continued:
1530 if (options & __WNOTHREAD) 1534 if (options & __WNOTHREAD)
1531 break; 1535 break;
1532 tsk = next_thread(tsk); 1536 tsk = next_thread(tsk);
1533 if (tsk->signal != current->signal) 1537 BUG_ON(tsk->signal != current->signal);
1534 BUG();
1535 } while (tsk != current); 1538 } while (tsk != current);
1536 1539
1537 read_unlock(&tasklist_lock); 1540 read_unlock(&tasklist_lock);
diff --git a/kernel/fork.c b/kernel/fork.c
index ac8100e3088a..9064bf9e131b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -11,7 +11,6 @@
11 * management can be a bitch. See 'mm/memory.c': 'copy_page_range()' 11 * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
12 */ 12 */
13 13
14#include <linux/config.h>
15#include <linux/slab.h> 14#include <linux/slab.h>
16#include <linux/init.h> 15#include <linux/init.h>
17#include <linux/unistd.h> 16#include <linux/unistd.h>
@@ -104,6 +103,7 @@ static kmem_cache_t *mm_cachep;
104void free_task(struct task_struct *tsk) 103void free_task(struct task_struct *tsk)
105{ 104{
106 free_thread_info(tsk->thread_info); 105 free_thread_info(tsk->thread_info);
106 rt_mutex_debug_task_free(tsk);
107 free_task_struct(tsk); 107 free_task_struct(tsk);
108} 108}
109EXPORT_SYMBOL(free_task); 109EXPORT_SYMBOL(free_task);
@@ -368,6 +368,8 @@ void fastcall __mmdrop(struct mm_struct *mm)
368 */ 368 */
369void mmput(struct mm_struct *mm) 369void mmput(struct mm_struct *mm)
370{ 370{
371 might_sleep();
372
371 if (atomic_dec_and_test(&mm->mm_users)) { 373 if (atomic_dec_and_test(&mm->mm_users)) {
372 exit_aio(mm); 374 exit_aio(mm);
373 exit_mmap(mm); 375 exit_mmap(mm);
@@ -623,6 +625,7 @@ out:
623/* 625/*
624 * Allocate a new files structure and copy contents from the 626 * Allocate a new files structure and copy contents from the
625 * passed in files structure. 627 * passed in files structure.
628 * errorp will be valid only when the returned files_struct is NULL.
626 */ 629 */
627static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) 630static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
628{ 631{
@@ -631,6 +634,7 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
631 int open_files, size, i, expand; 634 int open_files, size, i, expand;
632 struct fdtable *old_fdt, *new_fdt; 635 struct fdtable *old_fdt, *new_fdt;
633 636
637 *errorp = -ENOMEM;
634 newf = alloc_files(); 638 newf = alloc_files();
635 if (!newf) 639 if (!newf)
636 goto out; 640 goto out;
@@ -744,7 +748,6 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
744 * break this. 748 * break this.
745 */ 749 */
746 tsk->files = NULL; 750 tsk->files = NULL;
747 error = -ENOMEM;
748 newf = dup_fd(oldf, &error); 751 newf = dup_fd(oldf, &error);
749 if (!newf) 752 if (!newf)
750 goto out; 753 goto out;
@@ -871,6 +874,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
871 tsk->it_prof_expires = 874 tsk->it_prof_expires =
872 secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); 875 secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
873 } 876 }
877 acct_init_pacct(&sig->pacct);
874 878
875 return 0; 879 return 0;
876} 880}
@@ -909,6 +913,19 @@ asmlinkage long sys_set_tid_address(int __user *tidptr)
909 return current->pid; 913 return current->pid;
910} 914}
911 915
916static inline void rt_mutex_init_task(struct task_struct *p)
917{
918#ifdef CONFIG_RT_MUTEXES
919 spin_lock_init(&p->pi_lock);
920 plist_head_init(&p->pi_waiters, &p->pi_lock);
921 p->pi_blocked_on = NULL;
922# ifdef CONFIG_DEBUG_RT_MUTEXES
923 spin_lock_init(&p->held_list_lock);
924 INIT_LIST_HEAD(&p->held_list_head);
925# endif
926#endif
927}
928
912/* 929/*
913 * This creates a new process as a copy of the old one, 930 * This creates a new process as a copy of the old one,
914 * but does not actually start it yet. 931 * but does not actually start it yet.
@@ -989,13 +1006,10 @@ static task_t *copy_process(unsigned long clone_flags,
989 if (put_user(p->pid, parent_tidptr)) 1006 if (put_user(p->pid, parent_tidptr))
990 goto bad_fork_cleanup; 1007 goto bad_fork_cleanup;
991 1008
992 p->proc_dentry = NULL;
993
994 INIT_LIST_HEAD(&p->children); 1009 INIT_LIST_HEAD(&p->children);
995 INIT_LIST_HEAD(&p->sibling); 1010 INIT_LIST_HEAD(&p->sibling);
996 p->vfork_done = NULL; 1011 p->vfork_done = NULL;
997 spin_lock_init(&p->alloc_lock); 1012 spin_lock_init(&p->alloc_lock);
998 spin_lock_init(&p->proc_lock);
999 1013
1000 clear_tsk_thread_flag(p, TIF_SIGPENDING); 1014 clear_tsk_thread_flag(p, TIF_SIGPENDING);
1001 init_sigpending(&p->pending); 1015 init_sigpending(&p->pending);
@@ -1033,6 +1047,8 @@ static task_t *copy_process(unsigned long clone_flags,
1033 mpol_fix_fork_child_flag(p); 1047 mpol_fix_fork_child_flag(p);
1034#endif 1048#endif
1035 1049
1050 rt_mutex_init_task(p);
1051
1036#ifdef CONFIG_DEBUG_MUTEXES 1052#ifdef CONFIG_DEBUG_MUTEXES
1037 p->blocked_on = NULL; /* not blocked yet */ 1053 p->blocked_on = NULL; /* not blocked yet */
1038#endif 1054#endif
@@ -1075,6 +1091,9 @@ static task_t *copy_process(unsigned long clone_flags,
1075#ifdef CONFIG_COMPAT 1091#ifdef CONFIG_COMPAT
1076 p->compat_robust_list = NULL; 1092 p->compat_robust_list = NULL;
1077#endif 1093#endif
1094 INIT_LIST_HEAD(&p->pi_state_list);
1095 p->pi_state_cache = NULL;
1096
1078 /* 1097 /*
1079 * sigaltstack should be cleared when sharing the same VM 1098 * sigaltstack should be cleared when sharing the same VM
1080 */ 1099 */
@@ -1155,18 +1174,6 @@ static task_t *copy_process(unsigned long clone_flags,
1155 } 1174 }
1156 1175
1157 if (clone_flags & CLONE_THREAD) { 1176 if (clone_flags & CLONE_THREAD) {
1158 /*
1159 * Important: if an exit-all has been started then
1160 * do not create this new thread - the whole thread
1161 * group is supposed to exit anyway.
1162 */
1163 if (current->signal->flags & SIGNAL_GROUP_EXIT) {
1164 spin_unlock(&current->sighand->siglock);
1165 write_unlock_irq(&tasklist_lock);
1166 retval = -EAGAIN;
1167 goto bad_fork_cleanup_namespace;
1168 }
1169
1170 p->group_leader = current->group_leader; 1177 p->group_leader = current->group_leader;
1171 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); 1178 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
1172 1179
diff --git a/kernel/futex.c b/kernel/futex.c
index 5699c512057b..15caf93e4a43 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -12,6 +12,10 @@
12 * (C) Copyright 2006 Red Hat Inc, All Rights Reserved 12 * (C) Copyright 2006 Red Hat Inc, All Rights Reserved
13 * Thanks to Thomas Gleixner for suggestions, analysis and fixes. 13 * Thanks to Thomas Gleixner for suggestions, analysis and fixes.
14 * 14 *
15 * PI-futex support started by Ingo Molnar and Thomas Gleixner
16 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
17 * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
18 *
15 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly 19 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
16 * enough at me, Linus for the original (flawed) idea, Matthew 20 * enough at me, Linus for the original (flawed) idea, Matthew
17 * Kirkwood for proof-of-concept implementation. 21 * Kirkwood for proof-of-concept implementation.
@@ -46,6 +50,8 @@
46#include <linux/signal.h> 50#include <linux/signal.h>
47#include <asm/futex.h> 51#include <asm/futex.h>
48 52
53#include "rtmutex_common.h"
54
49#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) 55#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
50 56
51/* 57/*
@@ -63,7 +69,7 @@ union futex_key {
63 int offset; 69 int offset;
64 } shared; 70 } shared;
65 struct { 71 struct {
66 unsigned long uaddr; 72 unsigned long address;
67 struct mm_struct *mm; 73 struct mm_struct *mm;
68 int offset; 74 int offset;
69 } private; 75 } private;
@@ -75,6 +81,27 @@ union futex_key {
75}; 81};
76 82
77/* 83/*
84 * Priority Inheritance state:
85 */
86struct futex_pi_state {
87 /*
88 * list of 'owned' pi_state instances - these have to be
89 * cleaned up in do_exit() if the task exits prematurely:
90 */
91 struct list_head list;
92
93 /*
94 * The PI object:
95 */
96 struct rt_mutex pi_mutex;
97
98 struct task_struct *owner;
99 atomic_t refcount;
100
101 union futex_key key;
102};
103
104/*
78 * We use this hashed waitqueue instead of a normal wait_queue_t, so 105 * We use this hashed waitqueue instead of a normal wait_queue_t, so
79 * we can wake only the relevant ones (hashed queues may be shared). 106 * we can wake only the relevant ones (hashed queues may be shared).
80 * 107 *
@@ -87,15 +114,19 @@ struct futex_q {
87 struct list_head list; 114 struct list_head list;
88 wait_queue_head_t waiters; 115 wait_queue_head_t waiters;
89 116
90 /* Which hash list lock to use. */ 117 /* Which hash list lock to use: */
91 spinlock_t *lock_ptr; 118 spinlock_t *lock_ptr;
92 119
93 /* Key which the futex is hashed on. */ 120 /* Key which the futex is hashed on: */
94 union futex_key key; 121 union futex_key key;
95 122
96 /* For fd, sigio sent using these. */ 123 /* For fd, sigio sent using these: */
97 int fd; 124 int fd;
98 struct file *filp; 125 struct file *filp;
126
127 /* Optional priority inheritance state: */
128 struct futex_pi_state *pi_state;
129 struct task_struct *task;
99}; 130};
100 131
101/* 132/*
@@ -144,8 +175,9 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
144 * 175 *
145 * Should be called with &current->mm->mmap_sem but NOT any spinlocks. 176 * Should be called with &current->mm->mmap_sem but NOT any spinlocks.
146 */ 177 */
147static int get_futex_key(unsigned long uaddr, union futex_key *key) 178static int get_futex_key(u32 __user *uaddr, union futex_key *key)
148{ 179{
180 unsigned long address = (unsigned long)uaddr;
149 struct mm_struct *mm = current->mm; 181 struct mm_struct *mm = current->mm;
150 struct vm_area_struct *vma; 182 struct vm_area_struct *vma;
151 struct page *page; 183 struct page *page;
@@ -154,16 +186,16 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
154 /* 186 /*
155 * The futex address must be "naturally" aligned. 187 * The futex address must be "naturally" aligned.
156 */ 188 */
157 key->both.offset = uaddr % PAGE_SIZE; 189 key->both.offset = address % PAGE_SIZE;
158 if (unlikely((key->both.offset % sizeof(u32)) != 0)) 190 if (unlikely((key->both.offset % sizeof(u32)) != 0))
159 return -EINVAL; 191 return -EINVAL;
160 uaddr -= key->both.offset; 192 address -= key->both.offset;
161 193
162 /* 194 /*
163 * The futex is hashed differently depending on whether 195 * The futex is hashed differently depending on whether
164 * it's in a shared or private mapping. So check vma first. 196 * it's in a shared or private mapping. So check vma first.
165 */ 197 */
166 vma = find_extend_vma(mm, uaddr); 198 vma = find_extend_vma(mm, address);
167 if (unlikely(!vma)) 199 if (unlikely(!vma))
168 return -EFAULT; 200 return -EFAULT;
169 201
@@ -184,7 +216,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
184 */ 216 */
185 if (likely(!(vma->vm_flags & VM_MAYSHARE))) { 217 if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
186 key->private.mm = mm; 218 key->private.mm = mm;
187 key->private.uaddr = uaddr; 219 key->private.address = address;
188 return 0; 220 return 0;
189 } 221 }
190 222
@@ -194,7 +226,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
194 key->shared.inode = vma->vm_file->f_dentry->d_inode; 226 key->shared.inode = vma->vm_file->f_dentry->d_inode;
195 key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ 227 key->both.offset++; /* Bit 0 of offset indicates inode-based key. */
196 if (likely(!(vma->vm_flags & VM_NONLINEAR))) { 228 if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
197 key->shared.pgoff = (((uaddr - vma->vm_start) >> PAGE_SHIFT) 229 key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
198 + vma->vm_pgoff); 230 + vma->vm_pgoff);
199 return 0; 231 return 0;
200 } 232 }
@@ -205,7 +237,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
205 * from swap. But that's a lot of code to duplicate here 237 * from swap. But that's a lot of code to duplicate here
206 * for a rare case, so we simply fetch the page. 238 * for a rare case, so we simply fetch the page.
207 */ 239 */
208 err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL); 240 err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
209 if (err >= 0) { 241 if (err >= 0) {
210 key->shared.pgoff = 242 key->shared.pgoff =
211 page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 243 page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -246,18 +278,244 @@ static void drop_key_refs(union futex_key *key)
246 } 278 }
247} 279}
248 280
249static inline int get_futex_value_locked(int *dest, int __user *from) 281static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
250{ 282{
251 int ret; 283 int ret;
252 284
253 inc_preempt_count(); 285 inc_preempt_count();
254 ret = __copy_from_user_inatomic(dest, from, sizeof(int)); 286 ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
255 dec_preempt_count(); 287 dec_preempt_count();
256 288
257 return ret ? -EFAULT : 0; 289 return ret ? -EFAULT : 0;
258} 290}
259 291
260/* 292/*
293 * Fault handling. Called with current->mm->mmap_sem held.
294 */
295static int futex_handle_fault(unsigned long address, int attempt)
296{
297 struct vm_area_struct * vma;
298 struct mm_struct *mm = current->mm;
299
300 if (attempt >= 2 || !(vma = find_vma(mm, address)) ||
301 vma->vm_start > address || !(vma->vm_flags & VM_WRITE))
302 return -EFAULT;
303
304 switch (handle_mm_fault(mm, vma, address, 1)) {
305 case VM_FAULT_MINOR:
306 current->min_flt++;
307 break;
308 case VM_FAULT_MAJOR:
309 current->maj_flt++;
310 break;
311 default:
312 return -EFAULT;
313 }
314 return 0;
315}
316
317/*
318 * PI code:
319 */
320static int refill_pi_state_cache(void)
321{
322 struct futex_pi_state *pi_state;
323
324 if (likely(current->pi_state_cache))
325 return 0;
326
327 pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL);
328
329 if (!pi_state)
330 return -ENOMEM;
331
332 memset(pi_state, 0, sizeof(*pi_state));
333 INIT_LIST_HEAD(&pi_state->list);
334 /* pi_mutex gets initialized later */
335 pi_state->owner = NULL;
336 atomic_set(&pi_state->refcount, 1);
337
338 current->pi_state_cache = pi_state;
339
340 return 0;
341}
342
343static struct futex_pi_state * alloc_pi_state(void)
344{
345 struct futex_pi_state *pi_state = current->pi_state_cache;
346
347 WARN_ON(!pi_state);
348 current->pi_state_cache = NULL;
349
350 return pi_state;
351}
352
353static void free_pi_state(struct futex_pi_state *pi_state)
354{
355 if (!atomic_dec_and_test(&pi_state->refcount))
356 return;
357
358 /*
359 * If pi_state->owner is NULL, the owner is most probably dying
360 * and has cleaned up the pi_state already
361 */
362 if (pi_state->owner) {
363 spin_lock_irq(&pi_state->owner->pi_lock);
364 list_del_init(&pi_state->list);
365 spin_unlock_irq(&pi_state->owner->pi_lock);
366
367 rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
368 }
369
370 if (current->pi_state_cache)
371 kfree(pi_state);
372 else {
373 /*
374 * pi_state->list is already empty.
375 * clear pi_state->owner.
376 * refcount is at 0 - put it back to 1.
377 */
378 pi_state->owner = NULL;
379 atomic_set(&pi_state->refcount, 1);
380 current->pi_state_cache = pi_state;
381 }
382}
383
384/*
385 * Look up the task based on what TID userspace gave us.
386 * We dont trust it.
387 */
388static struct task_struct * futex_find_get_task(pid_t pid)
389{
390 struct task_struct *p;
391
392 read_lock(&tasklist_lock);
393 p = find_task_by_pid(pid);
394 if (!p)
395 goto out_unlock;
396 if ((current->euid != p->euid) && (current->euid != p->uid)) {
397 p = NULL;
398 goto out_unlock;
399 }
400 if (p->state == EXIT_ZOMBIE || p->exit_state == EXIT_ZOMBIE) {
401 p = NULL;
402 goto out_unlock;
403 }
404 get_task_struct(p);
405out_unlock:
406 read_unlock(&tasklist_lock);
407
408 return p;
409}
410
411/*
412 * This task is holding PI mutexes at exit time => bad.
413 * Kernel cleans up PI-state, but userspace is likely hosed.
414 * (Robust-futex cleanup is separate and might save the day for userspace.)
415 */
416void exit_pi_state_list(struct task_struct *curr)
417{
418 struct futex_hash_bucket *hb;
419 struct list_head *next, *head = &curr->pi_state_list;
420 struct futex_pi_state *pi_state;
421 union futex_key key;
422
423 /*
424 * We are a ZOMBIE and nobody can enqueue itself on
425 * pi_state_list anymore, but we have to be careful
426 * versus waiters unqueueing themselfs
427 */
428 spin_lock_irq(&curr->pi_lock);
429 while (!list_empty(head)) {
430
431 next = head->next;
432 pi_state = list_entry(next, struct futex_pi_state, list);
433 key = pi_state->key;
434 spin_unlock_irq(&curr->pi_lock);
435
436 hb = hash_futex(&key);
437 spin_lock(&hb->lock);
438
439 spin_lock_irq(&curr->pi_lock);
440 if (head->next != next) {
441 spin_unlock(&hb->lock);
442 continue;
443 }
444
445 list_del_init(&pi_state->list);
446
447 WARN_ON(pi_state->owner != curr);
448
449 pi_state->owner = NULL;
450 spin_unlock_irq(&curr->pi_lock);
451
452 rt_mutex_unlock(&pi_state->pi_mutex);
453
454 spin_unlock(&hb->lock);
455
456 spin_lock_irq(&curr->pi_lock);
457 }
458 spin_unlock_irq(&curr->pi_lock);
459}
460
461static int
462lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
463{
464 struct futex_pi_state *pi_state = NULL;
465 struct futex_q *this, *next;
466 struct list_head *head;
467 struct task_struct *p;
468 pid_t pid;
469
470 head = &hb->chain;
471
472 list_for_each_entry_safe(this, next, head, list) {
473 if (match_futex (&this->key, &me->key)) {
474 /*
475 * Another waiter already exists - bump up
476 * the refcount and return its pi_state:
477 */
478 pi_state = this->pi_state;
479 atomic_inc(&pi_state->refcount);
480 me->pi_state = pi_state;
481
482 return 0;
483 }
484 }
485
486 /*
487 * We are the first waiter - try to look up the real owner and
488 * attach the new pi_state to it:
489 */
490 pid = uval & FUTEX_TID_MASK;
491 p = futex_find_get_task(pid);
492 if (!p)
493 return -ESRCH;
494
495 pi_state = alloc_pi_state();
496
497 /*
498 * Initialize the pi_mutex in locked state and make 'p'
499 * the owner of it:
500 */
501 rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
502
503 /* Store the key for possible exit cleanups: */
504 pi_state->key = me->key;
505
506 spin_lock_irq(&p->pi_lock);
507 list_add(&pi_state->list, &p->pi_state_list);
508 pi_state->owner = p;
509 spin_unlock_irq(&p->pi_lock);
510
511 put_task_struct(p);
512
513 me->pi_state = pi_state;
514
515 return 0;
516}
517
518/*
261 * The hash bucket lock must be held when this is called. 519 * The hash bucket lock must be held when this is called.
262 * Afterwards, the futex_q must not be accessed. 520 * Afterwards, the futex_q must not be accessed.
263 */ 521 */
@@ -284,16 +542,80 @@ static void wake_futex(struct futex_q *q)
284 q->lock_ptr = NULL; 542 q->lock_ptr = NULL;
285} 543}
286 544
545static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
546{
547 struct task_struct *new_owner;
548 struct futex_pi_state *pi_state = this->pi_state;
549 u32 curval, newval;
550
551 if (!pi_state)
552 return -EINVAL;
553
554 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
555
556 /*
557 * This happens when we have stolen the lock and the original
558 * pending owner did not enqueue itself back on the rt_mutex.
559 * Thats not a tragedy. We know that way, that a lock waiter
560 * is on the fly. We make the futex_q waiter the pending owner.
561 */
562 if (!new_owner)
563 new_owner = this->task;
564
565 /*
566 * We pass it to the next owner. (The WAITERS bit is always
567 * kept enabled while there is PI state around. We must also
568 * preserve the owner died bit.)
569 */
570 newval = (uval & FUTEX_OWNER_DIED) | FUTEX_WAITERS | new_owner->pid;
571
572 inc_preempt_count();
573 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
574 dec_preempt_count();
575
576 if (curval == -EFAULT)
577 return -EFAULT;
578 if (curval != uval)
579 return -EINVAL;
580
581 list_del_init(&pi_state->owner->pi_state_list);
582 list_add(&pi_state->list, &new_owner->pi_state_list);
583 pi_state->owner = new_owner;
584 rt_mutex_unlock(&pi_state->pi_mutex);
585
586 return 0;
587}
588
589static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
590{
591 u32 oldval;
592
593 /*
594 * There is no waiter, so we unlock the futex. The owner died
595 * bit has not to be preserved here. We are the owner:
596 */
597 inc_preempt_count();
598 oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0);
599 dec_preempt_count();
600
601 if (oldval == -EFAULT)
602 return oldval;
603 if (oldval != uval)
604 return -EAGAIN;
605
606 return 0;
607}
608
287/* 609/*
288 * Wake up all waiters hashed on the physical page that is mapped 610 * Wake up all waiters hashed on the physical page that is mapped
289 * to this virtual address: 611 * to this virtual address:
290 */ 612 */
291static int futex_wake(unsigned long uaddr, int nr_wake) 613static int futex_wake(u32 __user *uaddr, int nr_wake)
292{ 614{
293 union futex_key key; 615 struct futex_hash_bucket *hb;
294 struct futex_hash_bucket *bh;
295 struct list_head *head;
296 struct futex_q *this, *next; 616 struct futex_q *this, *next;
617 struct list_head *head;
618 union futex_key key;
297 int ret; 619 int ret;
298 620
299 down_read(&current->mm->mmap_sem); 621 down_read(&current->mm->mmap_sem);
@@ -302,19 +624,23 @@ static int futex_wake(unsigned long uaddr, int nr_wake)
302 if (unlikely(ret != 0)) 624 if (unlikely(ret != 0))
303 goto out; 625 goto out;
304 626
305 bh = hash_futex(&key); 627 hb = hash_futex(&key);
306 spin_lock(&bh->lock); 628 spin_lock(&hb->lock);
307 head = &bh->chain; 629 head = &hb->chain;
308 630
309 list_for_each_entry_safe(this, next, head, list) { 631 list_for_each_entry_safe(this, next, head, list) {
310 if (match_futex (&this->key, &key)) { 632 if (match_futex (&this->key, &key)) {
633 if (this->pi_state) {
634 ret = -EINVAL;
635 break;
636 }
311 wake_futex(this); 637 wake_futex(this);
312 if (++ret >= nr_wake) 638 if (++ret >= nr_wake)
313 break; 639 break;
314 } 640 }
315 } 641 }
316 642
317 spin_unlock(&bh->lock); 643 spin_unlock(&hb->lock);
318out: 644out:
319 up_read(&current->mm->mmap_sem); 645 up_read(&current->mm->mmap_sem);
320 return ret; 646 return ret;
@@ -324,10 +650,12 @@ out:
324 * Wake up all waiters hashed on the physical page that is mapped 650 * Wake up all waiters hashed on the physical page that is mapped
325 * to this virtual address: 651 * to this virtual address:
326 */ 652 */
327static int futex_wake_op(unsigned long uaddr1, unsigned long uaddr2, int nr_wake, int nr_wake2, int op) 653static int
654futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2,
655 int nr_wake, int nr_wake2, int op)
328{ 656{
329 union futex_key key1, key2; 657 union futex_key key1, key2;
330 struct futex_hash_bucket *bh1, *bh2; 658 struct futex_hash_bucket *hb1, *hb2;
331 struct list_head *head; 659 struct list_head *head;
332 struct futex_q *this, *next; 660 struct futex_q *this, *next;
333 int ret, op_ret, attempt = 0; 661 int ret, op_ret, attempt = 0;
@@ -342,27 +670,29 @@ retryfull:
342 if (unlikely(ret != 0)) 670 if (unlikely(ret != 0))
343 goto out; 671 goto out;
344 672
345 bh1 = hash_futex(&key1); 673 hb1 = hash_futex(&key1);
346 bh2 = hash_futex(&key2); 674 hb2 = hash_futex(&key2);
347 675
348retry: 676retry:
349 if (bh1 < bh2) 677 if (hb1 < hb2)
350 spin_lock(&bh1->lock); 678 spin_lock(&hb1->lock);
351 spin_lock(&bh2->lock); 679 spin_lock(&hb2->lock);
352 if (bh1 > bh2) 680 if (hb1 > hb2)
353 spin_lock(&bh1->lock); 681 spin_lock(&hb1->lock);
354 682
355 op_ret = futex_atomic_op_inuser(op, (int __user *)uaddr2); 683 op_ret = futex_atomic_op_inuser(op, uaddr2);
356 if (unlikely(op_ret < 0)) { 684 if (unlikely(op_ret < 0)) {
357 int dummy; 685 u32 dummy;
358 686
359 spin_unlock(&bh1->lock); 687 spin_unlock(&hb1->lock);
360 if (bh1 != bh2) 688 if (hb1 != hb2)
361 spin_unlock(&bh2->lock); 689 spin_unlock(&hb2->lock);
362 690
363#ifndef CONFIG_MMU 691#ifndef CONFIG_MMU
364 /* we don't get EFAULT from MMU faults if we don't have an MMU, 692 /*
365 * but we might get them from range checking */ 693 * we don't get EFAULT from MMU faults if we don't have an MMU,
694 * but we might get them from range checking
695 */
366 ret = op_ret; 696 ret = op_ret;
367 goto out; 697 goto out;
368#endif 698#endif
@@ -372,47 +702,34 @@ retry:
372 goto out; 702 goto out;
373 } 703 }
374 704
375 /* futex_atomic_op_inuser needs to both read and write 705 /*
706 * futex_atomic_op_inuser needs to both read and write
376 * *(int __user *)uaddr2, but we can't modify it 707 * *(int __user *)uaddr2, but we can't modify it
377 * non-atomically. Therefore, if get_user below is not 708 * non-atomically. Therefore, if get_user below is not
378 * enough, we need to handle the fault ourselves, while 709 * enough, we need to handle the fault ourselves, while
379 * still holding the mmap_sem. */ 710 * still holding the mmap_sem.
711 */
380 if (attempt++) { 712 if (attempt++) {
381 struct vm_area_struct * vma; 713 if (futex_handle_fault((unsigned long)uaddr2,
382 struct mm_struct *mm = current->mm; 714 attempt))
383
384 ret = -EFAULT;
385 if (attempt >= 2 ||
386 !(vma = find_vma(mm, uaddr2)) ||
387 vma->vm_start > uaddr2 ||
388 !(vma->vm_flags & VM_WRITE))
389 goto out;
390
391 switch (handle_mm_fault(mm, vma, uaddr2, 1)) {
392 case VM_FAULT_MINOR:
393 current->min_flt++;
394 break;
395 case VM_FAULT_MAJOR:
396 current->maj_flt++;
397 break;
398 default:
399 goto out; 715 goto out;
400 }
401 goto retry; 716 goto retry;
402 } 717 }
403 718
404 /* If we would have faulted, release mmap_sem, 719 /*
405 * fault it in and start all over again. */ 720 * If we would have faulted, release mmap_sem,
721 * fault it in and start all over again.
722 */
406 up_read(&current->mm->mmap_sem); 723 up_read(&current->mm->mmap_sem);
407 724
408 ret = get_user(dummy, (int __user *)uaddr2); 725 ret = get_user(dummy, uaddr2);
409 if (ret) 726 if (ret)
410 return ret; 727 return ret;
411 728
412 goto retryfull; 729 goto retryfull;
413 } 730 }
414 731
415 head = &bh1->chain; 732 head = &hb1->chain;
416 733
417 list_for_each_entry_safe(this, next, head, list) { 734 list_for_each_entry_safe(this, next, head, list) {
418 if (match_futex (&this->key, &key1)) { 735 if (match_futex (&this->key, &key1)) {
@@ -423,7 +740,7 @@ retry:
423 } 740 }
424 741
425 if (op_ret > 0) { 742 if (op_ret > 0) {
426 head = &bh2->chain; 743 head = &hb2->chain;
427 744
428 op_ret = 0; 745 op_ret = 0;
429 list_for_each_entry_safe(this, next, head, list) { 746 list_for_each_entry_safe(this, next, head, list) {
@@ -436,9 +753,9 @@ retry:
436 ret += op_ret; 753 ret += op_ret;
437 } 754 }
438 755
439 spin_unlock(&bh1->lock); 756 spin_unlock(&hb1->lock);
440 if (bh1 != bh2) 757 if (hb1 != hb2)
441 spin_unlock(&bh2->lock); 758 spin_unlock(&hb2->lock);
442out: 759out:
443 up_read(&current->mm->mmap_sem); 760 up_read(&current->mm->mmap_sem);
444 return ret; 761 return ret;
@@ -448,11 +765,11 @@ out:
448 * Requeue all waiters hashed on one physical page to another 765 * Requeue all waiters hashed on one physical page to another
449 * physical page. 766 * physical page.
450 */ 767 */
451static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2, 768static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
452 int nr_wake, int nr_requeue, int *valp) 769 int nr_wake, int nr_requeue, u32 *cmpval)
453{ 770{
454 union futex_key key1, key2; 771 union futex_key key1, key2;
455 struct futex_hash_bucket *bh1, *bh2; 772 struct futex_hash_bucket *hb1, *hb2;
456 struct list_head *head1; 773 struct list_head *head1;
457 struct futex_q *this, *next; 774 struct futex_q *this, *next;
458 int ret, drop_count = 0; 775 int ret, drop_count = 0;
@@ -467,68 +784,72 @@ static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
467 if (unlikely(ret != 0)) 784 if (unlikely(ret != 0))
468 goto out; 785 goto out;
469 786
470 bh1 = hash_futex(&key1); 787 hb1 = hash_futex(&key1);
471 bh2 = hash_futex(&key2); 788 hb2 = hash_futex(&key2);
472 789
473 if (bh1 < bh2) 790 if (hb1 < hb2)
474 spin_lock(&bh1->lock); 791 spin_lock(&hb1->lock);
475 spin_lock(&bh2->lock); 792 spin_lock(&hb2->lock);
476 if (bh1 > bh2) 793 if (hb1 > hb2)
477 spin_lock(&bh1->lock); 794 spin_lock(&hb1->lock);
478 795
479 if (likely(valp != NULL)) { 796 if (likely(cmpval != NULL)) {
480 int curval; 797 u32 curval;
481 798
482 ret = get_futex_value_locked(&curval, (int __user *)uaddr1); 799 ret = get_futex_value_locked(&curval, uaddr1);
483 800
484 if (unlikely(ret)) { 801 if (unlikely(ret)) {
485 spin_unlock(&bh1->lock); 802 spin_unlock(&hb1->lock);
486 if (bh1 != bh2) 803 if (hb1 != hb2)
487 spin_unlock(&bh2->lock); 804 spin_unlock(&hb2->lock);
488 805
489 /* If we would have faulted, release mmap_sem, fault 806 /*
807 * If we would have faulted, release mmap_sem, fault
490 * it in and start all over again. 808 * it in and start all over again.
491 */ 809 */
492 up_read(&current->mm->mmap_sem); 810 up_read(&current->mm->mmap_sem);
493 811
494 ret = get_user(curval, (int __user *)uaddr1); 812 ret = get_user(curval, uaddr1);
495 813
496 if (!ret) 814 if (!ret)
497 goto retry; 815 goto retry;
498 816
499 return ret; 817 return ret;
500 } 818 }
501 if (curval != *valp) { 819 if (curval != *cmpval) {
502 ret = -EAGAIN; 820 ret = -EAGAIN;
503 goto out_unlock; 821 goto out_unlock;
504 } 822 }
505 } 823 }
506 824
507 head1 = &bh1->chain; 825 head1 = &hb1->chain;
508 list_for_each_entry_safe(this, next, head1, list) { 826 list_for_each_entry_safe(this, next, head1, list) {
509 if (!match_futex (&this->key, &key1)) 827 if (!match_futex (&this->key, &key1))
510 continue; 828 continue;
511 if (++ret <= nr_wake) { 829 if (++ret <= nr_wake) {
512 wake_futex(this); 830 wake_futex(this);
513 } else { 831 } else {
514 list_move_tail(&this->list, &bh2->chain); 832 /*
515 this->lock_ptr = &bh2->lock; 833 * If key1 and key2 hash to the same bucket, no need to
834 * requeue.
835 */
836 if (likely(head1 != &hb2->chain)) {
837 list_move_tail(&this->list, &hb2->chain);
838 this->lock_ptr = &hb2->lock;
839 }
516 this->key = key2; 840 this->key = key2;
517 get_key_refs(&key2); 841 get_key_refs(&key2);
518 drop_count++; 842 drop_count++;
519 843
520 if (ret - nr_wake >= nr_requeue) 844 if (ret - nr_wake >= nr_requeue)
521 break; 845 break;
522 /* Make sure to stop if key1 == key2 */
523 if (head1 == &bh2->chain && head1 != &next->list)
524 head1 = &this->list;
525 } 846 }
526 } 847 }
527 848
528out_unlock: 849out_unlock:
529 spin_unlock(&bh1->lock); 850 spin_unlock(&hb1->lock);
530 if (bh1 != bh2) 851 if (hb1 != hb2)
531 spin_unlock(&bh2->lock); 852 spin_unlock(&hb2->lock);
532 853
533 /* drop_key_refs() must be called outside the spinlocks. */ 854 /* drop_key_refs() must be called outside the spinlocks. */
534 while (--drop_count >= 0) 855 while (--drop_count >= 0)
@@ -543,7 +864,7 @@ out:
543static inline struct futex_hash_bucket * 864static inline struct futex_hash_bucket *
544queue_lock(struct futex_q *q, int fd, struct file *filp) 865queue_lock(struct futex_q *q, int fd, struct file *filp)
545{ 866{
546 struct futex_hash_bucket *bh; 867 struct futex_hash_bucket *hb;
547 868
548 q->fd = fd; 869 q->fd = fd;
549 q->filp = filp; 870 q->filp = filp;
@@ -551,23 +872,24 @@ queue_lock(struct futex_q *q, int fd, struct file *filp)
551 init_waitqueue_head(&q->waiters); 872 init_waitqueue_head(&q->waiters);
552 873
553 get_key_refs(&q->key); 874 get_key_refs(&q->key);
554 bh = hash_futex(&q->key); 875 hb = hash_futex(&q->key);
555 q->lock_ptr = &bh->lock; 876 q->lock_ptr = &hb->lock;
556 877
557 spin_lock(&bh->lock); 878 spin_lock(&hb->lock);
558 return bh; 879 return hb;
559} 880}
560 881
561static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *bh) 882static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
562{ 883{
563 list_add_tail(&q->list, &bh->chain); 884 list_add_tail(&q->list, &hb->chain);
564 spin_unlock(&bh->lock); 885 q->task = current;
886 spin_unlock(&hb->lock);
565} 887}
566 888
567static inline void 889static inline void
568queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh) 890queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
569{ 891{
570 spin_unlock(&bh->lock); 892 spin_unlock(&hb->lock);
571 drop_key_refs(&q->key); 893 drop_key_refs(&q->key);
572} 894}
573 895
@@ -579,16 +901,17 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh)
579/* The key must be already stored in q->key. */ 901/* The key must be already stored in q->key. */
580static void queue_me(struct futex_q *q, int fd, struct file *filp) 902static void queue_me(struct futex_q *q, int fd, struct file *filp)
581{ 903{
582 struct futex_hash_bucket *bh; 904 struct futex_hash_bucket *hb;
583 bh = queue_lock(q, fd, filp); 905
584 __queue_me(q, bh); 906 hb = queue_lock(q, fd, filp);
907 __queue_me(q, hb);
585} 908}
586 909
587/* Return 1 if we were still queued (ie. 0 means we were woken) */ 910/* Return 1 if we were still queued (ie. 0 means we were woken) */
588static int unqueue_me(struct futex_q *q) 911static int unqueue_me(struct futex_q *q)
589{ 912{
590 int ret = 0;
591 spinlock_t *lock_ptr; 913 spinlock_t *lock_ptr;
914 int ret = 0;
592 915
593 /* In the common case we don't take the spinlock, which is nice. */ 916 /* In the common case we don't take the spinlock, which is nice. */
594 retry: 917 retry:
@@ -614,6 +937,9 @@ static int unqueue_me(struct futex_q *q)
614 } 937 }
615 WARN_ON(list_empty(&q->list)); 938 WARN_ON(list_empty(&q->list));
616 list_del(&q->list); 939 list_del(&q->list);
940
941 BUG_ON(q->pi_state);
942
617 spin_unlock(lock_ptr); 943 spin_unlock(lock_ptr);
618 ret = 1; 944 ret = 1;
619 } 945 }
@@ -622,21 +948,42 @@ static int unqueue_me(struct futex_q *q)
622 return ret; 948 return ret;
623} 949}
624 950
625static int futex_wait(unsigned long uaddr, int val, unsigned long time) 951/*
952 * PI futexes can not be requeued and must remove themself from the
953 * hash bucket. The hash bucket lock is held on entry and dropped here.
954 */
955static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb)
626{ 956{
627 DECLARE_WAITQUEUE(wait, current); 957 WARN_ON(list_empty(&q->list));
628 int ret, curval; 958 list_del(&q->list);
959
960 BUG_ON(!q->pi_state);
961 free_pi_state(q->pi_state);
962 q->pi_state = NULL;
963
964 spin_unlock(&hb->lock);
965
966 drop_key_refs(&q->key);
967}
968
969static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
970{
971 struct task_struct *curr = current;
972 DECLARE_WAITQUEUE(wait, curr);
973 struct futex_hash_bucket *hb;
629 struct futex_q q; 974 struct futex_q q;
630 struct futex_hash_bucket *bh; 975 u32 uval;
976 int ret;
631 977
978 q.pi_state = NULL;
632 retry: 979 retry:
633 down_read(&current->mm->mmap_sem); 980 down_read(&curr->mm->mmap_sem);
634 981
635 ret = get_futex_key(uaddr, &q.key); 982 ret = get_futex_key(uaddr, &q.key);
636 if (unlikely(ret != 0)) 983 if (unlikely(ret != 0))
637 goto out_release_sem; 984 goto out_release_sem;
638 985
639 bh = queue_lock(&q, -1, NULL); 986 hb = queue_lock(&q, -1, NULL);
640 987
641 /* 988 /*
642 * Access the page AFTER the futex is queued. 989 * Access the page AFTER the futex is queued.
@@ -658,37 +1005,35 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time)
658 * We hold the mmap semaphore, so the mapping cannot have changed 1005 * We hold the mmap semaphore, so the mapping cannot have changed
659 * since we looked it up in get_futex_key. 1006 * since we looked it up in get_futex_key.
660 */ 1007 */
661 1008 ret = get_futex_value_locked(&uval, uaddr);
662 ret = get_futex_value_locked(&curval, (int __user *)uaddr);
663 1009
664 if (unlikely(ret)) { 1010 if (unlikely(ret)) {
665 queue_unlock(&q, bh); 1011 queue_unlock(&q, hb);
666 1012
667 /* If we would have faulted, release mmap_sem, fault it in and 1013 /*
1014 * If we would have faulted, release mmap_sem, fault it in and
668 * start all over again. 1015 * start all over again.
669 */ 1016 */
670 up_read(&current->mm->mmap_sem); 1017 up_read(&curr->mm->mmap_sem);
671 1018
672 ret = get_user(curval, (int __user *)uaddr); 1019 ret = get_user(uval, uaddr);
673 1020
674 if (!ret) 1021 if (!ret)
675 goto retry; 1022 goto retry;
676 return ret; 1023 return ret;
677 } 1024 }
678 if (curval != val) { 1025 ret = -EWOULDBLOCK;
679 ret = -EWOULDBLOCK; 1026 if (uval != val)
680 queue_unlock(&q, bh); 1027 goto out_unlock_release_sem;
681 goto out_release_sem;
682 }
683 1028
684 /* Only actually queue if *uaddr contained val. */ 1029 /* Only actually queue if *uaddr contained val. */
685 __queue_me(&q, bh); 1030 __queue_me(&q, hb);
686 1031
687 /* 1032 /*
688 * Now the futex is queued and we have checked the data, we 1033 * Now the futex is queued and we have checked the data, we
689 * don't want to hold mmap_sem while we sleep. 1034 * don't want to hold mmap_sem while we sleep.
690 */ 1035 */
691 up_read(&current->mm->mmap_sem); 1036 up_read(&curr->mm->mmap_sem);
692 1037
693 /* 1038 /*
694 * There might have been scheduling since the queue_me(), as we 1039 * There might have been scheduling since the queue_me(), as we
@@ -720,12 +1065,421 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time)
720 return 0; 1065 return 0;
721 if (time == 0) 1066 if (time == 0)
722 return -ETIMEDOUT; 1067 return -ETIMEDOUT;
723 /* We expect signal_pending(current), but another thread may 1068 /*
724 * have handled it for us already. */ 1069 * We expect signal_pending(current), but another thread may
1070 * have handled it for us already.
1071 */
725 return -EINTR; 1072 return -EINTR;
726 1073
1074 out_unlock_release_sem:
1075 queue_unlock(&q, hb);
1076
727 out_release_sem: 1077 out_release_sem:
1078 up_read(&curr->mm->mmap_sem);
1079 return ret;
1080}
1081
1082/*
1083 * Userspace tried a 0 -> TID atomic transition of the futex value
1084 * and failed. The kernel side here does the whole locking operation:
1085 * if there are waiters then it will block, it does PI, etc. (Due to
1086 * races the kernel might see a 0 value of the futex too.)
1087 */
1088static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
1089 struct hrtimer_sleeper *to)
1090{
1091 struct task_struct *curr = current;
1092 struct futex_hash_bucket *hb;
1093 u32 uval, newval, curval;
1094 struct futex_q q;
1095 int ret, attempt = 0;
1096
1097 if (refill_pi_state_cache())
1098 return -ENOMEM;
1099
1100 q.pi_state = NULL;
1101 retry:
1102 down_read(&curr->mm->mmap_sem);
1103
1104 ret = get_futex_key(uaddr, &q.key);
1105 if (unlikely(ret != 0))
1106 goto out_release_sem;
1107
1108 hb = queue_lock(&q, -1, NULL);
1109
1110 retry_locked:
1111 /*
1112 * To avoid races, we attempt to take the lock here again
1113 * (by doing a 0 -> TID atomic cmpxchg), while holding all
1114 * the locks. It will most likely not succeed.
1115 */
1116 newval = current->pid;
1117
1118 inc_preempt_count();
1119 curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval);
1120 dec_preempt_count();
1121
1122 if (unlikely(curval == -EFAULT))
1123 goto uaddr_faulted;
1124
1125 /* We own the lock already */
1126 if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) {
1127 if (!detect && 0)
1128 force_sig(SIGKILL, current);
1129 ret = -EDEADLK;
1130 goto out_unlock_release_sem;
1131 }
1132
1133 /*
1134 * Surprise - we got the lock. Just return
1135 * to userspace:
1136 */
1137 if (unlikely(!curval))
1138 goto out_unlock_release_sem;
1139
1140 uval = curval;
1141 newval = uval | FUTEX_WAITERS;
1142
1143 inc_preempt_count();
1144 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
1145 dec_preempt_count();
1146
1147 if (unlikely(curval == -EFAULT))
1148 goto uaddr_faulted;
1149 if (unlikely(curval != uval))
1150 goto retry_locked;
1151
1152 /*
1153 * We dont have the lock. Look up the PI state (or create it if
1154 * we are the first waiter):
1155 */
1156 ret = lookup_pi_state(uval, hb, &q);
1157
1158 if (unlikely(ret)) {
1159 /*
1160 * There were no waiters and the owner task lookup
1161 * failed. When the OWNER_DIED bit is set, then we
1162 * know that this is a robust futex and we actually
1163 * take the lock. This is safe as we are protected by
1164 * the hash bucket lock. We also set the waiters bit
1165 * unconditionally here, to simplify glibc handling of
1166 * multiple tasks racing to acquire the lock and
1167 * cleanup the problems which were left by the dead
1168 * owner.
1169 */
1170 if (curval & FUTEX_OWNER_DIED) {
1171 uval = newval;
1172 newval = current->pid |
1173 FUTEX_OWNER_DIED | FUTEX_WAITERS;
1174
1175 inc_preempt_count();
1176 curval = futex_atomic_cmpxchg_inatomic(uaddr,
1177 uval, newval);
1178 dec_preempt_count();
1179
1180 if (unlikely(curval == -EFAULT))
1181 goto uaddr_faulted;
1182 if (unlikely(curval != uval))
1183 goto retry_locked;
1184 ret = 0;
1185 }
1186 goto out_unlock_release_sem;
1187 }
1188
1189 /*
1190 * Only actually queue now that the atomic ops are done:
1191 */
1192 __queue_me(&q, hb);
1193
1194 /*
1195 * Now the futex is queued and we have checked the data, we
1196 * don't want to hold mmap_sem while we sleep.
1197 */
1198 up_read(&curr->mm->mmap_sem);
1199
1200 WARN_ON(!q.pi_state);
1201 /*
1202 * Block on the PI mutex:
1203 */
1204 if (!trylock)
1205 ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1);
1206 else {
1207 ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
1208 /* Fixup the trylock return value: */
1209 ret = ret ? 0 : -EWOULDBLOCK;
1210 }
1211
1212 down_read(&curr->mm->mmap_sem);
1213 spin_lock(q.lock_ptr);
1214
1215 /*
1216 * Got the lock. We might not be the anticipated owner if we
1217 * did a lock-steal - fix up the PI-state in that case.
1218 */
1219 if (!ret && q.pi_state->owner != curr) {
1220 u32 newtid = current->pid | FUTEX_WAITERS;
1221
1222 /* Owner died? */
1223 if (q.pi_state->owner != NULL) {
1224 spin_lock_irq(&q.pi_state->owner->pi_lock);
1225 list_del_init(&q.pi_state->list);
1226 spin_unlock_irq(&q.pi_state->owner->pi_lock);
1227 } else
1228 newtid |= FUTEX_OWNER_DIED;
1229
1230 q.pi_state->owner = current;
1231
1232 spin_lock_irq(&current->pi_lock);
1233 list_add(&q.pi_state->list, &current->pi_state_list);
1234 spin_unlock_irq(&current->pi_lock);
1235
1236 /* Unqueue and drop the lock */
1237 unqueue_me_pi(&q, hb);
1238 up_read(&curr->mm->mmap_sem);
1239 /*
1240 * We own it, so we have to replace the pending owner
1241 * TID. This must be atomic as we have preserve the
1242 * owner died bit here.
1243 */
1244 ret = get_user(uval, uaddr);
1245 while (!ret) {
1246 newval = (uval & FUTEX_OWNER_DIED) | newtid;
1247 curval = futex_atomic_cmpxchg_inatomic(uaddr,
1248 uval, newval);
1249 if (curval == -EFAULT)
1250 ret = -EFAULT;
1251 if (curval == uval)
1252 break;
1253 uval = curval;
1254 }
1255 } else {
1256 /*
1257 * Catch the rare case, where the lock was released
1258 * when we were on the way back before we locked
1259 * the hash bucket.
1260 */
1261 if (ret && q.pi_state->owner == curr) {
1262 if (rt_mutex_trylock(&q.pi_state->pi_mutex))
1263 ret = 0;
1264 }
1265 /* Unqueue and drop the lock */
1266 unqueue_me_pi(&q, hb);
1267 up_read(&curr->mm->mmap_sem);
1268 }
1269
1270 if (!detect && ret == -EDEADLK && 0)
1271 force_sig(SIGKILL, current);
1272
1273 return ret;
1274
1275 out_unlock_release_sem:
1276 queue_unlock(&q, hb);
1277
1278 out_release_sem:
1279 up_read(&curr->mm->mmap_sem);
1280 return ret;
1281
1282 uaddr_faulted:
1283 /*
1284 * We have to r/w *(int __user *)uaddr, but we can't modify it
1285 * non-atomically. Therefore, if get_user below is not
1286 * enough, we need to handle the fault ourselves, while
1287 * still holding the mmap_sem.
1288 */
1289 if (attempt++) {
1290 if (futex_handle_fault((unsigned long)uaddr, attempt))
1291 goto out_unlock_release_sem;
1292
1293 goto retry_locked;
1294 }
1295
1296 queue_unlock(&q, hb);
1297 up_read(&curr->mm->mmap_sem);
1298
1299 ret = get_user(uval, uaddr);
1300 if (!ret && (uval != -EFAULT))
1301 goto retry;
1302
1303 return ret;
1304}
1305
1306/*
1307 * Restart handler
1308 */
1309static long futex_lock_pi_restart(struct restart_block *restart)
1310{
1311 struct hrtimer_sleeper timeout, *to = NULL;
1312 int ret;
1313
1314 restart->fn = do_no_restart_syscall;
1315
1316 if (restart->arg2 || restart->arg3) {
1317 to = &timeout;
1318 hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS);
1319 hrtimer_init_sleeper(to, current);
1320 to->timer.expires.tv64 = ((u64)restart->arg1 << 32) |
1321 (u64) restart->arg0;
1322 }
1323
1324 pr_debug("lock_pi restart: %p, %d (%d)\n",
1325 (u32 __user *)restart->arg0, current->pid);
1326
1327 ret = do_futex_lock_pi((u32 __user *)restart->arg0, restart->arg1,
1328 0, to);
1329
1330 if (ret != -EINTR)
1331 return ret;
1332
1333 restart->fn = futex_lock_pi_restart;
1334
1335 /* The other values are filled in */
1336 return -ERESTART_RESTARTBLOCK;
1337}
1338
1339/*
1340 * Called from the syscall entry below.
1341 */
1342static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1343 long nsec, int trylock)
1344{
1345 struct hrtimer_sleeper timeout, *to = NULL;
1346 struct restart_block *restart;
1347 int ret;
1348
1349 if (sec != MAX_SCHEDULE_TIMEOUT) {
1350 to = &timeout;
1351 hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS);
1352 hrtimer_init_sleeper(to, current);
1353 to->timer.expires = ktime_set(sec, nsec);
1354 }
1355
1356 ret = do_futex_lock_pi(uaddr, detect, trylock, to);
1357
1358 if (ret != -EINTR)
1359 return ret;
1360
1361 pr_debug("lock_pi interrupted: %p, %d (%d)\n", uaddr, current->pid);
1362
1363 restart = &current_thread_info()->restart_block;
1364 restart->fn = futex_lock_pi_restart;
1365 restart->arg0 = (unsigned long) uaddr;
1366 restart->arg1 = detect;
1367 if (to) {
1368 restart->arg2 = to->timer.expires.tv64 & 0xFFFFFFFF;
1369 restart->arg3 = to->timer.expires.tv64 >> 32;
1370 } else
1371 restart->arg2 = restart->arg3 = 0;
1372
1373 return -ERESTART_RESTARTBLOCK;
1374}
1375
1376/*
1377 * Userspace attempted a TID -> 0 atomic transition, and failed.
1378 * This is the in-kernel slowpath: we look up the PI state (if any),
1379 * and do the rt-mutex unlock.
1380 */
1381static int futex_unlock_pi(u32 __user *uaddr)
1382{
1383 struct futex_hash_bucket *hb;
1384 struct futex_q *this, *next;
1385 u32 uval;
1386 struct list_head *head;
1387 union futex_key key;
1388 int ret, attempt = 0;
1389
1390retry:
1391 if (get_user(uval, uaddr))
1392 return -EFAULT;
1393 /*
1394 * We release only a lock we actually own:
1395 */
1396 if ((uval & FUTEX_TID_MASK) != current->pid)
1397 return -EPERM;
1398 /*
1399 * First take all the futex related locks:
1400 */
1401 down_read(&current->mm->mmap_sem);
1402
1403 ret = get_futex_key(uaddr, &key);
1404 if (unlikely(ret != 0))
1405 goto out;
1406
1407 hb = hash_futex(&key);
1408 spin_lock(&hb->lock);
1409
1410retry_locked:
1411 /*
1412 * To avoid races, try to do the TID -> 0 atomic transition
1413 * again. If it succeeds then we can return without waking
1414 * anyone else up:
1415 */
1416 inc_preempt_count();
1417 uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
1418 dec_preempt_count();
1419
1420 if (unlikely(uval == -EFAULT))
1421 goto pi_faulted;
1422 /*
1423 * Rare case: we managed to release the lock atomically,
1424 * no need to wake anyone else up:
1425 */
1426 if (unlikely(uval == current->pid))
1427 goto out_unlock;
1428
1429 /*
1430 * Ok, other tasks may need to be woken up - check waiters
1431 * and do the wakeup if necessary:
1432 */
1433 head = &hb->chain;
1434
1435 list_for_each_entry_safe(this, next, head, list) {
1436 if (!match_futex (&this->key, &key))
1437 continue;
1438 ret = wake_futex_pi(uaddr, uval, this);
1439 /*
1440 * The atomic access to the futex value
1441 * generated a pagefault, so retry the
1442 * user-access and the wakeup:
1443 */
1444 if (ret == -EFAULT)
1445 goto pi_faulted;
1446 goto out_unlock;
1447 }
1448 /*
1449 * No waiters - kernel unlocks the futex:
1450 */
1451 ret = unlock_futex_pi(uaddr, uval);
1452 if (ret == -EFAULT)
1453 goto pi_faulted;
1454
1455out_unlock:
1456 spin_unlock(&hb->lock);
1457out:
728 up_read(&current->mm->mmap_sem); 1458 up_read(&current->mm->mmap_sem);
1459
1460 return ret;
1461
1462pi_faulted:
1463 /*
1464 * We have to r/w *(int __user *)uaddr, but we can't modify it
1465 * non-atomically. Therefore, if get_user below is not
1466 * enough, we need to handle the fault ourselves, while
1467 * still holding the mmap_sem.
1468 */
1469 if (attempt++) {
1470 if (futex_handle_fault((unsigned long)uaddr, attempt))
1471 goto out_unlock;
1472
1473 goto retry_locked;
1474 }
1475
1476 spin_unlock(&hb->lock);
1477 up_read(&current->mm->mmap_sem);
1478
1479 ret = get_user(uval, uaddr);
1480 if (!ret && (uval != -EFAULT))
1481 goto retry;
1482
729 return ret; 1483 return ret;
730} 1484}
731 1485
@@ -735,6 +1489,7 @@ static int futex_close(struct inode *inode, struct file *filp)
735 1489
736 unqueue_me(q); 1490 unqueue_me(q);
737 kfree(q); 1491 kfree(q);
1492
738 return 0; 1493 return 0;
739} 1494}
740 1495
@@ -766,7 +1521,7 @@ static struct file_operations futex_fops = {
766 * Signal allows caller to avoid the race which would occur if they 1521 * Signal allows caller to avoid the race which would occur if they
767 * set the sigio stuff up afterwards. 1522 * set the sigio stuff up afterwards.
768 */ 1523 */
769static int futex_fd(unsigned long uaddr, int signal) 1524static int futex_fd(u32 __user *uaddr, int signal)
770{ 1525{
771 struct futex_q *q; 1526 struct futex_q *q;
772 struct file *filp; 1527 struct file *filp;
@@ -803,6 +1558,7 @@ static int futex_fd(unsigned long uaddr, int signal)
803 err = -ENOMEM; 1558 err = -ENOMEM;
804 goto error; 1559 goto error;
805 } 1560 }
1561 q->pi_state = NULL;
806 1562
807 down_read(&current->mm->mmap_sem); 1563 down_read(&current->mm->mmap_sem);
808 err = get_futex_key(uaddr, &q->key); 1564 err = get_futex_key(uaddr, &q->key);
@@ -840,7 +1596,7 @@ error:
840 * Implementation: user-space maintains a per-thread list of locks it 1596 * Implementation: user-space maintains a per-thread list of locks it
841 * is holding. Upon do_exit(), the kernel carefully walks this list, 1597 * is holding. Upon do_exit(), the kernel carefully walks this list,
842 * and marks all locks that are owned by this thread with the 1598 * and marks all locks that are owned by this thread with the
843 * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is 1599 * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
844 * always manipulated with the lock held, so the list is private and 1600 * always manipulated with the lock held, so the list is private and
845 * per-thread. Userspace also maintains a per-thread 'list_op_pending' 1601 * per-thread. Userspace also maintains a per-thread 'list_op_pending'
846 * field, to allow the kernel to clean up if the thread dies after 1602 * field, to allow the kernel to clean up if the thread dies after
@@ -915,7 +1671,7 @@ err_unlock:
915 */ 1671 */
916int handle_futex_death(u32 __user *uaddr, struct task_struct *curr) 1672int handle_futex_death(u32 __user *uaddr, struct task_struct *curr)
917{ 1673{
918 u32 uval; 1674 u32 uval, nval;
919 1675
920retry: 1676retry:
921 if (get_user(uval, uaddr)) 1677 if (get_user(uval, uaddr))
@@ -932,12 +1688,16 @@ retry:
932 * thread-death.) The rest of the cleanup is done in 1688 * thread-death.) The rest of the cleanup is done in
933 * userspace. 1689 * userspace.
934 */ 1690 */
935 if (futex_atomic_cmpxchg_inatomic(uaddr, uval, 1691 nval = futex_atomic_cmpxchg_inatomic(uaddr, uval,
936 uval | FUTEX_OWNER_DIED) != uval) 1692 uval | FUTEX_OWNER_DIED);
1693 if (nval == -EFAULT)
1694 return -1;
1695
1696 if (nval != uval)
937 goto retry; 1697 goto retry;
938 1698
939 if (uval & FUTEX_WAITERS) 1699 if (uval & FUTEX_WAITERS)
940 futex_wake((unsigned long)uaddr, 1); 1700 futex_wake(uaddr, 1);
941 } 1701 }
942 return 0; 1702 return 0;
943} 1703}
@@ -978,7 +1738,7 @@ void exit_robust_list(struct task_struct *curr)
978 while (entry != &head->list) { 1738 while (entry != &head->list) {
979 /* 1739 /*
980 * A pending lock might already be on the list, so 1740 * A pending lock might already be on the list, so
981 * dont process it twice: 1741 * don't process it twice:
982 */ 1742 */
983 if (entry != pending) 1743 if (entry != pending)
984 if (handle_futex_death((void *)entry + futex_offset, 1744 if (handle_futex_death((void *)entry + futex_offset,
@@ -999,8 +1759,8 @@ void exit_robust_list(struct task_struct *curr)
999 } 1759 }
1000} 1760}
1001 1761
1002long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, 1762long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout,
1003 unsigned long uaddr2, int val2, int val3) 1763 u32 __user *uaddr2, u32 val2, u32 val3)
1004{ 1764{
1005 int ret; 1765 int ret;
1006 1766
@@ -1024,6 +1784,15 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
1024 case FUTEX_WAKE_OP: 1784 case FUTEX_WAKE_OP:
1025 ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); 1785 ret = futex_wake_op(uaddr, uaddr2, val, val2, val3);
1026 break; 1786 break;
1787 case FUTEX_LOCK_PI:
1788 ret = futex_lock_pi(uaddr, val, timeout, val2, 0);
1789 break;
1790 case FUTEX_UNLOCK_PI:
1791 ret = futex_unlock_pi(uaddr);
1792 break;
1793 case FUTEX_TRYLOCK_PI:
1794 ret = futex_lock_pi(uaddr, 0, timeout, val2, 1);
1795 break;
1027 default: 1796 default:
1028 ret = -ENOSYS; 1797 ret = -ENOSYS;
1029 } 1798 }
@@ -1031,36 +1800,40 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
1031} 1800}
1032 1801
1033 1802
1034asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, 1803asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
1035 struct timespec __user *utime, u32 __user *uaddr2, 1804 struct timespec __user *utime, u32 __user *uaddr2,
1036 int val3) 1805 u32 val3)
1037{ 1806{
1038 struct timespec t; 1807 struct timespec t;
1039 unsigned long timeout = MAX_SCHEDULE_TIMEOUT; 1808 unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
1040 int val2 = 0; 1809 u32 val2 = 0;
1041 1810
1042 if (utime && (op == FUTEX_WAIT)) { 1811 if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
1043 if (copy_from_user(&t, utime, sizeof(t)) != 0) 1812 if (copy_from_user(&t, utime, sizeof(t)) != 0)
1044 return -EFAULT; 1813 return -EFAULT;
1045 if (!timespec_valid(&t)) 1814 if (!timespec_valid(&t))
1046 return -EINVAL; 1815 return -EINVAL;
1047 timeout = timespec_to_jiffies(&t) + 1; 1816 if (op == FUTEX_WAIT)
1817 timeout = timespec_to_jiffies(&t) + 1;
1818 else {
1819 timeout = t.tv_sec;
1820 val2 = t.tv_nsec;
1821 }
1048 } 1822 }
1049 /* 1823 /*
1050 * requeue parameter in 'utime' if op == FUTEX_REQUEUE. 1824 * requeue parameter in 'utime' if op == FUTEX_REQUEUE.
1051 */ 1825 */
1052 if (op >= FUTEX_REQUEUE) 1826 if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
1053 val2 = (int) (unsigned long) utime; 1827 val2 = (u32) (unsigned long) utime;
1054 1828
1055 return do_futex((unsigned long)uaddr, op, val, timeout, 1829 return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
1056 (unsigned long)uaddr2, val2, val3);
1057} 1830}
1058 1831
1059static struct super_block * 1832static int futexfs_get_sb(struct file_system_type *fs_type,
1060futexfs_get_sb(struct file_system_type *fs_type, 1833 int flags, const char *dev_name, void *data,
1061 int flags, const char *dev_name, void *data) 1834 struct vfsmount *mnt)
1062{ 1835{
1063 return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA); 1836 return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA, mnt);
1064} 1837}
1065 1838
1066static struct file_system_type futex_fs_type = { 1839static struct file_system_type futex_fs_type = {
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 1ab6a0ea3d14..d1d92b441fb7 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -129,16 +129,20 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
129 unsigned long timeout = MAX_SCHEDULE_TIMEOUT; 129 unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
130 int val2 = 0; 130 int val2 = 0;
131 131
132 if (utime && (op == FUTEX_WAIT)) { 132 if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
133 if (get_compat_timespec(&t, utime)) 133 if (get_compat_timespec(&t, utime))
134 return -EFAULT; 134 return -EFAULT;
135 if (!timespec_valid(&t)) 135 if (!timespec_valid(&t))
136 return -EINVAL; 136 return -EINVAL;
137 timeout = timespec_to_jiffies(&t) + 1; 137 if (op == FUTEX_WAIT)
138 timeout = timespec_to_jiffies(&t) + 1;
139 else {
140 timeout = t.tv_sec;
141 val2 = t.tv_nsec;
142 }
138 } 143 }
139 if (op >= FUTEX_REQUEUE) 144 if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
140 val2 = (int) (unsigned long) utime; 145 val2 = (int) (unsigned long) utime;
141 146
142 return do_futex((unsigned long)uaddr, op, val, timeout, 147 return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
143 (unsigned long)uaddr2, val2, val3);
144} 148}
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 01fa2ae98a85..8d3dc29ef41a 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -98,7 +98,6 @@ static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) =
98 98
99/** 99/**
100 * ktime_get_ts - get the monotonic clock in timespec format 100 * ktime_get_ts - get the monotonic clock in timespec format
101 *
102 * @ts: pointer to timespec variable 101 * @ts: pointer to timespec variable
103 * 102 *
104 * The function calculates the monotonic clock from the realtime 103 * The function calculates the monotonic clock from the realtime
@@ -238,7 +237,6 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
238# ifndef CONFIG_KTIME_SCALAR 237# ifndef CONFIG_KTIME_SCALAR
239/** 238/**
240 * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable 239 * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable
241 *
242 * @kt: addend 240 * @kt: addend
243 * @nsec: the scalar nsec value to add 241 * @nsec: the scalar nsec value to add
244 * 242 *
@@ -299,7 +297,6 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
299 297
300/** 298/**
301 * hrtimer_forward - forward the timer expiry 299 * hrtimer_forward - forward the timer expiry
302 *
303 * @timer: hrtimer to forward 300 * @timer: hrtimer to forward
304 * @now: forward past this time 301 * @now: forward past this time
305 * @interval: the interval to forward 302 * @interval: the interval to forward
@@ -393,7 +390,7 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
393 if (base->first == &timer->node) 390 if (base->first == &timer->node)
394 base->first = rb_next(&timer->node); 391 base->first = rb_next(&timer->node);
395 rb_erase(&timer->node, &base->active); 392 rb_erase(&timer->node, &base->active);
396 timer->node.rb_parent = HRTIMER_INACTIVE; 393 rb_set_parent(&timer->node, &timer->node);
397} 394}
398 395
399/* 396/*
@@ -411,7 +408,6 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
411 408
412/** 409/**
413 * hrtimer_start - (re)start an relative timer on the current CPU 410 * hrtimer_start - (re)start an relative timer on the current CPU
414 *
415 * @timer: the timer to be added 411 * @timer: the timer to be added
416 * @tim: expiry time 412 * @tim: expiry time
417 * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) 413 * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
@@ -460,14 +456,13 @@ EXPORT_SYMBOL_GPL(hrtimer_start);
460 456
461/** 457/**
462 * hrtimer_try_to_cancel - try to deactivate a timer 458 * hrtimer_try_to_cancel - try to deactivate a timer
463 *
464 * @timer: hrtimer to stop 459 * @timer: hrtimer to stop
465 * 460 *
466 * Returns: 461 * Returns:
467 * 0 when the timer was not active 462 * 0 when the timer was not active
468 * 1 when the timer was active 463 * 1 when the timer was active
469 * -1 when the timer is currently excuting the callback function and 464 * -1 when the timer is currently excuting the callback function and
470 * can not be stopped 465 * cannot be stopped
471 */ 466 */
472int hrtimer_try_to_cancel(struct hrtimer *timer) 467int hrtimer_try_to_cancel(struct hrtimer *timer)
473{ 468{
@@ -489,7 +484,6 @@ EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel);
489 484
490/** 485/**
491 * hrtimer_cancel - cancel a timer and wait for the handler to finish. 486 * hrtimer_cancel - cancel a timer and wait for the handler to finish.
492 *
493 * @timer: the timer to be cancelled 487 * @timer: the timer to be cancelled
494 * 488 *
495 * Returns: 489 * Returns:
@@ -510,7 +504,6 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel);
510 504
511/** 505/**
512 * hrtimer_get_remaining - get remaining time for the timer 506 * hrtimer_get_remaining - get remaining time for the timer
513 *
514 * @timer: the timer to read 507 * @timer: the timer to read
515 */ 508 */
516ktime_t hrtimer_get_remaining(const struct hrtimer *timer) 509ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
@@ -564,7 +557,6 @@ ktime_t hrtimer_get_next_event(void)
564 557
565/** 558/**
566 * hrtimer_init - initialize a timer to the given clock 559 * hrtimer_init - initialize a timer to the given clock
567 *
568 * @timer: the timer to be initialized 560 * @timer: the timer to be initialized
569 * @clock_id: the clock to be used 561 * @clock_id: the clock to be used
570 * @mode: timer mode abs/rel 562 * @mode: timer mode abs/rel
@@ -576,19 +568,18 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
576 568
577 memset(timer, 0, sizeof(struct hrtimer)); 569 memset(timer, 0, sizeof(struct hrtimer));
578 570
579 bases = per_cpu(hrtimer_bases, raw_smp_processor_id()); 571 bases = __raw_get_cpu_var(hrtimer_bases);
580 572
581 if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS) 573 if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS)
582 clock_id = CLOCK_MONOTONIC; 574 clock_id = CLOCK_MONOTONIC;
583 575
584 timer->base = &bases[clock_id]; 576 timer->base = &bases[clock_id];
585 timer->node.rb_parent = HRTIMER_INACTIVE; 577 rb_set_parent(&timer->node, &timer->node);
586} 578}
587EXPORT_SYMBOL_GPL(hrtimer_init); 579EXPORT_SYMBOL_GPL(hrtimer_init);
588 580
589/** 581/**
590 * hrtimer_get_res - get the timer resolution for a clock 582 * hrtimer_get_res - get the timer resolution for a clock
591 *
592 * @which_clock: which clock to query 583 * @which_clock: which clock to query
593 * @tp: pointer to timespec variable to store the resolution 584 * @tp: pointer to timespec variable to store the resolution
594 * 585 *
@@ -599,7 +590,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
599{ 590{
600 struct hrtimer_base *bases; 591 struct hrtimer_base *bases;
601 592
602 bases = per_cpu(hrtimer_bases, raw_smp_processor_id()); 593 bases = __raw_get_cpu_var(hrtimer_bases);
603 *tp = ktime_to_timespec(bases[which_clock].resolution); 594 *tp = ktime_to_timespec(bases[which_clock].resolution);
604 595
605 return 0; 596 return 0;
@@ -842,7 +833,7 @@ static void migrate_hrtimers(int cpu)
842} 833}
843#endif /* CONFIG_HOTPLUG_CPU */ 834#endif /* CONFIG_HOTPLUG_CPU */
844 835
845static int hrtimer_cpu_notify(struct notifier_block *self, 836static int __devinit hrtimer_cpu_notify(struct notifier_block *self,
846 unsigned long action, void *hcpu) 837 unsigned long action, void *hcpu)
847{ 838{
848 long cpu = (long)hcpu; 839 long cpu = (long)hcpu;
@@ -866,7 +857,7 @@ static int hrtimer_cpu_notify(struct notifier_block *self,
866 return NOTIFY_OK; 857 return NOTIFY_OK;
867} 858}
868 859
869static struct notifier_block hrtimers_nb = { 860static struct notifier_block __devinitdata hrtimers_nb = {
870 .notifier_call = hrtimer_cpu_notify, 861 .notifier_call = hrtimer_cpu_notify,
871}; 862};
872 863
diff --git a/kernel/intermodule.c b/kernel/intermodule.c
deleted file mode 100644
index 55b1e5b85db9..000000000000
--- a/kernel/intermodule.c
+++ /dev/null
@@ -1,184 +0,0 @@
1/* Deprecated, do not use. Moved from module.c to here. --RR */
2
3/* Written by Keith Owens <kaos@ocs.com.au> Oct 2000 */
4#include <linux/module.h>
5#include <linux/kmod.h>
6#include <linux/spinlock.h>
7#include <linux/list.h>
8#include <linux/slab.h>
9
10/* inter_module functions are always available, even when the kernel is
11 * compiled without modules. Consumers of inter_module_xxx routines
12 * will always work, even when both are built into the kernel, this
13 * approach removes lots of #ifdefs in mainline code.
14 */
15
16static struct list_head ime_list = LIST_HEAD_INIT(ime_list);
17static DEFINE_SPINLOCK(ime_lock);
18static int kmalloc_failed;
19
20struct inter_module_entry {
21 struct list_head list;
22 const char *im_name;
23 struct module *owner;
24 const void *userdata;
25};
26
27/**
28 * inter_module_register - register a new set of inter module data.
29 * @im_name: an arbitrary string to identify the data, must be unique
30 * @owner: module that is registering the data, always use THIS_MODULE
31 * @userdata: pointer to arbitrary userdata to be registered
32 *
33 * Description: Check that the im_name has not already been registered,
34 * complain if it has. For new data, add it to the inter_module_entry
35 * list.
36 */
37void inter_module_register(const char *im_name, struct module *owner, const void *userdata)
38{
39 struct list_head *tmp;
40 struct inter_module_entry *ime, *ime_new;
41
42 if (!(ime_new = kzalloc(sizeof(*ime), GFP_KERNEL))) {
43 /* Overloaded kernel, not fatal */
44 printk(KERN_ERR
45 "Aiee, inter_module_register: cannot kmalloc entry for '%s'\n",
46 im_name);
47 kmalloc_failed = 1;
48 return;
49 }
50 ime_new->im_name = im_name;
51 ime_new->owner = owner;
52 ime_new->userdata = userdata;
53
54 spin_lock(&ime_lock);
55 list_for_each(tmp, &ime_list) {
56 ime = list_entry(tmp, struct inter_module_entry, list);
57 if (strcmp(ime->im_name, im_name) == 0) {
58 spin_unlock(&ime_lock);
59 kfree(ime_new);
60 /* Program logic error, fatal */
61 printk(KERN_ERR "inter_module_register: duplicate im_name '%s'", im_name);
62 BUG();
63 }
64 }
65 list_add(&(ime_new->list), &ime_list);
66 spin_unlock(&ime_lock);
67}
68
69/**
70 * inter_module_unregister - unregister a set of inter module data.
71 * @im_name: an arbitrary string to identify the data, must be unique
72 *
73 * Description: Check that the im_name has been registered, complain if
74 * it has not. For existing data, remove it from the
75 * inter_module_entry list.
76 */
77void inter_module_unregister(const char *im_name)
78{
79 struct list_head *tmp;
80 struct inter_module_entry *ime;
81
82 spin_lock(&ime_lock);
83 list_for_each(tmp, &ime_list) {
84 ime = list_entry(tmp, struct inter_module_entry, list);
85 if (strcmp(ime->im_name, im_name) == 0) {
86 list_del(&(ime->list));
87 spin_unlock(&ime_lock);
88 kfree(ime);
89 return;
90 }
91 }
92 spin_unlock(&ime_lock);
93 if (kmalloc_failed) {
94 printk(KERN_ERR
95 "inter_module_unregister: no entry for '%s', "
96 "probably caused by previous kmalloc failure\n",
97 im_name);
98 return;
99 }
100 else {
101 /* Program logic error, fatal */
102 printk(KERN_ERR "inter_module_unregister: no entry for '%s'", im_name);
103 BUG();
104 }
105}
106
107/**
108 * inter_module_get - return arbitrary userdata from another module.
109 * @im_name: an arbitrary string to identify the data, must be unique
110 *
111 * Description: If the im_name has not been registered, return NULL.
112 * Try to increment the use count on the owning module, if that fails
113 * then return NULL. Otherwise return the userdata.
114 */
115static const void *inter_module_get(const char *im_name)
116{
117 struct list_head *tmp;
118 struct inter_module_entry *ime;
119 const void *result = NULL;
120
121 spin_lock(&ime_lock);
122 list_for_each(tmp, &ime_list) {
123 ime = list_entry(tmp, struct inter_module_entry, list);
124 if (strcmp(ime->im_name, im_name) == 0) {
125 if (try_module_get(ime->owner))
126 result = ime->userdata;
127 break;
128 }
129 }
130 spin_unlock(&ime_lock);
131 return(result);
132}
133
134/**
135 * inter_module_get_request - im get with automatic request_module.
136 * @im_name: an arbitrary string to identify the data, must be unique
137 * @modname: module that is expected to register im_name
138 *
139 * Description: If inter_module_get fails, do request_module then retry.
140 */
141const void *inter_module_get_request(const char *im_name, const char *modname)
142{
143 const void *result = inter_module_get(im_name);
144 if (!result) {
145 request_module("%s", modname);
146 result = inter_module_get(im_name);
147 }
148 return(result);
149}
150
151/**
152 * inter_module_put - release use of data from another module.
153 * @im_name: an arbitrary string to identify the data, must be unique
154 *
155 * Description: If the im_name has not been registered, complain,
156 * otherwise decrement the use count on the owning module.
157 */
158void inter_module_put(const char *im_name)
159{
160 struct list_head *tmp;
161 struct inter_module_entry *ime;
162
163 spin_lock(&ime_lock);
164 list_for_each(tmp, &ime_list) {
165 ime = list_entry(tmp, struct inter_module_entry, list);
166 if (strcmp(ime->im_name, im_name) == 0) {
167 if (ime->owner)
168 module_put(ime->owner);
169 spin_unlock(&ime_lock);
170 return;
171 }
172 }
173 spin_unlock(&ime_lock);
174 printk(KERN_ERR "inter_module_put: no entry for '%s'", im_name);
175 BUG();
176}
177
178EXPORT_SYMBOL(inter_module_register);
179EXPORT_SYMBOL(inter_module_unregister);
180EXPORT_SYMBOL(inter_module_get_request);
181EXPORT_SYMBOL(inter_module_put);
182
183MODULE_LICENSE("GPL");
184
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 9f77f50d8143..1dab0ac3f797 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -1,5 +1,5 @@
1 1
2obj-y := handle.o manage.o spurious.o 2obj-y := handle.o manage.o spurious.o resend.o chip.o
3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o 3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
4obj-$(CONFIG_PROC_FS) += proc.o 4obj-$(CONFIG_PROC_FS) += proc.o
5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o 5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 3467097ca61a..533068cfb607 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -11,12 +11,14 @@
11#include <linux/interrupt.h> 11#include <linux/interrupt.h>
12#include <linux/delay.h> 12#include <linux/delay.h>
13 13
14#include "internals.h"
15
14/* 16/*
15 * Autodetection depends on the fact that any interrupt that 17 * Autodetection depends on the fact that any interrupt that
16 * comes in on to an unassigned handler will get stuck with 18 * comes in on to an unassigned handler will get stuck with
17 * "IRQ_WAITING" cleared and the interrupt disabled. 19 * "IRQ_WAITING" cleared and the interrupt disabled.
18 */ 20 */
19static DECLARE_MUTEX(probe_sem); 21static DEFINE_MUTEX(probing_active);
20 22
21/** 23/**
22 * probe_irq_on - begin an interrupt autodetect 24 * probe_irq_on - begin an interrupt autodetect
@@ -27,11 +29,11 @@ static DECLARE_MUTEX(probe_sem);
27 */ 29 */
28unsigned long probe_irq_on(void) 30unsigned long probe_irq_on(void)
29{ 31{
30 unsigned long val; 32 struct irq_desc *desc;
31 irq_desc_t *desc; 33 unsigned long mask;
32 unsigned int i; 34 unsigned int i;
33 35
34 down(&probe_sem); 36 mutex_lock(&probing_active);
35 /* 37 /*
36 * something may have generated an irq long ago and we want to 38 * something may have generated an irq long ago and we want to
37 * flush such a longstanding irq before considering it as spurious. 39 * flush such a longstanding irq before considering it as spurious.
@@ -40,8 +42,21 @@ unsigned long probe_irq_on(void)
40 desc = irq_desc + i; 42 desc = irq_desc + i;
41 43
42 spin_lock_irq(&desc->lock); 44 spin_lock_irq(&desc->lock);
43 if (!irq_desc[i].action) 45 if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
44 irq_desc[i].handler->startup(i); 46 /*
47 * An old-style architecture might still have
48 * the handle_bad_irq handler there:
49 */
50 compat_irq_chip_set_default_handler(desc);
51
52 /*
53 * Some chips need to know about probing in
54 * progress:
55 */
56 if (desc->chip->set_type)
57 desc->chip->set_type(i, IRQ_TYPE_PROBE);
58 desc->chip->startup(i);
59 }
45 spin_unlock_irq(&desc->lock); 60 spin_unlock_irq(&desc->lock);
46 } 61 }
47 62
@@ -57,9 +72,9 @@ unsigned long probe_irq_on(void)
57 desc = irq_desc + i; 72 desc = irq_desc + i;
58 73
59 spin_lock_irq(&desc->lock); 74 spin_lock_irq(&desc->lock);
60 if (!desc->action) { 75 if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
61 desc->status |= IRQ_AUTODETECT | IRQ_WAITING; 76 desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
62 if (desc->handler->startup(i)) 77 if (desc->chip->startup(i))
63 desc->status |= IRQ_PENDING; 78 desc->status |= IRQ_PENDING;
64 } 79 }
65 spin_unlock_irq(&desc->lock); 80 spin_unlock_irq(&desc->lock);
@@ -73,11 +88,11 @@ unsigned long probe_irq_on(void)
73 /* 88 /*
74 * Now filter out any obviously spurious interrupts 89 * Now filter out any obviously spurious interrupts
75 */ 90 */
76 val = 0; 91 mask = 0;
77 for (i = 0; i < NR_IRQS; i++) { 92 for (i = 0; i < NR_IRQS; i++) {
78 irq_desc_t *desc = irq_desc + i;
79 unsigned int status; 93 unsigned int status;
80 94
95 desc = irq_desc + i;
81 spin_lock_irq(&desc->lock); 96 spin_lock_irq(&desc->lock);
82 status = desc->status; 97 status = desc->status;
83 98
@@ -85,17 +100,16 @@ unsigned long probe_irq_on(void)
85 /* It triggered already - consider it spurious. */ 100 /* It triggered already - consider it spurious. */
86 if (!(status & IRQ_WAITING)) { 101 if (!(status & IRQ_WAITING)) {
87 desc->status = status & ~IRQ_AUTODETECT; 102 desc->status = status & ~IRQ_AUTODETECT;
88 desc->handler->shutdown(i); 103 desc->chip->shutdown(i);
89 } else 104 } else
90 if (i < 32) 105 if (i < 32)
91 val |= 1 << i; 106 mask |= 1 << i;
92 } 107 }
93 spin_unlock_irq(&desc->lock); 108 spin_unlock_irq(&desc->lock);
94 } 109 }
95 110
96 return val; 111 return mask;
97} 112}
98
99EXPORT_SYMBOL(probe_irq_on); 113EXPORT_SYMBOL(probe_irq_on);
100 114
101/** 115/**
@@ -117,7 +131,7 @@ unsigned int probe_irq_mask(unsigned long val)
117 131
118 mask = 0; 132 mask = 0;
119 for (i = 0; i < NR_IRQS; i++) { 133 for (i = 0; i < NR_IRQS; i++) {
120 irq_desc_t *desc = irq_desc + i; 134 struct irq_desc *desc = irq_desc + i;
121 unsigned int status; 135 unsigned int status;
122 136
123 spin_lock_irq(&desc->lock); 137 spin_lock_irq(&desc->lock);
@@ -128,11 +142,11 @@ unsigned int probe_irq_mask(unsigned long val)
128 mask |= 1 << i; 142 mask |= 1 << i;
129 143
130 desc->status = status & ~IRQ_AUTODETECT; 144 desc->status = status & ~IRQ_AUTODETECT;
131 desc->handler->shutdown(i); 145 desc->chip->shutdown(i);
132 } 146 }
133 spin_unlock_irq(&desc->lock); 147 spin_unlock_irq(&desc->lock);
134 } 148 }
135 up(&probe_sem); 149 mutex_unlock(&probing_active);
136 150
137 return mask & val; 151 return mask & val;
138} 152}
@@ -160,7 +174,7 @@ int probe_irq_off(unsigned long val)
160 int i, irq_found = 0, nr_irqs = 0; 174 int i, irq_found = 0, nr_irqs = 0;
161 175
162 for (i = 0; i < NR_IRQS; i++) { 176 for (i = 0; i < NR_IRQS; i++) {
163 irq_desc_t *desc = irq_desc + i; 177 struct irq_desc *desc = irq_desc + i;
164 unsigned int status; 178 unsigned int status;
165 179
166 spin_lock_irq(&desc->lock); 180 spin_lock_irq(&desc->lock);
@@ -173,16 +187,16 @@ int probe_irq_off(unsigned long val)
173 nr_irqs++; 187 nr_irqs++;
174 } 188 }
175 desc->status = status & ~IRQ_AUTODETECT; 189 desc->status = status & ~IRQ_AUTODETECT;
176 desc->handler->shutdown(i); 190 desc->chip->shutdown(i);
177 } 191 }
178 spin_unlock_irq(&desc->lock); 192 spin_unlock_irq(&desc->lock);
179 } 193 }
180 up(&probe_sem); 194 mutex_unlock(&probing_active);
181 195
182 if (nr_irqs > 1) 196 if (nr_irqs > 1)
183 irq_found = -irq_found; 197 irq_found = -irq_found;
198
184 return irq_found; 199 return irq_found;
185} 200}
186
187EXPORT_SYMBOL(probe_irq_off); 201EXPORT_SYMBOL(probe_irq_off);
188 202
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
new file mode 100644
index 000000000000..54105bdfe20d
--- /dev/null
+++ b/kernel/irq/chip.c
@@ -0,0 +1,534 @@
1/*
2 * linux/kernel/irq/chip.c
3 *
4 * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
5 * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
6 *
7 * This file contains the core interrupt handling code, for irq-chip
8 * based architectures.
9 *
10 * Detailed information is available in Documentation/DocBook/genericirq
11 */
12
13#include <linux/irq.h>
14#include <linux/module.h>
15#include <linux/interrupt.h>
16#include <linux/kernel_stat.h>
17
18#include "internals.h"
19
20/**
21 * set_irq_chip - set the irq chip for an irq
22 * @irq: irq number
23 * @chip: pointer to irq chip description structure
24 */
25int set_irq_chip(unsigned int irq, struct irq_chip *chip)
26{
27 struct irq_desc *desc;
28 unsigned long flags;
29
30 if (irq >= NR_IRQS) {
31 printk(KERN_ERR "Trying to install chip for IRQ%d\n", irq);
32 WARN_ON(1);
33 return -EINVAL;
34 }
35
36 if (!chip)
37 chip = &no_irq_chip;
38
39 desc = irq_desc + irq;
40 spin_lock_irqsave(&desc->lock, flags);
41 irq_chip_set_defaults(chip);
42 desc->chip = chip;
43 /*
44 * For compatibility only:
45 */
46 desc->chip = chip;
47 spin_unlock_irqrestore(&desc->lock, flags);
48
49 return 0;
50}
51EXPORT_SYMBOL(set_irq_chip);
52
53/**
54 * set_irq_type - set the irq type for an irq
55 * @irq: irq number
56 * @type: interrupt type - see include/linux/interrupt.h
57 */
58int set_irq_type(unsigned int irq, unsigned int type)
59{
60 struct irq_desc *desc;
61 unsigned long flags;
62 int ret = -ENXIO;
63
64 if (irq >= NR_IRQS) {
65 printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq);
66 return -ENODEV;
67 }
68
69 desc = irq_desc + irq;
70 if (desc->chip->set_type) {
71 spin_lock_irqsave(&desc->lock, flags);
72 ret = desc->chip->set_type(irq, type);
73 spin_unlock_irqrestore(&desc->lock, flags);
74 }
75 return ret;
76}
77EXPORT_SYMBOL(set_irq_type);
78
79/**
80 * set_irq_data - set irq type data for an irq
81 * @irq: Interrupt number
82 * @data: Pointer to interrupt specific data
83 *
84 * Set the hardware irq controller data for an irq
85 */
86int set_irq_data(unsigned int irq, void *data)
87{
88 struct irq_desc *desc;
89 unsigned long flags;
90
91 if (irq >= NR_IRQS) {
92 printk(KERN_ERR
93 "Trying to install controller data for IRQ%d\n", irq);
94 return -EINVAL;
95 }
96
97 desc = irq_desc + irq;
98 spin_lock_irqsave(&desc->lock, flags);
99 desc->handler_data = data;
100 spin_unlock_irqrestore(&desc->lock, flags);
101 return 0;
102}
103EXPORT_SYMBOL(set_irq_data);
104
105/**
106 * set_irq_chip_data - set irq chip data for an irq
107 * @irq: Interrupt number
108 * @data: Pointer to chip specific data
109 *
110 * Set the hardware irq chip data for an irq
111 */
112int set_irq_chip_data(unsigned int irq, void *data)
113{
114 struct irq_desc *desc = irq_desc + irq;
115 unsigned long flags;
116
117 if (irq >= NR_IRQS || !desc->chip) {
118 printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq);
119 return -EINVAL;
120 }
121
122 spin_lock_irqsave(&desc->lock, flags);
123 desc->chip_data = data;
124 spin_unlock_irqrestore(&desc->lock, flags);
125
126 return 0;
127}
128EXPORT_SYMBOL(set_irq_chip_data);
129
130/*
131 * default enable function
132 */
133static void default_enable(unsigned int irq)
134{
135 struct irq_desc *desc = irq_desc + irq;
136
137 desc->chip->unmask(irq);
138 desc->status &= ~IRQ_MASKED;
139}
140
141/*
142 * default disable function
143 */
144static void default_disable(unsigned int irq)
145{
146 struct irq_desc *desc = irq_desc + irq;
147
148 if (!(desc->status & IRQ_DELAYED_DISABLE))
149 irq_desc[irq].chip->mask(irq);
150}
151
152/*
153 * default startup function
154 */
155static unsigned int default_startup(unsigned int irq)
156{
157 irq_desc[irq].chip->enable(irq);
158
159 return 0;
160}
161
162/*
163 * Fixup enable/disable function pointers
164 */
165void irq_chip_set_defaults(struct irq_chip *chip)
166{
167 if (!chip->enable)
168 chip->enable = default_enable;
169 if (!chip->disable)
170 chip->disable = default_disable;
171 if (!chip->startup)
172 chip->startup = default_startup;
173 if (!chip->shutdown)
174 chip->shutdown = chip->disable;
175 if (!chip->name)
176 chip->name = chip->typename;
177}
178
179static inline void mask_ack_irq(struct irq_desc *desc, int irq)
180{
181 if (desc->chip->mask_ack)
182 desc->chip->mask_ack(irq);
183 else {
184 desc->chip->mask(irq);
185 desc->chip->ack(irq);
186 }
187}
188
189/**
190 * handle_simple_irq - Simple and software-decoded IRQs.
191 * @irq: the interrupt number
192 * @desc: the interrupt description structure for this irq
193 * @regs: pointer to a register structure
194 *
195 * Simple interrupts are either sent from a demultiplexing interrupt
196 * handler or come from hardware, where no interrupt hardware control
197 * is necessary.
198 *
199 * Note: The caller is expected to handle the ack, clear, mask and
200 * unmask issues if necessary.
201 */
202void fastcall
203handle_simple_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
204{
205 struct irqaction *action;
206 irqreturn_t action_ret;
207 const unsigned int cpu = smp_processor_id();
208
209 spin_lock(&desc->lock);
210
211 if (unlikely(desc->status & IRQ_INPROGRESS))
212 goto out_unlock;
213 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
214 kstat_cpu(cpu).irqs[irq]++;
215
216 action = desc->action;
217 if (unlikely(!action || (desc->status & IRQ_DISABLED)))
218 goto out_unlock;
219
220 desc->status |= IRQ_INPROGRESS;
221 spin_unlock(&desc->lock);
222
223 action_ret = handle_IRQ_event(irq, regs, action);
224 if (!noirqdebug)
225 note_interrupt(irq, desc, action_ret, regs);
226
227 spin_lock(&desc->lock);
228 desc->status &= ~IRQ_INPROGRESS;
229out_unlock:
230 spin_unlock(&desc->lock);
231}
232
233/**
234 * handle_level_irq - Level type irq handler
235 * @irq: the interrupt number
236 * @desc: the interrupt description structure for this irq
237 * @regs: pointer to a register structure
238 *
239 * Level type interrupts are active as long as the hardware line has
240 * the active level. This may require to mask the interrupt and unmask
241 * it after the associated handler has acknowledged the device, so the
242 * interrupt line is back to inactive.
243 */
244void fastcall
245handle_level_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
246{
247 unsigned int cpu = smp_processor_id();
248 struct irqaction *action;
249 irqreturn_t action_ret;
250
251 spin_lock(&desc->lock);
252 mask_ack_irq(desc, irq);
253
254 if (unlikely(desc->status & IRQ_INPROGRESS))
255 goto out;
256 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
257 kstat_cpu(cpu).irqs[irq]++;
258
259 /*
260 * If its disabled or no action available
261 * keep it masked and get out of here
262 */
263 action = desc->action;
264 if (unlikely(!action || (desc->status & IRQ_DISABLED)))
265 goto out;
266
267 desc->status |= IRQ_INPROGRESS;
268 spin_unlock(&desc->lock);
269
270 action_ret = handle_IRQ_event(irq, regs, action);
271 if (!noirqdebug)
272 note_interrupt(irq, desc, action_ret, regs);
273
274 spin_lock(&desc->lock);
275 desc->status &= ~IRQ_INPROGRESS;
276out:
277 if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
278 desc->chip->unmask(irq);
279 spin_unlock(&desc->lock);
280}
281
282/**
283 * handle_fasteoi_irq - irq handler for transparent controllers
284 * @irq: the interrupt number
285 * @desc: the interrupt description structure for this irq
286 * @regs: pointer to a register structure
287 *
288 * Only a single callback will be issued to the chip: an ->eoi()
289 * call when the interrupt has been serviced. This enables support
290 * for modern forms of interrupt handlers, which handle the flow
291 * details in hardware, transparently.
292 */
293void fastcall
294handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc,
295 struct pt_regs *regs)
296{
297 unsigned int cpu = smp_processor_id();
298 struct irqaction *action;
299 irqreturn_t action_ret;
300
301 spin_lock(&desc->lock);
302
303 if (unlikely(desc->status & IRQ_INPROGRESS))
304 goto out;
305
306 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
307 kstat_cpu(cpu).irqs[irq]++;
308
309 /*
310 * If its disabled or no action available
311 * keep it masked and get out of here
312 */
313 action = desc->action;
314 if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
315 desc->status |= IRQ_PENDING;
316 goto out;
317 }
318
319 desc->status |= IRQ_INPROGRESS;
320 desc->status &= ~IRQ_PENDING;
321 spin_unlock(&desc->lock);
322
323 action_ret = handle_IRQ_event(irq, regs, action);
324 if (!noirqdebug)
325 note_interrupt(irq, desc, action_ret, regs);
326
327 spin_lock(&desc->lock);
328 desc->status &= ~IRQ_INPROGRESS;
329out:
330 desc->chip->eoi(irq);
331
332 spin_unlock(&desc->lock);
333}
334
335/**
336 * handle_edge_irq - edge type IRQ handler
337 * @irq: the interrupt number
338 * @desc: the interrupt description structure for this irq
339 * @regs: pointer to a register structure
340 *
341 * Interrupt occures on the falling and/or rising edge of a hardware
342 * signal. The occurence is latched into the irq controller hardware
343 * and must be acked in order to be reenabled. After the ack another
344 * interrupt can happen on the same source even before the first one
345 * is handled by the assosiacted event handler. If this happens it
346 * might be necessary to disable (mask) the interrupt depending on the
347 * controller hardware. This requires to reenable the interrupt inside
348 * of the loop which handles the interrupts which have arrived while
349 * the handler was running. If all pending interrupts are handled, the
350 * loop is left.
351 */
352void fastcall
353handle_edge_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
354{
355 const unsigned int cpu = smp_processor_id();
356
357 spin_lock(&desc->lock);
358
359 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
360
361 /*
362 * If we're currently running this IRQ, or its disabled,
363 * we shouldn't process the IRQ. Mark it pending, handle
364 * the necessary masking and go out
365 */
366 if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) ||
367 !desc->action)) {
368 desc->status |= (IRQ_PENDING | IRQ_MASKED);
369 mask_ack_irq(desc, irq);
370 goto out_unlock;
371 }
372
373 kstat_cpu(cpu).irqs[irq]++;
374
375 /* Start handling the irq */
376 desc->chip->ack(irq);
377
378 /* Mark the IRQ currently in progress.*/
379 desc->status |= IRQ_INPROGRESS;
380
381 do {
382 struct irqaction *action = desc->action;
383 irqreturn_t action_ret;
384
385 if (unlikely(!action)) {
386 desc->chip->mask(irq);
387 goto out_unlock;
388 }
389
390 /*
391 * When another irq arrived while we were handling
392 * one, we could have masked the irq.
393 * Renable it, if it was not disabled in meantime.
394 */
395 if (unlikely((desc->status &
396 (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) ==
397 (IRQ_PENDING | IRQ_MASKED))) {
398 desc->chip->unmask(irq);
399 desc->status &= ~IRQ_MASKED;
400 }
401
402 desc->status &= ~IRQ_PENDING;
403 spin_unlock(&desc->lock);
404 action_ret = handle_IRQ_event(irq, regs, action);
405 if (!noirqdebug)
406 note_interrupt(irq, desc, action_ret, regs);
407 spin_lock(&desc->lock);
408
409 } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING);
410
411 desc->status &= ~IRQ_INPROGRESS;
412out_unlock:
413 spin_unlock(&desc->lock);
414}
415
416#ifdef CONFIG_SMP
417/**
418 * handle_percpu_IRQ - Per CPU local irq handler
419 * @irq: the interrupt number
420 * @desc: the interrupt description structure for this irq
421 * @regs: pointer to a register structure
422 *
423 * Per CPU interrupts on SMP machines without locking requirements
424 */
425void fastcall
426handle_percpu_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
427{
428 irqreturn_t action_ret;
429
430 kstat_this_cpu.irqs[irq]++;
431
432 if (desc->chip->ack)
433 desc->chip->ack(irq);
434
435 action_ret = handle_IRQ_event(irq, regs, desc->action);
436 if (!noirqdebug)
437 note_interrupt(irq, desc, action_ret, regs);
438
439 if (desc->chip->eoi)
440 desc->chip->eoi(irq);
441}
442
443#endif /* CONFIG_SMP */
444
445void
446__set_irq_handler(unsigned int irq,
447 void fastcall (*handle)(unsigned int, irq_desc_t *,
448 struct pt_regs *),
449 int is_chained)
450{
451 struct irq_desc *desc;
452 unsigned long flags;
453
454 if (irq >= NR_IRQS) {
455 printk(KERN_ERR
456 "Trying to install type control for IRQ%d\n", irq);
457 return;
458 }
459
460 desc = irq_desc + irq;
461
462 if (!handle)
463 handle = handle_bad_irq;
464
465 if (desc->chip == &no_irq_chip) {
466 printk(KERN_WARNING "Trying to install %sinterrupt handler "
467 "for IRQ%d\n", is_chained ? "chained " : " ", irq);
468 /*
469 * Some ARM implementations install a handler for really dumb
470 * interrupt hardware without setting an irq_chip. This worked
471 * with the ARM no_irq_chip but the check in setup_irq would
472 * prevent us to setup the interrupt at all. Switch it to
473 * dummy_irq_chip for easy transition.
474 */
475 desc->chip = &dummy_irq_chip;
476 }
477
478 spin_lock_irqsave(&desc->lock, flags);
479
480 /* Uninstall? */
481 if (handle == handle_bad_irq) {
482 if (desc->chip != &no_irq_chip) {
483 desc->chip->mask(irq);
484 desc->chip->ack(irq);
485 }
486 desc->status |= IRQ_DISABLED;
487 desc->depth = 1;
488 }
489 desc->handle_irq = handle;
490
491 if (handle != handle_bad_irq && is_chained) {
492 desc->status &= ~IRQ_DISABLED;
493 desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE;
494 desc->depth = 0;
495 desc->chip->unmask(irq);
496 }
497 spin_unlock_irqrestore(&desc->lock, flags);
498}
499
500void
501set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip,
502 void fastcall (*handle)(unsigned int,
503 struct irq_desc *,
504 struct pt_regs *))
505{
506 set_irq_chip(irq, chip);
507 __set_irq_handler(irq, handle, 0);
508}
509
510/*
511 * Get a descriptive string for the highlevel handler, for
512 * /proc/interrupts output:
513 */
514const char *
515handle_irq_name(void fastcall (*handle)(unsigned int, struct irq_desc *,
516 struct pt_regs *))
517{
518 if (handle == handle_level_irq)
519 return "level ";
520 if (handle == handle_fasteoi_irq)
521 return "fasteoi";
522 if (handle == handle_edge_irq)
523 return "edge ";
524 if (handle == handle_simple_irq)
525 return "simple ";
526#ifdef CONFIG_SMP
527 if (handle == handle_percpu_irq)
528 return "percpu ";
529#endif
530 if (handle == handle_bad_irq)
531 return "bad ";
532
533 return NULL;
534}
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 51df337b37db..aeb6e391276c 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -1,9 +1,13 @@
1/* 1/*
2 * linux/kernel/irq/handle.c 2 * linux/kernel/irq/handle.c
3 * 3 *
4 * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar 4 * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
5 * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
5 * 6 *
6 * This file contains the core interrupt handling code. 7 * This file contains the core interrupt handling code.
8 *
9 * Detailed information is available in Documentation/DocBook/genericirq
10 *
7 */ 11 */
8 12
9#include <linux/irq.h> 13#include <linux/irq.h>
@@ -14,11 +18,22 @@
14 18
15#include "internals.h" 19#include "internals.h"
16 20
21/**
22 * handle_bad_irq - handle spurious and unhandled irqs
23 */
24void fastcall
25handle_bad_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
26{
27 print_irq_desc(irq, desc);
28 kstat_this_cpu.irqs[irq]++;
29 ack_bad_irq(irq);
30}
31
17/* 32/*
18 * Linux has a controller-independent interrupt architecture. 33 * Linux has a controller-independent interrupt architecture.
19 * Every controller has a 'controller-template', that is used 34 * Every controller has a 'controller-template', that is used
20 * by the main code to do the right thing. Each driver-visible 35 * by the main code to do the right thing. Each driver-visible
21 * interrupt source is transparently wired to the apropriate 36 * interrupt source is transparently wired to the appropriate
22 * controller. Thus drivers need not be aware of the 37 * controller. Thus drivers need not be aware of the
23 * interrupt-controller. 38 * interrupt-controller.
24 * 39 *
@@ -28,41 +43,68 @@
28 * 43 *
29 * Controller mappings for all interrupt sources: 44 * Controller mappings for all interrupt sources:
30 */ 45 */
31irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = { 46struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned = {
32 [0 ... NR_IRQS-1] = { 47 [0 ... NR_IRQS-1] = {
33 .status = IRQ_DISABLED, 48 .status = IRQ_DISABLED,
34 .handler = &no_irq_type, 49 .chip = &no_irq_chip,
35 .lock = SPIN_LOCK_UNLOCKED 50 .handle_irq = handle_bad_irq,
51 .depth = 1,
52 .lock = SPIN_LOCK_UNLOCKED,
53#ifdef CONFIG_SMP
54 .affinity = CPU_MASK_ALL
55#endif
36 } 56 }
37}; 57};
38 58
39/* 59/*
40 * Generic 'no controller' code 60 * What should we do if we get a hw irq event on an illegal vector?
61 * Each architecture has to answer this themself.
41 */ 62 */
42static void end_none(unsigned int irq) { } 63static void ack_bad(unsigned int irq)
43static void enable_none(unsigned int irq) { }
44static void disable_none(unsigned int irq) { }
45static void shutdown_none(unsigned int irq) { }
46static unsigned int startup_none(unsigned int irq) { return 0; }
47
48static void ack_none(unsigned int irq)
49{ 64{
50 /* 65 print_irq_desc(irq, irq_desc + irq);
51 * 'what should we do if we get a hw irq event on an illegal vector'.
52 * each architecture has to answer this themself.
53 */
54 ack_bad_irq(irq); 66 ack_bad_irq(irq);
55} 67}
56 68
57struct hw_interrupt_type no_irq_type = { 69/*
58 .typename = "none", 70 * NOP functions
59 .startup = startup_none, 71 */
60 .shutdown = shutdown_none, 72static void noop(unsigned int irq)
61 .enable = enable_none, 73{
62 .disable = disable_none, 74}
63 .ack = ack_none, 75
64 .end = end_none, 76static unsigned int noop_ret(unsigned int irq)
65 .set_affinity = NULL 77{
78 return 0;
79}
80
81/*
82 * Generic no controller implementation
83 */
84struct irq_chip no_irq_chip = {
85 .name = "none",
86 .startup = noop_ret,
87 .shutdown = noop,
88 .enable = noop,
89 .disable = noop,
90 .ack = ack_bad,
91 .end = noop,
92};
93
94/*
95 * Generic dummy implementation which can be used for
96 * real dumb interrupt sources
97 */
98struct irq_chip dummy_irq_chip = {
99 .name = "dummy",
100 .startup = noop_ret,
101 .shutdown = noop,
102 .enable = noop,
103 .disable = noop,
104 .ack = noop,
105 .mask = noop,
106 .unmask = noop,
107 .end = noop,
66}; 108};
67 109
68/* 110/*
@@ -73,15 +115,23 @@ irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs)
73 return IRQ_NONE; 115 return IRQ_NONE;
74} 116}
75 117
76/* 118/**
77 * Have got an event to handle: 119 * handle_IRQ_event - irq action chain handler
120 * @irq: the interrupt number
121 * @regs: pointer to a register structure
122 * @action: the interrupt action chain for this irq
123 *
124 * Handles the action chain of an irq event
78 */ 125 */
79fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs, 126irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
80 struct irqaction *action) 127 struct irqaction *action)
81{ 128{
82 int ret, retval = 0, status = 0; 129 irqreturn_t ret, retval = IRQ_NONE;
130 unsigned int status = 0;
131
132 handle_dynamic_tick(action);
83 133
84 if (!(action->flags & SA_INTERRUPT)) 134 if (!(action->flags & IRQF_DISABLED))
85 local_irq_enable(); 135 local_irq_enable();
86 136
87 do { 137 do {
@@ -92,22 +142,29 @@ fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
92 action = action->next; 142 action = action->next;
93 } while (action); 143 } while (action);
94 144
95 if (status & SA_SAMPLE_RANDOM) 145 if (status & IRQF_SAMPLE_RANDOM)
96 add_interrupt_randomness(irq); 146 add_interrupt_randomness(irq);
97 local_irq_disable(); 147 local_irq_disable();
98 148
99 return retval; 149 return retval;
100} 150}
101 151
102/* 152/**
103 * do_IRQ handles all normal device IRQ's (the special 153 * __do_IRQ - original all in one highlevel IRQ handler
154 * @irq: the interrupt number
155 * @regs: pointer to a register structure
156 *
157 * __do_IRQ handles all normal device IRQ's (the special
104 * SMP cross-CPU interrupts have their own specific 158 * SMP cross-CPU interrupts have their own specific
105 * handlers). 159 * handlers).
160 *
161 * This is the original x86 implementation which is used for every
162 * interrupt type.
106 */ 163 */
107fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) 164fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
108{ 165{
109 irq_desc_t *desc = irq_desc + irq; 166 struct irq_desc *desc = irq_desc + irq;
110 struct irqaction * action; 167 struct irqaction *action;
111 unsigned int status; 168 unsigned int status;
112 169
113 kstat_this_cpu.irqs[irq]++; 170 kstat_this_cpu.irqs[irq]++;
@@ -117,16 +174,16 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
117 /* 174 /*
118 * No locking required for CPU-local interrupts: 175 * No locking required for CPU-local interrupts:
119 */ 176 */
120 if (desc->handler->ack) 177 if (desc->chip->ack)
121 desc->handler->ack(irq); 178 desc->chip->ack(irq);
122 action_ret = handle_IRQ_event(irq, regs, desc->action); 179 action_ret = handle_IRQ_event(irq, regs, desc->action);
123 desc->handler->end(irq); 180 desc->chip->end(irq);
124 return 1; 181 return 1;
125 } 182 }
126 183
127 spin_lock(&desc->lock); 184 spin_lock(&desc->lock);
128 if (desc->handler->ack) 185 if (desc->chip->ack)
129 desc->handler->ack(irq); 186 desc->chip->ack(irq);
130 /* 187 /*
131 * REPLAY is when Linux resends an IRQ that was dropped earlier 188 * REPLAY is when Linux resends an IRQ that was dropped earlier
132 * WAITING is used by probe to mark irqs that are being tested 189 * WAITING is used by probe to mark irqs that are being tested
@@ -186,7 +243,7 @@ out:
186 * The ->end() handler has to deal with interrupts which got 243 * The ->end() handler has to deal with interrupts which got
187 * disabled while the handler was running. 244 * disabled while the handler was running.
188 */ 245 */
189 desc->handler->end(irq); 246 desc->chip->end(irq);
190 spin_unlock(&desc->lock); 247 spin_unlock(&desc->lock);
191 248
192 return 1; 249 return 1;
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 46feba630266..08a849a22447 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -4,6 +4,12 @@
4 4
5extern int noirqdebug; 5extern int noirqdebug;
6 6
7/* Set default functions for irq_chip structures: */
8extern void irq_chip_set_defaults(struct irq_chip *chip);
9
10/* Set default handler: */
11extern void compat_irq_chip_set_default_handler(struct irq_desc *desc);
12
7#ifdef CONFIG_PROC_FS 13#ifdef CONFIG_PROC_FS
8extern void register_irq_proc(unsigned int irq); 14extern void register_irq_proc(unsigned int irq);
9extern void register_handler_proc(unsigned int irq, struct irqaction *action); 15extern void register_handler_proc(unsigned int irq, struct irqaction *action);
@@ -16,3 +22,43 @@ static inline void unregister_handler_proc(unsigned int irq,
16 struct irqaction *action) { } 22 struct irqaction *action) { }
17#endif 23#endif
18 24
25/*
26 * Debugging printout:
27 */
28
29#include <linux/kallsyms.h>
30
31#define P(f) if (desc->status & f) printk("%14s set\n", #f)
32
33static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
34{
35 printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n",
36 irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
37 printk("->handle_irq(): %p, ", desc->handle_irq);
38 print_symbol("%s\n", (unsigned long)desc->handle_irq);
39 printk("->chip(): %p, ", desc->chip);
40 print_symbol("%s\n", (unsigned long)desc->chip);
41 printk("->action(): %p\n", desc->action);
42 if (desc->action) {
43 printk("->action->handler(): %p, ", desc->action->handler);
44 print_symbol("%s\n", (unsigned long)desc->action->handler);
45 }
46
47 P(IRQ_INPROGRESS);
48 P(IRQ_DISABLED);
49 P(IRQ_PENDING);
50 P(IRQ_REPLAY);
51 P(IRQ_AUTODETECT);
52 P(IRQ_WAITING);
53 P(IRQ_LEVEL);
54 P(IRQ_MASKED);
55#ifdef CONFIG_IRQ_PER_CPU
56 P(IRQ_PER_CPU);
57#endif
58 P(IRQ_NOPROBE);
59 P(IRQ_NOREQUEST);
60 P(IRQ_NOAUTOEN);
61}
62
63#undef P
64
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 1279e3499534..c911c6ec4dd6 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1,12 +1,12 @@
1/* 1/*
2 * linux/kernel/irq/manage.c 2 * linux/kernel/irq/manage.c
3 * 3 *
4 * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar 4 * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
5 * Copyright (C) 2005-2006 Thomas Gleixner
5 * 6 *
6 * This file contains driver APIs to the irq subsystem. 7 * This file contains driver APIs to the irq subsystem.
7 */ 8 */
8 9
9#include <linux/config.h>
10#include <linux/irq.h> 10#include <linux/irq.h>
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/random.h> 12#include <linux/random.h>
@@ -16,12 +16,6 @@
16 16
17#ifdef CONFIG_SMP 17#ifdef CONFIG_SMP
18 18
19cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL };
20
21#if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE)
22cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS];
23#endif
24
25/** 19/**
26 * synchronize_irq - wait for pending IRQ handlers (on other CPUs) 20 * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
27 * @irq: interrupt number to wait for 21 * @irq: interrupt number to wait for
@@ -42,7 +36,6 @@ void synchronize_irq(unsigned int irq)
42 while (desc->status & IRQ_INPROGRESS) 36 while (desc->status & IRQ_INPROGRESS)
43 cpu_relax(); 37 cpu_relax();
44} 38}
45
46EXPORT_SYMBOL(synchronize_irq); 39EXPORT_SYMBOL(synchronize_irq);
47 40
48#endif 41#endif
@@ -60,7 +53,7 @@ EXPORT_SYMBOL(synchronize_irq);
60 */ 53 */
61void disable_irq_nosync(unsigned int irq) 54void disable_irq_nosync(unsigned int irq)
62{ 55{
63 irq_desc_t *desc = irq_desc + irq; 56 struct irq_desc *desc = irq_desc + irq;
64 unsigned long flags; 57 unsigned long flags;
65 58
66 if (irq >= NR_IRQS) 59 if (irq >= NR_IRQS)
@@ -69,11 +62,10 @@ void disable_irq_nosync(unsigned int irq)
69 spin_lock_irqsave(&desc->lock, flags); 62 spin_lock_irqsave(&desc->lock, flags);
70 if (!desc->depth++) { 63 if (!desc->depth++) {
71 desc->status |= IRQ_DISABLED; 64 desc->status |= IRQ_DISABLED;
72 desc->handler->disable(irq); 65 desc->chip->disable(irq);
73 } 66 }
74 spin_unlock_irqrestore(&desc->lock, flags); 67 spin_unlock_irqrestore(&desc->lock, flags);
75} 68}
76
77EXPORT_SYMBOL(disable_irq_nosync); 69EXPORT_SYMBOL(disable_irq_nosync);
78 70
79/** 71/**
@@ -90,7 +82,7 @@ EXPORT_SYMBOL(disable_irq_nosync);
90 */ 82 */
91void disable_irq(unsigned int irq) 83void disable_irq(unsigned int irq)
92{ 84{
93 irq_desc_t *desc = irq_desc + irq; 85 struct irq_desc *desc = irq_desc + irq;
94 86
95 if (irq >= NR_IRQS) 87 if (irq >= NR_IRQS)
96 return; 88 return;
@@ -99,7 +91,6 @@ void disable_irq(unsigned int irq)
99 if (desc->action) 91 if (desc->action)
100 synchronize_irq(irq); 92 synchronize_irq(irq);
101} 93}
102
103EXPORT_SYMBOL(disable_irq); 94EXPORT_SYMBOL(disable_irq);
104 95
105/** 96/**
@@ -114,7 +105,7 @@ EXPORT_SYMBOL(disable_irq);
114 */ 105 */
115void enable_irq(unsigned int irq) 106void enable_irq(unsigned int irq)
116{ 107{
117 irq_desc_t *desc = irq_desc + irq; 108 struct irq_desc *desc = irq_desc + irq;
118 unsigned long flags; 109 unsigned long flags;
119 110
120 if (irq >= NR_IRQS) 111 if (irq >= NR_IRQS)
@@ -123,17 +114,15 @@ void enable_irq(unsigned int irq)
123 spin_lock_irqsave(&desc->lock, flags); 114 spin_lock_irqsave(&desc->lock, flags);
124 switch (desc->depth) { 115 switch (desc->depth) {
125 case 0: 116 case 0:
117 printk(KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
126 WARN_ON(1); 118 WARN_ON(1);
127 break; 119 break;
128 case 1: { 120 case 1: {
129 unsigned int status = desc->status & ~IRQ_DISABLED; 121 unsigned int status = desc->status & ~IRQ_DISABLED;
130 122
131 desc->status = status; 123 /* Prevent probing on this irq: */
132 if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { 124 desc->status = status | IRQ_NOPROBE;
133 desc->status = status | IRQ_REPLAY; 125 check_irq_resend(desc, irq);
134 hw_resend_irq(desc->handler,irq);
135 }
136 desc->handler->enable(irq);
137 /* fall-through */ 126 /* fall-through */
138 } 127 }
139 default: 128 default:
@@ -141,9 +130,29 @@ void enable_irq(unsigned int irq)
141 } 130 }
142 spin_unlock_irqrestore(&desc->lock, flags); 131 spin_unlock_irqrestore(&desc->lock, flags);
143} 132}
144
145EXPORT_SYMBOL(enable_irq); 133EXPORT_SYMBOL(enable_irq);
146 134
135/**
136 * set_irq_wake - control irq power management wakeup
137 * @irq: interrupt to control
138 * @on: enable/disable power management wakeup
139 *
140 * Enable/disable power management wakeup mode
141 */
142int set_irq_wake(unsigned int irq, unsigned int on)
143{
144 struct irq_desc *desc = irq_desc + irq;
145 unsigned long flags;
146 int ret = -ENXIO;
147
148 spin_lock_irqsave(&desc->lock, flags);
149 if (desc->chip->set_wake)
150 ret = desc->chip->set_wake(irq, on);
151 spin_unlock_irqrestore(&desc->lock, flags);
152 return ret;
153}
154EXPORT_SYMBOL(set_irq_wake);
155
147/* 156/*
148 * Internal function that tells the architecture code whether a 157 * Internal function that tells the architecture code whether a
149 * particular irq has been exclusively allocated or is available 158 * particular irq has been exclusively allocated or is available
@@ -153,22 +162,33 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
153{ 162{
154 struct irqaction *action; 163 struct irqaction *action;
155 164
156 if (irq >= NR_IRQS) 165 if (irq >= NR_IRQS || irq_desc[irq].status & IRQ_NOREQUEST)
157 return 0; 166 return 0;
158 167
159 action = irq_desc[irq].action; 168 action = irq_desc[irq].action;
160 if (action) 169 if (action)
161 if (irqflags & action->flags & SA_SHIRQ) 170 if (irqflags & action->flags & IRQF_SHARED)
162 action = NULL; 171 action = NULL;
163 172
164 return !action; 173 return !action;
165} 174}
166 175
176void compat_irq_chip_set_default_handler(struct irq_desc *desc)
177{
178 /*
179 * If the architecture still has not overriden
180 * the flow handler then zap the default. This
181 * should catch incorrect flow-type setting.
182 */
183 if (desc->handle_irq == &handle_bad_irq)
184 desc->handle_irq = NULL;
185}
186
167/* 187/*
168 * Internal function to register an irqaction - typically used to 188 * Internal function to register an irqaction - typically used to
169 * allocate special interrupts that are part of the architecture. 189 * allocate special interrupts that are part of the architecture.
170 */ 190 */
171int setup_irq(unsigned int irq, struct irqaction * new) 191int setup_irq(unsigned int irq, struct irqaction *new)
172{ 192{
173 struct irq_desc *desc = irq_desc + irq; 193 struct irq_desc *desc = irq_desc + irq;
174 struct irqaction *old, **p; 194 struct irqaction *old, **p;
@@ -178,14 +198,14 @@ int setup_irq(unsigned int irq, struct irqaction * new)
178 if (irq >= NR_IRQS) 198 if (irq >= NR_IRQS)
179 return -EINVAL; 199 return -EINVAL;
180 200
181 if (desc->handler == &no_irq_type) 201 if (desc->chip == &no_irq_chip)
182 return -ENOSYS; 202 return -ENOSYS;
183 /* 203 /*
184 * Some drivers like serial.c use request_irq() heavily, 204 * Some drivers like serial.c use request_irq() heavily,
185 * so we have to be careful not to interfere with a 205 * so we have to be careful not to interfere with a
186 * running system. 206 * running system.
187 */ 207 */
188 if (new->flags & SA_SAMPLE_RANDOM) { 208 if (new->flags & IRQF_SAMPLE_RANDOM) {
189 /* 209 /*
190 * This function might sleep, we want to call it first, 210 * This function might sleep, we want to call it first,
191 * outside of the atomic block. 211 * outside of the atomic block.
@@ -200,16 +220,24 @@ int setup_irq(unsigned int irq, struct irqaction * new)
200 /* 220 /*
201 * The following block of code has to be executed atomically 221 * The following block of code has to be executed atomically
202 */ 222 */
203 spin_lock_irqsave(&desc->lock,flags); 223 spin_lock_irqsave(&desc->lock, flags);
204 p = &desc->action; 224 p = &desc->action;
205 if ((old = *p) != NULL) { 225 old = *p;
206 /* Can't share interrupts unless both agree to */ 226 if (old) {
207 if (!(old->flags & new->flags & SA_SHIRQ)) 227 /*
228 * Can't share interrupts unless both agree to and are
229 * the same type (level, edge, polarity). So both flag
230 * fields must have IRQF_SHARED set and the bits which
231 * set the trigger type must match.
232 */
233 if (!((old->flags & new->flags) & IRQF_SHARED) ||
234 ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK))
208 goto mismatch; 235 goto mismatch;
209 236
210#if defined(ARCH_HAS_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ) 237#if defined(CONFIG_IRQ_PER_CPU)
211 /* All handlers must agree on per-cpuness */ 238 /* All handlers must agree on per-cpuness */
212 if ((old->flags & IRQ_PER_CPU) != (new->flags & IRQ_PER_CPU)) 239 if ((old->flags & IRQF_PERCPU) !=
240 (new->flags & IRQF_PERCPU))
213 goto mismatch; 241 goto mismatch;
214#endif 242#endif
215 243
@@ -222,20 +250,45 @@ int setup_irq(unsigned int irq, struct irqaction * new)
222 } 250 }
223 251
224 *p = new; 252 *p = new;
225#if defined(ARCH_HAS_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ) 253#if defined(CONFIG_IRQ_PER_CPU)
226 if (new->flags & SA_PERCPU_IRQ) 254 if (new->flags & IRQF_PERCPU)
227 desc->status |= IRQ_PER_CPU; 255 desc->status |= IRQ_PER_CPU;
228#endif 256#endif
229 if (!shared) { 257 if (!shared) {
230 desc->depth = 0; 258 irq_chip_set_defaults(desc->chip);
231 desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT | 259
232 IRQ_WAITING | IRQ_INPROGRESS); 260 /* Setup the type (level, edge polarity) if configured: */
233 if (desc->handler->startup) 261 if (new->flags & IRQF_TRIGGER_MASK) {
234 desc->handler->startup(irq); 262 if (desc->chip && desc->chip->set_type)
235 else 263 desc->chip->set_type(irq,
236 desc->handler->enable(irq); 264 new->flags & IRQF_TRIGGER_MASK);
265 else
266 /*
267 * IRQF_TRIGGER_* but the PIC does not support
268 * multiple flow-types?
269 */
270 printk(KERN_WARNING "No IRQF_TRIGGER set_type "
271 "function for IRQ %d (%s)\n", irq,
272 desc->chip ? desc->chip->name :
273 "unknown");
274 } else
275 compat_irq_chip_set_default_handler(desc);
276
277 desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING |
278 IRQ_INPROGRESS);
279
280 if (!(desc->status & IRQ_NOAUTOEN)) {
281 desc->depth = 0;
282 desc->status &= ~IRQ_DISABLED;
283 if (desc->chip->startup)
284 desc->chip->startup(irq);
285 else
286 desc->chip->enable(irq);
287 } else
288 /* Undo nested disables: */
289 desc->depth = 1;
237 } 290 }
238 spin_unlock_irqrestore(&desc->lock,flags); 291 spin_unlock_irqrestore(&desc->lock, flags);
239 292
240 new->irq = irq; 293 new->irq = irq;
241 register_irq_proc(irq); 294 register_irq_proc(irq);
@@ -246,8 +299,8 @@ int setup_irq(unsigned int irq, struct irqaction * new)
246 299
247mismatch: 300mismatch:
248 spin_unlock_irqrestore(&desc->lock, flags); 301 spin_unlock_irqrestore(&desc->lock, flags);
249 if (!(new->flags & SA_PROBEIRQ)) { 302 if (!(new->flags & IRQF_PROBE_SHARED)) {
250 printk(KERN_ERR "%s: irq handler mismatch\n", __FUNCTION__); 303 printk(KERN_ERR "IRQ handler type mismatch for IRQ %d\n", irq);
251 dump_stack(); 304 dump_stack();
252 } 305 }
253 return -EBUSY; 306 return -EBUSY;
@@ -278,10 +331,10 @@ void free_irq(unsigned int irq, void *dev_id)
278 return; 331 return;
279 332
280 desc = irq_desc + irq; 333 desc = irq_desc + irq;
281 spin_lock_irqsave(&desc->lock,flags); 334 spin_lock_irqsave(&desc->lock, flags);
282 p = &desc->action; 335 p = &desc->action;
283 for (;;) { 336 for (;;) {
284 struct irqaction * action = *p; 337 struct irqaction *action = *p;
285 338
286 if (action) { 339 if (action) {
287 struct irqaction **pp = p; 340 struct irqaction **pp = p;
@@ -295,18 +348,18 @@ void free_irq(unsigned int irq, void *dev_id)
295 348
296 /* Currently used only by UML, might disappear one day.*/ 349 /* Currently used only by UML, might disappear one day.*/
297#ifdef CONFIG_IRQ_RELEASE_METHOD 350#ifdef CONFIG_IRQ_RELEASE_METHOD
298 if (desc->handler->release) 351 if (desc->chip->release)
299 desc->handler->release(irq, dev_id); 352 desc->chip->release(irq, dev_id);
300#endif 353#endif
301 354
302 if (!desc->action) { 355 if (!desc->action) {
303 desc->status |= IRQ_DISABLED; 356 desc->status |= IRQ_DISABLED;
304 if (desc->handler->shutdown) 357 if (desc->chip->shutdown)
305 desc->handler->shutdown(irq); 358 desc->chip->shutdown(irq);
306 else 359 else
307 desc->handler->disable(irq); 360 desc->chip->disable(irq);
308 } 361 }
309 spin_unlock_irqrestore(&desc->lock,flags); 362 spin_unlock_irqrestore(&desc->lock, flags);
310 unregister_handler_proc(irq, action); 363 unregister_handler_proc(irq, action);
311 364
312 /* Make sure it's not being used on another CPU */ 365 /* Make sure it's not being used on another CPU */
@@ -314,12 +367,11 @@ void free_irq(unsigned int irq, void *dev_id)
314 kfree(action); 367 kfree(action);
315 return; 368 return;
316 } 369 }
317 printk(KERN_ERR "Trying to free free IRQ%d\n",irq); 370 printk(KERN_ERR "Trying to free already-free IRQ %d\n", irq);
318 spin_unlock_irqrestore(&desc->lock,flags); 371 spin_unlock_irqrestore(&desc->lock, flags);
319 return; 372 return;
320 } 373 }
321} 374}
322
323EXPORT_SYMBOL(free_irq); 375EXPORT_SYMBOL(free_irq);
324 376
325/** 377/**
@@ -346,16 +398,16 @@ EXPORT_SYMBOL(free_irq);
346 * 398 *
347 * Flags: 399 * Flags:
348 * 400 *
349 * SA_SHIRQ Interrupt is shared 401 * IRQF_SHARED Interrupt is shared
350 * SA_INTERRUPT Disable local interrupts while processing 402 * IRQF_DISABLED Disable local interrupts while processing
351 * SA_SAMPLE_RANDOM The interrupt can be used for entropy 403 * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy
352 * 404 *
353 */ 405 */
354int request_irq(unsigned int irq, 406int request_irq(unsigned int irq,
355 irqreturn_t (*handler)(int, void *, struct pt_regs *), 407 irqreturn_t (*handler)(int, void *, struct pt_regs *),
356 unsigned long irqflags, const char * devname, void *dev_id) 408 unsigned long irqflags, const char *devname, void *dev_id)
357{ 409{
358 struct irqaction * action; 410 struct irqaction *action;
359 int retval; 411 int retval;
360 412
361 /* 413 /*
@@ -364,10 +416,12 @@ int request_irq(unsigned int irq,
364 * which interrupt is which (messes up the interrupt freeing 416 * which interrupt is which (messes up the interrupt freeing
365 * logic etc). 417 * logic etc).
366 */ 418 */
367 if ((irqflags & SA_SHIRQ) && !dev_id) 419 if ((irqflags & IRQF_SHARED) && !dev_id)
368 return -EINVAL; 420 return -EINVAL;
369 if (irq >= NR_IRQS) 421 if (irq >= NR_IRQS)
370 return -EINVAL; 422 return -EINVAL;
423 if (irq_desc[irq].status & IRQ_NOREQUEST)
424 return -EINVAL;
371 if (!handler) 425 if (!handler)
372 return -EINVAL; 426 return -EINVAL;
373 427
@@ -390,6 +444,5 @@ int request_irq(unsigned int irq,
390 444
391 return retval; 445 return retval;
392} 446}
393
394EXPORT_SYMBOL(request_irq); 447EXPORT_SYMBOL(request_irq);
395 448
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 134f9f2e0e39..a57ebe9fa6f6 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -3,19 +3,19 @@
3 3
4void set_pending_irq(unsigned int irq, cpumask_t mask) 4void set_pending_irq(unsigned int irq, cpumask_t mask)
5{ 5{
6 irq_desc_t *desc = irq_desc + irq; 6 struct irq_desc *desc = irq_desc + irq;
7 unsigned long flags; 7 unsigned long flags;
8 8
9 spin_lock_irqsave(&desc->lock, flags); 9 spin_lock_irqsave(&desc->lock, flags);
10 desc->move_irq = 1; 10 desc->move_irq = 1;
11 pending_irq_cpumask[irq] = mask; 11 irq_desc[irq].pending_mask = mask;
12 spin_unlock_irqrestore(&desc->lock, flags); 12 spin_unlock_irqrestore(&desc->lock, flags);
13} 13}
14 14
15void move_native_irq(int irq) 15void move_native_irq(int irq)
16{ 16{
17 struct irq_desc *desc = irq_desc + irq;
17 cpumask_t tmp; 18 cpumask_t tmp;
18 irq_desc_t *desc = irq_descp(irq);
19 19
20 if (likely(!desc->move_irq)) 20 if (likely(!desc->move_irq))
21 return; 21 return;
@@ -30,15 +30,15 @@ void move_native_irq(int irq)
30 30
31 desc->move_irq = 0; 31 desc->move_irq = 0;
32 32
33 if (likely(cpus_empty(pending_irq_cpumask[irq]))) 33 if (unlikely(cpus_empty(irq_desc[irq].pending_mask)))
34 return; 34 return;
35 35
36 if (!desc->handler->set_affinity) 36 if (!desc->chip->set_affinity)
37 return; 37 return;
38 38
39 assert_spin_locked(&desc->lock); 39 assert_spin_locked(&desc->lock);
40 40
41 cpus_and(tmp, pending_irq_cpumask[irq], cpu_online_map); 41 cpus_and(tmp, irq_desc[irq].pending_mask, cpu_online_map);
42 42
43 /* 43 /*
44 * If there was a valid mask to work with, please 44 * If there was a valid mask to work with, please
@@ -49,14 +49,14 @@ void move_native_irq(int irq)
49 * cause some ioapics to mal-function. 49 * cause some ioapics to mal-function.
50 * Being paranoid i guess! 50 * Being paranoid i guess!
51 */ 51 */
52 if (unlikely(!cpus_empty(tmp))) { 52 if (likely(!cpus_empty(tmp))) {
53 if (likely(!(desc->status & IRQ_DISABLED))) 53 if (likely(!(desc->status & IRQ_DISABLED)))
54 desc->handler->disable(irq); 54 desc->chip->disable(irq);
55 55
56 desc->handler->set_affinity(irq,tmp); 56 desc->chip->set_affinity(irq,tmp);
57 57
58 if (likely(!(desc->status & IRQ_DISABLED))) 58 if (likely(!(desc->status & IRQ_DISABLED)))
59 desc->handler->enable(irq); 59 desc->chip->enable(irq);
60 } 60 }
61 cpus_clear(pending_irq_cpumask[irq]); 61 cpus_clear(irq_desc[irq].pending_mask);
62} 62}
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index d03b5eef8ce0..607c7809ad01 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -12,18 +12,15 @@
12 12
13#include "internals.h" 13#include "internals.h"
14 14
15static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS]; 15static struct proc_dir_entry *root_irq_dir;
16 16
17#ifdef CONFIG_SMP 17#ifdef CONFIG_SMP
18 18
19/*
20 * The /proc/irq/<irq>/smp_affinity values:
21 */
22static struct proc_dir_entry *smp_affinity_entry[NR_IRQS];
23
24#ifdef CONFIG_GENERIC_PENDING_IRQ 19#ifdef CONFIG_GENERIC_PENDING_IRQ
25void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) 20void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
26{ 21{
22 set_balance_irq_affinity(irq, mask_val);
23
27 /* 24 /*
28 * Save these away for later use. Re-progam when the 25 * Save these away for later use. Re-progam when the
29 * interrupt is pending 26 * interrupt is pending
@@ -33,15 +30,16 @@ void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
33#else 30#else
34void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) 31void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
35{ 32{
36 irq_affinity[irq] = mask_val; 33 set_balance_irq_affinity(irq, mask_val);
37 irq_desc[irq].handler->set_affinity(irq, mask_val); 34 irq_desc[irq].affinity = mask_val;
35 irq_desc[irq].chip->set_affinity(irq, mask_val);
38} 36}
39#endif 37#endif
40 38
41static int irq_affinity_read_proc(char *page, char **start, off_t off, 39static int irq_affinity_read_proc(char *page, char **start, off_t off,
42 int count, int *eof, void *data) 40 int count, int *eof, void *data)
43{ 41{
44 int len = cpumask_scnprintf(page, count, irq_affinity[(long)data]); 42 int len = cpumask_scnprintf(page, count, irq_desc[(long)data].affinity);
45 43
46 if (count - len < 2) 44 if (count - len < 2)
47 return -EINVAL; 45 return -EINVAL;
@@ -56,7 +54,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
56 unsigned int irq = (int)(long)data, full_count = count, err; 54 unsigned int irq = (int)(long)data, full_count = count, err;
57 cpumask_t new_value, tmp; 55 cpumask_t new_value, tmp;
58 56
59 if (!irq_desc[irq].handler->set_affinity || no_irq_affinity) 57 if (!irq_desc[irq].chip->set_affinity || no_irq_affinity)
60 return -EIO; 58 return -EIO;
61 59
62 err = cpumask_parse(buffer, count, new_value); 60 err = cpumask_parse(buffer, count, new_value);
@@ -99,7 +97,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
99{ 97{
100 char name [MAX_NAMELEN]; 98 char name [MAX_NAMELEN];
101 99
102 if (!irq_dir[irq] || action->dir || !action->name || 100 if (!irq_desc[irq].dir || action->dir || !action->name ||
103 !name_unique(irq, action)) 101 !name_unique(irq, action))
104 return; 102 return;
105 103
@@ -107,7 +105,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
107 snprintf(name, MAX_NAMELEN, "%s", action->name); 105 snprintf(name, MAX_NAMELEN, "%s", action->name);
108 106
109 /* create /proc/irq/1234/handler/ */ 107 /* create /proc/irq/1234/handler/ */
110 action->dir = proc_mkdir(name, irq_dir[irq]); 108 action->dir = proc_mkdir(name, irq_desc[irq].dir);
111} 109}
112 110
113#undef MAX_NAMELEN 111#undef MAX_NAMELEN
@@ -119,22 +117,22 @@ void register_irq_proc(unsigned int irq)
119 char name [MAX_NAMELEN]; 117 char name [MAX_NAMELEN];
120 118
121 if (!root_irq_dir || 119 if (!root_irq_dir ||
122 (irq_desc[irq].handler == &no_irq_type) || 120 (irq_desc[irq].chip == &no_irq_chip) ||
123 irq_dir[irq]) 121 irq_desc[irq].dir)
124 return; 122 return;
125 123
126 memset(name, 0, MAX_NAMELEN); 124 memset(name, 0, MAX_NAMELEN);
127 sprintf(name, "%d", irq); 125 sprintf(name, "%d", irq);
128 126
129 /* create /proc/irq/1234 */ 127 /* create /proc/irq/1234 */
130 irq_dir[irq] = proc_mkdir(name, root_irq_dir); 128 irq_desc[irq].dir = proc_mkdir(name, root_irq_dir);
131 129
132#ifdef CONFIG_SMP 130#ifdef CONFIG_SMP
133 { 131 {
134 struct proc_dir_entry *entry; 132 struct proc_dir_entry *entry;
135 133
136 /* create /proc/irq/<irq>/smp_affinity */ 134 /* create /proc/irq/<irq>/smp_affinity */
137 entry = create_proc_entry("smp_affinity", 0600, irq_dir[irq]); 135 entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir);
138 136
139 if (entry) { 137 if (entry) {
140 entry->nlink = 1; 138 entry->nlink = 1;
@@ -142,7 +140,6 @@ void register_irq_proc(unsigned int irq)
142 entry->read_proc = irq_affinity_read_proc; 140 entry->read_proc = irq_affinity_read_proc;
143 entry->write_proc = irq_affinity_write_proc; 141 entry->write_proc = irq_affinity_write_proc;
144 } 142 }
145 smp_affinity_entry[irq] = entry;
146 } 143 }
147#endif 144#endif
148} 145}
@@ -152,7 +149,7 @@ void register_irq_proc(unsigned int irq)
152void unregister_handler_proc(unsigned int irq, struct irqaction *action) 149void unregister_handler_proc(unsigned int irq, struct irqaction *action)
153{ 150{
154 if (action->dir) 151 if (action->dir)
155 remove_proc_entry(action->dir->name, irq_dir[irq]); 152 remove_proc_entry(action->dir->name, irq_desc[irq].dir);
156} 153}
157 154
158void init_irq_proc(void) 155void init_irq_proc(void)
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
new file mode 100644
index 000000000000..872f91ba2ce8
--- /dev/null
+++ b/kernel/irq/resend.c
@@ -0,0 +1,78 @@
1/*
2 * linux/kernel/irq/resend.c
3 *
4 * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
5 * Copyright (C) 2005-2006, Thomas Gleixner
6 *
7 * This file contains the IRQ-resend code
8 *
9 * If the interrupt is waiting to be processed, we try to re-run it.
10 * We can't directly run it from here since the caller might be in an
11 * interrupt-protected region. Not all irq controller chips can
12 * retrigger interrupts at the hardware level, so in those cases
13 * we allow the resending of IRQs via a tasklet.
14 */
15
16#include <linux/irq.h>
17#include <linux/module.h>
18#include <linux/random.h>
19#include <linux/interrupt.h>
20
21#include "internals.h"
22
23#ifdef CONFIG_HARDIRQS_SW_RESEND
24
25/* Bitmap to handle software resend of interrupts: */
26static DECLARE_BITMAP(irqs_resend, NR_IRQS);
27
28/*
29 * Run software resends of IRQ's
30 */
31static void resend_irqs(unsigned long arg)
32{
33 struct irq_desc *desc;
34 int irq;
35
36 while (!bitmap_empty(irqs_resend, NR_IRQS)) {
37 irq = find_first_bit(irqs_resend, NR_IRQS);
38 clear_bit(irq, irqs_resend);
39 desc = irq_desc + irq;
40 local_irq_disable();
41 desc->handle_irq(irq, desc, NULL);
42 local_irq_enable();
43 }
44}
45
46/* Tasklet to handle resend: */
47static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
48
49#endif
50
51/*
52 * IRQ resend
53 *
54 * Is called with interrupts disabled and desc->lock held.
55 */
56void check_irq_resend(struct irq_desc *desc, unsigned int irq)
57{
58 unsigned int status = desc->status;
59
60 /*
61 * Make sure the interrupt is enabled, before resending it:
62 */
63 desc->chip->enable(irq);
64
65 if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
66 desc->status &= ~IRQ_PENDING;
67 desc->status = status | IRQ_REPLAY;
68
69 if (!desc->chip || !desc->chip->retrigger ||
70 !desc->chip->retrigger(irq)) {
71#ifdef CONFIG_HARDIRQS_SW_RESEND
72 /* Set it pending and activate the softirq: */
73 set_bit(irq, irqs_resend);
74 tasklet_schedule(&resend_tasklet);
75#endif
76 }
77 }
78}
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 7df9abd5ec86..417e98092cf2 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -11,44 +11,44 @@
11#include <linux/kallsyms.h> 11#include <linux/kallsyms.h>
12#include <linux/interrupt.h> 12#include <linux/interrupt.h>
13 13
14static int irqfixup; 14static int irqfixup __read_mostly;
15 15
16/* 16/*
17 * Recovery handler for misrouted interrupts. 17 * Recovery handler for misrouted interrupts.
18 */ 18 */
19
20static int misrouted_irq(int irq, struct pt_regs *regs) 19static int misrouted_irq(int irq, struct pt_regs *regs)
21{ 20{
22 int i; 21 int i;
23 irq_desc_t *desc;
24 int ok = 0; 22 int ok = 0;
25 int work = 0; /* Did we do work for a real IRQ */ 23 int work = 0; /* Did we do work for a real IRQ */
26 24
27 for(i = 1; i < NR_IRQS; i++) { 25 for (i = 1; i < NR_IRQS; i++) {
26 struct irq_desc *desc = irq_desc + i;
28 struct irqaction *action; 27 struct irqaction *action;
29 28
30 if (i == irq) /* Already tried */ 29 if (i == irq) /* Already tried */
31 continue; 30 continue;
32 desc = &irq_desc[i]; 31
33 spin_lock(&desc->lock); 32 spin_lock(&desc->lock);
34 action = desc->action;
35 /* Already running on another processor */ 33 /* Already running on another processor */
36 if (desc->status & IRQ_INPROGRESS) { 34 if (desc->status & IRQ_INPROGRESS) {
37 /* 35 /*
38 * Already running: If it is shared get the other 36 * Already running: If it is shared get the other
39 * CPU to go looking for our mystery interrupt too 37 * CPU to go looking for our mystery interrupt too
40 */ 38 */
41 if (desc->action && (desc->action->flags & SA_SHIRQ)) 39 if (desc->action && (desc->action->flags & IRQF_SHARED))
42 desc->status |= IRQ_PENDING; 40 desc->status |= IRQ_PENDING;
43 spin_unlock(&desc->lock); 41 spin_unlock(&desc->lock);
44 continue; 42 continue;
45 } 43 }
46 /* Honour the normal IRQ locking */ 44 /* Honour the normal IRQ locking */
47 desc->status |= IRQ_INPROGRESS; 45 desc->status |= IRQ_INPROGRESS;
46 action = desc->action;
48 spin_unlock(&desc->lock); 47 spin_unlock(&desc->lock);
48
49 while (action) { 49 while (action) {
50 /* Only shared IRQ handlers are safe to call */ 50 /* Only shared IRQ handlers are safe to call */
51 if (action->flags & SA_SHIRQ) { 51 if (action->flags & IRQF_SHARED) {
52 if (action->handler(i, action->dev_id, regs) == 52 if (action->handler(i, action->dev_id, regs) ==
53 IRQ_HANDLED) 53 IRQ_HANDLED)
54 ok = 1; 54 ok = 1;
@@ -62,9 +62,8 @@ static int misrouted_irq(int irq, struct pt_regs *regs)
62 62
63 /* 63 /*
64 * While we were looking for a fixup someone queued a real 64 * While we were looking for a fixup someone queued a real
65 * IRQ clashing with our walk 65 * IRQ clashing with our walk:
66 */ 66 */
67
68 while ((desc->status & IRQ_PENDING) && action) { 67 while ((desc->status & IRQ_PENDING) && action) {
69 /* 68 /*
70 * Perform real IRQ processing for the IRQ we deferred 69 * Perform real IRQ processing for the IRQ we deferred
@@ -80,8 +79,8 @@ static int misrouted_irq(int irq, struct pt_regs *regs)
80 * If we did actual work for the real IRQ line we must let the 79 * If we did actual work for the real IRQ line we must let the
81 * IRQ controller clean up too 80 * IRQ controller clean up too
82 */ 81 */
83 if(work) 82 if (work && desc->chip && desc->chip->end)
84 desc->handler->end(i); 83 desc->chip->end(i);
85 spin_unlock(&desc->lock); 84 spin_unlock(&desc->lock);
86 } 85 }
87 /* So the caller can adjust the irq error counts */ 86 /* So the caller can adjust the irq error counts */
@@ -100,7 +99,8 @@ static int misrouted_irq(int irq, struct pt_regs *regs)
100 */ 99 */
101 100
102static void 101static void
103__report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) 102__report_bad_irq(unsigned int irq, struct irq_desc *desc,
103 irqreturn_t action_ret)
104{ 104{
105 struct irqaction *action; 105 struct irqaction *action;
106 106
@@ -113,6 +113,7 @@ __report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
113 } 113 }
114 dump_stack(); 114 dump_stack();
115 printk(KERN_ERR "handlers:\n"); 115 printk(KERN_ERR "handlers:\n");
116
116 action = desc->action; 117 action = desc->action;
117 while (action) { 118 while (action) {
118 printk(KERN_ERR "[<%p>]", action->handler); 119 printk(KERN_ERR "[<%p>]", action->handler);
@@ -123,7 +124,8 @@ __report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
123 } 124 }
124} 125}
125 126
126static void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) 127static void
128report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret)
127{ 129{
128 static int count = 100; 130 static int count = 100;
129 131
@@ -133,12 +135,12 @@ static void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t actio
133 } 135 }
134} 136}
135 137
136void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret, 138void note_interrupt(unsigned int irq, struct irq_desc *desc,
137 struct pt_regs *regs) 139 irqreturn_t action_ret, struct pt_regs *regs)
138{ 140{
139 if (action_ret != IRQ_HANDLED) { 141 if (unlikely(action_ret != IRQ_HANDLED)) {
140 desc->irqs_unhandled++; 142 desc->irqs_unhandled++;
141 if (action_ret != IRQ_NONE) 143 if (unlikely(action_ret != IRQ_NONE))
142 report_bad_irq(irq, desc, action_ret); 144 report_bad_irq(irq, desc, action_ret);
143 } 145 }
144 146
@@ -152,11 +154,11 @@ void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret,
152 } 154 }
153 155
154 desc->irq_count++; 156 desc->irq_count++;
155 if (desc->irq_count < 100000) 157 if (likely(desc->irq_count < 100000))
156 return; 158 return;
157 159
158 desc->irq_count = 0; 160 desc->irq_count = 0;
159 if (desc->irqs_unhandled > 99900) { 161 if (unlikely(desc->irqs_unhandled > 99900)) {
160 /* 162 /*
161 * The interrupt is stuck 163 * The interrupt is stuck
162 */ 164 */
@@ -166,17 +168,19 @@ void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret,
166 */ 168 */
167 printk(KERN_EMERG "Disabling IRQ #%d\n", irq); 169 printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
168 desc->status |= IRQ_DISABLED; 170 desc->status |= IRQ_DISABLED;
169 desc->handler->disable(irq); 171 desc->depth = 1;
172 desc->chip->disable(irq);
170 } 173 }
171 desc->irqs_unhandled = 0; 174 desc->irqs_unhandled = 0;
172} 175}
173 176
174int noirqdebug; 177int noirqdebug __read_mostly;
175 178
176int __init noirqdebug_setup(char *str) 179int __init noirqdebug_setup(char *str)
177{ 180{
178 noirqdebug = 1; 181 noirqdebug = 1;
179 printk(KERN_INFO "IRQ lockup detection disabled\n"); 182 printk(KERN_INFO "IRQ lockup detection disabled\n");
183
180 return 1; 184 return 1;
181} 185}
182 186
@@ -187,6 +191,7 @@ static int __init irqfixup_setup(char *str)
187 irqfixup = 1; 191 irqfixup = 1;
188 printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n"); 192 printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
189 printk(KERN_WARNING "This may impact system performance.\n"); 193 printk(KERN_WARNING "This may impact system performance.\n");
194
190 return 1; 195 return 1;
191} 196}
192 197
diff --git a/kernel/kexec.c b/kernel/kexec.c
index bf39d28e4c0e..50087ecf337e 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -902,14 +902,14 @@ static int kimage_load_segment(struct kimage *image,
902 * kexec does not sync, or unmount filesystems so if you need 902 * kexec does not sync, or unmount filesystems so if you need
903 * that to happen you need to do that yourself. 903 * that to happen you need to do that yourself.
904 */ 904 */
905struct kimage *kexec_image = NULL; 905struct kimage *kexec_image;
906static struct kimage *kexec_crash_image = NULL; 906struct kimage *kexec_crash_image;
907/* 907/*
908 * A home grown binary mutex. 908 * A home grown binary mutex.
909 * Nothing can wait so this mutex is safe to use 909 * Nothing can wait so this mutex is safe to use
910 * in interrupt context :) 910 * in interrupt context :)
911 */ 911 */
912static int kexec_lock = 0; 912static int kexec_lock;
913 913
914asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, 914asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
915 struct kexec_segment __user *segments, 915 struct kexec_segment __user *segments,
@@ -1042,7 +1042,6 @@ asmlinkage long compat_sys_kexec_load(unsigned long entry,
1042 1042
1043void crash_kexec(struct pt_regs *regs) 1043void crash_kexec(struct pt_regs *regs)
1044{ 1044{
1045 struct kimage *image;
1046 int locked; 1045 int locked;
1047 1046
1048 1047
@@ -1056,12 +1055,11 @@ void crash_kexec(struct pt_regs *regs)
1056 */ 1055 */
1057 locked = xchg(&kexec_lock, 1); 1056 locked = xchg(&kexec_lock, 1);
1058 if (!locked) { 1057 if (!locked) {
1059 image = xchg(&kexec_crash_image, NULL); 1058 if (kexec_crash_image) {
1060 if (image) {
1061 struct pt_regs fixed_regs; 1059 struct pt_regs fixed_regs;
1062 crash_setup_regs(&fixed_regs, regs); 1060 crash_setup_regs(&fixed_regs, regs);
1063 machine_crash_shutdown(&fixed_regs); 1061 machine_crash_shutdown(&fixed_regs);
1064 machine_kexec(image); 1062 machine_kexec(kexec_crash_image);
1065 } 1063 }
1066 xchg(&kexec_lock, 0); 1064 xchg(&kexec_lock, 0);
1067 } 1065 }
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 20a997c73c3d..1b7157af051c 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -20,7 +20,6 @@
20*/ 20*/
21#define __KERNEL_SYSCALLS__ 21#define __KERNEL_SYSCALLS__
22 22
23#include <linux/config.h>
24#include <linux/module.h> 23#include <linux/module.h>
25#include <linux/sched.h> 24#include <linux/sched.h>
26#include <linux/syscalls.h> 25#include <linux/syscalls.h>
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 1fbf466a29aa..64aab081153b 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -47,11 +47,17 @@
47 47
48static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; 48static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
49static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; 49static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
50static atomic_t kprobe_count;
50 51
51DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ 52DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
52DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ 53DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */
53static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; 54static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
54 55
56static struct notifier_block kprobe_page_fault_nb = {
57 .notifier_call = kprobe_exceptions_notify,
58 .priority = 0x7fffffff /* we need to notified first */
59};
60
55#ifdef __ARCH_WANT_KPROBES_INSN_SLOT 61#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
56/* 62/*
57 * kprobe->ainsn.insn points to the copy of the instruction to be 63 * kprobe->ainsn.insn points to the copy of the instruction to be
@@ -368,16 +374,15 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
368*/ 374*/
369static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p) 375static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
370{ 376{
371 struct kprobe *kp;
372
373 if (p->break_handler) { 377 if (p->break_handler) {
374 list_for_each_entry_rcu(kp, &old_p->list, list) { 378 if (old_p->break_handler)
375 if (kp->break_handler) 379 return -EEXIST;
376 return -EEXIST;
377 }
378 list_add_tail_rcu(&p->list, &old_p->list); 380 list_add_tail_rcu(&p->list, &old_p->list);
381 old_p->break_handler = aggr_break_handler;
379 } else 382 } else
380 list_add_rcu(&p->list, &old_p->list); 383 list_add_rcu(&p->list, &old_p->list);
384 if (p->post_handler && !old_p->post_handler)
385 old_p->post_handler = aggr_post_handler;
381 return 0; 386 return 0;
382} 387}
383 388
@@ -390,9 +395,11 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
390 copy_kprobe(p, ap); 395 copy_kprobe(p, ap);
391 ap->addr = p->addr; 396 ap->addr = p->addr;
392 ap->pre_handler = aggr_pre_handler; 397 ap->pre_handler = aggr_pre_handler;
393 ap->post_handler = aggr_post_handler;
394 ap->fault_handler = aggr_fault_handler; 398 ap->fault_handler = aggr_fault_handler;
395 ap->break_handler = aggr_break_handler; 399 if (p->post_handler)
400 ap->post_handler = aggr_post_handler;
401 if (p->break_handler)
402 ap->break_handler = aggr_break_handler;
396 403
397 INIT_LIST_HEAD(&ap->list); 404 INIT_LIST_HEAD(&ap->list);
398 list_add_rcu(&p->list, &ap->list); 405 list_add_rcu(&p->list, &ap->list);
@@ -464,6 +471,8 @@ static int __kprobes __register_kprobe(struct kprobe *p,
464 old_p = get_kprobe(p->addr); 471 old_p = get_kprobe(p->addr);
465 if (old_p) { 472 if (old_p) {
466 ret = register_aggr_kprobe(old_p, p); 473 ret = register_aggr_kprobe(old_p, p);
474 if (!ret)
475 atomic_inc(&kprobe_count);
467 goto out; 476 goto out;
468 } 477 }
469 478
@@ -474,6 +483,10 @@ static int __kprobes __register_kprobe(struct kprobe *p,
474 hlist_add_head_rcu(&p->hlist, 483 hlist_add_head_rcu(&p->hlist,
475 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); 484 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
476 485
486 if (atomic_add_return(1, &kprobe_count) == \
487 (ARCH_INACTIVE_KPROBE_COUNT + 1))
488 register_page_fault_notifier(&kprobe_page_fault_nb);
489
477 arch_arm_kprobe(p); 490 arch_arm_kprobe(p);
478 491
479out: 492out:
@@ -536,14 +549,40 @@ valid_p:
536 kfree(old_p); 549 kfree(old_p);
537 } 550 }
538 arch_remove_kprobe(p); 551 arch_remove_kprobe(p);
552 } else {
553 mutex_lock(&kprobe_mutex);
554 if (p->break_handler)
555 old_p->break_handler = NULL;
556 if (p->post_handler){
557 list_for_each_entry_rcu(list_p, &old_p->list, list){
558 if (list_p->post_handler){
559 cleanup_p = 2;
560 break;
561 }
562 }
563 if (cleanup_p == 0)
564 old_p->post_handler = NULL;
565 }
566 mutex_unlock(&kprobe_mutex);
539 } 567 }
568
569 /* Call unregister_page_fault_notifier()
570 * if no probes are active
571 */
572 mutex_lock(&kprobe_mutex);
573 if (atomic_add_return(-1, &kprobe_count) == \
574 ARCH_INACTIVE_KPROBE_COUNT)
575 unregister_page_fault_notifier(&kprobe_page_fault_nb);
576 mutex_unlock(&kprobe_mutex);
577 return;
540} 578}
541 579
542static struct notifier_block kprobe_exceptions_nb = { 580static struct notifier_block kprobe_exceptions_nb = {
543 .notifier_call = kprobe_exceptions_notify, 581 .notifier_call = kprobe_exceptions_notify,
544 .priority = 0x7fffffff /* we need to notified first */ 582 .priority = 0x7fffffff /* we need to be notified first */
545}; 583};
546 584
585
547int __kprobes register_jprobe(struct jprobe *jp) 586int __kprobes register_jprobe(struct jprobe *jp)
548{ 587{
549 /* Todo: Verify probepoint is a function entry point */ 588 /* Todo: Verify probepoint is a function entry point */
@@ -652,6 +691,7 @@ static int __init init_kprobes(void)
652 INIT_HLIST_HEAD(&kprobe_table[i]); 691 INIT_HLIST_HEAD(&kprobe_table[i]);
653 INIT_HLIST_HEAD(&kretprobe_inst_table[i]); 692 INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
654 } 693 }
694 atomic_set(&kprobe_count, 0);
655 695
656 err = arch_init_kprobes(); 696 err = arch_init_kprobes();
657 if (!err) 697 if (!err)
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index f119e098e67b..e0ffe4ab0917 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -8,12 +8,12 @@
8 * 8 *
9 */ 9 */
10 10
11#include <linux/config.h>
12#include <linux/kobject.h> 11#include <linux/kobject.h>
13#include <linux/string.h> 12#include <linux/string.h>
14#include <linux/sysfs.h> 13#include <linux/sysfs.h>
15#include <linux/module.h> 14#include <linux/module.h>
16#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/kexec.h>
17 17
18#define KERNEL_ATTR_RO(_name) \ 18#define KERNEL_ATTR_RO(_name) \
19static struct subsys_attribute _name##_attr = __ATTR_RO(_name) 19static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
@@ -48,6 +48,20 @@ static ssize_t uevent_helper_store(struct subsystem *subsys, const char *page, s
48KERNEL_ATTR_RW(uevent_helper); 48KERNEL_ATTR_RW(uevent_helper);
49#endif 49#endif
50 50
51#ifdef CONFIG_KEXEC
52static ssize_t kexec_loaded_show(struct subsystem *subsys, char *page)
53{
54 return sprintf(page, "%d\n", !!kexec_image);
55}
56KERNEL_ATTR_RO(kexec_loaded);
57
58static ssize_t kexec_crash_loaded_show(struct subsystem *subsys, char *page)
59{
60 return sprintf(page, "%d\n", !!kexec_crash_image);
61}
62KERNEL_ATTR_RO(kexec_crash_loaded);
63#endif /* CONFIG_KEXEC */
64
51decl_subsys(kernel, NULL, NULL); 65decl_subsys(kernel, NULL, NULL);
52EXPORT_SYMBOL_GPL(kernel_subsys); 66EXPORT_SYMBOL_GPL(kernel_subsys);
53 67
@@ -56,6 +70,10 @@ static struct attribute * kernel_attrs[] = {
56 &uevent_seqnum_attr.attr, 70 &uevent_seqnum_attr.attr,
57 &uevent_helper_attr.attr, 71 &uevent_helper_attr.attr,
58#endif 72#endif
73#ifdef CONFIG_KEXEC
74 &kexec_loaded_attr.attr,
75 &kexec_crash_loaded_attr.attr,
76#endif
59 NULL 77 NULL
60}; 78};
61 79
diff --git a/kernel/kthread.c b/kernel/kthread.c
index c5f3c6613b6d..24be714b04c7 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -45,6 +45,13 @@ struct kthread_stop_info
45static DEFINE_MUTEX(kthread_stop_lock); 45static DEFINE_MUTEX(kthread_stop_lock);
46static struct kthread_stop_info kthread_stop_info; 46static struct kthread_stop_info kthread_stop_info;
47 47
48/**
49 * kthread_should_stop - should this kthread return now?
50 *
51 * When someone calls kthread_stop on your kthread, it will be woken
52 * and this will return true. You should then return, and your return
53 * value will be passed through to kthread_stop().
54 */
48int kthread_should_stop(void) 55int kthread_should_stop(void)
49{ 56{
50 return (kthread_stop_info.k == current); 57 return (kthread_stop_info.k == current);
@@ -122,6 +129,25 @@ static void keventd_create_kthread(void *_create)
122 complete(&create->done); 129 complete(&create->done);
123} 130}
124 131
132/**
133 * kthread_create - create a kthread.
134 * @threadfn: the function to run until signal_pending(current).
135 * @data: data ptr for @threadfn.
136 * @namefmt: printf-style name for the thread.
137 *
138 * Description: This helper function creates and names a kernel
139 * thread. The thread will be stopped: use wake_up_process() to start
140 * it. See also kthread_run(), kthread_create_on_cpu().
141 *
142 * When woken, the thread will run @threadfn() with @data as its
143 * argument. @threadfn can either call do_exit() directly if it is a
144 * standalone thread for which noone will call kthread_stop(), or
145 * return when 'kthread_should_stop()' is true (which means
146 * kthread_stop() has been called). The return value should be zero
147 * or a negative error number; it will be passed to kthread_stop().
148 *
149 * Returns a task_struct or ERR_PTR(-ENOMEM).
150 */
125struct task_struct *kthread_create(int (*threadfn)(void *data), 151struct task_struct *kthread_create(int (*threadfn)(void *data),
126 void *data, 152 void *data,
127 const char namefmt[], 153 const char namefmt[],
@@ -156,6 +182,15 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
156} 182}
157EXPORT_SYMBOL(kthread_create); 183EXPORT_SYMBOL(kthread_create);
158 184
185/**
186 * kthread_bind - bind a just-created kthread to a cpu.
187 * @k: thread created by kthread_create().
188 * @cpu: cpu (might not be online, must be possible) for @k to run on.
189 *
190 * Description: This function is equivalent to set_cpus_allowed(),
191 * except that @cpu doesn't need to be online, and the thread must be
192 * stopped (i.e., just returned from kthread_create().
193 */
159void kthread_bind(struct task_struct *k, unsigned int cpu) 194void kthread_bind(struct task_struct *k, unsigned int cpu)
160{ 195{
161 BUG_ON(k->state != TASK_INTERRUPTIBLE); 196 BUG_ON(k->state != TASK_INTERRUPTIBLE);
@@ -166,12 +201,36 @@ void kthread_bind(struct task_struct *k, unsigned int cpu)
166} 201}
167EXPORT_SYMBOL(kthread_bind); 202EXPORT_SYMBOL(kthread_bind);
168 203
204/**
205 * kthread_stop - stop a thread created by kthread_create().
206 * @k: thread created by kthread_create().
207 *
208 * Sets kthread_should_stop() for @k to return true, wakes it, and
209 * waits for it to exit. Your threadfn() must not call do_exit()
210 * itself if you use this function! This can also be called after
211 * kthread_create() instead of calling wake_up_process(): the thread
212 * will exit without calling threadfn().
213 *
214 * Returns the result of threadfn(), or %-EINTR if wake_up_process()
215 * was never called.
216 */
169int kthread_stop(struct task_struct *k) 217int kthread_stop(struct task_struct *k)
170{ 218{
171 return kthread_stop_sem(k, NULL); 219 return kthread_stop_sem(k, NULL);
172} 220}
173EXPORT_SYMBOL(kthread_stop); 221EXPORT_SYMBOL(kthread_stop);
174 222
223/**
224 * kthread_stop_sem - stop a thread created by kthread_create().
225 * @k: thread created by kthread_create().
226 * @s: semaphore that @k waits on while idle.
227 *
228 * Does essentially the same thing as kthread_stop() above, but wakes
229 * @k by calling up(@s).
230 *
231 * Returns the result of threadfn(), or %-EINTR if wake_up_process()
232 * was never called.
233 */
175int kthread_stop_sem(struct task_struct *k, struct semaphore *s) 234int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
176{ 235{
177 int ret; 236 int ret;
@@ -210,5 +269,5 @@ static __init int helper_init(void)
210 269
211 return 0; 270 return 0;
212} 271}
213core_initcall(helper_init);
214 272
273core_initcall(helper_init);
diff --git a/kernel/module.c b/kernel/module.c
index bbe04862e1b0..281172f01e9a 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1,4 +1,4 @@
1/* Rewritten by Rusty Russell, on the backs of many others... 1/*
2 Copyright (C) 2002 Richard Henderson 2 Copyright (C) 2002 Richard Henderson
3 Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. 3 Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM.
4 4
@@ -16,7 +16,6 @@
16 along with this program; if not, write to the Free Software 16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 17 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
18*/ 18*/
19#include <linux/config.h>
20#include <linux/module.h> 19#include <linux/module.h>
21#include <linux/moduleloader.h> 20#include <linux/moduleloader.h>
22#include <linux/init.h> 21#include <linux/init.h>
@@ -40,9 +39,11 @@
40#include <linux/string.h> 39#include <linux/string.h>
41#include <linux/sched.h> 40#include <linux/sched.h>
42#include <linux/mutex.h> 41#include <linux/mutex.h>
42#include <linux/unwind.h>
43#include <asm/uaccess.h> 43#include <asm/uaccess.h>
44#include <asm/semaphore.h> 44#include <asm/semaphore.h>
45#include <asm/cacheflush.h> 45#include <asm/cacheflush.h>
46#include <linux/license.h>
46 47
47#if 0 48#if 0
48#define DEBUGP printk 49#define DEBUGP printk
@@ -120,9 +121,17 @@ extern const struct kernel_symbol __start___ksymtab_gpl[];
120extern const struct kernel_symbol __stop___ksymtab_gpl[]; 121extern const struct kernel_symbol __stop___ksymtab_gpl[];
121extern const struct kernel_symbol __start___ksymtab_gpl_future[]; 122extern const struct kernel_symbol __start___ksymtab_gpl_future[];
122extern const struct kernel_symbol __stop___ksymtab_gpl_future[]; 123extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
124extern const struct kernel_symbol __start___ksymtab_unused[];
125extern const struct kernel_symbol __stop___ksymtab_unused[];
126extern const struct kernel_symbol __start___ksymtab_unused_gpl[];
127extern const struct kernel_symbol __stop___ksymtab_unused_gpl[];
128extern const struct kernel_symbol __start___ksymtab_gpl_future[];
129extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
123extern const unsigned long __start___kcrctab[]; 130extern const unsigned long __start___kcrctab[];
124extern const unsigned long __start___kcrctab_gpl[]; 131extern const unsigned long __start___kcrctab_gpl[];
125extern const unsigned long __start___kcrctab_gpl_future[]; 132extern const unsigned long __start___kcrctab_gpl_future[];
133extern const unsigned long __start___kcrctab_unused[];
134extern const unsigned long __start___kcrctab_unused_gpl[];
126 135
127#ifndef CONFIG_MODVERSIONS 136#ifndef CONFIG_MODVERSIONS
128#define symversion(base, idx) NULL 137#define symversion(base, idx) NULL
@@ -142,6 +151,17 @@ static const struct kernel_symbol *lookup_symbol(const char *name,
142 return NULL; 151 return NULL;
143} 152}
144 153
154static void printk_unused_warning(const char *name)
155{
156 printk(KERN_WARNING "Symbol %s is marked as UNUSED, "
157 "however this module is using it.\n", name);
158 printk(KERN_WARNING "This symbol will go away in the future.\n");
159 printk(KERN_WARNING "Please evalute if this is the right api to use, "
160 "and if it really is, submit a report the linux kernel "
161 "mailinglist together with submitting your code for "
162 "inclusion.\n");
163}
164
145/* Find a symbol, return value, crc and module which owns it */ 165/* Find a symbol, return value, crc and module which owns it */
146static unsigned long __find_symbol(const char *name, 166static unsigned long __find_symbol(const char *name,
147 struct module **owner, 167 struct module **owner,
@@ -184,6 +204,25 @@ static unsigned long __find_symbol(const char *name,
184 return ks->value; 204 return ks->value;
185 } 205 }
186 206
207 ks = lookup_symbol(name, __start___ksymtab_unused,
208 __stop___ksymtab_unused);
209 if (ks) {
210 printk_unused_warning(name);
211 *crc = symversion(__start___kcrctab_unused,
212 (ks - __start___ksymtab_unused));
213 return ks->value;
214 }
215
216 if (gplok)
217 ks = lookup_symbol(name, __start___ksymtab_unused_gpl,
218 __stop___ksymtab_unused_gpl);
219 if (ks) {
220 printk_unused_warning(name);
221 *crc = symversion(__start___kcrctab_unused_gpl,
222 (ks - __start___ksymtab_unused_gpl));
223 return ks->value;
224 }
225
187 /* Now try modules. */ 226 /* Now try modules. */
188 list_for_each_entry(mod, &modules, list) { 227 list_for_each_entry(mod, &modules, list) {
189 *owner = mod; 228 *owner = mod;
@@ -202,6 +241,23 @@ static unsigned long __find_symbol(const char *name,
202 return ks->value; 241 return ks->value;
203 } 242 }
204 } 243 }
244 ks = lookup_symbol(name, mod->unused_syms, mod->unused_syms + mod->num_unused_syms);
245 if (ks) {
246 printk_unused_warning(name);
247 *crc = symversion(mod->unused_crcs, (ks - mod->unused_syms));
248 return ks->value;
249 }
250
251 if (gplok) {
252 ks = lookup_symbol(name, mod->unused_gpl_syms,
253 mod->unused_gpl_syms + mod->num_unused_gpl_syms);
254 if (ks) {
255 printk_unused_warning(name);
256 *crc = symversion(mod->unused_gpl_crcs,
257 (ks - mod->unused_gpl_syms));
258 return ks->value;
259 }
260 }
205 ks = lookup_symbol(name, mod->gpl_future_syms, 261 ks = lookup_symbol(name, mod->gpl_future_syms,
206 (mod->gpl_future_syms + 262 (mod->gpl_future_syms +
207 mod->num_gpl_future_syms)); 263 mod->num_gpl_future_syms));
@@ -1051,6 +1107,8 @@ static void free_module(struct module *mod)
1051 remove_sect_attrs(mod); 1107 remove_sect_attrs(mod);
1052 mod_kobject_remove(mod); 1108 mod_kobject_remove(mod);
1053 1109
1110 unwind_remove_table(mod->unwind_info, 0);
1111
1054 /* Arch-specific cleanup. */ 1112 /* Arch-specific cleanup. */
1055 module_arch_cleanup(mod); 1113 module_arch_cleanup(mod);
1056 1114
@@ -1248,16 +1306,6 @@ static void layout_sections(struct module *mod,
1248 } 1306 }
1249} 1307}
1250 1308
1251static inline int license_is_gpl_compatible(const char *license)
1252{
1253 return (strcmp(license, "GPL") == 0
1254 || strcmp(license, "GPL v2") == 0
1255 || strcmp(license, "GPL and additional rights") == 0
1256 || strcmp(license, "Dual BSD/GPL") == 0
1257 || strcmp(license, "Dual MIT/GPL") == 0
1258 || strcmp(license, "Dual MPL/GPL") == 0);
1259}
1260
1261static void set_license(struct module *mod, const char *license) 1309static void set_license(struct module *mod, const char *license)
1262{ 1310{
1263 if (!license) 1311 if (!license)
@@ -1326,7 +1374,7 @@ int is_exported(const char *name, const struct module *mod)
1326 if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab)) 1374 if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab))
1327 return 1; 1375 return 1;
1328 else 1376 else
1329 if (lookup_symbol(name, mod->syms, mod->syms + mod->num_syms)) 1377 if (mod && lookup_symbol(name, mod->syms, mod->syms + mod->num_syms))
1330 return 1; 1378 return 1;
1331 else 1379 else
1332 return 0; 1380 return 0;
@@ -1409,10 +1457,27 @@ static struct module *load_module(void __user *umod,
1409 Elf_Ehdr *hdr; 1457 Elf_Ehdr *hdr;
1410 Elf_Shdr *sechdrs; 1458 Elf_Shdr *sechdrs;
1411 char *secstrings, *args, *modmagic, *strtab = NULL; 1459 char *secstrings, *args, *modmagic, *strtab = NULL;
1412 unsigned int i, symindex = 0, strindex = 0, setupindex, exindex, 1460 unsigned int i;
1413 exportindex, modindex, obsparmindex, infoindex, gplindex, 1461 unsigned int symindex = 0;
1414 crcindex, gplcrcindex, versindex, pcpuindex, gplfutureindex, 1462 unsigned int strindex = 0;
1415 gplfuturecrcindex; 1463 unsigned int setupindex;
1464 unsigned int exindex;
1465 unsigned int exportindex;
1466 unsigned int modindex;
1467 unsigned int obsparmindex;
1468 unsigned int infoindex;
1469 unsigned int gplindex;
1470 unsigned int crcindex;
1471 unsigned int gplcrcindex;
1472 unsigned int versindex;
1473 unsigned int pcpuindex;
1474 unsigned int gplfutureindex;
1475 unsigned int gplfuturecrcindex;
1476 unsigned int unwindex = 0;
1477 unsigned int unusedindex;
1478 unsigned int unusedcrcindex;
1479 unsigned int unusedgplindex;
1480 unsigned int unusedgplcrcindex;
1416 struct module *mod; 1481 struct module *mod;
1417 long err = 0; 1482 long err = 0;
1418 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ 1483 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
@@ -1493,15 +1558,22 @@ static struct module *load_module(void __user *umod,
1493 exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab"); 1558 exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab");
1494 gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl"); 1559 gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl");
1495 gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future"); 1560 gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future");
1561 unusedindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused");
1562 unusedgplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused_gpl");
1496 crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab"); 1563 crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab");
1497 gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl"); 1564 gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl");
1498 gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future"); 1565 gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future");
1566 unusedcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused");
1567 unusedgplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused_gpl");
1499 setupindex = find_sec(hdr, sechdrs, secstrings, "__param"); 1568 setupindex = find_sec(hdr, sechdrs, secstrings, "__param");
1500 exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table"); 1569 exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table");
1501 obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm"); 1570 obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm");
1502 versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); 1571 versindex = find_sec(hdr, sechdrs, secstrings, "__versions");
1503 infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); 1572 infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo");
1504 pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); 1573 pcpuindex = find_pcpusec(hdr, sechdrs, secstrings);
1574#ifdef ARCH_UNWIND_SECTION_NAME
1575 unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME);
1576#endif
1505 1577
1506 /* Don't keep modinfo section */ 1578 /* Don't keep modinfo section */
1507 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; 1579 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
@@ -1510,6 +1582,8 @@ static struct module *load_module(void __user *umod,
1510 sechdrs[symindex].sh_flags |= SHF_ALLOC; 1582 sechdrs[symindex].sh_flags |= SHF_ALLOC;
1511 sechdrs[strindex].sh_flags |= SHF_ALLOC; 1583 sechdrs[strindex].sh_flags |= SHF_ALLOC;
1512#endif 1584#endif
1585 if (unwindex)
1586 sechdrs[unwindex].sh_flags |= SHF_ALLOC;
1513 1587
1514 /* Check module struct version now, before we try to use module. */ 1588 /* Check module struct version now, before we try to use module. */
1515 if (!check_modstruct_version(sechdrs, versindex, mod)) { 1589 if (!check_modstruct_version(sechdrs, versindex, mod)) {
@@ -1639,14 +1713,27 @@ static struct module *load_module(void __user *umod,
1639 mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr; 1713 mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr;
1640 mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size / 1714 mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size /
1641 sizeof(*mod->gpl_future_syms); 1715 sizeof(*mod->gpl_future_syms);
1716 mod->num_unused_syms = sechdrs[unusedindex].sh_size /
1717 sizeof(*mod->unused_syms);
1718 mod->num_unused_gpl_syms = sechdrs[unusedgplindex].sh_size /
1719 sizeof(*mod->unused_gpl_syms);
1642 mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr; 1720 mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr;
1643 if (gplfuturecrcindex) 1721 if (gplfuturecrcindex)
1644 mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr; 1722 mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr;
1645 1723
1724 mod->unused_syms = (void *)sechdrs[unusedindex].sh_addr;
1725 if (unusedcrcindex)
1726 mod->unused_crcs = (void *)sechdrs[unusedcrcindex].sh_addr;
1727 mod->unused_gpl_syms = (void *)sechdrs[unusedgplindex].sh_addr;
1728 if (unusedgplcrcindex)
1729 mod->unused_crcs = (void *)sechdrs[unusedgplcrcindex].sh_addr;
1730
1646#ifdef CONFIG_MODVERSIONS 1731#ifdef CONFIG_MODVERSIONS
1647 if ((mod->num_syms && !crcindex) || 1732 if ((mod->num_syms && !crcindex) ||
1648 (mod->num_gpl_syms && !gplcrcindex) || 1733 (mod->num_gpl_syms && !gplcrcindex) ||
1649 (mod->num_gpl_future_syms && !gplfuturecrcindex)) { 1734 (mod->num_gpl_future_syms && !gplfuturecrcindex) ||
1735 (mod->num_unused_syms && !unusedcrcindex) ||
1736 (mod->num_unused_gpl_syms && !unusedgplcrcindex)) {
1650 printk(KERN_WARNING "%s: No versions for exported symbols." 1737 printk(KERN_WARNING "%s: No versions for exported symbols."
1651 " Tainting kernel.\n", mod->name); 1738 " Tainting kernel.\n", mod->name);
1652 add_taint(TAINT_FORCED_MODULE); 1739 add_taint(TAINT_FORCED_MODULE);
@@ -1738,6 +1825,11 @@ static struct module *load_module(void __user *umod,
1738 goto arch_cleanup; 1825 goto arch_cleanup;
1739 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 1826 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
1740 1827
1828 /* Size of section 0 is 0, so this works well if no unwind info. */
1829 mod->unwind_info = unwind_add_table(mod,
1830 (void *)sechdrs[unwindex].sh_addr,
1831 sechdrs[unwindex].sh_size);
1832
1741 /* Get rid of temporary copy */ 1833 /* Get rid of temporary copy */
1742 vfree(hdr); 1834 vfree(hdr);
1743 1835
@@ -1836,6 +1928,7 @@ sys_init_module(void __user *umod,
1836 mod->state = MODULE_STATE_LIVE; 1928 mod->state = MODULE_STATE_LIVE;
1837 /* Drop initial reference. */ 1929 /* Drop initial reference. */
1838 module_put(mod); 1930 module_put(mod);
1931 unwind_remove_table(mod->unwind_info, 1);
1839 module_free(mod, mod->module_init); 1932 module_free(mod, mod->module_init);
1840 mod->module_init = NULL; 1933 mod->module_init = NULL;
1841 mod->init_size = 0; 1934 mod->init_size = 0;
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index f4913c376950..e38e4bac97ca 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -16,6 +16,7 @@
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/delay.h> 17#include <linux/delay.h>
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/poison.h>
19#include <linux/spinlock.h> 20#include <linux/spinlock.h>
20#include <linux/kallsyms.h> 21#include <linux/kallsyms.h>
21#include <linux/interrupt.h> 22#include <linux/interrupt.h>
@@ -153,13 +154,13 @@ next:
153 continue; 154 continue;
154 count++; 155 count++;
155 cursor = curr->next; 156 cursor = curr->next;
156 debug_spin_lock_restore(&debug_mutex_lock, flags); 157 debug_spin_unlock_restore(&debug_mutex_lock, flags);
157 158
158 printk("\n#%03d: ", count); 159 printk("\n#%03d: ", count);
159 printk_lock(lock, filter ? 0 : 1); 160 printk_lock(lock, filter ? 0 : 1);
160 goto next; 161 goto next;
161 } 162 }
162 debug_spin_lock_restore(&debug_mutex_lock, flags); 163 debug_spin_unlock_restore(&debug_mutex_lock, flags);
163 printk("\n"); 164 printk("\n");
164} 165}
165 166
@@ -316,7 +317,7 @@ void mutex_debug_check_no_locks_held(struct task_struct *task)
316 continue; 317 continue;
317 list_del_init(curr); 318 list_del_init(curr);
318 DEBUG_OFF(); 319 DEBUG_OFF();
319 debug_spin_lock_restore(&debug_mutex_lock, flags); 320 debug_spin_unlock_restore(&debug_mutex_lock, flags);
320 321
321 printk("BUG: %s/%d, lock held at task exit time!\n", 322 printk("BUG: %s/%d, lock held at task exit time!\n",
322 task->comm, task->pid); 323 task->comm, task->pid);
@@ -325,7 +326,7 @@ void mutex_debug_check_no_locks_held(struct task_struct *task)
325 printk("exiting task is not even the owner??\n"); 326 printk("exiting task is not even the owner??\n");
326 return; 327 return;
327 } 328 }
328 debug_spin_lock_restore(&debug_mutex_lock, flags); 329 debug_spin_unlock_restore(&debug_mutex_lock, flags);
329} 330}
330 331
331/* 332/*
@@ -352,7 +353,7 @@ void mutex_debug_check_no_locks_freed(const void *from, unsigned long len)
352 continue; 353 continue;
353 list_del_init(curr); 354 list_del_init(curr);
354 DEBUG_OFF(); 355 DEBUG_OFF();
355 debug_spin_lock_restore(&debug_mutex_lock, flags); 356 debug_spin_unlock_restore(&debug_mutex_lock, flags);
356 357
357 printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n", 358 printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n",
358 current->comm, current->pid, lock, from, to); 359 current->comm, current->pid, lock, from, to);
@@ -362,7 +363,7 @@ void mutex_debug_check_no_locks_freed(const void *from, unsigned long len)
362 printk("freeing task is not even the owner??\n"); 363 printk("freeing task is not even the owner??\n");
363 return; 364 return;
364 } 365 }
365 debug_spin_lock_restore(&debug_mutex_lock, flags); 366 debug_spin_unlock_restore(&debug_mutex_lock, flags);
366} 367}
367 368
368/* 369/*
@@ -381,7 +382,7 @@ void debug_mutex_set_owner(struct mutex *lock,
381 382
382void debug_mutex_init_waiter(struct mutex_waiter *waiter) 383void debug_mutex_init_waiter(struct mutex_waiter *waiter)
383{ 384{
384 memset(waiter, 0x11, sizeof(*waiter)); 385 memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter));
385 waiter->magic = waiter; 386 waiter->magic = waiter;
386 INIT_LIST_HEAD(&waiter->list); 387 INIT_LIST_HEAD(&waiter->list);
387} 388}
@@ -397,7 +398,7 @@ void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter)
397void debug_mutex_free_waiter(struct mutex_waiter *waiter) 398void debug_mutex_free_waiter(struct mutex_waiter *waiter)
398{ 399{
399 DEBUG_WARN_ON(!list_empty(&waiter->list)); 400 DEBUG_WARN_ON(!list_empty(&waiter->list));
400 memset(waiter, 0x22, sizeof(*waiter)); 401 memset(waiter, MUTEX_DEBUG_FREE, sizeof(*waiter));
401} 402}
402 403
403void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, 404void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
index fd384050acb1..a5196c36a5fd 100644
--- a/kernel/mutex-debug.h
+++ b/kernel/mutex-debug.h
@@ -46,21 +46,6 @@ extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
46extern void debug_mutex_unlock(struct mutex *lock); 46extern void debug_mutex_unlock(struct mutex *lock);
47extern void debug_mutex_init(struct mutex *lock, const char *name); 47extern void debug_mutex_init(struct mutex *lock, const char *name);
48 48
49#define debug_spin_lock(lock) \
50 do { \
51 local_irq_disable(); \
52 if (debug_mutex_on) \
53 spin_lock(lock); \
54 } while (0)
55
56#define debug_spin_unlock(lock) \
57 do { \
58 if (debug_mutex_on) \
59 spin_unlock(lock); \
60 local_irq_enable(); \
61 preempt_check_resched(); \
62 } while (0)
63
64#define debug_spin_lock_save(lock, flags) \ 49#define debug_spin_lock_save(lock, flags) \
65 do { \ 50 do { \
66 local_irq_save(flags); \ 51 local_irq_save(flags); \
@@ -68,7 +53,7 @@ extern void debug_mutex_init(struct mutex *lock, const char *name);
68 spin_lock(lock); \ 53 spin_lock(lock); \
69 } while (0) 54 } while (0)
70 55
71#define debug_spin_lock_restore(lock, flags) \ 56#define debug_spin_unlock_restore(lock, flags) \
72 do { \ 57 do { \
73 if (debug_mutex_on) \ 58 if (debug_mutex_on) \
74 spin_unlock(lock); \ 59 spin_unlock(lock); \
@@ -76,20 +61,20 @@ extern void debug_mutex_init(struct mutex *lock, const char *name);
76 preempt_check_resched(); \ 61 preempt_check_resched(); \
77 } while (0) 62 } while (0)
78 63
79#define spin_lock_mutex(lock) \ 64#define spin_lock_mutex(lock, flags) \
80 do { \ 65 do { \
81 struct mutex *l = container_of(lock, struct mutex, wait_lock); \ 66 struct mutex *l = container_of(lock, struct mutex, wait_lock); \
82 \ 67 \
83 DEBUG_WARN_ON(in_interrupt()); \ 68 DEBUG_WARN_ON(in_interrupt()); \
84 debug_spin_lock(&debug_mutex_lock); \ 69 debug_spin_lock_save(&debug_mutex_lock, flags); \
85 spin_lock(lock); \ 70 spin_lock(lock); \
86 DEBUG_WARN_ON(l->magic != l); \ 71 DEBUG_WARN_ON(l->magic != l); \
87 } while (0) 72 } while (0)
88 73
89#define spin_unlock_mutex(lock) \ 74#define spin_unlock_mutex(lock, flags) \
90 do { \ 75 do { \
91 spin_unlock(lock); \ 76 spin_unlock(lock); \
92 debug_spin_unlock(&debug_mutex_lock); \ 77 debug_spin_unlock_restore(&debug_mutex_lock, flags); \
93 } while (0) 78 } while (0)
94 79
95#define DEBUG_OFF() \ 80#define DEBUG_OFF() \
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 5449b210d9ed..7043db21bbce 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -125,10 +125,11 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__)
125 struct task_struct *task = current; 125 struct task_struct *task = current;
126 struct mutex_waiter waiter; 126 struct mutex_waiter waiter;
127 unsigned int old_val; 127 unsigned int old_val;
128 unsigned long flags;
128 129
129 debug_mutex_init_waiter(&waiter); 130 debug_mutex_init_waiter(&waiter);
130 131
131 spin_lock_mutex(&lock->wait_lock); 132 spin_lock_mutex(&lock->wait_lock, flags);
132 133
133 debug_mutex_add_waiter(lock, &waiter, task->thread_info, ip); 134 debug_mutex_add_waiter(lock, &waiter, task->thread_info, ip);
134 135
@@ -157,7 +158,7 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__)
157 if (unlikely(state == TASK_INTERRUPTIBLE && 158 if (unlikely(state == TASK_INTERRUPTIBLE &&
158 signal_pending(task))) { 159 signal_pending(task))) {
159 mutex_remove_waiter(lock, &waiter, task->thread_info); 160 mutex_remove_waiter(lock, &waiter, task->thread_info);
160 spin_unlock_mutex(&lock->wait_lock); 161 spin_unlock_mutex(&lock->wait_lock, flags);
161 162
162 debug_mutex_free_waiter(&waiter); 163 debug_mutex_free_waiter(&waiter);
163 return -EINTR; 164 return -EINTR;
@@ -165,9 +166,9 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__)
165 __set_task_state(task, state); 166 __set_task_state(task, state);
166 167
167 /* didnt get the lock, go to sleep: */ 168 /* didnt get the lock, go to sleep: */
168 spin_unlock_mutex(&lock->wait_lock); 169 spin_unlock_mutex(&lock->wait_lock, flags);
169 schedule(); 170 schedule();
170 spin_lock_mutex(&lock->wait_lock); 171 spin_lock_mutex(&lock->wait_lock, flags);
171 } 172 }
172 173
173 /* got the lock - rejoice! */ 174 /* got the lock - rejoice! */
@@ -178,7 +179,7 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__)
178 if (likely(list_empty(&lock->wait_list))) 179 if (likely(list_empty(&lock->wait_list)))
179 atomic_set(&lock->count, 0); 180 atomic_set(&lock->count, 0);
180 181
181 spin_unlock_mutex(&lock->wait_lock); 182 spin_unlock_mutex(&lock->wait_lock, flags);
182 183
183 debug_mutex_free_waiter(&waiter); 184 debug_mutex_free_waiter(&waiter);
184 185
@@ -203,10 +204,11 @@ static fastcall noinline void
203__mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__) 204__mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__)
204{ 205{
205 struct mutex *lock = container_of(lock_count, struct mutex, count); 206 struct mutex *lock = container_of(lock_count, struct mutex, count);
207 unsigned long flags;
206 208
207 DEBUG_WARN_ON(lock->owner != current_thread_info()); 209 DEBUG_WARN_ON(lock->owner != current_thread_info());
208 210
209 spin_lock_mutex(&lock->wait_lock); 211 spin_lock_mutex(&lock->wait_lock, flags);
210 212
211 /* 213 /*
212 * some architectures leave the lock unlocked in the fastpath failure 214 * some architectures leave the lock unlocked in the fastpath failure
@@ -231,7 +233,7 @@ __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__)
231 233
232 debug_mutex_clear_owner(lock); 234 debug_mutex_clear_owner(lock);
233 235
234 spin_unlock_mutex(&lock->wait_lock); 236 spin_unlock_mutex(&lock->wait_lock, flags);
235} 237}
236 238
237/* 239/*
@@ -276,9 +278,10 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__)
276static inline int __mutex_trylock_slowpath(atomic_t *lock_count) 278static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
277{ 279{
278 struct mutex *lock = container_of(lock_count, struct mutex, count); 280 struct mutex *lock = container_of(lock_count, struct mutex, count);
281 unsigned long flags;
279 int prev; 282 int prev;
280 283
281 spin_lock_mutex(&lock->wait_lock); 284 spin_lock_mutex(&lock->wait_lock, flags);
282 285
283 prev = atomic_xchg(&lock->count, -1); 286 prev = atomic_xchg(&lock->count, -1);
284 if (likely(prev == 1)) 287 if (likely(prev == 1))
@@ -287,7 +290,7 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
287 if (likely(list_empty(&lock->wait_list))) 290 if (likely(list_empty(&lock->wait_list)))
288 atomic_set(&lock->count, 0); 291 atomic_set(&lock->count, 0);
289 292
290 spin_unlock_mutex(&lock->wait_lock); 293 spin_unlock_mutex(&lock->wait_lock, flags);
291 294
292 return prev == 1; 295 return prev == 1;
293} 296}
diff --git a/kernel/mutex.h b/kernel/mutex.h
index 00fe84e7b672..069189947257 100644
--- a/kernel/mutex.h
+++ b/kernel/mutex.h
@@ -9,8 +9,10 @@
9 * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs: 9 * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs:
10 */ 10 */
11 11
12#define spin_lock_mutex(lock) spin_lock(lock) 12#define spin_lock_mutex(lock, flags) \
13#define spin_unlock_mutex(lock) spin_unlock(lock) 13 do { spin_lock(lock); (void)(flags); } while (0)
14#define spin_unlock_mutex(lock, flags) \
15 do { spin_unlock(lock); (void)(flags); } while (0)
14#define mutex_remove_waiter(lock, waiter, ti) \ 16#define mutex_remove_waiter(lock, waiter, ti) \
15 __list_del((waiter)->list.prev, (waiter)->list.next) 17 __list_del((waiter)->list.prev, (waiter)->list.next)
16 18
diff --git a/kernel/panic.c b/kernel/panic.c
index cc2a4c9c36ac..ab13f0f668b5 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -8,7 +8,6 @@
8 * This function is used through-out the kernel (including mm and fs) 8 * This function is used through-out the kernel (including mm and fs)
9 * to indicate a major problem. 9 * to indicate a major problem.
10 */ 10 */
11#include <linux/config.h>
12#include <linux/module.h> 11#include <linux/module.h>
13#include <linux/sched.h> 12#include <linux/sched.h>
14#include <linux/delay.h> 13#include <linux/delay.h>
diff --git a/kernel/params.c b/kernel/params.c
index af43ecdc8d9b..91aea7aa532e 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -15,7 +15,6 @@
15 along with this program; if not, write to the Free Software 15 along with this program; if not, write to the Free Software
16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 16 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17*/ 17*/
18#include <linux/config.h>
19#include <linux/moduleparam.h> 18#include <linux/moduleparam.h>
20#include <linux/kernel.h> 19#include <linux/kernel.h>
21#include <linux/string.h> 20#include <linux/string.h>
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index ce0dfb8f4a4e..ae44a70aae8a 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -36,6 +36,24 @@ config PM_DEBUG
36 code. This is helpful when debugging and reporting various PM bugs, 36 code. This is helpful when debugging and reporting various PM bugs,
37 like suspend support. 37 like suspend support.
38 38
39config PM_TRACE
40 bool "Suspend/resume event tracing"
41 depends on PM && PM_DEBUG && X86_32 && EXPERIMENTAL
42 default n
43 ---help---
44 This enables some cheesy code to save the last PM event point in the
45 RTC across reboots, so that you can debug a machine that just hangs
46 during suspend (or more commonly, during resume).
47
48 To use this debugging feature you should attempt to suspend the machine,
49 then reboot it, then run
50
51 dmesg -s 1000000 | grep 'hash matches'
52
53 CAUTION: this option will cause your machine's real-time clock to be
54 set to an invalid time after a resume.
55
56
39config SOFTWARE_SUSPEND 57config SOFTWARE_SUSPEND
40 bool "Software Suspend" 58 bool "Software Suspend"
41 depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP) 59 depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP)
@@ -82,18 +100,6 @@ config PM_STD_PARTITION
82 suspended image to. It will simply pick the first available swap 100 suspended image to. It will simply pick the first available swap
83 device. 101 device.
84 102
85config SWSUSP_ENCRYPT
86 bool "Encrypt suspend image"
87 depends on SOFTWARE_SUSPEND && CRYPTO=y && (CRYPTO_AES=y || CRYPTO_AES_586=y || CRYPTO_AES_X86_64=y)
88 default ""
89 ---help---
90 To prevent data gathering from swap after resume you can encrypt
91 the suspend image with a temporary key that is deleted on
92 resume.
93
94 Note that the temporary key is stored unencrypted on disk while the
95 system is suspended.
96
97config SUSPEND_SMP 103config SUSPEND_SMP
98 bool 104 bool
99 depends on HOTPLUG_CPU && X86 && PM 105 depends on HOTPLUG_CPU && X86 && PM
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 81d4d982f3f0..e13e74067845 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -231,7 +231,7 @@ static int software_resume(void)
231late_initcall(software_resume); 231late_initcall(software_resume);
232 232
233 233
234static char * pm_disk_modes[] = { 234static const char * const pm_disk_modes[] = {
235 [PM_DISK_FIRMWARE] = "firmware", 235 [PM_DISK_FIRMWARE] = "firmware",
236 [PM_DISK_PLATFORM] = "platform", 236 [PM_DISK_PLATFORM] = "platform",
237 [PM_DISK_SHUTDOWN] = "shutdown", 237 [PM_DISK_SHUTDOWN] = "shutdown",
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 0a907f0dc56b..6d295c776794 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -15,7 +15,7 @@
15#include <linux/errno.h> 15#include <linux/errno.h>
16#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/pm.h> 17#include <linux/pm.h>
18 18#include <linux/console.h>
19 19
20#include "power.h" 20#include "power.h"
21 21
@@ -145,7 +145,7 @@ static void suspend_finish(suspend_state_t state)
145 145
146 146
147 147
148static char *pm_states[PM_SUSPEND_MAX] = { 148static const char * const pm_states[PM_SUSPEND_MAX] = {
149 [PM_SUSPEND_STANDBY] = "standby", 149 [PM_SUSPEND_STANDBY] = "standby",
150 [PM_SUSPEND_MEM] = "mem", 150 [PM_SUSPEND_MEM] = "mem",
151#ifdef CONFIG_SOFTWARE_SUSPEND 151#ifdef CONFIG_SOFTWARE_SUSPEND
@@ -262,7 +262,7 @@ static ssize_t state_show(struct subsystem * subsys, char * buf)
262static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n) 262static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n)
263{ 263{
264 suspend_state_t state = PM_SUSPEND_STANDBY; 264 suspend_state_t state = PM_SUSPEND_STANDBY;
265 char ** s; 265 const char * const *s;
266 char *p; 266 char *p;
267 int error; 267 int error;
268 int len; 268 int len;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index f06f12f21767..57a792982fb9 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -55,7 +55,7 @@ struct snapshot_handle {
55 unsigned int page; 55 unsigned int page;
56 unsigned int page_offset; 56 unsigned int page_offset;
57 unsigned int prev; 57 unsigned int prev;
58 struct pbe *pbe; 58 struct pbe *pbe, *last_pbe;
59 void *buffer; 59 void *buffer;
60 unsigned int buf_offset; 60 unsigned int buf_offset;
61}; 61};
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 3eeedbb13b78..24c96f354231 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -150,6 +150,10 @@ int restore_highmem(void)
150 } 150 }
151 return 0; 151 return 0;
152} 152}
153#else
154static inline unsigned int count_highmem_pages(void) {return 0;}
155static inline int save_highmem(void) {return 0;}
156static inline int restore_highmem(void) {return 0;}
153#endif 157#endif
154 158
155static int pfn_is_nosave(unsigned long pfn) 159static int pfn_is_nosave(unsigned long pfn)
@@ -293,62 +297,29 @@ static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
293 } 297 }
294} 298}
295 299
296/** 300static unsigned int unsafe_pages;
297 * On resume it is necessary to trace and eventually free the unsafe
298 * pages that have been allocated, because they are needed for I/O
299 * (on x86-64 we likely will "eat" these pages once again while
300 * creating the temporary page translation tables)
301 */
302
303struct eaten_page {
304 struct eaten_page *next;
305 char padding[PAGE_SIZE - sizeof(void *)];
306};
307
308static struct eaten_page *eaten_pages = NULL;
309
310static void release_eaten_pages(void)
311{
312 struct eaten_page *p, *q;
313
314 p = eaten_pages;
315 while (p) {
316 q = p->next;
317 /* We don't want swsusp_free() to free this page again */
318 ClearPageNosave(virt_to_page(p));
319 free_page((unsigned long)p);
320 p = q;
321 }
322 eaten_pages = NULL;
323}
324 301
325/** 302/**
326 * @safe_needed - on resume, for storing the PBE list and the image, 303 * @safe_needed - on resume, for storing the PBE list and the image,
327 * we can only use memory pages that do not conflict with the pages 304 * we can only use memory pages that do not conflict with the pages
328 * which had been used before suspend. 305 * used before suspend.
329 * 306 *
330 * The unsafe pages are marked with the PG_nosave_free flag 307 * The unsafe pages are marked with the PG_nosave_free flag
331 * 308 * and we count them using unsafe_pages
332 * Allocated but unusable (ie eaten) memory pages should be marked
333 * so that swsusp_free() can release them
334 */ 309 */
335 310
336static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed) 311static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
337{ 312{
338 void *res; 313 void *res;
339 314
315 res = (void *)get_zeroed_page(gfp_mask);
340 if (safe_needed) 316 if (safe_needed)
341 do { 317 while (res && PageNosaveFree(virt_to_page(res))) {
318 /* The page is unsafe, mark it for swsusp_free() */
319 SetPageNosave(virt_to_page(res));
320 unsafe_pages++;
342 res = (void *)get_zeroed_page(gfp_mask); 321 res = (void *)get_zeroed_page(gfp_mask);
343 if (res && PageNosaveFree(virt_to_page(res))) { 322 }
344 /* This is for swsusp_free() */
345 SetPageNosave(virt_to_page(res));
346 ((struct eaten_page *)res)->next = eaten_pages;
347 eaten_pages = res;
348 }
349 } while (res && PageNosaveFree(virt_to_page(res)));
350 else
351 res = (void *)get_zeroed_page(gfp_mask);
352 if (res) { 323 if (res) {
353 SetPageNosave(virt_to_page(res)); 324 SetPageNosave(virt_to_page(res));
354 SetPageNosaveFree(virt_to_page(res)); 325 SetPageNosaveFree(virt_to_page(res));
@@ -374,7 +345,8 @@ unsigned long get_safe_page(gfp_t gfp_mask)
374 * On each page we set up a list of struct_pbe elements. 345 * On each page we set up a list of struct_pbe elements.
375 */ 346 */
376 347
377struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed) 348static struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask,
349 int safe_needed)
378{ 350{
379 unsigned int num; 351 unsigned int num;
380 struct pbe *pblist, *pbe; 352 struct pbe *pblist, *pbe;
@@ -642,6 +614,8 @@ static int mark_unsafe_pages(struct pbe *pblist)
642 return -EFAULT; 614 return -EFAULT;
643 } 615 }
644 616
617 unsafe_pages = 0;
618
645 return 0; 619 return 0;
646} 620}
647 621
@@ -719,42 +693,99 @@ static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
719} 693}
720 694
721/** 695/**
722 * create_image - use metadata contained in the PBE list 696 * prepare_image - use metadata contained in the PBE list
723 * pointed to by pagedir_nosave to mark the pages that will 697 * pointed to by pagedir_nosave to mark the pages that will
724 * be overwritten in the process of restoring the system 698 * be overwritten in the process of restoring the system
725 * memory state from the image and allocate memory for 699 * memory state from the image ("unsafe" pages) and allocate
726 * the image avoiding these pages 700 * memory for the image
701 *
702 * The idea is to allocate the PBE list first and then
703 * allocate as many pages as it's needed for the image data,
704 * but not to assign these pages to the PBEs initially.
705 * Instead, we just mark them as allocated and create a list
706 * of "safe" which will be used later
727 */ 707 */
728 708
729static int create_image(struct snapshot_handle *handle) 709struct safe_page {
710 struct safe_page *next;
711 char padding[PAGE_SIZE - sizeof(void *)];
712};
713
714static struct safe_page *safe_pages;
715
716static int prepare_image(struct snapshot_handle *handle)
730{ 717{
731 int error = 0; 718 int error = 0;
732 struct pbe *p, *pblist; 719 unsigned int nr_pages = nr_copy_pages;
720 struct pbe *p, *pblist = NULL;
733 721
734 p = pagedir_nosave; 722 p = pagedir_nosave;
735 error = mark_unsafe_pages(p); 723 error = mark_unsafe_pages(p);
736 if (!error) { 724 if (!error) {
737 pblist = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1); 725 pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1);
738 if (pblist) 726 if (pblist)
739 copy_page_backup_list(pblist, p); 727 copy_page_backup_list(pblist, p);
740 free_pagedir(p, 0); 728 free_pagedir(p, 0);
741 if (!pblist) 729 if (!pblist)
742 error = -ENOMEM; 730 error = -ENOMEM;
743 } 731 }
744 if (!error) 732 safe_pages = NULL;
745 error = alloc_data_pages(pblist, GFP_ATOMIC, 1); 733 if (!error && nr_pages > unsafe_pages) {
734 nr_pages -= unsafe_pages;
735 while (nr_pages--) {
736 struct safe_page *ptr;
737
738 ptr = (struct safe_page *)get_zeroed_page(GFP_ATOMIC);
739 if (!ptr) {
740 error = -ENOMEM;
741 break;
742 }
743 if (!PageNosaveFree(virt_to_page(ptr))) {
744 /* The page is "safe", add it to the list */
745 ptr->next = safe_pages;
746 safe_pages = ptr;
747 }
748 /* Mark the page as allocated */
749 SetPageNosave(virt_to_page(ptr));
750 SetPageNosaveFree(virt_to_page(ptr));
751 }
752 }
746 if (!error) { 753 if (!error) {
747 release_eaten_pages();
748 pagedir_nosave = pblist; 754 pagedir_nosave = pblist;
749 } else { 755 } else {
750 pagedir_nosave = NULL;
751 handle->pbe = NULL; 756 handle->pbe = NULL;
752 nr_copy_pages = 0; 757 swsusp_free();
753 nr_meta_pages = 0;
754 } 758 }
755 return error; 759 return error;
756} 760}
757 761
762static void *get_buffer(struct snapshot_handle *handle)
763{
764 struct pbe *pbe = handle->pbe, *last = handle->last_pbe;
765 struct page *page = virt_to_page(pbe->orig_address);
766
767 if (PageNosave(page) && PageNosaveFree(page)) {
768 /*
769 * We have allocated the "original" page frame and we can
770 * use it directly to store the read page
771 */
772 pbe->address = 0;
773 if (last && last->next)
774 last->next = NULL;
775 return (void *)pbe->orig_address;
776 }
777 /*
778 * The "original" page frame has not been allocated and we have to
779 * use a "safe" page frame to store the read page
780 */
781 pbe->address = (unsigned long)safe_pages;
782 safe_pages = safe_pages->next;
783 if (last)
784 last->next = pbe;
785 handle->last_pbe = pbe;
786 return (void *)pbe->address;
787}
788
758/** 789/**
759 * snapshot_write_next - used for writing the system memory snapshot. 790 * snapshot_write_next - used for writing the system memory snapshot.
760 * 791 *
@@ -799,15 +830,16 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
799 } else if (handle->prev <= nr_meta_pages) { 830 } else if (handle->prev <= nr_meta_pages) {
800 handle->pbe = unpack_orig_addresses(buffer, handle->pbe); 831 handle->pbe = unpack_orig_addresses(buffer, handle->pbe);
801 if (!handle->pbe) { 832 if (!handle->pbe) {
802 error = create_image(handle); 833 error = prepare_image(handle);
803 if (error) 834 if (error)
804 return error; 835 return error;
805 handle->pbe = pagedir_nosave; 836 handle->pbe = pagedir_nosave;
806 handle->buffer = (void *)handle->pbe->address; 837 handle->last_pbe = NULL;
838 handle->buffer = get_buffer(handle);
807 } 839 }
808 } else { 840 } else {
809 handle->pbe = handle->pbe->next; 841 handle->pbe = handle->pbe->next;
810 handle->buffer = (void *)handle->pbe->address; 842 handle->buffer = get_buffer(handle);
811 } 843 }
812 handle->prev = handle->page; 844 handle->prev = handle->page;
813 } 845 }
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index c4016cbbd3e0..17f669c83012 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -67,9 +67,9 @@ unsigned int count_highmem_pages(void);
67int save_highmem(void); 67int save_highmem(void);
68int restore_highmem(void); 68int restore_highmem(void);
69#else 69#else
70static int save_highmem(void) { return 0; } 70static inline int save_highmem(void) { return 0; }
71static int restore_highmem(void) { return 0; } 71static inline int restore_highmem(void) { return 0; }
72static unsigned int count_highmem_pages(void) { return 0; } 72static inline unsigned int count_highmem_pages(void) { return 0; }
73#endif 73#endif
74 74
75/** 75/**
@@ -175,6 +175,12 @@ void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
175 */ 175 */
176 176
177#define SHRINK_BITE 10000 177#define SHRINK_BITE 10000
178static inline unsigned long __shrink_memory(long tmp)
179{
180 if (tmp > SHRINK_BITE)
181 tmp = SHRINK_BITE;
182 return shrink_all_memory(tmp);
183}
178 184
179int swsusp_shrink_memory(void) 185int swsusp_shrink_memory(void)
180{ 186{
@@ -192,15 +198,17 @@ int swsusp_shrink_memory(void)
192 PAGES_FOR_IO; 198 PAGES_FOR_IO;
193 tmp = size; 199 tmp = size;
194 for_each_zone (zone) 200 for_each_zone (zone)
195 if (!is_highmem(zone)) 201 if (!is_highmem(zone) && populated_zone(zone)) {
196 tmp -= zone->free_pages; 202 tmp -= zone->free_pages;
203 tmp += zone->lowmem_reserve[ZONE_NORMAL];
204 }
197 if (tmp > 0) { 205 if (tmp > 0) {
198 tmp = shrink_all_memory(SHRINK_BITE); 206 tmp = __shrink_memory(tmp);
199 if (!tmp) 207 if (!tmp)
200 return -ENOMEM; 208 return -ENOMEM;
201 pages += tmp; 209 pages += tmp;
202 } else if (size > image_size / PAGE_SIZE) { 210 } else if (size > image_size / PAGE_SIZE) {
203 tmp = shrink_all_memory(SHRINK_BITE); 211 tmp = __shrink_memory(size - (image_size / PAGE_SIZE));
204 pages += tmp; 212 pages += tmp;
205 } 213 }
206 printk("\b%c", p[i++%4]); 214 printk("\b%c", p[i++%4]);
diff --git a/kernel/printk.c b/kernel/printk.c
index 416b8f3fb265..9772b9e8feee 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -24,8 +24,8 @@
24#include <linux/console.h> 24#include <linux/console.h>
25#include <linux/init.h> 25#include <linux/init.h>
26#include <linux/module.h> 26#include <linux/module.h>
27#include <linux/moduleparam.h>
27#include <linux/interrupt.h> /* For in_interrupt() */ 28#include <linux/interrupt.h> /* For in_interrupt() */
28#include <linux/config.h>
29#include <linux/delay.h> 29#include <linux/delay.h>
30#include <linux/smp.h> 30#include <linux/smp.h>
31#include <linux/security.h> 31#include <linux/security.h>
@@ -327,7 +327,9 @@ static void __call_console_drivers(unsigned long start, unsigned long end)
327 struct console *con; 327 struct console *con;
328 328
329 for (con = console_drivers; con; con = con->next) { 329 for (con = console_drivers; con; con = con->next) {
330 if ((con->flags & CON_ENABLED) && con->write) 330 if ((con->flags & CON_ENABLED) && con->write &&
331 (cpu_online(smp_processor_id()) ||
332 (con->flags & CON_ANYTIME)))
331 con->write(con, &LOG_BUF(start), end - start); 333 con->write(con, &LOG_BUF(start), end - start);
332 } 334 }
333} 335}
@@ -437,6 +439,7 @@ static int printk_time = 1;
437#else 439#else
438static int printk_time = 0; 440static int printk_time = 0;
439#endif 441#endif
442module_param(printk_time, int, S_IRUGO | S_IWUSR);
440 443
441static int __init printk_time_setup(char *str) 444static int __init printk_time_setup(char *str)
442{ 445{
@@ -453,6 +456,18 @@ __attribute__((weak)) unsigned long long printk_clock(void)
453 return sched_clock(); 456 return sched_clock();
454} 457}
455 458
459/* Check if we have any console registered that can be called early in boot. */
460static int have_callable_console(void)
461{
462 struct console *con;
463
464 for (con = console_drivers; con; con = con->next)
465 if (con->flags & CON_ANYTIME)
466 return 1;
467
468 return 0;
469}
470
456/** 471/**
457 * printk - print a kernel message 472 * printk - print a kernel message
458 * @fmt: format string 473 * @fmt: format string
@@ -566,27 +581,29 @@ asmlinkage int vprintk(const char *fmt, va_list args)
566 log_level_unknown = 1; 581 log_level_unknown = 1;
567 } 582 }
568 583
569 if (!cpu_online(smp_processor_id())) { 584 if (!down_trylock(&console_sem)) {
570 /* 585 /*
571 * Some console drivers may assume that per-cpu resources have 586 * We own the drivers. We can drop the spinlock and
572 * been allocated. So don't allow them to be called by this 587 * let release_console_sem() print the text, maybe ...
573 * CPU until it is officially up. We shouldn't be calling into
574 * random console drivers on a CPU which doesn't exist yet..
575 */ 588 */
589 console_locked = 1;
576 printk_cpu = UINT_MAX; 590 printk_cpu = UINT_MAX;
577 spin_unlock_irqrestore(&logbuf_lock, flags); 591 spin_unlock_irqrestore(&logbuf_lock, flags);
578 goto out; 592
579 }
580 if (!down_trylock(&console_sem)) {
581 console_locked = 1;
582 /* 593 /*
583 * We own the drivers. We can drop the spinlock and let 594 * Console drivers may assume that per-cpu resources have
584 * release_console_sem() print the text 595 * been allocated. So unless they're explicitly marked as
596 * being able to cope (CON_ANYTIME) don't call them until
597 * this CPU is officially up.
585 */ 598 */
586 printk_cpu = UINT_MAX; 599 if (cpu_online(smp_processor_id()) || have_callable_console()) {
587 spin_unlock_irqrestore(&logbuf_lock, flags); 600 console_may_schedule = 0;
588 console_may_schedule = 0; 601 release_console_sem();
589 release_console_sem(); 602 } else {
603 /* Release by hand to avoid flushing the buffer. */
604 console_locked = 0;
605 up(&console_sem);
606 }
590 } else { 607 } else {
591 /* 608 /*
592 * Someone else owns the drivers. We drop the spinlock, which 609 * Someone else owns the drivers. We drop the spinlock, which
@@ -596,7 +613,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
596 printk_cpu = UINT_MAX; 613 printk_cpu = UINT_MAX;
597 spin_unlock_irqrestore(&logbuf_lock, flags); 614 spin_unlock_irqrestore(&logbuf_lock, flags);
598 } 615 }
599out: 616
600 preempt_enable(); 617 preempt_enable();
601 return printed_len; 618 return printed_len;
602} 619}
diff --git a/kernel/profile.c b/kernel/profile.c
index 68afe121e507..d5bd75e7501c 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -13,7 +13,6 @@
13 * to resolve timer interrupt livelocks, William Irwin, Oracle, 2004 13 * to resolve timer interrupt livelocks, William Irwin, Oracle, 2004
14 */ 14 */
15 15
16#include <linux/config.h>
17#include <linux/module.h> 16#include <linux/module.h>
18#include <linux/profile.h> 17#include <linux/profile.h>
19#include <linux/bootmem.h> 18#include <linux/bootmem.h>
@@ -299,7 +298,7 @@ out:
299} 298}
300 299
301#ifdef CONFIG_HOTPLUG_CPU 300#ifdef CONFIG_HOTPLUG_CPU
302static int profile_cpu_callback(struct notifier_block *info, 301static int __devinit profile_cpu_callback(struct notifier_block *info,
303 unsigned long action, void *__cpu) 302 unsigned long action, void *__cpu)
304{ 303{
305 int node, cpu = (unsigned long)__cpu; 304 int node, cpu = (unsigned long)__cpu;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 921c22ad16e4..335c5b932e14 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -120,8 +120,18 @@ int ptrace_check_attach(struct task_struct *child, int kill)
120 120
121static int may_attach(struct task_struct *task) 121static int may_attach(struct task_struct *task)
122{ 122{
123 if (!task->mm) 123 /* May we inspect the given task?
124 return -EPERM; 124 * This check is used both for attaching with ptrace
125 * and for allowing access to sensitive information in /proc.
126 *
127 * ptrace_attach denies several cases that /proc allows
128 * because setting up the necessary parent/child relationship
129 * or halting the specified task is impossible.
130 */
131 int dumpable = 0;
132 /* Don't let security modules deny introspection */
133 if (task == current)
134 return 0;
125 if (((current->uid != task->euid) || 135 if (((current->uid != task->euid) ||
126 (current->uid != task->suid) || 136 (current->uid != task->suid) ||
127 (current->uid != task->uid) || 137 (current->uid != task->uid) ||
@@ -130,7 +140,9 @@ static int may_attach(struct task_struct *task)
130 (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) 140 (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
131 return -EPERM; 141 return -EPERM;
132 smp_rmb(); 142 smp_rmb();
133 if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE)) 143 if (task->mm)
144 dumpable = task->mm->dumpable;
145 if (!dumpable && !capable(CAP_SYS_PTRACE))
134 return -EPERM; 146 return -EPERM;
135 147
136 return security_ptrace(current, task); 148 return security_ptrace(current, task);
@@ -176,6 +188,8 @@ repeat:
176 goto repeat; 188 goto repeat;
177 } 189 }
178 190
191 if (!task->mm)
192 goto bad;
179 /* the same process cannot be attached many times */ 193 /* the same process cannot be attached many times */
180 if (task->ptrace & PT_PTRACED) 194 if (task->ptrace & PT_PTRACED)
181 goto bad; 195 goto bad;
@@ -200,7 +214,7 @@ out:
200 return retval; 214 return retval;
201} 215}
202 216
203void __ptrace_detach(struct task_struct *child, unsigned int data) 217static inline void __ptrace_detach(struct task_struct *child, unsigned int data)
204{ 218{
205 child->exit_code = data; 219 child->exit_code = data;
206 /* .. re-parent .. */ 220 /* .. re-parent .. */
@@ -219,6 +233,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
219 ptrace_disable(child); 233 ptrace_disable(child);
220 234
221 write_lock_irq(&tasklist_lock); 235 write_lock_irq(&tasklist_lock);
236 /* protect against de_thread()->release_task() */
222 if (child->ptrace) 237 if (child->ptrace)
223 __ptrace_detach(child, data); 238 __ptrace_detach(child, data);
224 write_unlock_irq(&tasklist_lock); 239 write_unlock_irq(&tasklist_lock);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 2058f88c7bbb..f464f5ae3f11 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -182,6 +182,15 @@ long rcu_batches_completed(void)
182 return rcu_ctrlblk.completed; 182 return rcu_ctrlblk.completed;
183} 183}
184 184
185/*
186 * Return the number of RCU batches processed thus far. Useful
187 * for debug and statistics.
188 */
189long rcu_batches_completed_bh(void)
190{
191 return rcu_bh_ctrlblk.completed;
192}
193
185static void rcu_barrier_callback(struct rcu_head *notused) 194static void rcu_barrier_callback(struct rcu_head *notused)
186{ 195{
187 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) 196 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
@@ -539,7 +548,7 @@ static void __devinit rcu_online_cpu(int cpu)
539 tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL); 548 tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL);
540} 549}
541 550
542static int rcu_cpu_notify(struct notifier_block *self, 551static int __devinit rcu_cpu_notify(struct notifier_block *self,
543 unsigned long action, void *hcpu) 552 unsigned long action, void *hcpu)
544{ 553{
545 long cpu = (long)hcpu; 554 long cpu = (long)hcpu;
@@ -556,7 +565,7 @@ static int rcu_cpu_notify(struct notifier_block *self,
556 return NOTIFY_OK; 565 return NOTIFY_OK;
557} 566}
558 567
559static struct notifier_block rcu_nb = { 568static struct notifier_block __devinitdata rcu_nb = {
560 .notifier_call = rcu_cpu_notify, 569 .notifier_call = rcu_cpu_notify,
561}; 570};
562 571
@@ -612,14 +621,6 @@ void synchronize_rcu(void)
612 wait_for_completion(&rcu.completion); 621 wait_for_completion(&rcu.completion);
613} 622}
614 623
615/*
616 * Deprecated, use synchronize_rcu() or synchronize_sched() instead.
617 */
618void synchronize_kernel(void)
619{
620 synchronize_rcu();
621}
622
623module_param(blimit, int, 0); 624module_param(blimit, int, 0);
624module_param(qhimark, int, 0); 625module_param(qhimark, int, 0);
625module_param(qlowmark, int, 0); 626module_param(qlowmark, int, 0);
@@ -627,7 +628,7 @@ module_param(qlowmark, int, 0);
627module_param(rsinterval, int, 0); 628module_param(rsinterval, int, 0);
628#endif 629#endif
629EXPORT_SYMBOL_GPL(rcu_batches_completed); 630EXPORT_SYMBOL_GPL(rcu_batches_completed);
630EXPORT_SYMBOL_GPL_FUTURE(call_rcu); /* WARNING: GPL-only in April 2006. */ 631EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
631EXPORT_SYMBOL_GPL_FUTURE(call_rcu_bh); /* WARNING: GPL-only in April 2006. */ 632EXPORT_SYMBOL_GPL(call_rcu);
633EXPORT_SYMBOL_GPL(call_rcu_bh);
632EXPORT_SYMBOL_GPL(synchronize_rcu); 634EXPORT_SYMBOL_GPL(synchronize_rcu);
633EXPORT_SYMBOL_GPL_FUTURE(synchronize_kernel); /* WARNING: GPL-only in April 2006. */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 8154e7589d12..4d1c3d247127 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Read-Copy Update /proc-based torture test facility 2 * Read-Copy Update module-based torture test facility
3 * 3 *
4 * This program is free software; you can redistribute it and/or modify 4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by 5 * it under the terms of the GNU General Public License as published by
@@ -53,6 +53,7 @@ static int stat_interval; /* Interval between stats, in seconds. */
53static int verbose; /* Print more debug info. */ 53static int verbose; /* Print more debug info. */
54static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ 54static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */
55static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/ 55static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/
56static char *torture_type = "rcu"; /* What to torture. */
56 57
57module_param(nreaders, int, 0); 58module_param(nreaders, int, 0);
58MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); 59MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
@@ -64,13 +65,16 @@ module_param(test_no_idle_hz, bool, 0);
64MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); 65MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
65module_param(shuffle_interval, int, 0); 66module_param(shuffle_interval, int, 0);
66MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); 67MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
67#define TORTURE_FLAG "rcutorture: " 68module_param(torture_type, charp, 0);
69MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh)");
70
71#define TORTURE_FLAG "-torture:"
68#define PRINTK_STRING(s) \ 72#define PRINTK_STRING(s) \
69 do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) 73 do { printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0)
70#define VERBOSE_PRINTK_STRING(s) \ 74#define VERBOSE_PRINTK_STRING(s) \
71 do { if (verbose) printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) 75 do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0)
72#define VERBOSE_PRINTK_ERRSTRING(s) \ 76#define VERBOSE_PRINTK_ERRSTRING(s) \
73 do { if (verbose) printk(KERN_ALERT TORTURE_FLAG "!!! " s "\n"); } while (0) 77 do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0)
74 78
75static char printk_buf[4096]; 79static char printk_buf[4096];
76 80
@@ -139,28 +143,6 @@ rcu_torture_free(struct rcu_torture *p)
139 spin_unlock_bh(&rcu_torture_lock); 143 spin_unlock_bh(&rcu_torture_lock);
140} 144}
141 145
142static void
143rcu_torture_cb(struct rcu_head *p)
144{
145 int i;
146 struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
147
148 if (fullstop) {
149 /* Test is ending, just drop callbacks on the floor. */
150 /* The next initialization will pick up the pieces. */
151 return;
152 }
153 i = rp->rtort_pipe_count;
154 if (i > RCU_TORTURE_PIPE_LEN)
155 i = RCU_TORTURE_PIPE_LEN;
156 atomic_inc(&rcu_torture_wcount[i]);
157 if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
158 rp->rtort_mbtest = 0;
159 rcu_torture_free(rp);
160 } else
161 call_rcu(p, rcu_torture_cb);
162}
163
164struct rcu_random_state { 146struct rcu_random_state {
165 unsigned long rrs_state; 147 unsigned long rrs_state;
166 unsigned long rrs_count; 148 unsigned long rrs_count;
@@ -191,6 +173,119 @@ rcu_random(struct rcu_random_state *rrsp)
191} 173}
192 174
193/* 175/*
176 * Operations vector for selecting different types of tests.
177 */
178
179struct rcu_torture_ops {
180 void (*init)(void);
181 void (*cleanup)(void);
182 int (*readlock)(void);
183 void (*readunlock)(int idx);
184 int (*completed)(void);
185 void (*deferredfree)(struct rcu_torture *p);
186 int (*stats)(char *page);
187 char *name;
188};
189static struct rcu_torture_ops *cur_ops = NULL;
190
191/*
192 * Definitions for rcu torture testing.
193 */
194
195static int rcu_torture_read_lock(void)
196{
197 rcu_read_lock();
198 return 0;
199}
200
201static void rcu_torture_read_unlock(int idx)
202{
203 rcu_read_unlock();
204}
205
206static int rcu_torture_completed(void)
207{
208 return rcu_batches_completed();
209}
210
211static void
212rcu_torture_cb(struct rcu_head *p)
213{
214 int i;
215 struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
216
217 if (fullstop) {
218 /* Test is ending, just drop callbacks on the floor. */
219 /* The next initialization will pick up the pieces. */
220 return;
221 }
222 i = rp->rtort_pipe_count;
223 if (i > RCU_TORTURE_PIPE_LEN)
224 i = RCU_TORTURE_PIPE_LEN;
225 atomic_inc(&rcu_torture_wcount[i]);
226 if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
227 rp->rtort_mbtest = 0;
228 rcu_torture_free(rp);
229 } else
230 cur_ops->deferredfree(rp);
231}
232
233static void rcu_torture_deferred_free(struct rcu_torture *p)
234{
235 call_rcu(&p->rtort_rcu, rcu_torture_cb);
236}
237
238static struct rcu_torture_ops rcu_ops = {
239 .init = NULL,
240 .cleanup = NULL,
241 .readlock = rcu_torture_read_lock,
242 .readunlock = rcu_torture_read_unlock,
243 .completed = rcu_torture_completed,
244 .deferredfree = rcu_torture_deferred_free,
245 .stats = NULL,
246 .name = "rcu"
247};
248
249/*
250 * Definitions for rcu_bh torture testing.
251 */
252
253static int rcu_bh_torture_read_lock(void)
254{
255 rcu_read_lock_bh();
256 return 0;
257}
258
259static void rcu_bh_torture_read_unlock(int idx)
260{
261 rcu_read_unlock_bh();
262}
263
264static int rcu_bh_torture_completed(void)
265{
266 return rcu_batches_completed_bh();
267}
268
269static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
270{
271 call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
272}
273
274static struct rcu_torture_ops rcu_bh_ops = {
275 .init = NULL,
276 .cleanup = NULL,
277 .readlock = rcu_bh_torture_read_lock,
278 .readunlock = rcu_bh_torture_read_unlock,
279 .completed = rcu_bh_torture_completed,
280 .deferredfree = rcu_bh_torture_deferred_free,
281 .stats = NULL,
282 .name = "rcu_bh"
283};
284
285static struct rcu_torture_ops *torture_ops[] =
286 { &rcu_ops, &rcu_bh_ops, NULL };
287
288/*
194 * RCU torture writer kthread. Repeatedly substitutes a new structure 289 * RCU torture writer kthread. Repeatedly substitutes a new structure
195 * for that pointed to by rcu_torture_current, freeing the old structure 290 * for that pointed to by rcu_torture_current, freeing the old structure
196 * after a series of grace periods (the "pipeline"). 291 * after a series of grace periods (the "pipeline").
@@ -209,8 +304,6 @@ rcu_torture_writer(void *arg)
209 304
210 do { 305 do {
211 schedule_timeout_uninterruptible(1); 306 schedule_timeout_uninterruptible(1);
212 if (rcu_batches_completed() == oldbatch)
213 continue;
214 if ((rp = rcu_torture_alloc()) == NULL) 307 if ((rp = rcu_torture_alloc()) == NULL)
215 continue; 308 continue;
216 rp->rtort_pipe_count = 0; 309 rp->rtort_pipe_count = 0;
@@ -225,10 +318,10 @@ rcu_torture_writer(void *arg)
225 i = RCU_TORTURE_PIPE_LEN; 318 i = RCU_TORTURE_PIPE_LEN;
226 atomic_inc(&rcu_torture_wcount[i]); 319 atomic_inc(&rcu_torture_wcount[i]);
227 old_rp->rtort_pipe_count++; 320 old_rp->rtort_pipe_count++;
228 call_rcu(&old_rp->rtort_rcu, rcu_torture_cb); 321 cur_ops->deferredfree(old_rp);
229 } 322 }
230 rcu_torture_current_version++; 323 rcu_torture_current_version++;
231 oldbatch = rcu_batches_completed(); 324 oldbatch = cur_ops->completed();
232 } while (!kthread_should_stop() && !fullstop); 325 } while (!kthread_should_stop() && !fullstop);
233 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); 326 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
234 while (!kthread_should_stop()) 327 while (!kthread_should_stop())
@@ -246,6 +339,7 @@ static int
246rcu_torture_reader(void *arg) 339rcu_torture_reader(void *arg)
247{ 340{
248 int completed; 341 int completed;
342 int idx;
249 DEFINE_RCU_RANDOM(rand); 343 DEFINE_RCU_RANDOM(rand);
250 struct rcu_torture *p; 344 struct rcu_torture *p;
251 int pipe_count; 345 int pipe_count;
@@ -254,12 +348,12 @@ rcu_torture_reader(void *arg)
254 set_user_nice(current, 19); 348 set_user_nice(current, 19);
255 349
256 do { 350 do {
257 rcu_read_lock(); 351 idx = cur_ops->readlock();
258 completed = rcu_batches_completed(); 352 completed = cur_ops->completed();
259 p = rcu_dereference(rcu_torture_current); 353 p = rcu_dereference(rcu_torture_current);
260 if (p == NULL) { 354 if (p == NULL) {
261 /* Wait for rcu_torture_writer to get underway */ 355 /* Wait for rcu_torture_writer to get underway */
262 rcu_read_unlock(); 356 cur_ops->readunlock(idx);
263 schedule_timeout_interruptible(HZ); 357 schedule_timeout_interruptible(HZ);
264 continue; 358 continue;
265 } 359 }
@@ -273,14 +367,14 @@ rcu_torture_reader(void *arg)
273 pipe_count = RCU_TORTURE_PIPE_LEN; 367 pipe_count = RCU_TORTURE_PIPE_LEN;
274 } 368 }
275 ++__get_cpu_var(rcu_torture_count)[pipe_count]; 369 ++__get_cpu_var(rcu_torture_count)[pipe_count];
276 completed = rcu_batches_completed() - completed; 370 completed = cur_ops->completed() - completed;
277 if (completed > RCU_TORTURE_PIPE_LEN) { 371 if (completed > RCU_TORTURE_PIPE_LEN) {
278 /* Should not happen, but... */ 372 /* Should not happen, but... */
279 completed = RCU_TORTURE_PIPE_LEN; 373 completed = RCU_TORTURE_PIPE_LEN;
280 } 374 }
281 ++__get_cpu_var(rcu_torture_batch)[completed]; 375 ++__get_cpu_var(rcu_torture_batch)[completed];
282 preempt_enable(); 376 preempt_enable();
283 rcu_read_unlock(); 377 cur_ops->readunlock(idx);
284 schedule(); 378 schedule();
285 } while (!kthread_should_stop() && !fullstop); 379 } while (!kthread_should_stop() && !fullstop);
286 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); 380 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
@@ -311,7 +405,7 @@ rcu_torture_printk(char *page)
311 if (pipesummary[i] != 0) 405 if (pipesummary[i] != 0)
312 break; 406 break;
313 } 407 }
314 cnt += sprintf(&page[cnt], "rcutorture: "); 408 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
315 cnt += sprintf(&page[cnt], 409 cnt += sprintf(&page[cnt],
316 "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " 410 "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
317 "rtmbe: %d", 411 "rtmbe: %d",
@@ -324,7 +418,7 @@ rcu_torture_printk(char *page)
324 atomic_read(&n_rcu_torture_mberror)); 418 atomic_read(&n_rcu_torture_mberror));
325 if (atomic_read(&n_rcu_torture_mberror) != 0) 419 if (atomic_read(&n_rcu_torture_mberror) != 0)
326 cnt += sprintf(&page[cnt], " !!!"); 420 cnt += sprintf(&page[cnt], " !!!");
327 cnt += sprintf(&page[cnt], "\nrcutorture: "); 421 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
328 if (i > 1) { 422 if (i > 1) {
329 cnt += sprintf(&page[cnt], "!!! "); 423 cnt += sprintf(&page[cnt], "!!! ");
330 atomic_inc(&n_rcu_torture_error); 424 atomic_inc(&n_rcu_torture_error);
@@ -332,17 +426,19 @@ rcu_torture_printk(char *page)
332 cnt += sprintf(&page[cnt], "Reader Pipe: "); 426 cnt += sprintf(&page[cnt], "Reader Pipe: ");
333 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) 427 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
334 cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); 428 cnt += sprintf(&page[cnt], " %ld", pipesummary[i]);
335 cnt += sprintf(&page[cnt], "\nrcutorture: "); 429 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
336 cnt += sprintf(&page[cnt], "Reader Batch: "); 430 cnt += sprintf(&page[cnt], "Reader Batch: ");
337 for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++) 431 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
338 cnt += sprintf(&page[cnt], " %ld", batchsummary[i]); 432 cnt += sprintf(&page[cnt], " %ld", batchsummary[i]);
339 cnt += sprintf(&page[cnt], "\nrcutorture: "); 433 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
340 cnt += sprintf(&page[cnt], "Free-Block Circulation: "); 434 cnt += sprintf(&page[cnt], "Free-Block Circulation: ");
341 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { 435 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
342 cnt += sprintf(&page[cnt], " %d", 436 cnt += sprintf(&page[cnt], " %d",
343 atomic_read(&rcu_torture_wcount[i])); 437 atomic_read(&rcu_torture_wcount[i]));
344 } 438 }
345 cnt += sprintf(&page[cnt], "\n"); 439 cnt += sprintf(&page[cnt], "\n");
440 if (cur_ops->stats != NULL)
441 cnt += cur_ops->stats(&page[cnt]);
346 return cnt; 442 return cnt;
347} 443}
348 444
@@ -444,11 +540,11 @@ rcu_torture_shuffle(void *arg)
444static inline void 540static inline void
445rcu_torture_print_module_parms(char *tag) 541rcu_torture_print_module_parms(char *tag)
446{ 542{
447 printk(KERN_ALERT TORTURE_FLAG "--- %s: nreaders=%d " 543 printk(KERN_ALERT "%s" TORTURE_FLAG "--- %s: nreaders=%d "
448 "stat_interval=%d verbose=%d test_no_idle_hz=%d " 544 "stat_interval=%d verbose=%d test_no_idle_hz=%d "
449 "shuffle_interval = %d\n", 545 "shuffle_interval = %d\n",
450 tag, nrealreaders, stat_interval, verbose, test_no_idle_hz, 546 torture_type, tag, nrealreaders, stat_interval, verbose,
451 shuffle_interval); 547 test_no_idle_hz, shuffle_interval);
452} 548}
453 549
454static void 550static void
@@ -493,6 +589,9 @@ rcu_torture_cleanup(void)
493 rcu_barrier(); 589 rcu_barrier();
494 590
495 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ 591 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
592
593 if (cur_ops->cleanup != NULL)
594 cur_ops->cleanup();
496 if (atomic_read(&n_rcu_torture_error)) 595 if (atomic_read(&n_rcu_torture_error))
497 rcu_torture_print_module_parms("End of test: FAILURE"); 596 rcu_torture_print_module_parms("End of test: FAILURE");
498 else 597 else
@@ -508,6 +607,20 @@ rcu_torture_init(void)
508 607
509 /* Process args and tell the world that the torturer is on the job. */ 608 /* Process args and tell the world that the torturer is on the job. */
510 609
610 for (i = 0; cur_ops = torture_ops[i], cur_ops != NULL; i++) {
611 cur_ops = torture_ops[i];
612 if (strcmp(torture_type, cur_ops->name) == 0) {
613 break;
614 }
615 }
616 if (cur_ops == NULL) {
617 printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n",
618 torture_type);
619 return (-EINVAL);
620 }
621 if (cur_ops->init != NULL)
622 cur_ops->init(); /* no "goto unwind" prior to this point!!! */
623
511 if (nreaders >= 0) 624 if (nreaders >= 0)
512 nrealreaders = nreaders; 625 nrealreaders = nreaders;
513 else 626 else
diff --git a/kernel/resource.c b/kernel/resource.c
index e3080fcc66a3..129cf046e561 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -7,7 +7,6 @@
7 * Arbitrary resource management. 7 * Arbitrary resource management.
8 */ 8 */
9 9
10#include <linux/config.h>
11#include <linux/module.h> 10#include <linux/module.h>
12#include <linux/sched.h> 11#include <linux/sched.h>
13#include <linux/errno.h> 12#include <linux/errno.h>
@@ -23,20 +22,18 @@
23 22
24struct resource ioport_resource = { 23struct resource ioport_resource = {
25 .name = "PCI IO", 24 .name = "PCI IO",
26 .start = 0x0000, 25 .start = 0,
27 .end = IO_SPACE_LIMIT, 26 .end = IO_SPACE_LIMIT,
28 .flags = IORESOURCE_IO, 27 .flags = IORESOURCE_IO,
29}; 28};
30
31EXPORT_SYMBOL(ioport_resource); 29EXPORT_SYMBOL(ioport_resource);
32 30
33struct resource iomem_resource = { 31struct resource iomem_resource = {
34 .name = "PCI mem", 32 .name = "PCI mem",
35 .start = 0UL, 33 .start = 0,
36 .end = ~0UL, 34 .end = -1,
37 .flags = IORESOURCE_MEM, 35 .flags = IORESOURCE_MEM,
38}; 36};
39
40EXPORT_SYMBOL(iomem_resource); 37EXPORT_SYMBOL(iomem_resource);
41 38
42static DEFINE_RWLOCK(resource_lock); 39static DEFINE_RWLOCK(resource_lock);
@@ -83,10 +80,10 @@ static int r_show(struct seq_file *m, void *v)
83 for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent) 80 for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent)
84 if (p->parent == root) 81 if (p->parent == root)
85 break; 82 break;
86 seq_printf(m, "%*s%0*lx-%0*lx : %s\n", 83 seq_printf(m, "%*s%0*llx-%0*llx : %s\n",
87 depth * 2, "", 84 depth * 2, "",
88 width, r->start, 85 width, (unsigned long long) r->start,
89 width, r->end, 86 width, (unsigned long long) r->end,
90 r->name ? r->name : "<BAD>"); 87 r->name ? r->name : "<BAD>");
91 return 0; 88 return 0;
92} 89}
@@ -151,8 +148,8 @@ __initcall(ioresources_init);
151/* Return the conflict entry if you can't request it */ 148/* Return the conflict entry if you can't request it */
152static struct resource * __request_resource(struct resource *root, struct resource *new) 149static struct resource * __request_resource(struct resource *root, struct resource *new)
153{ 150{
154 unsigned long start = new->start; 151 resource_size_t start = new->start;
155 unsigned long end = new->end; 152 resource_size_t end = new->end;
156 struct resource *tmp, **p; 153 struct resource *tmp, **p;
157 154
158 if (end < start) 155 if (end < start)
@@ -232,15 +229,52 @@ int release_resource(struct resource *old)
232 229
233EXPORT_SYMBOL(release_resource); 230EXPORT_SYMBOL(release_resource);
234 231
232#ifdef CONFIG_MEMORY_HOTPLUG
233/*
234 * Finds the lowest memory reosurce exists within [res->start.res->end)
235 * the caller must specify res->start, res->end, res->flags.
236 * If found, returns 0, res is overwritten, if not found, returns -1.
237 */
238int find_next_system_ram(struct resource *res)
239{
240 resource_size_t start, end;
241 struct resource *p;
242
243 BUG_ON(!res);
244
245 start = res->start;
246 end = res->end;
247
248 read_lock(&resource_lock);
249 for (p = iomem_resource.child; p ; p = p->sibling) {
250 /* system ram is just marked as IORESOURCE_MEM */
251 if (p->flags != res->flags)
252 continue;
253 if (p->start > end) {
254 p = NULL;
255 break;
256 }
257 if (p->start >= start)
258 break;
259 }
260 read_unlock(&resource_lock);
261 if (!p)
262 return -1;
263 /* copy data */
264 res->start = p->start;
265 res->end = p->end;
266 return 0;
267}
268#endif
269
235/* 270/*
236 * Find empty slot in the resource tree given range and alignment. 271 * Find empty slot in the resource tree given range and alignment.
237 */ 272 */
238static int find_resource(struct resource *root, struct resource *new, 273static int find_resource(struct resource *root, struct resource *new,
239 unsigned long size, 274 resource_size_t size, resource_size_t min,
240 unsigned long min, unsigned long max, 275 resource_size_t max, resource_size_t align,
241 unsigned long align,
242 void (*alignf)(void *, struct resource *, 276 void (*alignf)(void *, struct resource *,
243 unsigned long, unsigned long), 277 resource_size_t, resource_size_t),
244 void *alignf_data) 278 void *alignf_data)
245{ 279{
246 struct resource *this = root->child; 280 struct resource *this = root->child;
@@ -282,11 +316,10 @@ static int find_resource(struct resource *root, struct resource *new,
282 * Allocate empty slot in the resource tree given range and alignment. 316 * Allocate empty slot in the resource tree given range and alignment.
283 */ 317 */
284int allocate_resource(struct resource *root, struct resource *new, 318int allocate_resource(struct resource *root, struct resource *new,
285 unsigned long size, 319 resource_size_t size, resource_size_t min,
286 unsigned long min, unsigned long max, 320 resource_size_t max, resource_size_t align,
287 unsigned long align,
288 void (*alignf)(void *, struct resource *, 321 void (*alignf)(void *, struct resource *,
289 unsigned long, unsigned long), 322 resource_size_t, resource_size_t),
290 void *alignf_data) 323 void *alignf_data)
291{ 324{
292 int err; 325 int err;
@@ -378,10 +411,10 @@ EXPORT_SYMBOL(insert_resource);
378 * arguments. Returns -EBUSY if it can't fit. Existing children of 411 * arguments. Returns -EBUSY if it can't fit. Existing children of
379 * the resource are assumed to be immutable. 412 * the resource are assumed to be immutable.
380 */ 413 */
381int adjust_resource(struct resource *res, unsigned long start, unsigned long size) 414int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size)
382{ 415{
383 struct resource *tmp, *parent = res->parent; 416 struct resource *tmp, *parent = res->parent;
384 unsigned long end = start + size - 1; 417 resource_size_t end = start + size - 1;
385 int result = -EBUSY; 418 int result = -EBUSY;
386 419
387 write_lock(&resource_lock); 420 write_lock(&resource_lock);
@@ -428,7 +461,9 @@ EXPORT_SYMBOL(adjust_resource);
428 * 461 *
429 * Release-region releases a matching busy region. 462 * Release-region releases a matching busy region.
430 */ 463 */
431struct resource * __request_region(struct resource *parent, unsigned long start, unsigned long n, const char *name) 464struct resource * __request_region(struct resource *parent,
465 resource_size_t start, resource_size_t n,
466 const char *name)
432{ 467{
433 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); 468 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
434 469
@@ -464,7 +499,8 @@ struct resource * __request_region(struct resource *parent, unsigned long start,
464 499
465EXPORT_SYMBOL(__request_region); 500EXPORT_SYMBOL(__request_region);
466 501
467int __check_region(struct resource *parent, unsigned long start, unsigned long n) 502int __check_region(struct resource *parent, resource_size_t start,
503 resource_size_t n)
468{ 504{
469 struct resource * res; 505 struct resource * res;
470 506
@@ -479,10 +515,11 @@ int __check_region(struct resource *parent, unsigned long start, unsigned long n
479 515
480EXPORT_SYMBOL(__check_region); 516EXPORT_SYMBOL(__check_region);
481 517
482void __release_region(struct resource *parent, unsigned long start, unsigned long n) 518void __release_region(struct resource *parent, resource_size_t start,
519 resource_size_t n)
483{ 520{
484 struct resource **p; 521 struct resource **p;
485 unsigned long end; 522 resource_size_t end;
486 523
487 p = &parent->child; 524 p = &parent->child;
488 end = start + n - 1; 525 end = start + n - 1;
@@ -511,7 +548,9 @@ void __release_region(struct resource *parent, unsigned long start, unsigned lon
511 548
512 write_unlock(&resource_lock); 549 write_unlock(&resource_lock);
513 550
514 printk(KERN_WARNING "Trying to free nonexistent resource <%08lx-%08lx>\n", start, end); 551 printk(KERN_WARNING "Trying to free nonexistent resource "
552 "<%016llx-%016llx>\n", (unsigned long long)start,
553 (unsigned long long)end);
515} 554}
516 555
517EXPORT_SYMBOL(__release_region); 556EXPORT_SYMBOL(__release_region);
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
new file mode 100644
index 000000000000..4aa8a2c9f453
--- /dev/null
+++ b/kernel/rtmutex-debug.c
@@ -0,0 +1,513 @@
1/*
2 * RT-Mutexes: blocking mutual exclusion locks with PI support
3 *
4 * started by Ingo Molnar and Thomas Gleixner:
5 *
6 * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
8 *
9 * This code is based on the rt.c implementation in the preempt-rt tree.
10 * Portions of said code are
11 *
12 * Copyright (C) 2004 LynuxWorks, Inc., Igor Manyilov, Bill Huey
13 * Copyright (C) 2006 Esben Nielsen
14 * Copyright (C) 2006 Kihon Technologies Inc.,
15 * Steven Rostedt <rostedt@goodmis.org>
16 *
17 * See rt.c in preempt-rt for proper credits and further information
18 */
19#include <linux/config.h>
20#include <linux/sched.h>
21#include <linux/delay.h>
22#include <linux/module.h>
23#include <linux/spinlock.h>
24#include <linux/kallsyms.h>
25#include <linux/syscalls.h>
26#include <linux/interrupt.h>
27#include <linux/plist.h>
28#include <linux/fs.h>
29
30#include "rtmutex_common.h"
31
32#ifdef CONFIG_DEBUG_RT_MUTEXES
33# include "rtmutex-debug.h"
34#else
35# include "rtmutex.h"
36#endif
37
38# define TRACE_WARN_ON(x) WARN_ON(x)
39# define TRACE_BUG_ON(x) BUG_ON(x)
40
41# define TRACE_OFF() \
42do { \
43 if (rt_trace_on) { \
44 rt_trace_on = 0; \
45 console_verbose(); \
46 if (spin_is_locked(&current->pi_lock)) \
47 spin_unlock(&current->pi_lock); \
48 if (spin_is_locked(&current->held_list_lock)) \
49 spin_unlock(&current->held_list_lock); \
50 } \
51} while (0)
52
53# define TRACE_OFF_NOLOCK() \
54do { \
55 if (rt_trace_on) { \
56 rt_trace_on = 0; \
57 console_verbose(); \
58 } \
59} while (0)
60
61# define TRACE_BUG_LOCKED() \
62do { \
63 TRACE_OFF(); \
64 BUG(); \
65} while (0)
66
67# define TRACE_WARN_ON_LOCKED(c) \
68do { \
69 if (unlikely(c)) { \
70 TRACE_OFF(); \
71 WARN_ON(1); \
72 } \
73} while (0)
74
75# define TRACE_BUG_ON_LOCKED(c) \
76do { \
77 if (unlikely(c)) \
78 TRACE_BUG_LOCKED(); \
79} while (0)
80
81#ifdef CONFIG_SMP
82# define SMP_TRACE_BUG_ON_LOCKED(c) TRACE_BUG_ON_LOCKED(c)
83#else
84# define SMP_TRACE_BUG_ON_LOCKED(c) do { } while (0)
85#endif
86
87/*
88 * deadlock detection flag. We turn it off when we detect
89 * the first problem because we dont want to recurse back
90 * into the tracing code when doing error printk or
91 * executing a BUG():
92 */
93int rt_trace_on = 1;
94
95void deadlock_trace_off(void)
96{
97 rt_trace_on = 0;
98}
99
100static void printk_task(task_t *p)
101{
102 if (p)
103 printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio);
104 else
105 printk("<none>");
106}
107
108static void printk_task_short(task_t *p)
109{
110 if (p)
111 printk("%s/%d [%p, %3d]", p->comm, p->pid, p, p->prio);
112 else
113 printk("<none>");
114}
115
116static void printk_lock(struct rt_mutex *lock, int print_owner)
117{
118 if (lock->name)
119 printk(" [%p] {%s}\n",
120 lock, lock->name);
121 else
122 printk(" [%p] {%s:%d}\n",
123 lock, lock->file, lock->line);
124
125 if (print_owner && rt_mutex_owner(lock)) {
126 printk(".. ->owner: %p\n", lock->owner);
127 printk(".. held by: ");
128 printk_task(rt_mutex_owner(lock));
129 printk("\n");
130 }
131 if (rt_mutex_owner(lock)) {
132 printk("... acquired at: ");
133 print_symbol("%s\n", lock->acquire_ip);
134 }
135}
136
137static void printk_waiter(struct rt_mutex_waiter *w)
138{
139 printk("-------------------------\n");
140 printk("| waiter struct %p:\n", w);
141 printk("| w->list_entry: [DP:%p/%p|SP:%p/%p|PRI:%d]\n",
142 w->list_entry.plist.prio_list.prev, w->list_entry.plist.prio_list.next,
143 w->list_entry.plist.node_list.prev, w->list_entry.plist.node_list.next,
144 w->list_entry.prio);
145 printk("| w->pi_list_entry: [DP:%p/%p|SP:%p/%p|PRI:%d]\n",
146 w->pi_list_entry.plist.prio_list.prev, w->pi_list_entry.plist.prio_list.next,
147 w->pi_list_entry.plist.node_list.prev, w->pi_list_entry.plist.node_list.next,
148 w->pi_list_entry.prio);
149 printk("\n| lock:\n");
150 printk_lock(w->lock, 1);
151 printk("| w->ti->task:\n");
152 printk_task(w->task);
153 printk("| blocked at: ");
154 print_symbol("%s\n", w->ip);
155 printk("-------------------------\n");
156}
157
158static void show_task_locks(task_t *p)
159{
160 switch (p->state) {
161 case TASK_RUNNING: printk("R"); break;
162 case TASK_INTERRUPTIBLE: printk("S"); break;
163 case TASK_UNINTERRUPTIBLE: printk("D"); break;
164 case TASK_STOPPED: printk("T"); break;
165 case EXIT_ZOMBIE: printk("Z"); break;
166 case EXIT_DEAD: printk("X"); break;
167 default: printk("?"); break;
168 }
169 printk_task(p);
170 if (p->pi_blocked_on) {
171 struct rt_mutex *lock = p->pi_blocked_on->lock;
172
173 printk(" blocked on:");
174 printk_lock(lock, 1);
175 } else
176 printk(" (not blocked)\n");
177}
178
179void rt_mutex_show_held_locks(task_t *task, int verbose)
180{
181 struct list_head *curr, *cursor = NULL;
182 struct rt_mutex *lock;
183 task_t *t;
184 unsigned long flags;
185 int count = 0;
186
187 if (!rt_trace_on)
188 return;
189
190 if (verbose) {
191 printk("------------------------------\n");
192 printk("| showing all locks held by: | (");
193 printk_task_short(task);
194 printk("):\n");
195 printk("------------------------------\n");
196 }
197
198next:
199 spin_lock_irqsave(&task->held_list_lock, flags);
200 list_for_each(curr, &task->held_list_head) {
201 if (cursor && curr != cursor)
202 continue;
203 lock = list_entry(curr, struct rt_mutex, held_list_entry);
204 t = rt_mutex_owner(lock);
205 WARN_ON(t != task);
206 count++;
207 cursor = curr->next;
208 spin_unlock_irqrestore(&task->held_list_lock, flags);
209
210 printk("\n#%03d: ", count);
211 printk_lock(lock, 0);
212 goto next;
213 }
214 spin_unlock_irqrestore(&task->held_list_lock, flags);
215
216 printk("\n");
217}
218
219void rt_mutex_show_all_locks(void)
220{
221 task_t *g, *p;
222 int count = 10;
223 int unlock = 1;
224
225 printk("\n");
226 printk("----------------------\n");
227 printk("| showing all tasks: |\n");
228 printk("----------------------\n");
229
230 /*
231 * Here we try to get the tasklist_lock as hard as possible,
232 * if not successful after 2 seconds we ignore it (but keep
233 * trying). This is to enable a debug printout even if a
234 * tasklist_lock-holding task deadlocks or crashes.
235 */
236retry:
237 if (!read_trylock(&tasklist_lock)) {
238 if (count == 10)
239 printk("hm, tasklist_lock locked, retrying... ");
240 if (count) {
241 count--;
242 printk(" #%d", 10-count);
243 mdelay(200);
244 goto retry;
245 }
246 printk(" ignoring it.\n");
247 unlock = 0;
248 }
249 if (count != 10)
250 printk(" locked it.\n");
251
252 do_each_thread(g, p) {
253 show_task_locks(p);
254 if (!unlock)
255 if (read_trylock(&tasklist_lock))
256 unlock = 1;
257 } while_each_thread(g, p);
258
259 printk("\n");
260
261 printk("-----------------------------------------\n");
262 printk("| showing all locks held in the system: |\n");
263 printk("-----------------------------------------\n");
264
265 do_each_thread(g, p) {
266 rt_mutex_show_held_locks(p, 0);
267 if (!unlock)
268 if (read_trylock(&tasklist_lock))
269 unlock = 1;
270 } while_each_thread(g, p);
271
272
273 printk("=============================================\n\n");
274
275 if (unlock)
276 read_unlock(&tasklist_lock);
277}
278
279void rt_mutex_debug_check_no_locks_held(task_t *task)
280{
281 struct rt_mutex_waiter *w;
282 struct list_head *curr;
283 struct rt_mutex *lock;
284
285 if (!rt_trace_on)
286 return;
287 if (!rt_prio(task->normal_prio) && rt_prio(task->prio)) {
288 printk("BUG: PI priority boost leaked!\n");
289 printk_task(task);
290 printk("\n");
291 }
292 if (list_empty(&task->held_list_head))
293 return;
294
295 spin_lock(&task->pi_lock);
296 plist_for_each_entry(w, &task->pi_waiters, pi_list_entry) {
297 TRACE_OFF();
298
299 printk("hm, PI interest held at exit time? Task:\n");
300 printk_task(task);
301 printk_waiter(w);
302 return;
303 }
304 spin_unlock(&task->pi_lock);
305
306 list_for_each(curr, &task->held_list_head) {
307 lock = list_entry(curr, struct rt_mutex, held_list_entry);
308
309 printk("BUG: %s/%d, lock held at task exit time!\n",
310 task->comm, task->pid);
311 printk_lock(lock, 1);
312 if (rt_mutex_owner(lock) != task)
313 printk("exiting task is not even the owner??\n");
314 }
315}
316
317int rt_mutex_debug_check_no_locks_freed(const void *from, unsigned long len)
318{
319 const void *to = from + len;
320 struct list_head *curr;
321 struct rt_mutex *lock;
322 unsigned long flags;
323 void *lock_addr;
324
325 if (!rt_trace_on)
326 return 0;
327
328 spin_lock_irqsave(&current->held_list_lock, flags);
329 list_for_each(curr, &current->held_list_head) {
330 lock = list_entry(curr, struct rt_mutex, held_list_entry);
331 lock_addr = lock;
332 if (lock_addr < from || lock_addr >= to)
333 continue;
334 TRACE_OFF();
335
336 printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n",
337 current->comm, current->pid, lock, from, to);
338 dump_stack();
339 printk_lock(lock, 1);
340 if (rt_mutex_owner(lock) != current)
341 printk("freeing task is not even the owner??\n");
342 return 1;
343 }
344 spin_unlock_irqrestore(&current->held_list_lock, flags);
345
346 return 0;
347}
348
349void rt_mutex_debug_task_free(struct task_struct *task)
350{
351 WARN_ON(!plist_head_empty(&task->pi_waiters));
352 WARN_ON(task->pi_blocked_on);
353}
354
355/*
356 * We fill out the fields in the waiter to store the information about
357 * the deadlock. We print when we return. act_waiter can be NULL in
358 * case of a remove waiter operation.
359 */
360void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter,
361 struct rt_mutex *lock)
362{
363 struct task_struct *task;
364
365 if (!rt_trace_on || detect || !act_waiter)
366 return;
367
368 task = rt_mutex_owner(act_waiter->lock);
369 if (task && task != current) {
370 act_waiter->deadlock_task_pid = task->pid;
371 act_waiter->deadlock_lock = lock;
372 }
373}
374
375void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
376{
377 struct task_struct *task;
378
379 if (!waiter->deadlock_lock || !rt_trace_on)
380 return;
381
382 task = find_task_by_pid(waiter->deadlock_task_pid);
383 if (!task)
384 return;
385
386 TRACE_OFF_NOLOCK();
387
388 printk("\n============================================\n");
389 printk( "[ BUG: circular locking deadlock detected! ]\n");
390 printk( "--------------------------------------------\n");
391 printk("%s/%d is deadlocking current task %s/%d\n\n",
392 task->comm, task->pid, current->comm, current->pid);
393
394 printk("\n1) %s/%d is trying to acquire this lock:\n",
395 current->comm, current->pid);
396 printk_lock(waiter->lock, 1);
397
398 printk("... trying at: ");
399 print_symbol("%s\n", waiter->ip);
400
401 printk("\n2) %s/%d is blocked on this lock:\n", task->comm, task->pid);
402 printk_lock(waiter->deadlock_lock, 1);
403
404 rt_mutex_show_held_locks(current, 1);
405 rt_mutex_show_held_locks(task, 1);
406
407 printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task->pid);
408 show_stack(task, NULL);
409 printk("\n%s/%d's [current] stackdump:\n\n",
410 current->comm, current->pid);
411 dump_stack();
412 rt_mutex_show_all_locks();
413 printk("[ turning off deadlock detection."
414 "Please report this trace. ]\n\n");
415 local_irq_disable();
416}
417
418void debug_rt_mutex_lock(struct rt_mutex *lock __IP_DECL__)
419{
420 unsigned long flags;
421
422 if (rt_trace_on) {
423 TRACE_WARN_ON_LOCKED(!list_empty(&lock->held_list_entry));
424
425 spin_lock_irqsave(&current->held_list_lock, flags);
426 list_add_tail(&lock->held_list_entry, &current->held_list_head);
427 spin_unlock_irqrestore(&current->held_list_lock, flags);
428
429 lock->acquire_ip = ip;
430 }
431}
432
433void debug_rt_mutex_unlock(struct rt_mutex *lock)
434{
435 unsigned long flags;
436
437 if (rt_trace_on) {
438 TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current);
439 TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list_entry));
440
441 spin_lock_irqsave(&current->held_list_lock, flags);
442 list_del_init(&lock->held_list_entry);
443 spin_unlock_irqrestore(&current->held_list_lock, flags);
444 }
445}
446
447void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
448 struct task_struct *powner __IP_DECL__)
449{
450 unsigned long flags;
451
452 if (rt_trace_on) {
453 TRACE_WARN_ON_LOCKED(!list_empty(&lock->held_list_entry));
454
455 spin_lock_irqsave(&powner->held_list_lock, flags);
456 list_add_tail(&lock->held_list_entry, &powner->held_list_head);
457 spin_unlock_irqrestore(&powner->held_list_lock, flags);
458
459 lock->acquire_ip = ip;
460 }
461}
462
463void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
464{
465 unsigned long flags;
466
467 if (rt_trace_on) {
468 struct task_struct *owner = rt_mutex_owner(lock);
469
470 TRACE_WARN_ON_LOCKED(!owner);
471 TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list_entry));
472
473 spin_lock_irqsave(&owner->held_list_lock, flags);
474 list_del_init(&lock->held_list_entry);
475 spin_unlock_irqrestore(&owner->held_list_lock, flags);
476 }
477}
478
479void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
480{
481 memset(waiter, 0x11, sizeof(*waiter));
482 plist_node_init(&waiter->list_entry, MAX_PRIO);
483 plist_node_init(&waiter->pi_list_entry, MAX_PRIO);
484}
485
486void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
487{
488 TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry));
489 TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
490 TRACE_WARN_ON(waiter->task);
491 memset(waiter, 0x22, sizeof(*waiter));
492}
493
494void debug_rt_mutex_init(struct rt_mutex *lock, const char *name)
495{
496 void *addr = lock;
497
498 if (rt_trace_on) {
499 rt_mutex_debug_check_no_locks_freed(addr,
500 sizeof(struct rt_mutex));
501 INIT_LIST_HEAD(&lock->held_list_entry);
502 lock->name = name;
503 }
504}
505
506void rt_mutex_deadlock_account_lock(struct rt_mutex *lock, task_t *task)
507{
508}
509
510void rt_mutex_deadlock_account_unlock(struct task_struct *task)
511{
512}
513
diff --git a/kernel/rtmutex-debug.h b/kernel/rtmutex-debug.h
new file mode 100644
index 000000000000..7612fbc62d70
--- /dev/null
+++ b/kernel/rtmutex-debug.h
@@ -0,0 +1,37 @@
1/*
2 * RT-Mutexes: blocking mutual exclusion locks with PI support
3 *
4 * started by Ingo Molnar and Thomas Gleixner:
5 *
6 * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
8 *
9 * This file contains macros used solely by rtmutex.c. Debug version.
10 */
11
12#define __IP_DECL__ , unsigned long ip
13#define __IP__ , ip
14#define __RET_IP__ , (unsigned long)__builtin_return_address(0)
15
16extern void
17rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task);
18extern void rt_mutex_deadlock_account_unlock(struct task_struct *task);
19extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
20extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter);
21extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name);
22extern void debug_rt_mutex_lock(struct rt_mutex *lock __IP_DECL__);
23extern void debug_rt_mutex_unlock(struct rt_mutex *lock);
24extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
25 struct task_struct *powner __IP_DECL__);
26extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock);
27extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter,
28 struct rt_mutex *lock);
29extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter);
30# define debug_rt_mutex_reset_waiter(w) \
31 do { (w)->deadlock_lock = NULL; } while (0)
32
33static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
34 int detect)
35{
36 return (waiter != NULL);
37}
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
new file mode 100644
index 000000000000..e82c2f848249
--- /dev/null
+++ b/kernel/rtmutex-tester.c
@@ -0,0 +1,440 @@
1/*
2 * RT-Mutex-tester: scriptable tester for rt mutexes
3 *
4 * started by Thomas Gleixner:
5 *
6 * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
7 *
8 */
9#include <linux/config.h>
10#include <linux/kthread.h>
11#include <linux/module.h>
12#include <linux/sched.h>
13#include <linux/smp_lock.h>
14#include <linux/spinlock.h>
15#include <linux/sysdev.h>
16#include <linux/timer.h>
17
18#include "rtmutex.h"
19
20#define MAX_RT_TEST_THREADS 8
21#define MAX_RT_TEST_MUTEXES 8
22
23static spinlock_t rttest_lock;
24static atomic_t rttest_event;
25
26struct test_thread_data {
27 int opcode;
28 int opdata;
29 int mutexes[MAX_RT_TEST_MUTEXES];
30 int bkl;
31 int event;
32 struct sys_device sysdev;
33};
34
35static struct test_thread_data thread_data[MAX_RT_TEST_THREADS];
36static task_t *threads[MAX_RT_TEST_THREADS];
37static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES];
38
39enum test_opcodes {
40 RTTEST_NOP = 0,
41 RTTEST_SCHEDOT, /* 1 Sched other, data = nice */
42 RTTEST_SCHEDRT, /* 2 Sched fifo, data = prio */
43 RTTEST_LOCK, /* 3 Lock uninterruptible, data = lockindex */
44 RTTEST_LOCKNOWAIT, /* 4 Lock uninterruptible no wait in wakeup, data = lockindex */
45 RTTEST_LOCKINT, /* 5 Lock interruptible, data = lockindex */
46 RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */
47 RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */
48 RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */
49 RTTEST_LOCKBKL, /* 9 Lock BKL */
50 RTTEST_UNLOCKBKL, /* 10 Unlock BKL */
51 RTTEST_SIGNAL, /* 11 Signal other test thread, data = thread id */
52 RTTEST_RESETEVENT = 98, /* 98 Reset event counter */
53 RTTEST_RESET = 99, /* 99 Reset all pending operations */
54};
55
56static int handle_op(struct test_thread_data *td, int lockwakeup)
57{
58 int i, id, ret = -EINVAL;
59
60 switch(td->opcode) {
61
62 case RTTEST_NOP:
63 return 0;
64
65 case RTTEST_LOCKCONT:
66 td->mutexes[td->opdata] = 1;
67 td->event = atomic_add_return(1, &rttest_event);
68 return 0;
69
70 case RTTEST_RESET:
71 for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) {
72 if (td->mutexes[i] == 4) {
73 rt_mutex_unlock(&mutexes[i]);
74 td->mutexes[i] = 0;
75 }
76 }
77
78 if (!lockwakeup && td->bkl == 4) {
79 unlock_kernel();
80 td->bkl = 0;
81 }
82 return 0;
83
84 case RTTEST_RESETEVENT:
85 atomic_set(&rttest_event, 0);
86 return 0;
87
88 default:
89 if (lockwakeup)
90 return ret;
91 }
92
93 switch(td->opcode) {
94
95 case RTTEST_LOCK:
96 case RTTEST_LOCKNOWAIT:
97 id = td->opdata;
98 if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
99 return ret;
100
101 td->mutexes[id] = 1;
102 td->event = atomic_add_return(1, &rttest_event);
103 rt_mutex_lock(&mutexes[id]);
104 td->event = atomic_add_return(1, &rttest_event);
105 td->mutexes[id] = 4;
106 return 0;
107
108 case RTTEST_LOCKINT:
109 case RTTEST_LOCKINTNOWAIT:
110 id = td->opdata;
111 if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
112 return ret;
113
114 td->mutexes[id] = 1;
115 td->event = atomic_add_return(1, &rttest_event);
116 ret = rt_mutex_lock_interruptible(&mutexes[id], 0);
117 td->event = atomic_add_return(1, &rttest_event);
118 td->mutexes[id] = ret ? 0 : 4;
119 return ret ? -EINTR : 0;
120
121 case RTTEST_UNLOCK:
122 id = td->opdata;
123 if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4)
124 return ret;
125
126 td->event = atomic_add_return(1, &rttest_event);
127 rt_mutex_unlock(&mutexes[id]);
128 td->event = atomic_add_return(1, &rttest_event);
129 td->mutexes[id] = 0;
130 return 0;
131
132 case RTTEST_LOCKBKL:
133 if (td->bkl)
134 return 0;
135 td->bkl = 1;
136 lock_kernel();
137 td->bkl = 4;
138 return 0;
139
140 case RTTEST_UNLOCKBKL:
141 if (td->bkl != 4)
142 break;
143 unlock_kernel();
144 td->bkl = 0;
145 return 0;
146
147 default:
148 break;
149 }
150 return ret;
151}
152
153/*
154 * Schedule replacement for rtsem_down(). Only called for threads with
155 * PF_MUTEX_TESTER set.
156 *
157 * This allows us to have finegrained control over the event flow.
158 *
159 */
160void schedule_rt_mutex_test(struct rt_mutex *mutex)
161{
162 int tid, op, dat;
163 struct test_thread_data *td;
164
165 /* We have to lookup the task */
166 for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) {
167 if (threads[tid] == current)
168 break;
169 }
170
171 BUG_ON(tid == MAX_RT_TEST_THREADS);
172
173 td = &thread_data[tid];
174
175 op = td->opcode;
176 dat = td->opdata;
177
178 switch (op) {
179 case RTTEST_LOCK:
180 case RTTEST_LOCKINT:
181 case RTTEST_LOCKNOWAIT:
182 case RTTEST_LOCKINTNOWAIT:
183 if (mutex != &mutexes[dat])
184 break;
185
186 if (td->mutexes[dat] != 1)
187 break;
188
189 td->mutexes[dat] = 2;
190 td->event = atomic_add_return(1, &rttest_event);
191 break;
192
193 case RTTEST_LOCKBKL:
194 default:
195 break;
196 }
197
198 schedule();
199
200
201 switch (op) {
202 case RTTEST_LOCK:
203 case RTTEST_LOCKINT:
204 if (mutex != &mutexes[dat])
205 return;
206
207 if (td->mutexes[dat] != 2)
208 return;
209
210 td->mutexes[dat] = 3;
211 td->event = atomic_add_return(1, &rttest_event);
212 break;
213
214 case RTTEST_LOCKNOWAIT:
215 case RTTEST_LOCKINTNOWAIT:
216 if (mutex != &mutexes[dat])
217 return;
218
219 if (td->mutexes[dat] != 2)
220 return;
221
222 td->mutexes[dat] = 1;
223 td->event = atomic_add_return(1, &rttest_event);
224 return;
225
226 case RTTEST_LOCKBKL:
227 return;
228 default:
229 return;
230 }
231
232 td->opcode = 0;
233
234 for (;;) {
235 set_current_state(TASK_INTERRUPTIBLE);
236
237 if (td->opcode > 0) {
238 int ret;
239
240 set_current_state(TASK_RUNNING);
241 ret = handle_op(td, 1);
242 set_current_state(TASK_INTERRUPTIBLE);
243 if (td->opcode == RTTEST_LOCKCONT)
244 break;
245 td->opcode = ret;
246 }
247
248 /* Wait for the next command to be executed */
249 schedule();
250 }
251
252 /* Restore previous command and data */
253 td->opcode = op;
254 td->opdata = dat;
255}
256
257static int test_func(void *data)
258{
259 struct test_thread_data *td = data;
260 int ret;
261
262 current->flags |= PF_MUTEX_TESTER;
263 allow_signal(SIGHUP);
264
265 for(;;) {
266
267 set_current_state(TASK_INTERRUPTIBLE);
268
269 if (td->opcode > 0) {
270 set_current_state(TASK_RUNNING);
271 ret = handle_op(td, 0);
272 set_current_state(TASK_INTERRUPTIBLE);
273 td->opcode = ret;
274 }
275
276 /* Wait for the next command to be executed */
277 schedule();
278
279 if (signal_pending(current))
280 flush_signals(current);
281
282 if(kthread_should_stop())
283 break;
284 }
285 return 0;
286}
287
288/**
289 * sysfs_test_command - interface for test commands
290 * @dev: thread reference
291 * @buf: command for actual step
292 * @count: length of buffer
293 *
294 * command syntax:
295 *
296 * opcode:data
297 */
298static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf,
299 size_t count)
300{
301 struct sched_param schedpar;
302 struct test_thread_data *td;
303 char cmdbuf[32];
304 int op, dat, tid, ret;
305
306 td = container_of(dev, struct test_thread_data, sysdev);
307 tid = td->sysdev.id;
308
309 /* strings from sysfs write are not 0 terminated! */
310 if (count >= sizeof(cmdbuf))
311 return -EINVAL;
312
313 /* strip of \n: */
314 if (buf[count-1] == '\n')
315 count--;
316 if (count < 1)
317 return -EINVAL;
318
319 memcpy(cmdbuf, buf, count);
320 cmdbuf[count] = 0;
321
322 if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2)
323 return -EINVAL;
324
325 switch (op) {
326 case RTTEST_SCHEDOT:
327 schedpar.sched_priority = 0;
328 ret = sched_setscheduler(threads[tid], SCHED_NORMAL, &schedpar);
329 if (ret)
330 return ret;
331 set_user_nice(current, 0);
332 break;
333
334 case RTTEST_SCHEDRT:
335 schedpar.sched_priority = dat;
336 ret = sched_setscheduler(threads[tid], SCHED_FIFO, &schedpar);
337 if (ret)
338 return ret;
339 break;
340
341 case RTTEST_SIGNAL:
342 send_sig(SIGHUP, threads[tid], 0);
343 break;
344
345 default:
346 if (td->opcode > 0)
347 return -EBUSY;
348 td->opdata = dat;
349 td->opcode = op;
350 wake_up_process(threads[tid]);
351 }
352
353 return count;
354}
355
356/**
357 * sysfs_test_status - sysfs interface for rt tester
358 * @dev: thread to query
359 * @buf: char buffer to be filled with thread status info
360 */
361static ssize_t sysfs_test_status(struct sys_device *dev, char *buf)
362{
363 struct test_thread_data *td;
364 char *curr = buf;
365 task_t *tsk;
366 int i;
367
368 td = container_of(dev, struct test_thread_data, sysdev);
369 tsk = threads[td->sysdev.id];
370
371 spin_lock(&rttest_lock);
372
373 curr += sprintf(curr,
374 "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:",
375 td->opcode, td->event, tsk->state,
376 (MAX_RT_PRIO - 1) - tsk->prio,
377 (MAX_RT_PRIO - 1) - tsk->normal_prio,
378 tsk->pi_blocked_on, td->bkl);
379
380 for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--)
381 curr += sprintf(curr, "%d", td->mutexes[i]);
382
383 spin_unlock(&rttest_lock);
384
385 curr += sprintf(curr, ", T: %p, R: %p\n", tsk,
386 mutexes[td->sysdev.id].owner);
387
388 return curr - buf;
389}
390
391static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL);
392static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command);
393
394static struct sysdev_class rttest_sysclass = {
395 set_kset_name("rttest"),
396};
397
398static int init_test_thread(int id)
399{
400 thread_data[id].sysdev.cls = &rttest_sysclass;
401 thread_data[id].sysdev.id = id;
402
403 threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id);
404 if (IS_ERR(threads[id]))
405 return PTR_ERR(threads[id]);
406
407 return sysdev_register(&thread_data[id].sysdev);
408}
409
410static int init_rttest(void)
411{
412 int ret, i;
413
414 spin_lock_init(&rttest_lock);
415
416 for (i = 0; i < MAX_RT_TEST_MUTEXES; i++)
417 rt_mutex_init(&mutexes[i]);
418
419 ret = sysdev_class_register(&rttest_sysclass);
420 if (ret)
421 return ret;
422
423 for (i = 0; i < MAX_RT_TEST_THREADS; i++) {
424 ret = init_test_thread(i);
425 if (ret)
426 break;
427 ret = sysdev_create_file(&thread_data[i].sysdev, &attr_status);
428 if (ret)
429 break;
430 ret = sysdev_create_file(&thread_data[i].sysdev, &attr_command);
431 if (ret)
432 break;
433 }
434
435 printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" );
436
437 return ret;
438}
439
440device_initcall(init_rttest);
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
new file mode 100644
index 000000000000..45d61016da57
--- /dev/null
+++ b/kernel/rtmutex.c
@@ -0,0 +1,990 @@
1/*
2 * RT-Mutexes: simple blocking mutual exclusion locks with PI support
3 *
4 * started by Ingo Molnar and Thomas Gleixner.
5 *
6 * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
8 * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
9 * Copyright (C) 2006 Esben Nielsen
10 */
11#include <linux/spinlock.h>
12#include <linux/module.h>
13#include <linux/sched.h>
14#include <linux/timer.h>
15
16#include "rtmutex_common.h"
17
18#ifdef CONFIG_DEBUG_RT_MUTEXES
19# include "rtmutex-debug.h"
20#else
21# include "rtmutex.h"
22#endif
23
24/*
25 * lock->owner state tracking:
26 *
27 * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1
28 * are used to keep track of the "owner is pending" and "lock has
29 * waiters" state.
30 *
31 * owner bit1 bit0
32 * NULL 0 0 lock is free (fast acquire possible)
33 * NULL 0 1 invalid state
34 * NULL 1 0 Transitional State*
35 * NULL 1 1 invalid state
36 * taskpointer 0 0 lock is held (fast release possible)
37 * taskpointer 0 1 task is pending owner
38 * taskpointer 1 0 lock is held and has waiters
39 * taskpointer 1 1 task is pending owner and lock has more waiters
40 *
41 * Pending ownership is assigned to the top (highest priority)
42 * waiter of the lock, when the lock is released. The thread is woken
43 * up and can now take the lock. Until the lock is taken (bit 0
44 * cleared) a competing higher priority thread can steal the lock
45 * which puts the woken up thread back on the waiters list.
46 *
47 * The fast atomic compare exchange based acquire and release is only
48 * possible when bit 0 and 1 of lock->owner are 0.
49 *
50 * (*) There's a small time where the owner can be NULL and the
51 * "lock has waiters" bit is set. This can happen when grabbing the lock.
52 * To prevent a cmpxchg of the owner releasing the lock, we need to set this
53 * bit before looking at the lock, hence the reason this is a transitional
54 * state.
55 */
56
57static void
58rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
59 unsigned long mask)
60{
61 unsigned long val = (unsigned long)owner | mask;
62
63 if (rt_mutex_has_waiters(lock))
64 val |= RT_MUTEX_HAS_WAITERS;
65
66 lock->owner = (struct task_struct *)val;
67}
68
69static inline void clear_rt_mutex_waiters(struct rt_mutex *lock)
70{
71 lock->owner = (struct task_struct *)
72 ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
73}
74
75static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
76{
77 if (!rt_mutex_has_waiters(lock))
78 clear_rt_mutex_waiters(lock);
79}
80
81/*
82 * We can speed up the acquire/release, if the architecture
83 * supports cmpxchg and if there's no debugging state to be set up
84 */
85#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
86# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c)
87static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
88{
89 unsigned long owner, *p = (unsigned long *) &lock->owner;
90
91 do {
92 owner = *p;
93 } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
94}
95#else
96# define rt_mutex_cmpxchg(l,c,n) (0)
97static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
98{
99 lock->owner = (struct task_struct *)
100 ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
101}
102#endif
103
104/*
105 * Calculate task priority from the waiter list priority
106 *
107 * Return task->normal_prio when the waiter list is empty or when
108 * the waiter is not allowed to do priority boosting
109 */
110int rt_mutex_getprio(struct task_struct *task)
111{
112 if (likely(!task_has_pi_waiters(task)))
113 return task->normal_prio;
114
115 return min(task_top_pi_waiter(task)->pi_list_entry.prio,
116 task->normal_prio);
117}
118
119/*
120 * Adjust the priority of a task, after its pi_waiters got modified.
121 *
122 * This can be both boosting and unboosting. task->pi_lock must be held.
123 */
124static void __rt_mutex_adjust_prio(struct task_struct *task)
125{
126 int prio = rt_mutex_getprio(task);
127
128 if (task->prio != prio)
129 rt_mutex_setprio(task, prio);
130}
131
132/*
133 * Adjust task priority (undo boosting). Called from the exit path of
134 * rt_mutex_slowunlock() and rt_mutex_slowlock().
135 *
136 * (Note: We do this outside of the protection of lock->wait_lock to
137 * allow the lock to be taken while or before we readjust the priority
138 * of task. We do not use the spin_xx_mutex() variants here as we are
139 * outside of the debug path.)
140 */
141static void rt_mutex_adjust_prio(struct task_struct *task)
142{
143 unsigned long flags;
144
145 spin_lock_irqsave(&task->pi_lock, flags);
146 __rt_mutex_adjust_prio(task);
147 spin_unlock_irqrestore(&task->pi_lock, flags);
148}
149
150/*
151 * Max number of times we'll walk the boosting chain:
152 */
153int max_lock_depth = 1024;
154
155/*
156 * Adjust the priority chain. Also used for deadlock detection.
157 * Decreases task's usage by one - may thus free the task.
158 * Returns 0 or -EDEADLK.
159 */
160static int rt_mutex_adjust_prio_chain(task_t *task,
161 int deadlock_detect,
162 struct rt_mutex *orig_lock,
163 struct rt_mutex_waiter *orig_waiter,
164 struct task_struct *top_task
165 __IP_DECL__)
166{
167 struct rt_mutex *lock;
168 struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
169 int detect_deadlock, ret = 0, depth = 0;
170 unsigned long flags;
171
172 detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter,
173 deadlock_detect);
174
175 /*
176 * The (de)boosting is a step by step approach with a lot of
177 * pitfalls. We want this to be preemptible and we want hold a
178 * maximum of two locks per step. So we have to check
179 * carefully whether things change under us.
180 */
181 again:
182 if (++depth > max_lock_depth) {
183 static int prev_max;
184
185 /*
186 * Print this only once. If the admin changes the limit,
187 * print a new message when reaching the limit again.
188 */
189 if (prev_max != max_lock_depth) {
190 prev_max = max_lock_depth;
191 printk(KERN_WARNING "Maximum lock depth %d reached "
192 "task: %s (%d)\n", max_lock_depth,
193 top_task->comm, top_task->pid);
194 }
195 put_task_struct(task);
196
197 return deadlock_detect ? -EDEADLK : 0;
198 }
199 retry:
200 /*
201 * Task can not go away as we did a get_task() before !
202 */
203 spin_lock_irqsave(&task->pi_lock, flags);
204
205 waiter = task->pi_blocked_on;
206 /*
207 * Check whether the end of the boosting chain has been
208 * reached or the state of the chain has changed while we
209 * dropped the locks.
210 */
211 if (!waiter || !waiter->task)
212 goto out_unlock_pi;
213
214 if (top_waiter && (!task_has_pi_waiters(task) ||
215 top_waiter != task_top_pi_waiter(task)))
216 goto out_unlock_pi;
217
218 /*
219 * When deadlock detection is off then we check, if further
220 * priority adjustment is necessary.
221 */
222 if (!detect_deadlock && waiter->list_entry.prio == task->prio)
223 goto out_unlock_pi;
224
225 lock = waiter->lock;
226 if (!spin_trylock(&lock->wait_lock)) {
227 spin_unlock_irqrestore(&task->pi_lock, flags);
228 cpu_relax();
229 goto retry;
230 }
231
232 /* Deadlock detection */
233 if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
234 debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock);
235 spin_unlock(&lock->wait_lock);
236 ret = deadlock_detect ? -EDEADLK : 0;
237 goto out_unlock_pi;
238 }
239
240 top_waiter = rt_mutex_top_waiter(lock);
241
242 /* Requeue the waiter */
243 plist_del(&waiter->list_entry, &lock->wait_list);
244 waiter->list_entry.prio = task->prio;
245 plist_add(&waiter->list_entry, &lock->wait_list);
246
247 /* Release the task */
248 spin_unlock_irqrestore(&task->pi_lock, flags);
249 put_task_struct(task);
250
251 /* Grab the next task */
252 task = rt_mutex_owner(lock);
253 spin_lock_irqsave(&task->pi_lock, flags);
254
255 if (waiter == rt_mutex_top_waiter(lock)) {
256 /* Boost the owner */
257 plist_del(&top_waiter->pi_list_entry, &task->pi_waiters);
258 waiter->pi_list_entry.prio = waiter->list_entry.prio;
259 plist_add(&waiter->pi_list_entry, &task->pi_waiters);
260 __rt_mutex_adjust_prio(task);
261
262 } else if (top_waiter == waiter) {
263 /* Deboost the owner */
264 plist_del(&waiter->pi_list_entry, &task->pi_waiters);
265 waiter = rt_mutex_top_waiter(lock);
266 waiter->pi_list_entry.prio = waiter->list_entry.prio;
267 plist_add(&waiter->pi_list_entry, &task->pi_waiters);
268 __rt_mutex_adjust_prio(task);
269 }
270
271 get_task_struct(task);
272 spin_unlock_irqrestore(&task->pi_lock, flags);
273
274 top_waiter = rt_mutex_top_waiter(lock);
275 spin_unlock(&lock->wait_lock);
276
277 if (!detect_deadlock && waiter != top_waiter)
278 goto out_put_task;
279
280 goto again;
281
282 out_unlock_pi:
283 spin_unlock_irqrestore(&task->pi_lock, flags);
284 out_put_task:
285 put_task_struct(task);
286 return ret;
287}
288
289/*
290 * Optimization: check if we can steal the lock from the
291 * assigned pending owner [which might not have taken the
292 * lock yet]:
293 */
294static inline int try_to_steal_lock(struct rt_mutex *lock)
295{
296 struct task_struct *pendowner = rt_mutex_owner(lock);
297 struct rt_mutex_waiter *next;
298 unsigned long flags;
299
300 if (!rt_mutex_owner_pending(lock))
301 return 0;
302
303 if (pendowner == current)
304 return 1;
305
306 spin_lock_irqsave(&pendowner->pi_lock, flags);
307 if (current->prio >= pendowner->prio) {
308 spin_unlock_irqrestore(&pendowner->pi_lock, flags);
309 return 0;
310 }
311
312 /*
313 * Check if a waiter is enqueued on the pending owners
314 * pi_waiters list. Remove it and readjust pending owners
315 * priority.
316 */
317 if (likely(!rt_mutex_has_waiters(lock))) {
318 spin_unlock_irqrestore(&pendowner->pi_lock, flags);
319 return 1;
320 }
321
322 /* No chain handling, pending owner is not blocked on anything: */
323 next = rt_mutex_top_waiter(lock);
324 plist_del(&next->pi_list_entry, &pendowner->pi_waiters);
325 __rt_mutex_adjust_prio(pendowner);
326 spin_unlock_irqrestore(&pendowner->pi_lock, flags);
327
328 /*
329 * We are going to steal the lock and a waiter was
330 * enqueued on the pending owners pi_waiters queue. So
331 * we have to enqueue this waiter into
332 * current->pi_waiters list. This covers the case,
333 * where current is boosted because it holds another
334 * lock and gets unboosted because the booster is
335 * interrupted, so we would delay a waiter with higher
336 * priority as current->normal_prio.
337 *
338 * Note: in the rare case of a SCHED_OTHER task changing
339 * its priority and thus stealing the lock, next->task
340 * might be current:
341 */
342 if (likely(next->task != current)) {
343 spin_lock_irqsave(&current->pi_lock, flags);
344 plist_add(&next->pi_list_entry, &current->pi_waiters);
345 __rt_mutex_adjust_prio(current);
346 spin_unlock_irqrestore(&current->pi_lock, flags);
347 }
348 return 1;
349}
350
351/*
352 * Try to take an rt-mutex
353 *
354 * This fails
355 * - when the lock has a real owner
356 * - when a different pending owner exists and has higher priority than current
357 *
358 * Must be called with lock->wait_lock held.
359 */
360static int try_to_take_rt_mutex(struct rt_mutex *lock __IP_DECL__)
361{
362 /*
363 * We have to be careful here if the atomic speedups are
364 * enabled, such that, when
365 * - no other waiter is on the lock
366 * - the lock has been released since we did the cmpxchg
367 * the lock can be released or taken while we are doing the
368 * checks and marking the lock with RT_MUTEX_HAS_WAITERS.
369 *
370 * The atomic acquire/release aware variant of
371 * mark_rt_mutex_waiters uses a cmpxchg loop. After setting
372 * the WAITERS bit, the atomic release / acquire can not
373 * happen anymore and lock->wait_lock protects us from the
374 * non-atomic case.
375 *
376 * Note, that this might set lock->owner =
377 * RT_MUTEX_HAS_WAITERS in the case the lock is not contended
378 * any more. This is fixed up when we take the ownership.
379 * This is the transitional state explained at the top of this file.
380 */
381 mark_rt_mutex_waiters(lock);
382
383 if (rt_mutex_owner(lock) && !try_to_steal_lock(lock))
384 return 0;
385
386 /* We got the lock. */
387 debug_rt_mutex_lock(lock __IP__);
388
389 rt_mutex_set_owner(lock, current, 0);
390
391 rt_mutex_deadlock_account_lock(lock, current);
392
393 return 1;
394}
395
396/*
397 * Task blocks on lock.
398 *
399 * Prepare waiter and propagate pi chain
400 *
401 * This must be called with lock->wait_lock held.
402 */
403static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
404 struct rt_mutex_waiter *waiter,
405 int detect_deadlock
406 __IP_DECL__)
407{
408 struct rt_mutex_waiter *top_waiter = waiter;
409 task_t *owner = rt_mutex_owner(lock);
410 int boost = 0, res;
411 unsigned long flags;
412
413 spin_lock_irqsave(&current->pi_lock, flags);
414 __rt_mutex_adjust_prio(current);
415 waiter->task = current;
416 waiter->lock = lock;
417 plist_node_init(&waiter->list_entry, current->prio);
418 plist_node_init(&waiter->pi_list_entry, current->prio);
419
420 /* Get the top priority waiter on the lock */
421 if (rt_mutex_has_waiters(lock))
422 top_waiter = rt_mutex_top_waiter(lock);
423 plist_add(&waiter->list_entry, &lock->wait_list);
424
425 current->pi_blocked_on = waiter;
426
427 spin_unlock_irqrestore(&current->pi_lock, flags);
428
429 if (waiter == rt_mutex_top_waiter(lock)) {
430 spin_lock_irqsave(&owner->pi_lock, flags);
431 plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
432 plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
433
434 __rt_mutex_adjust_prio(owner);
435 if (owner->pi_blocked_on) {
436 boost = 1;
437 /* gets dropped in rt_mutex_adjust_prio_chain()! */
438 get_task_struct(owner);
439 }
440 spin_unlock_irqrestore(&owner->pi_lock, flags);
441 }
442 else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) {
443 spin_lock_irqsave(&owner->pi_lock, flags);
444 if (owner->pi_blocked_on) {
445 boost = 1;
446 /* gets dropped in rt_mutex_adjust_prio_chain()! */
447 get_task_struct(owner);
448 }
449 spin_unlock_irqrestore(&owner->pi_lock, flags);
450 }
451 if (!boost)
452 return 0;
453
454 spin_unlock(&lock->wait_lock);
455
456 res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
457 current __IP__);
458
459 spin_lock(&lock->wait_lock);
460
461 return res;
462}
463
464/*
465 * Wake up the next waiter on the lock.
466 *
467 * Remove the top waiter from the current tasks waiter list and from
468 * the lock waiter list. Set it as pending owner. Then wake it up.
469 *
470 * Called with lock->wait_lock held.
471 */
472static void wakeup_next_waiter(struct rt_mutex *lock)
473{
474 struct rt_mutex_waiter *waiter;
475 struct task_struct *pendowner;
476 unsigned long flags;
477
478 spin_lock_irqsave(&current->pi_lock, flags);
479
480 waiter = rt_mutex_top_waiter(lock);
481 plist_del(&waiter->list_entry, &lock->wait_list);
482
483 /*
484 * Remove it from current->pi_waiters. We do not adjust a
485 * possible priority boost right now. We execute wakeup in the
486 * boosted mode and go back to normal after releasing
487 * lock->wait_lock.
488 */
489 plist_del(&waiter->pi_list_entry, &current->pi_waiters);
490 pendowner = waiter->task;
491 waiter->task = NULL;
492
493 rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING);
494
495 spin_unlock_irqrestore(&current->pi_lock, flags);
496
497 /*
498 * Clear the pi_blocked_on variable and enqueue a possible
499 * waiter into the pi_waiters list of the pending owner. This
500 * prevents that in case the pending owner gets unboosted a
501 * waiter with higher priority than pending-owner->normal_prio
502 * is blocked on the unboosted (pending) owner.
503 */
504 spin_lock_irqsave(&pendowner->pi_lock, flags);
505
506 WARN_ON(!pendowner->pi_blocked_on);
507 WARN_ON(pendowner->pi_blocked_on != waiter);
508 WARN_ON(pendowner->pi_blocked_on->lock != lock);
509
510 pendowner->pi_blocked_on = NULL;
511
512 if (rt_mutex_has_waiters(lock)) {
513 struct rt_mutex_waiter *next;
514
515 next = rt_mutex_top_waiter(lock);
516 plist_add(&next->pi_list_entry, &pendowner->pi_waiters);
517 }
518 spin_unlock_irqrestore(&pendowner->pi_lock, flags);
519
520 wake_up_process(pendowner);
521}
522
523/*
524 * Remove a waiter from a lock
525 *
526 * Must be called with lock->wait_lock held
527 */
528static void remove_waiter(struct rt_mutex *lock,
529 struct rt_mutex_waiter *waiter __IP_DECL__)
530{
531 int first = (waiter == rt_mutex_top_waiter(lock));
532 int boost = 0;
533 task_t *owner = rt_mutex_owner(lock);
534 unsigned long flags;
535
536 spin_lock_irqsave(&current->pi_lock, flags);
537 plist_del(&waiter->list_entry, &lock->wait_list);
538 waiter->task = NULL;
539 current->pi_blocked_on = NULL;
540 spin_unlock_irqrestore(&current->pi_lock, flags);
541
542 if (first && owner != current) {
543
544 spin_lock_irqsave(&owner->pi_lock, flags);
545
546 plist_del(&waiter->pi_list_entry, &owner->pi_waiters);
547
548 if (rt_mutex_has_waiters(lock)) {
549 struct rt_mutex_waiter *next;
550
551 next = rt_mutex_top_waiter(lock);
552 plist_add(&next->pi_list_entry, &owner->pi_waiters);
553 }
554 __rt_mutex_adjust_prio(owner);
555
556 if (owner->pi_blocked_on) {
557 boost = 1;
558 /* gets dropped in rt_mutex_adjust_prio_chain()! */
559 get_task_struct(owner);
560 }
561 spin_unlock_irqrestore(&owner->pi_lock, flags);
562 }
563
564 WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
565
566 if (!boost)
567 return;
568
569 spin_unlock(&lock->wait_lock);
570
571 rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current __IP__);
572
573 spin_lock(&lock->wait_lock);
574}
575
576/*
577 * Recheck the pi chain, in case we got a priority setting
578 *
579 * Called from sched_setscheduler
580 */
581void rt_mutex_adjust_pi(struct task_struct *task)
582{
583 struct rt_mutex_waiter *waiter;
584 unsigned long flags;
585
586 spin_lock_irqsave(&task->pi_lock, flags);
587
588 waiter = task->pi_blocked_on;
589 if (!waiter || waiter->list_entry.prio == task->prio) {
590 spin_unlock_irqrestore(&task->pi_lock, flags);
591 return;
592 }
593
594 /* gets dropped in rt_mutex_adjust_prio_chain()! */
595 get_task_struct(task);
596 spin_unlock_irqrestore(&task->pi_lock, flags);
597
598 rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task __RET_IP__);
599}
600
601/*
602 * Slow path lock function:
603 */
604static int __sched
605rt_mutex_slowlock(struct rt_mutex *lock, int state,
606 struct hrtimer_sleeper *timeout,
607 int detect_deadlock __IP_DECL__)
608{
609 struct rt_mutex_waiter waiter;
610 int ret = 0;
611
612 debug_rt_mutex_init_waiter(&waiter);
613 waiter.task = NULL;
614
615 spin_lock(&lock->wait_lock);
616
617 /* Try to acquire the lock again: */
618 if (try_to_take_rt_mutex(lock __IP__)) {
619 spin_unlock(&lock->wait_lock);
620 return 0;
621 }
622
623 set_current_state(state);
624
625 /* Setup the timer, when timeout != NULL */
626 if (unlikely(timeout))
627 hrtimer_start(&timeout->timer, timeout->timer.expires,
628 HRTIMER_ABS);
629
630 for (;;) {
631 /* Try to acquire the lock: */
632 if (try_to_take_rt_mutex(lock __IP__))
633 break;
634
635 /*
636 * TASK_INTERRUPTIBLE checks for signals and
637 * timeout. Ignored otherwise.
638 */
639 if (unlikely(state == TASK_INTERRUPTIBLE)) {
640 /* Signal pending? */
641 if (signal_pending(current))
642 ret = -EINTR;
643 if (timeout && !timeout->task)
644 ret = -ETIMEDOUT;
645 if (ret)
646 break;
647 }
648
649 /*
650 * waiter.task is NULL the first time we come here and
651 * when we have been woken up by the previous owner
652 * but the lock got stolen by a higher prio task.
653 */
654 if (!waiter.task) {
655 ret = task_blocks_on_rt_mutex(lock, &waiter,
656 detect_deadlock __IP__);
657 /*
658 * If we got woken up by the owner then start loop
659 * all over without going into schedule to try
660 * to get the lock now:
661 */
662 if (unlikely(!waiter.task))
663 continue;
664
665 if (unlikely(ret))
666 break;
667 }
668
669 spin_unlock(&lock->wait_lock);
670
671 debug_rt_mutex_print_deadlock(&waiter);
672
673 if (waiter.task)
674 schedule_rt_mutex(lock);
675
676 spin_lock(&lock->wait_lock);
677 set_current_state(state);
678 }
679
680 set_current_state(TASK_RUNNING);
681
682 if (unlikely(waiter.task))
683 remove_waiter(lock, &waiter __IP__);
684
685 /*
686 * try_to_take_rt_mutex() sets the waiter bit
687 * unconditionally. We might have to fix that up.
688 */
689 fixup_rt_mutex_waiters(lock);
690
691 spin_unlock(&lock->wait_lock);
692
693 /* Remove pending timer: */
694 if (unlikely(timeout))
695 hrtimer_cancel(&timeout->timer);
696
697 /*
698 * Readjust priority, when we did not get the lock. We might
699 * have been the pending owner and boosted. Since we did not
700 * take the lock, the PI boost has to go.
701 */
702 if (unlikely(ret))
703 rt_mutex_adjust_prio(current);
704
705 debug_rt_mutex_free_waiter(&waiter);
706
707 return ret;
708}
709
710/*
711 * Slow path try-lock function:
712 */
713static inline int
714rt_mutex_slowtrylock(struct rt_mutex *lock __IP_DECL__)
715{
716 int ret = 0;
717
718 spin_lock(&lock->wait_lock);
719
720 if (likely(rt_mutex_owner(lock) != current)) {
721
722 ret = try_to_take_rt_mutex(lock __IP__);
723 /*
724 * try_to_take_rt_mutex() sets the lock waiters
725 * bit unconditionally. Clean this up.
726 */
727 fixup_rt_mutex_waiters(lock);
728 }
729
730 spin_unlock(&lock->wait_lock);
731
732 return ret;
733}
734
735/*
736 * Slow path to release a rt-mutex:
737 */
738static void __sched
739rt_mutex_slowunlock(struct rt_mutex *lock)
740{
741 spin_lock(&lock->wait_lock);
742
743 debug_rt_mutex_unlock(lock);
744
745 rt_mutex_deadlock_account_unlock(current);
746
747 if (!rt_mutex_has_waiters(lock)) {
748 lock->owner = NULL;
749 spin_unlock(&lock->wait_lock);
750 return;
751 }
752
753 wakeup_next_waiter(lock);
754
755 spin_unlock(&lock->wait_lock);
756
757 /* Undo pi boosting if necessary: */
758 rt_mutex_adjust_prio(current);
759}
760
761/*
762 * debug aware fast / slowpath lock,trylock,unlock
763 *
764 * The atomic acquire/release ops are compiled away, when either the
765 * architecture does not support cmpxchg or when debugging is enabled.
766 */
767static inline int
768rt_mutex_fastlock(struct rt_mutex *lock, int state,
769 int detect_deadlock,
770 int (*slowfn)(struct rt_mutex *lock, int state,
771 struct hrtimer_sleeper *timeout,
772 int detect_deadlock __IP_DECL__))
773{
774 if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
775 rt_mutex_deadlock_account_lock(lock, current);
776 return 0;
777 } else
778 return slowfn(lock, state, NULL, detect_deadlock __RET_IP__);
779}
780
781static inline int
782rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
783 struct hrtimer_sleeper *timeout, int detect_deadlock,
784 int (*slowfn)(struct rt_mutex *lock, int state,
785 struct hrtimer_sleeper *timeout,
786 int detect_deadlock __IP_DECL__))
787{
788 if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
789 rt_mutex_deadlock_account_lock(lock, current);
790 return 0;
791 } else
792 return slowfn(lock, state, timeout, detect_deadlock __RET_IP__);
793}
794
795static inline int
796rt_mutex_fasttrylock(struct rt_mutex *lock,
797 int (*slowfn)(struct rt_mutex *lock __IP_DECL__))
798{
799 if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
800 rt_mutex_deadlock_account_lock(lock, current);
801 return 1;
802 }
803 return slowfn(lock __RET_IP__);
804}
805
806static inline void
807rt_mutex_fastunlock(struct rt_mutex *lock,
808 void (*slowfn)(struct rt_mutex *lock))
809{
810 if (likely(rt_mutex_cmpxchg(lock, current, NULL)))
811 rt_mutex_deadlock_account_unlock(current);
812 else
813 slowfn(lock);
814}
815
816/**
817 * rt_mutex_lock - lock a rt_mutex
818 *
819 * @lock: the rt_mutex to be locked
820 */
821void __sched rt_mutex_lock(struct rt_mutex *lock)
822{
823 might_sleep();
824
825 rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock);
826}
827EXPORT_SYMBOL_GPL(rt_mutex_lock);
828
829/**
830 * rt_mutex_lock_interruptible - lock a rt_mutex interruptible
831 *
832 * @lock: the rt_mutex to be locked
833 * @detect_deadlock: deadlock detection on/off
834 *
835 * Returns:
836 * 0 on success
837 * -EINTR when interrupted by a signal
838 * -EDEADLK when the lock would deadlock (when deadlock detection is on)
839 */
840int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock,
841 int detect_deadlock)
842{
843 might_sleep();
844
845 return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE,
846 detect_deadlock, rt_mutex_slowlock);
847}
848EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
849
850/**
851 * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible
852 * the timeout structure is provided
853 * by the caller
854 *
855 * @lock: the rt_mutex to be locked
856 * @timeout: timeout structure or NULL (no timeout)
857 * @detect_deadlock: deadlock detection on/off
858 *
859 * Returns:
860 * 0 on success
861 * -EINTR when interrupted by a signal
862 * -ETIMEOUT when the timeout expired
863 * -EDEADLK when the lock would deadlock (when deadlock detection is on)
864 */
865int
866rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout,
867 int detect_deadlock)
868{
869 might_sleep();
870
871 return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
872 detect_deadlock, rt_mutex_slowlock);
873}
874EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
875
876/**
877 * rt_mutex_trylock - try to lock a rt_mutex
878 *
879 * @lock: the rt_mutex to be locked
880 *
881 * Returns 1 on success and 0 on contention
882 */
883int __sched rt_mutex_trylock(struct rt_mutex *lock)
884{
885 return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
886}
887EXPORT_SYMBOL_GPL(rt_mutex_trylock);
888
889/**
890 * rt_mutex_unlock - unlock a rt_mutex
891 *
892 * @lock: the rt_mutex to be unlocked
893 */
894void __sched rt_mutex_unlock(struct rt_mutex *lock)
895{
896 rt_mutex_fastunlock(lock, rt_mutex_slowunlock);
897}
898EXPORT_SYMBOL_GPL(rt_mutex_unlock);
899
900/***
901 * rt_mutex_destroy - mark a mutex unusable
902 * @lock: the mutex to be destroyed
903 *
904 * This function marks the mutex uninitialized, and any subsequent
905 * use of the mutex is forbidden. The mutex must not be locked when
906 * this function is called.
907 */
908void rt_mutex_destroy(struct rt_mutex *lock)
909{
910 WARN_ON(rt_mutex_is_locked(lock));
911#ifdef CONFIG_DEBUG_RT_MUTEXES
912 lock->magic = NULL;
913#endif
914}
915
916EXPORT_SYMBOL_GPL(rt_mutex_destroy);
917
918/**
919 * __rt_mutex_init - initialize the rt lock
920 *
921 * @lock: the rt lock to be initialized
922 *
923 * Initialize the rt lock to unlocked state.
924 *
925 * Initializing of a locked rt lock is not allowed
926 */
927void __rt_mutex_init(struct rt_mutex *lock, const char *name)
928{
929 lock->owner = NULL;
930 spin_lock_init(&lock->wait_lock);
931 plist_head_init(&lock->wait_list, &lock->wait_lock);
932
933 debug_rt_mutex_init(lock, name);
934}
935EXPORT_SYMBOL_GPL(__rt_mutex_init);
936
937/**
938 * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
939 * proxy owner
940 *
941 * @lock: the rt_mutex to be locked
942 * @proxy_owner:the task to set as owner
943 *
944 * No locking. Caller has to do serializing itself
945 * Special API call for PI-futex support
946 */
947void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
948 struct task_struct *proxy_owner)
949{
950 __rt_mutex_init(lock, NULL);
951 debug_rt_mutex_proxy_lock(lock, proxy_owner __RET_IP__);
952 rt_mutex_set_owner(lock, proxy_owner, 0);
953 rt_mutex_deadlock_account_lock(lock, proxy_owner);
954}
955
956/**
957 * rt_mutex_proxy_unlock - release a lock on behalf of owner
958 *
959 * @lock: the rt_mutex to be locked
960 *
961 * No locking. Caller has to do serializing itself
962 * Special API call for PI-futex support
963 */
964void rt_mutex_proxy_unlock(struct rt_mutex *lock,
965 struct task_struct *proxy_owner)
966{
967 debug_rt_mutex_proxy_unlock(lock);
968 rt_mutex_set_owner(lock, NULL, 0);
969 rt_mutex_deadlock_account_unlock(proxy_owner);
970}
971
972/**
973 * rt_mutex_next_owner - return the next owner of the lock
974 *
975 * @lock: the rt lock query
976 *
977 * Returns the next owner of the lock or NULL
978 *
979 * Caller has to serialize against other accessors to the lock
980 * itself.
981 *
982 * Special API call for PI-futex support
983 */
984struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
985{
986 if (!rt_mutex_has_waiters(lock))
987 return NULL;
988
989 return rt_mutex_top_waiter(lock)->task;
990}
diff --git a/kernel/rtmutex.h b/kernel/rtmutex.h
new file mode 100644
index 000000000000..1e0fca13ff72
--- /dev/null
+++ b/kernel/rtmutex.h
@@ -0,0 +1,29 @@
1/*
2 * RT-Mutexes: blocking mutual exclusion locks with PI support
3 *
4 * started by Ingo Molnar and Thomas Gleixner:
5 *
6 * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
8 *
9 * This file contains macros used solely by rtmutex.c.
10 * Non-debug version.
11 */
12
13#define __IP_DECL__
14#define __IP__
15#define __RET_IP__
16#define rt_mutex_deadlock_check(l) (0)
17#define rt_mutex_deadlock_account_lock(m, t) do { } while (0)
18#define rt_mutex_deadlock_account_unlock(l) do { } while (0)
19#define debug_rt_mutex_init_waiter(w) do { } while (0)
20#define debug_rt_mutex_free_waiter(w) do { } while (0)
21#define debug_rt_mutex_lock(l) do { } while (0)
22#define debug_rt_mutex_proxy_lock(l,p) do { } while (0)
23#define debug_rt_mutex_proxy_unlock(l) do { } while (0)
24#define debug_rt_mutex_unlock(l) do { } while (0)
25#define debug_rt_mutex_init(m, n) do { } while (0)
26#define debug_rt_mutex_deadlock(d, a ,l) do { } while (0)
27#define debug_rt_mutex_print_deadlock(w) do { } while (0)
28#define debug_rt_mutex_detect_deadlock(w,d) (d)
29#define debug_rt_mutex_reset_waiter(w) do { } while (0)
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
new file mode 100644
index 000000000000..9c75856e791e
--- /dev/null
+++ b/kernel/rtmutex_common.h
@@ -0,0 +1,123 @@
1/*
2 * RT Mutexes: blocking mutual exclusion locks with PI support
3 *
4 * started by Ingo Molnar and Thomas Gleixner:
5 *
6 * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
8 *
9 * This file contains the private data structure and API definitions.
10 */
11
12#ifndef __KERNEL_RTMUTEX_COMMON_H
13#define __KERNEL_RTMUTEX_COMMON_H
14
15#include <linux/rtmutex.h>
16
17/*
18 * The rtmutex in kernel tester is independent of rtmutex debugging. We
19 * call schedule_rt_mutex_test() instead of schedule() for the tasks which
20 * belong to the tester. That way we can delay the wakeup path of those
21 * threads to provoke lock stealing and testing of complex boosting scenarios.
22 */
23#ifdef CONFIG_RT_MUTEX_TESTER
24
25extern void schedule_rt_mutex_test(struct rt_mutex *lock);
26
27#define schedule_rt_mutex(_lock) \
28 do { \
29 if (!(current->flags & PF_MUTEX_TESTER)) \
30 schedule(); \
31 else \
32 schedule_rt_mutex_test(_lock); \
33 } while (0)
34
35#else
36# define schedule_rt_mutex(_lock) schedule()
37#endif
38
39/*
40 * This is the control structure for tasks blocked on a rt_mutex,
41 * which is allocated on the kernel stack on of the blocked task.
42 *
43 * @list_entry: pi node to enqueue into the mutex waiters list
44 * @pi_list_entry: pi node to enqueue into the mutex owner waiters list
45 * @task: task reference to the blocked task
46 */
47struct rt_mutex_waiter {
48 struct plist_node list_entry;
49 struct plist_node pi_list_entry;
50 struct task_struct *task;
51 struct rt_mutex *lock;
52#ifdef CONFIG_DEBUG_RT_MUTEXES
53 unsigned long ip;
54 pid_t deadlock_task_pid;
55 struct rt_mutex *deadlock_lock;
56#endif
57};
58
59/*
60 * Various helpers to access the waiters-plist:
61 */
62static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
63{
64 return !plist_head_empty(&lock->wait_list);
65}
66
67static inline struct rt_mutex_waiter *
68rt_mutex_top_waiter(struct rt_mutex *lock)
69{
70 struct rt_mutex_waiter *w;
71
72 w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter,
73 list_entry);
74 BUG_ON(w->lock != lock);
75
76 return w;
77}
78
79static inline int task_has_pi_waiters(struct task_struct *p)
80{
81 return !plist_head_empty(&p->pi_waiters);
82}
83
84static inline struct rt_mutex_waiter *
85task_top_pi_waiter(struct task_struct *p)
86{
87 return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter,
88 pi_list_entry);
89}
90
91/*
92 * lock->owner state tracking:
93 */
94#define RT_MUTEX_OWNER_PENDING 1UL
95#define RT_MUTEX_HAS_WAITERS 2UL
96#define RT_MUTEX_OWNER_MASKALL 3UL
97
98static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
99{
100 return (struct task_struct *)
101 ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL);
102}
103
104static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock)
105{
106 return (struct task_struct *)
107 ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
108}
109
110static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock)
111{
112 return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING;
113}
114
115/*
116 * PI-futex support (proxy locking functions, etc.):
117 */
118extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
119extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
120 struct task_struct *proxy_owner);
121extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
122 struct task_struct *proxy_owner);
123#endif
diff --git a/kernel/sched.c b/kernel/sched.c
index c13f1bd2df7d..d5e37072ea54 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -168,15 +168,21 @@
168 */ 168 */
169 169
170#define SCALE_PRIO(x, prio) \ 170#define SCALE_PRIO(x, prio) \
171 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) 171 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
172 172
173static unsigned int task_timeslice(task_t *p) 173static unsigned int static_prio_timeslice(int static_prio)
174{ 174{
175 if (p->static_prio < NICE_TO_PRIO(0)) 175 if (static_prio < NICE_TO_PRIO(0))
176 return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); 176 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
177 else 177 else
178 return SCALE_PRIO(DEF_TIMESLICE, p->static_prio); 178 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
179} 179}
180
181static inline unsigned int task_timeslice(task_t *p)
182{
183 return static_prio_timeslice(p->static_prio);
184}
185
180#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ 186#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \
181 < (long long) (sd)->cache_hot_time) 187 < (long long) (sd)->cache_hot_time)
182 188
@@ -184,13 +190,11 @@ static unsigned int task_timeslice(task_t *p)
184 * These are the runqueue data structures: 190 * These are the runqueue data structures:
185 */ 191 */
186 192
187#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
188
189typedef struct runqueue runqueue_t; 193typedef struct runqueue runqueue_t;
190 194
191struct prio_array { 195struct prio_array {
192 unsigned int nr_active; 196 unsigned int nr_active;
193 unsigned long bitmap[BITMAP_SIZE]; 197 DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
194 struct list_head queue[MAX_PRIO]; 198 struct list_head queue[MAX_PRIO];
195}; 199};
196 200
@@ -209,6 +213,7 @@ struct runqueue {
209 * remote CPUs use both these fields when doing load calculation. 213 * remote CPUs use both these fields when doing load calculation.
210 */ 214 */
211 unsigned long nr_running; 215 unsigned long nr_running;
216 unsigned long raw_weighted_load;
212#ifdef CONFIG_SMP 217#ifdef CONFIG_SMP
213 unsigned long cpu_load[3]; 218 unsigned long cpu_load[3];
214#endif 219#endif
@@ -239,7 +244,6 @@ struct runqueue {
239 244
240 task_t *migration_thread; 245 task_t *migration_thread;
241 struct list_head migration_queue; 246 struct list_head migration_queue;
242 int cpu;
243#endif 247#endif
244 248
245#ifdef CONFIG_SCHEDSTATS 249#ifdef CONFIG_SCHEDSTATS
@@ -351,11 +355,30 @@ static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
351#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 355#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
352 356
353/* 357/*
358 * __task_rq_lock - lock the runqueue a given task resides on.
359 * Must be called interrupts disabled.
360 */
361static inline runqueue_t *__task_rq_lock(task_t *p)
362 __acquires(rq->lock)
363{
364 struct runqueue *rq;
365
366repeat_lock_task:
367 rq = task_rq(p);
368 spin_lock(&rq->lock);
369 if (unlikely(rq != task_rq(p))) {
370 spin_unlock(&rq->lock);
371 goto repeat_lock_task;
372 }
373 return rq;
374}
375
376/*
354 * task_rq_lock - lock the runqueue a given task resides on and disable 377 * task_rq_lock - lock the runqueue a given task resides on and disable
355 * interrupts. Note the ordering: we can safely lookup the task_rq without 378 * interrupts. Note the ordering: we can safely lookup the task_rq without
356 * explicitly disabling preemption. 379 * explicitly disabling preemption.
357 */ 380 */
358static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) 381static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
359 __acquires(rq->lock) 382 __acquires(rq->lock)
360{ 383{
361 struct runqueue *rq; 384 struct runqueue *rq;
@@ -371,6 +394,12 @@ repeat_lock_task:
371 return rq; 394 return rq;
372} 395}
373 396
397static inline void __task_rq_unlock(runqueue_t *rq)
398 __releases(rq->lock)
399{
400 spin_unlock(&rq->lock);
401}
402
374static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) 403static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
375 __releases(rq->lock) 404 __releases(rq->lock)
376{ 405{
@@ -634,7 +663,7 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
634} 663}
635 664
636/* 665/*
637 * effective_prio - return the priority that is based on the static 666 * __normal_prio - return the priority that is based on the static
638 * priority but is modified by bonuses/penalties. 667 * priority but is modified by bonuses/penalties.
639 * 668 *
640 * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] 669 * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
@@ -647,13 +676,11 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
647 * 676 *
648 * Both properties are important to certain workloads. 677 * Both properties are important to certain workloads.
649 */ 678 */
650static int effective_prio(task_t *p) 679
680static inline int __normal_prio(task_t *p)
651{ 681{
652 int bonus, prio; 682 int bonus, prio;
653 683
654 if (rt_task(p))
655 return p->prio;
656
657 bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; 684 bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
658 685
659 prio = p->static_prio - bonus; 686 prio = p->static_prio - bonus;
@@ -665,6 +692,106 @@ static int effective_prio(task_t *p)
665} 692}
666 693
667/* 694/*
695 * To aid in avoiding the subversion of "niceness" due to uneven distribution
696 * of tasks with abnormal "nice" values across CPUs the contribution that
697 * each task makes to its run queue's load is weighted according to its
698 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
699 * scaled version of the new time slice allocation that they receive on time
700 * slice expiry etc.
701 */
702
703/*
704 * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
705 * If static_prio_timeslice() is ever changed to break this assumption then
706 * this code will need modification
707 */
708#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
709#define LOAD_WEIGHT(lp) \
710 (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
711#define PRIO_TO_LOAD_WEIGHT(prio) \
712 LOAD_WEIGHT(static_prio_timeslice(prio))
713#define RTPRIO_TO_LOAD_WEIGHT(rp) \
714 (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
715
716static void set_load_weight(task_t *p)
717{
718 if (has_rt_policy(p)) {
719#ifdef CONFIG_SMP
720 if (p == task_rq(p)->migration_thread)
721 /*
722 * The migration thread does the actual balancing.
723 * Giving its load any weight will skew balancing
724 * adversely.
725 */
726 p->load_weight = 0;
727 else
728#endif
729 p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
730 } else
731 p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
732}
733
734static inline void inc_raw_weighted_load(runqueue_t *rq, const task_t *p)
735{
736 rq->raw_weighted_load += p->load_weight;
737}
738
739static inline void dec_raw_weighted_load(runqueue_t *rq, const task_t *p)
740{
741 rq->raw_weighted_load -= p->load_weight;
742}
743
744static inline void inc_nr_running(task_t *p, runqueue_t *rq)
745{
746 rq->nr_running++;
747 inc_raw_weighted_load(rq, p);
748}
749
750static inline void dec_nr_running(task_t *p, runqueue_t *rq)
751{
752 rq->nr_running--;
753 dec_raw_weighted_load(rq, p);
754}
755
756/*
757 * Calculate the expected normal priority: i.e. priority
758 * without taking RT-inheritance into account. Might be
759 * boosted by interactivity modifiers. Changes upon fork,
760 * setprio syscalls, and whenever the interactivity
761 * estimator recalculates.
762 */
763static inline int normal_prio(task_t *p)
764{
765 int prio;
766
767 if (has_rt_policy(p))
768 prio = MAX_RT_PRIO-1 - p->rt_priority;
769 else
770 prio = __normal_prio(p);
771 return prio;
772}
773
774/*
775 * Calculate the current priority, i.e. the priority
776 * taken into account by the scheduler. This value might
777 * be boosted by RT tasks, or might be boosted by
778 * interactivity modifiers. Will be RT if the task got
779 * RT-boosted. If not then it returns p->normal_prio.
780 */
781static int effective_prio(task_t *p)
782{
783 p->normal_prio = normal_prio(p);
784 /*
785 * If we are RT tasks or we were boosted to RT priority,
786 * keep the priority unchanged. Otherwise, update priority
787 * to the normal priority:
788 */
789 if (!rt_prio(p->prio))
790 return p->normal_prio;
791 return p->prio;
792}
793
794/*
668 * __activate_task - move a task to the runqueue. 795 * __activate_task - move a task to the runqueue.
669 */ 796 */
670static void __activate_task(task_t *p, runqueue_t *rq) 797static void __activate_task(task_t *p, runqueue_t *rq)
@@ -674,7 +801,7 @@ static void __activate_task(task_t *p, runqueue_t *rq)
674 if (batch_task(p)) 801 if (batch_task(p))
675 target = rq->expired; 802 target = rq->expired;
676 enqueue_task(p, target); 803 enqueue_task(p, target);
677 rq->nr_running++; 804 inc_nr_running(p, rq);
678} 805}
679 806
680/* 807/*
@@ -683,39 +810,45 @@ static void __activate_task(task_t *p, runqueue_t *rq)
683static inline void __activate_idle_task(task_t *p, runqueue_t *rq) 810static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
684{ 811{
685 enqueue_task_head(p, rq->active); 812 enqueue_task_head(p, rq->active);
686 rq->nr_running++; 813 inc_nr_running(p, rq);
687} 814}
688 815
816/*
817 * Recalculate p->normal_prio and p->prio after having slept,
818 * updating the sleep-average too:
819 */
689static int recalc_task_prio(task_t *p, unsigned long long now) 820static int recalc_task_prio(task_t *p, unsigned long long now)
690{ 821{
691 /* Caller must always ensure 'now >= p->timestamp' */ 822 /* Caller must always ensure 'now >= p->timestamp' */
692 unsigned long long __sleep_time = now - p->timestamp; 823 unsigned long sleep_time = now - p->timestamp;
693 unsigned long sleep_time;
694 824
695 if (batch_task(p)) 825 if (batch_task(p))
696 sleep_time = 0; 826 sleep_time = 0;
697 else {
698 if (__sleep_time > NS_MAX_SLEEP_AVG)
699 sleep_time = NS_MAX_SLEEP_AVG;
700 else
701 sleep_time = (unsigned long)__sleep_time;
702 }
703 827
704 if (likely(sleep_time > 0)) { 828 if (likely(sleep_time > 0)) {
705 /* 829 /*
706 * User tasks that sleep a long time are categorised as 830 * This ceiling is set to the lowest priority that would allow
707 * idle. They will only have their sleep_avg increased to a 831 * a task to be reinserted into the active array on timeslice
708 * level that makes them just interactive priority to stay 832 * completion.
709 * active yet prevent them suddenly becoming cpu hogs and
710 * starving other processes.
711 */ 833 */
712 if (p->mm && sleep_time > INTERACTIVE_SLEEP(p)) { 834 unsigned long ceiling = INTERACTIVE_SLEEP(p);
713 unsigned long ceiling;
714 835
715 ceiling = JIFFIES_TO_NS(MAX_SLEEP_AVG - 836 if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) {
716 DEF_TIMESLICE); 837 /*
717 if (p->sleep_avg < ceiling) 838 * Prevents user tasks from achieving best priority
718 p->sleep_avg = ceiling; 839 * with one single large enough sleep.
840 */
841 p->sleep_avg = ceiling;
842 /*
843 * Using INTERACTIVE_SLEEP() as a ceiling places a
844 * nice(0) task 1ms sleep away from promotion, and
845 * gives it 700ms to round-robin with no chance of
846 * being demoted. This is more than generous, so
847 * mark this sleep as non-interactive to prevent the
848 * on-runqueue bonus logic from intervening should
849 * this task not receive cpu immediately.
850 */
851 p->sleep_type = SLEEP_NONINTERACTIVE;
719 } else { 852 } else {
720 /* 853 /*
721 * Tasks waking from uninterruptible sleep are 854 * Tasks waking from uninterruptible sleep are
@@ -723,12 +856,12 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
723 * are likely to be waiting on I/O 856 * are likely to be waiting on I/O
724 */ 857 */
725 if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { 858 if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
726 if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) 859 if (p->sleep_avg >= ceiling)
727 sleep_time = 0; 860 sleep_time = 0;
728 else if (p->sleep_avg + sleep_time >= 861 else if (p->sleep_avg + sleep_time >=
729 INTERACTIVE_SLEEP(p)) { 862 ceiling) {
730 p->sleep_avg = INTERACTIVE_SLEEP(p); 863 p->sleep_avg = ceiling;
731 sleep_time = 0; 864 sleep_time = 0;
732 } 865 }
733 } 866 }
734 867
@@ -742,9 +875,9 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
742 */ 875 */
743 p->sleep_avg += sleep_time; 876 p->sleep_avg += sleep_time;
744 877
745 if (p->sleep_avg > NS_MAX_SLEEP_AVG)
746 p->sleep_avg = NS_MAX_SLEEP_AVG;
747 } 878 }
879 if (p->sleep_avg > NS_MAX_SLEEP_AVG)
880 p->sleep_avg = NS_MAX_SLEEP_AVG;
748 } 881 }
749 882
750 return effective_prio(p); 883 return effective_prio(p);
@@ -805,7 +938,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
805 */ 938 */
806static void deactivate_task(struct task_struct *p, runqueue_t *rq) 939static void deactivate_task(struct task_struct *p, runqueue_t *rq)
807{ 940{
808 rq->nr_running--; 941 dec_nr_running(p, rq);
809 dequeue_task(p, p->array); 942 dequeue_task(p, p->array);
810 p->array = NULL; 943 p->array = NULL;
811} 944}
@@ -818,6 +951,11 @@ static void deactivate_task(struct task_struct *p, runqueue_t *rq)
818 * the target CPU. 951 * the target CPU.
819 */ 952 */
820#ifdef CONFIG_SMP 953#ifdef CONFIG_SMP
954
955#ifndef tsk_is_polling
956#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
957#endif
958
821static void resched_task(task_t *p) 959static void resched_task(task_t *p)
822{ 960{
823 int cpu; 961 int cpu;
@@ -833,9 +971,9 @@ static void resched_task(task_t *p)
833 if (cpu == smp_processor_id()) 971 if (cpu == smp_processor_id())
834 return; 972 return;
835 973
836 /* NEED_RESCHED must be visible before we test POLLING_NRFLAG */ 974 /* NEED_RESCHED must be visible before we test polling */
837 smp_mb(); 975 smp_mb();
838 if (!test_tsk_thread_flag(p, TIF_POLLING_NRFLAG)) 976 if (!tsk_is_polling(p))
839 smp_send_reschedule(cpu); 977 smp_send_reschedule(cpu);
840} 978}
841#else 979#else
@@ -855,6 +993,12 @@ inline int task_curr(const task_t *p)
855 return cpu_curr(task_cpu(p)) == p; 993 return cpu_curr(task_cpu(p)) == p;
856} 994}
857 995
996/* Used instead of source_load when we know the type == 0 */
997unsigned long weighted_cpuload(const int cpu)
998{
999 return cpu_rq(cpu)->raw_weighted_load;
1000}
1001
858#ifdef CONFIG_SMP 1002#ifdef CONFIG_SMP
859typedef struct { 1003typedef struct {
860 struct list_head list; 1004 struct list_head list;
@@ -944,7 +1088,8 @@ void kick_process(task_t *p)
944} 1088}
945 1089
946/* 1090/*
947 * Return a low guess at the load of a migration-source cpu. 1091 * Return a low guess at the load of a migration-source cpu weighted
1092 * according to the scheduling class and "nice" value.
948 * 1093 *
949 * We want to under-estimate the load of migration sources, to 1094 * We want to under-estimate the load of migration sources, to
950 * balance conservatively. 1095 * balance conservatively.
@@ -952,24 +1097,36 @@ void kick_process(task_t *p)
952static inline unsigned long source_load(int cpu, int type) 1097static inline unsigned long source_load(int cpu, int type)
953{ 1098{
954 runqueue_t *rq = cpu_rq(cpu); 1099 runqueue_t *rq = cpu_rq(cpu);
955 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; 1100
956 if (type == 0) 1101 if (type == 0)
957 return load_now; 1102 return rq->raw_weighted_load;
958 1103
959 return min(rq->cpu_load[type-1], load_now); 1104 return min(rq->cpu_load[type-1], rq->raw_weighted_load);
960} 1105}
961 1106
962/* 1107/*
963 * Return a high guess at the load of a migration-target cpu 1108 * Return a high guess at the load of a migration-target cpu weighted
1109 * according to the scheduling class and "nice" value.
964 */ 1110 */
965static inline unsigned long target_load(int cpu, int type) 1111static inline unsigned long target_load(int cpu, int type)
966{ 1112{
967 runqueue_t *rq = cpu_rq(cpu); 1113 runqueue_t *rq = cpu_rq(cpu);
968 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; 1114
969 if (type == 0) 1115 if (type == 0)
970 return load_now; 1116 return rq->raw_weighted_load;
971 1117
972 return max(rq->cpu_load[type-1], load_now); 1118 return max(rq->cpu_load[type-1], rq->raw_weighted_load);
1119}
1120
1121/*
1122 * Return the average load per task on the cpu's run queue
1123 */
1124static inline unsigned long cpu_avg_load_per_task(int cpu)
1125{
1126 runqueue_t *rq = cpu_rq(cpu);
1127 unsigned long n = rq->nr_running;
1128
1129 return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE;
973} 1130}
974 1131
975/* 1132/*
@@ -1042,7 +1199,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1042 cpus_and(tmp, group->cpumask, p->cpus_allowed); 1199 cpus_and(tmp, group->cpumask, p->cpus_allowed);
1043 1200
1044 for_each_cpu_mask(i, tmp) { 1201 for_each_cpu_mask(i, tmp) {
1045 load = source_load(i, 0); 1202 load = weighted_cpuload(i);
1046 1203
1047 if (load < min_load || (load == min_load && i == this_cpu)) { 1204 if (load < min_load || (load == min_load && i == this_cpu)) {
1048 min_load = load; 1205 min_load = load;
@@ -1069,9 +1226,15 @@ static int sched_balance_self(int cpu, int flag)
1069 struct task_struct *t = current; 1226 struct task_struct *t = current;
1070 struct sched_domain *tmp, *sd = NULL; 1227 struct sched_domain *tmp, *sd = NULL;
1071 1228
1072 for_each_domain(cpu, tmp) 1229 for_each_domain(cpu, tmp) {
1230 /*
1231 * If power savings logic is enabled for a domain, stop there.
1232 */
1233 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1234 break;
1073 if (tmp->flags & flag) 1235 if (tmp->flags & flag)
1074 sd = tmp; 1236 sd = tmp;
1237 }
1075 1238
1076 while (sd) { 1239 while (sd) {
1077 cpumask_t span; 1240 cpumask_t span;
@@ -1221,17 +1384,19 @@ static int try_to_wake_up(task_t *p, unsigned int state, int sync)
1221 1384
1222 if (this_sd->flags & SD_WAKE_AFFINE) { 1385 if (this_sd->flags & SD_WAKE_AFFINE) {
1223 unsigned long tl = this_load; 1386 unsigned long tl = this_load;
1387 unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu);
1388
1224 /* 1389 /*
1225 * If sync wakeup then subtract the (maximum possible) 1390 * If sync wakeup then subtract the (maximum possible)
1226 * effect of the currently running task from the load 1391 * effect of the currently running task from the load
1227 * of the current CPU: 1392 * of the current CPU:
1228 */ 1393 */
1229 if (sync) 1394 if (sync)
1230 tl -= SCHED_LOAD_SCALE; 1395 tl -= current->load_weight;
1231 1396
1232 if ((tl <= load && 1397 if ((tl <= load &&
1233 tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) || 1398 tl + target_load(cpu, idx) <= tl_per_task) ||
1234 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) { 1399 100*(tl + p->load_weight) <= imbalance*load) {
1235 /* 1400 /*
1236 * This domain has SD_WAKE_AFFINE and 1401 * This domain has SD_WAKE_AFFINE and
1237 * p is cache cold in this domain, and 1402 * p is cache cold in this domain, and
@@ -1348,6 +1513,12 @@ void fastcall sched_fork(task_t *p, int clone_flags)
1348 * event cannot wake it up and insert it on the runqueue either. 1513 * event cannot wake it up and insert it on the runqueue either.
1349 */ 1514 */
1350 p->state = TASK_RUNNING; 1515 p->state = TASK_RUNNING;
1516
1517 /*
1518 * Make sure we do not leak PI boosting priority to the child:
1519 */
1520 p->prio = current->normal_prio;
1521
1351 INIT_LIST_HEAD(&p->run_list); 1522 INIT_LIST_HEAD(&p->run_list);
1352 p->array = NULL; 1523 p->array = NULL;
1353#ifdef CONFIG_SCHEDSTATS 1524#ifdef CONFIG_SCHEDSTATS
@@ -1427,10 +1598,11 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
1427 __activate_task(p, rq); 1598 __activate_task(p, rq);
1428 else { 1599 else {
1429 p->prio = current->prio; 1600 p->prio = current->prio;
1601 p->normal_prio = current->normal_prio;
1430 list_add_tail(&p->run_list, &current->run_list); 1602 list_add_tail(&p->run_list, &current->run_list);
1431 p->array = current->array; 1603 p->array = current->array;
1432 p->array->nr_active++; 1604 p->array->nr_active++;
1433 rq->nr_running++; 1605 inc_nr_running(p, rq);
1434 } 1606 }
1435 set_need_resched(); 1607 set_need_resched();
1436 } else 1608 } else
@@ -1648,7 +1820,8 @@ unsigned long nr_uninterruptible(void)
1648 1820
1649unsigned long long nr_context_switches(void) 1821unsigned long long nr_context_switches(void)
1650{ 1822{
1651 unsigned long long i, sum = 0; 1823 int i;
1824 unsigned long long sum = 0;
1652 1825
1653 for_each_possible_cpu(i) 1826 for_each_possible_cpu(i)
1654 sum += cpu_rq(i)->nr_switches; 1827 sum += cpu_rq(i)->nr_switches;
@@ -1686,9 +1859,6 @@ unsigned long nr_active(void)
1686/* 1859/*
1687 * double_rq_lock - safely lock two runqueues 1860 * double_rq_lock - safely lock two runqueues
1688 * 1861 *
1689 * We must take them in cpu order to match code in
1690 * dependent_sleeper and wake_dependent_sleeper.
1691 *
1692 * Note this does not disable interrupts like task_rq_lock, 1862 * Note this does not disable interrupts like task_rq_lock,
1693 * you need to do so manually before calling. 1863 * you need to do so manually before calling.
1694 */ 1864 */
@@ -1700,7 +1870,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
1700 spin_lock(&rq1->lock); 1870 spin_lock(&rq1->lock);
1701 __acquire(rq2->lock); /* Fake it out ;) */ 1871 __acquire(rq2->lock); /* Fake it out ;) */
1702 } else { 1872 } else {
1703 if (rq1->cpu < rq2->cpu) { 1873 if (rq1 < rq2) {
1704 spin_lock(&rq1->lock); 1874 spin_lock(&rq1->lock);
1705 spin_lock(&rq2->lock); 1875 spin_lock(&rq2->lock);
1706 } else { 1876 } else {
@@ -1736,7 +1906,7 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
1736 __acquires(this_rq->lock) 1906 __acquires(this_rq->lock)
1737{ 1907{
1738 if (unlikely(!spin_trylock(&busiest->lock))) { 1908 if (unlikely(!spin_trylock(&busiest->lock))) {
1739 if (busiest->cpu < this_rq->cpu) { 1909 if (busiest < this_rq) {
1740 spin_unlock(&this_rq->lock); 1910 spin_unlock(&this_rq->lock);
1741 spin_lock(&busiest->lock); 1911 spin_lock(&busiest->lock);
1742 spin_lock(&this_rq->lock); 1912 spin_lock(&this_rq->lock);
@@ -1799,9 +1969,9 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1799 runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) 1969 runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
1800{ 1970{
1801 dequeue_task(p, src_array); 1971 dequeue_task(p, src_array);
1802 src_rq->nr_running--; 1972 dec_nr_running(p, src_rq);
1803 set_task_cpu(p, this_cpu); 1973 set_task_cpu(p, this_cpu);
1804 this_rq->nr_running++; 1974 inc_nr_running(p, this_rq);
1805 enqueue_task(p, this_array); 1975 enqueue_task(p, this_array);
1806 p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) 1976 p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
1807 + this_rq->timestamp_last_tick; 1977 + this_rq->timestamp_last_tick;
@@ -1848,26 +2018,42 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1848 return 1; 2018 return 1;
1849} 2019}
1850 2020
2021#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio)
1851/* 2022/*
1852 * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, 2023 * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
1853 * as part of a balancing operation within "domain". Returns the number of 2024 * load from busiest to this_rq, as part of a balancing operation within
1854 * tasks moved. 2025 * "domain". Returns the number of tasks moved.
1855 * 2026 *
1856 * Called with both runqueues locked. 2027 * Called with both runqueues locked.
1857 */ 2028 */
1858static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, 2029static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
1859 unsigned long max_nr_move, struct sched_domain *sd, 2030 unsigned long max_nr_move, unsigned long max_load_move,
1860 enum idle_type idle, int *all_pinned) 2031 struct sched_domain *sd, enum idle_type idle,
2032 int *all_pinned)
1861{ 2033{
1862 prio_array_t *array, *dst_array; 2034 prio_array_t *array, *dst_array;
1863 struct list_head *head, *curr; 2035 struct list_head *head, *curr;
1864 int idx, pulled = 0, pinned = 0; 2036 int idx, pulled = 0, pinned = 0, this_best_prio, busiest_best_prio;
2037 int busiest_best_prio_seen;
2038 int skip_for_load; /* skip the task based on weighted load issues */
2039 long rem_load_move;
1865 task_t *tmp; 2040 task_t *tmp;
1866 2041
1867 if (max_nr_move == 0) 2042 if (max_nr_move == 0 || max_load_move == 0)
1868 goto out; 2043 goto out;
1869 2044
2045 rem_load_move = max_load_move;
1870 pinned = 1; 2046 pinned = 1;
2047 this_best_prio = rq_best_prio(this_rq);
2048 busiest_best_prio = rq_best_prio(busiest);
2049 /*
2050 * Enable handling of the case where there is more than one task
2051 * with the best priority. If the current running task is one
2052 * of those with prio==busiest_best_prio we know it won't be moved
2053 * and therefore it's safe to override the skip (based on load) of
2054 * any task we find with that prio.
2055 */
2056 busiest_best_prio_seen = busiest_best_prio == busiest->curr->prio;
1871 2057
1872 /* 2058 /*
1873 * We first consider expired tasks. Those will likely not be 2059 * We first consider expired tasks. Those will likely not be
@@ -1907,7 +2093,17 @@ skip_queue:
1907 2093
1908 curr = curr->prev; 2094 curr = curr->prev;
1909 2095
1910 if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { 2096 /*
2097 * To help distribute high priority tasks accross CPUs we don't
2098 * skip a task if it will be the highest priority task (i.e. smallest
2099 * prio value) on its new queue regardless of its load weight
2100 */
2101 skip_for_load = tmp->load_weight > rem_load_move;
2102 if (skip_for_load && idx < this_best_prio)
2103 skip_for_load = !busiest_best_prio_seen && idx == busiest_best_prio;
2104 if (skip_for_load ||
2105 !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
2106 busiest_best_prio_seen |= idx == busiest_best_prio;
1911 if (curr != head) 2107 if (curr != head)
1912 goto skip_queue; 2108 goto skip_queue;
1913 idx++; 2109 idx++;
@@ -1921,9 +2117,15 @@ skip_queue:
1921 2117
1922 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); 2118 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
1923 pulled++; 2119 pulled++;
2120 rem_load_move -= tmp->load_weight;
1924 2121
1925 /* We only want to steal up to the prescribed number of tasks. */ 2122 /*
1926 if (pulled < max_nr_move) { 2123 * We only want to steal up to the prescribed number of tasks
2124 * and the prescribed amount of weighted load.
2125 */
2126 if (pulled < max_nr_move && rem_load_move > 0) {
2127 if (idx < this_best_prio)
2128 this_best_prio = idx;
1927 if (curr != head) 2129 if (curr != head)
1928 goto skip_queue; 2130 goto skip_queue;
1929 idx++; 2131 idx++;
@@ -1944,7 +2146,7 @@ out:
1944 2146
1945/* 2147/*
1946 * find_busiest_group finds and returns the busiest CPU group within the 2148 * find_busiest_group finds and returns the busiest CPU group within the
1947 * domain. It calculates and returns the number of tasks which should be 2149 * domain. It calculates and returns the amount of weighted load which should be
1948 * moved to restore balance via the imbalance parameter. 2150 * moved to restore balance via the imbalance parameter.
1949 */ 2151 */
1950static struct sched_group * 2152static struct sched_group *
@@ -1954,9 +2156,19 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1954 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 2156 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
1955 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 2157 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
1956 unsigned long max_pull; 2158 unsigned long max_pull;
2159 unsigned long busiest_load_per_task, busiest_nr_running;
2160 unsigned long this_load_per_task, this_nr_running;
1957 int load_idx; 2161 int load_idx;
2162#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2163 int power_savings_balance = 1;
2164 unsigned long leader_nr_running = 0, min_load_per_task = 0;
2165 unsigned long min_nr_running = ULONG_MAX;
2166 struct sched_group *group_min = NULL, *group_leader = NULL;
2167#endif
1958 2168
1959 max_load = this_load = total_load = total_pwr = 0; 2169 max_load = this_load = total_load = total_pwr = 0;
2170 busiest_load_per_task = busiest_nr_running = 0;
2171 this_load_per_task = this_nr_running = 0;
1960 if (idle == NOT_IDLE) 2172 if (idle == NOT_IDLE)
1961 load_idx = sd->busy_idx; 2173 load_idx = sd->busy_idx;
1962 else if (idle == NEWLY_IDLE) 2174 else if (idle == NEWLY_IDLE)
@@ -1965,16 +2177,19 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1965 load_idx = sd->idle_idx; 2177 load_idx = sd->idle_idx;
1966 2178
1967 do { 2179 do {
1968 unsigned long load; 2180 unsigned long load, group_capacity;
1969 int local_group; 2181 int local_group;
1970 int i; 2182 int i;
2183 unsigned long sum_nr_running, sum_weighted_load;
1971 2184
1972 local_group = cpu_isset(this_cpu, group->cpumask); 2185 local_group = cpu_isset(this_cpu, group->cpumask);
1973 2186
1974 /* Tally up the load of all CPUs in the group */ 2187 /* Tally up the load of all CPUs in the group */
1975 avg_load = 0; 2188 sum_weighted_load = sum_nr_running = avg_load = 0;
1976 2189
1977 for_each_cpu_mask(i, group->cpumask) { 2190 for_each_cpu_mask(i, group->cpumask) {
2191 runqueue_t *rq = cpu_rq(i);
2192
1978 if (*sd_idle && !idle_cpu(i)) 2193 if (*sd_idle && !idle_cpu(i))
1979 *sd_idle = 0; 2194 *sd_idle = 0;
1980 2195
@@ -1985,6 +2200,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1985 load = source_load(i, load_idx); 2200 load = source_load(i, load_idx);
1986 2201
1987 avg_load += load; 2202 avg_load += load;
2203 sum_nr_running += rq->nr_running;
2204 sum_weighted_load += rq->raw_weighted_load;
1988 } 2205 }
1989 2206
1990 total_load += avg_load; 2207 total_load += avg_load;
@@ -1993,17 +2210,80 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1993 /* Adjust by relative CPU power of the group */ 2210 /* Adjust by relative CPU power of the group */
1994 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; 2211 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
1995 2212
2213 group_capacity = group->cpu_power / SCHED_LOAD_SCALE;
2214
1996 if (local_group) { 2215 if (local_group) {
1997 this_load = avg_load; 2216 this_load = avg_load;
1998 this = group; 2217 this = group;
1999 } else if (avg_load > max_load) { 2218 this_nr_running = sum_nr_running;
2219 this_load_per_task = sum_weighted_load;
2220 } else if (avg_load > max_load &&
2221 sum_nr_running > group_capacity) {
2000 max_load = avg_load; 2222 max_load = avg_load;
2001 busiest = group; 2223 busiest = group;
2224 busiest_nr_running = sum_nr_running;
2225 busiest_load_per_task = sum_weighted_load;
2002 } 2226 }
2227
2228#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2229 /*
2230 * Busy processors will not participate in power savings
2231 * balance.
2232 */
2233 if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2234 goto group_next;
2235
2236 /*
2237 * If the local group is idle or completely loaded
2238 * no need to do power savings balance at this domain
2239 */
2240 if (local_group && (this_nr_running >= group_capacity ||
2241 !this_nr_running))
2242 power_savings_balance = 0;
2243
2244 /*
2245 * If a group is already running at full capacity or idle,
2246 * don't include that group in power savings calculations
2247 */
2248 if (!power_savings_balance || sum_nr_running >= group_capacity
2249 || !sum_nr_running)
2250 goto group_next;
2251
2252 /*
2253 * Calculate the group which has the least non-idle load.
2254 * This is the group from where we need to pick up the load
2255 * for saving power
2256 */
2257 if ((sum_nr_running < min_nr_running) ||
2258 (sum_nr_running == min_nr_running &&
2259 first_cpu(group->cpumask) <
2260 first_cpu(group_min->cpumask))) {
2261 group_min = group;
2262 min_nr_running = sum_nr_running;
2263 min_load_per_task = sum_weighted_load /
2264 sum_nr_running;
2265 }
2266
2267 /*
2268 * Calculate the group which is almost near its
2269 * capacity but still has some space to pick up some load
2270 * from other group and save more power
2271 */
2272 if (sum_nr_running <= group_capacity - 1)
2273 if (sum_nr_running > leader_nr_running ||
2274 (sum_nr_running == leader_nr_running &&
2275 first_cpu(group->cpumask) >
2276 first_cpu(group_leader->cpumask))) {
2277 group_leader = group;
2278 leader_nr_running = sum_nr_running;
2279 }
2280
2281group_next:
2282#endif
2003 group = group->next; 2283 group = group->next;
2004 } while (group != sd->groups); 2284 } while (group != sd->groups);
2005 2285
2006 if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE) 2286 if (!busiest || this_load >= max_load || busiest_nr_running == 0)
2007 goto out_balanced; 2287 goto out_balanced;
2008 2288
2009 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; 2289 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
@@ -2012,6 +2292,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2012 100*max_load <= sd->imbalance_pct*this_load) 2292 100*max_load <= sd->imbalance_pct*this_load)
2013 goto out_balanced; 2293 goto out_balanced;
2014 2294
2295 busiest_load_per_task /= busiest_nr_running;
2015 /* 2296 /*
2016 * We're trying to get all the cpus to the average_load, so we don't 2297 * We're trying to get all the cpus to the average_load, so we don't
2017 * want to push ourselves above the average load, nor do we wish to 2298 * want to push ourselves above the average load, nor do we wish to
@@ -2023,21 +2304,50 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2023 * by pulling tasks to us. Be careful of negative numbers as they'll 2304 * by pulling tasks to us. Be careful of negative numbers as they'll
2024 * appear as very large values with unsigned longs. 2305 * appear as very large values with unsigned longs.
2025 */ 2306 */
2307 if (max_load <= busiest_load_per_task)
2308 goto out_balanced;
2309
2310 /*
2311 * In the presence of smp nice balancing, certain scenarios can have
2312 * max load less than avg load(as we skip the groups at or below
2313 * its cpu_power, while calculating max_load..)
2314 */
2315 if (max_load < avg_load) {
2316 *imbalance = 0;
2317 goto small_imbalance;
2318 }
2026 2319
2027 /* Don't want to pull so many tasks that a group would go idle */ 2320 /* Don't want to pull so many tasks that a group would go idle */
2028 max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE); 2321 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
2029 2322
2030 /* How much load to actually move to equalise the imbalance */ 2323 /* How much load to actually move to equalise the imbalance */
2031 *imbalance = min(max_pull * busiest->cpu_power, 2324 *imbalance = min(max_pull * busiest->cpu_power,
2032 (avg_load - this_load) * this->cpu_power) 2325 (avg_load - this_load) * this->cpu_power)
2033 / SCHED_LOAD_SCALE; 2326 / SCHED_LOAD_SCALE;
2034 2327
2035 if (*imbalance < SCHED_LOAD_SCALE) { 2328 /*
2036 unsigned long pwr_now = 0, pwr_move = 0; 2329 * if *imbalance is less than the average load per runnable task
2330 * there is no gaurantee that any tasks will be moved so we'll have
2331 * a think about bumping its value to force at least one task to be
2332 * moved
2333 */
2334 if (*imbalance < busiest_load_per_task) {
2335 unsigned long pwr_now, pwr_move;
2037 unsigned long tmp; 2336 unsigned long tmp;
2337 unsigned int imbn;
2338
2339small_imbalance:
2340 pwr_move = pwr_now = 0;
2341 imbn = 2;
2342 if (this_nr_running) {
2343 this_load_per_task /= this_nr_running;
2344 if (busiest_load_per_task > this_load_per_task)
2345 imbn = 1;
2346 } else
2347 this_load_per_task = SCHED_LOAD_SCALE;
2038 2348
2039 if (max_load - this_load >= SCHED_LOAD_SCALE*2) { 2349 if (max_load - this_load >= busiest_load_per_task * imbn) {
2040 *imbalance = 1; 2350 *imbalance = busiest_load_per_task;
2041 return busiest; 2351 return busiest;
2042 } 2352 }
2043 2353
@@ -2047,39 +2357,47 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2047 * moving them. 2357 * moving them.
2048 */ 2358 */
2049 2359
2050 pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); 2360 pwr_now += busiest->cpu_power *
2051 pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); 2361 min(busiest_load_per_task, max_load);
2362 pwr_now += this->cpu_power *
2363 min(this_load_per_task, this_load);
2052 pwr_now /= SCHED_LOAD_SCALE; 2364 pwr_now /= SCHED_LOAD_SCALE;
2053 2365
2054 /* Amount of load we'd subtract */ 2366 /* Amount of load we'd subtract */
2055 tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; 2367 tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power;
2056 if (max_load > tmp) 2368 if (max_load > tmp)
2057 pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE, 2369 pwr_move += busiest->cpu_power *
2058 max_load - tmp); 2370 min(busiest_load_per_task, max_load - tmp);
2059 2371
2060 /* Amount of load we'd add */ 2372 /* Amount of load we'd add */
2061 if (max_load*busiest->cpu_power < 2373 if (max_load*busiest->cpu_power <
2062 SCHED_LOAD_SCALE*SCHED_LOAD_SCALE) 2374 busiest_load_per_task*SCHED_LOAD_SCALE)
2063 tmp = max_load*busiest->cpu_power/this->cpu_power; 2375 tmp = max_load*busiest->cpu_power/this->cpu_power;
2064 else 2376 else
2065 tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; 2377 tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power;
2066 pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp); 2378 pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp);
2067 pwr_move /= SCHED_LOAD_SCALE; 2379 pwr_move /= SCHED_LOAD_SCALE;
2068 2380
2069 /* Move if we gain throughput */ 2381 /* Move if we gain throughput */
2070 if (pwr_move <= pwr_now) 2382 if (pwr_move <= pwr_now)
2071 goto out_balanced; 2383 goto out_balanced;
2072 2384
2073 *imbalance = 1; 2385 *imbalance = busiest_load_per_task;
2074 return busiest;
2075 } 2386 }
2076 2387
2077 /* Get rid of the scaling factor, rounding down as we divide */
2078 *imbalance = *imbalance / SCHED_LOAD_SCALE;
2079 return busiest; 2388 return busiest;
2080 2389
2081out_balanced: 2390out_balanced:
2391#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2392 if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2393 goto ret;
2082 2394
2395 if (this == group_leader && group_leader != group_min) {
2396 *imbalance = min_load_per_task;
2397 return group_min;
2398 }
2399ret:
2400#endif
2083 *imbalance = 0; 2401 *imbalance = 0;
2084 return NULL; 2402 return NULL;
2085} 2403}
@@ -2088,18 +2406,21 @@ out_balanced:
2088 * find_busiest_queue - find the busiest runqueue among the cpus in group. 2406 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2089 */ 2407 */
2090static runqueue_t *find_busiest_queue(struct sched_group *group, 2408static runqueue_t *find_busiest_queue(struct sched_group *group,
2091 enum idle_type idle) 2409 enum idle_type idle, unsigned long imbalance)
2092{ 2410{
2093 unsigned long load, max_load = 0; 2411 unsigned long max_load = 0;
2094 runqueue_t *busiest = NULL; 2412 runqueue_t *busiest = NULL, *rqi;
2095 int i; 2413 int i;
2096 2414
2097 for_each_cpu_mask(i, group->cpumask) { 2415 for_each_cpu_mask(i, group->cpumask) {
2098 load = source_load(i, 0); 2416 rqi = cpu_rq(i);
2099 2417
2100 if (load > max_load) { 2418 if (rqi->nr_running == 1 && rqi->raw_weighted_load > imbalance)
2101 max_load = load; 2419 continue;
2102 busiest = cpu_rq(i); 2420
2421 if (rqi->raw_weighted_load > max_load) {
2422 max_load = rqi->raw_weighted_load;
2423 busiest = rqi;
2103 } 2424 }
2104 } 2425 }
2105 2426
@@ -2112,6 +2433,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group,
2112 */ 2433 */
2113#define MAX_PINNED_INTERVAL 512 2434#define MAX_PINNED_INTERVAL 512
2114 2435
2436#define minus_1_or_zero(n) ((n) > 0 ? (n) - 1 : 0)
2115/* 2437/*
2116 * Check this_cpu to ensure it is balanced within domain. Attempt to move 2438 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2117 * tasks if there is an imbalance. 2439 * tasks if there is an imbalance.
@@ -2128,7 +2450,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
2128 int active_balance = 0; 2450 int active_balance = 0;
2129 int sd_idle = 0; 2451 int sd_idle = 0;
2130 2452
2131 if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER) 2453 if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2454 !sched_smt_power_savings)
2132 sd_idle = 1; 2455 sd_idle = 1;
2133 2456
2134 schedstat_inc(sd, lb_cnt[idle]); 2457 schedstat_inc(sd, lb_cnt[idle]);
@@ -2139,7 +2462,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
2139 goto out_balanced; 2462 goto out_balanced;
2140 } 2463 }
2141 2464
2142 busiest = find_busiest_queue(group, idle); 2465 busiest = find_busiest_queue(group, idle, imbalance);
2143 if (!busiest) { 2466 if (!busiest) {
2144 schedstat_inc(sd, lb_nobusyq[idle]); 2467 schedstat_inc(sd, lb_nobusyq[idle]);
2145 goto out_balanced; 2468 goto out_balanced;
@@ -2159,6 +2482,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
2159 */ 2482 */
2160 double_rq_lock(this_rq, busiest); 2483 double_rq_lock(this_rq, busiest);
2161 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2484 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2485 minus_1_or_zero(busiest->nr_running),
2162 imbalance, sd, idle, &all_pinned); 2486 imbalance, sd, idle, &all_pinned);
2163 double_rq_unlock(this_rq, busiest); 2487 double_rq_unlock(this_rq, busiest);
2164 2488
@@ -2216,7 +2540,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
2216 sd->balance_interval *= 2; 2540 sd->balance_interval *= 2;
2217 } 2541 }
2218 2542
2219 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER) 2543 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2544 !sched_smt_power_savings)
2220 return -1; 2545 return -1;
2221 return nr_moved; 2546 return nr_moved;
2222 2547
@@ -2231,7 +2556,7 @@ out_one_pinned:
2231 (sd->balance_interval < sd->max_interval)) 2556 (sd->balance_interval < sd->max_interval))
2232 sd->balance_interval *= 2; 2557 sd->balance_interval *= 2;
2233 2558
2234 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) 2559 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
2235 return -1; 2560 return -1;
2236 return 0; 2561 return 0;
2237} 2562}
@@ -2252,7 +2577,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2252 int nr_moved = 0; 2577 int nr_moved = 0;
2253 int sd_idle = 0; 2578 int sd_idle = 0;
2254 2579
2255 if (sd->flags & SD_SHARE_CPUPOWER) 2580 if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
2256 sd_idle = 1; 2581 sd_idle = 1;
2257 2582
2258 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); 2583 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
@@ -2262,7 +2587,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2262 goto out_balanced; 2587 goto out_balanced;
2263 } 2588 }
2264 2589
2265 busiest = find_busiest_queue(group, NEWLY_IDLE); 2590 busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance);
2266 if (!busiest) { 2591 if (!busiest) {
2267 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); 2592 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
2268 goto out_balanced; 2593 goto out_balanced;
@@ -2277,6 +2602,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2277 /* Attempt to move tasks */ 2602 /* Attempt to move tasks */
2278 double_lock_balance(this_rq, busiest); 2603 double_lock_balance(this_rq, busiest);
2279 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2604 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2605 minus_1_or_zero(busiest->nr_running),
2280 imbalance, sd, NEWLY_IDLE, NULL); 2606 imbalance, sd, NEWLY_IDLE, NULL);
2281 spin_unlock(&busiest->lock); 2607 spin_unlock(&busiest->lock);
2282 } 2608 }
@@ -2292,7 +2618,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2292 2618
2293out_balanced: 2619out_balanced:
2294 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); 2620 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
2295 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) 2621 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
2296 return -1; 2622 return -1;
2297 sd->nr_balance_failed = 0; 2623 sd->nr_balance_failed = 0;
2298 return 0; 2624 return 0;
@@ -2347,17 +2673,19 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
2347 double_lock_balance(busiest_rq, target_rq); 2673 double_lock_balance(busiest_rq, target_rq);
2348 2674
2349 /* Search for an sd spanning us and the target CPU. */ 2675 /* Search for an sd spanning us and the target CPU. */
2350 for_each_domain(target_cpu, sd) 2676 for_each_domain(target_cpu, sd) {
2351 if ((sd->flags & SD_LOAD_BALANCE) && 2677 if ((sd->flags & SD_LOAD_BALANCE) &&
2352 cpu_isset(busiest_cpu, sd->span)) 2678 cpu_isset(busiest_cpu, sd->span))
2353 break; 2679 break;
2680 }
2354 2681
2355 if (unlikely(sd == NULL)) 2682 if (unlikely(sd == NULL))
2356 goto out; 2683 goto out;
2357 2684
2358 schedstat_inc(sd, alb_cnt); 2685 schedstat_inc(sd, alb_cnt);
2359 2686
2360 if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL)) 2687 if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
2688 RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE, NULL))
2361 schedstat_inc(sd, alb_pushed); 2689 schedstat_inc(sd, alb_pushed);
2362 else 2690 else
2363 schedstat_inc(sd, alb_failed); 2691 schedstat_inc(sd, alb_failed);
@@ -2385,7 +2713,7 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
2385 struct sched_domain *sd; 2713 struct sched_domain *sd;
2386 int i; 2714 int i;
2387 2715
2388 this_load = this_rq->nr_running * SCHED_LOAD_SCALE; 2716 this_load = this_rq->raw_weighted_load;
2389 /* Update our load */ 2717 /* Update our load */
2390 for (i = 0; i < 3; i++) { 2718 for (i = 0; i < 3; i++) {
2391 unsigned long new_load = this_load; 2719 unsigned long new_load = this_load;
@@ -2686,48 +3014,35 @@ static inline void wakeup_busy_runqueue(runqueue_t *rq)
2686 resched_task(rq->idle); 3014 resched_task(rq->idle);
2687} 3015}
2688 3016
2689static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) 3017/*
3018 * Called with interrupt disabled and this_rq's runqueue locked.
3019 */
3020static void wake_sleeping_dependent(int this_cpu)
2690{ 3021{
2691 struct sched_domain *tmp, *sd = NULL; 3022 struct sched_domain *tmp, *sd = NULL;
2692 cpumask_t sibling_map;
2693 int i; 3023 int i;
2694 3024
2695 for_each_domain(this_cpu, tmp) 3025 for_each_domain(this_cpu, tmp) {
2696 if (tmp->flags & SD_SHARE_CPUPOWER) 3026 if (tmp->flags & SD_SHARE_CPUPOWER) {
2697 sd = tmp; 3027 sd = tmp;
3028 break;
3029 }
3030 }
2698 3031
2699 if (!sd) 3032 if (!sd)
2700 return; 3033 return;
2701 3034
2702 /* 3035 for_each_cpu_mask(i, sd->span) {
2703 * Unlock the current runqueue because we have to lock in
2704 * CPU order to avoid deadlocks. Caller knows that we might
2705 * unlock. We keep IRQs disabled.
2706 */
2707 spin_unlock(&this_rq->lock);
2708
2709 sibling_map = sd->span;
2710
2711 for_each_cpu_mask(i, sibling_map)
2712 spin_lock(&cpu_rq(i)->lock);
2713 /*
2714 * We clear this CPU from the mask. This both simplifies the
2715 * inner loop and keps this_rq locked when we exit:
2716 */
2717 cpu_clear(this_cpu, sibling_map);
2718
2719 for_each_cpu_mask(i, sibling_map) {
2720 runqueue_t *smt_rq = cpu_rq(i); 3036 runqueue_t *smt_rq = cpu_rq(i);
2721 3037
3038 if (i == this_cpu)
3039 continue;
3040 if (unlikely(!spin_trylock(&smt_rq->lock)))
3041 continue;
3042
2722 wakeup_busy_runqueue(smt_rq); 3043 wakeup_busy_runqueue(smt_rq);
3044 spin_unlock(&smt_rq->lock);
2723 } 3045 }
2724
2725 for_each_cpu_mask(i, sibling_map)
2726 spin_unlock(&cpu_rq(i)->lock);
2727 /*
2728 * We exit with this_cpu's rq still held and IRQs
2729 * still disabled:
2730 */
2731} 3046}
2732 3047
2733/* 3048/*
@@ -2740,52 +3055,46 @@ static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd)
2740 return p->time_slice * (100 - sd->per_cpu_gain) / 100; 3055 return p->time_slice * (100 - sd->per_cpu_gain) / 100;
2741} 3056}
2742 3057
2743static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) 3058/*
3059 * To minimise lock contention and not have to drop this_rq's runlock we only
3060 * trylock the sibling runqueues and bypass those runqueues if we fail to
3061 * acquire their lock. As we only trylock the normal locking order does not
3062 * need to be obeyed.
3063 */
3064static int dependent_sleeper(int this_cpu, runqueue_t *this_rq, task_t *p)
2744{ 3065{
2745 struct sched_domain *tmp, *sd = NULL; 3066 struct sched_domain *tmp, *sd = NULL;
2746 cpumask_t sibling_map;
2747 prio_array_t *array;
2748 int ret = 0, i; 3067 int ret = 0, i;
2749 task_t *p;
2750 3068
2751 for_each_domain(this_cpu, tmp) 3069 /* kernel/rt threads do not participate in dependent sleeping */
2752 if (tmp->flags & SD_SHARE_CPUPOWER) 3070 if (!p->mm || rt_task(p))
3071 return 0;
3072
3073 for_each_domain(this_cpu, tmp) {
3074 if (tmp->flags & SD_SHARE_CPUPOWER) {
2753 sd = tmp; 3075 sd = tmp;
3076 break;
3077 }
3078 }
2754 3079
2755 if (!sd) 3080 if (!sd)
2756 return 0; 3081 return 0;
2757 3082
2758 /* 3083 for_each_cpu_mask(i, sd->span) {
2759 * The same locking rules and details apply as for 3084 runqueue_t *smt_rq;
2760 * wake_sleeping_dependent(): 3085 task_t *smt_curr;
2761 */
2762 spin_unlock(&this_rq->lock);
2763 sibling_map = sd->span;
2764 for_each_cpu_mask(i, sibling_map)
2765 spin_lock(&cpu_rq(i)->lock);
2766 cpu_clear(this_cpu, sibling_map);
2767 3086
2768 /* 3087 if (i == this_cpu)
2769 * Establish next task to be run - it might have gone away because 3088 continue;
2770 * we released the runqueue lock above:
2771 */
2772 if (!this_rq->nr_running)
2773 goto out_unlock;
2774 array = this_rq->active;
2775 if (!array->nr_active)
2776 array = this_rq->expired;
2777 BUG_ON(!array->nr_active);
2778 3089
2779 p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next, 3090 smt_rq = cpu_rq(i);
2780 task_t, run_list); 3091 if (unlikely(!spin_trylock(&smt_rq->lock)))
3092 continue;
2781 3093
2782 for_each_cpu_mask(i, sibling_map) { 3094 smt_curr = smt_rq->curr;
2783 runqueue_t *smt_rq = cpu_rq(i);
2784 task_t *smt_curr = smt_rq->curr;
2785 3095
2786 /* Kernel threads do not participate in dependent sleeping */ 3096 if (!smt_curr->mm)
2787 if (!p->mm || !smt_curr->mm || rt_task(p)) 3097 goto unlock;
2788 goto check_smt_task;
2789 3098
2790 /* 3099 /*
2791 * If a user task with lower static priority than the 3100 * If a user task with lower static priority than the
@@ -2803,49 +3112,24 @@ static int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2803 if ((jiffies % DEF_TIMESLICE) > 3112 if ((jiffies % DEF_TIMESLICE) >
2804 (sd->per_cpu_gain * DEF_TIMESLICE / 100)) 3113 (sd->per_cpu_gain * DEF_TIMESLICE / 100))
2805 ret = 1; 3114 ret = 1;
2806 } else 3115 } else {
2807 if (smt_curr->static_prio < p->static_prio && 3116 if (smt_curr->static_prio < p->static_prio &&
2808 !TASK_PREEMPTS_CURR(p, smt_rq) && 3117 !TASK_PREEMPTS_CURR(p, smt_rq) &&
2809 smt_slice(smt_curr, sd) > task_timeslice(p)) 3118 smt_slice(smt_curr, sd) > task_timeslice(p))
2810 ret = 1; 3119 ret = 1;
2811
2812check_smt_task:
2813 if ((!smt_curr->mm && smt_curr != smt_rq->idle) ||
2814 rt_task(smt_curr))
2815 continue;
2816 if (!p->mm) {
2817 wakeup_busy_runqueue(smt_rq);
2818 continue;
2819 }
2820
2821 /*
2822 * Reschedule a lower priority task on the SMT sibling for
2823 * it to be put to sleep, or wake it up if it has been put to
2824 * sleep for priority reasons to see if it should run now.
2825 */
2826 if (rt_task(p)) {
2827 if ((jiffies % DEF_TIMESLICE) >
2828 (sd->per_cpu_gain * DEF_TIMESLICE / 100))
2829 resched_task(smt_curr);
2830 } else {
2831 if (TASK_PREEMPTS_CURR(p, smt_rq) &&
2832 smt_slice(p, sd) > task_timeslice(smt_curr))
2833 resched_task(smt_curr);
2834 else
2835 wakeup_busy_runqueue(smt_rq);
2836 } 3120 }
3121unlock:
3122 spin_unlock(&smt_rq->lock);
2837 } 3123 }
2838out_unlock:
2839 for_each_cpu_mask(i, sibling_map)
2840 spin_unlock(&cpu_rq(i)->lock);
2841 return ret; 3124 return ret;
2842} 3125}
2843#else 3126#else
2844static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) 3127static inline void wake_sleeping_dependent(int this_cpu)
2845{ 3128{
2846} 3129}
2847 3130
2848static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) 3131static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq,
3132 task_t *p)
2849{ 3133{
2850 return 0; 3134 return 0;
2851} 3135}
@@ -2967,32 +3251,13 @@ need_resched_nonpreemptible:
2967 3251
2968 cpu = smp_processor_id(); 3252 cpu = smp_processor_id();
2969 if (unlikely(!rq->nr_running)) { 3253 if (unlikely(!rq->nr_running)) {
2970go_idle:
2971 idle_balance(cpu, rq); 3254 idle_balance(cpu, rq);
2972 if (!rq->nr_running) { 3255 if (!rq->nr_running) {
2973 next = rq->idle; 3256 next = rq->idle;
2974 rq->expired_timestamp = 0; 3257 rq->expired_timestamp = 0;
2975 wake_sleeping_dependent(cpu, rq); 3258 wake_sleeping_dependent(cpu);
2976 /*
2977 * wake_sleeping_dependent() might have released
2978 * the runqueue, so break out if we got new
2979 * tasks meanwhile:
2980 */
2981 if (!rq->nr_running)
2982 goto switch_tasks;
2983 }
2984 } else {
2985 if (dependent_sleeper(cpu, rq)) {
2986 next = rq->idle;
2987 goto switch_tasks; 3259 goto switch_tasks;
2988 } 3260 }
2989 /*
2990 * dependent_sleeper() releases and reacquires the runqueue
2991 * lock, hence go into the idle loop if the rq went
2992 * empty meanwhile:
2993 */
2994 if (unlikely(!rq->nr_running))
2995 goto go_idle;
2996 } 3261 }
2997 3262
2998 array = rq->active; 3263 array = rq->active;
@@ -3030,6 +3295,8 @@ go_idle:
3030 } 3295 }
3031 } 3296 }
3032 next->sleep_type = SLEEP_NORMAL; 3297 next->sleep_type = SLEEP_NORMAL;
3298 if (dependent_sleeper(cpu, rq, next))
3299 next = rq->idle;
3033switch_tasks: 3300switch_tasks:
3034 if (next == rq->idle) 3301 if (next == rq->idle)
3035 schedstat_inc(rq, sched_goidle); 3302 schedstat_inc(rq, sched_goidle);
@@ -3473,12 +3740,65 @@ long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3473 3740
3474EXPORT_SYMBOL(sleep_on_timeout); 3741EXPORT_SYMBOL(sleep_on_timeout);
3475 3742
3743#ifdef CONFIG_RT_MUTEXES
3744
3745/*
3746 * rt_mutex_setprio - set the current priority of a task
3747 * @p: task
3748 * @prio: prio value (kernel-internal form)
3749 *
3750 * This function changes the 'effective' priority of a task. It does
3751 * not touch ->normal_prio like __setscheduler().
3752 *
3753 * Used by the rt_mutex code to implement priority inheritance logic.
3754 */
3755void rt_mutex_setprio(task_t *p, int prio)
3756{
3757 unsigned long flags;
3758 prio_array_t *array;
3759 runqueue_t *rq;
3760 int oldprio;
3761
3762 BUG_ON(prio < 0 || prio > MAX_PRIO);
3763
3764 rq = task_rq_lock(p, &flags);
3765
3766 oldprio = p->prio;
3767 array = p->array;
3768 if (array)
3769 dequeue_task(p, array);
3770 p->prio = prio;
3771
3772 if (array) {
3773 /*
3774 * If changing to an RT priority then queue it
3775 * in the active array!
3776 */
3777 if (rt_task(p))
3778 array = rq->active;
3779 enqueue_task(p, array);
3780 /*
3781 * Reschedule if we are currently running on this runqueue and
3782 * our priority decreased, or if we are not currently running on
3783 * this runqueue and our priority is higher than the current's
3784 */
3785 if (task_running(rq, p)) {
3786 if (p->prio > oldprio)
3787 resched_task(rq->curr);
3788 } else if (TASK_PREEMPTS_CURR(p, rq))
3789 resched_task(rq->curr);
3790 }
3791 task_rq_unlock(rq, &flags);
3792}
3793
3794#endif
3795
3476void set_user_nice(task_t *p, long nice) 3796void set_user_nice(task_t *p, long nice)
3477{ 3797{
3478 unsigned long flags; 3798 unsigned long flags;
3479 prio_array_t *array; 3799 prio_array_t *array;
3480 runqueue_t *rq; 3800 runqueue_t *rq;
3481 int old_prio, new_prio, delta; 3801 int old_prio, delta;
3482 3802
3483 if (TASK_NICE(p) == nice || nice < -20 || nice > 19) 3803 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3484 return; 3804 return;
@@ -3493,22 +3813,25 @@ void set_user_nice(task_t *p, long nice)
3493 * it wont have any effect on scheduling until the task is 3813 * it wont have any effect on scheduling until the task is
3494 * not SCHED_NORMAL/SCHED_BATCH: 3814 * not SCHED_NORMAL/SCHED_BATCH:
3495 */ 3815 */
3496 if (rt_task(p)) { 3816 if (has_rt_policy(p)) {
3497 p->static_prio = NICE_TO_PRIO(nice); 3817 p->static_prio = NICE_TO_PRIO(nice);
3498 goto out_unlock; 3818 goto out_unlock;
3499 } 3819 }
3500 array = p->array; 3820 array = p->array;
3501 if (array) 3821 if (array) {
3502 dequeue_task(p, array); 3822 dequeue_task(p, array);
3823 dec_raw_weighted_load(rq, p);
3824 }
3503 3825
3504 old_prio = p->prio;
3505 new_prio = NICE_TO_PRIO(nice);
3506 delta = new_prio - old_prio;
3507 p->static_prio = NICE_TO_PRIO(nice); 3826 p->static_prio = NICE_TO_PRIO(nice);
3508 p->prio += delta; 3827 set_load_weight(p);
3828 old_prio = p->prio;
3829 p->prio = effective_prio(p);
3830 delta = p->prio - old_prio;
3509 3831
3510 if (array) { 3832 if (array) {
3511 enqueue_task(p, array); 3833 enqueue_task(p, array);
3834 inc_raw_weighted_load(rq, p);
3512 /* 3835 /*
3513 * If the task increased its priority or is running and 3836 * If the task increased its priority or is running and
3514 * lowered its priority, then reschedule its CPU: 3837 * lowered its priority, then reschedule its CPU:
@@ -3519,7 +3842,6 @@ void set_user_nice(task_t *p, long nice)
3519out_unlock: 3842out_unlock:
3520 task_rq_unlock(rq, &flags); 3843 task_rq_unlock(rq, &flags);
3521} 3844}
3522
3523EXPORT_SYMBOL(set_user_nice); 3845EXPORT_SYMBOL(set_user_nice);
3524 3846
3525/* 3847/*
@@ -3634,16 +3956,15 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
3634 BUG_ON(p->array); 3956 BUG_ON(p->array);
3635 p->policy = policy; 3957 p->policy = policy;
3636 p->rt_priority = prio; 3958 p->rt_priority = prio;
3637 if (policy != SCHED_NORMAL && policy != SCHED_BATCH) { 3959 p->normal_prio = normal_prio(p);
3638 p->prio = MAX_RT_PRIO-1 - p->rt_priority; 3960 /* we are holding p->pi_lock already */
3639 } else { 3961 p->prio = rt_mutex_getprio(p);
3640 p->prio = p->static_prio; 3962 /*
3641 /* 3963 * SCHED_BATCH tasks are treated as perpetual CPU hogs:
3642 * SCHED_BATCH tasks are treated as perpetual CPU hogs: 3964 */
3643 */ 3965 if (policy == SCHED_BATCH)
3644 if (policy == SCHED_BATCH) 3966 p->sleep_avg = 0;
3645 p->sleep_avg = 0; 3967 set_load_weight(p);
3646 }
3647} 3968}
3648 3969
3649/** 3970/**
@@ -3662,6 +3983,8 @@ int sched_setscheduler(struct task_struct *p, int policy,
3662 unsigned long flags; 3983 unsigned long flags;
3663 runqueue_t *rq; 3984 runqueue_t *rq;
3664 3985
3986 /* may grab non-irq protected spin_locks */
3987 BUG_ON(in_interrupt());
3665recheck: 3988recheck:
3666 /* double check policy once rq lock held */ 3989 /* double check policy once rq lock held */
3667 if (policy < 0) 3990 if (policy < 0)
@@ -3710,14 +4033,20 @@ recheck:
3710 if (retval) 4033 if (retval)
3711 return retval; 4034 return retval;
3712 /* 4035 /*
4036 * make sure no PI-waiters arrive (or leave) while we are
4037 * changing the priority of the task:
4038 */
4039 spin_lock_irqsave(&p->pi_lock, flags);
4040 /*
3713 * To be able to change p->policy safely, the apropriate 4041 * To be able to change p->policy safely, the apropriate
3714 * runqueue lock must be held. 4042 * runqueue lock must be held.
3715 */ 4043 */
3716 rq = task_rq_lock(p, &flags); 4044 rq = __task_rq_lock(p);
3717 /* recheck policy now with rq lock held */ 4045 /* recheck policy now with rq lock held */
3718 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 4046 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
3719 policy = oldpolicy = -1; 4047 policy = oldpolicy = -1;
3720 task_rq_unlock(rq, &flags); 4048 __task_rq_unlock(rq);
4049 spin_unlock_irqrestore(&p->pi_lock, flags);
3721 goto recheck; 4050 goto recheck;
3722 } 4051 }
3723 array = p->array; 4052 array = p->array;
@@ -3738,7 +4067,11 @@ recheck:
3738 } else if (TASK_PREEMPTS_CURR(p, rq)) 4067 } else if (TASK_PREEMPTS_CURR(p, rq))
3739 resched_task(rq->curr); 4068 resched_task(rq->curr);
3740 } 4069 }
3741 task_rq_unlock(rq, &flags); 4070 __task_rq_unlock(rq);
4071 spin_unlock_irqrestore(&p->pi_lock, flags);
4072
4073 rt_mutex_adjust_pi(p);
4074
3742 return 0; 4075 return 0;
3743} 4076}
3744EXPORT_SYMBOL_GPL(sched_setscheduler); 4077EXPORT_SYMBOL_GPL(sched_setscheduler);
@@ -3760,8 +4093,10 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3760 read_unlock_irq(&tasklist_lock); 4093 read_unlock_irq(&tasklist_lock);
3761 return -ESRCH; 4094 return -ESRCH;
3762 } 4095 }
3763 retval = sched_setscheduler(p, policy, &lparam); 4096 get_task_struct(p);
3764 read_unlock_irq(&tasklist_lock); 4097 read_unlock_irq(&tasklist_lock);
4098 retval = sched_setscheduler(p, policy, &lparam);
4099 put_task_struct(p);
3765 return retval; 4100 return retval;
3766} 4101}
3767 4102
@@ -3886,6 +4221,10 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
3886 !capable(CAP_SYS_NICE)) 4221 !capable(CAP_SYS_NICE))
3887 goto out_unlock; 4222 goto out_unlock;
3888 4223
4224 retval = security_task_setscheduler(p, 0, NULL);
4225 if (retval)
4226 goto out_unlock;
4227
3889 cpus_allowed = cpuset_cpus_allowed(p); 4228 cpus_allowed = cpuset_cpus_allowed(p);
3890 cpus_and(new_mask, new_mask, cpus_allowed); 4229 cpus_and(new_mask, new_mask, cpus_allowed);
3891 retval = set_cpus_allowed(p, new_mask); 4230 retval = set_cpus_allowed(p, new_mask);
@@ -3954,7 +4293,10 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
3954 if (!p) 4293 if (!p)
3955 goto out_unlock; 4294 goto out_unlock;
3956 4295
3957 retval = 0; 4296 retval = security_task_getscheduler(p);
4297 if (retval)
4298 goto out_unlock;
4299
3958 cpus_and(*mask, p->cpus_allowed, cpu_online_map); 4300 cpus_and(*mask, p->cpus_allowed, cpu_online_map);
3959 4301
3960out_unlock: 4302out_unlock:
@@ -4044,17 +4386,25 @@ asmlinkage long sys_sched_yield(void)
4044 return 0; 4386 return 0;
4045} 4387}
4046 4388
4047static inline void __cond_resched(void) 4389static inline int __resched_legal(void)
4390{
4391 if (unlikely(preempt_count()))
4392 return 0;
4393 if (unlikely(system_state != SYSTEM_RUNNING))
4394 return 0;
4395 return 1;
4396}
4397
4398static void __cond_resched(void)
4048{ 4399{
4400#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
4401 __might_sleep(__FILE__, __LINE__);
4402#endif
4049 /* 4403 /*
4050 * The BKS might be reacquired before we have dropped 4404 * The BKS might be reacquired before we have dropped
4051 * PREEMPT_ACTIVE, which could trigger a second 4405 * PREEMPT_ACTIVE, which could trigger a second
4052 * cond_resched() call. 4406 * cond_resched() call.
4053 */ 4407 */
4054 if (unlikely(preempt_count()))
4055 return;
4056 if (unlikely(system_state != SYSTEM_RUNNING))
4057 return;
4058 do { 4408 do {
4059 add_preempt_count(PREEMPT_ACTIVE); 4409 add_preempt_count(PREEMPT_ACTIVE);
4060 schedule(); 4410 schedule();
@@ -4064,13 +4414,12 @@ static inline void __cond_resched(void)
4064 4414
4065int __sched cond_resched(void) 4415int __sched cond_resched(void)
4066{ 4416{
4067 if (need_resched()) { 4417 if (need_resched() && __resched_legal()) {
4068 __cond_resched(); 4418 __cond_resched();
4069 return 1; 4419 return 1;
4070 } 4420 }
4071 return 0; 4421 return 0;
4072} 4422}
4073
4074EXPORT_SYMBOL(cond_resched); 4423EXPORT_SYMBOL(cond_resched);
4075 4424
4076/* 4425/*
@@ -4091,7 +4440,7 @@ int cond_resched_lock(spinlock_t *lock)
4091 ret = 1; 4440 ret = 1;
4092 spin_lock(lock); 4441 spin_lock(lock);
4093 } 4442 }
4094 if (need_resched()) { 4443 if (need_resched() && __resched_legal()) {
4095 _raw_spin_unlock(lock); 4444 _raw_spin_unlock(lock);
4096 preempt_enable_no_resched(); 4445 preempt_enable_no_resched();
4097 __cond_resched(); 4446 __cond_resched();
@@ -4100,14 +4449,13 @@ int cond_resched_lock(spinlock_t *lock)
4100 } 4449 }
4101 return ret; 4450 return ret;
4102} 4451}
4103
4104EXPORT_SYMBOL(cond_resched_lock); 4452EXPORT_SYMBOL(cond_resched_lock);
4105 4453
4106int __sched cond_resched_softirq(void) 4454int __sched cond_resched_softirq(void)
4107{ 4455{
4108 BUG_ON(!in_softirq()); 4456 BUG_ON(!in_softirq());
4109 4457
4110 if (need_resched()) { 4458 if (need_resched() && __resched_legal()) {
4111 __local_bh_enable(); 4459 __local_bh_enable();
4112 __cond_resched(); 4460 __cond_resched();
4113 local_bh_disable(); 4461 local_bh_disable();
@@ -4115,10 +4463,8 @@ int __sched cond_resched_softirq(void)
4115 } 4463 }
4116 return 0; 4464 return 0;
4117} 4465}
4118
4119EXPORT_SYMBOL(cond_resched_softirq); 4466EXPORT_SYMBOL(cond_resched_softirq);
4120 4467
4121
4122/** 4468/**
4123 * yield - yield the current processor to other threads. 4469 * yield - yield the current processor to other threads.
4124 * 4470 *
@@ -4142,7 +4488,7 @@ EXPORT_SYMBOL(yield);
4142 */ 4488 */
4143void __sched io_schedule(void) 4489void __sched io_schedule(void)
4144{ 4490{
4145 struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); 4491 struct runqueue *rq = &__raw_get_cpu_var(runqueues);
4146 4492
4147 atomic_inc(&rq->nr_iowait); 4493 atomic_inc(&rq->nr_iowait);
4148 schedule(); 4494 schedule();
@@ -4153,7 +4499,7 @@ EXPORT_SYMBOL(io_schedule);
4153 4499
4154long __sched io_schedule_timeout(long timeout) 4500long __sched io_schedule_timeout(long timeout)
4155{ 4501{
4156 struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); 4502 struct runqueue *rq = &__raw_get_cpu_var(runqueues);
4157 long ret; 4503 long ret;
4158 4504
4159 atomic_inc(&rq->nr_iowait); 4505 atomic_inc(&rq->nr_iowait);
@@ -4237,7 +4583,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4237 if (retval) 4583 if (retval)
4238 goto out_unlock; 4584 goto out_unlock;
4239 4585
4240 jiffies_to_timespec(p->policy & SCHED_FIFO ? 4586 jiffies_to_timespec(p->policy == SCHED_FIFO ?
4241 0 : task_timeslice(p), &t); 4587 0 : task_timeslice(p), &t);
4242 read_unlock(&tasklist_lock); 4588 read_unlock(&tasklist_lock);
4243 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 4589 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
@@ -4363,7 +4709,7 @@ void __devinit init_idle(task_t *idle, int cpu)
4363 idle->timestamp = sched_clock(); 4709 idle->timestamp = sched_clock();
4364 idle->sleep_avg = 0; 4710 idle->sleep_avg = 0;
4365 idle->array = NULL; 4711 idle->array = NULL;
4366 idle->prio = MAX_PRIO; 4712 idle->prio = idle->normal_prio = MAX_PRIO;
4367 idle->state = TASK_RUNNING; 4713 idle->state = TASK_RUNNING;
4368 idle->cpus_allowed = cpumask_of_cpu(cpu); 4714 idle->cpus_allowed = cpumask_of_cpu(cpu);
4369 set_task_cpu(idle, cpu); 4715 set_task_cpu(idle, cpu);
@@ -4459,13 +4805,16 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed);
4459 * 4805 *
4460 * So we race with normal scheduler movements, but that's OK, as long 4806 * So we race with normal scheduler movements, but that's OK, as long
4461 * as the task is no longer on this CPU. 4807 * as the task is no longer on this CPU.
4808 *
4809 * Returns non-zero if task was successfully migrated.
4462 */ 4810 */
4463static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) 4811static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4464{ 4812{
4465 runqueue_t *rq_dest, *rq_src; 4813 runqueue_t *rq_dest, *rq_src;
4814 int ret = 0;
4466 4815
4467 if (unlikely(cpu_is_offline(dest_cpu))) 4816 if (unlikely(cpu_is_offline(dest_cpu)))
4468 return; 4817 return ret;
4469 4818
4470 rq_src = cpu_rq(src_cpu); 4819 rq_src = cpu_rq(src_cpu);
4471 rq_dest = cpu_rq(dest_cpu); 4820 rq_dest = cpu_rq(dest_cpu);
@@ -4493,9 +4842,10 @@ static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4493 if (TASK_PREEMPTS_CURR(p, rq_dest)) 4842 if (TASK_PREEMPTS_CURR(p, rq_dest))
4494 resched_task(rq_dest->curr); 4843 resched_task(rq_dest->curr);
4495 } 4844 }
4496 4845 ret = 1;
4497out: 4846out:
4498 double_rq_unlock(rq_src, rq_dest); 4847 double_rq_unlock(rq_src, rq_dest);
4848 return ret;
4499} 4849}
4500 4850
4501/* 4851/*
@@ -4565,9 +4915,12 @@ wait_to_die:
4565/* Figure out where task on dead CPU should go, use force if neccessary. */ 4915/* Figure out where task on dead CPU should go, use force if neccessary. */
4566static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) 4916static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
4567{ 4917{
4918 runqueue_t *rq;
4919 unsigned long flags;
4568 int dest_cpu; 4920 int dest_cpu;
4569 cpumask_t mask; 4921 cpumask_t mask;
4570 4922
4923restart:
4571 /* On same node? */ 4924 /* On same node? */
4572 mask = node_to_cpumask(cpu_to_node(dead_cpu)); 4925 mask = node_to_cpumask(cpu_to_node(dead_cpu));
4573 cpus_and(mask, mask, tsk->cpus_allowed); 4926 cpus_and(mask, mask, tsk->cpus_allowed);
@@ -4579,8 +4932,10 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
4579 4932
4580 /* No more Mr. Nice Guy. */ 4933 /* No more Mr. Nice Guy. */
4581 if (dest_cpu == NR_CPUS) { 4934 if (dest_cpu == NR_CPUS) {
4935 rq = task_rq_lock(tsk, &flags);
4582 cpus_setall(tsk->cpus_allowed); 4936 cpus_setall(tsk->cpus_allowed);
4583 dest_cpu = any_online_cpu(tsk->cpus_allowed); 4937 dest_cpu = any_online_cpu(tsk->cpus_allowed);
4938 task_rq_unlock(rq, &flags);
4584 4939
4585 /* 4940 /*
4586 * Don't tell them about moving exiting tasks or 4941 * Don't tell them about moving exiting tasks or
@@ -4592,7 +4947,8 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
4592 "longer affine to cpu%d\n", 4947 "longer affine to cpu%d\n",
4593 tsk->pid, tsk->comm, dead_cpu); 4948 tsk->pid, tsk->comm, dead_cpu);
4594 } 4949 }
4595 __migrate_task(tsk, dead_cpu, dest_cpu); 4950 if (!__migrate_task(tsk, dead_cpu, dest_cpu))
4951 goto restart;
4596} 4952}
4597 4953
4598/* 4954/*
@@ -4719,8 +5075,9 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
4719 * migration_call - callback that gets triggered when a CPU is added. 5075 * migration_call - callback that gets triggered when a CPU is added.
4720 * Here we can start up the necessary migration thread for the new CPU. 5076 * Here we can start up the necessary migration thread for the new CPU.
4721 */ 5077 */
4722static int migration_call(struct notifier_block *nfb, unsigned long action, 5078static int __cpuinit migration_call(struct notifier_block *nfb,
4723 void *hcpu) 5079 unsigned long action,
5080 void *hcpu)
4724{ 5081{
4725 int cpu = (long)hcpu; 5082 int cpu = (long)hcpu;
4726 struct task_struct *p; 5083 struct task_struct *p;
@@ -4746,6 +5103,8 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
4746 break; 5103 break;
4747#ifdef CONFIG_HOTPLUG_CPU 5104#ifdef CONFIG_HOTPLUG_CPU
4748 case CPU_UP_CANCELED: 5105 case CPU_UP_CANCELED:
5106 if (!cpu_rq(cpu)->migration_thread)
5107 break;
4749 /* Unbind it from offline cpu so it can run. Fall thru. */ 5108 /* Unbind it from offline cpu so it can run. Fall thru. */
4750 kthread_bind(cpu_rq(cpu)->migration_thread, 5109 kthread_bind(cpu_rq(cpu)->migration_thread,
4751 any_online_cpu(cpu_online_map)); 5110 any_online_cpu(cpu_online_map));
@@ -4788,7 +5147,7 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
4788/* Register at highest priority so that task migration (migrate_all_tasks) 5147/* Register at highest priority so that task migration (migrate_all_tasks)
4789 * happens before everything else. 5148 * happens before everything else.
4790 */ 5149 */
4791static struct notifier_block migration_notifier = { 5150static struct notifier_block __cpuinitdata migration_notifier = {
4792 .notifier_call = migration_call, 5151 .notifier_call = migration_call,
4793 .priority = 10 5152 .priority = 10
4794}; 5153};
@@ -5589,6 +5948,7 @@ static cpumask_t sched_domain_node_span(int node)
5589} 5948}
5590#endif 5949#endif
5591 5950
5951int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
5592/* 5952/*
5593 * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we 5953 * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
5594 * can switch it on easily if needed. 5954 * can switch it on easily if needed.
@@ -5604,7 +5964,7 @@ static int cpu_to_cpu_group(int cpu)
5604 5964
5605#ifdef CONFIG_SCHED_MC 5965#ifdef CONFIG_SCHED_MC
5606static DEFINE_PER_CPU(struct sched_domain, core_domains); 5966static DEFINE_PER_CPU(struct sched_domain, core_domains);
5607static struct sched_group sched_group_core[NR_CPUS]; 5967static struct sched_group *sched_group_core_bycpu[NR_CPUS];
5608#endif 5968#endif
5609 5969
5610#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 5970#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
@@ -5620,7 +5980,7 @@ static int cpu_to_core_group(int cpu)
5620#endif 5980#endif
5621 5981
5622static DEFINE_PER_CPU(struct sched_domain, phys_domains); 5982static DEFINE_PER_CPU(struct sched_domain, phys_domains);
5623static struct sched_group sched_group_phys[NR_CPUS]; 5983static struct sched_group *sched_group_phys_bycpu[NR_CPUS];
5624static int cpu_to_phys_group(int cpu) 5984static int cpu_to_phys_group(int cpu)
5625{ 5985{
5626#if defined(CONFIG_SCHED_MC) 5986#if defined(CONFIG_SCHED_MC)
@@ -5677,13 +6037,74 @@ next_sg:
5677} 6037}
5678#endif 6038#endif
5679 6039
6040/* Free memory allocated for various sched_group structures */
6041static void free_sched_groups(const cpumask_t *cpu_map)
6042{
6043 int cpu;
6044#ifdef CONFIG_NUMA
6045 int i;
6046
6047 for_each_cpu_mask(cpu, *cpu_map) {
6048 struct sched_group *sched_group_allnodes
6049 = sched_group_allnodes_bycpu[cpu];
6050 struct sched_group **sched_group_nodes
6051 = sched_group_nodes_bycpu[cpu];
6052
6053 if (sched_group_allnodes) {
6054 kfree(sched_group_allnodes);
6055 sched_group_allnodes_bycpu[cpu] = NULL;
6056 }
6057
6058 if (!sched_group_nodes)
6059 continue;
6060
6061 for (i = 0; i < MAX_NUMNODES; i++) {
6062 cpumask_t nodemask = node_to_cpumask(i);
6063 struct sched_group *oldsg, *sg = sched_group_nodes[i];
6064
6065 cpus_and(nodemask, nodemask, *cpu_map);
6066 if (cpus_empty(nodemask))
6067 continue;
6068
6069 if (sg == NULL)
6070 continue;
6071 sg = sg->next;
6072next_sg:
6073 oldsg = sg;
6074 sg = sg->next;
6075 kfree(oldsg);
6076 if (oldsg != sched_group_nodes[i])
6077 goto next_sg;
6078 }
6079 kfree(sched_group_nodes);
6080 sched_group_nodes_bycpu[cpu] = NULL;
6081 }
6082#endif
6083 for_each_cpu_mask(cpu, *cpu_map) {
6084 if (sched_group_phys_bycpu[cpu]) {
6085 kfree(sched_group_phys_bycpu[cpu]);
6086 sched_group_phys_bycpu[cpu] = NULL;
6087 }
6088#ifdef CONFIG_SCHED_MC
6089 if (sched_group_core_bycpu[cpu]) {
6090 kfree(sched_group_core_bycpu[cpu]);
6091 sched_group_core_bycpu[cpu] = NULL;
6092 }
6093#endif
6094 }
6095}
6096
5680/* 6097/*
5681 * Build sched domains for a given set of cpus and attach the sched domains 6098 * Build sched domains for a given set of cpus and attach the sched domains
5682 * to the individual cpus 6099 * to the individual cpus
5683 */ 6100 */
5684void build_sched_domains(const cpumask_t *cpu_map) 6101static int build_sched_domains(const cpumask_t *cpu_map)
5685{ 6102{
5686 int i; 6103 int i;
6104 struct sched_group *sched_group_phys = NULL;
6105#ifdef CONFIG_SCHED_MC
6106 struct sched_group *sched_group_core = NULL;
6107#endif
5687#ifdef CONFIG_NUMA 6108#ifdef CONFIG_NUMA
5688 struct sched_group **sched_group_nodes = NULL; 6109 struct sched_group **sched_group_nodes = NULL;
5689 struct sched_group *sched_group_allnodes = NULL; 6110 struct sched_group *sched_group_allnodes = NULL;
@@ -5691,11 +6112,11 @@ void build_sched_domains(const cpumask_t *cpu_map)
5691 /* 6112 /*
5692 * Allocate the per-node list of sched groups 6113 * Allocate the per-node list of sched groups
5693 */ 6114 */
5694 sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES, 6115 sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
5695 GFP_ATOMIC); 6116 GFP_KERNEL);
5696 if (!sched_group_nodes) { 6117 if (!sched_group_nodes) {
5697 printk(KERN_WARNING "Can not alloc sched group node list\n"); 6118 printk(KERN_WARNING "Can not alloc sched group node list\n");
5698 return; 6119 return -ENOMEM;
5699 } 6120 }
5700 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; 6121 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
5701#endif 6122#endif
@@ -5721,7 +6142,7 @@ void build_sched_domains(const cpumask_t *cpu_map)
5721 if (!sched_group_allnodes) { 6142 if (!sched_group_allnodes) {
5722 printk(KERN_WARNING 6143 printk(KERN_WARNING
5723 "Can not alloc allnodes sched group\n"); 6144 "Can not alloc allnodes sched group\n");
5724 break; 6145 goto error;
5725 } 6146 }
5726 sched_group_allnodes_bycpu[i] 6147 sched_group_allnodes_bycpu[i]
5727 = sched_group_allnodes; 6148 = sched_group_allnodes;
@@ -5742,6 +6163,18 @@ void build_sched_domains(const cpumask_t *cpu_map)
5742 cpus_and(sd->span, sd->span, *cpu_map); 6163 cpus_and(sd->span, sd->span, *cpu_map);
5743#endif 6164#endif
5744 6165
6166 if (!sched_group_phys) {
6167 sched_group_phys
6168 = kmalloc(sizeof(struct sched_group) * NR_CPUS,
6169 GFP_KERNEL);
6170 if (!sched_group_phys) {
6171 printk (KERN_WARNING "Can not alloc phys sched"
6172 "group\n");
6173 goto error;
6174 }
6175 sched_group_phys_bycpu[i] = sched_group_phys;
6176 }
6177
5745 p = sd; 6178 p = sd;
5746 sd = &per_cpu(phys_domains, i); 6179 sd = &per_cpu(phys_domains, i);
5747 group = cpu_to_phys_group(i); 6180 group = cpu_to_phys_group(i);
@@ -5751,6 +6184,18 @@ void build_sched_domains(const cpumask_t *cpu_map)
5751 sd->groups = &sched_group_phys[group]; 6184 sd->groups = &sched_group_phys[group];
5752 6185
5753#ifdef CONFIG_SCHED_MC 6186#ifdef CONFIG_SCHED_MC
6187 if (!sched_group_core) {
6188 sched_group_core
6189 = kmalloc(sizeof(struct sched_group) * NR_CPUS,
6190 GFP_KERNEL);
6191 if (!sched_group_core) {
6192 printk (KERN_WARNING "Can not alloc core sched"
6193 "group\n");
6194 goto error;
6195 }
6196 sched_group_core_bycpu[i] = sched_group_core;
6197 }
6198
5754 p = sd; 6199 p = sd;
5755 sd = &per_cpu(core_domains, i); 6200 sd = &per_cpu(core_domains, i);
5756 group = cpu_to_core_group(i); 6201 group = cpu_to_core_group(i);
@@ -5834,24 +6279,21 @@ void build_sched_domains(const cpumask_t *cpu_map)
5834 domainspan = sched_domain_node_span(i); 6279 domainspan = sched_domain_node_span(i);
5835 cpus_and(domainspan, domainspan, *cpu_map); 6280 cpus_and(domainspan, domainspan, *cpu_map);
5836 6281
5837 sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); 6282 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
6283 if (!sg) {
6284 printk(KERN_WARNING "Can not alloc domain group for "
6285 "node %d\n", i);
6286 goto error;
6287 }
5838 sched_group_nodes[i] = sg; 6288 sched_group_nodes[i] = sg;
5839 for_each_cpu_mask(j, nodemask) { 6289 for_each_cpu_mask(j, nodemask) {
5840 struct sched_domain *sd; 6290 struct sched_domain *sd;
5841 sd = &per_cpu(node_domains, j); 6291 sd = &per_cpu(node_domains, j);
5842 sd->groups = sg; 6292 sd->groups = sg;
5843 if (sd->groups == NULL) {
5844 /* Turn off balancing if we have no groups */
5845 sd->flags = 0;
5846 }
5847 }
5848 if (!sg) {
5849 printk(KERN_WARNING
5850 "Can not alloc domain group for node %d\n", i);
5851 continue;
5852 } 6293 }
5853 sg->cpu_power = 0; 6294 sg->cpu_power = 0;
5854 sg->cpumask = nodemask; 6295 sg->cpumask = nodemask;
6296 sg->next = sg;
5855 cpus_or(covered, covered, nodemask); 6297 cpus_or(covered, covered, nodemask);
5856 prev = sg; 6298 prev = sg;
5857 6299
@@ -5870,54 +6312,90 @@ void build_sched_domains(const cpumask_t *cpu_map)
5870 if (cpus_empty(tmp)) 6312 if (cpus_empty(tmp))
5871 continue; 6313 continue;
5872 6314
5873 sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); 6315 sg = kmalloc_node(sizeof(struct sched_group),
6316 GFP_KERNEL, i);
5874 if (!sg) { 6317 if (!sg) {
5875 printk(KERN_WARNING 6318 printk(KERN_WARNING
5876 "Can not alloc domain group for node %d\n", j); 6319 "Can not alloc domain group for node %d\n", j);
5877 break; 6320 goto error;
5878 } 6321 }
5879 sg->cpu_power = 0; 6322 sg->cpu_power = 0;
5880 sg->cpumask = tmp; 6323 sg->cpumask = tmp;
6324 sg->next = prev->next;
5881 cpus_or(covered, covered, tmp); 6325 cpus_or(covered, covered, tmp);
5882 prev->next = sg; 6326 prev->next = sg;
5883 prev = sg; 6327 prev = sg;
5884 } 6328 }
5885 prev->next = sched_group_nodes[i];
5886 } 6329 }
5887#endif 6330#endif
5888 6331
5889 /* Calculate CPU power for physical packages and nodes */ 6332 /* Calculate CPU power for physical packages and nodes */
6333#ifdef CONFIG_SCHED_SMT
5890 for_each_cpu_mask(i, *cpu_map) { 6334 for_each_cpu_mask(i, *cpu_map) {
5891 int power;
5892 struct sched_domain *sd; 6335 struct sched_domain *sd;
5893#ifdef CONFIG_SCHED_SMT
5894 sd = &per_cpu(cpu_domains, i); 6336 sd = &per_cpu(cpu_domains, i);
5895 power = SCHED_LOAD_SCALE; 6337 sd->groups->cpu_power = SCHED_LOAD_SCALE;
5896 sd->groups->cpu_power = power; 6338 }
5897#endif 6339#endif
5898#ifdef CONFIG_SCHED_MC 6340#ifdef CONFIG_SCHED_MC
6341 for_each_cpu_mask(i, *cpu_map) {
6342 int power;
6343 struct sched_domain *sd;
5899 sd = &per_cpu(core_domains, i); 6344 sd = &per_cpu(core_domains, i);
5900 power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) 6345 if (sched_smt_power_savings)
6346 power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
6347 else
6348 power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
5901 * SCHED_LOAD_SCALE / 10; 6349 * SCHED_LOAD_SCALE / 10;
5902 sd->groups->cpu_power = power; 6350 sd->groups->cpu_power = power;
6351 }
6352#endif
5903 6353
6354 for_each_cpu_mask(i, *cpu_map) {
6355 struct sched_domain *sd;
6356#ifdef CONFIG_SCHED_MC
5904 sd = &per_cpu(phys_domains, i); 6357 sd = &per_cpu(phys_domains, i);
6358 if (i != first_cpu(sd->groups->cpumask))
6359 continue;
5905 6360
5906 /* 6361 sd->groups->cpu_power = 0;
5907 * This has to be < 2 * SCHED_LOAD_SCALE 6362 if (sched_mc_power_savings || sched_smt_power_savings) {
5908 * Lets keep it SCHED_LOAD_SCALE, so that 6363 int j;
5909 * while calculating NUMA group's cpu_power 6364
5910 * we can simply do 6365 for_each_cpu_mask(j, sd->groups->cpumask) {
5911 * numa_group->cpu_power += phys_group->cpu_power; 6366 struct sched_domain *sd1;
5912 * 6367 sd1 = &per_cpu(core_domains, j);
5913 * See "only add power once for each physical pkg" 6368 /*
5914 * comment below 6369 * for each core we will add once
5915 */ 6370 * to the group in physical domain
5916 sd->groups->cpu_power = SCHED_LOAD_SCALE; 6371 */
6372 if (j != first_cpu(sd1->groups->cpumask))
6373 continue;
6374
6375 if (sched_smt_power_savings)
6376 sd->groups->cpu_power += sd1->groups->cpu_power;
6377 else
6378 sd->groups->cpu_power += SCHED_LOAD_SCALE;
6379 }
6380 } else
6381 /*
6382 * This has to be < 2 * SCHED_LOAD_SCALE
6383 * Lets keep it SCHED_LOAD_SCALE, so that
6384 * while calculating NUMA group's cpu_power
6385 * we can simply do
6386 * numa_group->cpu_power += phys_group->cpu_power;
6387 *
6388 * See "only add power once for each physical pkg"
6389 * comment below
6390 */
6391 sd->groups->cpu_power = SCHED_LOAD_SCALE;
5917#else 6392#else
6393 int power;
5918 sd = &per_cpu(phys_domains, i); 6394 sd = &per_cpu(phys_domains, i);
5919 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * 6395 if (sched_smt_power_savings)
5920 (cpus_weight(sd->groups->cpumask)-1) / 10; 6396 power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
6397 else
6398 power = SCHED_LOAD_SCALE;
5921 sd->groups->cpu_power = power; 6399 sd->groups->cpu_power = power;
5922#endif 6400#endif
5923 } 6401 }
@@ -5945,13 +6423,20 @@ void build_sched_domains(const cpumask_t *cpu_map)
5945 * Tune cache-hot values: 6423 * Tune cache-hot values:
5946 */ 6424 */
5947 calibrate_migration_costs(cpu_map); 6425 calibrate_migration_costs(cpu_map);
6426
6427 return 0;
6428
6429error:
6430 free_sched_groups(cpu_map);
6431 return -ENOMEM;
5948} 6432}
5949/* 6433/*
5950 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 6434 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
5951 */ 6435 */
5952static void arch_init_sched_domains(const cpumask_t *cpu_map) 6436static int arch_init_sched_domains(const cpumask_t *cpu_map)
5953{ 6437{
5954 cpumask_t cpu_default_map; 6438 cpumask_t cpu_default_map;
6439 int err;
5955 6440
5956 /* 6441 /*
5957 * Setup mask for cpus without special case scheduling requirements. 6442 * Setup mask for cpus without special case scheduling requirements.
@@ -5960,51 +6445,14 @@ static void arch_init_sched_domains(const cpumask_t *cpu_map)
5960 */ 6445 */
5961 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); 6446 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
5962 6447
5963 build_sched_domains(&cpu_default_map); 6448 err = build_sched_domains(&cpu_default_map);
6449
6450 return err;
5964} 6451}
5965 6452
5966static void arch_destroy_sched_domains(const cpumask_t *cpu_map) 6453static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
5967{ 6454{
5968#ifdef CONFIG_NUMA 6455 free_sched_groups(cpu_map);
5969 int i;
5970 int cpu;
5971
5972 for_each_cpu_mask(cpu, *cpu_map) {
5973 struct sched_group *sched_group_allnodes
5974 = sched_group_allnodes_bycpu[cpu];
5975 struct sched_group **sched_group_nodes
5976 = sched_group_nodes_bycpu[cpu];
5977
5978 if (sched_group_allnodes) {
5979 kfree(sched_group_allnodes);
5980 sched_group_allnodes_bycpu[cpu] = NULL;
5981 }
5982
5983 if (!sched_group_nodes)
5984 continue;
5985
5986 for (i = 0; i < MAX_NUMNODES; i++) {
5987 cpumask_t nodemask = node_to_cpumask(i);
5988 struct sched_group *oldsg, *sg = sched_group_nodes[i];
5989
5990 cpus_and(nodemask, nodemask, *cpu_map);
5991 if (cpus_empty(nodemask))
5992 continue;
5993
5994 if (sg == NULL)
5995 continue;
5996 sg = sg->next;
5997next_sg:
5998 oldsg = sg;
5999 sg = sg->next;
6000 kfree(oldsg);
6001 if (oldsg != sched_group_nodes[i])
6002 goto next_sg;
6003 }
6004 kfree(sched_group_nodes);
6005 sched_group_nodes_bycpu[cpu] = NULL;
6006 }
6007#endif
6008} 6456}
6009 6457
6010/* 6458/*
@@ -6029,9 +6477,10 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
6029 * correct sched domains 6477 * correct sched domains
6030 * Call with hotplug lock held 6478 * Call with hotplug lock held
6031 */ 6479 */
6032void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) 6480int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
6033{ 6481{
6034 cpumask_t change_map; 6482 cpumask_t change_map;
6483 int err = 0;
6035 6484
6036 cpus_and(*partition1, *partition1, cpu_online_map); 6485 cpus_and(*partition1, *partition1, cpu_online_map);
6037 cpus_and(*partition2, *partition2, cpu_online_map); 6486 cpus_and(*partition2, *partition2, cpu_online_map);
@@ -6040,10 +6489,86 @@ void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
6040 /* Detach sched domains from all of the affected cpus */ 6489 /* Detach sched domains from all of the affected cpus */
6041 detach_destroy_domains(&change_map); 6490 detach_destroy_domains(&change_map);
6042 if (!cpus_empty(*partition1)) 6491 if (!cpus_empty(*partition1))
6043 build_sched_domains(partition1); 6492 err = build_sched_domains(partition1);
6044 if (!cpus_empty(*partition2)) 6493 if (!err && !cpus_empty(*partition2))
6045 build_sched_domains(partition2); 6494 err = build_sched_domains(partition2);
6495
6496 return err;
6497}
6498
6499#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6500int arch_reinit_sched_domains(void)
6501{
6502 int err;
6503
6504 lock_cpu_hotplug();
6505 detach_destroy_domains(&cpu_online_map);
6506 err = arch_init_sched_domains(&cpu_online_map);
6507 unlock_cpu_hotplug();
6508
6509 return err;
6510}
6511
6512static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6513{
6514 int ret;
6515
6516 if (buf[0] != '0' && buf[0] != '1')
6517 return -EINVAL;
6518
6519 if (smt)
6520 sched_smt_power_savings = (buf[0] == '1');
6521 else
6522 sched_mc_power_savings = (buf[0] == '1');
6523
6524 ret = arch_reinit_sched_domains();
6525
6526 return ret ? ret : count;
6527}
6528
6529int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
6530{
6531 int err = 0;
6532#ifdef CONFIG_SCHED_SMT
6533 if (smt_capable())
6534 err = sysfs_create_file(&cls->kset.kobj,
6535 &attr_sched_smt_power_savings.attr);
6536#endif
6537#ifdef CONFIG_SCHED_MC
6538 if (!err && mc_capable())
6539 err = sysfs_create_file(&cls->kset.kobj,
6540 &attr_sched_mc_power_savings.attr);
6541#endif
6542 return err;
6543}
6544#endif
6545
6546#ifdef CONFIG_SCHED_MC
6547static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
6548{
6549 return sprintf(page, "%u\n", sched_mc_power_savings);
6550}
6551static ssize_t sched_mc_power_savings_store(struct sys_device *dev, const char *buf, size_t count)
6552{
6553 return sched_power_savings_store(buf, count, 0);
6046} 6554}
6555SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
6556 sched_mc_power_savings_store);
6557#endif
6558
6559#ifdef CONFIG_SCHED_SMT
6560static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
6561{
6562 return sprintf(page, "%u\n", sched_smt_power_savings);
6563}
6564static ssize_t sched_smt_power_savings_store(struct sys_device *dev, const char *buf, size_t count)
6565{
6566 return sched_power_savings_store(buf, count, 1);
6567}
6568SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
6569 sched_smt_power_savings_store);
6570#endif
6571
6047 6572
6048#ifdef CONFIG_HOTPLUG_CPU 6573#ifdef CONFIG_HOTPLUG_CPU
6049/* 6574/*
@@ -6126,7 +6651,6 @@ void __init sched_init(void)
6126 rq->push_cpu = 0; 6651 rq->push_cpu = 0;
6127 rq->migration_thread = NULL; 6652 rq->migration_thread = NULL;
6128 INIT_LIST_HEAD(&rq->migration_queue); 6653 INIT_LIST_HEAD(&rq->migration_queue);
6129 rq->cpu = i;
6130#endif 6654#endif
6131 atomic_set(&rq->nr_iowait, 0); 6655 atomic_set(&rq->nr_iowait, 0);
6132 6656
@@ -6141,6 +6665,7 @@ void __init sched_init(void)
6141 } 6665 }
6142 } 6666 }
6143 6667
6668 set_load_weight(&init_task);
6144 /* 6669 /*
6145 * The boot idle thread does lazy MMU switching as well: 6670 * The boot idle thread does lazy MMU switching as well:
6146 */ 6671 */
@@ -6187,11 +6712,12 @@ void normalize_rt_tasks(void)
6187 runqueue_t *rq; 6712 runqueue_t *rq;
6188 6713
6189 read_lock_irq(&tasklist_lock); 6714 read_lock_irq(&tasklist_lock);
6190 for_each_process (p) { 6715 for_each_process(p) {
6191 if (!rt_task(p)) 6716 if (!rt_task(p))
6192 continue; 6717 continue;
6193 6718
6194 rq = task_rq_lock(p, &flags); 6719 spin_lock_irqsave(&p->pi_lock, flags);
6720 rq = __task_rq_lock(p);
6195 6721
6196 array = p->array; 6722 array = p->array;
6197 if (array) 6723 if (array)
@@ -6202,7 +6728,8 @@ void normalize_rt_tasks(void)
6202 resched_task(rq->curr); 6728 resched_task(rq->curr);
6203 } 6729 }
6204 6730
6205 task_rq_unlock(rq, &flags); 6731 __task_rq_unlock(rq);
6732 spin_unlock_irqrestore(&p->pi_lock, flags);
6206 } 6733 }
6207 read_unlock_irq(&tasklist_lock); 6734 read_unlock_irq(&tasklist_lock);
6208} 6735}
diff --git a/kernel/signal.c b/kernel/signal.c
index e5f8aea78ffe..7fe874d12fae 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -10,7 +10,6 @@
10 * to allow signals to be sent reliably. 10 * to allow signals to be sent reliably.
11 */ 11 */
12 12
13#include <linux/config.h>
14#include <linux/slab.h> 13#include <linux/slab.h>
15#include <linux/module.h> 14#include <linux/module.h>
16#include <linux/smp_lock.h> 15#include <linux/smp_lock.h>
@@ -23,12 +22,12 @@
23#include <linux/syscalls.h> 22#include <linux/syscalls.h>
24#include <linux/ptrace.h> 23#include <linux/ptrace.h>
25#include <linux/signal.h> 24#include <linux/signal.h>
26#include <linux/audit.h>
27#include <linux/capability.h> 25#include <linux/capability.h>
28#include <asm/param.h> 26#include <asm/param.h>
29#include <asm/uaccess.h> 27#include <asm/uaccess.h>
30#include <asm/unistd.h> 28#include <asm/unistd.h>
31#include <asm/siginfo.h> 29#include <asm/siginfo.h>
30#include "audit.h" /* audit_signal_info() */
32 31
33/* 32/*
34 * SLAB caches for signal bits. 33 * SLAB caches for signal bits.
@@ -584,7 +583,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
584 && !capable(CAP_KILL)) 583 && !capable(CAP_KILL))
585 return error; 584 return error;
586 585
587 error = security_task_kill(t, info, sig); 586 error = security_task_kill(t, info, sig, 0);
588 if (!error) 587 if (!error)
589 audit_signal_info(sig, t); /* Let audit system see the signal */ 588 audit_signal_info(sig, t); /* Let audit system see the signal */
590 return error; 589 return error;
@@ -1107,7 +1106,7 @@ kill_proc_info(int sig, struct siginfo *info, pid_t pid)
1107 1106
1108/* like kill_proc_info(), but doesn't use uid/euid of "current" */ 1107/* like kill_proc_info(), but doesn't use uid/euid of "current" */
1109int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid, 1108int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid,
1110 uid_t uid, uid_t euid) 1109 uid_t uid, uid_t euid, u32 secid)
1111{ 1110{
1112 int ret = -EINVAL; 1111 int ret = -EINVAL;
1113 struct task_struct *p; 1112 struct task_struct *p;
@@ -1127,6 +1126,9 @@ int kill_proc_info_as_uid(int sig, struct siginfo *info, pid_t pid,
1127 ret = -EPERM; 1126 ret = -EPERM;
1128 goto out_unlock; 1127 goto out_unlock;
1129 } 1128 }
1129 ret = security_task_kill(p, info, sig, secid);
1130 if (ret)
1131 goto out_unlock;
1130 if (sig && p->sighand) { 1132 if (sig && p->sighand) {
1131 unsigned long flags; 1133 unsigned long flags;
1132 spin_lock_irqsave(&p->sighand->siglock, flags); 1134 spin_lock_irqsave(&p->sighand->siglock, flags);
@@ -1531,6 +1533,35 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
1531 spin_unlock_irqrestore(&sighand->siglock, flags); 1533 spin_unlock_irqrestore(&sighand->siglock, flags);
1532} 1534}
1533 1535
1536static inline int may_ptrace_stop(void)
1537{
1538 if (!likely(current->ptrace & PT_PTRACED))
1539 return 0;
1540
1541 if (unlikely(current->parent == current->real_parent &&
1542 (current->ptrace & PT_ATTACHED)))
1543 return 0;
1544
1545 if (unlikely(current->signal == current->parent->signal) &&
1546 unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))
1547 return 0;
1548
1549 /*
1550 * Are we in the middle of do_coredump?
1551 * If so and our tracer is also part of the coredump stopping
1552 * is a deadlock situation, and pointless because our tracer
1553 * is dead so don't allow us to stop.
1554 * If SIGKILL was already sent before the caller unlocked
1555 * ->siglock we must see ->core_waiters != 0. Otherwise it
1556 * is safe to enter schedule().
1557 */
1558 if (unlikely(current->mm->core_waiters) &&
1559 unlikely(current->mm == current->parent->mm))
1560 return 0;
1561
1562 return 1;
1563}
1564
1534/* 1565/*
1535 * This must be called with current->sighand->siglock held. 1566 * This must be called with current->sighand->siglock held.
1536 * 1567 *
@@ -1559,11 +1590,7 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
1559 spin_unlock_irq(&current->sighand->siglock); 1590 spin_unlock_irq(&current->sighand->siglock);
1560 try_to_freeze(); 1591 try_to_freeze();
1561 read_lock(&tasklist_lock); 1592 read_lock(&tasklist_lock);
1562 if (likely(current->ptrace & PT_PTRACED) && 1593 if (may_ptrace_stop()) {
1563 likely(current->parent != current->real_parent ||
1564 !(current->ptrace & PT_ATTACHED)) &&
1565 (likely(current->parent->signal != current->signal) ||
1566 !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) {
1567 do_notify_parent_cldstop(current, CLD_TRAPPED); 1594 do_notify_parent_cldstop(current, CLD_TRAPPED);
1568 read_unlock(&tasklist_lock); 1595 read_unlock(&tasklist_lock);
1569 schedule(); 1596 schedule();
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 336f92d64e2e..8f03e3b89b55 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -446,7 +446,7 @@ static void takeover_tasklets(unsigned int cpu)
446} 446}
447#endif /* CONFIG_HOTPLUG_CPU */ 447#endif /* CONFIG_HOTPLUG_CPU */
448 448
449static int cpu_callback(struct notifier_block *nfb, 449static int __devinit cpu_callback(struct notifier_block *nfb,
450 unsigned long action, 450 unsigned long action,
451 void *hcpu) 451 void *hcpu)
452{ 452{
@@ -470,6 +470,8 @@ static int cpu_callback(struct notifier_block *nfb,
470 break; 470 break;
471#ifdef CONFIG_HOTPLUG_CPU 471#ifdef CONFIG_HOTPLUG_CPU
472 case CPU_UP_CANCELED: 472 case CPU_UP_CANCELED:
473 if (!per_cpu(ksoftirqd, hotcpu))
474 break;
473 /* Unbind so it can run. Fall thru. */ 475 /* Unbind so it can run. Fall thru. */
474 kthread_bind(per_cpu(ksoftirqd, hotcpu), 476 kthread_bind(per_cpu(ksoftirqd, hotcpu),
475 any_online_cpu(cpu_online_map)); 477 any_online_cpu(cpu_online_map));
@@ -484,7 +486,7 @@ static int cpu_callback(struct notifier_block *nfb,
484 return NOTIFY_OK; 486 return NOTIFY_OK;
485} 487}
486 488
487static struct notifier_block cpu_nfb = { 489static struct notifier_block __devinitdata cpu_nfb = {
488 .notifier_call = cpu_callback 490 .notifier_call = cpu_callback
489}; 491};
490 492
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 14c7faf02909..6b76caa22981 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -36,7 +36,7 @@ static struct notifier_block panic_block = {
36 36
37void touch_softlockup_watchdog(void) 37void touch_softlockup_watchdog(void)
38{ 38{
39 per_cpu(touch_timestamp, raw_smp_processor_id()) = jiffies; 39 __raw_get_cpu_var(touch_timestamp) = jiffies;
40} 40}
41EXPORT_SYMBOL(touch_softlockup_watchdog); 41EXPORT_SYMBOL(touch_softlockup_watchdog);
42 42
@@ -104,7 +104,7 @@ static int watchdog(void * __bind_cpu)
104/* 104/*
105 * Create/destroy watchdog threads as CPUs come and go: 105 * Create/destroy watchdog threads as CPUs come and go:
106 */ 106 */
107static int 107static int __devinit
108cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 108cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
109{ 109{
110 int hotcpu = (unsigned long)hcpu; 110 int hotcpu = (unsigned long)hcpu;
@@ -127,6 +127,8 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
127 break; 127 break;
128#ifdef CONFIG_HOTPLUG_CPU 128#ifdef CONFIG_HOTPLUG_CPU
129 case CPU_UP_CANCELED: 129 case CPU_UP_CANCELED:
130 if (!per_cpu(watchdog_task, hotcpu))
131 break;
130 /* Unbind so it can run. Fall thru. */ 132 /* Unbind so it can run. Fall thru. */
131 kthread_bind(per_cpu(watchdog_task, hotcpu), 133 kthread_bind(per_cpu(watchdog_task, hotcpu),
132 any_online_cpu(cpu_online_map)); 134 any_online_cpu(cpu_online_map));
@@ -140,7 +142,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
140 return NOTIFY_OK; 142 return NOTIFY_OK;
141} 143}
142 144
143static struct notifier_block cpu_nfb = { 145static struct notifier_block __devinitdata cpu_nfb = {
144 .notifier_call = cpu_callback 146 .notifier_call = cpu_callback
145}; 147};
146 148
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index d1b810782bc4..b31e54eadf56 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -9,7 +9,6 @@
9 * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them) 9 * SMP and the DEBUG_SPINLOCK cases. (UP-nondebug inlines them)
10 */ 10 */
11 11
12#include <linux/config.h>
13#include <linux/linkage.h> 12#include <linux/linkage.h>
14#include <linux/preempt.h> 13#include <linux/preempt.h>
15#include <linux/spinlock.h> 14#include <linux/spinlock.h>
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index dcfb5d731466..2c0aacc37c55 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -4,6 +4,7 @@
4#include <linux/cpu.h> 4#include <linux/cpu.h>
5#include <linux/err.h> 5#include <linux/err.h>
6#include <linux/syscalls.h> 6#include <linux/syscalls.h>
7#include <linux/kthread.h>
7#include <asm/atomic.h> 8#include <asm/atomic.h>
8#include <asm/semaphore.h> 9#include <asm/semaphore.h>
9#include <asm/uaccess.h> 10#include <asm/uaccess.h>
@@ -25,13 +26,11 @@ static unsigned int stopmachine_num_threads;
25static atomic_t stopmachine_thread_ack; 26static atomic_t stopmachine_thread_ack;
26static DECLARE_MUTEX(stopmachine_mutex); 27static DECLARE_MUTEX(stopmachine_mutex);
27 28
28static int stopmachine(void *cpu) 29static int stopmachine(void *unused)
29{ 30{
30 int irqs_disabled = 0; 31 int irqs_disabled = 0;
31 int prepared = 0; 32 int prepared = 0;
32 33
33 set_cpus_allowed(current, cpumask_of_cpu((int)(long)cpu));
34
35 /* Ack: we are alive */ 34 /* Ack: we are alive */
36 smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */ 35 smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */
37 atomic_inc(&stopmachine_thread_ack); 36 atomic_inc(&stopmachine_thread_ack);
@@ -85,7 +84,8 @@ static void stopmachine_set_state(enum stopmachine_state state)
85 84
86static int stop_machine(void) 85static int stop_machine(void)
87{ 86{
88 int i, ret = 0; 87 int ret = 0;
88 unsigned int i;
89 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 89 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
90 90
91 /* One high-prio thread per cpu. We'll do this one. */ 91 /* One high-prio thread per cpu. We'll do this one. */
@@ -96,11 +96,16 @@ static int stop_machine(void)
96 stopmachine_state = STOPMACHINE_WAIT; 96 stopmachine_state = STOPMACHINE_WAIT;
97 97
98 for_each_online_cpu(i) { 98 for_each_online_cpu(i) {
99 struct task_struct *tsk;
99 if (i == raw_smp_processor_id()) 100 if (i == raw_smp_processor_id())
100 continue; 101 continue;
101 ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL); 102 tsk = kthread_create(stopmachine, NULL, "stopmachine");
102 if (ret < 0) 103 if (IS_ERR(tsk)) {
104 ret = PTR_ERR(tsk);
103 break; 105 break;
106 }
107 kthread_bind(tsk, i);
108 wake_up_process(tsk);
104 stopmachine_num_threads++; 109 stopmachine_num_threads++;
105 } 110 }
106 111
diff --git a/kernel/sys.c b/kernel/sys.c
index 0b6ec0e7936f..dbb3b9c7ea64 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -4,7 +4,6 @@
4 * Copyright (C) 1991, 1992 Linus Torvalds 4 * Copyright (C) 1991, 1992 Linus Torvalds
5 */ 5 */
6 6
7#include <linux/config.h>
8#include <linux/module.h> 7#include <linux/module.h>
9#include <linux/mm.h> 8#include <linux/mm.h>
10#include <linux/utsname.h> 9#include <linux/utsname.h>
@@ -13,7 +12,6 @@
13#include <linux/notifier.h> 12#include <linux/notifier.h>
14#include <linux/reboot.h> 13#include <linux/reboot.h>
15#include <linux/prctl.h> 14#include <linux/prctl.h>
16#include <linux/init.h>
17#include <linux/highuid.h> 15#include <linux/highuid.h>
18#include <linux/fs.h> 16#include <linux/fs.h>
19#include <linux/kernel.h> 17#include <linux/kernel.h>
@@ -57,6 +55,12 @@
57#ifndef GET_FPEXC_CTL 55#ifndef GET_FPEXC_CTL
58# define GET_FPEXC_CTL(a,b) (-EINVAL) 56# define GET_FPEXC_CTL(a,b) (-EINVAL)
59#endif 57#endif
58#ifndef GET_ENDIAN
59# define GET_ENDIAN(a,b) (-EINVAL)
60#endif
61#ifndef SET_ENDIAN
62# define SET_ENDIAN(a,b) (-EINVAL)
63#endif
60 64
61/* 65/*
62 * this is where the system-wide overflow UID and GID are defined, for 66 * this is where the system-wide overflow UID and GID are defined, for
@@ -132,14 +136,15 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
132 unsigned long val, void *v) 136 unsigned long val, void *v)
133{ 137{
134 int ret = NOTIFY_DONE; 138 int ret = NOTIFY_DONE;
135 struct notifier_block *nb; 139 struct notifier_block *nb, *next_nb;
136 140
137 nb = rcu_dereference(*nl); 141 nb = rcu_dereference(*nl);
138 while (nb) { 142 while (nb) {
143 next_nb = rcu_dereference(nb->next);
139 ret = nb->notifier_call(nb, val, v); 144 ret = nb->notifier_call(nb, val, v);
140 if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK) 145 if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
141 break; 146 break;
142 nb = rcu_dereference(nb->next); 147 nb = next_nb;
143 } 148 }
144 return ret; 149 return ret;
145} 150}
@@ -583,7 +588,7 @@ void emergency_restart(void)
583} 588}
584EXPORT_SYMBOL_GPL(emergency_restart); 589EXPORT_SYMBOL_GPL(emergency_restart);
585 590
586void kernel_restart_prepare(char *cmd) 591static void kernel_restart_prepare(char *cmd)
587{ 592{
588 blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); 593 blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
589 system_state = SYSTEM_RESTART; 594 system_state = SYSTEM_RESTART;
@@ -617,7 +622,7 @@ EXPORT_SYMBOL_GPL(kernel_restart);
617 * Move into place and start executing a preloaded standalone 622 * Move into place and start executing a preloaded standalone
618 * executable. If nothing was preloaded return an error. 623 * executable. If nothing was preloaded return an error.
619 */ 624 */
620void kernel_kexec(void) 625static void kernel_kexec(void)
621{ 626{
622#ifdef CONFIG_KEXEC 627#ifdef CONFIG_KEXEC
623 struct kimage *image; 628 struct kimage *image;
@@ -631,7 +636,6 @@ void kernel_kexec(void)
631 machine_kexec(image); 636 machine_kexec(image);
632#endif 637#endif
633} 638}
634EXPORT_SYMBOL_GPL(kernel_kexec);
635 639
636void kernel_shutdown_prepare(enum system_states state) 640void kernel_shutdown_prepare(enum system_states state)
637{ 641{
@@ -1860,23 +1864,20 @@ out:
1860 * fields when reaping, so a sample either gets all the additions of a 1864 * fields when reaping, so a sample either gets all the additions of a
1861 * given child after it's reaped, or none so this sample is before reaping. 1865 * given child after it's reaped, or none so this sample is before reaping.
1862 * 1866 *
1863 * tasklist_lock locking optimisation: 1867 * Locking:
1864 * If we are current and single threaded, we do not need to take the tasklist 1868 * We need to take the siglock for CHILDEREN, SELF and BOTH
1865 * lock or the siglock. No one else can take our signal_struct away, 1869 * for the cases current multithreaded, non-current single threaded
1866 * no one else can reap the children to update signal->c* counters, and 1870 * non-current multithreaded. Thread traversal is now safe with
1867 * no one else can race with the signal-> fields. 1871 * the siglock held.
1868 * If we do not take the tasklist_lock, the signal-> fields could be read 1872 * Strictly speaking, we donot need to take the siglock if we are current and
1869 * out of order while another thread was just exiting. So we place a 1873 * single threaded, as no one else can take our signal_struct away, no one
1870 * read memory barrier when we avoid the lock. On the writer side, 1874 * else can reap the children to update signal->c* counters, and no one else
1871 * write memory barrier is implied in __exit_signal as __exit_signal releases 1875 * can race with the signal-> fields. If we do not take any lock, the
1872 * the siglock spinlock after updating the signal-> fields. 1876 * signal-> fields could be read out of order while another thread was just
1873 * 1877 * exiting. So we should place a read memory barrier when we avoid the lock.
1874 * We don't really need the siglock when we access the non c* fields 1878 * On the writer side, write memory barrier is implied in __exit_signal
1875 * of the signal_struct (for RUSAGE_SELF) even in multithreaded 1879 * as __exit_signal releases the siglock spinlock after updating the signal->
1876 * case, since we take the tasklist lock for read and the non c* signal-> 1880 * fields. But we don't do this yet to keep things simple.
1877 * fields are updated only in __exit_signal, which is called with
1878 * tasklist_lock taken for write, hence these two threads cannot execute
1879 * concurrently.
1880 * 1881 *
1881 */ 1882 */
1882 1883
@@ -1885,35 +1886,25 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1885 struct task_struct *t; 1886 struct task_struct *t;
1886 unsigned long flags; 1887 unsigned long flags;
1887 cputime_t utime, stime; 1888 cputime_t utime, stime;
1888 int need_lock = 0;
1889 1889
1890 memset((char *) r, 0, sizeof *r); 1890 memset((char *) r, 0, sizeof *r);
1891 utime = stime = cputime_zero; 1891 utime = stime = cputime_zero;
1892 1892
1893 if (p != current || !thread_group_empty(p)) 1893 rcu_read_lock();
1894 need_lock = 1; 1894 if (!lock_task_sighand(p, &flags)) {
1895 1895 rcu_read_unlock();
1896 if (need_lock) { 1896 return;
1897 read_lock(&tasklist_lock); 1897 }
1898 if (unlikely(!p->signal)) {
1899 read_unlock(&tasklist_lock);
1900 return;
1901 }
1902 } else
1903 /* See locking comments above */
1904 smp_rmb();
1905 1898
1906 switch (who) { 1899 switch (who) {
1907 case RUSAGE_BOTH: 1900 case RUSAGE_BOTH:
1908 case RUSAGE_CHILDREN: 1901 case RUSAGE_CHILDREN:
1909 spin_lock_irqsave(&p->sighand->siglock, flags);
1910 utime = p->signal->cutime; 1902 utime = p->signal->cutime;
1911 stime = p->signal->cstime; 1903 stime = p->signal->cstime;
1912 r->ru_nvcsw = p->signal->cnvcsw; 1904 r->ru_nvcsw = p->signal->cnvcsw;
1913 r->ru_nivcsw = p->signal->cnivcsw; 1905 r->ru_nivcsw = p->signal->cnivcsw;
1914 r->ru_minflt = p->signal->cmin_flt; 1906 r->ru_minflt = p->signal->cmin_flt;
1915 r->ru_majflt = p->signal->cmaj_flt; 1907 r->ru_majflt = p->signal->cmaj_flt;
1916 spin_unlock_irqrestore(&p->sighand->siglock, flags);
1917 1908
1918 if (who == RUSAGE_CHILDREN) 1909 if (who == RUSAGE_CHILDREN)
1919 break; 1910 break;
@@ -1941,8 +1932,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1941 BUG(); 1932 BUG();
1942 } 1933 }
1943 1934
1944 if (need_lock) 1935 unlock_task_sighand(p, &flags);
1945 read_unlock(&tasklist_lock); 1936 rcu_read_unlock();
1937
1946 cputime_to_timeval(utime, &r->ru_utime); 1938 cputime_to_timeval(utime, &r->ru_utime);
1947 cputime_to_timeval(stime, &r->ru_stime); 1939 cputime_to_timeval(stime, &r->ru_stime);
1948} 1940}
@@ -2057,6 +2049,13 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
2057 return -EFAULT; 2049 return -EFAULT;
2058 return 0; 2050 return 0;
2059 } 2051 }
2052 case PR_GET_ENDIAN:
2053 error = GET_ENDIAN(current, arg2);
2054 break;
2055 case PR_SET_ENDIAN:
2056 error = SET_ENDIAN(current, arg2);
2057 break;
2058
2060 default: 2059 default:
2061 error = -EINVAL; 2060 error = -EINVAL;
2062 break; 2061 break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 5433195040f1..6991bece67e8 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -87,6 +87,7 @@ cond_syscall(sys_inotify_init);
87cond_syscall(sys_inotify_add_watch); 87cond_syscall(sys_inotify_add_watch);
88cond_syscall(sys_inotify_rm_watch); 88cond_syscall(sys_inotify_rm_watch);
89cond_syscall(sys_migrate_pages); 89cond_syscall(sys_migrate_pages);
90cond_syscall(sys_move_pages);
90cond_syscall(sys_chown16); 91cond_syscall(sys_chown16);
91cond_syscall(sys_fchown16); 92cond_syscall(sys_fchown16);
92cond_syscall(sys_getegid16); 93cond_syscall(sys_getegid16);
@@ -132,3 +133,4 @@ cond_syscall(sys_mincore);
132cond_syscall(sys_madvise); 133cond_syscall(sys_madvise);
133cond_syscall(sys_mremap); 134cond_syscall(sys_mremap);
134cond_syscall(sys_remap_file_pages); 135cond_syscall(sys_remap_file_pages);
136cond_syscall(compat_sys_move_pages);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e82726faeeff..99a58f279077 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -18,7 +18,6 @@
18 * Removed it and replaced it with older style, 03/23/00, Bill Wendling 18 * Removed it and replaced it with older style, 03/23/00, Bill Wendling
19 */ 19 */
20 20
21#include <linux/config.h>
22#include <linux/module.h> 21#include <linux/module.h>
23#include <linux/mm.h> 22#include <linux/mm.h>
24#include <linux/swap.h> 23#include <linux/swap.h>
@@ -59,6 +58,7 @@ extern int proc_nr_files(ctl_table *table, int write, struct file *filp,
59extern int C_A_D; 58extern int C_A_D;
60extern int sysctl_overcommit_memory; 59extern int sysctl_overcommit_memory;
61extern int sysctl_overcommit_ratio; 60extern int sysctl_overcommit_ratio;
61extern int sysctl_panic_on_oom;
62extern int max_threads; 62extern int max_threads;
63extern int sysrq_enabled; 63extern int sysrq_enabled;
64extern int core_uses_pid; 64extern int core_uses_pid;
@@ -72,6 +72,7 @@ extern int printk_ratelimit_burst;
72extern int pid_max_min, pid_max_max; 72extern int pid_max_min, pid_max_max;
73extern int sysctl_drop_caches; 73extern int sysctl_drop_caches;
74extern int percpu_pagelist_fraction; 74extern int percpu_pagelist_fraction;
75extern int compat_log;
75 76
76#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) 77#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
77int unknown_nmi_panic; 78int unknown_nmi_panic;
@@ -131,6 +132,10 @@ extern int acct_parm[];
131extern int no_unaligned_warning; 132extern int no_unaligned_warning;
132#endif 133#endif
133 134
135#ifdef CONFIG_RT_MUTEXES
136extern int max_lock_depth;
137#endif
138
134static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, 139static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t,
135 ctl_table *, void **); 140 ctl_table *, void **);
136static int proc_doutsstring(ctl_table *table, int write, struct file *filp, 141static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
@@ -142,7 +147,6 @@ static struct ctl_table_header root_table_header =
142 147
143static ctl_table kern_table[]; 148static ctl_table kern_table[];
144static ctl_table vm_table[]; 149static ctl_table vm_table[];
145static ctl_table proc_table[];
146static ctl_table fs_table[]; 150static ctl_table fs_table[];
147static ctl_table debug_table[]; 151static ctl_table debug_table[];
148static ctl_table dev_table[]; 152static ctl_table dev_table[];
@@ -150,7 +154,7 @@ extern ctl_table random_table[];
150#ifdef CONFIG_UNIX98_PTYS 154#ifdef CONFIG_UNIX98_PTYS
151extern ctl_table pty_table[]; 155extern ctl_table pty_table[];
152#endif 156#endif
153#ifdef CONFIG_INOTIFY 157#ifdef CONFIG_INOTIFY_USER
154extern ctl_table inotify_table[]; 158extern ctl_table inotify_table[];
155#endif 159#endif
156 160
@@ -202,12 +206,6 @@ static ctl_table root_table[] = {
202 }, 206 },
203#endif 207#endif
204 { 208 {
205 .ctl_name = CTL_PROC,
206 .procname = "proc",
207 .mode = 0555,
208 .child = proc_table,
209 },
210 {
211 .ctl_name = CTL_FS, 209 .ctl_name = CTL_FS,
212 .procname = "fs", 210 .procname = "fs",
213 .mode = 0555, 211 .mode = 0555,
@@ -398,7 +396,7 @@ static ctl_table kern_table[] = {
398 .strategy = &sysctl_string, 396 .strategy = &sysctl_string,
399 }, 397 },
400#endif 398#endif
401#ifdef CONFIG_HOTPLUG 399#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
402 { 400 {
403 .ctl_name = KERN_HOTPLUG, 401 .ctl_name = KERN_HOTPLUG,
404 .procname = "hotplug", 402 .procname = "hotplug",
@@ -683,6 +681,27 @@ static ctl_table kern_table[] = {
683 .proc_handler = &proc_dointvec, 681 .proc_handler = &proc_dointvec,
684 }, 682 },
685#endif 683#endif
684#ifdef CONFIG_COMPAT
685 {
686 .ctl_name = KERN_COMPAT_LOG,
687 .procname = "compat-log",
688 .data = &compat_log,
689 .maxlen = sizeof (int),
690 .mode = 0644,
691 .proc_handler = &proc_dointvec,
692 },
693#endif
694#ifdef CONFIG_RT_MUTEXES
695 {
696 .ctl_name = KERN_MAX_LOCK_DEPTH,
697 .procname = "max_lock_depth",
698 .data = &max_lock_depth,
699 .maxlen = sizeof(int),
700 .mode = 0644,
701 .proc_handler = &proc_dointvec,
702 },
703#endif
704
686 { .ctl_name = 0 } 705 { .ctl_name = 0 }
687}; 706};
688 707
@@ -702,6 +721,14 @@ static ctl_table vm_table[] = {
702 .proc_handler = &proc_dointvec, 721 .proc_handler = &proc_dointvec,
703 }, 722 },
704 { 723 {
724 .ctl_name = VM_PANIC_ON_OOM,
725 .procname = "panic_on_oom",
726 .data = &sysctl_panic_on_oom,
727 .maxlen = sizeof(sysctl_panic_on_oom),
728 .mode = 0644,
729 .proc_handler = &proc_dointvec,
730 },
731 {
705 .ctl_name = VM_OVERCOMMIT_RATIO, 732 .ctl_name = VM_OVERCOMMIT_RATIO,
706 .procname = "overcommit_ratio", 733 .procname = "overcommit_ratio",
707 .data = &sysctl_overcommit_ratio, 734 .data = &sysctl_overcommit_ratio,
@@ -905,23 +932,22 @@ static ctl_table vm_table[] = {
905 .strategy = &sysctl_intvec, 932 .strategy = &sysctl_intvec,
906 .extra1 = &zero, 933 .extra1 = &zero,
907 }, 934 },
935#endif
936#ifdef CONFIG_X86_32
908 { 937 {
909 .ctl_name = VM_ZONE_RECLAIM_INTERVAL, 938 .ctl_name = VM_VDSO_ENABLED,
910 .procname = "zone_reclaim_interval", 939 .procname = "vdso_enabled",
911 .data = &zone_reclaim_interval, 940 .data = &vdso_enabled,
912 .maxlen = sizeof(zone_reclaim_interval), 941 .maxlen = sizeof(vdso_enabled),
913 .mode = 0644, 942 .mode = 0644,
914 .proc_handler = &proc_dointvec_jiffies, 943 .proc_handler = &proc_dointvec,
915 .strategy = &sysctl_jiffies, 944 .strategy = &sysctl_intvec,
945 .extra1 = &zero,
916 }, 946 },
917#endif 947#endif
918 { .ctl_name = 0 } 948 { .ctl_name = 0 }
919}; 949};
920 950
921static ctl_table proc_table[] = {
922 { .ctl_name = 0 }
923};
924
925static ctl_table fs_table[] = { 951static ctl_table fs_table[] = {
926 { 952 {
927 .ctl_name = FS_NRINODE, 953 .ctl_name = FS_NRINODE,
@@ -1028,7 +1054,7 @@ static ctl_table fs_table[] = {
1028 .mode = 0644, 1054 .mode = 0644,
1029 .proc_handler = &proc_doulongvec_minmax, 1055 .proc_handler = &proc_doulongvec_minmax,
1030 }, 1056 },
1031#ifdef CONFIG_INOTIFY 1057#ifdef CONFIG_INOTIFY_USER
1032 { 1058 {
1033 .ctl_name = FS_INOTIFY, 1059 .ctl_name = FS_INOTIFY,
1034 .procname = "inotify", 1060 .procname = "inotify",
diff --git a/kernel/time.c b/kernel/time.c
index b00ddc71cedb..5bd489747643 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -523,6 +523,7 @@ EXPORT_SYMBOL(do_gettimeofday);
523 523
524 524
525#else 525#else
526#ifndef CONFIG_GENERIC_TIME
526/* 527/*
527 * Simulate gettimeofday using do_gettimeofday which only allows a timeval 528 * Simulate gettimeofday using do_gettimeofday which only allows a timeval
528 * and therefore only yields usec accuracy 529 * and therefore only yields usec accuracy
@@ -537,6 +538,7 @@ void getnstimeofday(struct timespec *tv)
537} 538}
538EXPORT_SYMBOL_GPL(getnstimeofday); 539EXPORT_SYMBOL_GPL(getnstimeofday);
539#endif 540#endif
541#endif
540 542
541/* Converts Gregorian date to seconds since 1970-01-01 00:00:00. 543/* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
542 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 544 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
new file mode 100644
index 000000000000..e1dfd8e86cce
--- /dev/null
+++ b/kernel/time/Makefile
@@ -0,0 +1 @@
obj-y += clocksource.o jiffies.o
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
new file mode 100644
index 000000000000..74eca5939bd9
--- /dev/null
+++ b/kernel/time/clocksource.c
@@ -0,0 +1,349 @@
1/*
2 * linux/kernel/time/clocksource.c
3 *
4 * This file contains the functions which manage clocksource drivers.
5 *
6 * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 *
22 * TODO WishList:
23 * o Allow clocksource drivers to be unregistered
24 * o get rid of clocksource_jiffies extern
25 */
26
27#include <linux/clocksource.h>
28#include <linux/sysdev.h>
29#include <linux/init.h>
30#include <linux/module.h>
31
32/* XXX - Would like a better way for initializing curr_clocksource */
33extern struct clocksource clocksource_jiffies;
34
35/*[Clocksource internal variables]---------
36 * curr_clocksource:
37 * currently selected clocksource. Initialized to clocksource_jiffies.
38 * next_clocksource:
39 * pending next selected clocksource.
40 * clocksource_list:
41 * linked list with the registered clocksources
42 * clocksource_lock:
43 * protects manipulations to curr_clocksource and next_clocksource
44 * and the clocksource_list
45 * override_name:
46 * Name of the user-specified clocksource.
47 */
48static struct clocksource *curr_clocksource = &clocksource_jiffies;
49static struct clocksource *next_clocksource;
50static LIST_HEAD(clocksource_list);
51static DEFINE_SPINLOCK(clocksource_lock);
52static char override_name[32];
53static int finished_booting;
54
55/* clocksource_done_booting - Called near the end of bootup
56 *
57 * Hack to avoid lots of clocksource churn at boot time
58 */
59static int __init clocksource_done_booting(void)
60{
61 finished_booting = 1;
62 return 0;
63}
64
65late_initcall(clocksource_done_booting);
66
67/**
68 * clocksource_get_next - Returns the selected clocksource
69 *
70 */
71struct clocksource *clocksource_get_next(void)
72{
73 unsigned long flags;
74
75 spin_lock_irqsave(&clocksource_lock, flags);
76 if (next_clocksource && finished_booting) {
77 curr_clocksource = next_clocksource;
78 next_clocksource = NULL;
79 }
80 spin_unlock_irqrestore(&clocksource_lock, flags);
81
82 return curr_clocksource;
83}
84
85/**
86 * select_clocksource - Finds the best registered clocksource.
87 *
88 * Private function. Must hold clocksource_lock when called.
89 *
90 * Looks through the list of registered clocksources, returning
91 * the one with the highest rating value. If there is a clocksource
92 * name that matches the override string, it returns that clocksource.
93 */
94static struct clocksource *select_clocksource(void)
95{
96 struct clocksource *best = NULL;
97 struct list_head *tmp;
98
99 list_for_each(tmp, &clocksource_list) {
100 struct clocksource *src;
101
102 src = list_entry(tmp, struct clocksource, list);
103 if (!best)
104 best = src;
105
106 /* check for override: */
107 if (strlen(src->name) == strlen(override_name) &&
108 !strcmp(src->name, override_name)) {
109 best = src;
110 break;
111 }
112 /* pick the highest rating: */
113 if (src->rating > best->rating)
114 best = src;
115 }
116
117 return best;
118}
119
120/**
121 * is_registered_source - Checks if clocksource is registered
122 * @c: pointer to a clocksource
123 *
124 * Private helper function. Must hold clocksource_lock when called.
125 *
126 * Returns one if the clocksource is already registered, zero otherwise.
127 */
128static int is_registered_source(struct clocksource *c)
129{
130 int len = strlen(c->name);
131 struct list_head *tmp;
132
133 list_for_each(tmp, &clocksource_list) {
134 struct clocksource *src;
135
136 src = list_entry(tmp, struct clocksource, list);
137 if (strlen(src->name) == len && !strcmp(src->name, c->name))
138 return 1;
139 }
140
141 return 0;
142}
143
144/**
145 * clocksource_register - Used to install new clocksources
146 * @t: clocksource to be registered
147 *
148 * Returns -EBUSY if registration fails, zero otherwise.
149 */
150int clocksource_register(struct clocksource *c)
151{
152 int ret = 0;
153 unsigned long flags;
154
155 spin_lock_irqsave(&clocksource_lock, flags);
156 /* check if clocksource is already registered */
157 if (is_registered_source(c)) {
158 printk("register_clocksource: Cannot register %s. "
159 "Already registered!", c->name);
160 ret = -EBUSY;
161 } else {
162 /* register it */
163 list_add(&c->list, &clocksource_list);
164 /* scan the registered clocksources, and pick the best one */
165 next_clocksource = select_clocksource();
166 }
167 spin_unlock_irqrestore(&clocksource_lock, flags);
168 return ret;
169}
170EXPORT_SYMBOL(clocksource_register);
171
172/**
173 * clocksource_reselect - Rescan list for next clocksource
174 *
175 * A quick helper function to be used if a clocksource changes its
176 * rating. Forces the clocksource list to be re-scanned for the best
177 * clocksource.
178 */
179void clocksource_reselect(void)
180{
181 unsigned long flags;
182
183 spin_lock_irqsave(&clocksource_lock, flags);
184 next_clocksource = select_clocksource();
185 spin_unlock_irqrestore(&clocksource_lock, flags);
186}
187EXPORT_SYMBOL(clocksource_reselect);
188
189/**
190 * sysfs_show_current_clocksources - sysfs interface for current clocksource
191 * @dev: unused
192 * @buf: char buffer to be filled with clocksource list
193 *
194 * Provides sysfs interface for listing current clocksource.
195 */
196static ssize_t
197sysfs_show_current_clocksources(struct sys_device *dev, char *buf)
198{
199 char *curr = buf;
200
201 spin_lock_irq(&clocksource_lock);
202 curr += sprintf(curr, "%s ", curr_clocksource->name);
203 spin_unlock_irq(&clocksource_lock);
204
205 curr += sprintf(curr, "\n");
206
207 return curr - buf;
208}
209
210/**
211 * sysfs_override_clocksource - interface for manually overriding clocksource
212 * @dev: unused
213 * @buf: name of override clocksource
214 * @count: length of buffer
215 *
216 * Takes input from sysfs interface for manually overriding the default
217 * clocksource selction.
218 */
219static ssize_t sysfs_override_clocksource(struct sys_device *dev,
220 const char *buf, size_t count)
221{
222 size_t ret = count;
223 /* strings from sysfs write are not 0 terminated! */
224 if (count >= sizeof(override_name))
225 return -EINVAL;
226
227 /* strip of \n: */
228 if (buf[count-1] == '\n')
229 count--;
230 if (count < 1)
231 return -EINVAL;
232
233 spin_lock_irq(&clocksource_lock);
234
235 /* copy the name given: */
236 memcpy(override_name, buf, count);
237 override_name[count] = 0;
238
239 /* try to select it: */
240 next_clocksource = select_clocksource();
241
242 spin_unlock_irq(&clocksource_lock);
243
244 return ret;
245}
246
247/**
248 * sysfs_show_available_clocksources - sysfs interface for listing clocksource
249 * @dev: unused
250 * @buf: char buffer to be filled with clocksource list
251 *
252 * Provides sysfs interface for listing registered clocksources
253 */
254static ssize_t
255sysfs_show_available_clocksources(struct sys_device *dev, char *buf)
256{
257 struct list_head *tmp;
258 char *curr = buf;
259
260 spin_lock_irq(&clocksource_lock);
261 list_for_each(tmp, &clocksource_list) {
262 struct clocksource *src;
263
264 src = list_entry(tmp, struct clocksource, list);
265 curr += sprintf(curr, "%s ", src->name);
266 }
267 spin_unlock_irq(&clocksource_lock);
268
269 curr += sprintf(curr, "\n");
270
271 return curr - buf;
272}
273
274/*
275 * Sysfs setup bits:
276 */
277static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources,
278 sysfs_override_clocksource);
279
280static SYSDEV_ATTR(available_clocksource, 0600,
281 sysfs_show_available_clocksources, NULL);
282
283static struct sysdev_class clocksource_sysclass = {
284 set_kset_name("clocksource"),
285};
286
287static struct sys_device device_clocksource = {
288 .id = 0,
289 .cls = &clocksource_sysclass,
290};
291
292static int __init init_clocksource_sysfs(void)
293{
294 int error = sysdev_class_register(&clocksource_sysclass);
295
296 if (!error)
297 error = sysdev_register(&device_clocksource);
298 if (!error)
299 error = sysdev_create_file(
300 &device_clocksource,
301 &attr_current_clocksource);
302 if (!error)
303 error = sysdev_create_file(
304 &device_clocksource,
305 &attr_available_clocksource);
306 return error;
307}
308
309device_initcall(init_clocksource_sysfs);
310
311/**
312 * boot_override_clocksource - boot clock override
313 * @str: override name
314 *
315 * Takes a clocksource= boot argument and uses it
316 * as the clocksource override name.
317 */
318static int __init boot_override_clocksource(char* str)
319{
320 unsigned long flags;
321 spin_lock_irqsave(&clocksource_lock, flags);
322 if (str)
323 strlcpy(override_name, str, sizeof(override_name));
324 spin_unlock_irqrestore(&clocksource_lock, flags);
325 return 1;
326}
327
328__setup("clocksource=", boot_override_clocksource);
329
330/**
331 * boot_override_clock - Compatibility layer for deprecated boot option
332 * @str: override name
333 *
334 * DEPRECATED! Takes a clock= boot argument and uses it
335 * as the clocksource override name
336 */
337static int __init boot_override_clock(char* str)
338{
339 if (!strcmp(str, "pmtmr")) {
340 printk("Warning: clock=pmtmr is deprecated. "
341 "Use clocksource=acpi_pm.\n");
342 return boot_override_clocksource("acpi_pm");
343 }
344 printk("Warning! clock= boot option is deprecated. "
345 "Use clocksource=xyz\n");
346 return boot_override_clocksource(str);
347}
348
349__setup("clock=", boot_override_clock);
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
new file mode 100644
index 000000000000..126bb30c4afe
--- /dev/null
+++ b/kernel/time/jiffies.c
@@ -0,0 +1,73 @@
1/***********************************************************************
2* linux/kernel/time/jiffies.c
3*
4* This file contains the jiffies based clocksource.
5*
6* Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
7*
8* This program is free software; you can redistribute it and/or modify
9* it under the terms of the GNU General Public License as published by
10* the Free Software Foundation; either version 2 of the License, or
11* (at your option) any later version.
12*
13* This program is distributed in the hope that it will be useful,
14* but WITHOUT ANY WARRANTY; without even the implied warranty of
15* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16* GNU General Public License for more details.
17*
18* You should have received a copy of the GNU General Public License
19* along with this program; if not, write to the Free Software
20* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21*
22************************************************************************/
23#include <linux/clocksource.h>
24#include <linux/jiffies.h>
25#include <linux/init.h>
26
27/* The Jiffies based clocksource is the lowest common
28 * denominator clock source which should function on
29 * all systems. It has the same coarse resolution as
30 * the timer interrupt frequency HZ and it suffers
31 * inaccuracies caused by missed or lost timer
32 * interrupts and the inability for the timer
33 * interrupt hardware to accuratly tick at the
34 * requested HZ value. It is also not reccomended
35 * for "tick-less" systems.
36 */
37#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ))
38
39/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier
40 * conversion, the .shift value could be zero. However
41 * this would make NTP adjustments impossible as they are
42 * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to
43 * shift both the nominator and denominator the same
44 * amount, and give ntp adjustments in units of 1/2^8
45 *
46 * The value 8 is somewhat carefully chosen, as anything
47 * larger can result in overflows. NSEC_PER_JIFFY grows as
48 * HZ shrinks, so values greater then 8 overflow 32bits when
49 * HZ=100.
50 */
51#define JIFFIES_SHIFT 8
52
53static cycle_t jiffies_read(void)
54{
55 return (cycle_t) jiffies;
56}
57
58struct clocksource clocksource_jiffies = {
59 .name = "jiffies",
60 .rating = 0, /* lowest rating*/
61 .read = jiffies_read,
62 .mask = 0xffffffff, /*32bits*/
63 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
64 .shift = JIFFIES_SHIFT,
65 .is_continuous = 0, /* tick based, not free running */
66};
67
68static int __init init_jiffies_clocksource(void)
69{
70 return clocksource_register(&clocksource_jiffies);
71}
72
73module_init(init_jiffies_clocksource);
diff --git a/kernel/timer.c b/kernel/timer.c
index 9e49deed468c..5a8960253063 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -146,7 +146,7 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
146void fastcall init_timer(struct timer_list *timer) 146void fastcall init_timer(struct timer_list *timer)
147{ 147{
148 timer->entry.next = NULL; 148 timer->entry.next = NULL;
149 timer->base = per_cpu(tvec_bases, raw_smp_processor_id()); 149 timer->base = __raw_get_cpu_var(tvec_bases);
150} 150}
151EXPORT_SYMBOL(init_timer); 151EXPORT_SYMBOL(init_timer);
152 152
@@ -383,23 +383,19 @@ EXPORT_SYMBOL(del_timer_sync);
383static int cascade(tvec_base_t *base, tvec_t *tv, int index) 383static int cascade(tvec_base_t *base, tvec_t *tv, int index)
384{ 384{
385 /* cascade all the timers from tv up one level */ 385 /* cascade all the timers from tv up one level */
386 struct list_head *head, *curr; 386 struct timer_list *timer, *tmp;
387 struct list_head tv_list;
388
389 list_replace_init(tv->vec + index, &tv_list);
387 390
388 head = tv->vec + index;
389 curr = head->next;
390 /* 391 /*
391 * We are removing _all_ timers from the list, so we don't have to 392 * We are removing _all_ timers from the list, so we
392 * detach them individually, just clear the list afterwards. 393 * don't have to detach them individually.
393 */ 394 */
394 while (curr != head) { 395 list_for_each_entry_safe(timer, tmp, &tv_list, entry) {
395 struct timer_list *tmp; 396 BUG_ON(timer->base != base);
396 397 internal_add_timer(base, timer);
397 tmp = list_entry(curr, struct timer_list, entry);
398 BUG_ON(tmp->base != base);
399 curr = curr->next;
400 internal_add_timer(base, tmp);
401 } 398 }
402 INIT_LIST_HEAD(head);
403 399
404 return index; 400 return index;
405} 401}
@@ -419,10 +415,10 @@ static inline void __run_timers(tvec_base_t *base)
419 415
420 spin_lock_irq(&base->lock); 416 spin_lock_irq(&base->lock);
421 while (time_after_eq(jiffies, base->timer_jiffies)) { 417 while (time_after_eq(jiffies, base->timer_jiffies)) {
422 struct list_head work_list = LIST_HEAD_INIT(work_list); 418 struct list_head work_list;
423 struct list_head *head = &work_list; 419 struct list_head *head = &work_list;
424 int index = base->timer_jiffies & TVR_MASK; 420 int index = base->timer_jiffies & TVR_MASK;
425 421
426 /* 422 /*
427 * Cascade timers: 423 * Cascade timers:
428 */ 424 */
@@ -431,8 +427,8 @@ static inline void __run_timers(tvec_base_t *base)
431 (!cascade(base, &base->tv3, INDEX(1))) && 427 (!cascade(base, &base->tv3, INDEX(1))) &&
432 !cascade(base, &base->tv4, INDEX(2))) 428 !cascade(base, &base->tv4, INDEX(2)))
433 cascade(base, &base->tv5, INDEX(3)); 429 cascade(base, &base->tv5, INDEX(3));
434 ++base->timer_jiffies; 430 ++base->timer_jiffies;
435 list_splice_init(base->tv1.vec + index, &work_list); 431 list_replace_init(base->tv1.vec + index, &work_list);
436 while (!list_empty(head)) { 432 while (!list_empty(head)) {
437 void (*fn)(unsigned long); 433 void (*fn)(unsigned long);
438 unsigned long data; 434 unsigned long data;
@@ -601,7 +597,6 @@ long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */
601long time_precision = 1; /* clock precision (us) */ 597long time_precision = 1; /* clock precision (us) */
602long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ 598long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */
603long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ 599long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */
604static long time_phase; /* phase offset (scaled us) */
605long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC; 600long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC;
606 /* frequency offset (scaled ppm)*/ 601 /* frequency offset (scaled ppm)*/
607static long time_adj; /* tick adjust (scaled 1 / HZ) */ 602static long time_adj; /* tick adjust (scaled 1 / HZ) */
@@ -751,27 +746,14 @@ static long adjtime_adjustment(void)
751} 746}
752 747
753/* in the NTP reference this is called "hardclock()" */ 748/* in the NTP reference this is called "hardclock()" */
754static void update_wall_time_one_tick(void) 749static void update_ntp_one_tick(void)
755{ 750{
756 long time_adjust_step, delta_nsec; 751 long time_adjust_step;
757 752
758 time_adjust_step = adjtime_adjustment(); 753 time_adjust_step = adjtime_adjustment();
759 if (time_adjust_step) 754 if (time_adjust_step)
760 /* Reduce by this step the amount of time left */ 755 /* Reduce by this step the amount of time left */
761 time_adjust -= time_adjust_step; 756 time_adjust -= time_adjust_step;
762 delta_nsec = tick_nsec + time_adjust_step * 1000;
763 /*
764 * Advance the phase, once it gets to one microsecond, then
765 * advance the tick more.
766 */
767 time_phase += time_adj;
768 if ((time_phase >= FINENSEC) || (time_phase <= -FINENSEC)) {
769 long ltemp = shift_right(time_phase, (SHIFT_SCALE - 10));
770 time_phase -= ltemp << (SHIFT_SCALE - 10);
771 delta_nsec += ltemp;
772 }
773 xtime.tv_nsec += delta_nsec;
774 time_interpolator_update(delta_nsec);
775 757
776 /* Changes by adjtime() do not take effect till next tick. */ 758 /* Changes by adjtime() do not take effect till next tick. */
777 if (time_next_adjust != 0) { 759 if (time_next_adjust != 0) {
@@ -784,36 +766,378 @@ static void update_wall_time_one_tick(void)
784 * Return how long ticks are at the moment, that is, how much time 766 * Return how long ticks are at the moment, that is, how much time
785 * update_wall_time_one_tick will add to xtime next time we call it 767 * update_wall_time_one_tick will add to xtime next time we call it
786 * (assuming no calls to do_adjtimex in the meantime). 768 * (assuming no calls to do_adjtimex in the meantime).
787 * The return value is in fixed-point nanoseconds with SHIFT_SCALE-10 769 * The return value is in fixed-point nanoseconds shifted by the
788 * bits to the right of the binary point. 770 * specified number of bits to the right of the binary point.
789 * This function has no side-effects. 771 * This function has no side-effects.
790 */ 772 */
791u64 current_tick_length(void) 773u64 current_tick_length(void)
792{ 774{
793 long delta_nsec; 775 long delta_nsec;
776 u64 ret;
794 777
778 /* calculate the finest interval NTP will allow.
779 * ie: nanosecond value shifted by (SHIFT_SCALE - 10)
780 */
795 delta_nsec = tick_nsec + adjtime_adjustment() * 1000; 781 delta_nsec = tick_nsec + adjtime_adjustment() * 1000;
796 return ((u64) delta_nsec << (SHIFT_SCALE - 10)) + time_adj; 782 ret = (u64)delta_nsec << TICK_LENGTH_SHIFT;
783 ret += (s64)time_adj << (TICK_LENGTH_SHIFT - (SHIFT_SCALE - 10));
784
785 return ret;
797} 786}
798 787
799/* 788/* XXX - all of this timekeeping code should be later moved to time.c */
800 * Using a loop looks inefficient, but "ticks" is 789#include <linux/clocksource.h>
801 * usually just one (we shouldn't be losing ticks, 790static struct clocksource *clock; /* pointer to current clocksource */
802 * we're doing this this way mainly for interrupt 791
803 * latency reasons, not because we think we'll 792#ifdef CONFIG_GENERIC_TIME
804 * have lots of lost timer ticks 793/**
794 * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook
795 *
796 * private function, must hold xtime_lock lock when being
797 * called. Returns the number of nanoseconds since the
798 * last call to update_wall_time() (adjusted by NTP scaling)
799 */
800static inline s64 __get_nsec_offset(void)
801{
802 cycle_t cycle_now, cycle_delta;
803 s64 ns_offset;
804
805 /* read clocksource: */
806 cycle_now = clocksource_read(clock);
807
808 /* calculate the delta since the last update_wall_time: */
809 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
810
811 /* convert to nanoseconds: */
812 ns_offset = cyc2ns(clock, cycle_delta);
813
814 return ns_offset;
815}
816
817/**
818 * __get_realtime_clock_ts - Returns the time of day in a timespec
819 * @ts: pointer to the timespec to be set
820 *
821 * Returns the time of day in a timespec. Used by
822 * do_gettimeofday() and get_realtime_clock_ts().
823 */
824static inline void __get_realtime_clock_ts(struct timespec *ts)
825{
826 unsigned long seq;
827 s64 nsecs;
828
829 do {
830 seq = read_seqbegin(&xtime_lock);
831
832 *ts = xtime;
833 nsecs = __get_nsec_offset();
834
835 } while (read_seqretry(&xtime_lock, seq));
836
837 timespec_add_ns(ts, nsecs);
838}
839
840/**
841 * getnstimeofday - Returns the time of day in a timespec
842 * @ts: pointer to the timespec to be set
843 *
844 * Returns the time of day in a timespec.
845 */
846void getnstimeofday(struct timespec *ts)
847{
848 __get_realtime_clock_ts(ts);
849}
850
851EXPORT_SYMBOL(getnstimeofday);
852
853/**
854 * do_gettimeofday - Returns the time of day in a timeval
855 * @tv: pointer to the timeval to be set
856 *
857 * NOTE: Users should be converted to using get_realtime_clock_ts()
858 */
859void do_gettimeofday(struct timeval *tv)
860{
861 struct timespec now;
862
863 __get_realtime_clock_ts(&now);
864 tv->tv_sec = now.tv_sec;
865 tv->tv_usec = now.tv_nsec/1000;
866}
867
868EXPORT_SYMBOL(do_gettimeofday);
869/**
870 * do_settimeofday - Sets the time of day
871 * @tv: pointer to the timespec variable containing the new time
872 *
873 * Sets the time of day to the new time and update NTP and notify hrtimers
874 */
875int do_settimeofday(struct timespec *tv)
876{
877 unsigned long flags;
878 time_t wtm_sec, sec = tv->tv_sec;
879 long wtm_nsec, nsec = tv->tv_nsec;
880
881 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
882 return -EINVAL;
883
884 write_seqlock_irqsave(&xtime_lock, flags);
885
886 nsec -= __get_nsec_offset();
887
888 wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
889 wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
890
891 set_normalized_timespec(&xtime, sec, nsec);
892 set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
893
894 ntp_clear();
895
896 write_sequnlock_irqrestore(&xtime_lock, flags);
897
898 /* signal hrtimers about time change */
899 clock_was_set();
900
901 return 0;
902}
903
904EXPORT_SYMBOL(do_settimeofday);
905
906/**
907 * change_clocksource - Swaps clocksources if a new one is available
908 *
909 * Accumulates current time interval and initializes new clocksource
910 */
911static int change_clocksource(void)
912{
913 struct clocksource *new;
914 cycle_t now;
915 u64 nsec;
916 new = clocksource_get_next();
917 if (clock != new) {
918 now = clocksource_read(new);
919 nsec = __get_nsec_offset();
920 timespec_add_ns(&xtime, nsec);
921
922 clock = new;
923 clock->cycle_last = now;
924 printk(KERN_INFO "Time: %s clocksource has been installed.\n",
925 clock->name);
926 return 1;
927 } else if (clock->update_callback) {
928 return clock->update_callback();
929 }
930 return 0;
931}
932#else
933#define change_clocksource() (0)
934#endif
935
936/**
937 * timeofday_is_continuous - check to see if timekeeping is free running
805 */ 938 */
806static void update_wall_time(unsigned long ticks) 939int timekeeping_is_continuous(void)
807{ 940{
941 unsigned long seq;
942 int ret;
943
808 do { 944 do {
809 ticks--; 945 seq = read_seqbegin(&xtime_lock);
810 update_wall_time_one_tick(); 946
811 if (xtime.tv_nsec >= 1000000000) { 947 ret = clock->is_continuous;
812 xtime.tv_nsec -= 1000000000; 948
949 } while (read_seqretry(&xtime_lock, seq));
950
951 return ret;
952}
953
954/*
955 * timekeeping_init - Initializes the clocksource and common timekeeping values
956 */
957void __init timekeeping_init(void)
958{
959 unsigned long flags;
960
961 write_seqlock_irqsave(&xtime_lock, flags);
962 clock = clocksource_get_next();
963 clocksource_calculate_interval(clock, tick_nsec);
964 clock->cycle_last = clocksource_read(clock);
965 ntp_clear();
966 write_sequnlock_irqrestore(&xtime_lock, flags);
967}
968
969
970/*
971 * timekeeping_resume - Resumes the generic timekeeping subsystem.
972 * @dev: unused
973 *
974 * This is for the generic clocksource timekeeping.
975 * xtime/wall_to_monotonic/jiffies/wall_jiffies/etc are
976 * still managed by arch specific suspend/resume code.
977 */
978static int timekeeping_resume(struct sys_device *dev)
979{
980 unsigned long flags;
981
982 write_seqlock_irqsave(&xtime_lock, flags);
983 /* restart the last cycle value */
984 clock->cycle_last = clocksource_read(clock);
985 write_sequnlock_irqrestore(&xtime_lock, flags);
986 return 0;
987}
988
989/* sysfs resume/suspend bits for timekeeping */
990static struct sysdev_class timekeeping_sysclass = {
991 .resume = timekeeping_resume,
992 set_kset_name("timekeeping"),
993};
994
995static struct sys_device device_timer = {
996 .id = 0,
997 .cls = &timekeeping_sysclass,
998};
999
1000static int __init timekeeping_init_device(void)
1001{
1002 int error = sysdev_class_register(&timekeeping_sysclass);
1003 if (!error)
1004 error = sysdev_register(&device_timer);
1005 return error;
1006}
1007
1008device_initcall(timekeeping_init_device);
1009
1010/*
1011 * If the error is already larger, we look ahead another tick,
1012 * to compensate for late or lost adjustments.
1013 */
1014static __always_inline int clocksource_bigadjust(int sign, s64 error, s64 *interval, s64 *offset)
1015{
1016 int adj;
1017
1018 /*
1019 * As soon as the machine is synchronized to the external time
1020 * source this should be the common case.
1021 */
1022 error >>= 2;
1023 if (likely(sign > 0 ? error <= *interval : error >= *interval))
1024 return sign;
1025
1026 /*
1027 * An extra look ahead dampens the effect of the current error,
1028 * which can grow quite large with continously late updates, as
1029 * it would dominate the adjustment value and can lead to
1030 * oscillation.
1031 */
1032 error += current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1);
1033 error -= clock->xtime_interval >> 1;
1034
1035 adj = 0;
1036 while (1) {
1037 error >>= 1;
1038 if (sign > 0 ? error <= *interval : error >= *interval)
1039 break;
1040 adj++;
1041 }
1042
1043 /*
1044 * Add the current adjustments to the error and take the offset
1045 * into account, the latter can cause the error to be hardly
1046 * reduced at the next tick. Check the error again if there's
1047 * room for another adjustment, thus further reducing the error
1048 * which otherwise had to be corrected at the next update.
1049 */
1050 error = (error << 1) - *interval + *offset;
1051 if (sign > 0 ? error > *interval : error < *interval)
1052 adj++;
1053
1054 *interval <<= adj;
1055 *offset <<= adj;
1056 return sign << adj;
1057}
1058
1059/*
1060 * Adjust the multiplier to reduce the error value,
1061 * this is optimized for the most common adjustments of -1,0,1,
1062 * for other values we can do a bit more work.
1063 */
1064static void clocksource_adjust(struct clocksource *clock, s64 offset)
1065{
1066 s64 error, interval = clock->cycle_interval;
1067 int adj;
1068
1069 error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1);
1070 if (error > interval) {
1071 adj = clocksource_bigadjust(1, error, &interval, &offset);
1072 } else if (error < -interval) {
1073 interval = -interval;
1074 offset = -offset;
1075 adj = clocksource_bigadjust(-1, error, &interval, &offset);
1076 } else
1077 return;
1078
1079 clock->mult += adj;
1080 clock->xtime_interval += interval;
1081 clock->xtime_nsec -= offset;
1082 clock->error -= (interval - offset) << (TICK_LENGTH_SHIFT - clock->shift);
1083}
1084
1085/*
1086 * update_wall_time - Uses the current clocksource to increment the wall time
1087 *
1088 * Called from the timer interrupt, must hold a write on xtime_lock.
1089 */
1090static void update_wall_time(void)
1091{
1092 cycle_t offset;
1093
1094 clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
1095
1096#ifdef CONFIG_GENERIC_TIME
1097 offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask;
1098#else
1099 offset = clock->cycle_interval;
1100#endif
1101
1102 /* normally this loop will run just once, however in the
1103 * case of lost or late ticks, it will accumulate correctly.
1104 */
1105 while (offset >= clock->cycle_interval) {
1106 /* accumulate one interval */
1107 clock->xtime_nsec += clock->xtime_interval;
1108 clock->cycle_last += clock->cycle_interval;
1109 offset -= clock->cycle_interval;
1110
1111 if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
1112 clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
813 xtime.tv_sec++; 1113 xtime.tv_sec++;
814 second_overflow(); 1114 second_overflow();
815 } 1115 }
816 } while (ticks); 1116
1117 /* interpolator bits */
1118 time_interpolator_update(clock->xtime_interval
1119 >> clock->shift);
1120 /* increment the NTP state machine */
1121 update_ntp_one_tick();
1122
1123 /* accumulate error between NTP and clock interval */
1124 clock->error += current_tick_length();
1125 clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift);
1126 }
1127
1128 /* correct the clock when NTP error is too big */
1129 clocksource_adjust(clock, offset);
1130
1131 /* store full nanoseconds into xtime */
1132 xtime.tv_nsec = clock->xtime_nsec >> clock->shift;
1133 clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
1134
1135 /* check to see if there is a new clocksource to use */
1136 if (change_clocksource()) {
1137 clock->error = 0;
1138 clock->xtime_nsec = 0;
1139 clocksource_calculate_interval(clock, tick_nsec);
1140 }
817} 1141}
818 1142
819/* 1143/*
@@ -919,10 +1243,8 @@ static inline void update_times(void)
919 unsigned long ticks; 1243 unsigned long ticks;
920 1244
921 ticks = jiffies - wall_jiffies; 1245 ticks = jiffies - wall_jiffies;
922 if (ticks) { 1246 wall_jiffies += ticks;
923 wall_jiffies += ticks; 1247 update_wall_time();
924 update_wall_time(ticks);
925 }
926 calc_load(ticks); 1248 calc_load(ticks);
927} 1249}
928 1250
@@ -1330,7 +1652,7 @@ static void __devinit migrate_timers(int cpu)
1330} 1652}
1331#endif /* CONFIG_HOTPLUG_CPU */ 1653#endif /* CONFIG_HOTPLUG_CPU */
1332 1654
1333static int timer_cpu_notify(struct notifier_block *self, 1655static int __devinit timer_cpu_notify(struct notifier_block *self,
1334 unsigned long action, void *hcpu) 1656 unsigned long action, void *hcpu)
1335{ 1657{
1336 long cpu = (long)hcpu; 1658 long cpu = (long)hcpu;
@@ -1350,7 +1672,7 @@ static int timer_cpu_notify(struct notifier_block *self,
1350 return NOTIFY_OK; 1672 return NOTIFY_OK;
1351} 1673}
1352 1674
1353static struct notifier_block timers_nb = { 1675static struct notifier_block __devinitdata timers_nb = {
1354 .notifier_call = timer_cpu_notify, 1676 .notifier_call = timer_cpu_notify,
1355}; 1677};
1356 1678
diff --git a/kernel/unwind.c b/kernel/unwind.c
new file mode 100644
index 000000000000..f69c804c8e62
--- /dev/null
+++ b/kernel/unwind.c
@@ -0,0 +1,918 @@
1/*
2 * Copyright (C) 2002-2006 Novell, Inc.
3 * Jan Beulich <jbeulich@novell.com>
4 * This code is released under version 2 of the GNU GPL.
5 *
6 * A simple API for unwinding kernel stacks. This is used for
7 * debugging and error reporting purposes. The kernel doesn't need
8 * full-blown stack unwinding with all the bells and whistles, so there
9 * is not much point in implementing the full Dwarf2 unwind API.
10 */
11
12#include <linux/unwind.h>
13#include <linux/module.h>
14#include <linux/delay.h>
15#include <linux/stop_machine.h>
16#include <asm/sections.h>
17#include <asm/uaccess.h>
18#include <asm/unaligned.h>
19
20extern char __start_unwind[], __end_unwind[];
21
22#define MAX_STACK_DEPTH 8
23
24#define EXTRA_INFO(f) { \
25 BUILD_BUG_ON_ZERO(offsetof(struct unwind_frame_info, f) \
26 % FIELD_SIZEOF(struct unwind_frame_info, f)) \
27 + offsetof(struct unwind_frame_info, f) \
28 / FIELD_SIZEOF(struct unwind_frame_info, f), \
29 FIELD_SIZEOF(struct unwind_frame_info, f) \
30 }
31#define PTREGS_INFO(f) EXTRA_INFO(regs.f)
32
33static const struct {
34 unsigned offs:BITS_PER_LONG / 2;
35 unsigned width:BITS_PER_LONG / 2;
36} reg_info[] = {
37 UNW_REGISTER_INFO
38};
39
40#undef PTREGS_INFO
41#undef EXTRA_INFO
42
43#ifndef REG_INVALID
44#define REG_INVALID(r) (reg_info[r].width == 0)
45#endif
46
47#define DW_CFA_nop 0x00
48#define DW_CFA_set_loc 0x01
49#define DW_CFA_advance_loc1 0x02
50#define DW_CFA_advance_loc2 0x03
51#define DW_CFA_advance_loc4 0x04
52#define DW_CFA_offset_extended 0x05
53#define DW_CFA_restore_extended 0x06
54#define DW_CFA_undefined 0x07
55#define DW_CFA_same_value 0x08
56#define DW_CFA_register 0x09
57#define DW_CFA_remember_state 0x0a
58#define DW_CFA_restore_state 0x0b
59#define DW_CFA_def_cfa 0x0c
60#define DW_CFA_def_cfa_register 0x0d
61#define DW_CFA_def_cfa_offset 0x0e
62#define DW_CFA_def_cfa_expression 0x0f
63#define DW_CFA_expression 0x10
64#define DW_CFA_offset_extended_sf 0x11
65#define DW_CFA_def_cfa_sf 0x12
66#define DW_CFA_def_cfa_offset_sf 0x13
67#define DW_CFA_val_offset 0x14
68#define DW_CFA_val_offset_sf 0x15
69#define DW_CFA_val_expression 0x16
70#define DW_CFA_lo_user 0x1c
71#define DW_CFA_GNU_window_save 0x2d
72#define DW_CFA_GNU_args_size 0x2e
73#define DW_CFA_GNU_negative_offset_extended 0x2f
74#define DW_CFA_hi_user 0x3f
75
76#define DW_EH_PE_FORM 0x07
77#define DW_EH_PE_native 0x00
78#define DW_EH_PE_leb128 0x01
79#define DW_EH_PE_data2 0x02
80#define DW_EH_PE_data4 0x03
81#define DW_EH_PE_data8 0x04
82#define DW_EH_PE_signed 0x08
83#define DW_EH_PE_ADJUST 0x70
84#define DW_EH_PE_abs 0x00
85#define DW_EH_PE_pcrel 0x10
86#define DW_EH_PE_textrel 0x20
87#define DW_EH_PE_datarel 0x30
88#define DW_EH_PE_funcrel 0x40
89#define DW_EH_PE_aligned 0x50
90#define DW_EH_PE_indirect 0x80
91#define DW_EH_PE_omit 0xff
92
93typedef unsigned long uleb128_t;
94typedef signed long sleb128_t;
95
96static struct unwind_table {
97 struct {
98 unsigned long pc;
99 unsigned long range;
100 } core, init;
101 const void *address;
102 unsigned long size;
103 struct unwind_table *link;
104 const char *name;
105} root_table, *last_table;
106
107struct unwind_item {
108 enum item_location {
109 Nowhere,
110 Memory,
111 Register,
112 Value
113 } where;
114 uleb128_t value;
115};
116
117struct unwind_state {
118 uleb128_t loc, org;
119 const u8 *cieStart, *cieEnd;
120 uleb128_t codeAlign;
121 sleb128_t dataAlign;
122 struct cfa {
123 uleb128_t reg, offs;
124 } cfa;
125 struct unwind_item regs[ARRAY_SIZE(reg_info)];
126 unsigned stackDepth:8;
127 unsigned version:8;
128 const u8 *label;
129 const u8 *stack[MAX_STACK_DEPTH];
130};
131
132static const struct cfa badCFA = { ARRAY_SIZE(reg_info), 1 };
133
134static struct unwind_table *find_table(unsigned long pc)
135{
136 struct unwind_table *table;
137
138 for (table = &root_table; table; table = table->link)
139 if ((pc >= table->core.pc
140 && pc < table->core.pc + table->core.range)
141 || (pc >= table->init.pc
142 && pc < table->init.pc + table->init.range))
143 break;
144
145 return table;
146}
147
148static void init_unwind_table(struct unwind_table *table,
149 const char *name,
150 const void *core_start,
151 unsigned long core_size,
152 const void *init_start,
153 unsigned long init_size,
154 const void *table_start,
155 unsigned long table_size)
156{
157 table->core.pc = (unsigned long)core_start;
158 table->core.range = core_size;
159 table->init.pc = (unsigned long)init_start;
160 table->init.range = init_size;
161 table->address = table_start;
162 table->size = table_size;
163 table->link = NULL;
164 table->name = name;
165}
166
167void __init unwind_init(void)
168{
169 init_unwind_table(&root_table, "kernel",
170 _text, _end - _text,
171 NULL, 0,
172 __start_unwind, __end_unwind - __start_unwind);
173}
174
175#ifdef CONFIG_MODULES
176
177/* Must be called with module_mutex held. */
178void *unwind_add_table(struct module *module,
179 const void *table_start,
180 unsigned long table_size)
181{
182 struct unwind_table *table;
183
184 if (table_size <= 0)
185 return NULL;
186
187 table = kmalloc(sizeof(*table), GFP_KERNEL);
188 if (!table)
189 return NULL;
190
191 init_unwind_table(table, module->name,
192 module->module_core, module->core_size,
193 module->module_init, module->init_size,
194 table_start, table_size);
195
196 if (last_table)
197 last_table->link = table;
198 else
199 root_table.link = table;
200 last_table = table;
201
202 return table;
203}
204
205struct unlink_table_info
206{
207 struct unwind_table *table;
208 int init_only;
209};
210
211static int unlink_table(void *arg)
212{
213 struct unlink_table_info *info = arg;
214 struct unwind_table *table = info->table, *prev;
215
216 for (prev = &root_table; prev->link && prev->link != table; prev = prev->link)
217 ;
218
219 if (prev->link) {
220 if (info->init_only) {
221 table->init.pc = 0;
222 table->init.range = 0;
223 info->table = NULL;
224 } else {
225 prev->link = table->link;
226 if (!prev->link)
227 last_table = prev;
228 }
229 } else
230 info->table = NULL;
231
232 return 0;
233}
234
235/* Must be called with module_mutex held. */
236void unwind_remove_table(void *handle, int init_only)
237{
238 struct unwind_table *table = handle;
239 struct unlink_table_info info;
240
241 if (!table || table == &root_table)
242 return;
243
244 if (init_only && table == last_table) {
245 table->init.pc = 0;
246 table->init.range = 0;
247 return;
248 }
249
250 info.table = table;
251 info.init_only = init_only;
252 stop_machine_run(unlink_table, &info, NR_CPUS);
253
254 if (info.table)
255 kfree(table);
256}
257
258#endif /* CONFIG_MODULES */
259
260static uleb128_t get_uleb128(const u8 **pcur, const u8 *end)
261{
262 const u8 *cur = *pcur;
263 uleb128_t value;
264 unsigned shift;
265
266 for (shift = 0, value = 0; cur < end; shift += 7) {
267 if (shift + 7 > 8 * sizeof(value)
268 && (*cur & 0x7fU) >= (1U << (8 * sizeof(value) - shift))) {
269 cur = end + 1;
270 break;
271 }
272 value |= (uleb128_t)(*cur & 0x7f) << shift;
273 if (!(*cur++ & 0x80))
274 break;
275 }
276 *pcur = cur;
277
278 return value;
279}
280
281static sleb128_t get_sleb128(const u8 **pcur, const u8 *end)
282{
283 const u8 *cur = *pcur;
284 sleb128_t value;
285 unsigned shift;
286
287 for (shift = 0, value = 0; cur < end; shift += 7) {
288 if (shift + 7 > 8 * sizeof(value)
289 && (*cur & 0x7fU) >= (1U << (8 * sizeof(value) - shift))) {
290 cur = end + 1;
291 break;
292 }
293 value |= (sleb128_t)(*cur & 0x7f) << shift;
294 if (!(*cur & 0x80)) {
295 value |= -(*cur++ & 0x40) << shift;
296 break;
297 }
298 }
299 *pcur = cur;
300
301 return value;
302}
303
304static unsigned long read_pointer(const u8 **pLoc,
305 const void *end,
306 signed ptrType)
307{
308 unsigned long value = 0;
309 union {
310 const u8 *p8;
311 const u16 *p16u;
312 const s16 *p16s;
313 const u32 *p32u;
314 const s32 *p32s;
315 const unsigned long *pul;
316 } ptr;
317
318 if (ptrType < 0 || ptrType == DW_EH_PE_omit)
319 return 0;
320 ptr.p8 = *pLoc;
321 switch(ptrType & DW_EH_PE_FORM) {
322 case DW_EH_PE_data2:
323 if (end < (const void *)(ptr.p16u + 1))
324 return 0;
325 if(ptrType & DW_EH_PE_signed)
326 value = get_unaligned(ptr.p16s++);
327 else
328 value = get_unaligned(ptr.p16u++);
329 break;
330 case DW_EH_PE_data4:
331#ifdef CONFIG_64BIT
332 if (end < (const void *)(ptr.p32u + 1))
333 return 0;
334 if(ptrType & DW_EH_PE_signed)
335 value = get_unaligned(ptr.p32s++);
336 else
337 value = get_unaligned(ptr.p32u++);
338 break;
339 case DW_EH_PE_data8:
340 BUILD_BUG_ON(sizeof(u64) != sizeof(value));
341#else
342 BUILD_BUG_ON(sizeof(u32) != sizeof(value));
343#endif
344 case DW_EH_PE_native:
345 if (end < (const void *)(ptr.pul + 1))
346 return 0;
347 value = get_unaligned(ptr.pul++);
348 break;
349 case DW_EH_PE_leb128:
350 BUILD_BUG_ON(sizeof(uleb128_t) > sizeof(value));
351 value = ptrType & DW_EH_PE_signed
352 ? get_sleb128(&ptr.p8, end)
353 : get_uleb128(&ptr.p8, end);
354 if ((const void *)ptr.p8 > end)
355 return 0;
356 break;
357 default:
358 return 0;
359 }
360 switch(ptrType & DW_EH_PE_ADJUST) {
361 case DW_EH_PE_abs:
362 break;
363 case DW_EH_PE_pcrel:
364 value += (unsigned long)*pLoc;
365 break;
366 default:
367 return 0;
368 }
369 if ((ptrType & DW_EH_PE_indirect)
370 && __get_user(value, (unsigned long *)value))
371 return 0;
372 *pLoc = ptr.p8;
373
374 return value;
375}
376
377static signed fde_pointer_type(const u32 *cie)
378{
379 const u8 *ptr = (const u8 *)(cie + 2);
380 unsigned version = *ptr;
381
382 if (version != 1)
383 return -1; /* unsupported */
384 if (*++ptr) {
385 const char *aug;
386 const u8 *end = (const u8 *)(cie + 1) + *cie;
387 uleb128_t len;
388
389 /* check if augmentation size is first (and thus present) */
390 if (*ptr != 'z')
391 return -1;
392 /* check if augmentation string is nul-terminated */
393 if ((ptr = memchr(aug = (const void *)ptr, 0, end - ptr)) == NULL)
394 return -1;
395 ++ptr; /* skip terminator */
396 get_uleb128(&ptr, end); /* skip code alignment */
397 get_sleb128(&ptr, end); /* skip data alignment */
398 /* skip return address column */
399 version <= 1 ? (void)++ptr : (void)get_uleb128(&ptr, end);
400 len = get_uleb128(&ptr, end); /* augmentation length */
401 if (ptr + len < ptr || ptr + len > end)
402 return -1;
403 end = ptr + len;
404 while (*++aug) {
405 if (ptr >= end)
406 return -1;
407 switch(*aug) {
408 case 'L':
409 ++ptr;
410 break;
411 case 'P': {
412 signed ptrType = *ptr++;
413
414 if (!read_pointer(&ptr, end, ptrType) || ptr > end)
415 return -1;
416 }
417 break;
418 case 'R':
419 return *ptr;
420 default:
421 return -1;
422 }
423 }
424 }
425 return DW_EH_PE_native|DW_EH_PE_abs;
426}
427
428static int advance_loc(unsigned long delta, struct unwind_state *state)
429{
430 state->loc += delta * state->codeAlign;
431
432 return delta > 0;
433}
434
435static void set_rule(uleb128_t reg,
436 enum item_location where,
437 uleb128_t value,
438 struct unwind_state *state)
439{
440 if (reg < ARRAY_SIZE(state->regs)) {
441 state->regs[reg].where = where;
442 state->regs[reg].value = value;
443 }
444}
445
446static int processCFI(const u8 *start,
447 const u8 *end,
448 unsigned long targetLoc,
449 signed ptrType,
450 struct unwind_state *state)
451{
452 union {
453 const u8 *p8;
454 const u16 *p16;
455 const u32 *p32;
456 } ptr;
457 int result = 1;
458
459 if (start != state->cieStart) {
460 state->loc = state->org;
461 result = processCFI(state->cieStart, state->cieEnd, 0, ptrType, state);
462 if (targetLoc == 0 && state->label == NULL)
463 return result;
464 }
465 for (ptr.p8 = start; result && ptr.p8 < end; ) {
466 switch(*ptr.p8 >> 6) {
467 uleb128_t value;
468
469 case 0:
470 switch(*ptr.p8++) {
471 case DW_CFA_nop:
472 break;
473 case DW_CFA_set_loc:
474 if ((state->loc = read_pointer(&ptr.p8, end, ptrType)) == 0)
475 result = 0;
476 break;
477 case DW_CFA_advance_loc1:
478 result = ptr.p8 < end && advance_loc(*ptr.p8++, state);
479 break;
480 case DW_CFA_advance_loc2:
481 result = ptr.p8 <= end + 2
482 && advance_loc(*ptr.p16++, state);
483 break;
484 case DW_CFA_advance_loc4:
485 result = ptr.p8 <= end + 4
486 && advance_loc(*ptr.p32++, state);
487 break;
488 case DW_CFA_offset_extended:
489 value = get_uleb128(&ptr.p8, end);
490 set_rule(value, Memory, get_uleb128(&ptr.p8, end), state);
491 break;
492 case DW_CFA_val_offset:
493 value = get_uleb128(&ptr.p8, end);
494 set_rule(value, Value, get_uleb128(&ptr.p8, end), state);
495 break;
496 case DW_CFA_offset_extended_sf:
497 value = get_uleb128(&ptr.p8, end);
498 set_rule(value, Memory, get_sleb128(&ptr.p8, end), state);
499 break;
500 case DW_CFA_val_offset_sf:
501 value = get_uleb128(&ptr.p8, end);
502 set_rule(value, Value, get_sleb128(&ptr.p8, end), state);
503 break;
504 case DW_CFA_restore_extended:
505 case DW_CFA_undefined:
506 case DW_CFA_same_value:
507 set_rule(get_uleb128(&ptr.p8, end), Nowhere, 0, state);
508 break;
509 case DW_CFA_register:
510 value = get_uleb128(&ptr.p8, end);
511 set_rule(value,
512 Register,
513 get_uleb128(&ptr.p8, end), state);
514 break;
515 case DW_CFA_remember_state:
516 if (ptr.p8 == state->label) {
517 state->label = NULL;
518 return 1;
519 }
520 if (state->stackDepth >= MAX_STACK_DEPTH)
521 return 0;
522 state->stack[state->stackDepth++] = ptr.p8;
523 break;
524 case DW_CFA_restore_state:
525 if (state->stackDepth) {
526 const uleb128_t loc = state->loc;
527 const u8 *label = state->label;
528
529 state->label = state->stack[state->stackDepth - 1];
530 memcpy(&state->cfa, &badCFA, sizeof(state->cfa));
531 memset(state->regs, 0, sizeof(state->regs));
532 state->stackDepth = 0;
533 result = processCFI(start, end, 0, ptrType, state);
534 state->loc = loc;
535 state->label = label;
536 } else
537 return 0;
538 break;
539 case DW_CFA_def_cfa:
540 state->cfa.reg = get_uleb128(&ptr.p8, end);
541 /*nobreak*/
542 case DW_CFA_def_cfa_offset:
543 state->cfa.offs = get_uleb128(&ptr.p8, end);
544 break;
545 case DW_CFA_def_cfa_sf:
546 state->cfa.reg = get_uleb128(&ptr.p8, end);
547 /*nobreak*/
548 case DW_CFA_def_cfa_offset_sf:
549 state->cfa.offs = get_sleb128(&ptr.p8, end)
550 * state->dataAlign;
551 break;
552 case DW_CFA_def_cfa_register:
553 state->cfa.reg = get_uleb128(&ptr.p8, end);
554 break;
555 /*todo case DW_CFA_def_cfa_expression: */
556 /*todo case DW_CFA_expression: */
557 /*todo case DW_CFA_val_expression: */
558 case DW_CFA_GNU_args_size:
559 get_uleb128(&ptr.p8, end);
560 break;
561 case DW_CFA_GNU_negative_offset_extended:
562 value = get_uleb128(&ptr.p8, end);
563 set_rule(value,
564 Memory,
565 (uleb128_t)0 - get_uleb128(&ptr.p8, end), state);
566 break;
567 case DW_CFA_GNU_window_save:
568 default:
569 result = 0;
570 break;
571 }
572 break;
573 case 1:
574 result = advance_loc(*ptr.p8++ & 0x3f, state);
575 break;
576 case 2:
577 value = *ptr.p8++ & 0x3f;
578 set_rule(value, Memory, get_uleb128(&ptr.p8, end), state);
579 break;
580 case 3:
581 set_rule(*ptr.p8++ & 0x3f, Nowhere, 0, state);
582 break;
583 }
584 if (ptr.p8 > end)
585 result = 0;
586 if (result && targetLoc != 0 && targetLoc < state->loc)
587 return 1;
588 }
589
590 return result
591 && ptr.p8 == end
592 && (targetLoc == 0
593 || (/*todo While in theory this should apply, gcc in practice omits
594 everything past the function prolog, and hence the location
595 never reaches the end of the function.
596 targetLoc < state->loc &&*/ state->label == NULL));
597}
598
599/* Unwind to previous to frame. Returns 0 if successful, negative
600 * number in case of an error. */
601int unwind(struct unwind_frame_info *frame)
602{
603#define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs])
604 const u32 *fde = NULL, *cie = NULL;
605 const u8 *ptr = NULL, *end = NULL;
606 unsigned long startLoc = 0, endLoc = 0, cfa;
607 unsigned i;
608 signed ptrType = -1;
609 uleb128_t retAddrReg = 0;
610 struct unwind_table *table;
611 struct unwind_state state;
612
613 if (UNW_PC(frame) == 0)
614 return -EINVAL;
615 if ((table = find_table(UNW_PC(frame))) != NULL
616 && !(table->size & (sizeof(*fde) - 1))) {
617 unsigned long tableSize = table->size;
618
619 for (fde = table->address;
620 tableSize > sizeof(*fde) && tableSize - sizeof(*fde) >= *fde;
621 tableSize -= sizeof(*fde) + *fde,
622 fde += 1 + *fde / sizeof(*fde)) {
623 if (!*fde || (*fde & (sizeof(*fde) - 1)))
624 break;
625 if (!fde[1])
626 continue; /* this is a CIE */
627 if ((fde[1] & (sizeof(*fde) - 1))
628 || fde[1] > (unsigned long)(fde + 1)
629 - (unsigned long)table->address)
630 continue; /* this is not a valid FDE */
631 cie = fde + 1 - fde[1] / sizeof(*fde);
632 if (*cie <= sizeof(*cie) + 4
633 || *cie >= fde[1] - sizeof(*fde)
634 || (*cie & (sizeof(*cie) - 1))
635 || cie[1]
636 || (ptrType = fde_pointer_type(cie)) < 0) {
637 cie = NULL; /* this is not a (valid) CIE */
638 continue;
639 }
640 ptr = (const u8 *)(fde + 2);
641 startLoc = read_pointer(&ptr,
642 (const u8 *)(fde + 1) + *fde,
643 ptrType);
644 endLoc = startLoc
645 + read_pointer(&ptr,
646 (const u8 *)(fde + 1) + *fde,
647 ptrType & DW_EH_PE_indirect
648 ? ptrType
649 : ptrType & (DW_EH_PE_FORM|DW_EH_PE_signed));
650 if (UNW_PC(frame) >= startLoc && UNW_PC(frame) < endLoc)
651 break;
652 cie = NULL;
653 }
654 }
655 if (cie != NULL) {
656 memset(&state, 0, sizeof(state));
657 state.cieEnd = ptr; /* keep here temporarily */
658 ptr = (const u8 *)(cie + 2);
659 end = (const u8 *)(cie + 1) + *cie;
660 if ((state.version = *ptr) != 1)
661 cie = NULL; /* unsupported version */
662 else if (*++ptr) {
663 /* check if augmentation size is first (and thus present) */
664 if (*ptr == 'z') {
665 /* check for ignorable (or already handled)
666 * nul-terminated augmentation string */
667 while (++ptr < end && *ptr)
668 if (strchr("LPR", *ptr) == NULL)
669 break;
670 }
671 if (ptr >= end || *ptr)
672 cie = NULL;
673 }
674 ++ptr;
675 }
676 if (cie != NULL) {
677 /* get code aligment factor */
678 state.codeAlign = get_uleb128(&ptr, end);
679 /* get data aligment factor */
680 state.dataAlign = get_sleb128(&ptr, end);
681 if (state.codeAlign == 0 || state.dataAlign == 0 || ptr >= end)
682 cie = NULL;
683 else {
684 retAddrReg = state.version <= 1 ? *ptr++ : get_uleb128(&ptr, end);
685 /* skip augmentation */
686 if (((const char *)(cie + 2))[1] == 'z')
687 ptr += get_uleb128(&ptr, end);
688 if (ptr > end
689 || retAddrReg >= ARRAY_SIZE(reg_info)
690 || REG_INVALID(retAddrReg)
691 || reg_info[retAddrReg].width != sizeof(unsigned long))
692 cie = NULL;
693 }
694 }
695 if (cie != NULL) {
696 state.cieStart = ptr;
697 ptr = state.cieEnd;
698 state.cieEnd = end;
699 end = (const u8 *)(fde + 1) + *fde;
700 /* skip augmentation */
701 if (((const char *)(cie + 2))[1] == 'z') {
702 uleb128_t augSize = get_uleb128(&ptr, end);
703
704 if ((ptr += augSize) > end)
705 fde = NULL;
706 }
707 }
708 if (cie == NULL || fde == NULL) {
709#ifdef CONFIG_FRAME_POINTER
710 unsigned long top, bottom;
711#endif
712
713#ifdef CONFIG_FRAME_POINTER
714 top = STACK_TOP(frame->task);
715 bottom = STACK_BOTTOM(frame->task);
716# if FRAME_RETADDR_OFFSET < 0
717 if (UNW_SP(frame) < top
718 && UNW_FP(frame) <= UNW_SP(frame)
719 && bottom < UNW_FP(frame)
720# else
721 if (UNW_SP(frame) > top
722 && UNW_FP(frame) >= UNW_SP(frame)
723 && bottom > UNW_FP(frame)
724# endif
725 && !((UNW_SP(frame) | UNW_FP(frame))
726 & (sizeof(unsigned long) - 1))) {
727 unsigned long link;
728
729 if (!__get_user(link,
730 (unsigned long *)(UNW_FP(frame)
731 + FRAME_LINK_OFFSET))
732# if FRAME_RETADDR_OFFSET < 0
733 && link > bottom && link < UNW_FP(frame)
734# else
735 && link > UNW_FP(frame) && link < bottom
736# endif
737 && !(link & (sizeof(link) - 1))
738 && !__get_user(UNW_PC(frame),
739 (unsigned long *)(UNW_FP(frame)
740 + FRAME_RETADDR_OFFSET))) {
741 UNW_SP(frame) = UNW_FP(frame) + FRAME_RETADDR_OFFSET
742# if FRAME_RETADDR_OFFSET < 0
743 -
744# else
745 +
746# endif
747 sizeof(UNW_PC(frame));
748 UNW_FP(frame) = link;
749 return 0;
750 }
751 }
752#endif
753 return -ENXIO;
754 }
755 state.org = startLoc;
756 memcpy(&state.cfa, &badCFA, sizeof(state.cfa));
757 /* process instructions */
758 if (!processCFI(ptr, end, UNW_PC(frame), ptrType, &state)
759 || state.loc > endLoc
760 || state.regs[retAddrReg].where == Nowhere
761 || state.cfa.reg >= ARRAY_SIZE(reg_info)
762 || reg_info[state.cfa.reg].width != sizeof(unsigned long)
763 || state.cfa.offs % sizeof(unsigned long))
764 return -EIO;
765 /* update frame */
766 cfa = FRAME_REG(state.cfa.reg, unsigned long) + state.cfa.offs;
767 startLoc = min((unsigned long)UNW_SP(frame), cfa);
768 endLoc = max((unsigned long)UNW_SP(frame), cfa);
769 if (STACK_LIMIT(startLoc) != STACK_LIMIT(endLoc)) {
770 startLoc = min(STACK_LIMIT(cfa), cfa);
771 endLoc = max(STACK_LIMIT(cfa), cfa);
772 }
773#ifndef CONFIG_64BIT
774# define CASES CASE(8); CASE(16); CASE(32)
775#else
776# define CASES CASE(8); CASE(16); CASE(32); CASE(64)
777#endif
778 for (i = 0; i < ARRAY_SIZE(state.regs); ++i) {
779 if (REG_INVALID(i)) {
780 if (state.regs[i].where == Nowhere)
781 continue;
782 return -EIO;
783 }
784 switch(state.regs[i].where) {
785 default:
786 break;
787 case Register:
788 if (state.regs[i].value >= ARRAY_SIZE(reg_info)
789 || REG_INVALID(state.regs[i].value)
790 || reg_info[i].width > reg_info[state.regs[i].value].width)
791 return -EIO;
792 switch(reg_info[state.regs[i].value].width) {
793#define CASE(n) \
794 case sizeof(u##n): \
795 state.regs[i].value = FRAME_REG(state.regs[i].value, \
796 const u##n); \
797 break
798 CASES;
799#undef CASE
800 default:
801 return -EIO;
802 }
803 break;
804 }
805 }
806 for (i = 0; i < ARRAY_SIZE(state.regs); ++i) {
807 if (REG_INVALID(i))
808 continue;
809 switch(state.regs[i].where) {
810 case Nowhere:
811 if (reg_info[i].width != sizeof(UNW_SP(frame))
812 || &FRAME_REG(i, __typeof__(UNW_SP(frame)))
813 != &UNW_SP(frame))
814 continue;
815 UNW_SP(frame) = cfa;
816 break;
817 case Register:
818 switch(reg_info[i].width) {
819#define CASE(n) case sizeof(u##n): \
820 FRAME_REG(i, u##n) = state.regs[i].value; \
821 break
822 CASES;
823#undef CASE
824 default:
825 return -EIO;
826 }
827 break;
828 case Value:
829 if (reg_info[i].width != sizeof(unsigned long))
830 return -EIO;
831 FRAME_REG(i, unsigned long) = cfa + state.regs[i].value
832 * state.dataAlign;
833 break;
834 case Memory: {
835 unsigned long addr = cfa + state.regs[i].value
836 * state.dataAlign;
837
838 if ((state.regs[i].value * state.dataAlign)
839 % sizeof(unsigned long)
840 || addr < startLoc
841 || addr + sizeof(unsigned long) < addr
842 || addr + sizeof(unsigned long) > endLoc)
843 return -EIO;
844 switch(reg_info[i].width) {
845#define CASE(n) case sizeof(u##n): \
846 __get_user(FRAME_REG(i, u##n), (u##n *)addr); \
847 break
848 CASES;
849#undef CASE
850 default:
851 return -EIO;
852 }
853 }
854 break;
855 }
856 }
857
858 return 0;
859#undef CASES
860#undef FRAME_REG
861}
862EXPORT_SYMBOL(unwind);
863
864int unwind_init_frame_info(struct unwind_frame_info *info,
865 struct task_struct *tsk,
866 /*const*/ struct pt_regs *regs)
867{
868 info->task = tsk;
869 arch_unw_init_frame_info(info, regs);
870
871 return 0;
872}
873EXPORT_SYMBOL(unwind_init_frame_info);
874
875/*
876 * Prepare to unwind a blocked task.
877 */
878int unwind_init_blocked(struct unwind_frame_info *info,
879 struct task_struct *tsk)
880{
881 info->task = tsk;
882 arch_unw_init_blocked(info);
883
884 return 0;
885}
886EXPORT_SYMBOL(unwind_init_blocked);
887
888/*
889 * Prepare to unwind the currently running thread.
890 */
891int unwind_init_running(struct unwind_frame_info *info,
892 asmlinkage int (*callback)(struct unwind_frame_info *,
893 void *arg),
894 void *arg)
895{
896 info->task = current;
897
898 return arch_unwind_init_running(info, callback, arg);
899}
900EXPORT_SYMBOL(unwind_init_running);
901
902/*
903 * Unwind until the return pointer is in user-land (or until an error
904 * occurs). Returns 0 if successful, negative number in case of
905 * error.
906 */
907int unwind_to_user(struct unwind_frame_info *info)
908{
909 while (!arch_unw_user_mode(info)) {
910 int err = unwind(info);
911
912 if (err < 0)
913 return err;
914 }
915
916 return 0;
917}
918EXPORT_SYMBOL(unwind_to_user);
diff --git a/kernel/user.c b/kernel/user.c
index 2116642f42c6..6408c0424291 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -140,7 +140,7 @@ struct user_struct * alloc_uid(uid_t uid)
140 atomic_set(&new->processes, 0); 140 atomic_set(&new->processes, 0);
141 atomic_set(&new->files, 0); 141 atomic_set(&new->files, 0);
142 atomic_set(&new->sigpending, 0); 142 atomic_set(&new->sigpending, 0);
143#ifdef CONFIG_INOTIFY 143#ifdef CONFIG_INOTIFY_USER
144 atomic_set(&new->inotify_watches, 0); 144 atomic_set(&new->inotify_watches, 0);
145 atomic_set(&new->inotify_devs, 0); 145 atomic_set(&new->inotify_devs, 0);
146#endif 146#endif
@@ -148,7 +148,7 @@ struct user_struct * alloc_uid(uid_t uid)
148 new->mq_bytes = 0; 148 new->mq_bytes = 0;
149 new->locked_shm = 0; 149 new->locked_shm = 0;
150 150
151 if (alloc_uid_keyring(new) < 0) { 151 if (alloc_uid_keyring(new, current) < 0) {
152 kmem_cache_free(uid_cachep, new); 152 kmem_cache_free(uid_cachep, new);
153 return NULL; 153 return NULL;
154 } 154 }
diff --git a/kernel/wait.c b/kernel/wait.c
index 791681cfea98..5985d866531f 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -3,7 +3,6 @@
3 * 3 *
4 * (C) 2004 William Irwin, Oracle 4 * (C) 2004 William Irwin, Oracle
5 */ 5 */
6#include <linux/config.h>
7#include <linux/init.h> 6#include <linux/init.h>
8#include <linux/module.h> 7#include <linux/module.h>
9#include <linux/sched.h> 8#include <linux/sched.h>
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 880fb415a8f6..59f0b42bd89e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -428,22 +428,34 @@ int schedule_delayed_work_on(int cpu,
428 return ret; 428 return ret;
429} 429}
430 430
431int schedule_on_each_cpu(void (*func) (void *info), void *info) 431/**
432 * schedule_on_each_cpu - call a function on each online CPU from keventd
433 * @func: the function to call
434 * @info: a pointer to pass to func()
435 *
436 * Returns zero on success.
437 * Returns -ve errno on failure.
438 *
439 * Appears to be racy against CPU hotplug.
440 *
441 * schedule_on_each_cpu() is very slow.
442 */
443int schedule_on_each_cpu(void (*func)(void *info), void *info)
432{ 444{
433 int cpu; 445 int cpu;
434 struct work_struct *work; 446 struct work_struct *works;
435 447
436 work = kmalloc(NR_CPUS * sizeof(struct work_struct), GFP_KERNEL); 448 works = alloc_percpu(struct work_struct);
437 449 if (!works)
438 if (!work)
439 return -ENOMEM; 450 return -ENOMEM;
451
440 for_each_online_cpu(cpu) { 452 for_each_online_cpu(cpu) {
441 INIT_WORK(work + cpu, func, info); 453 INIT_WORK(per_cpu_ptr(works, cpu), func, info);
442 __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), 454 __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu),
443 work + cpu); 455 per_cpu_ptr(works, cpu));
444 } 456 }
445 flush_workqueue(keventd_wq); 457 flush_workqueue(keventd_wq);
446 kfree(work); 458 free_percpu(works);
447 return 0; 459 return 0;
448} 460}
449 461
@@ -531,11 +543,11 @@ int current_is_keventd(void)
531static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) 543static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
532{ 544{
533 struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); 545 struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
534 LIST_HEAD(list); 546 struct list_head list;
535 struct work_struct *work; 547 struct work_struct *work;
536 548
537 spin_lock_irq(&cwq->lock); 549 spin_lock_irq(&cwq->lock);
538 list_splice_init(&cwq->worklist, &list); 550 list_replace_init(&cwq->worklist, &list);
539 551
540 while (!list_empty(&list)) { 552 while (!list_empty(&list)) {
541 printk("Taking work for %s\n", wq->name); 553 printk("Taking work for %s\n", wq->name);
@@ -547,7 +559,7 @@ static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
547} 559}
548 560
549/* We're holding the cpucontrol mutex here */ 561/* We're holding the cpucontrol mutex here */
550static int workqueue_cpu_callback(struct notifier_block *nfb, 562static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
551 unsigned long action, 563 unsigned long action,
552 void *hcpu) 564 void *hcpu)
553{ 565{
@@ -578,6 +590,8 @@ static int workqueue_cpu_callback(struct notifier_block *nfb,
578 590
579 case CPU_UP_CANCELED: 591 case CPU_UP_CANCELED:
580 list_for_each_entry(wq, &workqueues, list) { 592 list_for_each_entry(wq, &workqueues, list) {
593 if (!per_cpu_ptr(wq->cpu_wq, hotcpu)->thread)
594 continue;
581 /* Unbind so it can run. */ 595 /* Unbind so it can run. */
582 kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread, 596 kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread,
583 any_online_cpu(cpu_online_map)); 597 any_online_cpu(cpu_online_map));