aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile6
-rw-r--r--kernel/acct.c118
-rw-r--r--kernel/audit.c207
-rw-r--r--kernel/audit.h61
-rw-r--r--kernel/auditfilter.c899
-rw-r--r--kernel/auditsc.c651
-rw-r--r--kernel/compat.c30
-rw-r--r--kernel/cpu.c18
-rw-r--r--kernel/cpuset.c42
-rw-r--r--kernel/exit.c30
-rw-r--r--kernel/fork.c40
-rw-r--r--kernel/futex.c1075
-rw-r--r--kernel/futex_compat.c14
-rw-r--r--kernel/hrtimer.c23
-rw-r--r--kernel/intermodule.c184
-rw-r--r--kernel/irq/Makefile2
-rw-r--r--kernel/irq/autoprobe.c56
-rw-r--r--kernel/irq/chip.c525
-rw-r--r--kernel/irq/handle.c121
-rw-r--r--kernel/irq/internals.h46
-rw-r--r--kernel/irq/manage.c156
-rw-r--r--kernel/irq/migration.c22
-rw-r--r--kernel/irq/proc.c33
-rw-r--r--kernel/irq/resend.c78
-rw-r--r--kernel/irq/spurious.c45
-rw-r--r--kernel/kexec.c12
-rw-r--r--kernel/kprobes.c58
-rw-r--r--kernel/ksysfs.c19
-rw-r--r--kernel/kthread.c61
-rw-r--r--kernel/module.c128
-rw-r--r--kernel/mutex-debug.c17
-rw-r--r--kernel/mutex-debug.h25
-rw-r--r--kernel/mutex.c21
-rw-r--r--kernel/mutex.h6
-rw-r--r--kernel/power/Kconfig18
-rw-r--r--kernel/power/disk.c2
-rw-r--r--kernel/power/main.c6
-rw-r--r--kernel/power/power.h2
-rw-r--r--kernel/power/snapshot.c148
-rw-r--r--kernel/power/swsusp.c20
-rw-r--r--kernel/printk.c52
-rw-r--r--kernel/profile.c2
-rw-r--r--kernel/ptrace.c23
-rw-r--r--kernel/rcupdate.c27
-rw-r--r--kernel/rcutorture.c201
-rw-r--r--kernel/resource.c90
-rw-r--r--kernel/rtmutex-debug.c513
-rw-r--r--kernel/rtmutex-debug.h37
-rw-r--r--kernel/rtmutex-tester.c440
-rw-r--r--kernel/rtmutex.c990
-rw-r--r--kernel/rtmutex.h29
-rw-r--r--kernel/rtmutex_common.h123
-rw-r--r--kernel/sched.c1228
-rw-r--r--kernel/signal.c37
-rw-r--r--kernel/softirq.c6
-rw-r--r--kernel/softlockup.c8
-rw-r--r--kernel/stop_machine.c17
-rw-r--r--kernel/sys.c80
-rw-r--r--kernel/sys_ni.c2
-rw-r--r--kernel/sysctl.c64
-rw-r--r--kernel/time.c2
-rw-r--r--kernel/time/Makefile1
-rw-r--r--kernel/time/clocksource.c349
-rw-r--r--kernel/time/jiffies.c73
-rw-r--r--kernel/timer.c432
-rw-r--r--kernel/unwind.c918
-rw-r--r--kernel/user.c4
-rw-r--r--kernel/workqueue.c36
68 files changed, 9184 insertions, 1625 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 58908f9d156a..82fb182f6f61 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,18 +10,22 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o 11 hrtimer.o
12 12
13obj-y += time/
13obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o 14obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
14obj-$(CONFIG_FUTEX) += futex.o 15obj-$(CONFIG_FUTEX) += futex.o
15ifeq ($(CONFIG_COMPAT),y) 16ifeq ($(CONFIG_COMPAT),y)
16obj-$(CONFIG_FUTEX) += futex_compat.o 17obj-$(CONFIG_FUTEX) += futex_compat.o
17endif 18endif
19obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
20obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
21obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
18obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o 22obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
19obj-$(CONFIG_SMP) += cpu.o spinlock.o 23obj-$(CONFIG_SMP) += cpu.o spinlock.o
20obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o 24obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
21obj-$(CONFIG_UID16) += uid16.o 25obj-$(CONFIG_UID16) += uid16.o
22obj-$(CONFIG_MODULES) += module.o 26obj-$(CONFIG_MODULES) += module.o
23obj-$(CONFIG_OBSOLETE_INTERMODULE) += intermodule.o
24obj-$(CONFIG_KALLSYMS) += kallsyms.o 27obj-$(CONFIG_KALLSYMS) += kallsyms.o
28obj-$(CONFIG_STACK_UNWIND) += unwind.o
25obj-$(CONFIG_PM) += power/ 29obj-$(CONFIG_PM) += power/
26obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o 30obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
27obj-$(CONFIG_KEXEC) += kexec.o 31obj-$(CONFIG_KEXEC) += kexec.o
diff --git a/kernel/acct.c b/kernel/acct.c
index b327f4d20104..126ca43d5d2b 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -75,7 +75,7 @@ int acct_parm[3] = {4, 2, 30};
75/* 75/*
76 * External references and all of the globals. 76 * External references and all of the globals.
77 */ 77 */
78static void do_acct_process(long, struct file *); 78static void do_acct_process(struct file *);
79 79
80/* 80/*
81 * This structure is used so that all the data protected by lock 81 * This structure is used so that all the data protected by lock
@@ -118,7 +118,7 @@ static int check_free_space(struct file *file)
118 spin_unlock(&acct_globals.lock); 118 spin_unlock(&acct_globals.lock);
119 119
120 /* May block */ 120 /* May block */
121 if (vfs_statfs(file->f_dentry->d_inode->i_sb, &sbuf)) 121 if (vfs_statfs(file->f_dentry, &sbuf))
122 return res; 122 return res;
123 suspend = sbuf.f_blocks * SUSPEND; 123 suspend = sbuf.f_blocks * SUSPEND;
124 resume = sbuf.f_blocks * RESUME; 124 resume = sbuf.f_blocks * RESUME;
@@ -196,7 +196,7 @@ static void acct_file_reopen(struct file *file)
196 if (old_acct) { 196 if (old_acct) {
197 mnt_unpin(old_acct->f_vfsmnt); 197 mnt_unpin(old_acct->f_vfsmnt);
198 spin_unlock(&acct_globals.lock); 198 spin_unlock(&acct_globals.lock);
199 do_acct_process(0, old_acct); 199 do_acct_process(old_acct);
200 filp_close(old_acct, NULL); 200 filp_close(old_acct, NULL);
201 spin_lock(&acct_globals.lock); 201 spin_lock(&acct_globals.lock);
202 } 202 }
@@ -419,16 +419,15 @@ static u32 encode_float(u64 value)
419/* 419/*
420 * do_acct_process does all actual work. Caller holds the reference to file. 420 * do_acct_process does all actual work. Caller holds the reference to file.
421 */ 421 */
422static void do_acct_process(long exitcode, struct file *file) 422static void do_acct_process(struct file *file)
423{ 423{
424 struct pacct_struct *pacct = &current->signal->pacct;
424 acct_t ac; 425 acct_t ac;
425 mm_segment_t fs; 426 mm_segment_t fs;
426 unsigned long vsize;
427 unsigned long flim; 427 unsigned long flim;
428 u64 elapsed; 428 u64 elapsed;
429 u64 run_time; 429 u64 run_time;
430 struct timespec uptime; 430 struct timespec uptime;
431 unsigned long jiffies;
432 431
433 /* 432 /*
434 * First check to see if there is enough free_space to continue 433 * First check to see if there is enough free_space to continue
@@ -469,12 +468,6 @@ static void do_acct_process(long exitcode, struct file *file)
469#endif 468#endif
470 do_div(elapsed, AHZ); 469 do_div(elapsed, AHZ);
471 ac.ac_btime = xtime.tv_sec - elapsed; 470 ac.ac_btime = xtime.tv_sec - elapsed;
472 jiffies = cputime_to_jiffies(cputime_add(current->utime,
473 current->signal->utime));
474 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(jiffies));
475 jiffies = cputime_to_jiffies(cputime_add(current->stime,
476 current->signal->stime));
477 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(jiffies));
478 /* we really need to bite the bullet and change layout */ 471 /* we really need to bite the bullet and change layout */
479 ac.ac_uid = current->uid; 472 ac.ac_uid = current->uid;
480 ac.ac_gid = current->gid; 473 ac.ac_gid = current->gid;
@@ -496,37 +489,18 @@ static void do_acct_process(long exitcode, struct file *file)
496 old_encode_dev(tty_devnum(current->signal->tty)) : 0; 489 old_encode_dev(tty_devnum(current->signal->tty)) : 0;
497 read_unlock(&tasklist_lock); 490 read_unlock(&tasklist_lock);
498 491
499 ac.ac_flag = 0; 492 spin_lock(&current->sighand->siglock);
500 if (current->flags & PF_FORKNOEXEC) 493 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
501 ac.ac_flag |= AFORK; 494 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
502 if (current->flags & PF_SUPERPRIV) 495 ac.ac_flag = pacct->ac_flag;
503 ac.ac_flag |= ASU; 496 ac.ac_mem = encode_comp_t(pacct->ac_mem);
504 if (current->flags & PF_DUMPCORE) 497 ac.ac_minflt = encode_comp_t(pacct->ac_minflt);
505 ac.ac_flag |= ACORE; 498 ac.ac_majflt = encode_comp_t(pacct->ac_majflt);
506 if (current->flags & PF_SIGNALED) 499 ac.ac_exitcode = pacct->ac_exitcode;
507 ac.ac_flag |= AXSIG; 500 spin_unlock(&current->sighand->siglock);
508
509 vsize = 0;
510 if (current->mm) {
511 struct vm_area_struct *vma;
512 down_read(&current->mm->mmap_sem);
513 vma = current->mm->mmap;
514 while (vma) {
515 vsize += vma->vm_end - vma->vm_start;
516 vma = vma->vm_next;
517 }
518 up_read(&current->mm->mmap_sem);
519 }
520 vsize = vsize / 1024;
521 ac.ac_mem = encode_comp_t(vsize);
522 ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ 501 ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */
523 ac.ac_rw = encode_comp_t(ac.ac_io / 1024); 502 ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
524 ac.ac_minflt = encode_comp_t(current->signal->min_flt +
525 current->min_flt);
526 ac.ac_majflt = encode_comp_t(current->signal->maj_flt +
527 current->maj_flt);
528 ac.ac_swaps = encode_comp_t(0); 503 ac.ac_swaps = encode_comp_t(0);
529 ac.ac_exitcode = exitcode;
530 504
531 /* 505 /*
532 * Kernel segment override to datasegment and write it 506 * Kernel segment override to datasegment and write it
@@ -546,12 +520,64 @@ static void do_acct_process(long exitcode, struct file *file)
546} 520}
547 521
548/** 522/**
523 * acct_init_pacct - initialize a new pacct_struct
524 * @pacct: per-process accounting info struct to initialize
525 */
526void acct_init_pacct(struct pacct_struct *pacct)
527{
528 memset(pacct, 0, sizeof(struct pacct_struct));
529 pacct->ac_utime = pacct->ac_stime = cputime_zero;
530}
531
532/**
533 * acct_collect - collect accounting information into pacct_struct
534 * @exitcode: task exit code
535 * @group_dead: not 0, if this thread is the last one in the process.
536 */
537void acct_collect(long exitcode, int group_dead)
538{
539 struct pacct_struct *pacct = &current->signal->pacct;
540 unsigned long vsize = 0;
541
542 if (group_dead && current->mm) {
543 struct vm_area_struct *vma;
544 down_read(&current->mm->mmap_sem);
545 vma = current->mm->mmap;
546 while (vma) {
547 vsize += vma->vm_end - vma->vm_start;
548 vma = vma->vm_next;
549 }
550 up_read(&current->mm->mmap_sem);
551 }
552
553 spin_lock_irq(&current->sighand->siglock);
554 if (group_dead)
555 pacct->ac_mem = vsize / 1024;
556 if (thread_group_leader(current)) {
557 pacct->ac_exitcode = exitcode;
558 if (current->flags & PF_FORKNOEXEC)
559 pacct->ac_flag |= AFORK;
560 }
561 if (current->flags & PF_SUPERPRIV)
562 pacct->ac_flag |= ASU;
563 if (current->flags & PF_DUMPCORE)
564 pacct->ac_flag |= ACORE;
565 if (current->flags & PF_SIGNALED)
566 pacct->ac_flag |= AXSIG;
567 pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime);
568 pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime);
569 pacct->ac_minflt += current->min_flt;
570 pacct->ac_majflt += current->maj_flt;
571 spin_unlock_irq(&current->sighand->siglock);
572}
573
574/**
549 * acct_process - now just a wrapper around do_acct_process 575 * acct_process - now just a wrapper around do_acct_process
550 * @exitcode: task exit code 576 * @exitcode: task exit code
551 * 577 *
552 * handles process accounting for an exiting task 578 * handles process accounting for an exiting task
553 */ 579 */
554void acct_process(long exitcode) 580void acct_process(void)
555{ 581{
556 struct file *file = NULL; 582 struct file *file = NULL;
557 583
@@ -570,7 +596,7 @@ void acct_process(long exitcode)
570 get_file(file); 596 get_file(file);
571 spin_unlock(&acct_globals.lock); 597 spin_unlock(&acct_globals.lock);
572 598
573 do_acct_process(exitcode, file); 599 do_acct_process(file);
574 fput(file); 600 fput(file);
575} 601}
576 602
@@ -599,9 +625,7 @@ void acct_update_integrals(struct task_struct *tsk)
599 */ 625 */
600void acct_clear_integrals(struct task_struct *tsk) 626void acct_clear_integrals(struct task_struct *tsk)
601{ 627{
602 if (tsk) { 628 tsk->acct_stimexpd = 0;
603 tsk->acct_stimexpd = 0; 629 tsk->acct_rss_mem1 = 0;
604 tsk->acct_rss_mem1 = 0; 630 tsk->acct_vm_mem1 = 0;
605 tsk->acct_vm_mem1 = 0;
606 }
607} 631}
diff --git a/kernel/audit.c b/kernel/audit.c
index df57b493e1cb..82443fb433ef 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -56,6 +56,7 @@
56#include <linux/skbuff.h> 56#include <linux/skbuff.h>
57#include <linux/netlink.h> 57#include <linux/netlink.h>
58#include <linux/selinux.h> 58#include <linux/selinux.h>
59#include <linux/inotify.h>
59 60
60#include "audit.h" 61#include "audit.h"
61 62
@@ -89,6 +90,7 @@ static int audit_backlog_wait_overflow = 0;
89/* The identity of the user shutting down the audit system. */ 90/* The identity of the user shutting down the audit system. */
90uid_t audit_sig_uid = -1; 91uid_t audit_sig_uid = -1;
91pid_t audit_sig_pid = -1; 92pid_t audit_sig_pid = -1;
93u32 audit_sig_sid = 0;
92 94
93/* Records can be lost in several ways: 95/* Records can be lost in several ways:
94 0) [suppressed in audit_alloc] 96 0) [suppressed in audit_alloc]
@@ -102,6 +104,12 @@ static atomic_t audit_lost = ATOMIC_INIT(0);
102/* The netlink socket. */ 104/* The netlink socket. */
103static struct sock *audit_sock; 105static struct sock *audit_sock;
104 106
107/* Inotify handle. */
108struct inotify_handle *audit_ih;
109
110/* Hash for inode-based rules */
111struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
112
105/* The audit_freelist is a list of pre-allocated audit buffers (if more 113/* The audit_freelist is a list of pre-allocated audit buffers (if more
106 * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of 114 * than AUDIT_MAXFREE are in use, the audit buffer is freed instead of
107 * being placed on the freelist). */ 115 * being placed on the freelist). */
@@ -114,10 +122,8 @@ static struct task_struct *kauditd_task;
114static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait); 122static DECLARE_WAIT_QUEUE_HEAD(kauditd_wait);
115static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait); 123static DECLARE_WAIT_QUEUE_HEAD(audit_backlog_wait);
116 124
117/* The netlink socket is only to be read by 1 CPU, which lets us assume 125/* Serialize requests from userspace. */
118 * that list additions and deletions never happen simultaneously in 126static DEFINE_MUTEX(audit_cmd_mutex);
119 * auditsc.c */
120DEFINE_MUTEX(audit_netlink_mutex);
121 127
122/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting 128/* AUDIT_BUFSIZ is the size of the temporary buffer used for formatting
123 * audit records. Since printk uses a 1024 byte buffer, this buffer 129 * audit records. Since printk uses a 1024 byte buffer, this buffer
@@ -250,7 +256,7 @@ static int audit_set_rate_limit(int limit, uid_t loginuid, u32 sid)
250 "audit_rate_limit=%d old=%d by auid=%u", 256 "audit_rate_limit=%d old=%d by auid=%u",
251 limit, old, loginuid); 257 limit, old, loginuid);
252 audit_rate_limit = limit; 258 audit_rate_limit = limit;
253 return old; 259 return 0;
254} 260}
255 261
256static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid) 262static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid)
@@ -273,7 +279,7 @@ static int audit_set_backlog_limit(int limit, uid_t loginuid, u32 sid)
273 "audit_backlog_limit=%d old=%d by auid=%u", 279 "audit_backlog_limit=%d old=%d by auid=%u",
274 limit, old, loginuid); 280 limit, old, loginuid);
275 audit_backlog_limit = limit; 281 audit_backlog_limit = limit;
276 return old; 282 return 0;
277} 283}
278 284
279static int audit_set_enabled(int state, uid_t loginuid, u32 sid) 285static int audit_set_enabled(int state, uid_t loginuid, u32 sid)
@@ -299,7 +305,7 @@ static int audit_set_enabled(int state, uid_t loginuid, u32 sid)
299 "audit_enabled=%d old=%d by auid=%u", 305 "audit_enabled=%d old=%d by auid=%u",
300 state, old, loginuid); 306 state, old, loginuid);
301 audit_enabled = state; 307 audit_enabled = state;
302 return old; 308 return 0;
303} 309}
304 310
305static int audit_set_failure(int state, uid_t loginuid, u32 sid) 311static int audit_set_failure(int state, uid_t loginuid, u32 sid)
@@ -327,7 +333,7 @@ static int audit_set_failure(int state, uid_t loginuid, u32 sid)
327 "audit_failure=%d old=%d by auid=%u", 333 "audit_failure=%d old=%d by auid=%u",
328 state, old, loginuid); 334 state, old, loginuid);
329 audit_failure = state; 335 audit_failure = state;
330 return old; 336 return 0;
331} 337}
332 338
333static int kauditd_thread(void *dummy) 339static int kauditd_thread(void *dummy)
@@ -363,9 +369,52 @@ static int kauditd_thread(void *dummy)
363 remove_wait_queue(&kauditd_wait, &wait); 369 remove_wait_queue(&kauditd_wait, &wait);
364 } 370 }
365 } 371 }
372}
373
374int audit_send_list(void *_dest)
375{
376 struct audit_netlink_list *dest = _dest;
377 int pid = dest->pid;
378 struct sk_buff *skb;
379
380 /* wait for parent to finish and send an ACK */
381 mutex_lock(&audit_cmd_mutex);
382 mutex_unlock(&audit_cmd_mutex);
383
384 while ((skb = __skb_dequeue(&dest->q)) != NULL)
385 netlink_unicast(audit_sock, skb, pid, 0);
386
387 kfree(dest);
388
366 return 0; 389 return 0;
367} 390}
368 391
392struct sk_buff *audit_make_reply(int pid, int seq, int type, int done,
393 int multi, void *payload, int size)
394{
395 struct sk_buff *skb;
396 struct nlmsghdr *nlh;
397 int len = NLMSG_SPACE(size);
398 void *data;
399 int flags = multi ? NLM_F_MULTI : 0;
400 int t = done ? NLMSG_DONE : type;
401
402 skb = alloc_skb(len, GFP_KERNEL);
403 if (!skb)
404 return NULL;
405
406 nlh = NLMSG_PUT(skb, pid, seq, t, size);
407 nlh->nlmsg_flags = flags;
408 data = NLMSG_DATA(nlh);
409 memcpy(data, payload, size);
410 return skb;
411
412nlmsg_failure: /* Used by NLMSG_PUT */
413 if (skb)
414 kfree_skb(skb);
415 return NULL;
416}
417
369/** 418/**
370 * audit_send_reply - send an audit reply message via netlink 419 * audit_send_reply - send an audit reply message via netlink
371 * @pid: process id to send reply to 420 * @pid: process id to send reply to
@@ -383,29 +432,13 @@ void audit_send_reply(int pid, int seq, int type, int done, int multi,
383 void *payload, int size) 432 void *payload, int size)
384{ 433{
385 struct sk_buff *skb; 434 struct sk_buff *skb;
386 struct nlmsghdr *nlh; 435 skb = audit_make_reply(pid, seq, type, done, multi, payload, size);
387 int len = NLMSG_SPACE(size);
388 void *data;
389 int flags = multi ? NLM_F_MULTI : 0;
390 int t = done ? NLMSG_DONE : type;
391
392 skb = alloc_skb(len, GFP_KERNEL);
393 if (!skb) 436 if (!skb)
394 return; 437 return;
395
396 nlh = NLMSG_PUT(skb, pid, seq, t, size);
397 nlh->nlmsg_flags = flags;
398 data = NLMSG_DATA(nlh);
399 memcpy(data, payload, size);
400
401 /* Ignore failure. It'll only happen if the sender goes away, 438 /* Ignore failure. It'll only happen if the sender goes away,
402 because our timeout is set to infinite. */ 439 because our timeout is set to infinite. */
403 netlink_unicast(audit_sock, skb, pid, 0); 440 netlink_unicast(audit_sock, skb, pid, 0);
404 return; 441 return;
405
406nlmsg_failure: /* Used by NLMSG_PUT */
407 if (skb)
408 kfree_skb(skb);
409} 442}
410 443
411/* 444/*
@@ -451,7 +484,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
451 struct audit_buffer *ab; 484 struct audit_buffer *ab;
452 u16 msg_type = nlh->nlmsg_type; 485 u16 msg_type = nlh->nlmsg_type;
453 uid_t loginuid; /* loginuid of sender */ 486 uid_t loginuid; /* loginuid of sender */
454 struct audit_sig_info sig_data; 487 struct audit_sig_info *sig_data;
488 char *ctx;
489 u32 len;
455 490
456 err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type); 491 err = audit_netlink_ok(NETLINK_CB(skb).eff_cap, msg_type);
457 if (err) 492 if (err)
@@ -503,12 +538,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
503 if (status_get->mask & AUDIT_STATUS_PID) { 538 if (status_get->mask & AUDIT_STATUS_PID) {
504 int old = audit_pid; 539 int old = audit_pid;
505 if (sid) { 540 if (sid) {
506 char *ctx = NULL; 541 if ((err = selinux_ctxid_to_string(
507 u32 len;
508 int rc;
509 if ((rc = selinux_ctxid_to_string(
510 sid, &ctx, &len))) 542 sid, &ctx, &len)))
511 return rc; 543 return err;
512 else 544 else
513 audit_log(NULL, GFP_KERNEL, 545 audit_log(NULL, GFP_KERNEL,
514 AUDIT_CONFIG_CHANGE, 546 AUDIT_CONFIG_CHANGE,
@@ -523,10 +555,10 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
523 audit_pid = status_get->pid; 555 audit_pid = status_get->pid;
524 } 556 }
525 if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) 557 if (status_get->mask & AUDIT_STATUS_RATE_LIMIT)
526 audit_set_rate_limit(status_get->rate_limit, 558 err = audit_set_rate_limit(status_get->rate_limit,
527 loginuid, sid); 559 loginuid, sid);
528 if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) 560 if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
529 audit_set_backlog_limit(status_get->backlog_limit, 561 err = audit_set_backlog_limit(status_get->backlog_limit,
530 loginuid, sid); 562 loginuid, sid);
531 break; 563 break;
532 case AUDIT_USER: 564 case AUDIT_USER:
@@ -544,8 +576,6 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
544 "user pid=%d uid=%u auid=%u", 576 "user pid=%d uid=%u auid=%u",
545 pid, uid, loginuid); 577 pid, uid, loginuid);
546 if (sid) { 578 if (sid) {
547 char *ctx = NULL;
548 u32 len;
549 if (selinux_ctxid_to_string( 579 if (selinux_ctxid_to_string(
550 sid, &ctx, &len)) { 580 sid, &ctx, &len)) {
551 audit_log_format(ab, 581 audit_log_format(ab,
@@ -584,10 +614,21 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
584 loginuid, sid); 614 loginuid, sid);
585 break; 615 break;
586 case AUDIT_SIGNAL_INFO: 616 case AUDIT_SIGNAL_INFO:
587 sig_data.uid = audit_sig_uid; 617 err = selinux_ctxid_to_string(audit_sig_sid, &ctx, &len);
588 sig_data.pid = audit_sig_pid; 618 if (err)
619 return err;
620 sig_data = kmalloc(sizeof(*sig_data) + len, GFP_KERNEL);
621 if (!sig_data) {
622 kfree(ctx);
623 return -ENOMEM;
624 }
625 sig_data->uid = audit_sig_uid;
626 sig_data->pid = audit_sig_pid;
627 memcpy(sig_data->ctx, ctx, len);
628 kfree(ctx);
589 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO, 629 audit_send_reply(NETLINK_CB(skb).pid, seq, AUDIT_SIGNAL_INFO,
590 0, 0, &sig_data, sizeof(sig_data)); 630 0, 0, sig_data, sizeof(*sig_data) + len);
631 kfree(sig_data);
591 break; 632 break;
592 default: 633 default:
593 err = -EINVAL; 634 err = -EINVAL;
@@ -629,20 +670,30 @@ static void audit_receive(struct sock *sk, int length)
629 struct sk_buff *skb; 670 struct sk_buff *skb;
630 unsigned int qlen; 671 unsigned int qlen;
631 672
632 mutex_lock(&audit_netlink_mutex); 673 mutex_lock(&audit_cmd_mutex);
633 674
634 for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) { 675 for (qlen = skb_queue_len(&sk->sk_receive_queue); qlen; qlen--) {
635 skb = skb_dequeue(&sk->sk_receive_queue); 676 skb = skb_dequeue(&sk->sk_receive_queue);
636 audit_receive_skb(skb); 677 audit_receive_skb(skb);
637 kfree_skb(skb); 678 kfree_skb(skb);
638 } 679 }
639 mutex_unlock(&audit_netlink_mutex); 680 mutex_unlock(&audit_cmd_mutex);
640} 681}
641 682
683#ifdef CONFIG_AUDITSYSCALL
684static const struct inotify_operations audit_inotify_ops = {
685 .handle_event = audit_handle_ievent,
686 .destroy_watch = audit_free_parent,
687};
688#endif
642 689
643/* Initialize audit support at boot time. */ 690/* Initialize audit support at boot time. */
644static int __init audit_init(void) 691static int __init audit_init(void)
645{ 692{
693#ifdef CONFIG_AUDITSYSCALL
694 int i;
695#endif
696
646 printk(KERN_INFO "audit: initializing netlink socket (%s)\n", 697 printk(KERN_INFO "audit: initializing netlink socket (%s)\n",
647 audit_default ? "enabled" : "disabled"); 698 audit_default ? "enabled" : "disabled");
648 audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive, 699 audit_sock = netlink_kernel_create(NETLINK_AUDIT, 0, audit_receive,
@@ -661,6 +712,16 @@ static int __init audit_init(void)
661 selinux_audit_set_callback(&selinux_audit_rule_update); 712 selinux_audit_set_callback(&selinux_audit_rule_update);
662 713
663 audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized"); 714 audit_log(NULL, GFP_KERNEL, AUDIT_KERNEL, "initialized");
715
716#ifdef CONFIG_AUDITSYSCALL
717 audit_ih = inotify_init(&audit_inotify_ops);
718 if (IS_ERR(audit_ih))
719 audit_panic("cannot initialize inotify handle");
720
721 for (i = 0; i < AUDIT_INODE_BUCKETS; i++)
722 INIT_LIST_HEAD(&audit_inode_hash[i]);
723#endif
724
664 return 0; 725 return 0;
665} 726}
666__initcall(audit_init); 727__initcall(audit_init);
@@ -690,10 +751,12 @@ static void audit_buffer_free(struct audit_buffer *ab)
690 kfree_skb(ab->skb); 751 kfree_skb(ab->skb);
691 752
692 spin_lock_irqsave(&audit_freelist_lock, flags); 753 spin_lock_irqsave(&audit_freelist_lock, flags);
693 if (++audit_freelist_count > AUDIT_MAXFREE) 754 if (audit_freelist_count > AUDIT_MAXFREE)
694 kfree(ab); 755 kfree(ab);
695 else 756 else {
757 audit_freelist_count++;
696 list_add(&ab->list, &audit_freelist); 758 list_add(&ab->list, &audit_freelist);
759 }
697 spin_unlock_irqrestore(&audit_freelist_lock, flags); 760 spin_unlock_irqrestore(&audit_freelist_lock, flags);
698} 761}
699 762
@@ -755,7 +818,7 @@ err:
755 */ 818 */
756unsigned int audit_serial(void) 819unsigned int audit_serial(void)
757{ 820{
758 static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED; 821 static DEFINE_SPINLOCK(serial_lock);
759 static unsigned int serial = 0; 822 static unsigned int serial = 0;
760 823
761 unsigned long flags; 824 unsigned long flags;
@@ -988,28 +1051,76 @@ void audit_log_hex(struct audit_buffer *ab, const unsigned char *buf,
988 skb_put(skb, len << 1); /* new string is twice the old string */ 1051 skb_put(skb, len << 1); /* new string is twice the old string */
989} 1052}
990 1053
1054/*
1055 * Format a string of no more than slen characters into the audit buffer,
1056 * enclosed in quote marks.
1057 */
1058static void audit_log_n_string(struct audit_buffer *ab, size_t slen,
1059 const char *string)
1060{
1061 int avail, new_len;
1062 unsigned char *ptr;
1063 struct sk_buff *skb;
1064
1065 BUG_ON(!ab->skb);
1066 skb = ab->skb;
1067 avail = skb_tailroom(skb);
1068 new_len = slen + 3; /* enclosing quotes + null terminator */
1069 if (new_len > avail) {
1070 avail = audit_expand(ab, new_len);
1071 if (!avail)
1072 return;
1073 }
1074 ptr = skb->tail;
1075 *ptr++ = '"';
1076 memcpy(ptr, string, slen);
1077 ptr += slen;
1078 *ptr++ = '"';
1079 *ptr = 0;
1080 skb_put(skb, slen + 2); /* don't include null terminator */
1081}
1082
991/** 1083/**
992 * audit_log_unstrustedstring - log a string that may contain random characters 1084 * audit_log_n_unstrustedstring - log a string that may contain random characters
993 * @ab: audit_buffer 1085 * @ab: audit_buffer
1086 * @len: lenth of string (not including trailing null)
994 * @string: string to be logged 1087 * @string: string to be logged
995 * 1088 *
996 * This code will escape a string that is passed to it if the string 1089 * This code will escape a string that is passed to it if the string
997 * contains a control character, unprintable character, double quote mark, 1090 * contains a control character, unprintable character, double quote mark,
998 * or a space. Unescaped strings will start and end with a double quote mark. 1091 * or a space. Unescaped strings will start and end with a double quote mark.
999 * Strings that are escaped are printed in hex (2 digits per char). 1092 * Strings that are escaped are printed in hex (2 digits per char).
1093 *
1094 * The caller specifies the number of characters in the string to log, which may
1095 * or may not be the entire string.
1000 */ 1096 */
1001void audit_log_untrustedstring(struct audit_buffer *ab, const char *string) 1097const char *audit_log_n_untrustedstring(struct audit_buffer *ab, size_t len,
1098 const char *string)
1002{ 1099{
1003 const unsigned char *p = string; 1100 const unsigned char *p = string;
1004 1101
1005 while (*p) { 1102 while (*p) {
1006 if (*p == '"' || *p < 0x21 || *p > 0x7f) { 1103 if (*p == '"' || *p < 0x21 || *p > 0x7f) {
1007 audit_log_hex(ab, string, strlen(string)); 1104 audit_log_hex(ab, string, len);
1008 return; 1105 return string + len + 1;
1009 } 1106 }
1010 p++; 1107 p++;
1011 } 1108 }
1012 audit_log_format(ab, "\"%s\"", string); 1109 audit_log_n_string(ab, len, string);
1110 return p + 1;
1111}
1112
1113/**
1114 * audit_log_unstrustedstring - log a string that may contain random characters
1115 * @ab: audit_buffer
1116 * @string: string to be logged
1117 *
1118 * Same as audit_log_n_unstrustedstring(), except that strlen is used to
1119 * determine string length.
1120 */
1121const char *audit_log_untrustedstring(struct audit_buffer *ab, const char *string)
1122{
1123 return audit_log_n_untrustedstring(ab, strlen(string), string);
1013} 1124}
1014 1125
1015/* This is a helper-function to print the escaped d_path */ 1126/* This is a helper-function to print the escaped d_path */
diff --git a/kernel/audit.h b/kernel/audit.h
index 6f733920fd32..8323e4132a33 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -19,9 +19,9 @@
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */ 20 */
21 21
22#include <linux/mutex.h>
23#include <linux/fs.h> 22#include <linux/fs.h>
24#include <linux/audit.h> 23#include <linux/audit.h>
24#include <linux/skbuff.h>
25 25
26/* 0 = no checking 26/* 0 = no checking
27 1 = put_count checking 27 1 = put_count checking
@@ -53,6 +53,18 @@ enum audit_state {
53}; 53};
54 54
55/* Rule lists */ 55/* Rule lists */
56struct audit_parent;
57
58struct audit_watch {
59 atomic_t count; /* reference count */
60 char *path; /* insertion path */
61 dev_t dev; /* associated superblock device */
62 unsigned long ino; /* associated inode number */
63 struct audit_parent *parent; /* associated parent */
64 struct list_head wlist; /* entry in parent->watches list */
65 struct list_head rules; /* associated rules */
66};
67
56struct audit_field { 68struct audit_field {
57 u32 type; 69 u32 type;
58 u32 val; 70 u32 val;
@@ -70,6 +82,9 @@ struct audit_krule {
70 u32 buflen; /* for data alloc on list rules */ 82 u32 buflen; /* for data alloc on list rules */
71 u32 field_count; 83 u32 field_count;
72 struct audit_field *fields; 84 struct audit_field *fields;
85 struct audit_field *inode_f; /* quick access to an inode field */
86 struct audit_watch *watch; /* associated watch */
87 struct list_head rlist; /* entry in audit_watch.rules list */
73}; 88};
74 89
75struct audit_entry { 90struct audit_entry {
@@ -78,15 +93,53 @@ struct audit_entry {
78 struct audit_krule rule; 93 struct audit_krule rule;
79}; 94};
80 95
81
82extern int audit_pid; 96extern int audit_pid;
83extern int audit_comparator(const u32 left, const u32 op, const u32 right);
84 97
98#define AUDIT_INODE_BUCKETS 32
99extern struct list_head audit_inode_hash[AUDIT_INODE_BUCKETS];
100
101static inline int audit_hash_ino(u32 ino)
102{
103 return (ino & (AUDIT_INODE_BUCKETS-1));
104}
105
106extern int audit_comparator(const u32 left, const u32 op, const u32 right);
107extern int audit_compare_dname_path(const char *dname, const char *path,
108 int *dirlen);
109extern struct sk_buff * audit_make_reply(int pid, int seq, int type,
110 int done, int multi,
111 void *payload, int size);
85extern void audit_send_reply(int pid, int seq, int type, 112extern void audit_send_reply(int pid, int seq, int type,
86 int done, int multi, 113 int done, int multi,
87 void *payload, int size); 114 void *payload, int size);
88extern void audit_log_lost(const char *message); 115extern void audit_log_lost(const char *message);
89extern void audit_panic(const char *message); 116extern void audit_panic(const char *message);
90extern struct mutex audit_netlink_mutex;
91 117
118struct audit_netlink_list {
119 int pid;
120 struct sk_buff_head q;
121};
122
123int audit_send_list(void *);
124
125struct inotify_watch;
126extern void audit_free_parent(struct inotify_watch *);
127extern void audit_handle_ievent(struct inotify_watch *, u32, u32, u32,
128 const char *, struct inode *);
92extern int selinux_audit_rule_update(void); 129extern int selinux_audit_rule_update(void);
130
131#ifdef CONFIG_AUDITSYSCALL
132extern void __audit_signal_info(int sig, struct task_struct *t);
133static inline void audit_signal_info(int sig, struct task_struct *t)
134{
135 if (unlikely(audit_pid && t->tgid == audit_pid))
136 __audit_signal_info(sig, t);
137}
138extern enum audit_state audit_filter_inodes(struct task_struct *,
139 struct audit_context *);
140extern void audit_set_auditable(struct audit_context *);
141#else
142#define audit_signal_info(s,t)
143#define audit_filter_inodes(t,c) AUDIT_DISABLED
144#define audit_set_auditable(c)
145#endif
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 7c134906d689..4c99d2c586ed 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -22,13 +22,59 @@
22#include <linux/kernel.h> 22#include <linux/kernel.h>
23#include <linux/audit.h> 23#include <linux/audit.h>
24#include <linux/kthread.h> 24#include <linux/kthread.h>
25#include <linux/mutex.h>
26#include <linux/fs.h>
27#include <linux/namei.h>
25#include <linux/netlink.h> 28#include <linux/netlink.h>
29#include <linux/sched.h>
30#include <linux/inotify.h>
26#include <linux/selinux.h> 31#include <linux/selinux.h>
27#include "audit.h" 32#include "audit.h"
28 33
29/* There are three lists of rules -- one to search at task creation 34/*
30 * time, one to search at syscall entry time, and another to search at 35 * Locking model:
31 * syscall exit time. */ 36 *
37 * audit_filter_mutex:
38 * Synchronizes writes and blocking reads of audit's filterlist
39 * data. Rcu is used to traverse the filterlist and access
40 * contents of structs audit_entry, audit_watch and opaque
41 * selinux rules during filtering. If modified, these structures
42 * must be copied and replace their counterparts in the filterlist.
43 * An audit_parent struct is not accessed during filtering, so may
44 * be written directly provided audit_filter_mutex is held.
45 */
46
47/*
48 * Reference counting:
49 *
50 * audit_parent: lifetime is from audit_init_parent() to receipt of an IN_IGNORED
51 * event. Each audit_watch holds a reference to its associated parent.
52 *
53 * audit_watch: if added to lists, lifetime is from audit_init_watch() to
54 * audit_remove_watch(). Additionally, an audit_watch may exist
55 * temporarily to assist in searching existing filter data. Each
56 * audit_krule holds a reference to its associated watch.
57 */
58
59struct audit_parent {
60 struct list_head ilist; /* entry in inotify registration list */
61 struct list_head watches; /* associated watches */
62 struct inotify_watch wdata; /* inotify watch data */
63 unsigned flags; /* status flags */
64};
65
66/*
67 * audit_parent status flags:
68 *
69 * AUDIT_PARENT_INVALID - set anytime rules/watches are auto-removed due to
70 * a filesystem event to ensure we're adding audit watches to a valid parent.
71 * Technically not needed for IN_DELETE_SELF or IN_UNMOUNT events, as we cannot
72 * receive them while we have nameidata, but must be used for IN_MOVE_SELF which
73 * we can receive while holding nameidata.
74 */
75#define AUDIT_PARENT_INVALID 0x001
76
77/* Audit filter lists, defined in <linux/audit.h> */
32struct list_head audit_filter_list[AUDIT_NR_FILTERS] = { 78struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
33 LIST_HEAD_INIT(audit_filter_list[0]), 79 LIST_HEAD_INIT(audit_filter_list[0]),
34 LIST_HEAD_INIT(audit_filter_list[1]), 80 LIST_HEAD_INIT(audit_filter_list[1]),
@@ -41,9 +87,53 @@ struct list_head audit_filter_list[AUDIT_NR_FILTERS] = {
41#endif 87#endif
42}; 88};
43 89
90static DEFINE_MUTEX(audit_filter_mutex);
91
92/* Inotify handle */
93extern struct inotify_handle *audit_ih;
94
95/* Inotify events we care about. */
96#define AUDIT_IN_WATCH IN_MOVE|IN_CREATE|IN_DELETE|IN_DELETE_SELF|IN_MOVE_SELF
97
98void audit_free_parent(struct inotify_watch *i_watch)
99{
100 struct audit_parent *parent;
101
102 parent = container_of(i_watch, struct audit_parent, wdata);
103 WARN_ON(!list_empty(&parent->watches));
104 kfree(parent);
105}
106
107static inline void audit_get_watch(struct audit_watch *watch)
108{
109 atomic_inc(&watch->count);
110}
111
112static void audit_put_watch(struct audit_watch *watch)
113{
114 if (atomic_dec_and_test(&watch->count)) {
115 WARN_ON(watch->parent);
116 WARN_ON(!list_empty(&watch->rules));
117 kfree(watch->path);
118 kfree(watch);
119 }
120}
121
122static void audit_remove_watch(struct audit_watch *watch)
123{
124 list_del(&watch->wlist);
125 put_inotify_watch(&watch->parent->wdata);
126 watch->parent = NULL;
127 audit_put_watch(watch); /* match initial get */
128}
129
44static inline void audit_free_rule(struct audit_entry *e) 130static inline void audit_free_rule(struct audit_entry *e)
45{ 131{
46 int i; 132 int i;
133
134 /* some rules don't have associated watches */
135 if (e->rule.watch)
136 audit_put_watch(e->rule.watch);
47 if (e->rule.fields) 137 if (e->rule.fields)
48 for (i = 0; i < e->rule.field_count; i++) { 138 for (i = 0; i < e->rule.field_count; i++) {
49 struct audit_field *f = &e->rule.fields[i]; 139 struct audit_field *f = &e->rule.fields[i];
@@ -60,6 +150,50 @@ static inline void audit_free_rule_rcu(struct rcu_head *head)
60 audit_free_rule(e); 150 audit_free_rule(e);
61} 151}
62 152
153/* Initialize a parent watch entry. */
154static struct audit_parent *audit_init_parent(struct nameidata *ndp)
155{
156 struct audit_parent *parent;
157 s32 wd;
158
159 parent = kzalloc(sizeof(*parent), GFP_KERNEL);
160 if (unlikely(!parent))
161 return ERR_PTR(-ENOMEM);
162
163 INIT_LIST_HEAD(&parent->watches);
164 parent->flags = 0;
165
166 inotify_init_watch(&parent->wdata);
167 /* grab a ref so inotify watch hangs around until we take audit_filter_mutex */
168 get_inotify_watch(&parent->wdata);
169 wd = inotify_add_watch(audit_ih, &parent->wdata, ndp->dentry->d_inode,
170 AUDIT_IN_WATCH);
171 if (wd < 0) {
172 audit_free_parent(&parent->wdata);
173 return ERR_PTR(wd);
174 }
175
176 return parent;
177}
178
179/* Initialize a watch entry. */
180static struct audit_watch *audit_init_watch(char *path)
181{
182 struct audit_watch *watch;
183
184 watch = kzalloc(sizeof(*watch), GFP_KERNEL);
185 if (unlikely(!watch))
186 return ERR_PTR(-ENOMEM);
187
188 INIT_LIST_HEAD(&watch->rules);
189 atomic_set(&watch->count, 1);
190 watch->path = path;
191 watch->dev = (dev_t)-1;
192 watch->ino = (unsigned long)-1;
193
194 return watch;
195}
196
63/* Initialize an audit filterlist entry. */ 197/* Initialize an audit filterlist entry. */
64static inline struct audit_entry *audit_init_entry(u32 field_count) 198static inline struct audit_entry *audit_init_entry(u32 field_count)
65{ 199{
@@ -107,6 +241,43 @@ static char *audit_unpack_string(void **bufp, size_t *remain, size_t len)
107 return str; 241 return str;
108} 242}
109 243
244/* Translate an inode field to kernel respresentation. */
245static inline int audit_to_inode(struct audit_krule *krule,
246 struct audit_field *f)
247{
248 if (krule->listnr != AUDIT_FILTER_EXIT ||
249 krule->watch || krule->inode_f)
250 return -EINVAL;
251
252 krule->inode_f = f;
253 return 0;
254}
255
256/* Translate a watch string to kernel respresentation. */
257static int audit_to_watch(struct audit_krule *krule, char *path, int len,
258 u32 op)
259{
260 struct audit_watch *watch;
261
262 if (!audit_ih)
263 return -EOPNOTSUPP;
264
265 if (path[0] != '/' || path[len-1] == '/' ||
266 krule->listnr != AUDIT_FILTER_EXIT ||
267 op & ~AUDIT_EQUAL ||
268 krule->inode_f || krule->watch) /* 1 inode # per rule, for hash */
269 return -EINVAL;
270
271 watch = audit_init_watch(path);
272 if (unlikely(IS_ERR(watch)))
273 return PTR_ERR(watch);
274
275 audit_get_watch(watch);
276 krule->watch = watch;
277
278 return 0;
279}
280
110/* Common user-space to kernel rule translation. */ 281/* Common user-space to kernel rule translation. */
111static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule) 282static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
112{ 283{
@@ -128,8 +299,11 @@ static inline struct audit_entry *audit_to_entry_common(struct audit_rule *rule)
128#endif 299#endif
129 ; 300 ;
130 } 301 }
131 if (rule->action != AUDIT_NEVER && rule->action != AUDIT_POSSIBLE && 302 if (unlikely(rule->action == AUDIT_POSSIBLE)) {
132 rule->action != AUDIT_ALWAYS) 303 printk(KERN_ERR "AUDIT_POSSIBLE is deprecated\n");
304 goto exit_err;
305 }
306 if (rule->action != AUDIT_NEVER && rule->action != AUDIT_ALWAYS)
133 goto exit_err; 307 goto exit_err;
134 if (rule->field_count > AUDIT_MAX_FIELDS) 308 if (rule->field_count > AUDIT_MAX_FIELDS)
135 goto exit_err; 309 goto exit_err;
@@ -158,6 +332,7 @@ exit_err:
158static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule) 332static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
159{ 333{
160 struct audit_entry *entry; 334 struct audit_entry *entry;
335 struct audit_field *f;
161 int err = 0; 336 int err = 0;
162 int i; 337 int i;
163 338
@@ -172,14 +347,37 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
172 f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS); 347 f->type = rule->fields[i] & ~(AUDIT_NEGATE|AUDIT_OPERATORS);
173 f->val = rule->values[i]; 348 f->val = rule->values[i];
174 349
175 if (f->type & AUDIT_UNUSED_BITS || 350 err = -EINVAL;
176 f->type == AUDIT_SE_USER || 351 switch(f->type) {
177 f->type == AUDIT_SE_ROLE || 352 default:
178 f->type == AUDIT_SE_TYPE ||
179 f->type == AUDIT_SE_SEN ||
180 f->type == AUDIT_SE_CLR) {
181 err = -EINVAL;
182 goto exit_free; 353 goto exit_free;
354 case AUDIT_PID:
355 case AUDIT_UID:
356 case AUDIT_EUID:
357 case AUDIT_SUID:
358 case AUDIT_FSUID:
359 case AUDIT_GID:
360 case AUDIT_EGID:
361 case AUDIT_SGID:
362 case AUDIT_FSGID:
363 case AUDIT_LOGINUID:
364 case AUDIT_PERS:
365 case AUDIT_ARCH:
366 case AUDIT_MSGTYPE:
367 case AUDIT_DEVMAJOR:
368 case AUDIT_DEVMINOR:
369 case AUDIT_EXIT:
370 case AUDIT_SUCCESS:
371 case AUDIT_ARG0:
372 case AUDIT_ARG1:
373 case AUDIT_ARG2:
374 case AUDIT_ARG3:
375 break;
376 case AUDIT_INODE:
377 err = audit_to_inode(&entry->rule, f);
378 if (err)
379 goto exit_free;
380 break;
183 } 381 }
184 382
185 entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1; 383 entry->rule.vers_ops = (f->op & AUDIT_OPERATORS) ? 2 : 1;
@@ -196,6 +394,18 @@ static struct audit_entry *audit_rule_to_entry(struct audit_rule *rule)
196 } 394 }
197 } 395 }
198 396
397 f = entry->rule.inode_f;
398 if (f) {
399 switch(f->op) {
400 case AUDIT_NOT_EQUAL:
401 entry->rule.inode_f = NULL;
402 case AUDIT_EQUAL:
403 break;
404 default:
405 goto exit_free;
406 }
407 }
408
199exit_nofree: 409exit_nofree:
200 return entry; 410 return entry;
201 411
@@ -210,6 +420,7 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
210{ 420{
211 int err = 0; 421 int err = 0;
212 struct audit_entry *entry; 422 struct audit_entry *entry;
423 struct audit_field *f;
213 void *bufp; 424 void *bufp;
214 size_t remain = datasz - sizeof(struct audit_rule_data); 425 size_t remain = datasz - sizeof(struct audit_rule_data);
215 int i; 426 int i;
@@ -235,6 +446,29 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
235 f->se_str = NULL; 446 f->se_str = NULL;
236 f->se_rule = NULL; 447 f->se_rule = NULL;
237 switch(f->type) { 448 switch(f->type) {
449 case AUDIT_PID:
450 case AUDIT_UID:
451 case AUDIT_EUID:
452 case AUDIT_SUID:
453 case AUDIT_FSUID:
454 case AUDIT_GID:
455 case AUDIT_EGID:
456 case AUDIT_SGID:
457 case AUDIT_FSGID:
458 case AUDIT_LOGINUID:
459 case AUDIT_PERS:
460 case AUDIT_ARCH:
461 case AUDIT_MSGTYPE:
462 case AUDIT_PPID:
463 case AUDIT_DEVMAJOR:
464 case AUDIT_DEVMINOR:
465 case AUDIT_EXIT:
466 case AUDIT_SUCCESS:
467 case AUDIT_ARG0:
468 case AUDIT_ARG1:
469 case AUDIT_ARG2:
470 case AUDIT_ARG3:
471 break;
238 case AUDIT_SE_USER: 472 case AUDIT_SE_USER:
239 case AUDIT_SE_ROLE: 473 case AUDIT_SE_ROLE:
240 case AUDIT_SE_TYPE: 474 case AUDIT_SE_TYPE:
@@ -260,6 +494,37 @@ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data,
260 } else 494 } else
261 f->se_str = str; 495 f->se_str = str;
262 break; 496 break;
497 case AUDIT_WATCH:
498 str = audit_unpack_string(&bufp, &remain, f->val);
499 if (IS_ERR(str))
500 goto exit_free;
501 entry->rule.buflen += f->val;
502
503 err = audit_to_watch(&entry->rule, str, f->val, f->op);
504 if (err) {
505 kfree(str);
506 goto exit_free;
507 }
508 break;
509 case AUDIT_INODE:
510 err = audit_to_inode(&entry->rule, f);
511 if (err)
512 goto exit_free;
513 break;
514 default:
515 goto exit_free;
516 }
517 }
518
519 f = entry->rule.inode_f;
520 if (f) {
521 switch(f->op) {
522 case AUDIT_NOT_EQUAL:
523 entry->rule.inode_f = NULL;
524 case AUDIT_EQUAL:
525 break;
526 default:
527 goto exit_free;
263 } 528 }
264 } 529 }
265 530
@@ -291,7 +556,7 @@ static struct audit_rule *audit_krule_to_rule(struct audit_krule *krule)
291 556
292 rule = kmalloc(sizeof(*rule), GFP_KERNEL); 557 rule = kmalloc(sizeof(*rule), GFP_KERNEL);
293 if (unlikely(!rule)) 558 if (unlikely(!rule))
294 return ERR_PTR(-ENOMEM); 559 return NULL;
295 memset(rule, 0, sizeof(*rule)); 560 memset(rule, 0, sizeof(*rule));
296 561
297 rule->flags = krule->flags | krule->listnr; 562 rule->flags = krule->flags | krule->listnr;
@@ -322,7 +587,7 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
322 587
323 data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL); 588 data = kmalloc(sizeof(*data) + krule->buflen, GFP_KERNEL);
324 if (unlikely(!data)) 589 if (unlikely(!data))
325 return ERR_PTR(-ENOMEM); 590 return NULL;
326 memset(data, 0, sizeof(*data)); 591 memset(data, 0, sizeof(*data));
327 592
328 data->flags = krule->flags | krule->listnr; 593 data->flags = krule->flags | krule->listnr;
@@ -343,6 +608,10 @@ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule)
343 data->buflen += data->values[i] = 608 data->buflen += data->values[i] =
344 audit_pack_string(&bufp, f->se_str); 609 audit_pack_string(&bufp, f->se_str);
345 break; 610 break;
611 case AUDIT_WATCH:
612 data->buflen += data->values[i] =
613 audit_pack_string(&bufp, krule->watch->path);
614 break;
346 default: 615 default:
347 data->values[i] = f->val; 616 data->values[i] = f->val;
348 } 617 }
@@ -378,6 +647,10 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
378 if (strcmp(a->fields[i].se_str, b->fields[i].se_str)) 647 if (strcmp(a->fields[i].se_str, b->fields[i].se_str))
379 return 1; 648 return 1;
380 break; 649 break;
650 case AUDIT_WATCH:
651 if (strcmp(a->watch->path, b->watch->path))
652 return 1;
653 break;
381 default: 654 default:
382 if (a->fields[i].val != b->fields[i].val) 655 if (a->fields[i].val != b->fields[i].val)
383 return 1; 656 return 1;
@@ -391,6 +664,32 @@ static int audit_compare_rule(struct audit_krule *a, struct audit_krule *b)
391 return 0; 664 return 0;
392} 665}
393 666
667/* Duplicate the given audit watch. The new watch's rules list is initialized
668 * to an empty list and wlist is undefined. */
669static struct audit_watch *audit_dupe_watch(struct audit_watch *old)
670{
671 char *path;
672 struct audit_watch *new;
673
674 path = kstrdup(old->path, GFP_KERNEL);
675 if (unlikely(!path))
676 return ERR_PTR(-ENOMEM);
677
678 new = audit_init_watch(path);
679 if (unlikely(IS_ERR(new))) {
680 kfree(path);
681 goto out;
682 }
683
684 new->dev = old->dev;
685 new->ino = old->ino;
686 get_inotify_watch(&old->parent->wdata);
687 new->parent = old->parent;
688
689out:
690 return new;
691}
692
394/* Duplicate selinux field information. The se_rule is opaque, so must be 693/* Duplicate selinux field information. The se_rule is opaque, so must be
395 * re-initialized. */ 694 * re-initialized. */
396static inline int audit_dupe_selinux_field(struct audit_field *df, 695static inline int audit_dupe_selinux_field(struct audit_field *df,
@@ -422,8 +721,11 @@ static inline int audit_dupe_selinux_field(struct audit_field *df,
422/* Duplicate an audit rule. This will be a deep copy with the exception 721/* Duplicate an audit rule. This will be a deep copy with the exception
423 * of the watch - that pointer is carried over. The selinux specific fields 722 * of the watch - that pointer is carried over. The selinux specific fields
424 * will be updated in the copy. The point is to be able to replace the old 723 * will be updated in the copy. The point is to be able to replace the old
425 * rule with the new rule in the filterlist, then free the old rule. */ 724 * rule with the new rule in the filterlist, then free the old rule.
426static struct audit_entry *audit_dupe_rule(struct audit_krule *old) 725 * The rlist element is undefined; list manipulations are handled apart from
726 * the initial copy. */
727static struct audit_entry *audit_dupe_rule(struct audit_krule *old,
728 struct audit_watch *watch)
427{ 729{
428 u32 fcount = old->field_count; 730 u32 fcount = old->field_count;
429 struct audit_entry *entry; 731 struct audit_entry *entry;
@@ -442,6 +744,8 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old)
442 for (i = 0; i < AUDIT_BITMASK_SIZE; i++) 744 for (i = 0; i < AUDIT_BITMASK_SIZE; i++)
443 new->mask[i] = old->mask[i]; 745 new->mask[i] = old->mask[i];
444 new->buflen = old->buflen; 746 new->buflen = old->buflen;
747 new->inode_f = old->inode_f;
748 new->watch = NULL;
445 new->field_count = old->field_count; 749 new->field_count = old->field_count;
446 memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount); 750 memcpy(new->fields, old->fields, sizeof(struct audit_field) * fcount);
447 751
@@ -463,68 +767,409 @@ static struct audit_entry *audit_dupe_rule(struct audit_krule *old)
463 } 767 }
464 } 768 }
465 769
770 if (watch) {
771 audit_get_watch(watch);
772 new->watch = watch;
773 }
774
466 return entry; 775 return entry;
467} 776}
468 777
469/* Add rule to given filterlist if not a duplicate. Protected by 778/* Update inode info in audit rules based on filesystem event. */
470 * audit_netlink_mutex. */ 779static void audit_update_watch(struct audit_parent *parent,
780 const char *dname, dev_t dev,
781 unsigned long ino, unsigned invalidating)
782{
783 struct audit_watch *owatch, *nwatch, *nextw;
784 struct audit_krule *r, *nextr;
785 struct audit_entry *oentry, *nentry;
786 struct audit_buffer *ab;
787
788 mutex_lock(&audit_filter_mutex);
789 list_for_each_entry_safe(owatch, nextw, &parent->watches, wlist) {
790 if (audit_compare_dname_path(dname, owatch->path, NULL))
791 continue;
792
793 /* If the update involves invalidating rules, do the inode-based
794 * filtering now, so we don't omit records. */
795 if (invalidating &&
796 audit_filter_inodes(current, current->audit_context) == AUDIT_RECORD_CONTEXT)
797 audit_set_auditable(current->audit_context);
798
799 nwatch = audit_dupe_watch(owatch);
800 if (unlikely(IS_ERR(nwatch))) {
801 mutex_unlock(&audit_filter_mutex);
802 audit_panic("error updating watch, skipping");
803 return;
804 }
805 nwatch->dev = dev;
806 nwatch->ino = ino;
807
808 list_for_each_entry_safe(r, nextr, &owatch->rules, rlist) {
809
810 oentry = container_of(r, struct audit_entry, rule);
811 list_del(&oentry->rule.rlist);
812 list_del_rcu(&oentry->list);
813
814 nentry = audit_dupe_rule(&oentry->rule, nwatch);
815 if (unlikely(IS_ERR(nentry)))
816 audit_panic("error updating watch, removing");
817 else {
818 int h = audit_hash_ino((u32)ino);
819 list_add(&nentry->rule.rlist, &nwatch->rules);
820 list_add_rcu(&nentry->list, &audit_inode_hash[h]);
821 }
822
823 call_rcu(&oentry->rcu, audit_free_rule_rcu);
824 }
825
826 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE);
827 audit_log_format(ab, "audit updated rules specifying watch=");
828 audit_log_untrustedstring(ab, owatch->path);
829 audit_log_format(ab, " with dev=%u ino=%lu\n", dev, ino);
830 audit_log_end(ab);
831
832 audit_remove_watch(owatch);
833 goto add_watch_to_parent; /* event applies to a single watch */
834 }
835 mutex_unlock(&audit_filter_mutex);
836 return;
837
838add_watch_to_parent:
839 list_add(&nwatch->wlist, &parent->watches);
840 mutex_unlock(&audit_filter_mutex);
841 return;
842}
843
844/* Remove all watches & rules associated with a parent that is going away. */
845static void audit_remove_parent_watches(struct audit_parent *parent)
846{
847 struct audit_watch *w, *nextw;
848 struct audit_krule *r, *nextr;
849 struct audit_entry *e;
850
851 mutex_lock(&audit_filter_mutex);
852 parent->flags |= AUDIT_PARENT_INVALID;
853 list_for_each_entry_safe(w, nextw, &parent->watches, wlist) {
854 list_for_each_entry_safe(r, nextr, &w->rules, rlist) {
855 e = container_of(r, struct audit_entry, rule);
856 list_del(&r->rlist);
857 list_del_rcu(&e->list);
858 call_rcu(&e->rcu, audit_free_rule_rcu);
859
860 audit_log(NULL, GFP_KERNEL, AUDIT_CONFIG_CHANGE,
861 "audit implicitly removed rule from list=%d\n",
862 AUDIT_FILTER_EXIT);
863 }
864 audit_remove_watch(w);
865 }
866 mutex_unlock(&audit_filter_mutex);
867}
868
869/* Unregister inotify watches for parents on in_list.
870 * Generates an IN_IGNORED event. */
871static void audit_inotify_unregister(struct list_head *in_list)
872{
873 struct audit_parent *p, *n;
874
875 list_for_each_entry_safe(p, n, in_list, ilist) {
876 list_del(&p->ilist);
877 inotify_rm_watch(audit_ih, &p->wdata);
878 /* the put matching the get in audit_do_del_rule() */
879 put_inotify_watch(&p->wdata);
880 }
881}
882
883/* Find an existing audit rule.
884 * Caller must hold audit_filter_mutex to prevent stale rule data. */
885static struct audit_entry *audit_find_rule(struct audit_entry *entry,
886 struct list_head *list)
887{
888 struct audit_entry *e, *found = NULL;
889 int h;
890
891 if (entry->rule.watch) {
892 /* we don't know the inode number, so must walk entire hash */
893 for (h = 0; h < AUDIT_INODE_BUCKETS; h++) {
894 list = &audit_inode_hash[h];
895 list_for_each_entry(e, list, list)
896 if (!audit_compare_rule(&entry->rule, &e->rule)) {
897 found = e;
898 goto out;
899 }
900 }
901 goto out;
902 }
903
904 list_for_each_entry(e, list, list)
905 if (!audit_compare_rule(&entry->rule, &e->rule)) {
906 found = e;
907 goto out;
908 }
909
910out:
911 return found;
912}
913
914/* Get path information necessary for adding watches. */
915static int audit_get_nd(char *path, struct nameidata **ndp,
916 struct nameidata **ndw)
917{
918 struct nameidata *ndparent, *ndwatch;
919 int err;
920
921 ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL);
922 if (unlikely(!ndparent))
923 return -ENOMEM;
924
925 ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL);
926 if (unlikely(!ndwatch)) {
927 kfree(ndparent);
928 return -ENOMEM;
929 }
930
931 err = path_lookup(path, LOOKUP_PARENT, ndparent);
932 if (err) {
933 kfree(ndparent);
934 kfree(ndwatch);
935 return err;
936 }
937
938 err = path_lookup(path, 0, ndwatch);
939 if (err) {
940 kfree(ndwatch);
941 ndwatch = NULL;
942 }
943
944 *ndp = ndparent;
945 *ndw = ndwatch;
946
947 return 0;
948}
949
950/* Release resources used for watch path information. */
951static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
952{
953 if (ndp) {
954 path_release(ndp);
955 kfree(ndp);
956 }
957 if (ndw) {
958 path_release(ndw);
959 kfree(ndw);
960 }
961}
962
963/* Associate the given rule with an existing parent inotify_watch.
964 * Caller must hold audit_filter_mutex. */
965static void audit_add_to_parent(struct audit_krule *krule,
966 struct audit_parent *parent)
967{
968 struct audit_watch *w, *watch = krule->watch;
969 int watch_found = 0;
970
971 list_for_each_entry(w, &parent->watches, wlist) {
972 if (strcmp(watch->path, w->path))
973 continue;
974
975 watch_found = 1;
976
977 /* put krule's and initial refs to temporary watch */
978 audit_put_watch(watch);
979 audit_put_watch(watch);
980
981 audit_get_watch(w);
982 krule->watch = watch = w;
983 break;
984 }
985
986 if (!watch_found) {
987 get_inotify_watch(&parent->wdata);
988 watch->parent = parent;
989
990 list_add(&watch->wlist, &parent->watches);
991 }
992 list_add(&krule->rlist, &watch->rules);
993}
994
995/* Find a matching watch entry, or add this one.
996 * Caller must hold audit_filter_mutex. */
997static int audit_add_watch(struct audit_krule *krule, struct nameidata *ndp,
998 struct nameidata *ndw)
999{
1000 struct audit_watch *watch = krule->watch;
1001 struct inotify_watch *i_watch;
1002 struct audit_parent *parent;
1003 int ret = 0;
1004
1005 /* update watch filter fields */
1006 if (ndw) {
1007 watch->dev = ndw->dentry->d_inode->i_sb->s_dev;
1008 watch->ino = ndw->dentry->d_inode->i_ino;
1009 }
1010
1011 /* The audit_filter_mutex must not be held during inotify calls because
1012 * we hold it during inotify event callback processing. If an existing
1013 * inotify watch is found, inotify_find_watch() grabs a reference before
1014 * returning.
1015 */
1016 mutex_unlock(&audit_filter_mutex);
1017
1018 if (inotify_find_watch(audit_ih, ndp->dentry->d_inode, &i_watch) < 0) {
1019 parent = audit_init_parent(ndp);
1020 if (IS_ERR(parent)) {
1021 /* caller expects mutex locked */
1022 mutex_lock(&audit_filter_mutex);
1023 return PTR_ERR(parent);
1024 }
1025 } else
1026 parent = container_of(i_watch, struct audit_parent, wdata);
1027
1028 mutex_lock(&audit_filter_mutex);
1029
1030 /* parent was moved before we took audit_filter_mutex */
1031 if (parent->flags & AUDIT_PARENT_INVALID)
1032 ret = -ENOENT;
1033 else
1034 audit_add_to_parent(krule, parent);
1035
1036 /* match get in audit_init_parent or inotify_find_watch */
1037 put_inotify_watch(&parent->wdata);
1038 return ret;
1039}
1040
1041/* Add rule to given filterlist if not a duplicate. */
471static inline int audit_add_rule(struct audit_entry *entry, 1042static inline int audit_add_rule(struct audit_entry *entry,
472 struct list_head *list) 1043 struct list_head *list)
473{ 1044{
474 struct audit_entry *e; 1045 struct audit_entry *e;
1046 struct audit_field *inode_f = entry->rule.inode_f;
1047 struct audit_watch *watch = entry->rule.watch;
1048 struct nameidata *ndp, *ndw;
1049 int h, err, putnd_needed = 0;
1050
1051 if (inode_f) {
1052 h = audit_hash_ino(inode_f->val);
1053 list = &audit_inode_hash[h];
1054 }
475 1055
476 /* Do not use the _rcu iterator here, since this is the only 1056 mutex_lock(&audit_filter_mutex);
477 * addition routine. */ 1057 e = audit_find_rule(entry, list);
478 list_for_each_entry(e, list, list) { 1058 mutex_unlock(&audit_filter_mutex);
479 if (!audit_compare_rule(&entry->rule, &e->rule)) 1059 if (e) {
480 return -EEXIST; 1060 err = -EEXIST;
1061 goto error;
1062 }
1063
1064 /* Avoid calling path_lookup under audit_filter_mutex. */
1065 if (watch) {
1066 err = audit_get_nd(watch->path, &ndp, &ndw);
1067 if (err)
1068 goto error;
1069 putnd_needed = 1;
1070 }
1071
1072 mutex_lock(&audit_filter_mutex);
1073 if (watch) {
1074 /* audit_filter_mutex is dropped and re-taken during this call */
1075 err = audit_add_watch(&entry->rule, ndp, ndw);
1076 if (err) {
1077 mutex_unlock(&audit_filter_mutex);
1078 goto error;
1079 }
1080 h = audit_hash_ino((u32)watch->ino);
1081 list = &audit_inode_hash[h];
481 } 1082 }
482 1083
483 if (entry->rule.flags & AUDIT_FILTER_PREPEND) { 1084 if (entry->rule.flags & AUDIT_FILTER_PREPEND) {
484 list_add_rcu(&entry->list, list); 1085 list_add_rcu(&entry->list, list);
1086 entry->rule.flags &= ~AUDIT_FILTER_PREPEND;
485 } else { 1087 } else {
486 list_add_tail_rcu(&entry->list, list); 1088 list_add_tail_rcu(&entry->list, list);
487 } 1089 }
1090 mutex_unlock(&audit_filter_mutex);
488 1091
489 return 0; 1092 if (putnd_needed)
1093 audit_put_nd(ndp, ndw);
1094
1095 return 0;
1096
1097error:
1098 if (putnd_needed)
1099 audit_put_nd(ndp, ndw);
1100 if (watch)
1101 audit_put_watch(watch); /* tmp watch, matches initial get */
1102 return err;
490} 1103}
491 1104
492/* Remove an existing rule from filterlist. Protected by 1105/* Remove an existing rule from filterlist. */
493 * audit_netlink_mutex. */
494static inline int audit_del_rule(struct audit_entry *entry, 1106static inline int audit_del_rule(struct audit_entry *entry,
495 struct list_head *list) 1107 struct list_head *list)
496{ 1108{
497 struct audit_entry *e; 1109 struct audit_entry *e;
1110 struct audit_field *inode_f = entry->rule.inode_f;
1111 struct audit_watch *watch, *tmp_watch = entry->rule.watch;
1112 LIST_HEAD(inotify_list);
1113 int h, ret = 0;
1114
1115 if (inode_f) {
1116 h = audit_hash_ino(inode_f->val);
1117 list = &audit_inode_hash[h];
1118 }
498 1119
499 /* Do not use the _rcu iterator here, since this is the only 1120 mutex_lock(&audit_filter_mutex);
500 * deletion routine. */ 1121 e = audit_find_rule(entry, list);
501 list_for_each_entry(e, list, list) { 1122 if (!e) {
502 if (!audit_compare_rule(&entry->rule, &e->rule)) { 1123 mutex_unlock(&audit_filter_mutex);
503 list_del_rcu(&e->list); 1124 ret = -ENOENT;
504 call_rcu(&e->rcu, audit_free_rule_rcu); 1125 goto out;
505 return 0; 1126 }
1127
1128 watch = e->rule.watch;
1129 if (watch) {
1130 struct audit_parent *parent = watch->parent;
1131
1132 list_del(&e->rule.rlist);
1133
1134 if (list_empty(&watch->rules)) {
1135 audit_remove_watch(watch);
1136
1137 if (list_empty(&parent->watches)) {
1138 /* Put parent on the inotify un-registration
1139 * list. Grab a reference before releasing
1140 * audit_filter_mutex, to be released in
1141 * audit_inotify_unregister(). */
1142 list_add(&parent->ilist, &inotify_list);
1143 get_inotify_watch(&parent->wdata);
1144 }
506 } 1145 }
507 } 1146 }
508 return -ENOENT; /* No matching rule */ 1147
1148 list_del_rcu(&e->list);
1149 call_rcu(&e->rcu, audit_free_rule_rcu);
1150
1151 mutex_unlock(&audit_filter_mutex);
1152
1153 if (!list_empty(&inotify_list))
1154 audit_inotify_unregister(&inotify_list);
1155
1156out:
1157 if (tmp_watch)
1158 audit_put_watch(tmp_watch); /* match initial get */
1159
1160 return ret;
509} 1161}
510 1162
511/* List rules using struct audit_rule. Exists for backward 1163/* List rules using struct audit_rule. Exists for backward
512 * compatibility with userspace. */ 1164 * compatibility with userspace. */
513static int audit_list(void *_dest) 1165static void audit_list(int pid, int seq, struct sk_buff_head *q)
514{ 1166{
515 int pid, seq; 1167 struct sk_buff *skb;
516 int *dest = _dest;
517 struct audit_entry *entry; 1168 struct audit_entry *entry;
518 int i; 1169 int i;
519 1170
520 pid = dest[0]; 1171 /* This is a blocking read, so use audit_filter_mutex instead of rcu
521 seq = dest[1]; 1172 * iterator to sync with list writers. */
522 kfree(dest);
523
524 mutex_lock(&audit_netlink_mutex);
525
526 /* The *_rcu iterators not needed here because we are
527 always called with audit_netlink_mutex held. */
528 for (i=0; i<AUDIT_NR_FILTERS; i++) { 1173 for (i=0; i<AUDIT_NR_FILTERS; i++) {
529 list_for_each_entry(entry, &audit_filter_list[i], list) { 1174 list_for_each_entry(entry, &audit_filter_list[i], list) {
530 struct audit_rule *rule; 1175 struct audit_rule *rule;
@@ -532,33 +1177,41 @@ static int audit_list(void *_dest)
532 rule = audit_krule_to_rule(&entry->rule); 1177 rule = audit_krule_to_rule(&entry->rule);
533 if (unlikely(!rule)) 1178 if (unlikely(!rule))
534 break; 1179 break;
535 audit_send_reply(pid, seq, AUDIT_LIST, 0, 1, 1180 skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1,
536 rule, sizeof(*rule)); 1181 rule, sizeof(*rule));
1182 if (skb)
1183 skb_queue_tail(q, skb);
537 kfree(rule); 1184 kfree(rule);
538 } 1185 }
539 } 1186 }
540 audit_send_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0); 1187 for (i = 0; i < AUDIT_INODE_BUCKETS; i++) {
541 1188 list_for_each_entry(entry, &audit_inode_hash[i], list) {
542 mutex_unlock(&audit_netlink_mutex); 1189 struct audit_rule *rule;
543 return 0; 1190
1191 rule = audit_krule_to_rule(&entry->rule);
1192 if (unlikely(!rule))
1193 break;
1194 skb = audit_make_reply(pid, seq, AUDIT_LIST, 0, 1,
1195 rule, sizeof(*rule));
1196 if (skb)
1197 skb_queue_tail(q, skb);
1198 kfree(rule);
1199 }
1200 }
1201 skb = audit_make_reply(pid, seq, AUDIT_LIST, 1, 1, NULL, 0);
1202 if (skb)
1203 skb_queue_tail(q, skb);
544} 1204}
545 1205
546/* List rules using struct audit_rule_data. */ 1206/* List rules using struct audit_rule_data. */
547static int audit_list_rules(void *_dest) 1207static void audit_list_rules(int pid, int seq, struct sk_buff_head *q)
548{ 1208{
549 int pid, seq; 1209 struct sk_buff *skb;
550 int *dest = _dest;
551 struct audit_entry *e; 1210 struct audit_entry *e;
552 int i; 1211 int i;
553 1212
554 pid = dest[0]; 1213 /* This is a blocking read, so use audit_filter_mutex instead of rcu
555 seq = dest[1]; 1214 * iterator to sync with list writers. */
556 kfree(dest);
557
558 mutex_lock(&audit_netlink_mutex);
559
560 /* The *_rcu iterators not needed here because we are
561 always called with audit_netlink_mutex held. */
562 for (i=0; i<AUDIT_NR_FILTERS; i++) { 1215 for (i=0; i<AUDIT_NR_FILTERS; i++) {
563 list_for_each_entry(e, &audit_filter_list[i], list) { 1216 list_for_each_entry(e, &audit_filter_list[i], list) {
564 struct audit_rule_data *data; 1217 struct audit_rule_data *data;
@@ -566,15 +1219,30 @@ static int audit_list_rules(void *_dest)
566 data = audit_krule_to_data(&e->rule); 1219 data = audit_krule_to_data(&e->rule);
567 if (unlikely(!data)) 1220 if (unlikely(!data))
568 break; 1221 break;
569 audit_send_reply(pid, seq, AUDIT_LIST_RULES, 0, 1, 1222 skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1,
570 data, sizeof(*data)); 1223 data, sizeof(*data) + data->buflen);
1224 if (skb)
1225 skb_queue_tail(q, skb);
571 kfree(data); 1226 kfree(data);
572 } 1227 }
573 } 1228 }
574 audit_send_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0); 1229 for (i=0; i< AUDIT_INODE_BUCKETS; i++) {
1230 list_for_each_entry(e, &audit_inode_hash[i], list) {
1231 struct audit_rule_data *data;
575 1232
576 mutex_unlock(&audit_netlink_mutex); 1233 data = audit_krule_to_data(&e->rule);
577 return 0; 1234 if (unlikely(!data))
1235 break;
1236 skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 0, 1,
1237 data, sizeof(*data) + data->buflen);
1238 if (skb)
1239 skb_queue_tail(q, skb);
1240 kfree(data);
1241 }
1242 }
1243 skb = audit_make_reply(pid, seq, AUDIT_LIST_RULES, 1, 1, NULL, 0);
1244 if (skb)
1245 skb_queue_tail(q, skb);
578} 1246}
579 1247
580/** 1248/**
@@ -592,7 +1260,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
592 size_t datasz, uid_t loginuid, u32 sid) 1260 size_t datasz, uid_t loginuid, u32 sid)
593{ 1261{
594 struct task_struct *tsk; 1262 struct task_struct *tsk;
595 int *dest; 1263 struct audit_netlink_list *dest;
596 int err = 0; 1264 int err = 0;
597 struct audit_entry *entry; 1265 struct audit_entry *entry;
598 1266
@@ -605,18 +1273,22 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
605 * happen if we're actually running in the context of auditctl 1273 * happen if we're actually running in the context of auditctl
606 * trying to _send_ the stuff */ 1274 * trying to _send_ the stuff */
607 1275
608 dest = kmalloc(2 * sizeof(int), GFP_KERNEL); 1276 dest = kmalloc(sizeof(struct audit_netlink_list), GFP_KERNEL);
609 if (!dest) 1277 if (!dest)
610 return -ENOMEM; 1278 return -ENOMEM;
611 dest[0] = pid; 1279 dest->pid = pid;
612 dest[1] = seq; 1280 skb_queue_head_init(&dest->q);
613 1281
1282 mutex_lock(&audit_filter_mutex);
614 if (type == AUDIT_LIST) 1283 if (type == AUDIT_LIST)
615 tsk = kthread_run(audit_list, dest, "audit_list"); 1284 audit_list(pid, seq, &dest->q);
616 else 1285 else
617 tsk = kthread_run(audit_list_rules, dest, 1286 audit_list_rules(pid, seq, &dest->q);
618 "audit_list_rules"); 1287 mutex_unlock(&audit_filter_mutex);
1288
1289 tsk = kthread_run(audit_send_list, dest, "audit_send_list");
619 if (IS_ERR(tsk)) { 1290 if (IS_ERR(tsk)) {
1291 skb_queue_purge(&dest->q);
620 kfree(dest); 1292 kfree(dest);
621 err = PTR_ERR(tsk); 1293 err = PTR_ERR(tsk);
622 } 1294 }
@@ -632,6 +1304,7 @@ int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
632 1304
633 err = audit_add_rule(entry, 1305 err = audit_add_rule(entry,
634 &audit_filter_list[entry->rule.listnr]); 1306 &audit_filter_list[entry->rule.listnr]);
1307
635 if (sid) { 1308 if (sid) {
636 char *ctx = NULL; 1309 char *ctx = NULL;
637 u32 len; 1310 u32 len;
@@ -712,7 +1385,43 @@ int audit_comparator(const u32 left, const u32 op, const u32 right)
712 return 0; 1385 return 0;
713} 1386}
714 1387
1388/* Compare given dentry name with last component in given path,
1389 * return of 0 indicates a match. */
1390int audit_compare_dname_path(const char *dname, const char *path,
1391 int *dirlen)
1392{
1393 int dlen, plen;
1394 const char *p;
715 1395
1396 if (!dname || !path)
1397 return 1;
1398
1399 dlen = strlen(dname);
1400 plen = strlen(path);
1401 if (plen < dlen)
1402 return 1;
1403
1404 /* disregard trailing slashes */
1405 p = path + plen - 1;
1406 while ((*p == '/') && (p > path))
1407 p--;
1408
1409 /* find last path component */
1410 p = p - dlen + 1;
1411 if (p < path)
1412 return 1;
1413 else if (p > path) {
1414 if (*--p != '/')
1415 return 1;
1416 else
1417 p++;
1418 }
1419
1420 /* return length of path's directory component */
1421 if (dirlen)
1422 *dirlen = p - path;
1423 return strncmp(p, dname, dlen);
1424}
716 1425
717static int audit_filter_user_rules(struct netlink_skb_parms *cb, 1426static int audit_filter_user_rules(struct netlink_skb_parms *cb,
718 struct audit_krule *rule, 1427 struct audit_krule *rule,
@@ -744,7 +1453,6 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
744 } 1453 }
745 switch (rule->action) { 1454 switch (rule->action) {
746 case AUDIT_NEVER: *state = AUDIT_DISABLED; break; 1455 case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
747 case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break;
748 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; 1456 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
749 } 1457 }
750 return 1; 1458 return 1;
@@ -826,32 +1534,65 @@ static inline int audit_rule_has_selinux(struct audit_krule *rule)
826int selinux_audit_rule_update(void) 1534int selinux_audit_rule_update(void)
827{ 1535{
828 struct audit_entry *entry, *n, *nentry; 1536 struct audit_entry *entry, *n, *nentry;
1537 struct audit_watch *watch;
829 int i, err = 0; 1538 int i, err = 0;
830 1539
831 /* audit_netlink_mutex synchronizes the writers */ 1540 /* audit_filter_mutex synchronizes the writers */
832 mutex_lock(&audit_netlink_mutex); 1541 mutex_lock(&audit_filter_mutex);
833 1542
834 for (i = 0; i < AUDIT_NR_FILTERS; i++) { 1543 for (i = 0; i < AUDIT_NR_FILTERS; i++) {
835 list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) { 1544 list_for_each_entry_safe(entry, n, &audit_filter_list[i], list) {
836 if (!audit_rule_has_selinux(&entry->rule)) 1545 if (!audit_rule_has_selinux(&entry->rule))
837 continue; 1546 continue;
838 1547
839 nentry = audit_dupe_rule(&entry->rule); 1548 watch = entry->rule.watch;
1549 nentry = audit_dupe_rule(&entry->rule, watch);
840 if (unlikely(IS_ERR(nentry))) { 1550 if (unlikely(IS_ERR(nentry))) {
841 /* save the first error encountered for the 1551 /* save the first error encountered for the
842 * return value */ 1552 * return value */
843 if (!err) 1553 if (!err)
844 err = PTR_ERR(nentry); 1554 err = PTR_ERR(nentry);
845 audit_panic("error updating selinux filters"); 1555 audit_panic("error updating selinux filters");
1556 if (watch)
1557 list_del(&entry->rule.rlist);
846 list_del_rcu(&entry->list); 1558 list_del_rcu(&entry->list);
847 } else { 1559 } else {
1560 if (watch) {
1561 list_add(&nentry->rule.rlist,
1562 &watch->rules);
1563 list_del(&entry->rule.rlist);
1564 }
848 list_replace_rcu(&entry->list, &nentry->list); 1565 list_replace_rcu(&entry->list, &nentry->list);
849 } 1566 }
850 call_rcu(&entry->rcu, audit_free_rule_rcu); 1567 call_rcu(&entry->rcu, audit_free_rule_rcu);
851 } 1568 }
852 } 1569 }
853 1570
854 mutex_unlock(&audit_netlink_mutex); 1571 mutex_unlock(&audit_filter_mutex);
855 1572
856 return err; 1573 return err;
857} 1574}
1575
1576/* Update watch data in audit rules based on inotify events. */
1577void audit_handle_ievent(struct inotify_watch *i_watch, u32 wd, u32 mask,
1578 u32 cookie, const char *dname, struct inode *inode)
1579{
1580 struct audit_parent *parent;
1581
1582 parent = container_of(i_watch, struct audit_parent, wdata);
1583
1584 if (mask & (IN_CREATE|IN_MOVED_TO) && inode)
1585 audit_update_watch(parent, dname, inode->i_sb->s_dev,
1586 inode->i_ino, 0);
1587 else if (mask & (IN_DELETE|IN_MOVED_FROM))
1588 audit_update_watch(parent, dname, (dev_t)-1, (unsigned long)-1, 1);
1589 /* inotify automatically removes the watch and sends IN_IGNORED */
1590 else if (mask & (IN_DELETE_SELF|IN_UNMOUNT))
1591 audit_remove_parent_watches(parent);
1592 /* inotify does not remove the watch, so remove it manually */
1593 else if(mask & IN_MOVE_SELF) {
1594 audit_remove_parent_watches(parent);
1595 inotify_remove_watch_locked(audit_ih, i_watch);
1596 } else if (mask & IN_IGNORED)
1597 put_inotify_watch(i_watch);
1598}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 1c03a4ed1b27..dc5e3f01efe7 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina. 4 * Copyright 2003-2004 Red Hat Inc., Durham, North Carolina.
5 * Copyright 2005 Hewlett-Packard Development Company, L.P. 5 * Copyright 2005 Hewlett-Packard Development Company, L.P.
6 * Copyright (C) 2005 IBM Corporation 6 * Copyright (C) 2005, 2006 IBM Corporation
7 * All Rights Reserved. 7 * All Rights Reserved.
8 * 8 *
9 * This program is free software; you can redistribute it and/or modify 9 * This program is free software; you can redistribute it and/or modify
@@ -29,6 +29,9 @@
29 * this file -- see entry.S) is based on a GPL'd patch written by 29 * this file -- see entry.S) is based on a GPL'd patch written by
30 * okir@suse.de and Copyright 2003 SuSE Linux AG. 30 * okir@suse.de and Copyright 2003 SuSE Linux AG.
31 * 31 *
32 * POSIX message queue support added by George Wilson <ltcgcw@us.ibm.com>,
33 * 2006.
34 *
32 * The support of additional filter rules compares (>, <, >=, <=) was 35 * The support of additional filter rules compares (>, <, >=, <=) was
33 * added by Dustin Kirkland <dustin.kirkland@us.ibm.com>, 2005. 36 * added by Dustin Kirkland <dustin.kirkland@us.ibm.com>, 2005.
34 * 37 *
@@ -49,6 +52,7 @@
49#include <linux/module.h> 52#include <linux/module.h>
50#include <linux/mount.h> 53#include <linux/mount.h>
51#include <linux/socket.h> 54#include <linux/socket.h>
55#include <linux/mqueue.h>
52#include <linux/audit.h> 56#include <linux/audit.h>
53#include <linux/personality.h> 57#include <linux/personality.h>
54#include <linux/time.h> 58#include <linux/time.h>
@@ -59,6 +63,8 @@
59#include <linux/list.h> 63#include <linux/list.h>
60#include <linux/tty.h> 64#include <linux/tty.h>
61#include <linux/selinux.h> 65#include <linux/selinux.h>
66#include <linux/binfmts.h>
67#include <linux/syscalls.h>
62 68
63#include "audit.h" 69#include "audit.h"
64 70
@@ -76,6 +82,9 @@ extern int audit_enabled;
76 * path_lookup. */ 82 * path_lookup. */
77#define AUDIT_NAMES_RESERVED 7 83#define AUDIT_NAMES_RESERVED 7
78 84
85/* Indicates that audit should log the full pathname. */
86#define AUDIT_NAME_FULL -1
87
79/* When fs/namei.c:getname() is called, we store the pointer in name and 88/* When fs/namei.c:getname() is called, we store the pointer in name and
80 * we don't let putname() free it (instead we free all of the saved 89 * we don't let putname() free it (instead we free all of the saved
81 * pointers at syscall exit time). 90 * pointers at syscall exit time).
@@ -83,8 +92,9 @@ extern int audit_enabled;
83 * Further, in fs/namei.c:path_lookup() we store the inode and device. */ 92 * Further, in fs/namei.c:path_lookup() we store the inode and device. */
84struct audit_names { 93struct audit_names {
85 const char *name; 94 const char *name;
95 int name_len; /* number of name's characters to log */
96 unsigned name_put; /* call __putname() for this name */
86 unsigned long ino; 97 unsigned long ino;
87 unsigned long pino;
88 dev_t dev; 98 dev_t dev;
89 umode_t mode; 99 umode_t mode;
90 uid_t uid; 100 uid_t uid;
@@ -100,6 +110,33 @@ struct audit_aux_data {
100 110
101#define AUDIT_AUX_IPCPERM 0 111#define AUDIT_AUX_IPCPERM 0
102 112
113struct audit_aux_data_mq_open {
114 struct audit_aux_data d;
115 int oflag;
116 mode_t mode;
117 struct mq_attr attr;
118};
119
120struct audit_aux_data_mq_sendrecv {
121 struct audit_aux_data d;
122 mqd_t mqdes;
123 size_t msg_len;
124 unsigned int msg_prio;
125 struct timespec abs_timeout;
126};
127
128struct audit_aux_data_mq_notify {
129 struct audit_aux_data d;
130 mqd_t mqdes;
131 struct sigevent notification;
132};
133
134struct audit_aux_data_mq_getsetattr {
135 struct audit_aux_data d;
136 mqd_t mqdes;
137 struct mq_attr mqstat;
138};
139
103struct audit_aux_data_ipcctl { 140struct audit_aux_data_ipcctl {
104 struct audit_aux_data d; 141 struct audit_aux_data d;
105 struct ipc_perm p; 142 struct ipc_perm p;
@@ -110,6 +147,13 @@ struct audit_aux_data_ipcctl {
110 u32 osid; 147 u32 osid;
111}; 148};
112 149
150struct audit_aux_data_execve {
151 struct audit_aux_data d;
152 int argc;
153 int envc;
154 char mem[0];
155};
156
113struct audit_aux_data_socketcall { 157struct audit_aux_data_socketcall {
114 struct audit_aux_data d; 158 struct audit_aux_data d;
115 int nargs; 159 int nargs;
@@ -148,7 +192,7 @@ struct audit_context {
148 struct audit_aux_data *aux; 192 struct audit_aux_data *aux;
149 193
150 /* Save things to print about task_struct */ 194 /* Save things to print about task_struct */
151 pid_t pid; 195 pid_t pid, ppid;
152 uid_t uid, euid, suid, fsuid; 196 uid_t uid, euid, suid, fsuid;
153 gid_t gid, egid, sgid, fsgid; 197 gid_t gid, egid, sgid, fsgid;
154 unsigned long personality; 198 unsigned long personality;
@@ -160,12 +204,13 @@ struct audit_context {
160#endif 204#endif
161}; 205};
162 206
163 207/* Determine if any context name data matches a rule's watch data */
164/* Compare a task_struct with an audit_rule. Return 1 on match, 0 208/* Compare a task_struct with an audit_rule. Return 1 on match, 0
165 * otherwise. */ 209 * otherwise. */
166static int audit_filter_rules(struct task_struct *tsk, 210static int audit_filter_rules(struct task_struct *tsk,
167 struct audit_krule *rule, 211 struct audit_krule *rule,
168 struct audit_context *ctx, 212 struct audit_context *ctx,
213 struct audit_names *name,
169 enum audit_state *state) 214 enum audit_state *state)
170{ 215{
171 int i, j, need_sid = 1; 216 int i, j, need_sid = 1;
@@ -179,6 +224,10 @@ static int audit_filter_rules(struct task_struct *tsk,
179 case AUDIT_PID: 224 case AUDIT_PID:
180 result = audit_comparator(tsk->pid, f->op, f->val); 225 result = audit_comparator(tsk->pid, f->op, f->val);
181 break; 226 break;
227 case AUDIT_PPID:
228 if (ctx)
229 result = audit_comparator(ctx->ppid, f->op, f->val);
230 break;
182 case AUDIT_UID: 231 case AUDIT_UID:
183 result = audit_comparator(tsk->uid, f->op, f->val); 232 result = audit_comparator(tsk->uid, f->op, f->val);
184 break; 233 break;
@@ -224,7 +273,10 @@ static int audit_filter_rules(struct task_struct *tsk,
224 } 273 }
225 break; 274 break;
226 case AUDIT_DEVMAJOR: 275 case AUDIT_DEVMAJOR:
227 if (ctx) { 276 if (name)
277 result = audit_comparator(MAJOR(name->dev),
278 f->op, f->val);
279 else if (ctx) {
228 for (j = 0; j < ctx->name_count; j++) { 280 for (j = 0; j < ctx->name_count; j++) {
229 if (audit_comparator(MAJOR(ctx->names[j].dev), f->op, f->val)) { 281 if (audit_comparator(MAJOR(ctx->names[j].dev), f->op, f->val)) {
230 ++result; 282 ++result;
@@ -234,7 +286,10 @@ static int audit_filter_rules(struct task_struct *tsk,
234 } 286 }
235 break; 287 break;
236 case AUDIT_DEVMINOR: 288 case AUDIT_DEVMINOR:
237 if (ctx) { 289 if (name)
290 result = audit_comparator(MINOR(name->dev),
291 f->op, f->val);
292 else if (ctx) {
238 for (j = 0; j < ctx->name_count; j++) { 293 for (j = 0; j < ctx->name_count; j++) {
239 if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) { 294 if (audit_comparator(MINOR(ctx->names[j].dev), f->op, f->val)) {
240 ++result; 295 ++result;
@@ -244,16 +299,22 @@ static int audit_filter_rules(struct task_struct *tsk,
244 } 299 }
245 break; 300 break;
246 case AUDIT_INODE: 301 case AUDIT_INODE:
247 if (ctx) { 302 if (name)
303 result = (name->ino == f->val);
304 else if (ctx) {
248 for (j = 0; j < ctx->name_count; j++) { 305 for (j = 0; j < ctx->name_count; j++) {
249 if (audit_comparator(ctx->names[j].ino, f->op, f->val) || 306 if (audit_comparator(ctx->names[j].ino, f->op, f->val)) {
250 audit_comparator(ctx->names[j].pino, f->op, f->val)) {
251 ++result; 307 ++result;
252 break; 308 break;
253 } 309 }
254 } 310 }
255 } 311 }
256 break; 312 break;
313 case AUDIT_WATCH:
314 if (name && rule->watch->ino != (unsigned long)-1)
315 result = (name->dev == rule->watch->dev &&
316 name->ino == rule->watch->ino);
317 break;
257 case AUDIT_LOGINUID: 318 case AUDIT_LOGINUID:
258 result = 0; 319 result = 0;
259 if (ctx) 320 if (ctx)
@@ -294,7 +355,6 @@ static int audit_filter_rules(struct task_struct *tsk,
294 } 355 }
295 switch (rule->action) { 356 switch (rule->action) {
296 case AUDIT_NEVER: *state = AUDIT_DISABLED; break; 357 case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
297 case AUDIT_POSSIBLE: *state = AUDIT_BUILD_CONTEXT; break;
298 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break; 358 case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
299 } 359 }
300 return 1; 360 return 1;
@@ -311,7 +371,7 @@ static enum audit_state audit_filter_task(struct task_struct *tsk)
311 371
312 rcu_read_lock(); 372 rcu_read_lock();
313 list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) { 373 list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) {
314 if (audit_filter_rules(tsk, &e->rule, NULL, &state)) { 374 if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) {
315 rcu_read_unlock(); 375 rcu_read_unlock();
316 return state; 376 return state;
317 } 377 }
@@ -341,8 +401,47 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
341 int bit = AUDIT_BIT(ctx->major); 401 int bit = AUDIT_BIT(ctx->major);
342 402
343 list_for_each_entry_rcu(e, list, list) { 403 list_for_each_entry_rcu(e, list, list) {
344 if ((e->rule.mask[word] & bit) == bit 404 if ((e->rule.mask[word] & bit) == bit &&
345 && audit_filter_rules(tsk, &e->rule, ctx, &state)) { 405 audit_filter_rules(tsk, &e->rule, ctx, NULL,
406 &state)) {
407 rcu_read_unlock();
408 return state;
409 }
410 }
411 }
412 rcu_read_unlock();
413 return AUDIT_BUILD_CONTEXT;
414}
415
416/* At syscall exit time, this filter is called if any audit_names[] have been
417 * collected during syscall processing. We only check rules in sublists at hash
418 * buckets applicable to the inode numbers in audit_names[].
419 * Regarding audit_state, same rules apply as for audit_filter_syscall().
420 */
421enum audit_state audit_filter_inodes(struct task_struct *tsk,
422 struct audit_context *ctx)
423{
424 int i;
425 struct audit_entry *e;
426 enum audit_state state;
427
428 if (audit_pid && tsk->tgid == audit_pid)
429 return AUDIT_DISABLED;
430
431 rcu_read_lock();
432 for (i = 0; i < ctx->name_count; i++) {
433 int word = AUDIT_WORD(ctx->major);
434 int bit = AUDIT_BIT(ctx->major);
435 struct audit_names *n = &ctx->names[i];
436 int h = audit_hash_ino((u32)n->ino);
437 struct list_head *list = &audit_inode_hash[h];
438
439 if (list_empty(list))
440 continue;
441
442 list_for_each_entry_rcu(e, list, list) {
443 if ((e->rule.mask[word] & bit) == bit &&
444 audit_filter_rules(tsk, &e->rule, ctx, n, &state)) {
346 rcu_read_unlock(); 445 rcu_read_unlock();
347 return state; 446 return state;
348 } 447 }
@@ -352,6 +451,11 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
352 return AUDIT_BUILD_CONTEXT; 451 return AUDIT_BUILD_CONTEXT;
353} 452}
354 453
454void audit_set_auditable(struct audit_context *ctx)
455{
456 ctx->auditable = 1;
457}
458
355static inline struct audit_context *audit_get_context(struct task_struct *tsk, 459static inline struct audit_context *audit_get_context(struct task_struct *tsk,
356 int return_valid, 460 int return_valid,
357 int return_code) 461 int return_code)
@@ -365,12 +469,22 @@ static inline struct audit_context *audit_get_context(struct task_struct *tsk,
365 469
366 if (context->in_syscall && !context->auditable) { 470 if (context->in_syscall && !context->auditable) {
367 enum audit_state state; 471 enum audit_state state;
472
368 state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]); 473 state = audit_filter_syscall(tsk, context, &audit_filter_list[AUDIT_FILTER_EXIT]);
474 if (state == AUDIT_RECORD_CONTEXT) {
475 context->auditable = 1;
476 goto get_context;
477 }
478
479 state = audit_filter_inodes(tsk, context);
369 if (state == AUDIT_RECORD_CONTEXT) 480 if (state == AUDIT_RECORD_CONTEXT)
370 context->auditable = 1; 481 context->auditable = 1;
482
371 } 483 }
372 484
485get_context:
373 context->pid = tsk->pid; 486 context->pid = tsk->pid;
487 context->ppid = sys_getppid(); /* sic. tsk == current in all cases */
374 context->uid = tsk->uid; 488 context->uid = tsk->uid;
375 context->gid = tsk->gid; 489 context->gid = tsk->gid;
376 context->euid = tsk->euid; 490 context->euid = tsk->euid;
@@ -413,7 +527,7 @@ static inline void audit_free_names(struct audit_context *context)
413#endif 527#endif
414 528
415 for (i = 0; i < context->name_count; i++) { 529 for (i = 0; i < context->name_count; i++) {
416 if (context->names[i].name) 530 if (context->names[i].name && context->names[i].name_put)
417 __putname(context->names[i].name); 531 __putname(context->names[i].name);
418 } 532 }
419 context->name_count = 0; 533 context->name_count = 0;
@@ -544,8 +658,7 @@ static void audit_log_task_context(struct audit_buffer *ab)
544 return; 658 return;
545 659
546error_path: 660error_path:
547 if (ctx) 661 kfree(ctx);
548 kfree(ctx);
549 audit_panic("error in audit_log_task_context"); 662 audit_panic("error in audit_log_task_context");
550 return; 663 return;
551} 664}
@@ -606,7 +719,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
606 tty = "(none)"; 719 tty = "(none)";
607 audit_log_format(ab, 720 audit_log_format(ab,
608 " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" 721 " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
609 " pid=%d auid=%u uid=%u gid=%u" 722 " ppid=%d pid=%d auid=%u uid=%u gid=%u"
610 " euid=%u suid=%u fsuid=%u" 723 " euid=%u suid=%u fsuid=%u"
611 " egid=%u sgid=%u fsgid=%u tty=%s", 724 " egid=%u sgid=%u fsgid=%u tty=%s",
612 context->argv[0], 725 context->argv[0],
@@ -614,6 +727,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
614 context->argv[2], 727 context->argv[2],
615 context->argv[3], 728 context->argv[3],
616 context->name_count, 729 context->name_count,
730 context->ppid,
617 context->pid, 731 context->pid,
618 context->loginuid, 732 context->loginuid,
619 context->uid, 733 context->uid,
@@ -630,11 +744,48 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
630 continue; /* audit_panic has been called */ 744 continue; /* audit_panic has been called */
631 745
632 switch (aux->type) { 746 switch (aux->type) {
747 case AUDIT_MQ_OPEN: {
748 struct audit_aux_data_mq_open *axi = (void *)aux;
749 audit_log_format(ab,
750 "oflag=0x%x mode=%#o mq_flags=0x%lx mq_maxmsg=%ld "
751 "mq_msgsize=%ld mq_curmsgs=%ld",
752 axi->oflag, axi->mode, axi->attr.mq_flags,
753 axi->attr.mq_maxmsg, axi->attr.mq_msgsize,
754 axi->attr.mq_curmsgs);
755 break; }
756
757 case AUDIT_MQ_SENDRECV: {
758 struct audit_aux_data_mq_sendrecv *axi = (void *)aux;
759 audit_log_format(ab,
760 "mqdes=%d msg_len=%zd msg_prio=%u "
761 "abs_timeout_sec=%ld abs_timeout_nsec=%ld",
762 axi->mqdes, axi->msg_len, axi->msg_prio,
763 axi->abs_timeout.tv_sec, axi->abs_timeout.tv_nsec);
764 break; }
765
766 case AUDIT_MQ_NOTIFY: {
767 struct audit_aux_data_mq_notify *axi = (void *)aux;
768 audit_log_format(ab,
769 "mqdes=%d sigev_signo=%d",
770 axi->mqdes,
771 axi->notification.sigev_signo);
772 break; }
773
774 case AUDIT_MQ_GETSETATTR: {
775 struct audit_aux_data_mq_getsetattr *axi = (void *)aux;
776 audit_log_format(ab,
777 "mqdes=%d mq_flags=0x%lx mq_maxmsg=%ld mq_msgsize=%ld "
778 "mq_curmsgs=%ld ",
779 axi->mqdes,
780 axi->mqstat.mq_flags, axi->mqstat.mq_maxmsg,
781 axi->mqstat.mq_msgsize, axi->mqstat.mq_curmsgs);
782 break; }
783
633 case AUDIT_IPC: { 784 case AUDIT_IPC: {
634 struct audit_aux_data_ipcctl *axi = (void *)aux; 785 struct audit_aux_data_ipcctl *axi = (void *)aux;
635 audit_log_format(ab, 786 audit_log_format(ab,
636 " qbytes=%lx iuid=%u igid=%u mode=%x", 787 "ouid=%u ogid=%u mode=%x",
637 axi->qbytes, axi->uid, axi->gid, axi->mode); 788 axi->uid, axi->gid, axi->mode);
638 if (axi->osid != 0) { 789 if (axi->osid != 0) {
639 char *ctx = NULL; 790 char *ctx = NULL;
640 u32 len; 791 u32 len;
@@ -652,19 +803,18 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
652 case AUDIT_IPC_SET_PERM: { 803 case AUDIT_IPC_SET_PERM: {
653 struct audit_aux_data_ipcctl *axi = (void *)aux; 804 struct audit_aux_data_ipcctl *axi = (void *)aux;
654 audit_log_format(ab, 805 audit_log_format(ab,
655 " new qbytes=%lx new iuid=%u new igid=%u new mode=%x", 806 "qbytes=%lx ouid=%u ogid=%u mode=%x",
656 axi->qbytes, axi->uid, axi->gid, axi->mode); 807 axi->qbytes, axi->uid, axi->gid, axi->mode);
657 if (axi->osid != 0) { 808 break; }
658 char *ctx = NULL; 809
659 u32 len; 810 case AUDIT_EXECVE: {
660 if (selinux_ctxid_to_string( 811 struct audit_aux_data_execve *axi = (void *)aux;
661 axi->osid, &ctx, &len)) { 812 int i;
662 audit_log_format(ab, " osid=%u", 813 const char *p;
663 axi->osid); 814 for (i = 0, p = axi->mem; i < axi->argc; i++) {
664 call_panic = 1; 815 audit_log_format(ab, "a%d=", i);
665 } else 816 p = audit_log_untrustedstring(ab, p);
666 audit_log_format(ab, " obj=%s", ctx); 817 audit_log_format(ab, "\n");
667 kfree(ctx);
668 } 818 }
669 break; } 819 break; }
670 820
@@ -700,8 +850,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
700 } 850 }
701 } 851 }
702 for (i = 0; i < context->name_count; i++) { 852 for (i = 0; i < context->name_count; i++) {
703 unsigned long ino = context->names[i].ino; 853 struct audit_names *n = &context->names[i];
704 unsigned long pino = context->names[i].pino;
705 854
706 ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH); 855 ab = audit_log_start(context, GFP_KERNEL, AUDIT_PATH);
707 if (!ab) 856 if (!ab)
@@ -709,33 +858,47 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
709 858
710 audit_log_format(ab, "item=%d", i); 859 audit_log_format(ab, "item=%d", i);
711 860
712 audit_log_format(ab, " name="); 861 if (n->name) {
713 if (context->names[i].name) 862 switch(n->name_len) {
714 audit_log_untrustedstring(ab, context->names[i].name); 863 case AUDIT_NAME_FULL:
715 else 864 /* log the full path */
716 audit_log_format(ab, "(null)"); 865 audit_log_format(ab, " name=");
717 866 audit_log_untrustedstring(ab, n->name);
718 if (pino != (unsigned long)-1) 867 break;
719 audit_log_format(ab, " parent=%lu", pino); 868 case 0:
720 if (ino != (unsigned long)-1) 869 /* name was specified as a relative path and the
721 audit_log_format(ab, " inode=%lu", ino); 870 * directory component is the cwd */
722 if ((pino != (unsigned long)-1) || (ino != (unsigned long)-1)) 871 audit_log_d_path(ab, " name=", context->pwd,
723 audit_log_format(ab, " dev=%02x:%02x mode=%#o" 872 context->pwdmnt);
724 " ouid=%u ogid=%u rdev=%02x:%02x", 873 break;
725 MAJOR(context->names[i].dev), 874 default:
726 MINOR(context->names[i].dev), 875 /* log the name's directory component */
727 context->names[i].mode, 876 audit_log_format(ab, " name=");
728 context->names[i].uid, 877 audit_log_n_untrustedstring(ab, n->name_len,
729 context->names[i].gid, 878 n->name);
730 MAJOR(context->names[i].rdev), 879 }
731 MINOR(context->names[i].rdev)); 880 } else
732 if (context->names[i].osid != 0) { 881 audit_log_format(ab, " name=(null)");
882
883 if (n->ino != (unsigned long)-1) {
884 audit_log_format(ab, " inode=%lu"
885 " dev=%02x:%02x mode=%#o"
886 " ouid=%u ogid=%u rdev=%02x:%02x",
887 n->ino,
888 MAJOR(n->dev),
889 MINOR(n->dev),
890 n->mode,
891 n->uid,
892 n->gid,
893 MAJOR(n->rdev),
894 MINOR(n->rdev));
895 }
896 if (n->osid != 0) {
733 char *ctx = NULL; 897 char *ctx = NULL;
734 u32 len; 898 u32 len;
735 if (selinux_ctxid_to_string( 899 if (selinux_ctxid_to_string(
736 context->names[i].osid, &ctx, &len)) { 900 n->osid, &ctx, &len)) {
737 audit_log_format(ab, " osid=%u", 901 audit_log_format(ab, " osid=%u", n->osid);
738 context->names[i].osid);
739 call_panic = 2; 902 call_panic = 2;
740 } else 903 } else
741 audit_log_format(ab, " obj=%s", ctx); 904 audit_log_format(ab, " obj=%s", ctx);
@@ -908,11 +1071,11 @@ void audit_syscall_exit(int valid, long return_code)
908 * Add a name to the list of audit names for this context. 1071 * Add a name to the list of audit names for this context.
909 * Called from fs/namei.c:getname(). 1072 * Called from fs/namei.c:getname().
910 */ 1073 */
911void audit_getname(const char *name) 1074void __audit_getname(const char *name)
912{ 1075{
913 struct audit_context *context = current->audit_context; 1076 struct audit_context *context = current->audit_context;
914 1077
915 if (!context || IS_ERR(name) || !name) 1078 if (IS_ERR(name) || !name)
916 return; 1079 return;
917 1080
918 if (!context->in_syscall) { 1081 if (!context->in_syscall) {
@@ -925,6 +1088,8 @@ void audit_getname(const char *name)
925 } 1088 }
926 BUG_ON(context->name_count >= AUDIT_NAMES); 1089 BUG_ON(context->name_count >= AUDIT_NAMES);
927 context->names[context->name_count].name = name; 1090 context->names[context->name_count].name = name;
1091 context->names[context->name_count].name_len = AUDIT_NAME_FULL;
1092 context->names[context->name_count].name_put = 1;
928 context->names[context->name_count].ino = (unsigned long)-1; 1093 context->names[context->name_count].ino = (unsigned long)-1;
929 ++context->name_count; 1094 ++context->name_count;
930 if (!context->pwd) { 1095 if (!context->pwd) {
@@ -991,11 +1156,10 @@ static void audit_inode_context(int idx, const struct inode *inode)
991 * audit_inode - store the inode and device from a lookup 1156 * audit_inode - store the inode and device from a lookup
992 * @name: name being audited 1157 * @name: name being audited
993 * @inode: inode being audited 1158 * @inode: inode being audited
994 * @flags: lookup flags (as used in path_lookup())
995 * 1159 *
996 * Called from fs/namei.c:path_lookup(). 1160 * Called from fs/namei.c:path_lookup().
997 */ 1161 */
998void __audit_inode(const char *name, const struct inode *inode, unsigned flags) 1162void __audit_inode(const char *name, const struct inode *inode)
999{ 1163{
1000 int idx; 1164 int idx;
1001 struct audit_context *context = current->audit_context; 1165 struct audit_context *context = current->audit_context;
@@ -1021,20 +1185,13 @@ void __audit_inode(const char *name, const struct inode *inode, unsigned flags)
1021 ++context->ino_count; 1185 ++context->ino_count;
1022#endif 1186#endif
1023 } 1187 }
1188 context->names[idx].ino = inode->i_ino;
1024 context->names[idx].dev = inode->i_sb->s_dev; 1189 context->names[idx].dev = inode->i_sb->s_dev;
1025 context->names[idx].mode = inode->i_mode; 1190 context->names[idx].mode = inode->i_mode;
1026 context->names[idx].uid = inode->i_uid; 1191 context->names[idx].uid = inode->i_uid;
1027 context->names[idx].gid = inode->i_gid; 1192 context->names[idx].gid = inode->i_gid;
1028 context->names[idx].rdev = inode->i_rdev; 1193 context->names[idx].rdev = inode->i_rdev;
1029 audit_inode_context(idx, inode); 1194 audit_inode_context(idx, inode);
1030 if ((flags & LOOKUP_PARENT) && (strcmp(name, "/") != 0) &&
1031 (strcmp(name, ".") != 0)) {
1032 context->names[idx].ino = (unsigned long)-1;
1033 context->names[idx].pino = inode->i_ino;
1034 } else {
1035 context->names[idx].ino = inode->i_ino;
1036 context->names[idx].pino = (unsigned long)-1;
1037 }
1038} 1195}
1039 1196
1040/** 1197/**
@@ -1056,51 +1213,40 @@ void __audit_inode_child(const char *dname, const struct inode *inode,
1056{ 1213{
1057 int idx; 1214 int idx;
1058 struct audit_context *context = current->audit_context; 1215 struct audit_context *context = current->audit_context;
1216 const char *found_name = NULL;
1217 int dirlen = 0;
1059 1218
1060 if (!context->in_syscall) 1219 if (!context->in_syscall)
1061 return; 1220 return;
1062 1221
1063 /* determine matching parent */ 1222 /* determine matching parent */
1064 if (dname) 1223 if (!dname)
1065 for (idx = 0; idx < context->name_count; idx++) 1224 goto update_context;
1066 if (context->names[idx].pino == pino) { 1225 for (idx = 0; idx < context->name_count; idx++)
1067 const char *n; 1226 if (context->names[idx].ino == pino) {
1068 const char *name = context->names[idx].name; 1227 const char *name = context->names[idx].name;
1069 int dlen = strlen(dname); 1228
1070 int nlen = name ? strlen(name) : 0; 1229 if (!name)
1071 1230 continue;
1072 if (nlen < dlen) 1231
1073 continue; 1232 if (audit_compare_dname_path(dname, name, &dirlen) == 0) {
1074 1233 context->names[idx].name_len = dirlen;
1075 /* disregard trailing slashes */ 1234 found_name = name;
1076 n = name + nlen - 1; 1235 break;
1077 while ((*n == '/') && (n > name))
1078 n--;
1079
1080 /* find last path component */
1081 n = n - dlen + 1;
1082 if (n < name)
1083 continue;
1084 else if (n > name) {
1085 if (*--n != '/')
1086 continue;
1087 else
1088 n++;
1089 }
1090
1091 if (strncmp(n, dname, dlen) == 0)
1092 goto update_context;
1093 } 1236 }
1237 }
1094 1238
1095 /* catch-all in case match not found */ 1239update_context:
1096 idx = context->name_count++; 1240 idx = context->name_count++;
1097 context->names[idx].name = NULL;
1098 context->names[idx].pino = pino;
1099#if AUDIT_DEBUG 1241#if AUDIT_DEBUG
1100 context->ino_count++; 1242 context->ino_count++;
1101#endif 1243#endif
1244 /* Re-use the name belonging to the slot for a matching parent directory.
1245 * All names for this context are relinquished in audit_free_names() */
1246 context->names[idx].name = found_name;
1247 context->names[idx].name_len = AUDIT_NAME_FULL;
1248 context->names[idx].name_put = 0; /* don't call __putname() */
1102 1249
1103update_context:
1104 if (inode) { 1250 if (inode) {
1105 context->names[idx].ino = inode->i_ino; 1251 context->names[idx].ino = inode->i_ino;
1106 context->names[idx].dev = inode->i_sb->s_dev; 1252 context->names[idx].dev = inode->i_sb->s_dev;
@@ -1109,7 +1255,8 @@ update_context:
1109 context->names[idx].gid = inode->i_gid; 1255 context->names[idx].gid = inode->i_gid;
1110 context->names[idx].rdev = inode->i_rdev; 1256 context->names[idx].rdev = inode->i_rdev;
1111 audit_inode_context(idx, inode); 1257 audit_inode_context(idx, inode);
1112 } 1258 } else
1259 context->names[idx].ino = (unsigned long)-1;
1113} 1260}
1114 1261
1115/** 1262/**
@@ -1142,18 +1289,23 @@ void auditsc_get_stamp(struct audit_context *ctx,
1142 */ 1289 */
1143int audit_set_loginuid(struct task_struct *task, uid_t loginuid) 1290int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
1144{ 1291{
1145 if (task->audit_context) { 1292 struct audit_context *context = task->audit_context;
1146 struct audit_buffer *ab; 1293
1147 1294 if (context) {
1148 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN); 1295 /* Only log if audit is enabled */
1149 if (ab) { 1296 if (context->in_syscall) {
1150 audit_log_format(ab, "login pid=%d uid=%u " 1297 struct audit_buffer *ab;
1151 "old auid=%u new auid=%u", 1298
1152 task->pid, task->uid, 1299 ab = audit_log_start(NULL, GFP_KERNEL, AUDIT_LOGIN);
1153 task->audit_context->loginuid, loginuid); 1300 if (ab) {
1154 audit_log_end(ab); 1301 audit_log_format(ab, "login pid=%d uid=%u "
1302 "old auid=%u new auid=%u",
1303 task->pid, task->uid,
1304 context->loginuid, loginuid);
1305 audit_log_end(ab);
1306 }
1155 } 1307 }
1156 task->audit_context->loginuid = loginuid; 1308 context->loginuid = loginuid;
1157 } 1309 }
1158 return 0; 1310 return 0;
1159} 1311}
@@ -1170,16 +1322,193 @@ uid_t audit_get_loginuid(struct audit_context *ctx)
1170} 1322}
1171 1323
1172/** 1324/**
1173 * audit_ipc_obj - record audit data for ipc object 1325 * __audit_mq_open - record audit data for a POSIX MQ open
1174 * @ipcp: ipc permissions 1326 * @oflag: open flag
1327 * @mode: mode bits
1328 * @u_attr: queue attributes
1175 * 1329 *
1176 * Returns 0 for success or NULL context or < 0 on error. 1330 * Returns 0 for success or NULL context or < 0 on error.
1177 */ 1331 */
1178int audit_ipc_obj(struct kern_ipc_perm *ipcp) 1332int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr)
1179{ 1333{
1180 struct audit_aux_data_ipcctl *ax; 1334 struct audit_aux_data_mq_open *ax;
1335 struct audit_context *context = current->audit_context;
1336
1337 if (!audit_enabled)
1338 return 0;
1339
1340 if (likely(!context))
1341 return 0;
1342
1343 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
1344 if (!ax)
1345 return -ENOMEM;
1346
1347 if (u_attr != NULL) {
1348 if (copy_from_user(&ax->attr, u_attr, sizeof(ax->attr))) {
1349 kfree(ax);
1350 return -EFAULT;
1351 }
1352 } else
1353 memset(&ax->attr, 0, sizeof(ax->attr));
1354
1355 ax->oflag = oflag;
1356 ax->mode = mode;
1357
1358 ax->d.type = AUDIT_MQ_OPEN;
1359 ax->d.next = context->aux;
1360 context->aux = (void *)ax;
1361 return 0;
1362}
1363
1364/**
1365 * __audit_mq_timedsend - record audit data for a POSIX MQ timed send
1366 * @mqdes: MQ descriptor
1367 * @msg_len: Message length
1368 * @msg_prio: Message priority
1369 * @u_abs_timeout: Message timeout in absolute time
1370 *
1371 * Returns 0 for success or NULL context or < 0 on error.
1372 */
1373int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
1374 const struct timespec __user *u_abs_timeout)
1375{
1376 struct audit_aux_data_mq_sendrecv *ax;
1377 struct audit_context *context = current->audit_context;
1378
1379 if (!audit_enabled)
1380 return 0;
1381
1382 if (likely(!context))
1383 return 0;
1384
1385 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
1386 if (!ax)
1387 return -ENOMEM;
1388
1389 if (u_abs_timeout != NULL) {
1390 if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) {
1391 kfree(ax);
1392 return -EFAULT;
1393 }
1394 } else
1395 memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout));
1396
1397 ax->mqdes = mqdes;
1398 ax->msg_len = msg_len;
1399 ax->msg_prio = msg_prio;
1400
1401 ax->d.type = AUDIT_MQ_SENDRECV;
1402 ax->d.next = context->aux;
1403 context->aux = (void *)ax;
1404 return 0;
1405}
1406
1407/**
1408 * __audit_mq_timedreceive - record audit data for a POSIX MQ timed receive
1409 * @mqdes: MQ descriptor
1410 * @msg_len: Message length
1411 * @u_msg_prio: Message priority
1412 * @u_abs_timeout: Message timeout in absolute time
1413 *
1414 * Returns 0 for success or NULL context or < 0 on error.
1415 */
1416int __audit_mq_timedreceive(mqd_t mqdes, size_t msg_len,
1417 unsigned int __user *u_msg_prio,
1418 const struct timespec __user *u_abs_timeout)
1419{
1420 struct audit_aux_data_mq_sendrecv *ax;
1421 struct audit_context *context = current->audit_context;
1422
1423 if (!audit_enabled)
1424 return 0;
1425
1426 if (likely(!context))
1427 return 0;
1428
1429 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
1430 if (!ax)
1431 return -ENOMEM;
1432
1433 if (u_msg_prio != NULL) {
1434 if (get_user(ax->msg_prio, u_msg_prio)) {
1435 kfree(ax);
1436 return -EFAULT;
1437 }
1438 } else
1439 ax->msg_prio = 0;
1440
1441 if (u_abs_timeout != NULL) {
1442 if (copy_from_user(&ax->abs_timeout, u_abs_timeout, sizeof(ax->abs_timeout))) {
1443 kfree(ax);
1444 return -EFAULT;
1445 }
1446 } else
1447 memset(&ax->abs_timeout, 0, sizeof(ax->abs_timeout));
1448
1449 ax->mqdes = mqdes;
1450 ax->msg_len = msg_len;
1451
1452 ax->d.type = AUDIT_MQ_SENDRECV;
1453 ax->d.next = context->aux;
1454 context->aux = (void *)ax;
1455 return 0;
1456}
1457
1458/**
1459 * __audit_mq_notify - record audit data for a POSIX MQ notify
1460 * @mqdes: MQ descriptor
1461 * @u_notification: Notification event
1462 *
1463 * Returns 0 for success or NULL context or < 0 on error.
1464 */
1465
1466int __audit_mq_notify(mqd_t mqdes, const struct sigevent __user *u_notification)
1467{
1468 struct audit_aux_data_mq_notify *ax;
1469 struct audit_context *context = current->audit_context;
1470
1471 if (!audit_enabled)
1472 return 0;
1473
1474 if (likely(!context))
1475 return 0;
1476
1477 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
1478 if (!ax)
1479 return -ENOMEM;
1480
1481 if (u_notification != NULL) {
1482 if (copy_from_user(&ax->notification, u_notification, sizeof(ax->notification))) {
1483 kfree(ax);
1484 return -EFAULT;
1485 }
1486 } else
1487 memset(&ax->notification, 0, sizeof(ax->notification));
1488
1489 ax->mqdes = mqdes;
1490
1491 ax->d.type = AUDIT_MQ_NOTIFY;
1492 ax->d.next = context->aux;
1493 context->aux = (void *)ax;
1494 return 0;
1495}
1496
1497/**
1498 * __audit_mq_getsetattr - record audit data for a POSIX MQ get/set attribute
1499 * @mqdes: MQ descriptor
1500 * @mqstat: MQ flags
1501 *
1502 * Returns 0 for success or NULL context or < 0 on error.
1503 */
1504int __audit_mq_getsetattr(mqd_t mqdes, struct mq_attr *mqstat)
1505{
1506 struct audit_aux_data_mq_getsetattr *ax;
1181 struct audit_context *context = current->audit_context; 1507 struct audit_context *context = current->audit_context;
1182 1508
1509 if (!audit_enabled)
1510 return 0;
1511
1183 if (likely(!context)) 1512 if (likely(!context))
1184 return 0; 1513 return 0;
1185 1514
@@ -1187,6 +1516,30 @@ int audit_ipc_obj(struct kern_ipc_perm *ipcp)
1187 if (!ax) 1516 if (!ax)
1188 return -ENOMEM; 1517 return -ENOMEM;
1189 1518
1519 ax->mqdes = mqdes;
1520 ax->mqstat = *mqstat;
1521
1522 ax->d.type = AUDIT_MQ_GETSETATTR;
1523 ax->d.next = context->aux;
1524 context->aux = (void *)ax;
1525 return 0;
1526}
1527
1528/**
1529 * audit_ipc_obj - record audit data for ipc object
1530 * @ipcp: ipc permissions
1531 *
1532 * Returns 0 for success or NULL context or < 0 on error.
1533 */
1534int __audit_ipc_obj(struct kern_ipc_perm *ipcp)
1535{
1536 struct audit_aux_data_ipcctl *ax;
1537 struct audit_context *context = current->audit_context;
1538
1539 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
1540 if (!ax)
1541 return -ENOMEM;
1542
1190 ax->uid = ipcp->uid; 1543 ax->uid = ipcp->uid;
1191 ax->gid = ipcp->gid; 1544 ax->gid = ipcp->gid;
1192 ax->mode = ipcp->mode; 1545 ax->mode = ipcp->mode;
@@ -1207,14 +1560,11 @@ int audit_ipc_obj(struct kern_ipc_perm *ipcp)
1207 * 1560 *
1208 * Returns 0 for success or NULL context or < 0 on error. 1561 * Returns 0 for success or NULL context or < 0 on error.
1209 */ 1562 */
1210int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode, struct kern_ipc_perm *ipcp) 1563int __audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode)
1211{ 1564{
1212 struct audit_aux_data_ipcctl *ax; 1565 struct audit_aux_data_ipcctl *ax;
1213 struct audit_context *context = current->audit_context; 1566 struct audit_context *context = current->audit_context;
1214 1567
1215 if (likely(!context))
1216 return 0;
1217
1218 ax = kmalloc(sizeof(*ax), GFP_ATOMIC); 1568 ax = kmalloc(sizeof(*ax), GFP_ATOMIC);
1219 if (!ax) 1569 if (!ax)
1220 return -ENOMEM; 1570 return -ENOMEM;
@@ -1223,7 +1573,6 @@ int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode,
1223 ax->uid = uid; 1573 ax->uid = uid;
1224 ax->gid = gid; 1574 ax->gid = gid;
1225 ax->mode = mode; 1575 ax->mode = mode;
1226 selinux_get_ipc_sid(ipcp, &ax->osid);
1227 1576
1228 ax->d.type = AUDIT_IPC_SET_PERM; 1577 ax->d.type = AUDIT_IPC_SET_PERM;
1229 ax->d.next = context->aux; 1578 ax->d.next = context->aux;
@@ -1231,6 +1580,39 @@ int audit_ipc_set_perm(unsigned long qbytes, uid_t uid, gid_t gid, mode_t mode,
1231 return 0; 1580 return 0;
1232} 1581}
1233 1582
1583int audit_bprm(struct linux_binprm *bprm)
1584{
1585 struct audit_aux_data_execve *ax;
1586 struct audit_context *context = current->audit_context;
1587 unsigned long p, next;
1588 void *to;
1589
1590 if (likely(!audit_enabled || !context))
1591 return 0;
1592
1593 ax = kmalloc(sizeof(*ax) + PAGE_SIZE * MAX_ARG_PAGES - bprm->p,
1594 GFP_KERNEL);
1595 if (!ax)
1596 return -ENOMEM;
1597
1598 ax->argc = bprm->argc;
1599 ax->envc = bprm->envc;
1600 for (p = bprm->p, to = ax->mem; p < MAX_ARG_PAGES*PAGE_SIZE; p = next) {
1601 struct page *page = bprm->page[p / PAGE_SIZE];
1602 void *kaddr = kmap(page);
1603 next = (p + PAGE_SIZE) & ~(PAGE_SIZE - 1);
1604 memcpy(to, kaddr + (p & (PAGE_SIZE - 1)), next - p);
1605 to += next - p;
1606 kunmap(page);
1607 }
1608
1609 ax->d.type = AUDIT_EXECVE;
1610 ax->d.next = context->aux;
1611 context->aux = (void *)ax;
1612 return 0;
1613}
1614
1615
1234/** 1616/**
1235 * audit_socketcall - record audit data for sys_socketcall 1617 * audit_socketcall - record audit data for sys_socketcall
1236 * @nargs: number of args 1618 * @nargs: number of args
@@ -1325,19 +1707,20 @@ int audit_avc_path(struct dentry *dentry, struct vfsmount *mnt)
1325 * If the audit subsystem is being terminated, record the task (pid) 1707 * If the audit subsystem is being terminated, record the task (pid)
1326 * and uid that is doing that. 1708 * and uid that is doing that.
1327 */ 1709 */
1328void audit_signal_info(int sig, struct task_struct *t) 1710void __audit_signal_info(int sig, struct task_struct *t)
1329{ 1711{
1330 extern pid_t audit_sig_pid; 1712 extern pid_t audit_sig_pid;
1331 extern uid_t audit_sig_uid; 1713 extern uid_t audit_sig_uid;
1332 1714 extern u32 audit_sig_sid;
1333 if (unlikely(audit_pid && t->tgid == audit_pid)) { 1715
1334 if (sig == SIGTERM || sig == SIGHUP) { 1716 if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) {
1335 struct audit_context *ctx = current->audit_context; 1717 struct task_struct *tsk = current;
1336 audit_sig_pid = current->pid; 1718 struct audit_context *ctx = tsk->audit_context;
1337 if (ctx) 1719 audit_sig_pid = tsk->pid;
1338 audit_sig_uid = ctx->loginuid; 1720 if (ctx)
1339 else 1721 audit_sig_uid = ctx->loginuid;
1340 audit_sig_uid = current->uid; 1722 else
1341 } 1723 audit_sig_uid = tsk->uid;
1724 selinux_get_task_sid(tsk, &audit_sig_sid);
1342 } 1725 }
1343} 1726}
diff --git a/kernel/compat.c b/kernel/compat.c
index c1601a84f8d8..126dee9530aa 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -21,6 +21,7 @@
21#include <linux/unistd.h> 21#include <linux/unistd.h>
22#include <linux/security.h> 22#include <linux/security.h>
23#include <linux/timex.h> 23#include <linux/timex.h>
24#include <linux/migrate.h>
24 25
25#include <asm/uaccess.h> 26#include <asm/uaccess.h>
26 27
@@ -729,17 +730,10 @@ void
729sigset_from_compat (sigset_t *set, compat_sigset_t *compat) 730sigset_from_compat (sigset_t *set, compat_sigset_t *compat)
730{ 731{
731 switch (_NSIG_WORDS) { 732 switch (_NSIG_WORDS) {
732#if defined (__COMPAT_ENDIAN_SWAP__)
733 case 4: set->sig[3] = compat->sig[7] | (((long)compat->sig[6]) << 32 );
734 case 3: set->sig[2] = compat->sig[5] | (((long)compat->sig[4]) << 32 );
735 case 2: set->sig[1] = compat->sig[3] | (((long)compat->sig[2]) << 32 );
736 case 1: set->sig[0] = compat->sig[1] | (((long)compat->sig[0]) << 32 );
737#else
738 case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 ); 733 case 4: set->sig[3] = compat->sig[6] | (((long)compat->sig[7]) << 32 );
739 case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 ); 734 case 3: set->sig[2] = compat->sig[4] | (((long)compat->sig[5]) << 32 );
740 case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 ); 735 case 2: set->sig[1] = compat->sig[2] | (((long)compat->sig[3]) << 32 );
741 case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 ); 736 case 1: set->sig[0] = compat->sig[0] | (((long)compat->sig[1]) << 32 );
742#endif
743 } 737 }
744} 738}
745 739
@@ -934,3 +928,25 @@ asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
934 928
935 return ret; 929 return ret;
936} 930}
931
932#ifdef CONFIG_NUMA
933asmlinkage long compat_sys_move_pages(pid_t pid, unsigned long nr_pages,
934 compat_uptr_t __user *pages32,
935 const int __user *nodes,
936 int __user *status,
937 int flags)
938{
939 const void __user * __user *pages;
940 int i;
941
942 pages = compat_alloc_user_space(nr_pages * sizeof(void *));
943 for (i = 0; i < nr_pages; i++) {
944 compat_uptr_t p;
945
946 if (get_user(p, pages32 + i) ||
947 put_user(compat_ptr(p), pages + i))
948 return -EFAULT;
949 }
950 return sys_move_pages(pid, nr_pages, pages, nodes, status, flags);
951}
952#endif
diff --git a/kernel/cpu.c b/kernel/cpu.c
index fe2b8d0bfe4c..70fbf2e83766 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -13,12 +13,12 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/kthread.h> 14#include <linux/kthread.h>
15#include <linux/stop_machine.h> 15#include <linux/stop_machine.h>
16#include <asm/semaphore.h> 16#include <linux/mutex.h>
17 17
18/* This protects CPUs going up and down... */ 18/* This protects CPUs going up and down... */
19static DECLARE_MUTEX(cpucontrol); 19static DEFINE_MUTEX(cpucontrol);
20 20
21static BLOCKING_NOTIFIER_HEAD(cpu_chain); 21static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain);
22 22
23#ifdef CONFIG_HOTPLUG_CPU 23#ifdef CONFIG_HOTPLUG_CPU
24static struct task_struct *lock_cpu_hotplug_owner; 24static struct task_struct *lock_cpu_hotplug_owner;
@@ -30,9 +30,9 @@ static int __lock_cpu_hotplug(int interruptible)
30 30
31 if (lock_cpu_hotplug_owner != current) { 31 if (lock_cpu_hotplug_owner != current) {
32 if (interruptible) 32 if (interruptible)
33 ret = down_interruptible(&cpucontrol); 33 ret = mutex_lock_interruptible(&cpucontrol);
34 else 34 else
35 down(&cpucontrol); 35 mutex_lock(&cpucontrol);
36 } 36 }
37 37
38 /* 38 /*
@@ -56,7 +56,7 @@ void unlock_cpu_hotplug(void)
56{ 56{
57 if (--lock_cpu_hotplug_depth == 0) { 57 if (--lock_cpu_hotplug_depth == 0) {
58 lock_cpu_hotplug_owner = NULL; 58 lock_cpu_hotplug_owner = NULL;
59 up(&cpucontrol); 59 mutex_unlock(&cpucontrol);
60 } 60 }
61} 61}
62EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); 62EXPORT_SYMBOL_GPL(unlock_cpu_hotplug);
@@ -69,10 +69,13 @@ EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible);
69#endif /* CONFIG_HOTPLUG_CPU */ 69#endif /* CONFIG_HOTPLUG_CPU */
70 70
71/* Need to know about CPUs going up/down? */ 71/* Need to know about CPUs going up/down? */
72int register_cpu_notifier(struct notifier_block *nb) 72int __cpuinit register_cpu_notifier(struct notifier_block *nb)
73{ 73{
74 return blocking_notifier_chain_register(&cpu_chain, nb); 74 return blocking_notifier_chain_register(&cpu_chain, nb);
75} 75}
76
77#ifdef CONFIG_HOTPLUG_CPU
78
76EXPORT_SYMBOL(register_cpu_notifier); 79EXPORT_SYMBOL(register_cpu_notifier);
77 80
78void unregister_cpu_notifier(struct notifier_block *nb) 81void unregister_cpu_notifier(struct notifier_block *nb)
@@ -81,7 +84,6 @@ void unregister_cpu_notifier(struct notifier_block *nb)
81} 84}
82EXPORT_SYMBOL(unregister_cpu_notifier); 85EXPORT_SYMBOL(unregister_cpu_notifier);
83 86
84#ifdef CONFIG_HOTPLUG_CPU
85static inline void check_for_tasks(int cpu) 87static inline void check_for_tasks(int cpu)
86{ 88{
87 struct task_struct *p; 89 struct task_struct *p;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ab81fdd4572b..1535af3a912d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -41,6 +41,7 @@
41#include <linux/rcupdate.h> 41#include <linux/rcupdate.h>
42#include <linux/sched.h> 42#include <linux/sched.h>
43#include <linux/seq_file.h> 43#include <linux/seq_file.h>
44#include <linux/security.h>
44#include <linux/slab.h> 45#include <linux/slab.h>
45#include <linux/smp_lock.h> 46#include <linux/smp_lock.h>
46#include <linux/spinlock.h> 47#include <linux/spinlock.h>
@@ -392,11 +393,11 @@ static int cpuset_fill_super(struct super_block *sb, void *unused_data,
392 return 0; 393 return 0;
393} 394}
394 395
395static struct super_block *cpuset_get_sb(struct file_system_type *fs_type, 396static int cpuset_get_sb(struct file_system_type *fs_type,
396 int flags, const char *unused_dev_name, 397 int flags, const char *unused_dev_name,
397 void *data) 398 void *data, struct vfsmount *mnt)
398{ 399{
399 return get_sb_single(fs_type, flags, data, cpuset_fill_super); 400 return get_sb_single(fs_type, flags, data, cpuset_fill_super, mnt);
400} 401}
401 402
402static struct file_system_type cpuset_fs_type = { 403static struct file_system_type cpuset_fs_type = {
@@ -1177,6 +1178,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
1177 cpumask_t cpus; 1178 cpumask_t cpus;
1178 nodemask_t from, to; 1179 nodemask_t from, to;
1179 struct mm_struct *mm; 1180 struct mm_struct *mm;
1181 int retval;
1180 1182
1181 if (sscanf(pidbuf, "%d", &pid) != 1) 1183 if (sscanf(pidbuf, "%d", &pid) != 1)
1182 return -EIO; 1184 return -EIO;
@@ -1205,6 +1207,12 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
1205 get_task_struct(tsk); 1207 get_task_struct(tsk);
1206 } 1208 }
1207 1209
1210 retval = security_task_setscheduler(tsk, 0, NULL);
1211 if (retval) {
1212 put_task_struct(tsk);
1213 return retval;
1214 }
1215
1208 mutex_lock(&callback_mutex); 1216 mutex_lock(&callback_mutex);
1209 1217
1210 task_lock(tsk); 1218 task_lock(tsk);
@@ -2434,31 +2442,43 @@ void __cpuset_memory_pressure_bump(void)
2434 */ 2442 */
2435static int proc_cpuset_show(struct seq_file *m, void *v) 2443static int proc_cpuset_show(struct seq_file *m, void *v)
2436{ 2444{
2445 struct pid *pid;
2437 struct task_struct *tsk; 2446 struct task_struct *tsk;
2438 char *buf; 2447 char *buf;
2439 int retval = 0; 2448 int retval;
2440 2449
2450 retval = -ENOMEM;
2441 buf = kmalloc(PAGE_SIZE, GFP_KERNEL); 2451 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
2442 if (!buf) 2452 if (!buf)
2443 return -ENOMEM; 2453 goto out;
2444 2454
2445 tsk = m->private; 2455 retval = -ESRCH;
2456 pid = m->private;
2457 tsk = get_pid_task(pid, PIDTYPE_PID);
2458 if (!tsk)
2459 goto out_free;
2460
2461 retval = -EINVAL;
2446 mutex_lock(&manage_mutex); 2462 mutex_lock(&manage_mutex);
2463
2447 retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE); 2464 retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE);
2448 if (retval < 0) 2465 if (retval < 0)
2449 goto out; 2466 goto out_unlock;
2450 seq_puts(m, buf); 2467 seq_puts(m, buf);
2451 seq_putc(m, '\n'); 2468 seq_putc(m, '\n');
2452out: 2469out_unlock:
2453 mutex_unlock(&manage_mutex); 2470 mutex_unlock(&manage_mutex);
2471 put_task_struct(tsk);
2472out_free:
2454 kfree(buf); 2473 kfree(buf);
2474out:
2455 return retval; 2475 return retval;
2456} 2476}
2457 2477
2458static int cpuset_open(struct inode *inode, struct file *file) 2478static int cpuset_open(struct inode *inode, struct file *file)
2459{ 2479{
2460 struct task_struct *tsk = PROC_I(inode)->task; 2480 struct pid *pid = PROC_I(inode)->pid;
2461 return single_open(file, proc_cpuset_show, tsk); 2481 return single_open(file, proc_cpuset_show, pid);
2462} 2482}
2463 2483
2464struct file_operations proc_cpuset_operations = { 2484struct file_operations proc_cpuset_operations = {
diff --git a/kernel/exit.c b/kernel/exit.c
index e06d0c10a24e..ab06b9f88f64 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -36,6 +36,7 @@
36#include <linux/compat.h> 36#include <linux/compat.h>
37#include <linux/pipe_fs_i.h> 37#include <linux/pipe_fs_i.h>
38#include <linux/audit.h> /* for audit_free() */ 38#include <linux/audit.h> /* for audit_free() */
39#include <linux/resource.h>
39 40
40#include <asm/uaccess.h> 41#include <asm/uaccess.h>
41#include <asm/unistd.h> 42#include <asm/unistd.h>
@@ -45,8 +46,6 @@
45extern void sem_exit (void); 46extern void sem_exit (void);
46extern struct task_struct *child_reaper; 47extern struct task_struct *child_reaper;
47 48
48int getrusage(struct task_struct *, int, struct rusage __user *);
49
50static void exit_mm(struct task_struct * tsk); 49static void exit_mm(struct task_struct * tsk);
51 50
52static void __unhash_process(struct task_struct *p) 51static void __unhash_process(struct task_struct *p)
@@ -138,12 +137,8 @@ void release_task(struct task_struct * p)
138{ 137{
139 int zap_leader; 138 int zap_leader;
140 task_t *leader; 139 task_t *leader;
141 struct dentry *proc_dentry;
142
143repeat: 140repeat:
144 atomic_dec(&p->user->processes); 141 atomic_dec(&p->user->processes);
145 spin_lock(&p->proc_lock);
146 proc_dentry = proc_pid_unhash(p);
147 write_lock_irq(&tasklist_lock); 142 write_lock_irq(&tasklist_lock);
148 ptrace_unlink(p); 143 ptrace_unlink(p);
149 BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); 144 BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
@@ -172,8 +167,7 @@ repeat:
172 167
173 sched_exit(p); 168 sched_exit(p);
174 write_unlock_irq(&tasklist_lock); 169 write_unlock_irq(&tasklist_lock);
175 spin_unlock(&p->proc_lock); 170 proc_flush_task(p);
176 proc_pid_flush(proc_dentry);
177 release_thread(p); 171 release_thread(p);
178 call_rcu(&p->rcu, delayed_put_task_struct); 172 call_rcu(&p->rcu, delayed_put_task_struct);
179 173
@@ -579,7 +573,7 @@ static void exit_mm(struct task_struct * tsk)
579 down_read(&mm->mmap_sem); 573 down_read(&mm->mmap_sem);
580 } 574 }
581 atomic_inc(&mm->mm_count); 575 atomic_inc(&mm->mm_count);
582 if (mm != tsk->active_mm) BUG(); 576 BUG_ON(mm != tsk->active_mm);
583 /* more a memory barrier than a real lock */ 577 /* more a memory barrier than a real lock */
584 task_lock(tsk); 578 task_lock(tsk);
585 tsk->mm = NULL; 579 tsk->mm = NULL;
@@ -895,11 +889,11 @@ fastcall NORET_TYPE void do_exit(long code)
895 if (group_dead) { 889 if (group_dead) {
896 hrtimer_cancel(&tsk->signal->real_timer); 890 hrtimer_cancel(&tsk->signal->real_timer);
897 exit_itimers(tsk->signal); 891 exit_itimers(tsk->signal);
898 acct_process(code);
899 } 892 }
893 acct_collect(code, group_dead);
900 if (unlikely(tsk->robust_list)) 894 if (unlikely(tsk->robust_list))
901 exit_robust_list(tsk); 895 exit_robust_list(tsk);
902#ifdef CONFIG_COMPAT 896#if defined(CONFIG_FUTEX) && defined(CONFIG_COMPAT)
903 if (unlikely(tsk->compat_robust_list)) 897 if (unlikely(tsk->compat_robust_list))
904 compat_exit_robust_list(tsk); 898 compat_exit_robust_list(tsk);
905#endif 899#endif
@@ -907,6 +901,8 @@ fastcall NORET_TYPE void do_exit(long code)
907 audit_free(tsk); 901 audit_free(tsk);
908 exit_mm(tsk); 902 exit_mm(tsk);
909 903
904 if (group_dead)
905 acct_process();
910 exit_sem(tsk); 906 exit_sem(tsk);
911 __exit_files(tsk); 907 __exit_files(tsk);
912 __exit_fs(tsk); 908 __exit_fs(tsk);
@@ -930,9 +926,18 @@ fastcall NORET_TYPE void do_exit(long code)
930 tsk->mempolicy = NULL; 926 tsk->mempolicy = NULL;
931#endif 927#endif
932 /* 928 /*
929 * This must happen late, after the PID is not
930 * hashed anymore:
931 */
932 if (unlikely(!list_empty(&tsk->pi_state_list)))
933 exit_pi_state_list(tsk);
934 if (unlikely(current->pi_state_cache))
935 kfree(current->pi_state_cache);
936 /*
933 * If DEBUG_MUTEXES is on, make sure we are holding no locks: 937 * If DEBUG_MUTEXES is on, make sure we are holding no locks:
934 */ 938 */
935 mutex_debug_check_no_locks_held(tsk); 939 mutex_debug_check_no_locks_held(tsk);
940 rt_mutex_debug_check_no_locks_held(tsk);
936 941
937 if (tsk->io_context) 942 if (tsk->io_context)
938 exit_io_context(); 943 exit_io_context();
@@ -1530,8 +1535,7 @@ check_continued:
1530 if (options & __WNOTHREAD) 1535 if (options & __WNOTHREAD)
1531 break; 1536 break;
1532 tsk = next_thread(tsk); 1537 tsk = next_thread(tsk);
1533 if (tsk->signal != current->signal) 1538 BUG_ON(tsk->signal != current->signal);
1534 BUG();
1535 } while (tsk != current); 1539 } while (tsk != current);
1536 1540
1537 read_unlock(&tasklist_lock); 1541 read_unlock(&tasklist_lock);
diff --git a/kernel/fork.c b/kernel/fork.c
index ac8100e3088a..628198a4f28a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -104,6 +104,7 @@ static kmem_cache_t *mm_cachep;
104void free_task(struct task_struct *tsk) 104void free_task(struct task_struct *tsk)
105{ 105{
106 free_thread_info(tsk->thread_info); 106 free_thread_info(tsk->thread_info);
107 rt_mutex_debug_task_free(tsk);
107 free_task_struct(tsk); 108 free_task_struct(tsk);
108} 109}
109EXPORT_SYMBOL(free_task); 110EXPORT_SYMBOL(free_task);
@@ -368,6 +369,8 @@ void fastcall __mmdrop(struct mm_struct *mm)
368 */ 369 */
369void mmput(struct mm_struct *mm) 370void mmput(struct mm_struct *mm)
370{ 371{
372 might_sleep();
373
371 if (atomic_dec_and_test(&mm->mm_users)) { 374 if (atomic_dec_and_test(&mm->mm_users)) {
372 exit_aio(mm); 375 exit_aio(mm);
373 exit_mmap(mm); 376 exit_mmap(mm);
@@ -623,6 +626,7 @@ out:
623/* 626/*
624 * Allocate a new files structure and copy contents from the 627 * Allocate a new files structure and copy contents from the
625 * passed in files structure. 628 * passed in files structure.
629 * errorp will be valid only when the returned files_struct is NULL.
626 */ 630 */
627static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) 631static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
628{ 632{
@@ -631,6 +635,7 @@ static struct files_struct *dup_fd(struct files_struct *oldf, int *errorp)
631 int open_files, size, i, expand; 635 int open_files, size, i, expand;
632 struct fdtable *old_fdt, *new_fdt; 636 struct fdtable *old_fdt, *new_fdt;
633 637
638 *errorp = -ENOMEM;
634 newf = alloc_files(); 639 newf = alloc_files();
635 if (!newf) 640 if (!newf)
636 goto out; 641 goto out;
@@ -744,7 +749,6 @@ static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
744 * break this. 749 * break this.
745 */ 750 */
746 tsk->files = NULL; 751 tsk->files = NULL;
747 error = -ENOMEM;
748 newf = dup_fd(oldf, &error); 752 newf = dup_fd(oldf, &error);
749 if (!newf) 753 if (!newf)
750 goto out; 754 goto out;
@@ -871,6 +875,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
871 tsk->it_prof_expires = 875 tsk->it_prof_expires =
872 secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur); 876 secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
873 } 877 }
878 acct_init_pacct(&sig->pacct);
874 879
875 return 0; 880 return 0;
876} 881}
@@ -909,6 +914,19 @@ asmlinkage long sys_set_tid_address(int __user *tidptr)
909 return current->pid; 914 return current->pid;
910} 915}
911 916
917static inline void rt_mutex_init_task(struct task_struct *p)
918{
919#ifdef CONFIG_RT_MUTEXES
920 spin_lock_init(&p->pi_lock);
921 plist_head_init(&p->pi_waiters, &p->pi_lock);
922 p->pi_blocked_on = NULL;
923# ifdef CONFIG_DEBUG_RT_MUTEXES
924 spin_lock_init(&p->held_list_lock);
925 INIT_LIST_HEAD(&p->held_list_head);
926# endif
927#endif
928}
929
912/* 930/*
913 * This creates a new process as a copy of the old one, 931 * This creates a new process as a copy of the old one,
914 * but does not actually start it yet. 932 * but does not actually start it yet.
@@ -989,13 +1007,10 @@ static task_t *copy_process(unsigned long clone_flags,
989 if (put_user(p->pid, parent_tidptr)) 1007 if (put_user(p->pid, parent_tidptr))
990 goto bad_fork_cleanup; 1008 goto bad_fork_cleanup;
991 1009
992 p->proc_dentry = NULL;
993
994 INIT_LIST_HEAD(&p->children); 1010 INIT_LIST_HEAD(&p->children);
995 INIT_LIST_HEAD(&p->sibling); 1011 INIT_LIST_HEAD(&p->sibling);
996 p->vfork_done = NULL; 1012 p->vfork_done = NULL;
997 spin_lock_init(&p->alloc_lock); 1013 spin_lock_init(&p->alloc_lock);
998 spin_lock_init(&p->proc_lock);
999 1014
1000 clear_tsk_thread_flag(p, TIF_SIGPENDING); 1015 clear_tsk_thread_flag(p, TIF_SIGPENDING);
1001 init_sigpending(&p->pending); 1016 init_sigpending(&p->pending);
@@ -1033,6 +1048,8 @@ static task_t *copy_process(unsigned long clone_flags,
1033 mpol_fix_fork_child_flag(p); 1048 mpol_fix_fork_child_flag(p);
1034#endif 1049#endif
1035 1050
1051 rt_mutex_init_task(p);
1052
1036#ifdef CONFIG_DEBUG_MUTEXES 1053#ifdef CONFIG_DEBUG_MUTEXES
1037 p->blocked_on = NULL; /* not blocked yet */ 1054 p->blocked_on = NULL; /* not blocked yet */
1038#endif 1055#endif
@@ -1075,6 +1092,9 @@ static task_t *copy_process(unsigned long clone_flags,
1075#ifdef CONFIG_COMPAT 1092#ifdef CONFIG_COMPAT
1076 p->compat_robust_list = NULL; 1093 p->compat_robust_list = NULL;
1077#endif 1094#endif
1095 INIT_LIST_HEAD(&p->pi_state_list);
1096 p->pi_state_cache = NULL;
1097
1078 /* 1098 /*
1079 * sigaltstack should be cleared when sharing the same VM 1099 * sigaltstack should be cleared when sharing the same VM
1080 */ 1100 */
@@ -1155,18 +1175,6 @@ static task_t *copy_process(unsigned long clone_flags,
1155 } 1175 }
1156 1176
1157 if (clone_flags & CLONE_THREAD) { 1177 if (clone_flags & CLONE_THREAD) {
1158 /*
1159 * Important: if an exit-all has been started then
1160 * do not create this new thread - the whole thread
1161 * group is supposed to exit anyway.
1162 */
1163 if (current->signal->flags & SIGNAL_GROUP_EXIT) {
1164 spin_unlock(&current->sighand->siglock);
1165 write_unlock_irq(&tasklist_lock);
1166 retval = -EAGAIN;
1167 goto bad_fork_cleanup_namespace;
1168 }
1169
1170 p->group_leader = current->group_leader; 1178 p->group_leader = current->group_leader;
1171 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); 1179 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
1172 1180
diff --git a/kernel/futex.c b/kernel/futex.c
index 5699c512057b..6c91f938005d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -12,6 +12,10 @@
12 * (C) Copyright 2006 Red Hat Inc, All Rights Reserved 12 * (C) Copyright 2006 Red Hat Inc, All Rights Reserved
13 * Thanks to Thomas Gleixner for suggestions, analysis and fixes. 13 * Thanks to Thomas Gleixner for suggestions, analysis and fixes.
14 * 14 *
15 * PI-futex support started by Ingo Molnar and Thomas Gleixner
16 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
17 * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
18 *
15 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly 19 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
16 * enough at me, Linus for the original (flawed) idea, Matthew 20 * enough at me, Linus for the original (flawed) idea, Matthew
17 * Kirkwood for proof-of-concept implementation. 21 * Kirkwood for proof-of-concept implementation.
@@ -46,6 +50,8 @@
46#include <linux/signal.h> 50#include <linux/signal.h>
47#include <asm/futex.h> 51#include <asm/futex.h>
48 52
53#include "rtmutex_common.h"
54
49#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) 55#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
50 56
51/* 57/*
@@ -63,7 +69,7 @@ union futex_key {
63 int offset; 69 int offset;
64 } shared; 70 } shared;
65 struct { 71 struct {
66 unsigned long uaddr; 72 unsigned long address;
67 struct mm_struct *mm; 73 struct mm_struct *mm;
68 int offset; 74 int offset;
69 } private; 75 } private;
@@ -75,6 +81,27 @@ union futex_key {
75}; 81};
76 82
77/* 83/*
84 * Priority Inheritance state:
85 */
86struct futex_pi_state {
87 /*
88 * list of 'owned' pi_state instances - these have to be
89 * cleaned up in do_exit() if the task exits prematurely:
90 */
91 struct list_head list;
92
93 /*
94 * The PI object:
95 */
96 struct rt_mutex pi_mutex;
97
98 struct task_struct *owner;
99 atomic_t refcount;
100
101 union futex_key key;
102};
103
104/*
78 * We use this hashed waitqueue instead of a normal wait_queue_t, so 105 * We use this hashed waitqueue instead of a normal wait_queue_t, so
79 * we can wake only the relevant ones (hashed queues may be shared). 106 * we can wake only the relevant ones (hashed queues may be shared).
80 * 107 *
@@ -87,15 +114,19 @@ struct futex_q {
87 struct list_head list; 114 struct list_head list;
88 wait_queue_head_t waiters; 115 wait_queue_head_t waiters;
89 116
90 /* Which hash list lock to use. */ 117 /* Which hash list lock to use: */
91 spinlock_t *lock_ptr; 118 spinlock_t *lock_ptr;
92 119
93 /* Key which the futex is hashed on. */ 120 /* Key which the futex is hashed on: */
94 union futex_key key; 121 union futex_key key;
95 122
96 /* For fd, sigio sent using these. */ 123 /* For fd, sigio sent using these: */
97 int fd; 124 int fd;
98 struct file *filp; 125 struct file *filp;
126
127 /* Optional priority inheritance state: */
128 struct futex_pi_state *pi_state;
129 struct task_struct *task;
99}; 130};
100 131
101/* 132/*
@@ -144,8 +175,9 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
144 * 175 *
145 * Should be called with &current->mm->mmap_sem but NOT any spinlocks. 176 * Should be called with &current->mm->mmap_sem but NOT any spinlocks.
146 */ 177 */
147static int get_futex_key(unsigned long uaddr, union futex_key *key) 178static int get_futex_key(u32 __user *uaddr, union futex_key *key)
148{ 179{
180 unsigned long address = (unsigned long)uaddr;
149 struct mm_struct *mm = current->mm; 181 struct mm_struct *mm = current->mm;
150 struct vm_area_struct *vma; 182 struct vm_area_struct *vma;
151 struct page *page; 183 struct page *page;
@@ -154,16 +186,16 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
154 /* 186 /*
155 * The futex address must be "naturally" aligned. 187 * The futex address must be "naturally" aligned.
156 */ 188 */
157 key->both.offset = uaddr % PAGE_SIZE; 189 key->both.offset = address % PAGE_SIZE;
158 if (unlikely((key->both.offset % sizeof(u32)) != 0)) 190 if (unlikely((key->both.offset % sizeof(u32)) != 0))
159 return -EINVAL; 191 return -EINVAL;
160 uaddr -= key->both.offset; 192 address -= key->both.offset;
161 193
162 /* 194 /*
163 * The futex is hashed differently depending on whether 195 * The futex is hashed differently depending on whether
164 * it's in a shared or private mapping. So check vma first. 196 * it's in a shared or private mapping. So check vma first.
165 */ 197 */
166 vma = find_extend_vma(mm, uaddr); 198 vma = find_extend_vma(mm, address);
167 if (unlikely(!vma)) 199 if (unlikely(!vma))
168 return -EFAULT; 200 return -EFAULT;
169 201
@@ -184,7 +216,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
184 */ 216 */
185 if (likely(!(vma->vm_flags & VM_MAYSHARE))) { 217 if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
186 key->private.mm = mm; 218 key->private.mm = mm;
187 key->private.uaddr = uaddr; 219 key->private.address = address;
188 return 0; 220 return 0;
189 } 221 }
190 222
@@ -194,7 +226,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
194 key->shared.inode = vma->vm_file->f_dentry->d_inode; 226 key->shared.inode = vma->vm_file->f_dentry->d_inode;
195 key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ 227 key->both.offset++; /* Bit 0 of offset indicates inode-based key. */
196 if (likely(!(vma->vm_flags & VM_NONLINEAR))) { 228 if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
197 key->shared.pgoff = (((uaddr - vma->vm_start) >> PAGE_SHIFT) 229 key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
198 + vma->vm_pgoff); 230 + vma->vm_pgoff);
199 return 0; 231 return 0;
200 } 232 }
@@ -205,7 +237,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
205 * from swap. But that's a lot of code to duplicate here 237 * from swap. But that's a lot of code to duplicate here
206 * for a rare case, so we simply fetch the page. 238 * for a rare case, so we simply fetch the page.
207 */ 239 */
208 err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL); 240 err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
209 if (err >= 0) { 241 if (err >= 0) {
210 key->shared.pgoff = 242 key->shared.pgoff =
211 page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 243 page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -246,18 +278,244 @@ static void drop_key_refs(union futex_key *key)
246 } 278 }
247} 279}
248 280
249static inline int get_futex_value_locked(int *dest, int __user *from) 281static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
250{ 282{
251 int ret; 283 int ret;
252 284
253 inc_preempt_count(); 285 inc_preempt_count();
254 ret = __copy_from_user_inatomic(dest, from, sizeof(int)); 286 ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
255 dec_preempt_count(); 287 dec_preempt_count();
256 288
257 return ret ? -EFAULT : 0; 289 return ret ? -EFAULT : 0;
258} 290}
259 291
260/* 292/*
293 * Fault handling. Called with current->mm->mmap_sem held.
294 */
295static int futex_handle_fault(unsigned long address, int attempt)
296{
297 struct vm_area_struct * vma;
298 struct mm_struct *mm = current->mm;
299
300 if (attempt >= 2 || !(vma = find_vma(mm, address)) ||
301 vma->vm_start > address || !(vma->vm_flags & VM_WRITE))
302 return -EFAULT;
303
304 switch (handle_mm_fault(mm, vma, address, 1)) {
305 case VM_FAULT_MINOR:
306 current->min_flt++;
307 break;
308 case VM_FAULT_MAJOR:
309 current->maj_flt++;
310 break;
311 default:
312 return -EFAULT;
313 }
314 return 0;
315}
316
317/*
318 * PI code:
319 */
320static int refill_pi_state_cache(void)
321{
322 struct futex_pi_state *pi_state;
323
324 if (likely(current->pi_state_cache))
325 return 0;
326
327 pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL);
328
329 if (!pi_state)
330 return -ENOMEM;
331
332 memset(pi_state, 0, sizeof(*pi_state));
333 INIT_LIST_HEAD(&pi_state->list);
334 /* pi_mutex gets initialized later */
335 pi_state->owner = NULL;
336 atomic_set(&pi_state->refcount, 1);
337
338 current->pi_state_cache = pi_state;
339
340 return 0;
341}
342
343static struct futex_pi_state * alloc_pi_state(void)
344{
345 struct futex_pi_state *pi_state = current->pi_state_cache;
346
347 WARN_ON(!pi_state);
348 current->pi_state_cache = NULL;
349
350 return pi_state;
351}
352
353static void free_pi_state(struct futex_pi_state *pi_state)
354{
355 if (!atomic_dec_and_test(&pi_state->refcount))
356 return;
357
358 /*
359 * If pi_state->owner is NULL, the owner is most probably dying
360 * and has cleaned up the pi_state already
361 */
362 if (pi_state->owner) {
363 spin_lock_irq(&pi_state->owner->pi_lock);
364 list_del_init(&pi_state->list);
365 spin_unlock_irq(&pi_state->owner->pi_lock);
366
367 rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
368 }
369
370 if (current->pi_state_cache)
371 kfree(pi_state);
372 else {
373 /*
374 * pi_state->list is already empty.
375 * clear pi_state->owner.
376 * refcount is at 0 - put it back to 1.
377 */
378 pi_state->owner = NULL;
379 atomic_set(&pi_state->refcount, 1);
380 current->pi_state_cache = pi_state;
381 }
382}
383
384/*
385 * Look up the task based on what TID userspace gave us.
386 * We dont trust it.
387 */
388static struct task_struct * futex_find_get_task(pid_t pid)
389{
390 struct task_struct *p;
391
392 read_lock(&tasklist_lock);
393 p = find_task_by_pid(pid);
394 if (!p)
395 goto out_unlock;
396 if ((current->euid != p->euid) && (current->euid != p->uid)) {
397 p = NULL;
398 goto out_unlock;
399 }
400 if (p->state == EXIT_ZOMBIE || p->exit_state == EXIT_ZOMBIE) {
401 p = NULL;
402 goto out_unlock;
403 }
404 get_task_struct(p);
405out_unlock:
406 read_unlock(&tasklist_lock);
407
408 return p;
409}
410
411/*
412 * This task is holding PI mutexes at exit time => bad.
413 * Kernel cleans up PI-state, but userspace is likely hosed.
414 * (Robust-futex cleanup is separate and might save the day for userspace.)
415 */
416void exit_pi_state_list(struct task_struct *curr)
417{
418 struct futex_hash_bucket *hb;
419 struct list_head *next, *head = &curr->pi_state_list;
420 struct futex_pi_state *pi_state;
421 union futex_key key;
422
423 /*
424 * We are a ZOMBIE and nobody can enqueue itself on
425 * pi_state_list anymore, but we have to be careful
426 * versus waiters unqueueing themselfs
427 */
428 spin_lock_irq(&curr->pi_lock);
429 while (!list_empty(head)) {
430
431 next = head->next;
432 pi_state = list_entry(next, struct futex_pi_state, list);
433 key = pi_state->key;
434 spin_unlock_irq(&curr->pi_lock);
435
436 hb = hash_futex(&key);
437 spin_lock(&hb->lock);
438
439 spin_lock_irq(&curr->pi_lock);
440 if (head->next != next) {
441 spin_unlock(&hb->lock);
442 continue;
443 }
444
445 list_del_init(&pi_state->list);
446
447 WARN_ON(pi_state->owner != curr);
448
449 pi_state->owner = NULL;
450 spin_unlock_irq(&curr->pi_lock);
451
452 rt_mutex_unlock(&pi_state->pi_mutex);
453
454 spin_unlock(&hb->lock);
455
456 spin_lock_irq(&curr->pi_lock);
457 }
458 spin_unlock_irq(&curr->pi_lock);
459}
460
461static int
462lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
463{
464 struct futex_pi_state *pi_state = NULL;
465 struct futex_q *this, *next;
466 struct list_head *head;
467 struct task_struct *p;
468 pid_t pid;
469
470 head = &hb->chain;
471
472 list_for_each_entry_safe(this, next, head, list) {
473 if (match_futex (&this->key, &me->key)) {
474 /*
475 * Another waiter already exists - bump up
476 * the refcount and return its pi_state:
477 */
478 pi_state = this->pi_state;
479 atomic_inc(&pi_state->refcount);
480 me->pi_state = pi_state;
481
482 return 0;
483 }
484 }
485
486 /*
487 * We are the first waiter - try to look up the real owner and
488 * attach the new pi_state to it:
489 */
490 pid = uval & FUTEX_TID_MASK;
491 p = futex_find_get_task(pid);
492 if (!p)
493 return -ESRCH;
494
495 pi_state = alloc_pi_state();
496
497 /*
498 * Initialize the pi_mutex in locked state and make 'p'
499 * the owner of it:
500 */
501 rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
502
503 /* Store the key for possible exit cleanups: */
504 pi_state->key = me->key;
505
506 spin_lock_irq(&p->pi_lock);
507 list_add(&pi_state->list, &p->pi_state_list);
508 pi_state->owner = p;
509 spin_unlock_irq(&p->pi_lock);
510
511 put_task_struct(p);
512
513 me->pi_state = pi_state;
514
515 return 0;
516}
517
518/*
261 * The hash bucket lock must be held when this is called. 519 * The hash bucket lock must be held when this is called.
262 * Afterwards, the futex_q must not be accessed. 520 * Afterwards, the futex_q must not be accessed.
263 */ 521 */
@@ -284,16 +542,80 @@ static void wake_futex(struct futex_q *q)
284 q->lock_ptr = NULL; 542 q->lock_ptr = NULL;
285} 543}
286 544
545static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
546{
547 struct task_struct *new_owner;
548 struct futex_pi_state *pi_state = this->pi_state;
549 u32 curval, newval;
550
551 if (!pi_state)
552 return -EINVAL;
553
554 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
555
556 /*
557 * This happens when we have stolen the lock and the original
558 * pending owner did not enqueue itself back on the rt_mutex.
559 * Thats not a tragedy. We know that way, that a lock waiter
560 * is on the fly. We make the futex_q waiter the pending owner.
561 */
562 if (!new_owner)
563 new_owner = this->task;
564
565 /*
566 * We pass it to the next owner. (The WAITERS bit is always
567 * kept enabled while there is PI state around. We must also
568 * preserve the owner died bit.)
569 */
570 newval = (uval & FUTEX_OWNER_DIED) | FUTEX_WAITERS | new_owner->pid;
571
572 inc_preempt_count();
573 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
574 dec_preempt_count();
575
576 if (curval == -EFAULT)
577 return -EFAULT;
578 if (curval != uval)
579 return -EINVAL;
580
581 list_del_init(&pi_state->owner->pi_state_list);
582 list_add(&pi_state->list, &new_owner->pi_state_list);
583 pi_state->owner = new_owner;
584 rt_mutex_unlock(&pi_state->pi_mutex);
585
586 return 0;
587}
588
589static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
590{
591 u32 oldval;
592
593 /*
594 * There is no waiter, so we unlock the futex. The owner died
595 * bit has not to be preserved here. We are the owner:
596 */
597 inc_preempt_count();
598 oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0);
599 dec_preempt_count();
600
601 if (oldval == -EFAULT)
602 return oldval;
603 if (oldval != uval)
604 return -EAGAIN;
605
606 return 0;
607}
608
287/* 609/*
288 * Wake up all waiters hashed on the physical page that is mapped 610 * Wake up all waiters hashed on the physical page that is mapped
289 * to this virtual address: 611 * to this virtual address:
290 */ 612 */
291static int futex_wake(unsigned long uaddr, int nr_wake) 613static int futex_wake(u32 __user *uaddr, int nr_wake)
292{ 614{
293 union futex_key key; 615 struct futex_hash_bucket *hb;
294 struct futex_hash_bucket *bh;
295 struct list_head *head;
296 struct futex_q *this, *next; 616 struct futex_q *this, *next;
617 struct list_head *head;
618 union futex_key key;
297 int ret; 619 int ret;
298 620
299 down_read(&current->mm->mmap_sem); 621 down_read(&current->mm->mmap_sem);
@@ -302,19 +624,21 @@ static int futex_wake(unsigned long uaddr, int nr_wake)
302 if (unlikely(ret != 0)) 624 if (unlikely(ret != 0))
303 goto out; 625 goto out;
304 626
305 bh = hash_futex(&key); 627 hb = hash_futex(&key);
306 spin_lock(&bh->lock); 628 spin_lock(&hb->lock);
307 head = &bh->chain; 629 head = &hb->chain;
308 630
309 list_for_each_entry_safe(this, next, head, list) { 631 list_for_each_entry_safe(this, next, head, list) {
310 if (match_futex (&this->key, &key)) { 632 if (match_futex (&this->key, &key)) {
633 if (this->pi_state)
634 return -EINVAL;
311 wake_futex(this); 635 wake_futex(this);
312 if (++ret >= nr_wake) 636 if (++ret >= nr_wake)
313 break; 637 break;
314 } 638 }
315 } 639 }
316 640
317 spin_unlock(&bh->lock); 641 spin_unlock(&hb->lock);
318out: 642out:
319 up_read(&current->mm->mmap_sem); 643 up_read(&current->mm->mmap_sem);
320 return ret; 644 return ret;
@@ -324,10 +648,12 @@ out:
324 * Wake up all waiters hashed on the physical page that is mapped 648 * Wake up all waiters hashed on the physical page that is mapped
325 * to this virtual address: 649 * to this virtual address:
326 */ 650 */
327static int futex_wake_op(unsigned long uaddr1, unsigned long uaddr2, int nr_wake, int nr_wake2, int op) 651static int
652futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2,
653 int nr_wake, int nr_wake2, int op)
328{ 654{
329 union futex_key key1, key2; 655 union futex_key key1, key2;
330 struct futex_hash_bucket *bh1, *bh2; 656 struct futex_hash_bucket *hb1, *hb2;
331 struct list_head *head; 657 struct list_head *head;
332 struct futex_q *this, *next; 658 struct futex_q *this, *next;
333 int ret, op_ret, attempt = 0; 659 int ret, op_ret, attempt = 0;
@@ -342,27 +668,29 @@ retryfull:
342 if (unlikely(ret != 0)) 668 if (unlikely(ret != 0))
343 goto out; 669 goto out;
344 670
345 bh1 = hash_futex(&key1); 671 hb1 = hash_futex(&key1);
346 bh2 = hash_futex(&key2); 672 hb2 = hash_futex(&key2);
347 673
348retry: 674retry:
349 if (bh1 < bh2) 675 if (hb1 < hb2)
350 spin_lock(&bh1->lock); 676 spin_lock(&hb1->lock);
351 spin_lock(&bh2->lock); 677 spin_lock(&hb2->lock);
352 if (bh1 > bh2) 678 if (hb1 > hb2)
353 spin_lock(&bh1->lock); 679 spin_lock(&hb1->lock);
354 680
355 op_ret = futex_atomic_op_inuser(op, (int __user *)uaddr2); 681 op_ret = futex_atomic_op_inuser(op, uaddr2);
356 if (unlikely(op_ret < 0)) { 682 if (unlikely(op_ret < 0)) {
357 int dummy; 683 u32 dummy;
358 684
359 spin_unlock(&bh1->lock); 685 spin_unlock(&hb1->lock);
360 if (bh1 != bh2) 686 if (hb1 != hb2)
361 spin_unlock(&bh2->lock); 687 spin_unlock(&hb2->lock);
362 688
363#ifndef CONFIG_MMU 689#ifndef CONFIG_MMU
364 /* we don't get EFAULT from MMU faults if we don't have an MMU, 690 /*
365 * but we might get them from range checking */ 691 * we don't get EFAULT from MMU faults if we don't have an MMU,
692 * but we might get them from range checking
693 */
366 ret = op_ret; 694 ret = op_ret;
367 goto out; 695 goto out;
368#endif 696#endif
@@ -372,47 +700,34 @@ retry:
372 goto out; 700 goto out;
373 } 701 }
374 702
375 /* futex_atomic_op_inuser needs to both read and write 703 /*
704 * futex_atomic_op_inuser needs to both read and write
376 * *(int __user *)uaddr2, but we can't modify it 705 * *(int __user *)uaddr2, but we can't modify it
377 * non-atomically. Therefore, if get_user below is not 706 * non-atomically. Therefore, if get_user below is not
378 * enough, we need to handle the fault ourselves, while 707 * enough, we need to handle the fault ourselves, while
379 * still holding the mmap_sem. */ 708 * still holding the mmap_sem.
709 */
380 if (attempt++) { 710 if (attempt++) {
381 struct vm_area_struct * vma; 711 if (futex_handle_fault((unsigned long)uaddr2,
382 struct mm_struct *mm = current->mm; 712 attempt))
383
384 ret = -EFAULT;
385 if (attempt >= 2 ||
386 !(vma = find_vma(mm, uaddr2)) ||
387 vma->vm_start > uaddr2 ||
388 !(vma->vm_flags & VM_WRITE))
389 goto out;
390
391 switch (handle_mm_fault(mm, vma, uaddr2, 1)) {
392 case VM_FAULT_MINOR:
393 current->min_flt++;
394 break;
395 case VM_FAULT_MAJOR:
396 current->maj_flt++;
397 break;
398 default:
399 goto out; 713 goto out;
400 }
401 goto retry; 714 goto retry;
402 } 715 }
403 716
404 /* If we would have faulted, release mmap_sem, 717 /*
405 * fault it in and start all over again. */ 718 * If we would have faulted, release mmap_sem,
719 * fault it in and start all over again.
720 */
406 up_read(&current->mm->mmap_sem); 721 up_read(&current->mm->mmap_sem);
407 722
408 ret = get_user(dummy, (int __user *)uaddr2); 723 ret = get_user(dummy, uaddr2);
409 if (ret) 724 if (ret)
410 return ret; 725 return ret;
411 726
412 goto retryfull; 727 goto retryfull;
413 } 728 }
414 729
415 head = &bh1->chain; 730 head = &hb1->chain;
416 731
417 list_for_each_entry_safe(this, next, head, list) { 732 list_for_each_entry_safe(this, next, head, list) {
418 if (match_futex (&this->key, &key1)) { 733 if (match_futex (&this->key, &key1)) {
@@ -423,7 +738,7 @@ retry:
423 } 738 }
424 739
425 if (op_ret > 0) { 740 if (op_ret > 0) {
426 head = &bh2->chain; 741 head = &hb2->chain;
427 742
428 op_ret = 0; 743 op_ret = 0;
429 list_for_each_entry_safe(this, next, head, list) { 744 list_for_each_entry_safe(this, next, head, list) {
@@ -436,9 +751,9 @@ retry:
436 ret += op_ret; 751 ret += op_ret;
437 } 752 }
438 753
439 spin_unlock(&bh1->lock); 754 spin_unlock(&hb1->lock);
440 if (bh1 != bh2) 755 if (hb1 != hb2)
441 spin_unlock(&bh2->lock); 756 spin_unlock(&hb2->lock);
442out: 757out:
443 up_read(&current->mm->mmap_sem); 758 up_read(&current->mm->mmap_sem);
444 return ret; 759 return ret;
@@ -448,11 +763,11 @@ out:
448 * Requeue all waiters hashed on one physical page to another 763 * Requeue all waiters hashed on one physical page to another
449 * physical page. 764 * physical page.
450 */ 765 */
451static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2, 766static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
452 int nr_wake, int nr_requeue, int *valp) 767 int nr_wake, int nr_requeue, u32 *cmpval)
453{ 768{
454 union futex_key key1, key2; 769 union futex_key key1, key2;
455 struct futex_hash_bucket *bh1, *bh2; 770 struct futex_hash_bucket *hb1, *hb2;
456 struct list_head *head1; 771 struct list_head *head1;
457 struct futex_q *this, *next; 772 struct futex_q *this, *next;
458 int ret, drop_count = 0; 773 int ret, drop_count = 0;
@@ -467,68 +782,72 @@ static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
467 if (unlikely(ret != 0)) 782 if (unlikely(ret != 0))
468 goto out; 783 goto out;
469 784
470 bh1 = hash_futex(&key1); 785 hb1 = hash_futex(&key1);
471 bh2 = hash_futex(&key2); 786 hb2 = hash_futex(&key2);
472 787
473 if (bh1 < bh2) 788 if (hb1 < hb2)
474 spin_lock(&bh1->lock); 789 spin_lock(&hb1->lock);
475 spin_lock(&bh2->lock); 790 spin_lock(&hb2->lock);
476 if (bh1 > bh2) 791 if (hb1 > hb2)
477 spin_lock(&bh1->lock); 792 spin_lock(&hb1->lock);
478 793
479 if (likely(valp != NULL)) { 794 if (likely(cmpval != NULL)) {
480 int curval; 795 u32 curval;
481 796
482 ret = get_futex_value_locked(&curval, (int __user *)uaddr1); 797 ret = get_futex_value_locked(&curval, uaddr1);
483 798
484 if (unlikely(ret)) { 799 if (unlikely(ret)) {
485 spin_unlock(&bh1->lock); 800 spin_unlock(&hb1->lock);
486 if (bh1 != bh2) 801 if (hb1 != hb2)
487 spin_unlock(&bh2->lock); 802 spin_unlock(&hb2->lock);
488 803
489 /* If we would have faulted, release mmap_sem, fault 804 /*
805 * If we would have faulted, release mmap_sem, fault
490 * it in and start all over again. 806 * it in and start all over again.
491 */ 807 */
492 up_read(&current->mm->mmap_sem); 808 up_read(&current->mm->mmap_sem);
493 809
494 ret = get_user(curval, (int __user *)uaddr1); 810 ret = get_user(curval, uaddr1);
495 811
496 if (!ret) 812 if (!ret)
497 goto retry; 813 goto retry;
498 814
499 return ret; 815 return ret;
500 } 816 }
501 if (curval != *valp) { 817 if (curval != *cmpval) {
502 ret = -EAGAIN; 818 ret = -EAGAIN;
503 goto out_unlock; 819 goto out_unlock;
504 } 820 }
505 } 821 }
506 822
507 head1 = &bh1->chain; 823 head1 = &hb1->chain;
508 list_for_each_entry_safe(this, next, head1, list) { 824 list_for_each_entry_safe(this, next, head1, list) {
509 if (!match_futex (&this->key, &key1)) 825 if (!match_futex (&this->key, &key1))
510 continue; 826 continue;
511 if (++ret <= nr_wake) { 827 if (++ret <= nr_wake) {
512 wake_futex(this); 828 wake_futex(this);
513 } else { 829 } else {
514 list_move_tail(&this->list, &bh2->chain); 830 /*
515 this->lock_ptr = &bh2->lock; 831 * If key1 and key2 hash to the same bucket, no need to
832 * requeue.
833 */
834 if (likely(head1 != &hb2->chain)) {
835 list_move_tail(&this->list, &hb2->chain);
836 this->lock_ptr = &hb2->lock;
837 }
516 this->key = key2; 838 this->key = key2;
517 get_key_refs(&key2); 839 get_key_refs(&key2);
518 drop_count++; 840 drop_count++;
519 841
520 if (ret - nr_wake >= nr_requeue) 842 if (ret - nr_wake >= nr_requeue)
521 break; 843 break;
522 /* Make sure to stop if key1 == key2 */
523 if (head1 == &bh2->chain && head1 != &next->list)
524 head1 = &this->list;
525 } 844 }
526 } 845 }
527 846
528out_unlock: 847out_unlock:
529 spin_unlock(&bh1->lock); 848 spin_unlock(&hb1->lock);
530 if (bh1 != bh2) 849 if (hb1 != hb2)
531 spin_unlock(&bh2->lock); 850 spin_unlock(&hb2->lock);
532 851
533 /* drop_key_refs() must be called outside the spinlocks. */ 852 /* drop_key_refs() must be called outside the spinlocks. */
534 while (--drop_count >= 0) 853 while (--drop_count >= 0)
@@ -543,7 +862,7 @@ out:
543static inline struct futex_hash_bucket * 862static inline struct futex_hash_bucket *
544queue_lock(struct futex_q *q, int fd, struct file *filp) 863queue_lock(struct futex_q *q, int fd, struct file *filp)
545{ 864{
546 struct futex_hash_bucket *bh; 865 struct futex_hash_bucket *hb;
547 866
548 q->fd = fd; 867 q->fd = fd;
549 q->filp = filp; 868 q->filp = filp;
@@ -551,23 +870,24 @@ queue_lock(struct futex_q *q, int fd, struct file *filp)
551 init_waitqueue_head(&q->waiters); 870 init_waitqueue_head(&q->waiters);
552 871
553 get_key_refs(&q->key); 872 get_key_refs(&q->key);
554 bh = hash_futex(&q->key); 873 hb = hash_futex(&q->key);
555 q->lock_ptr = &bh->lock; 874 q->lock_ptr = &hb->lock;
556 875
557 spin_lock(&bh->lock); 876 spin_lock(&hb->lock);
558 return bh; 877 return hb;
559} 878}
560 879
561static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *bh) 880static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
562{ 881{
563 list_add_tail(&q->list, &bh->chain); 882 list_add_tail(&q->list, &hb->chain);
564 spin_unlock(&bh->lock); 883 q->task = current;
884 spin_unlock(&hb->lock);
565} 885}
566 886
567static inline void 887static inline void
568queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh) 888queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
569{ 889{
570 spin_unlock(&bh->lock); 890 spin_unlock(&hb->lock);
571 drop_key_refs(&q->key); 891 drop_key_refs(&q->key);
572} 892}
573 893
@@ -579,16 +899,17 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh)
579/* The key must be already stored in q->key. */ 899/* The key must be already stored in q->key. */
580static void queue_me(struct futex_q *q, int fd, struct file *filp) 900static void queue_me(struct futex_q *q, int fd, struct file *filp)
581{ 901{
582 struct futex_hash_bucket *bh; 902 struct futex_hash_bucket *hb;
583 bh = queue_lock(q, fd, filp); 903
584 __queue_me(q, bh); 904 hb = queue_lock(q, fd, filp);
905 __queue_me(q, hb);
585} 906}
586 907
587/* Return 1 if we were still queued (ie. 0 means we were woken) */ 908/* Return 1 if we were still queued (ie. 0 means we were woken) */
588static int unqueue_me(struct futex_q *q) 909static int unqueue_me(struct futex_q *q)
589{ 910{
590 int ret = 0;
591 spinlock_t *lock_ptr; 911 spinlock_t *lock_ptr;
912 int ret = 0;
592 913
593 /* In the common case we don't take the spinlock, which is nice. */ 914 /* In the common case we don't take the spinlock, which is nice. */
594 retry: 915 retry:
@@ -614,6 +935,9 @@ static int unqueue_me(struct futex_q *q)
614 } 935 }
615 WARN_ON(list_empty(&q->list)); 936 WARN_ON(list_empty(&q->list));
616 list_del(&q->list); 937 list_del(&q->list);
938
939 BUG_ON(q->pi_state);
940
617 spin_unlock(lock_ptr); 941 spin_unlock(lock_ptr);
618 ret = 1; 942 ret = 1;
619 } 943 }
@@ -622,21 +946,42 @@ static int unqueue_me(struct futex_q *q)
622 return ret; 946 return ret;
623} 947}
624 948
625static int futex_wait(unsigned long uaddr, int val, unsigned long time) 949/*
950 * PI futexes can not be requeued and must remove themself from the
951 * hash bucket. The hash bucket lock is held on entry and dropped here.
952 */
953static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb)
626{ 954{
627 DECLARE_WAITQUEUE(wait, current); 955 WARN_ON(list_empty(&q->list));
628 int ret, curval; 956 list_del(&q->list);
957
958 BUG_ON(!q->pi_state);
959 free_pi_state(q->pi_state);
960 q->pi_state = NULL;
961
962 spin_unlock(&hb->lock);
963
964 drop_key_refs(&q->key);
965}
966
967static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
968{
969 struct task_struct *curr = current;
970 DECLARE_WAITQUEUE(wait, curr);
971 struct futex_hash_bucket *hb;
629 struct futex_q q; 972 struct futex_q q;
630 struct futex_hash_bucket *bh; 973 u32 uval;
974 int ret;
631 975
976 q.pi_state = NULL;
632 retry: 977 retry:
633 down_read(&current->mm->mmap_sem); 978 down_read(&curr->mm->mmap_sem);
634 979
635 ret = get_futex_key(uaddr, &q.key); 980 ret = get_futex_key(uaddr, &q.key);
636 if (unlikely(ret != 0)) 981 if (unlikely(ret != 0))
637 goto out_release_sem; 982 goto out_release_sem;
638 983
639 bh = queue_lock(&q, -1, NULL); 984 hb = queue_lock(&q, -1, NULL);
640 985
641 /* 986 /*
642 * Access the page AFTER the futex is queued. 987 * Access the page AFTER the futex is queued.
@@ -658,37 +1003,35 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time)
658 * We hold the mmap semaphore, so the mapping cannot have changed 1003 * We hold the mmap semaphore, so the mapping cannot have changed
659 * since we looked it up in get_futex_key. 1004 * since we looked it up in get_futex_key.
660 */ 1005 */
661 1006 ret = get_futex_value_locked(&uval, uaddr);
662 ret = get_futex_value_locked(&curval, (int __user *)uaddr);
663 1007
664 if (unlikely(ret)) { 1008 if (unlikely(ret)) {
665 queue_unlock(&q, bh); 1009 queue_unlock(&q, hb);
666 1010
667 /* If we would have faulted, release mmap_sem, fault it in and 1011 /*
1012 * If we would have faulted, release mmap_sem, fault it in and
668 * start all over again. 1013 * start all over again.
669 */ 1014 */
670 up_read(&current->mm->mmap_sem); 1015 up_read(&curr->mm->mmap_sem);
671 1016
672 ret = get_user(curval, (int __user *)uaddr); 1017 ret = get_user(uval, uaddr);
673 1018
674 if (!ret) 1019 if (!ret)
675 goto retry; 1020 goto retry;
676 return ret; 1021 return ret;
677 } 1022 }
678 if (curval != val) { 1023 ret = -EWOULDBLOCK;
679 ret = -EWOULDBLOCK; 1024 if (uval != val)
680 queue_unlock(&q, bh); 1025 goto out_unlock_release_sem;
681 goto out_release_sem;
682 }
683 1026
684 /* Only actually queue if *uaddr contained val. */ 1027 /* Only actually queue if *uaddr contained val. */
685 __queue_me(&q, bh); 1028 __queue_me(&q, hb);
686 1029
687 /* 1030 /*
688 * Now the futex is queued and we have checked the data, we 1031 * Now the futex is queued and we have checked the data, we
689 * don't want to hold mmap_sem while we sleep. 1032 * don't want to hold mmap_sem while we sleep.
690 */ 1033 */
691 up_read(&current->mm->mmap_sem); 1034 up_read(&curr->mm->mmap_sem);
692 1035
693 /* 1036 /*
694 * There might have been scheduling since the queue_me(), as we 1037 * There might have been scheduling since the queue_me(), as we
@@ -720,12 +1063,421 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time)
720 return 0; 1063 return 0;
721 if (time == 0) 1064 if (time == 0)
722 return -ETIMEDOUT; 1065 return -ETIMEDOUT;
723 /* We expect signal_pending(current), but another thread may 1066 /*
724 * have handled it for us already. */ 1067 * We expect signal_pending(current), but another thread may
1068 * have handled it for us already.
1069 */
725 return -EINTR; 1070 return -EINTR;
726 1071
1072 out_unlock_release_sem:
1073 queue_unlock(&q, hb);
1074
727 out_release_sem: 1075 out_release_sem:
1076 up_read(&curr->mm->mmap_sem);
1077 return ret;
1078}
1079
1080/*
1081 * Userspace tried a 0 -> TID atomic transition of the futex value
1082 * and failed. The kernel side here does the whole locking operation:
1083 * if there are waiters then it will block, it does PI, etc. (Due to
1084 * races the kernel might see a 0 value of the futex too.)
1085 */
1086static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
1087 struct hrtimer_sleeper *to)
1088{
1089 struct task_struct *curr = current;
1090 struct futex_hash_bucket *hb;
1091 u32 uval, newval, curval;
1092 struct futex_q q;
1093 int ret, attempt = 0;
1094
1095 if (refill_pi_state_cache())
1096 return -ENOMEM;
1097
1098 q.pi_state = NULL;
1099 retry:
1100 down_read(&curr->mm->mmap_sem);
1101
1102 ret = get_futex_key(uaddr, &q.key);
1103 if (unlikely(ret != 0))
1104 goto out_release_sem;
1105
1106 hb = queue_lock(&q, -1, NULL);
1107
1108 retry_locked:
1109 /*
1110 * To avoid races, we attempt to take the lock here again
1111 * (by doing a 0 -> TID atomic cmpxchg), while holding all
1112 * the locks. It will most likely not succeed.
1113 */
1114 newval = current->pid;
1115
1116 inc_preempt_count();
1117 curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval);
1118 dec_preempt_count();
1119
1120 if (unlikely(curval == -EFAULT))
1121 goto uaddr_faulted;
1122
1123 /* We own the lock already */
1124 if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) {
1125 if (!detect && 0)
1126 force_sig(SIGKILL, current);
1127 ret = -EDEADLK;
1128 goto out_unlock_release_sem;
1129 }
1130
1131 /*
1132 * Surprise - we got the lock. Just return
1133 * to userspace:
1134 */
1135 if (unlikely(!curval))
1136 goto out_unlock_release_sem;
1137
1138 uval = curval;
1139 newval = uval | FUTEX_WAITERS;
1140
1141 inc_preempt_count();
1142 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
1143 dec_preempt_count();
1144
1145 if (unlikely(curval == -EFAULT))
1146 goto uaddr_faulted;
1147 if (unlikely(curval != uval))
1148 goto retry_locked;
1149
1150 /*
1151 * We dont have the lock. Look up the PI state (or create it if
1152 * we are the first waiter):
1153 */
1154 ret = lookup_pi_state(uval, hb, &q);
1155
1156 if (unlikely(ret)) {
1157 /*
1158 * There were no waiters and the owner task lookup
1159 * failed. When the OWNER_DIED bit is set, then we
1160 * know that this is a robust futex and we actually
1161 * take the lock. This is safe as we are protected by
1162 * the hash bucket lock. We also set the waiters bit
1163 * unconditionally here, to simplify glibc handling of
1164 * multiple tasks racing to acquire the lock and
1165 * cleanup the problems which were left by the dead
1166 * owner.
1167 */
1168 if (curval & FUTEX_OWNER_DIED) {
1169 uval = newval;
1170 newval = current->pid |
1171 FUTEX_OWNER_DIED | FUTEX_WAITERS;
1172
1173 inc_preempt_count();
1174 curval = futex_atomic_cmpxchg_inatomic(uaddr,
1175 uval, newval);
1176 dec_preempt_count();
1177
1178 if (unlikely(curval == -EFAULT))
1179 goto uaddr_faulted;
1180 if (unlikely(curval != uval))
1181 goto retry_locked;
1182 ret = 0;
1183 }
1184 goto out_unlock_release_sem;
1185 }
1186
1187 /*
1188 * Only actually queue now that the atomic ops are done:
1189 */
1190 __queue_me(&q, hb);
1191
1192 /*
1193 * Now the futex is queued and we have checked the data, we
1194 * don't want to hold mmap_sem while we sleep.
1195 */
1196 up_read(&curr->mm->mmap_sem);
1197
1198 WARN_ON(!q.pi_state);
1199 /*
1200 * Block on the PI mutex:
1201 */
1202 if (!trylock)
1203 ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1);
1204 else {
1205 ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
1206 /* Fixup the trylock return value: */
1207 ret = ret ? 0 : -EWOULDBLOCK;
1208 }
1209
1210 down_read(&curr->mm->mmap_sem);
1211 hb = queue_lock(&q, -1, NULL);
1212
1213 /*
1214 * Got the lock. We might not be the anticipated owner if we
1215 * did a lock-steal - fix up the PI-state in that case.
1216 */
1217 if (!ret && q.pi_state->owner != curr) {
1218 u32 newtid = current->pid | FUTEX_WAITERS;
1219
1220 /* Owner died? */
1221 if (q.pi_state->owner != NULL) {
1222 spin_lock_irq(&q.pi_state->owner->pi_lock);
1223 list_del_init(&q.pi_state->list);
1224 spin_unlock_irq(&q.pi_state->owner->pi_lock);
1225 } else
1226 newtid |= FUTEX_OWNER_DIED;
1227
1228 q.pi_state->owner = current;
1229
1230 spin_lock_irq(&current->pi_lock);
1231 list_add(&q.pi_state->list, &current->pi_state_list);
1232 spin_unlock_irq(&current->pi_lock);
1233
1234 /* Unqueue and drop the lock */
1235 unqueue_me_pi(&q, hb);
1236 up_read(&curr->mm->mmap_sem);
1237 /*
1238 * We own it, so we have to replace the pending owner
1239 * TID. This must be atomic as we have preserve the
1240 * owner died bit here.
1241 */
1242 ret = get_user(uval, uaddr);
1243 while (!ret) {
1244 newval = (uval & FUTEX_OWNER_DIED) | newtid;
1245 curval = futex_atomic_cmpxchg_inatomic(uaddr,
1246 uval, newval);
1247 if (curval == -EFAULT)
1248 ret = -EFAULT;
1249 if (curval == uval)
1250 break;
1251 uval = curval;
1252 }
1253 } else {
1254 /*
1255 * Catch the rare case, where the lock was released
1256 * when we were on the way back before we locked
1257 * the hash bucket.
1258 */
1259 if (ret && q.pi_state->owner == curr) {
1260 if (rt_mutex_trylock(&q.pi_state->pi_mutex))
1261 ret = 0;
1262 }
1263 /* Unqueue and drop the lock */
1264 unqueue_me_pi(&q, hb);
1265 up_read(&curr->mm->mmap_sem);
1266 }
1267
1268 if (!detect && ret == -EDEADLK && 0)
1269 force_sig(SIGKILL, current);
1270
1271 return ret;
1272
1273 out_unlock_release_sem:
1274 queue_unlock(&q, hb);
1275
1276 out_release_sem:
1277 up_read(&curr->mm->mmap_sem);
1278 return ret;
1279
1280 uaddr_faulted:
1281 /*
1282 * We have to r/w *(int __user *)uaddr, but we can't modify it
1283 * non-atomically. Therefore, if get_user below is not
1284 * enough, we need to handle the fault ourselves, while
1285 * still holding the mmap_sem.
1286 */
1287 if (attempt++) {
1288 if (futex_handle_fault((unsigned long)uaddr, attempt))
1289 goto out_unlock_release_sem;
1290
1291 goto retry_locked;
1292 }
1293
1294 queue_unlock(&q, hb);
1295 up_read(&curr->mm->mmap_sem);
1296
1297 ret = get_user(uval, uaddr);
1298 if (!ret && (uval != -EFAULT))
1299 goto retry;
1300
1301 return ret;
1302}
1303
1304/*
1305 * Restart handler
1306 */
1307static long futex_lock_pi_restart(struct restart_block *restart)
1308{
1309 struct hrtimer_sleeper timeout, *to = NULL;
1310 int ret;
1311
1312 restart->fn = do_no_restart_syscall;
1313
1314 if (restart->arg2 || restart->arg3) {
1315 to = &timeout;
1316 hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS);
1317 hrtimer_init_sleeper(to, current);
1318 to->timer.expires.tv64 = ((u64)restart->arg1 << 32) |
1319 (u64) restart->arg0;
1320 }
1321
1322 pr_debug("lock_pi restart: %p, %d (%d)\n",
1323 (u32 __user *)restart->arg0, current->pid);
1324
1325 ret = do_futex_lock_pi((u32 __user *)restart->arg0, restart->arg1,
1326 0, to);
1327
1328 if (ret != -EINTR)
1329 return ret;
1330
1331 restart->fn = futex_lock_pi_restart;
1332
1333 /* The other values are filled in */
1334 return -ERESTART_RESTARTBLOCK;
1335}
1336
1337/*
1338 * Called from the syscall entry below.
1339 */
1340static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1341 long nsec, int trylock)
1342{
1343 struct hrtimer_sleeper timeout, *to = NULL;
1344 struct restart_block *restart;
1345 int ret;
1346
1347 if (sec != MAX_SCHEDULE_TIMEOUT) {
1348 to = &timeout;
1349 hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS);
1350 hrtimer_init_sleeper(to, current);
1351 to->timer.expires = ktime_set(sec, nsec);
1352 }
1353
1354 ret = do_futex_lock_pi(uaddr, detect, trylock, to);
1355
1356 if (ret != -EINTR)
1357 return ret;
1358
1359 pr_debug("lock_pi interrupted: %p, %d (%d)\n", uaddr, current->pid);
1360
1361 restart = &current_thread_info()->restart_block;
1362 restart->fn = futex_lock_pi_restart;
1363 restart->arg0 = (unsigned long) uaddr;
1364 restart->arg1 = detect;
1365 if (to) {
1366 restart->arg2 = to->timer.expires.tv64 & 0xFFFFFFFF;
1367 restart->arg3 = to->timer.expires.tv64 >> 32;
1368 } else
1369 restart->arg2 = restart->arg3 = 0;
1370
1371 return -ERESTART_RESTARTBLOCK;
1372}
1373
1374/*
1375 * Userspace attempted a TID -> 0 atomic transition, and failed.
1376 * This is the in-kernel slowpath: we look up the PI state (if any),
1377 * and do the rt-mutex unlock.
1378 */
1379static int futex_unlock_pi(u32 __user *uaddr)
1380{
1381 struct futex_hash_bucket *hb;
1382 struct futex_q *this, *next;
1383 u32 uval;
1384 struct list_head *head;
1385 union futex_key key;
1386 int ret, attempt = 0;
1387
1388retry:
1389 if (get_user(uval, uaddr))
1390 return -EFAULT;
1391 /*
1392 * We release only a lock we actually own:
1393 */
1394 if ((uval & FUTEX_TID_MASK) != current->pid)
1395 return -EPERM;
1396 /*
1397 * First take all the futex related locks:
1398 */
1399 down_read(&current->mm->mmap_sem);
1400
1401 ret = get_futex_key(uaddr, &key);
1402 if (unlikely(ret != 0))
1403 goto out;
1404
1405 hb = hash_futex(&key);
1406 spin_lock(&hb->lock);
1407
1408retry_locked:
1409 /*
1410 * To avoid races, try to do the TID -> 0 atomic transition
1411 * again. If it succeeds then we can return without waking
1412 * anyone else up:
1413 */
1414 inc_preempt_count();
1415 uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
1416 dec_preempt_count();
1417
1418 if (unlikely(uval == -EFAULT))
1419 goto pi_faulted;
1420 /*
1421 * Rare case: we managed to release the lock atomically,
1422 * no need to wake anyone else up:
1423 */
1424 if (unlikely(uval == current->pid))
1425 goto out_unlock;
1426
1427 /*
1428 * Ok, other tasks may need to be woken up - check waiters
1429 * and do the wakeup if necessary:
1430 */
1431 head = &hb->chain;
1432
1433 list_for_each_entry_safe(this, next, head, list) {
1434 if (!match_futex (&this->key, &key))
1435 continue;
1436 ret = wake_futex_pi(uaddr, uval, this);
1437 /*
1438 * The atomic access to the futex value
1439 * generated a pagefault, so retry the
1440 * user-access and the wakeup:
1441 */
1442 if (ret == -EFAULT)
1443 goto pi_faulted;
1444 goto out_unlock;
1445 }
1446 /*
1447 * No waiters - kernel unlocks the futex:
1448 */
1449 ret = unlock_futex_pi(uaddr, uval);
1450 if (ret == -EFAULT)
1451 goto pi_faulted;
1452
1453out_unlock:
1454 spin_unlock(&hb->lock);
1455out:
728 up_read(&current->mm->mmap_sem); 1456 up_read(&current->mm->mmap_sem);
1457
1458 return ret;
1459
1460pi_faulted:
1461 /*
1462 * We have to r/w *(int __user *)uaddr, but we can't modify it
1463 * non-atomically. Therefore, if get_user below is not
1464 * enough, we need to handle the fault ourselves, while
1465 * still holding the mmap_sem.
1466 */
1467 if (attempt++) {
1468 if (futex_handle_fault((unsigned long)uaddr, attempt))
1469 goto out_unlock;
1470
1471 goto retry_locked;
1472 }
1473
1474 spin_unlock(&hb->lock);
1475 up_read(&current->mm->mmap_sem);
1476
1477 ret = get_user(uval, uaddr);
1478 if (!ret && (uval != -EFAULT))
1479 goto retry;
1480
729 return ret; 1481 return ret;
730} 1482}
731 1483
@@ -735,6 +1487,7 @@ static int futex_close(struct inode *inode, struct file *filp)
735 1487
736 unqueue_me(q); 1488 unqueue_me(q);
737 kfree(q); 1489 kfree(q);
1490
738 return 0; 1491 return 0;
739} 1492}
740 1493
@@ -766,7 +1519,7 @@ static struct file_operations futex_fops = {
766 * Signal allows caller to avoid the race which would occur if they 1519 * Signal allows caller to avoid the race which would occur if they
767 * set the sigio stuff up afterwards. 1520 * set the sigio stuff up afterwards.
768 */ 1521 */
769static int futex_fd(unsigned long uaddr, int signal) 1522static int futex_fd(u32 __user *uaddr, int signal)
770{ 1523{
771 struct futex_q *q; 1524 struct futex_q *q;
772 struct file *filp; 1525 struct file *filp;
@@ -803,6 +1556,7 @@ static int futex_fd(unsigned long uaddr, int signal)
803 err = -ENOMEM; 1556 err = -ENOMEM;
804 goto error; 1557 goto error;
805 } 1558 }
1559 q->pi_state = NULL;
806 1560
807 down_read(&current->mm->mmap_sem); 1561 down_read(&current->mm->mmap_sem);
808 err = get_futex_key(uaddr, &q->key); 1562 err = get_futex_key(uaddr, &q->key);
@@ -840,7 +1594,7 @@ error:
840 * Implementation: user-space maintains a per-thread list of locks it 1594 * Implementation: user-space maintains a per-thread list of locks it
841 * is holding. Upon do_exit(), the kernel carefully walks this list, 1595 * is holding. Upon do_exit(), the kernel carefully walks this list,
842 * and marks all locks that are owned by this thread with the 1596 * and marks all locks that are owned by this thread with the
843 * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is 1597 * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
844 * always manipulated with the lock held, so the list is private and 1598 * always manipulated with the lock held, so the list is private and
845 * per-thread. Userspace also maintains a per-thread 'list_op_pending' 1599 * per-thread. Userspace also maintains a per-thread 'list_op_pending'
846 * field, to allow the kernel to clean up if the thread dies after 1600 * field, to allow the kernel to clean up if the thread dies after
@@ -915,7 +1669,7 @@ err_unlock:
915 */ 1669 */
916int handle_futex_death(u32 __user *uaddr, struct task_struct *curr) 1670int handle_futex_death(u32 __user *uaddr, struct task_struct *curr)
917{ 1671{
918 u32 uval; 1672 u32 uval, nval;
919 1673
920retry: 1674retry:
921 if (get_user(uval, uaddr)) 1675 if (get_user(uval, uaddr))
@@ -932,12 +1686,16 @@ retry:
932 * thread-death.) The rest of the cleanup is done in 1686 * thread-death.) The rest of the cleanup is done in
933 * userspace. 1687 * userspace.
934 */ 1688 */
935 if (futex_atomic_cmpxchg_inatomic(uaddr, uval, 1689 nval = futex_atomic_cmpxchg_inatomic(uaddr, uval,
936 uval | FUTEX_OWNER_DIED) != uval) 1690 uval | FUTEX_OWNER_DIED);
1691 if (nval == -EFAULT)
1692 return -1;
1693
1694 if (nval != uval)
937 goto retry; 1695 goto retry;
938 1696
939 if (uval & FUTEX_WAITERS) 1697 if (uval & FUTEX_WAITERS)
940 futex_wake((unsigned long)uaddr, 1); 1698 futex_wake(uaddr, 1);
941 } 1699 }
942 return 0; 1700 return 0;
943} 1701}
@@ -978,7 +1736,7 @@ void exit_robust_list(struct task_struct *curr)
978 while (entry != &head->list) { 1736 while (entry != &head->list) {
979 /* 1737 /*
980 * A pending lock might already be on the list, so 1738 * A pending lock might already be on the list, so
981 * dont process it twice: 1739 * don't process it twice:
982 */ 1740 */
983 if (entry != pending) 1741 if (entry != pending)
984 if (handle_futex_death((void *)entry + futex_offset, 1742 if (handle_futex_death((void *)entry + futex_offset,
@@ -999,8 +1757,8 @@ void exit_robust_list(struct task_struct *curr)
999 } 1757 }
1000} 1758}
1001 1759
1002long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, 1760long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout,
1003 unsigned long uaddr2, int val2, int val3) 1761 u32 __user *uaddr2, u32 val2, u32 val3)
1004{ 1762{
1005 int ret; 1763 int ret;
1006 1764
@@ -1024,6 +1782,15 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
1024 case FUTEX_WAKE_OP: 1782 case FUTEX_WAKE_OP:
1025 ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); 1783 ret = futex_wake_op(uaddr, uaddr2, val, val2, val3);
1026 break; 1784 break;
1785 case FUTEX_LOCK_PI:
1786 ret = futex_lock_pi(uaddr, val, timeout, val2, 0);
1787 break;
1788 case FUTEX_UNLOCK_PI:
1789 ret = futex_unlock_pi(uaddr);
1790 break;
1791 case FUTEX_TRYLOCK_PI:
1792 ret = futex_lock_pi(uaddr, 0, timeout, val2, 1);
1793 break;
1027 default: 1794 default:
1028 ret = -ENOSYS; 1795 ret = -ENOSYS;
1029 } 1796 }
@@ -1031,36 +1798,40 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
1031} 1798}
1032 1799
1033 1800
1034asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, 1801asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
1035 struct timespec __user *utime, u32 __user *uaddr2, 1802 struct timespec __user *utime, u32 __user *uaddr2,
1036 int val3) 1803 u32 val3)
1037{ 1804{
1038 struct timespec t; 1805 struct timespec t;
1039 unsigned long timeout = MAX_SCHEDULE_TIMEOUT; 1806 unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
1040 int val2 = 0; 1807 u32 val2 = 0;
1041 1808
1042 if (utime && (op == FUTEX_WAIT)) { 1809 if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
1043 if (copy_from_user(&t, utime, sizeof(t)) != 0) 1810 if (copy_from_user(&t, utime, sizeof(t)) != 0)
1044 return -EFAULT; 1811 return -EFAULT;
1045 if (!timespec_valid(&t)) 1812 if (!timespec_valid(&t))
1046 return -EINVAL; 1813 return -EINVAL;
1047 timeout = timespec_to_jiffies(&t) + 1; 1814 if (op == FUTEX_WAIT)
1815 timeout = timespec_to_jiffies(&t) + 1;
1816 else {
1817 timeout = t.tv_sec;
1818 val2 = t.tv_nsec;
1819 }
1048 } 1820 }
1049 /* 1821 /*
1050 * requeue parameter in 'utime' if op == FUTEX_REQUEUE. 1822 * requeue parameter in 'utime' if op == FUTEX_REQUEUE.
1051 */ 1823 */
1052 if (op >= FUTEX_REQUEUE) 1824 if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
1053 val2 = (int) (unsigned long) utime; 1825 val2 = (u32) (unsigned long) utime;
1054 1826
1055 return do_futex((unsigned long)uaddr, op, val, timeout, 1827 return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
1056 (unsigned long)uaddr2, val2, val3);
1057} 1828}
1058 1829
1059static struct super_block * 1830static int futexfs_get_sb(struct file_system_type *fs_type,
1060futexfs_get_sb(struct file_system_type *fs_type, 1831 int flags, const char *dev_name, void *data,
1061 int flags, const char *dev_name, void *data) 1832 struct vfsmount *mnt)
1062{ 1833{
1063 return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA); 1834 return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA, mnt);
1064} 1835}
1065 1836
1066static struct file_system_type futex_fs_type = { 1837static struct file_system_type futex_fs_type = {
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 1ab6a0ea3d14..d1d92b441fb7 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -129,16 +129,20 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
129 unsigned long timeout = MAX_SCHEDULE_TIMEOUT; 129 unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
130 int val2 = 0; 130 int val2 = 0;
131 131
132 if (utime && (op == FUTEX_WAIT)) { 132 if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
133 if (get_compat_timespec(&t, utime)) 133 if (get_compat_timespec(&t, utime))
134 return -EFAULT; 134 return -EFAULT;
135 if (!timespec_valid(&t)) 135 if (!timespec_valid(&t))
136 return -EINVAL; 136 return -EINVAL;
137 timeout = timespec_to_jiffies(&t) + 1; 137 if (op == FUTEX_WAIT)
138 timeout = timespec_to_jiffies(&t) + 1;
139 else {
140 timeout = t.tv_sec;
141 val2 = t.tv_nsec;
142 }
138 } 143 }
139 if (op >= FUTEX_REQUEUE) 144 if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
140 val2 = (int) (unsigned long) utime; 145 val2 = (int) (unsigned long) utime;
141 146
142 return do_futex((unsigned long)uaddr, op, val, timeout, 147 return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
143 (unsigned long)uaddr2, val2, val3);
144} 148}
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 01fa2ae98a85..8d3dc29ef41a 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -98,7 +98,6 @@ static DEFINE_PER_CPU(struct hrtimer_base, hrtimer_bases[MAX_HRTIMER_BASES]) =
98 98
99/** 99/**
100 * ktime_get_ts - get the monotonic clock in timespec format 100 * ktime_get_ts - get the monotonic clock in timespec format
101 *
102 * @ts: pointer to timespec variable 101 * @ts: pointer to timespec variable
103 * 102 *
104 * The function calculates the monotonic clock from the realtime 103 * The function calculates the monotonic clock from the realtime
@@ -238,7 +237,6 @@ lock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
238# ifndef CONFIG_KTIME_SCALAR 237# ifndef CONFIG_KTIME_SCALAR
239/** 238/**
240 * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable 239 * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable
241 *
242 * @kt: addend 240 * @kt: addend
243 * @nsec: the scalar nsec value to add 241 * @nsec: the scalar nsec value to add
244 * 242 *
@@ -299,7 +297,6 @@ void unlock_hrtimer_base(const struct hrtimer *timer, unsigned long *flags)
299 297
300/** 298/**
301 * hrtimer_forward - forward the timer expiry 299 * hrtimer_forward - forward the timer expiry
302 *
303 * @timer: hrtimer to forward 300 * @timer: hrtimer to forward
304 * @now: forward past this time 301 * @now: forward past this time
305 * @interval: the interval to forward 302 * @interval: the interval to forward
@@ -393,7 +390,7 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
393 if (base->first == &timer->node) 390 if (base->first == &timer->node)
394 base->first = rb_next(&timer->node); 391 base->first = rb_next(&timer->node);
395 rb_erase(&timer->node, &base->active); 392 rb_erase(&timer->node, &base->active);
396 timer->node.rb_parent = HRTIMER_INACTIVE; 393 rb_set_parent(&timer->node, &timer->node);
397} 394}
398 395
399/* 396/*
@@ -411,7 +408,6 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_base *base)
411 408
412/** 409/**
413 * hrtimer_start - (re)start an relative timer on the current CPU 410 * hrtimer_start - (re)start an relative timer on the current CPU
414 *
415 * @timer: the timer to be added 411 * @timer: the timer to be added
416 * @tim: expiry time 412 * @tim: expiry time
417 * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) 413 * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
@@ -460,14 +456,13 @@ EXPORT_SYMBOL_GPL(hrtimer_start);
460 456
461/** 457/**
462 * hrtimer_try_to_cancel - try to deactivate a timer 458 * hrtimer_try_to_cancel - try to deactivate a timer
463 *
464 * @timer: hrtimer to stop 459 * @timer: hrtimer to stop
465 * 460 *
466 * Returns: 461 * Returns:
467 * 0 when the timer was not active 462 * 0 when the timer was not active
468 * 1 when the timer was active 463 * 1 when the timer was active
469 * -1 when the timer is currently excuting the callback function and 464 * -1 when the timer is currently excuting the callback function and
470 * can not be stopped 465 * cannot be stopped
471 */ 466 */
472int hrtimer_try_to_cancel(struct hrtimer *timer) 467int hrtimer_try_to_cancel(struct hrtimer *timer)
473{ 468{
@@ -489,7 +484,6 @@ EXPORT_SYMBOL_GPL(hrtimer_try_to_cancel);
489 484
490/** 485/**
491 * hrtimer_cancel - cancel a timer and wait for the handler to finish. 486 * hrtimer_cancel - cancel a timer and wait for the handler to finish.
492 *
493 * @timer: the timer to be cancelled 487 * @timer: the timer to be cancelled
494 * 488 *
495 * Returns: 489 * Returns:
@@ -510,7 +504,6 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel);
510 504
511/** 505/**
512 * hrtimer_get_remaining - get remaining time for the timer 506 * hrtimer_get_remaining - get remaining time for the timer
513 *
514 * @timer: the timer to read 507 * @timer: the timer to read
515 */ 508 */
516ktime_t hrtimer_get_remaining(const struct hrtimer *timer) 509ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
@@ -564,7 +557,6 @@ ktime_t hrtimer_get_next_event(void)
564 557
565/** 558/**
566 * hrtimer_init - initialize a timer to the given clock 559 * hrtimer_init - initialize a timer to the given clock
567 *
568 * @timer: the timer to be initialized 560 * @timer: the timer to be initialized
569 * @clock_id: the clock to be used 561 * @clock_id: the clock to be used
570 * @mode: timer mode abs/rel 562 * @mode: timer mode abs/rel
@@ -576,19 +568,18 @@ void hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
576 568
577 memset(timer, 0, sizeof(struct hrtimer)); 569 memset(timer, 0, sizeof(struct hrtimer));
578 570
579 bases = per_cpu(hrtimer_bases, raw_smp_processor_id()); 571 bases = __raw_get_cpu_var(hrtimer_bases);
580 572
581 if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS) 573 if (clock_id == CLOCK_REALTIME && mode != HRTIMER_ABS)
582 clock_id = CLOCK_MONOTONIC; 574 clock_id = CLOCK_MONOTONIC;
583 575
584 timer->base = &bases[clock_id]; 576 timer->base = &bases[clock_id];
585 timer->node.rb_parent = HRTIMER_INACTIVE; 577 rb_set_parent(&timer->node, &timer->node);
586} 578}
587EXPORT_SYMBOL_GPL(hrtimer_init); 579EXPORT_SYMBOL_GPL(hrtimer_init);
588 580
589/** 581/**
590 * hrtimer_get_res - get the timer resolution for a clock 582 * hrtimer_get_res - get the timer resolution for a clock
591 *
592 * @which_clock: which clock to query 583 * @which_clock: which clock to query
593 * @tp: pointer to timespec variable to store the resolution 584 * @tp: pointer to timespec variable to store the resolution
594 * 585 *
@@ -599,7 +590,7 @@ int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
599{ 590{
600 struct hrtimer_base *bases; 591 struct hrtimer_base *bases;
601 592
602 bases = per_cpu(hrtimer_bases, raw_smp_processor_id()); 593 bases = __raw_get_cpu_var(hrtimer_bases);
603 *tp = ktime_to_timespec(bases[which_clock].resolution); 594 *tp = ktime_to_timespec(bases[which_clock].resolution);
604 595
605 return 0; 596 return 0;
@@ -842,7 +833,7 @@ static void migrate_hrtimers(int cpu)
842} 833}
843#endif /* CONFIG_HOTPLUG_CPU */ 834#endif /* CONFIG_HOTPLUG_CPU */
844 835
845static int hrtimer_cpu_notify(struct notifier_block *self, 836static int __devinit hrtimer_cpu_notify(struct notifier_block *self,
846 unsigned long action, void *hcpu) 837 unsigned long action, void *hcpu)
847{ 838{
848 long cpu = (long)hcpu; 839 long cpu = (long)hcpu;
@@ -866,7 +857,7 @@ static int hrtimer_cpu_notify(struct notifier_block *self,
866 return NOTIFY_OK; 857 return NOTIFY_OK;
867} 858}
868 859
869static struct notifier_block hrtimers_nb = { 860static struct notifier_block __devinitdata hrtimers_nb = {
870 .notifier_call = hrtimer_cpu_notify, 861 .notifier_call = hrtimer_cpu_notify,
871}; 862};
872 863
diff --git a/kernel/intermodule.c b/kernel/intermodule.c
deleted file mode 100644
index 55b1e5b85db9..000000000000
--- a/kernel/intermodule.c
+++ /dev/null
@@ -1,184 +0,0 @@
1/* Deprecated, do not use. Moved from module.c to here. --RR */
2
3/* Written by Keith Owens <kaos@ocs.com.au> Oct 2000 */
4#include <linux/module.h>
5#include <linux/kmod.h>
6#include <linux/spinlock.h>
7#include <linux/list.h>
8#include <linux/slab.h>
9
10/* inter_module functions are always available, even when the kernel is
11 * compiled without modules. Consumers of inter_module_xxx routines
12 * will always work, even when both are built into the kernel, this
13 * approach removes lots of #ifdefs in mainline code.
14 */
15
16static struct list_head ime_list = LIST_HEAD_INIT(ime_list);
17static DEFINE_SPINLOCK(ime_lock);
18static int kmalloc_failed;
19
20struct inter_module_entry {
21 struct list_head list;
22 const char *im_name;
23 struct module *owner;
24 const void *userdata;
25};
26
27/**
28 * inter_module_register - register a new set of inter module data.
29 * @im_name: an arbitrary string to identify the data, must be unique
30 * @owner: module that is registering the data, always use THIS_MODULE
31 * @userdata: pointer to arbitrary userdata to be registered
32 *
33 * Description: Check that the im_name has not already been registered,
34 * complain if it has. For new data, add it to the inter_module_entry
35 * list.
36 */
37void inter_module_register(const char *im_name, struct module *owner, const void *userdata)
38{
39 struct list_head *tmp;
40 struct inter_module_entry *ime, *ime_new;
41
42 if (!(ime_new = kzalloc(sizeof(*ime), GFP_KERNEL))) {
43 /* Overloaded kernel, not fatal */
44 printk(KERN_ERR
45 "Aiee, inter_module_register: cannot kmalloc entry for '%s'\n",
46 im_name);
47 kmalloc_failed = 1;
48 return;
49 }
50 ime_new->im_name = im_name;
51 ime_new->owner = owner;
52 ime_new->userdata = userdata;
53
54 spin_lock(&ime_lock);
55 list_for_each(tmp, &ime_list) {
56 ime = list_entry(tmp, struct inter_module_entry, list);
57 if (strcmp(ime->im_name, im_name) == 0) {
58 spin_unlock(&ime_lock);
59 kfree(ime_new);
60 /* Program logic error, fatal */
61 printk(KERN_ERR "inter_module_register: duplicate im_name '%s'", im_name);
62 BUG();
63 }
64 }
65 list_add(&(ime_new->list), &ime_list);
66 spin_unlock(&ime_lock);
67}
68
69/**
70 * inter_module_unregister - unregister a set of inter module data.
71 * @im_name: an arbitrary string to identify the data, must be unique
72 *
73 * Description: Check that the im_name has been registered, complain if
74 * it has not. For existing data, remove it from the
75 * inter_module_entry list.
76 */
77void inter_module_unregister(const char *im_name)
78{
79 struct list_head *tmp;
80 struct inter_module_entry *ime;
81
82 spin_lock(&ime_lock);
83 list_for_each(tmp, &ime_list) {
84 ime = list_entry(tmp, struct inter_module_entry, list);
85 if (strcmp(ime->im_name, im_name) == 0) {
86 list_del(&(ime->list));
87 spin_unlock(&ime_lock);
88 kfree(ime);
89 return;
90 }
91 }
92 spin_unlock(&ime_lock);
93 if (kmalloc_failed) {
94 printk(KERN_ERR
95 "inter_module_unregister: no entry for '%s', "
96 "probably caused by previous kmalloc failure\n",
97 im_name);
98 return;
99 }
100 else {
101 /* Program logic error, fatal */
102 printk(KERN_ERR "inter_module_unregister: no entry for '%s'", im_name);
103 BUG();
104 }
105}
106
107/**
108 * inter_module_get - return arbitrary userdata from another module.
109 * @im_name: an arbitrary string to identify the data, must be unique
110 *
111 * Description: If the im_name has not been registered, return NULL.
112 * Try to increment the use count on the owning module, if that fails
113 * then return NULL. Otherwise return the userdata.
114 */
115static const void *inter_module_get(const char *im_name)
116{
117 struct list_head *tmp;
118 struct inter_module_entry *ime;
119 const void *result = NULL;
120
121 spin_lock(&ime_lock);
122 list_for_each(tmp, &ime_list) {
123 ime = list_entry(tmp, struct inter_module_entry, list);
124 if (strcmp(ime->im_name, im_name) == 0) {
125 if (try_module_get(ime->owner))
126 result = ime->userdata;
127 break;
128 }
129 }
130 spin_unlock(&ime_lock);
131 return(result);
132}
133
134/**
135 * inter_module_get_request - im get with automatic request_module.
136 * @im_name: an arbitrary string to identify the data, must be unique
137 * @modname: module that is expected to register im_name
138 *
139 * Description: If inter_module_get fails, do request_module then retry.
140 */
141const void *inter_module_get_request(const char *im_name, const char *modname)
142{
143 const void *result = inter_module_get(im_name);
144 if (!result) {
145 request_module("%s", modname);
146 result = inter_module_get(im_name);
147 }
148 return(result);
149}
150
151/**
152 * inter_module_put - release use of data from another module.
153 * @im_name: an arbitrary string to identify the data, must be unique
154 *
155 * Description: If the im_name has not been registered, complain,
156 * otherwise decrement the use count on the owning module.
157 */
158void inter_module_put(const char *im_name)
159{
160 struct list_head *tmp;
161 struct inter_module_entry *ime;
162
163 spin_lock(&ime_lock);
164 list_for_each(tmp, &ime_list) {
165 ime = list_entry(tmp, struct inter_module_entry, list);
166 if (strcmp(ime->im_name, im_name) == 0) {
167 if (ime->owner)
168 module_put(ime->owner);
169 spin_unlock(&ime_lock);
170 return;
171 }
172 }
173 spin_unlock(&ime_lock);
174 printk(KERN_ERR "inter_module_put: no entry for '%s'", im_name);
175 BUG();
176}
177
178EXPORT_SYMBOL(inter_module_register);
179EXPORT_SYMBOL(inter_module_unregister);
180EXPORT_SYMBOL(inter_module_get_request);
181EXPORT_SYMBOL(inter_module_put);
182
183MODULE_LICENSE("GPL");
184
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 9f77f50d8143..1dab0ac3f797 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -1,5 +1,5 @@
1 1
2obj-y := handle.o manage.o spurious.o 2obj-y := handle.o manage.o spurious.o resend.o chip.o
3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o 3obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
4obj-$(CONFIG_PROC_FS) += proc.o 4obj-$(CONFIG_PROC_FS) += proc.o
5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o 5obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 3467097ca61a..533068cfb607 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -11,12 +11,14 @@
11#include <linux/interrupt.h> 11#include <linux/interrupt.h>
12#include <linux/delay.h> 12#include <linux/delay.h>
13 13
14#include "internals.h"
15
14/* 16/*
15 * Autodetection depends on the fact that any interrupt that 17 * Autodetection depends on the fact that any interrupt that
16 * comes in on to an unassigned handler will get stuck with 18 * comes in on to an unassigned handler will get stuck with
17 * "IRQ_WAITING" cleared and the interrupt disabled. 19 * "IRQ_WAITING" cleared and the interrupt disabled.
18 */ 20 */
19static DECLARE_MUTEX(probe_sem); 21static DEFINE_MUTEX(probing_active);
20 22
21/** 23/**
22 * probe_irq_on - begin an interrupt autodetect 24 * probe_irq_on - begin an interrupt autodetect
@@ -27,11 +29,11 @@ static DECLARE_MUTEX(probe_sem);
27 */ 29 */
28unsigned long probe_irq_on(void) 30unsigned long probe_irq_on(void)
29{ 31{
30 unsigned long val; 32 struct irq_desc *desc;
31 irq_desc_t *desc; 33 unsigned long mask;
32 unsigned int i; 34 unsigned int i;
33 35
34 down(&probe_sem); 36 mutex_lock(&probing_active);
35 /* 37 /*
36 * something may have generated an irq long ago and we want to 38 * something may have generated an irq long ago and we want to
37 * flush such a longstanding irq before considering it as spurious. 39 * flush such a longstanding irq before considering it as spurious.
@@ -40,8 +42,21 @@ unsigned long probe_irq_on(void)
40 desc = irq_desc + i; 42 desc = irq_desc + i;
41 43
42 spin_lock_irq(&desc->lock); 44 spin_lock_irq(&desc->lock);
43 if (!irq_desc[i].action) 45 if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
44 irq_desc[i].handler->startup(i); 46 /*
47 * An old-style architecture might still have
48 * the handle_bad_irq handler there:
49 */
50 compat_irq_chip_set_default_handler(desc);
51
52 /*
53 * Some chips need to know about probing in
54 * progress:
55 */
56 if (desc->chip->set_type)
57 desc->chip->set_type(i, IRQ_TYPE_PROBE);
58 desc->chip->startup(i);
59 }
45 spin_unlock_irq(&desc->lock); 60 spin_unlock_irq(&desc->lock);
46 } 61 }
47 62
@@ -57,9 +72,9 @@ unsigned long probe_irq_on(void)
57 desc = irq_desc + i; 72 desc = irq_desc + i;
58 73
59 spin_lock_irq(&desc->lock); 74 spin_lock_irq(&desc->lock);
60 if (!desc->action) { 75 if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
61 desc->status |= IRQ_AUTODETECT | IRQ_WAITING; 76 desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
62 if (desc->handler->startup(i)) 77 if (desc->chip->startup(i))
63 desc->status |= IRQ_PENDING; 78 desc->status |= IRQ_PENDING;
64 } 79 }
65 spin_unlock_irq(&desc->lock); 80 spin_unlock_irq(&desc->lock);
@@ -73,11 +88,11 @@ unsigned long probe_irq_on(void)
73 /* 88 /*
74 * Now filter out any obviously spurious interrupts 89 * Now filter out any obviously spurious interrupts
75 */ 90 */
76 val = 0; 91 mask = 0;
77 for (i = 0; i < NR_IRQS; i++) { 92 for (i = 0; i < NR_IRQS; i++) {
78 irq_desc_t *desc = irq_desc + i;
79 unsigned int status; 93 unsigned int status;
80 94
95 desc = irq_desc + i;
81 spin_lock_irq(&desc->lock); 96 spin_lock_irq(&desc->lock);
82 status = desc->status; 97 status = desc->status;
83 98
@@ -85,17 +100,16 @@ unsigned long probe_irq_on(void)
85 /* It triggered already - consider it spurious. */ 100 /* It triggered already - consider it spurious. */
86 if (!(status & IRQ_WAITING)) { 101 if (!(status & IRQ_WAITING)) {
87 desc->status = status & ~IRQ_AUTODETECT; 102 desc->status = status & ~IRQ_AUTODETECT;
88 desc->handler->shutdown(i); 103 desc->chip->shutdown(i);
89 } else 104 } else
90 if (i < 32) 105 if (i < 32)
91 val |= 1 << i; 106 mask |= 1 << i;
92 } 107 }
93 spin_unlock_irq(&desc->lock); 108 spin_unlock_irq(&desc->lock);
94 } 109 }
95 110
96 return val; 111 return mask;
97} 112}
98
99EXPORT_SYMBOL(probe_irq_on); 113EXPORT_SYMBOL(probe_irq_on);
100 114
101/** 115/**
@@ -117,7 +131,7 @@ unsigned int probe_irq_mask(unsigned long val)
117 131
118 mask = 0; 132 mask = 0;
119 for (i = 0; i < NR_IRQS; i++) { 133 for (i = 0; i < NR_IRQS; i++) {
120 irq_desc_t *desc = irq_desc + i; 134 struct irq_desc *desc = irq_desc + i;
121 unsigned int status; 135 unsigned int status;
122 136
123 spin_lock_irq(&desc->lock); 137 spin_lock_irq(&desc->lock);
@@ -128,11 +142,11 @@ unsigned int probe_irq_mask(unsigned long val)
128 mask |= 1 << i; 142 mask |= 1 << i;
129 143
130 desc->status = status & ~IRQ_AUTODETECT; 144 desc->status = status & ~IRQ_AUTODETECT;
131 desc->handler->shutdown(i); 145 desc->chip->shutdown(i);
132 } 146 }
133 spin_unlock_irq(&desc->lock); 147 spin_unlock_irq(&desc->lock);
134 } 148 }
135 up(&probe_sem); 149 mutex_unlock(&probing_active);
136 150
137 return mask & val; 151 return mask & val;
138} 152}
@@ -160,7 +174,7 @@ int probe_irq_off(unsigned long val)
160 int i, irq_found = 0, nr_irqs = 0; 174 int i, irq_found = 0, nr_irqs = 0;
161 175
162 for (i = 0; i < NR_IRQS; i++) { 176 for (i = 0; i < NR_IRQS; i++) {
163 irq_desc_t *desc = irq_desc + i; 177 struct irq_desc *desc = irq_desc + i;
164 unsigned int status; 178 unsigned int status;
165 179
166 spin_lock_irq(&desc->lock); 180 spin_lock_irq(&desc->lock);
@@ -173,16 +187,16 @@ int probe_irq_off(unsigned long val)
173 nr_irqs++; 187 nr_irqs++;
174 } 188 }
175 desc->status = status & ~IRQ_AUTODETECT; 189 desc->status = status & ~IRQ_AUTODETECT;
176 desc->handler->shutdown(i); 190 desc->chip->shutdown(i);
177 } 191 }
178 spin_unlock_irq(&desc->lock); 192 spin_unlock_irq(&desc->lock);
179 } 193 }
180 up(&probe_sem); 194 mutex_unlock(&probing_active);
181 195
182 if (nr_irqs > 1) 196 if (nr_irqs > 1)
183 irq_found = -irq_found; 197 irq_found = -irq_found;
198
184 return irq_found; 199 return irq_found;
185} 200}
186
187EXPORT_SYMBOL(probe_irq_off); 201EXPORT_SYMBOL(probe_irq_off);
188 202
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
new file mode 100644
index 000000000000..4a0952d9458b
--- /dev/null
+++ b/kernel/irq/chip.c
@@ -0,0 +1,525 @@
1/*
2 * linux/kernel/irq/chip.c
3 *
4 * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
5 * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
6 *
7 * This file contains the core interrupt handling code, for irq-chip
8 * based architectures.
9 *
10 * Detailed information is available in Documentation/DocBook/genericirq
11 */
12
13#include <linux/irq.h>
14#include <linux/module.h>
15#include <linux/interrupt.h>
16#include <linux/kernel_stat.h>
17
18#include "internals.h"
19
20/**
21 * set_irq_chip - set the irq chip for an irq
22 * @irq: irq number
23 * @chip: pointer to irq chip description structure
24 */
25int set_irq_chip(unsigned int irq, struct irq_chip *chip)
26{
27 struct irq_desc *desc;
28 unsigned long flags;
29
30 if (irq >= NR_IRQS) {
31 printk(KERN_ERR "Trying to install chip for IRQ%d\n", irq);
32 WARN_ON(1);
33 return -EINVAL;
34 }
35
36 if (!chip)
37 chip = &no_irq_chip;
38
39 desc = irq_desc + irq;
40 spin_lock_irqsave(&desc->lock, flags);
41 irq_chip_set_defaults(chip);
42 desc->chip = chip;
43 /*
44 * For compatibility only:
45 */
46 desc->chip = chip;
47 spin_unlock_irqrestore(&desc->lock, flags);
48
49 return 0;
50}
51EXPORT_SYMBOL(set_irq_chip);
52
53/**
54 * set_irq_type - set the irq type for an irq
55 * @irq: irq number
56 * @type: interrupt type - see include/linux/interrupt.h
57 */
58int set_irq_type(unsigned int irq, unsigned int type)
59{
60 struct irq_desc *desc;
61 unsigned long flags;
62 int ret = -ENXIO;
63
64 if (irq >= NR_IRQS) {
65 printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq);
66 return -ENODEV;
67 }
68
69 desc = irq_desc + irq;
70 if (desc->chip->set_type) {
71 spin_lock_irqsave(&desc->lock, flags);
72 ret = desc->chip->set_type(irq, type);
73 spin_unlock_irqrestore(&desc->lock, flags);
74 }
75 return ret;
76}
77EXPORT_SYMBOL(set_irq_type);
78
79/**
80 * set_irq_data - set irq type data for an irq
81 * @irq: Interrupt number
82 * @data: Pointer to interrupt specific data
83 *
84 * Set the hardware irq controller data for an irq
85 */
86int set_irq_data(unsigned int irq, void *data)
87{
88 struct irq_desc *desc;
89 unsigned long flags;
90
91 if (irq >= NR_IRQS) {
92 printk(KERN_ERR
93 "Trying to install controller data for IRQ%d\n", irq);
94 return -EINVAL;
95 }
96
97 desc = irq_desc + irq;
98 spin_lock_irqsave(&desc->lock, flags);
99 desc->handler_data = data;
100 spin_unlock_irqrestore(&desc->lock, flags);
101 return 0;
102}
103EXPORT_SYMBOL(set_irq_data);
104
105/**
106 * set_irq_chip_data - set irq chip data for an irq
107 * @irq: Interrupt number
108 * @data: Pointer to chip specific data
109 *
110 * Set the hardware irq chip data for an irq
111 */
112int set_irq_chip_data(unsigned int irq, void *data)
113{
114 struct irq_desc *desc = irq_desc + irq;
115 unsigned long flags;
116
117 if (irq >= NR_IRQS || !desc->chip) {
118 printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq);
119 return -EINVAL;
120 }
121
122 spin_lock_irqsave(&desc->lock, flags);
123 desc->chip_data = data;
124 spin_unlock_irqrestore(&desc->lock, flags);
125
126 return 0;
127}
128EXPORT_SYMBOL(set_irq_chip_data);
129
130/*
131 * default enable function
132 */
133static void default_enable(unsigned int irq)
134{
135 struct irq_desc *desc = irq_desc + irq;
136
137 desc->chip->unmask(irq);
138 desc->status &= ~IRQ_MASKED;
139}
140
141/*
142 * default disable function
143 */
144static void default_disable(unsigned int irq)
145{
146 struct irq_desc *desc = irq_desc + irq;
147
148 if (!(desc->status & IRQ_DELAYED_DISABLE))
149 irq_desc[irq].chip->mask(irq);
150}
151
152/*
153 * default startup function
154 */
155static unsigned int default_startup(unsigned int irq)
156{
157 irq_desc[irq].chip->enable(irq);
158
159 return 0;
160}
161
162/*
163 * Fixup enable/disable function pointers
164 */
165void irq_chip_set_defaults(struct irq_chip *chip)
166{
167 if (!chip->enable)
168 chip->enable = default_enable;
169 if (!chip->disable)
170 chip->disable = default_disable;
171 if (!chip->startup)
172 chip->startup = default_startup;
173 if (!chip->shutdown)
174 chip->shutdown = chip->disable;
175 if (!chip->name)
176 chip->name = chip->typename;
177}
178
179static inline void mask_ack_irq(struct irq_desc *desc, int irq)
180{
181 if (desc->chip->mask_ack)
182 desc->chip->mask_ack(irq);
183 else {
184 desc->chip->mask(irq);
185 desc->chip->ack(irq);
186 }
187}
188
189/**
190 * handle_simple_irq - Simple and software-decoded IRQs.
191 * @irq: the interrupt number
192 * @desc: the interrupt description structure for this irq
193 * @regs: pointer to a register structure
194 *
195 * Simple interrupts are either sent from a demultiplexing interrupt
196 * handler or come from hardware, where no interrupt hardware control
197 * is necessary.
198 *
199 * Note: The caller is expected to handle the ack, clear, mask and
200 * unmask issues if necessary.
201 */
202void fastcall
203handle_simple_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
204{
205 struct irqaction *action;
206 irqreturn_t action_ret;
207 const unsigned int cpu = smp_processor_id();
208
209 spin_lock(&desc->lock);
210
211 if (unlikely(desc->status & IRQ_INPROGRESS))
212 goto out_unlock;
213 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
214 kstat_cpu(cpu).irqs[irq]++;
215
216 action = desc->action;
217 if (unlikely(!action || (desc->status & IRQ_DISABLED)))
218 goto out_unlock;
219
220 desc->status |= IRQ_INPROGRESS;
221 spin_unlock(&desc->lock);
222
223 action_ret = handle_IRQ_event(irq, regs, action);
224 if (!noirqdebug)
225 note_interrupt(irq, desc, action_ret, regs);
226
227 spin_lock(&desc->lock);
228 desc->status &= ~IRQ_INPROGRESS;
229out_unlock:
230 spin_unlock(&desc->lock);
231}
232
233/**
234 * handle_level_irq - Level type irq handler
235 * @irq: the interrupt number
236 * @desc: the interrupt description structure for this irq
237 * @regs: pointer to a register structure
238 *
239 * Level type interrupts are active as long as the hardware line has
240 * the active level. This may require to mask the interrupt and unmask
241 * it after the associated handler has acknowledged the device, so the
242 * interrupt line is back to inactive.
243 */
244void fastcall
245handle_level_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
246{
247 unsigned int cpu = smp_processor_id();
248 struct irqaction *action;
249 irqreturn_t action_ret;
250
251 spin_lock(&desc->lock);
252 mask_ack_irq(desc, irq);
253
254 if (unlikely(desc->status & IRQ_INPROGRESS))
255 goto out;
256 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
257 kstat_cpu(cpu).irqs[irq]++;
258
259 /*
260 * If its disabled or no action available
261 * keep it masked and get out of here
262 */
263 action = desc->action;
264 if (unlikely(!action || (desc->status & IRQ_DISABLED)))
265 goto out;
266
267 desc->status |= IRQ_INPROGRESS;
268 spin_unlock(&desc->lock);
269
270 action_ret = handle_IRQ_event(irq, regs, action);
271 if (!noirqdebug)
272 note_interrupt(irq, desc, action_ret, regs);
273
274 spin_lock(&desc->lock);
275 desc->status &= ~IRQ_INPROGRESS;
276out:
277 if (!(desc->status & IRQ_DISABLED) && desc->chip->unmask)
278 desc->chip->unmask(irq);
279 spin_unlock(&desc->lock);
280}
281
282/**
283 * handle_fasteoi_irq - irq handler for transparent controllers
284 * @irq: the interrupt number
285 * @desc: the interrupt description structure for this irq
286 * @regs: pointer to a register structure
287 *
288 * Only a single callback will be issued to the chip: an ->eoi()
289 * call when the interrupt has been serviced. This enables support
290 * for modern forms of interrupt handlers, which handle the flow
291 * details in hardware, transparently.
292 */
293void fastcall
294handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc,
295 struct pt_regs *regs)
296{
297 unsigned int cpu = smp_processor_id();
298 struct irqaction *action;
299 irqreturn_t action_ret;
300
301 spin_lock(&desc->lock);
302
303 if (unlikely(desc->status & IRQ_INPROGRESS))
304 goto out;
305
306 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
307 kstat_cpu(cpu).irqs[irq]++;
308
309 /*
310 * If its disabled or no action available
311 * keep it masked and get out of here
312 */
313 action = desc->action;
314 if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
315 desc->status |= IRQ_PENDING;
316 goto out;
317 }
318
319 desc->status |= IRQ_INPROGRESS;
320 desc->status &= ~IRQ_PENDING;
321 spin_unlock(&desc->lock);
322
323 action_ret = handle_IRQ_event(irq, regs, action);
324 if (!noirqdebug)
325 note_interrupt(irq, desc, action_ret, regs);
326
327 spin_lock(&desc->lock);
328 desc->status &= ~IRQ_INPROGRESS;
329out:
330 desc->chip->eoi(irq);
331
332 spin_unlock(&desc->lock);
333}
334
335/**
336 * handle_edge_irq - edge type IRQ handler
337 * @irq: the interrupt number
338 * @desc: the interrupt description structure for this irq
339 * @regs: pointer to a register structure
340 *
341 * Interrupt occures on the falling and/or rising edge of a hardware
342 * signal. The occurence is latched into the irq controller hardware
343 * and must be acked in order to be reenabled. After the ack another
344 * interrupt can happen on the same source even before the first one
345 * is handled by the assosiacted event handler. If this happens it
346 * might be necessary to disable (mask) the interrupt depending on the
347 * controller hardware. This requires to reenable the interrupt inside
348 * of the loop which handles the interrupts which have arrived while
349 * the handler was running. If all pending interrupts are handled, the
350 * loop is left.
351 */
352void fastcall
353handle_edge_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
354{
355 const unsigned int cpu = smp_processor_id();
356
357 spin_lock(&desc->lock);
358
359 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
360
361 /*
362 * If we're currently running this IRQ, or its disabled,
363 * we shouldn't process the IRQ. Mark it pending, handle
364 * the necessary masking and go out
365 */
366 if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) ||
367 !desc->action)) {
368 desc->status |= (IRQ_PENDING | IRQ_MASKED);
369 mask_ack_irq(desc, irq);
370 goto out_unlock;
371 }
372
373 kstat_cpu(cpu).irqs[irq]++;
374
375 /* Start handling the irq */
376 desc->chip->ack(irq);
377
378 /* Mark the IRQ currently in progress.*/
379 desc->status |= IRQ_INPROGRESS;
380
381 do {
382 struct irqaction *action = desc->action;
383 irqreturn_t action_ret;
384
385 if (unlikely(!action)) {
386 desc->chip->mask(irq);
387 goto out_unlock;
388 }
389
390 /*
391 * When another irq arrived while we were handling
392 * one, we could have masked the irq.
393 * Renable it, if it was not disabled in meantime.
394 */
395 if (unlikely((desc->status &
396 (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) ==
397 (IRQ_PENDING | IRQ_MASKED))) {
398 desc->chip->unmask(irq);
399 desc->status &= ~IRQ_MASKED;
400 }
401
402 desc->status &= ~IRQ_PENDING;
403 spin_unlock(&desc->lock);
404 action_ret = handle_IRQ_event(irq, regs, action);
405 if (!noirqdebug)
406 note_interrupt(irq, desc, action_ret, regs);
407 spin_lock(&desc->lock);
408
409 } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING);
410
411 desc->status &= ~IRQ_INPROGRESS;
412out_unlock:
413 spin_unlock(&desc->lock);
414}
415
416#ifdef CONFIG_SMP
417/**
418 * handle_percpu_IRQ - Per CPU local irq handler
419 * @irq: the interrupt number
420 * @desc: the interrupt description structure for this irq
421 * @regs: pointer to a register structure
422 *
423 * Per CPU interrupts on SMP machines without locking requirements
424 */
425void fastcall
426handle_percpu_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
427{
428 irqreturn_t action_ret;
429
430 kstat_this_cpu.irqs[irq]++;
431
432 if (desc->chip->ack)
433 desc->chip->ack(irq);
434
435 action_ret = handle_IRQ_event(irq, regs, desc->action);
436 if (!noirqdebug)
437 note_interrupt(irq, desc, action_ret, regs);
438
439 if (desc->chip->eoi)
440 desc->chip->eoi(irq);
441}
442
443#endif /* CONFIG_SMP */
444
445void
446__set_irq_handler(unsigned int irq,
447 void fastcall (*handle)(unsigned int, irq_desc_t *,
448 struct pt_regs *),
449 int is_chained)
450{
451 struct irq_desc *desc;
452 unsigned long flags;
453
454 if (irq >= NR_IRQS) {
455 printk(KERN_ERR
456 "Trying to install type control for IRQ%d\n", irq);
457 return;
458 }
459
460 desc = irq_desc + irq;
461
462 if (!handle)
463 handle = handle_bad_irq;
464
465 if (is_chained && desc->chip == &no_irq_chip)
466 printk(KERN_WARNING "Trying to install "
467 "chained interrupt type for IRQ%d\n", irq);
468
469 spin_lock_irqsave(&desc->lock, flags);
470
471 /* Uninstall? */
472 if (handle == handle_bad_irq) {
473 if (desc->chip != &no_irq_chip) {
474 desc->chip->mask(irq);
475 desc->chip->ack(irq);
476 }
477 desc->status |= IRQ_DISABLED;
478 desc->depth = 1;
479 }
480 desc->handle_irq = handle;
481
482 if (handle != handle_bad_irq && is_chained) {
483 desc->status &= ~IRQ_DISABLED;
484 desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE;
485 desc->depth = 0;
486 desc->chip->unmask(irq);
487 }
488 spin_unlock_irqrestore(&desc->lock, flags);
489}
490
491void
492set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip,
493 void fastcall (*handle)(unsigned int,
494 struct irq_desc *,
495 struct pt_regs *))
496{
497 set_irq_chip(irq, chip);
498 __set_irq_handler(irq, handle, 0);
499}
500
501/*
502 * Get a descriptive string for the highlevel handler, for
503 * /proc/interrupts output:
504 */
505const char *
506handle_irq_name(void fastcall (*handle)(unsigned int, struct irq_desc *,
507 struct pt_regs *))
508{
509 if (handle == handle_level_irq)
510 return "level ";
511 if (handle == handle_fasteoi_irq)
512 return "fasteoi";
513 if (handle == handle_edge_irq)
514 return "edge ";
515 if (handle == handle_simple_irq)
516 return "simple ";
517#ifdef CONFIG_SMP
518 if (handle == handle_percpu_irq)
519 return "percpu ";
520#endif
521 if (handle == handle_bad_irq)
522 return "bad ";
523
524 return NULL;
525}
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 51df337b37db..5a360dd4331b 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -1,9 +1,13 @@
1/* 1/*
2 * linux/kernel/irq/handle.c 2 * linux/kernel/irq/handle.c
3 * 3 *
4 * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar 4 * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
5 * Copyright (C) 2005-2006, Thomas Gleixner, Russell King
5 * 6 *
6 * This file contains the core interrupt handling code. 7 * This file contains the core interrupt handling code.
8 *
9 * Detailed information is available in Documentation/DocBook/genericirq
10 *
7 */ 11 */
8 12
9#include <linux/irq.h> 13#include <linux/irq.h>
@@ -14,11 +18,22 @@
14 18
15#include "internals.h" 19#include "internals.h"
16 20
21/**
22 * handle_bad_irq - handle spurious and unhandled irqs
23 */
24void fastcall
25handle_bad_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs)
26{
27 print_irq_desc(irq, desc);
28 kstat_this_cpu.irqs[irq]++;
29 ack_bad_irq(irq);
30}
31
17/* 32/*
18 * Linux has a controller-independent interrupt architecture. 33 * Linux has a controller-independent interrupt architecture.
19 * Every controller has a 'controller-template', that is used 34 * Every controller has a 'controller-template', that is used
20 * by the main code to do the right thing. Each driver-visible 35 * by the main code to do the right thing. Each driver-visible
21 * interrupt source is transparently wired to the apropriate 36 * interrupt source is transparently wired to the appropriate
22 * controller. Thus drivers need not be aware of the 37 * controller. Thus drivers need not be aware of the
23 * interrupt-controller. 38 * interrupt-controller.
24 * 39 *
@@ -28,41 +43,52 @@
28 * 43 *
29 * Controller mappings for all interrupt sources: 44 * Controller mappings for all interrupt sources:
30 */ 45 */
31irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = { 46struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned = {
32 [0 ... NR_IRQS-1] = { 47 [0 ... NR_IRQS-1] = {
33 .status = IRQ_DISABLED, 48 .status = IRQ_DISABLED,
34 .handler = &no_irq_type, 49 .chip = &no_irq_chip,
35 .lock = SPIN_LOCK_UNLOCKED 50 .handle_irq = handle_bad_irq,
51 .depth = 1,
52 .lock = SPIN_LOCK_UNLOCKED,
53#ifdef CONFIG_SMP
54 .affinity = CPU_MASK_ALL
55#endif
36 } 56 }
37}; 57};
38 58
39/* 59/*
40 * Generic 'no controller' code 60 * What should we do if we get a hw irq event on an illegal vector?
61 * Each architecture has to answer this themself.
41 */ 62 */
42static void end_none(unsigned int irq) { } 63static void ack_bad(unsigned int irq)
43static void enable_none(unsigned int irq) { }
44static void disable_none(unsigned int irq) { }
45static void shutdown_none(unsigned int irq) { }
46static unsigned int startup_none(unsigned int irq) { return 0; }
47
48static void ack_none(unsigned int irq)
49{ 64{
50 /* 65 print_irq_desc(irq, irq_desc + irq);
51 * 'what should we do if we get a hw irq event on an illegal vector'.
52 * each architecture has to answer this themself.
53 */
54 ack_bad_irq(irq); 66 ack_bad_irq(irq);
55} 67}
56 68
57struct hw_interrupt_type no_irq_type = { 69/*
58 .typename = "none", 70 * NOP functions
59 .startup = startup_none, 71 */
60 .shutdown = shutdown_none, 72static void noop(unsigned int irq)
61 .enable = enable_none, 73{
62 .disable = disable_none, 74}
63 .ack = ack_none, 75
64 .end = end_none, 76static unsigned int noop_ret(unsigned int irq)
65 .set_affinity = NULL 77{
78 return 0;
79}
80
81/*
82 * Generic no controller implementation
83 */
84struct irq_chip no_irq_chip = {
85 .name = "none",
86 .startup = noop_ret,
87 .shutdown = noop,
88 .enable = noop,
89 .disable = noop,
90 .ack = ack_bad,
91 .end = noop,
66}; 92};
67 93
68/* 94/*
@@ -73,13 +99,19 @@ irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs)
73 return IRQ_NONE; 99 return IRQ_NONE;
74} 100}
75 101
76/* 102/**
77 * Have got an event to handle: 103 * handle_IRQ_event - irq action chain handler
104 * @irq: the interrupt number
105 * @regs: pointer to a register structure
106 * @action: the interrupt action chain for this irq
107 *
108 * Handles the action chain of an irq event
78 */ 109 */
79fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs, 110irqreturn_t handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
80 struct irqaction *action) 111 struct irqaction *action)
81{ 112{
82 int ret, retval = 0, status = 0; 113 irqreturn_t ret, retval = IRQ_NONE;
114 unsigned int status = 0;
83 115
84 if (!(action->flags & SA_INTERRUPT)) 116 if (!(action->flags & SA_INTERRUPT))
85 local_irq_enable(); 117 local_irq_enable();
@@ -99,15 +131,22 @@ fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
99 return retval; 131 return retval;
100} 132}
101 133
102/* 134/**
103 * do_IRQ handles all normal device IRQ's (the special 135 * __do_IRQ - original all in one highlevel IRQ handler
136 * @irq: the interrupt number
137 * @regs: pointer to a register structure
138 *
139 * __do_IRQ handles all normal device IRQ's (the special
104 * SMP cross-CPU interrupts have their own specific 140 * SMP cross-CPU interrupts have their own specific
105 * handlers). 141 * handlers).
142 *
143 * This is the original x86 implementation which is used for every
144 * interrupt type.
106 */ 145 */
107fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs) 146fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
108{ 147{
109 irq_desc_t *desc = irq_desc + irq; 148 struct irq_desc *desc = irq_desc + irq;
110 struct irqaction * action; 149 struct irqaction *action;
111 unsigned int status; 150 unsigned int status;
112 151
113 kstat_this_cpu.irqs[irq]++; 152 kstat_this_cpu.irqs[irq]++;
@@ -117,16 +156,16 @@ fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
117 /* 156 /*
118 * No locking required for CPU-local interrupts: 157 * No locking required for CPU-local interrupts:
119 */ 158 */
120 if (desc->handler->ack) 159 if (desc->chip->ack)
121 desc->handler->ack(irq); 160 desc->chip->ack(irq);
122 action_ret = handle_IRQ_event(irq, regs, desc->action); 161 action_ret = handle_IRQ_event(irq, regs, desc->action);
123 desc->handler->end(irq); 162 desc->chip->end(irq);
124 return 1; 163 return 1;
125 } 164 }
126 165
127 spin_lock(&desc->lock); 166 spin_lock(&desc->lock);
128 if (desc->handler->ack) 167 if (desc->chip->ack)
129 desc->handler->ack(irq); 168 desc->chip->ack(irq);
130 /* 169 /*
131 * REPLAY is when Linux resends an IRQ that was dropped earlier 170 * REPLAY is when Linux resends an IRQ that was dropped earlier
132 * WAITING is used by probe to mark irqs that are being tested 171 * WAITING is used by probe to mark irqs that are being tested
@@ -186,7 +225,7 @@ out:
186 * The ->end() handler has to deal with interrupts which got 225 * The ->end() handler has to deal with interrupts which got
187 * disabled while the handler was running. 226 * disabled while the handler was running.
188 */ 227 */
189 desc->handler->end(irq); 228 desc->chip->end(irq);
190 spin_unlock(&desc->lock); 229 spin_unlock(&desc->lock);
191 230
192 return 1; 231 return 1;
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 46feba630266..08a849a22447 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -4,6 +4,12 @@
4 4
5extern int noirqdebug; 5extern int noirqdebug;
6 6
7/* Set default functions for irq_chip structures: */
8extern void irq_chip_set_defaults(struct irq_chip *chip);
9
10/* Set default handler: */
11extern void compat_irq_chip_set_default_handler(struct irq_desc *desc);
12
7#ifdef CONFIG_PROC_FS 13#ifdef CONFIG_PROC_FS
8extern void register_irq_proc(unsigned int irq); 14extern void register_irq_proc(unsigned int irq);
9extern void register_handler_proc(unsigned int irq, struct irqaction *action); 15extern void register_handler_proc(unsigned int irq, struct irqaction *action);
@@ -16,3 +22,43 @@ static inline void unregister_handler_proc(unsigned int irq,
16 struct irqaction *action) { } 22 struct irqaction *action) { }
17#endif 23#endif
18 24
25/*
26 * Debugging printout:
27 */
28
29#include <linux/kallsyms.h>
30
31#define P(f) if (desc->status & f) printk("%14s set\n", #f)
32
33static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
34{
35 printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n",
36 irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
37 printk("->handle_irq(): %p, ", desc->handle_irq);
38 print_symbol("%s\n", (unsigned long)desc->handle_irq);
39 printk("->chip(): %p, ", desc->chip);
40 print_symbol("%s\n", (unsigned long)desc->chip);
41 printk("->action(): %p\n", desc->action);
42 if (desc->action) {
43 printk("->action->handler(): %p, ", desc->action->handler);
44 print_symbol("%s\n", (unsigned long)desc->action->handler);
45 }
46
47 P(IRQ_INPROGRESS);
48 P(IRQ_DISABLED);
49 P(IRQ_PENDING);
50 P(IRQ_REPLAY);
51 P(IRQ_AUTODETECT);
52 P(IRQ_WAITING);
53 P(IRQ_LEVEL);
54 P(IRQ_MASKED);
55#ifdef CONFIG_IRQ_PER_CPU
56 P(IRQ_PER_CPU);
57#endif
58 P(IRQ_NOPROBE);
59 P(IRQ_NOREQUEST);
60 P(IRQ_NOAUTOEN);
61}
62
63#undef P
64
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 1279e3499534..9eb1d518ee1c 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1,7 +1,8 @@
1/* 1/*
2 * linux/kernel/irq/manage.c 2 * linux/kernel/irq/manage.c
3 * 3 *
4 * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar 4 * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
5 * Copyright (C) 2005-2006 Thomas Gleixner
5 * 6 *
6 * This file contains driver APIs to the irq subsystem. 7 * This file contains driver APIs to the irq subsystem.
7 */ 8 */
@@ -16,12 +17,6 @@
16 17
17#ifdef CONFIG_SMP 18#ifdef CONFIG_SMP
18 19
19cpumask_t irq_affinity[NR_IRQS] = { [0 ... NR_IRQS-1] = CPU_MASK_ALL };
20
21#if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE)
22cpumask_t __cacheline_aligned pending_irq_cpumask[NR_IRQS];
23#endif
24
25/** 20/**
26 * synchronize_irq - wait for pending IRQ handlers (on other CPUs) 21 * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
27 * @irq: interrupt number to wait for 22 * @irq: interrupt number to wait for
@@ -42,7 +37,6 @@ void synchronize_irq(unsigned int irq)
42 while (desc->status & IRQ_INPROGRESS) 37 while (desc->status & IRQ_INPROGRESS)
43 cpu_relax(); 38 cpu_relax();
44} 39}
45
46EXPORT_SYMBOL(synchronize_irq); 40EXPORT_SYMBOL(synchronize_irq);
47 41
48#endif 42#endif
@@ -60,7 +54,7 @@ EXPORT_SYMBOL(synchronize_irq);
60 */ 54 */
61void disable_irq_nosync(unsigned int irq) 55void disable_irq_nosync(unsigned int irq)
62{ 56{
63 irq_desc_t *desc = irq_desc + irq; 57 struct irq_desc *desc = irq_desc + irq;
64 unsigned long flags; 58 unsigned long flags;
65 59
66 if (irq >= NR_IRQS) 60 if (irq >= NR_IRQS)
@@ -69,11 +63,10 @@ void disable_irq_nosync(unsigned int irq)
69 spin_lock_irqsave(&desc->lock, flags); 63 spin_lock_irqsave(&desc->lock, flags);
70 if (!desc->depth++) { 64 if (!desc->depth++) {
71 desc->status |= IRQ_DISABLED; 65 desc->status |= IRQ_DISABLED;
72 desc->handler->disable(irq); 66 desc->chip->disable(irq);
73 } 67 }
74 spin_unlock_irqrestore(&desc->lock, flags); 68 spin_unlock_irqrestore(&desc->lock, flags);
75} 69}
76
77EXPORT_SYMBOL(disable_irq_nosync); 70EXPORT_SYMBOL(disable_irq_nosync);
78 71
79/** 72/**
@@ -90,7 +83,7 @@ EXPORT_SYMBOL(disable_irq_nosync);
90 */ 83 */
91void disable_irq(unsigned int irq) 84void disable_irq(unsigned int irq)
92{ 85{
93 irq_desc_t *desc = irq_desc + irq; 86 struct irq_desc *desc = irq_desc + irq;
94 87
95 if (irq >= NR_IRQS) 88 if (irq >= NR_IRQS)
96 return; 89 return;
@@ -99,7 +92,6 @@ void disable_irq(unsigned int irq)
99 if (desc->action) 92 if (desc->action)
100 synchronize_irq(irq); 93 synchronize_irq(irq);
101} 94}
102
103EXPORT_SYMBOL(disable_irq); 95EXPORT_SYMBOL(disable_irq);
104 96
105/** 97/**
@@ -114,7 +106,7 @@ EXPORT_SYMBOL(disable_irq);
114 */ 106 */
115void enable_irq(unsigned int irq) 107void enable_irq(unsigned int irq)
116{ 108{
117 irq_desc_t *desc = irq_desc + irq; 109 struct irq_desc *desc = irq_desc + irq;
118 unsigned long flags; 110 unsigned long flags;
119 111
120 if (irq >= NR_IRQS) 112 if (irq >= NR_IRQS)
@@ -123,17 +115,15 @@ void enable_irq(unsigned int irq)
123 spin_lock_irqsave(&desc->lock, flags); 115 spin_lock_irqsave(&desc->lock, flags);
124 switch (desc->depth) { 116 switch (desc->depth) {
125 case 0: 117 case 0:
118 printk(KERN_WARNING "Unablanced enable_irq(%d)\n", irq);
126 WARN_ON(1); 119 WARN_ON(1);
127 break; 120 break;
128 case 1: { 121 case 1: {
129 unsigned int status = desc->status & ~IRQ_DISABLED; 122 unsigned int status = desc->status & ~IRQ_DISABLED;
130 123
131 desc->status = status; 124 /* Prevent probing on this irq: */
132 if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) { 125 desc->status = status | IRQ_NOPROBE;
133 desc->status = status | IRQ_REPLAY; 126 check_irq_resend(desc, irq);
134 hw_resend_irq(desc->handler,irq);
135 }
136 desc->handler->enable(irq);
137 /* fall-through */ 127 /* fall-through */
138 } 128 }
139 default: 129 default:
@@ -141,9 +131,29 @@ void enable_irq(unsigned int irq)
141 } 131 }
142 spin_unlock_irqrestore(&desc->lock, flags); 132 spin_unlock_irqrestore(&desc->lock, flags);
143} 133}
144
145EXPORT_SYMBOL(enable_irq); 134EXPORT_SYMBOL(enable_irq);
146 135
136/**
137 * set_irq_wake - control irq power management wakeup
138 * @irq: interrupt to control
139 * @on: enable/disable power management wakeup
140 *
141 * Enable/disable power management wakeup mode
142 */
143int set_irq_wake(unsigned int irq, unsigned int on)
144{
145 struct irq_desc *desc = irq_desc + irq;
146 unsigned long flags;
147 int ret = -ENXIO;
148
149 spin_lock_irqsave(&desc->lock, flags);
150 if (desc->chip->set_wake)
151 ret = desc->chip->set_wake(irq, on);
152 spin_unlock_irqrestore(&desc->lock, flags);
153 return ret;
154}
155EXPORT_SYMBOL(set_irq_wake);
156
147/* 157/*
148 * Internal function that tells the architecture code whether a 158 * Internal function that tells the architecture code whether a
149 * particular irq has been exclusively allocated or is available 159 * particular irq has been exclusively allocated or is available
@@ -153,7 +163,7 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
153{ 163{
154 struct irqaction *action; 164 struct irqaction *action;
155 165
156 if (irq >= NR_IRQS) 166 if (irq >= NR_IRQS || irq_desc[irq].status & IRQ_NOREQUEST)
157 return 0; 167 return 0;
158 168
159 action = irq_desc[irq].action; 169 action = irq_desc[irq].action;
@@ -164,11 +174,22 @@ int can_request_irq(unsigned int irq, unsigned long irqflags)
164 return !action; 174 return !action;
165} 175}
166 176
177void compat_irq_chip_set_default_handler(struct irq_desc *desc)
178{
179 /*
180 * If the architecture still has not overriden
181 * the flow handler then zap the default. This
182 * should catch incorrect flow-type setting.
183 */
184 if (desc->handle_irq == &handle_bad_irq)
185 desc->handle_irq = NULL;
186}
187
167/* 188/*
168 * Internal function to register an irqaction - typically used to 189 * Internal function to register an irqaction - typically used to
169 * allocate special interrupts that are part of the architecture. 190 * allocate special interrupts that are part of the architecture.
170 */ 191 */
171int setup_irq(unsigned int irq, struct irqaction * new) 192int setup_irq(unsigned int irq, struct irqaction *new)
172{ 193{
173 struct irq_desc *desc = irq_desc + irq; 194 struct irq_desc *desc = irq_desc + irq;
174 struct irqaction *old, **p; 195 struct irqaction *old, **p;
@@ -178,7 +199,7 @@ int setup_irq(unsigned int irq, struct irqaction * new)
178 if (irq >= NR_IRQS) 199 if (irq >= NR_IRQS)
179 return -EINVAL; 200 return -EINVAL;
180 201
181 if (desc->handler == &no_irq_type) 202 if (desc->chip == &no_irq_chip)
182 return -ENOSYS; 203 return -ENOSYS;
183 /* 204 /*
184 * Some drivers like serial.c use request_irq() heavily, 205 * Some drivers like serial.c use request_irq() heavily,
@@ -200,14 +221,21 @@ int setup_irq(unsigned int irq, struct irqaction * new)
200 /* 221 /*
201 * The following block of code has to be executed atomically 222 * The following block of code has to be executed atomically
202 */ 223 */
203 spin_lock_irqsave(&desc->lock,flags); 224 spin_lock_irqsave(&desc->lock, flags);
204 p = &desc->action; 225 p = &desc->action;
205 if ((old = *p) != NULL) { 226 old = *p;
206 /* Can't share interrupts unless both agree to */ 227 if (old) {
207 if (!(old->flags & new->flags & SA_SHIRQ)) 228 /*
229 * Can't share interrupts unless both agree to and are
230 * the same type (level, edge, polarity). So both flag
231 * fields must have SA_SHIRQ set and the bits which
232 * set the trigger type must match.
233 */
234 if (!((old->flags & new->flags) & SA_SHIRQ) ||
235 ((old->flags ^ new->flags) & SA_TRIGGER_MASK))
208 goto mismatch; 236 goto mismatch;
209 237
210#if defined(ARCH_HAS_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ) 238#if defined(CONFIG_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ)
211 /* All handlers must agree on per-cpuness */ 239 /* All handlers must agree on per-cpuness */
212 if ((old->flags & IRQ_PER_CPU) != (new->flags & IRQ_PER_CPU)) 240 if ((old->flags & IRQ_PER_CPU) != (new->flags & IRQ_PER_CPU))
213 goto mismatch; 241 goto mismatch;
@@ -222,20 +250,44 @@ int setup_irq(unsigned int irq, struct irqaction * new)
222 } 250 }
223 251
224 *p = new; 252 *p = new;
225#if defined(ARCH_HAS_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ) 253#if defined(CONFIG_IRQ_PER_CPU) && defined(SA_PERCPU_IRQ)
226 if (new->flags & SA_PERCPU_IRQ) 254 if (new->flags & SA_PERCPU_IRQ)
227 desc->status |= IRQ_PER_CPU; 255 desc->status |= IRQ_PER_CPU;
228#endif 256#endif
229 if (!shared) { 257 if (!shared) {
230 desc->depth = 0; 258 irq_chip_set_defaults(desc->chip);
231 desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT | 259
232 IRQ_WAITING | IRQ_INPROGRESS); 260 /* Setup the type (level, edge polarity) if configured: */
233 if (desc->handler->startup) 261 if (new->flags & SA_TRIGGER_MASK) {
234 desc->handler->startup(irq); 262 if (desc->chip && desc->chip->set_type)
235 else 263 desc->chip->set_type(irq,
236 desc->handler->enable(irq); 264 new->flags & SA_TRIGGER_MASK);
265 else
266 /*
267 * SA_TRIGGER_* but the PIC does not support
268 * multiple flow-types?
269 */
270 printk(KERN_WARNING "setup_irq(%d) SA_TRIGGER"
271 "set. No set_type function available\n",
272 irq);
273 } else
274 compat_irq_chip_set_default_handler(desc);
275
276 desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING |
277 IRQ_INPROGRESS);
278
279 if (!(desc->status & IRQ_NOAUTOEN)) {
280 desc->depth = 0;
281 desc->status &= ~IRQ_DISABLED;
282 if (desc->chip->startup)
283 desc->chip->startup(irq);
284 else
285 desc->chip->enable(irq);
286 } else
287 /* Undo nested disables: */
288 desc->depth = 1;
237 } 289 }
238 spin_unlock_irqrestore(&desc->lock,flags); 290 spin_unlock_irqrestore(&desc->lock, flags);
239 291
240 new->irq = irq; 292 new->irq = irq;
241 register_irq_proc(irq); 293 register_irq_proc(irq);
@@ -278,10 +330,10 @@ void free_irq(unsigned int irq, void *dev_id)
278 return; 330 return;
279 331
280 desc = irq_desc + irq; 332 desc = irq_desc + irq;
281 spin_lock_irqsave(&desc->lock,flags); 333 spin_lock_irqsave(&desc->lock, flags);
282 p = &desc->action; 334 p = &desc->action;
283 for (;;) { 335 for (;;) {
284 struct irqaction * action = *p; 336 struct irqaction *action = *p;
285 337
286 if (action) { 338 if (action) {
287 struct irqaction **pp = p; 339 struct irqaction **pp = p;
@@ -295,18 +347,18 @@ void free_irq(unsigned int irq, void *dev_id)
295 347
296 /* Currently used only by UML, might disappear one day.*/ 348 /* Currently used only by UML, might disappear one day.*/
297#ifdef CONFIG_IRQ_RELEASE_METHOD 349#ifdef CONFIG_IRQ_RELEASE_METHOD
298 if (desc->handler->release) 350 if (desc->chip->release)
299 desc->handler->release(irq, dev_id); 351 desc->chip->release(irq, dev_id);
300#endif 352#endif
301 353
302 if (!desc->action) { 354 if (!desc->action) {
303 desc->status |= IRQ_DISABLED; 355 desc->status |= IRQ_DISABLED;
304 if (desc->handler->shutdown) 356 if (desc->chip->shutdown)
305 desc->handler->shutdown(irq); 357 desc->chip->shutdown(irq);
306 else 358 else
307 desc->handler->disable(irq); 359 desc->chip->disable(irq);
308 } 360 }
309 spin_unlock_irqrestore(&desc->lock,flags); 361 spin_unlock_irqrestore(&desc->lock, flags);
310 unregister_handler_proc(irq, action); 362 unregister_handler_proc(irq, action);
311 363
312 /* Make sure it's not being used on another CPU */ 364 /* Make sure it's not being used on another CPU */
@@ -314,12 +366,11 @@ void free_irq(unsigned int irq, void *dev_id)
314 kfree(action); 366 kfree(action);
315 return; 367 return;
316 } 368 }
317 printk(KERN_ERR "Trying to free free IRQ%d\n",irq); 369 printk(KERN_ERR "Trying to free free IRQ%d\n", irq);
318 spin_unlock_irqrestore(&desc->lock,flags); 370 spin_unlock_irqrestore(&desc->lock, flags);
319 return; 371 return;
320 } 372 }
321} 373}
322
323EXPORT_SYMBOL(free_irq); 374EXPORT_SYMBOL(free_irq);
324 375
325/** 376/**
@@ -353,9 +404,9 @@ EXPORT_SYMBOL(free_irq);
353 */ 404 */
354int request_irq(unsigned int irq, 405int request_irq(unsigned int irq,
355 irqreturn_t (*handler)(int, void *, struct pt_regs *), 406 irqreturn_t (*handler)(int, void *, struct pt_regs *),
356 unsigned long irqflags, const char * devname, void *dev_id) 407 unsigned long irqflags, const char *devname, void *dev_id)
357{ 408{
358 struct irqaction * action; 409 struct irqaction *action;
359 int retval; 410 int retval;
360 411
361 /* 412 /*
@@ -368,6 +419,8 @@ int request_irq(unsigned int irq,
368 return -EINVAL; 419 return -EINVAL;
369 if (irq >= NR_IRQS) 420 if (irq >= NR_IRQS)
370 return -EINVAL; 421 return -EINVAL;
422 if (irq_desc[irq].status & IRQ_NOREQUEST)
423 return -EINVAL;
371 if (!handler) 424 if (!handler)
372 return -EINVAL; 425 return -EINVAL;
373 426
@@ -390,6 +443,5 @@ int request_irq(unsigned int irq,
390 443
391 return retval; 444 return retval;
392} 445}
393
394EXPORT_SYMBOL(request_irq); 446EXPORT_SYMBOL(request_irq);
395 447
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 134f9f2e0e39..a57ebe9fa6f6 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -3,19 +3,19 @@
3 3
4void set_pending_irq(unsigned int irq, cpumask_t mask) 4void set_pending_irq(unsigned int irq, cpumask_t mask)
5{ 5{
6 irq_desc_t *desc = irq_desc + irq; 6 struct irq_desc *desc = irq_desc + irq;
7 unsigned long flags; 7 unsigned long flags;
8 8
9 spin_lock_irqsave(&desc->lock, flags); 9 spin_lock_irqsave(&desc->lock, flags);
10 desc->move_irq = 1; 10 desc->move_irq = 1;
11 pending_irq_cpumask[irq] = mask; 11 irq_desc[irq].pending_mask = mask;
12 spin_unlock_irqrestore(&desc->lock, flags); 12 spin_unlock_irqrestore(&desc->lock, flags);
13} 13}
14 14
15void move_native_irq(int irq) 15void move_native_irq(int irq)
16{ 16{
17 struct irq_desc *desc = irq_desc + irq;
17 cpumask_t tmp; 18 cpumask_t tmp;
18 irq_desc_t *desc = irq_descp(irq);
19 19
20 if (likely(!desc->move_irq)) 20 if (likely(!desc->move_irq))
21 return; 21 return;
@@ -30,15 +30,15 @@ void move_native_irq(int irq)
30 30
31 desc->move_irq = 0; 31 desc->move_irq = 0;
32 32
33 if (likely(cpus_empty(pending_irq_cpumask[irq]))) 33 if (unlikely(cpus_empty(irq_desc[irq].pending_mask)))
34 return; 34 return;
35 35
36 if (!desc->handler->set_affinity) 36 if (!desc->chip->set_affinity)
37 return; 37 return;
38 38
39 assert_spin_locked(&desc->lock); 39 assert_spin_locked(&desc->lock);
40 40
41 cpus_and(tmp, pending_irq_cpumask[irq], cpu_online_map); 41 cpus_and(tmp, irq_desc[irq].pending_mask, cpu_online_map);
42 42
43 /* 43 /*
44 * If there was a valid mask to work with, please 44 * If there was a valid mask to work with, please
@@ -49,14 +49,14 @@ void move_native_irq(int irq)
49 * cause some ioapics to mal-function. 49 * cause some ioapics to mal-function.
50 * Being paranoid i guess! 50 * Being paranoid i guess!
51 */ 51 */
52 if (unlikely(!cpus_empty(tmp))) { 52 if (likely(!cpus_empty(tmp))) {
53 if (likely(!(desc->status & IRQ_DISABLED))) 53 if (likely(!(desc->status & IRQ_DISABLED)))
54 desc->handler->disable(irq); 54 desc->chip->disable(irq);
55 55
56 desc->handler->set_affinity(irq,tmp); 56 desc->chip->set_affinity(irq,tmp);
57 57
58 if (likely(!(desc->status & IRQ_DISABLED))) 58 if (likely(!(desc->status & IRQ_DISABLED)))
59 desc->handler->enable(irq); 59 desc->chip->enable(irq);
60 } 60 }
61 cpus_clear(pending_irq_cpumask[irq]); 61 cpus_clear(irq_desc[irq].pending_mask);
62} 62}
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index d03b5eef8ce0..607c7809ad01 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -12,18 +12,15 @@
12 12
13#include "internals.h" 13#include "internals.h"
14 14
15static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS]; 15static struct proc_dir_entry *root_irq_dir;
16 16
17#ifdef CONFIG_SMP 17#ifdef CONFIG_SMP
18 18
19/*
20 * The /proc/irq/<irq>/smp_affinity values:
21 */
22static struct proc_dir_entry *smp_affinity_entry[NR_IRQS];
23
24#ifdef CONFIG_GENERIC_PENDING_IRQ 19#ifdef CONFIG_GENERIC_PENDING_IRQ
25void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) 20void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
26{ 21{
22 set_balance_irq_affinity(irq, mask_val);
23
27 /* 24 /*
28 * Save these away for later use. Re-progam when the 25 * Save these away for later use. Re-progam when the
29 * interrupt is pending 26 * interrupt is pending
@@ -33,15 +30,16 @@ void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
33#else 30#else
34void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val) 31void proc_set_irq_affinity(unsigned int irq, cpumask_t mask_val)
35{ 32{
36 irq_affinity[irq] = mask_val; 33 set_balance_irq_affinity(irq, mask_val);
37 irq_desc[irq].handler->set_affinity(irq, mask_val); 34 irq_desc[irq].affinity = mask_val;
35 irq_desc[irq].chip->set_affinity(irq, mask_val);
38} 36}
39#endif 37#endif
40 38
41static int irq_affinity_read_proc(char *page, char **start, off_t off, 39static int irq_affinity_read_proc(char *page, char **start, off_t off,
42 int count, int *eof, void *data) 40 int count, int *eof, void *data)
43{ 41{
44 int len = cpumask_scnprintf(page, count, irq_affinity[(long)data]); 42 int len = cpumask_scnprintf(page, count, irq_desc[(long)data].affinity);
45 43
46 if (count - len < 2) 44 if (count - len < 2)
47 return -EINVAL; 45 return -EINVAL;
@@ -56,7 +54,7 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
56 unsigned int irq = (int)(long)data, full_count = count, err; 54 unsigned int irq = (int)(long)data, full_count = count, err;
57 cpumask_t new_value, tmp; 55 cpumask_t new_value, tmp;
58 56
59 if (!irq_desc[irq].handler->set_affinity || no_irq_affinity) 57 if (!irq_desc[irq].chip->set_affinity || no_irq_affinity)
60 return -EIO; 58 return -EIO;
61 59
62 err = cpumask_parse(buffer, count, new_value); 60 err = cpumask_parse(buffer, count, new_value);
@@ -99,7 +97,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
99{ 97{
100 char name [MAX_NAMELEN]; 98 char name [MAX_NAMELEN];
101 99
102 if (!irq_dir[irq] || action->dir || !action->name || 100 if (!irq_desc[irq].dir || action->dir || !action->name ||
103 !name_unique(irq, action)) 101 !name_unique(irq, action))
104 return; 102 return;
105 103
@@ -107,7 +105,7 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
107 snprintf(name, MAX_NAMELEN, "%s", action->name); 105 snprintf(name, MAX_NAMELEN, "%s", action->name);
108 106
109 /* create /proc/irq/1234/handler/ */ 107 /* create /proc/irq/1234/handler/ */
110 action->dir = proc_mkdir(name, irq_dir[irq]); 108 action->dir = proc_mkdir(name, irq_desc[irq].dir);
111} 109}
112 110
113#undef MAX_NAMELEN 111#undef MAX_NAMELEN
@@ -119,22 +117,22 @@ void register_irq_proc(unsigned int irq)
119 char name [MAX_NAMELEN]; 117 char name [MAX_NAMELEN];
120 118
121 if (!root_irq_dir || 119 if (!root_irq_dir ||
122 (irq_desc[irq].handler == &no_irq_type) || 120 (irq_desc[irq].chip == &no_irq_chip) ||
123 irq_dir[irq]) 121 irq_desc[irq].dir)
124 return; 122 return;
125 123
126 memset(name, 0, MAX_NAMELEN); 124 memset(name, 0, MAX_NAMELEN);
127 sprintf(name, "%d", irq); 125 sprintf(name, "%d", irq);
128 126
129 /* create /proc/irq/1234 */ 127 /* create /proc/irq/1234 */
130 irq_dir[irq] = proc_mkdir(name, root_irq_dir); 128 irq_desc[irq].dir = proc_mkdir(name, root_irq_dir);
131 129
132#ifdef CONFIG_SMP 130#ifdef CONFIG_SMP
133 { 131 {
134 struct proc_dir_entry *entry; 132 struct proc_dir_entry *entry;
135 133
136 /* create /proc/irq/<irq>/smp_affinity */ 134 /* create /proc/irq/<irq>/smp_affinity */
137 entry = create_proc_entry("smp_affinity", 0600, irq_dir[irq]); 135 entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir);
138 136
139 if (entry) { 137 if (entry) {
140 entry->nlink = 1; 138 entry->nlink = 1;
@@ -142,7 +140,6 @@ void register_irq_proc(unsigned int irq)
142 entry->read_proc = irq_affinity_read_proc; 140 entry->read_proc = irq_affinity_read_proc;
143 entry->write_proc = irq_affinity_write_proc; 141 entry->write_proc = irq_affinity_write_proc;
144 } 142 }
145 smp_affinity_entry[irq] = entry;
146 } 143 }
147#endif 144#endif
148} 145}
@@ -152,7 +149,7 @@ void register_irq_proc(unsigned int irq)
152void unregister_handler_proc(unsigned int irq, struct irqaction *action) 149void unregister_handler_proc(unsigned int irq, struct irqaction *action)
153{ 150{
154 if (action->dir) 151 if (action->dir)
155 remove_proc_entry(action->dir->name, irq_dir[irq]); 152 remove_proc_entry(action->dir->name, irq_desc[irq].dir);
156} 153}
157 154
158void init_irq_proc(void) 155void init_irq_proc(void)
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
new file mode 100644
index 000000000000..872f91ba2ce8
--- /dev/null
+++ b/kernel/irq/resend.c
@@ -0,0 +1,78 @@
1/*
2 * linux/kernel/irq/resend.c
3 *
4 * Copyright (C) 1992, 1998-2006 Linus Torvalds, Ingo Molnar
5 * Copyright (C) 2005-2006, Thomas Gleixner
6 *
7 * This file contains the IRQ-resend code
8 *
9 * If the interrupt is waiting to be processed, we try to re-run it.
10 * We can't directly run it from here since the caller might be in an
11 * interrupt-protected region. Not all irq controller chips can
12 * retrigger interrupts at the hardware level, so in those cases
13 * we allow the resending of IRQs via a tasklet.
14 */
15
16#include <linux/irq.h>
17#include <linux/module.h>
18#include <linux/random.h>
19#include <linux/interrupt.h>
20
21#include "internals.h"
22
23#ifdef CONFIG_HARDIRQS_SW_RESEND
24
25/* Bitmap to handle software resend of interrupts: */
26static DECLARE_BITMAP(irqs_resend, NR_IRQS);
27
28/*
29 * Run software resends of IRQ's
30 */
31static void resend_irqs(unsigned long arg)
32{
33 struct irq_desc *desc;
34 int irq;
35
36 while (!bitmap_empty(irqs_resend, NR_IRQS)) {
37 irq = find_first_bit(irqs_resend, NR_IRQS);
38 clear_bit(irq, irqs_resend);
39 desc = irq_desc + irq;
40 local_irq_disable();
41 desc->handle_irq(irq, desc, NULL);
42 local_irq_enable();
43 }
44}
45
46/* Tasklet to handle resend: */
47static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
48
49#endif
50
51/*
52 * IRQ resend
53 *
54 * Is called with interrupts disabled and desc->lock held.
55 */
56void check_irq_resend(struct irq_desc *desc, unsigned int irq)
57{
58 unsigned int status = desc->status;
59
60 /*
61 * Make sure the interrupt is enabled, before resending it:
62 */
63 desc->chip->enable(irq);
64
65 if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
66 desc->status &= ~IRQ_PENDING;
67 desc->status = status | IRQ_REPLAY;
68
69 if (!desc->chip || !desc->chip->retrigger ||
70 !desc->chip->retrigger(irq)) {
71#ifdef CONFIG_HARDIRQS_SW_RESEND
72 /* Set it pending and activate the softirq: */
73 set_bit(irq, irqs_resend);
74 tasklet_schedule(&resend_tasklet);
75#endif
76 }
77 }
78}
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 7df9abd5ec86..b483deed311c 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -11,27 +11,25 @@
11#include <linux/kallsyms.h> 11#include <linux/kallsyms.h>
12#include <linux/interrupt.h> 12#include <linux/interrupt.h>
13 13
14static int irqfixup; 14static int irqfixup __read_mostly;
15 15
16/* 16/*
17 * Recovery handler for misrouted interrupts. 17 * Recovery handler for misrouted interrupts.
18 */ 18 */
19
20static int misrouted_irq(int irq, struct pt_regs *regs) 19static int misrouted_irq(int irq, struct pt_regs *regs)
21{ 20{
22 int i; 21 int i;
23 irq_desc_t *desc;
24 int ok = 0; 22 int ok = 0;
25 int work = 0; /* Did we do work for a real IRQ */ 23 int work = 0; /* Did we do work for a real IRQ */
26 24
27 for(i = 1; i < NR_IRQS; i++) { 25 for (i = 1; i < NR_IRQS; i++) {
26 struct irq_desc *desc = irq_desc + i;
28 struct irqaction *action; 27 struct irqaction *action;
29 28
30 if (i == irq) /* Already tried */ 29 if (i == irq) /* Already tried */
31 continue; 30 continue;
32 desc = &irq_desc[i]; 31
33 spin_lock(&desc->lock); 32 spin_lock(&desc->lock);
34 action = desc->action;
35 /* Already running on another processor */ 33 /* Already running on another processor */
36 if (desc->status & IRQ_INPROGRESS) { 34 if (desc->status & IRQ_INPROGRESS) {
37 /* 35 /*
@@ -45,7 +43,9 @@ static int misrouted_irq(int irq, struct pt_regs *regs)
45 } 43 }
46 /* Honour the normal IRQ locking */ 44 /* Honour the normal IRQ locking */
47 desc->status |= IRQ_INPROGRESS; 45 desc->status |= IRQ_INPROGRESS;
46 action = desc->action;
48 spin_unlock(&desc->lock); 47 spin_unlock(&desc->lock);
48
49 while (action) { 49 while (action) {
50 /* Only shared IRQ handlers are safe to call */ 50 /* Only shared IRQ handlers are safe to call */
51 if (action->flags & SA_SHIRQ) { 51 if (action->flags & SA_SHIRQ) {
@@ -62,9 +62,8 @@ static int misrouted_irq(int irq, struct pt_regs *regs)
62 62
63 /* 63 /*
64 * While we were looking for a fixup someone queued a real 64 * While we were looking for a fixup someone queued a real
65 * IRQ clashing with our walk 65 * IRQ clashing with our walk:
66 */ 66 */
67
68 while ((desc->status & IRQ_PENDING) && action) { 67 while ((desc->status & IRQ_PENDING) && action) {
69 /* 68 /*
70 * Perform real IRQ processing for the IRQ we deferred 69 * Perform real IRQ processing for the IRQ we deferred
@@ -80,8 +79,8 @@ static int misrouted_irq(int irq, struct pt_regs *regs)
80 * If we did actual work for the real IRQ line we must let the 79 * If we did actual work for the real IRQ line we must let the
81 * IRQ controller clean up too 80 * IRQ controller clean up too
82 */ 81 */
83 if(work) 82 if (work && desc->chip && desc->chip->end)
84 desc->handler->end(i); 83 desc->chip->end(i);
85 spin_unlock(&desc->lock); 84 spin_unlock(&desc->lock);
86 } 85 }
87 /* So the caller can adjust the irq error counts */ 86 /* So the caller can adjust the irq error counts */
@@ -100,7 +99,8 @@ static int misrouted_irq(int irq, struct pt_regs *regs)
100 */ 99 */
101 100
102static void 101static void
103__report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) 102__report_bad_irq(unsigned int irq, struct irq_desc *desc,
103 irqreturn_t action_ret)
104{ 104{
105 struct irqaction *action; 105 struct irqaction *action;
106 106
@@ -113,6 +113,7 @@ __report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
113 } 113 }
114 dump_stack(); 114 dump_stack();
115 printk(KERN_ERR "handlers:\n"); 115 printk(KERN_ERR "handlers:\n");
116
116 action = desc->action; 117 action = desc->action;
117 while (action) { 118 while (action) {
118 printk(KERN_ERR "[<%p>]", action->handler); 119 printk(KERN_ERR "[<%p>]", action->handler);
@@ -123,7 +124,8 @@ __report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret)
123 } 124 }
124} 125}
125 126
126static void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret) 127static void
128report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret)
127{ 129{
128 static int count = 100; 130 static int count = 100;
129 131
@@ -133,12 +135,12 @@ static void report_bad_irq(unsigned int irq, irq_desc_t *desc, irqreturn_t actio
133 } 135 }
134} 136}
135 137
136void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret, 138void note_interrupt(unsigned int irq, struct irq_desc *desc,
137 struct pt_regs *regs) 139 irqreturn_t action_ret, struct pt_regs *regs)
138{ 140{
139 if (action_ret != IRQ_HANDLED) { 141 if (unlikely(action_ret != IRQ_HANDLED)) {
140 desc->irqs_unhandled++; 142 desc->irqs_unhandled++;
141 if (action_ret != IRQ_NONE) 143 if (unlikely(action_ret != IRQ_NONE))
142 report_bad_irq(irq, desc, action_ret); 144 report_bad_irq(irq, desc, action_ret);
143 } 145 }
144 146
@@ -152,11 +154,11 @@ void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret,
152 } 154 }
153 155
154 desc->irq_count++; 156 desc->irq_count++;
155 if (desc->irq_count < 100000) 157 if (likely(desc->irq_count < 100000))
156 return; 158 return;
157 159
158 desc->irq_count = 0; 160 desc->irq_count = 0;
159 if (desc->irqs_unhandled > 99900) { 161 if (unlikely(desc->irqs_unhandled > 99900)) {
160 /* 162 /*
161 * The interrupt is stuck 163 * The interrupt is stuck
162 */ 164 */
@@ -166,17 +168,19 @@ void note_interrupt(unsigned int irq, irq_desc_t *desc, irqreturn_t action_ret,
166 */ 168 */
167 printk(KERN_EMERG "Disabling IRQ #%d\n", irq); 169 printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
168 desc->status |= IRQ_DISABLED; 170 desc->status |= IRQ_DISABLED;
169 desc->handler->disable(irq); 171 desc->depth = 1;
172 desc->chip->disable(irq);
170 } 173 }
171 desc->irqs_unhandled = 0; 174 desc->irqs_unhandled = 0;
172} 175}
173 176
174int noirqdebug; 177int noirqdebug __read_mostly;
175 178
176int __init noirqdebug_setup(char *str) 179int __init noirqdebug_setup(char *str)
177{ 180{
178 noirqdebug = 1; 181 noirqdebug = 1;
179 printk(KERN_INFO "IRQ lockup detection disabled\n"); 182 printk(KERN_INFO "IRQ lockup detection disabled\n");
183
180 return 1; 184 return 1;
181} 185}
182 186
@@ -187,6 +191,7 @@ static int __init irqfixup_setup(char *str)
187 irqfixup = 1; 191 irqfixup = 1;
188 printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n"); 192 printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n");
189 printk(KERN_WARNING "This may impact system performance.\n"); 193 printk(KERN_WARNING "This may impact system performance.\n");
194
190 return 1; 195 return 1;
191} 196}
192 197
diff --git a/kernel/kexec.c b/kernel/kexec.c
index bf39d28e4c0e..50087ecf337e 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -902,14 +902,14 @@ static int kimage_load_segment(struct kimage *image,
902 * kexec does not sync, or unmount filesystems so if you need 902 * kexec does not sync, or unmount filesystems so if you need
903 * that to happen you need to do that yourself. 903 * that to happen you need to do that yourself.
904 */ 904 */
905struct kimage *kexec_image = NULL; 905struct kimage *kexec_image;
906static struct kimage *kexec_crash_image = NULL; 906struct kimage *kexec_crash_image;
907/* 907/*
908 * A home grown binary mutex. 908 * A home grown binary mutex.
909 * Nothing can wait so this mutex is safe to use 909 * Nothing can wait so this mutex is safe to use
910 * in interrupt context :) 910 * in interrupt context :)
911 */ 911 */
912static int kexec_lock = 0; 912static int kexec_lock;
913 913
914asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, 914asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
915 struct kexec_segment __user *segments, 915 struct kexec_segment __user *segments,
@@ -1042,7 +1042,6 @@ asmlinkage long compat_sys_kexec_load(unsigned long entry,
1042 1042
1043void crash_kexec(struct pt_regs *regs) 1043void crash_kexec(struct pt_regs *regs)
1044{ 1044{
1045 struct kimage *image;
1046 int locked; 1045 int locked;
1047 1046
1048 1047
@@ -1056,12 +1055,11 @@ void crash_kexec(struct pt_regs *regs)
1056 */ 1055 */
1057 locked = xchg(&kexec_lock, 1); 1056 locked = xchg(&kexec_lock, 1);
1058 if (!locked) { 1057 if (!locked) {
1059 image = xchg(&kexec_crash_image, NULL); 1058 if (kexec_crash_image) {
1060 if (image) {
1061 struct pt_regs fixed_regs; 1059 struct pt_regs fixed_regs;
1062 crash_setup_regs(&fixed_regs, regs); 1060 crash_setup_regs(&fixed_regs, regs);
1063 machine_crash_shutdown(&fixed_regs); 1061 machine_crash_shutdown(&fixed_regs);
1064 machine_kexec(image); 1062 machine_kexec(kexec_crash_image);
1065 } 1063 }
1066 xchg(&kexec_lock, 0); 1064 xchg(&kexec_lock, 0);
1067 } 1065 }
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 1fbf466a29aa..64aab081153b 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -47,11 +47,17 @@
47 47
48static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; 48static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
49static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; 49static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
50static atomic_t kprobe_count;
50 51
51DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ 52DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
52DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ 53DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */
53static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; 54static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
54 55
56static struct notifier_block kprobe_page_fault_nb = {
57 .notifier_call = kprobe_exceptions_notify,
58 .priority = 0x7fffffff /* we need to notified first */
59};
60
55#ifdef __ARCH_WANT_KPROBES_INSN_SLOT 61#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
56/* 62/*
57 * kprobe->ainsn.insn points to the copy of the instruction to be 63 * kprobe->ainsn.insn points to the copy of the instruction to be
@@ -368,16 +374,15 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
368*/ 374*/
369static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p) 375static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
370{ 376{
371 struct kprobe *kp;
372
373 if (p->break_handler) { 377 if (p->break_handler) {
374 list_for_each_entry_rcu(kp, &old_p->list, list) { 378 if (old_p->break_handler)
375 if (kp->break_handler) 379 return -EEXIST;
376 return -EEXIST;
377 }
378 list_add_tail_rcu(&p->list, &old_p->list); 380 list_add_tail_rcu(&p->list, &old_p->list);
381 old_p->break_handler = aggr_break_handler;
379 } else 382 } else
380 list_add_rcu(&p->list, &old_p->list); 383 list_add_rcu(&p->list, &old_p->list);
384 if (p->post_handler && !old_p->post_handler)
385 old_p->post_handler = aggr_post_handler;
381 return 0; 386 return 0;
382} 387}
383 388
@@ -390,9 +395,11 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
390 copy_kprobe(p, ap); 395 copy_kprobe(p, ap);
391 ap->addr = p->addr; 396 ap->addr = p->addr;
392 ap->pre_handler = aggr_pre_handler; 397 ap->pre_handler = aggr_pre_handler;
393 ap->post_handler = aggr_post_handler;
394 ap->fault_handler = aggr_fault_handler; 398 ap->fault_handler = aggr_fault_handler;
395 ap->break_handler = aggr_break_handler; 399 if (p->post_handler)
400 ap->post_handler = aggr_post_handler;
401 if (p->break_handler)
402 ap->break_handler = aggr_break_handler;
396 403
397 INIT_LIST_HEAD(&ap->list); 404 INIT_LIST_HEAD(&ap->list);
398 list_add_rcu(&p->list, &ap->list); 405 list_add_rcu(&p->list, &ap->list);
@@ -464,6 +471,8 @@ static int __kprobes __register_kprobe(struct kprobe *p,
464 old_p = get_kprobe(p->addr); 471 old_p = get_kprobe(p->addr);
465 if (old_p) { 472 if (old_p) {
466 ret = register_aggr_kprobe(old_p, p); 473 ret = register_aggr_kprobe(old_p, p);
474 if (!ret)
475 atomic_inc(&kprobe_count);
467 goto out; 476 goto out;
468 } 477 }
469 478
@@ -474,6 +483,10 @@ static int __kprobes __register_kprobe(struct kprobe *p,
474 hlist_add_head_rcu(&p->hlist, 483 hlist_add_head_rcu(&p->hlist,
475 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); 484 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
476 485
486 if (atomic_add_return(1, &kprobe_count) == \
487 (ARCH_INACTIVE_KPROBE_COUNT + 1))
488 register_page_fault_notifier(&kprobe_page_fault_nb);
489
477 arch_arm_kprobe(p); 490 arch_arm_kprobe(p);
478 491
479out: 492out:
@@ -536,14 +549,40 @@ valid_p:
536 kfree(old_p); 549 kfree(old_p);
537 } 550 }
538 arch_remove_kprobe(p); 551 arch_remove_kprobe(p);
552 } else {
553 mutex_lock(&kprobe_mutex);
554 if (p->break_handler)
555 old_p->break_handler = NULL;
556 if (p->post_handler){
557 list_for_each_entry_rcu(list_p, &old_p->list, list){
558 if (list_p->post_handler){
559 cleanup_p = 2;
560 break;
561 }
562 }
563 if (cleanup_p == 0)
564 old_p->post_handler = NULL;
565 }
566 mutex_unlock(&kprobe_mutex);
539 } 567 }
568
569 /* Call unregister_page_fault_notifier()
570 * if no probes are active
571 */
572 mutex_lock(&kprobe_mutex);
573 if (atomic_add_return(-1, &kprobe_count) == \
574 ARCH_INACTIVE_KPROBE_COUNT)
575 unregister_page_fault_notifier(&kprobe_page_fault_nb);
576 mutex_unlock(&kprobe_mutex);
577 return;
540} 578}
541 579
542static struct notifier_block kprobe_exceptions_nb = { 580static struct notifier_block kprobe_exceptions_nb = {
543 .notifier_call = kprobe_exceptions_notify, 581 .notifier_call = kprobe_exceptions_notify,
544 .priority = 0x7fffffff /* we need to notified first */ 582 .priority = 0x7fffffff /* we need to be notified first */
545}; 583};
546 584
585
547int __kprobes register_jprobe(struct jprobe *jp) 586int __kprobes register_jprobe(struct jprobe *jp)
548{ 587{
549 /* Todo: Verify probepoint is a function entry point */ 588 /* Todo: Verify probepoint is a function entry point */
@@ -652,6 +691,7 @@ static int __init init_kprobes(void)
652 INIT_HLIST_HEAD(&kprobe_table[i]); 691 INIT_HLIST_HEAD(&kprobe_table[i]);
653 INIT_HLIST_HEAD(&kretprobe_inst_table[i]); 692 INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
654 } 693 }
694 atomic_set(&kprobe_count, 0);
655 695
656 err = arch_init_kprobes(); 696 err = arch_init_kprobes();
657 if (!err) 697 if (!err)
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index f119e098e67b..9e28478a17a5 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -14,6 +14,7 @@
14#include <linux/sysfs.h> 14#include <linux/sysfs.h>
15#include <linux/module.h> 15#include <linux/module.h>
16#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/kexec.h>
17 18
18#define KERNEL_ATTR_RO(_name) \ 19#define KERNEL_ATTR_RO(_name) \
19static struct subsys_attribute _name##_attr = __ATTR_RO(_name) 20static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
@@ -48,6 +49,20 @@ static ssize_t uevent_helper_store(struct subsystem *subsys, const char *page, s
48KERNEL_ATTR_RW(uevent_helper); 49KERNEL_ATTR_RW(uevent_helper);
49#endif 50#endif
50 51
52#ifdef CONFIG_KEXEC
53static ssize_t kexec_loaded_show(struct subsystem *subsys, char *page)
54{
55 return sprintf(page, "%d\n", !!kexec_image);
56}
57KERNEL_ATTR_RO(kexec_loaded);
58
59static ssize_t kexec_crash_loaded_show(struct subsystem *subsys, char *page)
60{
61 return sprintf(page, "%d\n", !!kexec_crash_image);
62}
63KERNEL_ATTR_RO(kexec_crash_loaded);
64#endif /* CONFIG_KEXEC */
65
51decl_subsys(kernel, NULL, NULL); 66decl_subsys(kernel, NULL, NULL);
52EXPORT_SYMBOL_GPL(kernel_subsys); 67EXPORT_SYMBOL_GPL(kernel_subsys);
53 68
@@ -56,6 +71,10 @@ static struct attribute * kernel_attrs[] = {
56 &uevent_seqnum_attr.attr, 71 &uevent_seqnum_attr.attr,
57 &uevent_helper_attr.attr, 72 &uevent_helper_attr.attr,
58#endif 73#endif
74#ifdef CONFIG_KEXEC
75 &kexec_loaded_attr.attr,
76 &kexec_crash_loaded_attr.attr,
77#endif
59 NULL 78 NULL
60}; 79};
61 80
diff --git a/kernel/kthread.c b/kernel/kthread.c
index c5f3c6613b6d..24be714b04c7 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -45,6 +45,13 @@ struct kthread_stop_info
45static DEFINE_MUTEX(kthread_stop_lock); 45static DEFINE_MUTEX(kthread_stop_lock);
46static struct kthread_stop_info kthread_stop_info; 46static struct kthread_stop_info kthread_stop_info;
47 47
48/**
49 * kthread_should_stop - should this kthread return now?
50 *
51 * When someone calls kthread_stop on your kthread, it will be woken
52 * and this will return true. You should then return, and your return
53 * value will be passed through to kthread_stop().
54 */
48int kthread_should_stop(void) 55int kthread_should_stop(void)
49{ 56{
50 return (kthread_stop_info.k == current); 57 return (kthread_stop_info.k == current);
@@ -122,6 +129,25 @@ static void keventd_create_kthread(void *_create)
122 complete(&create->done); 129 complete(&create->done);
123} 130}
124 131
132/**
133 * kthread_create - create a kthread.
134 * @threadfn: the function to run until signal_pending(current).
135 * @data: data ptr for @threadfn.
136 * @namefmt: printf-style name for the thread.
137 *
138 * Description: This helper function creates and names a kernel
139 * thread. The thread will be stopped: use wake_up_process() to start
140 * it. See also kthread_run(), kthread_create_on_cpu().
141 *
142 * When woken, the thread will run @threadfn() with @data as its
143 * argument. @threadfn can either call do_exit() directly if it is a
144 * standalone thread for which noone will call kthread_stop(), or
145 * return when 'kthread_should_stop()' is true (which means
146 * kthread_stop() has been called). The return value should be zero
147 * or a negative error number; it will be passed to kthread_stop().
148 *
149 * Returns a task_struct or ERR_PTR(-ENOMEM).
150 */
125struct task_struct *kthread_create(int (*threadfn)(void *data), 151struct task_struct *kthread_create(int (*threadfn)(void *data),
126 void *data, 152 void *data,
127 const char namefmt[], 153 const char namefmt[],
@@ -156,6 +182,15 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
156} 182}
157EXPORT_SYMBOL(kthread_create); 183EXPORT_SYMBOL(kthread_create);
158 184
185/**
186 * kthread_bind - bind a just-created kthread to a cpu.
187 * @k: thread created by kthread_create().
188 * @cpu: cpu (might not be online, must be possible) for @k to run on.
189 *
190 * Description: This function is equivalent to set_cpus_allowed(),
191 * except that @cpu doesn't need to be online, and the thread must be
192 * stopped (i.e., just returned from kthread_create().
193 */
159void kthread_bind(struct task_struct *k, unsigned int cpu) 194void kthread_bind(struct task_struct *k, unsigned int cpu)
160{ 195{
161 BUG_ON(k->state != TASK_INTERRUPTIBLE); 196 BUG_ON(k->state != TASK_INTERRUPTIBLE);
@@ -166,12 +201,36 @@ void kthread_bind(struct task_struct *k, unsigned int cpu)
166} 201}
167EXPORT_SYMBOL(kthread_bind); 202EXPORT_SYMBOL(kthread_bind);
168 203
204/**
205 * kthread_stop - stop a thread created by kthread_create().
206 * @k: thread created by kthread_create().
207 *
208 * Sets kthread_should_stop() for @k to return true, wakes it, and
209 * waits for it to exit. Your threadfn() must not call do_exit()
210 * itself if you use this function! This can also be called after
211 * kthread_create() instead of calling wake_up_process(): the thread
212 * will exit without calling threadfn().
213 *
214 * Returns the result of threadfn(), or %-EINTR if wake_up_process()
215 * was never called.
216 */
169int kthread_stop(struct task_struct *k) 217int kthread_stop(struct task_struct *k)
170{ 218{
171 return kthread_stop_sem(k, NULL); 219 return kthread_stop_sem(k, NULL);
172} 220}
173EXPORT_SYMBOL(kthread_stop); 221EXPORT_SYMBOL(kthread_stop);
174 222
223/**
224 * kthread_stop_sem - stop a thread created by kthread_create().
225 * @k: thread created by kthread_create().
226 * @s: semaphore that @k waits on while idle.
227 *
228 * Does essentially the same thing as kthread_stop() above, but wakes
229 * @k by calling up(@s).
230 *
231 * Returns the result of threadfn(), or %-EINTR if wake_up_process()
232 * was never called.
233 */
175int kthread_stop_sem(struct task_struct *k, struct semaphore *s) 234int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
176{ 235{
177 int ret; 236 int ret;
@@ -210,5 +269,5 @@ static __init int helper_init(void)
210 269
211 return 0; 270 return 0;
212} 271}
213core_initcall(helper_init);
214 272
273core_initcall(helper_init);
diff --git a/kernel/module.c b/kernel/module.c
index bbe04862e1b0..99c022ac3d21 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1,4 +1,4 @@
1/* Rewritten by Rusty Russell, on the backs of many others... 1/*
2 Copyright (C) 2002 Richard Henderson 2 Copyright (C) 2002 Richard Henderson
3 Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM. 3 Copyright (C) 2001 Rusty Russell, 2002 Rusty Russell IBM.
4 4
@@ -40,9 +40,11 @@
40#include <linux/string.h> 40#include <linux/string.h>
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/mutex.h> 42#include <linux/mutex.h>
43#include <linux/unwind.h>
43#include <asm/uaccess.h> 44#include <asm/uaccess.h>
44#include <asm/semaphore.h> 45#include <asm/semaphore.h>
45#include <asm/cacheflush.h> 46#include <asm/cacheflush.h>
47#include <linux/license.h>
46 48
47#if 0 49#if 0
48#define DEBUGP printk 50#define DEBUGP printk
@@ -120,9 +122,17 @@ extern const struct kernel_symbol __start___ksymtab_gpl[];
120extern const struct kernel_symbol __stop___ksymtab_gpl[]; 122extern const struct kernel_symbol __stop___ksymtab_gpl[];
121extern const struct kernel_symbol __start___ksymtab_gpl_future[]; 123extern const struct kernel_symbol __start___ksymtab_gpl_future[];
122extern const struct kernel_symbol __stop___ksymtab_gpl_future[]; 124extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
125extern const struct kernel_symbol __start___ksymtab_unused[];
126extern const struct kernel_symbol __stop___ksymtab_unused[];
127extern const struct kernel_symbol __start___ksymtab_unused_gpl[];
128extern const struct kernel_symbol __stop___ksymtab_unused_gpl[];
129extern const struct kernel_symbol __start___ksymtab_gpl_future[];
130extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
123extern const unsigned long __start___kcrctab[]; 131extern const unsigned long __start___kcrctab[];
124extern const unsigned long __start___kcrctab_gpl[]; 132extern const unsigned long __start___kcrctab_gpl[];
125extern const unsigned long __start___kcrctab_gpl_future[]; 133extern const unsigned long __start___kcrctab_gpl_future[];
134extern const unsigned long __start___kcrctab_unused[];
135extern const unsigned long __start___kcrctab_unused_gpl[];
126 136
127#ifndef CONFIG_MODVERSIONS 137#ifndef CONFIG_MODVERSIONS
128#define symversion(base, idx) NULL 138#define symversion(base, idx) NULL
@@ -142,6 +152,17 @@ static const struct kernel_symbol *lookup_symbol(const char *name,
142 return NULL; 152 return NULL;
143} 153}
144 154
155static void printk_unused_warning(const char *name)
156{
157 printk(KERN_WARNING "Symbol %s is marked as UNUSED, "
158 "however this module is using it.\n", name);
159 printk(KERN_WARNING "This symbol will go away in the future.\n");
160 printk(KERN_WARNING "Please evalute if this is the right api to use, "
161 "and if it really is, submit a report the linux kernel "
162 "mailinglist together with submitting your code for "
163 "inclusion.\n");
164}
165
145/* Find a symbol, return value, crc and module which owns it */ 166/* Find a symbol, return value, crc and module which owns it */
146static unsigned long __find_symbol(const char *name, 167static unsigned long __find_symbol(const char *name,
147 struct module **owner, 168 struct module **owner,
@@ -184,6 +205,25 @@ static unsigned long __find_symbol(const char *name,
184 return ks->value; 205 return ks->value;
185 } 206 }
186 207
208 ks = lookup_symbol(name, __start___ksymtab_unused,
209 __stop___ksymtab_unused);
210 if (ks) {
211 printk_unused_warning(name);
212 *crc = symversion(__start___kcrctab_unused,
213 (ks - __start___ksymtab_unused));
214 return ks->value;
215 }
216
217 if (gplok)
218 ks = lookup_symbol(name, __start___ksymtab_unused_gpl,
219 __stop___ksymtab_unused_gpl);
220 if (ks) {
221 printk_unused_warning(name);
222 *crc = symversion(__start___kcrctab_unused_gpl,
223 (ks - __start___ksymtab_unused_gpl));
224 return ks->value;
225 }
226
187 /* Now try modules. */ 227 /* Now try modules. */
188 list_for_each_entry(mod, &modules, list) { 228 list_for_each_entry(mod, &modules, list) {
189 *owner = mod; 229 *owner = mod;
@@ -202,6 +242,23 @@ static unsigned long __find_symbol(const char *name,
202 return ks->value; 242 return ks->value;
203 } 243 }
204 } 244 }
245 ks = lookup_symbol(name, mod->unused_syms, mod->unused_syms + mod->num_unused_syms);
246 if (ks) {
247 printk_unused_warning(name);
248 *crc = symversion(mod->unused_crcs, (ks - mod->unused_syms));
249 return ks->value;
250 }
251
252 if (gplok) {
253 ks = lookup_symbol(name, mod->unused_gpl_syms,
254 mod->unused_gpl_syms + mod->num_unused_gpl_syms);
255 if (ks) {
256 printk_unused_warning(name);
257 *crc = symversion(mod->unused_gpl_crcs,
258 (ks - mod->unused_gpl_syms));
259 return ks->value;
260 }
261 }
205 ks = lookup_symbol(name, mod->gpl_future_syms, 262 ks = lookup_symbol(name, mod->gpl_future_syms,
206 (mod->gpl_future_syms + 263 (mod->gpl_future_syms +
207 mod->num_gpl_future_syms)); 264 mod->num_gpl_future_syms));
@@ -1051,6 +1108,8 @@ static void free_module(struct module *mod)
1051 remove_sect_attrs(mod); 1108 remove_sect_attrs(mod);
1052 mod_kobject_remove(mod); 1109 mod_kobject_remove(mod);
1053 1110
1111 unwind_remove_table(mod->unwind_info, 0);
1112
1054 /* Arch-specific cleanup. */ 1113 /* Arch-specific cleanup. */
1055 module_arch_cleanup(mod); 1114 module_arch_cleanup(mod);
1056 1115
@@ -1248,16 +1307,6 @@ static void layout_sections(struct module *mod,
1248 } 1307 }
1249} 1308}
1250 1309
1251static inline int license_is_gpl_compatible(const char *license)
1252{
1253 return (strcmp(license, "GPL") == 0
1254 || strcmp(license, "GPL v2") == 0
1255 || strcmp(license, "GPL and additional rights") == 0
1256 || strcmp(license, "Dual BSD/GPL") == 0
1257 || strcmp(license, "Dual MIT/GPL") == 0
1258 || strcmp(license, "Dual MPL/GPL") == 0);
1259}
1260
1261static void set_license(struct module *mod, const char *license) 1310static void set_license(struct module *mod, const char *license)
1262{ 1311{
1263 if (!license) 1312 if (!license)
@@ -1326,7 +1375,7 @@ int is_exported(const char *name, const struct module *mod)
1326 if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab)) 1375 if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab))
1327 return 1; 1376 return 1;
1328 else 1377 else
1329 if (lookup_symbol(name, mod->syms, mod->syms + mod->num_syms)) 1378 if (mod && lookup_symbol(name, mod->syms, mod->syms + mod->num_syms))
1330 return 1; 1379 return 1;
1331 else 1380 else
1332 return 0; 1381 return 0;
@@ -1409,10 +1458,27 @@ static struct module *load_module(void __user *umod,
1409 Elf_Ehdr *hdr; 1458 Elf_Ehdr *hdr;
1410 Elf_Shdr *sechdrs; 1459 Elf_Shdr *sechdrs;
1411 char *secstrings, *args, *modmagic, *strtab = NULL; 1460 char *secstrings, *args, *modmagic, *strtab = NULL;
1412 unsigned int i, symindex = 0, strindex = 0, setupindex, exindex, 1461 unsigned int i;
1413 exportindex, modindex, obsparmindex, infoindex, gplindex, 1462 unsigned int symindex = 0;
1414 crcindex, gplcrcindex, versindex, pcpuindex, gplfutureindex, 1463 unsigned int strindex = 0;
1415 gplfuturecrcindex; 1464 unsigned int setupindex;
1465 unsigned int exindex;
1466 unsigned int exportindex;
1467 unsigned int modindex;
1468 unsigned int obsparmindex;
1469 unsigned int infoindex;
1470 unsigned int gplindex;
1471 unsigned int crcindex;
1472 unsigned int gplcrcindex;
1473 unsigned int versindex;
1474 unsigned int pcpuindex;
1475 unsigned int gplfutureindex;
1476 unsigned int gplfuturecrcindex;
1477 unsigned int unwindex = 0;
1478 unsigned int unusedindex;
1479 unsigned int unusedcrcindex;
1480 unsigned int unusedgplindex;
1481 unsigned int unusedgplcrcindex;
1416 struct module *mod; 1482 struct module *mod;
1417 long err = 0; 1483 long err = 0;
1418 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ 1484 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
@@ -1493,15 +1559,22 @@ static struct module *load_module(void __user *umod,
1493 exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab"); 1559 exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab");
1494 gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl"); 1560 gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl");
1495 gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future"); 1561 gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future");
1562 unusedindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused");
1563 unusedgplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused_gpl");
1496 crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab"); 1564 crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab");
1497 gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl"); 1565 gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl");
1498 gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future"); 1566 gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future");
1567 unusedcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused");
1568 unusedgplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused_gpl");
1499 setupindex = find_sec(hdr, sechdrs, secstrings, "__param"); 1569 setupindex = find_sec(hdr, sechdrs, secstrings, "__param");
1500 exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table"); 1570 exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table");
1501 obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm"); 1571 obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm");
1502 versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); 1572 versindex = find_sec(hdr, sechdrs, secstrings, "__versions");
1503 infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); 1573 infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo");
1504 pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); 1574 pcpuindex = find_pcpusec(hdr, sechdrs, secstrings);
1575#ifdef ARCH_UNWIND_SECTION_NAME
1576 unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME);
1577#endif
1505 1578
1506 /* Don't keep modinfo section */ 1579 /* Don't keep modinfo section */
1507 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; 1580 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
@@ -1510,6 +1583,8 @@ static struct module *load_module(void __user *umod,
1510 sechdrs[symindex].sh_flags |= SHF_ALLOC; 1583 sechdrs[symindex].sh_flags |= SHF_ALLOC;
1511 sechdrs[strindex].sh_flags |= SHF_ALLOC; 1584 sechdrs[strindex].sh_flags |= SHF_ALLOC;
1512#endif 1585#endif
1586 if (unwindex)
1587 sechdrs[unwindex].sh_flags |= SHF_ALLOC;
1513 1588
1514 /* Check module struct version now, before we try to use module. */ 1589 /* Check module struct version now, before we try to use module. */
1515 if (!check_modstruct_version(sechdrs, versindex, mod)) { 1590 if (!check_modstruct_version(sechdrs, versindex, mod)) {
@@ -1639,14 +1714,27 @@ static struct module *load_module(void __user *umod,
1639 mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr; 1714 mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr;
1640 mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size / 1715 mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size /
1641 sizeof(*mod->gpl_future_syms); 1716 sizeof(*mod->gpl_future_syms);
1717 mod->num_unused_syms = sechdrs[unusedindex].sh_size /
1718 sizeof(*mod->unused_syms);
1719 mod->num_unused_gpl_syms = sechdrs[unusedgplindex].sh_size /
1720 sizeof(*mod->unused_gpl_syms);
1642 mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr; 1721 mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr;
1643 if (gplfuturecrcindex) 1722 if (gplfuturecrcindex)
1644 mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr; 1723 mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr;
1645 1724
1725 mod->unused_syms = (void *)sechdrs[unusedindex].sh_addr;
1726 if (unusedcrcindex)
1727 mod->unused_crcs = (void *)sechdrs[unusedcrcindex].sh_addr;
1728 mod->unused_gpl_syms = (void *)sechdrs[unusedgplindex].sh_addr;
1729 if (unusedgplcrcindex)
1730 mod->unused_crcs = (void *)sechdrs[unusedgplcrcindex].sh_addr;
1731
1646#ifdef CONFIG_MODVERSIONS 1732#ifdef CONFIG_MODVERSIONS
1647 if ((mod->num_syms && !crcindex) || 1733 if ((mod->num_syms && !crcindex) ||
1648 (mod->num_gpl_syms && !gplcrcindex) || 1734 (mod->num_gpl_syms && !gplcrcindex) ||
1649 (mod->num_gpl_future_syms && !gplfuturecrcindex)) { 1735 (mod->num_gpl_future_syms && !gplfuturecrcindex) ||
1736 (mod->num_unused_syms && !unusedcrcindex) ||
1737 (mod->num_unused_gpl_syms && !unusedgplcrcindex)) {
1650 printk(KERN_WARNING "%s: No versions for exported symbols." 1738 printk(KERN_WARNING "%s: No versions for exported symbols."
1651 " Tainting kernel.\n", mod->name); 1739 " Tainting kernel.\n", mod->name);
1652 add_taint(TAINT_FORCED_MODULE); 1740 add_taint(TAINT_FORCED_MODULE);
@@ -1738,6 +1826,11 @@ static struct module *load_module(void __user *umod,
1738 goto arch_cleanup; 1826 goto arch_cleanup;
1739 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 1827 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
1740 1828
1829 /* Size of section 0 is 0, so this works well if no unwind info. */
1830 mod->unwind_info = unwind_add_table(mod,
1831 (void *)sechdrs[unwindex].sh_addr,
1832 sechdrs[unwindex].sh_size);
1833
1741 /* Get rid of temporary copy */ 1834 /* Get rid of temporary copy */
1742 vfree(hdr); 1835 vfree(hdr);
1743 1836
@@ -1836,6 +1929,7 @@ sys_init_module(void __user *umod,
1836 mod->state = MODULE_STATE_LIVE; 1929 mod->state = MODULE_STATE_LIVE;
1837 /* Drop initial reference. */ 1930 /* Drop initial reference. */
1838 module_put(mod); 1931 module_put(mod);
1932 unwind_remove_table(mod->unwind_info, 1);
1839 module_free(mod, mod->module_init); 1933 module_free(mod, mod->module_init);
1840 mod->module_init = NULL; 1934 mod->module_init = NULL;
1841 mod->init_size = 0; 1935 mod->init_size = 0;
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index f4913c376950..e38e4bac97ca 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -16,6 +16,7 @@
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/delay.h> 17#include <linux/delay.h>
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/poison.h>
19#include <linux/spinlock.h> 20#include <linux/spinlock.h>
20#include <linux/kallsyms.h> 21#include <linux/kallsyms.h>
21#include <linux/interrupt.h> 22#include <linux/interrupt.h>
@@ -153,13 +154,13 @@ next:
153 continue; 154 continue;
154 count++; 155 count++;
155 cursor = curr->next; 156 cursor = curr->next;
156 debug_spin_lock_restore(&debug_mutex_lock, flags); 157 debug_spin_unlock_restore(&debug_mutex_lock, flags);
157 158
158 printk("\n#%03d: ", count); 159 printk("\n#%03d: ", count);
159 printk_lock(lock, filter ? 0 : 1); 160 printk_lock(lock, filter ? 0 : 1);
160 goto next; 161 goto next;
161 } 162 }
162 debug_spin_lock_restore(&debug_mutex_lock, flags); 163 debug_spin_unlock_restore(&debug_mutex_lock, flags);
163 printk("\n"); 164 printk("\n");
164} 165}
165 166
@@ -316,7 +317,7 @@ void mutex_debug_check_no_locks_held(struct task_struct *task)
316 continue; 317 continue;
317 list_del_init(curr); 318 list_del_init(curr);
318 DEBUG_OFF(); 319 DEBUG_OFF();
319 debug_spin_lock_restore(&debug_mutex_lock, flags); 320 debug_spin_unlock_restore(&debug_mutex_lock, flags);
320 321
321 printk("BUG: %s/%d, lock held at task exit time!\n", 322 printk("BUG: %s/%d, lock held at task exit time!\n",
322 task->comm, task->pid); 323 task->comm, task->pid);
@@ -325,7 +326,7 @@ void mutex_debug_check_no_locks_held(struct task_struct *task)
325 printk("exiting task is not even the owner??\n"); 326 printk("exiting task is not even the owner??\n");
326 return; 327 return;
327 } 328 }
328 debug_spin_lock_restore(&debug_mutex_lock, flags); 329 debug_spin_unlock_restore(&debug_mutex_lock, flags);
329} 330}
330 331
331/* 332/*
@@ -352,7 +353,7 @@ void mutex_debug_check_no_locks_freed(const void *from, unsigned long len)
352 continue; 353 continue;
353 list_del_init(curr); 354 list_del_init(curr);
354 DEBUG_OFF(); 355 DEBUG_OFF();
355 debug_spin_lock_restore(&debug_mutex_lock, flags); 356 debug_spin_unlock_restore(&debug_mutex_lock, flags);
356 357
357 printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n", 358 printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n",
358 current->comm, current->pid, lock, from, to); 359 current->comm, current->pid, lock, from, to);
@@ -362,7 +363,7 @@ void mutex_debug_check_no_locks_freed(const void *from, unsigned long len)
362 printk("freeing task is not even the owner??\n"); 363 printk("freeing task is not even the owner??\n");
363 return; 364 return;
364 } 365 }
365 debug_spin_lock_restore(&debug_mutex_lock, flags); 366 debug_spin_unlock_restore(&debug_mutex_lock, flags);
366} 367}
367 368
368/* 369/*
@@ -381,7 +382,7 @@ void debug_mutex_set_owner(struct mutex *lock,
381 382
382void debug_mutex_init_waiter(struct mutex_waiter *waiter) 383void debug_mutex_init_waiter(struct mutex_waiter *waiter)
383{ 384{
384 memset(waiter, 0x11, sizeof(*waiter)); 385 memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter));
385 waiter->magic = waiter; 386 waiter->magic = waiter;
386 INIT_LIST_HEAD(&waiter->list); 387 INIT_LIST_HEAD(&waiter->list);
387} 388}
@@ -397,7 +398,7 @@ void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter)
397void debug_mutex_free_waiter(struct mutex_waiter *waiter) 398void debug_mutex_free_waiter(struct mutex_waiter *waiter)
398{ 399{
399 DEBUG_WARN_ON(!list_empty(&waiter->list)); 400 DEBUG_WARN_ON(!list_empty(&waiter->list));
400 memset(waiter, 0x22, sizeof(*waiter)); 401 memset(waiter, MUTEX_DEBUG_FREE, sizeof(*waiter));
401} 402}
402 403
403void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, 404void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
index fd384050acb1..a5196c36a5fd 100644
--- a/kernel/mutex-debug.h
+++ b/kernel/mutex-debug.h
@@ -46,21 +46,6 @@ extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
46extern void debug_mutex_unlock(struct mutex *lock); 46extern void debug_mutex_unlock(struct mutex *lock);
47extern void debug_mutex_init(struct mutex *lock, const char *name); 47extern void debug_mutex_init(struct mutex *lock, const char *name);
48 48
49#define debug_spin_lock(lock) \
50 do { \
51 local_irq_disable(); \
52 if (debug_mutex_on) \
53 spin_lock(lock); \
54 } while (0)
55
56#define debug_spin_unlock(lock) \
57 do { \
58 if (debug_mutex_on) \
59 spin_unlock(lock); \
60 local_irq_enable(); \
61 preempt_check_resched(); \
62 } while (0)
63
64#define debug_spin_lock_save(lock, flags) \ 49#define debug_spin_lock_save(lock, flags) \
65 do { \ 50 do { \
66 local_irq_save(flags); \ 51 local_irq_save(flags); \
@@ -68,7 +53,7 @@ extern void debug_mutex_init(struct mutex *lock, const char *name);
68 spin_lock(lock); \ 53 spin_lock(lock); \
69 } while (0) 54 } while (0)
70 55
71#define debug_spin_lock_restore(lock, flags) \ 56#define debug_spin_unlock_restore(lock, flags) \
72 do { \ 57 do { \
73 if (debug_mutex_on) \ 58 if (debug_mutex_on) \
74 spin_unlock(lock); \ 59 spin_unlock(lock); \
@@ -76,20 +61,20 @@ extern void debug_mutex_init(struct mutex *lock, const char *name);
76 preempt_check_resched(); \ 61 preempt_check_resched(); \
77 } while (0) 62 } while (0)
78 63
79#define spin_lock_mutex(lock) \ 64#define spin_lock_mutex(lock, flags) \
80 do { \ 65 do { \
81 struct mutex *l = container_of(lock, struct mutex, wait_lock); \ 66 struct mutex *l = container_of(lock, struct mutex, wait_lock); \
82 \ 67 \
83 DEBUG_WARN_ON(in_interrupt()); \ 68 DEBUG_WARN_ON(in_interrupt()); \
84 debug_spin_lock(&debug_mutex_lock); \ 69 debug_spin_lock_save(&debug_mutex_lock, flags); \
85 spin_lock(lock); \ 70 spin_lock(lock); \
86 DEBUG_WARN_ON(l->magic != l); \ 71 DEBUG_WARN_ON(l->magic != l); \
87 } while (0) 72 } while (0)
88 73
89#define spin_unlock_mutex(lock) \ 74#define spin_unlock_mutex(lock, flags) \
90 do { \ 75 do { \
91 spin_unlock(lock); \ 76 spin_unlock(lock); \
92 debug_spin_unlock(&debug_mutex_lock); \ 77 debug_spin_unlock_restore(&debug_mutex_lock, flags); \
93 } while (0) 78 } while (0)
94 79
95#define DEBUG_OFF() \ 80#define DEBUG_OFF() \
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 5449b210d9ed..7043db21bbce 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -125,10 +125,11 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__)
125 struct task_struct *task = current; 125 struct task_struct *task = current;
126 struct mutex_waiter waiter; 126 struct mutex_waiter waiter;
127 unsigned int old_val; 127 unsigned int old_val;
128 unsigned long flags;
128 129
129 debug_mutex_init_waiter(&waiter); 130 debug_mutex_init_waiter(&waiter);
130 131
131 spin_lock_mutex(&lock->wait_lock); 132 spin_lock_mutex(&lock->wait_lock, flags);
132 133
133 debug_mutex_add_waiter(lock, &waiter, task->thread_info, ip); 134 debug_mutex_add_waiter(lock, &waiter, task->thread_info, ip);
134 135
@@ -157,7 +158,7 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__)
157 if (unlikely(state == TASK_INTERRUPTIBLE && 158 if (unlikely(state == TASK_INTERRUPTIBLE &&
158 signal_pending(task))) { 159 signal_pending(task))) {
159 mutex_remove_waiter(lock, &waiter, task->thread_info); 160 mutex_remove_waiter(lock, &waiter, task->thread_info);
160 spin_unlock_mutex(&lock->wait_lock); 161 spin_unlock_mutex(&lock->wait_lock, flags);
161 162
162 debug_mutex_free_waiter(&waiter); 163 debug_mutex_free_waiter(&waiter);
163 return -EINTR; 164 return -EINTR;
@@ -165,9 +166,9 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__)
165 __set_task_state(task, state); 166 __set_task_state(task, state);
166 167
167 /* didnt get the lock, go to sleep: */ 168 /* didnt get the lock, go to sleep: */
168 spin_unlock_mutex(&lock->wait_lock); 169 spin_unlock_mutex(&lock->wait_lock, flags);
169 schedule(); 170 schedule();
170 spin_lock_mutex(&lock->wait_lock); 171 spin_lock_mutex(&lock->wait_lock, flags);
171 } 172 }
172 173
173 /* got the lock - rejoice! */ 174 /* got the lock - rejoice! */
@@ -178,7 +179,7 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__)
178 if (likely(list_empty(&lock->wait_list))) 179 if (likely(list_empty(&lock->wait_list)))
179 atomic_set(&lock->count, 0); 180 atomic_set(&lock->count, 0);
180 181
181 spin_unlock_mutex(&lock->wait_lock); 182 spin_unlock_mutex(&lock->wait_lock, flags);
182 183
183 debug_mutex_free_waiter(&waiter); 184 debug_mutex_free_waiter(&waiter);
184 185
@@ -203,10 +204,11 @@ static fastcall noinline void
203__mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__) 204__mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__)
204{ 205{
205 struct mutex *lock = container_of(lock_count, struct mutex, count); 206 struct mutex *lock = container_of(lock_count, struct mutex, count);
207 unsigned long flags;
206 208
207 DEBUG_WARN_ON(lock->owner != current_thread_info()); 209 DEBUG_WARN_ON(lock->owner != current_thread_info());
208 210
209 spin_lock_mutex(&lock->wait_lock); 211 spin_lock_mutex(&lock->wait_lock, flags);
210 212
211 /* 213 /*
212 * some architectures leave the lock unlocked in the fastpath failure 214 * some architectures leave the lock unlocked in the fastpath failure
@@ -231,7 +233,7 @@ __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__)
231 233
232 debug_mutex_clear_owner(lock); 234 debug_mutex_clear_owner(lock);
233 235
234 spin_unlock_mutex(&lock->wait_lock); 236 spin_unlock_mutex(&lock->wait_lock, flags);
235} 237}
236 238
237/* 239/*
@@ -276,9 +278,10 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__)
276static inline int __mutex_trylock_slowpath(atomic_t *lock_count) 278static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
277{ 279{
278 struct mutex *lock = container_of(lock_count, struct mutex, count); 280 struct mutex *lock = container_of(lock_count, struct mutex, count);
281 unsigned long flags;
279 int prev; 282 int prev;
280 283
281 spin_lock_mutex(&lock->wait_lock); 284 spin_lock_mutex(&lock->wait_lock, flags);
282 285
283 prev = atomic_xchg(&lock->count, -1); 286 prev = atomic_xchg(&lock->count, -1);
284 if (likely(prev == 1)) 287 if (likely(prev == 1))
@@ -287,7 +290,7 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
287 if (likely(list_empty(&lock->wait_list))) 290 if (likely(list_empty(&lock->wait_list)))
288 atomic_set(&lock->count, 0); 291 atomic_set(&lock->count, 0);
289 292
290 spin_unlock_mutex(&lock->wait_lock); 293 spin_unlock_mutex(&lock->wait_lock, flags);
291 294
292 return prev == 1; 295 return prev == 1;
293} 296}
diff --git a/kernel/mutex.h b/kernel/mutex.h
index 00fe84e7b672..069189947257 100644
--- a/kernel/mutex.h
+++ b/kernel/mutex.h
@@ -9,8 +9,10 @@
9 * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs: 9 * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs:
10 */ 10 */
11 11
12#define spin_lock_mutex(lock) spin_lock(lock) 12#define spin_lock_mutex(lock, flags) \
13#define spin_unlock_mutex(lock) spin_unlock(lock) 13 do { spin_lock(lock); (void)(flags); } while (0)
14#define spin_unlock_mutex(lock, flags) \
15 do { spin_unlock(lock); (void)(flags); } while (0)
14#define mutex_remove_waiter(lock, waiter, ti) \ 16#define mutex_remove_waiter(lock, waiter, ti) \
15 __list_del((waiter)->list.prev, (waiter)->list.next) 17 __list_del((waiter)->list.prev, (waiter)->list.next)
16 18
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index ce0dfb8f4a4e..857b4fa09124 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -36,6 +36,24 @@ config PM_DEBUG
36 code. This is helpful when debugging and reporting various PM bugs, 36 code. This is helpful when debugging and reporting various PM bugs,
37 like suspend support. 37 like suspend support.
38 38
39config PM_TRACE
40 bool "Suspend/resume event tracing"
41 depends on PM && PM_DEBUG && X86_32 && EXPERIMENTAL
42 default n
43 ---help---
44 This enables some cheesy code to save the last PM event point in the
45 RTC across reboots, so that you can debug a machine that just hangs
46 during suspend (or more commonly, during resume).
47
48 To use this debugging feature you should attempt to suspend the machine,
49 then reboot it, then run
50
51 dmesg -s 1000000 | grep 'hash matches'
52
53 CAUTION: this option will cause your machine's real-time clock to be
54 set to an invalid time after a resume.
55
56
39config SOFTWARE_SUSPEND 57config SOFTWARE_SUSPEND
40 bool "Software Suspend" 58 bool "Software Suspend"
41 depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP) 59 depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP)
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 81d4d982f3f0..e13e74067845 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -231,7 +231,7 @@ static int software_resume(void)
231late_initcall(software_resume); 231late_initcall(software_resume);
232 232
233 233
234static char * pm_disk_modes[] = { 234static const char * const pm_disk_modes[] = {
235 [PM_DISK_FIRMWARE] = "firmware", 235 [PM_DISK_FIRMWARE] = "firmware",
236 [PM_DISK_PLATFORM] = "platform", 236 [PM_DISK_PLATFORM] = "platform",
237 [PM_DISK_SHUTDOWN] = "shutdown", 237 [PM_DISK_SHUTDOWN] = "shutdown",
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 0a907f0dc56b..6d295c776794 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -15,7 +15,7 @@
15#include <linux/errno.h> 15#include <linux/errno.h>
16#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/pm.h> 17#include <linux/pm.h>
18 18#include <linux/console.h>
19 19
20#include "power.h" 20#include "power.h"
21 21
@@ -145,7 +145,7 @@ static void suspend_finish(suspend_state_t state)
145 145
146 146
147 147
148static char *pm_states[PM_SUSPEND_MAX] = { 148static const char * const pm_states[PM_SUSPEND_MAX] = {
149 [PM_SUSPEND_STANDBY] = "standby", 149 [PM_SUSPEND_STANDBY] = "standby",
150 [PM_SUSPEND_MEM] = "mem", 150 [PM_SUSPEND_MEM] = "mem",
151#ifdef CONFIG_SOFTWARE_SUSPEND 151#ifdef CONFIG_SOFTWARE_SUSPEND
@@ -262,7 +262,7 @@ static ssize_t state_show(struct subsystem * subsys, char * buf)
262static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n) 262static ssize_t state_store(struct subsystem * subsys, const char * buf, size_t n)
263{ 263{
264 suspend_state_t state = PM_SUSPEND_STANDBY; 264 suspend_state_t state = PM_SUSPEND_STANDBY;
265 char ** s; 265 const char * const *s;
266 char *p; 266 char *p;
267 int error; 267 int error;
268 int len; 268 int len;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index f06f12f21767..57a792982fb9 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -55,7 +55,7 @@ struct snapshot_handle {
55 unsigned int page; 55 unsigned int page;
56 unsigned int page_offset; 56 unsigned int page_offset;
57 unsigned int prev; 57 unsigned int prev;
58 struct pbe *pbe; 58 struct pbe *pbe, *last_pbe;
59 void *buffer; 59 void *buffer;
60 unsigned int buf_offset; 60 unsigned int buf_offset;
61}; 61};
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 3eeedbb13b78..24c96f354231 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -150,6 +150,10 @@ int restore_highmem(void)
150 } 150 }
151 return 0; 151 return 0;
152} 152}
153#else
154static inline unsigned int count_highmem_pages(void) {return 0;}
155static inline int save_highmem(void) {return 0;}
156static inline int restore_highmem(void) {return 0;}
153#endif 157#endif
154 158
155static int pfn_is_nosave(unsigned long pfn) 159static int pfn_is_nosave(unsigned long pfn)
@@ -293,62 +297,29 @@ static inline void create_pbe_list(struct pbe *pblist, unsigned int nr_pages)
293 } 297 }
294} 298}
295 299
296/** 300static unsigned int unsafe_pages;
297 * On resume it is necessary to trace and eventually free the unsafe
298 * pages that have been allocated, because they are needed for I/O
299 * (on x86-64 we likely will "eat" these pages once again while
300 * creating the temporary page translation tables)
301 */
302
303struct eaten_page {
304 struct eaten_page *next;
305 char padding[PAGE_SIZE - sizeof(void *)];
306};
307
308static struct eaten_page *eaten_pages = NULL;
309
310static void release_eaten_pages(void)
311{
312 struct eaten_page *p, *q;
313
314 p = eaten_pages;
315 while (p) {
316 q = p->next;
317 /* We don't want swsusp_free() to free this page again */
318 ClearPageNosave(virt_to_page(p));
319 free_page((unsigned long)p);
320 p = q;
321 }
322 eaten_pages = NULL;
323}
324 301
325/** 302/**
326 * @safe_needed - on resume, for storing the PBE list and the image, 303 * @safe_needed - on resume, for storing the PBE list and the image,
327 * we can only use memory pages that do not conflict with the pages 304 * we can only use memory pages that do not conflict with the pages
328 * which had been used before suspend. 305 * used before suspend.
329 * 306 *
330 * The unsafe pages are marked with the PG_nosave_free flag 307 * The unsafe pages are marked with the PG_nosave_free flag
331 * 308 * and we count them using unsafe_pages
332 * Allocated but unusable (ie eaten) memory pages should be marked
333 * so that swsusp_free() can release them
334 */ 309 */
335 310
336static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed) 311static inline void *alloc_image_page(gfp_t gfp_mask, int safe_needed)
337{ 312{
338 void *res; 313 void *res;
339 314
315 res = (void *)get_zeroed_page(gfp_mask);
340 if (safe_needed) 316 if (safe_needed)
341 do { 317 while (res && PageNosaveFree(virt_to_page(res))) {
318 /* The page is unsafe, mark it for swsusp_free() */
319 SetPageNosave(virt_to_page(res));
320 unsafe_pages++;
342 res = (void *)get_zeroed_page(gfp_mask); 321 res = (void *)get_zeroed_page(gfp_mask);
343 if (res && PageNosaveFree(virt_to_page(res))) { 322 }
344 /* This is for swsusp_free() */
345 SetPageNosave(virt_to_page(res));
346 ((struct eaten_page *)res)->next = eaten_pages;
347 eaten_pages = res;
348 }
349 } while (res && PageNosaveFree(virt_to_page(res)));
350 else
351 res = (void *)get_zeroed_page(gfp_mask);
352 if (res) { 323 if (res) {
353 SetPageNosave(virt_to_page(res)); 324 SetPageNosave(virt_to_page(res));
354 SetPageNosaveFree(virt_to_page(res)); 325 SetPageNosaveFree(virt_to_page(res));
@@ -374,7 +345,8 @@ unsigned long get_safe_page(gfp_t gfp_mask)
374 * On each page we set up a list of struct_pbe elements. 345 * On each page we set up a list of struct_pbe elements.
375 */ 346 */
376 347
377struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask, int safe_needed) 348static struct pbe *alloc_pagedir(unsigned int nr_pages, gfp_t gfp_mask,
349 int safe_needed)
378{ 350{
379 unsigned int num; 351 unsigned int num;
380 struct pbe *pblist, *pbe; 352 struct pbe *pblist, *pbe;
@@ -642,6 +614,8 @@ static int mark_unsafe_pages(struct pbe *pblist)
642 return -EFAULT; 614 return -EFAULT;
643 } 615 }
644 616
617 unsafe_pages = 0;
618
645 return 0; 619 return 0;
646} 620}
647 621
@@ -719,42 +693,99 @@ static inline struct pbe *unpack_orig_addresses(unsigned long *buf,
719} 693}
720 694
721/** 695/**
722 * create_image - use metadata contained in the PBE list 696 * prepare_image - use metadata contained in the PBE list
723 * pointed to by pagedir_nosave to mark the pages that will 697 * pointed to by pagedir_nosave to mark the pages that will
724 * be overwritten in the process of restoring the system 698 * be overwritten in the process of restoring the system
725 * memory state from the image and allocate memory for 699 * memory state from the image ("unsafe" pages) and allocate
726 * the image avoiding these pages 700 * memory for the image
701 *
702 * The idea is to allocate the PBE list first and then
703 * allocate as many pages as it's needed for the image data,
704 * but not to assign these pages to the PBEs initially.
705 * Instead, we just mark them as allocated and create a list
706 * of "safe" which will be used later
727 */ 707 */
728 708
729static int create_image(struct snapshot_handle *handle) 709struct safe_page {
710 struct safe_page *next;
711 char padding[PAGE_SIZE - sizeof(void *)];
712};
713
714static struct safe_page *safe_pages;
715
716static int prepare_image(struct snapshot_handle *handle)
730{ 717{
731 int error = 0; 718 int error = 0;
732 struct pbe *p, *pblist; 719 unsigned int nr_pages = nr_copy_pages;
720 struct pbe *p, *pblist = NULL;
733 721
734 p = pagedir_nosave; 722 p = pagedir_nosave;
735 error = mark_unsafe_pages(p); 723 error = mark_unsafe_pages(p);
736 if (!error) { 724 if (!error) {
737 pblist = alloc_pagedir(nr_copy_pages, GFP_ATOMIC, 1); 725 pblist = alloc_pagedir(nr_pages, GFP_ATOMIC, 1);
738 if (pblist) 726 if (pblist)
739 copy_page_backup_list(pblist, p); 727 copy_page_backup_list(pblist, p);
740 free_pagedir(p, 0); 728 free_pagedir(p, 0);
741 if (!pblist) 729 if (!pblist)
742 error = -ENOMEM; 730 error = -ENOMEM;
743 } 731 }
744 if (!error) 732 safe_pages = NULL;
745 error = alloc_data_pages(pblist, GFP_ATOMIC, 1); 733 if (!error && nr_pages > unsafe_pages) {
734 nr_pages -= unsafe_pages;
735 while (nr_pages--) {
736 struct safe_page *ptr;
737
738 ptr = (struct safe_page *)get_zeroed_page(GFP_ATOMIC);
739 if (!ptr) {
740 error = -ENOMEM;
741 break;
742 }
743 if (!PageNosaveFree(virt_to_page(ptr))) {
744 /* The page is "safe", add it to the list */
745 ptr->next = safe_pages;
746 safe_pages = ptr;
747 }
748 /* Mark the page as allocated */
749 SetPageNosave(virt_to_page(ptr));
750 SetPageNosaveFree(virt_to_page(ptr));
751 }
752 }
746 if (!error) { 753 if (!error) {
747 release_eaten_pages();
748 pagedir_nosave = pblist; 754 pagedir_nosave = pblist;
749 } else { 755 } else {
750 pagedir_nosave = NULL;
751 handle->pbe = NULL; 756 handle->pbe = NULL;
752 nr_copy_pages = 0; 757 swsusp_free();
753 nr_meta_pages = 0;
754 } 758 }
755 return error; 759 return error;
756} 760}
757 761
762static void *get_buffer(struct snapshot_handle *handle)
763{
764 struct pbe *pbe = handle->pbe, *last = handle->last_pbe;
765 struct page *page = virt_to_page(pbe->orig_address);
766
767 if (PageNosave(page) && PageNosaveFree(page)) {
768 /*
769 * We have allocated the "original" page frame and we can
770 * use it directly to store the read page
771 */
772 pbe->address = 0;
773 if (last && last->next)
774 last->next = NULL;
775 return (void *)pbe->orig_address;
776 }
777 /*
778 * The "original" page frame has not been allocated and we have to
779 * use a "safe" page frame to store the read page
780 */
781 pbe->address = (unsigned long)safe_pages;
782 safe_pages = safe_pages->next;
783 if (last)
784 last->next = pbe;
785 handle->last_pbe = pbe;
786 return (void *)pbe->address;
787}
788
758/** 789/**
759 * snapshot_write_next - used for writing the system memory snapshot. 790 * snapshot_write_next - used for writing the system memory snapshot.
760 * 791 *
@@ -799,15 +830,16 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count)
799 } else if (handle->prev <= nr_meta_pages) { 830 } else if (handle->prev <= nr_meta_pages) {
800 handle->pbe = unpack_orig_addresses(buffer, handle->pbe); 831 handle->pbe = unpack_orig_addresses(buffer, handle->pbe);
801 if (!handle->pbe) { 832 if (!handle->pbe) {
802 error = create_image(handle); 833 error = prepare_image(handle);
803 if (error) 834 if (error)
804 return error; 835 return error;
805 handle->pbe = pagedir_nosave; 836 handle->pbe = pagedir_nosave;
806 handle->buffer = (void *)handle->pbe->address; 837 handle->last_pbe = NULL;
838 handle->buffer = get_buffer(handle);
807 } 839 }
808 } else { 840 } else {
809 handle->pbe = handle->pbe->next; 841 handle->pbe = handle->pbe->next;
810 handle->buffer = (void *)handle->pbe->address; 842 handle->buffer = get_buffer(handle);
811 } 843 }
812 handle->prev = handle->page; 844 handle->prev = handle->page;
813 } 845 }
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index c4016cbbd3e0..17f669c83012 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -67,9 +67,9 @@ unsigned int count_highmem_pages(void);
67int save_highmem(void); 67int save_highmem(void);
68int restore_highmem(void); 68int restore_highmem(void);
69#else 69#else
70static int save_highmem(void) { return 0; } 70static inline int save_highmem(void) { return 0; }
71static int restore_highmem(void) { return 0; } 71static inline int restore_highmem(void) { return 0; }
72static unsigned int count_highmem_pages(void) { return 0; } 72static inline unsigned int count_highmem_pages(void) { return 0; }
73#endif 73#endif
74 74
75/** 75/**
@@ -175,6 +175,12 @@ void free_all_swap_pages(int swap, struct bitmap_page *bitmap)
175 */ 175 */
176 176
177#define SHRINK_BITE 10000 177#define SHRINK_BITE 10000
178static inline unsigned long __shrink_memory(long tmp)
179{
180 if (tmp > SHRINK_BITE)
181 tmp = SHRINK_BITE;
182 return shrink_all_memory(tmp);
183}
178 184
179int swsusp_shrink_memory(void) 185int swsusp_shrink_memory(void)
180{ 186{
@@ -192,15 +198,17 @@ int swsusp_shrink_memory(void)
192 PAGES_FOR_IO; 198 PAGES_FOR_IO;
193 tmp = size; 199 tmp = size;
194 for_each_zone (zone) 200 for_each_zone (zone)
195 if (!is_highmem(zone)) 201 if (!is_highmem(zone) && populated_zone(zone)) {
196 tmp -= zone->free_pages; 202 tmp -= zone->free_pages;
203 tmp += zone->lowmem_reserve[ZONE_NORMAL];
204 }
197 if (tmp > 0) { 205 if (tmp > 0) {
198 tmp = shrink_all_memory(SHRINK_BITE); 206 tmp = __shrink_memory(tmp);
199 if (!tmp) 207 if (!tmp)
200 return -ENOMEM; 208 return -ENOMEM;
201 pages += tmp; 209 pages += tmp;
202 } else if (size > image_size / PAGE_SIZE) { 210 } else if (size > image_size / PAGE_SIZE) {
203 tmp = shrink_all_memory(SHRINK_BITE); 211 tmp = __shrink_memory(size - (image_size / PAGE_SIZE));
204 pages += tmp; 212 pages += tmp;
205 } 213 }
206 printk("\b%c", p[i++%4]); 214 printk("\b%c", p[i++%4]);
diff --git a/kernel/printk.c b/kernel/printk.c
index 19a955619294..95b7fe17f124 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -24,6 +24,7 @@
24#include <linux/console.h> 24#include <linux/console.h>
25#include <linux/init.h> 25#include <linux/init.h>
26#include <linux/module.h> 26#include <linux/module.h>
27#include <linux/moduleparam.h>
27#include <linux/interrupt.h> /* For in_interrupt() */ 28#include <linux/interrupt.h> /* For in_interrupt() */
28#include <linux/config.h> 29#include <linux/config.h>
29#include <linux/delay.h> 30#include <linux/delay.h>
@@ -327,7 +328,9 @@ static void __call_console_drivers(unsigned long start, unsigned long end)
327 struct console *con; 328 struct console *con;
328 329
329 for (con = console_drivers; con; con = con->next) { 330 for (con = console_drivers; con; con = con->next) {
330 if ((con->flags & CON_ENABLED) && con->write) 331 if ((con->flags & CON_ENABLED) && con->write &&
332 (cpu_online(smp_processor_id()) ||
333 (con->flags & CON_ANYTIME)))
331 con->write(con, &LOG_BUF(start), end - start); 334 con->write(con, &LOG_BUF(start), end - start);
332 } 335 }
333} 336}
@@ -437,6 +440,7 @@ static int printk_time = 1;
437#else 440#else
438static int printk_time = 0; 441static int printk_time = 0;
439#endif 442#endif
443module_param(printk_time, int, S_IRUGO | S_IWUSR);
440 444
441static int __init printk_time_setup(char *str) 445static int __init printk_time_setup(char *str)
442{ 446{
@@ -453,6 +457,18 @@ __attribute__((weak)) unsigned long long printk_clock(void)
453 return sched_clock(); 457 return sched_clock();
454} 458}
455 459
460/* Check if we have any console registered that can be called early in boot. */
461static int have_callable_console(void)
462{
463 struct console *con;
464
465 for (con = console_drivers; con; con = con->next)
466 if (con->flags & CON_ANYTIME)
467 return 1;
468
469 return 0;
470}
471
456/** 472/**
457 * printk - print a kernel message 473 * printk - print a kernel message
458 * @fmt: format string 474 * @fmt: format string
@@ -566,27 +582,29 @@ asmlinkage int vprintk(const char *fmt, va_list args)
566 log_level_unknown = 1; 582 log_level_unknown = 1;
567 } 583 }
568 584
569 if (!cpu_online(smp_processor_id())) { 585 if (!down_trylock(&console_sem)) {
570 /* 586 /*
571 * Some console drivers may assume that per-cpu resources have 587 * We own the drivers. We can drop the spinlock and
572 * been allocated. So don't allow them to be called by this 588 * let release_console_sem() print the text, maybe ...
573 * CPU until it is officially up. We shouldn't be calling into
574 * random console drivers on a CPU which doesn't exist yet..
575 */ 589 */
590 console_locked = 1;
576 printk_cpu = UINT_MAX; 591 printk_cpu = UINT_MAX;
577 spin_unlock_irqrestore(&logbuf_lock, flags); 592 spin_unlock_irqrestore(&logbuf_lock, flags);
578 goto out; 593
579 }
580 if (!down_trylock(&console_sem)) {
581 console_locked = 1;
582 /* 594 /*
583 * We own the drivers. We can drop the spinlock and let 595 * Console drivers may assume that per-cpu resources have
584 * release_console_sem() print the text 596 * been allocated. So unless they're explicitly marked as
597 * being able to cope (CON_ANYTIME) don't call them until
598 * this CPU is officially up.
585 */ 599 */
586 printk_cpu = UINT_MAX; 600 if (cpu_online(smp_processor_id()) || have_callable_console()) {
587 spin_unlock_irqrestore(&logbuf_lock, flags); 601 console_may_schedule = 0;
588 console_may_schedule = 0; 602 release_console_sem();
589 release_console_sem(); 603 } else {
604 /* Release by hand to avoid flushing the buffer. */
605 console_locked = 0;
606 up(&console_sem);
607 }
590 } else { 608 } else {
591 /* 609 /*
592 * Someone else owns the drivers. We drop the spinlock, which 610 * Someone else owns the drivers. We drop the spinlock, which
@@ -596,7 +614,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
596 printk_cpu = UINT_MAX; 614 printk_cpu = UINT_MAX;
597 spin_unlock_irqrestore(&logbuf_lock, flags); 615 spin_unlock_irqrestore(&logbuf_lock, flags);
598 } 616 }
599out: 617
600 preempt_enable(); 618 preempt_enable();
601 return printed_len; 619 return printed_len;
602} 620}
diff --git a/kernel/profile.c b/kernel/profile.c
index 68afe121e507..5a730fdb1a2c 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -299,7 +299,7 @@ out:
299} 299}
300 300
301#ifdef CONFIG_HOTPLUG_CPU 301#ifdef CONFIG_HOTPLUG_CPU
302static int profile_cpu_callback(struct notifier_block *info, 302static int __devinit profile_cpu_callback(struct notifier_block *info,
303 unsigned long action, void *__cpu) 303 unsigned long action, void *__cpu)
304{ 304{
305 int node, cpu = (unsigned long)__cpu; 305 int node, cpu = (unsigned long)__cpu;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 921c22ad16e4..335c5b932e14 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -120,8 +120,18 @@ int ptrace_check_attach(struct task_struct *child, int kill)
120 120
121static int may_attach(struct task_struct *task) 121static int may_attach(struct task_struct *task)
122{ 122{
123 if (!task->mm) 123 /* May we inspect the given task?
124 return -EPERM; 124 * This check is used both for attaching with ptrace
125 * and for allowing access to sensitive information in /proc.
126 *
127 * ptrace_attach denies several cases that /proc allows
128 * because setting up the necessary parent/child relationship
129 * or halting the specified task is impossible.
130 */
131 int dumpable = 0;
132 /* Don't let security modules deny introspection */
133 if (task == current)
134 return 0;
125 if (((current->uid != task->euid) || 135 if (((current->uid != task->euid) ||
126 (current->uid != task->suid) || 136 (current->uid != task->suid) ||
127 (current->uid != task->uid) || 137 (current->uid != task->uid) ||
@@ -130,7 +140,9 @@ static int may_attach(struct task_struct *task)
130 (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) 140 (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
131 return -EPERM; 141 return -EPERM;
132 smp_rmb(); 142 smp_rmb();
133 if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE)) 143 if (task->mm)
144 dumpable = task->mm->dumpable;
145 if (!dumpable && !capable(CAP_SYS_PTRACE))
134 return -EPERM; 146 return -EPERM;
135 147
136 return security_ptrace(current, task); 148 return security_ptrace(current, task);
@@ -176,6 +188,8 @@ repeat:
176 goto repeat; 188 goto repeat;
177 } 189 }
178 190
191 if (!task->mm)
192 goto bad;
179 /* the same process cannot be attached many times */ 193 /* the same process cannot be attached many times */
180 if (task->ptrace & PT_PTRACED) 194 if (task->ptrace & PT_PTRACED)
181 goto bad; 195 goto bad;
@@ -200,7 +214,7 @@ out:
200 return retval; 214 return retval;
201} 215}
202 216
203void __ptrace_detach(struct task_struct *child, unsigned int data) 217static inline void __ptrace_detach(struct task_struct *child, unsigned int data)
204{ 218{
205 child->exit_code = data; 219 child->exit_code = data;
206 /* .. re-parent .. */ 220 /* .. re-parent .. */
@@ -219,6 +233,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
219 ptrace_disable(child); 233 ptrace_disable(child);
220 234
221 write_lock_irq(&tasklist_lock); 235 write_lock_irq(&tasklist_lock);
236 /* protect against de_thread()->release_task() */
222 if (child->ptrace) 237 if (child->ptrace)
223 __ptrace_detach(child, data); 238 __ptrace_detach(child, data);
224 write_unlock_irq(&tasklist_lock); 239 write_unlock_irq(&tasklist_lock);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 2058f88c7bbb..f464f5ae3f11 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -182,6 +182,15 @@ long rcu_batches_completed(void)
182 return rcu_ctrlblk.completed; 182 return rcu_ctrlblk.completed;
183} 183}
184 184
185/*
186 * Return the number of RCU batches processed thus far. Useful
187 * for debug and statistics.
188 */
189long rcu_batches_completed_bh(void)
190{
191 return rcu_bh_ctrlblk.completed;
192}
193
185static void rcu_barrier_callback(struct rcu_head *notused) 194static void rcu_barrier_callback(struct rcu_head *notused)
186{ 195{
187 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) 196 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
@@ -539,7 +548,7 @@ static void __devinit rcu_online_cpu(int cpu)
539 tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL); 548 tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL);
540} 549}
541 550
542static int rcu_cpu_notify(struct notifier_block *self, 551static int __devinit rcu_cpu_notify(struct notifier_block *self,
543 unsigned long action, void *hcpu) 552 unsigned long action, void *hcpu)
544{ 553{
545 long cpu = (long)hcpu; 554 long cpu = (long)hcpu;
@@ -556,7 +565,7 @@ static int rcu_cpu_notify(struct notifier_block *self,
556 return NOTIFY_OK; 565 return NOTIFY_OK;
557} 566}
558 567
559static struct notifier_block rcu_nb = { 568static struct notifier_block __devinitdata rcu_nb = {
560 .notifier_call = rcu_cpu_notify, 569 .notifier_call = rcu_cpu_notify,
561}; 570};
562 571
@@ -612,14 +621,6 @@ void synchronize_rcu(void)
612 wait_for_completion(&rcu.completion); 621 wait_for_completion(&rcu.completion);
613} 622}
614 623
615/*
616 * Deprecated, use synchronize_rcu() or synchronize_sched() instead.
617 */
618void synchronize_kernel(void)
619{
620 synchronize_rcu();
621}
622
623module_param(blimit, int, 0); 624module_param(blimit, int, 0);
624module_param(qhimark, int, 0); 625module_param(qhimark, int, 0);
625module_param(qlowmark, int, 0); 626module_param(qlowmark, int, 0);
@@ -627,7 +628,7 @@ module_param(qlowmark, int, 0);
627module_param(rsinterval, int, 0); 628module_param(rsinterval, int, 0);
628#endif 629#endif
629EXPORT_SYMBOL_GPL(rcu_batches_completed); 630EXPORT_SYMBOL_GPL(rcu_batches_completed);
630EXPORT_SYMBOL_GPL_FUTURE(call_rcu); /* WARNING: GPL-only in April 2006. */ 631EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
631EXPORT_SYMBOL_GPL_FUTURE(call_rcu_bh); /* WARNING: GPL-only in April 2006. */ 632EXPORT_SYMBOL_GPL(call_rcu);
633EXPORT_SYMBOL_GPL(call_rcu_bh);
632EXPORT_SYMBOL_GPL(synchronize_rcu); 634EXPORT_SYMBOL_GPL(synchronize_rcu);
633EXPORT_SYMBOL_GPL_FUTURE(synchronize_kernel); /* WARNING: GPL-only in April 2006. */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 8154e7589d12..4d1c3d247127 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Read-Copy Update /proc-based torture test facility 2 * Read-Copy Update module-based torture test facility
3 * 3 *
4 * This program is free software; you can redistribute it and/or modify 4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by 5 * it under the terms of the GNU General Public License as published by
@@ -53,6 +53,7 @@ static int stat_interval; /* Interval between stats, in seconds. */
53static int verbose; /* Print more debug info. */ 53static int verbose; /* Print more debug info. */
54static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ 54static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */
55static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/ 55static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/
56static char *torture_type = "rcu"; /* What to torture. */
56 57
57module_param(nreaders, int, 0); 58module_param(nreaders, int, 0);
58MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); 59MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
@@ -64,13 +65,16 @@ module_param(test_no_idle_hz, bool, 0);
64MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); 65MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
65module_param(shuffle_interval, int, 0); 66module_param(shuffle_interval, int, 0);
66MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); 67MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
67#define TORTURE_FLAG "rcutorture: " 68module_param(torture_type, charp, 0);
69MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh)");
70
71#define TORTURE_FLAG "-torture:"
68#define PRINTK_STRING(s) \ 72#define PRINTK_STRING(s) \
69 do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) 73 do { printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0)
70#define VERBOSE_PRINTK_STRING(s) \ 74#define VERBOSE_PRINTK_STRING(s) \
71 do { if (verbose) printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) 75 do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0)
72#define VERBOSE_PRINTK_ERRSTRING(s) \ 76#define VERBOSE_PRINTK_ERRSTRING(s) \
73 do { if (verbose) printk(KERN_ALERT TORTURE_FLAG "!!! " s "\n"); } while (0) 77 do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0)
74 78
75static char printk_buf[4096]; 79static char printk_buf[4096];
76 80
@@ -139,28 +143,6 @@ rcu_torture_free(struct rcu_torture *p)
139 spin_unlock_bh(&rcu_torture_lock); 143 spin_unlock_bh(&rcu_torture_lock);
140} 144}
141 145
142static void
143rcu_torture_cb(struct rcu_head *p)
144{
145 int i;
146 struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
147
148 if (fullstop) {
149 /* Test is ending, just drop callbacks on the floor. */
150 /* The next initialization will pick up the pieces. */
151 return;
152 }
153 i = rp->rtort_pipe_count;
154 if (i > RCU_TORTURE_PIPE_LEN)
155 i = RCU_TORTURE_PIPE_LEN;
156 atomic_inc(&rcu_torture_wcount[i]);
157 if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
158 rp->rtort_mbtest = 0;
159 rcu_torture_free(rp);
160 } else
161 call_rcu(p, rcu_torture_cb);
162}
163
164struct rcu_random_state { 146struct rcu_random_state {
165 unsigned long rrs_state; 147 unsigned long rrs_state;
166 unsigned long rrs_count; 148 unsigned long rrs_count;
@@ -191,6 +173,119 @@ rcu_random(struct rcu_random_state *rrsp)
191} 173}
192 174
193/* 175/*
176 * Operations vector for selecting different types of tests.
177 */
178
179struct rcu_torture_ops {
180 void (*init)(void);
181 void (*cleanup)(void);
182 int (*readlock)(void);
183 void (*readunlock)(int idx);
184 int (*completed)(void);
185 void (*deferredfree)(struct rcu_torture *p);
186 int (*stats)(char *page);
187 char *name;
188};
189static struct rcu_torture_ops *cur_ops = NULL;
190
191/*
192 * Definitions for rcu torture testing.
193 */
194
195static int rcu_torture_read_lock(void)
196{
197 rcu_read_lock();
198 return 0;
199}
200
201static void rcu_torture_read_unlock(int idx)
202{
203 rcu_read_unlock();
204}
205
206static int rcu_torture_completed(void)
207{
208 return rcu_batches_completed();
209}
210
211static void
212rcu_torture_cb(struct rcu_head *p)
213{
214 int i;
215 struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
216
217 if (fullstop) {
218 /* Test is ending, just drop callbacks on the floor. */
219 /* The next initialization will pick up the pieces. */
220 return;
221 }
222 i = rp->rtort_pipe_count;
223 if (i > RCU_TORTURE_PIPE_LEN)
224 i = RCU_TORTURE_PIPE_LEN;
225 atomic_inc(&rcu_torture_wcount[i]);
226 if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
227 rp->rtort_mbtest = 0;
228 rcu_torture_free(rp);
229 } else
230 cur_ops->deferredfree(rp);
231}
232
233static void rcu_torture_deferred_free(struct rcu_torture *p)
234{
235 call_rcu(&p->rtort_rcu, rcu_torture_cb);
236}
237
238static struct rcu_torture_ops rcu_ops = {
239 .init = NULL,
240 .cleanup = NULL,
241 .readlock = rcu_torture_read_lock,
242 .readunlock = rcu_torture_read_unlock,
243 .completed = rcu_torture_completed,
244 .deferredfree = rcu_torture_deferred_free,
245 .stats = NULL,
246 .name = "rcu"
247};
248
249/*
250 * Definitions for rcu_bh torture testing.
251 */
252
253static int rcu_bh_torture_read_lock(void)
254{
255 rcu_read_lock_bh();
256 return 0;
257}
258
259static void rcu_bh_torture_read_unlock(int idx)
260{
261 rcu_read_unlock_bh();
262}
263
264static int rcu_bh_torture_completed(void)
265{
266 return rcu_batches_completed_bh();
267}
268
269static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
270{
271 call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
272}
273
274static struct rcu_torture_ops rcu_bh_ops = {
275 .init = NULL,
276 .cleanup = NULL,
277 .readlock = rcu_bh_torture_read_lock,
278 .readunlock = rcu_bh_torture_read_unlock,
279 .completed = rcu_bh_torture_completed,
280 .deferredfree = rcu_bh_torture_deferred_free,
281 .stats = NULL,
282 .name = "rcu_bh"
283};
284
285static struct rcu_torture_ops *torture_ops[] =
286 { &rcu_ops, &rcu_bh_ops, NULL };
287
288/*
194 * RCU torture writer kthread. Repeatedly substitutes a new structure 289 * RCU torture writer kthread. Repeatedly substitutes a new structure
195 * for that pointed to by rcu_torture_current, freeing the old structure 290 * for that pointed to by rcu_torture_current, freeing the old structure
196 * after a series of grace periods (the "pipeline"). 291 * after a series of grace periods (the "pipeline").
@@ -209,8 +304,6 @@ rcu_torture_writer(void *arg)
209 304
210 do { 305 do {
211 schedule_timeout_uninterruptible(1); 306 schedule_timeout_uninterruptible(1);
212 if (rcu_batches_completed() == oldbatch)
213 continue;
214 if ((rp = rcu_torture_alloc()) == NULL) 307 if ((rp = rcu_torture_alloc()) == NULL)
215 continue; 308 continue;
216 rp->rtort_pipe_count = 0; 309 rp->rtort_pipe_count = 0;
@@ -225,10 +318,10 @@ rcu_torture_writer(void *arg)
225 i = RCU_TORTURE_PIPE_LEN; 318 i = RCU_TORTURE_PIPE_LEN;
226 atomic_inc(&rcu_torture_wcount[i]); 319 atomic_inc(&rcu_torture_wcount[i]);
227 old_rp->rtort_pipe_count++; 320 old_rp->rtort_pipe_count++;
228 call_rcu(&old_rp->rtort_rcu, rcu_torture_cb); 321 cur_ops->deferredfree(old_rp);
229 } 322 }
230 rcu_torture_current_version++; 323 rcu_torture_current_version++;
231 oldbatch = rcu_batches_completed(); 324 oldbatch = cur_ops->completed();
232 } while (!kthread_should_stop() && !fullstop); 325 } while (!kthread_should_stop() && !fullstop);
233 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); 326 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
234 while (!kthread_should_stop()) 327 while (!kthread_should_stop())
@@ -246,6 +339,7 @@ static int
246rcu_torture_reader(void *arg) 339rcu_torture_reader(void *arg)
247{ 340{
248 int completed; 341 int completed;
342 int idx;
249 DEFINE_RCU_RANDOM(rand); 343 DEFINE_RCU_RANDOM(rand);
250 struct rcu_torture *p; 344 struct rcu_torture *p;
251 int pipe_count; 345 int pipe_count;
@@ -254,12 +348,12 @@ rcu_torture_reader(void *arg)
254 set_user_nice(current, 19); 348 set_user_nice(current, 19);
255 349
256 do { 350 do {
257 rcu_read_lock(); 351 idx = cur_ops->readlock();
258 completed = rcu_batches_completed(); 352 completed = cur_ops->completed();
259 p = rcu_dereference(rcu_torture_current); 353 p = rcu_dereference(rcu_torture_current);
260 if (p == NULL) { 354 if (p == NULL) {
261 /* Wait for rcu_torture_writer to get underway */ 355 /* Wait for rcu_torture_writer to get underway */
262 rcu_read_unlock(); 356 cur_ops->readunlock(idx);
263 schedule_timeout_interruptible(HZ); 357 schedule_timeout_interruptible(HZ);
264 continue; 358 continue;
265 } 359 }
@@ -273,14 +367,14 @@ rcu_torture_reader(void *arg)
273 pipe_count = RCU_TORTURE_PIPE_LEN; 367 pipe_count = RCU_TORTURE_PIPE_LEN;
274 } 368 }
275 ++__get_cpu_var(rcu_torture_count)[pipe_count]; 369 ++__get_cpu_var(rcu_torture_count)[pipe_count];
276 completed = rcu_batches_completed() - completed; 370 completed = cur_ops->completed() - completed;
277 if (completed > RCU_TORTURE_PIPE_LEN) { 371 if (completed > RCU_TORTURE_PIPE_LEN) {
278 /* Should not happen, but... */ 372 /* Should not happen, but... */
279 completed = RCU_TORTURE_PIPE_LEN; 373 completed = RCU_TORTURE_PIPE_LEN;
280 } 374 }
281 ++__get_cpu_var(rcu_torture_batch)[completed]; 375 ++__get_cpu_var(rcu_torture_batch)[completed];
282 preempt_enable(); 376 preempt_enable();
283 rcu_read_unlock(); 377 cur_ops->readunlock(idx);
284 schedule(); 378 schedule();
285 } while (!kthread_should_stop() && !fullstop); 379 } while (!kthread_should_stop() && !fullstop);
286 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); 380 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
@@ -311,7 +405,7 @@ rcu_torture_printk(char *page)
311 if (pipesummary[i] != 0) 405 if (pipesummary[i] != 0)
312 break; 406 break;
313 } 407 }
314 cnt += sprintf(&page[cnt], "rcutorture: "); 408 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
315 cnt += sprintf(&page[cnt], 409 cnt += sprintf(&page[cnt],
316 "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " 410 "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
317 "rtmbe: %d", 411 "rtmbe: %d",
@@ -324,7 +418,7 @@ rcu_torture_printk(char *page)
324 atomic_read(&n_rcu_torture_mberror)); 418 atomic_read(&n_rcu_torture_mberror));
325 if (atomic_read(&n_rcu_torture_mberror) != 0) 419 if (atomic_read(&n_rcu_torture_mberror) != 0)
326 cnt += sprintf(&page[cnt], " !!!"); 420 cnt += sprintf(&page[cnt], " !!!");
327 cnt += sprintf(&page[cnt], "\nrcutorture: "); 421 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
328 if (i > 1) { 422 if (i > 1) {
329 cnt += sprintf(&page[cnt], "!!! "); 423 cnt += sprintf(&page[cnt], "!!! ");
330 atomic_inc(&n_rcu_torture_error); 424 atomic_inc(&n_rcu_torture_error);
@@ -332,17 +426,19 @@ rcu_torture_printk(char *page)
332 cnt += sprintf(&page[cnt], "Reader Pipe: "); 426 cnt += sprintf(&page[cnt], "Reader Pipe: ");
333 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) 427 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
334 cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); 428 cnt += sprintf(&page[cnt], " %ld", pipesummary[i]);
335 cnt += sprintf(&page[cnt], "\nrcutorture: "); 429 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
336 cnt += sprintf(&page[cnt], "Reader Batch: "); 430 cnt += sprintf(&page[cnt], "Reader Batch: ");
337 for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++) 431 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
338 cnt += sprintf(&page[cnt], " %ld", batchsummary[i]); 432 cnt += sprintf(&page[cnt], " %ld", batchsummary[i]);
339 cnt += sprintf(&page[cnt], "\nrcutorture: "); 433 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
340 cnt += sprintf(&page[cnt], "Free-Block Circulation: "); 434 cnt += sprintf(&page[cnt], "Free-Block Circulation: ");
341 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { 435 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
342 cnt += sprintf(&page[cnt], " %d", 436 cnt += sprintf(&page[cnt], " %d",
343 atomic_read(&rcu_torture_wcount[i])); 437 atomic_read(&rcu_torture_wcount[i]));
344 } 438 }
345 cnt += sprintf(&page[cnt], "\n"); 439 cnt += sprintf(&page[cnt], "\n");
440 if (cur_ops->stats != NULL)
441 cnt += cur_ops->stats(&page[cnt]);
346 return cnt; 442 return cnt;
347} 443}
348 444
@@ -444,11 +540,11 @@ rcu_torture_shuffle(void *arg)
444static inline void 540static inline void
445rcu_torture_print_module_parms(char *tag) 541rcu_torture_print_module_parms(char *tag)
446{ 542{
447 printk(KERN_ALERT TORTURE_FLAG "--- %s: nreaders=%d " 543 printk(KERN_ALERT "%s" TORTURE_FLAG "--- %s: nreaders=%d "
448 "stat_interval=%d verbose=%d test_no_idle_hz=%d " 544 "stat_interval=%d verbose=%d test_no_idle_hz=%d "
449 "shuffle_interval = %d\n", 545 "shuffle_interval = %d\n",
450 tag, nrealreaders, stat_interval, verbose, test_no_idle_hz, 546 torture_type, tag, nrealreaders, stat_interval, verbose,
451 shuffle_interval); 547 test_no_idle_hz, shuffle_interval);
452} 548}
453 549
454static void 550static void
@@ -493,6 +589,9 @@ rcu_torture_cleanup(void)
493 rcu_barrier(); 589 rcu_barrier();
494 590
495 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ 591 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
592
593 if (cur_ops->cleanup != NULL)
594 cur_ops->cleanup();
496 if (atomic_read(&n_rcu_torture_error)) 595 if (atomic_read(&n_rcu_torture_error))
497 rcu_torture_print_module_parms("End of test: FAILURE"); 596 rcu_torture_print_module_parms("End of test: FAILURE");
498 else 597 else
@@ -508,6 +607,20 @@ rcu_torture_init(void)
508 607
509 /* Process args and tell the world that the torturer is on the job. */ 608 /* Process args and tell the world that the torturer is on the job. */
510 609
610 for (i = 0; cur_ops = torture_ops[i], cur_ops != NULL; i++) {
611 cur_ops = torture_ops[i];
612 if (strcmp(torture_type, cur_ops->name) == 0) {
613 break;
614 }
615 }
616 if (cur_ops == NULL) {
617 printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n",
618 torture_type);
619 return (-EINVAL);
620 }
621 if (cur_ops->init != NULL)
622 cur_ops->init(); /* no "goto unwind" prior to this point!!! */
623
511 if (nreaders >= 0) 624 if (nreaders >= 0)
512 nrealreaders = nreaders; 625 nrealreaders = nreaders;
513 else 626 else
diff --git a/kernel/resource.c b/kernel/resource.c
index e3080fcc66a3..bf1130d81b7f 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -23,20 +23,18 @@
23 23
24struct resource ioport_resource = { 24struct resource ioport_resource = {
25 .name = "PCI IO", 25 .name = "PCI IO",
26 .start = 0x0000, 26 .start = 0,
27 .end = IO_SPACE_LIMIT, 27 .end = IO_SPACE_LIMIT,
28 .flags = IORESOURCE_IO, 28 .flags = IORESOURCE_IO,
29}; 29};
30
31EXPORT_SYMBOL(ioport_resource); 30EXPORT_SYMBOL(ioport_resource);
32 31
33struct resource iomem_resource = { 32struct resource iomem_resource = {
34 .name = "PCI mem", 33 .name = "PCI mem",
35 .start = 0UL, 34 .start = 0,
36 .end = ~0UL, 35 .end = -1,
37 .flags = IORESOURCE_MEM, 36 .flags = IORESOURCE_MEM,
38}; 37};
39
40EXPORT_SYMBOL(iomem_resource); 38EXPORT_SYMBOL(iomem_resource);
41 39
42static DEFINE_RWLOCK(resource_lock); 40static DEFINE_RWLOCK(resource_lock);
@@ -83,10 +81,10 @@ static int r_show(struct seq_file *m, void *v)
83 for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent) 81 for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent)
84 if (p->parent == root) 82 if (p->parent == root)
85 break; 83 break;
86 seq_printf(m, "%*s%0*lx-%0*lx : %s\n", 84 seq_printf(m, "%*s%0*llx-%0*llx : %s\n",
87 depth * 2, "", 85 depth * 2, "",
88 width, r->start, 86 width, (unsigned long long) r->start,
89 width, r->end, 87 width, (unsigned long long) r->end,
90 r->name ? r->name : "<BAD>"); 88 r->name ? r->name : "<BAD>");
91 return 0; 89 return 0;
92} 90}
@@ -151,8 +149,8 @@ __initcall(ioresources_init);
151/* Return the conflict entry if you can't request it */ 149/* Return the conflict entry if you can't request it */
152static struct resource * __request_resource(struct resource *root, struct resource *new) 150static struct resource * __request_resource(struct resource *root, struct resource *new)
153{ 151{
154 unsigned long start = new->start; 152 resource_size_t start = new->start;
155 unsigned long end = new->end; 153 resource_size_t end = new->end;
156 struct resource *tmp, **p; 154 struct resource *tmp, **p;
157 155
158 if (end < start) 156 if (end < start)
@@ -232,15 +230,52 @@ int release_resource(struct resource *old)
232 230
233EXPORT_SYMBOL(release_resource); 231EXPORT_SYMBOL(release_resource);
234 232
233#ifdef CONFIG_MEMORY_HOTPLUG
234/*
235 * Finds the lowest memory reosurce exists within [res->start.res->end)
236 * the caller must specify res->start, res->end, res->flags.
237 * If found, returns 0, res is overwritten, if not found, returns -1.
238 */
239int find_next_system_ram(struct resource *res)
240{
241 resource_size_t start, end;
242 struct resource *p;
243
244 BUG_ON(!res);
245
246 start = res->start;
247 end = res->end;
248
249 read_lock(&resource_lock);
250 for (p = iomem_resource.child; p ; p = p->sibling) {
251 /* system ram is just marked as IORESOURCE_MEM */
252 if (p->flags != res->flags)
253 continue;
254 if (p->start > end) {
255 p = NULL;
256 break;
257 }
258 if (p->start >= start)
259 break;
260 }
261 read_unlock(&resource_lock);
262 if (!p)
263 return -1;
264 /* copy data */
265 res->start = p->start;
266 res->end = p->end;
267 return 0;
268}
269#endif
270
235/* 271/*
236 * Find empty slot in the resource tree given range and alignment. 272 * Find empty slot in the resource tree given range and alignment.
237 */ 273 */
238static int find_resource(struct resource *root, struct resource *new, 274static int find_resource(struct resource *root, struct resource *new,
239 unsigned long size, 275 resource_size_t size, resource_size_t min,
240 unsigned long min, unsigned long max, 276 resource_size_t max, resource_size_t align,
241 unsigned long align,
242 void (*alignf)(void *, struct resource *, 277 void (*alignf)(void *, struct resource *,
243 unsigned long, unsigned long), 278 resource_size_t, resource_size_t),
244 void *alignf_data) 279 void *alignf_data)
245{ 280{
246 struct resource *this = root->child; 281 struct resource *this = root->child;
@@ -282,11 +317,10 @@ static int find_resource(struct resource *root, struct resource *new,
282 * Allocate empty slot in the resource tree given range and alignment. 317 * Allocate empty slot in the resource tree given range and alignment.
283 */ 318 */
284int allocate_resource(struct resource *root, struct resource *new, 319int allocate_resource(struct resource *root, struct resource *new,
285 unsigned long size, 320 resource_size_t size, resource_size_t min,
286 unsigned long min, unsigned long max, 321 resource_size_t max, resource_size_t align,
287 unsigned long align,
288 void (*alignf)(void *, struct resource *, 322 void (*alignf)(void *, struct resource *,
289 unsigned long, unsigned long), 323 resource_size_t, resource_size_t),
290 void *alignf_data) 324 void *alignf_data)
291{ 325{
292 int err; 326 int err;
@@ -378,10 +412,10 @@ EXPORT_SYMBOL(insert_resource);
378 * arguments. Returns -EBUSY if it can't fit. Existing children of 412 * arguments. Returns -EBUSY if it can't fit. Existing children of
379 * the resource are assumed to be immutable. 413 * the resource are assumed to be immutable.
380 */ 414 */
381int adjust_resource(struct resource *res, unsigned long start, unsigned long size) 415int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size)
382{ 416{
383 struct resource *tmp, *parent = res->parent; 417 struct resource *tmp, *parent = res->parent;
384 unsigned long end = start + size - 1; 418 resource_size_t end = start + size - 1;
385 int result = -EBUSY; 419 int result = -EBUSY;
386 420
387 write_lock(&resource_lock); 421 write_lock(&resource_lock);
@@ -428,7 +462,9 @@ EXPORT_SYMBOL(adjust_resource);
428 * 462 *
429 * Release-region releases a matching busy region. 463 * Release-region releases a matching busy region.
430 */ 464 */
431struct resource * __request_region(struct resource *parent, unsigned long start, unsigned long n, const char *name) 465struct resource * __request_region(struct resource *parent,
466 resource_size_t start, resource_size_t n,
467 const char *name)
432{ 468{
433 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); 469 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
434 470
@@ -464,7 +500,8 @@ struct resource * __request_region(struct resource *parent, unsigned long start,
464 500
465EXPORT_SYMBOL(__request_region); 501EXPORT_SYMBOL(__request_region);
466 502
467int __check_region(struct resource *parent, unsigned long start, unsigned long n) 503int __check_region(struct resource *parent, resource_size_t start,
504 resource_size_t n)
468{ 505{
469 struct resource * res; 506 struct resource * res;
470 507
@@ -479,10 +516,11 @@ int __check_region(struct resource *parent, unsigned long start, unsigned long n
479 516
480EXPORT_SYMBOL(__check_region); 517EXPORT_SYMBOL(__check_region);
481 518
482void __release_region(struct resource *parent, unsigned long start, unsigned long n) 519void __release_region(struct resource *parent, resource_size_t start,
520 resource_size_t n)
483{ 521{
484 struct resource **p; 522 struct resource **p;
485 unsigned long end; 523 resource_size_t end;
486 524
487 p = &parent->child; 525 p = &parent->child;
488 end = start + n - 1; 526 end = start + n - 1;
@@ -511,7 +549,9 @@ void __release_region(struct resource *parent, unsigned long start, unsigned lon
511 549
512 write_unlock(&resource_lock); 550 write_unlock(&resource_lock);
513 551
514 printk(KERN_WARNING "Trying to free nonexistent resource <%08lx-%08lx>\n", start, end); 552 printk(KERN_WARNING "Trying to free nonexistent resource "
553 "<%016llx-%016llx>\n", (unsigned long long)start,
554 (unsigned long long)end);
515} 555}
516 556
517EXPORT_SYMBOL(__release_region); 557EXPORT_SYMBOL(__release_region);
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
new file mode 100644
index 000000000000..4aa8a2c9f453
--- /dev/null
+++ b/kernel/rtmutex-debug.c
@@ -0,0 +1,513 @@
1/*
2 * RT-Mutexes: blocking mutual exclusion locks with PI support
3 *
4 * started by Ingo Molnar and Thomas Gleixner:
5 *
6 * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
8 *
9 * This code is based on the rt.c implementation in the preempt-rt tree.
10 * Portions of said code are
11 *
12 * Copyright (C) 2004 LynuxWorks, Inc., Igor Manyilov, Bill Huey
13 * Copyright (C) 2006 Esben Nielsen
14 * Copyright (C) 2006 Kihon Technologies Inc.,
15 * Steven Rostedt <rostedt@goodmis.org>
16 *
17 * See rt.c in preempt-rt for proper credits and further information
18 */
19#include <linux/config.h>
20#include <linux/sched.h>
21#include <linux/delay.h>
22#include <linux/module.h>
23#include <linux/spinlock.h>
24#include <linux/kallsyms.h>
25#include <linux/syscalls.h>
26#include <linux/interrupt.h>
27#include <linux/plist.h>
28#include <linux/fs.h>
29
30#include "rtmutex_common.h"
31
32#ifdef CONFIG_DEBUG_RT_MUTEXES
33# include "rtmutex-debug.h"
34#else
35# include "rtmutex.h"
36#endif
37
38# define TRACE_WARN_ON(x) WARN_ON(x)
39# define TRACE_BUG_ON(x) BUG_ON(x)
40
41# define TRACE_OFF() \
42do { \
43 if (rt_trace_on) { \
44 rt_trace_on = 0; \
45 console_verbose(); \
46 if (spin_is_locked(&current->pi_lock)) \
47 spin_unlock(&current->pi_lock); \
48 if (spin_is_locked(&current->held_list_lock)) \
49 spin_unlock(&current->held_list_lock); \
50 } \
51} while (0)
52
53# define TRACE_OFF_NOLOCK() \
54do { \
55 if (rt_trace_on) { \
56 rt_trace_on = 0; \
57 console_verbose(); \
58 } \
59} while (0)
60
61# define TRACE_BUG_LOCKED() \
62do { \
63 TRACE_OFF(); \
64 BUG(); \
65} while (0)
66
67# define TRACE_WARN_ON_LOCKED(c) \
68do { \
69 if (unlikely(c)) { \
70 TRACE_OFF(); \
71 WARN_ON(1); \
72 } \
73} while (0)
74
75# define TRACE_BUG_ON_LOCKED(c) \
76do { \
77 if (unlikely(c)) \
78 TRACE_BUG_LOCKED(); \
79} while (0)
80
81#ifdef CONFIG_SMP
82# define SMP_TRACE_BUG_ON_LOCKED(c) TRACE_BUG_ON_LOCKED(c)
83#else
84# define SMP_TRACE_BUG_ON_LOCKED(c) do { } while (0)
85#endif
86
87/*
88 * deadlock detection flag. We turn it off when we detect
89 * the first problem because we dont want to recurse back
90 * into the tracing code when doing error printk or
91 * executing a BUG():
92 */
93int rt_trace_on = 1;
94
95void deadlock_trace_off(void)
96{
97 rt_trace_on = 0;
98}
99
100static void printk_task(task_t *p)
101{
102 if (p)
103 printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio);
104 else
105 printk("<none>");
106}
107
108static void printk_task_short(task_t *p)
109{
110 if (p)
111 printk("%s/%d [%p, %3d]", p->comm, p->pid, p, p->prio);
112 else
113 printk("<none>");
114}
115
116static void printk_lock(struct rt_mutex *lock, int print_owner)
117{
118 if (lock->name)
119 printk(" [%p] {%s}\n",
120 lock, lock->name);
121 else
122 printk(" [%p] {%s:%d}\n",
123 lock, lock->file, lock->line);
124
125 if (print_owner && rt_mutex_owner(lock)) {
126 printk(".. ->owner: %p\n", lock->owner);
127 printk(".. held by: ");
128 printk_task(rt_mutex_owner(lock));
129 printk("\n");
130 }
131 if (rt_mutex_owner(lock)) {
132 printk("... acquired at: ");
133 print_symbol("%s\n", lock->acquire_ip);
134 }
135}
136
137static void printk_waiter(struct rt_mutex_waiter *w)
138{
139 printk("-------------------------\n");
140 printk("| waiter struct %p:\n", w);
141 printk("| w->list_entry: [DP:%p/%p|SP:%p/%p|PRI:%d]\n",
142 w->list_entry.plist.prio_list.prev, w->list_entry.plist.prio_list.next,
143 w->list_entry.plist.node_list.prev, w->list_entry.plist.node_list.next,
144 w->list_entry.prio);
145 printk("| w->pi_list_entry: [DP:%p/%p|SP:%p/%p|PRI:%d]\n",
146 w->pi_list_entry.plist.prio_list.prev, w->pi_list_entry.plist.prio_list.next,
147 w->pi_list_entry.plist.node_list.prev, w->pi_list_entry.plist.node_list.next,
148 w->pi_list_entry.prio);
149 printk("\n| lock:\n");
150 printk_lock(w->lock, 1);
151 printk("| w->ti->task:\n");
152 printk_task(w->task);
153 printk("| blocked at: ");
154 print_symbol("%s\n", w->ip);
155 printk("-------------------------\n");
156}
157
158static void show_task_locks(task_t *p)
159{
160 switch (p->state) {
161 case TASK_RUNNING: printk("R"); break;
162 case TASK_INTERRUPTIBLE: printk("S"); break;
163 case TASK_UNINTERRUPTIBLE: printk("D"); break;
164 case TASK_STOPPED: printk("T"); break;
165 case EXIT_ZOMBIE: printk("Z"); break;
166 case EXIT_DEAD: printk("X"); break;
167 default: printk("?"); break;
168 }
169 printk_task(p);
170 if (p->pi_blocked_on) {
171 struct rt_mutex *lock = p->pi_blocked_on->lock;
172
173 printk(" blocked on:");
174 printk_lock(lock, 1);
175 } else
176 printk(" (not blocked)\n");
177}
178
179void rt_mutex_show_held_locks(task_t *task, int verbose)
180{
181 struct list_head *curr, *cursor = NULL;
182 struct rt_mutex *lock;
183 task_t *t;
184 unsigned long flags;
185 int count = 0;
186
187 if (!rt_trace_on)
188 return;
189
190 if (verbose) {
191 printk("------------------------------\n");
192 printk("| showing all locks held by: | (");
193 printk_task_short(task);
194 printk("):\n");
195 printk("------------------------------\n");
196 }
197
198next:
199 spin_lock_irqsave(&task->held_list_lock, flags);
200 list_for_each(curr, &task->held_list_head) {
201 if (cursor && curr != cursor)
202 continue;
203 lock = list_entry(curr, struct rt_mutex, held_list_entry);
204 t = rt_mutex_owner(lock);
205 WARN_ON(t != task);
206 count++;
207 cursor = curr->next;
208 spin_unlock_irqrestore(&task->held_list_lock, flags);
209
210 printk("\n#%03d: ", count);
211 printk_lock(lock, 0);
212 goto next;
213 }
214 spin_unlock_irqrestore(&task->held_list_lock, flags);
215
216 printk("\n");
217}
218
219void rt_mutex_show_all_locks(void)
220{
221 task_t *g, *p;
222 int count = 10;
223 int unlock = 1;
224
225 printk("\n");
226 printk("----------------------\n");
227 printk("| showing all tasks: |\n");
228 printk("----------------------\n");
229
230 /*
231 * Here we try to get the tasklist_lock as hard as possible,
232 * if not successful after 2 seconds we ignore it (but keep
233 * trying). This is to enable a debug printout even if a
234 * tasklist_lock-holding task deadlocks or crashes.
235 */
236retry:
237 if (!read_trylock(&tasklist_lock)) {
238 if (count == 10)
239 printk("hm, tasklist_lock locked, retrying... ");
240 if (count) {
241 count--;
242 printk(" #%d", 10-count);
243 mdelay(200);
244 goto retry;
245 }
246 printk(" ignoring it.\n");
247 unlock = 0;
248 }
249 if (count != 10)
250 printk(" locked it.\n");
251
252 do_each_thread(g, p) {
253 show_task_locks(p);
254 if (!unlock)
255 if (read_trylock(&tasklist_lock))
256 unlock = 1;
257 } while_each_thread(g, p);
258
259 printk("\n");
260
261 printk("-----------------------------------------\n");
262 printk("| showing all locks held in the system: |\n");
263 printk("-----------------------------------------\n");
264
265 do_each_thread(g, p) {
266 rt_mutex_show_held_locks(p, 0);
267 if (!unlock)
268 if (read_trylock(&tasklist_lock))
269 unlock = 1;
270 } while_each_thread(g, p);
271
272
273 printk("=============================================\n\n");
274
275 if (unlock)
276 read_unlock(&tasklist_lock);
277}
278
279void rt_mutex_debug_check_no_locks_held(task_t *task)
280{
281 struct rt_mutex_waiter *w;
282 struct list_head *curr;
283 struct rt_mutex *lock;
284
285 if (!rt_trace_on)
286 return;
287 if (!rt_prio(task->normal_prio) && rt_prio(task->prio)) {
288 printk("BUG: PI priority boost leaked!\n");
289 printk_task(task);
290 printk("\n");
291 }
292 if (list_empty(&task->held_list_head))
293 return;
294
295 spin_lock(&task->pi_lock);
296 plist_for_each_entry(w, &task->pi_waiters, pi_list_entry) {
297 TRACE_OFF();
298
299 printk("hm, PI interest held at exit time? Task:\n");
300 printk_task(task);
301 printk_waiter(w);
302 return;
303 }
304 spin_unlock(&task->pi_lock);
305
306 list_for_each(curr, &task->held_list_head) {
307 lock = list_entry(curr, struct rt_mutex, held_list_entry);
308
309 printk("BUG: %s/%d, lock held at task exit time!\n",
310 task->comm, task->pid);
311 printk_lock(lock, 1);
312 if (rt_mutex_owner(lock) != task)
313 printk("exiting task is not even the owner??\n");
314 }
315}
316
317int rt_mutex_debug_check_no_locks_freed(const void *from, unsigned long len)
318{
319 const void *to = from + len;
320 struct list_head *curr;
321 struct rt_mutex *lock;
322 unsigned long flags;
323 void *lock_addr;
324
325 if (!rt_trace_on)
326 return 0;
327
328 spin_lock_irqsave(&current->held_list_lock, flags);
329 list_for_each(curr, &current->held_list_head) {
330 lock = list_entry(curr, struct rt_mutex, held_list_entry);
331 lock_addr = lock;
332 if (lock_addr < from || lock_addr >= to)
333 continue;
334 TRACE_OFF();
335
336 printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n",
337 current->comm, current->pid, lock, from, to);
338 dump_stack();
339 printk_lock(lock, 1);
340 if (rt_mutex_owner(lock) != current)
341 printk("freeing task is not even the owner??\n");
342 return 1;
343 }
344 spin_unlock_irqrestore(&current->held_list_lock, flags);
345
346 return 0;
347}
348
349void rt_mutex_debug_task_free(struct task_struct *task)
350{
351 WARN_ON(!plist_head_empty(&task->pi_waiters));
352 WARN_ON(task->pi_blocked_on);
353}
354
355/*
356 * We fill out the fields in the waiter to store the information about
357 * the deadlock. We print when we return. act_waiter can be NULL in
358 * case of a remove waiter operation.
359 */
360void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter,
361 struct rt_mutex *lock)
362{
363 struct task_struct *task;
364
365 if (!rt_trace_on || detect || !act_waiter)
366 return;
367
368 task = rt_mutex_owner(act_waiter->lock);
369 if (task && task != current) {
370 act_waiter->deadlock_task_pid = task->pid;
371 act_waiter->deadlock_lock = lock;
372 }
373}
374
375void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
376{
377 struct task_struct *task;
378
379 if (!waiter->deadlock_lock || !rt_trace_on)
380 return;
381
382 task = find_task_by_pid(waiter->deadlock_task_pid);
383 if (!task)
384 return;
385
386 TRACE_OFF_NOLOCK();
387
388 printk("\n============================================\n");
389 printk( "[ BUG: circular locking deadlock detected! ]\n");
390 printk( "--------------------------------------------\n");
391 printk("%s/%d is deadlocking current task %s/%d\n\n",
392 task->comm, task->pid, current->comm, current->pid);
393
394 printk("\n1) %s/%d is trying to acquire this lock:\n",
395 current->comm, current->pid);
396 printk_lock(waiter->lock, 1);
397
398 printk("... trying at: ");
399 print_symbol("%s\n", waiter->ip);
400
401 printk("\n2) %s/%d is blocked on this lock:\n", task->comm, task->pid);
402 printk_lock(waiter->deadlock_lock, 1);
403
404 rt_mutex_show_held_locks(current, 1);
405 rt_mutex_show_held_locks(task, 1);
406
407 printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task->pid);
408 show_stack(task, NULL);
409 printk("\n%s/%d's [current] stackdump:\n\n",
410 current->comm, current->pid);
411 dump_stack();
412 rt_mutex_show_all_locks();
413 printk("[ turning off deadlock detection."
414 "Please report this trace. ]\n\n");
415 local_irq_disable();
416}
417
418void debug_rt_mutex_lock(struct rt_mutex *lock __IP_DECL__)
419{
420 unsigned long flags;
421
422 if (rt_trace_on) {
423 TRACE_WARN_ON_LOCKED(!list_empty(&lock->held_list_entry));
424
425 spin_lock_irqsave(&current->held_list_lock, flags);
426 list_add_tail(&lock->held_list_entry, &current->held_list_head);
427 spin_unlock_irqrestore(&current->held_list_lock, flags);
428
429 lock->acquire_ip = ip;
430 }
431}
432
433void debug_rt_mutex_unlock(struct rt_mutex *lock)
434{
435 unsigned long flags;
436
437 if (rt_trace_on) {
438 TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current);
439 TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list_entry));
440
441 spin_lock_irqsave(&current->held_list_lock, flags);
442 list_del_init(&lock->held_list_entry);
443 spin_unlock_irqrestore(&current->held_list_lock, flags);
444 }
445}
446
447void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
448 struct task_struct *powner __IP_DECL__)
449{
450 unsigned long flags;
451
452 if (rt_trace_on) {
453 TRACE_WARN_ON_LOCKED(!list_empty(&lock->held_list_entry));
454
455 spin_lock_irqsave(&powner->held_list_lock, flags);
456 list_add_tail(&lock->held_list_entry, &powner->held_list_head);
457 spin_unlock_irqrestore(&powner->held_list_lock, flags);
458
459 lock->acquire_ip = ip;
460 }
461}
462
463void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
464{
465 unsigned long flags;
466
467 if (rt_trace_on) {
468 struct task_struct *owner = rt_mutex_owner(lock);
469
470 TRACE_WARN_ON_LOCKED(!owner);
471 TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list_entry));
472
473 spin_lock_irqsave(&owner->held_list_lock, flags);
474 list_del_init(&lock->held_list_entry);
475 spin_unlock_irqrestore(&owner->held_list_lock, flags);
476 }
477}
478
479void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
480{
481 memset(waiter, 0x11, sizeof(*waiter));
482 plist_node_init(&waiter->list_entry, MAX_PRIO);
483 plist_node_init(&waiter->pi_list_entry, MAX_PRIO);
484}
485
486void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
487{
488 TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry));
489 TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
490 TRACE_WARN_ON(waiter->task);
491 memset(waiter, 0x22, sizeof(*waiter));
492}
493
494void debug_rt_mutex_init(struct rt_mutex *lock, const char *name)
495{
496 void *addr = lock;
497
498 if (rt_trace_on) {
499 rt_mutex_debug_check_no_locks_freed(addr,
500 sizeof(struct rt_mutex));
501 INIT_LIST_HEAD(&lock->held_list_entry);
502 lock->name = name;
503 }
504}
505
506void rt_mutex_deadlock_account_lock(struct rt_mutex *lock, task_t *task)
507{
508}
509
510void rt_mutex_deadlock_account_unlock(struct task_struct *task)
511{
512}
513
diff --git a/kernel/rtmutex-debug.h b/kernel/rtmutex-debug.h
new file mode 100644
index 000000000000..7612fbc62d70
--- /dev/null
+++ b/kernel/rtmutex-debug.h
@@ -0,0 +1,37 @@
1/*
2 * RT-Mutexes: blocking mutual exclusion locks with PI support
3 *
4 * started by Ingo Molnar and Thomas Gleixner:
5 *
6 * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
8 *
9 * This file contains macros used solely by rtmutex.c. Debug version.
10 */
11
12#define __IP_DECL__ , unsigned long ip
13#define __IP__ , ip
14#define __RET_IP__ , (unsigned long)__builtin_return_address(0)
15
16extern void
17rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task);
18extern void rt_mutex_deadlock_account_unlock(struct task_struct *task);
19extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
20extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter);
21extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name);
22extern void debug_rt_mutex_lock(struct rt_mutex *lock __IP_DECL__);
23extern void debug_rt_mutex_unlock(struct rt_mutex *lock);
24extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
25 struct task_struct *powner __IP_DECL__);
26extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock);
27extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter,
28 struct rt_mutex *lock);
29extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter);
30# define debug_rt_mutex_reset_waiter(w) \
31 do { (w)->deadlock_lock = NULL; } while (0)
32
33static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
34 int detect)
35{
36 return (waiter != NULL);
37}
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
new file mode 100644
index 000000000000..e82c2f848249
--- /dev/null
+++ b/kernel/rtmutex-tester.c
@@ -0,0 +1,440 @@
1/*
2 * RT-Mutex-tester: scriptable tester for rt mutexes
3 *
4 * started by Thomas Gleixner:
5 *
6 * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
7 *
8 */
9#include <linux/config.h>
10#include <linux/kthread.h>
11#include <linux/module.h>
12#include <linux/sched.h>
13#include <linux/smp_lock.h>
14#include <linux/spinlock.h>
15#include <linux/sysdev.h>
16#include <linux/timer.h>
17
18#include "rtmutex.h"
19
20#define MAX_RT_TEST_THREADS 8
21#define MAX_RT_TEST_MUTEXES 8
22
23static spinlock_t rttest_lock;
24static atomic_t rttest_event;
25
26struct test_thread_data {
27 int opcode;
28 int opdata;
29 int mutexes[MAX_RT_TEST_MUTEXES];
30 int bkl;
31 int event;
32 struct sys_device sysdev;
33};
34
35static struct test_thread_data thread_data[MAX_RT_TEST_THREADS];
36static task_t *threads[MAX_RT_TEST_THREADS];
37static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES];
38
39enum test_opcodes {
40 RTTEST_NOP = 0,
41 RTTEST_SCHEDOT, /* 1 Sched other, data = nice */
42 RTTEST_SCHEDRT, /* 2 Sched fifo, data = prio */
43 RTTEST_LOCK, /* 3 Lock uninterruptible, data = lockindex */
44 RTTEST_LOCKNOWAIT, /* 4 Lock uninterruptible no wait in wakeup, data = lockindex */
45 RTTEST_LOCKINT, /* 5 Lock interruptible, data = lockindex */
46 RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */
47 RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */
48 RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */
49 RTTEST_LOCKBKL, /* 9 Lock BKL */
50 RTTEST_UNLOCKBKL, /* 10 Unlock BKL */
51 RTTEST_SIGNAL, /* 11 Signal other test thread, data = thread id */
52 RTTEST_RESETEVENT = 98, /* 98 Reset event counter */
53 RTTEST_RESET = 99, /* 99 Reset all pending operations */
54};
55
56static int handle_op(struct test_thread_data *td, int lockwakeup)
57{
58 int i, id, ret = -EINVAL;
59
60 switch(td->opcode) {
61
62 case RTTEST_NOP:
63 return 0;
64
65 case RTTEST_LOCKCONT:
66 td->mutexes[td->opdata] = 1;
67 td->event = atomic_add_return(1, &rttest_event);
68 return 0;
69
70 case RTTEST_RESET:
71 for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) {
72 if (td->mutexes[i] == 4) {
73 rt_mutex_unlock(&mutexes[i]);
74 td->mutexes[i] = 0;
75 }
76 }
77
78 if (!lockwakeup && td->bkl == 4) {
79 unlock_kernel();
80 td->bkl = 0;
81 }
82 return 0;
83
84 case RTTEST_RESETEVENT:
85 atomic_set(&rttest_event, 0);
86 return 0;
87
88 default:
89 if (lockwakeup)
90 return ret;
91 }
92
93 switch(td->opcode) {
94
95 case RTTEST_LOCK:
96 case RTTEST_LOCKNOWAIT:
97 id = td->opdata;
98 if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
99 return ret;
100
101 td->mutexes[id] = 1;
102 td->event = atomic_add_return(1, &rttest_event);
103 rt_mutex_lock(&mutexes[id]);
104 td->event = atomic_add_return(1, &rttest_event);
105 td->mutexes[id] = 4;
106 return 0;
107
108 case RTTEST_LOCKINT:
109 case RTTEST_LOCKINTNOWAIT:
110 id = td->opdata;
111 if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
112 return ret;
113
114 td->mutexes[id] = 1;
115 td->event = atomic_add_return(1, &rttest_event);
116 ret = rt_mutex_lock_interruptible(&mutexes[id], 0);
117 td->event = atomic_add_return(1, &rttest_event);
118 td->mutexes[id] = ret ? 0 : 4;
119 return ret ? -EINTR : 0;
120
121 case RTTEST_UNLOCK:
122 id = td->opdata;
123 if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4)
124 return ret;
125
126 td->event = atomic_add_return(1, &rttest_event);
127 rt_mutex_unlock(&mutexes[id]);
128 td->event = atomic_add_return(1, &rttest_event);
129 td->mutexes[id] = 0;
130 return 0;
131
132 case RTTEST_LOCKBKL:
133 if (td->bkl)
134 return 0;
135 td->bkl = 1;
136 lock_kernel();
137 td->bkl = 4;
138 return 0;
139
140 case RTTEST_UNLOCKBKL:
141 if (td->bkl != 4)
142 break;
143 unlock_kernel();
144 td->bkl = 0;
145 return 0;
146
147 default:
148 break;
149 }
150 return ret;
151}
152
153/*
154 * Schedule replacement for rtsem_down(). Only called for threads with
155 * PF_MUTEX_TESTER set.
156 *
157 * This allows us to have finegrained control over the event flow.
158 *
159 */
160void schedule_rt_mutex_test(struct rt_mutex *mutex)
161{
162 int tid, op, dat;
163 struct test_thread_data *td;
164
165 /* We have to lookup the task */
166 for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) {
167 if (threads[tid] == current)
168 break;
169 }
170
171 BUG_ON(tid == MAX_RT_TEST_THREADS);
172
173 td = &thread_data[tid];
174
175 op = td->opcode;
176 dat = td->opdata;
177
178 switch (op) {
179 case RTTEST_LOCK:
180 case RTTEST_LOCKINT:
181 case RTTEST_LOCKNOWAIT:
182 case RTTEST_LOCKINTNOWAIT:
183 if (mutex != &mutexes[dat])
184 break;
185
186 if (td->mutexes[dat] != 1)
187 break;
188
189 td->mutexes[dat] = 2;
190 td->event = atomic_add_return(1, &rttest_event);
191 break;
192
193 case RTTEST_LOCKBKL:
194 default:
195 break;
196 }
197
198 schedule();
199
200
201 switch (op) {
202 case RTTEST_LOCK:
203 case RTTEST_LOCKINT:
204 if (mutex != &mutexes[dat])
205 return;
206
207 if (td->mutexes[dat] != 2)
208 return;
209
210 td->mutexes[dat] = 3;
211 td->event = atomic_add_return(1, &rttest_event);
212 break;
213
214 case RTTEST_LOCKNOWAIT:
215 case RTTEST_LOCKINTNOWAIT:
216 if (mutex != &mutexes[dat])
217 return;
218
219 if (td->mutexes[dat] != 2)
220 return;
221
222 td->mutexes[dat] = 1;
223 td->event = atomic_add_return(1, &rttest_event);
224 return;
225
226 case RTTEST_LOCKBKL:
227 return;
228 default:
229 return;
230 }
231
232 td->opcode = 0;
233
234 for (;;) {
235 set_current_state(TASK_INTERRUPTIBLE);
236
237 if (td->opcode > 0) {
238 int ret;
239
240 set_current_state(TASK_RUNNING);
241 ret = handle_op(td, 1);
242 set_current_state(TASK_INTERRUPTIBLE);
243 if (td->opcode == RTTEST_LOCKCONT)
244 break;
245 td->opcode = ret;
246 }
247
248 /* Wait for the next command to be executed */
249 schedule();
250 }
251
252 /* Restore previous command and data */
253 td->opcode = op;
254 td->opdata = dat;
255}
256
257static int test_func(void *data)
258{
259 struct test_thread_data *td = data;
260 int ret;
261
262 current->flags |= PF_MUTEX_TESTER;
263 allow_signal(SIGHUP);
264
265 for(;;) {
266
267 set_current_state(TASK_INTERRUPTIBLE);
268
269 if (td->opcode > 0) {
270 set_current_state(TASK_RUNNING);
271 ret = handle_op(td, 0);
272 set_current_state(TASK_INTERRUPTIBLE);
273 td->opcode = ret;
274 }
275
276 /* Wait for the next command to be executed */
277 schedule();
278
279 if (signal_pending(current))
280 flush_signals(current);
281
282 if(kthread_should_stop())
283 break;
284 }
285 return 0;
286}
287
288/**
289 * sysfs_test_command - interface for test commands
290 * @dev: thread reference
291 * @buf: command for actual step
292 * @count: length of buffer
293 *
294 * command syntax:
295 *
296 * opcode:data
297 */
298static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf,
299 size_t count)
300{
301 struct sched_param schedpar;
302 struct test_thread_data *td;
303 char cmdbuf[32];
304 int op, dat, tid, ret;
305
306 td = container_of(dev, struct test_thread_data, sysdev);
307 tid = td->sysdev.id;
308
309 /* strings from sysfs write are not 0 terminated! */
310 if (count >= sizeof(cmdbuf))
311 return -EINVAL;
312
313 /* strip of \n: */
314 if (buf[count-1] == '\n')
315 count--;
316 if (count < 1)
317 return -EINVAL;
318
319 memcpy(cmdbuf, buf, count);
320 cmdbuf[count] = 0;
321
322 if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2)
323 return -EINVAL;
324
325 switch (op) {
326 case RTTEST_SCHEDOT:
327 schedpar.sched_priority = 0;
328 ret = sched_setscheduler(threads[tid], SCHED_NORMAL, &schedpar);
329 if (ret)
330 return ret;
331 set_user_nice(current, 0);
332 break;
333
334 case RTTEST_SCHEDRT:
335 schedpar.sched_priority = dat;
336 ret = sched_setscheduler(threads[tid], SCHED_FIFO, &schedpar);
337 if (ret)
338 return ret;
339 break;
340
341 case RTTEST_SIGNAL:
342 send_sig(SIGHUP, threads[tid], 0);
343 break;
344
345 default:
346 if (td->opcode > 0)
347 return -EBUSY;
348 td->opdata = dat;
349 td->opcode = op;
350 wake_up_process(threads[tid]);
351 }
352
353 return count;
354}
355
356/**
357 * sysfs_test_status - sysfs interface for rt tester
358 * @dev: thread to query
359 * @buf: char buffer to be filled with thread status info
360 */
361static ssize_t sysfs_test_status(struct sys_device *dev, char *buf)
362{
363 struct test_thread_data *td;
364 char *curr = buf;
365 task_t *tsk;
366 int i;
367
368 td = container_of(dev, struct test_thread_data, sysdev);
369 tsk = threads[td->sysdev.id];
370
371 spin_lock(&rttest_lock);
372
373 curr += sprintf(curr,
374 "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:",
375 td->opcode, td->event, tsk->state,
376 (MAX_RT_PRIO - 1) - tsk->prio,
377 (MAX_RT_PRIO - 1) - tsk->normal_prio,
378 tsk->pi_blocked_on, td->bkl);
379
380 for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--)
381 curr += sprintf(curr, "%d", td->mutexes[i]);
382
383 spin_unlock(&rttest_lock);
384
385 curr += sprintf(curr, ", T: %p, R: %p\n", tsk,
386 mutexes[td->sysdev.id].owner);
387
388 return curr - buf;
389}
390
391static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL);
392static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command);
393
394static struct sysdev_class rttest_sysclass = {
395 set_kset_name("rttest"),
396};
397
398static int init_test_thread(int id)
399{
400 thread_data[id].sysdev.cls = &rttest_sysclass;
401 thread_data[id].sysdev.id = id;
402
403 threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id);
404 if (IS_ERR(threads[id]))
405 return PTR_ERR(threads[id]);
406
407 return sysdev_register(&thread_data[id].sysdev);
408}
409
410static int init_rttest(void)
411{
412 int ret, i;
413
414 spin_lock_init(&rttest_lock);
415
416 for (i = 0; i < MAX_RT_TEST_MUTEXES; i++)
417 rt_mutex_init(&mutexes[i]);
418
419 ret = sysdev_class_register(&rttest_sysclass);
420 if (ret)
421 return ret;
422
423 for (i = 0; i < MAX_RT_TEST_THREADS; i++) {
424 ret = init_test_thread(i);
425 if (ret)
426 break;
427 ret = sysdev_create_file(&thread_data[i].sysdev, &attr_status);
428 if (ret)
429 break;
430 ret = sysdev_create_file(&thread_data[i].sysdev, &attr_command);
431 if (ret)
432 break;
433 }
434
435 printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" );
436
437 return ret;
438}
439
440device_initcall(init_rttest);
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
new file mode 100644
index 000000000000..45d61016da57
--- /dev/null
+++ b/kernel/rtmutex.c
@@ -0,0 +1,990 @@
1/*
2 * RT-Mutexes: simple blocking mutual exclusion locks with PI support
3 *
4 * started by Ingo Molnar and Thomas Gleixner.
5 *
6 * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
8 * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
9 * Copyright (C) 2006 Esben Nielsen
10 */
11#include <linux/spinlock.h>
12#include <linux/module.h>
13#include <linux/sched.h>
14#include <linux/timer.h>
15
16#include "rtmutex_common.h"
17
18#ifdef CONFIG_DEBUG_RT_MUTEXES
19# include "rtmutex-debug.h"
20#else
21# include "rtmutex.h"
22#endif
23
24/*
25 * lock->owner state tracking:
26 *
27 * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1
28 * are used to keep track of the "owner is pending" and "lock has
29 * waiters" state.
30 *
31 * owner bit1 bit0
32 * NULL 0 0 lock is free (fast acquire possible)
33 * NULL 0 1 invalid state
34 * NULL 1 0 Transitional State*
35 * NULL 1 1 invalid state
36 * taskpointer 0 0 lock is held (fast release possible)
37 * taskpointer 0 1 task is pending owner
38 * taskpointer 1 0 lock is held and has waiters
39 * taskpointer 1 1 task is pending owner and lock has more waiters
40 *
41 * Pending ownership is assigned to the top (highest priority)
42 * waiter of the lock, when the lock is released. The thread is woken
43 * up and can now take the lock. Until the lock is taken (bit 0
44 * cleared) a competing higher priority thread can steal the lock
45 * which puts the woken up thread back on the waiters list.
46 *
47 * The fast atomic compare exchange based acquire and release is only
48 * possible when bit 0 and 1 of lock->owner are 0.
49 *
50 * (*) There's a small time where the owner can be NULL and the
51 * "lock has waiters" bit is set. This can happen when grabbing the lock.
52 * To prevent a cmpxchg of the owner releasing the lock, we need to set this
53 * bit before looking at the lock, hence the reason this is a transitional
54 * state.
55 */
56
57static void
58rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
59 unsigned long mask)
60{
61 unsigned long val = (unsigned long)owner | mask;
62
63 if (rt_mutex_has_waiters(lock))
64 val |= RT_MUTEX_HAS_WAITERS;
65
66 lock->owner = (struct task_struct *)val;
67}
68
69static inline void clear_rt_mutex_waiters(struct rt_mutex *lock)
70{
71 lock->owner = (struct task_struct *)
72 ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
73}
74
75static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
76{
77 if (!rt_mutex_has_waiters(lock))
78 clear_rt_mutex_waiters(lock);
79}
80
81/*
82 * We can speed up the acquire/release, if the architecture
83 * supports cmpxchg and if there's no debugging state to be set up
84 */
85#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
86# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c)
87static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
88{
89 unsigned long owner, *p = (unsigned long *) &lock->owner;
90
91 do {
92 owner = *p;
93 } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
94}
95#else
96# define rt_mutex_cmpxchg(l,c,n) (0)
97static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
98{
99 lock->owner = (struct task_struct *)
100 ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
101}
102#endif
103
104/*
105 * Calculate task priority from the waiter list priority
106 *
107 * Return task->normal_prio when the waiter list is empty or when
108 * the waiter is not allowed to do priority boosting
109 */
110int rt_mutex_getprio(struct task_struct *task)
111{
112 if (likely(!task_has_pi_waiters(task)))
113 return task->normal_prio;
114
115 return min(task_top_pi_waiter(task)->pi_list_entry.prio,
116 task->normal_prio);
117}
118
119/*
120 * Adjust the priority of a task, after its pi_waiters got modified.
121 *
122 * This can be both boosting and unboosting. task->pi_lock must be held.
123 */
124static void __rt_mutex_adjust_prio(struct task_struct *task)
125{
126 int prio = rt_mutex_getprio(task);
127
128 if (task->prio != prio)
129 rt_mutex_setprio(task, prio);
130}
131
132/*
133 * Adjust task priority (undo boosting). Called from the exit path of
134 * rt_mutex_slowunlock() and rt_mutex_slowlock().
135 *
136 * (Note: We do this outside of the protection of lock->wait_lock to
137 * allow the lock to be taken while or before we readjust the priority
138 * of task. We do not use the spin_xx_mutex() variants here as we are
139 * outside of the debug path.)
140 */
141static void rt_mutex_adjust_prio(struct task_struct *task)
142{
143 unsigned long flags;
144
145 spin_lock_irqsave(&task->pi_lock, flags);
146 __rt_mutex_adjust_prio(task);
147 spin_unlock_irqrestore(&task->pi_lock, flags);
148}
149
150/*
151 * Max number of times we'll walk the boosting chain:
152 */
153int max_lock_depth = 1024;
154
155/*
156 * Adjust the priority chain. Also used for deadlock detection.
157 * Decreases task's usage by one - may thus free the task.
158 * Returns 0 or -EDEADLK.
159 */
160static int rt_mutex_adjust_prio_chain(task_t *task,
161 int deadlock_detect,
162 struct rt_mutex *orig_lock,
163 struct rt_mutex_waiter *orig_waiter,
164 struct task_struct *top_task
165 __IP_DECL__)
166{
167 struct rt_mutex *lock;
168 struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
169 int detect_deadlock, ret = 0, depth = 0;
170 unsigned long flags;
171
172 detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter,
173 deadlock_detect);
174
175 /*
176 * The (de)boosting is a step by step approach with a lot of
177 * pitfalls. We want this to be preemptible and we want hold a
178 * maximum of two locks per step. So we have to check
179 * carefully whether things change under us.
180 */
181 again:
182 if (++depth > max_lock_depth) {
183 static int prev_max;
184
185 /*
186 * Print this only once. If the admin changes the limit,
187 * print a new message when reaching the limit again.
188 */
189 if (prev_max != max_lock_depth) {
190 prev_max = max_lock_depth;
191 printk(KERN_WARNING "Maximum lock depth %d reached "
192 "task: %s (%d)\n", max_lock_depth,
193 top_task->comm, top_task->pid);
194 }
195 put_task_struct(task);
196
197 return deadlock_detect ? -EDEADLK : 0;
198 }
199 retry:
200 /*
201 * Task can not go away as we did a get_task() before !
202 */
203 spin_lock_irqsave(&task->pi_lock, flags);
204
205 waiter = task->pi_blocked_on;
206 /*
207 * Check whether the end of the boosting chain has been
208 * reached or the state of the chain has changed while we
209 * dropped the locks.
210 */
211 if (!waiter || !waiter->task)
212 goto out_unlock_pi;
213
214 if (top_waiter && (!task_has_pi_waiters(task) ||
215 top_waiter != task_top_pi_waiter(task)))
216 goto out_unlock_pi;
217
218 /*
219 * When deadlock detection is off then we check, if further
220 * priority adjustment is necessary.
221 */
222 if (!detect_deadlock && waiter->list_entry.prio == task->prio)
223 goto out_unlock_pi;
224
225 lock = waiter->lock;
226 if (!spin_trylock(&lock->wait_lock)) {
227 spin_unlock_irqrestore(&task->pi_lock, flags);
228 cpu_relax();
229 goto retry;
230 }
231
232 /* Deadlock detection */
233 if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
234 debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock);
235 spin_unlock(&lock->wait_lock);
236 ret = deadlock_detect ? -EDEADLK : 0;
237 goto out_unlock_pi;
238 }
239
240 top_waiter = rt_mutex_top_waiter(lock);
241
242 /* Requeue the waiter */
243 plist_del(&waiter->list_entry, &lock->wait_list);
244 waiter->list_entry.prio = task->prio;
245 plist_add(&waiter->list_entry, &lock->wait_list);
246
247 /* Release the task */
248 spin_unlock_irqrestore(&task->pi_lock, flags);
249 put_task_struct(task);
250
251 /* Grab the next task */
252 task = rt_mutex_owner(lock);
253 spin_lock_irqsave(&task->pi_lock, flags);
254
255 if (waiter == rt_mutex_top_waiter(lock)) {
256 /* Boost the owner */
257 plist_del(&top_waiter->pi_list_entry, &task->pi_waiters);
258 waiter->pi_list_entry.prio = waiter->list_entry.prio;
259 plist_add(&waiter->pi_list_entry, &task->pi_waiters);
260 __rt_mutex_adjust_prio(task);
261
262 } else if (top_waiter == waiter) {
263 /* Deboost the owner */
264 plist_del(&waiter->pi_list_entry, &task->pi_waiters);
265 waiter = rt_mutex_top_waiter(lock);
266 waiter->pi_list_entry.prio = waiter->list_entry.prio;
267 plist_add(&waiter->pi_list_entry, &task->pi_waiters);
268 __rt_mutex_adjust_prio(task);
269 }
270
271 get_task_struct(task);
272 spin_unlock_irqrestore(&task->pi_lock, flags);
273
274 top_waiter = rt_mutex_top_waiter(lock);
275 spin_unlock(&lock->wait_lock);
276
277 if (!detect_deadlock && waiter != top_waiter)
278 goto out_put_task;
279
280 goto again;
281
282 out_unlock_pi:
283 spin_unlock_irqrestore(&task->pi_lock, flags);
284 out_put_task:
285 put_task_struct(task);
286 return ret;
287}
288
289/*
290 * Optimization: check if we can steal the lock from the
291 * assigned pending owner [which might not have taken the
292 * lock yet]:
293 */
294static inline int try_to_steal_lock(struct rt_mutex *lock)
295{
296 struct task_struct *pendowner = rt_mutex_owner(lock);
297 struct rt_mutex_waiter *next;
298 unsigned long flags;
299
300 if (!rt_mutex_owner_pending(lock))
301 return 0;
302
303 if (pendowner == current)
304 return 1;
305
306 spin_lock_irqsave(&pendowner->pi_lock, flags);
307 if (current->prio >= pendowner->prio) {
308 spin_unlock_irqrestore(&pendowner->pi_lock, flags);
309 return 0;
310 }
311
312 /*
313 * Check if a waiter is enqueued on the pending owners
314 * pi_waiters list. Remove it and readjust pending owners
315 * priority.
316 */
317 if (likely(!rt_mutex_has_waiters(lock))) {
318 spin_unlock_irqrestore(&pendowner->pi_lock, flags);
319 return 1;
320 }
321
322 /* No chain handling, pending owner is not blocked on anything: */
323 next = rt_mutex_top_waiter(lock);
324 plist_del(&next->pi_list_entry, &pendowner->pi_waiters);
325 __rt_mutex_adjust_prio(pendowner);
326 spin_unlock_irqrestore(&pendowner->pi_lock, flags);
327
328 /*
329 * We are going to steal the lock and a waiter was
330 * enqueued on the pending owners pi_waiters queue. So
331 * we have to enqueue this waiter into
332 * current->pi_waiters list. This covers the case,
333 * where current is boosted because it holds another
334 * lock and gets unboosted because the booster is
335 * interrupted, so we would delay a waiter with higher
336 * priority as current->normal_prio.
337 *
338 * Note: in the rare case of a SCHED_OTHER task changing
339 * its priority and thus stealing the lock, next->task
340 * might be current:
341 */
342 if (likely(next->task != current)) {
343 spin_lock_irqsave(&current->pi_lock, flags);
344 plist_add(&next->pi_list_entry, &current->pi_waiters);
345 __rt_mutex_adjust_prio(current);
346 spin_unlock_irqrestore(&current->pi_lock, flags);
347 }
348 return 1;
349}
350
351/*
352 * Try to take an rt-mutex
353 *
354 * This fails
355 * - when the lock has a real owner
356 * - when a different pending owner exists and has higher priority than current
357 *
358 * Must be called with lock->wait_lock held.
359 */
360static int try_to_take_rt_mutex(struct rt_mutex *lock __IP_DECL__)
361{
362 /*
363 * We have to be careful here if the atomic speedups are
364 * enabled, such that, when
365 * - no other waiter is on the lock
366 * - the lock has been released since we did the cmpxchg
367 * the lock can be released or taken while we are doing the
368 * checks and marking the lock with RT_MUTEX_HAS_WAITERS.
369 *
370 * The atomic acquire/release aware variant of
371 * mark_rt_mutex_waiters uses a cmpxchg loop. After setting
372 * the WAITERS bit, the atomic release / acquire can not
373 * happen anymore and lock->wait_lock protects us from the
374 * non-atomic case.
375 *
376 * Note, that this might set lock->owner =
377 * RT_MUTEX_HAS_WAITERS in the case the lock is not contended
378 * any more. This is fixed up when we take the ownership.
379 * This is the transitional state explained at the top of this file.
380 */
381 mark_rt_mutex_waiters(lock);
382
383 if (rt_mutex_owner(lock) && !try_to_steal_lock(lock))
384 return 0;
385
386 /* We got the lock. */
387 debug_rt_mutex_lock(lock __IP__);
388
389 rt_mutex_set_owner(lock, current, 0);
390
391 rt_mutex_deadlock_account_lock(lock, current);
392
393 return 1;
394}
395
396/*
397 * Task blocks on lock.
398 *
399 * Prepare waiter and propagate pi chain
400 *
401 * This must be called with lock->wait_lock held.
402 */
403static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
404 struct rt_mutex_waiter *waiter,
405 int detect_deadlock
406 __IP_DECL__)
407{
408 struct rt_mutex_waiter *top_waiter = waiter;
409 task_t *owner = rt_mutex_owner(lock);
410 int boost = 0, res;
411 unsigned long flags;
412
413 spin_lock_irqsave(&current->pi_lock, flags);
414 __rt_mutex_adjust_prio(current);
415 waiter->task = current;
416 waiter->lock = lock;
417 plist_node_init(&waiter->list_entry, current->prio);
418 plist_node_init(&waiter->pi_list_entry, current->prio);
419
420 /* Get the top priority waiter on the lock */
421 if (rt_mutex_has_waiters(lock))
422 top_waiter = rt_mutex_top_waiter(lock);
423 plist_add(&waiter->list_entry, &lock->wait_list);
424
425 current->pi_blocked_on = waiter;
426
427 spin_unlock_irqrestore(&current->pi_lock, flags);
428
429 if (waiter == rt_mutex_top_waiter(lock)) {
430 spin_lock_irqsave(&owner->pi_lock, flags);
431 plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
432 plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
433
434 __rt_mutex_adjust_prio(owner);
435 if (owner->pi_blocked_on) {
436 boost = 1;
437 /* gets dropped in rt_mutex_adjust_prio_chain()! */
438 get_task_struct(owner);
439 }
440 spin_unlock_irqrestore(&owner->pi_lock, flags);
441 }
442 else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) {
443 spin_lock_irqsave(&owner->pi_lock, flags);
444 if (owner->pi_blocked_on) {
445 boost = 1;
446 /* gets dropped in rt_mutex_adjust_prio_chain()! */
447 get_task_struct(owner);
448 }
449 spin_unlock_irqrestore(&owner->pi_lock, flags);
450 }
451 if (!boost)
452 return 0;
453
454 spin_unlock(&lock->wait_lock);
455
456 res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
457 current __IP__);
458
459 spin_lock(&lock->wait_lock);
460
461 return res;
462}
463
464/*
465 * Wake up the next waiter on the lock.
466 *
467 * Remove the top waiter from the current tasks waiter list and from
468 * the lock waiter list. Set it as pending owner. Then wake it up.
469 *
470 * Called with lock->wait_lock held.
471 */
472static void wakeup_next_waiter(struct rt_mutex *lock)
473{
474 struct rt_mutex_waiter *waiter;
475 struct task_struct *pendowner;
476 unsigned long flags;
477
478 spin_lock_irqsave(&current->pi_lock, flags);
479
480 waiter = rt_mutex_top_waiter(lock);
481 plist_del(&waiter->list_entry, &lock->wait_list);
482
483 /*
484 * Remove it from current->pi_waiters. We do not adjust a
485 * possible priority boost right now. We execute wakeup in the
486 * boosted mode and go back to normal after releasing
487 * lock->wait_lock.
488 */
489 plist_del(&waiter->pi_list_entry, &current->pi_waiters);
490 pendowner = waiter->task;
491 waiter->task = NULL;
492
493 rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING);
494
495 spin_unlock_irqrestore(&current->pi_lock, flags);
496
497 /*
498 * Clear the pi_blocked_on variable and enqueue a possible
499 * waiter into the pi_waiters list of the pending owner. This
500 * prevents that in case the pending owner gets unboosted a
501 * waiter with higher priority than pending-owner->normal_prio
502 * is blocked on the unboosted (pending) owner.
503 */
504 spin_lock_irqsave(&pendowner->pi_lock, flags);
505
506 WARN_ON(!pendowner->pi_blocked_on);
507 WARN_ON(pendowner->pi_blocked_on != waiter);
508 WARN_ON(pendowner->pi_blocked_on->lock != lock);
509
510 pendowner->pi_blocked_on = NULL;
511
512 if (rt_mutex_has_waiters(lock)) {
513 struct rt_mutex_waiter *next;
514
515 next = rt_mutex_top_waiter(lock);
516 plist_add(&next->pi_list_entry, &pendowner->pi_waiters);
517 }
518 spin_unlock_irqrestore(&pendowner->pi_lock, flags);
519
520 wake_up_process(pendowner);
521}
522
523/*
524 * Remove a waiter from a lock
525 *
526 * Must be called with lock->wait_lock held
527 */
528static void remove_waiter(struct rt_mutex *lock,
529 struct rt_mutex_waiter *waiter __IP_DECL__)
530{
531 int first = (waiter == rt_mutex_top_waiter(lock));
532 int boost = 0;
533 task_t *owner = rt_mutex_owner(lock);
534 unsigned long flags;
535
536 spin_lock_irqsave(&current->pi_lock, flags);
537 plist_del(&waiter->list_entry, &lock->wait_list);
538 waiter->task = NULL;
539 current->pi_blocked_on = NULL;
540 spin_unlock_irqrestore(&current->pi_lock, flags);
541
542 if (first && owner != current) {
543
544 spin_lock_irqsave(&owner->pi_lock, flags);
545
546 plist_del(&waiter->pi_list_entry, &owner->pi_waiters);
547
548 if (rt_mutex_has_waiters(lock)) {
549 struct rt_mutex_waiter *next;
550
551 next = rt_mutex_top_waiter(lock);
552 plist_add(&next->pi_list_entry, &owner->pi_waiters);
553 }
554 __rt_mutex_adjust_prio(owner);
555
556 if (owner->pi_blocked_on) {
557 boost = 1;
558 /* gets dropped in rt_mutex_adjust_prio_chain()! */
559 get_task_struct(owner);
560 }
561 spin_unlock_irqrestore(&owner->pi_lock, flags);
562 }
563
564 WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
565
566 if (!boost)
567 return;
568
569 spin_unlock(&lock->wait_lock);
570
571 rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current __IP__);
572
573 spin_lock(&lock->wait_lock);
574}
575
576/*
577 * Recheck the pi chain, in case we got a priority setting
578 *
579 * Called from sched_setscheduler
580 */
581void rt_mutex_adjust_pi(struct task_struct *task)
582{
583 struct rt_mutex_waiter *waiter;
584 unsigned long flags;
585
586 spin_lock_irqsave(&task->pi_lock, flags);
587
588 waiter = task->pi_blocked_on;
589 if (!waiter || waiter->list_entry.prio == task->prio) {
590 spin_unlock_irqrestore(&task->pi_lock, flags);
591 return;
592 }
593
594 /* gets dropped in rt_mutex_adjust_prio_chain()! */
595 get_task_struct(task);
596 spin_unlock_irqrestore(&task->pi_lock, flags);
597
598 rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task __RET_IP__);
599}
600
601/*
602 * Slow path lock function:
603 */
604static int __sched
605rt_mutex_slowlock(struct rt_mutex *lock, int state,
606 struct hrtimer_sleeper *timeout,
607 int detect_deadlock __IP_DECL__)
608{
609 struct rt_mutex_waiter waiter;
610 int ret = 0;
611
612 debug_rt_mutex_init_waiter(&waiter);
613 waiter.task = NULL;
614
615 spin_lock(&lock->wait_lock);
616
617 /* Try to acquire the lock again: */
618 if (try_to_take_rt_mutex(lock __IP__)) {
619 spin_unlock(&lock->wait_lock);
620 return 0;
621 }
622
623 set_current_state(state);
624
625 /* Setup the timer, when timeout != NULL */
626 if (unlikely(timeout))
627 hrtimer_start(&timeout->timer, timeout->timer.expires,
628 HRTIMER_ABS);
629
630 for (;;) {
631 /* Try to acquire the lock: */
632 if (try_to_take_rt_mutex(lock __IP__))
633 break;
634
635 /*
636 * TASK_INTERRUPTIBLE checks for signals and
637 * timeout. Ignored otherwise.
638 */
639 if (unlikely(state == TASK_INTERRUPTIBLE)) {
640 /* Signal pending? */
641 if (signal_pending(current))
642 ret = -EINTR;
643 if (timeout && !timeout->task)
644 ret = -ETIMEDOUT;
645 if (ret)
646 break;
647 }
648
649 /*
650 * waiter.task is NULL the first time we come here and
651 * when we have been woken up by the previous owner
652 * but the lock got stolen by a higher prio task.
653 */
654 if (!waiter.task) {
655 ret = task_blocks_on_rt_mutex(lock, &waiter,
656 detect_deadlock __IP__);
657 /*
658 * If we got woken up by the owner then start loop
659 * all over without going into schedule to try
660 * to get the lock now:
661 */
662 if (unlikely(!waiter.task))
663 continue;
664
665 if (unlikely(ret))
666 break;
667 }
668
669 spin_unlock(&lock->wait_lock);
670
671 debug_rt_mutex_print_deadlock(&waiter);
672
673 if (waiter.task)
674 schedule_rt_mutex(lock);
675
676 spin_lock(&lock->wait_lock);
677 set_current_state(state);
678 }
679
680 set_current_state(TASK_RUNNING);
681
682 if (unlikely(waiter.task))
683 remove_waiter(lock, &waiter __IP__);
684
685 /*
686 * try_to_take_rt_mutex() sets the waiter bit
687 * unconditionally. We might have to fix that up.
688 */
689 fixup_rt_mutex_waiters(lock);
690
691 spin_unlock(&lock->wait_lock);
692
693 /* Remove pending timer: */
694 if (unlikely(timeout))
695 hrtimer_cancel(&timeout->timer);
696
697 /*
698 * Readjust priority, when we did not get the lock. We might
699 * have been the pending owner and boosted. Since we did not
700 * take the lock, the PI boost has to go.
701 */
702 if (unlikely(ret))
703 rt_mutex_adjust_prio(current);
704
705 debug_rt_mutex_free_waiter(&waiter);
706
707 return ret;
708}
709
710/*
711 * Slow path try-lock function:
712 */
713static inline int
714rt_mutex_slowtrylock(struct rt_mutex *lock __IP_DECL__)
715{
716 int ret = 0;
717
718 spin_lock(&lock->wait_lock);
719
720 if (likely(rt_mutex_owner(lock) != current)) {
721
722 ret = try_to_take_rt_mutex(lock __IP__);
723 /*
724 * try_to_take_rt_mutex() sets the lock waiters
725 * bit unconditionally. Clean this up.
726 */
727 fixup_rt_mutex_waiters(lock);
728 }
729
730 spin_unlock(&lock->wait_lock);
731
732 return ret;
733}
734
735/*
736 * Slow path to release a rt-mutex:
737 */
738static void __sched
739rt_mutex_slowunlock(struct rt_mutex *lock)
740{
741 spin_lock(&lock->wait_lock);
742
743 debug_rt_mutex_unlock(lock);
744
745 rt_mutex_deadlock_account_unlock(current);
746
747 if (!rt_mutex_has_waiters(lock)) {
748 lock->owner = NULL;
749 spin_unlock(&lock->wait_lock);
750 return;
751 }
752
753 wakeup_next_waiter(lock);
754
755 spin_unlock(&lock->wait_lock);
756
757 /* Undo pi boosting if necessary: */
758 rt_mutex_adjust_prio(current);
759}
760
761/*
762 * debug aware fast / slowpath lock,trylock,unlock
763 *
764 * The atomic acquire/release ops are compiled away, when either the
765 * architecture does not support cmpxchg or when debugging is enabled.
766 */
767static inline int
768rt_mutex_fastlock(struct rt_mutex *lock, int state,
769 int detect_deadlock,
770 int (*slowfn)(struct rt_mutex *lock, int state,
771 struct hrtimer_sleeper *timeout,
772 int detect_deadlock __IP_DECL__))
773{
774 if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
775 rt_mutex_deadlock_account_lock(lock, current);
776 return 0;
777 } else
778 return slowfn(lock, state, NULL, detect_deadlock __RET_IP__);
779}
780
781static inline int
782rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
783 struct hrtimer_sleeper *timeout, int detect_deadlock,
784 int (*slowfn)(struct rt_mutex *lock, int state,
785 struct hrtimer_sleeper *timeout,
786 int detect_deadlock __IP_DECL__))
787{
788 if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
789 rt_mutex_deadlock_account_lock(lock, current);
790 return 0;
791 } else
792 return slowfn(lock, state, timeout, detect_deadlock __RET_IP__);
793}
794
795static inline int
796rt_mutex_fasttrylock(struct rt_mutex *lock,
797 int (*slowfn)(struct rt_mutex *lock __IP_DECL__))
798{
799 if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
800 rt_mutex_deadlock_account_lock(lock, current);
801 return 1;
802 }
803 return slowfn(lock __RET_IP__);
804}
805
806static inline void
807rt_mutex_fastunlock(struct rt_mutex *lock,
808 void (*slowfn)(struct rt_mutex *lock))
809{
810 if (likely(rt_mutex_cmpxchg(lock, current, NULL)))
811 rt_mutex_deadlock_account_unlock(current);
812 else
813 slowfn(lock);
814}
815
816/**
817 * rt_mutex_lock - lock a rt_mutex
818 *
819 * @lock: the rt_mutex to be locked
820 */
821void __sched rt_mutex_lock(struct rt_mutex *lock)
822{
823 might_sleep();
824
825 rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock);
826}
827EXPORT_SYMBOL_GPL(rt_mutex_lock);
828
829/**
830 * rt_mutex_lock_interruptible - lock a rt_mutex interruptible
831 *
832 * @lock: the rt_mutex to be locked
833 * @detect_deadlock: deadlock detection on/off
834 *
835 * Returns:
836 * 0 on success
837 * -EINTR when interrupted by a signal
838 * -EDEADLK when the lock would deadlock (when deadlock detection is on)
839 */
840int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock,
841 int detect_deadlock)
842{
843 might_sleep();
844
845 return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE,
846 detect_deadlock, rt_mutex_slowlock);
847}
848EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
849
850/**
851 * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible
852 * the timeout structure is provided
853 * by the caller
854 *
855 * @lock: the rt_mutex to be locked
856 * @timeout: timeout structure or NULL (no timeout)
857 * @detect_deadlock: deadlock detection on/off
858 *
859 * Returns:
860 * 0 on success
861 * -EINTR when interrupted by a signal
862 * -ETIMEOUT when the timeout expired
863 * -EDEADLK when the lock would deadlock (when deadlock detection is on)
864 */
865int
866rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout,
867 int detect_deadlock)
868{
869 might_sleep();
870
871 return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
872 detect_deadlock, rt_mutex_slowlock);
873}
874EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
875
876/**
877 * rt_mutex_trylock - try to lock a rt_mutex
878 *
879 * @lock: the rt_mutex to be locked
880 *
881 * Returns 1 on success and 0 on contention
882 */
883int __sched rt_mutex_trylock(struct rt_mutex *lock)
884{
885 return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
886}
887EXPORT_SYMBOL_GPL(rt_mutex_trylock);
888
889/**
890 * rt_mutex_unlock - unlock a rt_mutex
891 *
892 * @lock: the rt_mutex to be unlocked
893 */
894void __sched rt_mutex_unlock(struct rt_mutex *lock)
895{
896 rt_mutex_fastunlock(lock, rt_mutex_slowunlock);
897}
898EXPORT_SYMBOL_GPL(rt_mutex_unlock);
899
900/***
901 * rt_mutex_destroy - mark a mutex unusable
902 * @lock: the mutex to be destroyed
903 *
904 * This function marks the mutex uninitialized, and any subsequent
905 * use of the mutex is forbidden. The mutex must not be locked when
906 * this function is called.
907 */
908void rt_mutex_destroy(struct rt_mutex *lock)
909{
910 WARN_ON(rt_mutex_is_locked(lock));
911#ifdef CONFIG_DEBUG_RT_MUTEXES
912 lock->magic = NULL;
913#endif
914}
915
916EXPORT_SYMBOL_GPL(rt_mutex_destroy);
917
918/**
919 * __rt_mutex_init - initialize the rt lock
920 *
921 * @lock: the rt lock to be initialized
922 *
923 * Initialize the rt lock to unlocked state.
924 *
925 * Initializing of a locked rt lock is not allowed
926 */
927void __rt_mutex_init(struct rt_mutex *lock, const char *name)
928{
929 lock->owner = NULL;
930 spin_lock_init(&lock->wait_lock);
931 plist_head_init(&lock->wait_list, &lock->wait_lock);
932
933 debug_rt_mutex_init(lock, name);
934}
935EXPORT_SYMBOL_GPL(__rt_mutex_init);
936
937/**
938 * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
939 * proxy owner
940 *
941 * @lock: the rt_mutex to be locked
942 * @proxy_owner:the task to set as owner
943 *
944 * No locking. Caller has to do serializing itself
945 * Special API call for PI-futex support
946 */
947void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
948 struct task_struct *proxy_owner)
949{
950 __rt_mutex_init(lock, NULL);
951 debug_rt_mutex_proxy_lock(lock, proxy_owner __RET_IP__);
952 rt_mutex_set_owner(lock, proxy_owner, 0);
953 rt_mutex_deadlock_account_lock(lock, proxy_owner);
954}
955
956/**
957 * rt_mutex_proxy_unlock - release a lock on behalf of owner
958 *
959 * @lock: the rt_mutex to be locked
960 *
961 * No locking. Caller has to do serializing itself
962 * Special API call for PI-futex support
963 */
964void rt_mutex_proxy_unlock(struct rt_mutex *lock,
965 struct task_struct *proxy_owner)
966{
967 debug_rt_mutex_proxy_unlock(lock);
968 rt_mutex_set_owner(lock, NULL, 0);
969 rt_mutex_deadlock_account_unlock(proxy_owner);
970}
971
972/**
973 * rt_mutex_next_owner - return the next owner of the lock
974 *
975 * @lock: the rt lock query
976 *
977 * Returns the next owner of the lock or NULL
978 *
979 * Caller has to serialize against other accessors to the lock
980 * itself.
981 *
982 * Special API call for PI-futex support
983 */
984struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
985{
986 if (!rt_mutex_has_waiters(lock))
987 return NULL;
988
989 return rt_mutex_top_waiter(lock)->task;
990}
diff --git a/kernel/rtmutex.h b/kernel/rtmutex.h
new file mode 100644
index 000000000000..1e0fca13ff72
--- /dev/null
+++ b/kernel/rtmutex.h
@@ -0,0 +1,29 @@
1/*
2 * RT-Mutexes: blocking mutual exclusion locks with PI support
3 *
4 * started by Ingo Molnar and Thomas Gleixner:
5 *
6 * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
8 *
9 * This file contains macros used solely by rtmutex.c.
10 * Non-debug version.
11 */
12
13#define __IP_DECL__
14#define __IP__
15#define __RET_IP__
16#define rt_mutex_deadlock_check(l) (0)
17#define rt_mutex_deadlock_account_lock(m, t) do { } while (0)
18#define rt_mutex_deadlock_account_unlock(l) do { } while (0)
19#define debug_rt_mutex_init_waiter(w) do { } while (0)
20#define debug_rt_mutex_free_waiter(w) do { } while (0)
21#define debug_rt_mutex_lock(l) do { } while (0)
22#define debug_rt_mutex_proxy_lock(l,p) do { } while (0)
23#define debug_rt_mutex_proxy_unlock(l) do { } while (0)
24#define debug_rt_mutex_unlock(l) do { } while (0)
25#define debug_rt_mutex_init(m, n) do { } while (0)
26#define debug_rt_mutex_deadlock(d, a ,l) do { } while (0)
27#define debug_rt_mutex_print_deadlock(w) do { } while (0)
28#define debug_rt_mutex_detect_deadlock(w,d) (d)
29#define debug_rt_mutex_reset_waiter(w) do { } while (0)
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
new file mode 100644
index 000000000000..9c75856e791e
--- /dev/null
+++ b/kernel/rtmutex_common.h
@@ -0,0 +1,123 @@
1/*
2 * RT Mutexes: blocking mutual exclusion locks with PI support
3 *
4 * started by Ingo Molnar and Thomas Gleixner:
5 *
6 * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
8 *
9 * This file contains the private data structure and API definitions.
10 */
11
12#ifndef __KERNEL_RTMUTEX_COMMON_H
13#define __KERNEL_RTMUTEX_COMMON_H
14
15#include <linux/rtmutex.h>
16
17/*
18 * The rtmutex in kernel tester is independent of rtmutex debugging. We
19 * call schedule_rt_mutex_test() instead of schedule() for the tasks which
20 * belong to the tester. That way we can delay the wakeup path of those
21 * threads to provoke lock stealing and testing of complex boosting scenarios.
22 */
23#ifdef CONFIG_RT_MUTEX_TESTER
24
25extern void schedule_rt_mutex_test(struct rt_mutex *lock);
26
27#define schedule_rt_mutex(_lock) \
28 do { \
29 if (!(current->flags & PF_MUTEX_TESTER)) \
30 schedule(); \
31 else \
32 schedule_rt_mutex_test(_lock); \
33 } while (0)
34
35#else
36# define schedule_rt_mutex(_lock) schedule()
37#endif
38
39/*
40 * This is the control structure for tasks blocked on a rt_mutex,
41 * which is allocated on the kernel stack on of the blocked task.
42 *
43 * @list_entry: pi node to enqueue into the mutex waiters list
44 * @pi_list_entry: pi node to enqueue into the mutex owner waiters list
45 * @task: task reference to the blocked task
46 */
47struct rt_mutex_waiter {
48 struct plist_node list_entry;
49 struct plist_node pi_list_entry;
50 struct task_struct *task;
51 struct rt_mutex *lock;
52#ifdef CONFIG_DEBUG_RT_MUTEXES
53 unsigned long ip;
54 pid_t deadlock_task_pid;
55 struct rt_mutex *deadlock_lock;
56#endif
57};
58
59/*
60 * Various helpers to access the waiters-plist:
61 */
62static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
63{
64 return !plist_head_empty(&lock->wait_list);
65}
66
67static inline struct rt_mutex_waiter *
68rt_mutex_top_waiter(struct rt_mutex *lock)
69{
70 struct rt_mutex_waiter *w;
71
72 w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter,
73 list_entry);
74 BUG_ON(w->lock != lock);
75
76 return w;
77}
78
79static inline int task_has_pi_waiters(struct task_struct *p)
80{
81 return !plist_head_empty(&p->pi_waiters);
82}
83
84static inline struct rt_mutex_waiter *
85task_top_pi_waiter(struct task_struct *p)
86{
87 return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter,
88 pi_list_entry);
89}
90
91/*
92 * lock->owner state tracking:
93 */
94#define RT_MUTEX_OWNER_PENDING 1UL
95#define RT_MUTEX_HAS_WAITERS 2UL
96#define RT_MUTEX_OWNER_MASKALL 3UL
97
98static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
99{
100 return (struct task_struct *)
101 ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL);
102}
103
104static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock)
105{
106 return (struct task_struct *)
107 ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
108}
109
110static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock)
111{
112 return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING;
113}
114
115/*
116 * PI-futex support (proxy locking functions, etc.):
117 */
118extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
119extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
120 struct task_struct *proxy_owner);
121extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
122 struct task_struct *proxy_owner);
123#endif
diff --git a/kernel/sched.c b/kernel/sched.c
index c13f1bd2df7d..2629c1711fd6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -168,15 +168,21 @@
168 */ 168 */
169 169
170#define SCALE_PRIO(x, prio) \ 170#define SCALE_PRIO(x, prio) \
171 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) 171 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
172 172
173static unsigned int task_timeslice(task_t *p) 173static unsigned int static_prio_timeslice(int static_prio)
174{ 174{
175 if (p->static_prio < NICE_TO_PRIO(0)) 175 if (static_prio < NICE_TO_PRIO(0))
176 return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); 176 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
177 else 177 else
178 return SCALE_PRIO(DEF_TIMESLICE, p->static_prio); 178 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
179} 179}
180
181static inline unsigned int task_timeslice(task_t *p)
182{
183 return static_prio_timeslice(p->static_prio);
184}
185
180#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ 186#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \
181 < (long long) (sd)->cache_hot_time) 187 < (long long) (sd)->cache_hot_time)
182 188
@@ -184,13 +190,11 @@ static unsigned int task_timeslice(task_t *p)
184 * These are the runqueue data structures: 190 * These are the runqueue data structures:
185 */ 191 */
186 192
187#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
188
189typedef struct runqueue runqueue_t; 193typedef struct runqueue runqueue_t;
190 194
191struct prio_array { 195struct prio_array {
192 unsigned int nr_active; 196 unsigned int nr_active;
193 unsigned long bitmap[BITMAP_SIZE]; 197 DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
194 struct list_head queue[MAX_PRIO]; 198 struct list_head queue[MAX_PRIO];
195}; 199};
196 200
@@ -209,6 +213,7 @@ struct runqueue {
209 * remote CPUs use both these fields when doing load calculation. 213 * remote CPUs use both these fields when doing load calculation.
210 */ 214 */
211 unsigned long nr_running; 215 unsigned long nr_running;
216 unsigned long raw_weighted_load;
212#ifdef CONFIG_SMP 217#ifdef CONFIG_SMP
213 unsigned long cpu_load[3]; 218 unsigned long cpu_load[3];
214#endif 219#endif
@@ -239,7 +244,6 @@ struct runqueue {
239 244
240 task_t *migration_thread; 245 task_t *migration_thread;
241 struct list_head migration_queue; 246 struct list_head migration_queue;
242 int cpu;
243#endif 247#endif
244 248
245#ifdef CONFIG_SCHEDSTATS 249#ifdef CONFIG_SCHEDSTATS
@@ -351,11 +355,30 @@ static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
351#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 355#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
352 356
353/* 357/*
358 * __task_rq_lock - lock the runqueue a given task resides on.
359 * Must be called interrupts disabled.
360 */
361static inline runqueue_t *__task_rq_lock(task_t *p)
362 __acquires(rq->lock)
363{
364 struct runqueue *rq;
365
366repeat_lock_task:
367 rq = task_rq(p);
368 spin_lock(&rq->lock);
369 if (unlikely(rq != task_rq(p))) {
370 spin_unlock(&rq->lock);
371 goto repeat_lock_task;
372 }
373 return rq;
374}
375
376/*
354 * task_rq_lock - lock the runqueue a given task resides on and disable 377 * task_rq_lock - lock the runqueue a given task resides on and disable
355 * interrupts. Note the ordering: we can safely lookup the task_rq without 378 * interrupts. Note the ordering: we can safely lookup the task_rq without
356 * explicitly disabling preemption. 379 * explicitly disabling preemption.
357 */ 380 */
358static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) 381static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
359 __acquires(rq->lock) 382 __acquires(rq->lock)
360{ 383{
361 struct runqueue *rq; 384 struct runqueue *rq;
@@ -371,6 +394,12 @@ repeat_lock_task:
371 return rq; 394 return rq;
372} 395}
373 396
397static inline void __task_rq_unlock(runqueue_t *rq)
398 __releases(rq->lock)
399{
400 spin_unlock(&rq->lock);
401}
402
374static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) 403static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
375 __releases(rq->lock) 404 __releases(rq->lock)
376{ 405{
@@ -634,7 +663,7 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
634} 663}
635 664
636/* 665/*
637 * effective_prio - return the priority that is based on the static 666 * __normal_prio - return the priority that is based on the static
638 * priority but is modified by bonuses/penalties. 667 * priority but is modified by bonuses/penalties.
639 * 668 *
640 * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] 669 * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
@@ -647,13 +676,11 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
647 * 676 *
648 * Both properties are important to certain workloads. 677 * Both properties are important to certain workloads.
649 */ 678 */
650static int effective_prio(task_t *p) 679
680static inline int __normal_prio(task_t *p)
651{ 681{
652 int bonus, prio; 682 int bonus, prio;
653 683
654 if (rt_task(p))
655 return p->prio;
656
657 bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; 684 bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
658 685
659 prio = p->static_prio - bonus; 686 prio = p->static_prio - bonus;
@@ -665,6 +692,106 @@ static int effective_prio(task_t *p)
665} 692}
666 693
667/* 694/*
695 * To aid in avoiding the subversion of "niceness" due to uneven distribution
696 * of tasks with abnormal "nice" values across CPUs the contribution that
697 * each task makes to its run queue's load is weighted according to its
698 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
699 * scaled version of the new time slice allocation that they receive on time
700 * slice expiry etc.
701 */
702
703/*
704 * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
705 * If static_prio_timeslice() is ever changed to break this assumption then
706 * this code will need modification
707 */
708#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
709#define LOAD_WEIGHT(lp) \
710 (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
711#define PRIO_TO_LOAD_WEIGHT(prio) \
712 LOAD_WEIGHT(static_prio_timeslice(prio))
713#define RTPRIO_TO_LOAD_WEIGHT(rp) \
714 (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
715
716static void set_load_weight(task_t *p)
717{
718 if (has_rt_policy(p)) {
719#ifdef CONFIG_SMP
720 if (p == task_rq(p)->migration_thread)
721 /*
722 * The migration thread does the actual balancing.
723 * Giving its load any weight will skew balancing
724 * adversely.
725 */
726 p->load_weight = 0;
727 else
728#endif
729 p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
730 } else
731 p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
732}
733
734static inline void inc_raw_weighted_load(runqueue_t *rq, const task_t *p)
735{
736 rq->raw_weighted_load += p->load_weight;
737}
738
739static inline void dec_raw_weighted_load(runqueue_t *rq, const task_t *p)
740{
741 rq->raw_weighted_load -= p->load_weight;
742}
743
744static inline void inc_nr_running(task_t *p, runqueue_t *rq)
745{
746 rq->nr_running++;
747 inc_raw_weighted_load(rq, p);
748}
749
750static inline void dec_nr_running(task_t *p, runqueue_t *rq)
751{
752 rq->nr_running--;
753 dec_raw_weighted_load(rq, p);
754}
755
756/*
757 * Calculate the expected normal priority: i.e. priority
758 * without taking RT-inheritance into account. Might be
759 * boosted by interactivity modifiers. Changes upon fork,
760 * setprio syscalls, and whenever the interactivity
761 * estimator recalculates.
762 */
763static inline int normal_prio(task_t *p)
764{
765 int prio;
766
767 if (has_rt_policy(p))
768 prio = MAX_RT_PRIO-1 - p->rt_priority;
769 else
770 prio = __normal_prio(p);
771 return prio;
772}
773
774/*
775 * Calculate the current priority, i.e. the priority
776 * taken into account by the scheduler. This value might
777 * be boosted by RT tasks, or might be boosted by
778 * interactivity modifiers. Will be RT if the task got
779 * RT-boosted. If not then it returns p->normal_prio.
780 */
781static int effective_prio(task_t *p)
782{
783 p->normal_prio = normal_prio(p);
784 /*
785 * If we are RT tasks or we were boosted to RT priority,
786 * keep the priority unchanged. Otherwise, update priority
787 * to the normal priority:
788 */
789 if (!rt_prio(p->prio))
790 return p->normal_prio;
791 return p->prio;
792}
793
794/*
668 * __activate_task - move a task to the runqueue. 795 * __activate_task - move a task to the runqueue.
669 */ 796 */
670static void __activate_task(task_t *p, runqueue_t *rq) 797static void __activate_task(task_t *p, runqueue_t *rq)
@@ -674,7 +801,7 @@ static void __activate_task(task_t *p, runqueue_t *rq)
674 if (batch_task(p)) 801 if (batch_task(p))
675 target = rq->expired; 802 target = rq->expired;
676 enqueue_task(p, target); 803 enqueue_task(p, target);
677 rq->nr_running++; 804 inc_nr_running(p, rq);
678} 805}
679 806
680/* 807/*
@@ -683,39 +810,45 @@ static void __activate_task(task_t *p, runqueue_t *rq)
683static inline void __activate_idle_task(task_t *p, runqueue_t *rq) 810static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
684{ 811{
685 enqueue_task_head(p, rq->active); 812 enqueue_task_head(p, rq->active);
686 rq->nr_running++; 813 inc_nr_running(p, rq);
687} 814}
688 815
816/*
817 * Recalculate p->normal_prio and p->prio after having slept,
818 * updating the sleep-average too:
819 */
689static int recalc_task_prio(task_t *p, unsigned long long now) 820static int recalc_task_prio(task_t *p, unsigned long long now)
690{ 821{
691 /* Caller must always ensure 'now >= p->timestamp' */ 822 /* Caller must always ensure 'now >= p->timestamp' */
692 unsigned long long __sleep_time = now - p->timestamp; 823 unsigned long sleep_time = now - p->timestamp;
693 unsigned long sleep_time;
694 824
695 if (batch_task(p)) 825 if (batch_task(p))
696 sleep_time = 0; 826 sleep_time = 0;
697 else {
698 if (__sleep_time > NS_MAX_SLEEP_AVG)
699 sleep_time = NS_MAX_SLEEP_AVG;
700 else
701 sleep_time = (unsigned long)__sleep_time;
702 }
703 827
704 if (likely(sleep_time > 0)) { 828 if (likely(sleep_time > 0)) {
705 /* 829 /*
706 * User tasks that sleep a long time are categorised as 830 * This ceiling is set to the lowest priority that would allow
707 * idle. They will only have their sleep_avg increased to a 831 * a task to be reinserted into the active array on timeslice
708 * level that makes them just interactive priority to stay 832 * completion.
709 * active yet prevent them suddenly becoming cpu hogs and
710 * starving other processes.
711 */ 833 */
712 if (p->mm && sleep_time > INTERACTIVE_SLEEP(p)) { 834 unsigned long ceiling = INTERACTIVE_SLEEP(p);
713 unsigned long ceiling;
714 835
715 ceiling = JIFFIES_TO_NS(MAX_SLEEP_AVG - 836 if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) {
716 DEF_TIMESLICE); 837 /*
717 if (p->sleep_avg < ceiling) 838 * Prevents user tasks from achieving best priority
718 p->sleep_avg = ceiling; 839 * with one single large enough sleep.
840 */
841 p->sleep_avg = ceiling;
842 /*
843 * Using INTERACTIVE_SLEEP() as a ceiling places a
844 * nice(0) task 1ms sleep away from promotion, and
845 * gives it 700ms to round-robin with no chance of
846 * being demoted. This is more than generous, so
847 * mark this sleep as non-interactive to prevent the
848 * on-runqueue bonus logic from intervening should
849 * this task not receive cpu immediately.
850 */
851 p->sleep_type = SLEEP_NONINTERACTIVE;
719 } else { 852 } else {
720 /* 853 /*
721 * Tasks waking from uninterruptible sleep are 854 * Tasks waking from uninterruptible sleep are
@@ -723,12 +856,12 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
723 * are likely to be waiting on I/O 856 * are likely to be waiting on I/O
724 */ 857 */
725 if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { 858 if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
726 if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) 859 if (p->sleep_avg >= ceiling)
727 sleep_time = 0; 860 sleep_time = 0;
728 else if (p->sleep_avg + sleep_time >= 861 else if (p->sleep_avg + sleep_time >=
729 INTERACTIVE_SLEEP(p)) { 862 ceiling) {
730 p->sleep_avg = INTERACTIVE_SLEEP(p); 863 p->sleep_avg = ceiling;
731 sleep_time = 0; 864 sleep_time = 0;
732 } 865 }
733 } 866 }
734 867
@@ -742,9 +875,9 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
742 */ 875 */
743 p->sleep_avg += sleep_time; 876 p->sleep_avg += sleep_time;
744 877
745 if (p->sleep_avg > NS_MAX_SLEEP_AVG)
746 p->sleep_avg = NS_MAX_SLEEP_AVG;
747 } 878 }
879 if (p->sleep_avg > NS_MAX_SLEEP_AVG)
880 p->sleep_avg = NS_MAX_SLEEP_AVG;
748 } 881 }
749 882
750 return effective_prio(p); 883 return effective_prio(p);
@@ -805,7 +938,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
805 */ 938 */
806static void deactivate_task(struct task_struct *p, runqueue_t *rq) 939static void deactivate_task(struct task_struct *p, runqueue_t *rq)
807{ 940{
808 rq->nr_running--; 941 dec_nr_running(p, rq);
809 dequeue_task(p, p->array); 942 dequeue_task(p, p->array);
810 p->array = NULL; 943 p->array = NULL;
811} 944}
@@ -818,6 +951,11 @@ static void deactivate_task(struct task_struct *p, runqueue_t *rq)
818 * the target CPU. 951 * the target CPU.
819 */ 952 */
820#ifdef CONFIG_SMP 953#ifdef CONFIG_SMP
954
955#ifndef tsk_is_polling
956#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
957#endif
958
821static void resched_task(task_t *p) 959static void resched_task(task_t *p)
822{ 960{
823 int cpu; 961 int cpu;
@@ -833,9 +971,9 @@ static void resched_task(task_t *p)
833 if (cpu == smp_processor_id()) 971 if (cpu == smp_processor_id())
834 return; 972 return;
835 973
836 /* NEED_RESCHED must be visible before we test POLLING_NRFLAG */ 974 /* NEED_RESCHED must be visible before we test polling */
837 smp_mb(); 975 smp_mb();
838 if (!test_tsk_thread_flag(p, TIF_POLLING_NRFLAG)) 976 if (!tsk_is_polling(p))
839 smp_send_reschedule(cpu); 977 smp_send_reschedule(cpu);
840} 978}
841#else 979#else
@@ -855,6 +993,12 @@ inline int task_curr(const task_t *p)
855 return cpu_curr(task_cpu(p)) == p; 993 return cpu_curr(task_cpu(p)) == p;
856} 994}
857 995
996/* Used instead of source_load when we know the type == 0 */
997unsigned long weighted_cpuload(const int cpu)
998{
999 return cpu_rq(cpu)->raw_weighted_load;
1000}
1001
858#ifdef CONFIG_SMP 1002#ifdef CONFIG_SMP
859typedef struct { 1003typedef struct {
860 struct list_head list; 1004 struct list_head list;
@@ -944,7 +1088,8 @@ void kick_process(task_t *p)
944} 1088}
945 1089
946/* 1090/*
947 * Return a low guess at the load of a migration-source cpu. 1091 * Return a low guess at the load of a migration-source cpu weighted
1092 * according to the scheduling class and "nice" value.
948 * 1093 *
949 * We want to under-estimate the load of migration sources, to 1094 * We want to under-estimate the load of migration sources, to
950 * balance conservatively. 1095 * balance conservatively.
@@ -952,24 +1097,36 @@ void kick_process(task_t *p)
952static inline unsigned long source_load(int cpu, int type) 1097static inline unsigned long source_load(int cpu, int type)
953{ 1098{
954 runqueue_t *rq = cpu_rq(cpu); 1099 runqueue_t *rq = cpu_rq(cpu);
955 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; 1100
956 if (type == 0) 1101 if (type == 0)
957 return load_now; 1102 return rq->raw_weighted_load;
958 1103
959 return min(rq->cpu_load[type-1], load_now); 1104 return min(rq->cpu_load[type-1], rq->raw_weighted_load);
960} 1105}
961 1106
962/* 1107/*
963 * Return a high guess at the load of a migration-target cpu 1108 * Return a high guess at the load of a migration-target cpu weighted
1109 * according to the scheduling class and "nice" value.
964 */ 1110 */
965static inline unsigned long target_load(int cpu, int type) 1111static inline unsigned long target_load(int cpu, int type)
966{ 1112{
967 runqueue_t *rq = cpu_rq(cpu); 1113 runqueue_t *rq = cpu_rq(cpu);
968 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; 1114
969 if (type == 0) 1115 if (type == 0)
970 return load_now; 1116 return rq->raw_weighted_load;
971 1117
972 return max(rq->cpu_load[type-1], load_now); 1118 return max(rq->cpu_load[type-1], rq->raw_weighted_load);
1119}
1120
1121/*
1122 * Return the average load per task on the cpu's run queue
1123 */
1124static inline unsigned long cpu_avg_load_per_task(int cpu)
1125{
1126 runqueue_t *rq = cpu_rq(cpu);
1127 unsigned long n = rq->nr_running;
1128
1129 return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE;
973} 1130}
974 1131
975/* 1132/*
@@ -1042,7 +1199,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1042 cpus_and(tmp, group->cpumask, p->cpus_allowed); 1199 cpus_and(tmp, group->cpumask, p->cpus_allowed);
1043 1200
1044 for_each_cpu_mask(i, tmp) { 1201 for_each_cpu_mask(i, tmp) {
1045 load = source_load(i, 0); 1202 load = weighted_cpuload(i);
1046 1203
1047 if (load < min_load || (load == min_load && i == this_cpu)) { 1204 if (load < min_load || (load == min_load && i == this_cpu)) {
1048 min_load = load; 1205 min_load = load;
@@ -1069,9 +1226,15 @@ static int sched_balance_self(int cpu, int flag)
1069 struct task_struct *t = current; 1226 struct task_struct *t = current;
1070 struct sched_domain *tmp, *sd = NULL; 1227 struct sched_domain *tmp, *sd = NULL;
1071 1228
1072 for_each_domain(cpu, tmp) 1229 for_each_domain(cpu, tmp) {
1230 /*
1231 * If power savings logic is enabled for a domain, stop there.
1232 */
1233 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1234 break;
1073 if (tmp->flags & flag) 1235 if (tmp->flags & flag)
1074 sd = tmp; 1236 sd = tmp;
1237 }
1075 1238
1076 while (sd) { 1239 while (sd) {
1077 cpumask_t span; 1240 cpumask_t span;
@@ -1221,17 +1384,19 @@ static int try_to_wake_up(task_t *p, unsigned int state, int sync)
1221 1384
1222 if (this_sd->flags & SD_WAKE_AFFINE) { 1385 if (this_sd->flags & SD_WAKE_AFFINE) {
1223 unsigned long tl = this_load; 1386 unsigned long tl = this_load;
1387 unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu);
1388
1224 /* 1389 /*
1225 * If sync wakeup then subtract the (maximum possible) 1390 * If sync wakeup then subtract the (maximum possible)
1226 * effect of the currently running task from the load 1391 * effect of the currently running task from the load
1227 * of the current CPU: 1392 * of the current CPU:
1228 */ 1393 */
1229 if (sync) 1394 if (sync)
1230 tl -= SCHED_LOAD_SCALE; 1395 tl -= current->load_weight;
1231 1396
1232 if ((tl <= load && 1397 if ((tl <= load &&
1233 tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) || 1398 tl + target_load(cpu, idx) <= tl_per_task) ||
1234 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) { 1399 100*(tl + p->load_weight) <= imbalance*load) {
1235 /* 1400 /*
1236 * This domain has SD_WAKE_AFFINE and 1401 * This domain has SD_WAKE_AFFINE and
1237 * p is cache cold in this domain, and 1402 * p is cache cold in this domain, and
@@ -1348,6 +1513,12 @@ void fastcall sched_fork(task_t *p, int clone_flags)
1348 * event cannot wake it up and insert it on the runqueue either. 1513 * event cannot wake it up and insert it on the runqueue either.
1349 */ 1514 */
1350 p->state = TASK_RUNNING; 1515 p->state = TASK_RUNNING;
1516
1517 /*
1518 * Make sure we do not leak PI boosting priority to the child:
1519 */
1520 p->prio = current->normal_prio;
1521
1351 INIT_LIST_HEAD(&p->run_list); 1522 INIT_LIST_HEAD(&p->run_list);
1352 p->array = NULL; 1523 p->array = NULL;
1353#ifdef CONFIG_SCHEDSTATS 1524#ifdef CONFIG_SCHEDSTATS
@@ -1427,10 +1598,11 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
1427 __activate_task(p, rq); 1598 __activate_task(p, rq);
1428 else { 1599 else {
1429 p->prio = current->prio; 1600 p->prio = current->prio;
1601 p->normal_prio = current->normal_prio;
1430 list_add_tail(&p->run_list, &current->run_list); 1602 list_add_tail(&p->run_list, &current->run_list);
1431 p->array = current->array; 1603 p->array = current->array;
1432 p->array->nr_active++; 1604 p->array->nr_active++;
1433 rq->nr_running++; 1605 inc_nr_running(p, rq);
1434 } 1606 }
1435 set_need_resched(); 1607 set_need_resched();
1436 } else 1608 } else
@@ -1648,7 +1820,8 @@ unsigned long nr_uninterruptible(void)
1648 1820
1649unsigned long long nr_context_switches(void) 1821unsigned long long nr_context_switches(void)
1650{ 1822{
1651 unsigned long long i, sum = 0; 1823 int i;
1824 unsigned long long sum = 0;
1652 1825
1653 for_each_possible_cpu(i) 1826 for_each_possible_cpu(i)
1654 sum += cpu_rq(i)->nr_switches; 1827 sum += cpu_rq(i)->nr_switches;
@@ -1686,9 +1859,6 @@ unsigned long nr_active(void)
1686/* 1859/*
1687 * double_rq_lock - safely lock two runqueues 1860 * double_rq_lock - safely lock two runqueues
1688 * 1861 *
1689 * We must take them in cpu order to match code in
1690 * dependent_sleeper and wake_dependent_sleeper.
1691 *
1692 * Note this does not disable interrupts like task_rq_lock, 1862 * Note this does not disable interrupts like task_rq_lock,
1693 * you need to do so manually before calling. 1863 * you need to do so manually before calling.
1694 */ 1864 */
@@ -1700,7 +1870,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
1700 spin_lock(&rq1->lock); 1870 spin_lock(&rq1->lock);
1701 __acquire(rq2->lock); /* Fake it out ;) */ 1871 __acquire(rq2->lock); /* Fake it out ;) */
1702 } else { 1872 } else {
1703 if (rq1->cpu < rq2->cpu) { 1873 if (rq1 < rq2) {
1704 spin_lock(&rq1->lock); 1874 spin_lock(&rq1->lock);
1705 spin_lock(&rq2->lock); 1875 spin_lock(&rq2->lock);
1706 } else { 1876 } else {
@@ -1736,7 +1906,7 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
1736 __acquires(this_rq->lock) 1906 __acquires(this_rq->lock)
1737{ 1907{
1738 if (unlikely(!spin_trylock(&busiest->lock))) { 1908 if (unlikely(!spin_trylock(&busiest->lock))) {
1739 if (busiest->cpu < this_rq->cpu) { 1909 if (busiest < this_rq) {
1740 spin_unlock(&this_rq->lock); 1910 spin_unlock(&this_rq->lock);
1741 spin_lock(&busiest->lock); 1911 spin_lock(&busiest->lock);
1742 spin_lock(&this_rq->lock); 1912 spin_lock(&this_rq->lock);
@@ -1799,9 +1969,9 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1799 runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) 1969 runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
1800{ 1970{
1801 dequeue_task(p, src_array); 1971 dequeue_task(p, src_array);
1802 src_rq->nr_running--; 1972 dec_nr_running(p, src_rq);
1803 set_task_cpu(p, this_cpu); 1973 set_task_cpu(p, this_cpu);
1804 this_rq->nr_running++; 1974 inc_nr_running(p, this_rq);
1805 enqueue_task(p, this_array); 1975 enqueue_task(p, this_array);
1806 p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) 1976 p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
1807 + this_rq->timestamp_last_tick; 1977 + this_rq->timestamp_last_tick;
@@ -1848,26 +2018,42 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1848 return 1; 2018 return 1;
1849} 2019}
1850 2020
2021#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio)
1851/* 2022/*
1852 * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, 2023 * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
1853 * as part of a balancing operation within "domain". Returns the number of 2024 * load from busiest to this_rq, as part of a balancing operation within
1854 * tasks moved. 2025 * "domain". Returns the number of tasks moved.
1855 * 2026 *
1856 * Called with both runqueues locked. 2027 * Called with both runqueues locked.
1857 */ 2028 */
1858static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, 2029static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
1859 unsigned long max_nr_move, struct sched_domain *sd, 2030 unsigned long max_nr_move, unsigned long max_load_move,
1860 enum idle_type idle, int *all_pinned) 2031 struct sched_domain *sd, enum idle_type idle,
2032 int *all_pinned)
1861{ 2033{
1862 prio_array_t *array, *dst_array; 2034 prio_array_t *array, *dst_array;
1863 struct list_head *head, *curr; 2035 struct list_head *head, *curr;
1864 int idx, pulled = 0, pinned = 0; 2036 int idx, pulled = 0, pinned = 0, this_best_prio, busiest_best_prio;
2037 int busiest_best_prio_seen;
2038 int skip_for_load; /* skip the task based on weighted load issues */
2039 long rem_load_move;
1865 task_t *tmp; 2040 task_t *tmp;
1866 2041
1867 if (max_nr_move == 0) 2042 if (max_nr_move == 0 || max_load_move == 0)
1868 goto out; 2043 goto out;
1869 2044
2045 rem_load_move = max_load_move;
1870 pinned = 1; 2046 pinned = 1;
2047 this_best_prio = rq_best_prio(this_rq);
2048 busiest_best_prio = rq_best_prio(busiest);
2049 /*
2050 * Enable handling of the case where there is more than one task
2051 * with the best priority. If the current running task is one
2052 * of those with prio==busiest_best_prio we know it won't be moved
2053 * and therefore it's safe to override the skip (based on load) of
2054 * any task we find with that prio.
2055 */
2056 busiest_best_prio_seen = busiest_best_prio == busiest->curr->prio;
1871 2057
1872 /* 2058 /*
1873 * We first consider expired tasks. Those will likely not be 2059 * We first consider expired tasks. Those will likely not be
@@ -1907,7 +2093,17 @@ skip_queue:
1907 2093
1908 curr = curr->prev; 2094 curr = curr->prev;
1909 2095
1910 if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { 2096 /*
2097 * To help distribute high priority tasks accross CPUs we don't
2098 * skip a task if it will be the highest priority task (i.e. smallest
2099 * prio value) on its new queue regardless of its load weight
2100 */
2101 skip_for_load = tmp->load_weight > rem_load_move;
2102 if (skip_for_load && idx < this_best_prio)
2103 skip_for_load = !busiest_best_prio_seen && idx == busiest_best_prio;
2104 if (skip_for_load ||
2105 !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
2106 busiest_best_prio_seen |= idx == busiest_best_prio;
1911 if (curr != head) 2107 if (curr != head)
1912 goto skip_queue; 2108 goto skip_queue;
1913 idx++; 2109 idx++;
@@ -1921,9 +2117,15 @@ skip_queue:
1921 2117
1922 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); 2118 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
1923 pulled++; 2119 pulled++;
2120 rem_load_move -= tmp->load_weight;
1924 2121
1925 /* We only want to steal up to the prescribed number of tasks. */ 2122 /*
1926 if (pulled < max_nr_move) { 2123 * We only want to steal up to the prescribed number of tasks
2124 * and the prescribed amount of weighted load.
2125 */
2126 if (pulled < max_nr_move && rem_load_move > 0) {
2127 if (idx < this_best_prio)
2128 this_best_prio = idx;
1927 if (curr != head) 2129 if (curr != head)
1928 goto skip_queue; 2130 goto skip_queue;
1929 idx++; 2131 idx++;
@@ -1944,7 +2146,7 @@ out:
1944 2146
1945/* 2147/*
1946 * find_busiest_group finds and returns the busiest CPU group within the 2148 * find_busiest_group finds and returns the busiest CPU group within the
1947 * domain. It calculates and returns the number of tasks which should be 2149 * domain. It calculates and returns the amount of weighted load which should be
1948 * moved to restore balance via the imbalance parameter. 2150 * moved to restore balance via the imbalance parameter.
1949 */ 2151 */
1950static struct sched_group * 2152static struct sched_group *
@@ -1954,9 +2156,19 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1954 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 2156 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
1955 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 2157 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
1956 unsigned long max_pull; 2158 unsigned long max_pull;
2159 unsigned long busiest_load_per_task, busiest_nr_running;
2160 unsigned long this_load_per_task, this_nr_running;
1957 int load_idx; 2161 int load_idx;
2162#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2163 int power_savings_balance = 1;
2164 unsigned long leader_nr_running = 0, min_load_per_task = 0;
2165 unsigned long min_nr_running = ULONG_MAX;
2166 struct sched_group *group_min = NULL, *group_leader = NULL;
2167#endif
1958 2168
1959 max_load = this_load = total_load = total_pwr = 0; 2169 max_load = this_load = total_load = total_pwr = 0;
2170 busiest_load_per_task = busiest_nr_running = 0;
2171 this_load_per_task = this_nr_running = 0;
1960 if (idle == NOT_IDLE) 2172 if (idle == NOT_IDLE)
1961 load_idx = sd->busy_idx; 2173 load_idx = sd->busy_idx;
1962 else if (idle == NEWLY_IDLE) 2174 else if (idle == NEWLY_IDLE)
@@ -1965,16 +2177,19 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1965 load_idx = sd->idle_idx; 2177 load_idx = sd->idle_idx;
1966 2178
1967 do { 2179 do {
1968 unsigned long load; 2180 unsigned long load, group_capacity;
1969 int local_group; 2181 int local_group;
1970 int i; 2182 int i;
2183 unsigned long sum_nr_running, sum_weighted_load;
1971 2184
1972 local_group = cpu_isset(this_cpu, group->cpumask); 2185 local_group = cpu_isset(this_cpu, group->cpumask);
1973 2186
1974 /* Tally up the load of all CPUs in the group */ 2187 /* Tally up the load of all CPUs in the group */
1975 avg_load = 0; 2188 sum_weighted_load = sum_nr_running = avg_load = 0;
1976 2189
1977 for_each_cpu_mask(i, group->cpumask) { 2190 for_each_cpu_mask(i, group->cpumask) {
2191 runqueue_t *rq = cpu_rq(i);
2192
1978 if (*sd_idle && !idle_cpu(i)) 2193 if (*sd_idle && !idle_cpu(i))
1979 *sd_idle = 0; 2194 *sd_idle = 0;
1980 2195
@@ -1985,6 +2200,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1985 load = source_load(i, load_idx); 2200 load = source_load(i, load_idx);
1986 2201
1987 avg_load += load; 2202 avg_load += load;
2203 sum_nr_running += rq->nr_running;
2204 sum_weighted_load += rq->raw_weighted_load;
1988 } 2205 }
1989 2206
1990 total_load += avg_load; 2207 total_load += avg_load;
@@ -1993,17 +2210,80 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1993 /* Adjust by relative CPU power of the group */ 2210 /* Adjust by relative CPU power of the group */
1994 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; 2211 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
1995 2212
2213 group_capacity = group->cpu_power / SCHED_LOAD_SCALE;
2214
1996 if (local_group) { 2215 if (local_group) {
1997 this_load = avg_load; 2216 this_load = avg_load;
1998 this = group; 2217 this = group;
1999 } else if (avg_load > max_load) { 2218 this_nr_running = sum_nr_running;
2219 this_load_per_task = sum_weighted_load;
2220 } else if (avg_load > max_load &&
2221 sum_nr_running > group_capacity) {
2000 max_load = avg_load; 2222 max_load = avg_load;
2001 busiest = group; 2223 busiest = group;
2224 busiest_nr_running = sum_nr_running;
2225 busiest_load_per_task = sum_weighted_load;
2002 } 2226 }
2227
2228#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2229 /*
2230 * Busy processors will not participate in power savings
2231 * balance.
2232 */
2233 if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2234 goto group_next;
2235
2236 /*
2237 * If the local group is idle or completely loaded
2238 * no need to do power savings balance at this domain
2239 */
2240 if (local_group && (this_nr_running >= group_capacity ||
2241 !this_nr_running))
2242 power_savings_balance = 0;
2243
2244 /*
2245 * If a group is already running at full capacity or idle,
2246 * don't include that group in power savings calculations
2247 */
2248 if (!power_savings_balance || sum_nr_running >= group_capacity
2249 || !sum_nr_running)
2250 goto group_next;
2251
2252 /*
2253 * Calculate the group which has the least non-idle load.
2254 * This is the group from where we need to pick up the load
2255 * for saving power
2256 */
2257 if ((sum_nr_running < min_nr_running) ||
2258 (sum_nr_running == min_nr_running &&
2259 first_cpu(group->cpumask) <
2260 first_cpu(group_min->cpumask))) {
2261 group_min = group;
2262 min_nr_running = sum_nr_running;
2263 min_load_per_task = sum_weighted_load /
2264 sum_nr_running;
2265 }
2266
2267 /*
2268 * Calculate the group which is almost near its
2269 * capacity but still has some space to pick up some load
2270 * from other group and save more power
2271 */
2272 if (sum_nr_running <= group_capacity - 1)
2273 if (sum_nr_running > leader_nr_running ||
2274 (sum_nr_running == leader_nr_running &&
2275 first_cpu(group->cpumask) >
2276 first_cpu(group_leader->cpumask))) {
2277 group_leader = group;
2278 leader_nr_running = sum_nr_running;
2279 }
2280
2281group_next:
2282#endif
2003 group = group->next; 2283 group = group->next;
2004 } while (group != sd->groups); 2284 } while (group != sd->groups);
2005 2285
2006 if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE) 2286 if (!busiest || this_load >= max_load || busiest_nr_running == 0)
2007 goto out_balanced; 2287 goto out_balanced;
2008 2288
2009 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; 2289 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
@@ -2012,6 +2292,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2012 100*max_load <= sd->imbalance_pct*this_load) 2292 100*max_load <= sd->imbalance_pct*this_load)
2013 goto out_balanced; 2293 goto out_balanced;
2014 2294
2295 busiest_load_per_task /= busiest_nr_running;
2015 /* 2296 /*
2016 * We're trying to get all the cpus to the average_load, so we don't 2297 * We're trying to get all the cpus to the average_load, so we don't
2017 * want to push ourselves above the average load, nor do we wish to 2298 * want to push ourselves above the average load, nor do we wish to
@@ -2023,21 +2304,50 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2023 * by pulling tasks to us. Be careful of negative numbers as they'll 2304 * by pulling tasks to us. Be careful of negative numbers as they'll
2024 * appear as very large values with unsigned longs. 2305 * appear as very large values with unsigned longs.
2025 */ 2306 */
2307 if (max_load <= busiest_load_per_task)
2308 goto out_balanced;
2309
2310 /*
2311 * In the presence of smp nice balancing, certain scenarios can have
2312 * max load less than avg load(as we skip the groups at or below
2313 * its cpu_power, while calculating max_load..)
2314 */
2315 if (max_load < avg_load) {
2316 *imbalance = 0;
2317 goto small_imbalance;
2318 }
2026 2319
2027 /* Don't want to pull so many tasks that a group would go idle */ 2320 /* Don't want to pull so many tasks that a group would go idle */
2028 max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE); 2321 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
2029 2322
2030 /* How much load to actually move to equalise the imbalance */ 2323 /* How much load to actually move to equalise the imbalance */
2031 *imbalance = min(max_pull * busiest->cpu_power, 2324 *imbalance = min(max_pull * busiest->cpu_power,
2032 (avg_load - this_load) * this->cpu_power) 2325 (avg_load - this_load) * this->cpu_power)
2033 / SCHED_LOAD_SCALE; 2326 / SCHED_LOAD_SCALE;
2034 2327
2035 if (*imbalance < SCHED_LOAD_SCALE) { 2328 /*
2036 unsigned long pwr_now = 0, pwr_move = 0; 2329 * if *imbalance is less than the average load per runnable task
2330 * there is no gaurantee that any tasks will be moved so we'll have
2331 * a think about bumping its value to force at least one task to be
2332 * moved
2333 */
2334 if (*imbalance < busiest_load_per_task) {
2335 unsigned long pwr_now, pwr_move;
2037 unsigned long tmp; 2336 unsigned long tmp;
2337 unsigned int imbn;
2338
2339small_imbalance:
2340 pwr_move = pwr_now = 0;
2341 imbn = 2;
2342 if (this_nr_running) {
2343 this_load_per_task /= this_nr_running;
2344 if (busiest_load_per_task > this_load_per_task)
2345 imbn = 1;
2346 } else
2347 this_load_per_task = SCHED_LOAD_SCALE;
2038 2348
2039 if (max_load - this_load >= SCHED_LOAD_SCALE*2) { 2349 if (max_load - this_load >= busiest_load_per_task * imbn) {
2040 *imbalance = 1; 2350 *imbalance = busiest_load_per_task;
2041 return busiest; 2351 return busiest;
2042 } 2352 }
2043 2353
@@ -2047,39 +2357,47 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2047 * moving them. 2357 * moving them.
2048 */ 2358 */
2049 2359
2050 pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); 2360 pwr_now += busiest->cpu_power *
2051 pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); 2361 min(busiest_load_per_task, max_load);
2362 pwr_now += this->cpu_power *
2363 min(this_load_per_task, this_load);
2052 pwr_now /= SCHED_LOAD_SCALE; 2364 pwr_now /= SCHED_LOAD_SCALE;
2053 2365
2054 /* Amount of load we'd subtract */ 2366 /* Amount of load we'd subtract */
2055 tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; 2367 tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power;
2056 if (max_load > tmp) 2368 if (max_load > tmp)
2057 pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE, 2369 pwr_move += busiest->cpu_power *
2058 max_load - tmp); 2370 min(busiest_load_per_task, max_load - tmp);
2059 2371
2060 /* Amount of load we'd add */ 2372 /* Amount of load we'd add */
2061 if (max_load*busiest->cpu_power < 2373 if (max_load*busiest->cpu_power <
2062 SCHED_LOAD_SCALE*SCHED_LOAD_SCALE) 2374 busiest_load_per_task*SCHED_LOAD_SCALE)
2063 tmp = max_load*busiest->cpu_power/this->cpu_power; 2375 tmp = max_load*busiest->cpu_power/this->cpu_power;
2064 else 2376 else
2065 tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; 2377 tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power;
2066 pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp); 2378 pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp);
2067 pwr_move /= SCHED_LOAD_SCALE; 2379 pwr_move /= SCHED_LOAD_SCALE;
2068 2380
2069 /* Move if we gain throughput */ 2381 /* Move if we gain throughput */
2070 if (pwr_move <= pwr_now) 2382 if (pwr_move <= pwr_now)
2071 goto out_balanced; 2383 goto out_balanced;
2072 2384
2073 *imbalance = 1; 2385 *imbalance = busiest_load_per_task;
2074 return busiest;
2075 } 2386 }
2076 2387
2077 /* Get rid of the scaling factor, rounding down as we divide */
2078 *imbalance = *imbalance / SCHED_LOAD_SCALE;
2079 return busiest; 2388 return busiest;
2080 2389
2081out_balanced: 2390out_balanced:
2391#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2392 if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2393 goto ret;
2082 2394
2395 if (this == group_leader && group_leader != group_min) {
2396 *imbalance = min_load_per_task;
2397 return group_min;
2398 }
2399ret:
2400#endif
2083 *imbalance = 0; 2401 *imbalance = 0;
2084 return NULL; 2402 return NULL;
2085} 2403}
@@ -2088,18 +2406,21 @@ out_balanced:
2088 * find_busiest_queue - find the busiest runqueue among the cpus in group. 2406 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2089 */ 2407 */
2090static runqueue_t *find_busiest_queue(struct sched_group *group, 2408static runqueue_t *find_busiest_queue(struct sched_group *group,
2091 enum idle_type idle) 2409 enum idle_type idle, unsigned long imbalance)
2092{ 2410{
2093 unsigned long load, max_load = 0; 2411 unsigned long max_load = 0;
2094 runqueue_t *busiest = NULL; 2412 runqueue_t *busiest = NULL, *rqi;
2095 int i; 2413 int i;
2096 2414
2097 for_each_cpu_mask(i, group->cpumask) { 2415 for_each_cpu_mask(i, group->cpumask) {
2098 load = source_load(i, 0); 2416 rqi = cpu_rq(i);
2417
2418 if (rqi->nr_running == 1 && rqi->raw_weighted_load > imbalance)
2419 continue;
2099 2420
2100 if (load > max_load) { 2421 if (rqi->raw_weighted_load > max_load) {
2101 max_load = load; 2422 max_load = rqi->raw_weighted_load;
2102 busiest = cpu_rq(i); 2423 busiest = rqi;
2103 } 2424 }
2104 } 2425 }
2105 2426
@@ -2112,6 +2433,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group,
2112 */ 2433 */
2113#define MAX_PINNED_INTERVAL 512 2434#define MAX_PINNED_INTERVAL 512
2114 2435
2436#define minus_1_or_zero(n) ((n) > 0 ? (n) - 1 : 0)
2115/* 2437/*
2116 * Check this_cpu to ensure it is balanced within domain. Attempt to move 2438 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2117 * tasks if there is an imbalance. 2439 * tasks if there is an imbalance.
@@ -2128,7 +2450,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
2128 int active_balance = 0; 2450 int active_balance = 0;
2129 int sd_idle = 0; 2451 int sd_idle = 0;
2130 2452
2131 if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER) 2453 if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2454 !sched_smt_power_savings)
2132 sd_idle = 1; 2455 sd_idle = 1;
2133 2456
2134 schedstat_inc(sd, lb_cnt[idle]); 2457 schedstat_inc(sd, lb_cnt[idle]);
@@ -2139,7 +2462,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
2139 goto out_balanced; 2462 goto out_balanced;
2140 } 2463 }
2141 2464
2142 busiest = find_busiest_queue(group, idle); 2465 busiest = find_busiest_queue(group, idle, imbalance);
2143 if (!busiest) { 2466 if (!busiest) {
2144 schedstat_inc(sd, lb_nobusyq[idle]); 2467 schedstat_inc(sd, lb_nobusyq[idle]);
2145 goto out_balanced; 2468 goto out_balanced;
@@ -2159,6 +2482,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
2159 */ 2482 */
2160 double_rq_lock(this_rq, busiest); 2483 double_rq_lock(this_rq, busiest);
2161 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2484 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2485 minus_1_or_zero(busiest->nr_running),
2162 imbalance, sd, idle, &all_pinned); 2486 imbalance, sd, idle, &all_pinned);
2163 double_rq_unlock(this_rq, busiest); 2487 double_rq_unlock(this_rq, busiest);
2164 2488
@@ -2216,7 +2540,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
2216 sd->balance_interval *= 2; 2540 sd->balance_interval *= 2;
2217 } 2541 }
2218 2542
2219 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER) 2543 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2544 !sched_smt_power_savings)
2220 return -1; 2545 return -1;
2221 return nr_moved; 2546 return nr_moved;
2222 2547
@@ -2231,7 +2556,7 @@ out_one_pinned:
2231 (sd->balance_interval < sd->max_interval)) 2556 (sd->balance_interval < sd->max_interval))
2232 sd->balance_interval *= 2; 2557 sd->balance_interval *= 2;
2233 2558
2234 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) 2559 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
2235 return -1; 2560 return -1;
2236 return 0; 2561 return 0;
2237} 2562}
@@ -2252,7 +2577,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2252 int nr_moved = 0; 2577 int nr_moved = 0;
2253 int sd_idle = 0; 2578 int sd_idle = 0;
2254 2579
2255 if (sd->flags & SD_SHARE_CPUPOWER) 2580 if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
2256 sd_idle = 1; 2581 sd_idle = 1;
2257 2582
2258 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); 2583 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
@@ -2262,7 +2587,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2262 goto out_balanced; 2587 goto out_balanced;
2263 } 2588 }
2264 2589
2265 busiest = find_busiest_queue(group, NEWLY_IDLE); 2590 busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance);
2266 if (!busiest) { 2591 if (!busiest) {
2267 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); 2592 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
2268 goto out_balanced; 2593 goto out_balanced;
@@ -2277,6 +2602,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2277 /* Attempt to move tasks */ 2602 /* Attempt to move tasks */
2278 double_lock_balance(this_rq, busiest); 2603 double_lock_balance(this_rq, busiest);
2279 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2604 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2605 minus_1_or_zero(busiest->nr_running),
2280 imbalance, sd, NEWLY_IDLE, NULL); 2606 imbalance, sd, NEWLY_IDLE, NULL);
2281 spin_unlock(&busiest->lock); 2607 spin_unlock(&busiest->lock);
2282 } 2608 }
@@ -2292,7 +2618,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2292 2618
2293out_balanced: 2619out_balanced:
2294 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); 2620 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
2295 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) 2621 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
2296 return -1; 2622 return -1;
2297 sd->nr_balance_failed = 0; 2623 sd->nr_balance_failed = 0;
2298 return 0; 2624 return 0;
@@ -2347,17 +2673,19 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
2347 double_lock_balance(busiest_rq, target_rq); 2673 double_lock_balance(busiest_rq, target_rq);
2348 2674
2349 /* Search for an sd spanning us and the target CPU. */ 2675 /* Search for an sd spanning us and the target CPU. */
2350 for_each_domain(target_cpu, sd) 2676 for_each_domain(target_cpu, sd) {
2351 if ((sd->flags & SD_LOAD_BALANCE) && 2677 if ((sd->flags & SD_LOAD_BALANCE) &&
2352 cpu_isset(busiest_cpu, sd->span)) 2678 cpu_isset(busiest_cpu, sd->span))
2353 break; 2679 break;
2680 }
2354 2681
2355 if (unlikely(sd == NULL)) 2682 if (unlikely(sd == NULL))
2356 goto out; 2683 goto out;
2357 2684
2358 schedstat_inc(sd, alb_cnt); 2685 schedstat_inc(sd, alb_cnt);
2359 2686
2360 if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL)) 2687 if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
2688 RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE, NULL))
2361 schedstat_inc(sd, alb_pushed); 2689 schedstat_inc(sd, alb_pushed);
2362 else 2690 else
2363 schedstat_inc(sd, alb_failed); 2691 schedstat_inc(sd, alb_failed);
@@ -2385,7 +2713,7 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
2385 struct sched_domain *sd; 2713 struct sched_domain *sd;
2386 int i; 2714 int i;
2387 2715
2388 this_load = this_rq->nr_running * SCHED_LOAD_SCALE; 2716 this_load = this_rq->raw_weighted_load;
2389 /* Update our load */ 2717 /* Update our load */
2390 for (i = 0; i < 3; i++) { 2718 for (i = 0; i < 3; i++) {
2391 unsigned long new_load = this_load; 2719 unsigned long new_load = this_load;
@@ -2686,48 +3014,35 @@ static inline void wakeup_busy_runqueue(runqueue_t *rq)
2686 resched_task(rq->idle); 3014 resched_task(rq->idle);
2687} 3015}
2688 3016
2689static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) 3017/*
3018 * Called with interrupt disabled and this_rq's runqueue locked.
3019 */
3020static void wake_sleeping_dependent(int this_cpu)
2690{ 3021{
2691 struct sched_domain *tmp, *sd = NULL; 3022 struct sched_domain *tmp, *sd = NULL;
2692 cpumask_t sibling_map;
2693 int i; 3023 int i;
2694 3024
2695 for_each_domain(this_cpu, tmp) 3025 for_each_domain(this_cpu, tmp) {
2696 if (tmp->flags & SD_SHARE_CPUPOWER) 3026 if (tmp->flags & SD_SHARE_CPUPOWER) {
2697 sd = tmp; 3027 sd = tmp;
3028 break;
3029 }
3030 }
2698 3031
2699 if (!sd) 3032 if (!sd)
2700 return; 3033 return;
2701 3034
2702 /* 3035 for_each_cpu_mask(i, sd->span) {
2703 * Unlock the current runqueue because we have to lock in
2704 * CPU order to avoid deadlocks. Caller knows that we might
2705 * unlock. We keep IRQs disabled.
2706 */
2707 spin_unlock(&this_rq->lock);
2708
2709 sibling_map = sd->span;
2710
2711 for_each_cpu_mask(i, sibling_map)
2712 spin_lock(&cpu_rq(i)->lock);
2713 /*
2714 * We clear this CPU from the mask. This both simplifies the
2715 * inner loop and keps this_rq locked when we exit:
2716 */
2717 cpu_clear(this_cpu, sibling_map);
2718
2719 for_each_cpu_mask(i, sibling_map) {
2720 runqueue_t *smt_rq = cpu_rq(i); 3036 runqueue_t *smt_rq = cpu_rq(i);
2721 3037
3038 if (i == this_cpu)
3039 continue;
3040 if (unlikely(!spin_trylock(&smt_rq->lock)))
3041 continue;
3042
2722 wakeup_busy_runqueue(smt_rq); 3043 wakeup_busy_runqueue(smt_rq);
3044 spin_unlock(&smt_rq->lock);
2723 } 3045 }
2724
2725 for_each_cpu_mask(i, sibling_map)
2726 spin_unlock(&cpu_rq(i)->lock);
2727 /*
2728 * We exit with this_cpu's rq still held and IRQs
2729 * still disabled:
2730 */
2731} 3046}
2732 3047
2733/* 3048/*
@@ -2740,52 +3055,46 @@ static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd)
2740 return p->time_slice * (100 - sd->per_cpu_gain) / 100; 3055 return p->time_slice * (100 - sd->per_cpu_gain) / 100;
2741} 3056}
2742 3057
2743static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) 3058/*
3059 * To minimise lock contention and not have to drop this_rq's runlock we only
3060 * trylock the sibling runqueues and bypass those runqueues if we fail to
3061 * acquire their lock. As we only trylock the normal locking order does not
3062 * need to be obeyed.
3063 */
3064static int dependent_sleeper(int this_cpu, runqueue_t *this_rq, task_t *p)
2744{ 3065{
2745 struct sched_domain *tmp, *sd = NULL; 3066 struct sched_domain *tmp, *sd = NULL;
2746 cpumask_t sibling_map;
2747 prio_array_t *array;
2748 int ret = 0, i; 3067 int ret = 0, i;
2749 task_t *p;
2750 3068
2751 for_each_domain(this_cpu, tmp) 3069 /* kernel/rt threads do not participate in dependent sleeping */
2752 if (tmp->flags & SD_SHARE_CPUPOWER) 3070 if (!p->mm || rt_task(p))
3071 return 0;
3072
3073 for_each_domain(this_cpu, tmp) {
3074 if (tmp->flags & SD_SHARE_CPUPOWER) {
2753 sd = tmp; 3075 sd = tmp;
3076 break;
3077 }
3078 }
2754 3079
2755 if (!sd) 3080 if (!sd)
2756 return 0; 3081 return 0;
2757 3082
2758 /* 3083 for_each_cpu_mask(i, sd->span) {
2759 * The same locking rules and details apply as for 3084 runqueue_t *smt_rq;
2760 * wake_sleeping_dependent(): 3085 task_t *smt_curr;
2761 */
2762 spin_unlock(&this_rq->lock);
2763 sibling_map = sd->span;
2764 for_each_cpu_mask(i, sibling_map)
2765 spin_lock(&cpu_rq(i)->lock);
2766 cpu_clear(this_cpu, sibling_map);
2767 3086
2768 /* 3087 if (i == this_cpu)
2769 * Establish next task to be run - it might have gone away because 3088 continue;
2770 * we released the runqueue lock above:
2771 */
2772 if (!this_rq->nr_running)
2773 goto out_unlock;
2774 array = this_rq->active;
2775 if (!array->nr_active)
2776 array = this_rq->expired;
2777 BUG_ON(!array->nr_active);
2778 3089
2779 p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next, 3090 smt_rq = cpu_rq(i);
2780 task_t, run_list); 3091 if (unlikely(!spin_trylock(&smt_rq->lock)))
3092 continue;
2781 3093
2782 for_each_cpu_mask(i, sibling_map) { 3094 smt_curr = smt_rq->curr;
2783 runqueue_t *smt_rq = cpu_rq(i);
2784 task_t *smt_curr = smt_rq->curr;
2785 3095
2786 /* Kernel threads do not participate in dependent sleeping */ 3096 if (!smt_curr->mm)
2787 if (!p->mm || !smt_curr->mm || rt_task(p)) 3097 goto unlock;
2788 goto check_smt_task;
2789 3098
2790 /* 3099 /*
2791 * If a user task with lower static priority than the 3100 * If a user task with lower static priority than the
@@ -2803,49 +3112,24 @@ static int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2803 if ((jiffies % DEF_TIMESLICE) > 3112 if ((jiffies % DEF_TIMESLICE) >
2804 (sd->per_cpu_gain * DEF_TIMESLICE / 100)) 3113 (sd->per_cpu_gain * DEF_TIMESLICE / 100))
2805 ret = 1; 3114 ret = 1;
2806 } else 3115 } else {
2807 if (smt_curr->static_prio < p->static_prio && 3116 if (smt_curr->static_prio < p->static_prio &&
2808 !TASK_PREEMPTS_CURR(p, smt_rq) && 3117 !TASK_PREEMPTS_CURR(p, smt_rq) &&
2809 smt_slice(smt_curr, sd) > task_timeslice(p)) 3118 smt_slice(smt_curr, sd) > task_timeslice(p))
2810 ret = 1; 3119 ret = 1;
2811
2812check_smt_task:
2813 if ((!smt_curr->mm && smt_curr != smt_rq->idle) ||
2814 rt_task(smt_curr))
2815 continue;
2816 if (!p->mm) {
2817 wakeup_busy_runqueue(smt_rq);
2818 continue;
2819 }
2820
2821 /*
2822 * Reschedule a lower priority task on the SMT sibling for
2823 * it to be put to sleep, or wake it up if it has been put to
2824 * sleep for priority reasons to see if it should run now.
2825 */
2826 if (rt_task(p)) {
2827 if ((jiffies % DEF_TIMESLICE) >
2828 (sd->per_cpu_gain * DEF_TIMESLICE / 100))
2829 resched_task(smt_curr);
2830 } else {
2831 if (TASK_PREEMPTS_CURR(p, smt_rq) &&
2832 smt_slice(p, sd) > task_timeslice(smt_curr))
2833 resched_task(smt_curr);
2834 else
2835 wakeup_busy_runqueue(smt_rq);
2836 } 3120 }
3121unlock:
3122 spin_unlock(&smt_rq->lock);
2837 } 3123 }
2838out_unlock:
2839 for_each_cpu_mask(i, sibling_map)
2840 spin_unlock(&cpu_rq(i)->lock);
2841 return ret; 3124 return ret;
2842} 3125}
2843#else 3126#else
2844static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) 3127static inline void wake_sleeping_dependent(int this_cpu)
2845{ 3128{
2846} 3129}
2847 3130
2848static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) 3131static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq,
3132 task_t *p)
2849{ 3133{
2850 return 0; 3134 return 0;
2851} 3135}
@@ -2967,32 +3251,13 @@ need_resched_nonpreemptible:
2967 3251
2968 cpu = smp_processor_id(); 3252 cpu = smp_processor_id();
2969 if (unlikely(!rq->nr_running)) { 3253 if (unlikely(!rq->nr_running)) {
2970go_idle:
2971 idle_balance(cpu, rq); 3254 idle_balance(cpu, rq);
2972 if (!rq->nr_running) { 3255 if (!rq->nr_running) {
2973 next = rq->idle; 3256 next = rq->idle;
2974 rq->expired_timestamp = 0; 3257 rq->expired_timestamp = 0;
2975 wake_sleeping_dependent(cpu, rq); 3258 wake_sleeping_dependent(cpu);
2976 /*
2977 * wake_sleeping_dependent() might have released
2978 * the runqueue, so break out if we got new
2979 * tasks meanwhile:
2980 */
2981 if (!rq->nr_running)
2982 goto switch_tasks;
2983 }
2984 } else {
2985 if (dependent_sleeper(cpu, rq)) {
2986 next = rq->idle;
2987 goto switch_tasks; 3259 goto switch_tasks;
2988 } 3260 }
2989 /*
2990 * dependent_sleeper() releases and reacquires the runqueue
2991 * lock, hence go into the idle loop if the rq went
2992 * empty meanwhile:
2993 */
2994 if (unlikely(!rq->nr_running))
2995 goto go_idle;
2996 } 3261 }
2997 3262
2998 array = rq->active; 3263 array = rq->active;
@@ -3030,6 +3295,8 @@ go_idle:
3030 } 3295 }
3031 } 3296 }
3032 next->sleep_type = SLEEP_NORMAL; 3297 next->sleep_type = SLEEP_NORMAL;
3298 if (dependent_sleeper(cpu, rq, next))
3299 next = rq->idle;
3033switch_tasks: 3300switch_tasks:
3034 if (next == rq->idle) 3301 if (next == rq->idle)
3035 schedstat_inc(rq, sched_goidle); 3302 schedstat_inc(rq, sched_goidle);
@@ -3473,12 +3740,65 @@ long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3473 3740
3474EXPORT_SYMBOL(sleep_on_timeout); 3741EXPORT_SYMBOL(sleep_on_timeout);
3475 3742
3743#ifdef CONFIG_RT_MUTEXES
3744
3745/*
3746 * rt_mutex_setprio - set the current priority of a task
3747 * @p: task
3748 * @prio: prio value (kernel-internal form)
3749 *
3750 * This function changes the 'effective' priority of a task. It does
3751 * not touch ->normal_prio like __setscheduler().
3752 *
3753 * Used by the rt_mutex code to implement priority inheritance logic.
3754 */
3755void rt_mutex_setprio(task_t *p, int prio)
3756{
3757 unsigned long flags;
3758 prio_array_t *array;
3759 runqueue_t *rq;
3760 int oldprio;
3761
3762 BUG_ON(prio < 0 || prio > MAX_PRIO);
3763
3764 rq = task_rq_lock(p, &flags);
3765
3766 oldprio = p->prio;
3767 array = p->array;
3768 if (array)
3769 dequeue_task(p, array);
3770 p->prio = prio;
3771
3772 if (array) {
3773 /*
3774 * If changing to an RT priority then queue it
3775 * in the active array!
3776 */
3777 if (rt_task(p))
3778 array = rq->active;
3779 enqueue_task(p, array);
3780 /*
3781 * Reschedule if we are currently running on this runqueue and
3782 * our priority decreased, or if we are not currently running on
3783 * this runqueue and our priority is higher than the current's
3784 */
3785 if (task_running(rq, p)) {
3786 if (p->prio > oldprio)
3787 resched_task(rq->curr);
3788 } else if (TASK_PREEMPTS_CURR(p, rq))
3789 resched_task(rq->curr);
3790 }
3791 task_rq_unlock(rq, &flags);
3792}
3793
3794#endif
3795
3476void set_user_nice(task_t *p, long nice) 3796void set_user_nice(task_t *p, long nice)
3477{ 3797{
3478 unsigned long flags; 3798 unsigned long flags;
3479 prio_array_t *array; 3799 prio_array_t *array;
3480 runqueue_t *rq; 3800 runqueue_t *rq;
3481 int old_prio, new_prio, delta; 3801 int old_prio, delta;
3482 3802
3483 if (TASK_NICE(p) == nice || nice < -20 || nice > 19) 3803 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3484 return; 3804 return;
@@ -3493,22 +3813,25 @@ void set_user_nice(task_t *p, long nice)
3493 * it wont have any effect on scheduling until the task is 3813 * it wont have any effect on scheduling until the task is
3494 * not SCHED_NORMAL/SCHED_BATCH: 3814 * not SCHED_NORMAL/SCHED_BATCH:
3495 */ 3815 */
3496 if (rt_task(p)) { 3816 if (has_rt_policy(p)) {
3497 p->static_prio = NICE_TO_PRIO(nice); 3817 p->static_prio = NICE_TO_PRIO(nice);
3498 goto out_unlock; 3818 goto out_unlock;
3499 } 3819 }
3500 array = p->array; 3820 array = p->array;
3501 if (array) 3821 if (array) {
3502 dequeue_task(p, array); 3822 dequeue_task(p, array);
3823 dec_raw_weighted_load(rq, p);
3824 }
3503 3825
3504 old_prio = p->prio;
3505 new_prio = NICE_TO_PRIO(nice);
3506 delta = new_prio - old_prio;
3507 p->static_prio = NICE_TO_PRIO(nice); 3826 p->static_prio = NICE_TO_PRIO(nice);
3508 p->prio += delta; 3827 set_load_weight(p);
3828 old_prio = p->prio;
3829 p->prio = effective_prio(p);
3830 delta = p->prio - old_prio;
3509 3831
3510 if (array) { 3832 if (array) {
3511 enqueue_task(p, array); 3833 enqueue_task(p, array);
3834 inc_raw_weighted_load(rq, p);
3512 /* 3835 /*
3513 * If the task increased its priority or is running and 3836 * If the task increased its priority or is running and
3514 * lowered its priority, then reschedule its CPU: 3837 * lowered its priority, then reschedule its CPU:
@@ -3519,7 +3842,6 @@ void set_user_nice(task_t *p, long nice)
3519out_unlock: 3842out_unlock:
3520 task_rq_unlock(rq, &flags); 3843 task_rq_unlock(rq, &flags);
3521} 3844}
3522
3523EXPORT_SYMBOL(set_user_nice); 3845EXPORT_SYMBOL(set_user_nice);
3524 3846
3525/* 3847/*
@@ -3634,16 +3956,15 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
3634 BUG_ON(p->array); 3956 BUG_ON(p->array);
3635 p->policy = policy; 3957 p->policy = policy;
3636 p->rt_priority = prio; 3958 p->rt_priority = prio;
3637 if (policy != SCHED_NORMAL && policy != SCHED_BATCH) { 3959 p->normal_prio = normal_prio(p);
3638 p->prio = MAX_RT_PRIO-1 - p->rt_priority; 3960 /* we are holding p->pi_lock already */
3639 } else { 3961 p->prio = rt_mutex_getprio(p);
3640 p->prio = p->static_prio; 3962 /*
3641 /* 3963 * SCHED_BATCH tasks are treated as perpetual CPU hogs:
3642 * SCHED_BATCH tasks are treated as perpetual CPU hogs: 3964 */
3643 */ 3965 if (policy == SCHED_BATCH)
3644 if (policy == SCHED_BATCH) 3966 p->sleep_avg = 0;
3645 p->sleep_avg = 0; 3967 set_load_weight(p);
3646 }
3647} 3968}
3648 3969
3649/** 3970/**
@@ -3662,6 +3983,8 @@ int sched_setscheduler(struct task_struct *p, int policy,
3662 unsigned long flags; 3983 unsigned long flags;
3663 runqueue_t *rq; 3984 runqueue_t *rq;
3664 3985
3986 /* may grab non-irq protected spin_locks */
3987 BUG_ON(in_interrupt());
3665recheck: 3988recheck:
3666 /* double check policy once rq lock held */ 3989 /* double check policy once rq lock held */
3667 if (policy < 0) 3990 if (policy < 0)
@@ -3710,14 +4033,20 @@ recheck:
3710 if (retval) 4033 if (retval)
3711 return retval; 4034 return retval;
3712 /* 4035 /*
4036 * make sure no PI-waiters arrive (or leave) while we are
4037 * changing the priority of the task:
4038 */
4039 spin_lock_irqsave(&p->pi_lock, flags);
4040 /*
3713 * To be able to change p->policy safely, the apropriate 4041 * To be able to change p->policy safely, the apropriate
3714 * runqueue lock must be held. 4042 * runqueue lock must be held.
3715 */ 4043 */
3716 rq = task_rq_lock(p, &flags); 4044 rq = __task_rq_lock(p);
3717 /* recheck policy now with rq lock held */ 4045 /* recheck policy now with rq lock held */
3718 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 4046 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
3719 policy = oldpolicy = -1; 4047 policy = oldpolicy = -1;
3720 task_rq_unlock(rq, &flags); 4048 __task_rq_unlock(rq);
4049 spin_unlock_irqrestore(&p->pi_lock, flags);
3721 goto recheck; 4050 goto recheck;
3722 } 4051 }
3723 array = p->array; 4052 array = p->array;
@@ -3738,7 +4067,11 @@ recheck:
3738 } else if (TASK_PREEMPTS_CURR(p, rq)) 4067 } else if (TASK_PREEMPTS_CURR(p, rq))
3739 resched_task(rq->curr); 4068 resched_task(rq->curr);
3740 } 4069 }
3741 task_rq_unlock(rq, &flags); 4070 __task_rq_unlock(rq);
4071 spin_unlock_irqrestore(&p->pi_lock, flags);
4072
4073 rt_mutex_adjust_pi(p);
4074
3742 return 0; 4075 return 0;
3743} 4076}
3744EXPORT_SYMBOL_GPL(sched_setscheduler); 4077EXPORT_SYMBOL_GPL(sched_setscheduler);
@@ -3760,8 +4093,10 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3760 read_unlock_irq(&tasklist_lock); 4093 read_unlock_irq(&tasklist_lock);
3761 return -ESRCH; 4094 return -ESRCH;
3762 } 4095 }
3763 retval = sched_setscheduler(p, policy, &lparam); 4096 get_task_struct(p);
3764 read_unlock_irq(&tasklist_lock); 4097 read_unlock_irq(&tasklist_lock);
4098 retval = sched_setscheduler(p, policy, &lparam);
4099 put_task_struct(p);
3765 return retval; 4100 return retval;
3766} 4101}
3767 4102
@@ -3886,6 +4221,10 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask)
3886 !capable(CAP_SYS_NICE)) 4221 !capable(CAP_SYS_NICE))
3887 goto out_unlock; 4222 goto out_unlock;
3888 4223
4224 retval = security_task_setscheduler(p, 0, NULL);
4225 if (retval)
4226 goto out_unlock;
4227
3889 cpus_allowed = cpuset_cpus_allowed(p); 4228 cpus_allowed = cpuset_cpus_allowed(p);
3890 cpus_and(new_mask, new_mask, cpus_allowed); 4229 cpus_and(new_mask, new_mask, cpus_allowed);
3891 retval = set_cpus_allowed(p, new_mask); 4230 retval = set_cpus_allowed(p, new_mask);
@@ -3954,7 +4293,10 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask)
3954 if (!p) 4293 if (!p)
3955 goto out_unlock; 4294 goto out_unlock;
3956 4295
3957 retval = 0; 4296 retval = security_task_getscheduler(p);
4297 if (retval)
4298 goto out_unlock;
4299
3958 cpus_and(*mask, p->cpus_allowed, cpu_online_map); 4300 cpus_and(*mask, p->cpus_allowed, cpu_online_map);
3959 4301
3960out_unlock: 4302out_unlock:
@@ -4046,6 +4388,9 @@ asmlinkage long sys_sched_yield(void)
4046 4388
4047static inline void __cond_resched(void) 4389static inline void __cond_resched(void)
4048{ 4390{
4391#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
4392 __might_sleep(__FILE__, __LINE__);
4393#endif
4049 /* 4394 /*
4050 * The BKS might be reacquired before we have dropped 4395 * The BKS might be reacquired before we have dropped
4051 * PREEMPT_ACTIVE, which could trigger a second 4396 * PREEMPT_ACTIVE, which could trigger a second
@@ -4142,7 +4487,7 @@ EXPORT_SYMBOL(yield);
4142 */ 4487 */
4143void __sched io_schedule(void) 4488void __sched io_schedule(void)
4144{ 4489{
4145 struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); 4490 struct runqueue *rq = &__raw_get_cpu_var(runqueues);
4146 4491
4147 atomic_inc(&rq->nr_iowait); 4492 atomic_inc(&rq->nr_iowait);
4148 schedule(); 4493 schedule();
@@ -4153,7 +4498,7 @@ EXPORT_SYMBOL(io_schedule);
4153 4498
4154long __sched io_schedule_timeout(long timeout) 4499long __sched io_schedule_timeout(long timeout)
4155{ 4500{
4156 struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); 4501 struct runqueue *rq = &__raw_get_cpu_var(runqueues);
4157 long ret; 4502 long ret;
4158 4503
4159 atomic_inc(&rq->nr_iowait); 4504 atomic_inc(&rq->nr_iowait);
@@ -4237,7 +4582,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4237 if (retval) 4582 if (retval)
4238 goto out_unlock; 4583 goto out_unlock;
4239 4584
4240 jiffies_to_timespec(p->policy & SCHED_FIFO ? 4585 jiffies_to_timespec(p->policy == SCHED_FIFO ?
4241 0 : task_timeslice(p), &t); 4586 0 : task_timeslice(p), &t);
4242 read_unlock(&tasklist_lock); 4587 read_unlock(&tasklist_lock);
4243 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 4588 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
@@ -4363,7 +4708,7 @@ void __devinit init_idle(task_t *idle, int cpu)
4363 idle->timestamp = sched_clock(); 4708 idle->timestamp = sched_clock();
4364 idle->sleep_avg = 0; 4709 idle->sleep_avg = 0;
4365 idle->array = NULL; 4710 idle->array = NULL;
4366 idle->prio = MAX_PRIO; 4711 idle->prio = idle->normal_prio = MAX_PRIO;
4367 idle->state = TASK_RUNNING; 4712 idle->state = TASK_RUNNING;
4368 idle->cpus_allowed = cpumask_of_cpu(cpu); 4713 idle->cpus_allowed = cpumask_of_cpu(cpu);
4369 set_task_cpu(idle, cpu); 4714 set_task_cpu(idle, cpu);
@@ -4459,13 +4804,16 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed);
4459 * 4804 *
4460 * So we race with normal scheduler movements, but that's OK, as long 4805 * So we race with normal scheduler movements, but that's OK, as long
4461 * as the task is no longer on this CPU. 4806 * as the task is no longer on this CPU.
4807 *
4808 * Returns non-zero if task was successfully migrated.
4462 */ 4809 */
4463static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) 4810static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4464{ 4811{
4465 runqueue_t *rq_dest, *rq_src; 4812 runqueue_t *rq_dest, *rq_src;
4813 int ret = 0;
4466 4814
4467 if (unlikely(cpu_is_offline(dest_cpu))) 4815 if (unlikely(cpu_is_offline(dest_cpu)))
4468 return; 4816 return ret;
4469 4817
4470 rq_src = cpu_rq(src_cpu); 4818 rq_src = cpu_rq(src_cpu);
4471 rq_dest = cpu_rq(dest_cpu); 4819 rq_dest = cpu_rq(dest_cpu);
@@ -4493,9 +4841,10 @@ static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4493 if (TASK_PREEMPTS_CURR(p, rq_dest)) 4841 if (TASK_PREEMPTS_CURR(p, rq_dest))
4494 resched_task(rq_dest->curr); 4842 resched_task(rq_dest->curr);
4495 } 4843 }
4496 4844 ret = 1;
4497out: 4845out:
4498 double_rq_unlock(rq_src, rq_dest); 4846 double_rq_unlock(rq_src, rq_dest);
4847 return ret;
4499} 4848}
4500 4849
4501/* 4850/*
@@ -4565,9 +4914,12 @@ wait_to_die:
4565/* Figure out where task on dead CPU should go, use force if neccessary. */ 4914/* Figure out where task on dead CPU should go, use force if neccessary. */
4566static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) 4915static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
4567{ 4916{
4917 runqueue_t *rq;
4918 unsigned long flags;
4568 int dest_cpu; 4919 int dest_cpu;
4569 cpumask_t mask; 4920 cpumask_t mask;
4570 4921
4922restart:
4571 /* On same node? */ 4923 /* On same node? */
4572 mask = node_to_cpumask(cpu_to_node(dead_cpu)); 4924 mask = node_to_cpumask(cpu_to_node(dead_cpu));
4573 cpus_and(mask, mask, tsk->cpus_allowed); 4925 cpus_and(mask, mask, tsk->cpus_allowed);
@@ -4579,8 +4931,10 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
4579 4931
4580 /* No more Mr. Nice Guy. */ 4932 /* No more Mr. Nice Guy. */
4581 if (dest_cpu == NR_CPUS) { 4933 if (dest_cpu == NR_CPUS) {
4934 rq = task_rq_lock(tsk, &flags);
4582 cpus_setall(tsk->cpus_allowed); 4935 cpus_setall(tsk->cpus_allowed);
4583 dest_cpu = any_online_cpu(tsk->cpus_allowed); 4936 dest_cpu = any_online_cpu(tsk->cpus_allowed);
4937 task_rq_unlock(rq, &flags);
4584 4938
4585 /* 4939 /*
4586 * Don't tell them about moving exiting tasks or 4940 * Don't tell them about moving exiting tasks or
@@ -4592,7 +4946,8 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
4592 "longer affine to cpu%d\n", 4946 "longer affine to cpu%d\n",
4593 tsk->pid, tsk->comm, dead_cpu); 4947 tsk->pid, tsk->comm, dead_cpu);
4594 } 4948 }
4595 __migrate_task(tsk, dead_cpu, dest_cpu); 4949 if (!__migrate_task(tsk, dead_cpu, dest_cpu))
4950 goto restart;
4596} 4951}
4597 4952
4598/* 4953/*
@@ -4719,8 +5074,9 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
4719 * migration_call - callback that gets triggered when a CPU is added. 5074 * migration_call - callback that gets triggered when a CPU is added.
4720 * Here we can start up the necessary migration thread for the new CPU. 5075 * Here we can start up the necessary migration thread for the new CPU.
4721 */ 5076 */
4722static int migration_call(struct notifier_block *nfb, unsigned long action, 5077static int __cpuinit migration_call(struct notifier_block *nfb,
4723 void *hcpu) 5078 unsigned long action,
5079 void *hcpu)
4724{ 5080{
4725 int cpu = (long)hcpu; 5081 int cpu = (long)hcpu;
4726 struct task_struct *p; 5082 struct task_struct *p;
@@ -4746,6 +5102,8 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
4746 break; 5102 break;
4747#ifdef CONFIG_HOTPLUG_CPU 5103#ifdef CONFIG_HOTPLUG_CPU
4748 case CPU_UP_CANCELED: 5104 case CPU_UP_CANCELED:
5105 if (!cpu_rq(cpu)->migration_thread)
5106 break;
4749 /* Unbind it from offline cpu so it can run. Fall thru. */ 5107 /* Unbind it from offline cpu so it can run. Fall thru. */
4750 kthread_bind(cpu_rq(cpu)->migration_thread, 5108 kthread_bind(cpu_rq(cpu)->migration_thread,
4751 any_online_cpu(cpu_online_map)); 5109 any_online_cpu(cpu_online_map));
@@ -4788,7 +5146,7 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
4788/* Register at highest priority so that task migration (migrate_all_tasks) 5146/* Register at highest priority so that task migration (migrate_all_tasks)
4789 * happens before everything else. 5147 * happens before everything else.
4790 */ 5148 */
4791static struct notifier_block migration_notifier = { 5149static struct notifier_block __cpuinitdata migration_notifier = {
4792 .notifier_call = migration_call, 5150 .notifier_call = migration_call,
4793 .priority = 10 5151 .priority = 10
4794}; 5152};
@@ -5589,6 +5947,7 @@ static cpumask_t sched_domain_node_span(int node)
5589} 5947}
5590#endif 5948#endif
5591 5949
5950int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
5592/* 5951/*
5593 * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we 5952 * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
5594 * can switch it on easily if needed. 5953 * can switch it on easily if needed.
@@ -5604,7 +5963,7 @@ static int cpu_to_cpu_group(int cpu)
5604 5963
5605#ifdef CONFIG_SCHED_MC 5964#ifdef CONFIG_SCHED_MC
5606static DEFINE_PER_CPU(struct sched_domain, core_domains); 5965static DEFINE_PER_CPU(struct sched_domain, core_domains);
5607static struct sched_group sched_group_core[NR_CPUS]; 5966static struct sched_group *sched_group_core_bycpu[NR_CPUS];
5608#endif 5967#endif
5609 5968
5610#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 5969#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
@@ -5620,7 +5979,7 @@ static int cpu_to_core_group(int cpu)
5620#endif 5979#endif
5621 5980
5622static DEFINE_PER_CPU(struct sched_domain, phys_domains); 5981static DEFINE_PER_CPU(struct sched_domain, phys_domains);
5623static struct sched_group sched_group_phys[NR_CPUS]; 5982static struct sched_group *sched_group_phys_bycpu[NR_CPUS];
5624static int cpu_to_phys_group(int cpu) 5983static int cpu_to_phys_group(int cpu)
5625{ 5984{
5626#if defined(CONFIG_SCHED_MC) 5985#if defined(CONFIG_SCHED_MC)
@@ -5677,13 +6036,74 @@ next_sg:
5677} 6036}
5678#endif 6037#endif
5679 6038
6039/* Free memory allocated for various sched_group structures */
6040static void free_sched_groups(const cpumask_t *cpu_map)
6041{
6042 int cpu;
6043#ifdef CONFIG_NUMA
6044 int i;
6045
6046 for_each_cpu_mask(cpu, *cpu_map) {
6047 struct sched_group *sched_group_allnodes
6048 = sched_group_allnodes_bycpu[cpu];
6049 struct sched_group **sched_group_nodes
6050 = sched_group_nodes_bycpu[cpu];
6051
6052 if (sched_group_allnodes) {
6053 kfree(sched_group_allnodes);
6054 sched_group_allnodes_bycpu[cpu] = NULL;
6055 }
6056
6057 if (!sched_group_nodes)
6058 continue;
6059
6060 for (i = 0; i < MAX_NUMNODES; i++) {
6061 cpumask_t nodemask = node_to_cpumask(i);
6062 struct sched_group *oldsg, *sg = sched_group_nodes[i];
6063
6064 cpus_and(nodemask, nodemask, *cpu_map);
6065 if (cpus_empty(nodemask))
6066 continue;
6067
6068 if (sg == NULL)
6069 continue;
6070 sg = sg->next;
6071next_sg:
6072 oldsg = sg;
6073 sg = sg->next;
6074 kfree(oldsg);
6075 if (oldsg != sched_group_nodes[i])
6076 goto next_sg;
6077 }
6078 kfree(sched_group_nodes);
6079 sched_group_nodes_bycpu[cpu] = NULL;
6080 }
6081#endif
6082 for_each_cpu_mask(cpu, *cpu_map) {
6083 if (sched_group_phys_bycpu[cpu]) {
6084 kfree(sched_group_phys_bycpu[cpu]);
6085 sched_group_phys_bycpu[cpu] = NULL;
6086 }
6087#ifdef CONFIG_SCHED_MC
6088 if (sched_group_core_bycpu[cpu]) {
6089 kfree(sched_group_core_bycpu[cpu]);
6090 sched_group_core_bycpu[cpu] = NULL;
6091 }
6092#endif
6093 }
6094}
6095
5680/* 6096/*
5681 * Build sched domains for a given set of cpus and attach the sched domains 6097 * Build sched domains for a given set of cpus and attach the sched domains
5682 * to the individual cpus 6098 * to the individual cpus
5683 */ 6099 */
5684void build_sched_domains(const cpumask_t *cpu_map) 6100static int build_sched_domains(const cpumask_t *cpu_map)
5685{ 6101{
5686 int i; 6102 int i;
6103 struct sched_group *sched_group_phys = NULL;
6104#ifdef CONFIG_SCHED_MC
6105 struct sched_group *sched_group_core = NULL;
6106#endif
5687#ifdef CONFIG_NUMA 6107#ifdef CONFIG_NUMA
5688 struct sched_group **sched_group_nodes = NULL; 6108 struct sched_group **sched_group_nodes = NULL;
5689 struct sched_group *sched_group_allnodes = NULL; 6109 struct sched_group *sched_group_allnodes = NULL;
@@ -5691,11 +6111,11 @@ void build_sched_domains(const cpumask_t *cpu_map)
5691 /* 6111 /*
5692 * Allocate the per-node list of sched groups 6112 * Allocate the per-node list of sched groups
5693 */ 6113 */
5694 sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES, 6114 sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
5695 GFP_ATOMIC); 6115 GFP_KERNEL);
5696 if (!sched_group_nodes) { 6116 if (!sched_group_nodes) {
5697 printk(KERN_WARNING "Can not alloc sched group node list\n"); 6117 printk(KERN_WARNING "Can not alloc sched group node list\n");
5698 return; 6118 return -ENOMEM;
5699 } 6119 }
5700 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; 6120 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
5701#endif 6121#endif
@@ -5721,7 +6141,7 @@ void build_sched_domains(const cpumask_t *cpu_map)
5721 if (!sched_group_allnodes) { 6141 if (!sched_group_allnodes) {
5722 printk(KERN_WARNING 6142 printk(KERN_WARNING
5723 "Can not alloc allnodes sched group\n"); 6143 "Can not alloc allnodes sched group\n");
5724 break; 6144 goto error;
5725 } 6145 }
5726 sched_group_allnodes_bycpu[i] 6146 sched_group_allnodes_bycpu[i]
5727 = sched_group_allnodes; 6147 = sched_group_allnodes;
@@ -5742,6 +6162,18 @@ void build_sched_domains(const cpumask_t *cpu_map)
5742 cpus_and(sd->span, sd->span, *cpu_map); 6162 cpus_and(sd->span, sd->span, *cpu_map);
5743#endif 6163#endif
5744 6164
6165 if (!sched_group_phys) {
6166 sched_group_phys
6167 = kmalloc(sizeof(struct sched_group) * NR_CPUS,
6168 GFP_KERNEL);
6169 if (!sched_group_phys) {
6170 printk (KERN_WARNING "Can not alloc phys sched"
6171 "group\n");
6172 goto error;
6173 }
6174 sched_group_phys_bycpu[i] = sched_group_phys;
6175 }
6176
5745 p = sd; 6177 p = sd;
5746 sd = &per_cpu(phys_domains, i); 6178 sd = &per_cpu(phys_domains, i);
5747 group = cpu_to_phys_group(i); 6179 group = cpu_to_phys_group(i);
@@ -5751,6 +6183,18 @@ void build_sched_domains(const cpumask_t *cpu_map)
5751 sd->groups = &sched_group_phys[group]; 6183 sd->groups = &sched_group_phys[group];
5752 6184
5753#ifdef CONFIG_SCHED_MC 6185#ifdef CONFIG_SCHED_MC
6186 if (!sched_group_core) {
6187 sched_group_core
6188 = kmalloc(sizeof(struct sched_group) * NR_CPUS,
6189 GFP_KERNEL);
6190 if (!sched_group_core) {
6191 printk (KERN_WARNING "Can not alloc core sched"
6192 "group\n");
6193 goto error;
6194 }
6195 sched_group_core_bycpu[i] = sched_group_core;
6196 }
6197
5754 p = sd; 6198 p = sd;
5755 sd = &per_cpu(core_domains, i); 6199 sd = &per_cpu(core_domains, i);
5756 group = cpu_to_core_group(i); 6200 group = cpu_to_core_group(i);
@@ -5834,24 +6278,21 @@ void build_sched_domains(const cpumask_t *cpu_map)
5834 domainspan = sched_domain_node_span(i); 6278 domainspan = sched_domain_node_span(i);
5835 cpus_and(domainspan, domainspan, *cpu_map); 6279 cpus_and(domainspan, domainspan, *cpu_map);
5836 6280
5837 sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); 6281 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
6282 if (!sg) {
6283 printk(KERN_WARNING "Can not alloc domain group for "
6284 "node %d\n", i);
6285 goto error;
6286 }
5838 sched_group_nodes[i] = sg; 6287 sched_group_nodes[i] = sg;
5839 for_each_cpu_mask(j, nodemask) { 6288 for_each_cpu_mask(j, nodemask) {
5840 struct sched_domain *sd; 6289 struct sched_domain *sd;
5841 sd = &per_cpu(node_domains, j); 6290 sd = &per_cpu(node_domains, j);
5842 sd->groups = sg; 6291 sd->groups = sg;
5843 if (sd->groups == NULL) {
5844 /* Turn off balancing if we have no groups */
5845 sd->flags = 0;
5846 }
5847 }
5848 if (!sg) {
5849 printk(KERN_WARNING
5850 "Can not alloc domain group for node %d\n", i);
5851 continue;
5852 } 6292 }
5853 sg->cpu_power = 0; 6293 sg->cpu_power = 0;
5854 sg->cpumask = nodemask; 6294 sg->cpumask = nodemask;
6295 sg->next = sg;
5855 cpus_or(covered, covered, nodemask); 6296 cpus_or(covered, covered, nodemask);
5856 prev = sg; 6297 prev = sg;
5857 6298
@@ -5870,54 +6311,90 @@ void build_sched_domains(const cpumask_t *cpu_map)
5870 if (cpus_empty(tmp)) 6311 if (cpus_empty(tmp))
5871 continue; 6312 continue;
5872 6313
5873 sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); 6314 sg = kmalloc_node(sizeof(struct sched_group),
6315 GFP_KERNEL, i);
5874 if (!sg) { 6316 if (!sg) {
5875 printk(KERN_WARNING 6317 printk(KERN_WARNING
5876 "Can not alloc domain group for node %d\n", j); 6318 "Can not alloc domain group for node %d\n", j);
5877 break; 6319 goto error;
5878 } 6320 }
5879 sg->cpu_power = 0; 6321 sg->cpu_power = 0;
5880 sg->cpumask = tmp; 6322 sg->cpumask = tmp;
6323 sg->next = prev->next;
5881 cpus_or(covered, covered, tmp); 6324 cpus_or(covered, covered, tmp);
5882 prev->next = sg; 6325 prev->next = sg;
5883 prev = sg; 6326 prev = sg;
5884 } 6327 }
5885 prev->next = sched_group_nodes[i];
5886 } 6328 }
5887#endif 6329#endif
5888 6330
5889 /* Calculate CPU power for physical packages and nodes */ 6331 /* Calculate CPU power for physical packages and nodes */
6332#ifdef CONFIG_SCHED_SMT
5890 for_each_cpu_mask(i, *cpu_map) { 6333 for_each_cpu_mask(i, *cpu_map) {
5891 int power;
5892 struct sched_domain *sd; 6334 struct sched_domain *sd;
5893#ifdef CONFIG_SCHED_SMT
5894 sd = &per_cpu(cpu_domains, i); 6335 sd = &per_cpu(cpu_domains, i);
5895 power = SCHED_LOAD_SCALE; 6336 sd->groups->cpu_power = SCHED_LOAD_SCALE;
5896 sd->groups->cpu_power = power; 6337 }
5897#endif 6338#endif
5898#ifdef CONFIG_SCHED_MC 6339#ifdef CONFIG_SCHED_MC
6340 for_each_cpu_mask(i, *cpu_map) {
6341 int power;
6342 struct sched_domain *sd;
5899 sd = &per_cpu(core_domains, i); 6343 sd = &per_cpu(core_domains, i);
5900 power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) 6344 if (sched_smt_power_savings)
6345 power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
6346 else
6347 power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
5901 * SCHED_LOAD_SCALE / 10; 6348 * SCHED_LOAD_SCALE / 10;
5902 sd->groups->cpu_power = power; 6349 sd->groups->cpu_power = power;
6350 }
6351#endif
5903 6352
6353 for_each_cpu_mask(i, *cpu_map) {
6354 struct sched_domain *sd;
6355#ifdef CONFIG_SCHED_MC
5904 sd = &per_cpu(phys_domains, i); 6356 sd = &per_cpu(phys_domains, i);
6357 if (i != first_cpu(sd->groups->cpumask))
6358 continue;
5905 6359
5906 /* 6360 sd->groups->cpu_power = 0;
5907 * This has to be < 2 * SCHED_LOAD_SCALE 6361 if (sched_mc_power_savings || sched_smt_power_savings) {
5908 * Lets keep it SCHED_LOAD_SCALE, so that 6362 int j;
5909 * while calculating NUMA group's cpu_power 6363
5910 * we can simply do 6364 for_each_cpu_mask(j, sd->groups->cpumask) {
5911 * numa_group->cpu_power += phys_group->cpu_power; 6365 struct sched_domain *sd1;
5912 * 6366 sd1 = &per_cpu(core_domains, j);
5913 * See "only add power once for each physical pkg" 6367 /*
5914 * comment below 6368 * for each core we will add once
5915 */ 6369 * to the group in physical domain
5916 sd->groups->cpu_power = SCHED_LOAD_SCALE; 6370 */
6371 if (j != first_cpu(sd1->groups->cpumask))
6372 continue;
6373
6374 if (sched_smt_power_savings)
6375 sd->groups->cpu_power += sd1->groups->cpu_power;
6376 else
6377 sd->groups->cpu_power += SCHED_LOAD_SCALE;
6378 }
6379 } else
6380 /*
6381 * This has to be < 2 * SCHED_LOAD_SCALE
6382 * Lets keep it SCHED_LOAD_SCALE, so that
6383 * while calculating NUMA group's cpu_power
6384 * we can simply do
6385 * numa_group->cpu_power += phys_group->cpu_power;
6386 *
6387 * See "only add power once for each physical pkg"
6388 * comment below
6389 */
6390 sd->groups->cpu_power = SCHED_LOAD_SCALE;
5917#else 6391#else
6392 int power;
5918 sd = &per_cpu(phys_domains, i); 6393 sd = &per_cpu(phys_domains, i);
5919 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * 6394 if (sched_smt_power_savings)
5920 (cpus_weight(sd->groups->cpumask)-1) / 10; 6395 power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
6396 else
6397 power = SCHED_LOAD_SCALE;
5921 sd->groups->cpu_power = power; 6398 sd->groups->cpu_power = power;
5922#endif 6399#endif
5923 } 6400 }
@@ -5945,13 +6422,20 @@ void build_sched_domains(const cpumask_t *cpu_map)
5945 * Tune cache-hot values: 6422 * Tune cache-hot values:
5946 */ 6423 */
5947 calibrate_migration_costs(cpu_map); 6424 calibrate_migration_costs(cpu_map);
6425
6426 return 0;
6427
6428error:
6429 free_sched_groups(cpu_map);
6430 return -ENOMEM;
5948} 6431}
5949/* 6432/*
5950 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 6433 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
5951 */ 6434 */
5952static void arch_init_sched_domains(const cpumask_t *cpu_map) 6435static int arch_init_sched_domains(const cpumask_t *cpu_map)
5953{ 6436{
5954 cpumask_t cpu_default_map; 6437 cpumask_t cpu_default_map;
6438 int err;
5955 6439
5956 /* 6440 /*
5957 * Setup mask for cpus without special case scheduling requirements. 6441 * Setup mask for cpus without special case scheduling requirements.
@@ -5960,51 +6444,14 @@ static void arch_init_sched_domains(const cpumask_t *cpu_map)
5960 */ 6444 */
5961 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); 6445 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
5962 6446
5963 build_sched_domains(&cpu_default_map); 6447 err = build_sched_domains(&cpu_default_map);
6448
6449 return err;
5964} 6450}
5965 6451
5966static void arch_destroy_sched_domains(const cpumask_t *cpu_map) 6452static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
5967{ 6453{
5968#ifdef CONFIG_NUMA 6454 free_sched_groups(cpu_map);
5969 int i;
5970 int cpu;
5971
5972 for_each_cpu_mask(cpu, *cpu_map) {
5973 struct sched_group *sched_group_allnodes
5974 = sched_group_allnodes_bycpu[cpu];
5975 struct sched_group **sched_group_nodes
5976 = sched_group_nodes_bycpu[cpu];
5977
5978 if (sched_group_allnodes) {
5979 kfree(sched_group_allnodes);
5980 sched_group_allnodes_bycpu[cpu] = NULL;
5981 }
5982
5983 if (!sched_group_nodes)
5984 continue;
5985
5986 for (i = 0; i < MAX_NUMNODES; i++) {
5987 cpumask_t nodemask = node_to_cpumask(i);
5988 struct sched_group *oldsg, *sg = sched_group_nodes[i];
5989
5990 cpus_and(nodemask, nodemask, *cpu_map);
5991 if (cpus_empty(nodemask))
5992 continue;
5993
5994 if (sg == NULL)
5995 continue;
5996 sg = sg->next;
5997next_sg:
5998 oldsg = sg;
5999 sg = sg->next;
6000 kfree(oldsg);
6001 if (oldsg != sched_group_nodes[i])
6002 goto next_sg;
6003 }
6004 kfree(sched_group_nodes);
6005 sched_group_nodes_bycpu[cpu] = NULL;
6006 }
6007#endif
6008} 6455}
6009 6456
6010/* 6457/*
@@ -6029,9 +6476,10 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
6029 * correct sched domains 6476 * correct sched domains
6030 * Call with hotplug lock held 6477 * Call with hotplug lock held
6031 */ 6478 */
6032void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) 6479int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
6033{ 6480{
6034 cpumask_t change_map; 6481 cpumask_t change_map;
6482 int err = 0;
6035 6483
6036 cpus_and(*partition1, *partition1, cpu_online_map); 6484 cpus_and(*partition1, *partition1, cpu_online_map);
6037 cpus_and(*partition2, *partition2, cpu_online_map); 6485 cpus_and(*partition2, *partition2, cpu_online_map);
@@ -6040,11 +6488,87 @@ void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
6040 /* Detach sched domains from all of the affected cpus */ 6488 /* Detach sched domains from all of the affected cpus */
6041 detach_destroy_domains(&change_map); 6489 detach_destroy_domains(&change_map);
6042 if (!cpus_empty(*partition1)) 6490 if (!cpus_empty(*partition1))
6043 build_sched_domains(partition1); 6491 err = build_sched_domains(partition1);
6044 if (!cpus_empty(*partition2)) 6492 if (!err && !cpus_empty(*partition2))
6045 build_sched_domains(partition2); 6493 err = build_sched_domains(partition2);
6494
6495 return err;
6496}
6497
6498#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6499int arch_reinit_sched_domains(void)
6500{
6501 int err;
6502
6503 lock_cpu_hotplug();
6504 detach_destroy_domains(&cpu_online_map);
6505 err = arch_init_sched_domains(&cpu_online_map);
6506 unlock_cpu_hotplug();
6507
6508 return err;
6509}
6510
6511static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6512{
6513 int ret;
6514
6515 if (buf[0] != '0' && buf[0] != '1')
6516 return -EINVAL;
6517
6518 if (smt)
6519 sched_smt_power_savings = (buf[0] == '1');
6520 else
6521 sched_mc_power_savings = (buf[0] == '1');
6522
6523 ret = arch_reinit_sched_domains();
6524
6525 return ret ? ret : count;
6046} 6526}
6047 6527
6528int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
6529{
6530 int err = 0;
6531#ifdef CONFIG_SCHED_SMT
6532 if (smt_capable())
6533 err = sysfs_create_file(&cls->kset.kobj,
6534 &attr_sched_smt_power_savings.attr);
6535#endif
6536#ifdef CONFIG_SCHED_MC
6537 if (!err && mc_capable())
6538 err = sysfs_create_file(&cls->kset.kobj,
6539 &attr_sched_mc_power_savings.attr);
6540#endif
6541 return err;
6542}
6543#endif
6544
6545#ifdef CONFIG_SCHED_MC
6546static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
6547{
6548 return sprintf(page, "%u\n", sched_mc_power_savings);
6549}
6550static ssize_t sched_mc_power_savings_store(struct sys_device *dev, const char *buf, size_t count)
6551{
6552 return sched_power_savings_store(buf, count, 0);
6553}
6554SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
6555 sched_mc_power_savings_store);
6556#endif
6557
6558#ifdef CONFIG_SCHED_SMT
6559static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
6560{
6561 return sprintf(page, "%u\n", sched_smt_power_savings);
6562}
6563static ssize_t sched_smt_power_savings_store(struct sys_device *dev, const char *buf, size_t count)
6564{
6565 return sched_power_savings_store(buf, count, 1);
6566}
6567SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
6568 sched_smt_power_savings_store);
6569#endif
6570
6571
6048#ifdef CONFIG_HOTPLUG_CPU 6572#ifdef CONFIG_HOTPLUG_CPU
6049/* 6573/*
6050 * Force a reinitialization of the sched domains hierarchy. The domains 6574 * Force a reinitialization of the sched domains hierarchy. The domains
@@ -6126,7 +6650,6 @@ void __init sched_init(void)
6126 rq->push_cpu = 0; 6650 rq->push_cpu = 0;
6127 rq->migration_thread = NULL; 6651 rq->migration_thread = NULL;
6128 INIT_LIST_HEAD(&rq->migration_queue); 6652 INIT_LIST_HEAD(&rq->migration_queue);
6129 rq->cpu = i;
6130#endif 6653#endif
6131 atomic_set(&rq->nr_iowait, 0); 6654 atomic_set(&rq->nr_iowait, 0);
6132 6655
@@ -6141,6 +6664,7 @@ void __init sched_init(void)
6141 } 6664 }
6142 } 6665 }
6143 6666
6667 set_load_weight(&init_task);
6144 /* 6668 /*
6145 * The boot idle thread does lazy MMU switching as well: 6669 * The boot idle thread does lazy MMU switching as well:
6146 */ 6670 */
@@ -6187,11 +6711,12 @@ void normalize_rt_tasks(void)
6187 runqueue_t *rq; 6711 runqueue_t *rq;
6188 6712
6189 read_lock_irq(&tasklist_lock); 6713 read_lock_irq(&tasklist_lock);
6190 for_each_process (p) { 6714 for_each_process(p) {
6191 if (!rt_task(p)) 6715 if (!rt_task(p))
6192 continue; 6716 continue;
6193 6717
6194 rq = task_rq_lock(p, &flags); 6718 spin_lock_irqsave(&p->pi_lock, flags);
6719 rq = __task_rq_lock(p);
6195 6720
6196 array = p->array; 6721 array = p->array;
6197 if (array) 6722 if (array)
@@ -6202,7 +6727,8 @@ void normalize_rt_tasks(void)
6202 resched_task(rq->curr); 6727 resched_task(rq->curr);
6203 } 6728 }
6204 6729
6205 task_rq_unlock(rq, &flags); 6730 __task_rq_unlock(rq);
6731 spin_unlock_irqrestore(&p->pi_lock, flags);
6206 } 6732 }
6207 read_unlock_irq(&tasklist_lock); 6733 read_unlock_irq(&tasklist_lock);
6208} 6734}
diff --git a/kernel/signal.c b/kernel/signal.c
index e5f8aea78ffe..52adf53929f6 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -23,12 +23,12 @@
23#include <linux/syscalls.h> 23#include <linux/syscalls.h>
24#include <linux/ptrace.h> 24#include <linux/ptrace.h>
25#include <linux/signal.h> 25#include <linux/signal.h>
26#include <linux/audit.h>
27#include <linux/capability.h> 26#include <linux/capability.h>
28#include <asm/param.h> 27#include <asm/param.h>
29#include <asm/uaccess.h> 28#include <asm/uaccess.h>
30#include <asm/unistd.h> 29#include <asm/unistd.h>
31#include <asm/siginfo.h> 30#include <asm/siginfo.h>
31#include "audit.h" /* audit_signal_info() */
32 32
33/* 33/*
34 * SLAB caches for signal bits. 34 * SLAB caches for signal bits.
@@ -1531,6 +1531,35 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
1531 spin_unlock_irqrestore(&sighand->siglock, flags); 1531 spin_unlock_irqrestore(&sighand->siglock, flags);
1532} 1532}
1533 1533
1534static inline int may_ptrace_stop(void)
1535{
1536 if (!likely(current->ptrace & PT_PTRACED))
1537 return 0;
1538
1539 if (unlikely(current->parent == current->real_parent &&
1540 (current->ptrace & PT_ATTACHED)))
1541 return 0;
1542
1543 if (unlikely(current->signal == current->parent->signal) &&
1544 unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))
1545 return 0;
1546
1547 /*
1548 * Are we in the middle of do_coredump?
1549 * If so and our tracer is also part of the coredump stopping
1550 * is a deadlock situation, and pointless because our tracer
1551 * is dead so don't allow us to stop.
1552 * If SIGKILL was already sent before the caller unlocked
1553 * ->siglock we must see ->core_waiters != 0. Otherwise it
1554 * is safe to enter schedule().
1555 */
1556 if (unlikely(current->mm->core_waiters) &&
1557 unlikely(current->mm == current->parent->mm))
1558 return 0;
1559
1560 return 1;
1561}
1562
1534/* 1563/*
1535 * This must be called with current->sighand->siglock held. 1564 * This must be called with current->sighand->siglock held.
1536 * 1565 *
@@ -1559,11 +1588,7 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
1559 spin_unlock_irq(&current->sighand->siglock); 1588 spin_unlock_irq(&current->sighand->siglock);
1560 try_to_freeze(); 1589 try_to_freeze();
1561 read_lock(&tasklist_lock); 1590 read_lock(&tasklist_lock);
1562 if (likely(current->ptrace & PT_PTRACED) && 1591 if (may_ptrace_stop()) {
1563 likely(current->parent != current->real_parent ||
1564 !(current->ptrace & PT_ATTACHED)) &&
1565 (likely(current->parent->signal != current->signal) ||
1566 !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) {
1567 do_notify_parent_cldstop(current, CLD_TRAPPED); 1592 do_notify_parent_cldstop(current, CLD_TRAPPED);
1568 read_unlock(&tasklist_lock); 1593 read_unlock(&tasklist_lock);
1569 schedule(); 1594 schedule();
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 336f92d64e2e..8f03e3b89b55 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -446,7 +446,7 @@ static void takeover_tasklets(unsigned int cpu)
446} 446}
447#endif /* CONFIG_HOTPLUG_CPU */ 447#endif /* CONFIG_HOTPLUG_CPU */
448 448
449static int cpu_callback(struct notifier_block *nfb, 449static int __devinit cpu_callback(struct notifier_block *nfb,
450 unsigned long action, 450 unsigned long action,
451 void *hcpu) 451 void *hcpu)
452{ 452{
@@ -470,6 +470,8 @@ static int cpu_callback(struct notifier_block *nfb,
470 break; 470 break;
471#ifdef CONFIG_HOTPLUG_CPU 471#ifdef CONFIG_HOTPLUG_CPU
472 case CPU_UP_CANCELED: 472 case CPU_UP_CANCELED:
473 if (!per_cpu(ksoftirqd, hotcpu))
474 break;
473 /* Unbind so it can run. Fall thru. */ 475 /* Unbind so it can run. Fall thru. */
474 kthread_bind(per_cpu(ksoftirqd, hotcpu), 476 kthread_bind(per_cpu(ksoftirqd, hotcpu),
475 any_online_cpu(cpu_online_map)); 477 any_online_cpu(cpu_online_map));
@@ -484,7 +486,7 @@ static int cpu_callback(struct notifier_block *nfb,
484 return NOTIFY_OK; 486 return NOTIFY_OK;
485} 487}
486 488
487static struct notifier_block cpu_nfb = { 489static struct notifier_block __devinitdata cpu_nfb = {
488 .notifier_call = cpu_callback 490 .notifier_call = cpu_callback
489}; 491};
490 492
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 14c7faf02909..6b76caa22981 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -36,7 +36,7 @@ static struct notifier_block panic_block = {
36 36
37void touch_softlockup_watchdog(void) 37void touch_softlockup_watchdog(void)
38{ 38{
39 per_cpu(touch_timestamp, raw_smp_processor_id()) = jiffies; 39 __raw_get_cpu_var(touch_timestamp) = jiffies;
40} 40}
41EXPORT_SYMBOL(touch_softlockup_watchdog); 41EXPORT_SYMBOL(touch_softlockup_watchdog);
42 42
@@ -104,7 +104,7 @@ static int watchdog(void * __bind_cpu)
104/* 104/*
105 * Create/destroy watchdog threads as CPUs come and go: 105 * Create/destroy watchdog threads as CPUs come and go:
106 */ 106 */
107static int 107static int __devinit
108cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 108cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
109{ 109{
110 int hotcpu = (unsigned long)hcpu; 110 int hotcpu = (unsigned long)hcpu;
@@ -127,6 +127,8 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
127 break; 127 break;
128#ifdef CONFIG_HOTPLUG_CPU 128#ifdef CONFIG_HOTPLUG_CPU
129 case CPU_UP_CANCELED: 129 case CPU_UP_CANCELED:
130 if (!per_cpu(watchdog_task, hotcpu))
131 break;
130 /* Unbind so it can run. Fall thru. */ 132 /* Unbind so it can run. Fall thru. */
131 kthread_bind(per_cpu(watchdog_task, hotcpu), 133 kthread_bind(per_cpu(watchdog_task, hotcpu),
132 any_online_cpu(cpu_online_map)); 134 any_online_cpu(cpu_online_map));
@@ -140,7 +142,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
140 return NOTIFY_OK; 142 return NOTIFY_OK;
141} 143}
142 144
143static struct notifier_block cpu_nfb = { 145static struct notifier_block __devinitdata cpu_nfb = {
144 .notifier_call = cpu_callback 146 .notifier_call = cpu_callback
145}; 147};
146 148
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index dcfb5d731466..2c0aacc37c55 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -4,6 +4,7 @@
4#include <linux/cpu.h> 4#include <linux/cpu.h>
5#include <linux/err.h> 5#include <linux/err.h>
6#include <linux/syscalls.h> 6#include <linux/syscalls.h>
7#include <linux/kthread.h>
7#include <asm/atomic.h> 8#include <asm/atomic.h>
8#include <asm/semaphore.h> 9#include <asm/semaphore.h>
9#include <asm/uaccess.h> 10#include <asm/uaccess.h>
@@ -25,13 +26,11 @@ static unsigned int stopmachine_num_threads;
25static atomic_t stopmachine_thread_ack; 26static atomic_t stopmachine_thread_ack;
26static DECLARE_MUTEX(stopmachine_mutex); 27static DECLARE_MUTEX(stopmachine_mutex);
27 28
28static int stopmachine(void *cpu) 29static int stopmachine(void *unused)
29{ 30{
30 int irqs_disabled = 0; 31 int irqs_disabled = 0;
31 int prepared = 0; 32 int prepared = 0;
32 33
33 set_cpus_allowed(current, cpumask_of_cpu((int)(long)cpu));
34
35 /* Ack: we are alive */ 34 /* Ack: we are alive */
36 smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */ 35 smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */
37 atomic_inc(&stopmachine_thread_ack); 36 atomic_inc(&stopmachine_thread_ack);
@@ -85,7 +84,8 @@ static void stopmachine_set_state(enum stopmachine_state state)
85 84
86static int stop_machine(void) 85static int stop_machine(void)
87{ 86{
88 int i, ret = 0; 87 int ret = 0;
88 unsigned int i;
89 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 89 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
90 90
91 /* One high-prio thread per cpu. We'll do this one. */ 91 /* One high-prio thread per cpu. We'll do this one. */
@@ -96,11 +96,16 @@ static int stop_machine(void)
96 stopmachine_state = STOPMACHINE_WAIT; 96 stopmachine_state = STOPMACHINE_WAIT;
97 97
98 for_each_online_cpu(i) { 98 for_each_online_cpu(i) {
99 struct task_struct *tsk;
99 if (i == raw_smp_processor_id()) 100 if (i == raw_smp_processor_id())
100 continue; 101 continue;
101 ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL); 102 tsk = kthread_create(stopmachine, NULL, "stopmachine");
102 if (ret < 0) 103 if (IS_ERR(tsk)) {
104 ret = PTR_ERR(tsk);
103 break; 105 break;
106 }
107 kthread_bind(tsk, i);
108 wake_up_process(tsk);
104 stopmachine_num_threads++; 109 stopmachine_num_threads++;
105 } 110 }
106 111
diff --git a/kernel/sys.c b/kernel/sys.c
index 0b6ec0e7936f..2d5179c67cec 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -13,7 +13,6 @@
13#include <linux/notifier.h> 13#include <linux/notifier.h>
14#include <linux/reboot.h> 14#include <linux/reboot.h>
15#include <linux/prctl.h> 15#include <linux/prctl.h>
16#include <linux/init.h>
17#include <linux/highuid.h> 16#include <linux/highuid.h>
18#include <linux/fs.h> 17#include <linux/fs.h>
19#include <linux/kernel.h> 18#include <linux/kernel.h>
@@ -57,6 +56,12 @@
57#ifndef GET_FPEXC_CTL 56#ifndef GET_FPEXC_CTL
58# define GET_FPEXC_CTL(a,b) (-EINVAL) 57# define GET_FPEXC_CTL(a,b) (-EINVAL)
59#endif 58#endif
59#ifndef GET_ENDIAN
60# define GET_ENDIAN(a,b) (-EINVAL)
61#endif
62#ifndef SET_ENDIAN
63# define SET_ENDIAN(a,b) (-EINVAL)
64#endif
60 65
61/* 66/*
62 * this is where the system-wide overflow UID and GID are defined, for 67 * this is where the system-wide overflow UID and GID are defined, for
@@ -132,14 +137,15 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl,
132 unsigned long val, void *v) 137 unsigned long val, void *v)
133{ 138{
134 int ret = NOTIFY_DONE; 139 int ret = NOTIFY_DONE;
135 struct notifier_block *nb; 140 struct notifier_block *nb, *next_nb;
136 141
137 nb = rcu_dereference(*nl); 142 nb = rcu_dereference(*nl);
138 while (nb) { 143 while (nb) {
144 next_nb = rcu_dereference(nb->next);
139 ret = nb->notifier_call(nb, val, v); 145 ret = nb->notifier_call(nb, val, v);
140 if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK) 146 if ((ret & NOTIFY_STOP_MASK) == NOTIFY_STOP_MASK)
141 break; 147 break;
142 nb = rcu_dereference(nb->next); 148 nb = next_nb;
143 } 149 }
144 return ret; 150 return ret;
145} 151}
@@ -583,7 +589,7 @@ void emergency_restart(void)
583} 589}
584EXPORT_SYMBOL_GPL(emergency_restart); 590EXPORT_SYMBOL_GPL(emergency_restart);
585 591
586void kernel_restart_prepare(char *cmd) 592static void kernel_restart_prepare(char *cmd)
587{ 593{
588 blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); 594 blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
589 system_state = SYSTEM_RESTART; 595 system_state = SYSTEM_RESTART;
@@ -617,7 +623,7 @@ EXPORT_SYMBOL_GPL(kernel_restart);
617 * Move into place and start executing a preloaded standalone 623 * Move into place and start executing a preloaded standalone
618 * executable. If nothing was preloaded return an error. 624 * executable. If nothing was preloaded return an error.
619 */ 625 */
620void kernel_kexec(void) 626static void kernel_kexec(void)
621{ 627{
622#ifdef CONFIG_KEXEC 628#ifdef CONFIG_KEXEC
623 struct kimage *image; 629 struct kimage *image;
@@ -631,7 +637,6 @@ void kernel_kexec(void)
631 machine_kexec(image); 637 machine_kexec(image);
632#endif 638#endif
633} 639}
634EXPORT_SYMBOL_GPL(kernel_kexec);
635 640
636void kernel_shutdown_prepare(enum system_states state) 641void kernel_shutdown_prepare(enum system_states state)
637{ 642{
@@ -1860,23 +1865,20 @@ out:
1860 * fields when reaping, so a sample either gets all the additions of a 1865 * fields when reaping, so a sample either gets all the additions of a
1861 * given child after it's reaped, or none so this sample is before reaping. 1866 * given child after it's reaped, or none so this sample is before reaping.
1862 * 1867 *
1863 * tasklist_lock locking optimisation: 1868 * Locking:
1864 * If we are current and single threaded, we do not need to take the tasklist 1869 * We need to take the siglock for CHILDEREN, SELF and BOTH
1865 * lock or the siglock. No one else can take our signal_struct away, 1870 * for the cases current multithreaded, non-current single threaded
1866 * no one else can reap the children to update signal->c* counters, and 1871 * non-current multithreaded. Thread traversal is now safe with
1867 * no one else can race with the signal-> fields. 1872 * the siglock held.
1868 * If we do not take the tasklist_lock, the signal-> fields could be read 1873 * Strictly speaking, we donot need to take the siglock if we are current and
1869 * out of order while another thread was just exiting. So we place a 1874 * single threaded, as no one else can take our signal_struct away, no one
1870 * read memory barrier when we avoid the lock. On the writer side, 1875 * else can reap the children to update signal->c* counters, and no one else
1871 * write memory barrier is implied in __exit_signal as __exit_signal releases 1876 * can race with the signal-> fields. If we do not take any lock, the
1872 * the siglock spinlock after updating the signal-> fields. 1877 * signal-> fields could be read out of order while another thread was just
1873 * 1878 * exiting. So we should place a read memory barrier when we avoid the lock.
1874 * We don't really need the siglock when we access the non c* fields 1879 * On the writer side, write memory barrier is implied in __exit_signal
1875 * of the signal_struct (for RUSAGE_SELF) even in multithreaded 1880 * as __exit_signal releases the siglock spinlock after updating the signal->
1876 * case, since we take the tasklist lock for read and the non c* signal-> 1881 * fields. But we don't do this yet to keep things simple.
1877 * fields are updated only in __exit_signal, which is called with
1878 * tasklist_lock taken for write, hence these two threads cannot execute
1879 * concurrently.
1880 * 1882 *
1881 */ 1883 */
1882 1884
@@ -1885,35 +1887,25 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1885 struct task_struct *t; 1887 struct task_struct *t;
1886 unsigned long flags; 1888 unsigned long flags;
1887 cputime_t utime, stime; 1889 cputime_t utime, stime;
1888 int need_lock = 0;
1889 1890
1890 memset((char *) r, 0, sizeof *r); 1891 memset((char *) r, 0, sizeof *r);
1891 utime = stime = cputime_zero; 1892 utime = stime = cputime_zero;
1892 1893
1893 if (p != current || !thread_group_empty(p)) 1894 rcu_read_lock();
1894 need_lock = 1; 1895 if (!lock_task_sighand(p, &flags)) {
1895 1896 rcu_read_unlock();
1896 if (need_lock) { 1897 return;
1897 read_lock(&tasklist_lock); 1898 }
1898 if (unlikely(!p->signal)) {
1899 read_unlock(&tasklist_lock);
1900 return;
1901 }
1902 } else
1903 /* See locking comments above */
1904 smp_rmb();
1905 1899
1906 switch (who) { 1900 switch (who) {
1907 case RUSAGE_BOTH: 1901 case RUSAGE_BOTH:
1908 case RUSAGE_CHILDREN: 1902 case RUSAGE_CHILDREN:
1909 spin_lock_irqsave(&p->sighand->siglock, flags);
1910 utime = p->signal->cutime; 1903 utime = p->signal->cutime;
1911 stime = p->signal->cstime; 1904 stime = p->signal->cstime;
1912 r->ru_nvcsw = p->signal->cnvcsw; 1905 r->ru_nvcsw = p->signal->cnvcsw;
1913 r->ru_nivcsw = p->signal->cnivcsw; 1906 r->ru_nivcsw = p->signal->cnivcsw;
1914 r->ru_minflt = p->signal->cmin_flt; 1907 r->ru_minflt = p->signal->cmin_flt;
1915 r->ru_majflt = p->signal->cmaj_flt; 1908 r->ru_majflt = p->signal->cmaj_flt;
1916 spin_unlock_irqrestore(&p->sighand->siglock, flags);
1917 1909
1918 if (who == RUSAGE_CHILDREN) 1910 if (who == RUSAGE_CHILDREN)
1919 break; 1911 break;
@@ -1941,8 +1933,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1941 BUG(); 1933 BUG();
1942 } 1934 }
1943 1935
1944 if (need_lock) 1936 unlock_task_sighand(p, &flags);
1945 read_unlock(&tasklist_lock); 1937 rcu_read_unlock();
1938
1946 cputime_to_timeval(utime, &r->ru_utime); 1939 cputime_to_timeval(utime, &r->ru_utime);
1947 cputime_to_timeval(stime, &r->ru_stime); 1940 cputime_to_timeval(stime, &r->ru_stime);
1948} 1941}
@@ -2057,6 +2050,13 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
2057 return -EFAULT; 2050 return -EFAULT;
2058 return 0; 2051 return 0;
2059 } 2052 }
2053 case PR_GET_ENDIAN:
2054 error = GET_ENDIAN(current, arg2);
2055 break;
2056 case PR_SET_ENDIAN:
2057 error = SET_ENDIAN(current, arg2);
2058 break;
2059
2060 default: 2060 default:
2061 error = -EINVAL; 2061 error = -EINVAL;
2062 break; 2062 break;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 5433195040f1..6991bece67e8 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -87,6 +87,7 @@ cond_syscall(sys_inotify_init);
87cond_syscall(sys_inotify_add_watch); 87cond_syscall(sys_inotify_add_watch);
88cond_syscall(sys_inotify_rm_watch); 88cond_syscall(sys_inotify_rm_watch);
89cond_syscall(sys_migrate_pages); 89cond_syscall(sys_migrate_pages);
90cond_syscall(sys_move_pages);
90cond_syscall(sys_chown16); 91cond_syscall(sys_chown16);
91cond_syscall(sys_fchown16); 92cond_syscall(sys_fchown16);
92cond_syscall(sys_getegid16); 93cond_syscall(sys_getegid16);
@@ -132,3 +133,4 @@ cond_syscall(sys_mincore);
132cond_syscall(sys_madvise); 133cond_syscall(sys_madvise);
133cond_syscall(sys_mremap); 134cond_syscall(sys_mremap);
134cond_syscall(sys_remap_file_pages); 135cond_syscall(sys_remap_file_pages);
136cond_syscall(compat_sys_move_pages);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index e82726faeeff..93a2c5398648 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -59,6 +59,7 @@ extern int proc_nr_files(ctl_table *table, int write, struct file *filp,
59extern int C_A_D; 59extern int C_A_D;
60extern int sysctl_overcommit_memory; 60extern int sysctl_overcommit_memory;
61extern int sysctl_overcommit_ratio; 61extern int sysctl_overcommit_ratio;
62extern int sysctl_panic_on_oom;
62extern int max_threads; 63extern int max_threads;
63extern int sysrq_enabled; 64extern int sysrq_enabled;
64extern int core_uses_pid; 65extern int core_uses_pid;
@@ -72,6 +73,7 @@ extern int printk_ratelimit_burst;
72extern int pid_max_min, pid_max_max; 73extern int pid_max_min, pid_max_max;
73extern int sysctl_drop_caches; 74extern int sysctl_drop_caches;
74extern int percpu_pagelist_fraction; 75extern int percpu_pagelist_fraction;
76extern int compat_log;
75 77
76#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) 78#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
77int unknown_nmi_panic; 79int unknown_nmi_panic;
@@ -131,6 +133,10 @@ extern int acct_parm[];
131extern int no_unaligned_warning; 133extern int no_unaligned_warning;
132#endif 134#endif
133 135
136#ifdef CONFIG_RT_MUTEXES
137extern int max_lock_depth;
138#endif
139
134static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, 140static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t,
135 ctl_table *, void **); 141 ctl_table *, void **);
136static int proc_doutsstring(ctl_table *table, int write, struct file *filp, 142static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
@@ -142,7 +148,6 @@ static struct ctl_table_header root_table_header =
142 148
143static ctl_table kern_table[]; 149static ctl_table kern_table[];
144static ctl_table vm_table[]; 150static ctl_table vm_table[];
145static ctl_table proc_table[];
146static ctl_table fs_table[]; 151static ctl_table fs_table[];
147static ctl_table debug_table[]; 152static ctl_table debug_table[];
148static ctl_table dev_table[]; 153static ctl_table dev_table[];
@@ -150,7 +155,7 @@ extern ctl_table random_table[];
150#ifdef CONFIG_UNIX98_PTYS 155#ifdef CONFIG_UNIX98_PTYS
151extern ctl_table pty_table[]; 156extern ctl_table pty_table[];
152#endif 157#endif
153#ifdef CONFIG_INOTIFY 158#ifdef CONFIG_INOTIFY_USER
154extern ctl_table inotify_table[]; 159extern ctl_table inotify_table[];
155#endif 160#endif
156 161
@@ -202,12 +207,6 @@ static ctl_table root_table[] = {
202 }, 207 },
203#endif 208#endif
204 { 209 {
205 .ctl_name = CTL_PROC,
206 .procname = "proc",
207 .mode = 0555,
208 .child = proc_table,
209 },
210 {
211 .ctl_name = CTL_FS, 210 .ctl_name = CTL_FS,
212 .procname = "fs", 211 .procname = "fs",
213 .mode = 0555, 212 .mode = 0555,
@@ -398,7 +397,7 @@ static ctl_table kern_table[] = {
398 .strategy = &sysctl_string, 397 .strategy = &sysctl_string,
399 }, 398 },
400#endif 399#endif
401#ifdef CONFIG_HOTPLUG 400#if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
402 { 401 {
403 .ctl_name = KERN_HOTPLUG, 402 .ctl_name = KERN_HOTPLUG,
404 .procname = "hotplug", 403 .procname = "hotplug",
@@ -683,6 +682,27 @@ static ctl_table kern_table[] = {
683 .proc_handler = &proc_dointvec, 682 .proc_handler = &proc_dointvec,
684 }, 683 },
685#endif 684#endif
685#ifdef CONFIG_COMPAT
686 {
687 .ctl_name = KERN_COMPAT_LOG,
688 .procname = "compat-log",
689 .data = &compat_log,
690 .maxlen = sizeof (int),
691 .mode = 0644,
692 .proc_handler = &proc_dointvec,
693 },
694#endif
695#ifdef CONFIG_RT_MUTEXES
696 {
697 .ctl_name = KERN_MAX_LOCK_DEPTH,
698 .procname = "max_lock_depth",
699 .data = &max_lock_depth,
700 .maxlen = sizeof(int),
701 .mode = 0644,
702 .proc_handler = &proc_dointvec,
703 },
704#endif
705
686 { .ctl_name = 0 } 706 { .ctl_name = 0 }
687}; 707};
688 708
@@ -702,6 +722,14 @@ static ctl_table vm_table[] = {
702 .proc_handler = &proc_dointvec, 722 .proc_handler = &proc_dointvec,
703 }, 723 },
704 { 724 {
725 .ctl_name = VM_PANIC_ON_OOM,
726 .procname = "panic_on_oom",
727 .data = &sysctl_panic_on_oom,
728 .maxlen = sizeof(sysctl_panic_on_oom),
729 .mode = 0644,
730 .proc_handler = &proc_dointvec,
731 },
732 {
705 .ctl_name = VM_OVERCOMMIT_RATIO, 733 .ctl_name = VM_OVERCOMMIT_RATIO,
706 .procname = "overcommit_ratio", 734 .procname = "overcommit_ratio",
707 .data = &sysctl_overcommit_ratio, 735 .data = &sysctl_overcommit_ratio,
@@ -915,10 +943,18 @@ static ctl_table vm_table[] = {
915 .strategy = &sysctl_jiffies, 943 .strategy = &sysctl_jiffies,
916 }, 944 },
917#endif 945#endif
918 { .ctl_name = 0 } 946#ifdef CONFIG_X86_32
919}; 947 {
920 948 .ctl_name = VM_VDSO_ENABLED,
921static ctl_table proc_table[] = { 949 .procname = "vdso_enabled",
950 .data = &vdso_enabled,
951 .maxlen = sizeof(vdso_enabled),
952 .mode = 0644,
953 .proc_handler = &proc_dointvec,
954 .strategy = &sysctl_intvec,
955 .extra1 = &zero,
956 },
957#endif
922 { .ctl_name = 0 } 958 { .ctl_name = 0 }
923}; 959};
924 960
@@ -1028,7 +1064,7 @@ static ctl_table fs_table[] = {
1028 .mode = 0644, 1064 .mode = 0644,
1029 .proc_handler = &proc_doulongvec_minmax, 1065 .proc_handler = &proc_doulongvec_minmax,
1030 }, 1066 },
1031#ifdef CONFIG_INOTIFY 1067#ifdef CONFIG_INOTIFY_USER
1032 { 1068 {
1033 .ctl_name = FS_INOTIFY, 1069 .ctl_name = FS_INOTIFY,
1034 .procname = "inotify", 1070 .procname = "inotify",
diff --git a/kernel/time.c b/kernel/time.c
index b00ddc71cedb..5bd489747643 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -523,6 +523,7 @@ EXPORT_SYMBOL(do_gettimeofday);
523 523
524 524
525#else 525#else
526#ifndef CONFIG_GENERIC_TIME
526/* 527/*
527 * Simulate gettimeofday using do_gettimeofday which only allows a timeval 528 * Simulate gettimeofday using do_gettimeofday which only allows a timeval
528 * and therefore only yields usec accuracy 529 * and therefore only yields usec accuracy
@@ -537,6 +538,7 @@ void getnstimeofday(struct timespec *tv)
537} 538}
538EXPORT_SYMBOL_GPL(getnstimeofday); 539EXPORT_SYMBOL_GPL(getnstimeofday);
539#endif 540#endif
541#endif
540 542
541/* Converts Gregorian date to seconds since 1970-01-01 00:00:00. 543/* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
542 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 544 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
new file mode 100644
index 000000000000..e1dfd8e86cce
--- /dev/null
+++ b/kernel/time/Makefile
@@ -0,0 +1 @@
obj-y += clocksource.o jiffies.o
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
new file mode 100644
index 000000000000..74eca5939bd9
--- /dev/null
+++ b/kernel/time/clocksource.c
@@ -0,0 +1,349 @@
1/*
2 * linux/kernel/time/clocksource.c
3 *
4 * This file contains the functions which manage clocksource drivers.
5 *
6 * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 *
22 * TODO WishList:
23 * o Allow clocksource drivers to be unregistered
24 * o get rid of clocksource_jiffies extern
25 */
26
27#include <linux/clocksource.h>
28#include <linux/sysdev.h>
29#include <linux/init.h>
30#include <linux/module.h>
31
32/* XXX - Would like a better way for initializing curr_clocksource */
33extern struct clocksource clocksource_jiffies;
34
35/*[Clocksource internal variables]---------
36 * curr_clocksource:
37 * currently selected clocksource. Initialized to clocksource_jiffies.
38 * next_clocksource:
39 * pending next selected clocksource.
40 * clocksource_list:
41 * linked list with the registered clocksources
42 * clocksource_lock:
43 * protects manipulations to curr_clocksource and next_clocksource
44 * and the clocksource_list
45 * override_name:
46 * Name of the user-specified clocksource.
47 */
48static struct clocksource *curr_clocksource = &clocksource_jiffies;
49static struct clocksource *next_clocksource;
50static LIST_HEAD(clocksource_list);
51static DEFINE_SPINLOCK(clocksource_lock);
52static char override_name[32];
53static int finished_booting;
54
55/* clocksource_done_booting - Called near the end of bootup
56 *
57 * Hack to avoid lots of clocksource churn at boot time
58 */
59static int __init clocksource_done_booting(void)
60{
61 finished_booting = 1;
62 return 0;
63}
64
65late_initcall(clocksource_done_booting);
66
67/**
68 * clocksource_get_next - Returns the selected clocksource
69 *
70 */
71struct clocksource *clocksource_get_next(void)
72{
73 unsigned long flags;
74
75 spin_lock_irqsave(&clocksource_lock, flags);
76 if (next_clocksource && finished_booting) {
77 curr_clocksource = next_clocksource;
78 next_clocksource = NULL;
79 }
80 spin_unlock_irqrestore(&clocksource_lock, flags);
81
82 return curr_clocksource;
83}
84
85/**
86 * select_clocksource - Finds the best registered clocksource.
87 *
88 * Private function. Must hold clocksource_lock when called.
89 *
90 * Looks through the list of registered clocksources, returning
91 * the one with the highest rating value. If there is a clocksource
92 * name that matches the override string, it returns that clocksource.
93 */
94static struct clocksource *select_clocksource(void)
95{
96 struct clocksource *best = NULL;
97 struct list_head *tmp;
98
99 list_for_each(tmp, &clocksource_list) {
100 struct clocksource *src;
101
102 src = list_entry(tmp, struct clocksource, list);
103 if (!best)
104 best = src;
105
106 /* check for override: */
107 if (strlen(src->name) == strlen(override_name) &&
108 !strcmp(src->name, override_name)) {
109 best = src;
110 break;
111 }
112 /* pick the highest rating: */
113 if (src->rating > best->rating)
114 best = src;
115 }
116
117 return best;
118}
119
120/**
121 * is_registered_source - Checks if clocksource is registered
122 * @c: pointer to a clocksource
123 *
124 * Private helper function. Must hold clocksource_lock when called.
125 *
126 * Returns one if the clocksource is already registered, zero otherwise.
127 */
128static int is_registered_source(struct clocksource *c)
129{
130 int len = strlen(c->name);
131 struct list_head *tmp;
132
133 list_for_each(tmp, &clocksource_list) {
134 struct clocksource *src;
135
136 src = list_entry(tmp, struct clocksource, list);
137 if (strlen(src->name) == len && !strcmp(src->name, c->name))
138 return 1;
139 }
140
141 return 0;
142}
143
144/**
145 * clocksource_register - Used to install new clocksources
146 * @t: clocksource to be registered
147 *
148 * Returns -EBUSY if registration fails, zero otherwise.
149 */
150int clocksource_register(struct clocksource *c)
151{
152 int ret = 0;
153 unsigned long flags;
154
155 spin_lock_irqsave(&clocksource_lock, flags);
156 /* check if clocksource is already registered */
157 if (is_registered_source(c)) {
158 printk("register_clocksource: Cannot register %s. "
159 "Already registered!", c->name);
160 ret = -EBUSY;
161 } else {
162 /* register it */
163 list_add(&c->list, &clocksource_list);
164 /* scan the registered clocksources, and pick the best one */
165 next_clocksource = select_clocksource();
166 }
167 spin_unlock_irqrestore(&clocksource_lock, flags);
168 return ret;
169}
170EXPORT_SYMBOL(clocksource_register);
171
172/**
173 * clocksource_reselect - Rescan list for next clocksource
174 *
175 * A quick helper function to be used if a clocksource changes its
176 * rating. Forces the clocksource list to be re-scanned for the best
177 * clocksource.
178 */
179void clocksource_reselect(void)
180{
181 unsigned long flags;
182
183 spin_lock_irqsave(&clocksource_lock, flags);
184 next_clocksource = select_clocksource();
185 spin_unlock_irqrestore(&clocksource_lock, flags);
186}
187EXPORT_SYMBOL(clocksource_reselect);
188
189/**
190 * sysfs_show_current_clocksources - sysfs interface for current clocksource
191 * @dev: unused
192 * @buf: char buffer to be filled with clocksource list
193 *
194 * Provides sysfs interface for listing current clocksource.
195 */
196static ssize_t
197sysfs_show_current_clocksources(struct sys_device *dev, char *buf)
198{
199 char *curr = buf;
200
201 spin_lock_irq(&clocksource_lock);
202 curr += sprintf(curr, "%s ", curr_clocksource->name);
203 spin_unlock_irq(&clocksource_lock);
204
205 curr += sprintf(curr, "\n");
206
207 return curr - buf;
208}
209
210/**
211 * sysfs_override_clocksource - interface for manually overriding clocksource
212 * @dev: unused
213 * @buf: name of override clocksource
214 * @count: length of buffer
215 *
216 * Takes input from sysfs interface for manually overriding the default
217 * clocksource selction.
218 */
219static ssize_t sysfs_override_clocksource(struct sys_device *dev,
220 const char *buf, size_t count)
221{
222 size_t ret = count;
223 /* strings from sysfs write are not 0 terminated! */
224 if (count >= sizeof(override_name))
225 return -EINVAL;
226
227 /* strip of \n: */
228 if (buf[count-1] == '\n')
229 count--;
230 if (count < 1)
231 return -EINVAL;
232
233 spin_lock_irq(&clocksource_lock);
234
235 /* copy the name given: */
236 memcpy(override_name, buf, count);
237 override_name[count] = 0;
238
239 /* try to select it: */
240 next_clocksource = select_clocksource();
241
242 spin_unlock_irq(&clocksource_lock);
243
244 return ret;
245}
246
247/**
248 * sysfs_show_available_clocksources - sysfs interface for listing clocksource
249 * @dev: unused
250 * @buf: char buffer to be filled with clocksource list
251 *
252 * Provides sysfs interface for listing registered clocksources
253 */
254static ssize_t
255sysfs_show_available_clocksources(struct sys_device *dev, char *buf)
256{
257 struct list_head *tmp;
258 char *curr = buf;
259
260 spin_lock_irq(&clocksource_lock);
261 list_for_each(tmp, &clocksource_list) {
262 struct clocksource *src;
263
264 src = list_entry(tmp, struct clocksource, list);
265 curr += sprintf(curr, "%s ", src->name);
266 }
267 spin_unlock_irq(&clocksource_lock);
268
269 curr += sprintf(curr, "\n");
270
271 return curr - buf;
272}
273
274/*
275 * Sysfs setup bits:
276 */
277static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources,
278 sysfs_override_clocksource);
279
280static SYSDEV_ATTR(available_clocksource, 0600,
281 sysfs_show_available_clocksources, NULL);
282
283static struct sysdev_class clocksource_sysclass = {
284 set_kset_name("clocksource"),
285};
286
287static struct sys_device device_clocksource = {
288 .id = 0,
289 .cls = &clocksource_sysclass,
290};
291
292static int __init init_clocksource_sysfs(void)
293{
294 int error = sysdev_class_register(&clocksource_sysclass);
295
296 if (!error)
297 error = sysdev_register(&device_clocksource);
298 if (!error)
299 error = sysdev_create_file(
300 &device_clocksource,
301 &attr_current_clocksource);
302 if (!error)
303 error = sysdev_create_file(
304 &device_clocksource,
305 &attr_available_clocksource);
306 return error;
307}
308
309device_initcall(init_clocksource_sysfs);
310
311/**
312 * boot_override_clocksource - boot clock override
313 * @str: override name
314 *
315 * Takes a clocksource= boot argument and uses it
316 * as the clocksource override name.
317 */
318static int __init boot_override_clocksource(char* str)
319{
320 unsigned long flags;
321 spin_lock_irqsave(&clocksource_lock, flags);
322 if (str)
323 strlcpy(override_name, str, sizeof(override_name));
324 spin_unlock_irqrestore(&clocksource_lock, flags);
325 return 1;
326}
327
328__setup("clocksource=", boot_override_clocksource);
329
330/**
331 * boot_override_clock - Compatibility layer for deprecated boot option
332 * @str: override name
333 *
334 * DEPRECATED! Takes a clock= boot argument and uses it
335 * as the clocksource override name
336 */
337static int __init boot_override_clock(char* str)
338{
339 if (!strcmp(str, "pmtmr")) {
340 printk("Warning: clock=pmtmr is deprecated. "
341 "Use clocksource=acpi_pm.\n");
342 return boot_override_clocksource("acpi_pm");
343 }
344 printk("Warning! clock= boot option is deprecated. "
345 "Use clocksource=xyz\n");
346 return boot_override_clocksource(str);
347}
348
349__setup("clock=", boot_override_clock);
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
new file mode 100644
index 000000000000..126bb30c4afe
--- /dev/null
+++ b/kernel/time/jiffies.c
@@ -0,0 +1,73 @@
1/***********************************************************************
2* linux/kernel/time/jiffies.c
3*
4* This file contains the jiffies based clocksource.
5*
6* Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
7*
8* This program is free software; you can redistribute it and/or modify
9* it under the terms of the GNU General Public License as published by
10* the Free Software Foundation; either version 2 of the License, or
11* (at your option) any later version.
12*
13* This program is distributed in the hope that it will be useful,
14* but WITHOUT ANY WARRANTY; without even the implied warranty of
15* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16* GNU General Public License for more details.
17*
18* You should have received a copy of the GNU General Public License
19* along with this program; if not, write to the Free Software
20* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21*
22************************************************************************/
23#include <linux/clocksource.h>
24#include <linux/jiffies.h>
25#include <linux/init.h>
26
27/* The Jiffies based clocksource is the lowest common
28 * denominator clock source which should function on
29 * all systems. It has the same coarse resolution as
30 * the timer interrupt frequency HZ and it suffers
31 * inaccuracies caused by missed or lost timer
32 * interrupts and the inability for the timer
33 * interrupt hardware to accuratly tick at the
34 * requested HZ value. It is also not reccomended
35 * for "tick-less" systems.
36 */
37#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ))
38
39/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier
40 * conversion, the .shift value could be zero. However
41 * this would make NTP adjustments impossible as they are
42 * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to
43 * shift both the nominator and denominator the same
44 * amount, and give ntp adjustments in units of 1/2^8
45 *
46 * The value 8 is somewhat carefully chosen, as anything
47 * larger can result in overflows. NSEC_PER_JIFFY grows as
48 * HZ shrinks, so values greater then 8 overflow 32bits when
49 * HZ=100.
50 */
51#define JIFFIES_SHIFT 8
52
53static cycle_t jiffies_read(void)
54{
55 return (cycle_t) jiffies;
56}
57
58struct clocksource clocksource_jiffies = {
59 .name = "jiffies",
60 .rating = 0, /* lowest rating*/
61 .read = jiffies_read,
62 .mask = 0xffffffff, /*32bits*/
63 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
64 .shift = JIFFIES_SHIFT,
65 .is_continuous = 0, /* tick based, not free running */
66};
67
68static int __init init_jiffies_clocksource(void)
69{
70 return clocksource_register(&clocksource_jiffies);
71}
72
73module_init(init_jiffies_clocksource);
diff --git a/kernel/timer.c b/kernel/timer.c
index 9e49deed468c..5a8960253063 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -146,7 +146,7 @@ static void internal_add_timer(tvec_base_t *base, struct timer_list *timer)
146void fastcall init_timer(struct timer_list *timer) 146void fastcall init_timer(struct timer_list *timer)
147{ 147{
148 timer->entry.next = NULL; 148 timer->entry.next = NULL;
149 timer->base = per_cpu(tvec_bases, raw_smp_processor_id()); 149 timer->base = __raw_get_cpu_var(tvec_bases);
150} 150}
151EXPORT_SYMBOL(init_timer); 151EXPORT_SYMBOL(init_timer);
152 152
@@ -383,23 +383,19 @@ EXPORT_SYMBOL(del_timer_sync);
383static int cascade(tvec_base_t *base, tvec_t *tv, int index) 383static int cascade(tvec_base_t *base, tvec_t *tv, int index)
384{ 384{
385 /* cascade all the timers from tv up one level */ 385 /* cascade all the timers from tv up one level */
386 struct list_head *head, *curr; 386 struct timer_list *timer, *tmp;
387 struct list_head tv_list;
388
389 list_replace_init(tv->vec + index, &tv_list);
387 390
388 head = tv->vec + index;
389 curr = head->next;
390 /* 391 /*
391 * We are removing _all_ timers from the list, so we don't have to 392 * We are removing _all_ timers from the list, so we
392 * detach them individually, just clear the list afterwards. 393 * don't have to detach them individually.
393 */ 394 */
394 while (curr != head) { 395 list_for_each_entry_safe(timer, tmp, &tv_list, entry) {
395 struct timer_list *tmp; 396 BUG_ON(timer->base != base);
396 397 internal_add_timer(base, timer);
397 tmp = list_entry(curr, struct timer_list, entry);
398 BUG_ON(tmp->base != base);
399 curr = curr->next;
400 internal_add_timer(base, tmp);
401 } 398 }
402 INIT_LIST_HEAD(head);
403 399
404 return index; 400 return index;
405} 401}
@@ -419,10 +415,10 @@ static inline void __run_timers(tvec_base_t *base)
419 415
420 spin_lock_irq(&base->lock); 416 spin_lock_irq(&base->lock);
421 while (time_after_eq(jiffies, base->timer_jiffies)) { 417 while (time_after_eq(jiffies, base->timer_jiffies)) {
422 struct list_head work_list = LIST_HEAD_INIT(work_list); 418 struct list_head work_list;
423 struct list_head *head = &work_list; 419 struct list_head *head = &work_list;
424 int index = base->timer_jiffies & TVR_MASK; 420 int index = base->timer_jiffies & TVR_MASK;
425 421
426 /* 422 /*
427 * Cascade timers: 423 * Cascade timers:
428 */ 424 */
@@ -431,8 +427,8 @@ static inline void __run_timers(tvec_base_t *base)
431 (!cascade(base, &base->tv3, INDEX(1))) && 427 (!cascade(base, &base->tv3, INDEX(1))) &&
432 !cascade(base, &base->tv4, INDEX(2))) 428 !cascade(base, &base->tv4, INDEX(2)))
433 cascade(base, &base->tv5, INDEX(3)); 429 cascade(base, &base->tv5, INDEX(3));
434 ++base->timer_jiffies; 430 ++base->timer_jiffies;
435 list_splice_init(base->tv1.vec + index, &work_list); 431 list_replace_init(base->tv1.vec + index, &work_list);
436 while (!list_empty(head)) { 432 while (!list_empty(head)) {
437 void (*fn)(unsigned long); 433 void (*fn)(unsigned long);
438 unsigned long data; 434 unsigned long data;
@@ -601,7 +597,6 @@ long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */
601long time_precision = 1; /* clock precision (us) */ 597long time_precision = 1; /* clock precision (us) */
602long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ 598long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */
603long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ 599long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */
604static long time_phase; /* phase offset (scaled us) */
605long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC; 600long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC;
606 /* frequency offset (scaled ppm)*/ 601 /* frequency offset (scaled ppm)*/
607static long time_adj; /* tick adjust (scaled 1 / HZ) */ 602static long time_adj; /* tick adjust (scaled 1 / HZ) */
@@ -751,27 +746,14 @@ static long adjtime_adjustment(void)
751} 746}
752 747
753/* in the NTP reference this is called "hardclock()" */ 748/* in the NTP reference this is called "hardclock()" */
754static void update_wall_time_one_tick(void) 749static void update_ntp_one_tick(void)
755{ 750{
756 long time_adjust_step, delta_nsec; 751 long time_adjust_step;
757 752
758 time_adjust_step = adjtime_adjustment(); 753 time_adjust_step = adjtime_adjustment();
759 if (time_adjust_step) 754 if (time_adjust_step)
760 /* Reduce by this step the amount of time left */ 755 /* Reduce by this step the amount of time left */
761 time_adjust -= time_adjust_step; 756 time_adjust -= time_adjust_step;
762 delta_nsec = tick_nsec + time_adjust_step * 1000;
763 /*
764 * Advance the phase, once it gets to one microsecond, then
765 * advance the tick more.
766 */
767 time_phase += time_adj;
768 if ((time_phase >= FINENSEC) || (time_phase <= -FINENSEC)) {
769 long ltemp = shift_right(time_phase, (SHIFT_SCALE - 10));
770 time_phase -= ltemp << (SHIFT_SCALE - 10);
771 delta_nsec += ltemp;
772 }
773 xtime.tv_nsec += delta_nsec;
774 time_interpolator_update(delta_nsec);
775 757
776 /* Changes by adjtime() do not take effect till next tick. */ 758 /* Changes by adjtime() do not take effect till next tick. */
777 if (time_next_adjust != 0) { 759 if (time_next_adjust != 0) {
@@ -784,36 +766,378 @@ static void update_wall_time_one_tick(void)
784 * Return how long ticks are at the moment, that is, how much time 766 * Return how long ticks are at the moment, that is, how much time
785 * update_wall_time_one_tick will add to xtime next time we call it 767 * update_wall_time_one_tick will add to xtime next time we call it
786 * (assuming no calls to do_adjtimex in the meantime). 768 * (assuming no calls to do_adjtimex in the meantime).
787 * The return value is in fixed-point nanoseconds with SHIFT_SCALE-10 769 * The return value is in fixed-point nanoseconds shifted by the
788 * bits to the right of the binary point. 770 * specified number of bits to the right of the binary point.
789 * This function has no side-effects. 771 * This function has no side-effects.
790 */ 772 */
791u64 current_tick_length(void) 773u64 current_tick_length(void)
792{ 774{
793 long delta_nsec; 775 long delta_nsec;
776 u64 ret;
794 777
778 /* calculate the finest interval NTP will allow.
779 * ie: nanosecond value shifted by (SHIFT_SCALE - 10)
780 */
795 delta_nsec = tick_nsec + adjtime_adjustment() * 1000; 781 delta_nsec = tick_nsec + adjtime_adjustment() * 1000;
796 return ((u64) delta_nsec << (SHIFT_SCALE - 10)) + time_adj; 782 ret = (u64)delta_nsec << TICK_LENGTH_SHIFT;
783 ret += (s64)time_adj << (TICK_LENGTH_SHIFT - (SHIFT_SCALE - 10));
784
785 return ret;
797} 786}
798 787
799/* 788/* XXX - all of this timekeeping code should be later moved to time.c */
800 * Using a loop looks inefficient, but "ticks" is 789#include <linux/clocksource.h>
801 * usually just one (we shouldn't be losing ticks, 790static struct clocksource *clock; /* pointer to current clocksource */
802 * we're doing this this way mainly for interrupt 791
803 * latency reasons, not because we think we'll 792#ifdef CONFIG_GENERIC_TIME
804 * have lots of lost timer ticks 793/**
794 * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook
795 *
796 * private function, must hold xtime_lock lock when being
797 * called. Returns the number of nanoseconds since the
798 * last call to update_wall_time() (adjusted by NTP scaling)
799 */
800static inline s64 __get_nsec_offset(void)
801{
802 cycle_t cycle_now, cycle_delta;
803 s64 ns_offset;
804
805 /* read clocksource: */
806 cycle_now = clocksource_read(clock);
807
808 /* calculate the delta since the last update_wall_time: */
809 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
810
811 /* convert to nanoseconds: */
812 ns_offset = cyc2ns(clock, cycle_delta);
813
814 return ns_offset;
815}
816
817/**
818 * __get_realtime_clock_ts - Returns the time of day in a timespec
819 * @ts: pointer to the timespec to be set
820 *
821 * Returns the time of day in a timespec. Used by
822 * do_gettimeofday() and get_realtime_clock_ts().
823 */
824static inline void __get_realtime_clock_ts(struct timespec *ts)
825{
826 unsigned long seq;
827 s64 nsecs;
828
829 do {
830 seq = read_seqbegin(&xtime_lock);
831
832 *ts = xtime;
833 nsecs = __get_nsec_offset();
834
835 } while (read_seqretry(&xtime_lock, seq));
836
837 timespec_add_ns(ts, nsecs);
838}
839
840/**
841 * getnstimeofday - Returns the time of day in a timespec
842 * @ts: pointer to the timespec to be set
843 *
844 * Returns the time of day in a timespec.
845 */
846void getnstimeofday(struct timespec *ts)
847{
848 __get_realtime_clock_ts(ts);
849}
850
851EXPORT_SYMBOL(getnstimeofday);
852
853/**
854 * do_gettimeofday - Returns the time of day in a timeval
855 * @tv: pointer to the timeval to be set
856 *
857 * NOTE: Users should be converted to using get_realtime_clock_ts()
858 */
859void do_gettimeofday(struct timeval *tv)
860{
861 struct timespec now;
862
863 __get_realtime_clock_ts(&now);
864 tv->tv_sec = now.tv_sec;
865 tv->tv_usec = now.tv_nsec/1000;
866}
867
868EXPORT_SYMBOL(do_gettimeofday);
869/**
870 * do_settimeofday - Sets the time of day
871 * @tv: pointer to the timespec variable containing the new time
872 *
873 * Sets the time of day to the new time and update NTP and notify hrtimers
874 */
875int do_settimeofday(struct timespec *tv)
876{
877 unsigned long flags;
878 time_t wtm_sec, sec = tv->tv_sec;
879 long wtm_nsec, nsec = tv->tv_nsec;
880
881 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
882 return -EINVAL;
883
884 write_seqlock_irqsave(&xtime_lock, flags);
885
886 nsec -= __get_nsec_offset();
887
888 wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
889 wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
890
891 set_normalized_timespec(&xtime, sec, nsec);
892 set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
893
894 ntp_clear();
895
896 write_sequnlock_irqrestore(&xtime_lock, flags);
897
898 /* signal hrtimers about time change */
899 clock_was_set();
900
901 return 0;
902}
903
904EXPORT_SYMBOL(do_settimeofday);
905
906/**
907 * change_clocksource - Swaps clocksources if a new one is available
908 *
909 * Accumulates current time interval and initializes new clocksource
910 */
911static int change_clocksource(void)
912{
913 struct clocksource *new;
914 cycle_t now;
915 u64 nsec;
916 new = clocksource_get_next();
917 if (clock != new) {
918 now = clocksource_read(new);
919 nsec = __get_nsec_offset();
920 timespec_add_ns(&xtime, nsec);
921
922 clock = new;
923 clock->cycle_last = now;
924 printk(KERN_INFO "Time: %s clocksource has been installed.\n",
925 clock->name);
926 return 1;
927 } else if (clock->update_callback) {
928 return clock->update_callback();
929 }
930 return 0;
931}
932#else
933#define change_clocksource() (0)
934#endif
935
936/**
937 * timeofday_is_continuous - check to see if timekeeping is free running
805 */ 938 */
806static void update_wall_time(unsigned long ticks) 939int timekeeping_is_continuous(void)
807{ 940{
941 unsigned long seq;
942 int ret;
943
808 do { 944 do {
809 ticks--; 945 seq = read_seqbegin(&xtime_lock);
810 update_wall_time_one_tick(); 946
811 if (xtime.tv_nsec >= 1000000000) { 947 ret = clock->is_continuous;
812 xtime.tv_nsec -= 1000000000; 948
949 } while (read_seqretry(&xtime_lock, seq));
950
951 return ret;
952}
953
954/*
955 * timekeeping_init - Initializes the clocksource and common timekeeping values
956 */
957void __init timekeeping_init(void)
958{
959 unsigned long flags;
960
961 write_seqlock_irqsave(&xtime_lock, flags);
962 clock = clocksource_get_next();
963 clocksource_calculate_interval(clock, tick_nsec);
964 clock->cycle_last = clocksource_read(clock);
965 ntp_clear();
966 write_sequnlock_irqrestore(&xtime_lock, flags);
967}
968
969
970/*
971 * timekeeping_resume - Resumes the generic timekeeping subsystem.
972 * @dev: unused
973 *
974 * This is for the generic clocksource timekeeping.
975 * xtime/wall_to_monotonic/jiffies/wall_jiffies/etc are
976 * still managed by arch specific suspend/resume code.
977 */
978static int timekeeping_resume(struct sys_device *dev)
979{
980 unsigned long flags;
981
982 write_seqlock_irqsave(&xtime_lock, flags);
983 /* restart the last cycle value */
984 clock->cycle_last = clocksource_read(clock);
985 write_sequnlock_irqrestore(&xtime_lock, flags);
986 return 0;
987}
988
989/* sysfs resume/suspend bits for timekeeping */
990static struct sysdev_class timekeeping_sysclass = {
991 .resume = timekeeping_resume,
992 set_kset_name("timekeeping"),
993};
994
995static struct sys_device device_timer = {
996 .id = 0,
997 .cls = &timekeeping_sysclass,
998};
999
1000static int __init timekeeping_init_device(void)
1001{
1002 int error = sysdev_class_register(&timekeeping_sysclass);
1003 if (!error)
1004 error = sysdev_register(&device_timer);
1005 return error;
1006}
1007
1008device_initcall(timekeeping_init_device);
1009
1010/*
1011 * If the error is already larger, we look ahead another tick,
1012 * to compensate for late or lost adjustments.
1013 */
1014static __always_inline int clocksource_bigadjust(int sign, s64 error, s64 *interval, s64 *offset)
1015{
1016 int adj;
1017
1018 /*
1019 * As soon as the machine is synchronized to the external time
1020 * source this should be the common case.
1021 */
1022 error >>= 2;
1023 if (likely(sign > 0 ? error <= *interval : error >= *interval))
1024 return sign;
1025
1026 /*
1027 * An extra look ahead dampens the effect of the current error,
1028 * which can grow quite large with continously late updates, as
1029 * it would dominate the adjustment value and can lead to
1030 * oscillation.
1031 */
1032 error += current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1);
1033 error -= clock->xtime_interval >> 1;
1034
1035 adj = 0;
1036 while (1) {
1037 error >>= 1;
1038 if (sign > 0 ? error <= *interval : error >= *interval)
1039 break;
1040 adj++;
1041 }
1042
1043 /*
1044 * Add the current adjustments to the error and take the offset
1045 * into account, the latter can cause the error to be hardly
1046 * reduced at the next tick. Check the error again if there's
1047 * room for another adjustment, thus further reducing the error
1048 * which otherwise had to be corrected at the next update.
1049 */
1050 error = (error << 1) - *interval + *offset;
1051 if (sign > 0 ? error > *interval : error < *interval)
1052 adj++;
1053
1054 *interval <<= adj;
1055 *offset <<= adj;
1056 return sign << adj;
1057}
1058
1059/*
1060 * Adjust the multiplier to reduce the error value,
1061 * this is optimized for the most common adjustments of -1,0,1,
1062 * for other values we can do a bit more work.
1063 */
1064static void clocksource_adjust(struct clocksource *clock, s64 offset)
1065{
1066 s64 error, interval = clock->cycle_interval;
1067 int adj;
1068
1069 error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1);
1070 if (error > interval) {
1071 adj = clocksource_bigadjust(1, error, &interval, &offset);
1072 } else if (error < -interval) {
1073 interval = -interval;
1074 offset = -offset;
1075 adj = clocksource_bigadjust(-1, error, &interval, &offset);
1076 } else
1077 return;
1078
1079 clock->mult += adj;
1080 clock->xtime_interval += interval;
1081 clock->xtime_nsec -= offset;
1082 clock->error -= (interval - offset) << (TICK_LENGTH_SHIFT - clock->shift);
1083}
1084
1085/*
1086 * update_wall_time - Uses the current clocksource to increment the wall time
1087 *
1088 * Called from the timer interrupt, must hold a write on xtime_lock.
1089 */
1090static void update_wall_time(void)
1091{
1092 cycle_t offset;
1093
1094 clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
1095
1096#ifdef CONFIG_GENERIC_TIME
1097 offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask;
1098#else
1099 offset = clock->cycle_interval;
1100#endif
1101
1102 /* normally this loop will run just once, however in the
1103 * case of lost or late ticks, it will accumulate correctly.
1104 */
1105 while (offset >= clock->cycle_interval) {
1106 /* accumulate one interval */
1107 clock->xtime_nsec += clock->xtime_interval;
1108 clock->cycle_last += clock->cycle_interval;
1109 offset -= clock->cycle_interval;
1110
1111 if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
1112 clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
813 xtime.tv_sec++; 1113 xtime.tv_sec++;
814 second_overflow(); 1114 second_overflow();
815 } 1115 }
816 } while (ticks); 1116
1117 /* interpolator bits */
1118 time_interpolator_update(clock->xtime_interval
1119 >> clock->shift);
1120 /* increment the NTP state machine */
1121 update_ntp_one_tick();
1122
1123 /* accumulate error between NTP and clock interval */
1124 clock->error += current_tick_length();
1125 clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift);
1126 }
1127
1128 /* correct the clock when NTP error is too big */
1129 clocksource_adjust(clock, offset);
1130
1131 /* store full nanoseconds into xtime */
1132 xtime.tv_nsec = clock->xtime_nsec >> clock->shift;
1133 clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
1134
1135 /* check to see if there is a new clocksource to use */
1136 if (change_clocksource()) {
1137 clock->error = 0;
1138 clock->xtime_nsec = 0;
1139 clocksource_calculate_interval(clock, tick_nsec);
1140 }
817} 1141}
818 1142
819/* 1143/*
@@ -919,10 +1243,8 @@ static inline void update_times(void)
919 unsigned long ticks; 1243 unsigned long ticks;
920 1244
921 ticks = jiffies - wall_jiffies; 1245 ticks = jiffies - wall_jiffies;
922 if (ticks) { 1246 wall_jiffies += ticks;
923 wall_jiffies += ticks; 1247 update_wall_time();
924 update_wall_time(ticks);
925 }
926 calc_load(ticks); 1248 calc_load(ticks);
927} 1249}
928 1250
@@ -1330,7 +1652,7 @@ static void __devinit migrate_timers(int cpu)
1330} 1652}
1331#endif /* CONFIG_HOTPLUG_CPU */ 1653#endif /* CONFIG_HOTPLUG_CPU */
1332 1654
1333static int timer_cpu_notify(struct notifier_block *self, 1655static int __devinit timer_cpu_notify(struct notifier_block *self,
1334 unsigned long action, void *hcpu) 1656 unsigned long action, void *hcpu)
1335{ 1657{
1336 long cpu = (long)hcpu; 1658 long cpu = (long)hcpu;
@@ -1350,7 +1672,7 @@ static int timer_cpu_notify(struct notifier_block *self,
1350 return NOTIFY_OK; 1672 return NOTIFY_OK;
1351} 1673}
1352 1674
1353static struct notifier_block timers_nb = { 1675static struct notifier_block __devinitdata timers_nb = {
1354 .notifier_call = timer_cpu_notify, 1676 .notifier_call = timer_cpu_notify,
1355}; 1677};
1356 1678
diff --git a/kernel/unwind.c b/kernel/unwind.c
new file mode 100644
index 000000000000..f69c804c8e62
--- /dev/null
+++ b/kernel/unwind.c
@@ -0,0 +1,918 @@
1/*
2 * Copyright (C) 2002-2006 Novell, Inc.
3 * Jan Beulich <jbeulich@novell.com>
4 * This code is released under version 2 of the GNU GPL.
5 *
6 * A simple API for unwinding kernel stacks. This is used for
7 * debugging and error reporting purposes. The kernel doesn't need
8 * full-blown stack unwinding with all the bells and whistles, so there
9 * is not much point in implementing the full Dwarf2 unwind API.
10 */
11
12#include <linux/unwind.h>
13#include <linux/module.h>
14#include <linux/delay.h>
15#include <linux/stop_machine.h>
16#include <asm/sections.h>
17#include <asm/uaccess.h>
18#include <asm/unaligned.h>
19
20extern char __start_unwind[], __end_unwind[];
21
22#define MAX_STACK_DEPTH 8
23
24#define EXTRA_INFO(f) { \
25 BUILD_BUG_ON_ZERO(offsetof(struct unwind_frame_info, f) \
26 % FIELD_SIZEOF(struct unwind_frame_info, f)) \
27 + offsetof(struct unwind_frame_info, f) \
28 / FIELD_SIZEOF(struct unwind_frame_info, f), \
29 FIELD_SIZEOF(struct unwind_frame_info, f) \
30 }
31#define PTREGS_INFO(f) EXTRA_INFO(regs.f)
32
33static const struct {
34 unsigned offs:BITS_PER_LONG / 2;
35 unsigned width:BITS_PER_LONG / 2;
36} reg_info[] = {
37 UNW_REGISTER_INFO
38};
39
40#undef PTREGS_INFO
41#undef EXTRA_INFO
42
43#ifndef REG_INVALID
44#define REG_INVALID(r) (reg_info[r].width == 0)
45#endif
46
47#define DW_CFA_nop 0x00
48#define DW_CFA_set_loc 0x01
49#define DW_CFA_advance_loc1 0x02
50#define DW_CFA_advance_loc2 0x03
51#define DW_CFA_advance_loc4 0x04
52#define DW_CFA_offset_extended 0x05
53#define DW_CFA_restore_extended 0x06
54#define DW_CFA_undefined 0x07
55#define DW_CFA_same_value 0x08
56#define DW_CFA_register 0x09
57#define DW_CFA_remember_state 0x0a
58#define DW_CFA_restore_state 0x0b
59#define DW_CFA_def_cfa 0x0c
60#define DW_CFA_def_cfa_register 0x0d
61#define DW_CFA_def_cfa_offset 0x0e
62#define DW_CFA_def_cfa_expression 0x0f
63#define DW_CFA_expression 0x10
64#define DW_CFA_offset_extended_sf 0x11
65#define DW_CFA_def_cfa_sf 0x12
66#define DW_CFA_def_cfa_offset_sf 0x13
67#define DW_CFA_val_offset 0x14
68#define DW_CFA_val_offset_sf 0x15
69#define DW_CFA_val_expression 0x16
70#define DW_CFA_lo_user 0x1c
71#define DW_CFA_GNU_window_save 0x2d
72#define DW_CFA_GNU_args_size 0x2e
73#define DW_CFA_GNU_negative_offset_extended 0x2f
74#define DW_CFA_hi_user 0x3f
75
76#define DW_EH_PE_FORM 0x07
77#define DW_EH_PE_native 0x00
78#define DW_EH_PE_leb128 0x01
79#define DW_EH_PE_data2 0x02
80#define DW_EH_PE_data4 0x03
81#define DW_EH_PE_data8 0x04
82#define DW_EH_PE_signed 0x08
83#define DW_EH_PE_ADJUST 0x70
84#define DW_EH_PE_abs 0x00
85#define DW_EH_PE_pcrel 0x10
86#define DW_EH_PE_textrel 0x20
87#define DW_EH_PE_datarel 0x30
88#define DW_EH_PE_funcrel 0x40
89#define DW_EH_PE_aligned 0x50
90#define DW_EH_PE_indirect 0x80
91#define DW_EH_PE_omit 0xff
92
93typedef unsigned long uleb128_t;
94typedef signed long sleb128_t;
95
96static struct unwind_table {
97 struct {
98 unsigned long pc;
99 unsigned long range;
100 } core, init;
101 const void *address;
102 unsigned long size;
103 struct unwind_table *link;
104 const char *name;
105} root_table, *last_table;
106
107struct unwind_item {
108 enum item_location {
109 Nowhere,
110 Memory,
111 Register,
112 Value
113 } where;
114 uleb128_t value;
115};
116
117struct unwind_state {
118 uleb128_t loc, org;
119 const u8 *cieStart, *cieEnd;
120 uleb128_t codeAlign;
121 sleb128_t dataAlign;
122 struct cfa {
123 uleb128_t reg, offs;
124 } cfa;
125 struct unwind_item regs[ARRAY_SIZE(reg_info)];
126 unsigned stackDepth:8;
127 unsigned version:8;
128 const u8 *label;
129 const u8 *stack[MAX_STACK_DEPTH];
130};
131
132static const struct cfa badCFA = { ARRAY_SIZE(reg_info), 1 };
133
134static struct unwind_table *find_table(unsigned long pc)
135{
136 struct unwind_table *table;
137
138 for (table = &root_table; table; table = table->link)
139 if ((pc >= table->core.pc
140 && pc < table->core.pc + table->core.range)
141 || (pc >= table->init.pc
142 && pc < table->init.pc + table->init.range))
143 break;
144
145 return table;
146}
147
148static void init_unwind_table(struct unwind_table *table,
149 const char *name,
150 const void *core_start,
151 unsigned long core_size,
152 const void *init_start,
153 unsigned long init_size,
154 const void *table_start,
155 unsigned long table_size)
156{
157 table->core.pc = (unsigned long)core_start;
158 table->core.range = core_size;
159 table->init.pc = (unsigned long)init_start;
160 table->init.range = init_size;
161 table->address = table_start;
162 table->size = table_size;
163 table->link = NULL;
164 table->name = name;
165}
166
167void __init unwind_init(void)
168{
169 init_unwind_table(&root_table, "kernel",
170 _text, _end - _text,
171 NULL, 0,
172 __start_unwind, __end_unwind - __start_unwind);
173}
174
175#ifdef CONFIG_MODULES
176
177/* Must be called with module_mutex held. */
178void *unwind_add_table(struct module *module,
179 const void *table_start,
180 unsigned long table_size)
181{
182 struct unwind_table *table;
183
184 if (table_size <= 0)
185 return NULL;
186
187 table = kmalloc(sizeof(*table), GFP_KERNEL);
188 if (!table)
189 return NULL;
190
191 init_unwind_table(table, module->name,
192 module->module_core, module->core_size,
193 module->module_init, module->init_size,
194 table_start, table_size);
195
196 if (last_table)
197 last_table->link = table;
198 else
199 root_table.link = table;
200 last_table = table;
201
202 return table;
203}
204
205struct unlink_table_info
206{
207 struct unwind_table *table;
208 int init_only;
209};
210
211static int unlink_table(void *arg)
212{
213 struct unlink_table_info *info = arg;
214 struct unwind_table *table = info->table, *prev;
215
216 for (prev = &root_table; prev->link && prev->link != table; prev = prev->link)
217 ;
218
219 if (prev->link) {
220 if (info->init_only) {
221 table->init.pc = 0;
222 table->init.range = 0;
223 info->table = NULL;
224 } else {
225 prev->link = table->link;
226 if (!prev->link)
227 last_table = prev;
228 }
229 } else
230 info->table = NULL;
231
232 return 0;
233}
234
235/* Must be called with module_mutex held. */
236void unwind_remove_table(void *handle, int init_only)
237{
238 struct unwind_table *table = handle;
239 struct unlink_table_info info;
240
241 if (!table || table == &root_table)
242 return;
243
244 if (init_only && table == last_table) {
245 table->init.pc = 0;
246 table->init.range = 0;
247 return;
248 }
249
250 info.table = table;
251 info.init_only = init_only;
252 stop_machine_run(unlink_table, &info, NR_CPUS);
253
254 if (info.table)
255 kfree(table);
256}
257
258#endif /* CONFIG_MODULES */
259
260static uleb128_t get_uleb128(const u8 **pcur, const u8 *end)
261{
262 const u8 *cur = *pcur;
263 uleb128_t value;
264 unsigned shift;
265
266 for (shift = 0, value = 0; cur < end; shift += 7) {
267 if (shift + 7 > 8 * sizeof(value)
268 && (*cur & 0x7fU) >= (1U << (8 * sizeof(value) - shift))) {
269 cur = end + 1;
270 break;
271 }
272 value |= (uleb128_t)(*cur & 0x7f) << shift;
273 if (!(*cur++ & 0x80))
274 break;
275 }
276 *pcur = cur;
277
278 return value;
279}
280
281static sleb128_t get_sleb128(const u8 **pcur, const u8 *end)
282{
283 const u8 *cur = *pcur;
284 sleb128_t value;
285 unsigned shift;
286
287 for (shift = 0, value = 0; cur < end; shift += 7) {
288 if (shift + 7 > 8 * sizeof(value)
289 && (*cur & 0x7fU) >= (1U << (8 * sizeof(value) - shift))) {
290 cur = end + 1;
291 break;
292 }
293 value |= (sleb128_t)(*cur & 0x7f) << shift;
294 if (!(*cur & 0x80)) {
295 value |= -(*cur++ & 0x40) << shift;
296 break;
297 }
298 }
299 *pcur = cur;
300
301 return value;
302}
303
304static unsigned long read_pointer(const u8 **pLoc,
305 const void *end,
306 signed ptrType)
307{
308 unsigned long value = 0;
309 union {
310 const u8 *p8;
311 const u16 *p16u;
312 const s16 *p16s;
313 const u32 *p32u;
314 const s32 *p32s;
315 const unsigned long *pul;
316 } ptr;
317
318 if (ptrType < 0 || ptrType == DW_EH_PE_omit)
319 return 0;
320 ptr.p8 = *pLoc;
321 switch(ptrType & DW_EH_PE_FORM) {
322 case DW_EH_PE_data2:
323 if (end < (const void *)(ptr.p16u + 1))
324 return 0;
325 if(ptrType & DW_EH_PE_signed)
326 value = get_unaligned(ptr.p16s++);
327 else
328 value = get_unaligned(ptr.p16u++);
329 break;
330 case DW_EH_PE_data4:
331#ifdef CONFIG_64BIT
332 if (end < (const void *)(ptr.p32u + 1))
333 return 0;
334 if(ptrType & DW_EH_PE_signed)
335 value = get_unaligned(ptr.p32s++);
336 else
337 value = get_unaligned(ptr.p32u++);
338 break;
339 case DW_EH_PE_data8:
340 BUILD_BUG_ON(sizeof(u64) != sizeof(value));
341#else
342 BUILD_BUG_ON(sizeof(u32) != sizeof(value));
343#endif
344 case DW_EH_PE_native:
345 if (end < (const void *)(ptr.pul + 1))
346 return 0;
347 value = get_unaligned(ptr.pul++);
348 break;
349 case DW_EH_PE_leb128:
350 BUILD_BUG_ON(sizeof(uleb128_t) > sizeof(value));
351 value = ptrType & DW_EH_PE_signed
352 ? get_sleb128(&ptr.p8, end)
353 : get_uleb128(&ptr.p8, end);
354 if ((const void *)ptr.p8 > end)
355 return 0;
356 break;
357 default:
358 return 0;
359 }
360 switch(ptrType & DW_EH_PE_ADJUST) {
361 case DW_EH_PE_abs:
362 break;
363 case DW_EH_PE_pcrel:
364 value += (unsigned long)*pLoc;
365 break;
366 default:
367 return 0;
368 }
369 if ((ptrType & DW_EH_PE_indirect)
370 && __get_user(value, (unsigned long *)value))
371 return 0;
372 *pLoc = ptr.p8;
373
374 return value;
375}
376
377static signed fde_pointer_type(const u32 *cie)
378{
379 const u8 *ptr = (const u8 *)(cie + 2);
380 unsigned version = *ptr;
381
382 if (version != 1)
383 return -1; /* unsupported */
384 if (*++ptr) {
385 const char *aug;
386 const u8 *end = (const u8 *)(cie + 1) + *cie;
387 uleb128_t len;
388
389 /* check if augmentation size is first (and thus present) */
390 if (*ptr != 'z')
391 return -1;
392 /* check if augmentation string is nul-terminated */
393 if ((ptr = memchr(aug = (const void *)ptr, 0, end - ptr)) == NULL)
394 return -1;
395 ++ptr; /* skip terminator */
396 get_uleb128(&ptr, end); /* skip code alignment */
397 get_sleb128(&ptr, end); /* skip data alignment */
398 /* skip return address column */
399 version <= 1 ? (void)++ptr : (void)get_uleb128(&ptr, end);
400 len = get_uleb128(&ptr, end); /* augmentation length */
401 if (ptr + len < ptr || ptr + len > end)
402 return -1;
403 end = ptr + len;
404 while (*++aug) {
405 if (ptr >= end)
406 return -1;
407 switch(*aug) {
408 case 'L':
409 ++ptr;
410 break;
411 case 'P': {
412 signed ptrType = *ptr++;
413
414 if (!read_pointer(&ptr, end, ptrType) || ptr > end)
415 return -1;
416 }
417 break;
418 case 'R':
419 return *ptr;
420 default:
421 return -1;
422 }
423 }
424 }
425 return DW_EH_PE_native|DW_EH_PE_abs;
426}
427
428static int advance_loc(unsigned long delta, struct unwind_state *state)
429{
430 state->loc += delta * state->codeAlign;
431
432 return delta > 0;
433}
434
435static void set_rule(uleb128_t reg,
436 enum item_location where,
437 uleb128_t value,
438 struct unwind_state *state)
439{
440 if (reg < ARRAY_SIZE(state->regs)) {
441 state->regs[reg].where = where;
442 state->regs[reg].value = value;
443 }
444}
445
446static int processCFI(const u8 *start,
447 const u8 *end,
448 unsigned long targetLoc,
449 signed ptrType,
450 struct unwind_state *state)
451{
452 union {
453 const u8 *p8;
454 const u16 *p16;
455 const u32 *p32;
456 } ptr;
457 int result = 1;
458
459 if (start != state->cieStart) {
460 state->loc = state->org;
461 result = processCFI(state->cieStart, state->cieEnd, 0, ptrType, state);
462 if (targetLoc == 0 && state->label == NULL)
463 return result;
464 }
465 for (ptr.p8 = start; result && ptr.p8 < end; ) {
466 switch(*ptr.p8 >> 6) {
467 uleb128_t value;
468
469 case 0:
470 switch(*ptr.p8++) {
471 case DW_CFA_nop:
472 break;
473 case DW_CFA_set_loc:
474 if ((state->loc = read_pointer(&ptr.p8, end, ptrType)) == 0)
475 result = 0;
476 break;
477 case DW_CFA_advance_loc1:
478 result = ptr.p8 < end && advance_loc(*ptr.p8++, state);
479 break;
480 case DW_CFA_advance_loc2:
481 result = ptr.p8 <= end + 2
482 && advance_loc(*ptr.p16++, state);
483 break;
484 case DW_CFA_advance_loc4:
485 result = ptr.p8 <= end + 4
486 && advance_loc(*ptr.p32++, state);
487 break;
488 case DW_CFA_offset_extended:
489 value = get_uleb128(&ptr.p8, end);
490 set_rule(value, Memory, get_uleb128(&ptr.p8, end), state);
491 break;
492 case DW_CFA_val_offset:
493 value = get_uleb128(&ptr.p8, end);
494 set_rule(value, Value, get_uleb128(&ptr.p8, end), state);
495 break;
496 case DW_CFA_offset_extended_sf:
497 value = get_uleb128(&ptr.p8, end);
498 set_rule(value, Memory, get_sleb128(&ptr.p8, end), state);
499 break;
500 case DW_CFA_val_offset_sf:
501 value = get_uleb128(&ptr.p8, end);
502 set_rule(value, Value, get_sleb128(&ptr.p8, end), state);
503 break;
504 case DW_CFA_restore_extended:
505 case DW_CFA_undefined:
506 case DW_CFA_same_value:
507 set_rule(get_uleb128(&ptr.p8, end), Nowhere, 0, state);
508 break;
509 case DW_CFA_register:
510 value = get_uleb128(&ptr.p8, end);
511 set_rule(value,
512 Register,
513 get_uleb128(&ptr.p8, end), state);
514 break;
515 case DW_CFA_remember_state:
516 if (ptr.p8 == state->label) {
517 state->label = NULL;
518 return 1;
519 }
520 if (state->stackDepth >= MAX_STACK_DEPTH)
521 return 0;
522 state->stack[state->stackDepth++] = ptr.p8;
523 break;
524 case DW_CFA_restore_state:
525 if (state->stackDepth) {
526 const uleb128_t loc = state->loc;
527 const u8 *label = state->label;
528
529 state->label = state->stack[state->stackDepth - 1];
530 memcpy(&state->cfa, &badCFA, sizeof(state->cfa));
531 memset(state->regs, 0, sizeof(state->regs));
532 state->stackDepth = 0;
533 result = processCFI(start, end, 0, ptrType, state);
534 state->loc = loc;
535 state->label = label;
536 } else
537 return 0;
538 break;
539 case DW_CFA_def_cfa:
540 state->cfa.reg = get_uleb128(&ptr.p8, end);
541 /*nobreak*/
542 case DW_CFA_def_cfa_offset:
543 state->cfa.offs = get_uleb128(&ptr.p8, end);
544 break;
545 case DW_CFA_def_cfa_sf:
546 state->cfa.reg = get_uleb128(&ptr.p8, end);
547 /*nobreak*/
548 case DW_CFA_def_cfa_offset_sf:
549 state->cfa.offs = get_sleb128(&ptr.p8, end)
550 * state->dataAlign;
551 break;
552 case DW_CFA_def_cfa_register:
553 state->cfa.reg = get_uleb128(&ptr.p8, end);
554 break;
555 /*todo case DW_CFA_def_cfa_expression: */
556 /*todo case DW_CFA_expression: */
557 /*todo case DW_CFA_val_expression: */
558 case DW_CFA_GNU_args_size:
559 get_uleb128(&ptr.p8, end);
560 break;
561 case DW_CFA_GNU_negative_offset_extended:
562 value = get_uleb128(&ptr.p8, end);
563 set_rule(value,
564 Memory,
565 (uleb128_t)0 - get_uleb128(&ptr.p8, end), state);
566 break;
567 case DW_CFA_GNU_window_save:
568 default:
569 result = 0;
570 break;
571 }
572 break;
573 case 1:
574 result = advance_loc(*ptr.p8++ & 0x3f, state);
575 break;
576 case 2:
577 value = *ptr.p8++ & 0x3f;
578 set_rule(value, Memory, get_uleb128(&ptr.p8, end), state);
579 break;
580 case 3:
581 set_rule(*ptr.p8++ & 0x3f, Nowhere, 0, state);
582 break;
583 }
584 if (ptr.p8 > end)
585 result = 0;
586 if (result && targetLoc != 0 && targetLoc < state->loc)
587 return 1;
588 }
589
590 return result
591 && ptr.p8 == end
592 && (targetLoc == 0
593 || (/*todo While in theory this should apply, gcc in practice omits
594 everything past the function prolog, and hence the location
595 never reaches the end of the function.
596 targetLoc < state->loc &&*/ state->label == NULL));
597}
598
599/* Unwind to previous to frame. Returns 0 if successful, negative
600 * number in case of an error. */
601int unwind(struct unwind_frame_info *frame)
602{
603#define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs])
604 const u32 *fde = NULL, *cie = NULL;
605 const u8 *ptr = NULL, *end = NULL;
606 unsigned long startLoc = 0, endLoc = 0, cfa;
607 unsigned i;
608 signed ptrType = -1;
609 uleb128_t retAddrReg = 0;
610 struct unwind_table *table;
611 struct unwind_state state;
612
613 if (UNW_PC(frame) == 0)
614 return -EINVAL;
615 if ((table = find_table(UNW_PC(frame))) != NULL
616 && !(table->size & (sizeof(*fde) - 1))) {
617 unsigned long tableSize = table->size;
618
619 for (fde = table->address;
620 tableSize > sizeof(*fde) && tableSize - sizeof(*fde) >= *fde;
621 tableSize -= sizeof(*fde) + *fde,
622 fde += 1 + *fde / sizeof(*fde)) {
623 if (!*fde || (*fde & (sizeof(*fde) - 1)))
624 break;
625 if (!fde[1])
626 continue; /* this is a CIE */
627 if ((fde[1] & (sizeof(*fde) - 1))
628 || fde[1] > (unsigned long)(fde + 1)
629 - (unsigned long)table->address)
630 continue; /* this is not a valid FDE */
631 cie = fde + 1 - fde[1] / sizeof(*fde);
632 if (*cie <= sizeof(*cie) + 4
633 || *cie >= fde[1] - sizeof(*fde)
634 || (*cie & (sizeof(*cie) - 1))
635 || cie[1]
636 || (ptrType = fde_pointer_type(cie)) < 0) {
637 cie = NULL; /* this is not a (valid) CIE */
638 continue;
639 }
640 ptr = (const u8 *)(fde + 2);
641 startLoc = read_pointer(&ptr,
642 (const u8 *)(fde + 1) + *fde,
643 ptrType);
644 endLoc = startLoc
645 + read_pointer(&ptr,
646 (const u8 *)(fde + 1) + *fde,
647 ptrType & DW_EH_PE_indirect
648 ? ptrType
649 : ptrType & (DW_EH_PE_FORM|DW_EH_PE_signed));
650 if (UNW_PC(frame) >= startLoc && UNW_PC(frame) < endLoc)
651 break;
652 cie = NULL;
653 }
654 }
655 if (cie != NULL) {
656 memset(&state, 0, sizeof(state));
657 state.cieEnd = ptr; /* keep here temporarily */
658 ptr = (const u8 *)(cie + 2);
659 end = (const u8 *)(cie + 1) + *cie;
660 if ((state.version = *ptr) != 1)
661 cie = NULL; /* unsupported version */
662 else if (*++ptr) {
663 /* check if augmentation size is first (and thus present) */
664 if (*ptr == 'z') {
665 /* check for ignorable (or already handled)
666 * nul-terminated augmentation string */
667 while (++ptr < end && *ptr)
668 if (strchr("LPR", *ptr) == NULL)
669 break;
670 }
671 if (ptr >= end || *ptr)
672 cie = NULL;
673 }
674 ++ptr;
675 }
676 if (cie != NULL) {
677 /* get code aligment factor */
678 state.codeAlign = get_uleb128(&ptr, end);
679 /* get data aligment factor */
680 state.dataAlign = get_sleb128(&ptr, end);
681 if (state.codeAlign == 0 || state.dataAlign == 0 || ptr >= end)
682 cie = NULL;
683 else {
684 retAddrReg = state.version <= 1 ? *ptr++ : get_uleb128(&ptr, end);
685 /* skip augmentation */
686 if (((const char *)(cie + 2))[1] == 'z')
687 ptr += get_uleb128(&ptr, end);
688 if (ptr > end
689 || retAddrReg >= ARRAY_SIZE(reg_info)
690 || REG_INVALID(retAddrReg)
691 || reg_info[retAddrReg].width != sizeof(unsigned long))
692 cie = NULL;
693 }
694 }
695 if (cie != NULL) {
696 state.cieStart = ptr;
697 ptr = state.cieEnd;
698 state.cieEnd = end;
699 end = (const u8 *)(fde + 1) + *fde;
700 /* skip augmentation */
701 if (((const char *)(cie + 2))[1] == 'z') {
702 uleb128_t augSize = get_uleb128(&ptr, end);
703
704 if ((ptr += augSize) > end)
705 fde = NULL;
706 }
707 }
708 if (cie == NULL || fde == NULL) {
709#ifdef CONFIG_FRAME_POINTER
710 unsigned long top, bottom;
711#endif
712
713#ifdef CONFIG_FRAME_POINTER
714 top = STACK_TOP(frame->task);
715 bottom = STACK_BOTTOM(frame->task);
716# if FRAME_RETADDR_OFFSET < 0
717 if (UNW_SP(frame) < top
718 && UNW_FP(frame) <= UNW_SP(frame)
719 && bottom < UNW_FP(frame)
720# else
721 if (UNW_SP(frame) > top
722 && UNW_FP(frame) >= UNW_SP(frame)
723 && bottom > UNW_FP(frame)
724# endif
725 && !((UNW_SP(frame) | UNW_FP(frame))
726 & (sizeof(unsigned long) - 1))) {
727 unsigned long link;
728
729 if (!__get_user(link,
730 (unsigned long *)(UNW_FP(frame)
731 + FRAME_LINK_OFFSET))
732# if FRAME_RETADDR_OFFSET < 0
733 && link > bottom && link < UNW_FP(frame)
734# else
735 && link > UNW_FP(frame) && link < bottom
736# endif
737 && !(link & (sizeof(link) - 1))
738 && !__get_user(UNW_PC(frame),
739 (unsigned long *)(UNW_FP(frame)
740 + FRAME_RETADDR_OFFSET))) {
741 UNW_SP(frame) = UNW_FP(frame) + FRAME_RETADDR_OFFSET
742# if FRAME_RETADDR_OFFSET < 0
743 -
744# else
745 +
746# endif
747 sizeof(UNW_PC(frame));
748 UNW_FP(frame) = link;
749 return 0;
750 }
751 }
752#endif
753 return -ENXIO;
754 }
755 state.org = startLoc;
756 memcpy(&state.cfa, &badCFA, sizeof(state.cfa));
757 /* process instructions */
758 if (!processCFI(ptr, end, UNW_PC(frame), ptrType, &state)
759 || state.loc > endLoc
760 || state.regs[retAddrReg].where == Nowhere
761 || state.cfa.reg >= ARRAY_SIZE(reg_info)
762 || reg_info[state.cfa.reg].width != sizeof(unsigned long)
763 || state.cfa.offs % sizeof(unsigned long))
764 return -EIO;
765 /* update frame */
766 cfa = FRAME_REG(state.cfa.reg, unsigned long) + state.cfa.offs;
767 startLoc = min((unsigned long)UNW_SP(frame), cfa);
768 endLoc = max((unsigned long)UNW_SP(frame), cfa);
769 if (STACK_LIMIT(startLoc) != STACK_LIMIT(endLoc)) {
770 startLoc = min(STACK_LIMIT(cfa), cfa);
771 endLoc = max(STACK_LIMIT(cfa), cfa);
772 }
773#ifndef CONFIG_64BIT
774# define CASES CASE(8); CASE(16); CASE(32)
775#else
776# define CASES CASE(8); CASE(16); CASE(32); CASE(64)
777#endif
778 for (i = 0; i < ARRAY_SIZE(state.regs); ++i) {
779 if (REG_INVALID(i)) {
780 if (state.regs[i].where == Nowhere)
781 continue;
782 return -EIO;
783 }
784 switch(state.regs[i].where) {
785 default:
786 break;
787 case Register:
788 if (state.regs[i].value >= ARRAY_SIZE(reg_info)
789 || REG_INVALID(state.regs[i].value)
790 || reg_info[i].width > reg_info[state.regs[i].value].width)
791 return -EIO;
792 switch(reg_info[state.regs[i].value].width) {
793#define CASE(n) \
794 case sizeof(u##n): \
795 state.regs[i].value = FRAME_REG(state.regs[i].value, \
796 const u##n); \
797 break
798 CASES;
799#undef CASE
800 default:
801 return -EIO;
802 }
803 break;
804 }
805 }
806 for (i = 0; i < ARRAY_SIZE(state.regs); ++i) {
807 if (REG_INVALID(i))
808 continue;
809 switch(state.regs[i].where) {
810 case Nowhere:
811 if (reg_info[i].width != sizeof(UNW_SP(frame))
812 || &FRAME_REG(i, __typeof__(UNW_SP(frame)))
813 != &UNW_SP(frame))
814 continue;
815 UNW_SP(frame) = cfa;
816 break;
817 case Register:
818 switch(reg_info[i].width) {
819#define CASE(n) case sizeof(u##n): \
820 FRAME_REG(i, u##n) = state.regs[i].value; \
821 break
822 CASES;
823#undef CASE
824 default:
825 return -EIO;
826 }
827 break;
828 case Value:
829 if (reg_info[i].width != sizeof(unsigned long))
830 return -EIO;
831 FRAME_REG(i, unsigned long) = cfa + state.regs[i].value
832 * state.dataAlign;
833 break;
834 case Memory: {
835 unsigned long addr = cfa + state.regs[i].value
836 * state.dataAlign;
837
838 if ((state.regs[i].value * state.dataAlign)
839 % sizeof(unsigned long)
840 || addr < startLoc
841 || addr + sizeof(unsigned long) < addr
842 || addr + sizeof(unsigned long) > endLoc)
843 return -EIO;
844 switch(reg_info[i].width) {
845#define CASE(n) case sizeof(u##n): \
846 __get_user(FRAME_REG(i, u##n), (u##n *)addr); \
847 break
848 CASES;
849#undef CASE
850 default:
851 return -EIO;
852 }
853 }
854 break;
855 }
856 }
857
858 return 0;
859#undef CASES
860#undef FRAME_REG
861}
862EXPORT_SYMBOL(unwind);
863
864int unwind_init_frame_info(struct unwind_frame_info *info,
865 struct task_struct *tsk,
866 /*const*/ struct pt_regs *regs)
867{
868 info->task = tsk;
869 arch_unw_init_frame_info(info, regs);
870
871 return 0;
872}
873EXPORT_SYMBOL(unwind_init_frame_info);
874
875/*
876 * Prepare to unwind a blocked task.
877 */
878int unwind_init_blocked(struct unwind_frame_info *info,
879 struct task_struct *tsk)
880{
881 info->task = tsk;
882 arch_unw_init_blocked(info);
883
884 return 0;
885}
886EXPORT_SYMBOL(unwind_init_blocked);
887
888/*
889 * Prepare to unwind the currently running thread.
890 */
891int unwind_init_running(struct unwind_frame_info *info,
892 asmlinkage int (*callback)(struct unwind_frame_info *,
893 void *arg),
894 void *arg)
895{
896 info->task = current;
897
898 return arch_unwind_init_running(info, callback, arg);
899}
900EXPORT_SYMBOL(unwind_init_running);
901
902/*
903 * Unwind until the return pointer is in user-land (or until an error
904 * occurs). Returns 0 if successful, negative number in case of
905 * error.
906 */
907int unwind_to_user(struct unwind_frame_info *info)
908{
909 while (!arch_unw_user_mode(info)) {
910 int err = unwind(info);
911
912 if (err < 0)
913 return err;
914 }
915
916 return 0;
917}
918EXPORT_SYMBOL(unwind_to_user);
diff --git a/kernel/user.c b/kernel/user.c
index 2116642f42c6..6408c0424291 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -140,7 +140,7 @@ struct user_struct * alloc_uid(uid_t uid)
140 atomic_set(&new->processes, 0); 140 atomic_set(&new->processes, 0);
141 atomic_set(&new->files, 0); 141 atomic_set(&new->files, 0);
142 atomic_set(&new->sigpending, 0); 142 atomic_set(&new->sigpending, 0);
143#ifdef CONFIG_INOTIFY 143#ifdef CONFIG_INOTIFY_USER
144 atomic_set(&new->inotify_watches, 0); 144 atomic_set(&new->inotify_watches, 0);
145 atomic_set(&new->inotify_devs, 0); 145 atomic_set(&new->inotify_devs, 0);
146#endif 146#endif
@@ -148,7 +148,7 @@ struct user_struct * alloc_uid(uid_t uid)
148 new->mq_bytes = 0; 148 new->mq_bytes = 0;
149 new->locked_shm = 0; 149 new->locked_shm = 0;
150 150
151 if (alloc_uid_keyring(new) < 0) { 151 if (alloc_uid_keyring(new, current) < 0) {
152 kmem_cache_free(uid_cachep, new); 152 kmem_cache_free(uid_cachep, new);
153 return NULL; 153 return NULL;
154 } 154 }
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 880fb415a8f6..59f0b42bd89e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -428,22 +428,34 @@ int schedule_delayed_work_on(int cpu,
428 return ret; 428 return ret;
429} 429}
430 430
431int schedule_on_each_cpu(void (*func) (void *info), void *info) 431/**
432 * schedule_on_each_cpu - call a function on each online CPU from keventd
433 * @func: the function to call
434 * @info: a pointer to pass to func()
435 *
436 * Returns zero on success.
437 * Returns -ve errno on failure.
438 *
439 * Appears to be racy against CPU hotplug.
440 *
441 * schedule_on_each_cpu() is very slow.
442 */
443int schedule_on_each_cpu(void (*func)(void *info), void *info)
432{ 444{
433 int cpu; 445 int cpu;
434 struct work_struct *work; 446 struct work_struct *works;
435 447
436 work = kmalloc(NR_CPUS * sizeof(struct work_struct), GFP_KERNEL); 448 works = alloc_percpu(struct work_struct);
437 449 if (!works)
438 if (!work)
439 return -ENOMEM; 450 return -ENOMEM;
451
440 for_each_online_cpu(cpu) { 452 for_each_online_cpu(cpu) {
441 INIT_WORK(work + cpu, func, info); 453 INIT_WORK(per_cpu_ptr(works, cpu), func, info);
442 __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), 454 __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu),
443 work + cpu); 455 per_cpu_ptr(works, cpu));
444 } 456 }
445 flush_workqueue(keventd_wq); 457 flush_workqueue(keventd_wq);
446 kfree(work); 458 free_percpu(works);
447 return 0; 459 return 0;
448} 460}
449 461
@@ -531,11 +543,11 @@ int current_is_keventd(void)
531static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) 543static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
532{ 544{
533 struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu); 545 struct cpu_workqueue_struct *cwq = per_cpu_ptr(wq->cpu_wq, cpu);
534 LIST_HEAD(list); 546 struct list_head list;
535 struct work_struct *work; 547 struct work_struct *work;
536 548
537 spin_lock_irq(&cwq->lock); 549 spin_lock_irq(&cwq->lock);
538 list_splice_init(&cwq->worklist, &list); 550 list_replace_init(&cwq->worklist, &list);
539 551
540 while (!list_empty(&list)) { 552 while (!list_empty(&list)) {
541 printk("Taking work for %s\n", wq->name); 553 printk("Taking work for %s\n", wq->name);
@@ -547,7 +559,7 @@ static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
547} 559}
548 560
549/* We're holding the cpucontrol mutex here */ 561/* We're holding the cpucontrol mutex here */
550static int workqueue_cpu_callback(struct notifier_block *nfb, 562static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
551 unsigned long action, 563 unsigned long action,
552 void *hcpu) 564 void *hcpu)
553{ 565{
@@ -578,6 +590,8 @@ static int workqueue_cpu_callback(struct notifier_block *nfb,
578 590
579 case CPU_UP_CANCELED: 591 case CPU_UP_CANCELED:
580 list_for_each_entry(wq, &workqueues, list) { 592 list_for_each_entry(wq, &workqueues, list) {
593 if (!per_cpu_ptr(wq->cpu_wq, hotcpu)->thread)
594 continue;
581 /* Unbind so it can run. */ 595 /* Unbind so it can run. */
582 kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread, 596 kthread_bind(per_cpu_ptr(wq->cpu_wq, hotcpu)->thread,
583 any_online_cpu(cpu_online_map)); 597 any_online_cpu(cpu_online_map));