aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile5
-rw-r--r--kernel/acct.c3
-rw-r--r--kernel/audit.c2
-rw-r--r--kernel/auditsc.c10
-rw-r--r--kernel/cpu.c18
-rw-r--r--kernel/cpuset.c26
-rw-r--r--kernel/exit.c16
-rw-r--r--kernel/fork.c34
-rw-r--r--kernel/futex.c1067
-rw-r--r--kernel/futex_compat.c14
-rw-r--r--kernel/hrtimer.c4
-rw-r--r--kernel/kprobes.c58
-rw-r--r--kernel/module.c27
-rw-r--r--kernel/mutex-debug.c17
-rw-r--r--kernel/mutex-debug.h25
-rw-r--r--kernel/mutex.c21
-rw-r--r--kernel/mutex.h6
-rw-r--r--kernel/power/Kconfig13
-rw-r--r--kernel/profile.c2
-rw-r--r--kernel/ptrace.c23
-rw-r--r--kernel/rcupdate.c14
-rw-r--r--kernel/rcutorture.c201
-rw-r--r--kernel/resource.c38
-rw-r--r--kernel/rtmutex-debug.c513
-rw-r--r--kernel/rtmutex-debug.h37
-rw-r--r--kernel/rtmutex-tester.c440
-rw-r--r--kernel/rtmutex.c990
-rw-r--r--kernel/rtmutex.h29
-rw-r--r--kernel/rtmutex_common.h123
-rw-r--r--kernel/sched.c1210
-rw-r--r--kernel/signal.c35
-rw-r--r--kernel/softirq.c4
-rw-r--r--kernel/softlockup.c4
-rw-r--r--kernel/sysctl.c38
-rw-r--r--kernel/time.c2
-rw-r--r--kernel/time/Makefile1
-rw-r--r--kernel/time/clocksource.c349
-rw-r--r--kernel/time/jiffies.c73
-rw-r--r--kernel/timer.c400
-rw-r--r--kernel/unwind.c918
-rw-r--r--kernel/workqueue.c2
41 files changed, 6106 insertions, 706 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index f6ef00f4f90f..82fb182f6f61 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -10,17 +10,22 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ 10 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
11 hrtimer.o 11 hrtimer.o
12 12
13obj-y += time/
13obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o 14obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
14obj-$(CONFIG_FUTEX) += futex.o 15obj-$(CONFIG_FUTEX) += futex.o
15ifeq ($(CONFIG_COMPAT),y) 16ifeq ($(CONFIG_COMPAT),y)
16obj-$(CONFIG_FUTEX) += futex_compat.o 17obj-$(CONFIG_FUTEX) += futex_compat.o
17endif 18endif
19obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
20obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
21obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
18obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o 22obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
19obj-$(CONFIG_SMP) += cpu.o spinlock.o 23obj-$(CONFIG_SMP) += cpu.o spinlock.o
20obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o 24obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
21obj-$(CONFIG_UID16) += uid16.o 25obj-$(CONFIG_UID16) += uid16.o
22obj-$(CONFIG_MODULES) += module.o 26obj-$(CONFIG_MODULES) += module.o
23obj-$(CONFIG_KALLSYMS) += kallsyms.o 27obj-$(CONFIG_KALLSYMS) += kallsyms.o
28obj-$(CONFIG_STACK_UNWIND) += unwind.o
24obj-$(CONFIG_PM) += power/ 29obj-$(CONFIG_PM) += power/
25obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o 30obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
26obj-$(CONFIG_KEXEC) += kexec.o 31obj-$(CONFIG_KEXEC) += kexec.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 368c4f03fe0e..126ca43d5d2b 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -521,6 +521,7 @@ static void do_acct_process(struct file *file)
521 521
522/** 522/**
523 * acct_init_pacct - initialize a new pacct_struct 523 * acct_init_pacct - initialize a new pacct_struct
524 * @pacct: per-process accounting info struct to initialize
524 */ 525 */
525void acct_init_pacct(struct pacct_struct *pacct) 526void acct_init_pacct(struct pacct_struct *pacct)
526{ 527{
@@ -576,7 +577,7 @@ void acct_collect(long exitcode, int group_dead)
576 * 577 *
577 * handles process accounting for an exiting task 578 * handles process accounting for an exiting task
578 */ 579 */
579void acct_process() 580void acct_process(void)
580{ 581{
581 struct file *file = NULL; 582 struct file *file = NULL;
582 583
diff --git a/kernel/audit.c b/kernel/audit.c
index 7dfac7031bd7..82443fb433ef 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -818,7 +818,7 @@ err:
818 */ 818 */
819unsigned int audit_serial(void) 819unsigned int audit_serial(void)
820{ 820{
821 static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED; 821 static DEFINE_SPINLOCK(serial_lock);
822 static unsigned int serial = 0; 822 static unsigned int serial = 0;
823 823
824 unsigned long flags; 824 unsigned long flags;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 9ebd96fda295..dc5e3f01efe7 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -658,8 +658,7 @@ static void audit_log_task_context(struct audit_buffer *ab)
658 return; 658 return;
659 659
660error_path: 660error_path:
661 if (ctx) 661 kfree(ctx);
662 kfree(ctx);
663 audit_panic("error in audit_log_task_context"); 662 audit_panic("error in audit_log_task_context");
664 return; 663 return;
665} 664}
@@ -1367,7 +1366,7 @@ int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr)
1367 * @mqdes: MQ descriptor 1366 * @mqdes: MQ descriptor
1368 * @msg_len: Message length 1367 * @msg_len: Message length
1369 * @msg_prio: Message priority 1368 * @msg_prio: Message priority
1370 * @abs_timeout: Message timeout in absolute time 1369 * @u_abs_timeout: Message timeout in absolute time
1371 * 1370 *
1372 * Returns 0 for success or NULL context or < 0 on error. 1371 * Returns 0 for success or NULL context or < 0 on error.
1373 */ 1372 */
@@ -1409,8 +1408,8 @@ int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
1409 * __audit_mq_timedreceive - record audit data for a POSIX MQ timed receive 1408 * __audit_mq_timedreceive - record audit data for a POSIX MQ timed receive
1410 * @mqdes: MQ descriptor 1409 * @mqdes: MQ descriptor
1411 * @msg_len: Message length 1410 * @msg_len: Message length
1412 * @msg_prio: Message priority 1411 * @u_msg_prio: Message priority
1413 * @abs_timeout: Message timeout in absolute time 1412 * @u_abs_timeout: Message timeout in absolute time
1414 * 1413 *
1415 * Returns 0 for success or NULL context or < 0 on error. 1414 * Returns 0 for success or NULL context or < 0 on error.
1416 */ 1415 */
@@ -1558,7 +1557,6 @@ int __audit_ipc_obj(struct kern_ipc_perm *ipcp)
1558 * @uid: msgq user id 1557 * @uid: msgq user id
1559 * @gid: msgq group id 1558 * @gid: msgq group id
1560 * @mode: msgq mode (permissions) 1559 * @mode: msgq mode (permissions)
1561 * @ipcp: in-kernel IPC permissions
1562 * 1560 *
1563 * Returns 0 for success or NULL context or < 0 on error. 1561 * Returns 0 for success or NULL context or < 0 on error.
1564 */ 1562 */
diff --git a/kernel/cpu.c b/kernel/cpu.c
index fe2b8d0bfe4c..70fbf2e83766 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -13,12 +13,12 @@
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/kthread.h> 14#include <linux/kthread.h>
15#include <linux/stop_machine.h> 15#include <linux/stop_machine.h>
16#include <asm/semaphore.h> 16#include <linux/mutex.h>
17 17
18/* This protects CPUs going up and down... */ 18/* This protects CPUs going up and down... */
19static DECLARE_MUTEX(cpucontrol); 19static DEFINE_MUTEX(cpucontrol);
20 20
21static BLOCKING_NOTIFIER_HEAD(cpu_chain); 21static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain);
22 22
23#ifdef CONFIG_HOTPLUG_CPU 23#ifdef CONFIG_HOTPLUG_CPU
24static struct task_struct *lock_cpu_hotplug_owner; 24static struct task_struct *lock_cpu_hotplug_owner;
@@ -30,9 +30,9 @@ static int __lock_cpu_hotplug(int interruptible)
30 30
31 if (lock_cpu_hotplug_owner != current) { 31 if (lock_cpu_hotplug_owner != current) {
32 if (interruptible) 32 if (interruptible)
33 ret = down_interruptible(&cpucontrol); 33 ret = mutex_lock_interruptible(&cpucontrol);
34 else 34 else
35 down(&cpucontrol); 35 mutex_lock(&cpucontrol);
36 } 36 }
37 37
38 /* 38 /*
@@ -56,7 +56,7 @@ void unlock_cpu_hotplug(void)
56{ 56{
57 if (--lock_cpu_hotplug_depth == 0) { 57 if (--lock_cpu_hotplug_depth == 0) {
58 lock_cpu_hotplug_owner = NULL; 58 lock_cpu_hotplug_owner = NULL;
59 up(&cpucontrol); 59 mutex_unlock(&cpucontrol);
60 } 60 }
61} 61}
62EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); 62EXPORT_SYMBOL_GPL(unlock_cpu_hotplug);
@@ -69,10 +69,13 @@ EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible);
69#endif /* CONFIG_HOTPLUG_CPU */ 69#endif /* CONFIG_HOTPLUG_CPU */
70 70
71/* Need to know about CPUs going up/down? */ 71/* Need to know about CPUs going up/down? */
72int register_cpu_notifier(struct notifier_block *nb) 72int __cpuinit register_cpu_notifier(struct notifier_block *nb)
73{ 73{
74 return blocking_notifier_chain_register(&cpu_chain, nb); 74 return blocking_notifier_chain_register(&cpu_chain, nb);
75} 75}
76
77#ifdef CONFIG_HOTPLUG_CPU
78
76EXPORT_SYMBOL(register_cpu_notifier); 79EXPORT_SYMBOL(register_cpu_notifier);
77 80
78void unregister_cpu_notifier(struct notifier_block *nb) 81void unregister_cpu_notifier(struct notifier_block *nb)
@@ -81,7 +84,6 @@ void unregister_cpu_notifier(struct notifier_block *nb)
81} 84}
82EXPORT_SYMBOL(unregister_cpu_notifier); 85EXPORT_SYMBOL(unregister_cpu_notifier);
83 86
84#ifdef CONFIG_HOTPLUG_CPU
85static inline void check_for_tasks(int cpu) 87static inline void check_for_tasks(int cpu)
86{ 88{
87 struct task_struct *p; 89 struct task_struct *p;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index b602f73fb38d..1535af3a912d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2442,31 +2442,43 @@ void __cpuset_memory_pressure_bump(void)
2442 */ 2442 */
2443static int proc_cpuset_show(struct seq_file *m, void *v) 2443static int proc_cpuset_show(struct seq_file *m, void *v)
2444{ 2444{
2445 struct pid *pid;
2445 struct task_struct *tsk; 2446 struct task_struct *tsk;
2446 char *buf; 2447 char *buf;
2447 int retval = 0; 2448 int retval;
2448 2449
2450 retval = -ENOMEM;
2449 buf = kmalloc(PAGE_SIZE, GFP_KERNEL); 2451 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
2450 if (!buf) 2452 if (!buf)
2451 return -ENOMEM; 2453 goto out;
2454
2455 retval = -ESRCH;
2456 pid = m->private;
2457 tsk = get_pid_task(pid, PIDTYPE_PID);
2458 if (!tsk)
2459 goto out_free;
2452 2460
2453 tsk = m->private; 2461 retval = -EINVAL;
2454 mutex_lock(&manage_mutex); 2462 mutex_lock(&manage_mutex);
2463
2455 retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE); 2464 retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE);
2456 if (retval < 0) 2465 if (retval < 0)
2457 goto out; 2466 goto out_unlock;
2458 seq_puts(m, buf); 2467 seq_puts(m, buf);
2459 seq_putc(m, '\n'); 2468 seq_putc(m, '\n');
2460out: 2469out_unlock:
2461 mutex_unlock(&manage_mutex); 2470 mutex_unlock(&manage_mutex);
2471 put_task_struct(tsk);
2472out_free:
2462 kfree(buf); 2473 kfree(buf);
2474out:
2463 return retval; 2475 return retval;
2464} 2476}
2465 2477
2466static int cpuset_open(struct inode *inode, struct file *file) 2478static int cpuset_open(struct inode *inode, struct file *file)
2467{ 2479{
2468 struct task_struct *tsk = PROC_I(inode)->task; 2480 struct pid *pid = PROC_I(inode)->pid;
2469 return single_open(file, proc_cpuset_show, tsk); 2481 return single_open(file, proc_cpuset_show, pid);
2470} 2482}
2471 2483
2472struct file_operations proc_cpuset_operations = { 2484struct file_operations proc_cpuset_operations = {
diff --git a/kernel/exit.c b/kernel/exit.c
index e76bd02e930e..ab06b9f88f64 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -137,12 +137,8 @@ void release_task(struct task_struct * p)
137{ 137{
138 int zap_leader; 138 int zap_leader;
139 task_t *leader; 139 task_t *leader;
140 struct dentry *proc_dentry;
141
142repeat: 140repeat:
143 atomic_dec(&p->user->processes); 141 atomic_dec(&p->user->processes);
144 spin_lock(&p->proc_lock);
145 proc_dentry = proc_pid_unhash(p);
146 write_lock_irq(&tasklist_lock); 142 write_lock_irq(&tasklist_lock);
147 ptrace_unlink(p); 143 ptrace_unlink(p);
148 BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); 144 BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
@@ -171,8 +167,7 @@ repeat:
171 167
172 sched_exit(p); 168 sched_exit(p);
173 write_unlock_irq(&tasklist_lock); 169 write_unlock_irq(&tasklist_lock);
174 spin_unlock(&p->proc_lock); 170 proc_flush_task(p);
175 proc_pid_flush(proc_dentry);
176 release_thread(p); 171 release_thread(p);
177 call_rcu(&p->rcu, delayed_put_task_struct); 172 call_rcu(&p->rcu, delayed_put_task_struct);
178 173
@@ -931,9 +926,18 @@ fastcall NORET_TYPE void do_exit(long code)
931 tsk->mempolicy = NULL; 926 tsk->mempolicy = NULL;
932#endif 927#endif
933 /* 928 /*
929 * This must happen late, after the PID is not
930 * hashed anymore:
931 */
932 if (unlikely(!list_empty(&tsk->pi_state_list)))
933 exit_pi_state_list(tsk);
934 if (unlikely(current->pi_state_cache))
935 kfree(current->pi_state_cache);
936 /*
934 * If DEBUG_MUTEXES is on, make sure we are holding no locks: 937 * If DEBUG_MUTEXES is on, make sure we are holding no locks:
935 */ 938 */
936 mutex_debug_check_no_locks_held(tsk); 939 mutex_debug_check_no_locks_held(tsk);
940 rt_mutex_debug_check_no_locks_held(tsk);
937 941
938 if (tsk->io_context) 942 if (tsk->io_context)
939 exit_io_context(); 943 exit_io_context();
diff --git a/kernel/fork.c b/kernel/fork.c
index dfd10cb370c3..628198a4f28a 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -104,6 +104,7 @@ static kmem_cache_t *mm_cachep;
104void free_task(struct task_struct *tsk) 104void free_task(struct task_struct *tsk)
105{ 105{
106 free_thread_info(tsk->thread_info); 106 free_thread_info(tsk->thread_info);
107 rt_mutex_debug_task_free(tsk);
107 free_task_struct(tsk); 108 free_task_struct(tsk);
108} 109}
109EXPORT_SYMBOL(free_task); 110EXPORT_SYMBOL(free_task);
@@ -913,6 +914,19 @@ asmlinkage long sys_set_tid_address(int __user *tidptr)
913 return current->pid; 914 return current->pid;
914} 915}
915 916
917static inline void rt_mutex_init_task(struct task_struct *p)
918{
919#ifdef CONFIG_RT_MUTEXES
920 spin_lock_init(&p->pi_lock);
921 plist_head_init(&p->pi_waiters, &p->pi_lock);
922 p->pi_blocked_on = NULL;
923# ifdef CONFIG_DEBUG_RT_MUTEXES
924 spin_lock_init(&p->held_list_lock);
925 INIT_LIST_HEAD(&p->held_list_head);
926# endif
927#endif
928}
929
916/* 930/*
917 * This creates a new process as a copy of the old one, 931 * This creates a new process as a copy of the old one,
918 * but does not actually start it yet. 932 * but does not actually start it yet.
@@ -993,13 +1007,10 @@ static task_t *copy_process(unsigned long clone_flags,
993 if (put_user(p->pid, parent_tidptr)) 1007 if (put_user(p->pid, parent_tidptr))
994 goto bad_fork_cleanup; 1008 goto bad_fork_cleanup;
995 1009
996 p->proc_dentry = NULL;
997
998 INIT_LIST_HEAD(&p->children); 1010 INIT_LIST_HEAD(&p->children);
999 INIT_LIST_HEAD(&p->sibling); 1011 INIT_LIST_HEAD(&p->sibling);
1000 p->vfork_done = NULL; 1012 p->vfork_done = NULL;
1001 spin_lock_init(&p->alloc_lock); 1013 spin_lock_init(&p->alloc_lock);
1002 spin_lock_init(&p->proc_lock);
1003 1014
1004 clear_tsk_thread_flag(p, TIF_SIGPENDING); 1015 clear_tsk_thread_flag(p, TIF_SIGPENDING);
1005 init_sigpending(&p->pending); 1016 init_sigpending(&p->pending);
@@ -1037,6 +1048,8 @@ static task_t *copy_process(unsigned long clone_flags,
1037 mpol_fix_fork_child_flag(p); 1048 mpol_fix_fork_child_flag(p);
1038#endif 1049#endif
1039 1050
1051 rt_mutex_init_task(p);
1052
1040#ifdef CONFIG_DEBUG_MUTEXES 1053#ifdef CONFIG_DEBUG_MUTEXES
1041 p->blocked_on = NULL; /* not blocked yet */ 1054 p->blocked_on = NULL; /* not blocked yet */
1042#endif 1055#endif
@@ -1079,6 +1092,9 @@ static task_t *copy_process(unsigned long clone_flags,
1079#ifdef CONFIG_COMPAT 1092#ifdef CONFIG_COMPAT
1080 p->compat_robust_list = NULL; 1093 p->compat_robust_list = NULL;
1081#endif 1094#endif
1095 INIT_LIST_HEAD(&p->pi_state_list);
1096 p->pi_state_cache = NULL;
1097
1082 /* 1098 /*
1083 * sigaltstack should be cleared when sharing the same VM 1099 * sigaltstack should be cleared when sharing the same VM
1084 */ 1100 */
@@ -1159,18 +1175,6 @@ static task_t *copy_process(unsigned long clone_flags,
1159 } 1175 }
1160 1176
1161 if (clone_flags & CLONE_THREAD) { 1177 if (clone_flags & CLONE_THREAD) {
1162 /*
1163 * Important: if an exit-all has been started then
1164 * do not create this new thread - the whole thread
1165 * group is supposed to exit anyway.
1166 */
1167 if (current->signal->flags & SIGNAL_GROUP_EXIT) {
1168 spin_unlock(&current->sighand->siglock);
1169 write_unlock_irq(&tasklist_lock);
1170 retval = -EAGAIN;
1171 goto bad_fork_cleanup_namespace;
1172 }
1173
1174 p->group_leader = current->group_leader; 1178 p->group_leader = current->group_leader;
1175 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); 1179 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
1176 1180
diff --git a/kernel/futex.c b/kernel/futex.c
index e1a380c77a5a..6c91f938005d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -12,6 +12,10 @@
12 * (C) Copyright 2006 Red Hat Inc, All Rights Reserved 12 * (C) Copyright 2006 Red Hat Inc, All Rights Reserved
13 * Thanks to Thomas Gleixner for suggestions, analysis and fixes. 13 * Thanks to Thomas Gleixner for suggestions, analysis and fixes.
14 * 14 *
15 * PI-futex support started by Ingo Molnar and Thomas Gleixner
16 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
17 * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
18 *
15 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly 19 * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
16 * enough at me, Linus for the original (flawed) idea, Matthew 20 * enough at me, Linus for the original (flawed) idea, Matthew
17 * Kirkwood for proof-of-concept implementation. 21 * Kirkwood for proof-of-concept implementation.
@@ -46,6 +50,8 @@
46#include <linux/signal.h> 50#include <linux/signal.h>
47#include <asm/futex.h> 51#include <asm/futex.h>
48 52
53#include "rtmutex_common.h"
54
49#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) 55#define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
50 56
51/* 57/*
@@ -63,7 +69,7 @@ union futex_key {
63 int offset; 69 int offset;
64 } shared; 70 } shared;
65 struct { 71 struct {
66 unsigned long uaddr; 72 unsigned long address;
67 struct mm_struct *mm; 73 struct mm_struct *mm;
68 int offset; 74 int offset;
69 } private; 75 } private;
@@ -75,6 +81,27 @@ union futex_key {
75}; 81};
76 82
77/* 83/*
84 * Priority Inheritance state:
85 */
86struct futex_pi_state {
87 /*
88 * list of 'owned' pi_state instances - these have to be
89 * cleaned up in do_exit() if the task exits prematurely:
90 */
91 struct list_head list;
92
93 /*
94 * The PI object:
95 */
96 struct rt_mutex pi_mutex;
97
98 struct task_struct *owner;
99 atomic_t refcount;
100
101 union futex_key key;
102};
103
104/*
78 * We use this hashed waitqueue instead of a normal wait_queue_t, so 105 * We use this hashed waitqueue instead of a normal wait_queue_t, so
79 * we can wake only the relevant ones (hashed queues may be shared). 106 * we can wake only the relevant ones (hashed queues may be shared).
80 * 107 *
@@ -87,15 +114,19 @@ struct futex_q {
87 struct list_head list; 114 struct list_head list;
88 wait_queue_head_t waiters; 115 wait_queue_head_t waiters;
89 116
90 /* Which hash list lock to use. */ 117 /* Which hash list lock to use: */
91 spinlock_t *lock_ptr; 118 spinlock_t *lock_ptr;
92 119
93 /* Key which the futex is hashed on. */ 120 /* Key which the futex is hashed on: */
94 union futex_key key; 121 union futex_key key;
95 122
96 /* For fd, sigio sent using these. */ 123 /* For fd, sigio sent using these: */
97 int fd; 124 int fd;
98 struct file *filp; 125 struct file *filp;
126
127 /* Optional priority inheritance state: */
128 struct futex_pi_state *pi_state;
129 struct task_struct *task;
99}; 130};
100 131
101/* 132/*
@@ -144,8 +175,9 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
144 * 175 *
145 * Should be called with &current->mm->mmap_sem but NOT any spinlocks. 176 * Should be called with &current->mm->mmap_sem but NOT any spinlocks.
146 */ 177 */
147static int get_futex_key(unsigned long uaddr, union futex_key *key) 178static int get_futex_key(u32 __user *uaddr, union futex_key *key)
148{ 179{
180 unsigned long address = (unsigned long)uaddr;
149 struct mm_struct *mm = current->mm; 181 struct mm_struct *mm = current->mm;
150 struct vm_area_struct *vma; 182 struct vm_area_struct *vma;
151 struct page *page; 183 struct page *page;
@@ -154,16 +186,16 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
154 /* 186 /*
155 * The futex address must be "naturally" aligned. 187 * The futex address must be "naturally" aligned.
156 */ 188 */
157 key->both.offset = uaddr % PAGE_SIZE; 189 key->both.offset = address % PAGE_SIZE;
158 if (unlikely((key->both.offset % sizeof(u32)) != 0)) 190 if (unlikely((key->both.offset % sizeof(u32)) != 0))
159 return -EINVAL; 191 return -EINVAL;
160 uaddr -= key->both.offset; 192 address -= key->both.offset;
161 193
162 /* 194 /*
163 * The futex is hashed differently depending on whether 195 * The futex is hashed differently depending on whether
164 * it's in a shared or private mapping. So check vma first. 196 * it's in a shared or private mapping. So check vma first.
165 */ 197 */
166 vma = find_extend_vma(mm, uaddr); 198 vma = find_extend_vma(mm, address);
167 if (unlikely(!vma)) 199 if (unlikely(!vma))
168 return -EFAULT; 200 return -EFAULT;
169 201
@@ -184,7 +216,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
184 */ 216 */
185 if (likely(!(vma->vm_flags & VM_MAYSHARE))) { 217 if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
186 key->private.mm = mm; 218 key->private.mm = mm;
187 key->private.uaddr = uaddr; 219 key->private.address = address;
188 return 0; 220 return 0;
189 } 221 }
190 222
@@ -194,7 +226,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
194 key->shared.inode = vma->vm_file->f_dentry->d_inode; 226 key->shared.inode = vma->vm_file->f_dentry->d_inode;
195 key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ 227 key->both.offset++; /* Bit 0 of offset indicates inode-based key. */
196 if (likely(!(vma->vm_flags & VM_NONLINEAR))) { 228 if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
197 key->shared.pgoff = (((uaddr - vma->vm_start) >> PAGE_SHIFT) 229 key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
198 + vma->vm_pgoff); 230 + vma->vm_pgoff);
199 return 0; 231 return 0;
200 } 232 }
@@ -205,7 +237,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key)
205 * from swap. But that's a lot of code to duplicate here 237 * from swap. But that's a lot of code to duplicate here
206 * for a rare case, so we simply fetch the page. 238 * for a rare case, so we simply fetch the page.
207 */ 239 */
208 err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL); 240 err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
209 if (err >= 0) { 241 if (err >= 0) {
210 key->shared.pgoff = 242 key->shared.pgoff =
211 page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); 243 page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
@@ -246,18 +278,244 @@ static void drop_key_refs(union futex_key *key)
246 } 278 }
247} 279}
248 280
249static inline int get_futex_value_locked(int *dest, int __user *from) 281static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
250{ 282{
251 int ret; 283 int ret;
252 284
253 inc_preempt_count(); 285 inc_preempt_count();
254 ret = __copy_from_user_inatomic(dest, from, sizeof(int)); 286 ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
255 dec_preempt_count(); 287 dec_preempt_count();
256 288
257 return ret ? -EFAULT : 0; 289 return ret ? -EFAULT : 0;
258} 290}
259 291
260/* 292/*
293 * Fault handling. Called with current->mm->mmap_sem held.
294 */
295static int futex_handle_fault(unsigned long address, int attempt)
296{
297 struct vm_area_struct * vma;
298 struct mm_struct *mm = current->mm;
299
300 if (attempt >= 2 || !(vma = find_vma(mm, address)) ||
301 vma->vm_start > address || !(vma->vm_flags & VM_WRITE))
302 return -EFAULT;
303
304 switch (handle_mm_fault(mm, vma, address, 1)) {
305 case VM_FAULT_MINOR:
306 current->min_flt++;
307 break;
308 case VM_FAULT_MAJOR:
309 current->maj_flt++;
310 break;
311 default:
312 return -EFAULT;
313 }
314 return 0;
315}
316
317/*
318 * PI code:
319 */
320static int refill_pi_state_cache(void)
321{
322 struct futex_pi_state *pi_state;
323
324 if (likely(current->pi_state_cache))
325 return 0;
326
327 pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL);
328
329 if (!pi_state)
330 return -ENOMEM;
331
332 memset(pi_state, 0, sizeof(*pi_state));
333 INIT_LIST_HEAD(&pi_state->list);
334 /* pi_mutex gets initialized later */
335 pi_state->owner = NULL;
336 atomic_set(&pi_state->refcount, 1);
337
338 current->pi_state_cache = pi_state;
339
340 return 0;
341}
342
343static struct futex_pi_state * alloc_pi_state(void)
344{
345 struct futex_pi_state *pi_state = current->pi_state_cache;
346
347 WARN_ON(!pi_state);
348 current->pi_state_cache = NULL;
349
350 return pi_state;
351}
352
353static void free_pi_state(struct futex_pi_state *pi_state)
354{
355 if (!atomic_dec_and_test(&pi_state->refcount))
356 return;
357
358 /*
359 * If pi_state->owner is NULL, the owner is most probably dying
360 * and has cleaned up the pi_state already
361 */
362 if (pi_state->owner) {
363 spin_lock_irq(&pi_state->owner->pi_lock);
364 list_del_init(&pi_state->list);
365 spin_unlock_irq(&pi_state->owner->pi_lock);
366
367 rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
368 }
369
370 if (current->pi_state_cache)
371 kfree(pi_state);
372 else {
373 /*
374 * pi_state->list is already empty.
375 * clear pi_state->owner.
376 * refcount is at 0 - put it back to 1.
377 */
378 pi_state->owner = NULL;
379 atomic_set(&pi_state->refcount, 1);
380 current->pi_state_cache = pi_state;
381 }
382}
383
384/*
385 * Look up the task based on what TID userspace gave us.
386 * We dont trust it.
387 */
388static struct task_struct * futex_find_get_task(pid_t pid)
389{
390 struct task_struct *p;
391
392 read_lock(&tasklist_lock);
393 p = find_task_by_pid(pid);
394 if (!p)
395 goto out_unlock;
396 if ((current->euid != p->euid) && (current->euid != p->uid)) {
397 p = NULL;
398 goto out_unlock;
399 }
400 if (p->state == EXIT_ZOMBIE || p->exit_state == EXIT_ZOMBIE) {
401 p = NULL;
402 goto out_unlock;
403 }
404 get_task_struct(p);
405out_unlock:
406 read_unlock(&tasklist_lock);
407
408 return p;
409}
410
411/*
412 * This task is holding PI mutexes at exit time => bad.
413 * Kernel cleans up PI-state, but userspace is likely hosed.
414 * (Robust-futex cleanup is separate and might save the day for userspace.)
415 */
416void exit_pi_state_list(struct task_struct *curr)
417{
418 struct futex_hash_bucket *hb;
419 struct list_head *next, *head = &curr->pi_state_list;
420 struct futex_pi_state *pi_state;
421 union futex_key key;
422
423 /*
424 * We are a ZOMBIE and nobody can enqueue itself on
425 * pi_state_list anymore, but we have to be careful
426 * versus waiters unqueueing themselfs
427 */
428 spin_lock_irq(&curr->pi_lock);
429 while (!list_empty(head)) {
430
431 next = head->next;
432 pi_state = list_entry(next, struct futex_pi_state, list);
433 key = pi_state->key;
434 spin_unlock_irq(&curr->pi_lock);
435
436 hb = hash_futex(&key);
437 spin_lock(&hb->lock);
438
439 spin_lock_irq(&curr->pi_lock);
440 if (head->next != next) {
441 spin_unlock(&hb->lock);
442 continue;
443 }
444
445 list_del_init(&pi_state->list);
446
447 WARN_ON(pi_state->owner != curr);
448
449 pi_state->owner = NULL;
450 spin_unlock_irq(&curr->pi_lock);
451
452 rt_mutex_unlock(&pi_state->pi_mutex);
453
454 spin_unlock(&hb->lock);
455
456 spin_lock_irq(&curr->pi_lock);
457 }
458 spin_unlock_irq(&curr->pi_lock);
459}
460
461static int
462lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
463{
464 struct futex_pi_state *pi_state = NULL;
465 struct futex_q *this, *next;
466 struct list_head *head;
467 struct task_struct *p;
468 pid_t pid;
469
470 head = &hb->chain;
471
472 list_for_each_entry_safe(this, next, head, list) {
473 if (match_futex (&this->key, &me->key)) {
474 /*
475 * Another waiter already exists - bump up
476 * the refcount and return its pi_state:
477 */
478 pi_state = this->pi_state;
479 atomic_inc(&pi_state->refcount);
480 me->pi_state = pi_state;
481
482 return 0;
483 }
484 }
485
486 /*
487 * We are the first waiter - try to look up the real owner and
488 * attach the new pi_state to it:
489 */
490 pid = uval & FUTEX_TID_MASK;
491 p = futex_find_get_task(pid);
492 if (!p)
493 return -ESRCH;
494
495 pi_state = alloc_pi_state();
496
497 /*
498 * Initialize the pi_mutex in locked state and make 'p'
499 * the owner of it:
500 */
501 rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
502
503 /* Store the key for possible exit cleanups: */
504 pi_state->key = me->key;
505
506 spin_lock_irq(&p->pi_lock);
507 list_add(&pi_state->list, &p->pi_state_list);
508 pi_state->owner = p;
509 spin_unlock_irq(&p->pi_lock);
510
511 put_task_struct(p);
512
513 me->pi_state = pi_state;
514
515 return 0;
516}
517
518/*
261 * The hash bucket lock must be held when this is called. 519 * The hash bucket lock must be held when this is called.
262 * Afterwards, the futex_q must not be accessed. 520 * Afterwards, the futex_q must not be accessed.
263 */ 521 */
@@ -284,16 +542,80 @@ static void wake_futex(struct futex_q *q)
284 q->lock_ptr = NULL; 542 q->lock_ptr = NULL;
285} 543}
286 544
545static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
546{
547 struct task_struct *new_owner;
548 struct futex_pi_state *pi_state = this->pi_state;
549 u32 curval, newval;
550
551 if (!pi_state)
552 return -EINVAL;
553
554 new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
555
556 /*
557 * This happens when we have stolen the lock and the original
558 * pending owner did not enqueue itself back on the rt_mutex.
559 * Thats not a tragedy. We know that way, that a lock waiter
560 * is on the fly. We make the futex_q waiter the pending owner.
561 */
562 if (!new_owner)
563 new_owner = this->task;
564
565 /*
566 * We pass it to the next owner. (The WAITERS bit is always
567 * kept enabled while there is PI state around. We must also
568 * preserve the owner died bit.)
569 */
570 newval = (uval & FUTEX_OWNER_DIED) | FUTEX_WAITERS | new_owner->pid;
571
572 inc_preempt_count();
573 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
574 dec_preempt_count();
575
576 if (curval == -EFAULT)
577 return -EFAULT;
578 if (curval != uval)
579 return -EINVAL;
580
581 list_del_init(&pi_state->owner->pi_state_list);
582 list_add(&pi_state->list, &new_owner->pi_state_list);
583 pi_state->owner = new_owner;
584 rt_mutex_unlock(&pi_state->pi_mutex);
585
586 return 0;
587}
588
589static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
590{
591 u32 oldval;
592
593 /*
594 * There is no waiter, so we unlock the futex. The owner died
595 * bit has not to be preserved here. We are the owner:
596 */
597 inc_preempt_count();
598 oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0);
599 dec_preempt_count();
600
601 if (oldval == -EFAULT)
602 return oldval;
603 if (oldval != uval)
604 return -EAGAIN;
605
606 return 0;
607}
608
287/* 609/*
288 * Wake up all waiters hashed on the physical page that is mapped 610 * Wake up all waiters hashed on the physical page that is mapped
289 * to this virtual address: 611 * to this virtual address:
290 */ 612 */
291static int futex_wake(unsigned long uaddr, int nr_wake) 613static int futex_wake(u32 __user *uaddr, int nr_wake)
292{ 614{
293 union futex_key key; 615 struct futex_hash_bucket *hb;
294 struct futex_hash_bucket *bh;
295 struct list_head *head;
296 struct futex_q *this, *next; 616 struct futex_q *this, *next;
617 struct list_head *head;
618 union futex_key key;
297 int ret; 619 int ret;
298 620
299 down_read(&current->mm->mmap_sem); 621 down_read(&current->mm->mmap_sem);
@@ -302,19 +624,21 @@ static int futex_wake(unsigned long uaddr, int nr_wake)
302 if (unlikely(ret != 0)) 624 if (unlikely(ret != 0))
303 goto out; 625 goto out;
304 626
305 bh = hash_futex(&key); 627 hb = hash_futex(&key);
306 spin_lock(&bh->lock); 628 spin_lock(&hb->lock);
307 head = &bh->chain; 629 head = &hb->chain;
308 630
309 list_for_each_entry_safe(this, next, head, list) { 631 list_for_each_entry_safe(this, next, head, list) {
310 if (match_futex (&this->key, &key)) { 632 if (match_futex (&this->key, &key)) {
633 if (this->pi_state)
634 return -EINVAL;
311 wake_futex(this); 635 wake_futex(this);
312 if (++ret >= nr_wake) 636 if (++ret >= nr_wake)
313 break; 637 break;
314 } 638 }
315 } 639 }
316 640
317 spin_unlock(&bh->lock); 641 spin_unlock(&hb->lock);
318out: 642out:
319 up_read(&current->mm->mmap_sem); 643 up_read(&current->mm->mmap_sem);
320 return ret; 644 return ret;
@@ -324,10 +648,12 @@ out:
324 * Wake up all waiters hashed on the physical page that is mapped 648 * Wake up all waiters hashed on the physical page that is mapped
325 * to this virtual address: 649 * to this virtual address:
326 */ 650 */
327static int futex_wake_op(unsigned long uaddr1, unsigned long uaddr2, int nr_wake, int nr_wake2, int op) 651static int
652futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2,
653 int nr_wake, int nr_wake2, int op)
328{ 654{
329 union futex_key key1, key2; 655 union futex_key key1, key2;
330 struct futex_hash_bucket *bh1, *bh2; 656 struct futex_hash_bucket *hb1, *hb2;
331 struct list_head *head; 657 struct list_head *head;
332 struct futex_q *this, *next; 658 struct futex_q *this, *next;
333 int ret, op_ret, attempt = 0; 659 int ret, op_ret, attempt = 0;
@@ -342,27 +668,29 @@ retryfull:
342 if (unlikely(ret != 0)) 668 if (unlikely(ret != 0))
343 goto out; 669 goto out;
344 670
345 bh1 = hash_futex(&key1); 671 hb1 = hash_futex(&key1);
346 bh2 = hash_futex(&key2); 672 hb2 = hash_futex(&key2);
347 673
348retry: 674retry:
349 if (bh1 < bh2) 675 if (hb1 < hb2)
350 spin_lock(&bh1->lock); 676 spin_lock(&hb1->lock);
351 spin_lock(&bh2->lock); 677 spin_lock(&hb2->lock);
352 if (bh1 > bh2) 678 if (hb1 > hb2)
353 spin_lock(&bh1->lock); 679 spin_lock(&hb1->lock);
354 680
355 op_ret = futex_atomic_op_inuser(op, (int __user *)uaddr2); 681 op_ret = futex_atomic_op_inuser(op, uaddr2);
356 if (unlikely(op_ret < 0)) { 682 if (unlikely(op_ret < 0)) {
357 int dummy; 683 u32 dummy;
358 684
359 spin_unlock(&bh1->lock); 685 spin_unlock(&hb1->lock);
360 if (bh1 != bh2) 686 if (hb1 != hb2)
361 spin_unlock(&bh2->lock); 687 spin_unlock(&hb2->lock);
362 688
363#ifndef CONFIG_MMU 689#ifndef CONFIG_MMU
364 /* we don't get EFAULT from MMU faults if we don't have an MMU, 690 /*
365 * but we might get them from range checking */ 691 * we don't get EFAULT from MMU faults if we don't have an MMU,
692 * but we might get them from range checking
693 */
366 ret = op_ret; 694 ret = op_ret;
367 goto out; 695 goto out;
368#endif 696#endif
@@ -372,47 +700,34 @@ retry:
372 goto out; 700 goto out;
373 } 701 }
374 702
375 /* futex_atomic_op_inuser needs to both read and write 703 /*
704 * futex_atomic_op_inuser needs to both read and write
376 * *(int __user *)uaddr2, but we can't modify it 705 * *(int __user *)uaddr2, but we can't modify it
377 * non-atomically. Therefore, if get_user below is not 706 * non-atomically. Therefore, if get_user below is not
378 * enough, we need to handle the fault ourselves, while 707 * enough, we need to handle the fault ourselves, while
379 * still holding the mmap_sem. */ 708 * still holding the mmap_sem.
709 */
380 if (attempt++) { 710 if (attempt++) {
381 struct vm_area_struct * vma; 711 if (futex_handle_fault((unsigned long)uaddr2,
382 struct mm_struct *mm = current->mm; 712 attempt))
383
384 ret = -EFAULT;
385 if (attempt >= 2 ||
386 !(vma = find_vma(mm, uaddr2)) ||
387 vma->vm_start > uaddr2 ||
388 !(vma->vm_flags & VM_WRITE))
389 goto out;
390
391 switch (handle_mm_fault(mm, vma, uaddr2, 1)) {
392 case VM_FAULT_MINOR:
393 current->min_flt++;
394 break;
395 case VM_FAULT_MAJOR:
396 current->maj_flt++;
397 break;
398 default:
399 goto out; 713 goto out;
400 }
401 goto retry; 714 goto retry;
402 } 715 }
403 716
404 /* If we would have faulted, release mmap_sem, 717 /*
405 * fault it in and start all over again. */ 718 * If we would have faulted, release mmap_sem,
719 * fault it in and start all over again.
720 */
406 up_read(&current->mm->mmap_sem); 721 up_read(&current->mm->mmap_sem);
407 722
408 ret = get_user(dummy, (int __user *)uaddr2); 723 ret = get_user(dummy, uaddr2);
409 if (ret) 724 if (ret)
410 return ret; 725 return ret;
411 726
412 goto retryfull; 727 goto retryfull;
413 } 728 }
414 729
415 head = &bh1->chain; 730 head = &hb1->chain;
416 731
417 list_for_each_entry_safe(this, next, head, list) { 732 list_for_each_entry_safe(this, next, head, list) {
418 if (match_futex (&this->key, &key1)) { 733 if (match_futex (&this->key, &key1)) {
@@ -423,7 +738,7 @@ retry:
423 } 738 }
424 739
425 if (op_ret > 0) { 740 if (op_ret > 0) {
426 head = &bh2->chain; 741 head = &hb2->chain;
427 742
428 op_ret = 0; 743 op_ret = 0;
429 list_for_each_entry_safe(this, next, head, list) { 744 list_for_each_entry_safe(this, next, head, list) {
@@ -436,9 +751,9 @@ retry:
436 ret += op_ret; 751 ret += op_ret;
437 } 752 }
438 753
439 spin_unlock(&bh1->lock); 754 spin_unlock(&hb1->lock);
440 if (bh1 != bh2) 755 if (hb1 != hb2)
441 spin_unlock(&bh2->lock); 756 spin_unlock(&hb2->lock);
442out: 757out:
443 up_read(&current->mm->mmap_sem); 758 up_read(&current->mm->mmap_sem);
444 return ret; 759 return ret;
@@ -448,11 +763,11 @@ out:
448 * Requeue all waiters hashed on one physical page to another 763 * Requeue all waiters hashed on one physical page to another
449 * physical page. 764 * physical page.
450 */ 765 */
451static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2, 766static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
452 int nr_wake, int nr_requeue, int *valp) 767 int nr_wake, int nr_requeue, u32 *cmpval)
453{ 768{
454 union futex_key key1, key2; 769 union futex_key key1, key2;
455 struct futex_hash_bucket *bh1, *bh2; 770 struct futex_hash_bucket *hb1, *hb2;
456 struct list_head *head1; 771 struct list_head *head1;
457 struct futex_q *this, *next; 772 struct futex_q *this, *next;
458 int ret, drop_count = 0; 773 int ret, drop_count = 0;
@@ -467,68 +782,72 @@ static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2,
467 if (unlikely(ret != 0)) 782 if (unlikely(ret != 0))
468 goto out; 783 goto out;
469 784
470 bh1 = hash_futex(&key1); 785 hb1 = hash_futex(&key1);
471 bh2 = hash_futex(&key2); 786 hb2 = hash_futex(&key2);
472 787
473 if (bh1 < bh2) 788 if (hb1 < hb2)
474 spin_lock(&bh1->lock); 789 spin_lock(&hb1->lock);
475 spin_lock(&bh2->lock); 790 spin_lock(&hb2->lock);
476 if (bh1 > bh2) 791 if (hb1 > hb2)
477 spin_lock(&bh1->lock); 792 spin_lock(&hb1->lock);
478 793
479 if (likely(valp != NULL)) { 794 if (likely(cmpval != NULL)) {
480 int curval; 795 u32 curval;
481 796
482 ret = get_futex_value_locked(&curval, (int __user *)uaddr1); 797 ret = get_futex_value_locked(&curval, uaddr1);
483 798
484 if (unlikely(ret)) { 799 if (unlikely(ret)) {
485 spin_unlock(&bh1->lock); 800 spin_unlock(&hb1->lock);
486 if (bh1 != bh2) 801 if (hb1 != hb2)
487 spin_unlock(&bh2->lock); 802 spin_unlock(&hb2->lock);
488 803
489 /* If we would have faulted, release mmap_sem, fault 804 /*
805 * If we would have faulted, release mmap_sem, fault
490 * it in and start all over again. 806 * it in and start all over again.
491 */ 807 */
492 up_read(&current->mm->mmap_sem); 808 up_read(&current->mm->mmap_sem);
493 809
494 ret = get_user(curval, (int __user *)uaddr1); 810 ret = get_user(curval, uaddr1);
495 811
496 if (!ret) 812 if (!ret)
497 goto retry; 813 goto retry;
498 814
499 return ret; 815 return ret;
500 } 816 }
501 if (curval != *valp) { 817 if (curval != *cmpval) {
502 ret = -EAGAIN; 818 ret = -EAGAIN;
503 goto out_unlock; 819 goto out_unlock;
504 } 820 }
505 } 821 }
506 822
507 head1 = &bh1->chain; 823 head1 = &hb1->chain;
508 list_for_each_entry_safe(this, next, head1, list) { 824 list_for_each_entry_safe(this, next, head1, list) {
509 if (!match_futex (&this->key, &key1)) 825 if (!match_futex (&this->key, &key1))
510 continue; 826 continue;
511 if (++ret <= nr_wake) { 827 if (++ret <= nr_wake) {
512 wake_futex(this); 828 wake_futex(this);
513 } else { 829 } else {
514 list_move_tail(&this->list, &bh2->chain); 830 /*
515 this->lock_ptr = &bh2->lock; 831 * If key1 and key2 hash to the same bucket, no need to
832 * requeue.
833 */
834 if (likely(head1 != &hb2->chain)) {
835 list_move_tail(&this->list, &hb2->chain);
836 this->lock_ptr = &hb2->lock;
837 }
516 this->key = key2; 838 this->key = key2;
517 get_key_refs(&key2); 839 get_key_refs(&key2);
518 drop_count++; 840 drop_count++;
519 841
520 if (ret - nr_wake >= nr_requeue) 842 if (ret - nr_wake >= nr_requeue)
521 break; 843 break;
522 /* Make sure to stop if key1 == key2 */
523 if (head1 == &bh2->chain && head1 != &next->list)
524 head1 = &this->list;
525 } 844 }
526 } 845 }
527 846
528out_unlock: 847out_unlock:
529 spin_unlock(&bh1->lock); 848 spin_unlock(&hb1->lock);
530 if (bh1 != bh2) 849 if (hb1 != hb2)
531 spin_unlock(&bh2->lock); 850 spin_unlock(&hb2->lock);
532 851
533 /* drop_key_refs() must be called outside the spinlocks. */ 852 /* drop_key_refs() must be called outside the spinlocks. */
534 while (--drop_count >= 0) 853 while (--drop_count >= 0)
@@ -543,7 +862,7 @@ out:
543static inline struct futex_hash_bucket * 862static inline struct futex_hash_bucket *
544queue_lock(struct futex_q *q, int fd, struct file *filp) 863queue_lock(struct futex_q *q, int fd, struct file *filp)
545{ 864{
546 struct futex_hash_bucket *bh; 865 struct futex_hash_bucket *hb;
547 866
548 q->fd = fd; 867 q->fd = fd;
549 q->filp = filp; 868 q->filp = filp;
@@ -551,23 +870,24 @@ queue_lock(struct futex_q *q, int fd, struct file *filp)
551 init_waitqueue_head(&q->waiters); 870 init_waitqueue_head(&q->waiters);
552 871
553 get_key_refs(&q->key); 872 get_key_refs(&q->key);
554 bh = hash_futex(&q->key); 873 hb = hash_futex(&q->key);
555 q->lock_ptr = &bh->lock; 874 q->lock_ptr = &hb->lock;
556 875
557 spin_lock(&bh->lock); 876 spin_lock(&hb->lock);
558 return bh; 877 return hb;
559} 878}
560 879
561static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *bh) 880static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
562{ 881{
563 list_add_tail(&q->list, &bh->chain); 882 list_add_tail(&q->list, &hb->chain);
564 spin_unlock(&bh->lock); 883 q->task = current;
884 spin_unlock(&hb->lock);
565} 885}
566 886
567static inline void 887static inline void
568queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh) 888queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
569{ 889{
570 spin_unlock(&bh->lock); 890 spin_unlock(&hb->lock);
571 drop_key_refs(&q->key); 891 drop_key_refs(&q->key);
572} 892}
573 893
@@ -579,16 +899,17 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh)
579/* The key must be already stored in q->key. */ 899/* The key must be already stored in q->key. */
580static void queue_me(struct futex_q *q, int fd, struct file *filp) 900static void queue_me(struct futex_q *q, int fd, struct file *filp)
581{ 901{
582 struct futex_hash_bucket *bh; 902 struct futex_hash_bucket *hb;
583 bh = queue_lock(q, fd, filp); 903
584 __queue_me(q, bh); 904 hb = queue_lock(q, fd, filp);
905 __queue_me(q, hb);
585} 906}
586 907
587/* Return 1 if we were still queued (ie. 0 means we were woken) */ 908/* Return 1 if we were still queued (ie. 0 means we were woken) */
588static int unqueue_me(struct futex_q *q) 909static int unqueue_me(struct futex_q *q)
589{ 910{
590 int ret = 0;
591 spinlock_t *lock_ptr; 911 spinlock_t *lock_ptr;
912 int ret = 0;
592 913
593 /* In the common case we don't take the spinlock, which is nice. */ 914 /* In the common case we don't take the spinlock, which is nice. */
594 retry: 915 retry:
@@ -614,6 +935,9 @@ static int unqueue_me(struct futex_q *q)
614 } 935 }
615 WARN_ON(list_empty(&q->list)); 936 WARN_ON(list_empty(&q->list));
616 list_del(&q->list); 937 list_del(&q->list);
938
939 BUG_ON(q->pi_state);
940
617 spin_unlock(lock_ptr); 941 spin_unlock(lock_ptr);
618 ret = 1; 942 ret = 1;
619 } 943 }
@@ -622,21 +946,42 @@ static int unqueue_me(struct futex_q *q)
622 return ret; 946 return ret;
623} 947}
624 948
625static int futex_wait(unsigned long uaddr, int val, unsigned long time) 949/*
950 * PI futexes can not be requeued and must remove themself from the
951 * hash bucket. The hash bucket lock is held on entry and dropped here.
952 */
953static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb)
626{ 954{
627 DECLARE_WAITQUEUE(wait, current); 955 WARN_ON(list_empty(&q->list));
628 int ret, curval; 956 list_del(&q->list);
957
958 BUG_ON(!q->pi_state);
959 free_pi_state(q->pi_state);
960 q->pi_state = NULL;
961
962 spin_unlock(&hb->lock);
963
964 drop_key_refs(&q->key);
965}
966
967static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
968{
969 struct task_struct *curr = current;
970 DECLARE_WAITQUEUE(wait, curr);
971 struct futex_hash_bucket *hb;
629 struct futex_q q; 972 struct futex_q q;
630 struct futex_hash_bucket *bh; 973 u32 uval;
974 int ret;
631 975
976 q.pi_state = NULL;
632 retry: 977 retry:
633 down_read(&current->mm->mmap_sem); 978 down_read(&curr->mm->mmap_sem);
634 979
635 ret = get_futex_key(uaddr, &q.key); 980 ret = get_futex_key(uaddr, &q.key);
636 if (unlikely(ret != 0)) 981 if (unlikely(ret != 0))
637 goto out_release_sem; 982 goto out_release_sem;
638 983
639 bh = queue_lock(&q, -1, NULL); 984 hb = queue_lock(&q, -1, NULL);
640 985
641 /* 986 /*
642 * Access the page AFTER the futex is queued. 987 * Access the page AFTER the futex is queued.
@@ -658,37 +1003,35 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time)
658 * We hold the mmap semaphore, so the mapping cannot have changed 1003 * We hold the mmap semaphore, so the mapping cannot have changed
659 * since we looked it up in get_futex_key. 1004 * since we looked it up in get_futex_key.
660 */ 1005 */
661 1006 ret = get_futex_value_locked(&uval, uaddr);
662 ret = get_futex_value_locked(&curval, (int __user *)uaddr);
663 1007
664 if (unlikely(ret)) { 1008 if (unlikely(ret)) {
665 queue_unlock(&q, bh); 1009 queue_unlock(&q, hb);
666 1010
667 /* If we would have faulted, release mmap_sem, fault it in and 1011 /*
1012 * If we would have faulted, release mmap_sem, fault it in and
668 * start all over again. 1013 * start all over again.
669 */ 1014 */
670 up_read(&current->mm->mmap_sem); 1015 up_read(&curr->mm->mmap_sem);
671 1016
672 ret = get_user(curval, (int __user *)uaddr); 1017 ret = get_user(uval, uaddr);
673 1018
674 if (!ret) 1019 if (!ret)
675 goto retry; 1020 goto retry;
676 return ret; 1021 return ret;
677 } 1022 }
678 if (curval != val) { 1023 ret = -EWOULDBLOCK;
679 ret = -EWOULDBLOCK; 1024 if (uval != val)
680 queue_unlock(&q, bh); 1025 goto out_unlock_release_sem;
681 goto out_release_sem;
682 }
683 1026
684 /* Only actually queue if *uaddr contained val. */ 1027 /* Only actually queue if *uaddr contained val. */
685 __queue_me(&q, bh); 1028 __queue_me(&q, hb);
686 1029
687 /* 1030 /*
688 * Now the futex is queued and we have checked the data, we 1031 * Now the futex is queued and we have checked the data, we
689 * don't want to hold mmap_sem while we sleep. 1032 * don't want to hold mmap_sem while we sleep.
690 */ 1033 */
691 up_read(&current->mm->mmap_sem); 1034 up_read(&curr->mm->mmap_sem);
692 1035
693 /* 1036 /*
694 * There might have been scheduling since the queue_me(), as we 1037 * There might have been scheduling since the queue_me(), as we
@@ -720,12 +1063,421 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time)
720 return 0; 1063 return 0;
721 if (time == 0) 1064 if (time == 0)
722 return -ETIMEDOUT; 1065 return -ETIMEDOUT;
723 /* We expect signal_pending(current), but another thread may 1066 /*
724 * have handled it for us already. */ 1067 * We expect signal_pending(current), but another thread may
1068 * have handled it for us already.
1069 */
725 return -EINTR; 1070 return -EINTR;
726 1071
1072 out_unlock_release_sem:
1073 queue_unlock(&q, hb);
1074
727 out_release_sem: 1075 out_release_sem:
1076 up_read(&curr->mm->mmap_sem);
1077 return ret;
1078}
1079
1080/*
1081 * Userspace tried a 0 -> TID atomic transition of the futex value
1082 * and failed. The kernel side here does the whole locking operation:
1083 * if there are waiters then it will block, it does PI, etc. (Due to
1084 * races the kernel might see a 0 value of the futex too.)
1085 */
1086static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
1087 struct hrtimer_sleeper *to)
1088{
1089 struct task_struct *curr = current;
1090 struct futex_hash_bucket *hb;
1091 u32 uval, newval, curval;
1092 struct futex_q q;
1093 int ret, attempt = 0;
1094
1095 if (refill_pi_state_cache())
1096 return -ENOMEM;
1097
1098 q.pi_state = NULL;
1099 retry:
1100 down_read(&curr->mm->mmap_sem);
1101
1102 ret = get_futex_key(uaddr, &q.key);
1103 if (unlikely(ret != 0))
1104 goto out_release_sem;
1105
1106 hb = queue_lock(&q, -1, NULL);
1107
1108 retry_locked:
1109 /*
1110 * To avoid races, we attempt to take the lock here again
1111 * (by doing a 0 -> TID atomic cmpxchg), while holding all
1112 * the locks. It will most likely not succeed.
1113 */
1114 newval = current->pid;
1115
1116 inc_preempt_count();
1117 curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval);
1118 dec_preempt_count();
1119
1120 if (unlikely(curval == -EFAULT))
1121 goto uaddr_faulted;
1122
1123 /* We own the lock already */
1124 if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) {
1125 if (!detect && 0)
1126 force_sig(SIGKILL, current);
1127 ret = -EDEADLK;
1128 goto out_unlock_release_sem;
1129 }
1130
1131 /*
1132 * Surprise - we got the lock. Just return
1133 * to userspace:
1134 */
1135 if (unlikely(!curval))
1136 goto out_unlock_release_sem;
1137
1138 uval = curval;
1139 newval = uval | FUTEX_WAITERS;
1140
1141 inc_preempt_count();
1142 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
1143 dec_preempt_count();
1144
1145 if (unlikely(curval == -EFAULT))
1146 goto uaddr_faulted;
1147 if (unlikely(curval != uval))
1148 goto retry_locked;
1149
1150 /*
1151 * We dont have the lock. Look up the PI state (or create it if
1152 * we are the first waiter):
1153 */
1154 ret = lookup_pi_state(uval, hb, &q);
1155
1156 if (unlikely(ret)) {
1157 /*
1158 * There were no waiters and the owner task lookup
1159 * failed. When the OWNER_DIED bit is set, then we
1160 * know that this is a robust futex and we actually
1161 * take the lock. This is safe as we are protected by
1162 * the hash bucket lock. We also set the waiters bit
1163 * unconditionally here, to simplify glibc handling of
1164 * multiple tasks racing to acquire the lock and
1165 * cleanup the problems which were left by the dead
1166 * owner.
1167 */
1168 if (curval & FUTEX_OWNER_DIED) {
1169 uval = newval;
1170 newval = current->pid |
1171 FUTEX_OWNER_DIED | FUTEX_WAITERS;
1172
1173 inc_preempt_count();
1174 curval = futex_atomic_cmpxchg_inatomic(uaddr,
1175 uval, newval);
1176 dec_preempt_count();
1177
1178 if (unlikely(curval == -EFAULT))
1179 goto uaddr_faulted;
1180 if (unlikely(curval != uval))
1181 goto retry_locked;
1182 ret = 0;
1183 }
1184 goto out_unlock_release_sem;
1185 }
1186
1187 /*
1188 * Only actually queue now that the atomic ops are done:
1189 */
1190 __queue_me(&q, hb);
1191
1192 /*
1193 * Now the futex is queued and we have checked the data, we
1194 * don't want to hold mmap_sem while we sleep.
1195 */
1196 up_read(&curr->mm->mmap_sem);
1197
1198 WARN_ON(!q.pi_state);
1199 /*
1200 * Block on the PI mutex:
1201 */
1202 if (!trylock)
1203 ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1);
1204 else {
1205 ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
1206 /* Fixup the trylock return value: */
1207 ret = ret ? 0 : -EWOULDBLOCK;
1208 }
1209
1210 down_read(&curr->mm->mmap_sem);
1211 hb = queue_lock(&q, -1, NULL);
1212
1213 /*
1214 * Got the lock. We might not be the anticipated owner if we
1215 * did a lock-steal - fix up the PI-state in that case.
1216 */
1217 if (!ret && q.pi_state->owner != curr) {
1218 u32 newtid = current->pid | FUTEX_WAITERS;
1219
1220 /* Owner died? */
1221 if (q.pi_state->owner != NULL) {
1222 spin_lock_irq(&q.pi_state->owner->pi_lock);
1223 list_del_init(&q.pi_state->list);
1224 spin_unlock_irq(&q.pi_state->owner->pi_lock);
1225 } else
1226 newtid |= FUTEX_OWNER_DIED;
1227
1228 q.pi_state->owner = current;
1229
1230 spin_lock_irq(&current->pi_lock);
1231 list_add(&q.pi_state->list, &current->pi_state_list);
1232 spin_unlock_irq(&current->pi_lock);
1233
1234 /* Unqueue and drop the lock */
1235 unqueue_me_pi(&q, hb);
1236 up_read(&curr->mm->mmap_sem);
1237 /*
1238 * We own it, so we have to replace the pending owner
1239 * TID. This must be atomic as we have preserve the
1240 * owner died bit here.
1241 */
1242 ret = get_user(uval, uaddr);
1243 while (!ret) {
1244 newval = (uval & FUTEX_OWNER_DIED) | newtid;
1245 curval = futex_atomic_cmpxchg_inatomic(uaddr,
1246 uval, newval);
1247 if (curval == -EFAULT)
1248 ret = -EFAULT;
1249 if (curval == uval)
1250 break;
1251 uval = curval;
1252 }
1253 } else {
1254 /*
1255 * Catch the rare case, where the lock was released
1256 * when we were on the way back before we locked
1257 * the hash bucket.
1258 */
1259 if (ret && q.pi_state->owner == curr) {
1260 if (rt_mutex_trylock(&q.pi_state->pi_mutex))
1261 ret = 0;
1262 }
1263 /* Unqueue and drop the lock */
1264 unqueue_me_pi(&q, hb);
1265 up_read(&curr->mm->mmap_sem);
1266 }
1267
1268 if (!detect && ret == -EDEADLK && 0)
1269 force_sig(SIGKILL, current);
1270
1271 return ret;
1272
1273 out_unlock_release_sem:
1274 queue_unlock(&q, hb);
1275
1276 out_release_sem:
1277 up_read(&curr->mm->mmap_sem);
1278 return ret;
1279
1280 uaddr_faulted:
1281 /*
1282 * We have to r/w *(int __user *)uaddr, but we can't modify it
1283 * non-atomically. Therefore, if get_user below is not
1284 * enough, we need to handle the fault ourselves, while
1285 * still holding the mmap_sem.
1286 */
1287 if (attempt++) {
1288 if (futex_handle_fault((unsigned long)uaddr, attempt))
1289 goto out_unlock_release_sem;
1290
1291 goto retry_locked;
1292 }
1293
1294 queue_unlock(&q, hb);
1295 up_read(&curr->mm->mmap_sem);
1296
1297 ret = get_user(uval, uaddr);
1298 if (!ret && (uval != -EFAULT))
1299 goto retry;
1300
1301 return ret;
1302}
1303
1304/*
1305 * Restart handler
1306 */
1307static long futex_lock_pi_restart(struct restart_block *restart)
1308{
1309 struct hrtimer_sleeper timeout, *to = NULL;
1310 int ret;
1311
1312 restart->fn = do_no_restart_syscall;
1313
1314 if (restart->arg2 || restart->arg3) {
1315 to = &timeout;
1316 hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS);
1317 hrtimer_init_sleeper(to, current);
1318 to->timer.expires.tv64 = ((u64)restart->arg1 << 32) |
1319 (u64) restart->arg0;
1320 }
1321
1322 pr_debug("lock_pi restart: %p, %d (%d)\n",
1323 (u32 __user *)restart->arg0, current->pid);
1324
1325 ret = do_futex_lock_pi((u32 __user *)restart->arg0, restart->arg1,
1326 0, to);
1327
1328 if (ret != -EINTR)
1329 return ret;
1330
1331 restart->fn = futex_lock_pi_restart;
1332
1333 /* The other values are filled in */
1334 return -ERESTART_RESTARTBLOCK;
1335}
1336
1337/*
1338 * Called from the syscall entry below.
1339 */
1340static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
1341 long nsec, int trylock)
1342{
1343 struct hrtimer_sleeper timeout, *to = NULL;
1344 struct restart_block *restart;
1345 int ret;
1346
1347 if (sec != MAX_SCHEDULE_TIMEOUT) {
1348 to = &timeout;
1349 hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS);
1350 hrtimer_init_sleeper(to, current);
1351 to->timer.expires = ktime_set(sec, nsec);
1352 }
1353
1354 ret = do_futex_lock_pi(uaddr, detect, trylock, to);
1355
1356 if (ret != -EINTR)
1357 return ret;
1358
1359 pr_debug("lock_pi interrupted: %p, %d (%d)\n", uaddr, current->pid);
1360
1361 restart = &current_thread_info()->restart_block;
1362 restart->fn = futex_lock_pi_restart;
1363 restart->arg0 = (unsigned long) uaddr;
1364 restart->arg1 = detect;
1365 if (to) {
1366 restart->arg2 = to->timer.expires.tv64 & 0xFFFFFFFF;
1367 restart->arg3 = to->timer.expires.tv64 >> 32;
1368 } else
1369 restart->arg2 = restart->arg3 = 0;
1370
1371 return -ERESTART_RESTARTBLOCK;
1372}
1373
1374/*
1375 * Userspace attempted a TID -> 0 atomic transition, and failed.
1376 * This is the in-kernel slowpath: we look up the PI state (if any),
1377 * and do the rt-mutex unlock.
1378 */
1379static int futex_unlock_pi(u32 __user *uaddr)
1380{
1381 struct futex_hash_bucket *hb;
1382 struct futex_q *this, *next;
1383 u32 uval;
1384 struct list_head *head;
1385 union futex_key key;
1386 int ret, attempt = 0;
1387
1388retry:
1389 if (get_user(uval, uaddr))
1390 return -EFAULT;
1391 /*
1392 * We release only a lock we actually own:
1393 */
1394 if ((uval & FUTEX_TID_MASK) != current->pid)
1395 return -EPERM;
1396 /*
1397 * First take all the futex related locks:
1398 */
1399 down_read(&current->mm->mmap_sem);
1400
1401 ret = get_futex_key(uaddr, &key);
1402 if (unlikely(ret != 0))
1403 goto out;
1404
1405 hb = hash_futex(&key);
1406 spin_lock(&hb->lock);
1407
1408retry_locked:
1409 /*
1410 * To avoid races, try to do the TID -> 0 atomic transition
1411 * again. If it succeeds then we can return without waking
1412 * anyone else up:
1413 */
1414 inc_preempt_count();
1415 uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
1416 dec_preempt_count();
1417
1418 if (unlikely(uval == -EFAULT))
1419 goto pi_faulted;
1420 /*
1421 * Rare case: we managed to release the lock atomically,
1422 * no need to wake anyone else up:
1423 */
1424 if (unlikely(uval == current->pid))
1425 goto out_unlock;
1426
1427 /*
1428 * Ok, other tasks may need to be woken up - check waiters
1429 * and do the wakeup if necessary:
1430 */
1431 head = &hb->chain;
1432
1433 list_for_each_entry_safe(this, next, head, list) {
1434 if (!match_futex (&this->key, &key))
1435 continue;
1436 ret = wake_futex_pi(uaddr, uval, this);
1437 /*
1438 * The atomic access to the futex value
1439 * generated a pagefault, so retry the
1440 * user-access and the wakeup:
1441 */
1442 if (ret == -EFAULT)
1443 goto pi_faulted;
1444 goto out_unlock;
1445 }
1446 /*
1447 * No waiters - kernel unlocks the futex:
1448 */
1449 ret = unlock_futex_pi(uaddr, uval);
1450 if (ret == -EFAULT)
1451 goto pi_faulted;
1452
1453out_unlock:
1454 spin_unlock(&hb->lock);
1455out:
728 up_read(&current->mm->mmap_sem); 1456 up_read(&current->mm->mmap_sem);
1457
1458 return ret;
1459
1460pi_faulted:
1461 /*
1462 * We have to r/w *(int __user *)uaddr, but we can't modify it
1463 * non-atomically. Therefore, if get_user below is not
1464 * enough, we need to handle the fault ourselves, while
1465 * still holding the mmap_sem.
1466 */
1467 if (attempt++) {
1468 if (futex_handle_fault((unsigned long)uaddr, attempt))
1469 goto out_unlock;
1470
1471 goto retry_locked;
1472 }
1473
1474 spin_unlock(&hb->lock);
1475 up_read(&current->mm->mmap_sem);
1476
1477 ret = get_user(uval, uaddr);
1478 if (!ret && (uval != -EFAULT))
1479 goto retry;
1480
729 return ret; 1481 return ret;
730} 1482}
731 1483
@@ -735,6 +1487,7 @@ static int futex_close(struct inode *inode, struct file *filp)
735 1487
736 unqueue_me(q); 1488 unqueue_me(q);
737 kfree(q); 1489 kfree(q);
1490
738 return 0; 1491 return 0;
739} 1492}
740 1493
@@ -766,7 +1519,7 @@ static struct file_operations futex_fops = {
766 * Signal allows caller to avoid the race which would occur if they 1519 * Signal allows caller to avoid the race which would occur if they
767 * set the sigio stuff up afterwards. 1520 * set the sigio stuff up afterwards.
768 */ 1521 */
769static int futex_fd(unsigned long uaddr, int signal) 1522static int futex_fd(u32 __user *uaddr, int signal)
770{ 1523{
771 struct futex_q *q; 1524 struct futex_q *q;
772 struct file *filp; 1525 struct file *filp;
@@ -803,6 +1556,7 @@ static int futex_fd(unsigned long uaddr, int signal)
803 err = -ENOMEM; 1556 err = -ENOMEM;
804 goto error; 1557 goto error;
805 } 1558 }
1559 q->pi_state = NULL;
806 1560
807 down_read(&current->mm->mmap_sem); 1561 down_read(&current->mm->mmap_sem);
808 err = get_futex_key(uaddr, &q->key); 1562 err = get_futex_key(uaddr, &q->key);
@@ -840,7 +1594,7 @@ error:
840 * Implementation: user-space maintains a per-thread list of locks it 1594 * Implementation: user-space maintains a per-thread list of locks it
841 * is holding. Upon do_exit(), the kernel carefully walks this list, 1595 * is holding. Upon do_exit(), the kernel carefully walks this list,
842 * and marks all locks that are owned by this thread with the 1596 * and marks all locks that are owned by this thread with the
843 * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is 1597 * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
844 * always manipulated with the lock held, so the list is private and 1598 * always manipulated with the lock held, so the list is private and
845 * per-thread. Userspace also maintains a per-thread 'list_op_pending' 1599 * per-thread. Userspace also maintains a per-thread 'list_op_pending'
846 * field, to allow the kernel to clean up if the thread dies after 1600 * field, to allow the kernel to clean up if the thread dies after
@@ -915,7 +1669,7 @@ err_unlock:
915 */ 1669 */
916int handle_futex_death(u32 __user *uaddr, struct task_struct *curr) 1670int handle_futex_death(u32 __user *uaddr, struct task_struct *curr)
917{ 1671{
918 u32 uval; 1672 u32 uval, nval;
919 1673
920retry: 1674retry:
921 if (get_user(uval, uaddr)) 1675 if (get_user(uval, uaddr))
@@ -932,12 +1686,16 @@ retry:
932 * thread-death.) The rest of the cleanup is done in 1686 * thread-death.) The rest of the cleanup is done in
933 * userspace. 1687 * userspace.
934 */ 1688 */
935 if (futex_atomic_cmpxchg_inatomic(uaddr, uval, 1689 nval = futex_atomic_cmpxchg_inatomic(uaddr, uval,
936 uval | FUTEX_OWNER_DIED) != uval) 1690 uval | FUTEX_OWNER_DIED);
1691 if (nval == -EFAULT)
1692 return -1;
1693
1694 if (nval != uval)
937 goto retry; 1695 goto retry;
938 1696
939 if (uval & FUTEX_WAITERS) 1697 if (uval & FUTEX_WAITERS)
940 futex_wake((unsigned long)uaddr, 1); 1698 futex_wake(uaddr, 1);
941 } 1699 }
942 return 0; 1700 return 0;
943} 1701}
@@ -978,7 +1736,7 @@ void exit_robust_list(struct task_struct *curr)
978 while (entry != &head->list) { 1736 while (entry != &head->list) {
979 /* 1737 /*
980 * A pending lock might already be on the list, so 1738 * A pending lock might already be on the list, so
981 * dont process it twice: 1739 * don't process it twice:
982 */ 1740 */
983 if (entry != pending) 1741 if (entry != pending)
984 if (handle_futex_death((void *)entry + futex_offset, 1742 if (handle_futex_death((void *)entry + futex_offset,
@@ -999,8 +1757,8 @@ void exit_robust_list(struct task_struct *curr)
999 } 1757 }
1000} 1758}
1001 1759
1002long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, 1760long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout,
1003 unsigned long uaddr2, int val2, int val3) 1761 u32 __user *uaddr2, u32 val2, u32 val3)
1004{ 1762{
1005 int ret; 1763 int ret;
1006 1764
@@ -1024,6 +1782,15 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
1024 case FUTEX_WAKE_OP: 1782 case FUTEX_WAKE_OP:
1025 ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); 1783 ret = futex_wake_op(uaddr, uaddr2, val, val2, val3);
1026 break; 1784 break;
1785 case FUTEX_LOCK_PI:
1786 ret = futex_lock_pi(uaddr, val, timeout, val2, 0);
1787 break;
1788 case FUTEX_UNLOCK_PI:
1789 ret = futex_unlock_pi(uaddr);
1790 break;
1791 case FUTEX_TRYLOCK_PI:
1792 ret = futex_lock_pi(uaddr, 0, timeout, val2, 1);
1793 break;
1027 default: 1794 default:
1028 ret = -ENOSYS; 1795 ret = -ENOSYS;
1029 } 1796 }
@@ -1031,29 +1798,33 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
1031} 1798}
1032 1799
1033 1800
1034asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, 1801asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
1035 struct timespec __user *utime, u32 __user *uaddr2, 1802 struct timespec __user *utime, u32 __user *uaddr2,
1036 int val3) 1803 u32 val3)
1037{ 1804{
1038 struct timespec t; 1805 struct timespec t;
1039 unsigned long timeout = MAX_SCHEDULE_TIMEOUT; 1806 unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
1040 int val2 = 0; 1807 u32 val2 = 0;
1041 1808
1042 if (utime && (op == FUTEX_WAIT)) { 1809 if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
1043 if (copy_from_user(&t, utime, sizeof(t)) != 0) 1810 if (copy_from_user(&t, utime, sizeof(t)) != 0)
1044 return -EFAULT; 1811 return -EFAULT;
1045 if (!timespec_valid(&t)) 1812 if (!timespec_valid(&t))
1046 return -EINVAL; 1813 return -EINVAL;
1047 timeout = timespec_to_jiffies(&t) + 1; 1814 if (op == FUTEX_WAIT)
1815 timeout = timespec_to_jiffies(&t) + 1;
1816 else {
1817 timeout = t.tv_sec;
1818 val2 = t.tv_nsec;
1819 }
1048 } 1820 }
1049 /* 1821 /*
1050 * requeue parameter in 'utime' if op == FUTEX_REQUEUE. 1822 * requeue parameter in 'utime' if op == FUTEX_REQUEUE.
1051 */ 1823 */
1052 if (op >= FUTEX_REQUEUE) 1824 if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
1053 val2 = (int) (unsigned long) utime; 1825 val2 = (u32) (unsigned long) utime;
1054 1826
1055 return do_futex((unsigned long)uaddr, op, val, timeout, 1827 return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
1056 (unsigned long)uaddr2, val2, val3);
1057} 1828}
1058 1829
1059static int futexfs_get_sb(struct file_system_type *fs_type, 1830static int futexfs_get_sb(struct file_system_type *fs_type,
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index 1ab6a0ea3d14..d1d92b441fb7 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -129,16 +129,20 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val,
129 unsigned long timeout = MAX_SCHEDULE_TIMEOUT; 129 unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
130 int val2 = 0; 130 int val2 = 0;
131 131
132 if (utime && (op == FUTEX_WAIT)) { 132 if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) {
133 if (get_compat_timespec(&t, utime)) 133 if (get_compat_timespec(&t, utime))
134 return -EFAULT; 134 return -EFAULT;
135 if (!timespec_valid(&t)) 135 if (!timespec_valid(&t))
136 return -EINVAL; 136 return -EINVAL;
137 timeout = timespec_to_jiffies(&t) + 1; 137 if (op == FUTEX_WAIT)
138 timeout = timespec_to_jiffies(&t) + 1;
139 else {
140 timeout = t.tv_sec;
141 val2 = t.tv_nsec;
142 }
138 } 143 }
139 if (op >= FUTEX_REQUEUE) 144 if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
140 val2 = (int) (unsigned long) utime; 145 val2 = (int) (unsigned long) utime;
141 146
142 return do_futex((unsigned long)uaddr, op, val, timeout, 147 return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3);
143 (unsigned long)uaddr2, val2, val3);
144} 148}
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 55601b3ce60e..8d3dc29ef41a 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -833,7 +833,7 @@ static void migrate_hrtimers(int cpu)
833} 833}
834#endif /* CONFIG_HOTPLUG_CPU */ 834#endif /* CONFIG_HOTPLUG_CPU */
835 835
836static int hrtimer_cpu_notify(struct notifier_block *self, 836static int __devinit hrtimer_cpu_notify(struct notifier_block *self,
837 unsigned long action, void *hcpu) 837 unsigned long action, void *hcpu)
838{ 838{
839 long cpu = (long)hcpu; 839 long cpu = (long)hcpu;
@@ -857,7 +857,7 @@ static int hrtimer_cpu_notify(struct notifier_block *self,
857 return NOTIFY_OK; 857 return NOTIFY_OK;
858} 858}
859 859
860static struct notifier_block hrtimers_nb = { 860static struct notifier_block __devinitdata hrtimers_nb = {
861 .notifier_call = hrtimer_cpu_notify, 861 .notifier_call = hrtimer_cpu_notify,
862}; 862};
863 863
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 1fbf466a29aa..64aab081153b 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -47,11 +47,17 @@
47 47
48static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; 48static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
49static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; 49static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
50static atomic_t kprobe_count;
50 51
51DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ 52DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
52DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ 53DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */
53static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; 54static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
54 55
56static struct notifier_block kprobe_page_fault_nb = {
57 .notifier_call = kprobe_exceptions_notify,
58 .priority = 0x7fffffff /* we need to notified first */
59};
60
55#ifdef __ARCH_WANT_KPROBES_INSN_SLOT 61#ifdef __ARCH_WANT_KPROBES_INSN_SLOT
56/* 62/*
57 * kprobe->ainsn.insn points to the copy of the instruction to be 63 * kprobe->ainsn.insn points to the copy of the instruction to be
@@ -368,16 +374,15 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
368*/ 374*/
369static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p) 375static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
370{ 376{
371 struct kprobe *kp;
372
373 if (p->break_handler) { 377 if (p->break_handler) {
374 list_for_each_entry_rcu(kp, &old_p->list, list) { 378 if (old_p->break_handler)
375 if (kp->break_handler) 379 return -EEXIST;
376 return -EEXIST;
377 }
378 list_add_tail_rcu(&p->list, &old_p->list); 380 list_add_tail_rcu(&p->list, &old_p->list);
381 old_p->break_handler = aggr_break_handler;
379 } else 382 } else
380 list_add_rcu(&p->list, &old_p->list); 383 list_add_rcu(&p->list, &old_p->list);
384 if (p->post_handler && !old_p->post_handler)
385 old_p->post_handler = aggr_post_handler;
381 return 0; 386 return 0;
382} 387}
383 388
@@ -390,9 +395,11 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
390 copy_kprobe(p, ap); 395 copy_kprobe(p, ap);
391 ap->addr = p->addr; 396 ap->addr = p->addr;
392 ap->pre_handler = aggr_pre_handler; 397 ap->pre_handler = aggr_pre_handler;
393 ap->post_handler = aggr_post_handler;
394 ap->fault_handler = aggr_fault_handler; 398 ap->fault_handler = aggr_fault_handler;
395 ap->break_handler = aggr_break_handler; 399 if (p->post_handler)
400 ap->post_handler = aggr_post_handler;
401 if (p->break_handler)
402 ap->break_handler = aggr_break_handler;
396 403
397 INIT_LIST_HEAD(&ap->list); 404 INIT_LIST_HEAD(&ap->list);
398 list_add_rcu(&p->list, &ap->list); 405 list_add_rcu(&p->list, &ap->list);
@@ -464,6 +471,8 @@ static int __kprobes __register_kprobe(struct kprobe *p,
464 old_p = get_kprobe(p->addr); 471 old_p = get_kprobe(p->addr);
465 if (old_p) { 472 if (old_p) {
466 ret = register_aggr_kprobe(old_p, p); 473 ret = register_aggr_kprobe(old_p, p);
474 if (!ret)
475 atomic_inc(&kprobe_count);
467 goto out; 476 goto out;
468 } 477 }
469 478
@@ -474,6 +483,10 @@ static int __kprobes __register_kprobe(struct kprobe *p,
474 hlist_add_head_rcu(&p->hlist, 483 hlist_add_head_rcu(&p->hlist,
475 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); 484 &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
476 485
486 if (atomic_add_return(1, &kprobe_count) == \
487 (ARCH_INACTIVE_KPROBE_COUNT + 1))
488 register_page_fault_notifier(&kprobe_page_fault_nb);
489
477 arch_arm_kprobe(p); 490 arch_arm_kprobe(p);
478 491
479out: 492out:
@@ -536,14 +549,40 @@ valid_p:
536 kfree(old_p); 549 kfree(old_p);
537 } 550 }
538 arch_remove_kprobe(p); 551 arch_remove_kprobe(p);
552 } else {
553 mutex_lock(&kprobe_mutex);
554 if (p->break_handler)
555 old_p->break_handler = NULL;
556 if (p->post_handler){
557 list_for_each_entry_rcu(list_p, &old_p->list, list){
558 if (list_p->post_handler){
559 cleanup_p = 2;
560 break;
561 }
562 }
563 if (cleanup_p == 0)
564 old_p->post_handler = NULL;
565 }
566 mutex_unlock(&kprobe_mutex);
539 } 567 }
568
569 /* Call unregister_page_fault_notifier()
570 * if no probes are active
571 */
572 mutex_lock(&kprobe_mutex);
573 if (atomic_add_return(-1, &kprobe_count) == \
574 ARCH_INACTIVE_KPROBE_COUNT)
575 unregister_page_fault_notifier(&kprobe_page_fault_nb);
576 mutex_unlock(&kprobe_mutex);
577 return;
540} 578}
541 579
542static struct notifier_block kprobe_exceptions_nb = { 580static struct notifier_block kprobe_exceptions_nb = {
543 .notifier_call = kprobe_exceptions_notify, 581 .notifier_call = kprobe_exceptions_notify,
544 .priority = 0x7fffffff /* we need to notified first */ 582 .priority = 0x7fffffff /* we need to be notified first */
545}; 583};
546 584
585
547int __kprobes register_jprobe(struct jprobe *jp) 586int __kprobes register_jprobe(struct jprobe *jp)
548{ 587{
549 /* Todo: Verify probepoint is a function entry point */ 588 /* Todo: Verify probepoint is a function entry point */
@@ -652,6 +691,7 @@ static int __init init_kprobes(void)
652 INIT_HLIST_HEAD(&kprobe_table[i]); 691 INIT_HLIST_HEAD(&kprobe_table[i]);
653 INIT_HLIST_HEAD(&kretprobe_inst_table[i]); 692 INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
654 } 693 }
694 atomic_set(&kprobe_count, 0);
655 695
656 err = arch_init_kprobes(); 696 err = arch_init_kprobes();
657 if (!err) 697 if (!err)
diff --git a/kernel/module.c b/kernel/module.c
index d75275de1c28..10e5b872adf6 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -40,9 +40,11 @@
40#include <linux/string.h> 40#include <linux/string.h>
41#include <linux/sched.h> 41#include <linux/sched.h>
42#include <linux/mutex.h> 42#include <linux/mutex.h>
43#include <linux/unwind.h>
43#include <asm/uaccess.h> 44#include <asm/uaccess.h>
44#include <asm/semaphore.h> 45#include <asm/semaphore.h>
45#include <asm/cacheflush.h> 46#include <asm/cacheflush.h>
47#include <linux/license.h>
46 48
47#if 0 49#if 0
48#define DEBUGP printk 50#define DEBUGP printk
@@ -1051,6 +1053,8 @@ static void free_module(struct module *mod)
1051 remove_sect_attrs(mod); 1053 remove_sect_attrs(mod);
1052 mod_kobject_remove(mod); 1054 mod_kobject_remove(mod);
1053 1055
1056 unwind_remove_table(mod->unwind_info, 0);
1057
1054 /* Arch-specific cleanup. */ 1058 /* Arch-specific cleanup. */
1055 module_arch_cleanup(mod); 1059 module_arch_cleanup(mod);
1056 1060
@@ -1248,16 +1252,6 @@ static void layout_sections(struct module *mod,
1248 } 1252 }
1249} 1253}
1250 1254
1251static inline int license_is_gpl_compatible(const char *license)
1252{
1253 return (strcmp(license, "GPL") == 0
1254 || strcmp(license, "GPL v2") == 0
1255 || strcmp(license, "GPL and additional rights") == 0
1256 || strcmp(license, "Dual BSD/GPL") == 0
1257 || strcmp(license, "Dual MIT/GPL") == 0
1258 || strcmp(license, "Dual MPL/GPL") == 0);
1259}
1260
1261static void set_license(struct module *mod, const char *license) 1255static void set_license(struct module *mod, const char *license)
1262{ 1256{
1263 if (!license) 1257 if (!license)
@@ -1412,7 +1406,7 @@ static struct module *load_module(void __user *umod,
1412 unsigned int i, symindex = 0, strindex = 0, setupindex, exindex, 1406 unsigned int i, symindex = 0, strindex = 0, setupindex, exindex,
1413 exportindex, modindex, obsparmindex, infoindex, gplindex, 1407 exportindex, modindex, obsparmindex, infoindex, gplindex,
1414 crcindex, gplcrcindex, versindex, pcpuindex, gplfutureindex, 1408 crcindex, gplcrcindex, versindex, pcpuindex, gplfutureindex,
1415 gplfuturecrcindex; 1409 gplfuturecrcindex, unwindex = 0;
1416 struct module *mod; 1410 struct module *mod;
1417 long err = 0; 1411 long err = 0;
1418 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ 1412 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
@@ -1502,6 +1496,9 @@ static struct module *load_module(void __user *umod,
1502 versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); 1496 versindex = find_sec(hdr, sechdrs, secstrings, "__versions");
1503 infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); 1497 infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo");
1504 pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); 1498 pcpuindex = find_pcpusec(hdr, sechdrs, secstrings);
1499#ifdef ARCH_UNWIND_SECTION_NAME
1500 unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME);
1501#endif
1505 1502
1506 /* Don't keep modinfo section */ 1503 /* Don't keep modinfo section */
1507 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; 1504 sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
@@ -1510,6 +1507,8 @@ static struct module *load_module(void __user *umod,
1510 sechdrs[symindex].sh_flags |= SHF_ALLOC; 1507 sechdrs[symindex].sh_flags |= SHF_ALLOC;
1511 sechdrs[strindex].sh_flags |= SHF_ALLOC; 1508 sechdrs[strindex].sh_flags |= SHF_ALLOC;
1512#endif 1509#endif
1510 if (unwindex)
1511 sechdrs[unwindex].sh_flags |= SHF_ALLOC;
1513 1512
1514 /* Check module struct version now, before we try to use module. */ 1513 /* Check module struct version now, before we try to use module. */
1515 if (!check_modstruct_version(sechdrs, versindex, mod)) { 1514 if (!check_modstruct_version(sechdrs, versindex, mod)) {
@@ -1738,6 +1737,11 @@ static struct module *load_module(void __user *umod,
1738 goto arch_cleanup; 1737 goto arch_cleanup;
1739 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 1738 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
1740 1739
1740 /* Size of section 0 is 0, so this works well if no unwind info. */
1741 mod->unwind_info = unwind_add_table(mod,
1742 (void *)sechdrs[unwindex].sh_addr,
1743 sechdrs[unwindex].sh_size);
1744
1741 /* Get rid of temporary copy */ 1745 /* Get rid of temporary copy */
1742 vfree(hdr); 1746 vfree(hdr);
1743 1747
@@ -1836,6 +1840,7 @@ sys_init_module(void __user *umod,
1836 mod->state = MODULE_STATE_LIVE; 1840 mod->state = MODULE_STATE_LIVE;
1837 /* Drop initial reference. */ 1841 /* Drop initial reference. */
1838 module_put(mod); 1842 module_put(mod);
1843 unwind_remove_table(mod->unwind_info, 1);
1839 module_free(mod, mod->module_init); 1844 module_free(mod, mod->module_init);
1840 mod->module_init = NULL; 1845 mod->module_init = NULL;
1841 mod->init_size = 0; 1846 mod->init_size = 0;
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index f4913c376950..e38e4bac97ca 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -16,6 +16,7 @@
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/delay.h> 17#include <linux/delay.h>
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/poison.h>
19#include <linux/spinlock.h> 20#include <linux/spinlock.h>
20#include <linux/kallsyms.h> 21#include <linux/kallsyms.h>
21#include <linux/interrupt.h> 22#include <linux/interrupt.h>
@@ -153,13 +154,13 @@ next:
153 continue; 154 continue;
154 count++; 155 count++;
155 cursor = curr->next; 156 cursor = curr->next;
156 debug_spin_lock_restore(&debug_mutex_lock, flags); 157 debug_spin_unlock_restore(&debug_mutex_lock, flags);
157 158
158 printk("\n#%03d: ", count); 159 printk("\n#%03d: ", count);
159 printk_lock(lock, filter ? 0 : 1); 160 printk_lock(lock, filter ? 0 : 1);
160 goto next; 161 goto next;
161 } 162 }
162 debug_spin_lock_restore(&debug_mutex_lock, flags); 163 debug_spin_unlock_restore(&debug_mutex_lock, flags);
163 printk("\n"); 164 printk("\n");
164} 165}
165 166
@@ -316,7 +317,7 @@ void mutex_debug_check_no_locks_held(struct task_struct *task)
316 continue; 317 continue;
317 list_del_init(curr); 318 list_del_init(curr);
318 DEBUG_OFF(); 319 DEBUG_OFF();
319 debug_spin_lock_restore(&debug_mutex_lock, flags); 320 debug_spin_unlock_restore(&debug_mutex_lock, flags);
320 321
321 printk("BUG: %s/%d, lock held at task exit time!\n", 322 printk("BUG: %s/%d, lock held at task exit time!\n",
322 task->comm, task->pid); 323 task->comm, task->pid);
@@ -325,7 +326,7 @@ void mutex_debug_check_no_locks_held(struct task_struct *task)
325 printk("exiting task is not even the owner??\n"); 326 printk("exiting task is not even the owner??\n");
326 return; 327 return;
327 } 328 }
328 debug_spin_lock_restore(&debug_mutex_lock, flags); 329 debug_spin_unlock_restore(&debug_mutex_lock, flags);
329} 330}
330 331
331/* 332/*
@@ -352,7 +353,7 @@ void mutex_debug_check_no_locks_freed(const void *from, unsigned long len)
352 continue; 353 continue;
353 list_del_init(curr); 354 list_del_init(curr);
354 DEBUG_OFF(); 355 DEBUG_OFF();
355 debug_spin_lock_restore(&debug_mutex_lock, flags); 356 debug_spin_unlock_restore(&debug_mutex_lock, flags);
356 357
357 printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n", 358 printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n",
358 current->comm, current->pid, lock, from, to); 359 current->comm, current->pid, lock, from, to);
@@ -362,7 +363,7 @@ void mutex_debug_check_no_locks_freed(const void *from, unsigned long len)
362 printk("freeing task is not even the owner??\n"); 363 printk("freeing task is not even the owner??\n");
363 return; 364 return;
364 } 365 }
365 debug_spin_lock_restore(&debug_mutex_lock, flags); 366 debug_spin_unlock_restore(&debug_mutex_lock, flags);
366} 367}
367 368
368/* 369/*
@@ -381,7 +382,7 @@ void debug_mutex_set_owner(struct mutex *lock,
381 382
382void debug_mutex_init_waiter(struct mutex_waiter *waiter) 383void debug_mutex_init_waiter(struct mutex_waiter *waiter)
383{ 384{
384 memset(waiter, 0x11, sizeof(*waiter)); 385 memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter));
385 waiter->magic = waiter; 386 waiter->magic = waiter;
386 INIT_LIST_HEAD(&waiter->list); 387 INIT_LIST_HEAD(&waiter->list);
387} 388}
@@ -397,7 +398,7 @@ void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter)
397void debug_mutex_free_waiter(struct mutex_waiter *waiter) 398void debug_mutex_free_waiter(struct mutex_waiter *waiter)
398{ 399{
399 DEBUG_WARN_ON(!list_empty(&waiter->list)); 400 DEBUG_WARN_ON(!list_empty(&waiter->list));
400 memset(waiter, 0x22, sizeof(*waiter)); 401 memset(waiter, MUTEX_DEBUG_FREE, sizeof(*waiter));
401} 402}
402 403
403void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, 404void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
index fd384050acb1..a5196c36a5fd 100644
--- a/kernel/mutex-debug.h
+++ b/kernel/mutex-debug.h
@@ -46,21 +46,6 @@ extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
46extern void debug_mutex_unlock(struct mutex *lock); 46extern void debug_mutex_unlock(struct mutex *lock);
47extern void debug_mutex_init(struct mutex *lock, const char *name); 47extern void debug_mutex_init(struct mutex *lock, const char *name);
48 48
49#define debug_spin_lock(lock) \
50 do { \
51 local_irq_disable(); \
52 if (debug_mutex_on) \
53 spin_lock(lock); \
54 } while (0)
55
56#define debug_spin_unlock(lock) \
57 do { \
58 if (debug_mutex_on) \
59 spin_unlock(lock); \
60 local_irq_enable(); \
61 preempt_check_resched(); \
62 } while (0)
63
64#define debug_spin_lock_save(lock, flags) \ 49#define debug_spin_lock_save(lock, flags) \
65 do { \ 50 do { \
66 local_irq_save(flags); \ 51 local_irq_save(flags); \
@@ -68,7 +53,7 @@ extern void debug_mutex_init(struct mutex *lock, const char *name);
68 spin_lock(lock); \ 53 spin_lock(lock); \
69 } while (0) 54 } while (0)
70 55
71#define debug_spin_lock_restore(lock, flags) \ 56#define debug_spin_unlock_restore(lock, flags) \
72 do { \ 57 do { \
73 if (debug_mutex_on) \ 58 if (debug_mutex_on) \
74 spin_unlock(lock); \ 59 spin_unlock(lock); \
@@ -76,20 +61,20 @@ extern void debug_mutex_init(struct mutex *lock, const char *name);
76 preempt_check_resched(); \ 61 preempt_check_resched(); \
77 } while (0) 62 } while (0)
78 63
79#define spin_lock_mutex(lock) \ 64#define spin_lock_mutex(lock, flags) \
80 do { \ 65 do { \
81 struct mutex *l = container_of(lock, struct mutex, wait_lock); \ 66 struct mutex *l = container_of(lock, struct mutex, wait_lock); \
82 \ 67 \
83 DEBUG_WARN_ON(in_interrupt()); \ 68 DEBUG_WARN_ON(in_interrupt()); \
84 debug_spin_lock(&debug_mutex_lock); \ 69 debug_spin_lock_save(&debug_mutex_lock, flags); \
85 spin_lock(lock); \ 70 spin_lock(lock); \
86 DEBUG_WARN_ON(l->magic != l); \ 71 DEBUG_WARN_ON(l->magic != l); \
87 } while (0) 72 } while (0)
88 73
89#define spin_unlock_mutex(lock) \ 74#define spin_unlock_mutex(lock, flags) \
90 do { \ 75 do { \
91 spin_unlock(lock); \ 76 spin_unlock(lock); \
92 debug_spin_unlock(&debug_mutex_lock); \ 77 debug_spin_unlock_restore(&debug_mutex_lock, flags); \
93 } while (0) 78 } while (0)
94 79
95#define DEBUG_OFF() \ 80#define DEBUG_OFF() \
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 5449b210d9ed..7043db21bbce 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -125,10 +125,11 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__)
125 struct task_struct *task = current; 125 struct task_struct *task = current;
126 struct mutex_waiter waiter; 126 struct mutex_waiter waiter;
127 unsigned int old_val; 127 unsigned int old_val;
128 unsigned long flags;
128 129
129 debug_mutex_init_waiter(&waiter); 130 debug_mutex_init_waiter(&waiter);
130 131
131 spin_lock_mutex(&lock->wait_lock); 132 spin_lock_mutex(&lock->wait_lock, flags);
132 133
133 debug_mutex_add_waiter(lock, &waiter, task->thread_info, ip); 134 debug_mutex_add_waiter(lock, &waiter, task->thread_info, ip);
134 135
@@ -157,7 +158,7 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__)
157 if (unlikely(state == TASK_INTERRUPTIBLE && 158 if (unlikely(state == TASK_INTERRUPTIBLE &&
158 signal_pending(task))) { 159 signal_pending(task))) {
159 mutex_remove_waiter(lock, &waiter, task->thread_info); 160 mutex_remove_waiter(lock, &waiter, task->thread_info);
160 spin_unlock_mutex(&lock->wait_lock); 161 spin_unlock_mutex(&lock->wait_lock, flags);
161 162
162 debug_mutex_free_waiter(&waiter); 163 debug_mutex_free_waiter(&waiter);
163 return -EINTR; 164 return -EINTR;
@@ -165,9 +166,9 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__)
165 __set_task_state(task, state); 166 __set_task_state(task, state);
166 167
167 /* didnt get the lock, go to sleep: */ 168 /* didnt get the lock, go to sleep: */
168 spin_unlock_mutex(&lock->wait_lock); 169 spin_unlock_mutex(&lock->wait_lock, flags);
169 schedule(); 170 schedule();
170 spin_lock_mutex(&lock->wait_lock); 171 spin_lock_mutex(&lock->wait_lock, flags);
171 } 172 }
172 173
173 /* got the lock - rejoice! */ 174 /* got the lock - rejoice! */
@@ -178,7 +179,7 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__)
178 if (likely(list_empty(&lock->wait_list))) 179 if (likely(list_empty(&lock->wait_list)))
179 atomic_set(&lock->count, 0); 180 atomic_set(&lock->count, 0);
180 181
181 spin_unlock_mutex(&lock->wait_lock); 182 spin_unlock_mutex(&lock->wait_lock, flags);
182 183
183 debug_mutex_free_waiter(&waiter); 184 debug_mutex_free_waiter(&waiter);
184 185
@@ -203,10 +204,11 @@ static fastcall noinline void
203__mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__) 204__mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__)
204{ 205{
205 struct mutex *lock = container_of(lock_count, struct mutex, count); 206 struct mutex *lock = container_of(lock_count, struct mutex, count);
207 unsigned long flags;
206 208
207 DEBUG_WARN_ON(lock->owner != current_thread_info()); 209 DEBUG_WARN_ON(lock->owner != current_thread_info());
208 210
209 spin_lock_mutex(&lock->wait_lock); 211 spin_lock_mutex(&lock->wait_lock, flags);
210 212
211 /* 213 /*
212 * some architectures leave the lock unlocked in the fastpath failure 214 * some architectures leave the lock unlocked in the fastpath failure
@@ -231,7 +233,7 @@ __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__)
231 233
232 debug_mutex_clear_owner(lock); 234 debug_mutex_clear_owner(lock);
233 235
234 spin_unlock_mutex(&lock->wait_lock); 236 spin_unlock_mutex(&lock->wait_lock, flags);
235} 237}
236 238
237/* 239/*
@@ -276,9 +278,10 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__)
276static inline int __mutex_trylock_slowpath(atomic_t *lock_count) 278static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
277{ 279{
278 struct mutex *lock = container_of(lock_count, struct mutex, count); 280 struct mutex *lock = container_of(lock_count, struct mutex, count);
281 unsigned long flags;
279 int prev; 282 int prev;
280 283
281 spin_lock_mutex(&lock->wait_lock); 284 spin_lock_mutex(&lock->wait_lock, flags);
282 285
283 prev = atomic_xchg(&lock->count, -1); 286 prev = atomic_xchg(&lock->count, -1);
284 if (likely(prev == 1)) 287 if (likely(prev == 1))
@@ -287,7 +290,7 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count)
287 if (likely(list_empty(&lock->wait_list))) 290 if (likely(list_empty(&lock->wait_list)))
288 atomic_set(&lock->count, 0); 291 atomic_set(&lock->count, 0);
289 292
290 spin_unlock_mutex(&lock->wait_lock); 293 spin_unlock_mutex(&lock->wait_lock, flags);
291 294
292 return prev == 1; 295 return prev == 1;
293} 296}
diff --git a/kernel/mutex.h b/kernel/mutex.h
index 00fe84e7b672..069189947257 100644
--- a/kernel/mutex.h
+++ b/kernel/mutex.h
@@ -9,8 +9,10 @@
9 * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs: 9 * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs:
10 */ 10 */
11 11
12#define spin_lock_mutex(lock) spin_lock(lock) 12#define spin_lock_mutex(lock, flags) \
13#define spin_unlock_mutex(lock) spin_unlock(lock) 13 do { spin_lock(lock); (void)(flags); } while (0)
14#define spin_unlock_mutex(lock, flags) \
15 do { spin_unlock(lock); (void)(flags); } while (0)
14#define mutex_remove_waiter(lock, waiter, ti) \ 16#define mutex_remove_waiter(lock, waiter, ti) \
15 __list_del((waiter)->list.prev, (waiter)->list.next) 17 __list_del((waiter)->list.prev, (waiter)->list.next)
16 18
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index fc311a4673a2..857b4fa09124 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -38,13 +38,22 @@ config PM_DEBUG
38 38
39config PM_TRACE 39config PM_TRACE
40 bool "Suspend/resume event tracing" 40 bool "Suspend/resume event tracing"
41 depends on PM && PM_DEBUG && X86_32 41 depends on PM && PM_DEBUG && X86_32 && EXPERIMENTAL
42 default y 42 default n
43 ---help--- 43 ---help---
44 This enables some cheesy code to save the last PM event point in the 44 This enables some cheesy code to save the last PM event point in the
45 RTC across reboots, so that you can debug a machine that just hangs 45 RTC across reboots, so that you can debug a machine that just hangs
46 during suspend (or more commonly, during resume). 46 during suspend (or more commonly, during resume).
47 47
48 To use this debugging feature you should attempt to suspend the machine,
49 then reboot it, then run
50
51 dmesg -s 1000000 | grep 'hash matches'
52
53 CAUTION: this option will cause your machine's real-time clock to be
54 set to an invalid time after a resume.
55
56
48config SOFTWARE_SUSPEND 57config SOFTWARE_SUSPEND
49 bool "Software Suspend" 58 bool "Software Suspend"
50 depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP) 59 depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP)
diff --git a/kernel/profile.c b/kernel/profile.c
index 68afe121e507..5a730fdb1a2c 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -299,7 +299,7 @@ out:
299} 299}
300 300
301#ifdef CONFIG_HOTPLUG_CPU 301#ifdef CONFIG_HOTPLUG_CPU
302static int profile_cpu_callback(struct notifier_block *info, 302static int __devinit profile_cpu_callback(struct notifier_block *info,
303 unsigned long action, void *__cpu) 303 unsigned long action, void *__cpu)
304{ 304{
305 int node, cpu = (unsigned long)__cpu; 305 int node, cpu = (unsigned long)__cpu;
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 921c22ad16e4..335c5b932e14 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -120,8 +120,18 @@ int ptrace_check_attach(struct task_struct *child, int kill)
120 120
121static int may_attach(struct task_struct *task) 121static int may_attach(struct task_struct *task)
122{ 122{
123 if (!task->mm) 123 /* May we inspect the given task?
124 return -EPERM; 124 * This check is used both for attaching with ptrace
125 * and for allowing access to sensitive information in /proc.
126 *
127 * ptrace_attach denies several cases that /proc allows
128 * because setting up the necessary parent/child relationship
129 * or halting the specified task is impossible.
130 */
131 int dumpable = 0;
132 /* Don't let security modules deny introspection */
133 if (task == current)
134 return 0;
125 if (((current->uid != task->euid) || 135 if (((current->uid != task->euid) ||
126 (current->uid != task->suid) || 136 (current->uid != task->suid) ||
127 (current->uid != task->uid) || 137 (current->uid != task->uid) ||
@@ -130,7 +140,9 @@ static int may_attach(struct task_struct *task)
130 (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) 140 (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE))
131 return -EPERM; 141 return -EPERM;
132 smp_rmb(); 142 smp_rmb();
133 if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE)) 143 if (task->mm)
144 dumpable = task->mm->dumpable;
145 if (!dumpable && !capable(CAP_SYS_PTRACE))
134 return -EPERM; 146 return -EPERM;
135 147
136 return security_ptrace(current, task); 148 return security_ptrace(current, task);
@@ -176,6 +188,8 @@ repeat:
176 goto repeat; 188 goto repeat;
177 } 189 }
178 190
191 if (!task->mm)
192 goto bad;
179 /* the same process cannot be attached many times */ 193 /* the same process cannot be attached many times */
180 if (task->ptrace & PT_PTRACED) 194 if (task->ptrace & PT_PTRACED)
181 goto bad; 195 goto bad;
@@ -200,7 +214,7 @@ out:
200 return retval; 214 return retval;
201} 215}
202 216
203void __ptrace_detach(struct task_struct *child, unsigned int data) 217static inline void __ptrace_detach(struct task_struct *child, unsigned int data)
204{ 218{
205 child->exit_code = data; 219 child->exit_code = data;
206 /* .. re-parent .. */ 220 /* .. re-parent .. */
@@ -219,6 +233,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
219 ptrace_disable(child); 233 ptrace_disable(child);
220 234
221 write_lock_irq(&tasklist_lock); 235 write_lock_irq(&tasklist_lock);
236 /* protect against de_thread()->release_task() */
222 if (child->ptrace) 237 if (child->ptrace)
223 __ptrace_detach(child, data); 238 __ptrace_detach(child, data);
224 write_unlock_irq(&tasklist_lock); 239 write_unlock_irq(&tasklist_lock);
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 20e9710fc21c..f464f5ae3f11 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -182,6 +182,15 @@ long rcu_batches_completed(void)
182 return rcu_ctrlblk.completed; 182 return rcu_ctrlblk.completed;
183} 183}
184 184
185/*
186 * Return the number of RCU batches processed thus far. Useful
187 * for debug and statistics.
188 */
189long rcu_batches_completed_bh(void)
190{
191 return rcu_bh_ctrlblk.completed;
192}
193
185static void rcu_barrier_callback(struct rcu_head *notused) 194static void rcu_barrier_callback(struct rcu_head *notused)
186{ 195{
187 if (atomic_dec_and_test(&rcu_barrier_cpu_count)) 196 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
@@ -539,7 +548,7 @@ static void __devinit rcu_online_cpu(int cpu)
539 tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL); 548 tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL);
540} 549}
541 550
542static int rcu_cpu_notify(struct notifier_block *self, 551static int __devinit rcu_cpu_notify(struct notifier_block *self,
543 unsigned long action, void *hcpu) 552 unsigned long action, void *hcpu)
544{ 553{
545 long cpu = (long)hcpu; 554 long cpu = (long)hcpu;
@@ -556,7 +565,7 @@ static int rcu_cpu_notify(struct notifier_block *self,
556 return NOTIFY_OK; 565 return NOTIFY_OK;
557} 566}
558 567
559static struct notifier_block rcu_nb = { 568static struct notifier_block __devinitdata rcu_nb = {
560 .notifier_call = rcu_cpu_notify, 569 .notifier_call = rcu_cpu_notify,
561}; 570};
562 571
@@ -619,6 +628,7 @@ module_param(qlowmark, int, 0);
619module_param(rsinterval, int, 0); 628module_param(rsinterval, int, 0);
620#endif 629#endif
621EXPORT_SYMBOL_GPL(rcu_batches_completed); 630EXPORT_SYMBOL_GPL(rcu_batches_completed);
631EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
622EXPORT_SYMBOL_GPL(call_rcu); 632EXPORT_SYMBOL_GPL(call_rcu);
623EXPORT_SYMBOL_GPL(call_rcu_bh); 633EXPORT_SYMBOL_GPL(call_rcu_bh);
624EXPORT_SYMBOL_GPL(synchronize_rcu); 634EXPORT_SYMBOL_GPL(synchronize_rcu);
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 8154e7589d12..4d1c3d247127 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Read-Copy Update /proc-based torture test facility 2 * Read-Copy Update module-based torture test facility
3 * 3 *
4 * This program is free software; you can redistribute it and/or modify 4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by 5 * it under the terms of the GNU General Public License as published by
@@ -53,6 +53,7 @@ static int stat_interval; /* Interval between stats, in seconds. */
53static int verbose; /* Print more debug info. */ 53static int verbose; /* Print more debug info. */
54static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ 54static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */
55static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/ 55static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/
56static char *torture_type = "rcu"; /* What to torture. */
56 57
57module_param(nreaders, int, 0); 58module_param(nreaders, int, 0);
58MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); 59MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
@@ -64,13 +65,16 @@ module_param(test_no_idle_hz, bool, 0);
64MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); 65MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
65module_param(shuffle_interval, int, 0); 66module_param(shuffle_interval, int, 0);
66MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); 67MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
67#define TORTURE_FLAG "rcutorture: " 68module_param(torture_type, charp, 0);
69MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh)");
70
71#define TORTURE_FLAG "-torture:"
68#define PRINTK_STRING(s) \ 72#define PRINTK_STRING(s) \
69 do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) 73 do { printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0)
70#define VERBOSE_PRINTK_STRING(s) \ 74#define VERBOSE_PRINTK_STRING(s) \
71 do { if (verbose) printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) 75 do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0)
72#define VERBOSE_PRINTK_ERRSTRING(s) \ 76#define VERBOSE_PRINTK_ERRSTRING(s) \
73 do { if (verbose) printk(KERN_ALERT TORTURE_FLAG "!!! " s "\n"); } while (0) 77 do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0)
74 78
75static char printk_buf[4096]; 79static char printk_buf[4096];
76 80
@@ -139,28 +143,6 @@ rcu_torture_free(struct rcu_torture *p)
139 spin_unlock_bh(&rcu_torture_lock); 143 spin_unlock_bh(&rcu_torture_lock);
140} 144}
141 145
142static void
143rcu_torture_cb(struct rcu_head *p)
144{
145 int i;
146 struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
147
148 if (fullstop) {
149 /* Test is ending, just drop callbacks on the floor. */
150 /* The next initialization will pick up the pieces. */
151 return;
152 }
153 i = rp->rtort_pipe_count;
154 if (i > RCU_TORTURE_PIPE_LEN)
155 i = RCU_TORTURE_PIPE_LEN;
156 atomic_inc(&rcu_torture_wcount[i]);
157 if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
158 rp->rtort_mbtest = 0;
159 rcu_torture_free(rp);
160 } else
161 call_rcu(p, rcu_torture_cb);
162}
163
164struct rcu_random_state { 146struct rcu_random_state {
165 unsigned long rrs_state; 147 unsigned long rrs_state;
166 unsigned long rrs_count; 148 unsigned long rrs_count;
@@ -191,6 +173,119 @@ rcu_random(struct rcu_random_state *rrsp)
191} 173}
192 174
193/* 175/*
176 * Operations vector for selecting different types of tests.
177 */
178
179struct rcu_torture_ops {
180 void (*init)(void);
181 void (*cleanup)(void);
182 int (*readlock)(void);
183 void (*readunlock)(int idx);
184 int (*completed)(void);
185 void (*deferredfree)(struct rcu_torture *p);
186 int (*stats)(char *page);
187 char *name;
188};
189static struct rcu_torture_ops *cur_ops = NULL;
190
191/*
192 * Definitions for rcu torture testing.
193 */
194
195static int rcu_torture_read_lock(void)
196{
197 rcu_read_lock();
198 return 0;
199}
200
201static void rcu_torture_read_unlock(int idx)
202{
203 rcu_read_unlock();
204}
205
206static int rcu_torture_completed(void)
207{
208 return rcu_batches_completed();
209}
210
211static void
212rcu_torture_cb(struct rcu_head *p)
213{
214 int i;
215 struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
216
217 if (fullstop) {
218 /* Test is ending, just drop callbacks on the floor. */
219 /* The next initialization will pick up the pieces. */
220 return;
221 }
222 i = rp->rtort_pipe_count;
223 if (i > RCU_TORTURE_PIPE_LEN)
224 i = RCU_TORTURE_PIPE_LEN;
225 atomic_inc(&rcu_torture_wcount[i]);
226 if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
227 rp->rtort_mbtest = 0;
228 rcu_torture_free(rp);
229 } else
230 cur_ops->deferredfree(rp);
231}
232
233static void rcu_torture_deferred_free(struct rcu_torture *p)
234{
235 call_rcu(&p->rtort_rcu, rcu_torture_cb);
236}
237
238static struct rcu_torture_ops rcu_ops = {
239 .init = NULL,
240 .cleanup = NULL,
241 .readlock = rcu_torture_read_lock,
242 .readunlock = rcu_torture_read_unlock,
243 .completed = rcu_torture_completed,
244 .deferredfree = rcu_torture_deferred_free,
245 .stats = NULL,
246 .name = "rcu"
247};
248
249/*
250 * Definitions for rcu_bh torture testing.
251 */
252
253static int rcu_bh_torture_read_lock(void)
254{
255 rcu_read_lock_bh();
256 return 0;
257}
258
259static void rcu_bh_torture_read_unlock(int idx)
260{
261 rcu_read_unlock_bh();
262}
263
264static int rcu_bh_torture_completed(void)
265{
266 return rcu_batches_completed_bh();
267}
268
269static void rcu_bh_torture_deferred_free(struct rcu_torture *p)
270{
271 call_rcu_bh(&p->rtort_rcu, rcu_torture_cb);
272}
273
274static struct rcu_torture_ops rcu_bh_ops = {
275 .init = NULL,
276 .cleanup = NULL,
277 .readlock = rcu_bh_torture_read_lock,
278 .readunlock = rcu_bh_torture_read_unlock,
279 .completed = rcu_bh_torture_completed,
280 .deferredfree = rcu_bh_torture_deferred_free,
281 .stats = NULL,
282 .name = "rcu_bh"
283};
284
285static struct rcu_torture_ops *torture_ops[] =
286 { &rcu_ops, &rcu_bh_ops, NULL };
287
288/*
194 * RCU torture writer kthread. Repeatedly substitutes a new structure 289 * RCU torture writer kthread. Repeatedly substitutes a new structure
195 * for that pointed to by rcu_torture_current, freeing the old structure 290 * for that pointed to by rcu_torture_current, freeing the old structure
196 * after a series of grace periods (the "pipeline"). 291 * after a series of grace periods (the "pipeline").
@@ -209,8 +304,6 @@ rcu_torture_writer(void *arg)
209 304
210 do { 305 do {
211 schedule_timeout_uninterruptible(1); 306 schedule_timeout_uninterruptible(1);
212 if (rcu_batches_completed() == oldbatch)
213 continue;
214 if ((rp = rcu_torture_alloc()) == NULL) 307 if ((rp = rcu_torture_alloc()) == NULL)
215 continue; 308 continue;
216 rp->rtort_pipe_count = 0; 309 rp->rtort_pipe_count = 0;
@@ -225,10 +318,10 @@ rcu_torture_writer(void *arg)
225 i = RCU_TORTURE_PIPE_LEN; 318 i = RCU_TORTURE_PIPE_LEN;
226 atomic_inc(&rcu_torture_wcount[i]); 319 atomic_inc(&rcu_torture_wcount[i]);
227 old_rp->rtort_pipe_count++; 320 old_rp->rtort_pipe_count++;
228 call_rcu(&old_rp->rtort_rcu, rcu_torture_cb); 321 cur_ops->deferredfree(old_rp);
229 } 322 }
230 rcu_torture_current_version++; 323 rcu_torture_current_version++;
231 oldbatch = rcu_batches_completed(); 324 oldbatch = cur_ops->completed();
232 } while (!kthread_should_stop() && !fullstop); 325 } while (!kthread_should_stop() && !fullstop);
233 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); 326 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
234 while (!kthread_should_stop()) 327 while (!kthread_should_stop())
@@ -246,6 +339,7 @@ static int
246rcu_torture_reader(void *arg) 339rcu_torture_reader(void *arg)
247{ 340{
248 int completed; 341 int completed;
342 int idx;
249 DEFINE_RCU_RANDOM(rand); 343 DEFINE_RCU_RANDOM(rand);
250 struct rcu_torture *p; 344 struct rcu_torture *p;
251 int pipe_count; 345 int pipe_count;
@@ -254,12 +348,12 @@ rcu_torture_reader(void *arg)
254 set_user_nice(current, 19); 348 set_user_nice(current, 19);
255 349
256 do { 350 do {
257 rcu_read_lock(); 351 idx = cur_ops->readlock();
258 completed = rcu_batches_completed(); 352 completed = cur_ops->completed();
259 p = rcu_dereference(rcu_torture_current); 353 p = rcu_dereference(rcu_torture_current);
260 if (p == NULL) { 354 if (p == NULL) {
261 /* Wait for rcu_torture_writer to get underway */ 355 /* Wait for rcu_torture_writer to get underway */
262 rcu_read_unlock(); 356 cur_ops->readunlock(idx);
263 schedule_timeout_interruptible(HZ); 357 schedule_timeout_interruptible(HZ);
264 continue; 358 continue;
265 } 359 }
@@ -273,14 +367,14 @@ rcu_torture_reader(void *arg)
273 pipe_count = RCU_TORTURE_PIPE_LEN; 367 pipe_count = RCU_TORTURE_PIPE_LEN;
274 } 368 }
275 ++__get_cpu_var(rcu_torture_count)[pipe_count]; 369 ++__get_cpu_var(rcu_torture_count)[pipe_count];
276 completed = rcu_batches_completed() - completed; 370 completed = cur_ops->completed() - completed;
277 if (completed > RCU_TORTURE_PIPE_LEN) { 371 if (completed > RCU_TORTURE_PIPE_LEN) {
278 /* Should not happen, but... */ 372 /* Should not happen, but... */
279 completed = RCU_TORTURE_PIPE_LEN; 373 completed = RCU_TORTURE_PIPE_LEN;
280 } 374 }
281 ++__get_cpu_var(rcu_torture_batch)[completed]; 375 ++__get_cpu_var(rcu_torture_batch)[completed];
282 preempt_enable(); 376 preempt_enable();
283 rcu_read_unlock(); 377 cur_ops->readunlock(idx);
284 schedule(); 378 schedule();
285 } while (!kthread_should_stop() && !fullstop); 379 } while (!kthread_should_stop() && !fullstop);
286 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); 380 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
@@ -311,7 +405,7 @@ rcu_torture_printk(char *page)
311 if (pipesummary[i] != 0) 405 if (pipesummary[i] != 0)
312 break; 406 break;
313 } 407 }
314 cnt += sprintf(&page[cnt], "rcutorture: "); 408 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
315 cnt += sprintf(&page[cnt], 409 cnt += sprintf(&page[cnt],
316 "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " 410 "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
317 "rtmbe: %d", 411 "rtmbe: %d",
@@ -324,7 +418,7 @@ rcu_torture_printk(char *page)
324 atomic_read(&n_rcu_torture_mberror)); 418 atomic_read(&n_rcu_torture_mberror));
325 if (atomic_read(&n_rcu_torture_mberror) != 0) 419 if (atomic_read(&n_rcu_torture_mberror) != 0)
326 cnt += sprintf(&page[cnt], " !!!"); 420 cnt += sprintf(&page[cnt], " !!!");
327 cnt += sprintf(&page[cnt], "\nrcutorture: "); 421 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
328 if (i > 1) { 422 if (i > 1) {
329 cnt += sprintf(&page[cnt], "!!! "); 423 cnt += sprintf(&page[cnt], "!!! ");
330 atomic_inc(&n_rcu_torture_error); 424 atomic_inc(&n_rcu_torture_error);
@@ -332,17 +426,19 @@ rcu_torture_printk(char *page)
332 cnt += sprintf(&page[cnt], "Reader Pipe: "); 426 cnt += sprintf(&page[cnt], "Reader Pipe: ");
333 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) 427 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
334 cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); 428 cnt += sprintf(&page[cnt], " %ld", pipesummary[i]);
335 cnt += sprintf(&page[cnt], "\nrcutorture: "); 429 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
336 cnt += sprintf(&page[cnt], "Reader Batch: "); 430 cnt += sprintf(&page[cnt], "Reader Batch: ");
337 for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++) 431 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
338 cnt += sprintf(&page[cnt], " %ld", batchsummary[i]); 432 cnt += sprintf(&page[cnt], " %ld", batchsummary[i]);
339 cnt += sprintf(&page[cnt], "\nrcutorture: "); 433 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
340 cnt += sprintf(&page[cnt], "Free-Block Circulation: "); 434 cnt += sprintf(&page[cnt], "Free-Block Circulation: ");
341 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { 435 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
342 cnt += sprintf(&page[cnt], " %d", 436 cnt += sprintf(&page[cnt], " %d",
343 atomic_read(&rcu_torture_wcount[i])); 437 atomic_read(&rcu_torture_wcount[i]));
344 } 438 }
345 cnt += sprintf(&page[cnt], "\n"); 439 cnt += sprintf(&page[cnt], "\n");
440 if (cur_ops->stats != NULL)
441 cnt += cur_ops->stats(&page[cnt]);
346 return cnt; 442 return cnt;
347} 443}
348 444
@@ -444,11 +540,11 @@ rcu_torture_shuffle(void *arg)
444static inline void 540static inline void
445rcu_torture_print_module_parms(char *tag) 541rcu_torture_print_module_parms(char *tag)
446{ 542{
447 printk(KERN_ALERT TORTURE_FLAG "--- %s: nreaders=%d " 543 printk(KERN_ALERT "%s" TORTURE_FLAG "--- %s: nreaders=%d "
448 "stat_interval=%d verbose=%d test_no_idle_hz=%d " 544 "stat_interval=%d verbose=%d test_no_idle_hz=%d "
449 "shuffle_interval = %d\n", 545 "shuffle_interval = %d\n",
450 tag, nrealreaders, stat_interval, verbose, test_no_idle_hz, 546 torture_type, tag, nrealreaders, stat_interval, verbose,
451 shuffle_interval); 547 test_no_idle_hz, shuffle_interval);
452} 548}
453 549
454static void 550static void
@@ -493,6 +589,9 @@ rcu_torture_cleanup(void)
493 rcu_barrier(); 589 rcu_barrier();
494 590
495 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ 591 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
592
593 if (cur_ops->cleanup != NULL)
594 cur_ops->cleanup();
496 if (atomic_read(&n_rcu_torture_error)) 595 if (atomic_read(&n_rcu_torture_error))
497 rcu_torture_print_module_parms("End of test: FAILURE"); 596 rcu_torture_print_module_parms("End of test: FAILURE");
498 else 597 else
@@ -508,6 +607,20 @@ rcu_torture_init(void)
508 607
509 /* Process args and tell the world that the torturer is on the job. */ 608 /* Process args and tell the world that the torturer is on the job. */
510 609
610 for (i = 0; cur_ops = torture_ops[i], cur_ops != NULL; i++) {
611 cur_ops = torture_ops[i];
612 if (strcmp(torture_type, cur_ops->name) == 0) {
613 break;
614 }
615 }
616 if (cur_ops == NULL) {
617 printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n",
618 torture_type);
619 return (-EINVAL);
620 }
621 if (cur_ops->init != NULL)
622 cur_ops->init(); /* no "goto unwind" prior to this point!!! */
623
511 if (nreaders >= 0) 624 if (nreaders >= 0)
512 nrealreaders = nreaders; 625 nrealreaders = nreaders;
513 else 626 else
diff --git a/kernel/resource.c b/kernel/resource.c
index e3080fcc66a3..2404f9b0bc47 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -232,6 +232,44 @@ int release_resource(struct resource *old)
232 232
233EXPORT_SYMBOL(release_resource); 233EXPORT_SYMBOL(release_resource);
234 234
235#ifdef CONFIG_MEMORY_HOTPLUG
236/*
237 * Finds the lowest memory reosurce exists within [res->start.res->end)
238 * the caller must specify res->start, res->end, res->flags.
239 * If found, returns 0, res is overwritten, if not found, returns -1.
240 */
241int find_next_system_ram(struct resource *res)
242{
243 resource_size_t start, end;
244 struct resource *p;
245
246 BUG_ON(!res);
247
248 start = res->start;
249 end = res->end;
250
251 read_lock(&resource_lock);
252 for (p = iomem_resource.child; p ; p = p->sibling) {
253 /* system ram is just marked as IORESOURCE_MEM */
254 if (p->flags != res->flags)
255 continue;
256 if (p->start > end) {
257 p = NULL;
258 break;
259 }
260 if (p->start >= start)
261 break;
262 }
263 read_unlock(&resource_lock);
264 if (!p)
265 return -1;
266 /* copy data */
267 res->start = p->start;
268 res->end = p->end;
269 return 0;
270}
271#endif
272
235/* 273/*
236 * Find empty slot in the resource tree given range and alignment. 274 * Find empty slot in the resource tree given range and alignment.
237 */ 275 */
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
new file mode 100644
index 000000000000..4aa8a2c9f453
--- /dev/null
+++ b/kernel/rtmutex-debug.c
@@ -0,0 +1,513 @@
1/*
2 * RT-Mutexes: blocking mutual exclusion locks with PI support
3 *
4 * started by Ingo Molnar and Thomas Gleixner:
5 *
6 * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
8 *
9 * This code is based on the rt.c implementation in the preempt-rt tree.
10 * Portions of said code are
11 *
12 * Copyright (C) 2004 LynuxWorks, Inc., Igor Manyilov, Bill Huey
13 * Copyright (C) 2006 Esben Nielsen
14 * Copyright (C) 2006 Kihon Technologies Inc.,
15 * Steven Rostedt <rostedt@goodmis.org>
16 *
17 * See rt.c in preempt-rt for proper credits and further information
18 */
19#include <linux/config.h>
20#include <linux/sched.h>
21#include <linux/delay.h>
22#include <linux/module.h>
23#include <linux/spinlock.h>
24#include <linux/kallsyms.h>
25#include <linux/syscalls.h>
26#include <linux/interrupt.h>
27#include <linux/plist.h>
28#include <linux/fs.h>
29
30#include "rtmutex_common.h"
31
32#ifdef CONFIG_DEBUG_RT_MUTEXES
33# include "rtmutex-debug.h"
34#else
35# include "rtmutex.h"
36#endif
37
38# define TRACE_WARN_ON(x) WARN_ON(x)
39# define TRACE_BUG_ON(x) BUG_ON(x)
40
41# define TRACE_OFF() \
42do { \
43 if (rt_trace_on) { \
44 rt_trace_on = 0; \
45 console_verbose(); \
46 if (spin_is_locked(&current->pi_lock)) \
47 spin_unlock(&current->pi_lock); \
48 if (spin_is_locked(&current->held_list_lock)) \
49 spin_unlock(&current->held_list_lock); \
50 } \
51} while (0)
52
53# define TRACE_OFF_NOLOCK() \
54do { \
55 if (rt_trace_on) { \
56 rt_trace_on = 0; \
57 console_verbose(); \
58 } \
59} while (0)
60
61# define TRACE_BUG_LOCKED() \
62do { \
63 TRACE_OFF(); \
64 BUG(); \
65} while (0)
66
67# define TRACE_WARN_ON_LOCKED(c) \
68do { \
69 if (unlikely(c)) { \
70 TRACE_OFF(); \
71 WARN_ON(1); \
72 } \
73} while (0)
74
75# define TRACE_BUG_ON_LOCKED(c) \
76do { \
77 if (unlikely(c)) \
78 TRACE_BUG_LOCKED(); \
79} while (0)
80
81#ifdef CONFIG_SMP
82# define SMP_TRACE_BUG_ON_LOCKED(c) TRACE_BUG_ON_LOCKED(c)
83#else
84# define SMP_TRACE_BUG_ON_LOCKED(c) do { } while (0)
85#endif
86
87/*
88 * deadlock detection flag. We turn it off when we detect
89 * the first problem because we dont want to recurse back
90 * into the tracing code when doing error printk or
91 * executing a BUG():
92 */
93int rt_trace_on = 1;
94
95void deadlock_trace_off(void)
96{
97 rt_trace_on = 0;
98}
99
100static void printk_task(task_t *p)
101{
102 if (p)
103 printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio);
104 else
105 printk("<none>");
106}
107
108static void printk_task_short(task_t *p)
109{
110 if (p)
111 printk("%s/%d [%p, %3d]", p->comm, p->pid, p, p->prio);
112 else
113 printk("<none>");
114}
115
116static void printk_lock(struct rt_mutex *lock, int print_owner)
117{
118 if (lock->name)
119 printk(" [%p] {%s}\n",
120 lock, lock->name);
121 else
122 printk(" [%p] {%s:%d}\n",
123 lock, lock->file, lock->line);
124
125 if (print_owner && rt_mutex_owner(lock)) {
126 printk(".. ->owner: %p\n", lock->owner);
127 printk(".. held by: ");
128 printk_task(rt_mutex_owner(lock));
129 printk("\n");
130 }
131 if (rt_mutex_owner(lock)) {
132 printk("... acquired at: ");
133 print_symbol("%s\n", lock->acquire_ip);
134 }
135}
136
137static void printk_waiter(struct rt_mutex_waiter *w)
138{
139 printk("-------------------------\n");
140 printk("| waiter struct %p:\n", w);
141 printk("| w->list_entry: [DP:%p/%p|SP:%p/%p|PRI:%d]\n",
142 w->list_entry.plist.prio_list.prev, w->list_entry.plist.prio_list.next,
143 w->list_entry.plist.node_list.prev, w->list_entry.plist.node_list.next,
144 w->list_entry.prio);
145 printk("| w->pi_list_entry: [DP:%p/%p|SP:%p/%p|PRI:%d]\n",
146 w->pi_list_entry.plist.prio_list.prev, w->pi_list_entry.plist.prio_list.next,
147 w->pi_list_entry.plist.node_list.prev, w->pi_list_entry.plist.node_list.next,
148 w->pi_list_entry.prio);
149 printk("\n| lock:\n");
150 printk_lock(w->lock, 1);
151 printk("| w->ti->task:\n");
152 printk_task(w->task);
153 printk("| blocked at: ");
154 print_symbol("%s\n", w->ip);
155 printk("-------------------------\n");
156}
157
158static void show_task_locks(task_t *p)
159{
160 switch (p->state) {
161 case TASK_RUNNING: printk("R"); break;
162 case TASK_INTERRUPTIBLE: printk("S"); break;
163 case TASK_UNINTERRUPTIBLE: printk("D"); break;
164 case TASK_STOPPED: printk("T"); break;
165 case EXIT_ZOMBIE: printk("Z"); break;
166 case EXIT_DEAD: printk("X"); break;
167 default: printk("?"); break;
168 }
169 printk_task(p);
170 if (p->pi_blocked_on) {
171 struct rt_mutex *lock = p->pi_blocked_on->lock;
172
173 printk(" blocked on:");
174 printk_lock(lock, 1);
175 } else
176 printk(" (not blocked)\n");
177}
178
179void rt_mutex_show_held_locks(task_t *task, int verbose)
180{
181 struct list_head *curr, *cursor = NULL;
182 struct rt_mutex *lock;
183 task_t *t;
184 unsigned long flags;
185 int count = 0;
186
187 if (!rt_trace_on)
188 return;
189
190 if (verbose) {
191 printk("------------------------------\n");
192 printk("| showing all locks held by: | (");
193 printk_task_short(task);
194 printk("):\n");
195 printk("------------------------------\n");
196 }
197
198next:
199 spin_lock_irqsave(&task->held_list_lock, flags);
200 list_for_each(curr, &task->held_list_head) {
201 if (cursor && curr != cursor)
202 continue;
203 lock = list_entry(curr, struct rt_mutex, held_list_entry);
204 t = rt_mutex_owner(lock);
205 WARN_ON(t != task);
206 count++;
207 cursor = curr->next;
208 spin_unlock_irqrestore(&task->held_list_lock, flags);
209
210 printk("\n#%03d: ", count);
211 printk_lock(lock, 0);
212 goto next;
213 }
214 spin_unlock_irqrestore(&task->held_list_lock, flags);
215
216 printk("\n");
217}
218
219void rt_mutex_show_all_locks(void)
220{
221 task_t *g, *p;
222 int count = 10;
223 int unlock = 1;
224
225 printk("\n");
226 printk("----------------------\n");
227 printk("| showing all tasks: |\n");
228 printk("----------------------\n");
229
230 /*
231 * Here we try to get the tasklist_lock as hard as possible,
232 * if not successful after 2 seconds we ignore it (but keep
233 * trying). This is to enable a debug printout even if a
234 * tasklist_lock-holding task deadlocks or crashes.
235 */
236retry:
237 if (!read_trylock(&tasklist_lock)) {
238 if (count == 10)
239 printk("hm, tasklist_lock locked, retrying... ");
240 if (count) {
241 count--;
242 printk(" #%d", 10-count);
243 mdelay(200);
244 goto retry;
245 }
246 printk(" ignoring it.\n");
247 unlock = 0;
248 }
249 if (count != 10)
250 printk(" locked it.\n");
251
252 do_each_thread(g, p) {
253 show_task_locks(p);
254 if (!unlock)
255 if (read_trylock(&tasklist_lock))
256 unlock = 1;
257 } while_each_thread(g, p);
258
259 printk("\n");
260
261 printk("-----------------------------------------\n");
262 printk("| showing all locks held in the system: |\n");
263 printk("-----------------------------------------\n");
264
265 do_each_thread(g, p) {
266 rt_mutex_show_held_locks(p, 0);
267 if (!unlock)
268 if (read_trylock(&tasklist_lock))
269 unlock = 1;
270 } while_each_thread(g, p);
271
272
273 printk("=============================================\n\n");
274
275 if (unlock)
276 read_unlock(&tasklist_lock);
277}
278
279void rt_mutex_debug_check_no_locks_held(task_t *task)
280{
281 struct rt_mutex_waiter *w;
282 struct list_head *curr;
283 struct rt_mutex *lock;
284
285 if (!rt_trace_on)
286 return;
287 if (!rt_prio(task->normal_prio) && rt_prio(task->prio)) {
288 printk("BUG: PI priority boost leaked!\n");
289 printk_task(task);
290 printk("\n");
291 }
292 if (list_empty(&task->held_list_head))
293 return;
294
295 spin_lock(&task->pi_lock);
296 plist_for_each_entry(w, &task->pi_waiters, pi_list_entry) {
297 TRACE_OFF();
298
299 printk("hm, PI interest held at exit time? Task:\n");
300 printk_task(task);
301 printk_waiter(w);
302 return;
303 }
304 spin_unlock(&task->pi_lock);
305
306 list_for_each(curr, &task->held_list_head) {
307 lock = list_entry(curr, struct rt_mutex, held_list_entry);
308
309 printk("BUG: %s/%d, lock held at task exit time!\n",
310 task->comm, task->pid);
311 printk_lock(lock, 1);
312 if (rt_mutex_owner(lock) != task)
313 printk("exiting task is not even the owner??\n");
314 }
315}
316
317int rt_mutex_debug_check_no_locks_freed(const void *from, unsigned long len)
318{
319 const void *to = from + len;
320 struct list_head *curr;
321 struct rt_mutex *lock;
322 unsigned long flags;
323 void *lock_addr;
324
325 if (!rt_trace_on)
326 return 0;
327
328 spin_lock_irqsave(&current->held_list_lock, flags);
329 list_for_each(curr, &current->held_list_head) {
330 lock = list_entry(curr, struct rt_mutex, held_list_entry);
331 lock_addr = lock;
332 if (lock_addr < from || lock_addr >= to)
333 continue;
334 TRACE_OFF();
335
336 printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n",
337 current->comm, current->pid, lock, from, to);
338 dump_stack();
339 printk_lock(lock, 1);
340 if (rt_mutex_owner(lock) != current)
341 printk("freeing task is not even the owner??\n");
342 return 1;
343 }
344 spin_unlock_irqrestore(&current->held_list_lock, flags);
345
346 return 0;
347}
348
349void rt_mutex_debug_task_free(struct task_struct *task)
350{
351 WARN_ON(!plist_head_empty(&task->pi_waiters));
352 WARN_ON(task->pi_blocked_on);
353}
354
355/*
356 * We fill out the fields in the waiter to store the information about
357 * the deadlock. We print when we return. act_waiter can be NULL in
358 * case of a remove waiter operation.
359 */
360void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter,
361 struct rt_mutex *lock)
362{
363 struct task_struct *task;
364
365 if (!rt_trace_on || detect || !act_waiter)
366 return;
367
368 task = rt_mutex_owner(act_waiter->lock);
369 if (task && task != current) {
370 act_waiter->deadlock_task_pid = task->pid;
371 act_waiter->deadlock_lock = lock;
372 }
373}
374
375void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter)
376{
377 struct task_struct *task;
378
379 if (!waiter->deadlock_lock || !rt_trace_on)
380 return;
381
382 task = find_task_by_pid(waiter->deadlock_task_pid);
383 if (!task)
384 return;
385
386 TRACE_OFF_NOLOCK();
387
388 printk("\n============================================\n");
389 printk( "[ BUG: circular locking deadlock detected! ]\n");
390 printk( "--------------------------------------------\n");
391 printk("%s/%d is deadlocking current task %s/%d\n\n",
392 task->comm, task->pid, current->comm, current->pid);
393
394 printk("\n1) %s/%d is trying to acquire this lock:\n",
395 current->comm, current->pid);
396 printk_lock(waiter->lock, 1);
397
398 printk("... trying at: ");
399 print_symbol("%s\n", waiter->ip);
400
401 printk("\n2) %s/%d is blocked on this lock:\n", task->comm, task->pid);
402 printk_lock(waiter->deadlock_lock, 1);
403
404 rt_mutex_show_held_locks(current, 1);
405 rt_mutex_show_held_locks(task, 1);
406
407 printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task->pid);
408 show_stack(task, NULL);
409 printk("\n%s/%d's [current] stackdump:\n\n",
410 current->comm, current->pid);
411 dump_stack();
412 rt_mutex_show_all_locks();
413 printk("[ turning off deadlock detection."
414 "Please report this trace. ]\n\n");
415 local_irq_disable();
416}
417
418void debug_rt_mutex_lock(struct rt_mutex *lock __IP_DECL__)
419{
420 unsigned long flags;
421
422 if (rt_trace_on) {
423 TRACE_WARN_ON_LOCKED(!list_empty(&lock->held_list_entry));
424
425 spin_lock_irqsave(&current->held_list_lock, flags);
426 list_add_tail(&lock->held_list_entry, &current->held_list_head);
427 spin_unlock_irqrestore(&current->held_list_lock, flags);
428
429 lock->acquire_ip = ip;
430 }
431}
432
433void debug_rt_mutex_unlock(struct rt_mutex *lock)
434{
435 unsigned long flags;
436
437 if (rt_trace_on) {
438 TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current);
439 TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list_entry));
440
441 spin_lock_irqsave(&current->held_list_lock, flags);
442 list_del_init(&lock->held_list_entry);
443 spin_unlock_irqrestore(&current->held_list_lock, flags);
444 }
445}
446
447void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
448 struct task_struct *powner __IP_DECL__)
449{
450 unsigned long flags;
451
452 if (rt_trace_on) {
453 TRACE_WARN_ON_LOCKED(!list_empty(&lock->held_list_entry));
454
455 spin_lock_irqsave(&powner->held_list_lock, flags);
456 list_add_tail(&lock->held_list_entry, &powner->held_list_head);
457 spin_unlock_irqrestore(&powner->held_list_lock, flags);
458
459 lock->acquire_ip = ip;
460 }
461}
462
463void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock)
464{
465 unsigned long flags;
466
467 if (rt_trace_on) {
468 struct task_struct *owner = rt_mutex_owner(lock);
469
470 TRACE_WARN_ON_LOCKED(!owner);
471 TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list_entry));
472
473 spin_lock_irqsave(&owner->held_list_lock, flags);
474 list_del_init(&lock->held_list_entry);
475 spin_unlock_irqrestore(&owner->held_list_lock, flags);
476 }
477}
478
479void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
480{
481 memset(waiter, 0x11, sizeof(*waiter));
482 plist_node_init(&waiter->list_entry, MAX_PRIO);
483 plist_node_init(&waiter->pi_list_entry, MAX_PRIO);
484}
485
486void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
487{
488 TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry));
489 TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
490 TRACE_WARN_ON(waiter->task);
491 memset(waiter, 0x22, sizeof(*waiter));
492}
493
494void debug_rt_mutex_init(struct rt_mutex *lock, const char *name)
495{
496 void *addr = lock;
497
498 if (rt_trace_on) {
499 rt_mutex_debug_check_no_locks_freed(addr,
500 sizeof(struct rt_mutex));
501 INIT_LIST_HEAD(&lock->held_list_entry);
502 lock->name = name;
503 }
504}
505
506void rt_mutex_deadlock_account_lock(struct rt_mutex *lock, task_t *task)
507{
508}
509
510void rt_mutex_deadlock_account_unlock(struct task_struct *task)
511{
512}
513
diff --git a/kernel/rtmutex-debug.h b/kernel/rtmutex-debug.h
new file mode 100644
index 000000000000..7612fbc62d70
--- /dev/null
+++ b/kernel/rtmutex-debug.h
@@ -0,0 +1,37 @@
1/*
2 * RT-Mutexes: blocking mutual exclusion locks with PI support
3 *
4 * started by Ingo Molnar and Thomas Gleixner:
5 *
6 * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
8 *
9 * This file contains macros used solely by rtmutex.c. Debug version.
10 */
11
12#define __IP_DECL__ , unsigned long ip
13#define __IP__ , ip
14#define __RET_IP__ , (unsigned long)__builtin_return_address(0)
15
16extern void
17rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task);
18extern void rt_mutex_deadlock_account_unlock(struct task_struct *task);
19extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
20extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter);
21extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name);
22extern void debug_rt_mutex_lock(struct rt_mutex *lock __IP_DECL__);
23extern void debug_rt_mutex_unlock(struct rt_mutex *lock);
24extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock,
25 struct task_struct *powner __IP_DECL__);
26extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock);
27extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter,
28 struct rt_mutex *lock);
29extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter);
30# define debug_rt_mutex_reset_waiter(w) \
31 do { (w)->deadlock_lock = NULL; } while (0)
32
33static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter,
34 int detect)
35{
36 return (waiter != NULL);
37}
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
new file mode 100644
index 000000000000..e82c2f848249
--- /dev/null
+++ b/kernel/rtmutex-tester.c
@@ -0,0 +1,440 @@
1/*
2 * RT-Mutex-tester: scriptable tester for rt mutexes
3 *
4 * started by Thomas Gleixner:
5 *
6 * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
7 *
8 */
9#include <linux/config.h>
10#include <linux/kthread.h>
11#include <linux/module.h>
12#include <linux/sched.h>
13#include <linux/smp_lock.h>
14#include <linux/spinlock.h>
15#include <linux/sysdev.h>
16#include <linux/timer.h>
17
18#include "rtmutex.h"
19
20#define MAX_RT_TEST_THREADS 8
21#define MAX_RT_TEST_MUTEXES 8
22
23static spinlock_t rttest_lock;
24static atomic_t rttest_event;
25
26struct test_thread_data {
27 int opcode;
28 int opdata;
29 int mutexes[MAX_RT_TEST_MUTEXES];
30 int bkl;
31 int event;
32 struct sys_device sysdev;
33};
34
35static struct test_thread_data thread_data[MAX_RT_TEST_THREADS];
36static task_t *threads[MAX_RT_TEST_THREADS];
37static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES];
38
39enum test_opcodes {
40 RTTEST_NOP = 0,
41 RTTEST_SCHEDOT, /* 1 Sched other, data = nice */
42 RTTEST_SCHEDRT, /* 2 Sched fifo, data = prio */
43 RTTEST_LOCK, /* 3 Lock uninterruptible, data = lockindex */
44 RTTEST_LOCKNOWAIT, /* 4 Lock uninterruptible no wait in wakeup, data = lockindex */
45 RTTEST_LOCKINT, /* 5 Lock interruptible, data = lockindex */
46 RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */
47 RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */
48 RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */
49 RTTEST_LOCKBKL, /* 9 Lock BKL */
50 RTTEST_UNLOCKBKL, /* 10 Unlock BKL */
51 RTTEST_SIGNAL, /* 11 Signal other test thread, data = thread id */
52 RTTEST_RESETEVENT = 98, /* 98 Reset event counter */
53 RTTEST_RESET = 99, /* 99 Reset all pending operations */
54};
55
56static int handle_op(struct test_thread_data *td, int lockwakeup)
57{
58 int i, id, ret = -EINVAL;
59
60 switch(td->opcode) {
61
62 case RTTEST_NOP:
63 return 0;
64
65 case RTTEST_LOCKCONT:
66 td->mutexes[td->opdata] = 1;
67 td->event = atomic_add_return(1, &rttest_event);
68 return 0;
69
70 case RTTEST_RESET:
71 for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) {
72 if (td->mutexes[i] == 4) {
73 rt_mutex_unlock(&mutexes[i]);
74 td->mutexes[i] = 0;
75 }
76 }
77
78 if (!lockwakeup && td->bkl == 4) {
79 unlock_kernel();
80 td->bkl = 0;
81 }
82 return 0;
83
84 case RTTEST_RESETEVENT:
85 atomic_set(&rttest_event, 0);
86 return 0;
87
88 default:
89 if (lockwakeup)
90 return ret;
91 }
92
93 switch(td->opcode) {
94
95 case RTTEST_LOCK:
96 case RTTEST_LOCKNOWAIT:
97 id = td->opdata;
98 if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
99 return ret;
100
101 td->mutexes[id] = 1;
102 td->event = atomic_add_return(1, &rttest_event);
103 rt_mutex_lock(&mutexes[id]);
104 td->event = atomic_add_return(1, &rttest_event);
105 td->mutexes[id] = 4;
106 return 0;
107
108 case RTTEST_LOCKINT:
109 case RTTEST_LOCKINTNOWAIT:
110 id = td->opdata;
111 if (id < 0 || id >= MAX_RT_TEST_MUTEXES)
112 return ret;
113
114 td->mutexes[id] = 1;
115 td->event = atomic_add_return(1, &rttest_event);
116 ret = rt_mutex_lock_interruptible(&mutexes[id], 0);
117 td->event = atomic_add_return(1, &rttest_event);
118 td->mutexes[id] = ret ? 0 : 4;
119 return ret ? -EINTR : 0;
120
121 case RTTEST_UNLOCK:
122 id = td->opdata;
123 if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4)
124 return ret;
125
126 td->event = atomic_add_return(1, &rttest_event);
127 rt_mutex_unlock(&mutexes[id]);
128 td->event = atomic_add_return(1, &rttest_event);
129 td->mutexes[id] = 0;
130 return 0;
131
132 case RTTEST_LOCKBKL:
133 if (td->bkl)
134 return 0;
135 td->bkl = 1;
136 lock_kernel();
137 td->bkl = 4;
138 return 0;
139
140 case RTTEST_UNLOCKBKL:
141 if (td->bkl != 4)
142 break;
143 unlock_kernel();
144 td->bkl = 0;
145 return 0;
146
147 default:
148 break;
149 }
150 return ret;
151}
152
153/*
154 * Schedule replacement for rtsem_down(). Only called for threads with
155 * PF_MUTEX_TESTER set.
156 *
157 * This allows us to have finegrained control over the event flow.
158 *
159 */
160void schedule_rt_mutex_test(struct rt_mutex *mutex)
161{
162 int tid, op, dat;
163 struct test_thread_data *td;
164
165 /* We have to lookup the task */
166 for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) {
167 if (threads[tid] == current)
168 break;
169 }
170
171 BUG_ON(tid == MAX_RT_TEST_THREADS);
172
173 td = &thread_data[tid];
174
175 op = td->opcode;
176 dat = td->opdata;
177
178 switch (op) {
179 case RTTEST_LOCK:
180 case RTTEST_LOCKINT:
181 case RTTEST_LOCKNOWAIT:
182 case RTTEST_LOCKINTNOWAIT:
183 if (mutex != &mutexes[dat])
184 break;
185
186 if (td->mutexes[dat] != 1)
187 break;
188
189 td->mutexes[dat] = 2;
190 td->event = atomic_add_return(1, &rttest_event);
191 break;
192
193 case RTTEST_LOCKBKL:
194 default:
195 break;
196 }
197
198 schedule();
199
200
201 switch (op) {
202 case RTTEST_LOCK:
203 case RTTEST_LOCKINT:
204 if (mutex != &mutexes[dat])
205 return;
206
207 if (td->mutexes[dat] != 2)
208 return;
209
210 td->mutexes[dat] = 3;
211 td->event = atomic_add_return(1, &rttest_event);
212 break;
213
214 case RTTEST_LOCKNOWAIT:
215 case RTTEST_LOCKINTNOWAIT:
216 if (mutex != &mutexes[dat])
217 return;
218
219 if (td->mutexes[dat] != 2)
220 return;
221
222 td->mutexes[dat] = 1;
223 td->event = atomic_add_return(1, &rttest_event);
224 return;
225
226 case RTTEST_LOCKBKL:
227 return;
228 default:
229 return;
230 }
231
232 td->opcode = 0;
233
234 for (;;) {
235 set_current_state(TASK_INTERRUPTIBLE);
236
237 if (td->opcode > 0) {
238 int ret;
239
240 set_current_state(TASK_RUNNING);
241 ret = handle_op(td, 1);
242 set_current_state(TASK_INTERRUPTIBLE);
243 if (td->opcode == RTTEST_LOCKCONT)
244 break;
245 td->opcode = ret;
246 }
247
248 /* Wait for the next command to be executed */
249 schedule();
250 }
251
252 /* Restore previous command and data */
253 td->opcode = op;
254 td->opdata = dat;
255}
256
257static int test_func(void *data)
258{
259 struct test_thread_data *td = data;
260 int ret;
261
262 current->flags |= PF_MUTEX_TESTER;
263 allow_signal(SIGHUP);
264
265 for(;;) {
266
267 set_current_state(TASK_INTERRUPTIBLE);
268
269 if (td->opcode > 0) {
270 set_current_state(TASK_RUNNING);
271 ret = handle_op(td, 0);
272 set_current_state(TASK_INTERRUPTIBLE);
273 td->opcode = ret;
274 }
275
276 /* Wait for the next command to be executed */
277 schedule();
278
279 if (signal_pending(current))
280 flush_signals(current);
281
282 if(kthread_should_stop())
283 break;
284 }
285 return 0;
286}
287
288/**
289 * sysfs_test_command - interface for test commands
290 * @dev: thread reference
291 * @buf: command for actual step
292 * @count: length of buffer
293 *
294 * command syntax:
295 *
296 * opcode:data
297 */
298static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf,
299 size_t count)
300{
301 struct sched_param schedpar;
302 struct test_thread_data *td;
303 char cmdbuf[32];
304 int op, dat, tid, ret;
305
306 td = container_of(dev, struct test_thread_data, sysdev);
307 tid = td->sysdev.id;
308
309 /* strings from sysfs write are not 0 terminated! */
310 if (count >= sizeof(cmdbuf))
311 return -EINVAL;
312
313 /* strip of \n: */
314 if (buf[count-1] == '\n')
315 count--;
316 if (count < 1)
317 return -EINVAL;
318
319 memcpy(cmdbuf, buf, count);
320 cmdbuf[count] = 0;
321
322 if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2)
323 return -EINVAL;
324
325 switch (op) {
326 case RTTEST_SCHEDOT:
327 schedpar.sched_priority = 0;
328 ret = sched_setscheduler(threads[tid], SCHED_NORMAL, &schedpar);
329 if (ret)
330 return ret;
331 set_user_nice(current, 0);
332 break;
333
334 case RTTEST_SCHEDRT:
335 schedpar.sched_priority = dat;
336 ret = sched_setscheduler(threads[tid], SCHED_FIFO, &schedpar);
337 if (ret)
338 return ret;
339 break;
340
341 case RTTEST_SIGNAL:
342 send_sig(SIGHUP, threads[tid], 0);
343 break;
344
345 default:
346 if (td->opcode > 0)
347 return -EBUSY;
348 td->opdata = dat;
349 td->opcode = op;
350 wake_up_process(threads[tid]);
351 }
352
353 return count;
354}
355
356/**
357 * sysfs_test_status - sysfs interface for rt tester
358 * @dev: thread to query
359 * @buf: char buffer to be filled with thread status info
360 */
361static ssize_t sysfs_test_status(struct sys_device *dev, char *buf)
362{
363 struct test_thread_data *td;
364 char *curr = buf;
365 task_t *tsk;
366 int i;
367
368 td = container_of(dev, struct test_thread_data, sysdev);
369 tsk = threads[td->sysdev.id];
370
371 spin_lock(&rttest_lock);
372
373 curr += sprintf(curr,
374 "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:",
375 td->opcode, td->event, tsk->state,
376 (MAX_RT_PRIO - 1) - tsk->prio,
377 (MAX_RT_PRIO - 1) - tsk->normal_prio,
378 tsk->pi_blocked_on, td->bkl);
379
380 for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--)
381 curr += sprintf(curr, "%d", td->mutexes[i]);
382
383 spin_unlock(&rttest_lock);
384
385 curr += sprintf(curr, ", T: %p, R: %p\n", tsk,
386 mutexes[td->sysdev.id].owner);
387
388 return curr - buf;
389}
390
391static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL);
392static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command);
393
394static struct sysdev_class rttest_sysclass = {
395 set_kset_name("rttest"),
396};
397
398static int init_test_thread(int id)
399{
400 thread_data[id].sysdev.cls = &rttest_sysclass;
401 thread_data[id].sysdev.id = id;
402
403 threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id);
404 if (IS_ERR(threads[id]))
405 return PTR_ERR(threads[id]);
406
407 return sysdev_register(&thread_data[id].sysdev);
408}
409
410static int init_rttest(void)
411{
412 int ret, i;
413
414 spin_lock_init(&rttest_lock);
415
416 for (i = 0; i < MAX_RT_TEST_MUTEXES; i++)
417 rt_mutex_init(&mutexes[i]);
418
419 ret = sysdev_class_register(&rttest_sysclass);
420 if (ret)
421 return ret;
422
423 for (i = 0; i < MAX_RT_TEST_THREADS; i++) {
424 ret = init_test_thread(i);
425 if (ret)
426 break;
427 ret = sysdev_create_file(&thread_data[i].sysdev, &attr_status);
428 if (ret)
429 break;
430 ret = sysdev_create_file(&thread_data[i].sysdev, &attr_command);
431 if (ret)
432 break;
433 }
434
435 printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" );
436
437 return ret;
438}
439
440device_initcall(init_rttest);
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
new file mode 100644
index 000000000000..45d61016da57
--- /dev/null
+++ b/kernel/rtmutex.c
@@ -0,0 +1,990 @@
1/*
2 * RT-Mutexes: simple blocking mutual exclusion locks with PI support
3 *
4 * started by Ingo Molnar and Thomas Gleixner.
5 *
6 * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
8 * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
9 * Copyright (C) 2006 Esben Nielsen
10 */
11#include <linux/spinlock.h>
12#include <linux/module.h>
13#include <linux/sched.h>
14#include <linux/timer.h>
15
16#include "rtmutex_common.h"
17
18#ifdef CONFIG_DEBUG_RT_MUTEXES
19# include "rtmutex-debug.h"
20#else
21# include "rtmutex.h"
22#endif
23
24/*
25 * lock->owner state tracking:
26 *
27 * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1
28 * are used to keep track of the "owner is pending" and "lock has
29 * waiters" state.
30 *
31 * owner bit1 bit0
32 * NULL 0 0 lock is free (fast acquire possible)
33 * NULL 0 1 invalid state
34 * NULL 1 0 Transitional State*
35 * NULL 1 1 invalid state
36 * taskpointer 0 0 lock is held (fast release possible)
37 * taskpointer 0 1 task is pending owner
38 * taskpointer 1 0 lock is held and has waiters
39 * taskpointer 1 1 task is pending owner and lock has more waiters
40 *
41 * Pending ownership is assigned to the top (highest priority)
42 * waiter of the lock, when the lock is released. The thread is woken
43 * up and can now take the lock. Until the lock is taken (bit 0
44 * cleared) a competing higher priority thread can steal the lock
45 * which puts the woken up thread back on the waiters list.
46 *
47 * The fast atomic compare exchange based acquire and release is only
48 * possible when bit 0 and 1 of lock->owner are 0.
49 *
50 * (*) There's a small time where the owner can be NULL and the
51 * "lock has waiters" bit is set. This can happen when grabbing the lock.
52 * To prevent a cmpxchg of the owner releasing the lock, we need to set this
53 * bit before looking at the lock, hence the reason this is a transitional
54 * state.
55 */
56
57static void
58rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
59 unsigned long mask)
60{
61 unsigned long val = (unsigned long)owner | mask;
62
63 if (rt_mutex_has_waiters(lock))
64 val |= RT_MUTEX_HAS_WAITERS;
65
66 lock->owner = (struct task_struct *)val;
67}
68
69static inline void clear_rt_mutex_waiters(struct rt_mutex *lock)
70{
71 lock->owner = (struct task_struct *)
72 ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
73}
74
75static void fixup_rt_mutex_waiters(struct rt_mutex *lock)
76{
77 if (!rt_mutex_has_waiters(lock))
78 clear_rt_mutex_waiters(lock);
79}
80
81/*
82 * We can speed up the acquire/release, if the architecture
83 * supports cmpxchg and if there's no debugging state to be set up
84 */
85#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES)
86# define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c)
87static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
88{
89 unsigned long owner, *p = (unsigned long *) &lock->owner;
90
91 do {
92 owner = *p;
93 } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner);
94}
95#else
96# define rt_mutex_cmpxchg(l,c,n) (0)
97static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
98{
99 lock->owner = (struct task_struct *)
100 ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS);
101}
102#endif
103
104/*
105 * Calculate task priority from the waiter list priority
106 *
107 * Return task->normal_prio when the waiter list is empty or when
108 * the waiter is not allowed to do priority boosting
109 */
110int rt_mutex_getprio(struct task_struct *task)
111{
112 if (likely(!task_has_pi_waiters(task)))
113 return task->normal_prio;
114
115 return min(task_top_pi_waiter(task)->pi_list_entry.prio,
116 task->normal_prio);
117}
118
119/*
120 * Adjust the priority of a task, after its pi_waiters got modified.
121 *
122 * This can be both boosting and unboosting. task->pi_lock must be held.
123 */
124static void __rt_mutex_adjust_prio(struct task_struct *task)
125{
126 int prio = rt_mutex_getprio(task);
127
128 if (task->prio != prio)
129 rt_mutex_setprio(task, prio);
130}
131
132/*
133 * Adjust task priority (undo boosting). Called from the exit path of
134 * rt_mutex_slowunlock() and rt_mutex_slowlock().
135 *
136 * (Note: We do this outside of the protection of lock->wait_lock to
137 * allow the lock to be taken while or before we readjust the priority
138 * of task. We do not use the spin_xx_mutex() variants here as we are
139 * outside of the debug path.)
140 */
141static void rt_mutex_adjust_prio(struct task_struct *task)
142{
143 unsigned long flags;
144
145 spin_lock_irqsave(&task->pi_lock, flags);
146 __rt_mutex_adjust_prio(task);
147 spin_unlock_irqrestore(&task->pi_lock, flags);
148}
149
150/*
151 * Max number of times we'll walk the boosting chain:
152 */
153int max_lock_depth = 1024;
154
155/*
156 * Adjust the priority chain. Also used for deadlock detection.
157 * Decreases task's usage by one - may thus free the task.
158 * Returns 0 or -EDEADLK.
159 */
160static int rt_mutex_adjust_prio_chain(task_t *task,
161 int deadlock_detect,
162 struct rt_mutex *orig_lock,
163 struct rt_mutex_waiter *orig_waiter,
164 struct task_struct *top_task
165 __IP_DECL__)
166{
167 struct rt_mutex *lock;
168 struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter;
169 int detect_deadlock, ret = 0, depth = 0;
170 unsigned long flags;
171
172 detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter,
173 deadlock_detect);
174
175 /*
176 * The (de)boosting is a step by step approach with a lot of
177 * pitfalls. We want this to be preemptible and we want hold a
178 * maximum of two locks per step. So we have to check
179 * carefully whether things change under us.
180 */
181 again:
182 if (++depth > max_lock_depth) {
183 static int prev_max;
184
185 /*
186 * Print this only once. If the admin changes the limit,
187 * print a new message when reaching the limit again.
188 */
189 if (prev_max != max_lock_depth) {
190 prev_max = max_lock_depth;
191 printk(KERN_WARNING "Maximum lock depth %d reached "
192 "task: %s (%d)\n", max_lock_depth,
193 top_task->comm, top_task->pid);
194 }
195 put_task_struct(task);
196
197 return deadlock_detect ? -EDEADLK : 0;
198 }
199 retry:
200 /*
201 * Task can not go away as we did a get_task() before !
202 */
203 spin_lock_irqsave(&task->pi_lock, flags);
204
205 waiter = task->pi_blocked_on;
206 /*
207 * Check whether the end of the boosting chain has been
208 * reached or the state of the chain has changed while we
209 * dropped the locks.
210 */
211 if (!waiter || !waiter->task)
212 goto out_unlock_pi;
213
214 if (top_waiter && (!task_has_pi_waiters(task) ||
215 top_waiter != task_top_pi_waiter(task)))
216 goto out_unlock_pi;
217
218 /*
219 * When deadlock detection is off then we check, if further
220 * priority adjustment is necessary.
221 */
222 if (!detect_deadlock && waiter->list_entry.prio == task->prio)
223 goto out_unlock_pi;
224
225 lock = waiter->lock;
226 if (!spin_trylock(&lock->wait_lock)) {
227 spin_unlock_irqrestore(&task->pi_lock, flags);
228 cpu_relax();
229 goto retry;
230 }
231
232 /* Deadlock detection */
233 if (lock == orig_lock || rt_mutex_owner(lock) == top_task) {
234 debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock);
235 spin_unlock(&lock->wait_lock);
236 ret = deadlock_detect ? -EDEADLK : 0;
237 goto out_unlock_pi;
238 }
239
240 top_waiter = rt_mutex_top_waiter(lock);
241
242 /* Requeue the waiter */
243 plist_del(&waiter->list_entry, &lock->wait_list);
244 waiter->list_entry.prio = task->prio;
245 plist_add(&waiter->list_entry, &lock->wait_list);
246
247 /* Release the task */
248 spin_unlock_irqrestore(&task->pi_lock, flags);
249 put_task_struct(task);
250
251 /* Grab the next task */
252 task = rt_mutex_owner(lock);
253 spin_lock_irqsave(&task->pi_lock, flags);
254
255 if (waiter == rt_mutex_top_waiter(lock)) {
256 /* Boost the owner */
257 plist_del(&top_waiter->pi_list_entry, &task->pi_waiters);
258 waiter->pi_list_entry.prio = waiter->list_entry.prio;
259 plist_add(&waiter->pi_list_entry, &task->pi_waiters);
260 __rt_mutex_adjust_prio(task);
261
262 } else if (top_waiter == waiter) {
263 /* Deboost the owner */
264 plist_del(&waiter->pi_list_entry, &task->pi_waiters);
265 waiter = rt_mutex_top_waiter(lock);
266 waiter->pi_list_entry.prio = waiter->list_entry.prio;
267 plist_add(&waiter->pi_list_entry, &task->pi_waiters);
268 __rt_mutex_adjust_prio(task);
269 }
270
271 get_task_struct(task);
272 spin_unlock_irqrestore(&task->pi_lock, flags);
273
274 top_waiter = rt_mutex_top_waiter(lock);
275 spin_unlock(&lock->wait_lock);
276
277 if (!detect_deadlock && waiter != top_waiter)
278 goto out_put_task;
279
280 goto again;
281
282 out_unlock_pi:
283 spin_unlock_irqrestore(&task->pi_lock, flags);
284 out_put_task:
285 put_task_struct(task);
286 return ret;
287}
288
289/*
290 * Optimization: check if we can steal the lock from the
291 * assigned pending owner [which might not have taken the
292 * lock yet]:
293 */
294static inline int try_to_steal_lock(struct rt_mutex *lock)
295{
296 struct task_struct *pendowner = rt_mutex_owner(lock);
297 struct rt_mutex_waiter *next;
298 unsigned long flags;
299
300 if (!rt_mutex_owner_pending(lock))
301 return 0;
302
303 if (pendowner == current)
304 return 1;
305
306 spin_lock_irqsave(&pendowner->pi_lock, flags);
307 if (current->prio >= pendowner->prio) {
308 spin_unlock_irqrestore(&pendowner->pi_lock, flags);
309 return 0;
310 }
311
312 /*
313 * Check if a waiter is enqueued on the pending owners
314 * pi_waiters list. Remove it and readjust pending owners
315 * priority.
316 */
317 if (likely(!rt_mutex_has_waiters(lock))) {
318 spin_unlock_irqrestore(&pendowner->pi_lock, flags);
319 return 1;
320 }
321
322 /* No chain handling, pending owner is not blocked on anything: */
323 next = rt_mutex_top_waiter(lock);
324 plist_del(&next->pi_list_entry, &pendowner->pi_waiters);
325 __rt_mutex_adjust_prio(pendowner);
326 spin_unlock_irqrestore(&pendowner->pi_lock, flags);
327
328 /*
329 * We are going to steal the lock and a waiter was
330 * enqueued on the pending owners pi_waiters queue. So
331 * we have to enqueue this waiter into
332 * current->pi_waiters list. This covers the case,
333 * where current is boosted because it holds another
334 * lock and gets unboosted because the booster is
335 * interrupted, so we would delay a waiter with higher
336 * priority as current->normal_prio.
337 *
338 * Note: in the rare case of a SCHED_OTHER task changing
339 * its priority and thus stealing the lock, next->task
340 * might be current:
341 */
342 if (likely(next->task != current)) {
343 spin_lock_irqsave(&current->pi_lock, flags);
344 plist_add(&next->pi_list_entry, &current->pi_waiters);
345 __rt_mutex_adjust_prio(current);
346 spin_unlock_irqrestore(&current->pi_lock, flags);
347 }
348 return 1;
349}
350
351/*
352 * Try to take an rt-mutex
353 *
354 * This fails
355 * - when the lock has a real owner
356 * - when a different pending owner exists and has higher priority than current
357 *
358 * Must be called with lock->wait_lock held.
359 */
360static int try_to_take_rt_mutex(struct rt_mutex *lock __IP_DECL__)
361{
362 /*
363 * We have to be careful here if the atomic speedups are
364 * enabled, such that, when
365 * - no other waiter is on the lock
366 * - the lock has been released since we did the cmpxchg
367 * the lock can be released or taken while we are doing the
368 * checks and marking the lock with RT_MUTEX_HAS_WAITERS.
369 *
370 * The atomic acquire/release aware variant of
371 * mark_rt_mutex_waiters uses a cmpxchg loop. After setting
372 * the WAITERS bit, the atomic release / acquire can not
373 * happen anymore and lock->wait_lock protects us from the
374 * non-atomic case.
375 *
376 * Note, that this might set lock->owner =
377 * RT_MUTEX_HAS_WAITERS in the case the lock is not contended
378 * any more. This is fixed up when we take the ownership.
379 * This is the transitional state explained at the top of this file.
380 */
381 mark_rt_mutex_waiters(lock);
382
383 if (rt_mutex_owner(lock) && !try_to_steal_lock(lock))
384 return 0;
385
386 /* We got the lock. */
387 debug_rt_mutex_lock(lock __IP__);
388
389 rt_mutex_set_owner(lock, current, 0);
390
391 rt_mutex_deadlock_account_lock(lock, current);
392
393 return 1;
394}
395
396/*
397 * Task blocks on lock.
398 *
399 * Prepare waiter and propagate pi chain
400 *
401 * This must be called with lock->wait_lock held.
402 */
403static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
404 struct rt_mutex_waiter *waiter,
405 int detect_deadlock
406 __IP_DECL__)
407{
408 struct rt_mutex_waiter *top_waiter = waiter;
409 task_t *owner = rt_mutex_owner(lock);
410 int boost = 0, res;
411 unsigned long flags;
412
413 spin_lock_irqsave(&current->pi_lock, flags);
414 __rt_mutex_adjust_prio(current);
415 waiter->task = current;
416 waiter->lock = lock;
417 plist_node_init(&waiter->list_entry, current->prio);
418 plist_node_init(&waiter->pi_list_entry, current->prio);
419
420 /* Get the top priority waiter on the lock */
421 if (rt_mutex_has_waiters(lock))
422 top_waiter = rt_mutex_top_waiter(lock);
423 plist_add(&waiter->list_entry, &lock->wait_list);
424
425 current->pi_blocked_on = waiter;
426
427 spin_unlock_irqrestore(&current->pi_lock, flags);
428
429 if (waiter == rt_mutex_top_waiter(lock)) {
430 spin_lock_irqsave(&owner->pi_lock, flags);
431 plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
432 plist_add(&waiter->pi_list_entry, &owner->pi_waiters);
433
434 __rt_mutex_adjust_prio(owner);
435 if (owner->pi_blocked_on) {
436 boost = 1;
437 /* gets dropped in rt_mutex_adjust_prio_chain()! */
438 get_task_struct(owner);
439 }
440 spin_unlock_irqrestore(&owner->pi_lock, flags);
441 }
442 else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) {
443 spin_lock_irqsave(&owner->pi_lock, flags);
444 if (owner->pi_blocked_on) {
445 boost = 1;
446 /* gets dropped in rt_mutex_adjust_prio_chain()! */
447 get_task_struct(owner);
448 }
449 spin_unlock_irqrestore(&owner->pi_lock, flags);
450 }
451 if (!boost)
452 return 0;
453
454 spin_unlock(&lock->wait_lock);
455
456 res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
457 current __IP__);
458
459 spin_lock(&lock->wait_lock);
460
461 return res;
462}
463
464/*
465 * Wake up the next waiter on the lock.
466 *
467 * Remove the top waiter from the current tasks waiter list and from
468 * the lock waiter list. Set it as pending owner. Then wake it up.
469 *
470 * Called with lock->wait_lock held.
471 */
472static void wakeup_next_waiter(struct rt_mutex *lock)
473{
474 struct rt_mutex_waiter *waiter;
475 struct task_struct *pendowner;
476 unsigned long flags;
477
478 spin_lock_irqsave(&current->pi_lock, flags);
479
480 waiter = rt_mutex_top_waiter(lock);
481 plist_del(&waiter->list_entry, &lock->wait_list);
482
483 /*
484 * Remove it from current->pi_waiters. We do not adjust a
485 * possible priority boost right now. We execute wakeup in the
486 * boosted mode and go back to normal after releasing
487 * lock->wait_lock.
488 */
489 plist_del(&waiter->pi_list_entry, &current->pi_waiters);
490 pendowner = waiter->task;
491 waiter->task = NULL;
492
493 rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING);
494
495 spin_unlock_irqrestore(&current->pi_lock, flags);
496
497 /*
498 * Clear the pi_blocked_on variable and enqueue a possible
499 * waiter into the pi_waiters list of the pending owner. This
500 * prevents that in case the pending owner gets unboosted a
501 * waiter with higher priority than pending-owner->normal_prio
502 * is blocked on the unboosted (pending) owner.
503 */
504 spin_lock_irqsave(&pendowner->pi_lock, flags);
505
506 WARN_ON(!pendowner->pi_blocked_on);
507 WARN_ON(pendowner->pi_blocked_on != waiter);
508 WARN_ON(pendowner->pi_blocked_on->lock != lock);
509
510 pendowner->pi_blocked_on = NULL;
511
512 if (rt_mutex_has_waiters(lock)) {
513 struct rt_mutex_waiter *next;
514
515 next = rt_mutex_top_waiter(lock);
516 plist_add(&next->pi_list_entry, &pendowner->pi_waiters);
517 }
518 spin_unlock_irqrestore(&pendowner->pi_lock, flags);
519
520 wake_up_process(pendowner);
521}
522
523/*
524 * Remove a waiter from a lock
525 *
526 * Must be called with lock->wait_lock held
527 */
528static void remove_waiter(struct rt_mutex *lock,
529 struct rt_mutex_waiter *waiter __IP_DECL__)
530{
531 int first = (waiter == rt_mutex_top_waiter(lock));
532 int boost = 0;
533 task_t *owner = rt_mutex_owner(lock);
534 unsigned long flags;
535
536 spin_lock_irqsave(&current->pi_lock, flags);
537 plist_del(&waiter->list_entry, &lock->wait_list);
538 waiter->task = NULL;
539 current->pi_blocked_on = NULL;
540 spin_unlock_irqrestore(&current->pi_lock, flags);
541
542 if (first && owner != current) {
543
544 spin_lock_irqsave(&owner->pi_lock, flags);
545
546 plist_del(&waiter->pi_list_entry, &owner->pi_waiters);
547
548 if (rt_mutex_has_waiters(lock)) {
549 struct rt_mutex_waiter *next;
550
551 next = rt_mutex_top_waiter(lock);
552 plist_add(&next->pi_list_entry, &owner->pi_waiters);
553 }
554 __rt_mutex_adjust_prio(owner);
555
556 if (owner->pi_blocked_on) {
557 boost = 1;
558 /* gets dropped in rt_mutex_adjust_prio_chain()! */
559 get_task_struct(owner);
560 }
561 spin_unlock_irqrestore(&owner->pi_lock, flags);
562 }
563
564 WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
565
566 if (!boost)
567 return;
568
569 spin_unlock(&lock->wait_lock);
570
571 rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current __IP__);
572
573 spin_lock(&lock->wait_lock);
574}
575
576/*
577 * Recheck the pi chain, in case we got a priority setting
578 *
579 * Called from sched_setscheduler
580 */
581void rt_mutex_adjust_pi(struct task_struct *task)
582{
583 struct rt_mutex_waiter *waiter;
584 unsigned long flags;
585
586 spin_lock_irqsave(&task->pi_lock, flags);
587
588 waiter = task->pi_blocked_on;
589 if (!waiter || waiter->list_entry.prio == task->prio) {
590 spin_unlock_irqrestore(&task->pi_lock, flags);
591 return;
592 }
593
594 /* gets dropped in rt_mutex_adjust_prio_chain()! */
595 get_task_struct(task);
596 spin_unlock_irqrestore(&task->pi_lock, flags);
597
598 rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task __RET_IP__);
599}
600
601/*
602 * Slow path lock function:
603 */
604static int __sched
605rt_mutex_slowlock(struct rt_mutex *lock, int state,
606 struct hrtimer_sleeper *timeout,
607 int detect_deadlock __IP_DECL__)
608{
609 struct rt_mutex_waiter waiter;
610 int ret = 0;
611
612 debug_rt_mutex_init_waiter(&waiter);
613 waiter.task = NULL;
614
615 spin_lock(&lock->wait_lock);
616
617 /* Try to acquire the lock again: */
618 if (try_to_take_rt_mutex(lock __IP__)) {
619 spin_unlock(&lock->wait_lock);
620 return 0;
621 }
622
623 set_current_state(state);
624
625 /* Setup the timer, when timeout != NULL */
626 if (unlikely(timeout))
627 hrtimer_start(&timeout->timer, timeout->timer.expires,
628 HRTIMER_ABS);
629
630 for (;;) {
631 /* Try to acquire the lock: */
632 if (try_to_take_rt_mutex(lock __IP__))
633 break;
634
635 /*
636 * TASK_INTERRUPTIBLE checks for signals and
637 * timeout. Ignored otherwise.
638 */
639 if (unlikely(state == TASK_INTERRUPTIBLE)) {
640 /* Signal pending? */
641 if (signal_pending(current))
642 ret = -EINTR;
643 if (timeout && !timeout->task)
644 ret = -ETIMEDOUT;
645 if (ret)
646 break;
647 }
648
649 /*
650 * waiter.task is NULL the first time we come here and
651 * when we have been woken up by the previous owner
652 * but the lock got stolen by a higher prio task.
653 */
654 if (!waiter.task) {
655 ret = task_blocks_on_rt_mutex(lock, &waiter,
656 detect_deadlock __IP__);
657 /*
658 * If we got woken up by the owner then start loop
659 * all over without going into schedule to try
660 * to get the lock now:
661 */
662 if (unlikely(!waiter.task))
663 continue;
664
665 if (unlikely(ret))
666 break;
667 }
668
669 spin_unlock(&lock->wait_lock);
670
671 debug_rt_mutex_print_deadlock(&waiter);
672
673 if (waiter.task)
674 schedule_rt_mutex(lock);
675
676 spin_lock(&lock->wait_lock);
677 set_current_state(state);
678 }
679
680 set_current_state(TASK_RUNNING);
681
682 if (unlikely(waiter.task))
683 remove_waiter(lock, &waiter __IP__);
684
685 /*
686 * try_to_take_rt_mutex() sets the waiter bit
687 * unconditionally. We might have to fix that up.
688 */
689 fixup_rt_mutex_waiters(lock);
690
691 spin_unlock(&lock->wait_lock);
692
693 /* Remove pending timer: */
694 if (unlikely(timeout))
695 hrtimer_cancel(&timeout->timer);
696
697 /*
698 * Readjust priority, when we did not get the lock. We might
699 * have been the pending owner and boosted. Since we did not
700 * take the lock, the PI boost has to go.
701 */
702 if (unlikely(ret))
703 rt_mutex_adjust_prio(current);
704
705 debug_rt_mutex_free_waiter(&waiter);
706
707 return ret;
708}
709
710/*
711 * Slow path try-lock function:
712 */
713static inline int
714rt_mutex_slowtrylock(struct rt_mutex *lock __IP_DECL__)
715{
716 int ret = 0;
717
718 spin_lock(&lock->wait_lock);
719
720 if (likely(rt_mutex_owner(lock) != current)) {
721
722 ret = try_to_take_rt_mutex(lock __IP__);
723 /*
724 * try_to_take_rt_mutex() sets the lock waiters
725 * bit unconditionally. Clean this up.
726 */
727 fixup_rt_mutex_waiters(lock);
728 }
729
730 spin_unlock(&lock->wait_lock);
731
732 return ret;
733}
734
735/*
736 * Slow path to release a rt-mutex:
737 */
738static void __sched
739rt_mutex_slowunlock(struct rt_mutex *lock)
740{
741 spin_lock(&lock->wait_lock);
742
743 debug_rt_mutex_unlock(lock);
744
745 rt_mutex_deadlock_account_unlock(current);
746
747 if (!rt_mutex_has_waiters(lock)) {
748 lock->owner = NULL;
749 spin_unlock(&lock->wait_lock);
750 return;
751 }
752
753 wakeup_next_waiter(lock);
754
755 spin_unlock(&lock->wait_lock);
756
757 /* Undo pi boosting if necessary: */
758 rt_mutex_adjust_prio(current);
759}
760
761/*
762 * debug aware fast / slowpath lock,trylock,unlock
763 *
764 * The atomic acquire/release ops are compiled away, when either the
765 * architecture does not support cmpxchg or when debugging is enabled.
766 */
767static inline int
768rt_mutex_fastlock(struct rt_mutex *lock, int state,
769 int detect_deadlock,
770 int (*slowfn)(struct rt_mutex *lock, int state,
771 struct hrtimer_sleeper *timeout,
772 int detect_deadlock __IP_DECL__))
773{
774 if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
775 rt_mutex_deadlock_account_lock(lock, current);
776 return 0;
777 } else
778 return slowfn(lock, state, NULL, detect_deadlock __RET_IP__);
779}
780
781static inline int
782rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
783 struct hrtimer_sleeper *timeout, int detect_deadlock,
784 int (*slowfn)(struct rt_mutex *lock, int state,
785 struct hrtimer_sleeper *timeout,
786 int detect_deadlock __IP_DECL__))
787{
788 if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) {
789 rt_mutex_deadlock_account_lock(lock, current);
790 return 0;
791 } else
792 return slowfn(lock, state, timeout, detect_deadlock __RET_IP__);
793}
794
795static inline int
796rt_mutex_fasttrylock(struct rt_mutex *lock,
797 int (*slowfn)(struct rt_mutex *lock __IP_DECL__))
798{
799 if (likely(rt_mutex_cmpxchg(lock, NULL, current))) {
800 rt_mutex_deadlock_account_lock(lock, current);
801 return 1;
802 }
803 return slowfn(lock __RET_IP__);
804}
805
806static inline void
807rt_mutex_fastunlock(struct rt_mutex *lock,
808 void (*slowfn)(struct rt_mutex *lock))
809{
810 if (likely(rt_mutex_cmpxchg(lock, current, NULL)))
811 rt_mutex_deadlock_account_unlock(current);
812 else
813 slowfn(lock);
814}
815
816/**
817 * rt_mutex_lock - lock a rt_mutex
818 *
819 * @lock: the rt_mutex to be locked
820 */
821void __sched rt_mutex_lock(struct rt_mutex *lock)
822{
823 might_sleep();
824
825 rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock);
826}
827EXPORT_SYMBOL_GPL(rt_mutex_lock);
828
829/**
830 * rt_mutex_lock_interruptible - lock a rt_mutex interruptible
831 *
832 * @lock: the rt_mutex to be locked
833 * @detect_deadlock: deadlock detection on/off
834 *
835 * Returns:
836 * 0 on success
837 * -EINTR when interrupted by a signal
838 * -EDEADLK when the lock would deadlock (when deadlock detection is on)
839 */
840int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock,
841 int detect_deadlock)
842{
843 might_sleep();
844
845 return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE,
846 detect_deadlock, rt_mutex_slowlock);
847}
848EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
849
850/**
851 * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible
852 * the timeout structure is provided
853 * by the caller
854 *
855 * @lock: the rt_mutex to be locked
856 * @timeout: timeout structure or NULL (no timeout)
857 * @detect_deadlock: deadlock detection on/off
858 *
859 * Returns:
860 * 0 on success
861 * -EINTR when interrupted by a signal
862 * -ETIMEOUT when the timeout expired
863 * -EDEADLK when the lock would deadlock (when deadlock detection is on)
864 */
865int
866rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout,
867 int detect_deadlock)
868{
869 might_sleep();
870
871 return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
872 detect_deadlock, rt_mutex_slowlock);
873}
874EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
875
876/**
877 * rt_mutex_trylock - try to lock a rt_mutex
878 *
879 * @lock: the rt_mutex to be locked
880 *
881 * Returns 1 on success and 0 on contention
882 */
883int __sched rt_mutex_trylock(struct rt_mutex *lock)
884{
885 return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock);
886}
887EXPORT_SYMBOL_GPL(rt_mutex_trylock);
888
889/**
890 * rt_mutex_unlock - unlock a rt_mutex
891 *
892 * @lock: the rt_mutex to be unlocked
893 */
894void __sched rt_mutex_unlock(struct rt_mutex *lock)
895{
896 rt_mutex_fastunlock(lock, rt_mutex_slowunlock);
897}
898EXPORT_SYMBOL_GPL(rt_mutex_unlock);
899
900/***
901 * rt_mutex_destroy - mark a mutex unusable
902 * @lock: the mutex to be destroyed
903 *
904 * This function marks the mutex uninitialized, and any subsequent
905 * use of the mutex is forbidden. The mutex must not be locked when
906 * this function is called.
907 */
908void rt_mutex_destroy(struct rt_mutex *lock)
909{
910 WARN_ON(rt_mutex_is_locked(lock));
911#ifdef CONFIG_DEBUG_RT_MUTEXES
912 lock->magic = NULL;
913#endif
914}
915
916EXPORT_SYMBOL_GPL(rt_mutex_destroy);
917
918/**
919 * __rt_mutex_init - initialize the rt lock
920 *
921 * @lock: the rt lock to be initialized
922 *
923 * Initialize the rt lock to unlocked state.
924 *
925 * Initializing of a locked rt lock is not allowed
926 */
927void __rt_mutex_init(struct rt_mutex *lock, const char *name)
928{
929 lock->owner = NULL;
930 spin_lock_init(&lock->wait_lock);
931 plist_head_init(&lock->wait_list, &lock->wait_lock);
932
933 debug_rt_mutex_init(lock, name);
934}
935EXPORT_SYMBOL_GPL(__rt_mutex_init);
936
937/**
938 * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
939 * proxy owner
940 *
941 * @lock: the rt_mutex to be locked
942 * @proxy_owner:the task to set as owner
943 *
944 * No locking. Caller has to do serializing itself
945 * Special API call for PI-futex support
946 */
947void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
948 struct task_struct *proxy_owner)
949{
950 __rt_mutex_init(lock, NULL);
951 debug_rt_mutex_proxy_lock(lock, proxy_owner __RET_IP__);
952 rt_mutex_set_owner(lock, proxy_owner, 0);
953 rt_mutex_deadlock_account_lock(lock, proxy_owner);
954}
955
956/**
957 * rt_mutex_proxy_unlock - release a lock on behalf of owner
958 *
959 * @lock: the rt_mutex to be locked
960 *
961 * No locking. Caller has to do serializing itself
962 * Special API call for PI-futex support
963 */
964void rt_mutex_proxy_unlock(struct rt_mutex *lock,
965 struct task_struct *proxy_owner)
966{
967 debug_rt_mutex_proxy_unlock(lock);
968 rt_mutex_set_owner(lock, NULL, 0);
969 rt_mutex_deadlock_account_unlock(proxy_owner);
970}
971
972/**
973 * rt_mutex_next_owner - return the next owner of the lock
974 *
975 * @lock: the rt lock query
976 *
977 * Returns the next owner of the lock or NULL
978 *
979 * Caller has to serialize against other accessors to the lock
980 * itself.
981 *
982 * Special API call for PI-futex support
983 */
984struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
985{
986 if (!rt_mutex_has_waiters(lock))
987 return NULL;
988
989 return rt_mutex_top_waiter(lock)->task;
990}
diff --git a/kernel/rtmutex.h b/kernel/rtmutex.h
new file mode 100644
index 000000000000..1e0fca13ff72
--- /dev/null
+++ b/kernel/rtmutex.h
@@ -0,0 +1,29 @@
1/*
2 * RT-Mutexes: blocking mutual exclusion locks with PI support
3 *
4 * started by Ingo Molnar and Thomas Gleixner:
5 *
6 * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
8 *
9 * This file contains macros used solely by rtmutex.c.
10 * Non-debug version.
11 */
12
13#define __IP_DECL__
14#define __IP__
15#define __RET_IP__
16#define rt_mutex_deadlock_check(l) (0)
17#define rt_mutex_deadlock_account_lock(m, t) do { } while (0)
18#define rt_mutex_deadlock_account_unlock(l) do { } while (0)
19#define debug_rt_mutex_init_waiter(w) do { } while (0)
20#define debug_rt_mutex_free_waiter(w) do { } while (0)
21#define debug_rt_mutex_lock(l) do { } while (0)
22#define debug_rt_mutex_proxy_lock(l,p) do { } while (0)
23#define debug_rt_mutex_proxy_unlock(l) do { } while (0)
24#define debug_rt_mutex_unlock(l) do { } while (0)
25#define debug_rt_mutex_init(m, n) do { } while (0)
26#define debug_rt_mutex_deadlock(d, a ,l) do { } while (0)
27#define debug_rt_mutex_print_deadlock(w) do { } while (0)
28#define debug_rt_mutex_detect_deadlock(w,d) (d)
29#define debug_rt_mutex_reset_waiter(w) do { } while (0)
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
new file mode 100644
index 000000000000..9c75856e791e
--- /dev/null
+++ b/kernel/rtmutex_common.h
@@ -0,0 +1,123 @@
1/*
2 * RT Mutexes: blocking mutual exclusion locks with PI support
3 *
4 * started by Ingo Molnar and Thomas Gleixner:
5 *
6 * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com>
8 *
9 * This file contains the private data structure and API definitions.
10 */
11
12#ifndef __KERNEL_RTMUTEX_COMMON_H
13#define __KERNEL_RTMUTEX_COMMON_H
14
15#include <linux/rtmutex.h>
16
17/*
18 * The rtmutex in kernel tester is independent of rtmutex debugging. We
19 * call schedule_rt_mutex_test() instead of schedule() for the tasks which
20 * belong to the tester. That way we can delay the wakeup path of those
21 * threads to provoke lock stealing and testing of complex boosting scenarios.
22 */
23#ifdef CONFIG_RT_MUTEX_TESTER
24
25extern void schedule_rt_mutex_test(struct rt_mutex *lock);
26
27#define schedule_rt_mutex(_lock) \
28 do { \
29 if (!(current->flags & PF_MUTEX_TESTER)) \
30 schedule(); \
31 else \
32 schedule_rt_mutex_test(_lock); \
33 } while (0)
34
35#else
36# define schedule_rt_mutex(_lock) schedule()
37#endif
38
39/*
40 * This is the control structure for tasks blocked on a rt_mutex,
41 * which is allocated on the kernel stack on of the blocked task.
42 *
43 * @list_entry: pi node to enqueue into the mutex waiters list
44 * @pi_list_entry: pi node to enqueue into the mutex owner waiters list
45 * @task: task reference to the blocked task
46 */
47struct rt_mutex_waiter {
48 struct plist_node list_entry;
49 struct plist_node pi_list_entry;
50 struct task_struct *task;
51 struct rt_mutex *lock;
52#ifdef CONFIG_DEBUG_RT_MUTEXES
53 unsigned long ip;
54 pid_t deadlock_task_pid;
55 struct rt_mutex *deadlock_lock;
56#endif
57};
58
59/*
60 * Various helpers to access the waiters-plist:
61 */
62static inline int rt_mutex_has_waiters(struct rt_mutex *lock)
63{
64 return !plist_head_empty(&lock->wait_list);
65}
66
67static inline struct rt_mutex_waiter *
68rt_mutex_top_waiter(struct rt_mutex *lock)
69{
70 struct rt_mutex_waiter *w;
71
72 w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter,
73 list_entry);
74 BUG_ON(w->lock != lock);
75
76 return w;
77}
78
79static inline int task_has_pi_waiters(struct task_struct *p)
80{
81 return !plist_head_empty(&p->pi_waiters);
82}
83
84static inline struct rt_mutex_waiter *
85task_top_pi_waiter(struct task_struct *p)
86{
87 return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter,
88 pi_list_entry);
89}
90
91/*
92 * lock->owner state tracking:
93 */
94#define RT_MUTEX_OWNER_PENDING 1UL
95#define RT_MUTEX_HAS_WAITERS 2UL
96#define RT_MUTEX_OWNER_MASKALL 3UL
97
98static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
99{
100 return (struct task_struct *)
101 ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL);
102}
103
104static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock)
105{
106 return (struct task_struct *)
107 ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
108}
109
110static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock)
111{
112 return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING;
113}
114
115/*
116 * PI-futex support (proxy locking functions, etc.):
117 */
118extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock);
119extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
120 struct task_struct *proxy_owner);
121extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
122 struct task_struct *proxy_owner);
123#endif
diff --git a/kernel/sched.c b/kernel/sched.c
index f06d059edef5..2629c1711fd6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -168,15 +168,21 @@
168 */ 168 */
169 169
170#define SCALE_PRIO(x, prio) \ 170#define SCALE_PRIO(x, prio) \
171 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) 171 max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE)
172 172
173static unsigned int task_timeslice(task_t *p) 173static unsigned int static_prio_timeslice(int static_prio)
174{ 174{
175 if (p->static_prio < NICE_TO_PRIO(0)) 175 if (static_prio < NICE_TO_PRIO(0))
176 return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); 176 return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio);
177 else 177 else
178 return SCALE_PRIO(DEF_TIMESLICE, p->static_prio); 178 return SCALE_PRIO(DEF_TIMESLICE, static_prio);
179} 179}
180
181static inline unsigned int task_timeslice(task_t *p)
182{
183 return static_prio_timeslice(p->static_prio);
184}
185
180#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ 186#define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \
181 < (long long) (sd)->cache_hot_time) 187 < (long long) (sd)->cache_hot_time)
182 188
@@ -184,13 +190,11 @@ static unsigned int task_timeslice(task_t *p)
184 * These are the runqueue data structures: 190 * These are the runqueue data structures:
185 */ 191 */
186 192
187#define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long))
188
189typedef struct runqueue runqueue_t; 193typedef struct runqueue runqueue_t;
190 194
191struct prio_array { 195struct prio_array {
192 unsigned int nr_active; 196 unsigned int nr_active;
193 unsigned long bitmap[BITMAP_SIZE]; 197 DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */
194 struct list_head queue[MAX_PRIO]; 198 struct list_head queue[MAX_PRIO];
195}; 199};
196 200
@@ -209,6 +213,7 @@ struct runqueue {
209 * remote CPUs use both these fields when doing load calculation. 213 * remote CPUs use both these fields when doing load calculation.
210 */ 214 */
211 unsigned long nr_running; 215 unsigned long nr_running;
216 unsigned long raw_weighted_load;
212#ifdef CONFIG_SMP 217#ifdef CONFIG_SMP
213 unsigned long cpu_load[3]; 218 unsigned long cpu_load[3];
214#endif 219#endif
@@ -239,7 +244,6 @@ struct runqueue {
239 244
240 task_t *migration_thread; 245 task_t *migration_thread;
241 struct list_head migration_queue; 246 struct list_head migration_queue;
242 int cpu;
243#endif 247#endif
244 248
245#ifdef CONFIG_SCHEDSTATS 249#ifdef CONFIG_SCHEDSTATS
@@ -351,11 +355,30 @@ static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
351#endif /* __ARCH_WANT_UNLOCKED_CTXSW */ 355#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
352 356
353/* 357/*
358 * __task_rq_lock - lock the runqueue a given task resides on.
359 * Must be called interrupts disabled.
360 */
361static inline runqueue_t *__task_rq_lock(task_t *p)
362 __acquires(rq->lock)
363{
364 struct runqueue *rq;
365
366repeat_lock_task:
367 rq = task_rq(p);
368 spin_lock(&rq->lock);
369 if (unlikely(rq != task_rq(p))) {
370 spin_unlock(&rq->lock);
371 goto repeat_lock_task;
372 }
373 return rq;
374}
375
376/*
354 * task_rq_lock - lock the runqueue a given task resides on and disable 377 * task_rq_lock - lock the runqueue a given task resides on and disable
355 * interrupts. Note the ordering: we can safely lookup the task_rq without 378 * interrupts. Note the ordering: we can safely lookup the task_rq without
356 * explicitly disabling preemption. 379 * explicitly disabling preemption.
357 */ 380 */
358static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) 381static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags)
359 __acquires(rq->lock) 382 __acquires(rq->lock)
360{ 383{
361 struct runqueue *rq; 384 struct runqueue *rq;
@@ -371,6 +394,12 @@ repeat_lock_task:
371 return rq; 394 return rq;
372} 395}
373 396
397static inline void __task_rq_unlock(runqueue_t *rq)
398 __releases(rq->lock)
399{
400 spin_unlock(&rq->lock);
401}
402
374static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) 403static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags)
375 __releases(rq->lock) 404 __releases(rq->lock)
376{ 405{
@@ -634,7 +663,7 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
634} 663}
635 664
636/* 665/*
637 * effective_prio - return the priority that is based on the static 666 * __normal_prio - return the priority that is based on the static
638 * priority but is modified by bonuses/penalties. 667 * priority but is modified by bonuses/penalties.
639 * 668 *
640 * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] 669 * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
@@ -647,13 +676,11 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array)
647 * 676 *
648 * Both properties are important to certain workloads. 677 * Both properties are important to certain workloads.
649 */ 678 */
650static int effective_prio(task_t *p) 679
680static inline int __normal_prio(task_t *p)
651{ 681{
652 int bonus, prio; 682 int bonus, prio;
653 683
654 if (rt_task(p))
655 return p->prio;
656
657 bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; 684 bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
658 685
659 prio = p->static_prio - bonus; 686 prio = p->static_prio - bonus;
@@ -665,6 +692,106 @@ static int effective_prio(task_t *p)
665} 692}
666 693
667/* 694/*
695 * To aid in avoiding the subversion of "niceness" due to uneven distribution
696 * of tasks with abnormal "nice" values across CPUs the contribution that
697 * each task makes to its run queue's load is weighted according to its
698 * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a
699 * scaled version of the new time slice allocation that they receive on time
700 * slice expiry etc.
701 */
702
703/*
704 * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE
705 * If static_prio_timeslice() is ever changed to break this assumption then
706 * this code will need modification
707 */
708#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE
709#define LOAD_WEIGHT(lp) \
710 (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO)
711#define PRIO_TO_LOAD_WEIGHT(prio) \
712 LOAD_WEIGHT(static_prio_timeslice(prio))
713#define RTPRIO_TO_LOAD_WEIGHT(rp) \
714 (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp))
715
716static void set_load_weight(task_t *p)
717{
718 if (has_rt_policy(p)) {
719#ifdef CONFIG_SMP
720 if (p == task_rq(p)->migration_thread)
721 /*
722 * The migration thread does the actual balancing.
723 * Giving its load any weight will skew balancing
724 * adversely.
725 */
726 p->load_weight = 0;
727 else
728#endif
729 p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority);
730 } else
731 p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio);
732}
733
734static inline void inc_raw_weighted_load(runqueue_t *rq, const task_t *p)
735{
736 rq->raw_weighted_load += p->load_weight;
737}
738
739static inline void dec_raw_weighted_load(runqueue_t *rq, const task_t *p)
740{
741 rq->raw_weighted_load -= p->load_weight;
742}
743
744static inline void inc_nr_running(task_t *p, runqueue_t *rq)
745{
746 rq->nr_running++;
747 inc_raw_weighted_load(rq, p);
748}
749
750static inline void dec_nr_running(task_t *p, runqueue_t *rq)
751{
752 rq->nr_running--;
753 dec_raw_weighted_load(rq, p);
754}
755
756/*
757 * Calculate the expected normal priority: i.e. priority
758 * without taking RT-inheritance into account. Might be
759 * boosted by interactivity modifiers. Changes upon fork,
760 * setprio syscalls, and whenever the interactivity
761 * estimator recalculates.
762 */
763static inline int normal_prio(task_t *p)
764{
765 int prio;
766
767 if (has_rt_policy(p))
768 prio = MAX_RT_PRIO-1 - p->rt_priority;
769 else
770 prio = __normal_prio(p);
771 return prio;
772}
773
774/*
775 * Calculate the current priority, i.e. the priority
776 * taken into account by the scheduler. This value might
777 * be boosted by RT tasks, or might be boosted by
778 * interactivity modifiers. Will be RT if the task got
779 * RT-boosted. If not then it returns p->normal_prio.
780 */
781static int effective_prio(task_t *p)
782{
783 p->normal_prio = normal_prio(p);
784 /*
785 * If we are RT tasks or we were boosted to RT priority,
786 * keep the priority unchanged. Otherwise, update priority
787 * to the normal priority:
788 */
789 if (!rt_prio(p->prio))
790 return p->normal_prio;
791 return p->prio;
792}
793
794/*
668 * __activate_task - move a task to the runqueue. 795 * __activate_task - move a task to the runqueue.
669 */ 796 */
670static void __activate_task(task_t *p, runqueue_t *rq) 797static void __activate_task(task_t *p, runqueue_t *rq)
@@ -674,7 +801,7 @@ static void __activate_task(task_t *p, runqueue_t *rq)
674 if (batch_task(p)) 801 if (batch_task(p))
675 target = rq->expired; 802 target = rq->expired;
676 enqueue_task(p, target); 803 enqueue_task(p, target);
677 rq->nr_running++; 804 inc_nr_running(p, rq);
678} 805}
679 806
680/* 807/*
@@ -683,39 +810,45 @@ static void __activate_task(task_t *p, runqueue_t *rq)
683static inline void __activate_idle_task(task_t *p, runqueue_t *rq) 810static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
684{ 811{
685 enqueue_task_head(p, rq->active); 812 enqueue_task_head(p, rq->active);
686 rq->nr_running++; 813 inc_nr_running(p, rq);
687} 814}
688 815
816/*
817 * Recalculate p->normal_prio and p->prio after having slept,
818 * updating the sleep-average too:
819 */
689static int recalc_task_prio(task_t *p, unsigned long long now) 820static int recalc_task_prio(task_t *p, unsigned long long now)
690{ 821{
691 /* Caller must always ensure 'now >= p->timestamp' */ 822 /* Caller must always ensure 'now >= p->timestamp' */
692 unsigned long long __sleep_time = now - p->timestamp; 823 unsigned long sleep_time = now - p->timestamp;
693 unsigned long sleep_time;
694 824
695 if (batch_task(p)) 825 if (batch_task(p))
696 sleep_time = 0; 826 sleep_time = 0;
697 else {
698 if (__sleep_time > NS_MAX_SLEEP_AVG)
699 sleep_time = NS_MAX_SLEEP_AVG;
700 else
701 sleep_time = (unsigned long)__sleep_time;
702 }
703 827
704 if (likely(sleep_time > 0)) { 828 if (likely(sleep_time > 0)) {
705 /* 829 /*
706 * User tasks that sleep a long time are categorised as 830 * This ceiling is set to the lowest priority that would allow
707 * idle. They will only have their sleep_avg increased to a 831 * a task to be reinserted into the active array on timeslice
708 * level that makes them just interactive priority to stay 832 * completion.
709 * active yet prevent them suddenly becoming cpu hogs and
710 * starving other processes.
711 */ 833 */
712 if (p->mm && sleep_time > INTERACTIVE_SLEEP(p)) { 834 unsigned long ceiling = INTERACTIVE_SLEEP(p);
713 unsigned long ceiling;
714 835
715 ceiling = JIFFIES_TO_NS(MAX_SLEEP_AVG - 836 if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) {
716 DEF_TIMESLICE); 837 /*
717 if (p->sleep_avg < ceiling) 838 * Prevents user tasks from achieving best priority
718 p->sleep_avg = ceiling; 839 * with one single large enough sleep.
840 */
841 p->sleep_avg = ceiling;
842 /*
843 * Using INTERACTIVE_SLEEP() as a ceiling places a
844 * nice(0) task 1ms sleep away from promotion, and
845 * gives it 700ms to round-robin with no chance of
846 * being demoted. This is more than generous, so
847 * mark this sleep as non-interactive to prevent the
848 * on-runqueue bonus logic from intervening should
849 * this task not receive cpu immediately.
850 */
851 p->sleep_type = SLEEP_NONINTERACTIVE;
719 } else { 852 } else {
720 /* 853 /*
721 * Tasks waking from uninterruptible sleep are 854 * Tasks waking from uninterruptible sleep are
@@ -723,12 +856,12 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
723 * are likely to be waiting on I/O 856 * are likely to be waiting on I/O
724 */ 857 */
725 if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { 858 if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) {
726 if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) 859 if (p->sleep_avg >= ceiling)
727 sleep_time = 0; 860 sleep_time = 0;
728 else if (p->sleep_avg + sleep_time >= 861 else if (p->sleep_avg + sleep_time >=
729 INTERACTIVE_SLEEP(p)) { 862 ceiling) {
730 p->sleep_avg = INTERACTIVE_SLEEP(p); 863 p->sleep_avg = ceiling;
731 sleep_time = 0; 864 sleep_time = 0;
732 } 865 }
733 } 866 }
734 867
@@ -742,9 +875,9 @@ static int recalc_task_prio(task_t *p, unsigned long long now)
742 */ 875 */
743 p->sleep_avg += sleep_time; 876 p->sleep_avg += sleep_time;
744 877
745 if (p->sleep_avg > NS_MAX_SLEEP_AVG)
746 p->sleep_avg = NS_MAX_SLEEP_AVG;
747 } 878 }
879 if (p->sleep_avg > NS_MAX_SLEEP_AVG)
880 p->sleep_avg = NS_MAX_SLEEP_AVG;
748 } 881 }
749 882
750 return effective_prio(p); 883 return effective_prio(p);
@@ -805,7 +938,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local)
805 */ 938 */
806static void deactivate_task(struct task_struct *p, runqueue_t *rq) 939static void deactivate_task(struct task_struct *p, runqueue_t *rq)
807{ 940{
808 rq->nr_running--; 941 dec_nr_running(p, rq);
809 dequeue_task(p, p->array); 942 dequeue_task(p, p->array);
810 p->array = NULL; 943 p->array = NULL;
811} 944}
@@ -818,6 +951,11 @@ static void deactivate_task(struct task_struct *p, runqueue_t *rq)
818 * the target CPU. 951 * the target CPU.
819 */ 952 */
820#ifdef CONFIG_SMP 953#ifdef CONFIG_SMP
954
955#ifndef tsk_is_polling
956#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
957#endif
958
821static void resched_task(task_t *p) 959static void resched_task(task_t *p)
822{ 960{
823 int cpu; 961 int cpu;
@@ -833,9 +971,9 @@ static void resched_task(task_t *p)
833 if (cpu == smp_processor_id()) 971 if (cpu == smp_processor_id())
834 return; 972 return;
835 973
836 /* NEED_RESCHED must be visible before we test POLLING_NRFLAG */ 974 /* NEED_RESCHED must be visible before we test polling */
837 smp_mb(); 975 smp_mb();
838 if (!test_tsk_thread_flag(p, TIF_POLLING_NRFLAG)) 976 if (!tsk_is_polling(p))
839 smp_send_reschedule(cpu); 977 smp_send_reschedule(cpu);
840} 978}
841#else 979#else
@@ -855,6 +993,12 @@ inline int task_curr(const task_t *p)
855 return cpu_curr(task_cpu(p)) == p; 993 return cpu_curr(task_cpu(p)) == p;
856} 994}
857 995
996/* Used instead of source_load when we know the type == 0 */
997unsigned long weighted_cpuload(const int cpu)
998{
999 return cpu_rq(cpu)->raw_weighted_load;
1000}
1001
858#ifdef CONFIG_SMP 1002#ifdef CONFIG_SMP
859typedef struct { 1003typedef struct {
860 struct list_head list; 1004 struct list_head list;
@@ -944,7 +1088,8 @@ void kick_process(task_t *p)
944} 1088}
945 1089
946/* 1090/*
947 * Return a low guess at the load of a migration-source cpu. 1091 * Return a low guess at the load of a migration-source cpu weighted
1092 * according to the scheduling class and "nice" value.
948 * 1093 *
949 * We want to under-estimate the load of migration sources, to 1094 * We want to under-estimate the load of migration sources, to
950 * balance conservatively. 1095 * balance conservatively.
@@ -952,24 +1097,36 @@ void kick_process(task_t *p)
952static inline unsigned long source_load(int cpu, int type) 1097static inline unsigned long source_load(int cpu, int type)
953{ 1098{
954 runqueue_t *rq = cpu_rq(cpu); 1099 runqueue_t *rq = cpu_rq(cpu);
955 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; 1100
956 if (type == 0) 1101 if (type == 0)
957 return load_now; 1102 return rq->raw_weighted_load;
958 1103
959 return min(rq->cpu_load[type-1], load_now); 1104 return min(rq->cpu_load[type-1], rq->raw_weighted_load);
960} 1105}
961 1106
962/* 1107/*
963 * Return a high guess at the load of a migration-target cpu 1108 * Return a high guess at the load of a migration-target cpu weighted
1109 * according to the scheduling class and "nice" value.
964 */ 1110 */
965static inline unsigned long target_load(int cpu, int type) 1111static inline unsigned long target_load(int cpu, int type)
966{ 1112{
967 runqueue_t *rq = cpu_rq(cpu); 1113 runqueue_t *rq = cpu_rq(cpu);
968 unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; 1114
969 if (type == 0) 1115 if (type == 0)
970 return load_now; 1116 return rq->raw_weighted_load;
1117
1118 return max(rq->cpu_load[type-1], rq->raw_weighted_load);
1119}
1120
1121/*
1122 * Return the average load per task on the cpu's run queue
1123 */
1124static inline unsigned long cpu_avg_load_per_task(int cpu)
1125{
1126 runqueue_t *rq = cpu_rq(cpu);
1127 unsigned long n = rq->nr_running;
971 1128
972 return max(rq->cpu_load[type-1], load_now); 1129 return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE;
973} 1130}
974 1131
975/* 1132/*
@@ -1042,7 +1199,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
1042 cpus_and(tmp, group->cpumask, p->cpus_allowed); 1199 cpus_and(tmp, group->cpumask, p->cpus_allowed);
1043 1200
1044 for_each_cpu_mask(i, tmp) { 1201 for_each_cpu_mask(i, tmp) {
1045 load = source_load(i, 0); 1202 load = weighted_cpuload(i);
1046 1203
1047 if (load < min_load || (load == min_load && i == this_cpu)) { 1204 if (load < min_load || (load == min_load && i == this_cpu)) {
1048 min_load = load; 1205 min_load = load;
@@ -1069,9 +1226,15 @@ static int sched_balance_self(int cpu, int flag)
1069 struct task_struct *t = current; 1226 struct task_struct *t = current;
1070 struct sched_domain *tmp, *sd = NULL; 1227 struct sched_domain *tmp, *sd = NULL;
1071 1228
1072 for_each_domain(cpu, tmp) 1229 for_each_domain(cpu, tmp) {
1230 /*
1231 * If power savings logic is enabled for a domain, stop there.
1232 */
1233 if (tmp->flags & SD_POWERSAVINGS_BALANCE)
1234 break;
1073 if (tmp->flags & flag) 1235 if (tmp->flags & flag)
1074 sd = tmp; 1236 sd = tmp;
1237 }
1075 1238
1076 while (sd) { 1239 while (sd) {
1077 cpumask_t span; 1240 cpumask_t span;
@@ -1221,17 +1384,19 @@ static int try_to_wake_up(task_t *p, unsigned int state, int sync)
1221 1384
1222 if (this_sd->flags & SD_WAKE_AFFINE) { 1385 if (this_sd->flags & SD_WAKE_AFFINE) {
1223 unsigned long tl = this_load; 1386 unsigned long tl = this_load;
1387 unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu);
1388
1224 /* 1389 /*
1225 * If sync wakeup then subtract the (maximum possible) 1390 * If sync wakeup then subtract the (maximum possible)
1226 * effect of the currently running task from the load 1391 * effect of the currently running task from the load
1227 * of the current CPU: 1392 * of the current CPU:
1228 */ 1393 */
1229 if (sync) 1394 if (sync)
1230 tl -= SCHED_LOAD_SCALE; 1395 tl -= current->load_weight;
1231 1396
1232 if ((tl <= load && 1397 if ((tl <= load &&
1233 tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) || 1398 tl + target_load(cpu, idx) <= tl_per_task) ||
1234 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) { 1399 100*(tl + p->load_weight) <= imbalance*load) {
1235 /* 1400 /*
1236 * This domain has SD_WAKE_AFFINE and 1401 * This domain has SD_WAKE_AFFINE and
1237 * p is cache cold in this domain, and 1402 * p is cache cold in this domain, and
@@ -1348,6 +1513,12 @@ void fastcall sched_fork(task_t *p, int clone_flags)
1348 * event cannot wake it up and insert it on the runqueue either. 1513 * event cannot wake it up and insert it on the runqueue either.
1349 */ 1514 */
1350 p->state = TASK_RUNNING; 1515 p->state = TASK_RUNNING;
1516
1517 /*
1518 * Make sure we do not leak PI boosting priority to the child:
1519 */
1520 p->prio = current->normal_prio;
1521
1351 INIT_LIST_HEAD(&p->run_list); 1522 INIT_LIST_HEAD(&p->run_list);
1352 p->array = NULL; 1523 p->array = NULL;
1353#ifdef CONFIG_SCHEDSTATS 1524#ifdef CONFIG_SCHEDSTATS
@@ -1427,10 +1598,11 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags)
1427 __activate_task(p, rq); 1598 __activate_task(p, rq);
1428 else { 1599 else {
1429 p->prio = current->prio; 1600 p->prio = current->prio;
1601 p->normal_prio = current->normal_prio;
1430 list_add_tail(&p->run_list, &current->run_list); 1602 list_add_tail(&p->run_list, &current->run_list);
1431 p->array = current->array; 1603 p->array = current->array;
1432 p->array->nr_active++; 1604 p->array->nr_active++;
1433 rq->nr_running++; 1605 inc_nr_running(p, rq);
1434 } 1606 }
1435 set_need_resched(); 1607 set_need_resched();
1436 } else 1608 } else
@@ -1648,7 +1820,8 @@ unsigned long nr_uninterruptible(void)
1648 1820
1649unsigned long long nr_context_switches(void) 1821unsigned long long nr_context_switches(void)
1650{ 1822{
1651 unsigned long long i, sum = 0; 1823 int i;
1824 unsigned long long sum = 0;
1652 1825
1653 for_each_possible_cpu(i) 1826 for_each_possible_cpu(i)
1654 sum += cpu_rq(i)->nr_switches; 1827 sum += cpu_rq(i)->nr_switches;
@@ -1686,9 +1859,6 @@ unsigned long nr_active(void)
1686/* 1859/*
1687 * double_rq_lock - safely lock two runqueues 1860 * double_rq_lock - safely lock two runqueues
1688 * 1861 *
1689 * We must take them in cpu order to match code in
1690 * dependent_sleeper and wake_dependent_sleeper.
1691 *
1692 * Note this does not disable interrupts like task_rq_lock, 1862 * Note this does not disable interrupts like task_rq_lock,
1693 * you need to do so manually before calling. 1863 * you need to do so manually before calling.
1694 */ 1864 */
@@ -1700,7 +1870,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2)
1700 spin_lock(&rq1->lock); 1870 spin_lock(&rq1->lock);
1701 __acquire(rq2->lock); /* Fake it out ;) */ 1871 __acquire(rq2->lock); /* Fake it out ;) */
1702 } else { 1872 } else {
1703 if (rq1->cpu < rq2->cpu) { 1873 if (rq1 < rq2) {
1704 spin_lock(&rq1->lock); 1874 spin_lock(&rq1->lock);
1705 spin_lock(&rq2->lock); 1875 spin_lock(&rq2->lock);
1706 } else { 1876 } else {
@@ -1736,7 +1906,7 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest)
1736 __acquires(this_rq->lock) 1906 __acquires(this_rq->lock)
1737{ 1907{
1738 if (unlikely(!spin_trylock(&busiest->lock))) { 1908 if (unlikely(!spin_trylock(&busiest->lock))) {
1739 if (busiest->cpu < this_rq->cpu) { 1909 if (busiest < this_rq) {
1740 spin_unlock(&this_rq->lock); 1910 spin_unlock(&this_rq->lock);
1741 spin_lock(&busiest->lock); 1911 spin_lock(&busiest->lock);
1742 spin_lock(&this_rq->lock); 1912 spin_lock(&this_rq->lock);
@@ -1799,9 +1969,9 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
1799 runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) 1969 runqueue_t *this_rq, prio_array_t *this_array, int this_cpu)
1800{ 1970{
1801 dequeue_task(p, src_array); 1971 dequeue_task(p, src_array);
1802 src_rq->nr_running--; 1972 dec_nr_running(p, src_rq);
1803 set_task_cpu(p, this_cpu); 1973 set_task_cpu(p, this_cpu);
1804 this_rq->nr_running++; 1974 inc_nr_running(p, this_rq);
1805 enqueue_task(p, this_array); 1975 enqueue_task(p, this_array);
1806 p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) 1976 p->timestamp = (p->timestamp - src_rq->timestamp_last_tick)
1807 + this_rq->timestamp_last_tick; 1977 + this_rq->timestamp_last_tick;
@@ -1848,26 +2018,42 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
1848 return 1; 2018 return 1;
1849} 2019}
1850 2020
2021#define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio)
1851/* 2022/*
1852 * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, 2023 * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted
1853 * as part of a balancing operation within "domain". Returns the number of 2024 * load from busiest to this_rq, as part of a balancing operation within
1854 * tasks moved. 2025 * "domain". Returns the number of tasks moved.
1855 * 2026 *
1856 * Called with both runqueues locked. 2027 * Called with both runqueues locked.
1857 */ 2028 */
1858static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, 2029static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
1859 unsigned long max_nr_move, struct sched_domain *sd, 2030 unsigned long max_nr_move, unsigned long max_load_move,
1860 enum idle_type idle, int *all_pinned) 2031 struct sched_domain *sd, enum idle_type idle,
2032 int *all_pinned)
1861{ 2033{
1862 prio_array_t *array, *dst_array; 2034 prio_array_t *array, *dst_array;
1863 struct list_head *head, *curr; 2035 struct list_head *head, *curr;
1864 int idx, pulled = 0, pinned = 0; 2036 int idx, pulled = 0, pinned = 0, this_best_prio, busiest_best_prio;
2037 int busiest_best_prio_seen;
2038 int skip_for_load; /* skip the task based on weighted load issues */
2039 long rem_load_move;
1865 task_t *tmp; 2040 task_t *tmp;
1866 2041
1867 if (max_nr_move == 0) 2042 if (max_nr_move == 0 || max_load_move == 0)
1868 goto out; 2043 goto out;
1869 2044
2045 rem_load_move = max_load_move;
1870 pinned = 1; 2046 pinned = 1;
2047 this_best_prio = rq_best_prio(this_rq);
2048 busiest_best_prio = rq_best_prio(busiest);
2049 /*
2050 * Enable handling of the case where there is more than one task
2051 * with the best priority. If the current running task is one
2052 * of those with prio==busiest_best_prio we know it won't be moved
2053 * and therefore it's safe to override the skip (based on load) of
2054 * any task we find with that prio.
2055 */
2056 busiest_best_prio_seen = busiest_best_prio == busiest->curr->prio;
1871 2057
1872 /* 2058 /*
1873 * We first consider expired tasks. Those will likely not be 2059 * We first consider expired tasks. Those will likely not be
@@ -1907,7 +2093,17 @@ skip_queue:
1907 2093
1908 curr = curr->prev; 2094 curr = curr->prev;
1909 2095
1910 if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { 2096 /*
2097 * To help distribute high priority tasks accross CPUs we don't
2098 * skip a task if it will be the highest priority task (i.e. smallest
2099 * prio value) on its new queue regardless of its load weight
2100 */
2101 skip_for_load = tmp->load_weight > rem_load_move;
2102 if (skip_for_load && idx < this_best_prio)
2103 skip_for_load = !busiest_best_prio_seen && idx == busiest_best_prio;
2104 if (skip_for_load ||
2105 !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
2106 busiest_best_prio_seen |= idx == busiest_best_prio;
1911 if (curr != head) 2107 if (curr != head)
1912 goto skip_queue; 2108 goto skip_queue;
1913 idx++; 2109 idx++;
@@ -1921,9 +2117,15 @@ skip_queue:
1921 2117
1922 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); 2118 pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu);
1923 pulled++; 2119 pulled++;
2120 rem_load_move -= tmp->load_weight;
1924 2121
1925 /* We only want to steal up to the prescribed number of tasks. */ 2122 /*
1926 if (pulled < max_nr_move) { 2123 * We only want to steal up to the prescribed number of tasks
2124 * and the prescribed amount of weighted load.
2125 */
2126 if (pulled < max_nr_move && rem_load_move > 0) {
2127 if (idx < this_best_prio)
2128 this_best_prio = idx;
1927 if (curr != head) 2129 if (curr != head)
1928 goto skip_queue; 2130 goto skip_queue;
1929 idx++; 2131 idx++;
@@ -1944,7 +2146,7 @@ out:
1944 2146
1945/* 2147/*
1946 * find_busiest_group finds and returns the busiest CPU group within the 2148 * find_busiest_group finds and returns the busiest CPU group within the
1947 * domain. It calculates and returns the number of tasks which should be 2149 * domain. It calculates and returns the amount of weighted load which should be
1948 * moved to restore balance via the imbalance parameter. 2150 * moved to restore balance via the imbalance parameter.
1949 */ 2151 */
1950static struct sched_group * 2152static struct sched_group *
@@ -1954,9 +2156,19 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1954 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; 2156 struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
1955 unsigned long max_load, avg_load, total_load, this_load, total_pwr; 2157 unsigned long max_load, avg_load, total_load, this_load, total_pwr;
1956 unsigned long max_pull; 2158 unsigned long max_pull;
2159 unsigned long busiest_load_per_task, busiest_nr_running;
2160 unsigned long this_load_per_task, this_nr_running;
1957 int load_idx; 2161 int load_idx;
2162#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2163 int power_savings_balance = 1;
2164 unsigned long leader_nr_running = 0, min_load_per_task = 0;
2165 unsigned long min_nr_running = ULONG_MAX;
2166 struct sched_group *group_min = NULL, *group_leader = NULL;
2167#endif
1958 2168
1959 max_load = this_load = total_load = total_pwr = 0; 2169 max_load = this_load = total_load = total_pwr = 0;
2170 busiest_load_per_task = busiest_nr_running = 0;
2171 this_load_per_task = this_nr_running = 0;
1960 if (idle == NOT_IDLE) 2172 if (idle == NOT_IDLE)
1961 load_idx = sd->busy_idx; 2173 load_idx = sd->busy_idx;
1962 else if (idle == NEWLY_IDLE) 2174 else if (idle == NEWLY_IDLE)
@@ -1965,16 +2177,19 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1965 load_idx = sd->idle_idx; 2177 load_idx = sd->idle_idx;
1966 2178
1967 do { 2179 do {
1968 unsigned long load; 2180 unsigned long load, group_capacity;
1969 int local_group; 2181 int local_group;
1970 int i; 2182 int i;
2183 unsigned long sum_nr_running, sum_weighted_load;
1971 2184
1972 local_group = cpu_isset(this_cpu, group->cpumask); 2185 local_group = cpu_isset(this_cpu, group->cpumask);
1973 2186
1974 /* Tally up the load of all CPUs in the group */ 2187 /* Tally up the load of all CPUs in the group */
1975 avg_load = 0; 2188 sum_weighted_load = sum_nr_running = avg_load = 0;
1976 2189
1977 for_each_cpu_mask(i, group->cpumask) { 2190 for_each_cpu_mask(i, group->cpumask) {
2191 runqueue_t *rq = cpu_rq(i);
2192
1978 if (*sd_idle && !idle_cpu(i)) 2193 if (*sd_idle && !idle_cpu(i))
1979 *sd_idle = 0; 2194 *sd_idle = 0;
1980 2195
@@ -1985,6 +2200,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1985 load = source_load(i, load_idx); 2200 load = source_load(i, load_idx);
1986 2201
1987 avg_load += load; 2202 avg_load += load;
2203 sum_nr_running += rq->nr_running;
2204 sum_weighted_load += rq->raw_weighted_load;
1988 } 2205 }
1989 2206
1990 total_load += avg_load; 2207 total_load += avg_load;
@@ -1993,17 +2210,80 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
1993 /* Adjust by relative CPU power of the group */ 2210 /* Adjust by relative CPU power of the group */
1994 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; 2211 avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
1995 2212
2213 group_capacity = group->cpu_power / SCHED_LOAD_SCALE;
2214
1996 if (local_group) { 2215 if (local_group) {
1997 this_load = avg_load; 2216 this_load = avg_load;
1998 this = group; 2217 this = group;
1999 } else if (avg_load > max_load) { 2218 this_nr_running = sum_nr_running;
2219 this_load_per_task = sum_weighted_load;
2220 } else if (avg_load > max_load &&
2221 sum_nr_running > group_capacity) {
2000 max_load = avg_load; 2222 max_load = avg_load;
2001 busiest = group; 2223 busiest = group;
2224 busiest_nr_running = sum_nr_running;
2225 busiest_load_per_task = sum_weighted_load;
2002 } 2226 }
2227
2228#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2229 /*
2230 * Busy processors will not participate in power savings
2231 * balance.
2232 */
2233 if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2234 goto group_next;
2235
2236 /*
2237 * If the local group is idle or completely loaded
2238 * no need to do power savings balance at this domain
2239 */
2240 if (local_group && (this_nr_running >= group_capacity ||
2241 !this_nr_running))
2242 power_savings_balance = 0;
2243
2244 /*
2245 * If a group is already running at full capacity or idle,
2246 * don't include that group in power savings calculations
2247 */
2248 if (!power_savings_balance || sum_nr_running >= group_capacity
2249 || !sum_nr_running)
2250 goto group_next;
2251
2252 /*
2253 * Calculate the group which has the least non-idle load.
2254 * This is the group from where we need to pick up the load
2255 * for saving power
2256 */
2257 if ((sum_nr_running < min_nr_running) ||
2258 (sum_nr_running == min_nr_running &&
2259 first_cpu(group->cpumask) <
2260 first_cpu(group_min->cpumask))) {
2261 group_min = group;
2262 min_nr_running = sum_nr_running;
2263 min_load_per_task = sum_weighted_load /
2264 sum_nr_running;
2265 }
2266
2267 /*
2268 * Calculate the group which is almost near its
2269 * capacity but still has some space to pick up some load
2270 * from other group and save more power
2271 */
2272 if (sum_nr_running <= group_capacity - 1)
2273 if (sum_nr_running > leader_nr_running ||
2274 (sum_nr_running == leader_nr_running &&
2275 first_cpu(group->cpumask) >
2276 first_cpu(group_leader->cpumask))) {
2277 group_leader = group;
2278 leader_nr_running = sum_nr_running;
2279 }
2280
2281group_next:
2282#endif
2003 group = group->next; 2283 group = group->next;
2004 } while (group != sd->groups); 2284 } while (group != sd->groups);
2005 2285
2006 if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE) 2286 if (!busiest || this_load >= max_load || busiest_nr_running == 0)
2007 goto out_balanced; 2287 goto out_balanced;
2008 2288
2009 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; 2289 avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
@@ -2012,6 +2292,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2012 100*max_load <= sd->imbalance_pct*this_load) 2292 100*max_load <= sd->imbalance_pct*this_load)
2013 goto out_balanced; 2293 goto out_balanced;
2014 2294
2295 busiest_load_per_task /= busiest_nr_running;
2015 /* 2296 /*
2016 * We're trying to get all the cpus to the average_load, so we don't 2297 * We're trying to get all the cpus to the average_load, so we don't
2017 * want to push ourselves above the average load, nor do we wish to 2298 * want to push ourselves above the average load, nor do we wish to
@@ -2023,21 +2304,50 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2023 * by pulling tasks to us. Be careful of negative numbers as they'll 2304 * by pulling tasks to us. Be careful of negative numbers as they'll
2024 * appear as very large values with unsigned longs. 2305 * appear as very large values with unsigned longs.
2025 */ 2306 */
2307 if (max_load <= busiest_load_per_task)
2308 goto out_balanced;
2309
2310 /*
2311 * In the presence of smp nice balancing, certain scenarios can have
2312 * max load less than avg load(as we skip the groups at or below
2313 * its cpu_power, while calculating max_load..)
2314 */
2315 if (max_load < avg_load) {
2316 *imbalance = 0;
2317 goto small_imbalance;
2318 }
2026 2319
2027 /* Don't want to pull so many tasks that a group would go idle */ 2320 /* Don't want to pull so many tasks that a group would go idle */
2028 max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE); 2321 max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
2029 2322
2030 /* How much load to actually move to equalise the imbalance */ 2323 /* How much load to actually move to equalise the imbalance */
2031 *imbalance = min(max_pull * busiest->cpu_power, 2324 *imbalance = min(max_pull * busiest->cpu_power,
2032 (avg_load - this_load) * this->cpu_power) 2325 (avg_load - this_load) * this->cpu_power)
2033 / SCHED_LOAD_SCALE; 2326 / SCHED_LOAD_SCALE;
2034 2327
2035 if (*imbalance < SCHED_LOAD_SCALE) { 2328 /*
2036 unsigned long pwr_now = 0, pwr_move = 0; 2329 * if *imbalance is less than the average load per runnable task
2330 * there is no gaurantee that any tasks will be moved so we'll have
2331 * a think about bumping its value to force at least one task to be
2332 * moved
2333 */
2334 if (*imbalance < busiest_load_per_task) {
2335 unsigned long pwr_now, pwr_move;
2037 unsigned long tmp; 2336 unsigned long tmp;
2337 unsigned int imbn;
2338
2339small_imbalance:
2340 pwr_move = pwr_now = 0;
2341 imbn = 2;
2342 if (this_nr_running) {
2343 this_load_per_task /= this_nr_running;
2344 if (busiest_load_per_task > this_load_per_task)
2345 imbn = 1;
2346 } else
2347 this_load_per_task = SCHED_LOAD_SCALE;
2038 2348
2039 if (max_load - this_load >= SCHED_LOAD_SCALE*2) { 2349 if (max_load - this_load >= busiest_load_per_task * imbn) {
2040 *imbalance = 1; 2350 *imbalance = busiest_load_per_task;
2041 return busiest; 2351 return busiest;
2042 } 2352 }
2043 2353
@@ -2047,39 +2357,47 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2047 * moving them. 2357 * moving them.
2048 */ 2358 */
2049 2359
2050 pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); 2360 pwr_now += busiest->cpu_power *
2051 pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); 2361 min(busiest_load_per_task, max_load);
2362 pwr_now += this->cpu_power *
2363 min(this_load_per_task, this_load);
2052 pwr_now /= SCHED_LOAD_SCALE; 2364 pwr_now /= SCHED_LOAD_SCALE;
2053 2365
2054 /* Amount of load we'd subtract */ 2366 /* Amount of load we'd subtract */
2055 tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; 2367 tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power;
2056 if (max_load > tmp) 2368 if (max_load > tmp)
2057 pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE, 2369 pwr_move += busiest->cpu_power *
2058 max_load - tmp); 2370 min(busiest_load_per_task, max_load - tmp);
2059 2371
2060 /* Amount of load we'd add */ 2372 /* Amount of load we'd add */
2061 if (max_load*busiest->cpu_power < 2373 if (max_load*busiest->cpu_power <
2062 SCHED_LOAD_SCALE*SCHED_LOAD_SCALE) 2374 busiest_load_per_task*SCHED_LOAD_SCALE)
2063 tmp = max_load*busiest->cpu_power/this->cpu_power; 2375 tmp = max_load*busiest->cpu_power/this->cpu_power;
2064 else 2376 else
2065 tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; 2377 tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power;
2066 pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp); 2378 pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp);
2067 pwr_move /= SCHED_LOAD_SCALE; 2379 pwr_move /= SCHED_LOAD_SCALE;
2068 2380
2069 /* Move if we gain throughput */ 2381 /* Move if we gain throughput */
2070 if (pwr_move <= pwr_now) 2382 if (pwr_move <= pwr_now)
2071 goto out_balanced; 2383 goto out_balanced;
2072 2384
2073 *imbalance = 1; 2385 *imbalance = busiest_load_per_task;
2074 return busiest;
2075 } 2386 }
2076 2387
2077 /* Get rid of the scaling factor, rounding down as we divide */
2078 *imbalance = *imbalance / SCHED_LOAD_SCALE;
2079 return busiest; 2388 return busiest;
2080 2389
2081out_balanced: 2390out_balanced:
2391#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
2392 if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
2393 goto ret;
2082 2394
2395 if (this == group_leader && group_leader != group_min) {
2396 *imbalance = min_load_per_task;
2397 return group_min;
2398 }
2399ret:
2400#endif
2083 *imbalance = 0; 2401 *imbalance = 0;
2084 return NULL; 2402 return NULL;
2085} 2403}
@@ -2088,18 +2406,21 @@ out_balanced:
2088 * find_busiest_queue - find the busiest runqueue among the cpus in group. 2406 * find_busiest_queue - find the busiest runqueue among the cpus in group.
2089 */ 2407 */
2090static runqueue_t *find_busiest_queue(struct sched_group *group, 2408static runqueue_t *find_busiest_queue(struct sched_group *group,
2091 enum idle_type idle) 2409 enum idle_type idle, unsigned long imbalance)
2092{ 2410{
2093 unsigned long load, max_load = 0; 2411 unsigned long max_load = 0;
2094 runqueue_t *busiest = NULL; 2412 runqueue_t *busiest = NULL, *rqi;
2095 int i; 2413 int i;
2096 2414
2097 for_each_cpu_mask(i, group->cpumask) { 2415 for_each_cpu_mask(i, group->cpumask) {
2098 load = source_load(i, 0); 2416 rqi = cpu_rq(i);
2099 2417
2100 if (load > max_load) { 2418 if (rqi->nr_running == 1 && rqi->raw_weighted_load > imbalance)
2101 max_load = load; 2419 continue;
2102 busiest = cpu_rq(i); 2420
2421 if (rqi->raw_weighted_load > max_load) {
2422 max_load = rqi->raw_weighted_load;
2423 busiest = rqi;
2103 } 2424 }
2104 } 2425 }
2105 2426
@@ -2112,6 +2433,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group,
2112 */ 2433 */
2113#define MAX_PINNED_INTERVAL 512 2434#define MAX_PINNED_INTERVAL 512
2114 2435
2436#define minus_1_or_zero(n) ((n) > 0 ? (n) - 1 : 0)
2115/* 2437/*
2116 * Check this_cpu to ensure it is balanced within domain. Attempt to move 2438 * Check this_cpu to ensure it is balanced within domain. Attempt to move
2117 * tasks if there is an imbalance. 2439 * tasks if there is an imbalance.
@@ -2128,7 +2450,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
2128 int active_balance = 0; 2450 int active_balance = 0;
2129 int sd_idle = 0; 2451 int sd_idle = 0;
2130 2452
2131 if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER) 2453 if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
2454 !sched_smt_power_savings)
2132 sd_idle = 1; 2455 sd_idle = 1;
2133 2456
2134 schedstat_inc(sd, lb_cnt[idle]); 2457 schedstat_inc(sd, lb_cnt[idle]);
@@ -2139,7 +2462,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
2139 goto out_balanced; 2462 goto out_balanced;
2140 } 2463 }
2141 2464
2142 busiest = find_busiest_queue(group, idle); 2465 busiest = find_busiest_queue(group, idle, imbalance);
2143 if (!busiest) { 2466 if (!busiest) {
2144 schedstat_inc(sd, lb_nobusyq[idle]); 2467 schedstat_inc(sd, lb_nobusyq[idle]);
2145 goto out_balanced; 2468 goto out_balanced;
@@ -2159,6 +2482,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
2159 */ 2482 */
2160 double_rq_lock(this_rq, busiest); 2483 double_rq_lock(this_rq, busiest);
2161 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2484 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2485 minus_1_or_zero(busiest->nr_running),
2162 imbalance, sd, idle, &all_pinned); 2486 imbalance, sd, idle, &all_pinned);
2163 double_rq_unlock(this_rq, busiest); 2487 double_rq_unlock(this_rq, busiest);
2164 2488
@@ -2216,7 +2540,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
2216 sd->balance_interval *= 2; 2540 sd->balance_interval *= 2;
2217 } 2541 }
2218 2542
2219 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER) 2543 if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
2544 !sched_smt_power_savings)
2220 return -1; 2545 return -1;
2221 return nr_moved; 2546 return nr_moved;
2222 2547
@@ -2231,7 +2556,7 @@ out_one_pinned:
2231 (sd->balance_interval < sd->max_interval)) 2556 (sd->balance_interval < sd->max_interval))
2232 sd->balance_interval *= 2; 2557 sd->balance_interval *= 2;
2233 2558
2234 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) 2559 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
2235 return -1; 2560 return -1;
2236 return 0; 2561 return 0;
2237} 2562}
@@ -2252,7 +2577,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2252 int nr_moved = 0; 2577 int nr_moved = 0;
2253 int sd_idle = 0; 2578 int sd_idle = 0;
2254 2579
2255 if (sd->flags & SD_SHARE_CPUPOWER) 2580 if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
2256 sd_idle = 1; 2581 sd_idle = 1;
2257 2582
2258 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); 2583 schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
@@ -2262,7 +2587,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2262 goto out_balanced; 2587 goto out_balanced;
2263 } 2588 }
2264 2589
2265 busiest = find_busiest_queue(group, NEWLY_IDLE); 2590 busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance);
2266 if (!busiest) { 2591 if (!busiest) {
2267 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); 2592 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
2268 goto out_balanced; 2593 goto out_balanced;
@@ -2277,6 +2602,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2277 /* Attempt to move tasks */ 2602 /* Attempt to move tasks */
2278 double_lock_balance(this_rq, busiest); 2603 double_lock_balance(this_rq, busiest);
2279 nr_moved = move_tasks(this_rq, this_cpu, busiest, 2604 nr_moved = move_tasks(this_rq, this_cpu, busiest,
2605 minus_1_or_zero(busiest->nr_running),
2280 imbalance, sd, NEWLY_IDLE, NULL); 2606 imbalance, sd, NEWLY_IDLE, NULL);
2281 spin_unlock(&busiest->lock); 2607 spin_unlock(&busiest->lock);
2282 } 2608 }
@@ -2292,7 +2618,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
2292 2618
2293out_balanced: 2619out_balanced:
2294 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); 2620 schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
2295 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) 2621 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings)
2296 return -1; 2622 return -1;
2297 sd->nr_balance_failed = 0; 2623 sd->nr_balance_failed = 0;
2298 return 0; 2624 return 0;
@@ -2347,17 +2673,19 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
2347 double_lock_balance(busiest_rq, target_rq); 2673 double_lock_balance(busiest_rq, target_rq);
2348 2674
2349 /* Search for an sd spanning us and the target CPU. */ 2675 /* Search for an sd spanning us and the target CPU. */
2350 for_each_domain(target_cpu, sd) 2676 for_each_domain(target_cpu, sd) {
2351 if ((sd->flags & SD_LOAD_BALANCE) && 2677 if ((sd->flags & SD_LOAD_BALANCE) &&
2352 cpu_isset(busiest_cpu, sd->span)) 2678 cpu_isset(busiest_cpu, sd->span))
2353 break; 2679 break;
2680 }
2354 2681
2355 if (unlikely(sd == NULL)) 2682 if (unlikely(sd == NULL))
2356 goto out; 2683 goto out;
2357 2684
2358 schedstat_inc(sd, alb_cnt); 2685 schedstat_inc(sd, alb_cnt);
2359 2686
2360 if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL)) 2687 if (move_tasks(target_rq, target_cpu, busiest_rq, 1,
2688 RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE, NULL))
2361 schedstat_inc(sd, alb_pushed); 2689 schedstat_inc(sd, alb_pushed);
2362 else 2690 else
2363 schedstat_inc(sd, alb_failed); 2691 schedstat_inc(sd, alb_failed);
@@ -2385,7 +2713,7 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
2385 struct sched_domain *sd; 2713 struct sched_domain *sd;
2386 int i; 2714 int i;
2387 2715
2388 this_load = this_rq->nr_running * SCHED_LOAD_SCALE; 2716 this_load = this_rq->raw_weighted_load;
2389 /* Update our load */ 2717 /* Update our load */
2390 for (i = 0; i < 3; i++) { 2718 for (i = 0; i < 3; i++) {
2391 unsigned long new_load = this_load; 2719 unsigned long new_load = this_load;
@@ -2686,48 +3014,35 @@ static inline void wakeup_busy_runqueue(runqueue_t *rq)
2686 resched_task(rq->idle); 3014 resched_task(rq->idle);
2687} 3015}
2688 3016
2689static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) 3017/*
3018 * Called with interrupt disabled and this_rq's runqueue locked.
3019 */
3020static void wake_sleeping_dependent(int this_cpu)
2690{ 3021{
2691 struct sched_domain *tmp, *sd = NULL; 3022 struct sched_domain *tmp, *sd = NULL;
2692 cpumask_t sibling_map;
2693 int i; 3023 int i;
2694 3024
2695 for_each_domain(this_cpu, tmp) 3025 for_each_domain(this_cpu, tmp) {
2696 if (tmp->flags & SD_SHARE_CPUPOWER) 3026 if (tmp->flags & SD_SHARE_CPUPOWER) {
2697 sd = tmp; 3027 sd = tmp;
3028 break;
3029 }
3030 }
2698 3031
2699 if (!sd) 3032 if (!sd)
2700 return; 3033 return;
2701 3034
2702 /* 3035 for_each_cpu_mask(i, sd->span) {
2703 * Unlock the current runqueue because we have to lock in
2704 * CPU order to avoid deadlocks. Caller knows that we might
2705 * unlock. We keep IRQs disabled.
2706 */
2707 spin_unlock(&this_rq->lock);
2708
2709 sibling_map = sd->span;
2710
2711 for_each_cpu_mask(i, sibling_map)
2712 spin_lock(&cpu_rq(i)->lock);
2713 /*
2714 * We clear this CPU from the mask. This both simplifies the
2715 * inner loop and keps this_rq locked when we exit:
2716 */
2717 cpu_clear(this_cpu, sibling_map);
2718
2719 for_each_cpu_mask(i, sibling_map) {
2720 runqueue_t *smt_rq = cpu_rq(i); 3036 runqueue_t *smt_rq = cpu_rq(i);
2721 3037
3038 if (i == this_cpu)
3039 continue;
3040 if (unlikely(!spin_trylock(&smt_rq->lock)))
3041 continue;
3042
2722 wakeup_busy_runqueue(smt_rq); 3043 wakeup_busy_runqueue(smt_rq);
3044 spin_unlock(&smt_rq->lock);
2723 } 3045 }
2724
2725 for_each_cpu_mask(i, sibling_map)
2726 spin_unlock(&cpu_rq(i)->lock);
2727 /*
2728 * We exit with this_cpu's rq still held and IRQs
2729 * still disabled:
2730 */
2731} 3046}
2732 3047
2733/* 3048/*
@@ -2740,52 +3055,46 @@ static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd)
2740 return p->time_slice * (100 - sd->per_cpu_gain) / 100; 3055 return p->time_slice * (100 - sd->per_cpu_gain) / 100;
2741} 3056}
2742 3057
2743static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) 3058/*
3059 * To minimise lock contention and not have to drop this_rq's runlock we only
3060 * trylock the sibling runqueues and bypass those runqueues if we fail to
3061 * acquire their lock. As we only trylock the normal locking order does not
3062 * need to be obeyed.
3063 */
3064static int dependent_sleeper(int this_cpu, runqueue_t *this_rq, task_t *p)
2744{ 3065{
2745 struct sched_domain *tmp, *sd = NULL; 3066 struct sched_domain *tmp, *sd = NULL;
2746 cpumask_t sibling_map;
2747 prio_array_t *array;
2748 int ret = 0, i; 3067 int ret = 0, i;
2749 task_t *p;
2750 3068
2751 for_each_domain(this_cpu, tmp) 3069 /* kernel/rt threads do not participate in dependent sleeping */
2752 if (tmp->flags & SD_SHARE_CPUPOWER) 3070 if (!p->mm || rt_task(p))
3071 return 0;
3072
3073 for_each_domain(this_cpu, tmp) {
3074 if (tmp->flags & SD_SHARE_CPUPOWER) {
2753 sd = tmp; 3075 sd = tmp;
3076 break;
3077 }
3078 }
2754 3079
2755 if (!sd) 3080 if (!sd)
2756 return 0; 3081 return 0;
2757 3082
2758 /* 3083 for_each_cpu_mask(i, sd->span) {
2759 * The same locking rules and details apply as for 3084 runqueue_t *smt_rq;
2760 * wake_sleeping_dependent(): 3085 task_t *smt_curr;
2761 */
2762 spin_unlock(&this_rq->lock);
2763 sibling_map = sd->span;
2764 for_each_cpu_mask(i, sibling_map)
2765 spin_lock(&cpu_rq(i)->lock);
2766 cpu_clear(this_cpu, sibling_map);
2767 3086
2768 /* 3087 if (i == this_cpu)
2769 * Establish next task to be run - it might have gone away because 3088 continue;
2770 * we released the runqueue lock above:
2771 */
2772 if (!this_rq->nr_running)
2773 goto out_unlock;
2774 array = this_rq->active;
2775 if (!array->nr_active)
2776 array = this_rq->expired;
2777 BUG_ON(!array->nr_active);
2778 3089
2779 p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next, 3090 smt_rq = cpu_rq(i);
2780 task_t, run_list); 3091 if (unlikely(!spin_trylock(&smt_rq->lock)))
3092 continue;
2781 3093
2782 for_each_cpu_mask(i, sibling_map) { 3094 smt_curr = smt_rq->curr;
2783 runqueue_t *smt_rq = cpu_rq(i);
2784 task_t *smt_curr = smt_rq->curr;
2785 3095
2786 /* Kernel threads do not participate in dependent sleeping */ 3096 if (!smt_curr->mm)
2787 if (!p->mm || !smt_curr->mm || rt_task(p)) 3097 goto unlock;
2788 goto check_smt_task;
2789 3098
2790 /* 3099 /*
2791 * If a user task with lower static priority than the 3100 * If a user task with lower static priority than the
@@ -2803,49 +3112,24 @@ static int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
2803 if ((jiffies % DEF_TIMESLICE) > 3112 if ((jiffies % DEF_TIMESLICE) >
2804 (sd->per_cpu_gain * DEF_TIMESLICE / 100)) 3113 (sd->per_cpu_gain * DEF_TIMESLICE / 100))
2805 ret = 1; 3114 ret = 1;
2806 } else 3115 } else {
2807 if (smt_curr->static_prio < p->static_prio && 3116 if (smt_curr->static_prio < p->static_prio &&
2808 !TASK_PREEMPTS_CURR(p, smt_rq) && 3117 !TASK_PREEMPTS_CURR(p, smt_rq) &&
2809 smt_slice(smt_curr, sd) > task_timeslice(p)) 3118 smt_slice(smt_curr, sd) > task_timeslice(p))
2810 ret = 1; 3119 ret = 1;
2811
2812check_smt_task:
2813 if ((!smt_curr->mm && smt_curr != smt_rq->idle) ||
2814 rt_task(smt_curr))
2815 continue;
2816 if (!p->mm) {
2817 wakeup_busy_runqueue(smt_rq);
2818 continue;
2819 }
2820
2821 /*
2822 * Reschedule a lower priority task on the SMT sibling for
2823 * it to be put to sleep, or wake it up if it has been put to
2824 * sleep for priority reasons to see if it should run now.
2825 */
2826 if (rt_task(p)) {
2827 if ((jiffies % DEF_TIMESLICE) >
2828 (sd->per_cpu_gain * DEF_TIMESLICE / 100))
2829 resched_task(smt_curr);
2830 } else {
2831 if (TASK_PREEMPTS_CURR(p, smt_rq) &&
2832 smt_slice(p, sd) > task_timeslice(smt_curr))
2833 resched_task(smt_curr);
2834 else
2835 wakeup_busy_runqueue(smt_rq);
2836 } 3120 }
3121unlock:
3122 spin_unlock(&smt_rq->lock);
2837 } 3123 }
2838out_unlock:
2839 for_each_cpu_mask(i, sibling_map)
2840 spin_unlock(&cpu_rq(i)->lock);
2841 return ret; 3124 return ret;
2842} 3125}
2843#else 3126#else
2844static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) 3127static inline void wake_sleeping_dependent(int this_cpu)
2845{ 3128{
2846} 3129}
2847 3130
2848static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) 3131static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq,
3132 task_t *p)
2849{ 3133{
2850 return 0; 3134 return 0;
2851} 3135}
@@ -2967,32 +3251,13 @@ need_resched_nonpreemptible:
2967 3251
2968 cpu = smp_processor_id(); 3252 cpu = smp_processor_id();
2969 if (unlikely(!rq->nr_running)) { 3253 if (unlikely(!rq->nr_running)) {
2970go_idle:
2971 idle_balance(cpu, rq); 3254 idle_balance(cpu, rq);
2972 if (!rq->nr_running) { 3255 if (!rq->nr_running) {
2973 next = rq->idle; 3256 next = rq->idle;
2974 rq->expired_timestamp = 0; 3257 rq->expired_timestamp = 0;
2975 wake_sleeping_dependent(cpu, rq); 3258 wake_sleeping_dependent(cpu);
2976 /*
2977 * wake_sleeping_dependent() might have released
2978 * the runqueue, so break out if we got new
2979 * tasks meanwhile:
2980 */
2981 if (!rq->nr_running)
2982 goto switch_tasks;
2983 }
2984 } else {
2985 if (dependent_sleeper(cpu, rq)) {
2986 next = rq->idle;
2987 goto switch_tasks; 3259 goto switch_tasks;
2988 } 3260 }
2989 /*
2990 * dependent_sleeper() releases and reacquires the runqueue
2991 * lock, hence go into the idle loop if the rq went
2992 * empty meanwhile:
2993 */
2994 if (unlikely(!rq->nr_running))
2995 goto go_idle;
2996 } 3261 }
2997 3262
2998 array = rq->active; 3263 array = rq->active;
@@ -3030,6 +3295,8 @@ go_idle:
3030 } 3295 }
3031 } 3296 }
3032 next->sleep_type = SLEEP_NORMAL; 3297 next->sleep_type = SLEEP_NORMAL;
3298 if (dependent_sleeper(cpu, rq, next))
3299 next = rq->idle;
3033switch_tasks: 3300switch_tasks:
3034 if (next == rq->idle) 3301 if (next == rq->idle)
3035 schedstat_inc(rq, sched_goidle); 3302 schedstat_inc(rq, sched_goidle);
@@ -3473,12 +3740,65 @@ long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout)
3473 3740
3474EXPORT_SYMBOL(sleep_on_timeout); 3741EXPORT_SYMBOL(sleep_on_timeout);
3475 3742
3743#ifdef CONFIG_RT_MUTEXES
3744
3745/*
3746 * rt_mutex_setprio - set the current priority of a task
3747 * @p: task
3748 * @prio: prio value (kernel-internal form)
3749 *
3750 * This function changes the 'effective' priority of a task. It does
3751 * not touch ->normal_prio like __setscheduler().
3752 *
3753 * Used by the rt_mutex code to implement priority inheritance logic.
3754 */
3755void rt_mutex_setprio(task_t *p, int prio)
3756{
3757 unsigned long flags;
3758 prio_array_t *array;
3759 runqueue_t *rq;
3760 int oldprio;
3761
3762 BUG_ON(prio < 0 || prio > MAX_PRIO);
3763
3764 rq = task_rq_lock(p, &flags);
3765
3766 oldprio = p->prio;
3767 array = p->array;
3768 if (array)
3769 dequeue_task(p, array);
3770 p->prio = prio;
3771
3772 if (array) {
3773 /*
3774 * If changing to an RT priority then queue it
3775 * in the active array!
3776 */
3777 if (rt_task(p))
3778 array = rq->active;
3779 enqueue_task(p, array);
3780 /*
3781 * Reschedule if we are currently running on this runqueue and
3782 * our priority decreased, or if we are not currently running on
3783 * this runqueue and our priority is higher than the current's
3784 */
3785 if (task_running(rq, p)) {
3786 if (p->prio > oldprio)
3787 resched_task(rq->curr);
3788 } else if (TASK_PREEMPTS_CURR(p, rq))
3789 resched_task(rq->curr);
3790 }
3791 task_rq_unlock(rq, &flags);
3792}
3793
3794#endif
3795
3476void set_user_nice(task_t *p, long nice) 3796void set_user_nice(task_t *p, long nice)
3477{ 3797{
3478 unsigned long flags; 3798 unsigned long flags;
3479 prio_array_t *array; 3799 prio_array_t *array;
3480 runqueue_t *rq; 3800 runqueue_t *rq;
3481 int old_prio, new_prio, delta; 3801 int old_prio, delta;
3482 3802
3483 if (TASK_NICE(p) == nice || nice < -20 || nice > 19) 3803 if (TASK_NICE(p) == nice || nice < -20 || nice > 19)
3484 return; 3804 return;
@@ -3493,22 +3813,25 @@ void set_user_nice(task_t *p, long nice)
3493 * it wont have any effect on scheduling until the task is 3813 * it wont have any effect on scheduling until the task is
3494 * not SCHED_NORMAL/SCHED_BATCH: 3814 * not SCHED_NORMAL/SCHED_BATCH:
3495 */ 3815 */
3496 if (rt_task(p)) { 3816 if (has_rt_policy(p)) {
3497 p->static_prio = NICE_TO_PRIO(nice); 3817 p->static_prio = NICE_TO_PRIO(nice);
3498 goto out_unlock; 3818 goto out_unlock;
3499 } 3819 }
3500 array = p->array; 3820 array = p->array;
3501 if (array) 3821 if (array) {
3502 dequeue_task(p, array); 3822 dequeue_task(p, array);
3823 dec_raw_weighted_load(rq, p);
3824 }
3503 3825
3504 old_prio = p->prio;
3505 new_prio = NICE_TO_PRIO(nice);
3506 delta = new_prio - old_prio;
3507 p->static_prio = NICE_TO_PRIO(nice); 3826 p->static_prio = NICE_TO_PRIO(nice);
3508 p->prio += delta; 3827 set_load_weight(p);
3828 old_prio = p->prio;
3829 p->prio = effective_prio(p);
3830 delta = p->prio - old_prio;
3509 3831
3510 if (array) { 3832 if (array) {
3511 enqueue_task(p, array); 3833 enqueue_task(p, array);
3834 inc_raw_weighted_load(rq, p);
3512 /* 3835 /*
3513 * If the task increased its priority or is running and 3836 * If the task increased its priority or is running and
3514 * lowered its priority, then reschedule its CPU: 3837 * lowered its priority, then reschedule its CPU:
@@ -3519,7 +3842,6 @@ void set_user_nice(task_t *p, long nice)
3519out_unlock: 3842out_unlock:
3520 task_rq_unlock(rq, &flags); 3843 task_rq_unlock(rq, &flags);
3521} 3844}
3522
3523EXPORT_SYMBOL(set_user_nice); 3845EXPORT_SYMBOL(set_user_nice);
3524 3846
3525/* 3847/*
@@ -3634,16 +3956,15 @@ static void __setscheduler(struct task_struct *p, int policy, int prio)
3634 BUG_ON(p->array); 3956 BUG_ON(p->array);
3635 p->policy = policy; 3957 p->policy = policy;
3636 p->rt_priority = prio; 3958 p->rt_priority = prio;
3637 if (policy != SCHED_NORMAL && policy != SCHED_BATCH) { 3959 p->normal_prio = normal_prio(p);
3638 p->prio = MAX_RT_PRIO-1 - p->rt_priority; 3960 /* we are holding p->pi_lock already */
3639 } else { 3961 p->prio = rt_mutex_getprio(p);
3640 p->prio = p->static_prio; 3962 /*
3641 /* 3963 * SCHED_BATCH tasks are treated as perpetual CPU hogs:
3642 * SCHED_BATCH tasks are treated as perpetual CPU hogs: 3964 */
3643 */ 3965 if (policy == SCHED_BATCH)
3644 if (policy == SCHED_BATCH) 3966 p->sleep_avg = 0;
3645 p->sleep_avg = 0; 3967 set_load_weight(p);
3646 }
3647} 3968}
3648 3969
3649/** 3970/**
@@ -3662,6 +3983,8 @@ int sched_setscheduler(struct task_struct *p, int policy,
3662 unsigned long flags; 3983 unsigned long flags;
3663 runqueue_t *rq; 3984 runqueue_t *rq;
3664 3985
3986 /* may grab non-irq protected spin_locks */
3987 BUG_ON(in_interrupt());
3665recheck: 3988recheck:
3666 /* double check policy once rq lock held */ 3989 /* double check policy once rq lock held */
3667 if (policy < 0) 3990 if (policy < 0)
@@ -3710,14 +4033,20 @@ recheck:
3710 if (retval) 4033 if (retval)
3711 return retval; 4034 return retval;
3712 /* 4035 /*
4036 * make sure no PI-waiters arrive (or leave) while we are
4037 * changing the priority of the task:
4038 */
4039 spin_lock_irqsave(&p->pi_lock, flags);
4040 /*
3713 * To be able to change p->policy safely, the apropriate 4041 * To be able to change p->policy safely, the apropriate
3714 * runqueue lock must be held. 4042 * runqueue lock must be held.
3715 */ 4043 */
3716 rq = task_rq_lock(p, &flags); 4044 rq = __task_rq_lock(p);
3717 /* recheck policy now with rq lock held */ 4045 /* recheck policy now with rq lock held */
3718 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { 4046 if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
3719 policy = oldpolicy = -1; 4047 policy = oldpolicy = -1;
3720 task_rq_unlock(rq, &flags); 4048 __task_rq_unlock(rq);
4049 spin_unlock_irqrestore(&p->pi_lock, flags);
3721 goto recheck; 4050 goto recheck;
3722 } 4051 }
3723 array = p->array; 4052 array = p->array;
@@ -3738,7 +4067,11 @@ recheck:
3738 } else if (TASK_PREEMPTS_CURR(p, rq)) 4067 } else if (TASK_PREEMPTS_CURR(p, rq))
3739 resched_task(rq->curr); 4068 resched_task(rq->curr);
3740 } 4069 }
3741 task_rq_unlock(rq, &flags); 4070 __task_rq_unlock(rq);
4071 spin_unlock_irqrestore(&p->pi_lock, flags);
4072
4073 rt_mutex_adjust_pi(p);
4074
3742 return 0; 4075 return 0;
3743} 4076}
3744EXPORT_SYMBOL_GPL(sched_setscheduler); 4077EXPORT_SYMBOL_GPL(sched_setscheduler);
@@ -3760,8 +4093,10 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
3760 read_unlock_irq(&tasklist_lock); 4093 read_unlock_irq(&tasklist_lock);
3761 return -ESRCH; 4094 return -ESRCH;
3762 } 4095 }
3763 retval = sched_setscheduler(p, policy, &lparam); 4096 get_task_struct(p);
3764 read_unlock_irq(&tasklist_lock); 4097 read_unlock_irq(&tasklist_lock);
4098 retval = sched_setscheduler(p, policy, &lparam);
4099 put_task_struct(p);
3765 return retval; 4100 return retval;
3766} 4101}
3767 4102
@@ -4247,7 +4582,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
4247 if (retval) 4582 if (retval)
4248 goto out_unlock; 4583 goto out_unlock;
4249 4584
4250 jiffies_to_timespec(p->policy & SCHED_FIFO ? 4585 jiffies_to_timespec(p->policy == SCHED_FIFO ?
4251 0 : task_timeslice(p), &t); 4586 0 : task_timeslice(p), &t);
4252 read_unlock(&tasklist_lock); 4587 read_unlock(&tasklist_lock);
4253 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; 4588 retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
@@ -4373,7 +4708,7 @@ void __devinit init_idle(task_t *idle, int cpu)
4373 idle->timestamp = sched_clock(); 4708 idle->timestamp = sched_clock();
4374 idle->sleep_avg = 0; 4709 idle->sleep_avg = 0;
4375 idle->array = NULL; 4710 idle->array = NULL;
4376 idle->prio = MAX_PRIO; 4711 idle->prio = idle->normal_prio = MAX_PRIO;
4377 idle->state = TASK_RUNNING; 4712 idle->state = TASK_RUNNING;
4378 idle->cpus_allowed = cpumask_of_cpu(cpu); 4713 idle->cpus_allowed = cpumask_of_cpu(cpu);
4379 set_task_cpu(idle, cpu); 4714 set_task_cpu(idle, cpu);
@@ -4469,13 +4804,16 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed);
4469 * 4804 *
4470 * So we race with normal scheduler movements, but that's OK, as long 4805 * So we race with normal scheduler movements, but that's OK, as long
4471 * as the task is no longer on this CPU. 4806 * as the task is no longer on this CPU.
4807 *
4808 * Returns non-zero if task was successfully migrated.
4472 */ 4809 */
4473static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) 4810static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4474{ 4811{
4475 runqueue_t *rq_dest, *rq_src; 4812 runqueue_t *rq_dest, *rq_src;
4813 int ret = 0;
4476 4814
4477 if (unlikely(cpu_is_offline(dest_cpu))) 4815 if (unlikely(cpu_is_offline(dest_cpu)))
4478 return; 4816 return ret;
4479 4817
4480 rq_src = cpu_rq(src_cpu); 4818 rq_src = cpu_rq(src_cpu);
4481 rq_dest = cpu_rq(dest_cpu); 4819 rq_dest = cpu_rq(dest_cpu);
@@ -4503,9 +4841,10 @@ static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4503 if (TASK_PREEMPTS_CURR(p, rq_dest)) 4841 if (TASK_PREEMPTS_CURR(p, rq_dest))
4504 resched_task(rq_dest->curr); 4842 resched_task(rq_dest->curr);
4505 } 4843 }
4506 4844 ret = 1;
4507out: 4845out:
4508 double_rq_unlock(rq_src, rq_dest); 4846 double_rq_unlock(rq_src, rq_dest);
4847 return ret;
4509} 4848}
4510 4849
4511/* 4850/*
@@ -4575,9 +4914,12 @@ wait_to_die:
4575/* Figure out where task on dead CPU should go, use force if neccessary. */ 4914/* Figure out where task on dead CPU should go, use force if neccessary. */
4576static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) 4915static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
4577{ 4916{
4917 runqueue_t *rq;
4918 unsigned long flags;
4578 int dest_cpu; 4919 int dest_cpu;
4579 cpumask_t mask; 4920 cpumask_t mask;
4580 4921
4922restart:
4581 /* On same node? */ 4923 /* On same node? */
4582 mask = node_to_cpumask(cpu_to_node(dead_cpu)); 4924 mask = node_to_cpumask(cpu_to_node(dead_cpu));
4583 cpus_and(mask, mask, tsk->cpus_allowed); 4925 cpus_and(mask, mask, tsk->cpus_allowed);
@@ -4589,8 +4931,10 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
4589 4931
4590 /* No more Mr. Nice Guy. */ 4932 /* No more Mr. Nice Guy. */
4591 if (dest_cpu == NR_CPUS) { 4933 if (dest_cpu == NR_CPUS) {
4934 rq = task_rq_lock(tsk, &flags);
4592 cpus_setall(tsk->cpus_allowed); 4935 cpus_setall(tsk->cpus_allowed);
4593 dest_cpu = any_online_cpu(tsk->cpus_allowed); 4936 dest_cpu = any_online_cpu(tsk->cpus_allowed);
4937 task_rq_unlock(rq, &flags);
4594 4938
4595 /* 4939 /*
4596 * Don't tell them about moving exiting tasks or 4940 * Don't tell them about moving exiting tasks or
@@ -4602,7 +4946,8 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk)
4602 "longer affine to cpu%d\n", 4946 "longer affine to cpu%d\n",
4603 tsk->pid, tsk->comm, dead_cpu); 4947 tsk->pid, tsk->comm, dead_cpu);
4604 } 4948 }
4605 __migrate_task(tsk, dead_cpu, dest_cpu); 4949 if (!__migrate_task(tsk, dead_cpu, dest_cpu))
4950 goto restart;
4606} 4951}
4607 4952
4608/* 4953/*
@@ -4729,8 +5074,9 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
4729 * migration_call - callback that gets triggered when a CPU is added. 5074 * migration_call - callback that gets triggered when a CPU is added.
4730 * Here we can start up the necessary migration thread for the new CPU. 5075 * Here we can start up the necessary migration thread for the new CPU.
4731 */ 5076 */
4732static int migration_call(struct notifier_block *nfb, unsigned long action, 5077static int __cpuinit migration_call(struct notifier_block *nfb,
4733 void *hcpu) 5078 unsigned long action,
5079 void *hcpu)
4734{ 5080{
4735 int cpu = (long)hcpu; 5081 int cpu = (long)hcpu;
4736 struct task_struct *p; 5082 struct task_struct *p;
@@ -4800,7 +5146,7 @@ static int migration_call(struct notifier_block *nfb, unsigned long action,
4800/* Register at highest priority so that task migration (migrate_all_tasks) 5146/* Register at highest priority so that task migration (migrate_all_tasks)
4801 * happens before everything else. 5147 * happens before everything else.
4802 */ 5148 */
4803static struct notifier_block migration_notifier = { 5149static struct notifier_block __cpuinitdata migration_notifier = {
4804 .notifier_call = migration_call, 5150 .notifier_call = migration_call,
4805 .priority = 10 5151 .priority = 10
4806}; 5152};
@@ -5601,6 +5947,7 @@ static cpumask_t sched_domain_node_span(int node)
5601} 5947}
5602#endif 5948#endif
5603 5949
5950int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
5604/* 5951/*
5605 * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we 5952 * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we
5606 * can switch it on easily if needed. 5953 * can switch it on easily if needed.
@@ -5616,7 +5963,7 @@ static int cpu_to_cpu_group(int cpu)
5616 5963
5617#ifdef CONFIG_SCHED_MC 5964#ifdef CONFIG_SCHED_MC
5618static DEFINE_PER_CPU(struct sched_domain, core_domains); 5965static DEFINE_PER_CPU(struct sched_domain, core_domains);
5619static struct sched_group sched_group_core[NR_CPUS]; 5966static struct sched_group *sched_group_core_bycpu[NR_CPUS];
5620#endif 5967#endif
5621 5968
5622#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 5969#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
@@ -5632,7 +5979,7 @@ static int cpu_to_core_group(int cpu)
5632#endif 5979#endif
5633 5980
5634static DEFINE_PER_CPU(struct sched_domain, phys_domains); 5981static DEFINE_PER_CPU(struct sched_domain, phys_domains);
5635static struct sched_group sched_group_phys[NR_CPUS]; 5982static struct sched_group *sched_group_phys_bycpu[NR_CPUS];
5636static int cpu_to_phys_group(int cpu) 5983static int cpu_to_phys_group(int cpu)
5637{ 5984{
5638#if defined(CONFIG_SCHED_MC) 5985#if defined(CONFIG_SCHED_MC)
@@ -5689,13 +6036,74 @@ next_sg:
5689} 6036}
5690#endif 6037#endif
5691 6038
6039/* Free memory allocated for various sched_group structures */
6040static void free_sched_groups(const cpumask_t *cpu_map)
6041{
6042 int cpu;
6043#ifdef CONFIG_NUMA
6044 int i;
6045
6046 for_each_cpu_mask(cpu, *cpu_map) {
6047 struct sched_group *sched_group_allnodes
6048 = sched_group_allnodes_bycpu[cpu];
6049 struct sched_group **sched_group_nodes
6050 = sched_group_nodes_bycpu[cpu];
6051
6052 if (sched_group_allnodes) {
6053 kfree(sched_group_allnodes);
6054 sched_group_allnodes_bycpu[cpu] = NULL;
6055 }
6056
6057 if (!sched_group_nodes)
6058 continue;
6059
6060 for (i = 0; i < MAX_NUMNODES; i++) {
6061 cpumask_t nodemask = node_to_cpumask(i);
6062 struct sched_group *oldsg, *sg = sched_group_nodes[i];
6063
6064 cpus_and(nodemask, nodemask, *cpu_map);
6065 if (cpus_empty(nodemask))
6066 continue;
6067
6068 if (sg == NULL)
6069 continue;
6070 sg = sg->next;
6071next_sg:
6072 oldsg = sg;
6073 sg = sg->next;
6074 kfree(oldsg);
6075 if (oldsg != sched_group_nodes[i])
6076 goto next_sg;
6077 }
6078 kfree(sched_group_nodes);
6079 sched_group_nodes_bycpu[cpu] = NULL;
6080 }
6081#endif
6082 for_each_cpu_mask(cpu, *cpu_map) {
6083 if (sched_group_phys_bycpu[cpu]) {
6084 kfree(sched_group_phys_bycpu[cpu]);
6085 sched_group_phys_bycpu[cpu] = NULL;
6086 }
6087#ifdef CONFIG_SCHED_MC
6088 if (sched_group_core_bycpu[cpu]) {
6089 kfree(sched_group_core_bycpu[cpu]);
6090 sched_group_core_bycpu[cpu] = NULL;
6091 }
6092#endif
6093 }
6094}
6095
5692/* 6096/*
5693 * Build sched domains for a given set of cpus and attach the sched domains 6097 * Build sched domains for a given set of cpus and attach the sched domains
5694 * to the individual cpus 6098 * to the individual cpus
5695 */ 6099 */
5696void build_sched_domains(const cpumask_t *cpu_map) 6100static int build_sched_domains(const cpumask_t *cpu_map)
5697{ 6101{
5698 int i; 6102 int i;
6103 struct sched_group *sched_group_phys = NULL;
6104#ifdef CONFIG_SCHED_MC
6105 struct sched_group *sched_group_core = NULL;
6106#endif
5699#ifdef CONFIG_NUMA 6107#ifdef CONFIG_NUMA
5700 struct sched_group **sched_group_nodes = NULL; 6108 struct sched_group **sched_group_nodes = NULL;
5701 struct sched_group *sched_group_allnodes = NULL; 6109 struct sched_group *sched_group_allnodes = NULL;
@@ -5703,11 +6111,11 @@ void build_sched_domains(const cpumask_t *cpu_map)
5703 /* 6111 /*
5704 * Allocate the per-node list of sched groups 6112 * Allocate the per-node list of sched groups
5705 */ 6113 */
5706 sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES, 6114 sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES,
5707 GFP_ATOMIC); 6115 GFP_KERNEL);
5708 if (!sched_group_nodes) { 6116 if (!sched_group_nodes) {
5709 printk(KERN_WARNING "Can not alloc sched group node list\n"); 6117 printk(KERN_WARNING "Can not alloc sched group node list\n");
5710 return; 6118 return -ENOMEM;
5711 } 6119 }
5712 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; 6120 sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes;
5713#endif 6121#endif
@@ -5733,7 +6141,7 @@ void build_sched_domains(const cpumask_t *cpu_map)
5733 if (!sched_group_allnodes) { 6141 if (!sched_group_allnodes) {
5734 printk(KERN_WARNING 6142 printk(KERN_WARNING
5735 "Can not alloc allnodes sched group\n"); 6143 "Can not alloc allnodes sched group\n");
5736 break; 6144 goto error;
5737 } 6145 }
5738 sched_group_allnodes_bycpu[i] 6146 sched_group_allnodes_bycpu[i]
5739 = sched_group_allnodes; 6147 = sched_group_allnodes;
@@ -5754,6 +6162,18 @@ void build_sched_domains(const cpumask_t *cpu_map)
5754 cpus_and(sd->span, sd->span, *cpu_map); 6162 cpus_and(sd->span, sd->span, *cpu_map);
5755#endif 6163#endif
5756 6164
6165 if (!sched_group_phys) {
6166 sched_group_phys
6167 = kmalloc(sizeof(struct sched_group) * NR_CPUS,
6168 GFP_KERNEL);
6169 if (!sched_group_phys) {
6170 printk (KERN_WARNING "Can not alloc phys sched"
6171 "group\n");
6172 goto error;
6173 }
6174 sched_group_phys_bycpu[i] = sched_group_phys;
6175 }
6176
5757 p = sd; 6177 p = sd;
5758 sd = &per_cpu(phys_domains, i); 6178 sd = &per_cpu(phys_domains, i);
5759 group = cpu_to_phys_group(i); 6179 group = cpu_to_phys_group(i);
@@ -5763,6 +6183,18 @@ void build_sched_domains(const cpumask_t *cpu_map)
5763 sd->groups = &sched_group_phys[group]; 6183 sd->groups = &sched_group_phys[group];
5764 6184
5765#ifdef CONFIG_SCHED_MC 6185#ifdef CONFIG_SCHED_MC
6186 if (!sched_group_core) {
6187 sched_group_core
6188 = kmalloc(sizeof(struct sched_group) * NR_CPUS,
6189 GFP_KERNEL);
6190 if (!sched_group_core) {
6191 printk (KERN_WARNING "Can not alloc core sched"
6192 "group\n");
6193 goto error;
6194 }
6195 sched_group_core_bycpu[i] = sched_group_core;
6196 }
6197
5766 p = sd; 6198 p = sd;
5767 sd = &per_cpu(core_domains, i); 6199 sd = &per_cpu(core_domains, i);
5768 group = cpu_to_core_group(i); 6200 group = cpu_to_core_group(i);
@@ -5846,24 +6278,21 @@ void build_sched_domains(const cpumask_t *cpu_map)
5846 domainspan = sched_domain_node_span(i); 6278 domainspan = sched_domain_node_span(i);
5847 cpus_and(domainspan, domainspan, *cpu_map); 6279 cpus_and(domainspan, domainspan, *cpu_map);
5848 6280
5849 sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); 6281 sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i);
6282 if (!sg) {
6283 printk(KERN_WARNING "Can not alloc domain group for "
6284 "node %d\n", i);
6285 goto error;
6286 }
5850 sched_group_nodes[i] = sg; 6287 sched_group_nodes[i] = sg;
5851 for_each_cpu_mask(j, nodemask) { 6288 for_each_cpu_mask(j, nodemask) {
5852 struct sched_domain *sd; 6289 struct sched_domain *sd;
5853 sd = &per_cpu(node_domains, j); 6290 sd = &per_cpu(node_domains, j);
5854 sd->groups = sg; 6291 sd->groups = sg;
5855 if (sd->groups == NULL) {
5856 /* Turn off balancing if we have no groups */
5857 sd->flags = 0;
5858 }
5859 }
5860 if (!sg) {
5861 printk(KERN_WARNING
5862 "Can not alloc domain group for node %d\n", i);
5863 continue;
5864 } 6292 }
5865 sg->cpu_power = 0; 6293 sg->cpu_power = 0;
5866 sg->cpumask = nodemask; 6294 sg->cpumask = nodemask;
6295 sg->next = sg;
5867 cpus_or(covered, covered, nodemask); 6296 cpus_or(covered, covered, nodemask);
5868 prev = sg; 6297 prev = sg;
5869 6298
@@ -5882,54 +6311,90 @@ void build_sched_domains(const cpumask_t *cpu_map)
5882 if (cpus_empty(tmp)) 6311 if (cpus_empty(tmp))
5883 continue; 6312 continue;
5884 6313
5885 sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); 6314 sg = kmalloc_node(sizeof(struct sched_group),
6315 GFP_KERNEL, i);
5886 if (!sg) { 6316 if (!sg) {
5887 printk(KERN_WARNING 6317 printk(KERN_WARNING
5888 "Can not alloc domain group for node %d\n", j); 6318 "Can not alloc domain group for node %d\n", j);
5889 break; 6319 goto error;
5890 } 6320 }
5891 sg->cpu_power = 0; 6321 sg->cpu_power = 0;
5892 sg->cpumask = tmp; 6322 sg->cpumask = tmp;
6323 sg->next = prev->next;
5893 cpus_or(covered, covered, tmp); 6324 cpus_or(covered, covered, tmp);
5894 prev->next = sg; 6325 prev->next = sg;
5895 prev = sg; 6326 prev = sg;
5896 } 6327 }
5897 prev->next = sched_group_nodes[i];
5898 } 6328 }
5899#endif 6329#endif
5900 6330
5901 /* Calculate CPU power for physical packages and nodes */ 6331 /* Calculate CPU power for physical packages and nodes */
6332#ifdef CONFIG_SCHED_SMT
5902 for_each_cpu_mask(i, *cpu_map) { 6333 for_each_cpu_mask(i, *cpu_map) {
5903 int power;
5904 struct sched_domain *sd; 6334 struct sched_domain *sd;
5905#ifdef CONFIG_SCHED_SMT
5906 sd = &per_cpu(cpu_domains, i); 6335 sd = &per_cpu(cpu_domains, i);
5907 power = SCHED_LOAD_SCALE; 6336 sd->groups->cpu_power = SCHED_LOAD_SCALE;
5908 sd->groups->cpu_power = power; 6337 }
5909#endif 6338#endif
5910#ifdef CONFIG_SCHED_MC 6339#ifdef CONFIG_SCHED_MC
6340 for_each_cpu_mask(i, *cpu_map) {
6341 int power;
6342 struct sched_domain *sd;
5911 sd = &per_cpu(core_domains, i); 6343 sd = &per_cpu(core_domains, i);
5912 power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) 6344 if (sched_smt_power_savings)
6345 power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
6346 else
6347 power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
5913 * SCHED_LOAD_SCALE / 10; 6348 * SCHED_LOAD_SCALE / 10;
5914 sd->groups->cpu_power = power; 6349 sd->groups->cpu_power = power;
6350 }
6351#endif
5915 6352
6353 for_each_cpu_mask(i, *cpu_map) {
6354 struct sched_domain *sd;
6355#ifdef CONFIG_SCHED_MC
5916 sd = &per_cpu(phys_domains, i); 6356 sd = &per_cpu(phys_domains, i);
6357 if (i != first_cpu(sd->groups->cpumask))
6358 continue;
5917 6359
5918 /* 6360 sd->groups->cpu_power = 0;
5919 * This has to be < 2 * SCHED_LOAD_SCALE 6361 if (sched_mc_power_savings || sched_smt_power_savings) {
5920 * Lets keep it SCHED_LOAD_SCALE, so that 6362 int j;
5921 * while calculating NUMA group's cpu_power 6363
5922 * we can simply do 6364 for_each_cpu_mask(j, sd->groups->cpumask) {
5923 * numa_group->cpu_power += phys_group->cpu_power; 6365 struct sched_domain *sd1;
5924 * 6366 sd1 = &per_cpu(core_domains, j);
5925 * See "only add power once for each physical pkg" 6367 /*
5926 * comment below 6368 * for each core we will add once
5927 */ 6369 * to the group in physical domain
5928 sd->groups->cpu_power = SCHED_LOAD_SCALE; 6370 */
6371 if (j != first_cpu(sd1->groups->cpumask))
6372 continue;
6373
6374 if (sched_smt_power_savings)
6375 sd->groups->cpu_power += sd1->groups->cpu_power;
6376 else
6377 sd->groups->cpu_power += SCHED_LOAD_SCALE;
6378 }
6379 } else
6380 /*
6381 * This has to be < 2 * SCHED_LOAD_SCALE
6382 * Lets keep it SCHED_LOAD_SCALE, so that
6383 * while calculating NUMA group's cpu_power
6384 * we can simply do
6385 * numa_group->cpu_power += phys_group->cpu_power;
6386 *
6387 * See "only add power once for each physical pkg"
6388 * comment below
6389 */
6390 sd->groups->cpu_power = SCHED_LOAD_SCALE;
5929#else 6391#else
6392 int power;
5930 sd = &per_cpu(phys_domains, i); 6393 sd = &per_cpu(phys_domains, i);
5931 power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * 6394 if (sched_smt_power_savings)
5932 (cpus_weight(sd->groups->cpumask)-1) / 10; 6395 power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask);
6396 else
6397 power = SCHED_LOAD_SCALE;
5933 sd->groups->cpu_power = power; 6398 sd->groups->cpu_power = power;
5934#endif 6399#endif
5935 } 6400 }
@@ -5957,13 +6422,20 @@ void build_sched_domains(const cpumask_t *cpu_map)
5957 * Tune cache-hot values: 6422 * Tune cache-hot values:
5958 */ 6423 */
5959 calibrate_migration_costs(cpu_map); 6424 calibrate_migration_costs(cpu_map);
6425
6426 return 0;
6427
6428error:
6429 free_sched_groups(cpu_map);
6430 return -ENOMEM;
5960} 6431}
5961/* 6432/*
5962 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 6433 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
5963 */ 6434 */
5964static void arch_init_sched_domains(const cpumask_t *cpu_map) 6435static int arch_init_sched_domains(const cpumask_t *cpu_map)
5965{ 6436{
5966 cpumask_t cpu_default_map; 6437 cpumask_t cpu_default_map;
6438 int err;
5967 6439
5968 /* 6440 /*
5969 * Setup mask for cpus without special case scheduling requirements. 6441 * Setup mask for cpus without special case scheduling requirements.
@@ -5972,51 +6444,14 @@ static void arch_init_sched_domains(const cpumask_t *cpu_map)
5972 */ 6444 */
5973 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); 6445 cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map);
5974 6446
5975 build_sched_domains(&cpu_default_map); 6447 err = build_sched_domains(&cpu_default_map);
6448
6449 return err;
5976} 6450}
5977 6451
5978static void arch_destroy_sched_domains(const cpumask_t *cpu_map) 6452static void arch_destroy_sched_domains(const cpumask_t *cpu_map)
5979{ 6453{
5980#ifdef CONFIG_NUMA 6454 free_sched_groups(cpu_map);
5981 int i;
5982 int cpu;
5983
5984 for_each_cpu_mask(cpu, *cpu_map) {
5985 struct sched_group *sched_group_allnodes
5986 = sched_group_allnodes_bycpu[cpu];
5987 struct sched_group **sched_group_nodes
5988 = sched_group_nodes_bycpu[cpu];
5989
5990 if (sched_group_allnodes) {
5991 kfree(sched_group_allnodes);
5992 sched_group_allnodes_bycpu[cpu] = NULL;
5993 }
5994
5995 if (!sched_group_nodes)
5996 continue;
5997
5998 for (i = 0; i < MAX_NUMNODES; i++) {
5999 cpumask_t nodemask = node_to_cpumask(i);
6000 struct sched_group *oldsg, *sg = sched_group_nodes[i];
6001
6002 cpus_and(nodemask, nodemask, *cpu_map);
6003 if (cpus_empty(nodemask))
6004 continue;
6005
6006 if (sg == NULL)
6007 continue;
6008 sg = sg->next;
6009next_sg:
6010 oldsg = sg;
6011 sg = sg->next;
6012 kfree(oldsg);
6013 if (oldsg != sched_group_nodes[i])
6014 goto next_sg;
6015 }
6016 kfree(sched_group_nodes);
6017 sched_group_nodes_bycpu[cpu] = NULL;
6018 }
6019#endif
6020} 6455}
6021 6456
6022/* 6457/*
@@ -6041,9 +6476,10 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
6041 * correct sched domains 6476 * correct sched domains
6042 * Call with hotplug lock held 6477 * Call with hotplug lock held
6043 */ 6478 */
6044void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) 6479int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
6045{ 6480{
6046 cpumask_t change_map; 6481 cpumask_t change_map;
6482 int err = 0;
6047 6483
6048 cpus_and(*partition1, *partition1, cpu_online_map); 6484 cpus_and(*partition1, *partition1, cpu_online_map);
6049 cpus_and(*partition2, *partition2, cpu_online_map); 6485 cpus_and(*partition2, *partition2, cpu_online_map);
@@ -6052,10 +6488,86 @@ void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2)
6052 /* Detach sched domains from all of the affected cpus */ 6488 /* Detach sched domains from all of the affected cpus */
6053 detach_destroy_domains(&change_map); 6489 detach_destroy_domains(&change_map);
6054 if (!cpus_empty(*partition1)) 6490 if (!cpus_empty(*partition1))
6055 build_sched_domains(partition1); 6491 err = build_sched_domains(partition1);
6056 if (!cpus_empty(*partition2)) 6492 if (!err && !cpus_empty(*partition2))
6057 build_sched_domains(partition2); 6493 err = build_sched_domains(partition2);
6494
6495 return err;
6496}
6497
6498#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
6499int arch_reinit_sched_domains(void)
6500{
6501 int err;
6502
6503 lock_cpu_hotplug();
6504 detach_destroy_domains(&cpu_online_map);
6505 err = arch_init_sched_domains(&cpu_online_map);
6506 unlock_cpu_hotplug();
6507
6508 return err;
6509}
6510
6511static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
6512{
6513 int ret;
6514
6515 if (buf[0] != '0' && buf[0] != '1')
6516 return -EINVAL;
6517
6518 if (smt)
6519 sched_smt_power_savings = (buf[0] == '1');
6520 else
6521 sched_mc_power_savings = (buf[0] == '1');
6522
6523 ret = arch_reinit_sched_domains();
6524
6525 return ret ? ret : count;
6526}
6527
6528int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
6529{
6530 int err = 0;
6531#ifdef CONFIG_SCHED_SMT
6532 if (smt_capable())
6533 err = sysfs_create_file(&cls->kset.kobj,
6534 &attr_sched_smt_power_savings.attr);
6535#endif
6536#ifdef CONFIG_SCHED_MC
6537 if (!err && mc_capable())
6538 err = sysfs_create_file(&cls->kset.kobj,
6539 &attr_sched_mc_power_savings.attr);
6540#endif
6541 return err;
6542}
6543#endif
6544
6545#ifdef CONFIG_SCHED_MC
6546static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page)
6547{
6548 return sprintf(page, "%u\n", sched_mc_power_savings);
6549}
6550static ssize_t sched_mc_power_savings_store(struct sys_device *dev, const char *buf, size_t count)
6551{
6552 return sched_power_savings_store(buf, count, 0);
6553}
6554SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show,
6555 sched_mc_power_savings_store);
6556#endif
6557
6558#ifdef CONFIG_SCHED_SMT
6559static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page)
6560{
6561 return sprintf(page, "%u\n", sched_smt_power_savings);
6562}
6563static ssize_t sched_smt_power_savings_store(struct sys_device *dev, const char *buf, size_t count)
6564{
6565 return sched_power_savings_store(buf, count, 1);
6058} 6566}
6567SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show,
6568 sched_smt_power_savings_store);
6569#endif
6570
6059 6571
6060#ifdef CONFIG_HOTPLUG_CPU 6572#ifdef CONFIG_HOTPLUG_CPU
6061/* 6573/*
@@ -6138,7 +6650,6 @@ void __init sched_init(void)
6138 rq->push_cpu = 0; 6650 rq->push_cpu = 0;
6139 rq->migration_thread = NULL; 6651 rq->migration_thread = NULL;
6140 INIT_LIST_HEAD(&rq->migration_queue); 6652 INIT_LIST_HEAD(&rq->migration_queue);
6141 rq->cpu = i;
6142#endif 6653#endif
6143 atomic_set(&rq->nr_iowait, 0); 6654 atomic_set(&rq->nr_iowait, 0);
6144 6655
@@ -6153,6 +6664,7 @@ void __init sched_init(void)
6153 } 6664 }
6154 } 6665 }
6155 6666
6667 set_load_weight(&init_task);
6156 /* 6668 /*
6157 * The boot idle thread does lazy MMU switching as well: 6669 * The boot idle thread does lazy MMU switching as well:
6158 */ 6670 */
@@ -6199,11 +6711,12 @@ void normalize_rt_tasks(void)
6199 runqueue_t *rq; 6711 runqueue_t *rq;
6200 6712
6201 read_lock_irq(&tasklist_lock); 6713 read_lock_irq(&tasklist_lock);
6202 for_each_process (p) { 6714 for_each_process(p) {
6203 if (!rt_task(p)) 6715 if (!rt_task(p))
6204 continue; 6716 continue;
6205 6717
6206 rq = task_rq_lock(p, &flags); 6718 spin_lock_irqsave(&p->pi_lock, flags);
6719 rq = __task_rq_lock(p);
6207 6720
6208 array = p->array; 6721 array = p->array;
6209 if (array) 6722 if (array)
@@ -6214,7 +6727,8 @@ void normalize_rt_tasks(void)
6214 resched_task(rq->curr); 6727 resched_task(rq->curr);
6215 } 6728 }
6216 6729
6217 task_rq_unlock(rq, &flags); 6730 __task_rq_unlock(rq);
6731 spin_unlock_irqrestore(&p->pi_lock, flags);
6218 } 6732 }
6219 read_unlock_irq(&tasklist_lock); 6733 read_unlock_irq(&tasklist_lock);
6220} 6734}
diff --git a/kernel/signal.c b/kernel/signal.c
index 1b3c921737e2..52adf53929f6 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1531,6 +1531,35 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
1531 spin_unlock_irqrestore(&sighand->siglock, flags); 1531 spin_unlock_irqrestore(&sighand->siglock, flags);
1532} 1532}
1533 1533
1534static inline int may_ptrace_stop(void)
1535{
1536 if (!likely(current->ptrace & PT_PTRACED))
1537 return 0;
1538
1539 if (unlikely(current->parent == current->real_parent &&
1540 (current->ptrace & PT_ATTACHED)))
1541 return 0;
1542
1543 if (unlikely(current->signal == current->parent->signal) &&
1544 unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))
1545 return 0;
1546
1547 /*
1548 * Are we in the middle of do_coredump?
1549 * If so and our tracer is also part of the coredump stopping
1550 * is a deadlock situation, and pointless because our tracer
1551 * is dead so don't allow us to stop.
1552 * If SIGKILL was already sent before the caller unlocked
1553 * ->siglock we must see ->core_waiters != 0. Otherwise it
1554 * is safe to enter schedule().
1555 */
1556 if (unlikely(current->mm->core_waiters) &&
1557 unlikely(current->mm == current->parent->mm))
1558 return 0;
1559
1560 return 1;
1561}
1562
1534/* 1563/*
1535 * This must be called with current->sighand->siglock held. 1564 * This must be called with current->sighand->siglock held.
1536 * 1565 *
@@ -1559,11 +1588,7 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info)
1559 spin_unlock_irq(&current->sighand->siglock); 1588 spin_unlock_irq(&current->sighand->siglock);
1560 try_to_freeze(); 1589 try_to_freeze();
1561 read_lock(&tasklist_lock); 1590 read_lock(&tasklist_lock);
1562 if (likely(current->ptrace & PT_PTRACED) && 1591 if (may_ptrace_stop()) {
1563 likely(current->parent != current->real_parent ||
1564 !(current->ptrace & PT_ATTACHED)) &&
1565 (likely(current->parent->signal != current->signal) ||
1566 !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) {
1567 do_notify_parent_cldstop(current, CLD_TRAPPED); 1592 do_notify_parent_cldstop(current, CLD_TRAPPED);
1568 read_unlock(&tasklist_lock); 1593 read_unlock(&tasklist_lock);
1569 schedule(); 1594 schedule();
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 9e2f1c6e73d7..8f03e3b89b55 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -446,7 +446,7 @@ static void takeover_tasklets(unsigned int cpu)
446} 446}
447#endif /* CONFIG_HOTPLUG_CPU */ 447#endif /* CONFIG_HOTPLUG_CPU */
448 448
449static int cpu_callback(struct notifier_block *nfb, 449static int __devinit cpu_callback(struct notifier_block *nfb,
450 unsigned long action, 450 unsigned long action,
451 void *hcpu) 451 void *hcpu)
452{ 452{
@@ -486,7 +486,7 @@ static int cpu_callback(struct notifier_block *nfb,
486 return NOTIFY_OK; 486 return NOTIFY_OK;
487} 487}
488 488
489static struct notifier_block cpu_nfb = { 489static struct notifier_block __devinitdata cpu_nfb = {
490 .notifier_call = cpu_callback 490 .notifier_call = cpu_callback
491}; 491};
492 492
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index b5c3b94e01ce..6b76caa22981 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -104,7 +104,7 @@ static int watchdog(void * __bind_cpu)
104/* 104/*
105 * Create/destroy watchdog threads as CPUs come and go: 105 * Create/destroy watchdog threads as CPUs come and go:
106 */ 106 */
107static int 107static int __devinit
108cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 108cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
109{ 109{
110 int hotcpu = (unsigned long)hcpu; 110 int hotcpu = (unsigned long)hcpu;
@@ -142,7 +142,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
142 return NOTIFY_OK; 142 return NOTIFY_OK;
143} 143}
144 144
145static struct notifier_block cpu_nfb = { 145static struct notifier_block __devinitdata cpu_nfb = {
146 .notifier_call = cpu_callback 146 .notifier_call = cpu_callback
147}; 147};
148 148
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 2c0e65819448..93a2c5398648 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -73,6 +73,7 @@ extern int printk_ratelimit_burst;
73extern int pid_max_min, pid_max_max; 73extern int pid_max_min, pid_max_max;
74extern int sysctl_drop_caches; 74extern int sysctl_drop_caches;
75extern int percpu_pagelist_fraction; 75extern int percpu_pagelist_fraction;
76extern int compat_log;
76 77
77#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) 78#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
78int unknown_nmi_panic; 79int unknown_nmi_panic;
@@ -132,6 +133,10 @@ extern int acct_parm[];
132extern int no_unaligned_warning; 133extern int no_unaligned_warning;
133#endif 134#endif
134 135
136#ifdef CONFIG_RT_MUTEXES
137extern int max_lock_depth;
138#endif
139
135static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, 140static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t,
136 ctl_table *, void **); 141 ctl_table *, void **);
137static int proc_doutsstring(ctl_table *table, int write, struct file *filp, 142static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
@@ -677,6 +682,27 @@ static ctl_table kern_table[] = {
677 .proc_handler = &proc_dointvec, 682 .proc_handler = &proc_dointvec,
678 }, 683 },
679#endif 684#endif
685#ifdef CONFIG_COMPAT
686 {
687 .ctl_name = KERN_COMPAT_LOG,
688 .procname = "compat-log",
689 .data = &compat_log,
690 .maxlen = sizeof (int),
691 .mode = 0644,
692 .proc_handler = &proc_dointvec,
693 },
694#endif
695#ifdef CONFIG_RT_MUTEXES
696 {
697 .ctl_name = KERN_MAX_LOCK_DEPTH,
698 .procname = "max_lock_depth",
699 .data = &max_lock_depth,
700 .maxlen = sizeof(int),
701 .mode = 0644,
702 .proc_handler = &proc_dointvec,
703 },
704#endif
705
680 { .ctl_name = 0 } 706 { .ctl_name = 0 }
681}; 707};
682 708
@@ -917,6 +943,18 @@ static ctl_table vm_table[] = {
917 .strategy = &sysctl_jiffies, 943 .strategy = &sysctl_jiffies,
918 }, 944 },
919#endif 945#endif
946#ifdef CONFIG_X86_32
947 {
948 .ctl_name = VM_VDSO_ENABLED,
949 .procname = "vdso_enabled",
950 .data = &vdso_enabled,
951 .maxlen = sizeof(vdso_enabled),
952 .mode = 0644,
953 .proc_handler = &proc_dointvec,
954 .strategy = &sysctl_intvec,
955 .extra1 = &zero,
956 },
957#endif
920 { .ctl_name = 0 } 958 { .ctl_name = 0 }
921}; 959};
922 960
diff --git a/kernel/time.c b/kernel/time.c
index b00ddc71cedb..5bd489747643 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -523,6 +523,7 @@ EXPORT_SYMBOL(do_gettimeofday);
523 523
524 524
525#else 525#else
526#ifndef CONFIG_GENERIC_TIME
526/* 527/*
527 * Simulate gettimeofday using do_gettimeofday which only allows a timeval 528 * Simulate gettimeofday using do_gettimeofday which only allows a timeval
528 * and therefore only yields usec accuracy 529 * and therefore only yields usec accuracy
@@ -537,6 +538,7 @@ void getnstimeofday(struct timespec *tv)
537} 538}
538EXPORT_SYMBOL_GPL(getnstimeofday); 539EXPORT_SYMBOL_GPL(getnstimeofday);
539#endif 540#endif
541#endif
540 542
541/* Converts Gregorian date to seconds since 1970-01-01 00:00:00. 543/* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
542 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 544 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
new file mode 100644
index 000000000000..e1dfd8e86cce
--- /dev/null
+++ b/kernel/time/Makefile
@@ -0,0 +1 @@
obj-y += clocksource.o jiffies.o
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
new file mode 100644
index 000000000000..74eca5939bd9
--- /dev/null
+++ b/kernel/time/clocksource.c
@@ -0,0 +1,349 @@
1/*
2 * linux/kernel/time/clocksource.c
3 *
4 * This file contains the functions which manage clocksource drivers.
5 *
6 * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 *
22 * TODO WishList:
23 * o Allow clocksource drivers to be unregistered
24 * o get rid of clocksource_jiffies extern
25 */
26
27#include <linux/clocksource.h>
28#include <linux/sysdev.h>
29#include <linux/init.h>
30#include <linux/module.h>
31
32/* XXX - Would like a better way for initializing curr_clocksource */
33extern struct clocksource clocksource_jiffies;
34
35/*[Clocksource internal variables]---------
36 * curr_clocksource:
37 * currently selected clocksource. Initialized to clocksource_jiffies.
38 * next_clocksource:
39 * pending next selected clocksource.
40 * clocksource_list:
41 * linked list with the registered clocksources
42 * clocksource_lock:
43 * protects manipulations to curr_clocksource and next_clocksource
44 * and the clocksource_list
45 * override_name:
46 * Name of the user-specified clocksource.
47 */
48static struct clocksource *curr_clocksource = &clocksource_jiffies;
49static struct clocksource *next_clocksource;
50static LIST_HEAD(clocksource_list);
51static DEFINE_SPINLOCK(clocksource_lock);
52static char override_name[32];
53static int finished_booting;
54
55/* clocksource_done_booting - Called near the end of bootup
56 *
57 * Hack to avoid lots of clocksource churn at boot time
58 */
59static int __init clocksource_done_booting(void)
60{
61 finished_booting = 1;
62 return 0;
63}
64
65late_initcall(clocksource_done_booting);
66
67/**
68 * clocksource_get_next - Returns the selected clocksource
69 *
70 */
71struct clocksource *clocksource_get_next(void)
72{
73 unsigned long flags;
74
75 spin_lock_irqsave(&clocksource_lock, flags);
76 if (next_clocksource && finished_booting) {
77 curr_clocksource = next_clocksource;
78 next_clocksource = NULL;
79 }
80 spin_unlock_irqrestore(&clocksource_lock, flags);
81
82 return curr_clocksource;
83}
84
85/**
86 * select_clocksource - Finds the best registered clocksource.
87 *
88 * Private function. Must hold clocksource_lock when called.
89 *
90 * Looks through the list of registered clocksources, returning
91 * the one with the highest rating value. If there is a clocksource
92 * name that matches the override string, it returns that clocksource.
93 */
94static struct clocksource *select_clocksource(void)
95{
96 struct clocksource *best = NULL;
97 struct list_head *tmp;
98
99 list_for_each(tmp, &clocksource_list) {
100 struct clocksource *src;
101
102 src = list_entry(tmp, struct clocksource, list);
103 if (!best)
104 best = src;
105
106 /* check for override: */
107 if (strlen(src->name) == strlen(override_name) &&
108 !strcmp(src->name, override_name)) {
109 best = src;
110 break;
111 }
112 /* pick the highest rating: */
113 if (src->rating > best->rating)
114 best = src;
115 }
116
117 return best;
118}
119
120/**
121 * is_registered_source - Checks if clocksource is registered
122 * @c: pointer to a clocksource
123 *
124 * Private helper function. Must hold clocksource_lock when called.
125 *
126 * Returns one if the clocksource is already registered, zero otherwise.
127 */
128static int is_registered_source(struct clocksource *c)
129{
130 int len = strlen(c->name);
131 struct list_head *tmp;
132
133 list_for_each(tmp, &clocksource_list) {
134 struct clocksource *src;
135
136 src = list_entry(tmp, struct clocksource, list);
137 if (strlen(src->name) == len && !strcmp(src->name, c->name))
138 return 1;
139 }
140
141 return 0;
142}
143
144/**
145 * clocksource_register - Used to install new clocksources
146 * @t: clocksource to be registered
147 *
148 * Returns -EBUSY if registration fails, zero otherwise.
149 */
150int clocksource_register(struct clocksource *c)
151{
152 int ret = 0;
153 unsigned long flags;
154
155 spin_lock_irqsave(&clocksource_lock, flags);
156 /* check if clocksource is already registered */
157 if (is_registered_source(c)) {
158 printk("register_clocksource: Cannot register %s. "
159 "Already registered!", c->name);
160 ret = -EBUSY;
161 } else {
162 /* register it */
163 list_add(&c->list, &clocksource_list);
164 /* scan the registered clocksources, and pick the best one */
165 next_clocksource = select_clocksource();
166 }
167 spin_unlock_irqrestore(&clocksource_lock, flags);
168 return ret;
169}
170EXPORT_SYMBOL(clocksource_register);
171
172/**
173 * clocksource_reselect - Rescan list for next clocksource
174 *
175 * A quick helper function to be used if a clocksource changes its
176 * rating. Forces the clocksource list to be re-scanned for the best
177 * clocksource.
178 */
179void clocksource_reselect(void)
180{
181 unsigned long flags;
182
183 spin_lock_irqsave(&clocksource_lock, flags);
184 next_clocksource = select_clocksource();
185 spin_unlock_irqrestore(&clocksource_lock, flags);
186}
187EXPORT_SYMBOL(clocksource_reselect);
188
189/**
190 * sysfs_show_current_clocksources - sysfs interface for current clocksource
191 * @dev: unused
192 * @buf: char buffer to be filled with clocksource list
193 *
194 * Provides sysfs interface for listing current clocksource.
195 */
196static ssize_t
197sysfs_show_current_clocksources(struct sys_device *dev, char *buf)
198{
199 char *curr = buf;
200
201 spin_lock_irq(&clocksource_lock);
202 curr += sprintf(curr, "%s ", curr_clocksource->name);
203 spin_unlock_irq(&clocksource_lock);
204
205 curr += sprintf(curr, "\n");
206
207 return curr - buf;
208}
209
210/**
211 * sysfs_override_clocksource - interface for manually overriding clocksource
212 * @dev: unused
213 * @buf: name of override clocksource
214 * @count: length of buffer
215 *
216 * Takes input from sysfs interface for manually overriding the default
217 * clocksource selction.
218 */
219static ssize_t sysfs_override_clocksource(struct sys_device *dev,
220 const char *buf, size_t count)
221{
222 size_t ret = count;
223 /* strings from sysfs write are not 0 terminated! */
224 if (count >= sizeof(override_name))
225 return -EINVAL;
226
227 /* strip of \n: */
228 if (buf[count-1] == '\n')
229 count--;
230 if (count < 1)
231 return -EINVAL;
232
233 spin_lock_irq(&clocksource_lock);
234
235 /* copy the name given: */
236 memcpy(override_name, buf, count);
237 override_name[count] = 0;
238
239 /* try to select it: */
240 next_clocksource = select_clocksource();
241
242 spin_unlock_irq(&clocksource_lock);
243
244 return ret;
245}
246
247/**
248 * sysfs_show_available_clocksources - sysfs interface for listing clocksource
249 * @dev: unused
250 * @buf: char buffer to be filled with clocksource list
251 *
252 * Provides sysfs interface for listing registered clocksources
253 */
254static ssize_t
255sysfs_show_available_clocksources(struct sys_device *dev, char *buf)
256{
257 struct list_head *tmp;
258 char *curr = buf;
259
260 spin_lock_irq(&clocksource_lock);
261 list_for_each(tmp, &clocksource_list) {
262 struct clocksource *src;
263
264 src = list_entry(tmp, struct clocksource, list);
265 curr += sprintf(curr, "%s ", src->name);
266 }
267 spin_unlock_irq(&clocksource_lock);
268
269 curr += sprintf(curr, "\n");
270
271 return curr - buf;
272}
273
274/*
275 * Sysfs setup bits:
276 */
277static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources,
278 sysfs_override_clocksource);
279
280static SYSDEV_ATTR(available_clocksource, 0600,
281 sysfs_show_available_clocksources, NULL);
282
283static struct sysdev_class clocksource_sysclass = {
284 set_kset_name("clocksource"),
285};
286
287static struct sys_device device_clocksource = {
288 .id = 0,
289 .cls = &clocksource_sysclass,
290};
291
292static int __init init_clocksource_sysfs(void)
293{
294 int error = sysdev_class_register(&clocksource_sysclass);
295
296 if (!error)
297 error = sysdev_register(&device_clocksource);
298 if (!error)
299 error = sysdev_create_file(
300 &device_clocksource,
301 &attr_current_clocksource);
302 if (!error)
303 error = sysdev_create_file(
304 &device_clocksource,
305 &attr_available_clocksource);
306 return error;
307}
308
309device_initcall(init_clocksource_sysfs);
310
311/**
312 * boot_override_clocksource - boot clock override
313 * @str: override name
314 *
315 * Takes a clocksource= boot argument and uses it
316 * as the clocksource override name.
317 */
318static int __init boot_override_clocksource(char* str)
319{
320 unsigned long flags;
321 spin_lock_irqsave(&clocksource_lock, flags);
322 if (str)
323 strlcpy(override_name, str, sizeof(override_name));
324 spin_unlock_irqrestore(&clocksource_lock, flags);
325 return 1;
326}
327
328__setup("clocksource=", boot_override_clocksource);
329
330/**
331 * boot_override_clock - Compatibility layer for deprecated boot option
332 * @str: override name
333 *
334 * DEPRECATED! Takes a clock= boot argument and uses it
335 * as the clocksource override name
336 */
337static int __init boot_override_clock(char* str)
338{
339 if (!strcmp(str, "pmtmr")) {
340 printk("Warning: clock=pmtmr is deprecated. "
341 "Use clocksource=acpi_pm.\n");
342 return boot_override_clocksource("acpi_pm");
343 }
344 printk("Warning! clock= boot option is deprecated. "
345 "Use clocksource=xyz\n");
346 return boot_override_clocksource(str);
347}
348
349__setup("clock=", boot_override_clock);
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
new file mode 100644
index 000000000000..126bb30c4afe
--- /dev/null
+++ b/kernel/time/jiffies.c
@@ -0,0 +1,73 @@
1/***********************************************************************
2* linux/kernel/time/jiffies.c
3*
4* This file contains the jiffies based clocksource.
5*
6* Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
7*
8* This program is free software; you can redistribute it and/or modify
9* it under the terms of the GNU General Public License as published by
10* the Free Software Foundation; either version 2 of the License, or
11* (at your option) any later version.
12*
13* This program is distributed in the hope that it will be useful,
14* but WITHOUT ANY WARRANTY; without even the implied warranty of
15* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16* GNU General Public License for more details.
17*
18* You should have received a copy of the GNU General Public License
19* along with this program; if not, write to the Free Software
20* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21*
22************************************************************************/
23#include <linux/clocksource.h>
24#include <linux/jiffies.h>
25#include <linux/init.h>
26
27/* The Jiffies based clocksource is the lowest common
28 * denominator clock source which should function on
29 * all systems. It has the same coarse resolution as
30 * the timer interrupt frequency HZ and it suffers
31 * inaccuracies caused by missed or lost timer
32 * interrupts and the inability for the timer
33 * interrupt hardware to accuratly tick at the
34 * requested HZ value. It is also not reccomended
35 * for "tick-less" systems.
36 */
37#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ))
38
39/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier
40 * conversion, the .shift value could be zero. However
41 * this would make NTP adjustments impossible as they are
42 * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to
43 * shift both the nominator and denominator the same
44 * amount, and give ntp adjustments in units of 1/2^8
45 *
46 * The value 8 is somewhat carefully chosen, as anything
47 * larger can result in overflows. NSEC_PER_JIFFY grows as
48 * HZ shrinks, so values greater then 8 overflow 32bits when
49 * HZ=100.
50 */
51#define JIFFIES_SHIFT 8
52
53static cycle_t jiffies_read(void)
54{
55 return (cycle_t) jiffies;
56}
57
58struct clocksource clocksource_jiffies = {
59 .name = "jiffies",
60 .rating = 0, /* lowest rating*/
61 .read = jiffies_read,
62 .mask = 0xffffffff, /*32bits*/
63 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
64 .shift = JIFFIES_SHIFT,
65 .is_continuous = 0, /* tick based, not free running */
66};
67
68static int __init init_jiffies_clocksource(void)
69{
70 return clocksource_register(&clocksource_jiffies);
71}
72
73module_init(init_jiffies_clocksource);
diff --git a/kernel/timer.c b/kernel/timer.c
index eb97371b87d8..5a8960253063 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -597,7 +597,6 @@ long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */
597long time_precision = 1; /* clock precision (us) */ 597long time_precision = 1; /* clock precision (us) */
598long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ 598long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */
599long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ 599long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */
600static long time_phase; /* phase offset (scaled us) */
601long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC; 600long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC;
602 /* frequency offset (scaled ppm)*/ 601 /* frequency offset (scaled ppm)*/
603static long time_adj; /* tick adjust (scaled 1 / HZ) */ 602static long time_adj; /* tick adjust (scaled 1 / HZ) */
@@ -747,27 +746,14 @@ static long adjtime_adjustment(void)
747} 746}
748 747
749/* in the NTP reference this is called "hardclock()" */ 748/* in the NTP reference this is called "hardclock()" */
750static void update_wall_time_one_tick(void) 749static void update_ntp_one_tick(void)
751{ 750{
752 long time_adjust_step, delta_nsec; 751 long time_adjust_step;
753 752
754 time_adjust_step = adjtime_adjustment(); 753 time_adjust_step = adjtime_adjustment();
755 if (time_adjust_step) 754 if (time_adjust_step)
756 /* Reduce by this step the amount of time left */ 755 /* Reduce by this step the amount of time left */
757 time_adjust -= time_adjust_step; 756 time_adjust -= time_adjust_step;
758 delta_nsec = tick_nsec + time_adjust_step * 1000;
759 /*
760 * Advance the phase, once it gets to one microsecond, then
761 * advance the tick more.
762 */
763 time_phase += time_adj;
764 if ((time_phase >= FINENSEC) || (time_phase <= -FINENSEC)) {
765 long ltemp = shift_right(time_phase, (SHIFT_SCALE - 10));
766 time_phase -= ltemp << (SHIFT_SCALE - 10);
767 delta_nsec += ltemp;
768 }
769 xtime.tv_nsec += delta_nsec;
770 time_interpolator_update(delta_nsec);
771 757
772 /* Changes by adjtime() do not take effect till next tick. */ 758 /* Changes by adjtime() do not take effect till next tick. */
773 if (time_next_adjust != 0) { 759 if (time_next_adjust != 0) {
@@ -780,36 +766,378 @@ static void update_wall_time_one_tick(void)
780 * Return how long ticks are at the moment, that is, how much time 766 * Return how long ticks are at the moment, that is, how much time
781 * update_wall_time_one_tick will add to xtime next time we call it 767 * update_wall_time_one_tick will add to xtime next time we call it
782 * (assuming no calls to do_adjtimex in the meantime). 768 * (assuming no calls to do_adjtimex in the meantime).
783 * The return value is in fixed-point nanoseconds with SHIFT_SCALE-10 769 * The return value is in fixed-point nanoseconds shifted by the
784 * bits to the right of the binary point. 770 * specified number of bits to the right of the binary point.
785 * This function has no side-effects. 771 * This function has no side-effects.
786 */ 772 */
787u64 current_tick_length(void) 773u64 current_tick_length(void)
788{ 774{
789 long delta_nsec; 775 long delta_nsec;
776 u64 ret;
790 777
778 /* calculate the finest interval NTP will allow.
779 * ie: nanosecond value shifted by (SHIFT_SCALE - 10)
780 */
791 delta_nsec = tick_nsec + adjtime_adjustment() * 1000; 781 delta_nsec = tick_nsec + adjtime_adjustment() * 1000;
792 return ((u64) delta_nsec << (SHIFT_SCALE - 10)) + time_adj; 782 ret = (u64)delta_nsec << TICK_LENGTH_SHIFT;
783 ret += (s64)time_adj << (TICK_LENGTH_SHIFT - (SHIFT_SCALE - 10));
784
785 return ret;
793} 786}
794 787
795/* 788/* XXX - all of this timekeeping code should be later moved to time.c */
796 * Using a loop looks inefficient, but "ticks" is 789#include <linux/clocksource.h>
797 * usually just one (we shouldn't be losing ticks, 790static struct clocksource *clock; /* pointer to current clocksource */
798 * we're doing this this way mainly for interrupt 791
799 * latency reasons, not because we think we'll 792#ifdef CONFIG_GENERIC_TIME
800 * have lots of lost timer ticks 793/**
794 * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook
795 *
796 * private function, must hold xtime_lock lock when being
797 * called. Returns the number of nanoseconds since the
798 * last call to update_wall_time() (adjusted by NTP scaling)
799 */
800static inline s64 __get_nsec_offset(void)
801{
802 cycle_t cycle_now, cycle_delta;
803 s64 ns_offset;
804
805 /* read clocksource: */
806 cycle_now = clocksource_read(clock);
807
808 /* calculate the delta since the last update_wall_time: */
809 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
810
811 /* convert to nanoseconds: */
812 ns_offset = cyc2ns(clock, cycle_delta);
813
814 return ns_offset;
815}
816
817/**
818 * __get_realtime_clock_ts - Returns the time of day in a timespec
819 * @ts: pointer to the timespec to be set
820 *
821 * Returns the time of day in a timespec. Used by
822 * do_gettimeofday() and get_realtime_clock_ts().
801 */ 823 */
802static void update_wall_time(unsigned long ticks) 824static inline void __get_realtime_clock_ts(struct timespec *ts)
803{ 825{
826 unsigned long seq;
827 s64 nsecs;
828
829 do {
830 seq = read_seqbegin(&xtime_lock);
831
832 *ts = xtime;
833 nsecs = __get_nsec_offset();
834
835 } while (read_seqretry(&xtime_lock, seq));
836
837 timespec_add_ns(ts, nsecs);
838}
839
840/**
841 * getnstimeofday - Returns the time of day in a timespec
842 * @ts: pointer to the timespec to be set
843 *
844 * Returns the time of day in a timespec.
845 */
846void getnstimeofday(struct timespec *ts)
847{
848 __get_realtime_clock_ts(ts);
849}
850
851EXPORT_SYMBOL(getnstimeofday);
852
853/**
854 * do_gettimeofday - Returns the time of day in a timeval
855 * @tv: pointer to the timeval to be set
856 *
857 * NOTE: Users should be converted to using get_realtime_clock_ts()
858 */
859void do_gettimeofday(struct timeval *tv)
860{
861 struct timespec now;
862
863 __get_realtime_clock_ts(&now);
864 tv->tv_sec = now.tv_sec;
865 tv->tv_usec = now.tv_nsec/1000;
866}
867
868EXPORT_SYMBOL(do_gettimeofday);
869/**
870 * do_settimeofday - Sets the time of day
871 * @tv: pointer to the timespec variable containing the new time
872 *
873 * Sets the time of day to the new time and update NTP and notify hrtimers
874 */
875int do_settimeofday(struct timespec *tv)
876{
877 unsigned long flags;
878 time_t wtm_sec, sec = tv->tv_sec;
879 long wtm_nsec, nsec = tv->tv_nsec;
880
881 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
882 return -EINVAL;
883
884 write_seqlock_irqsave(&xtime_lock, flags);
885
886 nsec -= __get_nsec_offset();
887
888 wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
889 wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
890
891 set_normalized_timespec(&xtime, sec, nsec);
892 set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
893
894 ntp_clear();
895
896 write_sequnlock_irqrestore(&xtime_lock, flags);
897
898 /* signal hrtimers about time change */
899 clock_was_set();
900
901 return 0;
902}
903
904EXPORT_SYMBOL(do_settimeofday);
905
906/**
907 * change_clocksource - Swaps clocksources if a new one is available
908 *
909 * Accumulates current time interval and initializes new clocksource
910 */
911static int change_clocksource(void)
912{
913 struct clocksource *new;
914 cycle_t now;
915 u64 nsec;
916 new = clocksource_get_next();
917 if (clock != new) {
918 now = clocksource_read(new);
919 nsec = __get_nsec_offset();
920 timespec_add_ns(&xtime, nsec);
921
922 clock = new;
923 clock->cycle_last = now;
924 printk(KERN_INFO "Time: %s clocksource has been installed.\n",
925 clock->name);
926 return 1;
927 } else if (clock->update_callback) {
928 return clock->update_callback();
929 }
930 return 0;
931}
932#else
933#define change_clocksource() (0)
934#endif
935
936/**
937 * timeofday_is_continuous - check to see if timekeeping is free running
938 */
939int timekeeping_is_continuous(void)
940{
941 unsigned long seq;
942 int ret;
943
804 do { 944 do {
805 ticks--; 945 seq = read_seqbegin(&xtime_lock);
806 update_wall_time_one_tick(); 946
807 if (xtime.tv_nsec >= 1000000000) { 947 ret = clock->is_continuous;
808 xtime.tv_nsec -= 1000000000; 948
949 } while (read_seqretry(&xtime_lock, seq));
950
951 return ret;
952}
953
954/*
955 * timekeeping_init - Initializes the clocksource and common timekeeping values
956 */
957void __init timekeeping_init(void)
958{
959 unsigned long flags;
960
961 write_seqlock_irqsave(&xtime_lock, flags);
962 clock = clocksource_get_next();
963 clocksource_calculate_interval(clock, tick_nsec);
964 clock->cycle_last = clocksource_read(clock);
965 ntp_clear();
966 write_sequnlock_irqrestore(&xtime_lock, flags);
967}
968
969
970/*
971 * timekeeping_resume - Resumes the generic timekeeping subsystem.
972 * @dev: unused
973 *
974 * This is for the generic clocksource timekeeping.
975 * xtime/wall_to_monotonic/jiffies/wall_jiffies/etc are
976 * still managed by arch specific suspend/resume code.
977 */
978static int timekeeping_resume(struct sys_device *dev)
979{
980 unsigned long flags;
981
982 write_seqlock_irqsave(&xtime_lock, flags);
983 /* restart the last cycle value */
984 clock->cycle_last = clocksource_read(clock);
985 write_sequnlock_irqrestore(&xtime_lock, flags);
986 return 0;
987}
988
989/* sysfs resume/suspend bits for timekeeping */
990static struct sysdev_class timekeeping_sysclass = {
991 .resume = timekeeping_resume,
992 set_kset_name("timekeeping"),
993};
994
995static struct sys_device device_timer = {
996 .id = 0,
997 .cls = &timekeeping_sysclass,
998};
999
1000static int __init timekeeping_init_device(void)
1001{
1002 int error = sysdev_class_register(&timekeeping_sysclass);
1003 if (!error)
1004 error = sysdev_register(&device_timer);
1005 return error;
1006}
1007
1008device_initcall(timekeeping_init_device);
1009
1010/*
1011 * If the error is already larger, we look ahead another tick,
1012 * to compensate for late or lost adjustments.
1013 */
1014static __always_inline int clocksource_bigadjust(int sign, s64 error, s64 *interval, s64 *offset)
1015{
1016 int adj;
1017
1018 /*
1019 * As soon as the machine is synchronized to the external time
1020 * source this should be the common case.
1021 */
1022 error >>= 2;
1023 if (likely(sign > 0 ? error <= *interval : error >= *interval))
1024 return sign;
1025
1026 /*
1027 * An extra look ahead dampens the effect of the current error,
1028 * which can grow quite large with continously late updates, as
1029 * it would dominate the adjustment value and can lead to
1030 * oscillation.
1031 */
1032 error += current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1);
1033 error -= clock->xtime_interval >> 1;
1034
1035 adj = 0;
1036 while (1) {
1037 error >>= 1;
1038 if (sign > 0 ? error <= *interval : error >= *interval)
1039 break;
1040 adj++;
1041 }
1042
1043 /*
1044 * Add the current adjustments to the error and take the offset
1045 * into account, the latter can cause the error to be hardly
1046 * reduced at the next tick. Check the error again if there's
1047 * room for another adjustment, thus further reducing the error
1048 * which otherwise had to be corrected at the next update.
1049 */
1050 error = (error << 1) - *interval + *offset;
1051 if (sign > 0 ? error > *interval : error < *interval)
1052 adj++;
1053
1054 *interval <<= adj;
1055 *offset <<= adj;
1056 return sign << adj;
1057}
1058
1059/*
1060 * Adjust the multiplier to reduce the error value,
1061 * this is optimized for the most common adjustments of -1,0,1,
1062 * for other values we can do a bit more work.
1063 */
1064static void clocksource_adjust(struct clocksource *clock, s64 offset)
1065{
1066 s64 error, interval = clock->cycle_interval;
1067 int adj;
1068
1069 error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1);
1070 if (error > interval) {
1071 adj = clocksource_bigadjust(1, error, &interval, &offset);
1072 } else if (error < -interval) {
1073 interval = -interval;
1074 offset = -offset;
1075 adj = clocksource_bigadjust(-1, error, &interval, &offset);
1076 } else
1077 return;
1078
1079 clock->mult += adj;
1080 clock->xtime_interval += interval;
1081 clock->xtime_nsec -= offset;
1082 clock->error -= (interval - offset) << (TICK_LENGTH_SHIFT - clock->shift);
1083}
1084
1085/*
1086 * update_wall_time - Uses the current clocksource to increment the wall time
1087 *
1088 * Called from the timer interrupt, must hold a write on xtime_lock.
1089 */
1090static void update_wall_time(void)
1091{
1092 cycle_t offset;
1093
1094 clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
1095
1096#ifdef CONFIG_GENERIC_TIME
1097 offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask;
1098#else
1099 offset = clock->cycle_interval;
1100#endif
1101
1102 /* normally this loop will run just once, however in the
1103 * case of lost or late ticks, it will accumulate correctly.
1104 */
1105 while (offset >= clock->cycle_interval) {
1106 /* accumulate one interval */
1107 clock->xtime_nsec += clock->xtime_interval;
1108 clock->cycle_last += clock->cycle_interval;
1109 offset -= clock->cycle_interval;
1110
1111 if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
1112 clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
809 xtime.tv_sec++; 1113 xtime.tv_sec++;
810 second_overflow(); 1114 second_overflow();
811 } 1115 }
812 } while (ticks); 1116
1117 /* interpolator bits */
1118 time_interpolator_update(clock->xtime_interval
1119 >> clock->shift);
1120 /* increment the NTP state machine */
1121 update_ntp_one_tick();
1122
1123 /* accumulate error between NTP and clock interval */
1124 clock->error += current_tick_length();
1125 clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift);
1126 }
1127
1128 /* correct the clock when NTP error is too big */
1129 clocksource_adjust(clock, offset);
1130
1131 /* store full nanoseconds into xtime */
1132 xtime.tv_nsec = clock->xtime_nsec >> clock->shift;
1133 clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
1134
1135 /* check to see if there is a new clocksource to use */
1136 if (change_clocksource()) {
1137 clock->error = 0;
1138 clock->xtime_nsec = 0;
1139 clocksource_calculate_interval(clock, tick_nsec);
1140 }
813} 1141}
814 1142
815/* 1143/*
@@ -915,10 +1243,8 @@ static inline void update_times(void)
915 unsigned long ticks; 1243 unsigned long ticks;
916 1244
917 ticks = jiffies - wall_jiffies; 1245 ticks = jiffies - wall_jiffies;
918 if (ticks) { 1246 wall_jiffies += ticks;
919 wall_jiffies += ticks; 1247 update_wall_time();
920 update_wall_time(ticks);
921 }
922 calc_load(ticks); 1248 calc_load(ticks);
923} 1249}
924 1250
@@ -1326,7 +1652,7 @@ static void __devinit migrate_timers(int cpu)
1326} 1652}
1327#endif /* CONFIG_HOTPLUG_CPU */ 1653#endif /* CONFIG_HOTPLUG_CPU */
1328 1654
1329static int timer_cpu_notify(struct notifier_block *self, 1655static int __devinit timer_cpu_notify(struct notifier_block *self,
1330 unsigned long action, void *hcpu) 1656 unsigned long action, void *hcpu)
1331{ 1657{
1332 long cpu = (long)hcpu; 1658 long cpu = (long)hcpu;
@@ -1346,7 +1672,7 @@ static int timer_cpu_notify(struct notifier_block *self,
1346 return NOTIFY_OK; 1672 return NOTIFY_OK;
1347} 1673}
1348 1674
1349static struct notifier_block timers_nb = { 1675static struct notifier_block __devinitdata timers_nb = {
1350 .notifier_call = timer_cpu_notify, 1676 .notifier_call = timer_cpu_notify,
1351}; 1677};
1352 1678
diff --git a/kernel/unwind.c b/kernel/unwind.c
new file mode 100644
index 000000000000..f69c804c8e62
--- /dev/null
+++ b/kernel/unwind.c
@@ -0,0 +1,918 @@
1/*
2 * Copyright (C) 2002-2006 Novell, Inc.
3 * Jan Beulich <jbeulich@novell.com>
4 * This code is released under version 2 of the GNU GPL.
5 *
6 * A simple API for unwinding kernel stacks. This is used for
7 * debugging and error reporting purposes. The kernel doesn't need
8 * full-blown stack unwinding with all the bells and whistles, so there
9 * is not much point in implementing the full Dwarf2 unwind API.
10 */
11
12#include <linux/unwind.h>
13#include <linux/module.h>
14#include <linux/delay.h>
15#include <linux/stop_machine.h>
16#include <asm/sections.h>
17#include <asm/uaccess.h>
18#include <asm/unaligned.h>
19
20extern char __start_unwind[], __end_unwind[];
21
22#define MAX_STACK_DEPTH 8
23
24#define EXTRA_INFO(f) { \
25 BUILD_BUG_ON_ZERO(offsetof(struct unwind_frame_info, f) \
26 % FIELD_SIZEOF(struct unwind_frame_info, f)) \
27 + offsetof(struct unwind_frame_info, f) \
28 / FIELD_SIZEOF(struct unwind_frame_info, f), \
29 FIELD_SIZEOF(struct unwind_frame_info, f) \
30 }
31#define PTREGS_INFO(f) EXTRA_INFO(regs.f)
32
33static const struct {
34 unsigned offs:BITS_PER_LONG / 2;
35 unsigned width:BITS_PER_LONG / 2;
36} reg_info[] = {
37 UNW_REGISTER_INFO
38};
39
40#undef PTREGS_INFO
41#undef EXTRA_INFO
42
43#ifndef REG_INVALID
44#define REG_INVALID(r) (reg_info[r].width == 0)
45#endif
46
47#define DW_CFA_nop 0x00
48#define DW_CFA_set_loc 0x01
49#define DW_CFA_advance_loc1 0x02
50#define DW_CFA_advance_loc2 0x03
51#define DW_CFA_advance_loc4 0x04
52#define DW_CFA_offset_extended 0x05
53#define DW_CFA_restore_extended 0x06
54#define DW_CFA_undefined 0x07
55#define DW_CFA_same_value 0x08
56#define DW_CFA_register 0x09
57#define DW_CFA_remember_state 0x0a
58#define DW_CFA_restore_state 0x0b
59#define DW_CFA_def_cfa 0x0c
60#define DW_CFA_def_cfa_register 0x0d
61#define DW_CFA_def_cfa_offset 0x0e
62#define DW_CFA_def_cfa_expression 0x0f
63#define DW_CFA_expression 0x10
64#define DW_CFA_offset_extended_sf 0x11
65#define DW_CFA_def_cfa_sf 0x12
66#define DW_CFA_def_cfa_offset_sf 0x13
67#define DW_CFA_val_offset 0x14
68#define DW_CFA_val_offset_sf 0x15
69#define DW_CFA_val_expression 0x16
70#define DW_CFA_lo_user 0x1c
71#define DW_CFA_GNU_window_save 0x2d
72#define DW_CFA_GNU_args_size 0x2e
73#define DW_CFA_GNU_negative_offset_extended 0x2f
74#define DW_CFA_hi_user 0x3f
75
76#define DW_EH_PE_FORM 0x07
77#define DW_EH_PE_native 0x00
78#define DW_EH_PE_leb128 0x01
79#define DW_EH_PE_data2 0x02
80#define DW_EH_PE_data4 0x03
81#define DW_EH_PE_data8 0x04
82#define DW_EH_PE_signed 0x08
83#define DW_EH_PE_ADJUST 0x70
84#define DW_EH_PE_abs 0x00
85#define DW_EH_PE_pcrel 0x10
86#define DW_EH_PE_textrel 0x20
87#define DW_EH_PE_datarel 0x30
88#define DW_EH_PE_funcrel 0x40
89#define DW_EH_PE_aligned 0x50
90#define DW_EH_PE_indirect 0x80
91#define DW_EH_PE_omit 0xff
92
93typedef unsigned long uleb128_t;
94typedef signed long sleb128_t;
95
96static struct unwind_table {
97 struct {
98 unsigned long pc;
99 unsigned long range;
100 } core, init;
101 const void *address;
102 unsigned long size;
103 struct unwind_table *link;
104 const char *name;
105} root_table, *last_table;
106
107struct unwind_item {
108 enum item_location {
109 Nowhere,
110 Memory,
111 Register,
112 Value
113 } where;
114 uleb128_t value;
115};
116
117struct unwind_state {
118 uleb128_t loc, org;
119 const u8 *cieStart, *cieEnd;
120 uleb128_t codeAlign;
121 sleb128_t dataAlign;
122 struct cfa {
123 uleb128_t reg, offs;
124 } cfa;
125 struct unwind_item regs[ARRAY_SIZE(reg_info)];
126 unsigned stackDepth:8;
127 unsigned version:8;
128 const u8 *label;
129 const u8 *stack[MAX_STACK_DEPTH];
130};
131
132static const struct cfa badCFA = { ARRAY_SIZE(reg_info), 1 };
133
134static struct unwind_table *find_table(unsigned long pc)
135{
136 struct unwind_table *table;
137
138 for (table = &root_table; table; table = table->link)
139 if ((pc >= table->core.pc
140 && pc < table->core.pc + table->core.range)
141 || (pc >= table->init.pc
142 && pc < table->init.pc + table->init.range))
143 break;
144
145 return table;
146}
147
148static void init_unwind_table(struct unwind_table *table,
149 const char *name,
150 const void *core_start,
151 unsigned long core_size,
152 const void *init_start,
153 unsigned long init_size,
154 const void *table_start,
155 unsigned long table_size)
156{
157 table->core.pc = (unsigned long)core_start;
158 table->core.range = core_size;
159 table->init.pc = (unsigned long)init_start;
160 table->init.range = init_size;
161 table->address = table_start;
162 table->size = table_size;
163 table->link = NULL;
164 table->name = name;
165}
166
167void __init unwind_init(void)
168{
169 init_unwind_table(&root_table, "kernel",
170 _text, _end - _text,
171 NULL, 0,
172 __start_unwind, __end_unwind - __start_unwind);
173}
174
175#ifdef CONFIG_MODULES
176
177/* Must be called with module_mutex held. */
178void *unwind_add_table(struct module *module,
179 const void *table_start,
180 unsigned long table_size)
181{
182 struct unwind_table *table;
183
184 if (table_size <= 0)
185 return NULL;
186
187 table = kmalloc(sizeof(*table), GFP_KERNEL);
188 if (!table)
189 return NULL;
190
191 init_unwind_table(table, module->name,
192 module->module_core, module->core_size,
193 module->module_init, module->init_size,
194 table_start, table_size);
195
196 if (last_table)
197 last_table->link = table;
198 else
199 root_table.link = table;
200 last_table = table;
201
202 return table;
203}
204
205struct unlink_table_info
206{
207 struct unwind_table *table;
208 int init_only;
209};
210
211static int unlink_table(void *arg)
212{
213 struct unlink_table_info *info = arg;
214 struct unwind_table *table = info->table, *prev;
215
216 for (prev = &root_table; prev->link && prev->link != table; prev = prev->link)
217 ;
218
219 if (prev->link) {
220 if (info->init_only) {
221 table->init.pc = 0;
222 table->init.range = 0;
223 info->table = NULL;
224 } else {
225 prev->link = table->link;
226 if (!prev->link)
227 last_table = prev;
228 }
229 } else
230 info->table = NULL;
231
232 return 0;
233}
234
235/* Must be called with module_mutex held. */
236void unwind_remove_table(void *handle, int init_only)
237{
238 struct unwind_table *table = handle;
239 struct unlink_table_info info;
240
241 if (!table || table == &root_table)
242 return;
243
244 if (init_only && table == last_table) {
245 table->init.pc = 0;
246 table->init.range = 0;
247 return;
248 }
249
250 info.table = table;
251 info.init_only = init_only;
252 stop_machine_run(unlink_table, &info, NR_CPUS);
253
254 if (info.table)
255 kfree(table);
256}
257
258#endif /* CONFIG_MODULES */
259
260static uleb128_t get_uleb128(const u8 **pcur, const u8 *end)
261{
262 const u8 *cur = *pcur;
263 uleb128_t value;
264 unsigned shift;
265
266 for (shift = 0, value = 0; cur < end; shift += 7) {
267 if (shift + 7 > 8 * sizeof(value)
268 && (*cur & 0x7fU) >= (1U << (8 * sizeof(value) - shift))) {
269 cur = end + 1;
270 break;
271 }
272 value |= (uleb128_t)(*cur & 0x7f) << shift;
273 if (!(*cur++ & 0x80))
274 break;
275 }
276 *pcur = cur;
277
278 return value;
279}
280
281static sleb128_t get_sleb128(const u8 **pcur, const u8 *end)
282{
283 const u8 *cur = *pcur;
284 sleb128_t value;
285 unsigned shift;
286
287 for (shift = 0, value = 0; cur < end; shift += 7) {
288 if (shift + 7 > 8 * sizeof(value)
289 && (*cur & 0x7fU) >= (1U << (8 * sizeof(value) - shift))) {
290 cur = end + 1;
291 break;
292 }
293 value |= (sleb128_t)(*cur & 0x7f) << shift;
294 if (!(*cur & 0x80)) {
295 value |= -(*cur++ & 0x40) << shift;
296 break;
297 }
298 }
299 *pcur = cur;
300
301 return value;
302}
303
304static unsigned long read_pointer(const u8 **pLoc,
305 const void *end,
306 signed ptrType)
307{
308 unsigned long value = 0;
309 union {
310 const u8 *p8;
311 const u16 *p16u;
312 const s16 *p16s;
313 const u32 *p32u;
314 const s32 *p32s;
315 const unsigned long *pul;
316 } ptr;
317
318 if (ptrType < 0 || ptrType == DW_EH_PE_omit)
319 return 0;
320 ptr.p8 = *pLoc;
321 switch(ptrType & DW_EH_PE_FORM) {
322 case DW_EH_PE_data2:
323 if (end < (const void *)(ptr.p16u + 1))
324 return 0;
325 if(ptrType & DW_EH_PE_signed)
326 value = get_unaligned(ptr.p16s++);
327 else
328 value = get_unaligned(ptr.p16u++);
329 break;
330 case DW_EH_PE_data4:
331#ifdef CONFIG_64BIT
332 if (end < (const void *)(ptr.p32u + 1))
333 return 0;
334 if(ptrType & DW_EH_PE_signed)
335 value = get_unaligned(ptr.p32s++);
336 else
337 value = get_unaligned(ptr.p32u++);
338 break;
339 case DW_EH_PE_data8:
340 BUILD_BUG_ON(sizeof(u64) != sizeof(value));
341#else
342 BUILD_BUG_ON(sizeof(u32) != sizeof(value));
343#endif
344 case DW_EH_PE_native:
345 if (end < (const void *)(ptr.pul + 1))
346 return 0;
347 value = get_unaligned(ptr.pul++);
348 break;
349 case DW_EH_PE_leb128:
350 BUILD_BUG_ON(sizeof(uleb128_t) > sizeof(value));
351 value = ptrType & DW_EH_PE_signed
352 ? get_sleb128(&ptr.p8, end)
353 : get_uleb128(&ptr.p8, end);
354 if ((const void *)ptr.p8 > end)
355 return 0;
356 break;
357 default:
358 return 0;
359 }
360 switch(ptrType & DW_EH_PE_ADJUST) {
361 case DW_EH_PE_abs:
362 break;
363 case DW_EH_PE_pcrel:
364 value += (unsigned long)*pLoc;
365 break;
366 default:
367 return 0;
368 }
369 if ((ptrType & DW_EH_PE_indirect)
370 && __get_user(value, (unsigned long *)value))
371 return 0;
372 *pLoc = ptr.p8;
373
374 return value;
375}
376
377static signed fde_pointer_type(const u32 *cie)
378{
379 const u8 *ptr = (const u8 *)(cie + 2);
380 unsigned version = *ptr;
381
382 if (version != 1)
383 return -1; /* unsupported */
384 if (*++ptr) {
385 const char *aug;
386 const u8 *end = (const u8 *)(cie + 1) + *cie;
387 uleb128_t len;
388
389 /* check if augmentation size is first (and thus present) */
390 if (*ptr != 'z')
391 return -1;
392 /* check if augmentation string is nul-terminated */
393 if ((ptr = memchr(aug = (const void *)ptr, 0, end - ptr)) == NULL)
394 return -1;
395 ++ptr; /* skip terminator */
396 get_uleb128(&ptr, end); /* skip code alignment */
397 get_sleb128(&ptr, end); /* skip data alignment */
398 /* skip return address column */
399 version <= 1 ? (void)++ptr : (void)get_uleb128(&ptr, end);
400 len = get_uleb128(&ptr, end); /* augmentation length */
401 if (ptr + len < ptr || ptr + len > end)
402 return -1;
403 end = ptr + len;
404 while (*++aug) {
405 if (ptr >= end)
406 return -1;
407 switch(*aug) {
408 case 'L':
409 ++ptr;
410 break;
411 case 'P': {
412 signed ptrType = *ptr++;
413
414 if (!read_pointer(&ptr, end, ptrType) || ptr > end)
415 return -1;
416 }
417 break;
418 case 'R':
419 return *ptr;
420 default:
421 return -1;
422 }
423 }
424 }
425 return DW_EH_PE_native|DW_EH_PE_abs;
426}
427
428static int advance_loc(unsigned long delta, struct unwind_state *state)
429{
430 state->loc += delta * state->codeAlign;
431
432 return delta > 0;
433}
434
435static void set_rule(uleb128_t reg,
436 enum item_location where,
437 uleb128_t value,
438 struct unwind_state *state)
439{
440 if (reg < ARRAY_SIZE(state->regs)) {
441 state->regs[reg].where = where;
442 state->regs[reg].value = value;
443 }
444}
445
446static int processCFI(const u8 *start,
447 const u8 *end,
448 unsigned long targetLoc,
449 signed ptrType,
450 struct unwind_state *state)
451{
452 union {
453 const u8 *p8;
454 const u16 *p16;
455 const u32 *p32;
456 } ptr;
457 int result = 1;
458
459 if (start != state->cieStart) {
460 state->loc = state->org;
461 result = processCFI(state->cieStart, state->cieEnd, 0, ptrType, state);
462 if (targetLoc == 0 && state->label == NULL)
463 return result;
464 }
465 for (ptr.p8 = start; result && ptr.p8 < end; ) {
466 switch(*ptr.p8 >> 6) {
467 uleb128_t value;
468
469 case 0:
470 switch(*ptr.p8++) {
471 case DW_CFA_nop:
472 break;
473 case DW_CFA_set_loc:
474 if ((state->loc = read_pointer(&ptr.p8, end, ptrType)) == 0)
475 result = 0;
476 break;
477 case DW_CFA_advance_loc1:
478 result = ptr.p8 < end && advance_loc(*ptr.p8++, state);
479 break;
480 case DW_CFA_advance_loc2:
481 result = ptr.p8 <= end + 2
482 && advance_loc(*ptr.p16++, state);
483 break;
484 case DW_CFA_advance_loc4:
485 result = ptr.p8 <= end + 4
486 && advance_loc(*ptr.p32++, state);
487 break;
488 case DW_CFA_offset_extended:
489 value = get_uleb128(&ptr.p8, end);
490 set_rule(value, Memory, get_uleb128(&ptr.p8, end), state);
491 break;
492 case DW_CFA_val_offset:
493 value = get_uleb128(&ptr.p8, end);
494 set_rule(value, Value, get_uleb128(&ptr.p8, end), state);
495 break;
496 case DW_CFA_offset_extended_sf:
497 value = get_uleb128(&ptr.p8, end);
498 set_rule(value, Memory, get_sleb128(&ptr.p8, end), state);
499 break;
500 case DW_CFA_val_offset_sf:
501 value = get_uleb128(&ptr.p8, end);
502 set_rule(value, Value, get_sleb128(&ptr.p8, end), state);
503 break;
504 case DW_CFA_restore_extended:
505 case DW_CFA_undefined:
506 case DW_CFA_same_value:
507 set_rule(get_uleb128(&ptr.p8, end), Nowhere, 0, state);
508 break;
509 case DW_CFA_register:
510 value = get_uleb128(&ptr.p8, end);
511 set_rule(value,
512 Register,
513 get_uleb128(&ptr.p8, end), state);
514 break;
515 case DW_CFA_remember_state:
516 if (ptr.p8 == state->label) {
517 state->label = NULL;
518 return 1;
519 }
520 if (state->stackDepth >= MAX_STACK_DEPTH)
521 return 0;
522 state->stack[state->stackDepth++] = ptr.p8;
523 break;
524 case DW_CFA_restore_state:
525 if (state->stackDepth) {
526 const uleb128_t loc = state->loc;
527 const u8 *label = state->label;
528
529 state->label = state->stack[state->stackDepth - 1];
530 memcpy(&state->cfa, &badCFA, sizeof(state->cfa));
531 memset(state->regs, 0, sizeof(state->regs));
532 state->stackDepth = 0;
533 result = processCFI(start, end, 0, ptrType, state);
534 state->loc = loc;
535 state->label = label;
536 } else
537 return 0;
538 break;
539 case DW_CFA_def_cfa:
540 state->cfa.reg = get_uleb128(&ptr.p8, end);
541 /*nobreak*/
542 case DW_CFA_def_cfa_offset:
543 state->cfa.offs = get_uleb128(&ptr.p8, end);
544 break;
545 case DW_CFA_def_cfa_sf:
546 state->cfa.reg = get_uleb128(&ptr.p8, end);
547 /*nobreak*/
548 case DW_CFA_def_cfa_offset_sf:
549 state->cfa.offs = get_sleb128(&ptr.p8, end)
550 * state->dataAlign;
551 break;
552 case DW_CFA_def_cfa_register:
553 state->cfa.reg = get_uleb128(&ptr.p8, end);
554 break;
555 /*todo case DW_CFA_def_cfa_expression: */
556 /*todo case DW_CFA_expression: */
557 /*todo case DW_CFA_val_expression: */
558 case DW_CFA_GNU_args_size:
559 get_uleb128(&ptr.p8, end);
560 break;
561 case DW_CFA_GNU_negative_offset_extended:
562 value = get_uleb128(&ptr.p8, end);
563 set_rule(value,
564 Memory,
565 (uleb128_t)0 - get_uleb128(&ptr.p8, end), state);
566 break;
567 case DW_CFA_GNU_window_save:
568 default:
569 result = 0;
570 break;
571 }
572 break;
573 case 1:
574 result = advance_loc(*ptr.p8++ & 0x3f, state);
575 break;
576 case 2:
577 value = *ptr.p8++ & 0x3f;
578 set_rule(value, Memory, get_uleb128(&ptr.p8, end), state);
579 break;
580 case 3:
581 set_rule(*ptr.p8++ & 0x3f, Nowhere, 0, state);
582 break;
583 }
584 if (ptr.p8 > end)
585 result = 0;
586 if (result && targetLoc != 0 && targetLoc < state->loc)
587 return 1;
588 }
589
590 return result
591 && ptr.p8 == end
592 && (targetLoc == 0
593 || (/*todo While in theory this should apply, gcc in practice omits
594 everything past the function prolog, and hence the location
595 never reaches the end of the function.
596 targetLoc < state->loc &&*/ state->label == NULL));
597}
598
599/* Unwind to previous to frame. Returns 0 if successful, negative
600 * number in case of an error. */
601int unwind(struct unwind_frame_info *frame)
602{
603#define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs])
604 const u32 *fde = NULL, *cie = NULL;
605 const u8 *ptr = NULL, *end = NULL;
606 unsigned long startLoc = 0, endLoc = 0, cfa;
607 unsigned i;
608 signed ptrType = -1;
609 uleb128_t retAddrReg = 0;
610 struct unwind_table *table;
611 struct unwind_state state;
612
613 if (UNW_PC(frame) == 0)
614 return -EINVAL;
615 if ((table = find_table(UNW_PC(frame))) != NULL
616 && !(table->size & (sizeof(*fde) - 1))) {
617 unsigned long tableSize = table->size;
618
619 for (fde = table->address;
620 tableSize > sizeof(*fde) && tableSize - sizeof(*fde) >= *fde;
621 tableSize -= sizeof(*fde) + *fde,
622 fde += 1 + *fde / sizeof(*fde)) {
623 if (!*fde || (*fde & (sizeof(*fde) - 1)))
624 break;
625 if (!fde[1])
626 continue; /* this is a CIE */
627 if ((fde[1] & (sizeof(*fde) - 1))
628 || fde[1] > (unsigned long)(fde + 1)
629 - (unsigned long)table->address)
630 continue; /* this is not a valid FDE */
631 cie = fde + 1 - fde[1] / sizeof(*fde);
632 if (*cie <= sizeof(*cie) + 4
633 || *cie >= fde[1] - sizeof(*fde)
634 || (*cie & (sizeof(*cie) - 1))
635 || cie[1]
636 || (ptrType = fde_pointer_type(cie)) < 0) {
637 cie = NULL; /* this is not a (valid) CIE */
638 continue;
639 }
640 ptr = (const u8 *)(fde + 2);
641 startLoc = read_pointer(&ptr,
642 (const u8 *)(fde + 1) + *fde,
643 ptrType);
644 endLoc = startLoc
645 + read_pointer(&ptr,
646 (const u8 *)(fde + 1) + *fde,
647 ptrType & DW_EH_PE_indirect
648 ? ptrType
649 : ptrType & (DW_EH_PE_FORM|DW_EH_PE_signed));
650 if (UNW_PC(frame) >= startLoc && UNW_PC(frame) < endLoc)
651 break;
652 cie = NULL;
653 }
654 }
655 if (cie != NULL) {
656 memset(&state, 0, sizeof(state));
657 state.cieEnd = ptr; /* keep here temporarily */
658 ptr = (const u8 *)(cie + 2);
659 end = (const u8 *)(cie + 1) + *cie;
660 if ((state.version = *ptr) != 1)
661 cie = NULL; /* unsupported version */
662 else if (*++ptr) {
663 /* check if augmentation size is first (and thus present) */
664 if (*ptr == 'z') {
665 /* check for ignorable (or already handled)
666 * nul-terminated augmentation string */
667 while (++ptr < end && *ptr)
668 if (strchr("LPR", *ptr) == NULL)
669 break;
670 }
671 if (ptr >= end || *ptr)
672 cie = NULL;
673 }
674 ++ptr;
675 }
676 if (cie != NULL) {
677 /* get code aligment factor */
678 state.codeAlign = get_uleb128(&ptr, end);
679 /* get data aligment factor */
680 state.dataAlign = get_sleb128(&ptr, end);
681 if (state.codeAlign == 0 || state.dataAlign == 0 || ptr >= end)
682 cie = NULL;
683 else {
684 retAddrReg = state.version <= 1 ? *ptr++ : get_uleb128(&ptr, end);
685 /* skip augmentation */
686 if (((const char *)(cie + 2))[1] == 'z')
687 ptr += get_uleb128(&ptr, end);
688 if (ptr > end
689 || retAddrReg >= ARRAY_SIZE(reg_info)
690 || REG_INVALID(retAddrReg)
691 || reg_info[retAddrReg].width != sizeof(unsigned long))
692 cie = NULL;
693 }
694 }
695 if (cie != NULL) {
696 state.cieStart = ptr;
697 ptr = state.cieEnd;
698 state.cieEnd = end;
699 end = (const u8 *)(fde + 1) + *fde;
700 /* skip augmentation */
701 if (((const char *)(cie + 2))[1] == 'z') {
702 uleb128_t augSize = get_uleb128(&ptr, end);
703
704 if ((ptr += augSize) > end)
705 fde = NULL;
706 }
707 }
708 if (cie == NULL || fde == NULL) {
709#ifdef CONFIG_FRAME_POINTER
710 unsigned long top, bottom;
711#endif
712
713#ifdef CONFIG_FRAME_POINTER
714 top = STACK_TOP(frame->task);
715 bottom = STACK_BOTTOM(frame->task);
716# if FRAME_RETADDR_OFFSET < 0
717 if (UNW_SP(frame) < top
718 && UNW_FP(frame) <= UNW_SP(frame)
719 && bottom < UNW_FP(frame)
720# else
721 if (UNW_SP(frame) > top
722 && UNW_FP(frame) >= UNW_SP(frame)
723 && bottom > UNW_FP(frame)
724# endif
725 && !((UNW_SP(frame) | UNW_FP(frame))
726 & (sizeof(unsigned long) - 1))) {
727 unsigned long link;
728
729 if (!__get_user(link,
730 (unsigned long *)(UNW_FP(frame)
731 + FRAME_LINK_OFFSET))
732# if FRAME_RETADDR_OFFSET < 0
733 && link > bottom && link < UNW_FP(frame)
734# else
735 && link > UNW_FP(frame) && link < bottom
736# endif
737 && !(link & (sizeof(link) - 1))
738 && !__get_user(UNW_PC(frame),
739 (unsigned long *)(UNW_FP(frame)
740 + FRAME_RETADDR_OFFSET))) {
741 UNW_SP(frame) = UNW_FP(frame) + FRAME_RETADDR_OFFSET
742# if FRAME_RETADDR_OFFSET < 0
743 -
744# else
745 +
746# endif
747 sizeof(UNW_PC(frame));
748 UNW_FP(frame) = link;
749 return 0;
750 }
751 }
752#endif
753 return -ENXIO;
754 }
755 state.org = startLoc;
756 memcpy(&state.cfa, &badCFA, sizeof(state.cfa));
757 /* process instructions */
758 if (!processCFI(ptr, end, UNW_PC(frame), ptrType, &state)
759 || state.loc > endLoc
760 || state.regs[retAddrReg].where == Nowhere
761 || state.cfa.reg >= ARRAY_SIZE(reg_info)
762 || reg_info[state.cfa.reg].width != sizeof(unsigned long)
763 || state.cfa.offs % sizeof(unsigned long))
764 return -EIO;
765 /* update frame */
766 cfa = FRAME_REG(state.cfa.reg, unsigned long) + state.cfa.offs;
767 startLoc = min((unsigned long)UNW_SP(frame), cfa);
768 endLoc = max((unsigned long)UNW_SP(frame), cfa);
769 if (STACK_LIMIT(startLoc) != STACK_LIMIT(endLoc)) {
770 startLoc = min(STACK_LIMIT(cfa), cfa);
771 endLoc = max(STACK_LIMIT(cfa), cfa);
772 }
773#ifndef CONFIG_64BIT
774# define CASES CASE(8); CASE(16); CASE(32)
775#else
776# define CASES CASE(8); CASE(16); CASE(32); CASE(64)
777#endif
778 for (i = 0; i < ARRAY_SIZE(state.regs); ++i) {
779 if (REG_INVALID(i)) {
780 if (state.regs[i].where == Nowhere)
781 continue;
782 return -EIO;
783 }
784 switch(state.regs[i].where) {
785 default:
786 break;
787 case Register:
788 if (state.regs[i].value >= ARRAY_SIZE(reg_info)
789 || REG_INVALID(state.regs[i].value)
790 || reg_info[i].width > reg_info[state.regs[i].value].width)
791 return -EIO;
792 switch(reg_info[state.regs[i].value].width) {
793#define CASE(n) \
794 case sizeof(u##n): \
795 state.regs[i].value = FRAME_REG(state.regs[i].value, \
796 const u##n); \
797 break
798 CASES;
799#undef CASE
800 default:
801 return -EIO;
802 }
803 break;
804 }
805 }
806 for (i = 0; i < ARRAY_SIZE(state.regs); ++i) {
807 if (REG_INVALID(i))
808 continue;
809 switch(state.regs[i].where) {
810 case Nowhere:
811 if (reg_info[i].width != sizeof(UNW_SP(frame))
812 || &FRAME_REG(i, __typeof__(UNW_SP(frame)))
813 != &UNW_SP(frame))
814 continue;
815 UNW_SP(frame) = cfa;
816 break;
817 case Register:
818 switch(reg_info[i].width) {
819#define CASE(n) case sizeof(u##n): \
820 FRAME_REG(i, u##n) = state.regs[i].value; \
821 break
822 CASES;
823#undef CASE
824 default:
825 return -EIO;
826 }
827 break;
828 case Value:
829 if (reg_info[i].width != sizeof(unsigned long))
830 return -EIO;
831 FRAME_REG(i, unsigned long) = cfa + state.regs[i].value
832 * state.dataAlign;
833 break;
834 case Memory: {
835 unsigned long addr = cfa + state.regs[i].value
836 * state.dataAlign;
837
838 if ((state.regs[i].value * state.dataAlign)
839 % sizeof(unsigned long)
840 || addr < startLoc
841 || addr + sizeof(unsigned long) < addr
842 || addr + sizeof(unsigned long) > endLoc)
843 return -EIO;
844 switch(reg_info[i].width) {
845#define CASE(n) case sizeof(u##n): \
846 __get_user(FRAME_REG(i, u##n), (u##n *)addr); \
847 break
848 CASES;
849#undef CASE
850 default:
851 return -EIO;
852 }
853 }
854 break;
855 }
856 }
857
858 return 0;
859#undef CASES
860#undef FRAME_REG
861}
862EXPORT_SYMBOL(unwind);
863
864int unwind_init_frame_info(struct unwind_frame_info *info,
865 struct task_struct *tsk,
866 /*const*/ struct pt_regs *regs)
867{
868 info->task = tsk;
869 arch_unw_init_frame_info(info, regs);
870
871 return 0;
872}
873EXPORT_SYMBOL(unwind_init_frame_info);
874
875/*
876 * Prepare to unwind a blocked task.
877 */
878int unwind_init_blocked(struct unwind_frame_info *info,
879 struct task_struct *tsk)
880{
881 info->task = tsk;
882 arch_unw_init_blocked(info);
883
884 return 0;
885}
886EXPORT_SYMBOL(unwind_init_blocked);
887
888/*
889 * Prepare to unwind the currently running thread.
890 */
891int unwind_init_running(struct unwind_frame_info *info,
892 asmlinkage int (*callback)(struct unwind_frame_info *,
893 void *arg),
894 void *arg)
895{
896 info->task = current;
897
898 return arch_unwind_init_running(info, callback, arg);
899}
900EXPORT_SYMBOL(unwind_init_running);
901
902/*
903 * Unwind until the return pointer is in user-land (or until an error
904 * occurs). Returns 0 if successful, negative number in case of
905 * error.
906 */
907int unwind_to_user(struct unwind_frame_info *info)
908{
909 while (!arch_unw_user_mode(info)) {
910 int err = unwind(info);
911
912 if (err < 0)
913 return err;
914 }
915
916 return 0;
917}
918EXPORT_SYMBOL(unwind_to_user);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 565cf7a1febd..59f0b42bd89e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -559,7 +559,7 @@ static void take_over_work(struct workqueue_struct *wq, unsigned int cpu)
559} 559}
560 560
561/* We're holding the cpucontrol mutex here */ 561/* We're holding the cpucontrol mutex here */
562static int workqueue_cpu_callback(struct notifier_block *nfb, 562static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
563 unsigned long action, 563 unsigned long action,
564 void *hcpu) 564 void *hcpu)
565{ 565{