diff options
Diffstat (limited to 'kernel')
41 files changed, 6106 insertions, 706 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index f6ef00f4f90f..82fb182f6f61 100644 --- a/kernel/Makefile +++ b/kernel/Makefile | |||
@@ -10,17 +10,22 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ | |||
10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ | 10 | kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ |
11 | hrtimer.o | 11 | hrtimer.o |
12 | 12 | ||
13 | obj-y += time/ | ||
13 | obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o | 14 | obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o |
14 | obj-$(CONFIG_FUTEX) += futex.o | 15 | obj-$(CONFIG_FUTEX) += futex.o |
15 | ifeq ($(CONFIG_COMPAT),y) | 16 | ifeq ($(CONFIG_COMPAT),y) |
16 | obj-$(CONFIG_FUTEX) += futex_compat.o | 17 | obj-$(CONFIG_FUTEX) += futex_compat.o |
17 | endif | 18 | endif |
19 | obj-$(CONFIG_RT_MUTEXES) += rtmutex.o | ||
20 | obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o | ||
21 | obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o | ||
18 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o | 22 | obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o |
19 | obj-$(CONFIG_SMP) += cpu.o spinlock.o | 23 | obj-$(CONFIG_SMP) += cpu.o spinlock.o |
20 | obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o | 24 | obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o |
21 | obj-$(CONFIG_UID16) += uid16.o | 25 | obj-$(CONFIG_UID16) += uid16.o |
22 | obj-$(CONFIG_MODULES) += module.o | 26 | obj-$(CONFIG_MODULES) += module.o |
23 | obj-$(CONFIG_KALLSYMS) += kallsyms.o | 27 | obj-$(CONFIG_KALLSYMS) += kallsyms.o |
28 | obj-$(CONFIG_STACK_UNWIND) += unwind.o | ||
24 | obj-$(CONFIG_PM) += power/ | 29 | obj-$(CONFIG_PM) += power/ |
25 | obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o | 30 | obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o |
26 | obj-$(CONFIG_KEXEC) += kexec.o | 31 | obj-$(CONFIG_KEXEC) += kexec.o |
diff --git a/kernel/acct.c b/kernel/acct.c index 368c4f03fe0e..126ca43d5d2b 100644 --- a/kernel/acct.c +++ b/kernel/acct.c | |||
@@ -521,6 +521,7 @@ static void do_acct_process(struct file *file) | |||
521 | 521 | ||
522 | /** | 522 | /** |
523 | * acct_init_pacct - initialize a new pacct_struct | 523 | * acct_init_pacct - initialize a new pacct_struct |
524 | * @pacct: per-process accounting info struct to initialize | ||
524 | */ | 525 | */ |
525 | void acct_init_pacct(struct pacct_struct *pacct) | 526 | void acct_init_pacct(struct pacct_struct *pacct) |
526 | { | 527 | { |
@@ -576,7 +577,7 @@ void acct_collect(long exitcode, int group_dead) | |||
576 | * | 577 | * |
577 | * handles process accounting for an exiting task | 578 | * handles process accounting for an exiting task |
578 | */ | 579 | */ |
579 | void acct_process() | 580 | void acct_process(void) |
580 | { | 581 | { |
581 | struct file *file = NULL; | 582 | struct file *file = NULL; |
582 | 583 | ||
diff --git a/kernel/audit.c b/kernel/audit.c index 7dfac7031bd7..82443fb433ef 100644 --- a/kernel/audit.c +++ b/kernel/audit.c | |||
@@ -818,7 +818,7 @@ err: | |||
818 | */ | 818 | */ |
819 | unsigned int audit_serial(void) | 819 | unsigned int audit_serial(void) |
820 | { | 820 | { |
821 | static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED; | 821 | static DEFINE_SPINLOCK(serial_lock); |
822 | static unsigned int serial = 0; | 822 | static unsigned int serial = 0; |
823 | 823 | ||
824 | unsigned long flags; | 824 | unsigned long flags; |
diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 9ebd96fda295..dc5e3f01efe7 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c | |||
@@ -658,8 +658,7 @@ static void audit_log_task_context(struct audit_buffer *ab) | |||
658 | return; | 658 | return; |
659 | 659 | ||
660 | error_path: | 660 | error_path: |
661 | if (ctx) | 661 | kfree(ctx); |
662 | kfree(ctx); | ||
663 | audit_panic("error in audit_log_task_context"); | 662 | audit_panic("error in audit_log_task_context"); |
664 | return; | 663 | return; |
665 | } | 664 | } |
@@ -1367,7 +1366,7 @@ int __audit_mq_open(int oflag, mode_t mode, struct mq_attr __user *u_attr) | |||
1367 | * @mqdes: MQ descriptor | 1366 | * @mqdes: MQ descriptor |
1368 | * @msg_len: Message length | 1367 | * @msg_len: Message length |
1369 | * @msg_prio: Message priority | 1368 | * @msg_prio: Message priority |
1370 | * @abs_timeout: Message timeout in absolute time | 1369 | * @u_abs_timeout: Message timeout in absolute time |
1371 | * | 1370 | * |
1372 | * Returns 0 for success or NULL context or < 0 on error. | 1371 | * Returns 0 for success or NULL context or < 0 on error. |
1373 | */ | 1372 | */ |
@@ -1409,8 +1408,8 @@ int __audit_mq_timedsend(mqd_t mqdes, size_t msg_len, unsigned int msg_prio, | |||
1409 | * __audit_mq_timedreceive - record audit data for a POSIX MQ timed receive | 1408 | * __audit_mq_timedreceive - record audit data for a POSIX MQ timed receive |
1410 | * @mqdes: MQ descriptor | 1409 | * @mqdes: MQ descriptor |
1411 | * @msg_len: Message length | 1410 | * @msg_len: Message length |
1412 | * @msg_prio: Message priority | 1411 | * @u_msg_prio: Message priority |
1413 | * @abs_timeout: Message timeout in absolute time | 1412 | * @u_abs_timeout: Message timeout in absolute time |
1414 | * | 1413 | * |
1415 | * Returns 0 for success or NULL context or < 0 on error. | 1414 | * Returns 0 for success or NULL context or < 0 on error. |
1416 | */ | 1415 | */ |
@@ -1558,7 +1557,6 @@ int __audit_ipc_obj(struct kern_ipc_perm *ipcp) | |||
1558 | * @uid: msgq user id | 1557 | * @uid: msgq user id |
1559 | * @gid: msgq group id | 1558 | * @gid: msgq group id |
1560 | * @mode: msgq mode (permissions) | 1559 | * @mode: msgq mode (permissions) |
1561 | * @ipcp: in-kernel IPC permissions | ||
1562 | * | 1560 | * |
1563 | * Returns 0 for success or NULL context or < 0 on error. | 1561 | * Returns 0 for success or NULL context or < 0 on error. |
1564 | */ | 1562 | */ |
diff --git a/kernel/cpu.c b/kernel/cpu.c index fe2b8d0bfe4c..70fbf2e83766 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c | |||
@@ -13,12 +13,12 @@ | |||
13 | #include <linux/module.h> | 13 | #include <linux/module.h> |
14 | #include <linux/kthread.h> | 14 | #include <linux/kthread.h> |
15 | #include <linux/stop_machine.h> | 15 | #include <linux/stop_machine.h> |
16 | #include <asm/semaphore.h> | 16 | #include <linux/mutex.h> |
17 | 17 | ||
18 | /* This protects CPUs going up and down... */ | 18 | /* This protects CPUs going up and down... */ |
19 | static DECLARE_MUTEX(cpucontrol); | 19 | static DEFINE_MUTEX(cpucontrol); |
20 | 20 | ||
21 | static BLOCKING_NOTIFIER_HEAD(cpu_chain); | 21 | static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain); |
22 | 22 | ||
23 | #ifdef CONFIG_HOTPLUG_CPU | 23 | #ifdef CONFIG_HOTPLUG_CPU |
24 | static struct task_struct *lock_cpu_hotplug_owner; | 24 | static struct task_struct *lock_cpu_hotplug_owner; |
@@ -30,9 +30,9 @@ static int __lock_cpu_hotplug(int interruptible) | |||
30 | 30 | ||
31 | if (lock_cpu_hotplug_owner != current) { | 31 | if (lock_cpu_hotplug_owner != current) { |
32 | if (interruptible) | 32 | if (interruptible) |
33 | ret = down_interruptible(&cpucontrol); | 33 | ret = mutex_lock_interruptible(&cpucontrol); |
34 | else | 34 | else |
35 | down(&cpucontrol); | 35 | mutex_lock(&cpucontrol); |
36 | } | 36 | } |
37 | 37 | ||
38 | /* | 38 | /* |
@@ -56,7 +56,7 @@ void unlock_cpu_hotplug(void) | |||
56 | { | 56 | { |
57 | if (--lock_cpu_hotplug_depth == 0) { | 57 | if (--lock_cpu_hotplug_depth == 0) { |
58 | lock_cpu_hotplug_owner = NULL; | 58 | lock_cpu_hotplug_owner = NULL; |
59 | up(&cpucontrol); | 59 | mutex_unlock(&cpucontrol); |
60 | } | 60 | } |
61 | } | 61 | } |
62 | EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); | 62 | EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); |
@@ -69,10 +69,13 @@ EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible); | |||
69 | #endif /* CONFIG_HOTPLUG_CPU */ | 69 | #endif /* CONFIG_HOTPLUG_CPU */ |
70 | 70 | ||
71 | /* Need to know about CPUs going up/down? */ | 71 | /* Need to know about CPUs going up/down? */ |
72 | int register_cpu_notifier(struct notifier_block *nb) | 72 | int __cpuinit register_cpu_notifier(struct notifier_block *nb) |
73 | { | 73 | { |
74 | return blocking_notifier_chain_register(&cpu_chain, nb); | 74 | return blocking_notifier_chain_register(&cpu_chain, nb); |
75 | } | 75 | } |
76 | |||
77 | #ifdef CONFIG_HOTPLUG_CPU | ||
78 | |||
76 | EXPORT_SYMBOL(register_cpu_notifier); | 79 | EXPORT_SYMBOL(register_cpu_notifier); |
77 | 80 | ||
78 | void unregister_cpu_notifier(struct notifier_block *nb) | 81 | void unregister_cpu_notifier(struct notifier_block *nb) |
@@ -81,7 +84,6 @@ void unregister_cpu_notifier(struct notifier_block *nb) | |||
81 | } | 84 | } |
82 | EXPORT_SYMBOL(unregister_cpu_notifier); | 85 | EXPORT_SYMBOL(unregister_cpu_notifier); |
83 | 86 | ||
84 | #ifdef CONFIG_HOTPLUG_CPU | ||
85 | static inline void check_for_tasks(int cpu) | 87 | static inline void check_for_tasks(int cpu) |
86 | { | 88 | { |
87 | struct task_struct *p; | 89 | struct task_struct *p; |
diff --git a/kernel/cpuset.c b/kernel/cpuset.c index b602f73fb38d..1535af3a912d 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c | |||
@@ -2442,31 +2442,43 @@ void __cpuset_memory_pressure_bump(void) | |||
2442 | */ | 2442 | */ |
2443 | static int proc_cpuset_show(struct seq_file *m, void *v) | 2443 | static int proc_cpuset_show(struct seq_file *m, void *v) |
2444 | { | 2444 | { |
2445 | struct pid *pid; | ||
2445 | struct task_struct *tsk; | 2446 | struct task_struct *tsk; |
2446 | char *buf; | 2447 | char *buf; |
2447 | int retval = 0; | 2448 | int retval; |
2448 | 2449 | ||
2450 | retval = -ENOMEM; | ||
2449 | buf = kmalloc(PAGE_SIZE, GFP_KERNEL); | 2451 | buf = kmalloc(PAGE_SIZE, GFP_KERNEL); |
2450 | if (!buf) | 2452 | if (!buf) |
2451 | return -ENOMEM; | 2453 | goto out; |
2454 | |||
2455 | retval = -ESRCH; | ||
2456 | pid = m->private; | ||
2457 | tsk = get_pid_task(pid, PIDTYPE_PID); | ||
2458 | if (!tsk) | ||
2459 | goto out_free; | ||
2452 | 2460 | ||
2453 | tsk = m->private; | 2461 | retval = -EINVAL; |
2454 | mutex_lock(&manage_mutex); | 2462 | mutex_lock(&manage_mutex); |
2463 | |||
2455 | retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE); | 2464 | retval = cpuset_path(tsk->cpuset, buf, PAGE_SIZE); |
2456 | if (retval < 0) | 2465 | if (retval < 0) |
2457 | goto out; | 2466 | goto out_unlock; |
2458 | seq_puts(m, buf); | 2467 | seq_puts(m, buf); |
2459 | seq_putc(m, '\n'); | 2468 | seq_putc(m, '\n'); |
2460 | out: | 2469 | out_unlock: |
2461 | mutex_unlock(&manage_mutex); | 2470 | mutex_unlock(&manage_mutex); |
2471 | put_task_struct(tsk); | ||
2472 | out_free: | ||
2462 | kfree(buf); | 2473 | kfree(buf); |
2474 | out: | ||
2463 | return retval; | 2475 | return retval; |
2464 | } | 2476 | } |
2465 | 2477 | ||
2466 | static int cpuset_open(struct inode *inode, struct file *file) | 2478 | static int cpuset_open(struct inode *inode, struct file *file) |
2467 | { | 2479 | { |
2468 | struct task_struct *tsk = PROC_I(inode)->task; | 2480 | struct pid *pid = PROC_I(inode)->pid; |
2469 | return single_open(file, proc_cpuset_show, tsk); | 2481 | return single_open(file, proc_cpuset_show, pid); |
2470 | } | 2482 | } |
2471 | 2483 | ||
2472 | struct file_operations proc_cpuset_operations = { | 2484 | struct file_operations proc_cpuset_operations = { |
diff --git a/kernel/exit.c b/kernel/exit.c index e76bd02e930e..ab06b9f88f64 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -137,12 +137,8 @@ void release_task(struct task_struct * p) | |||
137 | { | 137 | { |
138 | int zap_leader; | 138 | int zap_leader; |
139 | task_t *leader; | 139 | task_t *leader; |
140 | struct dentry *proc_dentry; | ||
141 | |||
142 | repeat: | 140 | repeat: |
143 | atomic_dec(&p->user->processes); | 141 | atomic_dec(&p->user->processes); |
144 | spin_lock(&p->proc_lock); | ||
145 | proc_dentry = proc_pid_unhash(p); | ||
146 | write_lock_irq(&tasklist_lock); | 142 | write_lock_irq(&tasklist_lock); |
147 | ptrace_unlink(p); | 143 | ptrace_unlink(p); |
148 | BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); | 144 | BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children)); |
@@ -171,8 +167,7 @@ repeat: | |||
171 | 167 | ||
172 | sched_exit(p); | 168 | sched_exit(p); |
173 | write_unlock_irq(&tasklist_lock); | 169 | write_unlock_irq(&tasklist_lock); |
174 | spin_unlock(&p->proc_lock); | 170 | proc_flush_task(p); |
175 | proc_pid_flush(proc_dentry); | ||
176 | release_thread(p); | 171 | release_thread(p); |
177 | call_rcu(&p->rcu, delayed_put_task_struct); | 172 | call_rcu(&p->rcu, delayed_put_task_struct); |
178 | 173 | ||
@@ -931,9 +926,18 @@ fastcall NORET_TYPE void do_exit(long code) | |||
931 | tsk->mempolicy = NULL; | 926 | tsk->mempolicy = NULL; |
932 | #endif | 927 | #endif |
933 | /* | 928 | /* |
929 | * This must happen late, after the PID is not | ||
930 | * hashed anymore: | ||
931 | */ | ||
932 | if (unlikely(!list_empty(&tsk->pi_state_list))) | ||
933 | exit_pi_state_list(tsk); | ||
934 | if (unlikely(current->pi_state_cache)) | ||
935 | kfree(current->pi_state_cache); | ||
936 | /* | ||
934 | * If DEBUG_MUTEXES is on, make sure we are holding no locks: | 937 | * If DEBUG_MUTEXES is on, make sure we are holding no locks: |
935 | */ | 938 | */ |
936 | mutex_debug_check_no_locks_held(tsk); | 939 | mutex_debug_check_no_locks_held(tsk); |
940 | rt_mutex_debug_check_no_locks_held(tsk); | ||
937 | 941 | ||
938 | if (tsk->io_context) | 942 | if (tsk->io_context) |
939 | exit_io_context(); | 943 | exit_io_context(); |
diff --git a/kernel/fork.c b/kernel/fork.c index dfd10cb370c3..628198a4f28a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -104,6 +104,7 @@ static kmem_cache_t *mm_cachep; | |||
104 | void free_task(struct task_struct *tsk) | 104 | void free_task(struct task_struct *tsk) |
105 | { | 105 | { |
106 | free_thread_info(tsk->thread_info); | 106 | free_thread_info(tsk->thread_info); |
107 | rt_mutex_debug_task_free(tsk); | ||
107 | free_task_struct(tsk); | 108 | free_task_struct(tsk); |
108 | } | 109 | } |
109 | EXPORT_SYMBOL(free_task); | 110 | EXPORT_SYMBOL(free_task); |
@@ -913,6 +914,19 @@ asmlinkage long sys_set_tid_address(int __user *tidptr) | |||
913 | return current->pid; | 914 | return current->pid; |
914 | } | 915 | } |
915 | 916 | ||
917 | static inline void rt_mutex_init_task(struct task_struct *p) | ||
918 | { | ||
919 | #ifdef CONFIG_RT_MUTEXES | ||
920 | spin_lock_init(&p->pi_lock); | ||
921 | plist_head_init(&p->pi_waiters, &p->pi_lock); | ||
922 | p->pi_blocked_on = NULL; | ||
923 | # ifdef CONFIG_DEBUG_RT_MUTEXES | ||
924 | spin_lock_init(&p->held_list_lock); | ||
925 | INIT_LIST_HEAD(&p->held_list_head); | ||
926 | # endif | ||
927 | #endif | ||
928 | } | ||
929 | |||
916 | /* | 930 | /* |
917 | * This creates a new process as a copy of the old one, | 931 | * This creates a new process as a copy of the old one, |
918 | * but does not actually start it yet. | 932 | * but does not actually start it yet. |
@@ -993,13 +1007,10 @@ static task_t *copy_process(unsigned long clone_flags, | |||
993 | if (put_user(p->pid, parent_tidptr)) | 1007 | if (put_user(p->pid, parent_tidptr)) |
994 | goto bad_fork_cleanup; | 1008 | goto bad_fork_cleanup; |
995 | 1009 | ||
996 | p->proc_dentry = NULL; | ||
997 | |||
998 | INIT_LIST_HEAD(&p->children); | 1010 | INIT_LIST_HEAD(&p->children); |
999 | INIT_LIST_HEAD(&p->sibling); | 1011 | INIT_LIST_HEAD(&p->sibling); |
1000 | p->vfork_done = NULL; | 1012 | p->vfork_done = NULL; |
1001 | spin_lock_init(&p->alloc_lock); | 1013 | spin_lock_init(&p->alloc_lock); |
1002 | spin_lock_init(&p->proc_lock); | ||
1003 | 1014 | ||
1004 | clear_tsk_thread_flag(p, TIF_SIGPENDING); | 1015 | clear_tsk_thread_flag(p, TIF_SIGPENDING); |
1005 | init_sigpending(&p->pending); | 1016 | init_sigpending(&p->pending); |
@@ -1037,6 +1048,8 @@ static task_t *copy_process(unsigned long clone_flags, | |||
1037 | mpol_fix_fork_child_flag(p); | 1048 | mpol_fix_fork_child_flag(p); |
1038 | #endif | 1049 | #endif |
1039 | 1050 | ||
1051 | rt_mutex_init_task(p); | ||
1052 | |||
1040 | #ifdef CONFIG_DEBUG_MUTEXES | 1053 | #ifdef CONFIG_DEBUG_MUTEXES |
1041 | p->blocked_on = NULL; /* not blocked yet */ | 1054 | p->blocked_on = NULL; /* not blocked yet */ |
1042 | #endif | 1055 | #endif |
@@ -1079,6 +1092,9 @@ static task_t *copy_process(unsigned long clone_flags, | |||
1079 | #ifdef CONFIG_COMPAT | 1092 | #ifdef CONFIG_COMPAT |
1080 | p->compat_robust_list = NULL; | 1093 | p->compat_robust_list = NULL; |
1081 | #endif | 1094 | #endif |
1095 | INIT_LIST_HEAD(&p->pi_state_list); | ||
1096 | p->pi_state_cache = NULL; | ||
1097 | |||
1082 | /* | 1098 | /* |
1083 | * sigaltstack should be cleared when sharing the same VM | 1099 | * sigaltstack should be cleared when sharing the same VM |
1084 | */ | 1100 | */ |
@@ -1159,18 +1175,6 @@ static task_t *copy_process(unsigned long clone_flags, | |||
1159 | } | 1175 | } |
1160 | 1176 | ||
1161 | if (clone_flags & CLONE_THREAD) { | 1177 | if (clone_flags & CLONE_THREAD) { |
1162 | /* | ||
1163 | * Important: if an exit-all has been started then | ||
1164 | * do not create this new thread - the whole thread | ||
1165 | * group is supposed to exit anyway. | ||
1166 | */ | ||
1167 | if (current->signal->flags & SIGNAL_GROUP_EXIT) { | ||
1168 | spin_unlock(¤t->sighand->siglock); | ||
1169 | write_unlock_irq(&tasklist_lock); | ||
1170 | retval = -EAGAIN; | ||
1171 | goto bad_fork_cleanup_namespace; | ||
1172 | } | ||
1173 | |||
1174 | p->group_leader = current->group_leader; | 1178 | p->group_leader = current->group_leader; |
1175 | list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); | 1179 | list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); |
1176 | 1180 | ||
diff --git a/kernel/futex.c b/kernel/futex.c index e1a380c77a5a..6c91f938005d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c | |||
@@ -12,6 +12,10 @@ | |||
12 | * (C) Copyright 2006 Red Hat Inc, All Rights Reserved | 12 | * (C) Copyright 2006 Red Hat Inc, All Rights Reserved |
13 | * Thanks to Thomas Gleixner for suggestions, analysis and fixes. | 13 | * Thanks to Thomas Gleixner for suggestions, analysis and fixes. |
14 | * | 14 | * |
15 | * PI-futex support started by Ingo Molnar and Thomas Gleixner | ||
16 | * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
17 | * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> | ||
18 | * | ||
15 | * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly | 19 | * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly |
16 | * enough at me, Linus for the original (flawed) idea, Matthew | 20 | * enough at me, Linus for the original (flawed) idea, Matthew |
17 | * Kirkwood for proof-of-concept implementation. | 21 | * Kirkwood for proof-of-concept implementation. |
@@ -46,6 +50,8 @@ | |||
46 | #include <linux/signal.h> | 50 | #include <linux/signal.h> |
47 | #include <asm/futex.h> | 51 | #include <asm/futex.h> |
48 | 52 | ||
53 | #include "rtmutex_common.h" | ||
54 | |||
49 | #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) | 55 | #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) |
50 | 56 | ||
51 | /* | 57 | /* |
@@ -63,7 +69,7 @@ union futex_key { | |||
63 | int offset; | 69 | int offset; |
64 | } shared; | 70 | } shared; |
65 | struct { | 71 | struct { |
66 | unsigned long uaddr; | 72 | unsigned long address; |
67 | struct mm_struct *mm; | 73 | struct mm_struct *mm; |
68 | int offset; | 74 | int offset; |
69 | } private; | 75 | } private; |
@@ -75,6 +81,27 @@ union futex_key { | |||
75 | }; | 81 | }; |
76 | 82 | ||
77 | /* | 83 | /* |
84 | * Priority Inheritance state: | ||
85 | */ | ||
86 | struct futex_pi_state { | ||
87 | /* | ||
88 | * list of 'owned' pi_state instances - these have to be | ||
89 | * cleaned up in do_exit() if the task exits prematurely: | ||
90 | */ | ||
91 | struct list_head list; | ||
92 | |||
93 | /* | ||
94 | * The PI object: | ||
95 | */ | ||
96 | struct rt_mutex pi_mutex; | ||
97 | |||
98 | struct task_struct *owner; | ||
99 | atomic_t refcount; | ||
100 | |||
101 | union futex_key key; | ||
102 | }; | ||
103 | |||
104 | /* | ||
78 | * We use this hashed waitqueue instead of a normal wait_queue_t, so | 105 | * We use this hashed waitqueue instead of a normal wait_queue_t, so |
79 | * we can wake only the relevant ones (hashed queues may be shared). | 106 | * we can wake only the relevant ones (hashed queues may be shared). |
80 | * | 107 | * |
@@ -87,15 +114,19 @@ struct futex_q { | |||
87 | struct list_head list; | 114 | struct list_head list; |
88 | wait_queue_head_t waiters; | 115 | wait_queue_head_t waiters; |
89 | 116 | ||
90 | /* Which hash list lock to use. */ | 117 | /* Which hash list lock to use: */ |
91 | spinlock_t *lock_ptr; | 118 | spinlock_t *lock_ptr; |
92 | 119 | ||
93 | /* Key which the futex is hashed on. */ | 120 | /* Key which the futex is hashed on: */ |
94 | union futex_key key; | 121 | union futex_key key; |
95 | 122 | ||
96 | /* For fd, sigio sent using these. */ | 123 | /* For fd, sigio sent using these: */ |
97 | int fd; | 124 | int fd; |
98 | struct file *filp; | 125 | struct file *filp; |
126 | |||
127 | /* Optional priority inheritance state: */ | ||
128 | struct futex_pi_state *pi_state; | ||
129 | struct task_struct *task; | ||
99 | }; | 130 | }; |
100 | 131 | ||
101 | /* | 132 | /* |
@@ -144,8 +175,9 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2) | |||
144 | * | 175 | * |
145 | * Should be called with ¤t->mm->mmap_sem but NOT any spinlocks. | 176 | * Should be called with ¤t->mm->mmap_sem but NOT any spinlocks. |
146 | */ | 177 | */ |
147 | static int get_futex_key(unsigned long uaddr, union futex_key *key) | 178 | static int get_futex_key(u32 __user *uaddr, union futex_key *key) |
148 | { | 179 | { |
180 | unsigned long address = (unsigned long)uaddr; | ||
149 | struct mm_struct *mm = current->mm; | 181 | struct mm_struct *mm = current->mm; |
150 | struct vm_area_struct *vma; | 182 | struct vm_area_struct *vma; |
151 | struct page *page; | 183 | struct page *page; |
@@ -154,16 +186,16 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) | |||
154 | /* | 186 | /* |
155 | * The futex address must be "naturally" aligned. | 187 | * The futex address must be "naturally" aligned. |
156 | */ | 188 | */ |
157 | key->both.offset = uaddr % PAGE_SIZE; | 189 | key->both.offset = address % PAGE_SIZE; |
158 | if (unlikely((key->both.offset % sizeof(u32)) != 0)) | 190 | if (unlikely((key->both.offset % sizeof(u32)) != 0)) |
159 | return -EINVAL; | 191 | return -EINVAL; |
160 | uaddr -= key->both.offset; | 192 | address -= key->both.offset; |
161 | 193 | ||
162 | /* | 194 | /* |
163 | * The futex is hashed differently depending on whether | 195 | * The futex is hashed differently depending on whether |
164 | * it's in a shared or private mapping. So check vma first. | 196 | * it's in a shared or private mapping. So check vma first. |
165 | */ | 197 | */ |
166 | vma = find_extend_vma(mm, uaddr); | 198 | vma = find_extend_vma(mm, address); |
167 | if (unlikely(!vma)) | 199 | if (unlikely(!vma)) |
168 | return -EFAULT; | 200 | return -EFAULT; |
169 | 201 | ||
@@ -184,7 +216,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) | |||
184 | */ | 216 | */ |
185 | if (likely(!(vma->vm_flags & VM_MAYSHARE))) { | 217 | if (likely(!(vma->vm_flags & VM_MAYSHARE))) { |
186 | key->private.mm = mm; | 218 | key->private.mm = mm; |
187 | key->private.uaddr = uaddr; | 219 | key->private.address = address; |
188 | return 0; | 220 | return 0; |
189 | } | 221 | } |
190 | 222 | ||
@@ -194,7 +226,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) | |||
194 | key->shared.inode = vma->vm_file->f_dentry->d_inode; | 226 | key->shared.inode = vma->vm_file->f_dentry->d_inode; |
195 | key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ | 227 | key->both.offset++; /* Bit 0 of offset indicates inode-based key. */ |
196 | if (likely(!(vma->vm_flags & VM_NONLINEAR))) { | 228 | if (likely(!(vma->vm_flags & VM_NONLINEAR))) { |
197 | key->shared.pgoff = (((uaddr - vma->vm_start) >> PAGE_SHIFT) | 229 | key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT) |
198 | + vma->vm_pgoff); | 230 | + vma->vm_pgoff); |
199 | return 0; | 231 | return 0; |
200 | } | 232 | } |
@@ -205,7 +237,7 @@ static int get_futex_key(unsigned long uaddr, union futex_key *key) | |||
205 | * from swap. But that's a lot of code to duplicate here | 237 | * from swap. But that's a lot of code to duplicate here |
206 | * for a rare case, so we simply fetch the page. | 238 | * for a rare case, so we simply fetch the page. |
207 | */ | 239 | */ |
208 | err = get_user_pages(current, mm, uaddr, 1, 0, 0, &page, NULL); | 240 | err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL); |
209 | if (err >= 0) { | 241 | if (err >= 0) { |
210 | key->shared.pgoff = | 242 | key->shared.pgoff = |
211 | page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); | 243 | page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); |
@@ -246,18 +278,244 @@ static void drop_key_refs(union futex_key *key) | |||
246 | } | 278 | } |
247 | } | 279 | } |
248 | 280 | ||
249 | static inline int get_futex_value_locked(int *dest, int __user *from) | 281 | static inline int get_futex_value_locked(u32 *dest, u32 __user *from) |
250 | { | 282 | { |
251 | int ret; | 283 | int ret; |
252 | 284 | ||
253 | inc_preempt_count(); | 285 | inc_preempt_count(); |
254 | ret = __copy_from_user_inatomic(dest, from, sizeof(int)); | 286 | ret = __copy_from_user_inatomic(dest, from, sizeof(u32)); |
255 | dec_preempt_count(); | 287 | dec_preempt_count(); |
256 | 288 | ||
257 | return ret ? -EFAULT : 0; | 289 | return ret ? -EFAULT : 0; |
258 | } | 290 | } |
259 | 291 | ||
260 | /* | 292 | /* |
293 | * Fault handling. Called with current->mm->mmap_sem held. | ||
294 | */ | ||
295 | static int futex_handle_fault(unsigned long address, int attempt) | ||
296 | { | ||
297 | struct vm_area_struct * vma; | ||
298 | struct mm_struct *mm = current->mm; | ||
299 | |||
300 | if (attempt >= 2 || !(vma = find_vma(mm, address)) || | ||
301 | vma->vm_start > address || !(vma->vm_flags & VM_WRITE)) | ||
302 | return -EFAULT; | ||
303 | |||
304 | switch (handle_mm_fault(mm, vma, address, 1)) { | ||
305 | case VM_FAULT_MINOR: | ||
306 | current->min_flt++; | ||
307 | break; | ||
308 | case VM_FAULT_MAJOR: | ||
309 | current->maj_flt++; | ||
310 | break; | ||
311 | default: | ||
312 | return -EFAULT; | ||
313 | } | ||
314 | return 0; | ||
315 | } | ||
316 | |||
317 | /* | ||
318 | * PI code: | ||
319 | */ | ||
320 | static int refill_pi_state_cache(void) | ||
321 | { | ||
322 | struct futex_pi_state *pi_state; | ||
323 | |||
324 | if (likely(current->pi_state_cache)) | ||
325 | return 0; | ||
326 | |||
327 | pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL); | ||
328 | |||
329 | if (!pi_state) | ||
330 | return -ENOMEM; | ||
331 | |||
332 | memset(pi_state, 0, sizeof(*pi_state)); | ||
333 | INIT_LIST_HEAD(&pi_state->list); | ||
334 | /* pi_mutex gets initialized later */ | ||
335 | pi_state->owner = NULL; | ||
336 | atomic_set(&pi_state->refcount, 1); | ||
337 | |||
338 | current->pi_state_cache = pi_state; | ||
339 | |||
340 | return 0; | ||
341 | } | ||
342 | |||
343 | static struct futex_pi_state * alloc_pi_state(void) | ||
344 | { | ||
345 | struct futex_pi_state *pi_state = current->pi_state_cache; | ||
346 | |||
347 | WARN_ON(!pi_state); | ||
348 | current->pi_state_cache = NULL; | ||
349 | |||
350 | return pi_state; | ||
351 | } | ||
352 | |||
353 | static void free_pi_state(struct futex_pi_state *pi_state) | ||
354 | { | ||
355 | if (!atomic_dec_and_test(&pi_state->refcount)) | ||
356 | return; | ||
357 | |||
358 | /* | ||
359 | * If pi_state->owner is NULL, the owner is most probably dying | ||
360 | * and has cleaned up the pi_state already | ||
361 | */ | ||
362 | if (pi_state->owner) { | ||
363 | spin_lock_irq(&pi_state->owner->pi_lock); | ||
364 | list_del_init(&pi_state->list); | ||
365 | spin_unlock_irq(&pi_state->owner->pi_lock); | ||
366 | |||
367 | rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); | ||
368 | } | ||
369 | |||
370 | if (current->pi_state_cache) | ||
371 | kfree(pi_state); | ||
372 | else { | ||
373 | /* | ||
374 | * pi_state->list is already empty. | ||
375 | * clear pi_state->owner. | ||
376 | * refcount is at 0 - put it back to 1. | ||
377 | */ | ||
378 | pi_state->owner = NULL; | ||
379 | atomic_set(&pi_state->refcount, 1); | ||
380 | current->pi_state_cache = pi_state; | ||
381 | } | ||
382 | } | ||
383 | |||
384 | /* | ||
385 | * Look up the task based on what TID userspace gave us. | ||
386 | * We dont trust it. | ||
387 | */ | ||
388 | static struct task_struct * futex_find_get_task(pid_t pid) | ||
389 | { | ||
390 | struct task_struct *p; | ||
391 | |||
392 | read_lock(&tasklist_lock); | ||
393 | p = find_task_by_pid(pid); | ||
394 | if (!p) | ||
395 | goto out_unlock; | ||
396 | if ((current->euid != p->euid) && (current->euid != p->uid)) { | ||
397 | p = NULL; | ||
398 | goto out_unlock; | ||
399 | } | ||
400 | if (p->state == EXIT_ZOMBIE || p->exit_state == EXIT_ZOMBIE) { | ||
401 | p = NULL; | ||
402 | goto out_unlock; | ||
403 | } | ||
404 | get_task_struct(p); | ||
405 | out_unlock: | ||
406 | read_unlock(&tasklist_lock); | ||
407 | |||
408 | return p; | ||
409 | } | ||
410 | |||
411 | /* | ||
412 | * This task is holding PI mutexes at exit time => bad. | ||
413 | * Kernel cleans up PI-state, but userspace is likely hosed. | ||
414 | * (Robust-futex cleanup is separate and might save the day for userspace.) | ||
415 | */ | ||
416 | void exit_pi_state_list(struct task_struct *curr) | ||
417 | { | ||
418 | struct futex_hash_bucket *hb; | ||
419 | struct list_head *next, *head = &curr->pi_state_list; | ||
420 | struct futex_pi_state *pi_state; | ||
421 | union futex_key key; | ||
422 | |||
423 | /* | ||
424 | * We are a ZOMBIE and nobody can enqueue itself on | ||
425 | * pi_state_list anymore, but we have to be careful | ||
426 | * versus waiters unqueueing themselfs | ||
427 | */ | ||
428 | spin_lock_irq(&curr->pi_lock); | ||
429 | while (!list_empty(head)) { | ||
430 | |||
431 | next = head->next; | ||
432 | pi_state = list_entry(next, struct futex_pi_state, list); | ||
433 | key = pi_state->key; | ||
434 | spin_unlock_irq(&curr->pi_lock); | ||
435 | |||
436 | hb = hash_futex(&key); | ||
437 | spin_lock(&hb->lock); | ||
438 | |||
439 | spin_lock_irq(&curr->pi_lock); | ||
440 | if (head->next != next) { | ||
441 | spin_unlock(&hb->lock); | ||
442 | continue; | ||
443 | } | ||
444 | |||
445 | list_del_init(&pi_state->list); | ||
446 | |||
447 | WARN_ON(pi_state->owner != curr); | ||
448 | |||
449 | pi_state->owner = NULL; | ||
450 | spin_unlock_irq(&curr->pi_lock); | ||
451 | |||
452 | rt_mutex_unlock(&pi_state->pi_mutex); | ||
453 | |||
454 | spin_unlock(&hb->lock); | ||
455 | |||
456 | spin_lock_irq(&curr->pi_lock); | ||
457 | } | ||
458 | spin_unlock_irq(&curr->pi_lock); | ||
459 | } | ||
460 | |||
461 | static int | ||
462 | lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) | ||
463 | { | ||
464 | struct futex_pi_state *pi_state = NULL; | ||
465 | struct futex_q *this, *next; | ||
466 | struct list_head *head; | ||
467 | struct task_struct *p; | ||
468 | pid_t pid; | ||
469 | |||
470 | head = &hb->chain; | ||
471 | |||
472 | list_for_each_entry_safe(this, next, head, list) { | ||
473 | if (match_futex (&this->key, &me->key)) { | ||
474 | /* | ||
475 | * Another waiter already exists - bump up | ||
476 | * the refcount and return its pi_state: | ||
477 | */ | ||
478 | pi_state = this->pi_state; | ||
479 | atomic_inc(&pi_state->refcount); | ||
480 | me->pi_state = pi_state; | ||
481 | |||
482 | return 0; | ||
483 | } | ||
484 | } | ||
485 | |||
486 | /* | ||
487 | * We are the first waiter - try to look up the real owner and | ||
488 | * attach the new pi_state to it: | ||
489 | */ | ||
490 | pid = uval & FUTEX_TID_MASK; | ||
491 | p = futex_find_get_task(pid); | ||
492 | if (!p) | ||
493 | return -ESRCH; | ||
494 | |||
495 | pi_state = alloc_pi_state(); | ||
496 | |||
497 | /* | ||
498 | * Initialize the pi_mutex in locked state and make 'p' | ||
499 | * the owner of it: | ||
500 | */ | ||
501 | rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p); | ||
502 | |||
503 | /* Store the key for possible exit cleanups: */ | ||
504 | pi_state->key = me->key; | ||
505 | |||
506 | spin_lock_irq(&p->pi_lock); | ||
507 | list_add(&pi_state->list, &p->pi_state_list); | ||
508 | pi_state->owner = p; | ||
509 | spin_unlock_irq(&p->pi_lock); | ||
510 | |||
511 | put_task_struct(p); | ||
512 | |||
513 | me->pi_state = pi_state; | ||
514 | |||
515 | return 0; | ||
516 | } | ||
517 | |||
518 | /* | ||
261 | * The hash bucket lock must be held when this is called. | 519 | * The hash bucket lock must be held when this is called. |
262 | * Afterwards, the futex_q must not be accessed. | 520 | * Afterwards, the futex_q must not be accessed. |
263 | */ | 521 | */ |
@@ -284,16 +542,80 @@ static void wake_futex(struct futex_q *q) | |||
284 | q->lock_ptr = NULL; | 542 | q->lock_ptr = NULL; |
285 | } | 543 | } |
286 | 544 | ||
545 | static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) | ||
546 | { | ||
547 | struct task_struct *new_owner; | ||
548 | struct futex_pi_state *pi_state = this->pi_state; | ||
549 | u32 curval, newval; | ||
550 | |||
551 | if (!pi_state) | ||
552 | return -EINVAL; | ||
553 | |||
554 | new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); | ||
555 | |||
556 | /* | ||
557 | * This happens when we have stolen the lock and the original | ||
558 | * pending owner did not enqueue itself back on the rt_mutex. | ||
559 | * Thats not a tragedy. We know that way, that a lock waiter | ||
560 | * is on the fly. We make the futex_q waiter the pending owner. | ||
561 | */ | ||
562 | if (!new_owner) | ||
563 | new_owner = this->task; | ||
564 | |||
565 | /* | ||
566 | * We pass it to the next owner. (The WAITERS bit is always | ||
567 | * kept enabled while there is PI state around. We must also | ||
568 | * preserve the owner died bit.) | ||
569 | */ | ||
570 | newval = (uval & FUTEX_OWNER_DIED) | FUTEX_WAITERS | new_owner->pid; | ||
571 | |||
572 | inc_preempt_count(); | ||
573 | curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); | ||
574 | dec_preempt_count(); | ||
575 | |||
576 | if (curval == -EFAULT) | ||
577 | return -EFAULT; | ||
578 | if (curval != uval) | ||
579 | return -EINVAL; | ||
580 | |||
581 | list_del_init(&pi_state->owner->pi_state_list); | ||
582 | list_add(&pi_state->list, &new_owner->pi_state_list); | ||
583 | pi_state->owner = new_owner; | ||
584 | rt_mutex_unlock(&pi_state->pi_mutex); | ||
585 | |||
586 | return 0; | ||
587 | } | ||
588 | |||
589 | static int unlock_futex_pi(u32 __user *uaddr, u32 uval) | ||
590 | { | ||
591 | u32 oldval; | ||
592 | |||
593 | /* | ||
594 | * There is no waiter, so we unlock the futex. The owner died | ||
595 | * bit has not to be preserved here. We are the owner: | ||
596 | */ | ||
597 | inc_preempt_count(); | ||
598 | oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0); | ||
599 | dec_preempt_count(); | ||
600 | |||
601 | if (oldval == -EFAULT) | ||
602 | return oldval; | ||
603 | if (oldval != uval) | ||
604 | return -EAGAIN; | ||
605 | |||
606 | return 0; | ||
607 | } | ||
608 | |||
287 | /* | 609 | /* |
288 | * Wake up all waiters hashed on the physical page that is mapped | 610 | * Wake up all waiters hashed on the physical page that is mapped |
289 | * to this virtual address: | 611 | * to this virtual address: |
290 | */ | 612 | */ |
291 | static int futex_wake(unsigned long uaddr, int nr_wake) | 613 | static int futex_wake(u32 __user *uaddr, int nr_wake) |
292 | { | 614 | { |
293 | union futex_key key; | 615 | struct futex_hash_bucket *hb; |
294 | struct futex_hash_bucket *bh; | ||
295 | struct list_head *head; | ||
296 | struct futex_q *this, *next; | 616 | struct futex_q *this, *next; |
617 | struct list_head *head; | ||
618 | union futex_key key; | ||
297 | int ret; | 619 | int ret; |
298 | 620 | ||
299 | down_read(¤t->mm->mmap_sem); | 621 | down_read(¤t->mm->mmap_sem); |
@@ -302,19 +624,21 @@ static int futex_wake(unsigned long uaddr, int nr_wake) | |||
302 | if (unlikely(ret != 0)) | 624 | if (unlikely(ret != 0)) |
303 | goto out; | 625 | goto out; |
304 | 626 | ||
305 | bh = hash_futex(&key); | 627 | hb = hash_futex(&key); |
306 | spin_lock(&bh->lock); | 628 | spin_lock(&hb->lock); |
307 | head = &bh->chain; | 629 | head = &hb->chain; |
308 | 630 | ||
309 | list_for_each_entry_safe(this, next, head, list) { | 631 | list_for_each_entry_safe(this, next, head, list) { |
310 | if (match_futex (&this->key, &key)) { | 632 | if (match_futex (&this->key, &key)) { |
633 | if (this->pi_state) | ||
634 | return -EINVAL; | ||
311 | wake_futex(this); | 635 | wake_futex(this); |
312 | if (++ret >= nr_wake) | 636 | if (++ret >= nr_wake) |
313 | break; | 637 | break; |
314 | } | 638 | } |
315 | } | 639 | } |
316 | 640 | ||
317 | spin_unlock(&bh->lock); | 641 | spin_unlock(&hb->lock); |
318 | out: | 642 | out: |
319 | up_read(¤t->mm->mmap_sem); | 643 | up_read(¤t->mm->mmap_sem); |
320 | return ret; | 644 | return ret; |
@@ -324,10 +648,12 @@ out: | |||
324 | * Wake up all waiters hashed on the physical page that is mapped | 648 | * Wake up all waiters hashed on the physical page that is mapped |
325 | * to this virtual address: | 649 | * to this virtual address: |
326 | */ | 650 | */ |
327 | static int futex_wake_op(unsigned long uaddr1, unsigned long uaddr2, int nr_wake, int nr_wake2, int op) | 651 | static int |
652 | futex_wake_op(u32 __user *uaddr1, u32 __user *uaddr2, | ||
653 | int nr_wake, int nr_wake2, int op) | ||
328 | { | 654 | { |
329 | union futex_key key1, key2; | 655 | union futex_key key1, key2; |
330 | struct futex_hash_bucket *bh1, *bh2; | 656 | struct futex_hash_bucket *hb1, *hb2; |
331 | struct list_head *head; | 657 | struct list_head *head; |
332 | struct futex_q *this, *next; | 658 | struct futex_q *this, *next; |
333 | int ret, op_ret, attempt = 0; | 659 | int ret, op_ret, attempt = 0; |
@@ -342,27 +668,29 @@ retryfull: | |||
342 | if (unlikely(ret != 0)) | 668 | if (unlikely(ret != 0)) |
343 | goto out; | 669 | goto out; |
344 | 670 | ||
345 | bh1 = hash_futex(&key1); | 671 | hb1 = hash_futex(&key1); |
346 | bh2 = hash_futex(&key2); | 672 | hb2 = hash_futex(&key2); |
347 | 673 | ||
348 | retry: | 674 | retry: |
349 | if (bh1 < bh2) | 675 | if (hb1 < hb2) |
350 | spin_lock(&bh1->lock); | 676 | spin_lock(&hb1->lock); |
351 | spin_lock(&bh2->lock); | 677 | spin_lock(&hb2->lock); |
352 | if (bh1 > bh2) | 678 | if (hb1 > hb2) |
353 | spin_lock(&bh1->lock); | 679 | spin_lock(&hb1->lock); |
354 | 680 | ||
355 | op_ret = futex_atomic_op_inuser(op, (int __user *)uaddr2); | 681 | op_ret = futex_atomic_op_inuser(op, uaddr2); |
356 | if (unlikely(op_ret < 0)) { | 682 | if (unlikely(op_ret < 0)) { |
357 | int dummy; | 683 | u32 dummy; |
358 | 684 | ||
359 | spin_unlock(&bh1->lock); | 685 | spin_unlock(&hb1->lock); |
360 | if (bh1 != bh2) | 686 | if (hb1 != hb2) |
361 | spin_unlock(&bh2->lock); | 687 | spin_unlock(&hb2->lock); |
362 | 688 | ||
363 | #ifndef CONFIG_MMU | 689 | #ifndef CONFIG_MMU |
364 | /* we don't get EFAULT from MMU faults if we don't have an MMU, | 690 | /* |
365 | * but we might get them from range checking */ | 691 | * we don't get EFAULT from MMU faults if we don't have an MMU, |
692 | * but we might get them from range checking | ||
693 | */ | ||
366 | ret = op_ret; | 694 | ret = op_ret; |
367 | goto out; | 695 | goto out; |
368 | #endif | 696 | #endif |
@@ -372,47 +700,34 @@ retry: | |||
372 | goto out; | 700 | goto out; |
373 | } | 701 | } |
374 | 702 | ||
375 | /* futex_atomic_op_inuser needs to both read and write | 703 | /* |
704 | * futex_atomic_op_inuser needs to both read and write | ||
376 | * *(int __user *)uaddr2, but we can't modify it | 705 | * *(int __user *)uaddr2, but we can't modify it |
377 | * non-atomically. Therefore, if get_user below is not | 706 | * non-atomically. Therefore, if get_user below is not |
378 | * enough, we need to handle the fault ourselves, while | 707 | * enough, we need to handle the fault ourselves, while |
379 | * still holding the mmap_sem. */ | 708 | * still holding the mmap_sem. |
709 | */ | ||
380 | if (attempt++) { | 710 | if (attempt++) { |
381 | struct vm_area_struct * vma; | 711 | if (futex_handle_fault((unsigned long)uaddr2, |
382 | struct mm_struct *mm = current->mm; | 712 | attempt)) |
383 | |||
384 | ret = -EFAULT; | ||
385 | if (attempt >= 2 || | ||
386 | !(vma = find_vma(mm, uaddr2)) || | ||
387 | vma->vm_start > uaddr2 || | ||
388 | !(vma->vm_flags & VM_WRITE)) | ||
389 | goto out; | ||
390 | |||
391 | switch (handle_mm_fault(mm, vma, uaddr2, 1)) { | ||
392 | case VM_FAULT_MINOR: | ||
393 | current->min_flt++; | ||
394 | break; | ||
395 | case VM_FAULT_MAJOR: | ||
396 | current->maj_flt++; | ||
397 | break; | ||
398 | default: | ||
399 | goto out; | 713 | goto out; |
400 | } | ||
401 | goto retry; | 714 | goto retry; |
402 | } | 715 | } |
403 | 716 | ||
404 | /* If we would have faulted, release mmap_sem, | 717 | /* |
405 | * fault it in and start all over again. */ | 718 | * If we would have faulted, release mmap_sem, |
719 | * fault it in and start all over again. | ||
720 | */ | ||
406 | up_read(¤t->mm->mmap_sem); | 721 | up_read(¤t->mm->mmap_sem); |
407 | 722 | ||
408 | ret = get_user(dummy, (int __user *)uaddr2); | 723 | ret = get_user(dummy, uaddr2); |
409 | if (ret) | 724 | if (ret) |
410 | return ret; | 725 | return ret; |
411 | 726 | ||
412 | goto retryfull; | 727 | goto retryfull; |
413 | } | 728 | } |
414 | 729 | ||
415 | head = &bh1->chain; | 730 | head = &hb1->chain; |
416 | 731 | ||
417 | list_for_each_entry_safe(this, next, head, list) { | 732 | list_for_each_entry_safe(this, next, head, list) { |
418 | if (match_futex (&this->key, &key1)) { | 733 | if (match_futex (&this->key, &key1)) { |
@@ -423,7 +738,7 @@ retry: | |||
423 | } | 738 | } |
424 | 739 | ||
425 | if (op_ret > 0) { | 740 | if (op_ret > 0) { |
426 | head = &bh2->chain; | 741 | head = &hb2->chain; |
427 | 742 | ||
428 | op_ret = 0; | 743 | op_ret = 0; |
429 | list_for_each_entry_safe(this, next, head, list) { | 744 | list_for_each_entry_safe(this, next, head, list) { |
@@ -436,9 +751,9 @@ retry: | |||
436 | ret += op_ret; | 751 | ret += op_ret; |
437 | } | 752 | } |
438 | 753 | ||
439 | spin_unlock(&bh1->lock); | 754 | spin_unlock(&hb1->lock); |
440 | if (bh1 != bh2) | 755 | if (hb1 != hb2) |
441 | spin_unlock(&bh2->lock); | 756 | spin_unlock(&hb2->lock); |
442 | out: | 757 | out: |
443 | up_read(¤t->mm->mmap_sem); | 758 | up_read(¤t->mm->mmap_sem); |
444 | return ret; | 759 | return ret; |
@@ -448,11 +763,11 @@ out: | |||
448 | * Requeue all waiters hashed on one physical page to another | 763 | * Requeue all waiters hashed on one physical page to another |
449 | * physical page. | 764 | * physical page. |
450 | */ | 765 | */ |
451 | static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2, | 766 | static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2, |
452 | int nr_wake, int nr_requeue, int *valp) | 767 | int nr_wake, int nr_requeue, u32 *cmpval) |
453 | { | 768 | { |
454 | union futex_key key1, key2; | 769 | union futex_key key1, key2; |
455 | struct futex_hash_bucket *bh1, *bh2; | 770 | struct futex_hash_bucket *hb1, *hb2; |
456 | struct list_head *head1; | 771 | struct list_head *head1; |
457 | struct futex_q *this, *next; | 772 | struct futex_q *this, *next; |
458 | int ret, drop_count = 0; | 773 | int ret, drop_count = 0; |
@@ -467,68 +782,72 @@ static int futex_requeue(unsigned long uaddr1, unsigned long uaddr2, | |||
467 | if (unlikely(ret != 0)) | 782 | if (unlikely(ret != 0)) |
468 | goto out; | 783 | goto out; |
469 | 784 | ||
470 | bh1 = hash_futex(&key1); | 785 | hb1 = hash_futex(&key1); |
471 | bh2 = hash_futex(&key2); | 786 | hb2 = hash_futex(&key2); |
472 | 787 | ||
473 | if (bh1 < bh2) | 788 | if (hb1 < hb2) |
474 | spin_lock(&bh1->lock); | 789 | spin_lock(&hb1->lock); |
475 | spin_lock(&bh2->lock); | 790 | spin_lock(&hb2->lock); |
476 | if (bh1 > bh2) | 791 | if (hb1 > hb2) |
477 | spin_lock(&bh1->lock); | 792 | spin_lock(&hb1->lock); |
478 | 793 | ||
479 | if (likely(valp != NULL)) { | 794 | if (likely(cmpval != NULL)) { |
480 | int curval; | 795 | u32 curval; |
481 | 796 | ||
482 | ret = get_futex_value_locked(&curval, (int __user *)uaddr1); | 797 | ret = get_futex_value_locked(&curval, uaddr1); |
483 | 798 | ||
484 | if (unlikely(ret)) { | 799 | if (unlikely(ret)) { |
485 | spin_unlock(&bh1->lock); | 800 | spin_unlock(&hb1->lock); |
486 | if (bh1 != bh2) | 801 | if (hb1 != hb2) |
487 | spin_unlock(&bh2->lock); | 802 | spin_unlock(&hb2->lock); |
488 | 803 | ||
489 | /* If we would have faulted, release mmap_sem, fault | 804 | /* |
805 | * If we would have faulted, release mmap_sem, fault | ||
490 | * it in and start all over again. | 806 | * it in and start all over again. |
491 | */ | 807 | */ |
492 | up_read(¤t->mm->mmap_sem); | 808 | up_read(¤t->mm->mmap_sem); |
493 | 809 | ||
494 | ret = get_user(curval, (int __user *)uaddr1); | 810 | ret = get_user(curval, uaddr1); |
495 | 811 | ||
496 | if (!ret) | 812 | if (!ret) |
497 | goto retry; | 813 | goto retry; |
498 | 814 | ||
499 | return ret; | 815 | return ret; |
500 | } | 816 | } |
501 | if (curval != *valp) { | 817 | if (curval != *cmpval) { |
502 | ret = -EAGAIN; | 818 | ret = -EAGAIN; |
503 | goto out_unlock; | 819 | goto out_unlock; |
504 | } | 820 | } |
505 | } | 821 | } |
506 | 822 | ||
507 | head1 = &bh1->chain; | 823 | head1 = &hb1->chain; |
508 | list_for_each_entry_safe(this, next, head1, list) { | 824 | list_for_each_entry_safe(this, next, head1, list) { |
509 | if (!match_futex (&this->key, &key1)) | 825 | if (!match_futex (&this->key, &key1)) |
510 | continue; | 826 | continue; |
511 | if (++ret <= nr_wake) { | 827 | if (++ret <= nr_wake) { |
512 | wake_futex(this); | 828 | wake_futex(this); |
513 | } else { | 829 | } else { |
514 | list_move_tail(&this->list, &bh2->chain); | 830 | /* |
515 | this->lock_ptr = &bh2->lock; | 831 | * If key1 and key2 hash to the same bucket, no need to |
832 | * requeue. | ||
833 | */ | ||
834 | if (likely(head1 != &hb2->chain)) { | ||
835 | list_move_tail(&this->list, &hb2->chain); | ||
836 | this->lock_ptr = &hb2->lock; | ||
837 | } | ||
516 | this->key = key2; | 838 | this->key = key2; |
517 | get_key_refs(&key2); | 839 | get_key_refs(&key2); |
518 | drop_count++; | 840 | drop_count++; |
519 | 841 | ||
520 | if (ret - nr_wake >= nr_requeue) | 842 | if (ret - nr_wake >= nr_requeue) |
521 | break; | 843 | break; |
522 | /* Make sure to stop if key1 == key2 */ | ||
523 | if (head1 == &bh2->chain && head1 != &next->list) | ||
524 | head1 = &this->list; | ||
525 | } | 844 | } |
526 | } | 845 | } |
527 | 846 | ||
528 | out_unlock: | 847 | out_unlock: |
529 | spin_unlock(&bh1->lock); | 848 | spin_unlock(&hb1->lock); |
530 | if (bh1 != bh2) | 849 | if (hb1 != hb2) |
531 | spin_unlock(&bh2->lock); | 850 | spin_unlock(&hb2->lock); |
532 | 851 | ||
533 | /* drop_key_refs() must be called outside the spinlocks. */ | 852 | /* drop_key_refs() must be called outside the spinlocks. */ |
534 | while (--drop_count >= 0) | 853 | while (--drop_count >= 0) |
@@ -543,7 +862,7 @@ out: | |||
543 | static inline struct futex_hash_bucket * | 862 | static inline struct futex_hash_bucket * |
544 | queue_lock(struct futex_q *q, int fd, struct file *filp) | 863 | queue_lock(struct futex_q *q, int fd, struct file *filp) |
545 | { | 864 | { |
546 | struct futex_hash_bucket *bh; | 865 | struct futex_hash_bucket *hb; |
547 | 866 | ||
548 | q->fd = fd; | 867 | q->fd = fd; |
549 | q->filp = filp; | 868 | q->filp = filp; |
@@ -551,23 +870,24 @@ queue_lock(struct futex_q *q, int fd, struct file *filp) | |||
551 | init_waitqueue_head(&q->waiters); | 870 | init_waitqueue_head(&q->waiters); |
552 | 871 | ||
553 | get_key_refs(&q->key); | 872 | get_key_refs(&q->key); |
554 | bh = hash_futex(&q->key); | 873 | hb = hash_futex(&q->key); |
555 | q->lock_ptr = &bh->lock; | 874 | q->lock_ptr = &hb->lock; |
556 | 875 | ||
557 | spin_lock(&bh->lock); | 876 | spin_lock(&hb->lock); |
558 | return bh; | 877 | return hb; |
559 | } | 878 | } |
560 | 879 | ||
561 | static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *bh) | 880 | static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) |
562 | { | 881 | { |
563 | list_add_tail(&q->list, &bh->chain); | 882 | list_add_tail(&q->list, &hb->chain); |
564 | spin_unlock(&bh->lock); | 883 | q->task = current; |
884 | spin_unlock(&hb->lock); | ||
565 | } | 885 | } |
566 | 886 | ||
567 | static inline void | 887 | static inline void |
568 | queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh) | 888 | queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) |
569 | { | 889 | { |
570 | spin_unlock(&bh->lock); | 890 | spin_unlock(&hb->lock); |
571 | drop_key_refs(&q->key); | 891 | drop_key_refs(&q->key); |
572 | } | 892 | } |
573 | 893 | ||
@@ -579,16 +899,17 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *bh) | |||
579 | /* The key must be already stored in q->key. */ | 899 | /* The key must be already stored in q->key. */ |
580 | static void queue_me(struct futex_q *q, int fd, struct file *filp) | 900 | static void queue_me(struct futex_q *q, int fd, struct file *filp) |
581 | { | 901 | { |
582 | struct futex_hash_bucket *bh; | 902 | struct futex_hash_bucket *hb; |
583 | bh = queue_lock(q, fd, filp); | 903 | |
584 | __queue_me(q, bh); | 904 | hb = queue_lock(q, fd, filp); |
905 | __queue_me(q, hb); | ||
585 | } | 906 | } |
586 | 907 | ||
587 | /* Return 1 if we were still queued (ie. 0 means we were woken) */ | 908 | /* Return 1 if we were still queued (ie. 0 means we were woken) */ |
588 | static int unqueue_me(struct futex_q *q) | 909 | static int unqueue_me(struct futex_q *q) |
589 | { | 910 | { |
590 | int ret = 0; | ||
591 | spinlock_t *lock_ptr; | 911 | spinlock_t *lock_ptr; |
912 | int ret = 0; | ||
592 | 913 | ||
593 | /* In the common case we don't take the spinlock, which is nice. */ | 914 | /* In the common case we don't take the spinlock, which is nice. */ |
594 | retry: | 915 | retry: |
@@ -614,6 +935,9 @@ static int unqueue_me(struct futex_q *q) | |||
614 | } | 935 | } |
615 | WARN_ON(list_empty(&q->list)); | 936 | WARN_ON(list_empty(&q->list)); |
616 | list_del(&q->list); | 937 | list_del(&q->list); |
938 | |||
939 | BUG_ON(q->pi_state); | ||
940 | |||
617 | spin_unlock(lock_ptr); | 941 | spin_unlock(lock_ptr); |
618 | ret = 1; | 942 | ret = 1; |
619 | } | 943 | } |
@@ -622,21 +946,42 @@ static int unqueue_me(struct futex_q *q) | |||
622 | return ret; | 946 | return ret; |
623 | } | 947 | } |
624 | 948 | ||
625 | static int futex_wait(unsigned long uaddr, int val, unsigned long time) | 949 | /* |
950 | * PI futexes can not be requeued and must remove themself from the | ||
951 | * hash bucket. The hash bucket lock is held on entry and dropped here. | ||
952 | */ | ||
953 | static void unqueue_me_pi(struct futex_q *q, struct futex_hash_bucket *hb) | ||
626 | { | 954 | { |
627 | DECLARE_WAITQUEUE(wait, current); | 955 | WARN_ON(list_empty(&q->list)); |
628 | int ret, curval; | 956 | list_del(&q->list); |
957 | |||
958 | BUG_ON(!q->pi_state); | ||
959 | free_pi_state(q->pi_state); | ||
960 | q->pi_state = NULL; | ||
961 | |||
962 | spin_unlock(&hb->lock); | ||
963 | |||
964 | drop_key_refs(&q->key); | ||
965 | } | ||
966 | |||
967 | static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time) | ||
968 | { | ||
969 | struct task_struct *curr = current; | ||
970 | DECLARE_WAITQUEUE(wait, curr); | ||
971 | struct futex_hash_bucket *hb; | ||
629 | struct futex_q q; | 972 | struct futex_q q; |
630 | struct futex_hash_bucket *bh; | 973 | u32 uval; |
974 | int ret; | ||
631 | 975 | ||
976 | q.pi_state = NULL; | ||
632 | retry: | 977 | retry: |
633 | down_read(¤t->mm->mmap_sem); | 978 | down_read(&curr->mm->mmap_sem); |
634 | 979 | ||
635 | ret = get_futex_key(uaddr, &q.key); | 980 | ret = get_futex_key(uaddr, &q.key); |
636 | if (unlikely(ret != 0)) | 981 | if (unlikely(ret != 0)) |
637 | goto out_release_sem; | 982 | goto out_release_sem; |
638 | 983 | ||
639 | bh = queue_lock(&q, -1, NULL); | 984 | hb = queue_lock(&q, -1, NULL); |
640 | 985 | ||
641 | /* | 986 | /* |
642 | * Access the page AFTER the futex is queued. | 987 | * Access the page AFTER the futex is queued. |
@@ -658,37 +1003,35 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time) | |||
658 | * We hold the mmap semaphore, so the mapping cannot have changed | 1003 | * We hold the mmap semaphore, so the mapping cannot have changed |
659 | * since we looked it up in get_futex_key. | 1004 | * since we looked it up in get_futex_key. |
660 | */ | 1005 | */ |
661 | 1006 | ret = get_futex_value_locked(&uval, uaddr); | |
662 | ret = get_futex_value_locked(&curval, (int __user *)uaddr); | ||
663 | 1007 | ||
664 | if (unlikely(ret)) { | 1008 | if (unlikely(ret)) { |
665 | queue_unlock(&q, bh); | 1009 | queue_unlock(&q, hb); |
666 | 1010 | ||
667 | /* If we would have faulted, release mmap_sem, fault it in and | 1011 | /* |
1012 | * If we would have faulted, release mmap_sem, fault it in and | ||
668 | * start all over again. | 1013 | * start all over again. |
669 | */ | 1014 | */ |
670 | up_read(¤t->mm->mmap_sem); | 1015 | up_read(&curr->mm->mmap_sem); |
671 | 1016 | ||
672 | ret = get_user(curval, (int __user *)uaddr); | 1017 | ret = get_user(uval, uaddr); |
673 | 1018 | ||
674 | if (!ret) | 1019 | if (!ret) |
675 | goto retry; | 1020 | goto retry; |
676 | return ret; | 1021 | return ret; |
677 | } | 1022 | } |
678 | if (curval != val) { | 1023 | ret = -EWOULDBLOCK; |
679 | ret = -EWOULDBLOCK; | 1024 | if (uval != val) |
680 | queue_unlock(&q, bh); | 1025 | goto out_unlock_release_sem; |
681 | goto out_release_sem; | ||
682 | } | ||
683 | 1026 | ||
684 | /* Only actually queue if *uaddr contained val. */ | 1027 | /* Only actually queue if *uaddr contained val. */ |
685 | __queue_me(&q, bh); | 1028 | __queue_me(&q, hb); |
686 | 1029 | ||
687 | /* | 1030 | /* |
688 | * Now the futex is queued and we have checked the data, we | 1031 | * Now the futex is queued and we have checked the data, we |
689 | * don't want to hold mmap_sem while we sleep. | 1032 | * don't want to hold mmap_sem while we sleep. |
690 | */ | 1033 | */ |
691 | up_read(¤t->mm->mmap_sem); | 1034 | up_read(&curr->mm->mmap_sem); |
692 | 1035 | ||
693 | /* | 1036 | /* |
694 | * There might have been scheduling since the queue_me(), as we | 1037 | * There might have been scheduling since the queue_me(), as we |
@@ -720,12 +1063,421 @@ static int futex_wait(unsigned long uaddr, int val, unsigned long time) | |||
720 | return 0; | 1063 | return 0; |
721 | if (time == 0) | 1064 | if (time == 0) |
722 | return -ETIMEDOUT; | 1065 | return -ETIMEDOUT; |
723 | /* We expect signal_pending(current), but another thread may | 1066 | /* |
724 | * have handled it for us already. */ | 1067 | * We expect signal_pending(current), but another thread may |
1068 | * have handled it for us already. | ||
1069 | */ | ||
725 | return -EINTR; | 1070 | return -EINTR; |
726 | 1071 | ||
1072 | out_unlock_release_sem: | ||
1073 | queue_unlock(&q, hb); | ||
1074 | |||
727 | out_release_sem: | 1075 | out_release_sem: |
1076 | up_read(&curr->mm->mmap_sem); | ||
1077 | return ret; | ||
1078 | } | ||
1079 | |||
1080 | /* | ||
1081 | * Userspace tried a 0 -> TID atomic transition of the futex value | ||
1082 | * and failed. The kernel side here does the whole locking operation: | ||
1083 | * if there are waiters then it will block, it does PI, etc. (Due to | ||
1084 | * races the kernel might see a 0 value of the futex too.) | ||
1085 | */ | ||
1086 | static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock, | ||
1087 | struct hrtimer_sleeper *to) | ||
1088 | { | ||
1089 | struct task_struct *curr = current; | ||
1090 | struct futex_hash_bucket *hb; | ||
1091 | u32 uval, newval, curval; | ||
1092 | struct futex_q q; | ||
1093 | int ret, attempt = 0; | ||
1094 | |||
1095 | if (refill_pi_state_cache()) | ||
1096 | return -ENOMEM; | ||
1097 | |||
1098 | q.pi_state = NULL; | ||
1099 | retry: | ||
1100 | down_read(&curr->mm->mmap_sem); | ||
1101 | |||
1102 | ret = get_futex_key(uaddr, &q.key); | ||
1103 | if (unlikely(ret != 0)) | ||
1104 | goto out_release_sem; | ||
1105 | |||
1106 | hb = queue_lock(&q, -1, NULL); | ||
1107 | |||
1108 | retry_locked: | ||
1109 | /* | ||
1110 | * To avoid races, we attempt to take the lock here again | ||
1111 | * (by doing a 0 -> TID atomic cmpxchg), while holding all | ||
1112 | * the locks. It will most likely not succeed. | ||
1113 | */ | ||
1114 | newval = current->pid; | ||
1115 | |||
1116 | inc_preempt_count(); | ||
1117 | curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval); | ||
1118 | dec_preempt_count(); | ||
1119 | |||
1120 | if (unlikely(curval == -EFAULT)) | ||
1121 | goto uaddr_faulted; | ||
1122 | |||
1123 | /* We own the lock already */ | ||
1124 | if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) { | ||
1125 | if (!detect && 0) | ||
1126 | force_sig(SIGKILL, current); | ||
1127 | ret = -EDEADLK; | ||
1128 | goto out_unlock_release_sem; | ||
1129 | } | ||
1130 | |||
1131 | /* | ||
1132 | * Surprise - we got the lock. Just return | ||
1133 | * to userspace: | ||
1134 | */ | ||
1135 | if (unlikely(!curval)) | ||
1136 | goto out_unlock_release_sem; | ||
1137 | |||
1138 | uval = curval; | ||
1139 | newval = uval | FUTEX_WAITERS; | ||
1140 | |||
1141 | inc_preempt_count(); | ||
1142 | curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); | ||
1143 | dec_preempt_count(); | ||
1144 | |||
1145 | if (unlikely(curval == -EFAULT)) | ||
1146 | goto uaddr_faulted; | ||
1147 | if (unlikely(curval != uval)) | ||
1148 | goto retry_locked; | ||
1149 | |||
1150 | /* | ||
1151 | * We dont have the lock. Look up the PI state (or create it if | ||
1152 | * we are the first waiter): | ||
1153 | */ | ||
1154 | ret = lookup_pi_state(uval, hb, &q); | ||
1155 | |||
1156 | if (unlikely(ret)) { | ||
1157 | /* | ||
1158 | * There were no waiters and the owner task lookup | ||
1159 | * failed. When the OWNER_DIED bit is set, then we | ||
1160 | * know that this is a robust futex and we actually | ||
1161 | * take the lock. This is safe as we are protected by | ||
1162 | * the hash bucket lock. We also set the waiters bit | ||
1163 | * unconditionally here, to simplify glibc handling of | ||
1164 | * multiple tasks racing to acquire the lock and | ||
1165 | * cleanup the problems which were left by the dead | ||
1166 | * owner. | ||
1167 | */ | ||
1168 | if (curval & FUTEX_OWNER_DIED) { | ||
1169 | uval = newval; | ||
1170 | newval = current->pid | | ||
1171 | FUTEX_OWNER_DIED | FUTEX_WAITERS; | ||
1172 | |||
1173 | inc_preempt_count(); | ||
1174 | curval = futex_atomic_cmpxchg_inatomic(uaddr, | ||
1175 | uval, newval); | ||
1176 | dec_preempt_count(); | ||
1177 | |||
1178 | if (unlikely(curval == -EFAULT)) | ||
1179 | goto uaddr_faulted; | ||
1180 | if (unlikely(curval != uval)) | ||
1181 | goto retry_locked; | ||
1182 | ret = 0; | ||
1183 | } | ||
1184 | goto out_unlock_release_sem; | ||
1185 | } | ||
1186 | |||
1187 | /* | ||
1188 | * Only actually queue now that the atomic ops are done: | ||
1189 | */ | ||
1190 | __queue_me(&q, hb); | ||
1191 | |||
1192 | /* | ||
1193 | * Now the futex is queued and we have checked the data, we | ||
1194 | * don't want to hold mmap_sem while we sleep. | ||
1195 | */ | ||
1196 | up_read(&curr->mm->mmap_sem); | ||
1197 | |||
1198 | WARN_ON(!q.pi_state); | ||
1199 | /* | ||
1200 | * Block on the PI mutex: | ||
1201 | */ | ||
1202 | if (!trylock) | ||
1203 | ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1); | ||
1204 | else { | ||
1205 | ret = rt_mutex_trylock(&q.pi_state->pi_mutex); | ||
1206 | /* Fixup the trylock return value: */ | ||
1207 | ret = ret ? 0 : -EWOULDBLOCK; | ||
1208 | } | ||
1209 | |||
1210 | down_read(&curr->mm->mmap_sem); | ||
1211 | hb = queue_lock(&q, -1, NULL); | ||
1212 | |||
1213 | /* | ||
1214 | * Got the lock. We might not be the anticipated owner if we | ||
1215 | * did a lock-steal - fix up the PI-state in that case. | ||
1216 | */ | ||
1217 | if (!ret && q.pi_state->owner != curr) { | ||
1218 | u32 newtid = current->pid | FUTEX_WAITERS; | ||
1219 | |||
1220 | /* Owner died? */ | ||
1221 | if (q.pi_state->owner != NULL) { | ||
1222 | spin_lock_irq(&q.pi_state->owner->pi_lock); | ||
1223 | list_del_init(&q.pi_state->list); | ||
1224 | spin_unlock_irq(&q.pi_state->owner->pi_lock); | ||
1225 | } else | ||
1226 | newtid |= FUTEX_OWNER_DIED; | ||
1227 | |||
1228 | q.pi_state->owner = current; | ||
1229 | |||
1230 | spin_lock_irq(¤t->pi_lock); | ||
1231 | list_add(&q.pi_state->list, ¤t->pi_state_list); | ||
1232 | spin_unlock_irq(¤t->pi_lock); | ||
1233 | |||
1234 | /* Unqueue and drop the lock */ | ||
1235 | unqueue_me_pi(&q, hb); | ||
1236 | up_read(&curr->mm->mmap_sem); | ||
1237 | /* | ||
1238 | * We own it, so we have to replace the pending owner | ||
1239 | * TID. This must be atomic as we have preserve the | ||
1240 | * owner died bit here. | ||
1241 | */ | ||
1242 | ret = get_user(uval, uaddr); | ||
1243 | while (!ret) { | ||
1244 | newval = (uval & FUTEX_OWNER_DIED) | newtid; | ||
1245 | curval = futex_atomic_cmpxchg_inatomic(uaddr, | ||
1246 | uval, newval); | ||
1247 | if (curval == -EFAULT) | ||
1248 | ret = -EFAULT; | ||
1249 | if (curval == uval) | ||
1250 | break; | ||
1251 | uval = curval; | ||
1252 | } | ||
1253 | } else { | ||
1254 | /* | ||
1255 | * Catch the rare case, where the lock was released | ||
1256 | * when we were on the way back before we locked | ||
1257 | * the hash bucket. | ||
1258 | */ | ||
1259 | if (ret && q.pi_state->owner == curr) { | ||
1260 | if (rt_mutex_trylock(&q.pi_state->pi_mutex)) | ||
1261 | ret = 0; | ||
1262 | } | ||
1263 | /* Unqueue and drop the lock */ | ||
1264 | unqueue_me_pi(&q, hb); | ||
1265 | up_read(&curr->mm->mmap_sem); | ||
1266 | } | ||
1267 | |||
1268 | if (!detect && ret == -EDEADLK && 0) | ||
1269 | force_sig(SIGKILL, current); | ||
1270 | |||
1271 | return ret; | ||
1272 | |||
1273 | out_unlock_release_sem: | ||
1274 | queue_unlock(&q, hb); | ||
1275 | |||
1276 | out_release_sem: | ||
1277 | up_read(&curr->mm->mmap_sem); | ||
1278 | return ret; | ||
1279 | |||
1280 | uaddr_faulted: | ||
1281 | /* | ||
1282 | * We have to r/w *(int __user *)uaddr, but we can't modify it | ||
1283 | * non-atomically. Therefore, if get_user below is not | ||
1284 | * enough, we need to handle the fault ourselves, while | ||
1285 | * still holding the mmap_sem. | ||
1286 | */ | ||
1287 | if (attempt++) { | ||
1288 | if (futex_handle_fault((unsigned long)uaddr, attempt)) | ||
1289 | goto out_unlock_release_sem; | ||
1290 | |||
1291 | goto retry_locked; | ||
1292 | } | ||
1293 | |||
1294 | queue_unlock(&q, hb); | ||
1295 | up_read(&curr->mm->mmap_sem); | ||
1296 | |||
1297 | ret = get_user(uval, uaddr); | ||
1298 | if (!ret && (uval != -EFAULT)) | ||
1299 | goto retry; | ||
1300 | |||
1301 | return ret; | ||
1302 | } | ||
1303 | |||
1304 | /* | ||
1305 | * Restart handler | ||
1306 | */ | ||
1307 | static long futex_lock_pi_restart(struct restart_block *restart) | ||
1308 | { | ||
1309 | struct hrtimer_sleeper timeout, *to = NULL; | ||
1310 | int ret; | ||
1311 | |||
1312 | restart->fn = do_no_restart_syscall; | ||
1313 | |||
1314 | if (restart->arg2 || restart->arg3) { | ||
1315 | to = &timeout; | ||
1316 | hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS); | ||
1317 | hrtimer_init_sleeper(to, current); | ||
1318 | to->timer.expires.tv64 = ((u64)restart->arg1 << 32) | | ||
1319 | (u64) restart->arg0; | ||
1320 | } | ||
1321 | |||
1322 | pr_debug("lock_pi restart: %p, %d (%d)\n", | ||
1323 | (u32 __user *)restart->arg0, current->pid); | ||
1324 | |||
1325 | ret = do_futex_lock_pi((u32 __user *)restart->arg0, restart->arg1, | ||
1326 | 0, to); | ||
1327 | |||
1328 | if (ret != -EINTR) | ||
1329 | return ret; | ||
1330 | |||
1331 | restart->fn = futex_lock_pi_restart; | ||
1332 | |||
1333 | /* The other values are filled in */ | ||
1334 | return -ERESTART_RESTARTBLOCK; | ||
1335 | } | ||
1336 | |||
1337 | /* | ||
1338 | * Called from the syscall entry below. | ||
1339 | */ | ||
1340 | static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec, | ||
1341 | long nsec, int trylock) | ||
1342 | { | ||
1343 | struct hrtimer_sleeper timeout, *to = NULL; | ||
1344 | struct restart_block *restart; | ||
1345 | int ret; | ||
1346 | |||
1347 | if (sec != MAX_SCHEDULE_TIMEOUT) { | ||
1348 | to = &timeout; | ||
1349 | hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS); | ||
1350 | hrtimer_init_sleeper(to, current); | ||
1351 | to->timer.expires = ktime_set(sec, nsec); | ||
1352 | } | ||
1353 | |||
1354 | ret = do_futex_lock_pi(uaddr, detect, trylock, to); | ||
1355 | |||
1356 | if (ret != -EINTR) | ||
1357 | return ret; | ||
1358 | |||
1359 | pr_debug("lock_pi interrupted: %p, %d (%d)\n", uaddr, current->pid); | ||
1360 | |||
1361 | restart = ¤t_thread_info()->restart_block; | ||
1362 | restart->fn = futex_lock_pi_restart; | ||
1363 | restart->arg0 = (unsigned long) uaddr; | ||
1364 | restart->arg1 = detect; | ||
1365 | if (to) { | ||
1366 | restart->arg2 = to->timer.expires.tv64 & 0xFFFFFFFF; | ||
1367 | restart->arg3 = to->timer.expires.tv64 >> 32; | ||
1368 | } else | ||
1369 | restart->arg2 = restart->arg3 = 0; | ||
1370 | |||
1371 | return -ERESTART_RESTARTBLOCK; | ||
1372 | } | ||
1373 | |||
1374 | /* | ||
1375 | * Userspace attempted a TID -> 0 atomic transition, and failed. | ||
1376 | * This is the in-kernel slowpath: we look up the PI state (if any), | ||
1377 | * and do the rt-mutex unlock. | ||
1378 | */ | ||
1379 | static int futex_unlock_pi(u32 __user *uaddr) | ||
1380 | { | ||
1381 | struct futex_hash_bucket *hb; | ||
1382 | struct futex_q *this, *next; | ||
1383 | u32 uval; | ||
1384 | struct list_head *head; | ||
1385 | union futex_key key; | ||
1386 | int ret, attempt = 0; | ||
1387 | |||
1388 | retry: | ||
1389 | if (get_user(uval, uaddr)) | ||
1390 | return -EFAULT; | ||
1391 | /* | ||
1392 | * We release only a lock we actually own: | ||
1393 | */ | ||
1394 | if ((uval & FUTEX_TID_MASK) != current->pid) | ||
1395 | return -EPERM; | ||
1396 | /* | ||
1397 | * First take all the futex related locks: | ||
1398 | */ | ||
1399 | down_read(¤t->mm->mmap_sem); | ||
1400 | |||
1401 | ret = get_futex_key(uaddr, &key); | ||
1402 | if (unlikely(ret != 0)) | ||
1403 | goto out; | ||
1404 | |||
1405 | hb = hash_futex(&key); | ||
1406 | spin_lock(&hb->lock); | ||
1407 | |||
1408 | retry_locked: | ||
1409 | /* | ||
1410 | * To avoid races, try to do the TID -> 0 atomic transition | ||
1411 | * again. If it succeeds then we can return without waking | ||
1412 | * anyone else up: | ||
1413 | */ | ||
1414 | inc_preempt_count(); | ||
1415 | uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); | ||
1416 | dec_preempt_count(); | ||
1417 | |||
1418 | if (unlikely(uval == -EFAULT)) | ||
1419 | goto pi_faulted; | ||
1420 | /* | ||
1421 | * Rare case: we managed to release the lock atomically, | ||
1422 | * no need to wake anyone else up: | ||
1423 | */ | ||
1424 | if (unlikely(uval == current->pid)) | ||
1425 | goto out_unlock; | ||
1426 | |||
1427 | /* | ||
1428 | * Ok, other tasks may need to be woken up - check waiters | ||
1429 | * and do the wakeup if necessary: | ||
1430 | */ | ||
1431 | head = &hb->chain; | ||
1432 | |||
1433 | list_for_each_entry_safe(this, next, head, list) { | ||
1434 | if (!match_futex (&this->key, &key)) | ||
1435 | continue; | ||
1436 | ret = wake_futex_pi(uaddr, uval, this); | ||
1437 | /* | ||
1438 | * The atomic access to the futex value | ||
1439 | * generated a pagefault, so retry the | ||
1440 | * user-access and the wakeup: | ||
1441 | */ | ||
1442 | if (ret == -EFAULT) | ||
1443 | goto pi_faulted; | ||
1444 | goto out_unlock; | ||
1445 | } | ||
1446 | /* | ||
1447 | * No waiters - kernel unlocks the futex: | ||
1448 | */ | ||
1449 | ret = unlock_futex_pi(uaddr, uval); | ||
1450 | if (ret == -EFAULT) | ||
1451 | goto pi_faulted; | ||
1452 | |||
1453 | out_unlock: | ||
1454 | spin_unlock(&hb->lock); | ||
1455 | out: | ||
728 | up_read(¤t->mm->mmap_sem); | 1456 | up_read(¤t->mm->mmap_sem); |
1457 | |||
1458 | return ret; | ||
1459 | |||
1460 | pi_faulted: | ||
1461 | /* | ||
1462 | * We have to r/w *(int __user *)uaddr, but we can't modify it | ||
1463 | * non-atomically. Therefore, if get_user below is not | ||
1464 | * enough, we need to handle the fault ourselves, while | ||
1465 | * still holding the mmap_sem. | ||
1466 | */ | ||
1467 | if (attempt++) { | ||
1468 | if (futex_handle_fault((unsigned long)uaddr, attempt)) | ||
1469 | goto out_unlock; | ||
1470 | |||
1471 | goto retry_locked; | ||
1472 | } | ||
1473 | |||
1474 | spin_unlock(&hb->lock); | ||
1475 | up_read(¤t->mm->mmap_sem); | ||
1476 | |||
1477 | ret = get_user(uval, uaddr); | ||
1478 | if (!ret && (uval != -EFAULT)) | ||
1479 | goto retry; | ||
1480 | |||
729 | return ret; | 1481 | return ret; |
730 | } | 1482 | } |
731 | 1483 | ||
@@ -735,6 +1487,7 @@ static int futex_close(struct inode *inode, struct file *filp) | |||
735 | 1487 | ||
736 | unqueue_me(q); | 1488 | unqueue_me(q); |
737 | kfree(q); | 1489 | kfree(q); |
1490 | |||
738 | return 0; | 1491 | return 0; |
739 | } | 1492 | } |
740 | 1493 | ||
@@ -766,7 +1519,7 @@ static struct file_operations futex_fops = { | |||
766 | * Signal allows caller to avoid the race which would occur if they | 1519 | * Signal allows caller to avoid the race which would occur if they |
767 | * set the sigio stuff up afterwards. | 1520 | * set the sigio stuff up afterwards. |
768 | */ | 1521 | */ |
769 | static int futex_fd(unsigned long uaddr, int signal) | 1522 | static int futex_fd(u32 __user *uaddr, int signal) |
770 | { | 1523 | { |
771 | struct futex_q *q; | 1524 | struct futex_q *q; |
772 | struct file *filp; | 1525 | struct file *filp; |
@@ -803,6 +1556,7 @@ static int futex_fd(unsigned long uaddr, int signal) | |||
803 | err = -ENOMEM; | 1556 | err = -ENOMEM; |
804 | goto error; | 1557 | goto error; |
805 | } | 1558 | } |
1559 | q->pi_state = NULL; | ||
806 | 1560 | ||
807 | down_read(¤t->mm->mmap_sem); | 1561 | down_read(¤t->mm->mmap_sem); |
808 | err = get_futex_key(uaddr, &q->key); | 1562 | err = get_futex_key(uaddr, &q->key); |
@@ -840,7 +1594,7 @@ error: | |||
840 | * Implementation: user-space maintains a per-thread list of locks it | 1594 | * Implementation: user-space maintains a per-thread list of locks it |
841 | * is holding. Upon do_exit(), the kernel carefully walks this list, | 1595 | * is holding. Upon do_exit(), the kernel carefully walks this list, |
842 | * and marks all locks that are owned by this thread with the | 1596 | * and marks all locks that are owned by this thread with the |
843 | * FUTEX_OWNER_DEAD bit, and wakes up a waiter (if any). The list is | 1597 | * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is |
844 | * always manipulated with the lock held, so the list is private and | 1598 | * always manipulated with the lock held, so the list is private and |
845 | * per-thread. Userspace also maintains a per-thread 'list_op_pending' | 1599 | * per-thread. Userspace also maintains a per-thread 'list_op_pending' |
846 | * field, to allow the kernel to clean up if the thread dies after | 1600 | * field, to allow the kernel to clean up if the thread dies after |
@@ -915,7 +1669,7 @@ err_unlock: | |||
915 | */ | 1669 | */ |
916 | int handle_futex_death(u32 __user *uaddr, struct task_struct *curr) | 1670 | int handle_futex_death(u32 __user *uaddr, struct task_struct *curr) |
917 | { | 1671 | { |
918 | u32 uval; | 1672 | u32 uval, nval; |
919 | 1673 | ||
920 | retry: | 1674 | retry: |
921 | if (get_user(uval, uaddr)) | 1675 | if (get_user(uval, uaddr)) |
@@ -932,12 +1686,16 @@ retry: | |||
932 | * thread-death.) The rest of the cleanup is done in | 1686 | * thread-death.) The rest of the cleanup is done in |
933 | * userspace. | 1687 | * userspace. |
934 | */ | 1688 | */ |
935 | if (futex_atomic_cmpxchg_inatomic(uaddr, uval, | 1689 | nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, |
936 | uval | FUTEX_OWNER_DIED) != uval) | 1690 | uval | FUTEX_OWNER_DIED); |
1691 | if (nval == -EFAULT) | ||
1692 | return -1; | ||
1693 | |||
1694 | if (nval != uval) | ||
937 | goto retry; | 1695 | goto retry; |
938 | 1696 | ||
939 | if (uval & FUTEX_WAITERS) | 1697 | if (uval & FUTEX_WAITERS) |
940 | futex_wake((unsigned long)uaddr, 1); | 1698 | futex_wake(uaddr, 1); |
941 | } | 1699 | } |
942 | return 0; | 1700 | return 0; |
943 | } | 1701 | } |
@@ -978,7 +1736,7 @@ void exit_robust_list(struct task_struct *curr) | |||
978 | while (entry != &head->list) { | 1736 | while (entry != &head->list) { |
979 | /* | 1737 | /* |
980 | * A pending lock might already be on the list, so | 1738 | * A pending lock might already be on the list, so |
981 | * dont process it twice: | 1739 | * don't process it twice: |
982 | */ | 1740 | */ |
983 | if (entry != pending) | 1741 | if (entry != pending) |
984 | if (handle_futex_death((void *)entry + futex_offset, | 1742 | if (handle_futex_death((void *)entry + futex_offset, |
@@ -999,8 +1757,8 @@ void exit_robust_list(struct task_struct *curr) | |||
999 | } | 1757 | } |
1000 | } | 1758 | } |
1001 | 1759 | ||
1002 | long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, | 1760 | long do_futex(u32 __user *uaddr, int op, u32 val, unsigned long timeout, |
1003 | unsigned long uaddr2, int val2, int val3) | 1761 | u32 __user *uaddr2, u32 val2, u32 val3) |
1004 | { | 1762 | { |
1005 | int ret; | 1763 | int ret; |
1006 | 1764 | ||
@@ -1024,6 +1782,15 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, | |||
1024 | case FUTEX_WAKE_OP: | 1782 | case FUTEX_WAKE_OP: |
1025 | ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); | 1783 | ret = futex_wake_op(uaddr, uaddr2, val, val2, val3); |
1026 | break; | 1784 | break; |
1785 | case FUTEX_LOCK_PI: | ||
1786 | ret = futex_lock_pi(uaddr, val, timeout, val2, 0); | ||
1787 | break; | ||
1788 | case FUTEX_UNLOCK_PI: | ||
1789 | ret = futex_unlock_pi(uaddr); | ||
1790 | break; | ||
1791 | case FUTEX_TRYLOCK_PI: | ||
1792 | ret = futex_lock_pi(uaddr, 0, timeout, val2, 1); | ||
1793 | break; | ||
1027 | default: | 1794 | default: |
1028 | ret = -ENOSYS; | 1795 | ret = -ENOSYS; |
1029 | } | 1796 | } |
@@ -1031,29 +1798,33 @@ long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout, | |||
1031 | } | 1798 | } |
1032 | 1799 | ||
1033 | 1800 | ||
1034 | asmlinkage long sys_futex(u32 __user *uaddr, int op, int val, | 1801 | asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val, |
1035 | struct timespec __user *utime, u32 __user *uaddr2, | 1802 | struct timespec __user *utime, u32 __user *uaddr2, |
1036 | int val3) | 1803 | u32 val3) |
1037 | { | 1804 | { |
1038 | struct timespec t; | 1805 | struct timespec t; |
1039 | unsigned long timeout = MAX_SCHEDULE_TIMEOUT; | 1806 | unsigned long timeout = MAX_SCHEDULE_TIMEOUT; |
1040 | int val2 = 0; | 1807 | u32 val2 = 0; |
1041 | 1808 | ||
1042 | if (utime && (op == FUTEX_WAIT)) { | 1809 | if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { |
1043 | if (copy_from_user(&t, utime, sizeof(t)) != 0) | 1810 | if (copy_from_user(&t, utime, sizeof(t)) != 0) |
1044 | return -EFAULT; | 1811 | return -EFAULT; |
1045 | if (!timespec_valid(&t)) | 1812 | if (!timespec_valid(&t)) |
1046 | return -EINVAL; | 1813 | return -EINVAL; |
1047 | timeout = timespec_to_jiffies(&t) + 1; | 1814 | if (op == FUTEX_WAIT) |
1815 | timeout = timespec_to_jiffies(&t) + 1; | ||
1816 | else { | ||
1817 | timeout = t.tv_sec; | ||
1818 | val2 = t.tv_nsec; | ||
1819 | } | ||
1048 | } | 1820 | } |
1049 | /* | 1821 | /* |
1050 | * requeue parameter in 'utime' if op == FUTEX_REQUEUE. | 1822 | * requeue parameter in 'utime' if op == FUTEX_REQUEUE. |
1051 | */ | 1823 | */ |
1052 | if (op >= FUTEX_REQUEUE) | 1824 | if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE) |
1053 | val2 = (int) (unsigned long) utime; | 1825 | val2 = (u32) (unsigned long) utime; |
1054 | 1826 | ||
1055 | return do_futex((unsigned long)uaddr, op, val, timeout, | 1827 | return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); |
1056 | (unsigned long)uaddr2, val2, val3); | ||
1057 | } | 1828 | } |
1058 | 1829 | ||
1059 | static int futexfs_get_sb(struct file_system_type *fs_type, | 1830 | static int futexfs_get_sb(struct file_system_type *fs_type, |
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 1ab6a0ea3d14..d1d92b441fb7 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c | |||
@@ -129,16 +129,20 @@ asmlinkage long compat_sys_futex(u32 __user *uaddr, int op, u32 val, | |||
129 | unsigned long timeout = MAX_SCHEDULE_TIMEOUT; | 129 | unsigned long timeout = MAX_SCHEDULE_TIMEOUT; |
130 | int val2 = 0; | 130 | int val2 = 0; |
131 | 131 | ||
132 | if (utime && (op == FUTEX_WAIT)) { | 132 | if (utime && (op == FUTEX_WAIT || op == FUTEX_LOCK_PI)) { |
133 | if (get_compat_timespec(&t, utime)) | 133 | if (get_compat_timespec(&t, utime)) |
134 | return -EFAULT; | 134 | return -EFAULT; |
135 | if (!timespec_valid(&t)) | 135 | if (!timespec_valid(&t)) |
136 | return -EINVAL; | 136 | return -EINVAL; |
137 | timeout = timespec_to_jiffies(&t) + 1; | 137 | if (op == FUTEX_WAIT) |
138 | timeout = timespec_to_jiffies(&t) + 1; | ||
139 | else { | ||
140 | timeout = t.tv_sec; | ||
141 | val2 = t.tv_nsec; | ||
142 | } | ||
138 | } | 143 | } |
139 | if (op >= FUTEX_REQUEUE) | 144 | if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE) |
140 | val2 = (int) (unsigned long) utime; | 145 | val2 = (int) (unsigned long) utime; |
141 | 146 | ||
142 | return do_futex((unsigned long)uaddr, op, val, timeout, | 147 | return do_futex(uaddr, op, val, timeout, uaddr2, val2, val3); |
143 | (unsigned long)uaddr2, val2, val3); | ||
144 | } | 148 | } |
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 55601b3ce60e..8d3dc29ef41a 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c | |||
@@ -833,7 +833,7 @@ static void migrate_hrtimers(int cpu) | |||
833 | } | 833 | } |
834 | #endif /* CONFIG_HOTPLUG_CPU */ | 834 | #endif /* CONFIG_HOTPLUG_CPU */ |
835 | 835 | ||
836 | static int hrtimer_cpu_notify(struct notifier_block *self, | 836 | static int __devinit hrtimer_cpu_notify(struct notifier_block *self, |
837 | unsigned long action, void *hcpu) | 837 | unsigned long action, void *hcpu) |
838 | { | 838 | { |
839 | long cpu = (long)hcpu; | 839 | long cpu = (long)hcpu; |
@@ -857,7 +857,7 @@ static int hrtimer_cpu_notify(struct notifier_block *self, | |||
857 | return NOTIFY_OK; | 857 | return NOTIFY_OK; |
858 | } | 858 | } |
859 | 859 | ||
860 | static struct notifier_block hrtimers_nb = { | 860 | static struct notifier_block __devinitdata hrtimers_nb = { |
861 | .notifier_call = hrtimer_cpu_notify, | 861 | .notifier_call = hrtimer_cpu_notify, |
862 | }; | 862 | }; |
863 | 863 | ||
diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 1fbf466a29aa..64aab081153b 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c | |||
@@ -47,11 +47,17 @@ | |||
47 | 47 | ||
48 | static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; | 48 | static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; |
49 | static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; | 49 | static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; |
50 | static atomic_t kprobe_count; | ||
50 | 51 | ||
51 | DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ | 52 | DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ |
52 | DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ | 53 | DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */ |
53 | static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; | 54 | static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; |
54 | 55 | ||
56 | static struct notifier_block kprobe_page_fault_nb = { | ||
57 | .notifier_call = kprobe_exceptions_notify, | ||
58 | .priority = 0x7fffffff /* we need to notified first */ | ||
59 | }; | ||
60 | |||
55 | #ifdef __ARCH_WANT_KPROBES_INSN_SLOT | 61 | #ifdef __ARCH_WANT_KPROBES_INSN_SLOT |
56 | /* | 62 | /* |
57 | * kprobe->ainsn.insn points to the copy of the instruction to be | 63 | * kprobe->ainsn.insn points to the copy of the instruction to be |
@@ -368,16 +374,15 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p) | |||
368 | */ | 374 | */ |
369 | static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p) | 375 | static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p) |
370 | { | 376 | { |
371 | struct kprobe *kp; | ||
372 | |||
373 | if (p->break_handler) { | 377 | if (p->break_handler) { |
374 | list_for_each_entry_rcu(kp, &old_p->list, list) { | 378 | if (old_p->break_handler) |
375 | if (kp->break_handler) | 379 | return -EEXIST; |
376 | return -EEXIST; | ||
377 | } | ||
378 | list_add_tail_rcu(&p->list, &old_p->list); | 380 | list_add_tail_rcu(&p->list, &old_p->list); |
381 | old_p->break_handler = aggr_break_handler; | ||
379 | } else | 382 | } else |
380 | list_add_rcu(&p->list, &old_p->list); | 383 | list_add_rcu(&p->list, &old_p->list); |
384 | if (p->post_handler && !old_p->post_handler) | ||
385 | old_p->post_handler = aggr_post_handler; | ||
381 | return 0; | 386 | return 0; |
382 | } | 387 | } |
383 | 388 | ||
@@ -390,9 +395,11 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) | |||
390 | copy_kprobe(p, ap); | 395 | copy_kprobe(p, ap); |
391 | ap->addr = p->addr; | 396 | ap->addr = p->addr; |
392 | ap->pre_handler = aggr_pre_handler; | 397 | ap->pre_handler = aggr_pre_handler; |
393 | ap->post_handler = aggr_post_handler; | ||
394 | ap->fault_handler = aggr_fault_handler; | 398 | ap->fault_handler = aggr_fault_handler; |
395 | ap->break_handler = aggr_break_handler; | 399 | if (p->post_handler) |
400 | ap->post_handler = aggr_post_handler; | ||
401 | if (p->break_handler) | ||
402 | ap->break_handler = aggr_break_handler; | ||
396 | 403 | ||
397 | INIT_LIST_HEAD(&ap->list); | 404 | INIT_LIST_HEAD(&ap->list); |
398 | list_add_rcu(&p->list, &ap->list); | 405 | list_add_rcu(&p->list, &ap->list); |
@@ -464,6 +471,8 @@ static int __kprobes __register_kprobe(struct kprobe *p, | |||
464 | old_p = get_kprobe(p->addr); | 471 | old_p = get_kprobe(p->addr); |
465 | if (old_p) { | 472 | if (old_p) { |
466 | ret = register_aggr_kprobe(old_p, p); | 473 | ret = register_aggr_kprobe(old_p, p); |
474 | if (!ret) | ||
475 | atomic_inc(&kprobe_count); | ||
467 | goto out; | 476 | goto out; |
468 | } | 477 | } |
469 | 478 | ||
@@ -474,6 +483,10 @@ static int __kprobes __register_kprobe(struct kprobe *p, | |||
474 | hlist_add_head_rcu(&p->hlist, | 483 | hlist_add_head_rcu(&p->hlist, |
475 | &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); | 484 | &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]); |
476 | 485 | ||
486 | if (atomic_add_return(1, &kprobe_count) == \ | ||
487 | (ARCH_INACTIVE_KPROBE_COUNT + 1)) | ||
488 | register_page_fault_notifier(&kprobe_page_fault_nb); | ||
489 | |||
477 | arch_arm_kprobe(p); | 490 | arch_arm_kprobe(p); |
478 | 491 | ||
479 | out: | 492 | out: |
@@ -536,14 +549,40 @@ valid_p: | |||
536 | kfree(old_p); | 549 | kfree(old_p); |
537 | } | 550 | } |
538 | arch_remove_kprobe(p); | 551 | arch_remove_kprobe(p); |
552 | } else { | ||
553 | mutex_lock(&kprobe_mutex); | ||
554 | if (p->break_handler) | ||
555 | old_p->break_handler = NULL; | ||
556 | if (p->post_handler){ | ||
557 | list_for_each_entry_rcu(list_p, &old_p->list, list){ | ||
558 | if (list_p->post_handler){ | ||
559 | cleanup_p = 2; | ||
560 | break; | ||
561 | } | ||
562 | } | ||
563 | if (cleanup_p == 0) | ||
564 | old_p->post_handler = NULL; | ||
565 | } | ||
566 | mutex_unlock(&kprobe_mutex); | ||
539 | } | 567 | } |
568 | |||
569 | /* Call unregister_page_fault_notifier() | ||
570 | * if no probes are active | ||
571 | */ | ||
572 | mutex_lock(&kprobe_mutex); | ||
573 | if (atomic_add_return(-1, &kprobe_count) == \ | ||
574 | ARCH_INACTIVE_KPROBE_COUNT) | ||
575 | unregister_page_fault_notifier(&kprobe_page_fault_nb); | ||
576 | mutex_unlock(&kprobe_mutex); | ||
577 | return; | ||
540 | } | 578 | } |
541 | 579 | ||
542 | static struct notifier_block kprobe_exceptions_nb = { | 580 | static struct notifier_block kprobe_exceptions_nb = { |
543 | .notifier_call = kprobe_exceptions_notify, | 581 | .notifier_call = kprobe_exceptions_notify, |
544 | .priority = 0x7fffffff /* we need to notified first */ | 582 | .priority = 0x7fffffff /* we need to be notified first */ |
545 | }; | 583 | }; |
546 | 584 | ||
585 | |||
547 | int __kprobes register_jprobe(struct jprobe *jp) | 586 | int __kprobes register_jprobe(struct jprobe *jp) |
548 | { | 587 | { |
549 | /* Todo: Verify probepoint is a function entry point */ | 588 | /* Todo: Verify probepoint is a function entry point */ |
@@ -652,6 +691,7 @@ static int __init init_kprobes(void) | |||
652 | INIT_HLIST_HEAD(&kprobe_table[i]); | 691 | INIT_HLIST_HEAD(&kprobe_table[i]); |
653 | INIT_HLIST_HEAD(&kretprobe_inst_table[i]); | 692 | INIT_HLIST_HEAD(&kretprobe_inst_table[i]); |
654 | } | 693 | } |
694 | atomic_set(&kprobe_count, 0); | ||
655 | 695 | ||
656 | err = arch_init_kprobes(); | 696 | err = arch_init_kprobes(); |
657 | if (!err) | 697 | if (!err) |
diff --git a/kernel/module.c b/kernel/module.c index d75275de1c28..10e5b872adf6 100644 --- a/kernel/module.c +++ b/kernel/module.c | |||
@@ -40,9 +40,11 @@ | |||
40 | #include <linux/string.h> | 40 | #include <linux/string.h> |
41 | #include <linux/sched.h> | 41 | #include <linux/sched.h> |
42 | #include <linux/mutex.h> | 42 | #include <linux/mutex.h> |
43 | #include <linux/unwind.h> | ||
43 | #include <asm/uaccess.h> | 44 | #include <asm/uaccess.h> |
44 | #include <asm/semaphore.h> | 45 | #include <asm/semaphore.h> |
45 | #include <asm/cacheflush.h> | 46 | #include <asm/cacheflush.h> |
47 | #include <linux/license.h> | ||
46 | 48 | ||
47 | #if 0 | 49 | #if 0 |
48 | #define DEBUGP printk | 50 | #define DEBUGP printk |
@@ -1051,6 +1053,8 @@ static void free_module(struct module *mod) | |||
1051 | remove_sect_attrs(mod); | 1053 | remove_sect_attrs(mod); |
1052 | mod_kobject_remove(mod); | 1054 | mod_kobject_remove(mod); |
1053 | 1055 | ||
1056 | unwind_remove_table(mod->unwind_info, 0); | ||
1057 | |||
1054 | /* Arch-specific cleanup. */ | 1058 | /* Arch-specific cleanup. */ |
1055 | module_arch_cleanup(mod); | 1059 | module_arch_cleanup(mod); |
1056 | 1060 | ||
@@ -1248,16 +1252,6 @@ static void layout_sections(struct module *mod, | |||
1248 | } | 1252 | } |
1249 | } | 1253 | } |
1250 | 1254 | ||
1251 | static inline int license_is_gpl_compatible(const char *license) | ||
1252 | { | ||
1253 | return (strcmp(license, "GPL") == 0 | ||
1254 | || strcmp(license, "GPL v2") == 0 | ||
1255 | || strcmp(license, "GPL and additional rights") == 0 | ||
1256 | || strcmp(license, "Dual BSD/GPL") == 0 | ||
1257 | || strcmp(license, "Dual MIT/GPL") == 0 | ||
1258 | || strcmp(license, "Dual MPL/GPL") == 0); | ||
1259 | } | ||
1260 | |||
1261 | static void set_license(struct module *mod, const char *license) | 1255 | static void set_license(struct module *mod, const char *license) |
1262 | { | 1256 | { |
1263 | if (!license) | 1257 | if (!license) |
@@ -1412,7 +1406,7 @@ static struct module *load_module(void __user *umod, | |||
1412 | unsigned int i, symindex = 0, strindex = 0, setupindex, exindex, | 1406 | unsigned int i, symindex = 0, strindex = 0, setupindex, exindex, |
1413 | exportindex, modindex, obsparmindex, infoindex, gplindex, | 1407 | exportindex, modindex, obsparmindex, infoindex, gplindex, |
1414 | crcindex, gplcrcindex, versindex, pcpuindex, gplfutureindex, | 1408 | crcindex, gplcrcindex, versindex, pcpuindex, gplfutureindex, |
1415 | gplfuturecrcindex; | 1409 | gplfuturecrcindex, unwindex = 0; |
1416 | struct module *mod; | 1410 | struct module *mod; |
1417 | long err = 0; | 1411 | long err = 0; |
1418 | void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ | 1412 | void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ |
@@ -1502,6 +1496,9 @@ static struct module *load_module(void __user *umod, | |||
1502 | versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); | 1496 | versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); |
1503 | infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); | 1497 | infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); |
1504 | pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); | 1498 | pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); |
1499 | #ifdef ARCH_UNWIND_SECTION_NAME | ||
1500 | unwindex = find_sec(hdr, sechdrs, secstrings, ARCH_UNWIND_SECTION_NAME); | ||
1501 | #endif | ||
1505 | 1502 | ||
1506 | /* Don't keep modinfo section */ | 1503 | /* Don't keep modinfo section */ |
1507 | sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; | 1504 | sechdrs[infoindex].sh_flags &= ~(unsigned long)SHF_ALLOC; |
@@ -1510,6 +1507,8 @@ static struct module *load_module(void __user *umod, | |||
1510 | sechdrs[symindex].sh_flags |= SHF_ALLOC; | 1507 | sechdrs[symindex].sh_flags |= SHF_ALLOC; |
1511 | sechdrs[strindex].sh_flags |= SHF_ALLOC; | 1508 | sechdrs[strindex].sh_flags |= SHF_ALLOC; |
1512 | #endif | 1509 | #endif |
1510 | if (unwindex) | ||
1511 | sechdrs[unwindex].sh_flags |= SHF_ALLOC; | ||
1513 | 1512 | ||
1514 | /* Check module struct version now, before we try to use module. */ | 1513 | /* Check module struct version now, before we try to use module. */ |
1515 | if (!check_modstruct_version(sechdrs, versindex, mod)) { | 1514 | if (!check_modstruct_version(sechdrs, versindex, mod)) { |
@@ -1738,6 +1737,11 @@ static struct module *load_module(void __user *umod, | |||
1738 | goto arch_cleanup; | 1737 | goto arch_cleanup; |
1739 | add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); | 1738 | add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); |
1740 | 1739 | ||
1740 | /* Size of section 0 is 0, so this works well if no unwind info. */ | ||
1741 | mod->unwind_info = unwind_add_table(mod, | ||
1742 | (void *)sechdrs[unwindex].sh_addr, | ||
1743 | sechdrs[unwindex].sh_size); | ||
1744 | |||
1741 | /* Get rid of temporary copy */ | 1745 | /* Get rid of temporary copy */ |
1742 | vfree(hdr); | 1746 | vfree(hdr); |
1743 | 1747 | ||
@@ -1836,6 +1840,7 @@ sys_init_module(void __user *umod, | |||
1836 | mod->state = MODULE_STATE_LIVE; | 1840 | mod->state = MODULE_STATE_LIVE; |
1837 | /* Drop initial reference. */ | 1841 | /* Drop initial reference. */ |
1838 | module_put(mod); | 1842 | module_put(mod); |
1843 | unwind_remove_table(mod->unwind_info, 1); | ||
1839 | module_free(mod, mod->module_init); | 1844 | module_free(mod, mod->module_init); |
1840 | mod->module_init = NULL; | 1845 | mod->module_init = NULL; |
1841 | mod->init_size = 0; | 1846 | mod->init_size = 0; |
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c index f4913c376950..e38e4bac97ca 100644 --- a/kernel/mutex-debug.c +++ b/kernel/mutex-debug.c | |||
@@ -16,6 +16,7 @@ | |||
16 | #include <linux/sched.h> | 16 | #include <linux/sched.h> |
17 | #include <linux/delay.h> | 17 | #include <linux/delay.h> |
18 | #include <linux/module.h> | 18 | #include <linux/module.h> |
19 | #include <linux/poison.h> | ||
19 | #include <linux/spinlock.h> | 20 | #include <linux/spinlock.h> |
20 | #include <linux/kallsyms.h> | 21 | #include <linux/kallsyms.h> |
21 | #include <linux/interrupt.h> | 22 | #include <linux/interrupt.h> |
@@ -153,13 +154,13 @@ next: | |||
153 | continue; | 154 | continue; |
154 | count++; | 155 | count++; |
155 | cursor = curr->next; | 156 | cursor = curr->next; |
156 | debug_spin_lock_restore(&debug_mutex_lock, flags); | 157 | debug_spin_unlock_restore(&debug_mutex_lock, flags); |
157 | 158 | ||
158 | printk("\n#%03d: ", count); | 159 | printk("\n#%03d: ", count); |
159 | printk_lock(lock, filter ? 0 : 1); | 160 | printk_lock(lock, filter ? 0 : 1); |
160 | goto next; | 161 | goto next; |
161 | } | 162 | } |
162 | debug_spin_lock_restore(&debug_mutex_lock, flags); | 163 | debug_spin_unlock_restore(&debug_mutex_lock, flags); |
163 | printk("\n"); | 164 | printk("\n"); |
164 | } | 165 | } |
165 | 166 | ||
@@ -316,7 +317,7 @@ void mutex_debug_check_no_locks_held(struct task_struct *task) | |||
316 | continue; | 317 | continue; |
317 | list_del_init(curr); | 318 | list_del_init(curr); |
318 | DEBUG_OFF(); | 319 | DEBUG_OFF(); |
319 | debug_spin_lock_restore(&debug_mutex_lock, flags); | 320 | debug_spin_unlock_restore(&debug_mutex_lock, flags); |
320 | 321 | ||
321 | printk("BUG: %s/%d, lock held at task exit time!\n", | 322 | printk("BUG: %s/%d, lock held at task exit time!\n", |
322 | task->comm, task->pid); | 323 | task->comm, task->pid); |
@@ -325,7 +326,7 @@ void mutex_debug_check_no_locks_held(struct task_struct *task) | |||
325 | printk("exiting task is not even the owner??\n"); | 326 | printk("exiting task is not even the owner??\n"); |
326 | return; | 327 | return; |
327 | } | 328 | } |
328 | debug_spin_lock_restore(&debug_mutex_lock, flags); | 329 | debug_spin_unlock_restore(&debug_mutex_lock, flags); |
329 | } | 330 | } |
330 | 331 | ||
331 | /* | 332 | /* |
@@ -352,7 +353,7 @@ void mutex_debug_check_no_locks_freed(const void *from, unsigned long len) | |||
352 | continue; | 353 | continue; |
353 | list_del_init(curr); | 354 | list_del_init(curr); |
354 | DEBUG_OFF(); | 355 | DEBUG_OFF(); |
355 | debug_spin_lock_restore(&debug_mutex_lock, flags); | 356 | debug_spin_unlock_restore(&debug_mutex_lock, flags); |
356 | 357 | ||
357 | printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n", | 358 | printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n", |
358 | current->comm, current->pid, lock, from, to); | 359 | current->comm, current->pid, lock, from, to); |
@@ -362,7 +363,7 @@ void mutex_debug_check_no_locks_freed(const void *from, unsigned long len) | |||
362 | printk("freeing task is not even the owner??\n"); | 363 | printk("freeing task is not even the owner??\n"); |
363 | return; | 364 | return; |
364 | } | 365 | } |
365 | debug_spin_lock_restore(&debug_mutex_lock, flags); | 366 | debug_spin_unlock_restore(&debug_mutex_lock, flags); |
366 | } | 367 | } |
367 | 368 | ||
368 | /* | 369 | /* |
@@ -381,7 +382,7 @@ void debug_mutex_set_owner(struct mutex *lock, | |||
381 | 382 | ||
382 | void debug_mutex_init_waiter(struct mutex_waiter *waiter) | 383 | void debug_mutex_init_waiter(struct mutex_waiter *waiter) |
383 | { | 384 | { |
384 | memset(waiter, 0x11, sizeof(*waiter)); | 385 | memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter)); |
385 | waiter->magic = waiter; | 386 | waiter->magic = waiter; |
386 | INIT_LIST_HEAD(&waiter->list); | 387 | INIT_LIST_HEAD(&waiter->list); |
387 | } | 388 | } |
@@ -397,7 +398,7 @@ void debug_mutex_wake_waiter(struct mutex *lock, struct mutex_waiter *waiter) | |||
397 | void debug_mutex_free_waiter(struct mutex_waiter *waiter) | 398 | void debug_mutex_free_waiter(struct mutex_waiter *waiter) |
398 | { | 399 | { |
399 | DEBUG_WARN_ON(!list_empty(&waiter->list)); | 400 | DEBUG_WARN_ON(!list_empty(&waiter->list)); |
400 | memset(waiter, 0x22, sizeof(*waiter)); | 401 | memset(waiter, MUTEX_DEBUG_FREE, sizeof(*waiter)); |
401 | } | 402 | } |
402 | 403 | ||
403 | void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, | 404 | void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter, |
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h index fd384050acb1..a5196c36a5fd 100644 --- a/kernel/mutex-debug.h +++ b/kernel/mutex-debug.h | |||
@@ -46,21 +46,6 @@ extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter, | |||
46 | extern void debug_mutex_unlock(struct mutex *lock); | 46 | extern void debug_mutex_unlock(struct mutex *lock); |
47 | extern void debug_mutex_init(struct mutex *lock, const char *name); | 47 | extern void debug_mutex_init(struct mutex *lock, const char *name); |
48 | 48 | ||
49 | #define debug_spin_lock(lock) \ | ||
50 | do { \ | ||
51 | local_irq_disable(); \ | ||
52 | if (debug_mutex_on) \ | ||
53 | spin_lock(lock); \ | ||
54 | } while (0) | ||
55 | |||
56 | #define debug_spin_unlock(lock) \ | ||
57 | do { \ | ||
58 | if (debug_mutex_on) \ | ||
59 | spin_unlock(lock); \ | ||
60 | local_irq_enable(); \ | ||
61 | preempt_check_resched(); \ | ||
62 | } while (0) | ||
63 | |||
64 | #define debug_spin_lock_save(lock, flags) \ | 49 | #define debug_spin_lock_save(lock, flags) \ |
65 | do { \ | 50 | do { \ |
66 | local_irq_save(flags); \ | 51 | local_irq_save(flags); \ |
@@ -68,7 +53,7 @@ extern void debug_mutex_init(struct mutex *lock, const char *name); | |||
68 | spin_lock(lock); \ | 53 | spin_lock(lock); \ |
69 | } while (0) | 54 | } while (0) |
70 | 55 | ||
71 | #define debug_spin_lock_restore(lock, flags) \ | 56 | #define debug_spin_unlock_restore(lock, flags) \ |
72 | do { \ | 57 | do { \ |
73 | if (debug_mutex_on) \ | 58 | if (debug_mutex_on) \ |
74 | spin_unlock(lock); \ | 59 | spin_unlock(lock); \ |
@@ -76,20 +61,20 @@ extern void debug_mutex_init(struct mutex *lock, const char *name); | |||
76 | preempt_check_resched(); \ | 61 | preempt_check_resched(); \ |
77 | } while (0) | 62 | } while (0) |
78 | 63 | ||
79 | #define spin_lock_mutex(lock) \ | 64 | #define spin_lock_mutex(lock, flags) \ |
80 | do { \ | 65 | do { \ |
81 | struct mutex *l = container_of(lock, struct mutex, wait_lock); \ | 66 | struct mutex *l = container_of(lock, struct mutex, wait_lock); \ |
82 | \ | 67 | \ |
83 | DEBUG_WARN_ON(in_interrupt()); \ | 68 | DEBUG_WARN_ON(in_interrupt()); \ |
84 | debug_spin_lock(&debug_mutex_lock); \ | 69 | debug_spin_lock_save(&debug_mutex_lock, flags); \ |
85 | spin_lock(lock); \ | 70 | spin_lock(lock); \ |
86 | DEBUG_WARN_ON(l->magic != l); \ | 71 | DEBUG_WARN_ON(l->magic != l); \ |
87 | } while (0) | 72 | } while (0) |
88 | 73 | ||
89 | #define spin_unlock_mutex(lock) \ | 74 | #define spin_unlock_mutex(lock, flags) \ |
90 | do { \ | 75 | do { \ |
91 | spin_unlock(lock); \ | 76 | spin_unlock(lock); \ |
92 | debug_spin_unlock(&debug_mutex_lock); \ | 77 | debug_spin_unlock_restore(&debug_mutex_lock, flags); \ |
93 | } while (0) | 78 | } while (0) |
94 | 79 | ||
95 | #define DEBUG_OFF() \ | 80 | #define DEBUG_OFF() \ |
diff --git a/kernel/mutex.c b/kernel/mutex.c index 5449b210d9ed..7043db21bbce 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c | |||
@@ -125,10 +125,11 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__) | |||
125 | struct task_struct *task = current; | 125 | struct task_struct *task = current; |
126 | struct mutex_waiter waiter; | 126 | struct mutex_waiter waiter; |
127 | unsigned int old_val; | 127 | unsigned int old_val; |
128 | unsigned long flags; | ||
128 | 129 | ||
129 | debug_mutex_init_waiter(&waiter); | 130 | debug_mutex_init_waiter(&waiter); |
130 | 131 | ||
131 | spin_lock_mutex(&lock->wait_lock); | 132 | spin_lock_mutex(&lock->wait_lock, flags); |
132 | 133 | ||
133 | debug_mutex_add_waiter(lock, &waiter, task->thread_info, ip); | 134 | debug_mutex_add_waiter(lock, &waiter, task->thread_info, ip); |
134 | 135 | ||
@@ -157,7 +158,7 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__) | |||
157 | if (unlikely(state == TASK_INTERRUPTIBLE && | 158 | if (unlikely(state == TASK_INTERRUPTIBLE && |
158 | signal_pending(task))) { | 159 | signal_pending(task))) { |
159 | mutex_remove_waiter(lock, &waiter, task->thread_info); | 160 | mutex_remove_waiter(lock, &waiter, task->thread_info); |
160 | spin_unlock_mutex(&lock->wait_lock); | 161 | spin_unlock_mutex(&lock->wait_lock, flags); |
161 | 162 | ||
162 | debug_mutex_free_waiter(&waiter); | 163 | debug_mutex_free_waiter(&waiter); |
163 | return -EINTR; | 164 | return -EINTR; |
@@ -165,9 +166,9 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__) | |||
165 | __set_task_state(task, state); | 166 | __set_task_state(task, state); |
166 | 167 | ||
167 | /* didnt get the lock, go to sleep: */ | 168 | /* didnt get the lock, go to sleep: */ |
168 | spin_unlock_mutex(&lock->wait_lock); | 169 | spin_unlock_mutex(&lock->wait_lock, flags); |
169 | schedule(); | 170 | schedule(); |
170 | spin_lock_mutex(&lock->wait_lock); | 171 | spin_lock_mutex(&lock->wait_lock, flags); |
171 | } | 172 | } |
172 | 173 | ||
173 | /* got the lock - rejoice! */ | 174 | /* got the lock - rejoice! */ |
@@ -178,7 +179,7 @@ __mutex_lock_common(struct mutex *lock, long state __IP_DECL__) | |||
178 | if (likely(list_empty(&lock->wait_list))) | 179 | if (likely(list_empty(&lock->wait_list))) |
179 | atomic_set(&lock->count, 0); | 180 | atomic_set(&lock->count, 0); |
180 | 181 | ||
181 | spin_unlock_mutex(&lock->wait_lock); | 182 | spin_unlock_mutex(&lock->wait_lock, flags); |
182 | 183 | ||
183 | debug_mutex_free_waiter(&waiter); | 184 | debug_mutex_free_waiter(&waiter); |
184 | 185 | ||
@@ -203,10 +204,11 @@ static fastcall noinline void | |||
203 | __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__) | 204 | __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__) |
204 | { | 205 | { |
205 | struct mutex *lock = container_of(lock_count, struct mutex, count); | 206 | struct mutex *lock = container_of(lock_count, struct mutex, count); |
207 | unsigned long flags; | ||
206 | 208 | ||
207 | DEBUG_WARN_ON(lock->owner != current_thread_info()); | 209 | DEBUG_WARN_ON(lock->owner != current_thread_info()); |
208 | 210 | ||
209 | spin_lock_mutex(&lock->wait_lock); | 211 | spin_lock_mutex(&lock->wait_lock, flags); |
210 | 212 | ||
211 | /* | 213 | /* |
212 | * some architectures leave the lock unlocked in the fastpath failure | 214 | * some architectures leave the lock unlocked in the fastpath failure |
@@ -231,7 +233,7 @@ __mutex_unlock_slowpath(atomic_t *lock_count __IP_DECL__) | |||
231 | 233 | ||
232 | debug_mutex_clear_owner(lock); | 234 | debug_mutex_clear_owner(lock); |
233 | 235 | ||
234 | spin_unlock_mutex(&lock->wait_lock); | 236 | spin_unlock_mutex(&lock->wait_lock, flags); |
235 | } | 237 | } |
236 | 238 | ||
237 | /* | 239 | /* |
@@ -276,9 +278,10 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count __IP_DECL__) | |||
276 | static inline int __mutex_trylock_slowpath(atomic_t *lock_count) | 278 | static inline int __mutex_trylock_slowpath(atomic_t *lock_count) |
277 | { | 279 | { |
278 | struct mutex *lock = container_of(lock_count, struct mutex, count); | 280 | struct mutex *lock = container_of(lock_count, struct mutex, count); |
281 | unsigned long flags; | ||
279 | int prev; | 282 | int prev; |
280 | 283 | ||
281 | spin_lock_mutex(&lock->wait_lock); | 284 | spin_lock_mutex(&lock->wait_lock, flags); |
282 | 285 | ||
283 | prev = atomic_xchg(&lock->count, -1); | 286 | prev = atomic_xchg(&lock->count, -1); |
284 | if (likely(prev == 1)) | 287 | if (likely(prev == 1)) |
@@ -287,7 +290,7 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count) | |||
287 | if (likely(list_empty(&lock->wait_list))) | 290 | if (likely(list_empty(&lock->wait_list))) |
288 | atomic_set(&lock->count, 0); | 291 | atomic_set(&lock->count, 0); |
289 | 292 | ||
290 | spin_unlock_mutex(&lock->wait_lock); | 293 | spin_unlock_mutex(&lock->wait_lock, flags); |
291 | 294 | ||
292 | return prev == 1; | 295 | return prev == 1; |
293 | } | 296 | } |
diff --git a/kernel/mutex.h b/kernel/mutex.h index 00fe84e7b672..069189947257 100644 --- a/kernel/mutex.h +++ b/kernel/mutex.h | |||
@@ -9,8 +9,10 @@ | |||
9 | * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs: | 9 | * !CONFIG_DEBUG_MUTEXES case. Most of them are NOPs: |
10 | */ | 10 | */ |
11 | 11 | ||
12 | #define spin_lock_mutex(lock) spin_lock(lock) | 12 | #define spin_lock_mutex(lock, flags) \ |
13 | #define spin_unlock_mutex(lock) spin_unlock(lock) | 13 | do { spin_lock(lock); (void)(flags); } while (0) |
14 | #define spin_unlock_mutex(lock, flags) \ | ||
15 | do { spin_unlock(lock); (void)(flags); } while (0) | ||
14 | #define mutex_remove_waiter(lock, waiter, ti) \ | 16 | #define mutex_remove_waiter(lock, waiter, ti) \ |
15 | __list_del((waiter)->list.prev, (waiter)->list.next) | 17 | __list_del((waiter)->list.prev, (waiter)->list.next) |
16 | 18 | ||
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index fc311a4673a2..857b4fa09124 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig | |||
@@ -38,13 +38,22 @@ config PM_DEBUG | |||
38 | 38 | ||
39 | config PM_TRACE | 39 | config PM_TRACE |
40 | bool "Suspend/resume event tracing" | 40 | bool "Suspend/resume event tracing" |
41 | depends on PM && PM_DEBUG && X86_32 | 41 | depends on PM && PM_DEBUG && X86_32 && EXPERIMENTAL |
42 | default y | 42 | default n |
43 | ---help--- | 43 | ---help--- |
44 | This enables some cheesy code to save the last PM event point in the | 44 | This enables some cheesy code to save the last PM event point in the |
45 | RTC across reboots, so that you can debug a machine that just hangs | 45 | RTC across reboots, so that you can debug a machine that just hangs |
46 | during suspend (or more commonly, during resume). | 46 | during suspend (or more commonly, during resume). |
47 | 47 | ||
48 | To use this debugging feature you should attempt to suspend the machine, | ||
49 | then reboot it, then run | ||
50 | |||
51 | dmesg -s 1000000 | grep 'hash matches' | ||
52 | |||
53 | CAUTION: this option will cause your machine's real-time clock to be | ||
54 | set to an invalid time after a resume. | ||
55 | |||
56 | |||
48 | config SOFTWARE_SUSPEND | 57 | config SOFTWARE_SUSPEND |
49 | bool "Software Suspend" | 58 | bool "Software Suspend" |
50 | depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP) | 59 | depends on PM && SWAP && (X86 && (!SMP || SUSPEND_SMP)) || ((FRV || PPC32) && !SMP) |
diff --git a/kernel/profile.c b/kernel/profile.c index 68afe121e507..5a730fdb1a2c 100644 --- a/kernel/profile.c +++ b/kernel/profile.c | |||
@@ -299,7 +299,7 @@ out: | |||
299 | } | 299 | } |
300 | 300 | ||
301 | #ifdef CONFIG_HOTPLUG_CPU | 301 | #ifdef CONFIG_HOTPLUG_CPU |
302 | static int profile_cpu_callback(struct notifier_block *info, | 302 | static int __devinit profile_cpu_callback(struct notifier_block *info, |
303 | unsigned long action, void *__cpu) | 303 | unsigned long action, void *__cpu) |
304 | { | 304 | { |
305 | int node, cpu = (unsigned long)__cpu; | 305 | int node, cpu = (unsigned long)__cpu; |
diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 921c22ad16e4..335c5b932e14 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c | |||
@@ -120,8 +120,18 @@ int ptrace_check_attach(struct task_struct *child, int kill) | |||
120 | 120 | ||
121 | static int may_attach(struct task_struct *task) | 121 | static int may_attach(struct task_struct *task) |
122 | { | 122 | { |
123 | if (!task->mm) | 123 | /* May we inspect the given task? |
124 | return -EPERM; | 124 | * This check is used both for attaching with ptrace |
125 | * and for allowing access to sensitive information in /proc. | ||
126 | * | ||
127 | * ptrace_attach denies several cases that /proc allows | ||
128 | * because setting up the necessary parent/child relationship | ||
129 | * or halting the specified task is impossible. | ||
130 | */ | ||
131 | int dumpable = 0; | ||
132 | /* Don't let security modules deny introspection */ | ||
133 | if (task == current) | ||
134 | return 0; | ||
125 | if (((current->uid != task->euid) || | 135 | if (((current->uid != task->euid) || |
126 | (current->uid != task->suid) || | 136 | (current->uid != task->suid) || |
127 | (current->uid != task->uid) || | 137 | (current->uid != task->uid) || |
@@ -130,7 +140,9 @@ static int may_attach(struct task_struct *task) | |||
130 | (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) | 140 | (current->gid != task->gid)) && !capable(CAP_SYS_PTRACE)) |
131 | return -EPERM; | 141 | return -EPERM; |
132 | smp_rmb(); | 142 | smp_rmb(); |
133 | if (!task->mm->dumpable && !capable(CAP_SYS_PTRACE)) | 143 | if (task->mm) |
144 | dumpable = task->mm->dumpable; | ||
145 | if (!dumpable && !capable(CAP_SYS_PTRACE)) | ||
134 | return -EPERM; | 146 | return -EPERM; |
135 | 147 | ||
136 | return security_ptrace(current, task); | 148 | return security_ptrace(current, task); |
@@ -176,6 +188,8 @@ repeat: | |||
176 | goto repeat; | 188 | goto repeat; |
177 | } | 189 | } |
178 | 190 | ||
191 | if (!task->mm) | ||
192 | goto bad; | ||
179 | /* the same process cannot be attached many times */ | 193 | /* the same process cannot be attached many times */ |
180 | if (task->ptrace & PT_PTRACED) | 194 | if (task->ptrace & PT_PTRACED) |
181 | goto bad; | 195 | goto bad; |
@@ -200,7 +214,7 @@ out: | |||
200 | return retval; | 214 | return retval; |
201 | } | 215 | } |
202 | 216 | ||
203 | void __ptrace_detach(struct task_struct *child, unsigned int data) | 217 | static inline void __ptrace_detach(struct task_struct *child, unsigned int data) |
204 | { | 218 | { |
205 | child->exit_code = data; | 219 | child->exit_code = data; |
206 | /* .. re-parent .. */ | 220 | /* .. re-parent .. */ |
@@ -219,6 +233,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data) | |||
219 | ptrace_disable(child); | 233 | ptrace_disable(child); |
220 | 234 | ||
221 | write_lock_irq(&tasklist_lock); | 235 | write_lock_irq(&tasklist_lock); |
236 | /* protect against de_thread()->release_task() */ | ||
222 | if (child->ptrace) | 237 | if (child->ptrace) |
223 | __ptrace_detach(child, data); | 238 | __ptrace_detach(child, data); |
224 | write_unlock_irq(&tasklist_lock); | 239 | write_unlock_irq(&tasklist_lock); |
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 20e9710fc21c..f464f5ae3f11 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c | |||
@@ -182,6 +182,15 @@ long rcu_batches_completed(void) | |||
182 | return rcu_ctrlblk.completed; | 182 | return rcu_ctrlblk.completed; |
183 | } | 183 | } |
184 | 184 | ||
185 | /* | ||
186 | * Return the number of RCU batches processed thus far. Useful | ||
187 | * for debug and statistics. | ||
188 | */ | ||
189 | long rcu_batches_completed_bh(void) | ||
190 | { | ||
191 | return rcu_bh_ctrlblk.completed; | ||
192 | } | ||
193 | |||
185 | static void rcu_barrier_callback(struct rcu_head *notused) | 194 | static void rcu_barrier_callback(struct rcu_head *notused) |
186 | { | 195 | { |
187 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) | 196 | if (atomic_dec_and_test(&rcu_barrier_cpu_count)) |
@@ -539,7 +548,7 @@ static void __devinit rcu_online_cpu(int cpu) | |||
539 | tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL); | 548 | tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL); |
540 | } | 549 | } |
541 | 550 | ||
542 | static int rcu_cpu_notify(struct notifier_block *self, | 551 | static int __devinit rcu_cpu_notify(struct notifier_block *self, |
543 | unsigned long action, void *hcpu) | 552 | unsigned long action, void *hcpu) |
544 | { | 553 | { |
545 | long cpu = (long)hcpu; | 554 | long cpu = (long)hcpu; |
@@ -556,7 +565,7 @@ static int rcu_cpu_notify(struct notifier_block *self, | |||
556 | return NOTIFY_OK; | 565 | return NOTIFY_OK; |
557 | } | 566 | } |
558 | 567 | ||
559 | static struct notifier_block rcu_nb = { | 568 | static struct notifier_block __devinitdata rcu_nb = { |
560 | .notifier_call = rcu_cpu_notify, | 569 | .notifier_call = rcu_cpu_notify, |
561 | }; | 570 | }; |
562 | 571 | ||
@@ -619,6 +628,7 @@ module_param(qlowmark, int, 0); | |||
619 | module_param(rsinterval, int, 0); | 628 | module_param(rsinterval, int, 0); |
620 | #endif | 629 | #endif |
621 | EXPORT_SYMBOL_GPL(rcu_batches_completed); | 630 | EXPORT_SYMBOL_GPL(rcu_batches_completed); |
631 | EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); | ||
622 | EXPORT_SYMBOL_GPL(call_rcu); | 632 | EXPORT_SYMBOL_GPL(call_rcu); |
623 | EXPORT_SYMBOL_GPL(call_rcu_bh); | 633 | EXPORT_SYMBOL_GPL(call_rcu_bh); |
624 | EXPORT_SYMBOL_GPL(synchronize_rcu); | 634 | EXPORT_SYMBOL_GPL(synchronize_rcu); |
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 8154e7589d12..4d1c3d247127 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c | |||
@@ -1,5 +1,5 @@ | |||
1 | /* | 1 | /* |
2 | * Read-Copy Update /proc-based torture test facility | 2 | * Read-Copy Update module-based torture test facility |
3 | * | 3 | * |
4 | * This program is free software; you can redistribute it and/or modify | 4 | * This program is free software; you can redistribute it and/or modify |
5 | * it under the terms of the GNU General Public License as published by | 5 | * it under the terms of the GNU General Public License as published by |
@@ -53,6 +53,7 @@ static int stat_interval; /* Interval between stats, in seconds. */ | |||
53 | static int verbose; /* Print more debug info. */ | 53 | static int verbose; /* Print more debug info. */ |
54 | static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ | 54 | static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ |
55 | static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/ | 55 | static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/ |
56 | static char *torture_type = "rcu"; /* What to torture. */ | ||
56 | 57 | ||
57 | module_param(nreaders, int, 0); | 58 | module_param(nreaders, int, 0); |
58 | MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); | 59 | MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); |
@@ -64,13 +65,16 @@ module_param(test_no_idle_hz, bool, 0); | |||
64 | MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); | 65 | MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); |
65 | module_param(shuffle_interval, int, 0); | 66 | module_param(shuffle_interval, int, 0); |
66 | MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); | 67 | MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); |
67 | #define TORTURE_FLAG "rcutorture: " | 68 | module_param(torture_type, charp, 0); |
69 | MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh)"); | ||
70 | |||
71 | #define TORTURE_FLAG "-torture:" | ||
68 | #define PRINTK_STRING(s) \ | 72 | #define PRINTK_STRING(s) \ |
69 | do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) | 73 | do { printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) |
70 | #define VERBOSE_PRINTK_STRING(s) \ | 74 | #define VERBOSE_PRINTK_STRING(s) \ |
71 | do { if (verbose) printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0) | 75 | do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG s "\n", torture_type); } while (0) |
72 | #define VERBOSE_PRINTK_ERRSTRING(s) \ | 76 | #define VERBOSE_PRINTK_ERRSTRING(s) \ |
73 | do { if (verbose) printk(KERN_ALERT TORTURE_FLAG "!!! " s "\n"); } while (0) | 77 | do { if (verbose) printk(KERN_ALERT "%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) |
74 | 78 | ||
75 | static char printk_buf[4096]; | 79 | static char printk_buf[4096]; |
76 | 80 | ||
@@ -139,28 +143,6 @@ rcu_torture_free(struct rcu_torture *p) | |||
139 | spin_unlock_bh(&rcu_torture_lock); | 143 | spin_unlock_bh(&rcu_torture_lock); |
140 | } | 144 | } |
141 | 145 | ||
142 | static void | ||
143 | rcu_torture_cb(struct rcu_head *p) | ||
144 | { | ||
145 | int i; | ||
146 | struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); | ||
147 | |||
148 | if (fullstop) { | ||
149 | /* Test is ending, just drop callbacks on the floor. */ | ||
150 | /* The next initialization will pick up the pieces. */ | ||
151 | return; | ||
152 | } | ||
153 | i = rp->rtort_pipe_count; | ||
154 | if (i > RCU_TORTURE_PIPE_LEN) | ||
155 | i = RCU_TORTURE_PIPE_LEN; | ||
156 | atomic_inc(&rcu_torture_wcount[i]); | ||
157 | if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { | ||
158 | rp->rtort_mbtest = 0; | ||
159 | rcu_torture_free(rp); | ||
160 | } else | ||
161 | call_rcu(p, rcu_torture_cb); | ||
162 | } | ||
163 | |||
164 | struct rcu_random_state { | 146 | struct rcu_random_state { |
165 | unsigned long rrs_state; | 147 | unsigned long rrs_state; |
166 | unsigned long rrs_count; | 148 | unsigned long rrs_count; |
@@ -191,6 +173,119 @@ rcu_random(struct rcu_random_state *rrsp) | |||
191 | } | 173 | } |
192 | 174 | ||
193 | /* | 175 | /* |
176 | * Operations vector for selecting different types of tests. | ||
177 | */ | ||
178 | |||
179 | struct rcu_torture_ops { | ||
180 | void (*init)(void); | ||
181 | void (*cleanup)(void); | ||
182 | int (*readlock)(void); | ||
183 | void (*readunlock)(int idx); | ||
184 | int (*completed)(void); | ||
185 | void (*deferredfree)(struct rcu_torture *p); | ||
186 | int (*stats)(char *page); | ||
187 | char *name; | ||
188 | }; | ||
189 | static struct rcu_torture_ops *cur_ops = NULL; | ||
190 | |||
191 | /* | ||
192 | * Definitions for rcu torture testing. | ||
193 | */ | ||
194 | |||
195 | static int rcu_torture_read_lock(void) | ||
196 | { | ||
197 | rcu_read_lock(); | ||
198 | return 0; | ||
199 | } | ||
200 | |||
201 | static void rcu_torture_read_unlock(int idx) | ||
202 | { | ||
203 | rcu_read_unlock(); | ||
204 | } | ||
205 | |||
206 | static int rcu_torture_completed(void) | ||
207 | { | ||
208 | return rcu_batches_completed(); | ||
209 | } | ||
210 | |||
211 | static void | ||
212 | rcu_torture_cb(struct rcu_head *p) | ||
213 | { | ||
214 | int i; | ||
215 | struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); | ||
216 | |||
217 | if (fullstop) { | ||
218 | /* Test is ending, just drop callbacks on the floor. */ | ||
219 | /* The next initialization will pick up the pieces. */ | ||
220 | return; | ||
221 | } | ||
222 | i = rp->rtort_pipe_count; | ||
223 | if (i > RCU_TORTURE_PIPE_LEN) | ||
224 | i = RCU_TORTURE_PIPE_LEN; | ||
225 | atomic_inc(&rcu_torture_wcount[i]); | ||
226 | if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { | ||
227 | rp->rtort_mbtest = 0; | ||
228 | rcu_torture_free(rp); | ||
229 | } else | ||
230 | cur_ops->deferredfree(rp); | ||
231 | } | ||
232 | |||
233 | static void rcu_torture_deferred_free(struct rcu_torture *p) | ||
234 | { | ||
235 | call_rcu(&p->rtort_rcu, rcu_torture_cb); | ||
236 | } | ||
237 | |||
238 | static struct rcu_torture_ops rcu_ops = { | ||
239 | .init = NULL, | ||
240 | .cleanup = NULL, | ||
241 | .readlock = rcu_torture_read_lock, | ||
242 | .readunlock = rcu_torture_read_unlock, | ||
243 | .completed = rcu_torture_completed, | ||
244 | .deferredfree = rcu_torture_deferred_free, | ||
245 | .stats = NULL, | ||
246 | .name = "rcu" | ||
247 | }; | ||
248 | |||
249 | /* | ||
250 | * Definitions for rcu_bh torture testing. | ||
251 | */ | ||
252 | |||
253 | static int rcu_bh_torture_read_lock(void) | ||
254 | { | ||
255 | rcu_read_lock_bh(); | ||
256 | return 0; | ||
257 | } | ||
258 | |||
259 | static void rcu_bh_torture_read_unlock(int idx) | ||
260 | { | ||
261 | rcu_read_unlock_bh(); | ||
262 | } | ||
263 | |||
264 | static int rcu_bh_torture_completed(void) | ||
265 | { | ||
266 | return rcu_batches_completed_bh(); | ||
267 | } | ||
268 | |||
269 | static void rcu_bh_torture_deferred_free(struct rcu_torture *p) | ||
270 | { | ||
271 | call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); | ||
272 | } | ||
273 | |||
274 | static struct rcu_torture_ops rcu_bh_ops = { | ||
275 | .init = NULL, | ||
276 | .cleanup = NULL, | ||
277 | .readlock = rcu_bh_torture_read_lock, | ||
278 | .readunlock = rcu_bh_torture_read_unlock, | ||
279 | .completed = rcu_bh_torture_completed, | ||
280 | .deferredfree = rcu_bh_torture_deferred_free, | ||
281 | .stats = NULL, | ||
282 | .name = "rcu_bh" | ||
283 | }; | ||
284 | |||
285 | static struct rcu_torture_ops *torture_ops[] = | ||
286 | { &rcu_ops, &rcu_bh_ops, NULL }; | ||
287 | |||
288 | /* | ||
194 | * RCU torture writer kthread. Repeatedly substitutes a new structure | 289 | * RCU torture writer kthread. Repeatedly substitutes a new structure |
195 | * for that pointed to by rcu_torture_current, freeing the old structure | 290 | * for that pointed to by rcu_torture_current, freeing the old structure |
196 | * after a series of grace periods (the "pipeline"). | 291 | * after a series of grace periods (the "pipeline"). |
@@ -209,8 +304,6 @@ rcu_torture_writer(void *arg) | |||
209 | 304 | ||
210 | do { | 305 | do { |
211 | schedule_timeout_uninterruptible(1); | 306 | schedule_timeout_uninterruptible(1); |
212 | if (rcu_batches_completed() == oldbatch) | ||
213 | continue; | ||
214 | if ((rp = rcu_torture_alloc()) == NULL) | 307 | if ((rp = rcu_torture_alloc()) == NULL) |
215 | continue; | 308 | continue; |
216 | rp->rtort_pipe_count = 0; | 309 | rp->rtort_pipe_count = 0; |
@@ -225,10 +318,10 @@ rcu_torture_writer(void *arg) | |||
225 | i = RCU_TORTURE_PIPE_LEN; | 318 | i = RCU_TORTURE_PIPE_LEN; |
226 | atomic_inc(&rcu_torture_wcount[i]); | 319 | atomic_inc(&rcu_torture_wcount[i]); |
227 | old_rp->rtort_pipe_count++; | 320 | old_rp->rtort_pipe_count++; |
228 | call_rcu(&old_rp->rtort_rcu, rcu_torture_cb); | 321 | cur_ops->deferredfree(old_rp); |
229 | } | 322 | } |
230 | rcu_torture_current_version++; | 323 | rcu_torture_current_version++; |
231 | oldbatch = rcu_batches_completed(); | 324 | oldbatch = cur_ops->completed(); |
232 | } while (!kthread_should_stop() && !fullstop); | 325 | } while (!kthread_should_stop() && !fullstop); |
233 | VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); | 326 | VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); |
234 | while (!kthread_should_stop()) | 327 | while (!kthread_should_stop()) |
@@ -246,6 +339,7 @@ static int | |||
246 | rcu_torture_reader(void *arg) | 339 | rcu_torture_reader(void *arg) |
247 | { | 340 | { |
248 | int completed; | 341 | int completed; |
342 | int idx; | ||
249 | DEFINE_RCU_RANDOM(rand); | 343 | DEFINE_RCU_RANDOM(rand); |
250 | struct rcu_torture *p; | 344 | struct rcu_torture *p; |
251 | int pipe_count; | 345 | int pipe_count; |
@@ -254,12 +348,12 @@ rcu_torture_reader(void *arg) | |||
254 | set_user_nice(current, 19); | 348 | set_user_nice(current, 19); |
255 | 349 | ||
256 | do { | 350 | do { |
257 | rcu_read_lock(); | 351 | idx = cur_ops->readlock(); |
258 | completed = rcu_batches_completed(); | 352 | completed = cur_ops->completed(); |
259 | p = rcu_dereference(rcu_torture_current); | 353 | p = rcu_dereference(rcu_torture_current); |
260 | if (p == NULL) { | 354 | if (p == NULL) { |
261 | /* Wait for rcu_torture_writer to get underway */ | 355 | /* Wait for rcu_torture_writer to get underway */ |
262 | rcu_read_unlock(); | 356 | cur_ops->readunlock(idx); |
263 | schedule_timeout_interruptible(HZ); | 357 | schedule_timeout_interruptible(HZ); |
264 | continue; | 358 | continue; |
265 | } | 359 | } |
@@ -273,14 +367,14 @@ rcu_torture_reader(void *arg) | |||
273 | pipe_count = RCU_TORTURE_PIPE_LEN; | 367 | pipe_count = RCU_TORTURE_PIPE_LEN; |
274 | } | 368 | } |
275 | ++__get_cpu_var(rcu_torture_count)[pipe_count]; | 369 | ++__get_cpu_var(rcu_torture_count)[pipe_count]; |
276 | completed = rcu_batches_completed() - completed; | 370 | completed = cur_ops->completed() - completed; |
277 | if (completed > RCU_TORTURE_PIPE_LEN) { | 371 | if (completed > RCU_TORTURE_PIPE_LEN) { |
278 | /* Should not happen, but... */ | 372 | /* Should not happen, but... */ |
279 | completed = RCU_TORTURE_PIPE_LEN; | 373 | completed = RCU_TORTURE_PIPE_LEN; |
280 | } | 374 | } |
281 | ++__get_cpu_var(rcu_torture_batch)[completed]; | 375 | ++__get_cpu_var(rcu_torture_batch)[completed]; |
282 | preempt_enable(); | 376 | preempt_enable(); |
283 | rcu_read_unlock(); | 377 | cur_ops->readunlock(idx); |
284 | schedule(); | 378 | schedule(); |
285 | } while (!kthread_should_stop() && !fullstop); | 379 | } while (!kthread_should_stop() && !fullstop); |
286 | VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); | 380 | VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); |
@@ -311,7 +405,7 @@ rcu_torture_printk(char *page) | |||
311 | if (pipesummary[i] != 0) | 405 | if (pipesummary[i] != 0) |
312 | break; | 406 | break; |
313 | } | 407 | } |
314 | cnt += sprintf(&page[cnt], "rcutorture: "); | 408 | cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); |
315 | cnt += sprintf(&page[cnt], | 409 | cnt += sprintf(&page[cnt], |
316 | "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " | 410 | "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " |
317 | "rtmbe: %d", | 411 | "rtmbe: %d", |
@@ -324,7 +418,7 @@ rcu_torture_printk(char *page) | |||
324 | atomic_read(&n_rcu_torture_mberror)); | 418 | atomic_read(&n_rcu_torture_mberror)); |
325 | if (atomic_read(&n_rcu_torture_mberror) != 0) | 419 | if (atomic_read(&n_rcu_torture_mberror) != 0) |
326 | cnt += sprintf(&page[cnt], " !!!"); | 420 | cnt += sprintf(&page[cnt], " !!!"); |
327 | cnt += sprintf(&page[cnt], "\nrcutorture: "); | 421 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); |
328 | if (i > 1) { | 422 | if (i > 1) { |
329 | cnt += sprintf(&page[cnt], "!!! "); | 423 | cnt += sprintf(&page[cnt], "!!! "); |
330 | atomic_inc(&n_rcu_torture_error); | 424 | atomic_inc(&n_rcu_torture_error); |
@@ -332,17 +426,19 @@ rcu_torture_printk(char *page) | |||
332 | cnt += sprintf(&page[cnt], "Reader Pipe: "); | 426 | cnt += sprintf(&page[cnt], "Reader Pipe: "); |
333 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) | 427 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) |
334 | cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); | 428 | cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); |
335 | cnt += sprintf(&page[cnt], "\nrcutorture: "); | 429 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); |
336 | cnt += sprintf(&page[cnt], "Reader Batch: "); | 430 | cnt += sprintf(&page[cnt], "Reader Batch: "); |
337 | for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++) | 431 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) |
338 | cnt += sprintf(&page[cnt], " %ld", batchsummary[i]); | 432 | cnt += sprintf(&page[cnt], " %ld", batchsummary[i]); |
339 | cnt += sprintf(&page[cnt], "\nrcutorture: "); | 433 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); |
340 | cnt += sprintf(&page[cnt], "Free-Block Circulation: "); | 434 | cnt += sprintf(&page[cnt], "Free-Block Circulation: "); |
341 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { | 435 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { |
342 | cnt += sprintf(&page[cnt], " %d", | 436 | cnt += sprintf(&page[cnt], " %d", |
343 | atomic_read(&rcu_torture_wcount[i])); | 437 | atomic_read(&rcu_torture_wcount[i])); |
344 | } | 438 | } |
345 | cnt += sprintf(&page[cnt], "\n"); | 439 | cnt += sprintf(&page[cnt], "\n"); |
440 | if (cur_ops->stats != NULL) | ||
441 | cnt += cur_ops->stats(&page[cnt]); | ||
346 | return cnt; | 442 | return cnt; |
347 | } | 443 | } |
348 | 444 | ||
@@ -444,11 +540,11 @@ rcu_torture_shuffle(void *arg) | |||
444 | static inline void | 540 | static inline void |
445 | rcu_torture_print_module_parms(char *tag) | 541 | rcu_torture_print_module_parms(char *tag) |
446 | { | 542 | { |
447 | printk(KERN_ALERT TORTURE_FLAG "--- %s: nreaders=%d " | 543 | printk(KERN_ALERT "%s" TORTURE_FLAG "--- %s: nreaders=%d " |
448 | "stat_interval=%d verbose=%d test_no_idle_hz=%d " | 544 | "stat_interval=%d verbose=%d test_no_idle_hz=%d " |
449 | "shuffle_interval = %d\n", | 545 | "shuffle_interval = %d\n", |
450 | tag, nrealreaders, stat_interval, verbose, test_no_idle_hz, | 546 | torture_type, tag, nrealreaders, stat_interval, verbose, |
451 | shuffle_interval); | 547 | test_no_idle_hz, shuffle_interval); |
452 | } | 548 | } |
453 | 549 | ||
454 | static void | 550 | static void |
@@ -493,6 +589,9 @@ rcu_torture_cleanup(void) | |||
493 | rcu_barrier(); | 589 | rcu_barrier(); |
494 | 590 | ||
495 | rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ | 591 | rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ |
592 | |||
593 | if (cur_ops->cleanup != NULL) | ||
594 | cur_ops->cleanup(); | ||
496 | if (atomic_read(&n_rcu_torture_error)) | 595 | if (atomic_read(&n_rcu_torture_error)) |
497 | rcu_torture_print_module_parms("End of test: FAILURE"); | 596 | rcu_torture_print_module_parms("End of test: FAILURE"); |
498 | else | 597 | else |
@@ -508,6 +607,20 @@ rcu_torture_init(void) | |||
508 | 607 | ||
509 | /* Process args and tell the world that the torturer is on the job. */ | 608 | /* Process args and tell the world that the torturer is on the job. */ |
510 | 609 | ||
610 | for (i = 0; cur_ops = torture_ops[i], cur_ops != NULL; i++) { | ||
611 | cur_ops = torture_ops[i]; | ||
612 | if (strcmp(torture_type, cur_ops->name) == 0) { | ||
613 | break; | ||
614 | } | ||
615 | } | ||
616 | if (cur_ops == NULL) { | ||
617 | printk(KERN_ALERT "rcutorture: invalid torture type: \"%s\"\n", | ||
618 | torture_type); | ||
619 | return (-EINVAL); | ||
620 | } | ||
621 | if (cur_ops->init != NULL) | ||
622 | cur_ops->init(); /* no "goto unwind" prior to this point!!! */ | ||
623 | |||
511 | if (nreaders >= 0) | 624 | if (nreaders >= 0) |
512 | nrealreaders = nreaders; | 625 | nrealreaders = nreaders; |
513 | else | 626 | else |
diff --git a/kernel/resource.c b/kernel/resource.c index e3080fcc66a3..2404f9b0bc47 100644 --- a/kernel/resource.c +++ b/kernel/resource.c | |||
@@ -232,6 +232,44 @@ int release_resource(struct resource *old) | |||
232 | 232 | ||
233 | EXPORT_SYMBOL(release_resource); | 233 | EXPORT_SYMBOL(release_resource); |
234 | 234 | ||
235 | #ifdef CONFIG_MEMORY_HOTPLUG | ||
236 | /* | ||
237 | * Finds the lowest memory reosurce exists within [res->start.res->end) | ||
238 | * the caller must specify res->start, res->end, res->flags. | ||
239 | * If found, returns 0, res is overwritten, if not found, returns -1. | ||
240 | */ | ||
241 | int find_next_system_ram(struct resource *res) | ||
242 | { | ||
243 | resource_size_t start, end; | ||
244 | struct resource *p; | ||
245 | |||
246 | BUG_ON(!res); | ||
247 | |||
248 | start = res->start; | ||
249 | end = res->end; | ||
250 | |||
251 | read_lock(&resource_lock); | ||
252 | for (p = iomem_resource.child; p ; p = p->sibling) { | ||
253 | /* system ram is just marked as IORESOURCE_MEM */ | ||
254 | if (p->flags != res->flags) | ||
255 | continue; | ||
256 | if (p->start > end) { | ||
257 | p = NULL; | ||
258 | break; | ||
259 | } | ||
260 | if (p->start >= start) | ||
261 | break; | ||
262 | } | ||
263 | read_unlock(&resource_lock); | ||
264 | if (!p) | ||
265 | return -1; | ||
266 | /* copy data */ | ||
267 | res->start = p->start; | ||
268 | res->end = p->end; | ||
269 | return 0; | ||
270 | } | ||
271 | #endif | ||
272 | |||
235 | /* | 273 | /* |
236 | * Find empty slot in the resource tree given range and alignment. | 274 | * Find empty slot in the resource tree given range and alignment. |
237 | */ | 275 | */ |
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c new file mode 100644 index 000000000000..4aa8a2c9f453 --- /dev/null +++ b/kernel/rtmutex-debug.c | |||
@@ -0,0 +1,513 @@ | |||
1 | /* | ||
2 | * RT-Mutexes: blocking mutual exclusion locks with PI support | ||
3 | * | ||
4 | * started by Ingo Molnar and Thomas Gleixner: | ||
5 | * | ||
6 | * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
7 | * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> | ||
8 | * | ||
9 | * This code is based on the rt.c implementation in the preempt-rt tree. | ||
10 | * Portions of said code are | ||
11 | * | ||
12 | * Copyright (C) 2004 LynuxWorks, Inc., Igor Manyilov, Bill Huey | ||
13 | * Copyright (C) 2006 Esben Nielsen | ||
14 | * Copyright (C) 2006 Kihon Technologies Inc., | ||
15 | * Steven Rostedt <rostedt@goodmis.org> | ||
16 | * | ||
17 | * See rt.c in preempt-rt for proper credits and further information | ||
18 | */ | ||
19 | #include <linux/config.h> | ||
20 | #include <linux/sched.h> | ||
21 | #include <linux/delay.h> | ||
22 | #include <linux/module.h> | ||
23 | #include <linux/spinlock.h> | ||
24 | #include <linux/kallsyms.h> | ||
25 | #include <linux/syscalls.h> | ||
26 | #include <linux/interrupt.h> | ||
27 | #include <linux/plist.h> | ||
28 | #include <linux/fs.h> | ||
29 | |||
30 | #include "rtmutex_common.h" | ||
31 | |||
32 | #ifdef CONFIG_DEBUG_RT_MUTEXES | ||
33 | # include "rtmutex-debug.h" | ||
34 | #else | ||
35 | # include "rtmutex.h" | ||
36 | #endif | ||
37 | |||
38 | # define TRACE_WARN_ON(x) WARN_ON(x) | ||
39 | # define TRACE_BUG_ON(x) BUG_ON(x) | ||
40 | |||
41 | # define TRACE_OFF() \ | ||
42 | do { \ | ||
43 | if (rt_trace_on) { \ | ||
44 | rt_trace_on = 0; \ | ||
45 | console_verbose(); \ | ||
46 | if (spin_is_locked(¤t->pi_lock)) \ | ||
47 | spin_unlock(¤t->pi_lock); \ | ||
48 | if (spin_is_locked(¤t->held_list_lock)) \ | ||
49 | spin_unlock(¤t->held_list_lock); \ | ||
50 | } \ | ||
51 | } while (0) | ||
52 | |||
53 | # define TRACE_OFF_NOLOCK() \ | ||
54 | do { \ | ||
55 | if (rt_trace_on) { \ | ||
56 | rt_trace_on = 0; \ | ||
57 | console_verbose(); \ | ||
58 | } \ | ||
59 | } while (0) | ||
60 | |||
61 | # define TRACE_BUG_LOCKED() \ | ||
62 | do { \ | ||
63 | TRACE_OFF(); \ | ||
64 | BUG(); \ | ||
65 | } while (0) | ||
66 | |||
67 | # define TRACE_WARN_ON_LOCKED(c) \ | ||
68 | do { \ | ||
69 | if (unlikely(c)) { \ | ||
70 | TRACE_OFF(); \ | ||
71 | WARN_ON(1); \ | ||
72 | } \ | ||
73 | } while (0) | ||
74 | |||
75 | # define TRACE_BUG_ON_LOCKED(c) \ | ||
76 | do { \ | ||
77 | if (unlikely(c)) \ | ||
78 | TRACE_BUG_LOCKED(); \ | ||
79 | } while (0) | ||
80 | |||
81 | #ifdef CONFIG_SMP | ||
82 | # define SMP_TRACE_BUG_ON_LOCKED(c) TRACE_BUG_ON_LOCKED(c) | ||
83 | #else | ||
84 | # define SMP_TRACE_BUG_ON_LOCKED(c) do { } while (0) | ||
85 | #endif | ||
86 | |||
87 | /* | ||
88 | * deadlock detection flag. We turn it off when we detect | ||
89 | * the first problem because we dont want to recurse back | ||
90 | * into the tracing code when doing error printk or | ||
91 | * executing a BUG(): | ||
92 | */ | ||
93 | int rt_trace_on = 1; | ||
94 | |||
95 | void deadlock_trace_off(void) | ||
96 | { | ||
97 | rt_trace_on = 0; | ||
98 | } | ||
99 | |||
100 | static void printk_task(task_t *p) | ||
101 | { | ||
102 | if (p) | ||
103 | printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio); | ||
104 | else | ||
105 | printk("<none>"); | ||
106 | } | ||
107 | |||
108 | static void printk_task_short(task_t *p) | ||
109 | { | ||
110 | if (p) | ||
111 | printk("%s/%d [%p, %3d]", p->comm, p->pid, p, p->prio); | ||
112 | else | ||
113 | printk("<none>"); | ||
114 | } | ||
115 | |||
116 | static void printk_lock(struct rt_mutex *lock, int print_owner) | ||
117 | { | ||
118 | if (lock->name) | ||
119 | printk(" [%p] {%s}\n", | ||
120 | lock, lock->name); | ||
121 | else | ||
122 | printk(" [%p] {%s:%d}\n", | ||
123 | lock, lock->file, lock->line); | ||
124 | |||
125 | if (print_owner && rt_mutex_owner(lock)) { | ||
126 | printk(".. ->owner: %p\n", lock->owner); | ||
127 | printk(".. held by: "); | ||
128 | printk_task(rt_mutex_owner(lock)); | ||
129 | printk("\n"); | ||
130 | } | ||
131 | if (rt_mutex_owner(lock)) { | ||
132 | printk("... acquired at: "); | ||
133 | print_symbol("%s\n", lock->acquire_ip); | ||
134 | } | ||
135 | } | ||
136 | |||
137 | static void printk_waiter(struct rt_mutex_waiter *w) | ||
138 | { | ||
139 | printk("-------------------------\n"); | ||
140 | printk("| waiter struct %p:\n", w); | ||
141 | printk("| w->list_entry: [DP:%p/%p|SP:%p/%p|PRI:%d]\n", | ||
142 | w->list_entry.plist.prio_list.prev, w->list_entry.plist.prio_list.next, | ||
143 | w->list_entry.plist.node_list.prev, w->list_entry.plist.node_list.next, | ||
144 | w->list_entry.prio); | ||
145 | printk("| w->pi_list_entry: [DP:%p/%p|SP:%p/%p|PRI:%d]\n", | ||
146 | w->pi_list_entry.plist.prio_list.prev, w->pi_list_entry.plist.prio_list.next, | ||
147 | w->pi_list_entry.plist.node_list.prev, w->pi_list_entry.plist.node_list.next, | ||
148 | w->pi_list_entry.prio); | ||
149 | printk("\n| lock:\n"); | ||
150 | printk_lock(w->lock, 1); | ||
151 | printk("| w->ti->task:\n"); | ||
152 | printk_task(w->task); | ||
153 | printk("| blocked at: "); | ||
154 | print_symbol("%s\n", w->ip); | ||
155 | printk("-------------------------\n"); | ||
156 | } | ||
157 | |||
158 | static void show_task_locks(task_t *p) | ||
159 | { | ||
160 | switch (p->state) { | ||
161 | case TASK_RUNNING: printk("R"); break; | ||
162 | case TASK_INTERRUPTIBLE: printk("S"); break; | ||
163 | case TASK_UNINTERRUPTIBLE: printk("D"); break; | ||
164 | case TASK_STOPPED: printk("T"); break; | ||
165 | case EXIT_ZOMBIE: printk("Z"); break; | ||
166 | case EXIT_DEAD: printk("X"); break; | ||
167 | default: printk("?"); break; | ||
168 | } | ||
169 | printk_task(p); | ||
170 | if (p->pi_blocked_on) { | ||
171 | struct rt_mutex *lock = p->pi_blocked_on->lock; | ||
172 | |||
173 | printk(" blocked on:"); | ||
174 | printk_lock(lock, 1); | ||
175 | } else | ||
176 | printk(" (not blocked)\n"); | ||
177 | } | ||
178 | |||
179 | void rt_mutex_show_held_locks(task_t *task, int verbose) | ||
180 | { | ||
181 | struct list_head *curr, *cursor = NULL; | ||
182 | struct rt_mutex *lock; | ||
183 | task_t *t; | ||
184 | unsigned long flags; | ||
185 | int count = 0; | ||
186 | |||
187 | if (!rt_trace_on) | ||
188 | return; | ||
189 | |||
190 | if (verbose) { | ||
191 | printk("------------------------------\n"); | ||
192 | printk("| showing all locks held by: | ("); | ||
193 | printk_task_short(task); | ||
194 | printk("):\n"); | ||
195 | printk("------------------------------\n"); | ||
196 | } | ||
197 | |||
198 | next: | ||
199 | spin_lock_irqsave(&task->held_list_lock, flags); | ||
200 | list_for_each(curr, &task->held_list_head) { | ||
201 | if (cursor && curr != cursor) | ||
202 | continue; | ||
203 | lock = list_entry(curr, struct rt_mutex, held_list_entry); | ||
204 | t = rt_mutex_owner(lock); | ||
205 | WARN_ON(t != task); | ||
206 | count++; | ||
207 | cursor = curr->next; | ||
208 | spin_unlock_irqrestore(&task->held_list_lock, flags); | ||
209 | |||
210 | printk("\n#%03d: ", count); | ||
211 | printk_lock(lock, 0); | ||
212 | goto next; | ||
213 | } | ||
214 | spin_unlock_irqrestore(&task->held_list_lock, flags); | ||
215 | |||
216 | printk("\n"); | ||
217 | } | ||
218 | |||
219 | void rt_mutex_show_all_locks(void) | ||
220 | { | ||
221 | task_t *g, *p; | ||
222 | int count = 10; | ||
223 | int unlock = 1; | ||
224 | |||
225 | printk("\n"); | ||
226 | printk("----------------------\n"); | ||
227 | printk("| showing all tasks: |\n"); | ||
228 | printk("----------------------\n"); | ||
229 | |||
230 | /* | ||
231 | * Here we try to get the tasklist_lock as hard as possible, | ||
232 | * if not successful after 2 seconds we ignore it (but keep | ||
233 | * trying). This is to enable a debug printout even if a | ||
234 | * tasklist_lock-holding task deadlocks or crashes. | ||
235 | */ | ||
236 | retry: | ||
237 | if (!read_trylock(&tasklist_lock)) { | ||
238 | if (count == 10) | ||
239 | printk("hm, tasklist_lock locked, retrying... "); | ||
240 | if (count) { | ||
241 | count--; | ||
242 | printk(" #%d", 10-count); | ||
243 | mdelay(200); | ||
244 | goto retry; | ||
245 | } | ||
246 | printk(" ignoring it.\n"); | ||
247 | unlock = 0; | ||
248 | } | ||
249 | if (count != 10) | ||
250 | printk(" locked it.\n"); | ||
251 | |||
252 | do_each_thread(g, p) { | ||
253 | show_task_locks(p); | ||
254 | if (!unlock) | ||
255 | if (read_trylock(&tasklist_lock)) | ||
256 | unlock = 1; | ||
257 | } while_each_thread(g, p); | ||
258 | |||
259 | printk("\n"); | ||
260 | |||
261 | printk("-----------------------------------------\n"); | ||
262 | printk("| showing all locks held in the system: |\n"); | ||
263 | printk("-----------------------------------------\n"); | ||
264 | |||
265 | do_each_thread(g, p) { | ||
266 | rt_mutex_show_held_locks(p, 0); | ||
267 | if (!unlock) | ||
268 | if (read_trylock(&tasklist_lock)) | ||
269 | unlock = 1; | ||
270 | } while_each_thread(g, p); | ||
271 | |||
272 | |||
273 | printk("=============================================\n\n"); | ||
274 | |||
275 | if (unlock) | ||
276 | read_unlock(&tasklist_lock); | ||
277 | } | ||
278 | |||
279 | void rt_mutex_debug_check_no_locks_held(task_t *task) | ||
280 | { | ||
281 | struct rt_mutex_waiter *w; | ||
282 | struct list_head *curr; | ||
283 | struct rt_mutex *lock; | ||
284 | |||
285 | if (!rt_trace_on) | ||
286 | return; | ||
287 | if (!rt_prio(task->normal_prio) && rt_prio(task->prio)) { | ||
288 | printk("BUG: PI priority boost leaked!\n"); | ||
289 | printk_task(task); | ||
290 | printk("\n"); | ||
291 | } | ||
292 | if (list_empty(&task->held_list_head)) | ||
293 | return; | ||
294 | |||
295 | spin_lock(&task->pi_lock); | ||
296 | plist_for_each_entry(w, &task->pi_waiters, pi_list_entry) { | ||
297 | TRACE_OFF(); | ||
298 | |||
299 | printk("hm, PI interest held at exit time? Task:\n"); | ||
300 | printk_task(task); | ||
301 | printk_waiter(w); | ||
302 | return; | ||
303 | } | ||
304 | spin_unlock(&task->pi_lock); | ||
305 | |||
306 | list_for_each(curr, &task->held_list_head) { | ||
307 | lock = list_entry(curr, struct rt_mutex, held_list_entry); | ||
308 | |||
309 | printk("BUG: %s/%d, lock held at task exit time!\n", | ||
310 | task->comm, task->pid); | ||
311 | printk_lock(lock, 1); | ||
312 | if (rt_mutex_owner(lock) != task) | ||
313 | printk("exiting task is not even the owner??\n"); | ||
314 | } | ||
315 | } | ||
316 | |||
317 | int rt_mutex_debug_check_no_locks_freed(const void *from, unsigned long len) | ||
318 | { | ||
319 | const void *to = from + len; | ||
320 | struct list_head *curr; | ||
321 | struct rt_mutex *lock; | ||
322 | unsigned long flags; | ||
323 | void *lock_addr; | ||
324 | |||
325 | if (!rt_trace_on) | ||
326 | return 0; | ||
327 | |||
328 | spin_lock_irqsave(¤t->held_list_lock, flags); | ||
329 | list_for_each(curr, ¤t->held_list_head) { | ||
330 | lock = list_entry(curr, struct rt_mutex, held_list_entry); | ||
331 | lock_addr = lock; | ||
332 | if (lock_addr < from || lock_addr >= to) | ||
333 | continue; | ||
334 | TRACE_OFF(); | ||
335 | |||
336 | printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n", | ||
337 | current->comm, current->pid, lock, from, to); | ||
338 | dump_stack(); | ||
339 | printk_lock(lock, 1); | ||
340 | if (rt_mutex_owner(lock) != current) | ||
341 | printk("freeing task is not even the owner??\n"); | ||
342 | return 1; | ||
343 | } | ||
344 | spin_unlock_irqrestore(¤t->held_list_lock, flags); | ||
345 | |||
346 | return 0; | ||
347 | } | ||
348 | |||
349 | void rt_mutex_debug_task_free(struct task_struct *task) | ||
350 | { | ||
351 | WARN_ON(!plist_head_empty(&task->pi_waiters)); | ||
352 | WARN_ON(task->pi_blocked_on); | ||
353 | } | ||
354 | |||
355 | /* | ||
356 | * We fill out the fields in the waiter to store the information about | ||
357 | * the deadlock. We print when we return. act_waiter can be NULL in | ||
358 | * case of a remove waiter operation. | ||
359 | */ | ||
360 | void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *act_waiter, | ||
361 | struct rt_mutex *lock) | ||
362 | { | ||
363 | struct task_struct *task; | ||
364 | |||
365 | if (!rt_trace_on || detect || !act_waiter) | ||
366 | return; | ||
367 | |||
368 | task = rt_mutex_owner(act_waiter->lock); | ||
369 | if (task && task != current) { | ||
370 | act_waiter->deadlock_task_pid = task->pid; | ||
371 | act_waiter->deadlock_lock = lock; | ||
372 | } | ||
373 | } | ||
374 | |||
375 | void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) | ||
376 | { | ||
377 | struct task_struct *task; | ||
378 | |||
379 | if (!waiter->deadlock_lock || !rt_trace_on) | ||
380 | return; | ||
381 | |||
382 | task = find_task_by_pid(waiter->deadlock_task_pid); | ||
383 | if (!task) | ||
384 | return; | ||
385 | |||
386 | TRACE_OFF_NOLOCK(); | ||
387 | |||
388 | printk("\n============================================\n"); | ||
389 | printk( "[ BUG: circular locking deadlock detected! ]\n"); | ||
390 | printk( "--------------------------------------------\n"); | ||
391 | printk("%s/%d is deadlocking current task %s/%d\n\n", | ||
392 | task->comm, task->pid, current->comm, current->pid); | ||
393 | |||
394 | printk("\n1) %s/%d is trying to acquire this lock:\n", | ||
395 | current->comm, current->pid); | ||
396 | printk_lock(waiter->lock, 1); | ||
397 | |||
398 | printk("... trying at: "); | ||
399 | print_symbol("%s\n", waiter->ip); | ||
400 | |||
401 | printk("\n2) %s/%d is blocked on this lock:\n", task->comm, task->pid); | ||
402 | printk_lock(waiter->deadlock_lock, 1); | ||
403 | |||
404 | rt_mutex_show_held_locks(current, 1); | ||
405 | rt_mutex_show_held_locks(task, 1); | ||
406 | |||
407 | printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task->pid); | ||
408 | show_stack(task, NULL); | ||
409 | printk("\n%s/%d's [current] stackdump:\n\n", | ||
410 | current->comm, current->pid); | ||
411 | dump_stack(); | ||
412 | rt_mutex_show_all_locks(); | ||
413 | printk("[ turning off deadlock detection." | ||
414 | "Please report this trace. ]\n\n"); | ||
415 | local_irq_disable(); | ||
416 | } | ||
417 | |||
418 | void debug_rt_mutex_lock(struct rt_mutex *lock __IP_DECL__) | ||
419 | { | ||
420 | unsigned long flags; | ||
421 | |||
422 | if (rt_trace_on) { | ||
423 | TRACE_WARN_ON_LOCKED(!list_empty(&lock->held_list_entry)); | ||
424 | |||
425 | spin_lock_irqsave(¤t->held_list_lock, flags); | ||
426 | list_add_tail(&lock->held_list_entry, ¤t->held_list_head); | ||
427 | spin_unlock_irqrestore(¤t->held_list_lock, flags); | ||
428 | |||
429 | lock->acquire_ip = ip; | ||
430 | } | ||
431 | } | ||
432 | |||
433 | void debug_rt_mutex_unlock(struct rt_mutex *lock) | ||
434 | { | ||
435 | unsigned long flags; | ||
436 | |||
437 | if (rt_trace_on) { | ||
438 | TRACE_WARN_ON_LOCKED(rt_mutex_owner(lock) != current); | ||
439 | TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list_entry)); | ||
440 | |||
441 | spin_lock_irqsave(¤t->held_list_lock, flags); | ||
442 | list_del_init(&lock->held_list_entry); | ||
443 | spin_unlock_irqrestore(¤t->held_list_lock, flags); | ||
444 | } | ||
445 | } | ||
446 | |||
447 | void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, | ||
448 | struct task_struct *powner __IP_DECL__) | ||
449 | { | ||
450 | unsigned long flags; | ||
451 | |||
452 | if (rt_trace_on) { | ||
453 | TRACE_WARN_ON_LOCKED(!list_empty(&lock->held_list_entry)); | ||
454 | |||
455 | spin_lock_irqsave(&powner->held_list_lock, flags); | ||
456 | list_add_tail(&lock->held_list_entry, &powner->held_list_head); | ||
457 | spin_unlock_irqrestore(&powner->held_list_lock, flags); | ||
458 | |||
459 | lock->acquire_ip = ip; | ||
460 | } | ||
461 | } | ||
462 | |||
463 | void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) | ||
464 | { | ||
465 | unsigned long flags; | ||
466 | |||
467 | if (rt_trace_on) { | ||
468 | struct task_struct *owner = rt_mutex_owner(lock); | ||
469 | |||
470 | TRACE_WARN_ON_LOCKED(!owner); | ||
471 | TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list_entry)); | ||
472 | |||
473 | spin_lock_irqsave(&owner->held_list_lock, flags); | ||
474 | list_del_init(&lock->held_list_entry); | ||
475 | spin_unlock_irqrestore(&owner->held_list_lock, flags); | ||
476 | } | ||
477 | } | ||
478 | |||
479 | void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) | ||
480 | { | ||
481 | memset(waiter, 0x11, sizeof(*waiter)); | ||
482 | plist_node_init(&waiter->list_entry, MAX_PRIO); | ||
483 | plist_node_init(&waiter->pi_list_entry, MAX_PRIO); | ||
484 | } | ||
485 | |||
486 | void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) | ||
487 | { | ||
488 | TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry)); | ||
489 | TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); | ||
490 | TRACE_WARN_ON(waiter->task); | ||
491 | memset(waiter, 0x22, sizeof(*waiter)); | ||
492 | } | ||
493 | |||
494 | void debug_rt_mutex_init(struct rt_mutex *lock, const char *name) | ||
495 | { | ||
496 | void *addr = lock; | ||
497 | |||
498 | if (rt_trace_on) { | ||
499 | rt_mutex_debug_check_no_locks_freed(addr, | ||
500 | sizeof(struct rt_mutex)); | ||
501 | INIT_LIST_HEAD(&lock->held_list_entry); | ||
502 | lock->name = name; | ||
503 | } | ||
504 | } | ||
505 | |||
506 | void rt_mutex_deadlock_account_lock(struct rt_mutex *lock, task_t *task) | ||
507 | { | ||
508 | } | ||
509 | |||
510 | void rt_mutex_deadlock_account_unlock(struct task_struct *task) | ||
511 | { | ||
512 | } | ||
513 | |||
diff --git a/kernel/rtmutex-debug.h b/kernel/rtmutex-debug.h new file mode 100644 index 000000000000..7612fbc62d70 --- /dev/null +++ b/kernel/rtmutex-debug.h | |||
@@ -0,0 +1,37 @@ | |||
1 | /* | ||
2 | * RT-Mutexes: blocking mutual exclusion locks with PI support | ||
3 | * | ||
4 | * started by Ingo Molnar and Thomas Gleixner: | ||
5 | * | ||
6 | * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
7 | * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> | ||
8 | * | ||
9 | * This file contains macros used solely by rtmutex.c. Debug version. | ||
10 | */ | ||
11 | |||
12 | #define __IP_DECL__ , unsigned long ip | ||
13 | #define __IP__ , ip | ||
14 | #define __RET_IP__ , (unsigned long)__builtin_return_address(0) | ||
15 | |||
16 | extern void | ||
17 | rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task); | ||
18 | extern void rt_mutex_deadlock_account_unlock(struct task_struct *task); | ||
19 | extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); | ||
20 | extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter); | ||
21 | extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name); | ||
22 | extern void debug_rt_mutex_lock(struct rt_mutex *lock __IP_DECL__); | ||
23 | extern void debug_rt_mutex_unlock(struct rt_mutex *lock); | ||
24 | extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, | ||
25 | struct task_struct *powner __IP_DECL__); | ||
26 | extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock); | ||
27 | extern void debug_rt_mutex_deadlock(int detect, struct rt_mutex_waiter *waiter, | ||
28 | struct rt_mutex *lock); | ||
29 | extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter); | ||
30 | # define debug_rt_mutex_reset_waiter(w) \ | ||
31 | do { (w)->deadlock_lock = NULL; } while (0) | ||
32 | |||
33 | static inline int debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, | ||
34 | int detect) | ||
35 | { | ||
36 | return (waiter != NULL); | ||
37 | } | ||
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c new file mode 100644 index 000000000000..e82c2f848249 --- /dev/null +++ b/kernel/rtmutex-tester.c | |||
@@ -0,0 +1,440 @@ | |||
1 | /* | ||
2 | * RT-Mutex-tester: scriptable tester for rt mutexes | ||
3 | * | ||
4 | * started by Thomas Gleixner: | ||
5 | * | ||
6 | * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> | ||
7 | * | ||
8 | */ | ||
9 | #include <linux/config.h> | ||
10 | #include <linux/kthread.h> | ||
11 | #include <linux/module.h> | ||
12 | #include <linux/sched.h> | ||
13 | #include <linux/smp_lock.h> | ||
14 | #include <linux/spinlock.h> | ||
15 | #include <linux/sysdev.h> | ||
16 | #include <linux/timer.h> | ||
17 | |||
18 | #include "rtmutex.h" | ||
19 | |||
20 | #define MAX_RT_TEST_THREADS 8 | ||
21 | #define MAX_RT_TEST_MUTEXES 8 | ||
22 | |||
23 | static spinlock_t rttest_lock; | ||
24 | static atomic_t rttest_event; | ||
25 | |||
26 | struct test_thread_data { | ||
27 | int opcode; | ||
28 | int opdata; | ||
29 | int mutexes[MAX_RT_TEST_MUTEXES]; | ||
30 | int bkl; | ||
31 | int event; | ||
32 | struct sys_device sysdev; | ||
33 | }; | ||
34 | |||
35 | static struct test_thread_data thread_data[MAX_RT_TEST_THREADS]; | ||
36 | static task_t *threads[MAX_RT_TEST_THREADS]; | ||
37 | static struct rt_mutex mutexes[MAX_RT_TEST_MUTEXES]; | ||
38 | |||
39 | enum test_opcodes { | ||
40 | RTTEST_NOP = 0, | ||
41 | RTTEST_SCHEDOT, /* 1 Sched other, data = nice */ | ||
42 | RTTEST_SCHEDRT, /* 2 Sched fifo, data = prio */ | ||
43 | RTTEST_LOCK, /* 3 Lock uninterruptible, data = lockindex */ | ||
44 | RTTEST_LOCKNOWAIT, /* 4 Lock uninterruptible no wait in wakeup, data = lockindex */ | ||
45 | RTTEST_LOCKINT, /* 5 Lock interruptible, data = lockindex */ | ||
46 | RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */ | ||
47 | RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */ | ||
48 | RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */ | ||
49 | RTTEST_LOCKBKL, /* 9 Lock BKL */ | ||
50 | RTTEST_UNLOCKBKL, /* 10 Unlock BKL */ | ||
51 | RTTEST_SIGNAL, /* 11 Signal other test thread, data = thread id */ | ||
52 | RTTEST_RESETEVENT = 98, /* 98 Reset event counter */ | ||
53 | RTTEST_RESET = 99, /* 99 Reset all pending operations */ | ||
54 | }; | ||
55 | |||
56 | static int handle_op(struct test_thread_data *td, int lockwakeup) | ||
57 | { | ||
58 | int i, id, ret = -EINVAL; | ||
59 | |||
60 | switch(td->opcode) { | ||
61 | |||
62 | case RTTEST_NOP: | ||
63 | return 0; | ||
64 | |||
65 | case RTTEST_LOCKCONT: | ||
66 | td->mutexes[td->opdata] = 1; | ||
67 | td->event = atomic_add_return(1, &rttest_event); | ||
68 | return 0; | ||
69 | |||
70 | case RTTEST_RESET: | ||
71 | for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) { | ||
72 | if (td->mutexes[i] == 4) { | ||
73 | rt_mutex_unlock(&mutexes[i]); | ||
74 | td->mutexes[i] = 0; | ||
75 | } | ||
76 | } | ||
77 | |||
78 | if (!lockwakeup && td->bkl == 4) { | ||
79 | unlock_kernel(); | ||
80 | td->bkl = 0; | ||
81 | } | ||
82 | return 0; | ||
83 | |||
84 | case RTTEST_RESETEVENT: | ||
85 | atomic_set(&rttest_event, 0); | ||
86 | return 0; | ||
87 | |||
88 | default: | ||
89 | if (lockwakeup) | ||
90 | return ret; | ||
91 | } | ||
92 | |||
93 | switch(td->opcode) { | ||
94 | |||
95 | case RTTEST_LOCK: | ||
96 | case RTTEST_LOCKNOWAIT: | ||
97 | id = td->opdata; | ||
98 | if (id < 0 || id >= MAX_RT_TEST_MUTEXES) | ||
99 | return ret; | ||
100 | |||
101 | td->mutexes[id] = 1; | ||
102 | td->event = atomic_add_return(1, &rttest_event); | ||
103 | rt_mutex_lock(&mutexes[id]); | ||
104 | td->event = atomic_add_return(1, &rttest_event); | ||
105 | td->mutexes[id] = 4; | ||
106 | return 0; | ||
107 | |||
108 | case RTTEST_LOCKINT: | ||
109 | case RTTEST_LOCKINTNOWAIT: | ||
110 | id = td->opdata; | ||
111 | if (id < 0 || id >= MAX_RT_TEST_MUTEXES) | ||
112 | return ret; | ||
113 | |||
114 | td->mutexes[id] = 1; | ||
115 | td->event = atomic_add_return(1, &rttest_event); | ||
116 | ret = rt_mutex_lock_interruptible(&mutexes[id], 0); | ||
117 | td->event = atomic_add_return(1, &rttest_event); | ||
118 | td->mutexes[id] = ret ? 0 : 4; | ||
119 | return ret ? -EINTR : 0; | ||
120 | |||
121 | case RTTEST_UNLOCK: | ||
122 | id = td->opdata; | ||
123 | if (id < 0 || id >= MAX_RT_TEST_MUTEXES || td->mutexes[id] != 4) | ||
124 | return ret; | ||
125 | |||
126 | td->event = atomic_add_return(1, &rttest_event); | ||
127 | rt_mutex_unlock(&mutexes[id]); | ||
128 | td->event = atomic_add_return(1, &rttest_event); | ||
129 | td->mutexes[id] = 0; | ||
130 | return 0; | ||
131 | |||
132 | case RTTEST_LOCKBKL: | ||
133 | if (td->bkl) | ||
134 | return 0; | ||
135 | td->bkl = 1; | ||
136 | lock_kernel(); | ||
137 | td->bkl = 4; | ||
138 | return 0; | ||
139 | |||
140 | case RTTEST_UNLOCKBKL: | ||
141 | if (td->bkl != 4) | ||
142 | break; | ||
143 | unlock_kernel(); | ||
144 | td->bkl = 0; | ||
145 | return 0; | ||
146 | |||
147 | default: | ||
148 | break; | ||
149 | } | ||
150 | return ret; | ||
151 | } | ||
152 | |||
153 | /* | ||
154 | * Schedule replacement for rtsem_down(). Only called for threads with | ||
155 | * PF_MUTEX_TESTER set. | ||
156 | * | ||
157 | * This allows us to have finegrained control over the event flow. | ||
158 | * | ||
159 | */ | ||
160 | void schedule_rt_mutex_test(struct rt_mutex *mutex) | ||
161 | { | ||
162 | int tid, op, dat; | ||
163 | struct test_thread_data *td; | ||
164 | |||
165 | /* We have to lookup the task */ | ||
166 | for (tid = 0; tid < MAX_RT_TEST_THREADS; tid++) { | ||
167 | if (threads[tid] == current) | ||
168 | break; | ||
169 | } | ||
170 | |||
171 | BUG_ON(tid == MAX_RT_TEST_THREADS); | ||
172 | |||
173 | td = &thread_data[tid]; | ||
174 | |||
175 | op = td->opcode; | ||
176 | dat = td->opdata; | ||
177 | |||
178 | switch (op) { | ||
179 | case RTTEST_LOCK: | ||
180 | case RTTEST_LOCKINT: | ||
181 | case RTTEST_LOCKNOWAIT: | ||
182 | case RTTEST_LOCKINTNOWAIT: | ||
183 | if (mutex != &mutexes[dat]) | ||
184 | break; | ||
185 | |||
186 | if (td->mutexes[dat] != 1) | ||
187 | break; | ||
188 | |||
189 | td->mutexes[dat] = 2; | ||
190 | td->event = atomic_add_return(1, &rttest_event); | ||
191 | break; | ||
192 | |||
193 | case RTTEST_LOCKBKL: | ||
194 | default: | ||
195 | break; | ||
196 | } | ||
197 | |||
198 | schedule(); | ||
199 | |||
200 | |||
201 | switch (op) { | ||
202 | case RTTEST_LOCK: | ||
203 | case RTTEST_LOCKINT: | ||
204 | if (mutex != &mutexes[dat]) | ||
205 | return; | ||
206 | |||
207 | if (td->mutexes[dat] != 2) | ||
208 | return; | ||
209 | |||
210 | td->mutexes[dat] = 3; | ||
211 | td->event = atomic_add_return(1, &rttest_event); | ||
212 | break; | ||
213 | |||
214 | case RTTEST_LOCKNOWAIT: | ||
215 | case RTTEST_LOCKINTNOWAIT: | ||
216 | if (mutex != &mutexes[dat]) | ||
217 | return; | ||
218 | |||
219 | if (td->mutexes[dat] != 2) | ||
220 | return; | ||
221 | |||
222 | td->mutexes[dat] = 1; | ||
223 | td->event = atomic_add_return(1, &rttest_event); | ||
224 | return; | ||
225 | |||
226 | case RTTEST_LOCKBKL: | ||
227 | return; | ||
228 | default: | ||
229 | return; | ||
230 | } | ||
231 | |||
232 | td->opcode = 0; | ||
233 | |||
234 | for (;;) { | ||
235 | set_current_state(TASK_INTERRUPTIBLE); | ||
236 | |||
237 | if (td->opcode > 0) { | ||
238 | int ret; | ||
239 | |||
240 | set_current_state(TASK_RUNNING); | ||
241 | ret = handle_op(td, 1); | ||
242 | set_current_state(TASK_INTERRUPTIBLE); | ||
243 | if (td->opcode == RTTEST_LOCKCONT) | ||
244 | break; | ||
245 | td->opcode = ret; | ||
246 | } | ||
247 | |||
248 | /* Wait for the next command to be executed */ | ||
249 | schedule(); | ||
250 | } | ||
251 | |||
252 | /* Restore previous command and data */ | ||
253 | td->opcode = op; | ||
254 | td->opdata = dat; | ||
255 | } | ||
256 | |||
257 | static int test_func(void *data) | ||
258 | { | ||
259 | struct test_thread_data *td = data; | ||
260 | int ret; | ||
261 | |||
262 | current->flags |= PF_MUTEX_TESTER; | ||
263 | allow_signal(SIGHUP); | ||
264 | |||
265 | for(;;) { | ||
266 | |||
267 | set_current_state(TASK_INTERRUPTIBLE); | ||
268 | |||
269 | if (td->opcode > 0) { | ||
270 | set_current_state(TASK_RUNNING); | ||
271 | ret = handle_op(td, 0); | ||
272 | set_current_state(TASK_INTERRUPTIBLE); | ||
273 | td->opcode = ret; | ||
274 | } | ||
275 | |||
276 | /* Wait for the next command to be executed */ | ||
277 | schedule(); | ||
278 | |||
279 | if (signal_pending(current)) | ||
280 | flush_signals(current); | ||
281 | |||
282 | if(kthread_should_stop()) | ||
283 | break; | ||
284 | } | ||
285 | return 0; | ||
286 | } | ||
287 | |||
288 | /** | ||
289 | * sysfs_test_command - interface for test commands | ||
290 | * @dev: thread reference | ||
291 | * @buf: command for actual step | ||
292 | * @count: length of buffer | ||
293 | * | ||
294 | * command syntax: | ||
295 | * | ||
296 | * opcode:data | ||
297 | */ | ||
298 | static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf, | ||
299 | size_t count) | ||
300 | { | ||
301 | struct sched_param schedpar; | ||
302 | struct test_thread_data *td; | ||
303 | char cmdbuf[32]; | ||
304 | int op, dat, tid, ret; | ||
305 | |||
306 | td = container_of(dev, struct test_thread_data, sysdev); | ||
307 | tid = td->sysdev.id; | ||
308 | |||
309 | /* strings from sysfs write are not 0 terminated! */ | ||
310 | if (count >= sizeof(cmdbuf)) | ||
311 | return -EINVAL; | ||
312 | |||
313 | /* strip of \n: */ | ||
314 | if (buf[count-1] == '\n') | ||
315 | count--; | ||
316 | if (count < 1) | ||
317 | return -EINVAL; | ||
318 | |||
319 | memcpy(cmdbuf, buf, count); | ||
320 | cmdbuf[count] = 0; | ||
321 | |||
322 | if (sscanf(cmdbuf, "%d:%d", &op, &dat) != 2) | ||
323 | return -EINVAL; | ||
324 | |||
325 | switch (op) { | ||
326 | case RTTEST_SCHEDOT: | ||
327 | schedpar.sched_priority = 0; | ||
328 | ret = sched_setscheduler(threads[tid], SCHED_NORMAL, &schedpar); | ||
329 | if (ret) | ||
330 | return ret; | ||
331 | set_user_nice(current, 0); | ||
332 | break; | ||
333 | |||
334 | case RTTEST_SCHEDRT: | ||
335 | schedpar.sched_priority = dat; | ||
336 | ret = sched_setscheduler(threads[tid], SCHED_FIFO, &schedpar); | ||
337 | if (ret) | ||
338 | return ret; | ||
339 | break; | ||
340 | |||
341 | case RTTEST_SIGNAL: | ||
342 | send_sig(SIGHUP, threads[tid], 0); | ||
343 | break; | ||
344 | |||
345 | default: | ||
346 | if (td->opcode > 0) | ||
347 | return -EBUSY; | ||
348 | td->opdata = dat; | ||
349 | td->opcode = op; | ||
350 | wake_up_process(threads[tid]); | ||
351 | } | ||
352 | |||
353 | return count; | ||
354 | } | ||
355 | |||
356 | /** | ||
357 | * sysfs_test_status - sysfs interface for rt tester | ||
358 | * @dev: thread to query | ||
359 | * @buf: char buffer to be filled with thread status info | ||
360 | */ | ||
361 | static ssize_t sysfs_test_status(struct sys_device *dev, char *buf) | ||
362 | { | ||
363 | struct test_thread_data *td; | ||
364 | char *curr = buf; | ||
365 | task_t *tsk; | ||
366 | int i; | ||
367 | |||
368 | td = container_of(dev, struct test_thread_data, sysdev); | ||
369 | tsk = threads[td->sysdev.id]; | ||
370 | |||
371 | spin_lock(&rttest_lock); | ||
372 | |||
373 | curr += sprintf(curr, | ||
374 | "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:", | ||
375 | td->opcode, td->event, tsk->state, | ||
376 | (MAX_RT_PRIO - 1) - tsk->prio, | ||
377 | (MAX_RT_PRIO - 1) - tsk->normal_prio, | ||
378 | tsk->pi_blocked_on, td->bkl); | ||
379 | |||
380 | for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--) | ||
381 | curr += sprintf(curr, "%d", td->mutexes[i]); | ||
382 | |||
383 | spin_unlock(&rttest_lock); | ||
384 | |||
385 | curr += sprintf(curr, ", T: %p, R: %p\n", tsk, | ||
386 | mutexes[td->sysdev.id].owner); | ||
387 | |||
388 | return curr - buf; | ||
389 | } | ||
390 | |||
391 | static SYSDEV_ATTR(status, 0600, sysfs_test_status, NULL); | ||
392 | static SYSDEV_ATTR(command, 0600, NULL, sysfs_test_command); | ||
393 | |||
394 | static struct sysdev_class rttest_sysclass = { | ||
395 | set_kset_name("rttest"), | ||
396 | }; | ||
397 | |||
398 | static int init_test_thread(int id) | ||
399 | { | ||
400 | thread_data[id].sysdev.cls = &rttest_sysclass; | ||
401 | thread_data[id].sysdev.id = id; | ||
402 | |||
403 | threads[id] = kthread_run(test_func, &thread_data[id], "rt-test-%d", id); | ||
404 | if (IS_ERR(threads[id])) | ||
405 | return PTR_ERR(threads[id]); | ||
406 | |||
407 | return sysdev_register(&thread_data[id].sysdev); | ||
408 | } | ||
409 | |||
410 | static int init_rttest(void) | ||
411 | { | ||
412 | int ret, i; | ||
413 | |||
414 | spin_lock_init(&rttest_lock); | ||
415 | |||
416 | for (i = 0; i < MAX_RT_TEST_MUTEXES; i++) | ||
417 | rt_mutex_init(&mutexes[i]); | ||
418 | |||
419 | ret = sysdev_class_register(&rttest_sysclass); | ||
420 | if (ret) | ||
421 | return ret; | ||
422 | |||
423 | for (i = 0; i < MAX_RT_TEST_THREADS; i++) { | ||
424 | ret = init_test_thread(i); | ||
425 | if (ret) | ||
426 | break; | ||
427 | ret = sysdev_create_file(&thread_data[i].sysdev, &attr_status); | ||
428 | if (ret) | ||
429 | break; | ||
430 | ret = sysdev_create_file(&thread_data[i].sysdev, &attr_command); | ||
431 | if (ret) | ||
432 | break; | ||
433 | } | ||
434 | |||
435 | printk("Initializing RT-Tester: %s\n", ret ? "Failed" : "OK" ); | ||
436 | |||
437 | return ret; | ||
438 | } | ||
439 | |||
440 | device_initcall(init_rttest); | ||
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c new file mode 100644 index 000000000000..45d61016da57 --- /dev/null +++ b/kernel/rtmutex.c | |||
@@ -0,0 +1,990 @@ | |||
1 | /* | ||
2 | * RT-Mutexes: simple blocking mutual exclusion locks with PI support | ||
3 | * | ||
4 | * started by Ingo Molnar and Thomas Gleixner. | ||
5 | * | ||
6 | * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
7 | * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> | ||
8 | * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt | ||
9 | * Copyright (C) 2006 Esben Nielsen | ||
10 | */ | ||
11 | #include <linux/spinlock.h> | ||
12 | #include <linux/module.h> | ||
13 | #include <linux/sched.h> | ||
14 | #include <linux/timer.h> | ||
15 | |||
16 | #include "rtmutex_common.h" | ||
17 | |||
18 | #ifdef CONFIG_DEBUG_RT_MUTEXES | ||
19 | # include "rtmutex-debug.h" | ||
20 | #else | ||
21 | # include "rtmutex.h" | ||
22 | #endif | ||
23 | |||
24 | /* | ||
25 | * lock->owner state tracking: | ||
26 | * | ||
27 | * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1 | ||
28 | * are used to keep track of the "owner is pending" and "lock has | ||
29 | * waiters" state. | ||
30 | * | ||
31 | * owner bit1 bit0 | ||
32 | * NULL 0 0 lock is free (fast acquire possible) | ||
33 | * NULL 0 1 invalid state | ||
34 | * NULL 1 0 Transitional State* | ||
35 | * NULL 1 1 invalid state | ||
36 | * taskpointer 0 0 lock is held (fast release possible) | ||
37 | * taskpointer 0 1 task is pending owner | ||
38 | * taskpointer 1 0 lock is held and has waiters | ||
39 | * taskpointer 1 1 task is pending owner and lock has more waiters | ||
40 | * | ||
41 | * Pending ownership is assigned to the top (highest priority) | ||
42 | * waiter of the lock, when the lock is released. The thread is woken | ||
43 | * up and can now take the lock. Until the lock is taken (bit 0 | ||
44 | * cleared) a competing higher priority thread can steal the lock | ||
45 | * which puts the woken up thread back on the waiters list. | ||
46 | * | ||
47 | * The fast atomic compare exchange based acquire and release is only | ||
48 | * possible when bit 0 and 1 of lock->owner are 0. | ||
49 | * | ||
50 | * (*) There's a small time where the owner can be NULL and the | ||
51 | * "lock has waiters" bit is set. This can happen when grabbing the lock. | ||
52 | * To prevent a cmpxchg of the owner releasing the lock, we need to set this | ||
53 | * bit before looking at the lock, hence the reason this is a transitional | ||
54 | * state. | ||
55 | */ | ||
56 | |||
57 | static void | ||
58 | rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner, | ||
59 | unsigned long mask) | ||
60 | { | ||
61 | unsigned long val = (unsigned long)owner | mask; | ||
62 | |||
63 | if (rt_mutex_has_waiters(lock)) | ||
64 | val |= RT_MUTEX_HAS_WAITERS; | ||
65 | |||
66 | lock->owner = (struct task_struct *)val; | ||
67 | } | ||
68 | |||
69 | static inline void clear_rt_mutex_waiters(struct rt_mutex *lock) | ||
70 | { | ||
71 | lock->owner = (struct task_struct *) | ||
72 | ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); | ||
73 | } | ||
74 | |||
75 | static void fixup_rt_mutex_waiters(struct rt_mutex *lock) | ||
76 | { | ||
77 | if (!rt_mutex_has_waiters(lock)) | ||
78 | clear_rt_mutex_waiters(lock); | ||
79 | } | ||
80 | |||
81 | /* | ||
82 | * We can speed up the acquire/release, if the architecture | ||
83 | * supports cmpxchg and if there's no debugging state to be set up | ||
84 | */ | ||
85 | #if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_DEBUG_RT_MUTEXES) | ||
86 | # define rt_mutex_cmpxchg(l,c,n) (cmpxchg(&l->owner, c, n) == c) | ||
87 | static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) | ||
88 | { | ||
89 | unsigned long owner, *p = (unsigned long *) &lock->owner; | ||
90 | |||
91 | do { | ||
92 | owner = *p; | ||
93 | } while (cmpxchg(p, owner, owner | RT_MUTEX_HAS_WAITERS) != owner); | ||
94 | } | ||
95 | #else | ||
96 | # define rt_mutex_cmpxchg(l,c,n) (0) | ||
97 | static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) | ||
98 | { | ||
99 | lock->owner = (struct task_struct *) | ||
100 | ((unsigned long)lock->owner | RT_MUTEX_HAS_WAITERS); | ||
101 | } | ||
102 | #endif | ||
103 | |||
104 | /* | ||
105 | * Calculate task priority from the waiter list priority | ||
106 | * | ||
107 | * Return task->normal_prio when the waiter list is empty or when | ||
108 | * the waiter is not allowed to do priority boosting | ||
109 | */ | ||
110 | int rt_mutex_getprio(struct task_struct *task) | ||
111 | { | ||
112 | if (likely(!task_has_pi_waiters(task))) | ||
113 | return task->normal_prio; | ||
114 | |||
115 | return min(task_top_pi_waiter(task)->pi_list_entry.prio, | ||
116 | task->normal_prio); | ||
117 | } | ||
118 | |||
119 | /* | ||
120 | * Adjust the priority of a task, after its pi_waiters got modified. | ||
121 | * | ||
122 | * This can be both boosting and unboosting. task->pi_lock must be held. | ||
123 | */ | ||
124 | static void __rt_mutex_adjust_prio(struct task_struct *task) | ||
125 | { | ||
126 | int prio = rt_mutex_getprio(task); | ||
127 | |||
128 | if (task->prio != prio) | ||
129 | rt_mutex_setprio(task, prio); | ||
130 | } | ||
131 | |||
132 | /* | ||
133 | * Adjust task priority (undo boosting). Called from the exit path of | ||
134 | * rt_mutex_slowunlock() and rt_mutex_slowlock(). | ||
135 | * | ||
136 | * (Note: We do this outside of the protection of lock->wait_lock to | ||
137 | * allow the lock to be taken while or before we readjust the priority | ||
138 | * of task. We do not use the spin_xx_mutex() variants here as we are | ||
139 | * outside of the debug path.) | ||
140 | */ | ||
141 | static void rt_mutex_adjust_prio(struct task_struct *task) | ||
142 | { | ||
143 | unsigned long flags; | ||
144 | |||
145 | spin_lock_irqsave(&task->pi_lock, flags); | ||
146 | __rt_mutex_adjust_prio(task); | ||
147 | spin_unlock_irqrestore(&task->pi_lock, flags); | ||
148 | } | ||
149 | |||
150 | /* | ||
151 | * Max number of times we'll walk the boosting chain: | ||
152 | */ | ||
153 | int max_lock_depth = 1024; | ||
154 | |||
155 | /* | ||
156 | * Adjust the priority chain. Also used for deadlock detection. | ||
157 | * Decreases task's usage by one - may thus free the task. | ||
158 | * Returns 0 or -EDEADLK. | ||
159 | */ | ||
160 | static int rt_mutex_adjust_prio_chain(task_t *task, | ||
161 | int deadlock_detect, | ||
162 | struct rt_mutex *orig_lock, | ||
163 | struct rt_mutex_waiter *orig_waiter, | ||
164 | struct task_struct *top_task | ||
165 | __IP_DECL__) | ||
166 | { | ||
167 | struct rt_mutex *lock; | ||
168 | struct rt_mutex_waiter *waiter, *top_waiter = orig_waiter; | ||
169 | int detect_deadlock, ret = 0, depth = 0; | ||
170 | unsigned long flags; | ||
171 | |||
172 | detect_deadlock = debug_rt_mutex_detect_deadlock(orig_waiter, | ||
173 | deadlock_detect); | ||
174 | |||
175 | /* | ||
176 | * The (de)boosting is a step by step approach with a lot of | ||
177 | * pitfalls. We want this to be preemptible and we want hold a | ||
178 | * maximum of two locks per step. So we have to check | ||
179 | * carefully whether things change under us. | ||
180 | */ | ||
181 | again: | ||
182 | if (++depth > max_lock_depth) { | ||
183 | static int prev_max; | ||
184 | |||
185 | /* | ||
186 | * Print this only once. If the admin changes the limit, | ||
187 | * print a new message when reaching the limit again. | ||
188 | */ | ||
189 | if (prev_max != max_lock_depth) { | ||
190 | prev_max = max_lock_depth; | ||
191 | printk(KERN_WARNING "Maximum lock depth %d reached " | ||
192 | "task: %s (%d)\n", max_lock_depth, | ||
193 | top_task->comm, top_task->pid); | ||
194 | } | ||
195 | put_task_struct(task); | ||
196 | |||
197 | return deadlock_detect ? -EDEADLK : 0; | ||
198 | } | ||
199 | retry: | ||
200 | /* | ||
201 | * Task can not go away as we did a get_task() before ! | ||
202 | */ | ||
203 | spin_lock_irqsave(&task->pi_lock, flags); | ||
204 | |||
205 | waiter = task->pi_blocked_on; | ||
206 | /* | ||
207 | * Check whether the end of the boosting chain has been | ||
208 | * reached or the state of the chain has changed while we | ||
209 | * dropped the locks. | ||
210 | */ | ||
211 | if (!waiter || !waiter->task) | ||
212 | goto out_unlock_pi; | ||
213 | |||
214 | if (top_waiter && (!task_has_pi_waiters(task) || | ||
215 | top_waiter != task_top_pi_waiter(task))) | ||
216 | goto out_unlock_pi; | ||
217 | |||
218 | /* | ||
219 | * When deadlock detection is off then we check, if further | ||
220 | * priority adjustment is necessary. | ||
221 | */ | ||
222 | if (!detect_deadlock && waiter->list_entry.prio == task->prio) | ||
223 | goto out_unlock_pi; | ||
224 | |||
225 | lock = waiter->lock; | ||
226 | if (!spin_trylock(&lock->wait_lock)) { | ||
227 | spin_unlock_irqrestore(&task->pi_lock, flags); | ||
228 | cpu_relax(); | ||
229 | goto retry; | ||
230 | } | ||
231 | |||
232 | /* Deadlock detection */ | ||
233 | if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { | ||
234 | debug_rt_mutex_deadlock(deadlock_detect, orig_waiter, lock); | ||
235 | spin_unlock(&lock->wait_lock); | ||
236 | ret = deadlock_detect ? -EDEADLK : 0; | ||
237 | goto out_unlock_pi; | ||
238 | } | ||
239 | |||
240 | top_waiter = rt_mutex_top_waiter(lock); | ||
241 | |||
242 | /* Requeue the waiter */ | ||
243 | plist_del(&waiter->list_entry, &lock->wait_list); | ||
244 | waiter->list_entry.prio = task->prio; | ||
245 | plist_add(&waiter->list_entry, &lock->wait_list); | ||
246 | |||
247 | /* Release the task */ | ||
248 | spin_unlock_irqrestore(&task->pi_lock, flags); | ||
249 | put_task_struct(task); | ||
250 | |||
251 | /* Grab the next task */ | ||
252 | task = rt_mutex_owner(lock); | ||
253 | spin_lock_irqsave(&task->pi_lock, flags); | ||
254 | |||
255 | if (waiter == rt_mutex_top_waiter(lock)) { | ||
256 | /* Boost the owner */ | ||
257 | plist_del(&top_waiter->pi_list_entry, &task->pi_waiters); | ||
258 | waiter->pi_list_entry.prio = waiter->list_entry.prio; | ||
259 | plist_add(&waiter->pi_list_entry, &task->pi_waiters); | ||
260 | __rt_mutex_adjust_prio(task); | ||
261 | |||
262 | } else if (top_waiter == waiter) { | ||
263 | /* Deboost the owner */ | ||
264 | plist_del(&waiter->pi_list_entry, &task->pi_waiters); | ||
265 | waiter = rt_mutex_top_waiter(lock); | ||
266 | waiter->pi_list_entry.prio = waiter->list_entry.prio; | ||
267 | plist_add(&waiter->pi_list_entry, &task->pi_waiters); | ||
268 | __rt_mutex_adjust_prio(task); | ||
269 | } | ||
270 | |||
271 | get_task_struct(task); | ||
272 | spin_unlock_irqrestore(&task->pi_lock, flags); | ||
273 | |||
274 | top_waiter = rt_mutex_top_waiter(lock); | ||
275 | spin_unlock(&lock->wait_lock); | ||
276 | |||
277 | if (!detect_deadlock && waiter != top_waiter) | ||
278 | goto out_put_task; | ||
279 | |||
280 | goto again; | ||
281 | |||
282 | out_unlock_pi: | ||
283 | spin_unlock_irqrestore(&task->pi_lock, flags); | ||
284 | out_put_task: | ||
285 | put_task_struct(task); | ||
286 | return ret; | ||
287 | } | ||
288 | |||
289 | /* | ||
290 | * Optimization: check if we can steal the lock from the | ||
291 | * assigned pending owner [which might not have taken the | ||
292 | * lock yet]: | ||
293 | */ | ||
294 | static inline int try_to_steal_lock(struct rt_mutex *lock) | ||
295 | { | ||
296 | struct task_struct *pendowner = rt_mutex_owner(lock); | ||
297 | struct rt_mutex_waiter *next; | ||
298 | unsigned long flags; | ||
299 | |||
300 | if (!rt_mutex_owner_pending(lock)) | ||
301 | return 0; | ||
302 | |||
303 | if (pendowner == current) | ||
304 | return 1; | ||
305 | |||
306 | spin_lock_irqsave(&pendowner->pi_lock, flags); | ||
307 | if (current->prio >= pendowner->prio) { | ||
308 | spin_unlock_irqrestore(&pendowner->pi_lock, flags); | ||
309 | return 0; | ||
310 | } | ||
311 | |||
312 | /* | ||
313 | * Check if a waiter is enqueued on the pending owners | ||
314 | * pi_waiters list. Remove it and readjust pending owners | ||
315 | * priority. | ||
316 | */ | ||
317 | if (likely(!rt_mutex_has_waiters(lock))) { | ||
318 | spin_unlock_irqrestore(&pendowner->pi_lock, flags); | ||
319 | return 1; | ||
320 | } | ||
321 | |||
322 | /* No chain handling, pending owner is not blocked on anything: */ | ||
323 | next = rt_mutex_top_waiter(lock); | ||
324 | plist_del(&next->pi_list_entry, &pendowner->pi_waiters); | ||
325 | __rt_mutex_adjust_prio(pendowner); | ||
326 | spin_unlock_irqrestore(&pendowner->pi_lock, flags); | ||
327 | |||
328 | /* | ||
329 | * We are going to steal the lock and a waiter was | ||
330 | * enqueued on the pending owners pi_waiters queue. So | ||
331 | * we have to enqueue this waiter into | ||
332 | * current->pi_waiters list. This covers the case, | ||
333 | * where current is boosted because it holds another | ||
334 | * lock and gets unboosted because the booster is | ||
335 | * interrupted, so we would delay a waiter with higher | ||
336 | * priority as current->normal_prio. | ||
337 | * | ||
338 | * Note: in the rare case of a SCHED_OTHER task changing | ||
339 | * its priority and thus stealing the lock, next->task | ||
340 | * might be current: | ||
341 | */ | ||
342 | if (likely(next->task != current)) { | ||
343 | spin_lock_irqsave(¤t->pi_lock, flags); | ||
344 | plist_add(&next->pi_list_entry, ¤t->pi_waiters); | ||
345 | __rt_mutex_adjust_prio(current); | ||
346 | spin_unlock_irqrestore(¤t->pi_lock, flags); | ||
347 | } | ||
348 | return 1; | ||
349 | } | ||
350 | |||
351 | /* | ||
352 | * Try to take an rt-mutex | ||
353 | * | ||
354 | * This fails | ||
355 | * - when the lock has a real owner | ||
356 | * - when a different pending owner exists and has higher priority than current | ||
357 | * | ||
358 | * Must be called with lock->wait_lock held. | ||
359 | */ | ||
360 | static int try_to_take_rt_mutex(struct rt_mutex *lock __IP_DECL__) | ||
361 | { | ||
362 | /* | ||
363 | * We have to be careful here if the atomic speedups are | ||
364 | * enabled, such that, when | ||
365 | * - no other waiter is on the lock | ||
366 | * - the lock has been released since we did the cmpxchg | ||
367 | * the lock can be released or taken while we are doing the | ||
368 | * checks and marking the lock with RT_MUTEX_HAS_WAITERS. | ||
369 | * | ||
370 | * The atomic acquire/release aware variant of | ||
371 | * mark_rt_mutex_waiters uses a cmpxchg loop. After setting | ||
372 | * the WAITERS bit, the atomic release / acquire can not | ||
373 | * happen anymore and lock->wait_lock protects us from the | ||
374 | * non-atomic case. | ||
375 | * | ||
376 | * Note, that this might set lock->owner = | ||
377 | * RT_MUTEX_HAS_WAITERS in the case the lock is not contended | ||
378 | * any more. This is fixed up when we take the ownership. | ||
379 | * This is the transitional state explained at the top of this file. | ||
380 | */ | ||
381 | mark_rt_mutex_waiters(lock); | ||
382 | |||
383 | if (rt_mutex_owner(lock) && !try_to_steal_lock(lock)) | ||
384 | return 0; | ||
385 | |||
386 | /* We got the lock. */ | ||
387 | debug_rt_mutex_lock(lock __IP__); | ||
388 | |||
389 | rt_mutex_set_owner(lock, current, 0); | ||
390 | |||
391 | rt_mutex_deadlock_account_lock(lock, current); | ||
392 | |||
393 | return 1; | ||
394 | } | ||
395 | |||
396 | /* | ||
397 | * Task blocks on lock. | ||
398 | * | ||
399 | * Prepare waiter and propagate pi chain | ||
400 | * | ||
401 | * This must be called with lock->wait_lock held. | ||
402 | */ | ||
403 | static int task_blocks_on_rt_mutex(struct rt_mutex *lock, | ||
404 | struct rt_mutex_waiter *waiter, | ||
405 | int detect_deadlock | ||
406 | __IP_DECL__) | ||
407 | { | ||
408 | struct rt_mutex_waiter *top_waiter = waiter; | ||
409 | task_t *owner = rt_mutex_owner(lock); | ||
410 | int boost = 0, res; | ||
411 | unsigned long flags; | ||
412 | |||
413 | spin_lock_irqsave(¤t->pi_lock, flags); | ||
414 | __rt_mutex_adjust_prio(current); | ||
415 | waiter->task = current; | ||
416 | waiter->lock = lock; | ||
417 | plist_node_init(&waiter->list_entry, current->prio); | ||
418 | plist_node_init(&waiter->pi_list_entry, current->prio); | ||
419 | |||
420 | /* Get the top priority waiter on the lock */ | ||
421 | if (rt_mutex_has_waiters(lock)) | ||
422 | top_waiter = rt_mutex_top_waiter(lock); | ||
423 | plist_add(&waiter->list_entry, &lock->wait_list); | ||
424 | |||
425 | current->pi_blocked_on = waiter; | ||
426 | |||
427 | spin_unlock_irqrestore(¤t->pi_lock, flags); | ||
428 | |||
429 | if (waiter == rt_mutex_top_waiter(lock)) { | ||
430 | spin_lock_irqsave(&owner->pi_lock, flags); | ||
431 | plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters); | ||
432 | plist_add(&waiter->pi_list_entry, &owner->pi_waiters); | ||
433 | |||
434 | __rt_mutex_adjust_prio(owner); | ||
435 | if (owner->pi_blocked_on) { | ||
436 | boost = 1; | ||
437 | /* gets dropped in rt_mutex_adjust_prio_chain()! */ | ||
438 | get_task_struct(owner); | ||
439 | } | ||
440 | spin_unlock_irqrestore(&owner->pi_lock, flags); | ||
441 | } | ||
442 | else if (debug_rt_mutex_detect_deadlock(waiter, detect_deadlock)) { | ||
443 | spin_lock_irqsave(&owner->pi_lock, flags); | ||
444 | if (owner->pi_blocked_on) { | ||
445 | boost = 1; | ||
446 | /* gets dropped in rt_mutex_adjust_prio_chain()! */ | ||
447 | get_task_struct(owner); | ||
448 | } | ||
449 | spin_unlock_irqrestore(&owner->pi_lock, flags); | ||
450 | } | ||
451 | if (!boost) | ||
452 | return 0; | ||
453 | |||
454 | spin_unlock(&lock->wait_lock); | ||
455 | |||
456 | res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, | ||
457 | current __IP__); | ||
458 | |||
459 | spin_lock(&lock->wait_lock); | ||
460 | |||
461 | return res; | ||
462 | } | ||
463 | |||
464 | /* | ||
465 | * Wake up the next waiter on the lock. | ||
466 | * | ||
467 | * Remove the top waiter from the current tasks waiter list and from | ||
468 | * the lock waiter list. Set it as pending owner. Then wake it up. | ||
469 | * | ||
470 | * Called with lock->wait_lock held. | ||
471 | */ | ||
472 | static void wakeup_next_waiter(struct rt_mutex *lock) | ||
473 | { | ||
474 | struct rt_mutex_waiter *waiter; | ||
475 | struct task_struct *pendowner; | ||
476 | unsigned long flags; | ||
477 | |||
478 | spin_lock_irqsave(¤t->pi_lock, flags); | ||
479 | |||
480 | waiter = rt_mutex_top_waiter(lock); | ||
481 | plist_del(&waiter->list_entry, &lock->wait_list); | ||
482 | |||
483 | /* | ||
484 | * Remove it from current->pi_waiters. We do not adjust a | ||
485 | * possible priority boost right now. We execute wakeup in the | ||
486 | * boosted mode and go back to normal after releasing | ||
487 | * lock->wait_lock. | ||
488 | */ | ||
489 | plist_del(&waiter->pi_list_entry, ¤t->pi_waiters); | ||
490 | pendowner = waiter->task; | ||
491 | waiter->task = NULL; | ||
492 | |||
493 | rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING); | ||
494 | |||
495 | spin_unlock_irqrestore(¤t->pi_lock, flags); | ||
496 | |||
497 | /* | ||
498 | * Clear the pi_blocked_on variable and enqueue a possible | ||
499 | * waiter into the pi_waiters list of the pending owner. This | ||
500 | * prevents that in case the pending owner gets unboosted a | ||
501 | * waiter with higher priority than pending-owner->normal_prio | ||
502 | * is blocked on the unboosted (pending) owner. | ||
503 | */ | ||
504 | spin_lock_irqsave(&pendowner->pi_lock, flags); | ||
505 | |||
506 | WARN_ON(!pendowner->pi_blocked_on); | ||
507 | WARN_ON(pendowner->pi_blocked_on != waiter); | ||
508 | WARN_ON(pendowner->pi_blocked_on->lock != lock); | ||
509 | |||
510 | pendowner->pi_blocked_on = NULL; | ||
511 | |||
512 | if (rt_mutex_has_waiters(lock)) { | ||
513 | struct rt_mutex_waiter *next; | ||
514 | |||
515 | next = rt_mutex_top_waiter(lock); | ||
516 | plist_add(&next->pi_list_entry, &pendowner->pi_waiters); | ||
517 | } | ||
518 | spin_unlock_irqrestore(&pendowner->pi_lock, flags); | ||
519 | |||
520 | wake_up_process(pendowner); | ||
521 | } | ||
522 | |||
523 | /* | ||
524 | * Remove a waiter from a lock | ||
525 | * | ||
526 | * Must be called with lock->wait_lock held | ||
527 | */ | ||
528 | static void remove_waiter(struct rt_mutex *lock, | ||
529 | struct rt_mutex_waiter *waiter __IP_DECL__) | ||
530 | { | ||
531 | int first = (waiter == rt_mutex_top_waiter(lock)); | ||
532 | int boost = 0; | ||
533 | task_t *owner = rt_mutex_owner(lock); | ||
534 | unsigned long flags; | ||
535 | |||
536 | spin_lock_irqsave(¤t->pi_lock, flags); | ||
537 | plist_del(&waiter->list_entry, &lock->wait_list); | ||
538 | waiter->task = NULL; | ||
539 | current->pi_blocked_on = NULL; | ||
540 | spin_unlock_irqrestore(¤t->pi_lock, flags); | ||
541 | |||
542 | if (first && owner != current) { | ||
543 | |||
544 | spin_lock_irqsave(&owner->pi_lock, flags); | ||
545 | |||
546 | plist_del(&waiter->pi_list_entry, &owner->pi_waiters); | ||
547 | |||
548 | if (rt_mutex_has_waiters(lock)) { | ||
549 | struct rt_mutex_waiter *next; | ||
550 | |||
551 | next = rt_mutex_top_waiter(lock); | ||
552 | plist_add(&next->pi_list_entry, &owner->pi_waiters); | ||
553 | } | ||
554 | __rt_mutex_adjust_prio(owner); | ||
555 | |||
556 | if (owner->pi_blocked_on) { | ||
557 | boost = 1; | ||
558 | /* gets dropped in rt_mutex_adjust_prio_chain()! */ | ||
559 | get_task_struct(owner); | ||
560 | } | ||
561 | spin_unlock_irqrestore(&owner->pi_lock, flags); | ||
562 | } | ||
563 | |||
564 | WARN_ON(!plist_node_empty(&waiter->pi_list_entry)); | ||
565 | |||
566 | if (!boost) | ||
567 | return; | ||
568 | |||
569 | spin_unlock(&lock->wait_lock); | ||
570 | |||
571 | rt_mutex_adjust_prio_chain(owner, 0, lock, NULL, current __IP__); | ||
572 | |||
573 | spin_lock(&lock->wait_lock); | ||
574 | } | ||
575 | |||
576 | /* | ||
577 | * Recheck the pi chain, in case we got a priority setting | ||
578 | * | ||
579 | * Called from sched_setscheduler | ||
580 | */ | ||
581 | void rt_mutex_adjust_pi(struct task_struct *task) | ||
582 | { | ||
583 | struct rt_mutex_waiter *waiter; | ||
584 | unsigned long flags; | ||
585 | |||
586 | spin_lock_irqsave(&task->pi_lock, flags); | ||
587 | |||
588 | waiter = task->pi_blocked_on; | ||
589 | if (!waiter || waiter->list_entry.prio == task->prio) { | ||
590 | spin_unlock_irqrestore(&task->pi_lock, flags); | ||
591 | return; | ||
592 | } | ||
593 | |||
594 | /* gets dropped in rt_mutex_adjust_prio_chain()! */ | ||
595 | get_task_struct(task); | ||
596 | spin_unlock_irqrestore(&task->pi_lock, flags); | ||
597 | |||
598 | rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task __RET_IP__); | ||
599 | } | ||
600 | |||
601 | /* | ||
602 | * Slow path lock function: | ||
603 | */ | ||
604 | static int __sched | ||
605 | rt_mutex_slowlock(struct rt_mutex *lock, int state, | ||
606 | struct hrtimer_sleeper *timeout, | ||
607 | int detect_deadlock __IP_DECL__) | ||
608 | { | ||
609 | struct rt_mutex_waiter waiter; | ||
610 | int ret = 0; | ||
611 | |||
612 | debug_rt_mutex_init_waiter(&waiter); | ||
613 | waiter.task = NULL; | ||
614 | |||
615 | spin_lock(&lock->wait_lock); | ||
616 | |||
617 | /* Try to acquire the lock again: */ | ||
618 | if (try_to_take_rt_mutex(lock __IP__)) { | ||
619 | spin_unlock(&lock->wait_lock); | ||
620 | return 0; | ||
621 | } | ||
622 | |||
623 | set_current_state(state); | ||
624 | |||
625 | /* Setup the timer, when timeout != NULL */ | ||
626 | if (unlikely(timeout)) | ||
627 | hrtimer_start(&timeout->timer, timeout->timer.expires, | ||
628 | HRTIMER_ABS); | ||
629 | |||
630 | for (;;) { | ||
631 | /* Try to acquire the lock: */ | ||
632 | if (try_to_take_rt_mutex(lock __IP__)) | ||
633 | break; | ||
634 | |||
635 | /* | ||
636 | * TASK_INTERRUPTIBLE checks for signals and | ||
637 | * timeout. Ignored otherwise. | ||
638 | */ | ||
639 | if (unlikely(state == TASK_INTERRUPTIBLE)) { | ||
640 | /* Signal pending? */ | ||
641 | if (signal_pending(current)) | ||
642 | ret = -EINTR; | ||
643 | if (timeout && !timeout->task) | ||
644 | ret = -ETIMEDOUT; | ||
645 | if (ret) | ||
646 | break; | ||
647 | } | ||
648 | |||
649 | /* | ||
650 | * waiter.task is NULL the first time we come here and | ||
651 | * when we have been woken up by the previous owner | ||
652 | * but the lock got stolen by a higher prio task. | ||
653 | */ | ||
654 | if (!waiter.task) { | ||
655 | ret = task_blocks_on_rt_mutex(lock, &waiter, | ||
656 | detect_deadlock __IP__); | ||
657 | /* | ||
658 | * If we got woken up by the owner then start loop | ||
659 | * all over without going into schedule to try | ||
660 | * to get the lock now: | ||
661 | */ | ||
662 | if (unlikely(!waiter.task)) | ||
663 | continue; | ||
664 | |||
665 | if (unlikely(ret)) | ||
666 | break; | ||
667 | } | ||
668 | |||
669 | spin_unlock(&lock->wait_lock); | ||
670 | |||
671 | debug_rt_mutex_print_deadlock(&waiter); | ||
672 | |||
673 | if (waiter.task) | ||
674 | schedule_rt_mutex(lock); | ||
675 | |||
676 | spin_lock(&lock->wait_lock); | ||
677 | set_current_state(state); | ||
678 | } | ||
679 | |||
680 | set_current_state(TASK_RUNNING); | ||
681 | |||
682 | if (unlikely(waiter.task)) | ||
683 | remove_waiter(lock, &waiter __IP__); | ||
684 | |||
685 | /* | ||
686 | * try_to_take_rt_mutex() sets the waiter bit | ||
687 | * unconditionally. We might have to fix that up. | ||
688 | */ | ||
689 | fixup_rt_mutex_waiters(lock); | ||
690 | |||
691 | spin_unlock(&lock->wait_lock); | ||
692 | |||
693 | /* Remove pending timer: */ | ||
694 | if (unlikely(timeout)) | ||
695 | hrtimer_cancel(&timeout->timer); | ||
696 | |||
697 | /* | ||
698 | * Readjust priority, when we did not get the lock. We might | ||
699 | * have been the pending owner and boosted. Since we did not | ||
700 | * take the lock, the PI boost has to go. | ||
701 | */ | ||
702 | if (unlikely(ret)) | ||
703 | rt_mutex_adjust_prio(current); | ||
704 | |||
705 | debug_rt_mutex_free_waiter(&waiter); | ||
706 | |||
707 | return ret; | ||
708 | } | ||
709 | |||
710 | /* | ||
711 | * Slow path try-lock function: | ||
712 | */ | ||
713 | static inline int | ||
714 | rt_mutex_slowtrylock(struct rt_mutex *lock __IP_DECL__) | ||
715 | { | ||
716 | int ret = 0; | ||
717 | |||
718 | spin_lock(&lock->wait_lock); | ||
719 | |||
720 | if (likely(rt_mutex_owner(lock) != current)) { | ||
721 | |||
722 | ret = try_to_take_rt_mutex(lock __IP__); | ||
723 | /* | ||
724 | * try_to_take_rt_mutex() sets the lock waiters | ||
725 | * bit unconditionally. Clean this up. | ||
726 | */ | ||
727 | fixup_rt_mutex_waiters(lock); | ||
728 | } | ||
729 | |||
730 | spin_unlock(&lock->wait_lock); | ||
731 | |||
732 | return ret; | ||
733 | } | ||
734 | |||
735 | /* | ||
736 | * Slow path to release a rt-mutex: | ||
737 | */ | ||
738 | static void __sched | ||
739 | rt_mutex_slowunlock(struct rt_mutex *lock) | ||
740 | { | ||
741 | spin_lock(&lock->wait_lock); | ||
742 | |||
743 | debug_rt_mutex_unlock(lock); | ||
744 | |||
745 | rt_mutex_deadlock_account_unlock(current); | ||
746 | |||
747 | if (!rt_mutex_has_waiters(lock)) { | ||
748 | lock->owner = NULL; | ||
749 | spin_unlock(&lock->wait_lock); | ||
750 | return; | ||
751 | } | ||
752 | |||
753 | wakeup_next_waiter(lock); | ||
754 | |||
755 | spin_unlock(&lock->wait_lock); | ||
756 | |||
757 | /* Undo pi boosting if necessary: */ | ||
758 | rt_mutex_adjust_prio(current); | ||
759 | } | ||
760 | |||
761 | /* | ||
762 | * debug aware fast / slowpath lock,trylock,unlock | ||
763 | * | ||
764 | * The atomic acquire/release ops are compiled away, when either the | ||
765 | * architecture does not support cmpxchg or when debugging is enabled. | ||
766 | */ | ||
767 | static inline int | ||
768 | rt_mutex_fastlock(struct rt_mutex *lock, int state, | ||
769 | int detect_deadlock, | ||
770 | int (*slowfn)(struct rt_mutex *lock, int state, | ||
771 | struct hrtimer_sleeper *timeout, | ||
772 | int detect_deadlock __IP_DECL__)) | ||
773 | { | ||
774 | if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { | ||
775 | rt_mutex_deadlock_account_lock(lock, current); | ||
776 | return 0; | ||
777 | } else | ||
778 | return slowfn(lock, state, NULL, detect_deadlock __RET_IP__); | ||
779 | } | ||
780 | |||
781 | static inline int | ||
782 | rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, | ||
783 | struct hrtimer_sleeper *timeout, int detect_deadlock, | ||
784 | int (*slowfn)(struct rt_mutex *lock, int state, | ||
785 | struct hrtimer_sleeper *timeout, | ||
786 | int detect_deadlock __IP_DECL__)) | ||
787 | { | ||
788 | if (!detect_deadlock && likely(rt_mutex_cmpxchg(lock, NULL, current))) { | ||
789 | rt_mutex_deadlock_account_lock(lock, current); | ||
790 | return 0; | ||
791 | } else | ||
792 | return slowfn(lock, state, timeout, detect_deadlock __RET_IP__); | ||
793 | } | ||
794 | |||
795 | static inline int | ||
796 | rt_mutex_fasttrylock(struct rt_mutex *lock, | ||
797 | int (*slowfn)(struct rt_mutex *lock __IP_DECL__)) | ||
798 | { | ||
799 | if (likely(rt_mutex_cmpxchg(lock, NULL, current))) { | ||
800 | rt_mutex_deadlock_account_lock(lock, current); | ||
801 | return 1; | ||
802 | } | ||
803 | return slowfn(lock __RET_IP__); | ||
804 | } | ||
805 | |||
806 | static inline void | ||
807 | rt_mutex_fastunlock(struct rt_mutex *lock, | ||
808 | void (*slowfn)(struct rt_mutex *lock)) | ||
809 | { | ||
810 | if (likely(rt_mutex_cmpxchg(lock, current, NULL))) | ||
811 | rt_mutex_deadlock_account_unlock(current); | ||
812 | else | ||
813 | slowfn(lock); | ||
814 | } | ||
815 | |||
816 | /** | ||
817 | * rt_mutex_lock - lock a rt_mutex | ||
818 | * | ||
819 | * @lock: the rt_mutex to be locked | ||
820 | */ | ||
821 | void __sched rt_mutex_lock(struct rt_mutex *lock) | ||
822 | { | ||
823 | might_sleep(); | ||
824 | |||
825 | rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, 0, rt_mutex_slowlock); | ||
826 | } | ||
827 | EXPORT_SYMBOL_GPL(rt_mutex_lock); | ||
828 | |||
829 | /** | ||
830 | * rt_mutex_lock_interruptible - lock a rt_mutex interruptible | ||
831 | * | ||
832 | * @lock: the rt_mutex to be locked | ||
833 | * @detect_deadlock: deadlock detection on/off | ||
834 | * | ||
835 | * Returns: | ||
836 | * 0 on success | ||
837 | * -EINTR when interrupted by a signal | ||
838 | * -EDEADLK when the lock would deadlock (when deadlock detection is on) | ||
839 | */ | ||
840 | int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock, | ||
841 | int detect_deadlock) | ||
842 | { | ||
843 | might_sleep(); | ||
844 | |||
845 | return rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, | ||
846 | detect_deadlock, rt_mutex_slowlock); | ||
847 | } | ||
848 | EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); | ||
849 | |||
850 | /** | ||
851 | * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible | ||
852 | * the timeout structure is provided | ||
853 | * by the caller | ||
854 | * | ||
855 | * @lock: the rt_mutex to be locked | ||
856 | * @timeout: timeout structure or NULL (no timeout) | ||
857 | * @detect_deadlock: deadlock detection on/off | ||
858 | * | ||
859 | * Returns: | ||
860 | * 0 on success | ||
861 | * -EINTR when interrupted by a signal | ||
862 | * -ETIMEOUT when the timeout expired | ||
863 | * -EDEADLK when the lock would deadlock (when deadlock detection is on) | ||
864 | */ | ||
865 | int | ||
866 | rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout, | ||
867 | int detect_deadlock) | ||
868 | { | ||
869 | might_sleep(); | ||
870 | |||
871 | return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, | ||
872 | detect_deadlock, rt_mutex_slowlock); | ||
873 | } | ||
874 | EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); | ||
875 | |||
876 | /** | ||
877 | * rt_mutex_trylock - try to lock a rt_mutex | ||
878 | * | ||
879 | * @lock: the rt_mutex to be locked | ||
880 | * | ||
881 | * Returns 1 on success and 0 on contention | ||
882 | */ | ||
883 | int __sched rt_mutex_trylock(struct rt_mutex *lock) | ||
884 | { | ||
885 | return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); | ||
886 | } | ||
887 | EXPORT_SYMBOL_GPL(rt_mutex_trylock); | ||
888 | |||
889 | /** | ||
890 | * rt_mutex_unlock - unlock a rt_mutex | ||
891 | * | ||
892 | * @lock: the rt_mutex to be unlocked | ||
893 | */ | ||
894 | void __sched rt_mutex_unlock(struct rt_mutex *lock) | ||
895 | { | ||
896 | rt_mutex_fastunlock(lock, rt_mutex_slowunlock); | ||
897 | } | ||
898 | EXPORT_SYMBOL_GPL(rt_mutex_unlock); | ||
899 | |||
900 | /*** | ||
901 | * rt_mutex_destroy - mark a mutex unusable | ||
902 | * @lock: the mutex to be destroyed | ||
903 | * | ||
904 | * This function marks the mutex uninitialized, and any subsequent | ||
905 | * use of the mutex is forbidden. The mutex must not be locked when | ||
906 | * this function is called. | ||
907 | */ | ||
908 | void rt_mutex_destroy(struct rt_mutex *lock) | ||
909 | { | ||
910 | WARN_ON(rt_mutex_is_locked(lock)); | ||
911 | #ifdef CONFIG_DEBUG_RT_MUTEXES | ||
912 | lock->magic = NULL; | ||
913 | #endif | ||
914 | } | ||
915 | |||
916 | EXPORT_SYMBOL_GPL(rt_mutex_destroy); | ||
917 | |||
918 | /** | ||
919 | * __rt_mutex_init - initialize the rt lock | ||
920 | * | ||
921 | * @lock: the rt lock to be initialized | ||
922 | * | ||
923 | * Initialize the rt lock to unlocked state. | ||
924 | * | ||
925 | * Initializing of a locked rt lock is not allowed | ||
926 | */ | ||
927 | void __rt_mutex_init(struct rt_mutex *lock, const char *name) | ||
928 | { | ||
929 | lock->owner = NULL; | ||
930 | spin_lock_init(&lock->wait_lock); | ||
931 | plist_head_init(&lock->wait_list, &lock->wait_lock); | ||
932 | |||
933 | debug_rt_mutex_init(lock, name); | ||
934 | } | ||
935 | EXPORT_SYMBOL_GPL(__rt_mutex_init); | ||
936 | |||
937 | /** | ||
938 | * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a | ||
939 | * proxy owner | ||
940 | * | ||
941 | * @lock: the rt_mutex to be locked | ||
942 | * @proxy_owner:the task to set as owner | ||
943 | * | ||
944 | * No locking. Caller has to do serializing itself | ||
945 | * Special API call for PI-futex support | ||
946 | */ | ||
947 | void rt_mutex_init_proxy_locked(struct rt_mutex *lock, | ||
948 | struct task_struct *proxy_owner) | ||
949 | { | ||
950 | __rt_mutex_init(lock, NULL); | ||
951 | debug_rt_mutex_proxy_lock(lock, proxy_owner __RET_IP__); | ||
952 | rt_mutex_set_owner(lock, proxy_owner, 0); | ||
953 | rt_mutex_deadlock_account_lock(lock, proxy_owner); | ||
954 | } | ||
955 | |||
956 | /** | ||
957 | * rt_mutex_proxy_unlock - release a lock on behalf of owner | ||
958 | * | ||
959 | * @lock: the rt_mutex to be locked | ||
960 | * | ||
961 | * No locking. Caller has to do serializing itself | ||
962 | * Special API call for PI-futex support | ||
963 | */ | ||
964 | void rt_mutex_proxy_unlock(struct rt_mutex *lock, | ||
965 | struct task_struct *proxy_owner) | ||
966 | { | ||
967 | debug_rt_mutex_proxy_unlock(lock); | ||
968 | rt_mutex_set_owner(lock, NULL, 0); | ||
969 | rt_mutex_deadlock_account_unlock(proxy_owner); | ||
970 | } | ||
971 | |||
972 | /** | ||
973 | * rt_mutex_next_owner - return the next owner of the lock | ||
974 | * | ||
975 | * @lock: the rt lock query | ||
976 | * | ||
977 | * Returns the next owner of the lock or NULL | ||
978 | * | ||
979 | * Caller has to serialize against other accessors to the lock | ||
980 | * itself. | ||
981 | * | ||
982 | * Special API call for PI-futex support | ||
983 | */ | ||
984 | struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock) | ||
985 | { | ||
986 | if (!rt_mutex_has_waiters(lock)) | ||
987 | return NULL; | ||
988 | |||
989 | return rt_mutex_top_waiter(lock)->task; | ||
990 | } | ||
diff --git a/kernel/rtmutex.h b/kernel/rtmutex.h new file mode 100644 index 000000000000..1e0fca13ff72 --- /dev/null +++ b/kernel/rtmutex.h | |||
@@ -0,0 +1,29 @@ | |||
1 | /* | ||
2 | * RT-Mutexes: blocking mutual exclusion locks with PI support | ||
3 | * | ||
4 | * started by Ingo Molnar and Thomas Gleixner: | ||
5 | * | ||
6 | * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
7 | * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> | ||
8 | * | ||
9 | * This file contains macros used solely by rtmutex.c. | ||
10 | * Non-debug version. | ||
11 | */ | ||
12 | |||
13 | #define __IP_DECL__ | ||
14 | #define __IP__ | ||
15 | #define __RET_IP__ | ||
16 | #define rt_mutex_deadlock_check(l) (0) | ||
17 | #define rt_mutex_deadlock_account_lock(m, t) do { } while (0) | ||
18 | #define rt_mutex_deadlock_account_unlock(l) do { } while (0) | ||
19 | #define debug_rt_mutex_init_waiter(w) do { } while (0) | ||
20 | #define debug_rt_mutex_free_waiter(w) do { } while (0) | ||
21 | #define debug_rt_mutex_lock(l) do { } while (0) | ||
22 | #define debug_rt_mutex_proxy_lock(l,p) do { } while (0) | ||
23 | #define debug_rt_mutex_proxy_unlock(l) do { } while (0) | ||
24 | #define debug_rt_mutex_unlock(l) do { } while (0) | ||
25 | #define debug_rt_mutex_init(m, n) do { } while (0) | ||
26 | #define debug_rt_mutex_deadlock(d, a ,l) do { } while (0) | ||
27 | #define debug_rt_mutex_print_deadlock(w) do { } while (0) | ||
28 | #define debug_rt_mutex_detect_deadlock(w,d) (d) | ||
29 | #define debug_rt_mutex_reset_waiter(w) do { } while (0) | ||
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h new file mode 100644 index 000000000000..9c75856e791e --- /dev/null +++ b/kernel/rtmutex_common.h | |||
@@ -0,0 +1,123 @@ | |||
1 | /* | ||
2 | * RT Mutexes: blocking mutual exclusion locks with PI support | ||
3 | * | ||
4 | * started by Ingo Molnar and Thomas Gleixner: | ||
5 | * | ||
6 | * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> | ||
7 | * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> | ||
8 | * | ||
9 | * This file contains the private data structure and API definitions. | ||
10 | */ | ||
11 | |||
12 | #ifndef __KERNEL_RTMUTEX_COMMON_H | ||
13 | #define __KERNEL_RTMUTEX_COMMON_H | ||
14 | |||
15 | #include <linux/rtmutex.h> | ||
16 | |||
17 | /* | ||
18 | * The rtmutex in kernel tester is independent of rtmutex debugging. We | ||
19 | * call schedule_rt_mutex_test() instead of schedule() for the tasks which | ||
20 | * belong to the tester. That way we can delay the wakeup path of those | ||
21 | * threads to provoke lock stealing and testing of complex boosting scenarios. | ||
22 | */ | ||
23 | #ifdef CONFIG_RT_MUTEX_TESTER | ||
24 | |||
25 | extern void schedule_rt_mutex_test(struct rt_mutex *lock); | ||
26 | |||
27 | #define schedule_rt_mutex(_lock) \ | ||
28 | do { \ | ||
29 | if (!(current->flags & PF_MUTEX_TESTER)) \ | ||
30 | schedule(); \ | ||
31 | else \ | ||
32 | schedule_rt_mutex_test(_lock); \ | ||
33 | } while (0) | ||
34 | |||
35 | #else | ||
36 | # define schedule_rt_mutex(_lock) schedule() | ||
37 | #endif | ||
38 | |||
39 | /* | ||
40 | * This is the control structure for tasks blocked on a rt_mutex, | ||
41 | * which is allocated on the kernel stack on of the blocked task. | ||
42 | * | ||
43 | * @list_entry: pi node to enqueue into the mutex waiters list | ||
44 | * @pi_list_entry: pi node to enqueue into the mutex owner waiters list | ||
45 | * @task: task reference to the blocked task | ||
46 | */ | ||
47 | struct rt_mutex_waiter { | ||
48 | struct plist_node list_entry; | ||
49 | struct plist_node pi_list_entry; | ||
50 | struct task_struct *task; | ||
51 | struct rt_mutex *lock; | ||
52 | #ifdef CONFIG_DEBUG_RT_MUTEXES | ||
53 | unsigned long ip; | ||
54 | pid_t deadlock_task_pid; | ||
55 | struct rt_mutex *deadlock_lock; | ||
56 | #endif | ||
57 | }; | ||
58 | |||
59 | /* | ||
60 | * Various helpers to access the waiters-plist: | ||
61 | */ | ||
62 | static inline int rt_mutex_has_waiters(struct rt_mutex *lock) | ||
63 | { | ||
64 | return !plist_head_empty(&lock->wait_list); | ||
65 | } | ||
66 | |||
67 | static inline struct rt_mutex_waiter * | ||
68 | rt_mutex_top_waiter(struct rt_mutex *lock) | ||
69 | { | ||
70 | struct rt_mutex_waiter *w; | ||
71 | |||
72 | w = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter, | ||
73 | list_entry); | ||
74 | BUG_ON(w->lock != lock); | ||
75 | |||
76 | return w; | ||
77 | } | ||
78 | |||
79 | static inline int task_has_pi_waiters(struct task_struct *p) | ||
80 | { | ||
81 | return !plist_head_empty(&p->pi_waiters); | ||
82 | } | ||
83 | |||
84 | static inline struct rt_mutex_waiter * | ||
85 | task_top_pi_waiter(struct task_struct *p) | ||
86 | { | ||
87 | return plist_first_entry(&p->pi_waiters, struct rt_mutex_waiter, | ||
88 | pi_list_entry); | ||
89 | } | ||
90 | |||
91 | /* | ||
92 | * lock->owner state tracking: | ||
93 | */ | ||
94 | #define RT_MUTEX_OWNER_PENDING 1UL | ||
95 | #define RT_MUTEX_HAS_WAITERS 2UL | ||
96 | #define RT_MUTEX_OWNER_MASKALL 3UL | ||
97 | |||
98 | static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) | ||
99 | { | ||
100 | return (struct task_struct *) | ||
101 | ((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL); | ||
102 | } | ||
103 | |||
104 | static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock) | ||
105 | { | ||
106 | return (struct task_struct *) | ||
107 | ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS); | ||
108 | } | ||
109 | |||
110 | static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock) | ||
111 | { | ||
112 | return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING; | ||
113 | } | ||
114 | |||
115 | /* | ||
116 | * PI-futex support (proxy locking functions, etc.): | ||
117 | */ | ||
118 | extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); | ||
119 | extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, | ||
120 | struct task_struct *proxy_owner); | ||
121 | extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, | ||
122 | struct task_struct *proxy_owner); | ||
123 | #endif | ||
diff --git a/kernel/sched.c b/kernel/sched.c index f06d059edef5..2629c1711fd6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -168,15 +168,21 @@ | |||
168 | */ | 168 | */ |
169 | 169 | ||
170 | #define SCALE_PRIO(x, prio) \ | 170 | #define SCALE_PRIO(x, prio) \ |
171 | max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO/2), MIN_TIMESLICE) | 171 | max(x * (MAX_PRIO - prio) / (MAX_USER_PRIO / 2), MIN_TIMESLICE) |
172 | 172 | ||
173 | static unsigned int task_timeslice(task_t *p) | 173 | static unsigned int static_prio_timeslice(int static_prio) |
174 | { | 174 | { |
175 | if (p->static_prio < NICE_TO_PRIO(0)) | 175 | if (static_prio < NICE_TO_PRIO(0)) |
176 | return SCALE_PRIO(DEF_TIMESLICE*4, p->static_prio); | 176 | return SCALE_PRIO(DEF_TIMESLICE * 4, static_prio); |
177 | else | 177 | else |
178 | return SCALE_PRIO(DEF_TIMESLICE, p->static_prio); | 178 | return SCALE_PRIO(DEF_TIMESLICE, static_prio); |
179 | } | 179 | } |
180 | |||
181 | static inline unsigned int task_timeslice(task_t *p) | ||
182 | { | ||
183 | return static_prio_timeslice(p->static_prio); | ||
184 | } | ||
185 | |||
180 | #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ | 186 | #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran) \ |
181 | < (long long) (sd)->cache_hot_time) | 187 | < (long long) (sd)->cache_hot_time) |
182 | 188 | ||
@@ -184,13 +190,11 @@ static unsigned int task_timeslice(task_t *p) | |||
184 | * These are the runqueue data structures: | 190 | * These are the runqueue data structures: |
185 | */ | 191 | */ |
186 | 192 | ||
187 | #define BITMAP_SIZE ((((MAX_PRIO+1+7)/8)+sizeof(long)-1)/sizeof(long)) | ||
188 | |||
189 | typedef struct runqueue runqueue_t; | 193 | typedef struct runqueue runqueue_t; |
190 | 194 | ||
191 | struct prio_array { | 195 | struct prio_array { |
192 | unsigned int nr_active; | 196 | unsigned int nr_active; |
193 | unsigned long bitmap[BITMAP_SIZE]; | 197 | DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */ |
194 | struct list_head queue[MAX_PRIO]; | 198 | struct list_head queue[MAX_PRIO]; |
195 | }; | 199 | }; |
196 | 200 | ||
@@ -209,6 +213,7 @@ struct runqueue { | |||
209 | * remote CPUs use both these fields when doing load calculation. | 213 | * remote CPUs use both these fields when doing load calculation. |
210 | */ | 214 | */ |
211 | unsigned long nr_running; | 215 | unsigned long nr_running; |
216 | unsigned long raw_weighted_load; | ||
212 | #ifdef CONFIG_SMP | 217 | #ifdef CONFIG_SMP |
213 | unsigned long cpu_load[3]; | 218 | unsigned long cpu_load[3]; |
214 | #endif | 219 | #endif |
@@ -239,7 +244,6 @@ struct runqueue { | |||
239 | 244 | ||
240 | task_t *migration_thread; | 245 | task_t *migration_thread; |
241 | struct list_head migration_queue; | 246 | struct list_head migration_queue; |
242 | int cpu; | ||
243 | #endif | 247 | #endif |
244 | 248 | ||
245 | #ifdef CONFIG_SCHEDSTATS | 249 | #ifdef CONFIG_SCHEDSTATS |
@@ -351,11 +355,30 @@ static inline void finish_lock_switch(runqueue_t *rq, task_t *prev) | |||
351 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ | 355 | #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ |
352 | 356 | ||
353 | /* | 357 | /* |
358 | * __task_rq_lock - lock the runqueue a given task resides on. | ||
359 | * Must be called interrupts disabled. | ||
360 | */ | ||
361 | static inline runqueue_t *__task_rq_lock(task_t *p) | ||
362 | __acquires(rq->lock) | ||
363 | { | ||
364 | struct runqueue *rq; | ||
365 | |||
366 | repeat_lock_task: | ||
367 | rq = task_rq(p); | ||
368 | spin_lock(&rq->lock); | ||
369 | if (unlikely(rq != task_rq(p))) { | ||
370 | spin_unlock(&rq->lock); | ||
371 | goto repeat_lock_task; | ||
372 | } | ||
373 | return rq; | ||
374 | } | ||
375 | |||
376 | /* | ||
354 | * task_rq_lock - lock the runqueue a given task resides on and disable | 377 | * task_rq_lock - lock the runqueue a given task resides on and disable |
355 | * interrupts. Note the ordering: we can safely lookup the task_rq without | 378 | * interrupts. Note the ordering: we can safely lookup the task_rq without |
356 | * explicitly disabling preemption. | 379 | * explicitly disabling preemption. |
357 | */ | 380 | */ |
358 | static inline runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) | 381 | static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) |
359 | __acquires(rq->lock) | 382 | __acquires(rq->lock) |
360 | { | 383 | { |
361 | struct runqueue *rq; | 384 | struct runqueue *rq; |
@@ -371,6 +394,12 @@ repeat_lock_task: | |||
371 | return rq; | 394 | return rq; |
372 | } | 395 | } |
373 | 396 | ||
397 | static inline void __task_rq_unlock(runqueue_t *rq) | ||
398 | __releases(rq->lock) | ||
399 | { | ||
400 | spin_unlock(&rq->lock); | ||
401 | } | ||
402 | |||
374 | static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) | 403 | static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) |
375 | __releases(rq->lock) | 404 | __releases(rq->lock) |
376 | { | 405 | { |
@@ -634,7 +663,7 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) | |||
634 | } | 663 | } |
635 | 664 | ||
636 | /* | 665 | /* |
637 | * effective_prio - return the priority that is based on the static | 666 | * __normal_prio - return the priority that is based on the static |
638 | * priority but is modified by bonuses/penalties. | 667 | * priority but is modified by bonuses/penalties. |
639 | * | 668 | * |
640 | * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] | 669 | * We scale the actual sleep average [0 .... MAX_SLEEP_AVG] |
@@ -647,13 +676,11 @@ static inline void enqueue_task_head(struct task_struct *p, prio_array_t *array) | |||
647 | * | 676 | * |
648 | * Both properties are important to certain workloads. | 677 | * Both properties are important to certain workloads. |
649 | */ | 678 | */ |
650 | static int effective_prio(task_t *p) | 679 | |
680 | static inline int __normal_prio(task_t *p) | ||
651 | { | 681 | { |
652 | int bonus, prio; | 682 | int bonus, prio; |
653 | 683 | ||
654 | if (rt_task(p)) | ||
655 | return p->prio; | ||
656 | |||
657 | bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; | 684 | bonus = CURRENT_BONUS(p) - MAX_BONUS / 2; |
658 | 685 | ||
659 | prio = p->static_prio - bonus; | 686 | prio = p->static_prio - bonus; |
@@ -665,6 +692,106 @@ static int effective_prio(task_t *p) | |||
665 | } | 692 | } |
666 | 693 | ||
667 | /* | 694 | /* |
695 | * To aid in avoiding the subversion of "niceness" due to uneven distribution | ||
696 | * of tasks with abnormal "nice" values across CPUs the contribution that | ||
697 | * each task makes to its run queue's load is weighted according to its | ||
698 | * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a | ||
699 | * scaled version of the new time slice allocation that they receive on time | ||
700 | * slice expiry etc. | ||
701 | */ | ||
702 | |||
703 | /* | ||
704 | * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE | ||
705 | * If static_prio_timeslice() is ever changed to break this assumption then | ||
706 | * this code will need modification | ||
707 | */ | ||
708 | #define TIME_SLICE_NICE_ZERO DEF_TIMESLICE | ||
709 | #define LOAD_WEIGHT(lp) \ | ||
710 | (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) | ||
711 | #define PRIO_TO_LOAD_WEIGHT(prio) \ | ||
712 | LOAD_WEIGHT(static_prio_timeslice(prio)) | ||
713 | #define RTPRIO_TO_LOAD_WEIGHT(rp) \ | ||
714 | (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) | ||
715 | |||
716 | static void set_load_weight(task_t *p) | ||
717 | { | ||
718 | if (has_rt_policy(p)) { | ||
719 | #ifdef CONFIG_SMP | ||
720 | if (p == task_rq(p)->migration_thread) | ||
721 | /* | ||
722 | * The migration thread does the actual balancing. | ||
723 | * Giving its load any weight will skew balancing | ||
724 | * adversely. | ||
725 | */ | ||
726 | p->load_weight = 0; | ||
727 | else | ||
728 | #endif | ||
729 | p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); | ||
730 | } else | ||
731 | p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); | ||
732 | } | ||
733 | |||
734 | static inline void inc_raw_weighted_load(runqueue_t *rq, const task_t *p) | ||
735 | { | ||
736 | rq->raw_weighted_load += p->load_weight; | ||
737 | } | ||
738 | |||
739 | static inline void dec_raw_weighted_load(runqueue_t *rq, const task_t *p) | ||
740 | { | ||
741 | rq->raw_weighted_load -= p->load_weight; | ||
742 | } | ||
743 | |||
744 | static inline void inc_nr_running(task_t *p, runqueue_t *rq) | ||
745 | { | ||
746 | rq->nr_running++; | ||
747 | inc_raw_weighted_load(rq, p); | ||
748 | } | ||
749 | |||
750 | static inline void dec_nr_running(task_t *p, runqueue_t *rq) | ||
751 | { | ||
752 | rq->nr_running--; | ||
753 | dec_raw_weighted_load(rq, p); | ||
754 | } | ||
755 | |||
756 | /* | ||
757 | * Calculate the expected normal priority: i.e. priority | ||
758 | * without taking RT-inheritance into account. Might be | ||
759 | * boosted by interactivity modifiers. Changes upon fork, | ||
760 | * setprio syscalls, and whenever the interactivity | ||
761 | * estimator recalculates. | ||
762 | */ | ||
763 | static inline int normal_prio(task_t *p) | ||
764 | { | ||
765 | int prio; | ||
766 | |||
767 | if (has_rt_policy(p)) | ||
768 | prio = MAX_RT_PRIO-1 - p->rt_priority; | ||
769 | else | ||
770 | prio = __normal_prio(p); | ||
771 | return prio; | ||
772 | } | ||
773 | |||
774 | /* | ||
775 | * Calculate the current priority, i.e. the priority | ||
776 | * taken into account by the scheduler. This value might | ||
777 | * be boosted by RT tasks, or might be boosted by | ||
778 | * interactivity modifiers. Will be RT if the task got | ||
779 | * RT-boosted. If not then it returns p->normal_prio. | ||
780 | */ | ||
781 | static int effective_prio(task_t *p) | ||
782 | { | ||
783 | p->normal_prio = normal_prio(p); | ||
784 | /* | ||
785 | * If we are RT tasks or we were boosted to RT priority, | ||
786 | * keep the priority unchanged. Otherwise, update priority | ||
787 | * to the normal priority: | ||
788 | */ | ||
789 | if (!rt_prio(p->prio)) | ||
790 | return p->normal_prio; | ||
791 | return p->prio; | ||
792 | } | ||
793 | |||
794 | /* | ||
668 | * __activate_task - move a task to the runqueue. | 795 | * __activate_task - move a task to the runqueue. |
669 | */ | 796 | */ |
670 | static void __activate_task(task_t *p, runqueue_t *rq) | 797 | static void __activate_task(task_t *p, runqueue_t *rq) |
@@ -674,7 +801,7 @@ static void __activate_task(task_t *p, runqueue_t *rq) | |||
674 | if (batch_task(p)) | 801 | if (batch_task(p)) |
675 | target = rq->expired; | 802 | target = rq->expired; |
676 | enqueue_task(p, target); | 803 | enqueue_task(p, target); |
677 | rq->nr_running++; | 804 | inc_nr_running(p, rq); |
678 | } | 805 | } |
679 | 806 | ||
680 | /* | 807 | /* |
@@ -683,39 +810,45 @@ static void __activate_task(task_t *p, runqueue_t *rq) | |||
683 | static inline void __activate_idle_task(task_t *p, runqueue_t *rq) | 810 | static inline void __activate_idle_task(task_t *p, runqueue_t *rq) |
684 | { | 811 | { |
685 | enqueue_task_head(p, rq->active); | 812 | enqueue_task_head(p, rq->active); |
686 | rq->nr_running++; | 813 | inc_nr_running(p, rq); |
687 | } | 814 | } |
688 | 815 | ||
816 | /* | ||
817 | * Recalculate p->normal_prio and p->prio after having slept, | ||
818 | * updating the sleep-average too: | ||
819 | */ | ||
689 | static int recalc_task_prio(task_t *p, unsigned long long now) | 820 | static int recalc_task_prio(task_t *p, unsigned long long now) |
690 | { | 821 | { |
691 | /* Caller must always ensure 'now >= p->timestamp' */ | 822 | /* Caller must always ensure 'now >= p->timestamp' */ |
692 | unsigned long long __sleep_time = now - p->timestamp; | 823 | unsigned long sleep_time = now - p->timestamp; |
693 | unsigned long sleep_time; | ||
694 | 824 | ||
695 | if (batch_task(p)) | 825 | if (batch_task(p)) |
696 | sleep_time = 0; | 826 | sleep_time = 0; |
697 | else { | ||
698 | if (__sleep_time > NS_MAX_SLEEP_AVG) | ||
699 | sleep_time = NS_MAX_SLEEP_AVG; | ||
700 | else | ||
701 | sleep_time = (unsigned long)__sleep_time; | ||
702 | } | ||
703 | 827 | ||
704 | if (likely(sleep_time > 0)) { | 828 | if (likely(sleep_time > 0)) { |
705 | /* | 829 | /* |
706 | * User tasks that sleep a long time are categorised as | 830 | * This ceiling is set to the lowest priority that would allow |
707 | * idle. They will only have their sleep_avg increased to a | 831 | * a task to be reinserted into the active array on timeslice |
708 | * level that makes them just interactive priority to stay | 832 | * completion. |
709 | * active yet prevent them suddenly becoming cpu hogs and | ||
710 | * starving other processes. | ||
711 | */ | 833 | */ |
712 | if (p->mm && sleep_time > INTERACTIVE_SLEEP(p)) { | 834 | unsigned long ceiling = INTERACTIVE_SLEEP(p); |
713 | unsigned long ceiling; | ||
714 | 835 | ||
715 | ceiling = JIFFIES_TO_NS(MAX_SLEEP_AVG - | 836 | if (p->mm && sleep_time > ceiling && p->sleep_avg < ceiling) { |
716 | DEF_TIMESLICE); | 837 | /* |
717 | if (p->sleep_avg < ceiling) | 838 | * Prevents user tasks from achieving best priority |
718 | p->sleep_avg = ceiling; | 839 | * with one single large enough sleep. |
840 | */ | ||
841 | p->sleep_avg = ceiling; | ||
842 | /* | ||
843 | * Using INTERACTIVE_SLEEP() as a ceiling places a | ||
844 | * nice(0) task 1ms sleep away from promotion, and | ||
845 | * gives it 700ms to round-robin with no chance of | ||
846 | * being demoted. This is more than generous, so | ||
847 | * mark this sleep as non-interactive to prevent the | ||
848 | * on-runqueue bonus logic from intervening should | ||
849 | * this task not receive cpu immediately. | ||
850 | */ | ||
851 | p->sleep_type = SLEEP_NONINTERACTIVE; | ||
719 | } else { | 852 | } else { |
720 | /* | 853 | /* |
721 | * Tasks waking from uninterruptible sleep are | 854 | * Tasks waking from uninterruptible sleep are |
@@ -723,12 +856,12 @@ static int recalc_task_prio(task_t *p, unsigned long long now) | |||
723 | * are likely to be waiting on I/O | 856 | * are likely to be waiting on I/O |
724 | */ | 857 | */ |
725 | if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { | 858 | if (p->sleep_type == SLEEP_NONINTERACTIVE && p->mm) { |
726 | if (p->sleep_avg >= INTERACTIVE_SLEEP(p)) | 859 | if (p->sleep_avg >= ceiling) |
727 | sleep_time = 0; | 860 | sleep_time = 0; |
728 | else if (p->sleep_avg + sleep_time >= | 861 | else if (p->sleep_avg + sleep_time >= |
729 | INTERACTIVE_SLEEP(p)) { | 862 | ceiling) { |
730 | p->sleep_avg = INTERACTIVE_SLEEP(p); | 863 | p->sleep_avg = ceiling; |
731 | sleep_time = 0; | 864 | sleep_time = 0; |
732 | } | 865 | } |
733 | } | 866 | } |
734 | 867 | ||
@@ -742,9 +875,9 @@ static int recalc_task_prio(task_t *p, unsigned long long now) | |||
742 | */ | 875 | */ |
743 | p->sleep_avg += sleep_time; | 876 | p->sleep_avg += sleep_time; |
744 | 877 | ||
745 | if (p->sleep_avg > NS_MAX_SLEEP_AVG) | ||
746 | p->sleep_avg = NS_MAX_SLEEP_AVG; | ||
747 | } | 878 | } |
879 | if (p->sleep_avg > NS_MAX_SLEEP_AVG) | ||
880 | p->sleep_avg = NS_MAX_SLEEP_AVG; | ||
748 | } | 881 | } |
749 | 882 | ||
750 | return effective_prio(p); | 883 | return effective_prio(p); |
@@ -805,7 +938,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local) | |||
805 | */ | 938 | */ |
806 | static void deactivate_task(struct task_struct *p, runqueue_t *rq) | 939 | static void deactivate_task(struct task_struct *p, runqueue_t *rq) |
807 | { | 940 | { |
808 | rq->nr_running--; | 941 | dec_nr_running(p, rq); |
809 | dequeue_task(p, p->array); | 942 | dequeue_task(p, p->array); |
810 | p->array = NULL; | 943 | p->array = NULL; |
811 | } | 944 | } |
@@ -818,6 +951,11 @@ static void deactivate_task(struct task_struct *p, runqueue_t *rq) | |||
818 | * the target CPU. | 951 | * the target CPU. |
819 | */ | 952 | */ |
820 | #ifdef CONFIG_SMP | 953 | #ifdef CONFIG_SMP |
954 | |||
955 | #ifndef tsk_is_polling | ||
956 | #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) | ||
957 | #endif | ||
958 | |||
821 | static void resched_task(task_t *p) | 959 | static void resched_task(task_t *p) |
822 | { | 960 | { |
823 | int cpu; | 961 | int cpu; |
@@ -833,9 +971,9 @@ static void resched_task(task_t *p) | |||
833 | if (cpu == smp_processor_id()) | 971 | if (cpu == smp_processor_id()) |
834 | return; | 972 | return; |
835 | 973 | ||
836 | /* NEED_RESCHED must be visible before we test POLLING_NRFLAG */ | 974 | /* NEED_RESCHED must be visible before we test polling */ |
837 | smp_mb(); | 975 | smp_mb(); |
838 | if (!test_tsk_thread_flag(p, TIF_POLLING_NRFLAG)) | 976 | if (!tsk_is_polling(p)) |
839 | smp_send_reschedule(cpu); | 977 | smp_send_reschedule(cpu); |
840 | } | 978 | } |
841 | #else | 979 | #else |
@@ -855,6 +993,12 @@ inline int task_curr(const task_t *p) | |||
855 | return cpu_curr(task_cpu(p)) == p; | 993 | return cpu_curr(task_cpu(p)) == p; |
856 | } | 994 | } |
857 | 995 | ||
996 | /* Used instead of source_load when we know the type == 0 */ | ||
997 | unsigned long weighted_cpuload(const int cpu) | ||
998 | { | ||
999 | return cpu_rq(cpu)->raw_weighted_load; | ||
1000 | } | ||
1001 | |||
858 | #ifdef CONFIG_SMP | 1002 | #ifdef CONFIG_SMP |
859 | typedef struct { | 1003 | typedef struct { |
860 | struct list_head list; | 1004 | struct list_head list; |
@@ -944,7 +1088,8 @@ void kick_process(task_t *p) | |||
944 | } | 1088 | } |
945 | 1089 | ||
946 | /* | 1090 | /* |
947 | * Return a low guess at the load of a migration-source cpu. | 1091 | * Return a low guess at the load of a migration-source cpu weighted |
1092 | * according to the scheduling class and "nice" value. | ||
948 | * | 1093 | * |
949 | * We want to under-estimate the load of migration sources, to | 1094 | * We want to under-estimate the load of migration sources, to |
950 | * balance conservatively. | 1095 | * balance conservatively. |
@@ -952,24 +1097,36 @@ void kick_process(task_t *p) | |||
952 | static inline unsigned long source_load(int cpu, int type) | 1097 | static inline unsigned long source_load(int cpu, int type) |
953 | { | 1098 | { |
954 | runqueue_t *rq = cpu_rq(cpu); | 1099 | runqueue_t *rq = cpu_rq(cpu); |
955 | unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; | 1100 | |
956 | if (type == 0) | 1101 | if (type == 0) |
957 | return load_now; | 1102 | return rq->raw_weighted_load; |
958 | 1103 | ||
959 | return min(rq->cpu_load[type-1], load_now); | 1104 | return min(rq->cpu_load[type-1], rq->raw_weighted_load); |
960 | } | 1105 | } |
961 | 1106 | ||
962 | /* | 1107 | /* |
963 | * Return a high guess at the load of a migration-target cpu | 1108 | * Return a high guess at the load of a migration-target cpu weighted |
1109 | * according to the scheduling class and "nice" value. | ||
964 | */ | 1110 | */ |
965 | static inline unsigned long target_load(int cpu, int type) | 1111 | static inline unsigned long target_load(int cpu, int type) |
966 | { | 1112 | { |
967 | runqueue_t *rq = cpu_rq(cpu); | 1113 | runqueue_t *rq = cpu_rq(cpu); |
968 | unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; | 1114 | |
969 | if (type == 0) | 1115 | if (type == 0) |
970 | return load_now; | 1116 | return rq->raw_weighted_load; |
1117 | |||
1118 | return max(rq->cpu_load[type-1], rq->raw_weighted_load); | ||
1119 | } | ||
1120 | |||
1121 | /* | ||
1122 | * Return the average load per task on the cpu's run queue | ||
1123 | */ | ||
1124 | static inline unsigned long cpu_avg_load_per_task(int cpu) | ||
1125 | { | ||
1126 | runqueue_t *rq = cpu_rq(cpu); | ||
1127 | unsigned long n = rq->nr_running; | ||
971 | 1128 | ||
972 | return max(rq->cpu_load[type-1], load_now); | 1129 | return n ? rq->raw_weighted_load / n : SCHED_LOAD_SCALE; |
973 | } | 1130 | } |
974 | 1131 | ||
975 | /* | 1132 | /* |
@@ -1042,7 +1199,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) | |||
1042 | cpus_and(tmp, group->cpumask, p->cpus_allowed); | 1199 | cpus_and(tmp, group->cpumask, p->cpus_allowed); |
1043 | 1200 | ||
1044 | for_each_cpu_mask(i, tmp) { | 1201 | for_each_cpu_mask(i, tmp) { |
1045 | load = source_load(i, 0); | 1202 | load = weighted_cpuload(i); |
1046 | 1203 | ||
1047 | if (load < min_load || (load == min_load && i == this_cpu)) { | 1204 | if (load < min_load || (load == min_load && i == this_cpu)) { |
1048 | min_load = load; | 1205 | min_load = load; |
@@ -1069,9 +1226,15 @@ static int sched_balance_self(int cpu, int flag) | |||
1069 | struct task_struct *t = current; | 1226 | struct task_struct *t = current; |
1070 | struct sched_domain *tmp, *sd = NULL; | 1227 | struct sched_domain *tmp, *sd = NULL; |
1071 | 1228 | ||
1072 | for_each_domain(cpu, tmp) | 1229 | for_each_domain(cpu, tmp) { |
1230 | /* | ||
1231 | * If power savings logic is enabled for a domain, stop there. | ||
1232 | */ | ||
1233 | if (tmp->flags & SD_POWERSAVINGS_BALANCE) | ||
1234 | break; | ||
1073 | if (tmp->flags & flag) | 1235 | if (tmp->flags & flag) |
1074 | sd = tmp; | 1236 | sd = tmp; |
1237 | } | ||
1075 | 1238 | ||
1076 | while (sd) { | 1239 | while (sd) { |
1077 | cpumask_t span; | 1240 | cpumask_t span; |
@@ -1221,17 +1384,19 @@ static int try_to_wake_up(task_t *p, unsigned int state, int sync) | |||
1221 | 1384 | ||
1222 | if (this_sd->flags & SD_WAKE_AFFINE) { | 1385 | if (this_sd->flags & SD_WAKE_AFFINE) { |
1223 | unsigned long tl = this_load; | 1386 | unsigned long tl = this_load; |
1387 | unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu); | ||
1388 | |||
1224 | /* | 1389 | /* |
1225 | * If sync wakeup then subtract the (maximum possible) | 1390 | * If sync wakeup then subtract the (maximum possible) |
1226 | * effect of the currently running task from the load | 1391 | * effect of the currently running task from the load |
1227 | * of the current CPU: | 1392 | * of the current CPU: |
1228 | */ | 1393 | */ |
1229 | if (sync) | 1394 | if (sync) |
1230 | tl -= SCHED_LOAD_SCALE; | 1395 | tl -= current->load_weight; |
1231 | 1396 | ||
1232 | if ((tl <= load && | 1397 | if ((tl <= load && |
1233 | tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) || | 1398 | tl + target_load(cpu, idx) <= tl_per_task) || |
1234 | 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) { | 1399 | 100*(tl + p->load_weight) <= imbalance*load) { |
1235 | /* | 1400 | /* |
1236 | * This domain has SD_WAKE_AFFINE and | 1401 | * This domain has SD_WAKE_AFFINE and |
1237 | * p is cache cold in this domain, and | 1402 | * p is cache cold in this domain, and |
@@ -1348,6 +1513,12 @@ void fastcall sched_fork(task_t *p, int clone_flags) | |||
1348 | * event cannot wake it up and insert it on the runqueue either. | 1513 | * event cannot wake it up and insert it on the runqueue either. |
1349 | */ | 1514 | */ |
1350 | p->state = TASK_RUNNING; | 1515 | p->state = TASK_RUNNING; |
1516 | |||
1517 | /* | ||
1518 | * Make sure we do not leak PI boosting priority to the child: | ||
1519 | */ | ||
1520 | p->prio = current->normal_prio; | ||
1521 | |||
1351 | INIT_LIST_HEAD(&p->run_list); | 1522 | INIT_LIST_HEAD(&p->run_list); |
1352 | p->array = NULL; | 1523 | p->array = NULL; |
1353 | #ifdef CONFIG_SCHEDSTATS | 1524 | #ifdef CONFIG_SCHEDSTATS |
@@ -1427,10 +1598,11 @@ void fastcall wake_up_new_task(task_t *p, unsigned long clone_flags) | |||
1427 | __activate_task(p, rq); | 1598 | __activate_task(p, rq); |
1428 | else { | 1599 | else { |
1429 | p->prio = current->prio; | 1600 | p->prio = current->prio; |
1601 | p->normal_prio = current->normal_prio; | ||
1430 | list_add_tail(&p->run_list, ¤t->run_list); | 1602 | list_add_tail(&p->run_list, ¤t->run_list); |
1431 | p->array = current->array; | 1603 | p->array = current->array; |
1432 | p->array->nr_active++; | 1604 | p->array->nr_active++; |
1433 | rq->nr_running++; | 1605 | inc_nr_running(p, rq); |
1434 | } | 1606 | } |
1435 | set_need_resched(); | 1607 | set_need_resched(); |
1436 | } else | 1608 | } else |
@@ -1648,7 +1820,8 @@ unsigned long nr_uninterruptible(void) | |||
1648 | 1820 | ||
1649 | unsigned long long nr_context_switches(void) | 1821 | unsigned long long nr_context_switches(void) |
1650 | { | 1822 | { |
1651 | unsigned long long i, sum = 0; | 1823 | int i; |
1824 | unsigned long long sum = 0; | ||
1652 | 1825 | ||
1653 | for_each_possible_cpu(i) | 1826 | for_each_possible_cpu(i) |
1654 | sum += cpu_rq(i)->nr_switches; | 1827 | sum += cpu_rq(i)->nr_switches; |
@@ -1686,9 +1859,6 @@ unsigned long nr_active(void) | |||
1686 | /* | 1859 | /* |
1687 | * double_rq_lock - safely lock two runqueues | 1860 | * double_rq_lock - safely lock two runqueues |
1688 | * | 1861 | * |
1689 | * We must take them in cpu order to match code in | ||
1690 | * dependent_sleeper and wake_dependent_sleeper. | ||
1691 | * | ||
1692 | * Note this does not disable interrupts like task_rq_lock, | 1862 | * Note this does not disable interrupts like task_rq_lock, |
1693 | * you need to do so manually before calling. | 1863 | * you need to do so manually before calling. |
1694 | */ | 1864 | */ |
@@ -1700,7 +1870,7 @@ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) | |||
1700 | spin_lock(&rq1->lock); | 1870 | spin_lock(&rq1->lock); |
1701 | __acquire(rq2->lock); /* Fake it out ;) */ | 1871 | __acquire(rq2->lock); /* Fake it out ;) */ |
1702 | } else { | 1872 | } else { |
1703 | if (rq1->cpu < rq2->cpu) { | 1873 | if (rq1 < rq2) { |
1704 | spin_lock(&rq1->lock); | 1874 | spin_lock(&rq1->lock); |
1705 | spin_lock(&rq2->lock); | 1875 | spin_lock(&rq2->lock); |
1706 | } else { | 1876 | } else { |
@@ -1736,7 +1906,7 @@ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) | |||
1736 | __acquires(this_rq->lock) | 1906 | __acquires(this_rq->lock) |
1737 | { | 1907 | { |
1738 | if (unlikely(!spin_trylock(&busiest->lock))) { | 1908 | if (unlikely(!spin_trylock(&busiest->lock))) { |
1739 | if (busiest->cpu < this_rq->cpu) { | 1909 | if (busiest < this_rq) { |
1740 | spin_unlock(&this_rq->lock); | 1910 | spin_unlock(&this_rq->lock); |
1741 | spin_lock(&busiest->lock); | 1911 | spin_lock(&busiest->lock); |
1742 | spin_lock(&this_rq->lock); | 1912 | spin_lock(&this_rq->lock); |
@@ -1799,9 +1969,9 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, | |||
1799 | runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) | 1969 | runqueue_t *this_rq, prio_array_t *this_array, int this_cpu) |
1800 | { | 1970 | { |
1801 | dequeue_task(p, src_array); | 1971 | dequeue_task(p, src_array); |
1802 | src_rq->nr_running--; | 1972 | dec_nr_running(p, src_rq); |
1803 | set_task_cpu(p, this_cpu); | 1973 | set_task_cpu(p, this_cpu); |
1804 | this_rq->nr_running++; | 1974 | inc_nr_running(p, this_rq); |
1805 | enqueue_task(p, this_array); | 1975 | enqueue_task(p, this_array); |
1806 | p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) | 1976 | p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) |
1807 | + this_rq->timestamp_last_tick; | 1977 | + this_rq->timestamp_last_tick; |
@@ -1848,26 +2018,42 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, | |||
1848 | return 1; | 2018 | return 1; |
1849 | } | 2019 | } |
1850 | 2020 | ||
2021 | #define rq_best_prio(rq) min((rq)->curr->prio, (rq)->best_expired_prio) | ||
1851 | /* | 2022 | /* |
1852 | * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, | 2023 | * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted |
1853 | * as part of a balancing operation within "domain". Returns the number of | 2024 | * load from busiest to this_rq, as part of a balancing operation within |
1854 | * tasks moved. | 2025 | * "domain". Returns the number of tasks moved. |
1855 | * | 2026 | * |
1856 | * Called with both runqueues locked. | 2027 | * Called with both runqueues locked. |
1857 | */ | 2028 | */ |
1858 | static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, | 2029 | static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, |
1859 | unsigned long max_nr_move, struct sched_domain *sd, | 2030 | unsigned long max_nr_move, unsigned long max_load_move, |
1860 | enum idle_type idle, int *all_pinned) | 2031 | struct sched_domain *sd, enum idle_type idle, |
2032 | int *all_pinned) | ||
1861 | { | 2033 | { |
1862 | prio_array_t *array, *dst_array; | 2034 | prio_array_t *array, *dst_array; |
1863 | struct list_head *head, *curr; | 2035 | struct list_head *head, *curr; |
1864 | int idx, pulled = 0, pinned = 0; | 2036 | int idx, pulled = 0, pinned = 0, this_best_prio, busiest_best_prio; |
2037 | int busiest_best_prio_seen; | ||
2038 | int skip_for_load; /* skip the task based on weighted load issues */ | ||
2039 | long rem_load_move; | ||
1865 | task_t *tmp; | 2040 | task_t *tmp; |
1866 | 2041 | ||
1867 | if (max_nr_move == 0) | 2042 | if (max_nr_move == 0 || max_load_move == 0) |
1868 | goto out; | 2043 | goto out; |
1869 | 2044 | ||
2045 | rem_load_move = max_load_move; | ||
1870 | pinned = 1; | 2046 | pinned = 1; |
2047 | this_best_prio = rq_best_prio(this_rq); | ||
2048 | busiest_best_prio = rq_best_prio(busiest); | ||
2049 | /* | ||
2050 | * Enable handling of the case where there is more than one task | ||
2051 | * with the best priority. If the current running task is one | ||
2052 | * of those with prio==busiest_best_prio we know it won't be moved | ||
2053 | * and therefore it's safe to override the skip (based on load) of | ||
2054 | * any task we find with that prio. | ||
2055 | */ | ||
2056 | busiest_best_prio_seen = busiest_best_prio == busiest->curr->prio; | ||
1871 | 2057 | ||
1872 | /* | 2058 | /* |
1873 | * We first consider expired tasks. Those will likely not be | 2059 | * We first consider expired tasks. Those will likely not be |
@@ -1907,7 +2093,17 @@ skip_queue: | |||
1907 | 2093 | ||
1908 | curr = curr->prev; | 2094 | curr = curr->prev; |
1909 | 2095 | ||
1910 | if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { | 2096 | /* |
2097 | * To help distribute high priority tasks accross CPUs we don't | ||
2098 | * skip a task if it will be the highest priority task (i.e. smallest | ||
2099 | * prio value) on its new queue regardless of its load weight | ||
2100 | */ | ||
2101 | skip_for_load = tmp->load_weight > rem_load_move; | ||
2102 | if (skip_for_load && idx < this_best_prio) | ||
2103 | skip_for_load = !busiest_best_prio_seen && idx == busiest_best_prio; | ||
2104 | if (skip_for_load || | ||
2105 | !can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { | ||
2106 | busiest_best_prio_seen |= idx == busiest_best_prio; | ||
1911 | if (curr != head) | 2107 | if (curr != head) |
1912 | goto skip_queue; | 2108 | goto skip_queue; |
1913 | idx++; | 2109 | idx++; |
@@ -1921,9 +2117,15 @@ skip_queue: | |||
1921 | 2117 | ||
1922 | pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); | 2118 | pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); |
1923 | pulled++; | 2119 | pulled++; |
2120 | rem_load_move -= tmp->load_weight; | ||
1924 | 2121 | ||
1925 | /* We only want to steal up to the prescribed number of tasks. */ | 2122 | /* |
1926 | if (pulled < max_nr_move) { | 2123 | * We only want to steal up to the prescribed number of tasks |
2124 | * and the prescribed amount of weighted load. | ||
2125 | */ | ||
2126 | if (pulled < max_nr_move && rem_load_move > 0) { | ||
2127 | if (idx < this_best_prio) | ||
2128 | this_best_prio = idx; | ||
1927 | if (curr != head) | 2129 | if (curr != head) |
1928 | goto skip_queue; | 2130 | goto skip_queue; |
1929 | idx++; | 2131 | idx++; |
@@ -1944,7 +2146,7 @@ out: | |||
1944 | 2146 | ||
1945 | /* | 2147 | /* |
1946 | * find_busiest_group finds and returns the busiest CPU group within the | 2148 | * find_busiest_group finds and returns the busiest CPU group within the |
1947 | * domain. It calculates and returns the number of tasks which should be | 2149 | * domain. It calculates and returns the amount of weighted load which should be |
1948 | * moved to restore balance via the imbalance parameter. | 2150 | * moved to restore balance via the imbalance parameter. |
1949 | */ | 2151 | */ |
1950 | static struct sched_group * | 2152 | static struct sched_group * |
@@ -1954,9 +2156,19 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
1954 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; | 2156 | struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; |
1955 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; | 2157 | unsigned long max_load, avg_load, total_load, this_load, total_pwr; |
1956 | unsigned long max_pull; | 2158 | unsigned long max_pull; |
2159 | unsigned long busiest_load_per_task, busiest_nr_running; | ||
2160 | unsigned long this_load_per_task, this_nr_running; | ||
1957 | int load_idx; | 2161 | int load_idx; |
2162 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
2163 | int power_savings_balance = 1; | ||
2164 | unsigned long leader_nr_running = 0, min_load_per_task = 0; | ||
2165 | unsigned long min_nr_running = ULONG_MAX; | ||
2166 | struct sched_group *group_min = NULL, *group_leader = NULL; | ||
2167 | #endif | ||
1958 | 2168 | ||
1959 | max_load = this_load = total_load = total_pwr = 0; | 2169 | max_load = this_load = total_load = total_pwr = 0; |
2170 | busiest_load_per_task = busiest_nr_running = 0; | ||
2171 | this_load_per_task = this_nr_running = 0; | ||
1960 | if (idle == NOT_IDLE) | 2172 | if (idle == NOT_IDLE) |
1961 | load_idx = sd->busy_idx; | 2173 | load_idx = sd->busy_idx; |
1962 | else if (idle == NEWLY_IDLE) | 2174 | else if (idle == NEWLY_IDLE) |
@@ -1965,16 +2177,19 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
1965 | load_idx = sd->idle_idx; | 2177 | load_idx = sd->idle_idx; |
1966 | 2178 | ||
1967 | do { | 2179 | do { |
1968 | unsigned long load; | 2180 | unsigned long load, group_capacity; |
1969 | int local_group; | 2181 | int local_group; |
1970 | int i; | 2182 | int i; |
2183 | unsigned long sum_nr_running, sum_weighted_load; | ||
1971 | 2184 | ||
1972 | local_group = cpu_isset(this_cpu, group->cpumask); | 2185 | local_group = cpu_isset(this_cpu, group->cpumask); |
1973 | 2186 | ||
1974 | /* Tally up the load of all CPUs in the group */ | 2187 | /* Tally up the load of all CPUs in the group */ |
1975 | avg_load = 0; | 2188 | sum_weighted_load = sum_nr_running = avg_load = 0; |
1976 | 2189 | ||
1977 | for_each_cpu_mask(i, group->cpumask) { | 2190 | for_each_cpu_mask(i, group->cpumask) { |
2191 | runqueue_t *rq = cpu_rq(i); | ||
2192 | |||
1978 | if (*sd_idle && !idle_cpu(i)) | 2193 | if (*sd_idle && !idle_cpu(i)) |
1979 | *sd_idle = 0; | 2194 | *sd_idle = 0; |
1980 | 2195 | ||
@@ -1985,6 +2200,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
1985 | load = source_load(i, load_idx); | 2200 | load = source_load(i, load_idx); |
1986 | 2201 | ||
1987 | avg_load += load; | 2202 | avg_load += load; |
2203 | sum_nr_running += rq->nr_running; | ||
2204 | sum_weighted_load += rq->raw_weighted_load; | ||
1988 | } | 2205 | } |
1989 | 2206 | ||
1990 | total_load += avg_load; | 2207 | total_load += avg_load; |
@@ -1993,17 +2210,80 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
1993 | /* Adjust by relative CPU power of the group */ | 2210 | /* Adjust by relative CPU power of the group */ |
1994 | avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; | 2211 | avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; |
1995 | 2212 | ||
2213 | group_capacity = group->cpu_power / SCHED_LOAD_SCALE; | ||
2214 | |||
1996 | if (local_group) { | 2215 | if (local_group) { |
1997 | this_load = avg_load; | 2216 | this_load = avg_load; |
1998 | this = group; | 2217 | this = group; |
1999 | } else if (avg_load > max_load) { | 2218 | this_nr_running = sum_nr_running; |
2219 | this_load_per_task = sum_weighted_load; | ||
2220 | } else if (avg_load > max_load && | ||
2221 | sum_nr_running > group_capacity) { | ||
2000 | max_load = avg_load; | 2222 | max_load = avg_load; |
2001 | busiest = group; | 2223 | busiest = group; |
2224 | busiest_nr_running = sum_nr_running; | ||
2225 | busiest_load_per_task = sum_weighted_load; | ||
2002 | } | 2226 | } |
2227 | |||
2228 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
2229 | /* | ||
2230 | * Busy processors will not participate in power savings | ||
2231 | * balance. | ||
2232 | */ | ||
2233 | if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) | ||
2234 | goto group_next; | ||
2235 | |||
2236 | /* | ||
2237 | * If the local group is idle or completely loaded | ||
2238 | * no need to do power savings balance at this domain | ||
2239 | */ | ||
2240 | if (local_group && (this_nr_running >= group_capacity || | ||
2241 | !this_nr_running)) | ||
2242 | power_savings_balance = 0; | ||
2243 | |||
2244 | /* | ||
2245 | * If a group is already running at full capacity or idle, | ||
2246 | * don't include that group in power savings calculations | ||
2247 | */ | ||
2248 | if (!power_savings_balance || sum_nr_running >= group_capacity | ||
2249 | || !sum_nr_running) | ||
2250 | goto group_next; | ||
2251 | |||
2252 | /* | ||
2253 | * Calculate the group which has the least non-idle load. | ||
2254 | * This is the group from where we need to pick up the load | ||
2255 | * for saving power | ||
2256 | */ | ||
2257 | if ((sum_nr_running < min_nr_running) || | ||
2258 | (sum_nr_running == min_nr_running && | ||
2259 | first_cpu(group->cpumask) < | ||
2260 | first_cpu(group_min->cpumask))) { | ||
2261 | group_min = group; | ||
2262 | min_nr_running = sum_nr_running; | ||
2263 | min_load_per_task = sum_weighted_load / | ||
2264 | sum_nr_running; | ||
2265 | } | ||
2266 | |||
2267 | /* | ||
2268 | * Calculate the group which is almost near its | ||
2269 | * capacity but still has some space to pick up some load | ||
2270 | * from other group and save more power | ||
2271 | */ | ||
2272 | if (sum_nr_running <= group_capacity - 1) | ||
2273 | if (sum_nr_running > leader_nr_running || | ||
2274 | (sum_nr_running == leader_nr_running && | ||
2275 | first_cpu(group->cpumask) > | ||
2276 | first_cpu(group_leader->cpumask))) { | ||
2277 | group_leader = group; | ||
2278 | leader_nr_running = sum_nr_running; | ||
2279 | } | ||
2280 | |||
2281 | group_next: | ||
2282 | #endif | ||
2003 | group = group->next; | 2283 | group = group->next; |
2004 | } while (group != sd->groups); | 2284 | } while (group != sd->groups); |
2005 | 2285 | ||
2006 | if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE) | 2286 | if (!busiest || this_load >= max_load || busiest_nr_running == 0) |
2007 | goto out_balanced; | 2287 | goto out_balanced; |
2008 | 2288 | ||
2009 | avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; | 2289 | avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; |
@@ -2012,6 +2292,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2012 | 100*max_load <= sd->imbalance_pct*this_load) | 2292 | 100*max_load <= sd->imbalance_pct*this_load) |
2013 | goto out_balanced; | 2293 | goto out_balanced; |
2014 | 2294 | ||
2295 | busiest_load_per_task /= busiest_nr_running; | ||
2015 | /* | 2296 | /* |
2016 | * We're trying to get all the cpus to the average_load, so we don't | 2297 | * We're trying to get all the cpus to the average_load, so we don't |
2017 | * want to push ourselves above the average load, nor do we wish to | 2298 | * want to push ourselves above the average load, nor do we wish to |
@@ -2023,21 +2304,50 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2023 | * by pulling tasks to us. Be careful of negative numbers as they'll | 2304 | * by pulling tasks to us. Be careful of negative numbers as they'll |
2024 | * appear as very large values with unsigned longs. | 2305 | * appear as very large values with unsigned longs. |
2025 | */ | 2306 | */ |
2307 | if (max_load <= busiest_load_per_task) | ||
2308 | goto out_balanced; | ||
2309 | |||
2310 | /* | ||
2311 | * In the presence of smp nice balancing, certain scenarios can have | ||
2312 | * max load less than avg load(as we skip the groups at or below | ||
2313 | * its cpu_power, while calculating max_load..) | ||
2314 | */ | ||
2315 | if (max_load < avg_load) { | ||
2316 | *imbalance = 0; | ||
2317 | goto small_imbalance; | ||
2318 | } | ||
2026 | 2319 | ||
2027 | /* Don't want to pull so many tasks that a group would go idle */ | 2320 | /* Don't want to pull so many tasks that a group would go idle */ |
2028 | max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE); | 2321 | max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); |
2029 | 2322 | ||
2030 | /* How much load to actually move to equalise the imbalance */ | 2323 | /* How much load to actually move to equalise the imbalance */ |
2031 | *imbalance = min(max_pull * busiest->cpu_power, | 2324 | *imbalance = min(max_pull * busiest->cpu_power, |
2032 | (avg_load - this_load) * this->cpu_power) | 2325 | (avg_load - this_load) * this->cpu_power) |
2033 | / SCHED_LOAD_SCALE; | 2326 | / SCHED_LOAD_SCALE; |
2034 | 2327 | ||
2035 | if (*imbalance < SCHED_LOAD_SCALE) { | 2328 | /* |
2036 | unsigned long pwr_now = 0, pwr_move = 0; | 2329 | * if *imbalance is less than the average load per runnable task |
2330 | * there is no gaurantee that any tasks will be moved so we'll have | ||
2331 | * a think about bumping its value to force at least one task to be | ||
2332 | * moved | ||
2333 | */ | ||
2334 | if (*imbalance < busiest_load_per_task) { | ||
2335 | unsigned long pwr_now, pwr_move; | ||
2037 | unsigned long tmp; | 2336 | unsigned long tmp; |
2337 | unsigned int imbn; | ||
2338 | |||
2339 | small_imbalance: | ||
2340 | pwr_move = pwr_now = 0; | ||
2341 | imbn = 2; | ||
2342 | if (this_nr_running) { | ||
2343 | this_load_per_task /= this_nr_running; | ||
2344 | if (busiest_load_per_task > this_load_per_task) | ||
2345 | imbn = 1; | ||
2346 | } else | ||
2347 | this_load_per_task = SCHED_LOAD_SCALE; | ||
2038 | 2348 | ||
2039 | if (max_load - this_load >= SCHED_LOAD_SCALE*2) { | 2349 | if (max_load - this_load >= busiest_load_per_task * imbn) { |
2040 | *imbalance = 1; | 2350 | *imbalance = busiest_load_per_task; |
2041 | return busiest; | 2351 | return busiest; |
2042 | } | 2352 | } |
2043 | 2353 | ||
@@ -2047,39 +2357,47 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, | |||
2047 | * moving them. | 2357 | * moving them. |
2048 | */ | 2358 | */ |
2049 | 2359 | ||
2050 | pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); | 2360 | pwr_now += busiest->cpu_power * |
2051 | pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); | 2361 | min(busiest_load_per_task, max_load); |
2362 | pwr_now += this->cpu_power * | ||
2363 | min(this_load_per_task, this_load); | ||
2052 | pwr_now /= SCHED_LOAD_SCALE; | 2364 | pwr_now /= SCHED_LOAD_SCALE; |
2053 | 2365 | ||
2054 | /* Amount of load we'd subtract */ | 2366 | /* Amount of load we'd subtract */ |
2055 | tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; | 2367 | tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power; |
2056 | if (max_load > tmp) | 2368 | if (max_load > tmp) |
2057 | pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE, | 2369 | pwr_move += busiest->cpu_power * |
2058 | max_load - tmp); | 2370 | min(busiest_load_per_task, max_load - tmp); |
2059 | 2371 | ||
2060 | /* Amount of load we'd add */ | 2372 | /* Amount of load we'd add */ |
2061 | if (max_load*busiest->cpu_power < | 2373 | if (max_load*busiest->cpu_power < |
2062 | SCHED_LOAD_SCALE*SCHED_LOAD_SCALE) | 2374 | busiest_load_per_task*SCHED_LOAD_SCALE) |
2063 | tmp = max_load*busiest->cpu_power/this->cpu_power; | 2375 | tmp = max_load*busiest->cpu_power/this->cpu_power; |
2064 | else | 2376 | else |
2065 | tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; | 2377 | tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power; |
2066 | pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp); | 2378 | pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp); |
2067 | pwr_move /= SCHED_LOAD_SCALE; | 2379 | pwr_move /= SCHED_LOAD_SCALE; |
2068 | 2380 | ||
2069 | /* Move if we gain throughput */ | 2381 | /* Move if we gain throughput */ |
2070 | if (pwr_move <= pwr_now) | 2382 | if (pwr_move <= pwr_now) |
2071 | goto out_balanced; | 2383 | goto out_balanced; |
2072 | 2384 | ||
2073 | *imbalance = 1; | 2385 | *imbalance = busiest_load_per_task; |
2074 | return busiest; | ||
2075 | } | 2386 | } |
2076 | 2387 | ||
2077 | /* Get rid of the scaling factor, rounding down as we divide */ | ||
2078 | *imbalance = *imbalance / SCHED_LOAD_SCALE; | ||
2079 | return busiest; | 2388 | return busiest; |
2080 | 2389 | ||
2081 | out_balanced: | 2390 | out_balanced: |
2391 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
2392 | if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) | ||
2393 | goto ret; | ||
2082 | 2394 | ||
2395 | if (this == group_leader && group_leader != group_min) { | ||
2396 | *imbalance = min_load_per_task; | ||
2397 | return group_min; | ||
2398 | } | ||
2399 | ret: | ||
2400 | #endif | ||
2083 | *imbalance = 0; | 2401 | *imbalance = 0; |
2084 | return NULL; | 2402 | return NULL; |
2085 | } | 2403 | } |
@@ -2088,18 +2406,21 @@ out_balanced: | |||
2088 | * find_busiest_queue - find the busiest runqueue among the cpus in group. | 2406 | * find_busiest_queue - find the busiest runqueue among the cpus in group. |
2089 | */ | 2407 | */ |
2090 | static runqueue_t *find_busiest_queue(struct sched_group *group, | 2408 | static runqueue_t *find_busiest_queue(struct sched_group *group, |
2091 | enum idle_type idle) | 2409 | enum idle_type idle, unsigned long imbalance) |
2092 | { | 2410 | { |
2093 | unsigned long load, max_load = 0; | 2411 | unsigned long max_load = 0; |
2094 | runqueue_t *busiest = NULL; | 2412 | runqueue_t *busiest = NULL, *rqi; |
2095 | int i; | 2413 | int i; |
2096 | 2414 | ||
2097 | for_each_cpu_mask(i, group->cpumask) { | 2415 | for_each_cpu_mask(i, group->cpumask) { |
2098 | load = source_load(i, 0); | 2416 | rqi = cpu_rq(i); |
2099 | 2417 | ||
2100 | if (load > max_load) { | 2418 | if (rqi->nr_running == 1 && rqi->raw_weighted_load > imbalance) |
2101 | max_load = load; | 2419 | continue; |
2102 | busiest = cpu_rq(i); | 2420 | |
2421 | if (rqi->raw_weighted_load > max_load) { | ||
2422 | max_load = rqi->raw_weighted_load; | ||
2423 | busiest = rqi; | ||
2103 | } | 2424 | } |
2104 | } | 2425 | } |
2105 | 2426 | ||
@@ -2112,6 +2433,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group, | |||
2112 | */ | 2433 | */ |
2113 | #define MAX_PINNED_INTERVAL 512 | 2434 | #define MAX_PINNED_INTERVAL 512 |
2114 | 2435 | ||
2436 | #define minus_1_or_zero(n) ((n) > 0 ? (n) - 1 : 0) | ||
2115 | /* | 2437 | /* |
2116 | * Check this_cpu to ensure it is balanced within domain. Attempt to move | 2438 | * Check this_cpu to ensure it is balanced within domain. Attempt to move |
2117 | * tasks if there is an imbalance. | 2439 | * tasks if there is an imbalance. |
@@ -2128,7 +2450,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
2128 | int active_balance = 0; | 2450 | int active_balance = 0; |
2129 | int sd_idle = 0; | 2451 | int sd_idle = 0; |
2130 | 2452 | ||
2131 | if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER) | 2453 | if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && |
2454 | !sched_smt_power_savings) | ||
2132 | sd_idle = 1; | 2455 | sd_idle = 1; |
2133 | 2456 | ||
2134 | schedstat_inc(sd, lb_cnt[idle]); | 2457 | schedstat_inc(sd, lb_cnt[idle]); |
@@ -2139,7 +2462,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
2139 | goto out_balanced; | 2462 | goto out_balanced; |
2140 | } | 2463 | } |
2141 | 2464 | ||
2142 | busiest = find_busiest_queue(group, idle); | 2465 | busiest = find_busiest_queue(group, idle, imbalance); |
2143 | if (!busiest) { | 2466 | if (!busiest) { |
2144 | schedstat_inc(sd, lb_nobusyq[idle]); | 2467 | schedstat_inc(sd, lb_nobusyq[idle]); |
2145 | goto out_balanced; | 2468 | goto out_balanced; |
@@ -2159,6 +2482,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
2159 | */ | 2482 | */ |
2160 | double_rq_lock(this_rq, busiest); | 2483 | double_rq_lock(this_rq, busiest); |
2161 | nr_moved = move_tasks(this_rq, this_cpu, busiest, | 2484 | nr_moved = move_tasks(this_rq, this_cpu, busiest, |
2485 | minus_1_or_zero(busiest->nr_running), | ||
2162 | imbalance, sd, idle, &all_pinned); | 2486 | imbalance, sd, idle, &all_pinned); |
2163 | double_rq_unlock(this_rq, busiest); | 2487 | double_rq_unlock(this_rq, busiest); |
2164 | 2488 | ||
@@ -2216,7 +2540,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, | |||
2216 | sd->balance_interval *= 2; | 2540 | sd->balance_interval *= 2; |
2217 | } | 2541 | } |
2218 | 2542 | ||
2219 | if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER) | 2543 | if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && |
2544 | !sched_smt_power_savings) | ||
2220 | return -1; | 2545 | return -1; |
2221 | return nr_moved; | 2546 | return nr_moved; |
2222 | 2547 | ||
@@ -2231,7 +2556,7 @@ out_one_pinned: | |||
2231 | (sd->balance_interval < sd->max_interval)) | 2556 | (sd->balance_interval < sd->max_interval)) |
2232 | sd->balance_interval *= 2; | 2557 | sd->balance_interval *= 2; |
2233 | 2558 | ||
2234 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) | 2559 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) |
2235 | return -1; | 2560 | return -1; |
2236 | return 0; | 2561 | return 0; |
2237 | } | 2562 | } |
@@ -2252,7 +2577,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, | |||
2252 | int nr_moved = 0; | 2577 | int nr_moved = 0; |
2253 | int sd_idle = 0; | 2578 | int sd_idle = 0; |
2254 | 2579 | ||
2255 | if (sd->flags & SD_SHARE_CPUPOWER) | 2580 | if (sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) |
2256 | sd_idle = 1; | 2581 | sd_idle = 1; |
2257 | 2582 | ||
2258 | schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); | 2583 | schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); |
@@ -2262,7 +2587,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, | |||
2262 | goto out_balanced; | 2587 | goto out_balanced; |
2263 | } | 2588 | } |
2264 | 2589 | ||
2265 | busiest = find_busiest_queue(group, NEWLY_IDLE); | 2590 | busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance); |
2266 | if (!busiest) { | 2591 | if (!busiest) { |
2267 | schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); | 2592 | schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); |
2268 | goto out_balanced; | 2593 | goto out_balanced; |
@@ -2277,6 +2602,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, | |||
2277 | /* Attempt to move tasks */ | 2602 | /* Attempt to move tasks */ |
2278 | double_lock_balance(this_rq, busiest); | 2603 | double_lock_balance(this_rq, busiest); |
2279 | nr_moved = move_tasks(this_rq, this_cpu, busiest, | 2604 | nr_moved = move_tasks(this_rq, this_cpu, busiest, |
2605 | minus_1_or_zero(busiest->nr_running), | ||
2280 | imbalance, sd, NEWLY_IDLE, NULL); | 2606 | imbalance, sd, NEWLY_IDLE, NULL); |
2281 | spin_unlock(&busiest->lock); | 2607 | spin_unlock(&busiest->lock); |
2282 | } | 2608 | } |
@@ -2292,7 +2618,7 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, | |||
2292 | 2618 | ||
2293 | out_balanced: | 2619 | out_balanced: |
2294 | schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); | 2620 | schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); |
2295 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER) | 2621 | if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !sched_smt_power_savings) |
2296 | return -1; | 2622 | return -1; |
2297 | sd->nr_balance_failed = 0; | 2623 | sd->nr_balance_failed = 0; |
2298 | return 0; | 2624 | return 0; |
@@ -2347,17 +2673,19 @@ static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) | |||
2347 | double_lock_balance(busiest_rq, target_rq); | 2673 | double_lock_balance(busiest_rq, target_rq); |
2348 | 2674 | ||
2349 | /* Search for an sd spanning us and the target CPU. */ | 2675 | /* Search for an sd spanning us and the target CPU. */ |
2350 | for_each_domain(target_cpu, sd) | 2676 | for_each_domain(target_cpu, sd) { |
2351 | if ((sd->flags & SD_LOAD_BALANCE) && | 2677 | if ((sd->flags & SD_LOAD_BALANCE) && |
2352 | cpu_isset(busiest_cpu, sd->span)) | 2678 | cpu_isset(busiest_cpu, sd->span)) |
2353 | break; | 2679 | break; |
2680 | } | ||
2354 | 2681 | ||
2355 | if (unlikely(sd == NULL)) | 2682 | if (unlikely(sd == NULL)) |
2356 | goto out; | 2683 | goto out; |
2357 | 2684 | ||
2358 | schedstat_inc(sd, alb_cnt); | 2685 | schedstat_inc(sd, alb_cnt); |
2359 | 2686 | ||
2360 | if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL)) | 2687 | if (move_tasks(target_rq, target_cpu, busiest_rq, 1, |
2688 | RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE, NULL)) | ||
2361 | schedstat_inc(sd, alb_pushed); | 2689 | schedstat_inc(sd, alb_pushed); |
2362 | else | 2690 | else |
2363 | schedstat_inc(sd, alb_failed); | 2691 | schedstat_inc(sd, alb_failed); |
@@ -2385,7 +2713,7 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, | |||
2385 | struct sched_domain *sd; | 2713 | struct sched_domain *sd; |
2386 | int i; | 2714 | int i; |
2387 | 2715 | ||
2388 | this_load = this_rq->nr_running * SCHED_LOAD_SCALE; | 2716 | this_load = this_rq->raw_weighted_load; |
2389 | /* Update our load */ | 2717 | /* Update our load */ |
2390 | for (i = 0; i < 3; i++) { | 2718 | for (i = 0; i < 3; i++) { |
2391 | unsigned long new_load = this_load; | 2719 | unsigned long new_load = this_load; |
@@ -2686,48 +3014,35 @@ static inline void wakeup_busy_runqueue(runqueue_t *rq) | |||
2686 | resched_task(rq->idle); | 3014 | resched_task(rq->idle); |
2687 | } | 3015 | } |
2688 | 3016 | ||
2689 | static void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) | 3017 | /* |
3018 | * Called with interrupt disabled and this_rq's runqueue locked. | ||
3019 | */ | ||
3020 | static void wake_sleeping_dependent(int this_cpu) | ||
2690 | { | 3021 | { |
2691 | struct sched_domain *tmp, *sd = NULL; | 3022 | struct sched_domain *tmp, *sd = NULL; |
2692 | cpumask_t sibling_map; | ||
2693 | int i; | 3023 | int i; |
2694 | 3024 | ||
2695 | for_each_domain(this_cpu, tmp) | 3025 | for_each_domain(this_cpu, tmp) { |
2696 | if (tmp->flags & SD_SHARE_CPUPOWER) | 3026 | if (tmp->flags & SD_SHARE_CPUPOWER) { |
2697 | sd = tmp; | 3027 | sd = tmp; |
3028 | break; | ||
3029 | } | ||
3030 | } | ||
2698 | 3031 | ||
2699 | if (!sd) | 3032 | if (!sd) |
2700 | return; | 3033 | return; |
2701 | 3034 | ||
2702 | /* | 3035 | for_each_cpu_mask(i, sd->span) { |
2703 | * Unlock the current runqueue because we have to lock in | ||
2704 | * CPU order to avoid deadlocks. Caller knows that we might | ||
2705 | * unlock. We keep IRQs disabled. | ||
2706 | */ | ||
2707 | spin_unlock(&this_rq->lock); | ||
2708 | |||
2709 | sibling_map = sd->span; | ||
2710 | |||
2711 | for_each_cpu_mask(i, sibling_map) | ||
2712 | spin_lock(&cpu_rq(i)->lock); | ||
2713 | /* | ||
2714 | * We clear this CPU from the mask. This both simplifies the | ||
2715 | * inner loop and keps this_rq locked when we exit: | ||
2716 | */ | ||
2717 | cpu_clear(this_cpu, sibling_map); | ||
2718 | |||
2719 | for_each_cpu_mask(i, sibling_map) { | ||
2720 | runqueue_t *smt_rq = cpu_rq(i); | 3036 | runqueue_t *smt_rq = cpu_rq(i); |
2721 | 3037 | ||
3038 | if (i == this_cpu) | ||
3039 | continue; | ||
3040 | if (unlikely(!spin_trylock(&smt_rq->lock))) | ||
3041 | continue; | ||
3042 | |||
2722 | wakeup_busy_runqueue(smt_rq); | 3043 | wakeup_busy_runqueue(smt_rq); |
3044 | spin_unlock(&smt_rq->lock); | ||
2723 | } | 3045 | } |
2724 | |||
2725 | for_each_cpu_mask(i, sibling_map) | ||
2726 | spin_unlock(&cpu_rq(i)->lock); | ||
2727 | /* | ||
2728 | * We exit with this_cpu's rq still held and IRQs | ||
2729 | * still disabled: | ||
2730 | */ | ||
2731 | } | 3046 | } |
2732 | 3047 | ||
2733 | /* | 3048 | /* |
@@ -2740,52 +3055,46 @@ static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd) | |||
2740 | return p->time_slice * (100 - sd->per_cpu_gain) / 100; | 3055 | return p->time_slice * (100 - sd->per_cpu_gain) / 100; |
2741 | } | 3056 | } |
2742 | 3057 | ||
2743 | static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) | 3058 | /* |
3059 | * To minimise lock contention and not have to drop this_rq's runlock we only | ||
3060 | * trylock the sibling runqueues and bypass those runqueues if we fail to | ||
3061 | * acquire their lock. As we only trylock the normal locking order does not | ||
3062 | * need to be obeyed. | ||
3063 | */ | ||
3064 | static int dependent_sleeper(int this_cpu, runqueue_t *this_rq, task_t *p) | ||
2744 | { | 3065 | { |
2745 | struct sched_domain *tmp, *sd = NULL; | 3066 | struct sched_domain *tmp, *sd = NULL; |
2746 | cpumask_t sibling_map; | ||
2747 | prio_array_t *array; | ||
2748 | int ret = 0, i; | 3067 | int ret = 0, i; |
2749 | task_t *p; | ||
2750 | 3068 | ||
2751 | for_each_domain(this_cpu, tmp) | 3069 | /* kernel/rt threads do not participate in dependent sleeping */ |
2752 | if (tmp->flags & SD_SHARE_CPUPOWER) | 3070 | if (!p->mm || rt_task(p)) |
3071 | return 0; | ||
3072 | |||
3073 | for_each_domain(this_cpu, tmp) { | ||
3074 | if (tmp->flags & SD_SHARE_CPUPOWER) { | ||
2753 | sd = tmp; | 3075 | sd = tmp; |
3076 | break; | ||
3077 | } | ||
3078 | } | ||
2754 | 3079 | ||
2755 | if (!sd) | 3080 | if (!sd) |
2756 | return 0; | 3081 | return 0; |
2757 | 3082 | ||
2758 | /* | 3083 | for_each_cpu_mask(i, sd->span) { |
2759 | * The same locking rules and details apply as for | 3084 | runqueue_t *smt_rq; |
2760 | * wake_sleeping_dependent(): | 3085 | task_t *smt_curr; |
2761 | */ | ||
2762 | spin_unlock(&this_rq->lock); | ||
2763 | sibling_map = sd->span; | ||
2764 | for_each_cpu_mask(i, sibling_map) | ||
2765 | spin_lock(&cpu_rq(i)->lock); | ||
2766 | cpu_clear(this_cpu, sibling_map); | ||
2767 | 3086 | ||
2768 | /* | 3087 | if (i == this_cpu) |
2769 | * Establish next task to be run - it might have gone away because | 3088 | continue; |
2770 | * we released the runqueue lock above: | ||
2771 | */ | ||
2772 | if (!this_rq->nr_running) | ||
2773 | goto out_unlock; | ||
2774 | array = this_rq->active; | ||
2775 | if (!array->nr_active) | ||
2776 | array = this_rq->expired; | ||
2777 | BUG_ON(!array->nr_active); | ||
2778 | 3089 | ||
2779 | p = list_entry(array->queue[sched_find_first_bit(array->bitmap)].next, | 3090 | smt_rq = cpu_rq(i); |
2780 | task_t, run_list); | 3091 | if (unlikely(!spin_trylock(&smt_rq->lock))) |
3092 | continue; | ||
2781 | 3093 | ||
2782 | for_each_cpu_mask(i, sibling_map) { | 3094 | smt_curr = smt_rq->curr; |
2783 | runqueue_t *smt_rq = cpu_rq(i); | ||
2784 | task_t *smt_curr = smt_rq->curr; | ||
2785 | 3095 | ||
2786 | /* Kernel threads do not participate in dependent sleeping */ | 3096 | if (!smt_curr->mm) |
2787 | if (!p->mm || !smt_curr->mm || rt_task(p)) | 3097 | goto unlock; |
2788 | goto check_smt_task; | ||
2789 | 3098 | ||
2790 | /* | 3099 | /* |
2791 | * If a user task with lower static priority than the | 3100 | * If a user task with lower static priority than the |
@@ -2803,49 +3112,24 @@ static int dependent_sleeper(int this_cpu, runqueue_t *this_rq) | |||
2803 | if ((jiffies % DEF_TIMESLICE) > | 3112 | if ((jiffies % DEF_TIMESLICE) > |
2804 | (sd->per_cpu_gain * DEF_TIMESLICE / 100)) | 3113 | (sd->per_cpu_gain * DEF_TIMESLICE / 100)) |
2805 | ret = 1; | 3114 | ret = 1; |
2806 | } else | 3115 | } else { |
2807 | if (smt_curr->static_prio < p->static_prio && | 3116 | if (smt_curr->static_prio < p->static_prio && |
2808 | !TASK_PREEMPTS_CURR(p, smt_rq) && | 3117 | !TASK_PREEMPTS_CURR(p, smt_rq) && |
2809 | smt_slice(smt_curr, sd) > task_timeslice(p)) | 3118 | smt_slice(smt_curr, sd) > task_timeslice(p)) |
2810 | ret = 1; | 3119 | ret = 1; |
2811 | |||
2812 | check_smt_task: | ||
2813 | if ((!smt_curr->mm && smt_curr != smt_rq->idle) || | ||
2814 | rt_task(smt_curr)) | ||
2815 | continue; | ||
2816 | if (!p->mm) { | ||
2817 | wakeup_busy_runqueue(smt_rq); | ||
2818 | continue; | ||
2819 | } | ||
2820 | |||
2821 | /* | ||
2822 | * Reschedule a lower priority task on the SMT sibling for | ||
2823 | * it to be put to sleep, or wake it up if it has been put to | ||
2824 | * sleep for priority reasons to see if it should run now. | ||
2825 | */ | ||
2826 | if (rt_task(p)) { | ||
2827 | if ((jiffies % DEF_TIMESLICE) > | ||
2828 | (sd->per_cpu_gain * DEF_TIMESLICE / 100)) | ||
2829 | resched_task(smt_curr); | ||
2830 | } else { | ||
2831 | if (TASK_PREEMPTS_CURR(p, smt_rq) && | ||
2832 | smt_slice(p, sd) > task_timeslice(smt_curr)) | ||
2833 | resched_task(smt_curr); | ||
2834 | else | ||
2835 | wakeup_busy_runqueue(smt_rq); | ||
2836 | } | 3120 | } |
3121 | unlock: | ||
3122 | spin_unlock(&smt_rq->lock); | ||
2837 | } | 3123 | } |
2838 | out_unlock: | ||
2839 | for_each_cpu_mask(i, sibling_map) | ||
2840 | spin_unlock(&cpu_rq(i)->lock); | ||
2841 | return ret; | 3124 | return ret; |
2842 | } | 3125 | } |
2843 | #else | 3126 | #else |
2844 | static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq) | 3127 | static inline void wake_sleeping_dependent(int this_cpu) |
2845 | { | 3128 | { |
2846 | } | 3129 | } |
2847 | 3130 | ||
2848 | static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq) | 3131 | static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq, |
3132 | task_t *p) | ||
2849 | { | 3133 | { |
2850 | return 0; | 3134 | return 0; |
2851 | } | 3135 | } |
@@ -2967,32 +3251,13 @@ need_resched_nonpreemptible: | |||
2967 | 3251 | ||
2968 | cpu = smp_processor_id(); | 3252 | cpu = smp_processor_id(); |
2969 | if (unlikely(!rq->nr_running)) { | 3253 | if (unlikely(!rq->nr_running)) { |
2970 | go_idle: | ||
2971 | idle_balance(cpu, rq); | 3254 | idle_balance(cpu, rq); |
2972 | if (!rq->nr_running) { | 3255 | if (!rq->nr_running) { |
2973 | next = rq->idle; | 3256 | next = rq->idle; |
2974 | rq->expired_timestamp = 0; | 3257 | rq->expired_timestamp = 0; |
2975 | wake_sleeping_dependent(cpu, rq); | 3258 | wake_sleeping_dependent(cpu); |
2976 | /* | ||
2977 | * wake_sleeping_dependent() might have released | ||
2978 | * the runqueue, so break out if we got new | ||
2979 | * tasks meanwhile: | ||
2980 | */ | ||
2981 | if (!rq->nr_running) | ||
2982 | goto switch_tasks; | ||
2983 | } | ||
2984 | } else { | ||
2985 | if (dependent_sleeper(cpu, rq)) { | ||
2986 | next = rq->idle; | ||
2987 | goto switch_tasks; | 3259 | goto switch_tasks; |
2988 | } | 3260 | } |
2989 | /* | ||
2990 | * dependent_sleeper() releases and reacquires the runqueue | ||
2991 | * lock, hence go into the idle loop if the rq went | ||
2992 | * empty meanwhile: | ||
2993 | */ | ||
2994 | if (unlikely(!rq->nr_running)) | ||
2995 | goto go_idle; | ||
2996 | } | 3261 | } |
2997 | 3262 | ||
2998 | array = rq->active; | 3263 | array = rq->active; |
@@ -3030,6 +3295,8 @@ go_idle: | |||
3030 | } | 3295 | } |
3031 | } | 3296 | } |
3032 | next->sleep_type = SLEEP_NORMAL; | 3297 | next->sleep_type = SLEEP_NORMAL; |
3298 | if (dependent_sleeper(cpu, rq, next)) | ||
3299 | next = rq->idle; | ||
3033 | switch_tasks: | 3300 | switch_tasks: |
3034 | if (next == rq->idle) | 3301 | if (next == rq->idle) |
3035 | schedstat_inc(rq, sched_goidle); | 3302 | schedstat_inc(rq, sched_goidle); |
@@ -3473,12 +3740,65 @@ long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) | |||
3473 | 3740 | ||
3474 | EXPORT_SYMBOL(sleep_on_timeout); | 3741 | EXPORT_SYMBOL(sleep_on_timeout); |
3475 | 3742 | ||
3743 | #ifdef CONFIG_RT_MUTEXES | ||
3744 | |||
3745 | /* | ||
3746 | * rt_mutex_setprio - set the current priority of a task | ||
3747 | * @p: task | ||
3748 | * @prio: prio value (kernel-internal form) | ||
3749 | * | ||
3750 | * This function changes the 'effective' priority of a task. It does | ||
3751 | * not touch ->normal_prio like __setscheduler(). | ||
3752 | * | ||
3753 | * Used by the rt_mutex code to implement priority inheritance logic. | ||
3754 | */ | ||
3755 | void rt_mutex_setprio(task_t *p, int prio) | ||
3756 | { | ||
3757 | unsigned long flags; | ||
3758 | prio_array_t *array; | ||
3759 | runqueue_t *rq; | ||
3760 | int oldprio; | ||
3761 | |||
3762 | BUG_ON(prio < 0 || prio > MAX_PRIO); | ||
3763 | |||
3764 | rq = task_rq_lock(p, &flags); | ||
3765 | |||
3766 | oldprio = p->prio; | ||
3767 | array = p->array; | ||
3768 | if (array) | ||
3769 | dequeue_task(p, array); | ||
3770 | p->prio = prio; | ||
3771 | |||
3772 | if (array) { | ||
3773 | /* | ||
3774 | * If changing to an RT priority then queue it | ||
3775 | * in the active array! | ||
3776 | */ | ||
3777 | if (rt_task(p)) | ||
3778 | array = rq->active; | ||
3779 | enqueue_task(p, array); | ||
3780 | /* | ||
3781 | * Reschedule if we are currently running on this runqueue and | ||
3782 | * our priority decreased, or if we are not currently running on | ||
3783 | * this runqueue and our priority is higher than the current's | ||
3784 | */ | ||
3785 | if (task_running(rq, p)) { | ||
3786 | if (p->prio > oldprio) | ||
3787 | resched_task(rq->curr); | ||
3788 | } else if (TASK_PREEMPTS_CURR(p, rq)) | ||
3789 | resched_task(rq->curr); | ||
3790 | } | ||
3791 | task_rq_unlock(rq, &flags); | ||
3792 | } | ||
3793 | |||
3794 | #endif | ||
3795 | |||
3476 | void set_user_nice(task_t *p, long nice) | 3796 | void set_user_nice(task_t *p, long nice) |
3477 | { | 3797 | { |
3478 | unsigned long flags; | 3798 | unsigned long flags; |
3479 | prio_array_t *array; | 3799 | prio_array_t *array; |
3480 | runqueue_t *rq; | 3800 | runqueue_t *rq; |
3481 | int old_prio, new_prio, delta; | 3801 | int old_prio, delta; |
3482 | 3802 | ||
3483 | if (TASK_NICE(p) == nice || nice < -20 || nice > 19) | 3803 | if (TASK_NICE(p) == nice || nice < -20 || nice > 19) |
3484 | return; | 3804 | return; |
@@ -3493,22 +3813,25 @@ void set_user_nice(task_t *p, long nice) | |||
3493 | * it wont have any effect on scheduling until the task is | 3813 | * it wont have any effect on scheduling until the task is |
3494 | * not SCHED_NORMAL/SCHED_BATCH: | 3814 | * not SCHED_NORMAL/SCHED_BATCH: |
3495 | */ | 3815 | */ |
3496 | if (rt_task(p)) { | 3816 | if (has_rt_policy(p)) { |
3497 | p->static_prio = NICE_TO_PRIO(nice); | 3817 | p->static_prio = NICE_TO_PRIO(nice); |
3498 | goto out_unlock; | 3818 | goto out_unlock; |
3499 | } | 3819 | } |
3500 | array = p->array; | 3820 | array = p->array; |
3501 | if (array) | 3821 | if (array) { |
3502 | dequeue_task(p, array); | 3822 | dequeue_task(p, array); |
3823 | dec_raw_weighted_load(rq, p); | ||
3824 | } | ||
3503 | 3825 | ||
3504 | old_prio = p->prio; | ||
3505 | new_prio = NICE_TO_PRIO(nice); | ||
3506 | delta = new_prio - old_prio; | ||
3507 | p->static_prio = NICE_TO_PRIO(nice); | 3826 | p->static_prio = NICE_TO_PRIO(nice); |
3508 | p->prio += delta; | 3827 | set_load_weight(p); |
3828 | old_prio = p->prio; | ||
3829 | p->prio = effective_prio(p); | ||
3830 | delta = p->prio - old_prio; | ||
3509 | 3831 | ||
3510 | if (array) { | 3832 | if (array) { |
3511 | enqueue_task(p, array); | 3833 | enqueue_task(p, array); |
3834 | inc_raw_weighted_load(rq, p); | ||
3512 | /* | 3835 | /* |
3513 | * If the task increased its priority or is running and | 3836 | * If the task increased its priority or is running and |
3514 | * lowered its priority, then reschedule its CPU: | 3837 | * lowered its priority, then reschedule its CPU: |
@@ -3519,7 +3842,6 @@ void set_user_nice(task_t *p, long nice) | |||
3519 | out_unlock: | 3842 | out_unlock: |
3520 | task_rq_unlock(rq, &flags); | 3843 | task_rq_unlock(rq, &flags); |
3521 | } | 3844 | } |
3522 | |||
3523 | EXPORT_SYMBOL(set_user_nice); | 3845 | EXPORT_SYMBOL(set_user_nice); |
3524 | 3846 | ||
3525 | /* | 3847 | /* |
@@ -3634,16 +3956,15 @@ static void __setscheduler(struct task_struct *p, int policy, int prio) | |||
3634 | BUG_ON(p->array); | 3956 | BUG_ON(p->array); |
3635 | p->policy = policy; | 3957 | p->policy = policy; |
3636 | p->rt_priority = prio; | 3958 | p->rt_priority = prio; |
3637 | if (policy != SCHED_NORMAL && policy != SCHED_BATCH) { | 3959 | p->normal_prio = normal_prio(p); |
3638 | p->prio = MAX_RT_PRIO-1 - p->rt_priority; | 3960 | /* we are holding p->pi_lock already */ |
3639 | } else { | 3961 | p->prio = rt_mutex_getprio(p); |
3640 | p->prio = p->static_prio; | 3962 | /* |
3641 | /* | 3963 | * SCHED_BATCH tasks are treated as perpetual CPU hogs: |
3642 | * SCHED_BATCH tasks are treated as perpetual CPU hogs: | 3964 | */ |
3643 | */ | 3965 | if (policy == SCHED_BATCH) |
3644 | if (policy == SCHED_BATCH) | 3966 | p->sleep_avg = 0; |
3645 | p->sleep_avg = 0; | 3967 | set_load_weight(p); |
3646 | } | ||
3647 | } | 3968 | } |
3648 | 3969 | ||
3649 | /** | 3970 | /** |
@@ -3662,6 +3983,8 @@ int sched_setscheduler(struct task_struct *p, int policy, | |||
3662 | unsigned long flags; | 3983 | unsigned long flags; |
3663 | runqueue_t *rq; | 3984 | runqueue_t *rq; |
3664 | 3985 | ||
3986 | /* may grab non-irq protected spin_locks */ | ||
3987 | BUG_ON(in_interrupt()); | ||
3665 | recheck: | 3988 | recheck: |
3666 | /* double check policy once rq lock held */ | 3989 | /* double check policy once rq lock held */ |
3667 | if (policy < 0) | 3990 | if (policy < 0) |
@@ -3710,14 +4033,20 @@ recheck: | |||
3710 | if (retval) | 4033 | if (retval) |
3711 | return retval; | 4034 | return retval; |
3712 | /* | 4035 | /* |
4036 | * make sure no PI-waiters arrive (or leave) while we are | ||
4037 | * changing the priority of the task: | ||
4038 | */ | ||
4039 | spin_lock_irqsave(&p->pi_lock, flags); | ||
4040 | /* | ||
3713 | * To be able to change p->policy safely, the apropriate | 4041 | * To be able to change p->policy safely, the apropriate |
3714 | * runqueue lock must be held. | 4042 | * runqueue lock must be held. |
3715 | */ | 4043 | */ |
3716 | rq = task_rq_lock(p, &flags); | 4044 | rq = __task_rq_lock(p); |
3717 | /* recheck policy now with rq lock held */ | 4045 | /* recheck policy now with rq lock held */ |
3718 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { | 4046 | if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { |
3719 | policy = oldpolicy = -1; | 4047 | policy = oldpolicy = -1; |
3720 | task_rq_unlock(rq, &flags); | 4048 | __task_rq_unlock(rq); |
4049 | spin_unlock_irqrestore(&p->pi_lock, flags); | ||
3721 | goto recheck; | 4050 | goto recheck; |
3722 | } | 4051 | } |
3723 | array = p->array; | 4052 | array = p->array; |
@@ -3738,7 +4067,11 @@ recheck: | |||
3738 | } else if (TASK_PREEMPTS_CURR(p, rq)) | 4067 | } else if (TASK_PREEMPTS_CURR(p, rq)) |
3739 | resched_task(rq->curr); | 4068 | resched_task(rq->curr); |
3740 | } | 4069 | } |
3741 | task_rq_unlock(rq, &flags); | 4070 | __task_rq_unlock(rq); |
4071 | spin_unlock_irqrestore(&p->pi_lock, flags); | ||
4072 | |||
4073 | rt_mutex_adjust_pi(p); | ||
4074 | |||
3742 | return 0; | 4075 | return 0; |
3743 | } | 4076 | } |
3744 | EXPORT_SYMBOL_GPL(sched_setscheduler); | 4077 | EXPORT_SYMBOL_GPL(sched_setscheduler); |
@@ -3760,8 +4093,10 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) | |||
3760 | read_unlock_irq(&tasklist_lock); | 4093 | read_unlock_irq(&tasklist_lock); |
3761 | return -ESRCH; | 4094 | return -ESRCH; |
3762 | } | 4095 | } |
3763 | retval = sched_setscheduler(p, policy, &lparam); | 4096 | get_task_struct(p); |
3764 | read_unlock_irq(&tasklist_lock); | 4097 | read_unlock_irq(&tasklist_lock); |
4098 | retval = sched_setscheduler(p, policy, &lparam); | ||
4099 | put_task_struct(p); | ||
3765 | return retval; | 4100 | return retval; |
3766 | } | 4101 | } |
3767 | 4102 | ||
@@ -4247,7 +4582,7 @@ long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) | |||
4247 | if (retval) | 4582 | if (retval) |
4248 | goto out_unlock; | 4583 | goto out_unlock; |
4249 | 4584 | ||
4250 | jiffies_to_timespec(p->policy & SCHED_FIFO ? | 4585 | jiffies_to_timespec(p->policy == SCHED_FIFO ? |
4251 | 0 : task_timeslice(p), &t); | 4586 | 0 : task_timeslice(p), &t); |
4252 | read_unlock(&tasklist_lock); | 4587 | read_unlock(&tasklist_lock); |
4253 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; | 4588 | retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; |
@@ -4373,7 +4708,7 @@ void __devinit init_idle(task_t *idle, int cpu) | |||
4373 | idle->timestamp = sched_clock(); | 4708 | idle->timestamp = sched_clock(); |
4374 | idle->sleep_avg = 0; | 4709 | idle->sleep_avg = 0; |
4375 | idle->array = NULL; | 4710 | idle->array = NULL; |
4376 | idle->prio = MAX_PRIO; | 4711 | idle->prio = idle->normal_prio = MAX_PRIO; |
4377 | idle->state = TASK_RUNNING; | 4712 | idle->state = TASK_RUNNING; |
4378 | idle->cpus_allowed = cpumask_of_cpu(cpu); | 4713 | idle->cpus_allowed = cpumask_of_cpu(cpu); |
4379 | set_task_cpu(idle, cpu); | 4714 | set_task_cpu(idle, cpu); |
@@ -4469,13 +4804,16 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed); | |||
4469 | * | 4804 | * |
4470 | * So we race with normal scheduler movements, but that's OK, as long | 4805 | * So we race with normal scheduler movements, but that's OK, as long |
4471 | * as the task is no longer on this CPU. | 4806 | * as the task is no longer on this CPU. |
4807 | * | ||
4808 | * Returns non-zero if task was successfully migrated. | ||
4472 | */ | 4809 | */ |
4473 | static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | 4810 | static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) |
4474 | { | 4811 | { |
4475 | runqueue_t *rq_dest, *rq_src; | 4812 | runqueue_t *rq_dest, *rq_src; |
4813 | int ret = 0; | ||
4476 | 4814 | ||
4477 | if (unlikely(cpu_is_offline(dest_cpu))) | 4815 | if (unlikely(cpu_is_offline(dest_cpu))) |
4478 | return; | 4816 | return ret; |
4479 | 4817 | ||
4480 | rq_src = cpu_rq(src_cpu); | 4818 | rq_src = cpu_rq(src_cpu); |
4481 | rq_dest = cpu_rq(dest_cpu); | 4819 | rq_dest = cpu_rq(dest_cpu); |
@@ -4503,9 +4841,10 @@ static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) | |||
4503 | if (TASK_PREEMPTS_CURR(p, rq_dest)) | 4841 | if (TASK_PREEMPTS_CURR(p, rq_dest)) |
4504 | resched_task(rq_dest->curr); | 4842 | resched_task(rq_dest->curr); |
4505 | } | 4843 | } |
4506 | 4844 | ret = 1; | |
4507 | out: | 4845 | out: |
4508 | double_rq_unlock(rq_src, rq_dest); | 4846 | double_rq_unlock(rq_src, rq_dest); |
4847 | return ret; | ||
4509 | } | 4848 | } |
4510 | 4849 | ||
4511 | /* | 4850 | /* |
@@ -4575,9 +4914,12 @@ wait_to_die: | |||
4575 | /* Figure out where task on dead CPU should go, use force if neccessary. */ | 4914 | /* Figure out where task on dead CPU should go, use force if neccessary. */ |
4576 | static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) | 4915 | static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) |
4577 | { | 4916 | { |
4917 | runqueue_t *rq; | ||
4918 | unsigned long flags; | ||
4578 | int dest_cpu; | 4919 | int dest_cpu; |
4579 | cpumask_t mask; | 4920 | cpumask_t mask; |
4580 | 4921 | ||
4922 | restart: | ||
4581 | /* On same node? */ | 4923 | /* On same node? */ |
4582 | mask = node_to_cpumask(cpu_to_node(dead_cpu)); | 4924 | mask = node_to_cpumask(cpu_to_node(dead_cpu)); |
4583 | cpus_and(mask, mask, tsk->cpus_allowed); | 4925 | cpus_and(mask, mask, tsk->cpus_allowed); |
@@ -4589,8 +4931,10 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) | |||
4589 | 4931 | ||
4590 | /* No more Mr. Nice Guy. */ | 4932 | /* No more Mr. Nice Guy. */ |
4591 | if (dest_cpu == NR_CPUS) { | 4933 | if (dest_cpu == NR_CPUS) { |
4934 | rq = task_rq_lock(tsk, &flags); | ||
4592 | cpus_setall(tsk->cpus_allowed); | 4935 | cpus_setall(tsk->cpus_allowed); |
4593 | dest_cpu = any_online_cpu(tsk->cpus_allowed); | 4936 | dest_cpu = any_online_cpu(tsk->cpus_allowed); |
4937 | task_rq_unlock(rq, &flags); | ||
4594 | 4938 | ||
4595 | /* | 4939 | /* |
4596 | * Don't tell them about moving exiting tasks or | 4940 | * Don't tell them about moving exiting tasks or |
@@ -4602,7 +4946,8 @@ static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *tsk) | |||
4602 | "longer affine to cpu%d\n", | 4946 | "longer affine to cpu%d\n", |
4603 | tsk->pid, tsk->comm, dead_cpu); | 4947 | tsk->pid, tsk->comm, dead_cpu); |
4604 | } | 4948 | } |
4605 | __migrate_task(tsk, dead_cpu, dest_cpu); | 4949 | if (!__migrate_task(tsk, dead_cpu, dest_cpu)) |
4950 | goto restart; | ||
4606 | } | 4951 | } |
4607 | 4952 | ||
4608 | /* | 4953 | /* |
@@ -4729,8 +5074,9 @@ static void migrate_dead_tasks(unsigned int dead_cpu) | |||
4729 | * migration_call - callback that gets triggered when a CPU is added. | 5074 | * migration_call - callback that gets triggered when a CPU is added. |
4730 | * Here we can start up the necessary migration thread for the new CPU. | 5075 | * Here we can start up the necessary migration thread for the new CPU. |
4731 | */ | 5076 | */ |
4732 | static int migration_call(struct notifier_block *nfb, unsigned long action, | 5077 | static int __cpuinit migration_call(struct notifier_block *nfb, |
4733 | void *hcpu) | 5078 | unsigned long action, |
5079 | void *hcpu) | ||
4734 | { | 5080 | { |
4735 | int cpu = (long)hcpu; | 5081 | int cpu = (long)hcpu; |
4736 | struct task_struct *p; | 5082 | struct task_struct *p; |
@@ -4800,7 +5146,7 @@ static int migration_call(struct notifier_block *nfb, unsigned long action, | |||
4800 | /* Register at highest priority so that task migration (migrate_all_tasks) | 5146 | /* Register at highest priority so that task migration (migrate_all_tasks) |
4801 | * happens before everything else. | 5147 | * happens before everything else. |
4802 | */ | 5148 | */ |
4803 | static struct notifier_block migration_notifier = { | 5149 | static struct notifier_block __cpuinitdata migration_notifier = { |
4804 | .notifier_call = migration_call, | 5150 | .notifier_call = migration_call, |
4805 | .priority = 10 | 5151 | .priority = 10 |
4806 | }; | 5152 | }; |
@@ -5601,6 +5947,7 @@ static cpumask_t sched_domain_node_span(int node) | |||
5601 | } | 5947 | } |
5602 | #endif | 5948 | #endif |
5603 | 5949 | ||
5950 | int sched_smt_power_savings = 0, sched_mc_power_savings = 0; | ||
5604 | /* | 5951 | /* |
5605 | * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we | 5952 | * At the moment, CONFIG_SCHED_SMT is never defined, but leave it in so we |
5606 | * can switch it on easily if needed. | 5953 | * can switch it on easily if needed. |
@@ -5616,7 +5963,7 @@ static int cpu_to_cpu_group(int cpu) | |||
5616 | 5963 | ||
5617 | #ifdef CONFIG_SCHED_MC | 5964 | #ifdef CONFIG_SCHED_MC |
5618 | static DEFINE_PER_CPU(struct sched_domain, core_domains); | 5965 | static DEFINE_PER_CPU(struct sched_domain, core_domains); |
5619 | static struct sched_group sched_group_core[NR_CPUS]; | 5966 | static struct sched_group *sched_group_core_bycpu[NR_CPUS]; |
5620 | #endif | 5967 | #endif |
5621 | 5968 | ||
5622 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) | 5969 | #if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) |
@@ -5632,7 +5979,7 @@ static int cpu_to_core_group(int cpu) | |||
5632 | #endif | 5979 | #endif |
5633 | 5980 | ||
5634 | static DEFINE_PER_CPU(struct sched_domain, phys_domains); | 5981 | static DEFINE_PER_CPU(struct sched_domain, phys_domains); |
5635 | static struct sched_group sched_group_phys[NR_CPUS]; | 5982 | static struct sched_group *sched_group_phys_bycpu[NR_CPUS]; |
5636 | static int cpu_to_phys_group(int cpu) | 5983 | static int cpu_to_phys_group(int cpu) |
5637 | { | 5984 | { |
5638 | #if defined(CONFIG_SCHED_MC) | 5985 | #if defined(CONFIG_SCHED_MC) |
@@ -5689,13 +6036,74 @@ next_sg: | |||
5689 | } | 6036 | } |
5690 | #endif | 6037 | #endif |
5691 | 6038 | ||
6039 | /* Free memory allocated for various sched_group structures */ | ||
6040 | static void free_sched_groups(const cpumask_t *cpu_map) | ||
6041 | { | ||
6042 | int cpu; | ||
6043 | #ifdef CONFIG_NUMA | ||
6044 | int i; | ||
6045 | |||
6046 | for_each_cpu_mask(cpu, *cpu_map) { | ||
6047 | struct sched_group *sched_group_allnodes | ||
6048 | = sched_group_allnodes_bycpu[cpu]; | ||
6049 | struct sched_group **sched_group_nodes | ||
6050 | = sched_group_nodes_bycpu[cpu]; | ||
6051 | |||
6052 | if (sched_group_allnodes) { | ||
6053 | kfree(sched_group_allnodes); | ||
6054 | sched_group_allnodes_bycpu[cpu] = NULL; | ||
6055 | } | ||
6056 | |||
6057 | if (!sched_group_nodes) | ||
6058 | continue; | ||
6059 | |||
6060 | for (i = 0; i < MAX_NUMNODES; i++) { | ||
6061 | cpumask_t nodemask = node_to_cpumask(i); | ||
6062 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; | ||
6063 | |||
6064 | cpus_and(nodemask, nodemask, *cpu_map); | ||
6065 | if (cpus_empty(nodemask)) | ||
6066 | continue; | ||
6067 | |||
6068 | if (sg == NULL) | ||
6069 | continue; | ||
6070 | sg = sg->next; | ||
6071 | next_sg: | ||
6072 | oldsg = sg; | ||
6073 | sg = sg->next; | ||
6074 | kfree(oldsg); | ||
6075 | if (oldsg != sched_group_nodes[i]) | ||
6076 | goto next_sg; | ||
6077 | } | ||
6078 | kfree(sched_group_nodes); | ||
6079 | sched_group_nodes_bycpu[cpu] = NULL; | ||
6080 | } | ||
6081 | #endif | ||
6082 | for_each_cpu_mask(cpu, *cpu_map) { | ||
6083 | if (sched_group_phys_bycpu[cpu]) { | ||
6084 | kfree(sched_group_phys_bycpu[cpu]); | ||
6085 | sched_group_phys_bycpu[cpu] = NULL; | ||
6086 | } | ||
6087 | #ifdef CONFIG_SCHED_MC | ||
6088 | if (sched_group_core_bycpu[cpu]) { | ||
6089 | kfree(sched_group_core_bycpu[cpu]); | ||
6090 | sched_group_core_bycpu[cpu] = NULL; | ||
6091 | } | ||
6092 | #endif | ||
6093 | } | ||
6094 | } | ||
6095 | |||
5692 | /* | 6096 | /* |
5693 | * Build sched domains for a given set of cpus and attach the sched domains | 6097 | * Build sched domains for a given set of cpus and attach the sched domains |
5694 | * to the individual cpus | 6098 | * to the individual cpus |
5695 | */ | 6099 | */ |
5696 | void build_sched_domains(const cpumask_t *cpu_map) | 6100 | static int build_sched_domains(const cpumask_t *cpu_map) |
5697 | { | 6101 | { |
5698 | int i; | 6102 | int i; |
6103 | struct sched_group *sched_group_phys = NULL; | ||
6104 | #ifdef CONFIG_SCHED_MC | ||
6105 | struct sched_group *sched_group_core = NULL; | ||
6106 | #endif | ||
5699 | #ifdef CONFIG_NUMA | 6107 | #ifdef CONFIG_NUMA |
5700 | struct sched_group **sched_group_nodes = NULL; | 6108 | struct sched_group **sched_group_nodes = NULL; |
5701 | struct sched_group *sched_group_allnodes = NULL; | 6109 | struct sched_group *sched_group_allnodes = NULL; |
@@ -5703,11 +6111,11 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
5703 | /* | 6111 | /* |
5704 | * Allocate the per-node list of sched groups | 6112 | * Allocate the per-node list of sched groups |
5705 | */ | 6113 | */ |
5706 | sched_group_nodes = kmalloc(sizeof(struct sched_group*)*MAX_NUMNODES, | 6114 | sched_group_nodes = kzalloc(sizeof(struct sched_group*)*MAX_NUMNODES, |
5707 | GFP_ATOMIC); | 6115 | GFP_KERNEL); |
5708 | if (!sched_group_nodes) { | 6116 | if (!sched_group_nodes) { |
5709 | printk(KERN_WARNING "Can not alloc sched group node list\n"); | 6117 | printk(KERN_WARNING "Can not alloc sched group node list\n"); |
5710 | return; | 6118 | return -ENOMEM; |
5711 | } | 6119 | } |
5712 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; | 6120 | sched_group_nodes_bycpu[first_cpu(*cpu_map)] = sched_group_nodes; |
5713 | #endif | 6121 | #endif |
@@ -5733,7 +6141,7 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
5733 | if (!sched_group_allnodes) { | 6141 | if (!sched_group_allnodes) { |
5734 | printk(KERN_WARNING | 6142 | printk(KERN_WARNING |
5735 | "Can not alloc allnodes sched group\n"); | 6143 | "Can not alloc allnodes sched group\n"); |
5736 | break; | 6144 | goto error; |
5737 | } | 6145 | } |
5738 | sched_group_allnodes_bycpu[i] | 6146 | sched_group_allnodes_bycpu[i] |
5739 | = sched_group_allnodes; | 6147 | = sched_group_allnodes; |
@@ -5754,6 +6162,18 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
5754 | cpus_and(sd->span, sd->span, *cpu_map); | 6162 | cpus_and(sd->span, sd->span, *cpu_map); |
5755 | #endif | 6163 | #endif |
5756 | 6164 | ||
6165 | if (!sched_group_phys) { | ||
6166 | sched_group_phys | ||
6167 | = kmalloc(sizeof(struct sched_group) * NR_CPUS, | ||
6168 | GFP_KERNEL); | ||
6169 | if (!sched_group_phys) { | ||
6170 | printk (KERN_WARNING "Can not alloc phys sched" | ||
6171 | "group\n"); | ||
6172 | goto error; | ||
6173 | } | ||
6174 | sched_group_phys_bycpu[i] = sched_group_phys; | ||
6175 | } | ||
6176 | |||
5757 | p = sd; | 6177 | p = sd; |
5758 | sd = &per_cpu(phys_domains, i); | 6178 | sd = &per_cpu(phys_domains, i); |
5759 | group = cpu_to_phys_group(i); | 6179 | group = cpu_to_phys_group(i); |
@@ -5763,6 +6183,18 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
5763 | sd->groups = &sched_group_phys[group]; | 6183 | sd->groups = &sched_group_phys[group]; |
5764 | 6184 | ||
5765 | #ifdef CONFIG_SCHED_MC | 6185 | #ifdef CONFIG_SCHED_MC |
6186 | if (!sched_group_core) { | ||
6187 | sched_group_core | ||
6188 | = kmalloc(sizeof(struct sched_group) * NR_CPUS, | ||
6189 | GFP_KERNEL); | ||
6190 | if (!sched_group_core) { | ||
6191 | printk (KERN_WARNING "Can not alloc core sched" | ||
6192 | "group\n"); | ||
6193 | goto error; | ||
6194 | } | ||
6195 | sched_group_core_bycpu[i] = sched_group_core; | ||
6196 | } | ||
6197 | |||
5766 | p = sd; | 6198 | p = sd; |
5767 | sd = &per_cpu(core_domains, i); | 6199 | sd = &per_cpu(core_domains, i); |
5768 | group = cpu_to_core_group(i); | 6200 | group = cpu_to_core_group(i); |
@@ -5846,24 +6278,21 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
5846 | domainspan = sched_domain_node_span(i); | 6278 | domainspan = sched_domain_node_span(i); |
5847 | cpus_and(domainspan, domainspan, *cpu_map); | 6279 | cpus_and(domainspan, domainspan, *cpu_map); |
5848 | 6280 | ||
5849 | sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); | 6281 | sg = kmalloc_node(sizeof(struct sched_group), GFP_KERNEL, i); |
6282 | if (!sg) { | ||
6283 | printk(KERN_WARNING "Can not alloc domain group for " | ||
6284 | "node %d\n", i); | ||
6285 | goto error; | ||
6286 | } | ||
5850 | sched_group_nodes[i] = sg; | 6287 | sched_group_nodes[i] = sg; |
5851 | for_each_cpu_mask(j, nodemask) { | 6288 | for_each_cpu_mask(j, nodemask) { |
5852 | struct sched_domain *sd; | 6289 | struct sched_domain *sd; |
5853 | sd = &per_cpu(node_domains, j); | 6290 | sd = &per_cpu(node_domains, j); |
5854 | sd->groups = sg; | 6291 | sd->groups = sg; |
5855 | if (sd->groups == NULL) { | ||
5856 | /* Turn off balancing if we have no groups */ | ||
5857 | sd->flags = 0; | ||
5858 | } | ||
5859 | } | ||
5860 | if (!sg) { | ||
5861 | printk(KERN_WARNING | ||
5862 | "Can not alloc domain group for node %d\n", i); | ||
5863 | continue; | ||
5864 | } | 6292 | } |
5865 | sg->cpu_power = 0; | 6293 | sg->cpu_power = 0; |
5866 | sg->cpumask = nodemask; | 6294 | sg->cpumask = nodemask; |
6295 | sg->next = sg; | ||
5867 | cpus_or(covered, covered, nodemask); | 6296 | cpus_or(covered, covered, nodemask); |
5868 | prev = sg; | 6297 | prev = sg; |
5869 | 6298 | ||
@@ -5882,54 +6311,90 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
5882 | if (cpus_empty(tmp)) | 6311 | if (cpus_empty(tmp)) |
5883 | continue; | 6312 | continue; |
5884 | 6313 | ||
5885 | sg = kmalloc(sizeof(struct sched_group), GFP_KERNEL); | 6314 | sg = kmalloc_node(sizeof(struct sched_group), |
6315 | GFP_KERNEL, i); | ||
5886 | if (!sg) { | 6316 | if (!sg) { |
5887 | printk(KERN_WARNING | 6317 | printk(KERN_WARNING |
5888 | "Can not alloc domain group for node %d\n", j); | 6318 | "Can not alloc domain group for node %d\n", j); |
5889 | break; | 6319 | goto error; |
5890 | } | 6320 | } |
5891 | sg->cpu_power = 0; | 6321 | sg->cpu_power = 0; |
5892 | sg->cpumask = tmp; | 6322 | sg->cpumask = tmp; |
6323 | sg->next = prev->next; | ||
5893 | cpus_or(covered, covered, tmp); | 6324 | cpus_or(covered, covered, tmp); |
5894 | prev->next = sg; | 6325 | prev->next = sg; |
5895 | prev = sg; | 6326 | prev = sg; |
5896 | } | 6327 | } |
5897 | prev->next = sched_group_nodes[i]; | ||
5898 | } | 6328 | } |
5899 | #endif | 6329 | #endif |
5900 | 6330 | ||
5901 | /* Calculate CPU power for physical packages and nodes */ | 6331 | /* Calculate CPU power for physical packages and nodes */ |
6332 | #ifdef CONFIG_SCHED_SMT | ||
5902 | for_each_cpu_mask(i, *cpu_map) { | 6333 | for_each_cpu_mask(i, *cpu_map) { |
5903 | int power; | ||
5904 | struct sched_domain *sd; | 6334 | struct sched_domain *sd; |
5905 | #ifdef CONFIG_SCHED_SMT | ||
5906 | sd = &per_cpu(cpu_domains, i); | 6335 | sd = &per_cpu(cpu_domains, i); |
5907 | power = SCHED_LOAD_SCALE; | 6336 | sd->groups->cpu_power = SCHED_LOAD_SCALE; |
5908 | sd->groups->cpu_power = power; | 6337 | } |
5909 | #endif | 6338 | #endif |
5910 | #ifdef CONFIG_SCHED_MC | 6339 | #ifdef CONFIG_SCHED_MC |
6340 | for_each_cpu_mask(i, *cpu_map) { | ||
6341 | int power; | ||
6342 | struct sched_domain *sd; | ||
5911 | sd = &per_cpu(core_domains, i); | 6343 | sd = &per_cpu(core_domains, i); |
5912 | power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) | 6344 | if (sched_smt_power_savings) |
6345 | power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask); | ||
6346 | else | ||
6347 | power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1) | ||
5913 | * SCHED_LOAD_SCALE / 10; | 6348 | * SCHED_LOAD_SCALE / 10; |
5914 | sd->groups->cpu_power = power; | 6349 | sd->groups->cpu_power = power; |
6350 | } | ||
6351 | #endif | ||
5915 | 6352 | ||
6353 | for_each_cpu_mask(i, *cpu_map) { | ||
6354 | struct sched_domain *sd; | ||
6355 | #ifdef CONFIG_SCHED_MC | ||
5916 | sd = &per_cpu(phys_domains, i); | 6356 | sd = &per_cpu(phys_domains, i); |
6357 | if (i != first_cpu(sd->groups->cpumask)) | ||
6358 | continue; | ||
5917 | 6359 | ||
5918 | /* | 6360 | sd->groups->cpu_power = 0; |
5919 | * This has to be < 2 * SCHED_LOAD_SCALE | 6361 | if (sched_mc_power_savings || sched_smt_power_savings) { |
5920 | * Lets keep it SCHED_LOAD_SCALE, so that | 6362 | int j; |
5921 | * while calculating NUMA group's cpu_power | 6363 | |
5922 | * we can simply do | 6364 | for_each_cpu_mask(j, sd->groups->cpumask) { |
5923 | * numa_group->cpu_power += phys_group->cpu_power; | 6365 | struct sched_domain *sd1; |
5924 | * | 6366 | sd1 = &per_cpu(core_domains, j); |
5925 | * See "only add power once for each physical pkg" | 6367 | /* |
5926 | * comment below | 6368 | * for each core we will add once |
5927 | */ | 6369 | * to the group in physical domain |
5928 | sd->groups->cpu_power = SCHED_LOAD_SCALE; | 6370 | */ |
6371 | if (j != first_cpu(sd1->groups->cpumask)) | ||
6372 | continue; | ||
6373 | |||
6374 | if (sched_smt_power_savings) | ||
6375 | sd->groups->cpu_power += sd1->groups->cpu_power; | ||
6376 | else | ||
6377 | sd->groups->cpu_power += SCHED_LOAD_SCALE; | ||
6378 | } | ||
6379 | } else | ||
6380 | /* | ||
6381 | * This has to be < 2 * SCHED_LOAD_SCALE | ||
6382 | * Lets keep it SCHED_LOAD_SCALE, so that | ||
6383 | * while calculating NUMA group's cpu_power | ||
6384 | * we can simply do | ||
6385 | * numa_group->cpu_power += phys_group->cpu_power; | ||
6386 | * | ||
6387 | * See "only add power once for each physical pkg" | ||
6388 | * comment below | ||
6389 | */ | ||
6390 | sd->groups->cpu_power = SCHED_LOAD_SCALE; | ||
5929 | #else | 6391 | #else |
6392 | int power; | ||
5930 | sd = &per_cpu(phys_domains, i); | 6393 | sd = &per_cpu(phys_domains, i); |
5931 | power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE * | 6394 | if (sched_smt_power_savings) |
5932 | (cpus_weight(sd->groups->cpumask)-1) / 10; | 6395 | power = SCHED_LOAD_SCALE * cpus_weight(sd->groups->cpumask); |
6396 | else | ||
6397 | power = SCHED_LOAD_SCALE; | ||
5933 | sd->groups->cpu_power = power; | 6398 | sd->groups->cpu_power = power; |
5934 | #endif | 6399 | #endif |
5935 | } | 6400 | } |
@@ -5957,13 +6422,20 @@ void build_sched_domains(const cpumask_t *cpu_map) | |||
5957 | * Tune cache-hot values: | 6422 | * Tune cache-hot values: |
5958 | */ | 6423 | */ |
5959 | calibrate_migration_costs(cpu_map); | 6424 | calibrate_migration_costs(cpu_map); |
6425 | |||
6426 | return 0; | ||
6427 | |||
6428 | error: | ||
6429 | free_sched_groups(cpu_map); | ||
6430 | return -ENOMEM; | ||
5960 | } | 6431 | } |
5961 | /* | 6432 | /* |
5962 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. | 6433 | * Set up scheduler domains and groups. Callers must hold the hotplug lock. |
5963 | */ | 6434 | */ |
5964 | static void arch_init_sched_domains(const cpumask_t *cpu_map) | 6435 | static int arch_init_sched_domains(const cpumask_t *cpu_map) |
5965 | { | 6436 | { |
5966 | cpumask_t cpu_default_map; | 6437 | cpumask_t cpu_default_map; |
6438 | int err; | ||
5967 | 6439 | ||
5968 | /* | 6440 | /* |
5969 | * Setup mask for cpus without special case scheduling requirements. | 6441 | * Setup mask for cpus without special case scheduling requirements. |
@@ -5972,51 +6444,14 @@ static void arch_init_sched_domains(const cpumask_t *cpu_map) | |||
5972 | */ | 6444 | */ |
5973 | cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); | 6445 | cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); |
5974 | 6446 | ||
5975 | build_sched_domains(&cpu_default_map); | 6447 | err = build_sched_domains(&cpu_default_map); |
6448 | |||
6449 | return err; | ||
5976 | } | 6450 | } |
5977 | 6451 | ||
5978 | static void arch_destroy_sched_domains(const cpumask_t *cpu_map) | 6452 | static void arch_destroy_sched_domains(const cpumask_t *cpu_map) |
5979 | { | 6453 | { |
5980 | #ifdef CONFIG_NUMA | 6454 | free_sched_groups(cpu_map); |
5981 | int i; | ||
5982 | int cpu; | ||
5983 | |||
5984 | for_each_cpu_mask(cpu, *cpu_map) { | ||
5985 | struct sched_group *sched_group_allnodes | ||
5986 | = sched_group_allnodes_bycpu[cpu]; | ||
5987 | struct sched_group **sched_group_nodes | ||
5988 | = sched_group_nodes_bycpu[cpu]; | ||
5989 | |||
5990 | if (sched_group_allnodes) { | ||
5991 | kfree(sched_group_allnodes); | ||
5992 | sched_group_allnodes_bycpu[cpu] = NULL; | ||
5993 | } | ||
5994 | |||
5995 | if (!sched_group_nodes) | ||
5996 | continue; | ||
5997 | |||
5998 | for (i = 0; i < MAX_NUMNODES; i++) { | ||
5999 | cpumask_t nodemask = node_to_cpumask(i); | ||
6000 | struct sched_group *oldsg, *sg = sched_group_nodes[i]; | ||
6001 | |||
6002 | cpus_and(nodemask, nodemask, *cpu_map); | ||
6003 | if (cpus_empty(nodemask)) | ||
6004 | continue; | ||
6005 | |||
6006 | if (sg == NULL) | ||
6007 | continue; | ||
6008 | sg = sg->next; | ||
6009 | next_sg: | ||
6010 | oldsg = sg; | ||
6011 | sg = sg->next; | ||
6012 | kfree(oldsg); | ||
6013 | if (oldsg != sched_group_nodes[i]) | ||
6014 | goto next_sg; | ||
6015 | } | ||
6016 | kfree(sched_group_nodes); | ||
6017 | sched_group_nodes_bycpu[cpu] = NULL; | ||
6018 | } | ||
6019 | #endif | ||
6020 | } | 6455 | } |
6021 | 6456 | ||
6022 | /* | 6457 | /* |
@@ -6041,9 +6476,10 @@ static void detach_destroy_domains(const cpumask_t *cpu_map) | |||
6041 | * correct sched domains | 6476 | * correct sched domains |
6042 | * Call with hotplug lock held | 6477 | * Call with hotplug lock held |
6043 | */ | 6478 | */ |
6044 | void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) | 6479 | int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) |
6045 | { | 6480 | { |
6046 | cpumask_t change_map; | 6481 | cpumask_t change_map; |
6482 | int err = 0; | ||
6047 | 6483 | ||
6048 | cpus_and(*partition1, *partition1, cpu_online_map); | 6484 | cpus_and(*partition1, *partition1, cpu_online_map); |
6049 | cpus_and(*partition2, *partition2, cpu_online_map); | 6485 | cpus_and(*partition2, *partition2, cpu_online_map); |
@@ -6052,10 +6488,86 @@ void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) | |||
6052 | /* Detach sched domains from all of the affected cpus */ | 6488 | /* Detach sched domains from all of the affected cpus */ |
6053 | detach_destroy_domains(&change_map); | 6489 | detach_destroy_domains(&change_map); |
6054 | if (!cpus_empty(*partition1)) | 6490 | if (!cpus_empty(*partition1)) |
6055 | build_sched_domains(partition1); | 6491 | err = build_sched_domains(partition1); |
6056 | if (!cpus_empty(*partition2)) | 6492 | if (!err && !cpus_empty(*partition2)) |
6057 | build_sched_domains(partition2); | 6493 | err = build_sched_domains(partition2); |
6494 | |||
6495 | return err; | ||
6496 | } | ||
6497 | |||
6498 | #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) | ||
6499 | int arch_reinit_sched_domains(void) | ||
6500 | { | ||
6501 | int err; | ||
6502 | |||
6503 | lock_cpu_hotplug(); | ||
6504 | detach_destroy_domains(&cpu_online_map); | ||
6505 | err = arch_init_sched_domains(&cpu_online_map); | ||
6506 | unlock_cpu_hotplug(); | ||
6507 | |||
6508 | return err; | ||
6509 | } | ||
6510 | |||
6511 | static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) | ||
6512 | { | ||
6513 | int ret; | ||
6514 | |||
6515 | if (buf[0] != '0' && buf[0] != '1') | ||
6516 | return -EINVAL; | ||
6517 | |||
6518 | if (smt) | ||
6519 | sched_smt_power_savings = (buf[0] == '1'); | ||
6520 | else | ||
6521 | sched_mc_power_savings = (buf[0] == '1'); | ||
6522 | |||
6523 | ret = arch_reinit_sched_domains(); | ||
6524 | |||
6525 | return ret ? ret : count; | ||
6526 | } | ||
6527 | |||
6528 | int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) | ||
6529 | { | ||
6530 | int err = 0; | ||
6531 | #ifdef CONFIG_SCHED_SMT | ||
6532 | if (smt_capable()) | ||
6533 | err = sysfs_create_file(&cls->kset.kobj, | ||
6534 | &attr_sched_smt_power_savings.attr); | ||
6535 | #endif | ||
6536 | #ifdef CONFIG_SCHED_MC | ||
6537 | if (!err && mc_capable()) | ||
6538 | err = sysfs_create_file(&cls->kset.kobj, | ||
6539 | &attr_sched_mc_power_savings.attr); | ||
6540 | #endif | ||
6541 | return err; | ||
6542 | } | ||
6543 | #endif | ||
6544 | |||
6545 | #ifdef CONFIG_SCHED_MC | ||
6546 | static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page) | ||
6547 | { | ||
6548 | return sprintf(page, "%u\n", sched_mc_power_savings); | ||
6549 | } | ||
6550 | static ssize_t sched_mc_power_savings_store(struct sys_device *dev, const char *buf, size_t count) | ||
6551 | { | ||
6552 | return sched_power_savings_store(buf, count, 0); | ||
6553 | } | ||
6554 | SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, | ||
6555 | sched_mc_power_savings_store); | ||
6556 | #endif | ||
6557 | |||
6558 | #ifdef CONFIG_SCHED_SMT | ||
6559 | static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page) | ||
6560 | { | ||
6561 | return sprintf(page, "%u\n", sched_smt_power_savings); | ||
6562 | } | ||
6563 | static ssize_t sched_smt_power_savings_store(struct sys_device *dev, const char *buf, size_t count) | ||
6564 | { | ||
6565 | return sched_power_savings_store(buf, count, 1); | ||
6058 | } | 6566 | } |
6567 | SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, | ||
6568 | sched_smt_power_savings_store); | ||
6569 | #endif | ||
6570 | |||
6059 | 6571 | ||
6060 | #ifdef CONFIG_HOTPLUG_CPU | 6572 | #ifdef CONFIG_HOTPLUG_CPU |
6061 | /* | 6573 | /* |
@@ -6138,7 +6650,6 @@ void __init sched_init(void) | |||
6138 | rq->push_cpu = 0; | 6650 | rq->push_cpu = 0; |
6139 | rq->migration_thread = NULL; | 6651 | rq->migration_thread = NULL; |
6140 | INIT_LIST_HEAD(&rq->migration_queue); | 6652 | INIT_LIST_HEAD(&rq->migration_queue); |
6141 | rq->cpu = i; | ||
6142 | #endif | 6653 | #endif |
6143 | atomic_set(&rq->nr_iowait, 0); | 6654 | atomic_set(&rq->nr_iowait, 0); |
6144 | 6655 | ||
@@ -6153,6 +6664,7 @@ void __init sched_init(void) | |||
6153 | } | 6664 | } |
6154 | } | 6665 | } |
6155 | 6666 | ||
6667 | set_load_weight(&init_task); | ||
6156 | /* | 6668 | /* |
6157 | * The boot idle thread does lazy MMU switching as well: | 6669 | * The boot idle thread does lazy MMU switching as well: |
6158 | */ | 6670 | */ |
@@ -6199,11 +6711,12 @@ void normalize_rt_tasks(void) | |||
6199 | runqueue_t *rq; | 6711 | runqueue_t *rq; |
6200 | 6712 | ||
6201 | read_lock_irq(&tasklist_lock); | 6713 | read_lock_irq(&tasklist_lock); |
6202 | for_each_process (p) { | 6714 | for_each_process(p) { |
6203 | if (!rt_task(p)) | 6715 | if (!rt_task(p)) |
6204 | continue; | 6716 | continue; |
6205 | 6717 | ||
6206 | rq = task_rq_lock(p, &flags); | 6718 | spin_lock_irqsave(&p->pi_lock, flags); |
6719 | rq = __task_rq_lock(p); | ||
6207 | 6720 | ||
6208 | array = p->array; | 6721 | array = p->array; |
6209 | if (array) | 6722 | if (array) |
@@ -6214,7 +6727,8 @@ void normalize_rt_tasks(void) | |||
6214 | resched_task(rq->curr); | 6727 | resched_task(rq->curr); |
6215 | } | 6728 | } |
6216 | 6729 | ||
6217 | task_rq_unlock(rq, &flags); | 6730 | __task_rq_unlock(rq); |
6731 | spin_unlock_irqrestore(&p->pi_lock, flags); | ||
6218 | } | 6732 | } |
6219 | read_unlock_irq(&tasklist_lock); | 6733 | read_unlock_irq(&tasklist_lock); |
6220 | } | 6734 | } |
diff --git a/kernel/signal.c b/kernel/signal.c index 1b3c921737e2..52adf53929f6 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -1531,6 +1531,35 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why) | |||
1531 | spin_unlock_irqrestore(&sighand->siglock, flags); | 1531 | spin_unlock_irqrestore(&sighand->siglock, flags); |
1532 | } | 1532 | } |
1533 | 1533 | ||
1534 | static inline int may_ptrace_stop(void) | ||
1535 | { | ||
1536 | if (!likely(current->ptrace & PT_PTRACED)) | ||
1537 | return 0; | ||
1538 | |||
1539 | if (unlikely(current->parent == current->real_parent && | ||
1540 | (current->ptrace & PT_ATTACHED))) | ||
1541 | return 0; | ||
1542 | |||
1543 | if (unlikely(current->signal == current->parent->signal) && | ||
1544 | unlikely(current->signal->flags & SIGNAL_GROUP_EXIT)) | ||
1545 | return 0; | ||
1546 | |||
1547 | /* | ||
1548 | * Are we in the middle of do_coredump? | ||
1549 | * If so and our tracer is also part of the coredump stopping | ||
1550 | * is a deadlock situation, and pointless because our tracer | ||
1551 | * is dead so don't allow us to stop. | ||
1552 | * If SIGKILL was already sent before the caller unlocked | ||
1553 | * ->siglock we must see ->core_waiters != 0. Otherwise it | ||
1554 | * is safe to enter schedule(). | ||
1555 | */ | ||
1556 | if (unlikely(current->mm->core_waiters) && | ||
1557 | unlikely(current->mm == current->parent->mm)) | ||
1558 | return 0; | ||
1559 | |||
1560 | return 1; | ||
1561 | } | ||
1562 | |||
1534 | /* | 1563 | /* |
1535 | * This must be called with current->sighand->siglock held. | 1564 | * This must be called with current->sighand->siglock held. |
1536 | * | 1565 | * |
@@ -1559,11 +1588,7 @@ static void ptrace_stop(int exit_code, int nostop_code, siginfo_t *info) | |||
1559 | spin_unlock_irq(¤t->sighand->siglock); | 1588 | spin_unlock_irq(¤t->sighand->siglock); |
1560 | try_to_freeze(); | 1589 | try_to_freeze(); |
1561 | read_lock(&tasklist_lock); | 1590 | read_lock(&tasklist_lock); |
1562 | if (likely(current->ptrace & PT_PTRACED) && | 1591 | if (may_ptrace_stop()) { |
1563 | likely(current->parent != current->real_parent || | ||
1564 | !(current->ptrace & PT_ATTACHED)) && | ||
1565 | (likely(current->parent->signal != current->signal) || | ||
1566 | !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) { | ||
1567 | do_notify_parent_cldstop(current, CLD_TRAPPED); | 1592 | do_notify_parent_cldstop(current, CLD_TRAPPED); |
1568 | read_unlock(&tasklist_lock); | 1593 | read_unlock(&tasklist_lock); |
1569 | schedule(); | 1594 | schedule(); |
diff --git a/kernel/softirq.c b/kernel/softirq.c index 9e2f1c6e73d7..8f03e3b89b55 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c | |||
@@ -446,7 +446,7 @@ static void takeover_tasklets(unsigned int cpu) | |||
446 | } | 446 | } |
447 | #endif /* CONFIG_HOTPLUG_CPU */ | 447 | #endif /* CONFIG_HOTPLUG_CPU */ |
448 | 448 | ||
449 | static int cpu_callback(struct notifier_block *nfb, | 449 | static int __devinit cpu_callback(struct notifier_block *nfb, |
450 | unsigned long action, | 450 | unsigned long action, |
451 | void *hcpu) | 451 | void *hcpu) |
452 | { | 452 | { |
@@ -486,7 +486,7 @@ static int cpu_callback(struct notifier_block *nfb, | |||
486 | return NOTIFY_OK; | 486 | return NOTIFY_OK; |
487 | } | 487 | } |
488 | 488 | ||
489 | static struct notifier_block cpu_nfb = { | 489 | static struct notifier_block __devinitdata cpu_nfb = { |
490 | .notifier_call = cpu_callback | 490 | .notifier_call = cpu_callback |
491 | }; | 491 | }; |
492 | 492 | ||
diff --git a/kernel/softlockup.c b/kernel/softlockup.c index b5c3b94e01ce..6b76caa22981 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c | |||
@@ -104,7 +104,7 @@ static int watchdog(void * __bind_cpu) | |||
104 | /* | 104 | /* |
105 | * Create/destroy watchdog threads as CPUs come and go: | 105 | * Create/destroy watchdog threads as CPUs come and go: |
106 | */ | 106 | */ |
107 | static int | 107 | static int __devinit |
108 | cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | 108 | cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) |
109 | { | 109 | { |
110 | int hotcpu = (unsigned long)hcpu; | 110 | int hotcpu = (unsigned long)hcpu; |
@@ -142,7 +142,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) | |||
142 | return NOTIFY_OK; | 142 | return NOTIFY_OK; |
143 | } | 143 | } |
144 | 144 | ||
145 | static struct notifier_block cpu_nfb = { | 145 | static struct notifier_block __devinitdata cpu_nfb = { |
146 | .notifier_call = cpu_callback | 146 | .notifier_call = cpu_callback |
147 | }; | 147 | }; |
148 | 148 | ||
diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 2c0e65819448..93a2c5398648 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c | |||
@@ -73,6 +73,7 @@ extern int printk_ratelimit_burst; | |||
73 | extern int pid_max_min, pid_max_max; | 73 | extern int pid_max_min, pid_max_max; |
74 | extern int sysctl_drop_caches; | 74 | extern int sysctl_drop_caches; |
75 | extern int percpu_pagelist_fraction; | 75 | extern int percpu_pagelist_fraction; |
76 | extern int compat_log; | ||
76 | 77 | ||
77 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) | 78 | #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86) |
78 | int unknown_nmi_panic; | 79 | int unknown_nmi_panic; |
@@ -132,6 +133,10 @@ extern int acct_parm[]; | |||
132 | extern int no_unaligned_warning; | 133 | extern int no_unaligned_warning; |
133 | #endif | 134 | #endif |
134 | 135 | ||
136 | #ifdef CONFIG_RT_MUTEXES | ||
137 | extern int max_lock_depth; | ||
138 | #endif | ||
139 | |||
135 | static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, | 140 | static int parse_table(int __user *, int, void __user *, size_t __user *, void __user *, size_t, |
136 | ctl_table *, void **); | 141 | ctl_table *, void **); |
137 | static int proc_doutsstring(ctl_table *table, int write, struct file *filp, | 142 | static int proc_doutsstring(ctl_table *table, int write, struct file *filp, |
@@ -677,6 +682,27 @@ static ctl_table kern_table[] = { | |||
677 | .proc_handler = &proc_dointvec, | 682 | .proc_handler = &proc_dointvec, |
678 | }, | 683 | }, |
679 | #endif | 684 | #endif |
685 | #ifdef CONFIG_COMPAT | ||
686 | { | ||
687 | .ctl_name = KERN_COMPAT_LOG, | ||
688 | .procname = "compat-log", | ||
689 | .data = &compat_log, | ||
690 | .maxlen = sizeof (int), | ||
691 | .mode = 0644, | ||
692 | .proc_handler = &proc_dointvec, | ||
693 | }, | ||
694 | #endif | ||
695 | #ifdef CONFIG_RT_MUTEXES | ||
696 | { | ||
697 | .ctl_name = KERN_MAX_LOCK_DEPTH, | ||
698 | .procname = "max_lock_depth", | ||
699 | .data = &max_lock_depth, | ||
700 | .maxlen = sizeof(int), | ||
701 | .mode = 0644, | ||
702 | .proc_handler = &proc_dointvec, | ||
703 | }, | ||
704 | #endif | ||
705 | |||
680 | { .ctl_name = 0 } | 706 | { .ctl_name = 0 } |
681 | }; | 707 | }; |
682 | 708 | ||
@@ -917,6 +943,18 @@ static ctl_table vm_table[] = { | |||
917 | .strategy = &sysctl_jiffies, | 943 | .strategy = &sysctl_jiffies, |
918 | }, | 944 | }, |
919 | #endif | 945 | #endif |
946 | #ifdef CONFIG_X86_32 | ||
947 | { | ||
948 | .ctl_name = VM_VDSO_ENABLED, | ||
949 | .procname = "vdso_enabled", | ||
950 | .data = &vdso_enabled, | ||
951 | .maxlen = sizeof(vdso_enabled), | ||
952 | .mode = 0644, | ||
953 | .proc_handler = &proc_dointvec, | ||
954 | .strategy = &sysctl_intvec, | ||
955 | .extra1 = &zero, | ||
956 | }, | ||
957 | #endif | ||
920 | { .ctl_name = 0 } | 958 | { .ctl_name = 0 } |
921 | }; | 959 | }; |
922 | 960 | ||
diff --git a/kernel/time.c b/kernel/time.c index b00ddc71cedb..5bd489747643 100644 --- a/kernel/time.c +++ b/kernel/time.c | |||
@@ -523,6 +523,7 @@ EXPORT_SYMBOL(do_gettimeofday); | |||
523 | 523 | ||
524 | 524 | ||
525 | #else | 525 | #else |
526 | #ifndef CONFIG_GENERIC_TIME | ||
526 | /* | 527 | /* |
527 | * Simulate gettimeofday using do_gettimeofday which only allows a timeval | 528 | * Simulate gettimeofday using do_gettimeofday which only allows a timeval |
528 | * and therefore only yields usec accuracy | 529 | * and therefore only yields usec accuracy |
@@ -537,6 +538,7 @@ void getnstimeofday(struct timespec *tv) | |||
537 | } | 538 | } |
538 | EXPORT_SYMBOL_GPL(getnstimeofday); | 539 | EXPORT_SYMBOL_GPL(getnstimeofday); |
539 | #endif | 540 | #endif |
541 | #endif | ||
540 | 542 | ||
541 | /* Converts Gregorian date to seconds since 1970-01-01 00:00:00. | 543 | /* Converts Gregorian date to seconds since 1970-01-01 00:00:00. |
542 | * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 | 544 | * Assumes input in normal date format, i.e. 1980-12-31 23:59:59 |
diff --git a/kernel/time/Makefile b/kernel/time/Makefile new file mode 100644 index 000000000000..e1dfd8e86cce --- /dev/null +++ b/kernel/time/Makefile | |||
@@ -0,0 +1 @@ | |||
obj-y += clocksource.o jiffies.o | |||
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c new file mode 100644 index 000000000000..74eca5939bd9 --- /dev/null +++ b/kernel/time/clocksource.c | |||
@@ -0,0 +1,349 @@ | |||
1 | /* | ||
2 | * linux/kernel/time/clocksource.c | ||
3 | * | ||
4 | * This file contains the functions which manage clocksource drivers. | ||
5 | * | ||
6 | * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com) | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or modify | ||
9 | * it under the terms of the GNU General Public License as published by | ||
10 | * the Free Software Foundation; either version 2 of the License, or | ||
11 | * (at your option) any later version. | ||
12 | * | ||
13 | * This program is distributed in the hope that it will be useful, | ||
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 | * GNU General Public License for more details. | ||
17 | * | ||
18 | * You should have received a copy of the GNU General Public License | ||
19 | * along with this program; if not, write to the Free Software | ||
20 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
21 | * | ||
22 | * TODO WishList: | ||
23 | * o Allow clocksource drivers to be unregistered | ||
24 | * o get rid of clocksource_jiffies extern | ||
25 | */ | ||
26 | |||
27 | #include <linux/clocksource.h> | ||
28 | #include <linux/sysdev.h> | ||
29 | #include <linux/init.h> | ||
30 | #include <linux/module.h> | ||
31 | |||
32 | /* XXX - Would like a better way for initializing curr_clocksource */ | ||
33 | extern struct clocksource clocksource_jiffies; | ||
34 | |||
35 | /*[Clocksource internal variables]--------- | ||
36 | * curr_clocksource: | ||
37 | * currently selected clocksource. Initialized to clocksource_jiffies. | ||
38 | * next_clocksource: | ||
39 | * pending next selected clocksource. | ||
40 | * clocksource_list: | ||
41 | * linked list with the registered clocksources | ||
42 | * clocksource_lock: | ||
43 | * protects manipulations to curr_clocksource and next_clocksource | ||
44 | * and the clocksource_list | ||
45 | * override_name: | ||
46 | * Name of the user-specified clocksource. | ||
47 | */ | ||
48 | static struct clocksource *curr_clocksource = &clocksource_jiffies; | ||
49 | static struct clocksource *next_clocksource; | ||
50 | static LIST_HEAD(clocksource_list); | ||
51 | static DEFINE_SPINLOCK(clocksource_lock); | ||
52 | static char override_name[32]; | ||
53 | static int finished_booting; | ||
54 | |||
55 | /* clocksource_done_booting - Called near the end of bootup | ||
56 | * | ||
57 | * Hack to avoid lots of clocksource churn at boot time | ||
58 | */ | ||
59 | static int __init clocksource_done_booting(void) | ||
60 | { | ||
61 | finished_booting = 1; | ||
62 | return 0; | ||
63 | } | ||
64 | |||
65 | late_initcall(clocksource_done_booting); | ||
66 | |||
67 | /** | ||
68 | * clocksource_get_next - Returns the selected clocksource | ||
69 | * | ||
70 | */ | ||
71 | struct clocksource *clocksource_get_next(void) | ||
72 | { | ||
73 | unsigned long flags; | ||
74 | |||
75 | spin_lock_irqsave(&clocksource_lock, flags); | ||
76 | if (next_clocksource && finished_booting) { | ||
77 | curr_clocksource = next_clocksource; | ||
78 | next_clocksource = NULL; | ||
79 | } | ||
80 | spin_unlock_irqrestore(&clocksource_lock, flags); | ||
81 | |||
82 | return curr_clocksource; | ||
83 | } | ||
84 | |||
85 | /** | ||
86 | * select_clocksource - Finds the best registered clocksource. | ||
87 | * | ||
88 | * Private function. Must hold clocksource_lock when called. | ||
89 | * | ||
90 | * Looks through the list of registered clocksources, returning | ||
91 | * the one with the highest rating value. If there is a clocksource | ||
92 | * name that matches the override string, it returns that clocksource. | ||
93 | */ | ||
94 | static struct clocksource *select_clocksource(void) | ||
95 | { | ||
96 | struct clocksource *best = NULL; | ||
97 | struct list_head *tmp; | ||
98 | |||
99 | list_for_each(tmp, &clocksource_list) { | ||
100 | struct clocksource *src; | ||
101 | |||
102 | src = list_entry(tmp, struct clocksource, list); | ||
103 | if (!best) | ||
104 | best = src; | ||
105 | |||
106 | /* check for override: */ | ||
107 | if (strlen(src->name) == strlen(override_name) && | ||
108 | !strcmp(src->name, override_name)) { | ||
109 | best = src; | ||
110 | break; | ||
111 | } | ||
112 | /* pick the highest rating: */ | ||
113 | if (src->rating > best->rating) | ||
114 | best = src; | ||
115 | } | ||
116 | |||
117 | return best; | ||
118 | } | ||
119 | |||
120 | /** | ||
121 | * is_registered_source - Checks if clocksource is registered | ||
122 | * @c: pointer to a clocksource | ||
123 | * | ||
124 | * Private helper function. Must hold clocksource_lock when called. | ||
125 | * | ||
126 | * Returns one if the clocksource is already registered, zero otherwise. | ||
127 | */ | ||
128 | static int is_registered_source(struct clocksource *c) | ||
129 | { | ||
130 | int len = strlen(c->name); | ||
131 | struct list_head *tmp; | ||
132 | |||
133 | list_for_each(tmp, &clocksource_list) { | ||
134 | struct clocksource *src; | ||
135 | |||
136 | src = list_entry(tmp, struct clocksource, list); | ||
137 | if (strlen(src->name) == len && !strcmp(src->name, c->name)) | ||
138 | return 1; | ||
139 | } | ||
140 | |||
141 | return 0; | ||
142 | } | ||
143 | |||
144 | /** | ||
145 | * clocksource_register - Used to install new clocksources | ||
146 | * @t: clocksource to be registered | ||
147 | * | ||
148 | * Returns -EBUSY if registration fails, zero otherwise. | ||
149 | */ | ||
150 | int clocksource_register(struct clocksource *c) | ||
151 | { | ||
152 | int ret = 0; | ||
153 | unsigned long flags; | ||
154 | |||
155 | spin_lock_irqsave(&clocksource_lock, flags); | ||
156 | /* check if clocksource is already registered */ | ||
157 | if (is_registered_source(c)) { | ||
158 | printk("register_clocksource: Cannot register %s. " | ||
159 | "Already registered!", c->name); | ||
160 | ret = -EBUSY; | ||
161 | } else { | ||
162 | /* register it */ | ||
163 | list_add(&c->list, &clocksource_list); | ||
164 | /* scan the registered clocksources, and pick the best one */ | ||
165 | next_clocksource = select_clocksource(); | ||
166 | } | ||
167 | spin_unlock_irqrestore(&clocksource_lock, flags); | ||
168 | return ret; | ||
169 | } | ||
170 | EXPORT_SYMBOL(clocksource_register); | ||
171 | |||
172 | /** | ||
173 | * clocksource_reselect - Rescan list for next clocksource | ||
174 | * | ||
175 | * A quick helper function to be used if a clocksource changes its | ||
176 | * rating. Forces the clocksource list to be re-scanned for the best | ||
177 | * clocksource. | ||
178 | */ | ||
179 | void clocksource_reselect(void) | ||
180 | { | ||
181 | unsigned long flags; | ||
182 | |||
183 | spin_lock_irqsave(&clocksource_lock, flags); | ||
184 | next_clocksource = select_clocksource(); | ||
185 | spin_unlock_irqrestore(&clocksource_lock, flags); | ||
186 | } | ||
187 | EXPORT_SYMBOL(clocksource_reselect); | ||
188 | |||
189 | /** | ||
190 | * sysfs_show_current_clocksources - sysfs interface for current clocksource | ||
191 | * @dev: unused | ||
192 | * @buf: char buffer to be filled with clocksource list | ||
193 | * | ||
194 | * Provides sysfs interface for listing current clocksource. | ||
195 | */ | ||
196 | static ssize_t | ||
197 | sysfs_show_current_clocksources(struct sys_device *dev, char *buf) | ||
198 | { | ||
199 | char *curr = buf; | ||
200 | |||
201 | spin_lock_irq(&clocksource_lock); | ||
202 | curr += sprintf(curr, "%s ", curr_clocksource->name); | ||
203 | spin_unlock_irq(&clocksource_lock); | ||
204 | |||
205 | curr += sprintf(curr, "\n"); | ||
206 | |||
207 | return curr - buf; | ||
208 | } | ||
209 | |||
210 | /** | ||
211 | * sysfs_override_clocksource - interface for manually overriding clocksource | ||
212 | * @dev: unused | ||
213 | * @buf: name of override clocksource | ||
214 | * @count: length of buffer | ||
215 | * | ||
216 | * Takes input from sysfs interface for manually overriding the default | ||
217 | * clocksource selction. | ||
218 | */ | ||
219 | static ssize_t sysfs_override_clocksource(struct sys_device *dev, | ||
220 | const char *buf, size_t count) | ||
221 | { | ||
222 | size_t ret = count; | ||
223 | /* strings from sysfs write are not 0 terminated! */ | ||
224 | if (count >= sizeof(override_name)) | ||
225 | return -EINVAL; | ||
226 | |||
227 | /* strip of \n: */ | ||
228 | if (buf[count-1] == '\n') | ||
229 | count--; | ||
230 | if (count < 1) | ||
231 | return -EINVAL; | ||
232 | |||
233 | spin_lock_irq(&clocksource_lock); | ||
234 | |||
235 | /* copy the name given: */ | ||
236 | memcpy(override_name, buf, count); | ||
237 | override_name[count] = 0; | ||
238 | |||
239 | /* try to select it: */ | ||
240 | next_clocksource = select_clocksource(); | ||
241 | |||
242 | spin_unlock_irq(&clocksource_lock); | ||
243 | |||
244 | return ret; | ||
245 | } | ||
246 | |||
247 | /** | ||
248 | * sysfs_show_available_clocksources - sysfs interface for listing clocksource | ||
249 | * @dev: unused | ||
250 | * @buf: char buffer to be filled with clocksource list | ||
251 | * | ||
252 | * Provides sysfs interface for listing registered clocksources | ||
253 | */ | ||
254 | static ssize_t | ||
255 | sysfs_show_available_clocksources(struct sys_device *dev, char *buf) | ||
256 | { | ||
257 | struct list_head *tmp; | ||
258 | char *curr = buf; | ||
259 | |||
260 | spin_lock_irq(&clocksource_lock); | ||
261 | list_for_each(tmp, &clocksource_list) { | ||
262 | struct clocksource *src; | ||
263 | |||
264 | src = list_entry(tmp, struct clocksource, list); | ||
265 | curr += sprintf(curr, "%s ", src->name); | ||
266 | } | ||
267 | spin_unlock_irq(&clocksource_lock); | ||
268 | |||
269 | curr += sprintf(curr, "\n"); | ||
270 | |||
271 | return curr - buf; | ||
272 | } | ||
273 | |||
274 | /* | ||
275 | * Sysfs setup bits: | ||
276 | */ | ||
277 | static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources, | ||
278 | sysfs_override_clocksource); | ||
279 | |||
280 | static SYSDEV_ATTR(available_clocksource, 0600, | ||
281 | sysfs_show_available_clocksources, NULL); | ||
282 | |||
283 | static struct sysdev_class clocksource_sysclass = { | ||
284 | set_kset_name("clocksource"), | ||
285 | }; | ||
286 | |||
287 | static struct sys_device device_clocksource = { | ||
288 | .id = 0, | ||
289 | .cls = &clocksource_sysclass, | ||
290 | }; | ||
291 | |||
292 | static int __init init_clocksource_sysfs(void) | ||
293 | { | ||
294 | int error = sysdev_class_register(&clocksource_sysclass); | ||
295 | |||
296 | if (!error) | ||
297 | error = sysdev_register(&device_clocksource); | ||
298 | if (!error) | ||
299 | error = sysdev_create_file( | ||
300 | &device_clocksource, | ||
301 | &attr_current_clocksource); | ||
302 | if (!error) | ||
303 | error = sysdev_create_file( | ||
304 | &device_clocksource, | ||
305 | &attr_available_clocksource); | ||
306 | return error; | ||
307 | } | ||
308 | |||
309 | device_initcall(init_clocksource_sysfs); | ||
310 | |||
311 | /** | ||
312 | * boot_override_clocksource - boot clock override | ||
313 | * @str: override name | ||
314 | * | ||
315 | * Takes a clocksource= boot argument and uses it | ||
316 | * as the clocksource override name. | ||
317 | */ | ||
318 | static int __init boot_override_clocksource(char* str) | ||
319 | { | ||
320 | unsigned long flags; | ||
321 | spin_lock_irqsave(&clocksource_lock, flags); | ||
322 | if (str) | ||
323 | strlcpy(override_name, str, sizeof(override_name)); | ||
324 | spin_unlock_irqrestore(&clocksource_lock, flags); | ||
325 | return 1; | ||
326 | } | ||
327 | |||
328 | __setup("clocksource=", boot_override_clocksource); | ||
329 | |||
330 | /** | ||
331 | * boot_override_clock - Compatibility layer for deprecated boot option | ||
332 | * @str: override name | ||
333 | * | ||
334 | * DEPRECATED! Takes a clock= boot argument and uses it | ||
335 | * as the clocksource override name | ||
336 | */ | ||
337 | static int __init boot_override_clock(char* str) | ||
338 | { | ||
339 | if (!strcmp(str, "pmtmr")) { | ||
340 | printk("Warning: clock=pmtmr is deprecated. " | ||
341 | "Use clocksource=acpi_pm.\n"); | ||
342 | return boot_override_clocksource("acpi_pm"); | ||
343 | } | ||
344 | printk("Warning! clock= boot option is deprecated. " | ||
345 | "Use clocksource=xyz\n"); | ||
346 | return boot_override_clocksource(str); | ||
347 | } | ||
348 | |||
349 | __setup("clock=", boot_override_clock); | ||
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c new file mode 100644 index 000000000000..126bb30c4afe --- /dev/null +++ b/kernel/time/jiffies.c | |||
@@ -0,0 +1,73 @@ | |||
1 | /*********************************************************************** | ||
2 | * linux/kernel/time/jiffies.c | ||
3 | * | ||
4 | * This file contains the jiffies based clocksource. | ||
5 | * | ||
6 | * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com) | ||
7 | * | ||
8 | * This program is free software; you can redistribute it and/or modify | ||
9 | * it under the terms of the GNU General Public License as published by | ||
10 | * the Free Software Foundation; either version 2 of the License, or | ||
11 | * (at your option) any later version. | ||
12 | * | ||
13 | * This program is distributed in the hope that it will be useful, | ||
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
16 | * GNU General Public License for more details. | ||
17 | * | ||
18 | * You should have received a copy of the GNU General Public License | ||
19 | * along with this program; if not, write to the Free Software | ||
20 | * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | ||
21 | * | ||
22 | ************************************************************************/ | ||
23 | #include <linux/clocksource.h> | ||
24 | #include <linux/jiffies.h> | ||
25 | #include <linux/init.h> | ||
26 | |||
27 | /* The Jiffies based clocksource is the lowest common | ||
28 | * denominator clock source which should function on | ||
29 | * all systems. It has the same coarse resolution as | ||
30 | * the timer interrupt frequency HZ and it suffers | ||
31 | * inaccuracies caused by missed or lost timer | ||
32 | * interrupts and the inability for the timer | ||
33 | * interrupt hardware to accuratly tick at the | ||
34 | * requested HZ value. It is also not reccomended | ||
35 | * for "tick-less" systems. | ||
36 | */ | ||
37 | #define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ)) | ||
38 | |||
39 | /* Since jiffies uses a simple NSEC_PER_JIFFY multiplier | ||
40 | * conversion, the .shift value could be zero. However | ||
41 | * this would make NTP adjustments impossible as they are | ||
42 | * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to | ||
43 | * shift both the nominator and denominator the same | ||
44 | * amount, and give ntp adjustments in units of 1/2^8 | ||
45 | * | ||
46 | * The value 8 is somewhat carefully chosen, as anything | ||
47 | * larger can result in overflows. NSEC_PER_JIFFY grows as | ||
48 | * HZ shrinks, so values greater then 8 overflow 32bits when | ||
49 | * HZ=100. | ||
50 | */ | ||
51 | #define JIFFIES_SHIFT 8 | ||
52 | |||
53 | static cycle_t jiffies_read(void) | ||
54 | { | ||
55 | return (cycle_t) jiffies; | ||
56 | } | ||
57 | |||
58 | struct clocksource clocksource_jiffies = { | ||
59 | .name = "jiffies", | ||
60 | .rating = 0, /* lowest rating*/ | ||
61 | .read = jiffies_read, | ||
62 | .mask = 0xffffffff, /*32bits*/ | ||
63 | .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ | ||
64 | .shift = JIFFIES_SHIFT, | ||
65 | .is_continuous = 0, /* tick based, not free running */ | ||
66 | }; | ||
67 | |||
68 | static int __init init_jiffies_clocksource(void) | ||
69 | { | ||
70 | return clocksource_register(&clocksource_jiffies); | ||
71 | } | ||
72 | |||
73 | module_init(init_jiffies_clocksource); | ||
diff --git a/kernel/timer.c b/kernel/timer.c index eb97371b87d8..5a8960253063 100644 --- a/kernel/timer.c +++ b/kernel/timer.c | |||
@@ -597,7 +597,6 @@ long time_tolerance = MAXFREQ; /* frequency tolerance (ppm) */ | |||
597 | long time_precision = 1; /* clock precision (us) */ | 597 | long time_precision = 1; /* clock precision (us) */ |
598 | long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ | 598 | long time_maxerror = NTP_PHASE_LIMIT; /* maximum error (us) */ |
599 | long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ | 599 | long time_esterror = NTP_PHASE_LIMIT; /* estimated error (us) */ |
600 | static long time_phase; /* phase offset (scaled us) */ | ||
601 | long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC; | 600 | long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC; |
602 | /* frequency offset (scaled ppm)*/ | 601 | /* frequency offset (scaled ppm)*/ |
603 | static long time_adj; /* tick adjust (scaled 1 / HZ) */ | 602 | static long time_adj; /* tick adjust (scaled 1 / HZ) */ |
@@ -747,27 +746,14 @@ static long adjtime_adjustment(void) | |||
747 | } | 746 | } |
748 | 747 | ||
749 | /* in the NTP reference this is called "hardclock()" */ | 748 | /* in the NTP reference this is called "hardclock()" */ |
750 | static void update_wall_time_one_tick(void) | 749 | static void update_ntp_one_tick(void) |
751 | { | 750 | { |
752 | long time_adjust_step, delta_nsec; | 751 | long time_adjust_step; |
753 | 752 | ||
754 | time_adjust_step = adjtime_adjustment(); | 753 | time_adjust_step = adjtime_adjustment(); |
755 | if (time_adjust_step) | 754 | if (time_adjust_step) |
756 | /* Reduce by this step the amount of time left */ | 755 | /* Reduce by this step the amount of time left */ |
757 | time_adjust -= time_adjust_step; | 756 | time_adjust -= time_adjust_step; |
758 | delta_nsec = tick_nsec + time_adjust_step * 1000; | ||
759 | /* | ||
760 | * Advance the phase, once it gets to one microsecond, then | ||
761 | * advance the tick more. | ||
762 | */ | ||
763 | time_phase += time_adj; | ||
764 | if ((time_phase >= FINENSEC) || (time_phase <= -FINENSEC)) { | ||
765 | long ltemp = shift_right(time_phase, (SHIFT_SCALE - 10)); | ||
766 | time_phase -= ltemp << (SHIFT_SCALE - 10); | ||
767 | delta_nsec += ltemp; | ||
768 | } | ||
769 | xtime.tv_nsec += delta_nsec; | ||
770 | time_interpolator_update(delta_nsec); | ||
771 | 757 | ||
772 | /* Changes by adjtime() do not take effect till next tick. */ | 758 | /* Changes by adjtime() do not take effect till next tick. */ |
773 | if (time_next_adjust != 0) { | 759 | if (time_next_adjust != 0) { |
@@ -780,36 +766,378 @@ static void update_wall_time_one_tick(void) | |||
780 | * Return how long ticks are at the moment, that is, how much time | 766 | * Return how long ticks are at the moment, that is, how much time |
781 | * update_wall_time_one_tick will add to xtime next time we call it | 767 | * update_wall_time_one_tick will add to xtime next time we call it |
782 | * (assuming no calls to do_adjtimex in the meantime). | 768 | * (assuming no calls to do_adjtimex in the meantime). |
783 | * The return value is in fixed-point nanoseconds with SHIFT_SCALE-10 | 769 | * The return value is in fixed-point nanoseconds shifted by the |
784 | * bits to the right of the binary point. | 770 | * specified number of bits to the right of the binary point. |
785 | * This function has no side-effects. | 771 | * This function has no side-effects. |
786 | */ | 772 | */ |
787 | u64 current_tick_length(void) | 773 | u64 current_tick_length(void) |
788 | { | 774 | { |
789 | long delta_nsec; | 775 | long delta_nsec; |
776 | u64 ret; | ||
790 | 777 | ||
778 | /* calculate the finest interval NTP will allow. | ||
779 | * ie: nanosecond value shifted by (SHIFT_SCALE - 10) | ||
780 | */ | ||
791 | delta_nsec = tick_nsec + adjtime_adjustment() * 1000; | 781 | delta_nsec = tick_nsec + adjtime_adjustment() * 1000; |
792 | return ((u64) delta_nsec << (SHIFT_SCALE - 10)) + time_adj; | 782 | ret = (u64)delta_nsec << TICK_LENGTH_SHIFT; |
783 | ret += (s64)time_adj << (TICK_LENGTH_SHIFT - (SHIFT_SCALE - 10)); | ||
784 | |||
785 | return ret; | ||
793 | } | 786 | } |
794 | 787 | ||
795 | /* | 788 | /* XXX - all of this timekeeping code should be later moved to time.c */ |
796 | * Using a loop looks inefficient, but "ticks" is | 789 | #include <linux/clocksource.h> |
797 | * usually just one (we shouldn't be losing ticks, | 790 | static struct clocksource *clock; /* pointer to current clocksource */ |
798 | * we're doing this this way mainly for interrupt | 791 | |
799 | * latency reasons, not because we think we'll | 792 | #ifdef CONFIG_GENERIC_TIME |
800 | * have lots of lost timer ticks | 793 | /** |
794 | * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook | ||
795 | * | ||
796 | * private function, must hold xtime_lock lock when being | ||
797 | * called. Returns the number of nanoseconds since the | ||
798 | * last call to update_wall_time() (adjusted by NTP scaling) | ||
799 | */ | ||
800 | static inline s64 __get_nsec_offset(void) | ||
801 | { | ||
802 | cycle_t cycle_now, cycle_delta; | ||
803 | s64 ns_offset; | ||
804 | |||
805 | /* read clocksource: */ | ||
806 | cycle_now = clocksource_read(clock); | ||
807 | |||
808 | /* calculate the delta since the last update_wall_time: */ | ||
809 | cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; | ||
810 | |||
811 | /* convert to nanoseconds: */ | ||
812 | ns_offset = cyc2ns(clock, cycle_delta); | ||
813 | |||
814 | return ns_offset; | ||
815 | } | ||
816 | |||
817 | /** | ||
818 | * __get_realtime_clock_ts - Returns the time of day in a timespec | ||
819 | * @ts: pointer to the timespec to be set | ||
820 | * | ||
821 | * Returns the time of day in a timespec. Used by | ||
822 | * do_gettimeofday() and get_realtime_clock_ts(). | ||
801 | */ | 823 | */ |
802 | static void update_wall_time(unsigned long ticks) | 824 | static inline void __get_realtime_clock_ts(struct timespec *ts) |
803 | { | 825 | { |
826 | unsigned long seq; | ||
827 | s64 nsecs; | ||
828 | |||
829 | do { | ||
830 | seq = read_seqbegin(&xtime_lock); | ||
831 | |||
832 | *ts = xtime; | ||
833 | nsecs = __get_nsec_offset(); | ||
834 | |||
835 | } while (read_seqretry(&xtime_lock, seq)); | ||
836 | |||
837 | timespec_add_ns(ts, nsecs); | ||
838 | } | ||
839 | |||
840 | /** | ||
841 | * getnstimeofday - Returns the time of day in a timespec | ||
842 | * @ts: pointer to the timespec to be set | ||
843 | * | ||
844 | * Returns the time of day in a timespec. | ||
845 | */ | ||
846 | void getnstimeofday(struct timespec *ts) | ||
847 | { | ||
848 | __get_realtime_clock_ts(ts); | ||
849 | } | ||
850 | |||
851 | EXPORT_SYMBOL(getnstimeofday); | ||
852 | |||
853 | /** | ||
854 | * do_gettimeofday - Returns the time of day in a timeval | ||
855 | * @tv: pointer to the timeval to be set | ||
856 | * | ||
857 | * NOTE: Users should be converted to using get_realtime_clock_ts() | ||
858 | */ | ||
859 | void do_gettimeofday(struct timeval *tv) | ||
860 | { | ||
861 | struct timespec now; | ||
862 | |||
863 | __get_realtime_clock_ts(&now); | ||
864 | tv->tv_sec = now.tv_sec; | ||
865 | tv->tv_usec = now.tv_nsec/1000; | ||
866 | } | ||
867 | |||
868 | EXPORT_SYMBOL(do_gettimeofday); | ||
869 | /** | ||
870 | * do_settimeofday - Sets the time of day | ||
871 | * @tv: pointer to the timespec variable containing the new time | ||
872 | * | ||
873 | * Sets the time of day to the new time and update NTP and notify hrtimers | ||
874 | */ | ||
875 | int do_settimeofday(struct timespec *tv) | ||
876 | { | ||
877 | unsigned long flags; | ||
878 | time_t wtm_sec, sec = tv->tv_sec; | ||
879 | long wtm_nsec, nsec = tv->tv_nsec; | ||
880 | |||
881 | if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) | ||
882 | return -EINVAL; | ||
883 | |||
884 | write_seqlock_irqsave(&xtime_lock, flags); | ||
885 | |||
886 | nsec -= __get_nsec_offset(); | ||
887 | |||
888 | wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); | ||
889 | wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); | ||
890 | |||
891 | set_normalized_timespec(&xtime, sec, nsec); | ||
892 | set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); | ||
893 | |||
894 | ntp_clear(); | ||
895 | |||
896 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
897 | |||
898 | /* signal hrtimers about time change */ | ||
899 | clock_was_set(); | ||
900 | |||
901 | return 0; | ||
902 | } | ||
903 | |||
904 | EXPORT_SYMBOL(do_settimeofday); | ||
905 | |||
906 | /** | ||
907 | * change_clocksource - Swaps clocksources if a new one is available | ||
908 | * | ||
909 | * Accumulates current time interval and initializes new clocksource | ||
910 | */ | ||
911 | static int change_clocksource(void) | ||
912 | { | ||
913 | struct clocksource *new; | ||
914 | cycle_t now; | ||
915 | u64 nsec; | ||
916 | new = clocksource_get_next(); | ||
917 | if (clock != new) { | ||
918 | now = clocksource_read(new); | ||
919 | nsec = __get_nsec_offset(); | ||
920 | timespec_add_ns(&xtime, nsec); | ||
921 | |||
922 | clock = new; | ||
923 | clock->cycle_last = now; | ||
924 | printk(KERN_INFO "Time: %s clocksource has been installed.\n", | ||
925 | clock->name); | ||
926 | return 1; | ||
927 | } else if (clock->update_callback) { | ||
928 | return clock->update_callback(); | ||
929 | } | ||
930 | return 0; | ||
931 | } | ||
932 | #else | ||
933 | #define change_clocksource() (0) | ||
934 | #endif | ||
935 | |||
936 | /** | ||
937 | * timeofday_is_continuous - check to see if timekeeping is free running | ||
938 | */ | ||
939 | int timekeeping_is_continuous(void) | ||
940 | { | ||
941 | unsigned long seq; | ||
942 | int ret; | ||
943 | |||
804 | do { | 944 | do { |
805 | ticks--; | 945 | seq = read_seqbegin(&xtime_lock); |
806 | update_wall_time_one_tick(); | 946 | |
807 | if (xtime.tv_nsec >= 1000000000) { | 947 | ret = clock->is_continuous; |
808 | xtime.tv_nsec -= 1000000000; | 948 | |
949 | } while (read_seqretry(&xtime_lock, seq)); | ||
950 | |||
951 | return ret; | ||
952 | } | ||
953 | |||
954 | /* | ||
955 | * timekeeping_init - Initializes the clocksource and common timekeeping values | ||
956 | */ | ||
957 | void __init timekeeping_init(void) | ||
958 | { | ||
959 | unsigned long flags; | ||
960 | |||
961 | write_seqlock_irqsave(&xtime_lock, flags); | ||
962 | clock = clocksource_get_next(); | ||
963 | clocksource_calculate_interval(clock, tick_nsec); | ||
964 | clock->cycle_last = clocksource_read(clock); | ||
965 | ntp_clear(); | ||
966 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
967 | } | ||
968 | |||
969 | |||
970 | /* | ||
971 | * timekeeping_resume - Resumes the generic timekeeping subsystem. | ||
972 | * @dev: unused | ||
973 | * | ||
974 | * This is for the generic clocksource timekeeping. | ||
975 | * xtime/wall_to_monotonic/jiffies/wall_jiffies/etc are | ||
976 | * still managed by arch specific suspend/resume code. | ||
977 | */ | ||
978 | static int timekeeping_resume(struct sys_device *dev) | ||
979 | { | ||
980 | unsigned long flags; | ||
981 | |||
982 | write_seqlock_irqsave(&xtime_lock, flags); | ||
983 | /* restart the last cycle value */ | ||
984 | clock->cycle_last = clocksource_read(clock); | ||
985 | write_sequnlock_irqrestore(&xtime_lock, flags); | ||
986 | return 0; | ||
987 | } | ||
988 | |||
989 | /* sysfs resume/suspend bits for timekeeping */ | ||
990 | static struct sysdev_class timekeeping_sysclass = { | ||
991 | .resume = timekeeping_resume, | ||
992 | set_kset_name("timekeeping"), | ||
993 | }; | ||
994 | |||
995 | static struct sys_device device_timer = { | ||
996 | .id = 0, | ||
997 | .cls = &timekeeping_sysclass, | ||
998 | }; | ||
999 | |||
1000 | static int __init timekeeping_init_device(void) | ||
1001 | { | ||
1002 | int error = sysdev_class_register(&timekeeping_sysclass); | ||
1003 | if (!error) | ||
1004 | error = sysdev_register(&device_timer); | ||
1005 | return error; | ||
1006 | } | ||
1007 | |||
1008 | device_initcall(timekeeping_init_device); | ||
1009 | |||
1010 | /* | ||
1011 | * If the error is already larger, we look ahead another tick, | ||
1012 | * to compensate for late or lost adjustments. | ||
1013 | */ | ||
1014 | static __always_inline int clocksource_bigadjust(int sign, s64 error, s64 *interval, s64 *offset) | ||
1015 | { | ||
1016 | int adj; | ||
1017 | |||
1018 | /* | ||
1019 | * As soon as the machine is synchronized to the external time | ||
1020 | * source this should be the common case. | ||
1021 | */ | ||
1022 | error >>= 2; | ||
1023 | if (likely(sign > 0 ? error <= *interval : error >= *interval)) | ||
1024 | return sign; | ||
1025 | |||
1026 | /* | ||
1027 | * An extra look ahead dampens the effect of the current error, | ||
1028 | * which can grow quite large with continously late updates, as | ||
1029 | * it would dominate the adjustment value and can lead to | ||
1030 | * oscillation. | ||
1031 | */ | ||
1032 | error += current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1); | ||
1033 | error -= clock->xtime_interval >> 1; | ||
1034 | |||
1035 | adj = 0; | ||
1036 | while (1) { | ||
1037 | error >>= 1; | ||
1038 | if (sign > 0 ? error <= *interval : error >= *interval) | ||
1039 | break; | ||
1040 | adj++; | ||
1041 | } | ||
1042 | |||
1043 | /* | ||
1044 | * Add the current adjustments to the error and take the offset | ||
1045 | * into account, the latter can cause the error to be hardly | ||
1046 | * reduced at the next tick. Check the error again if there's | ||
1047 | * room for another adjustment, thus further reducing the error | ||
1048 | * which otherwise had to be corrected at the next update. | ||
1049 | */ | ||
1050 | error = (error << 1) - *interval + *offset; | ||
1051 | if (sign > 0 ? error > *interval : error < *interval) | ||
1052 | adj++; | ||
1053 | |||
1054 | *interval <<= adj; | ||
1055 | *offset <<= adj; | ||
1056 | return sign << adj; | ||
1057 | } | ||
1058 | |||
1059 | /* | ||
1060 | * Adjust the multiplier to reduce the error value, | ||
1061 | * this is optimized for the most common adjustments of -1,0,1, | ||
1062 | * for other values we can do a bit more work. | ||
1063 | */ | ||
1064 | static void clocksource_adjust(struct clocksource *clock, s64 offset) | ||
1065 | { | ||
1066 | s64 error, interval = clock->cycle_interval; | ||
1067 | int adj; | ||
1068 | |||
1069 | error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1); | ||
1070 | if (error > interval) { | ||
1071 | adj = clocksource_bigadjust(1, error, &interval, &offset); | ||
1072 | } else if (error < -interval) { | ||
1073 | interval = -interval; | ||
1074 | offset = -offset; | ||
1075 | adj = clocksource_bigadjust(-1, error, &interval, &offset); | ||
1076 | } else | ||
1077 | return; | ||
1078 | |||
1079 | clock->mult += adj; | ||
1080 | clock->xtime_interval += interval; | ||
1081 | clock->xtime_nsec -= offset; | ||
1082 | clock->error -= (interval - offset) << (TICK_LENGTH_SHIFT - clock->shift); | ||
1083 | } | ||
1084 | |||
1085 | /* | ||
1086 | * update_wall_time - Uses the current clocksource to increment the wall time | ||
1087 | * | ||
1088 | * Called from the timer interrupt, must hold a write on xtime_lock. | ||
1089 | */ | ||
1090 | static void update_wall_time(void) | ||
1091 | { | ||
1092 | cycle_t offset; | ||
1093 | |||
1094 | clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift; | ||
1095 | |||
1096 | #ifdef CONFIG_GENERIC_TIME | ||
1097 | offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask; | ||
1098 | #else | ||
1099 | offset = clock->cycle_interval; | ||
1100 | #endif | ||
1101 | |||
1102 | /* normally this loop will run just once, however in the | ||
1103 | * case of lost or late ticks, it will accumulate correctly. | ||
1104 | */ | ||
1105 | while (offset >= clock->cycle_interval) { | ||
1106 | /* accumulate one interval */ | ||
1107 | clock->xtime_nsec += clock->xtime_interval; | ||
1108 | clock->cycle_last += clock->cycle_interval; | ||
1109 | offset -= clock->cycle_interval; | ||
1110 | |||
1111 | if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) { | ||
1112 | clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift; | ||
809 | xtime.tv_sec++; | 1113 | xtime.tv_sec++; |
810 | second_overflow(); | 1114 | second_overflow(); |
811 | } | 1115 | } |
812 | } while (ticks); | 1116 | |
1117 | /* interpolator bits */ | ||
1118 | time_interpolator_update(clock->xtime_interval | ||
1119 | >> clock->shift); | ||
1120 | /* increment the NTP state machine */ | ||
1121 | update_ntp_one_tick(); | ||
1122 | |||
1123 | /* accumulate error between NTP and clock interval */ | ||
1124 | clock->error += current_tick_length(); | ||
1125 | clock->error -= clock->xtime_interval << (TICK_LENGTH_SHIFT - clock->shift); | ||
1126 | } | ||
1127 | |||
1128 | /* correct the clock when NTP error is too big */ | ||
1129 | clocksource_adjust(clock, offset); | ||
1130 | |||
1131 | /* store full nanoseconds into xtime */ | ||
1132 | xtime.tv_nsec = clock->xtime_nsec >> clock->shift; | ||
1133 | clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; | ||
1134 | |||
1135 | /* check to see if there is a new clocksource to use */ | ||
1136 | if (change_clocksource()) { | ||
1137 | clock->error = 0; | ||
1138 | clock->xtime_nsec = 0; | ||
1139 | clocksource_calculate_interval(clock, tick_nsec); | ||
1140 | } | ||
813 | } | 1141 | } |
814 | 1142 | ||
815 | /* | 1143 | /* |
@@ -915,10 +1243,8 @@ static inline void update_times(void) | |||
915 | unsigned long ticks; | 1243 | unsigned long ticks; |
916 | 1244 | ||
917 | ticks = jiffies - wall_jiffies; | 1245 | ticks = jiffies - wall_jiffies; |
918 | if (ticks) { | 1246 | wall_jiffies += ticks; |
919 | wall_jiffies += ticks; | 1247 | update_wall_time(); |
920 | update_wall_time(ticks); | ||
921 | } | ||
922 | calc_load(ticks); | 1248 | calc_load(ticks); |
923 | } | 1249 | } |
924 | 1250 | ||
@@ -1326,7 +1652,7 @@ static void __devinit migrate_timers(int cpu) | |||
1326 | } | 1652 | } |
1327 | #endif /* CONFIG_HOTPLUG_CPU */ | 1653 | #endif /* CONFIG_HOTPLUG_CPU */ |
1328 | 1654 | ||
1329 | static int timer_cpu_notify(struct notifier_block *self, | 1655 | static int __devinit timer_cpu_notify(struct notifier_block *self, |
1330 | unsigned long action, void *hcpu) | 1656 | unsigned long action, void *hcpu) |
1331 | { | 1657 | { |
1332 | long cpu = (long)hcpu; | 1658 | long cpu = (long)hcpu; |
@@ -1346,7 +1672,7 @@ static int timer_cpu_notify(struct notifier_block *self, | |||
1346 | return NOTIFY_OK; | 1672 | return NOTIFY_OK; |
1347 | } | 1673 | } |
1348 | 1674 | ||
1349 | static struct notifier_block timers_nb = { | 1675 | static struct notifier_block __devinitdata timers_nb = { |
1350 | .notifier_call = timer_cpu_notify, | 1676 | .notifier_call = timer_cpu_notify, |
1351 | }; | 1677 | }; |
1352 | 1678 | ||
diff --git a/kernel/unwind.c b/kernel/unwind.c new file mode 100644 index 000000000000..f69c804c8e62 --- /dev/null +++ b/kernel/unwind.c | |||
@@ -0,0 +1,918 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2002-2006 Novell, Inc. | ||
3 | * Jan Beulich <jbeulich@novell.com> | ||
4 | * This code is released under version 2 of the GNU GPL. | ||
5 | * | ||
6 | * A simple API for unwinding kernel stacks. This is used for | ||
7 | * debugging and error reporting purposes. The kernel doesn't need | ||
8 | * full-blown stack unwinding with all the bells and whistles, so there | ||
9 | * is not much point in implementing the full Dwarf2 unwind API. | ||
10 | */ | ||
11 | |||
12 | #include <linux/unwind.h> | ||
13 | #include <linux/module.h> | ||
14 | #include <linux/delay.h> | ||
15 | #include <linux/stop_machine.h> | ||
16 | #include <asm/sections.h> | ||
17 | #include <asm/uaccess.h> | ||
18 | #include <asm/unaligned.h> | ||
19 | |||
20 | extern char __start_unwind[], __end_unwind[]; | ||
21 | |||
22 | #define MAX_STACK_DEPTH 8 | ||
23 | |||
24 | #define EXTRA_INFO(f) { \ | ||
25 | BUILD_BUG_ON_ZERO(offsetof(struct unwind_frame_info, f) \ | ||
26 | % FIELD_SIZEOF(struct unwind_frame_info, f)) \ | ||
27 | + offsetof(struct unwind_frame_info, f) \ | ||
28 | / FIELD_SIZEOF(struct unwind_frame_info, f), \ | ||
29 | FIELD_SIZEOF(struct unwind_frame_info, f) \ | ||
30 | } | ||
31 | #define PTREGS_INFO(f) EXTRA_INFO(regs.f) | ||
32 | |||
33 | static const struct { | ||
34 | unsigned offs:BITS_PER_LONG / 2; | ||
35 | unsigned width:BITS_PER_LONG / 2; | ||
36 | } reg_info[] = { | ||
37 | UNW_REGISTER_INFO | ||
38 | }; | ||
39 | |||
40 | #undef PTREGS_INFO | ||
41 | #undef EXTRA_INFO | ||
42 | |||
43 | #ifndef REG_INVALID | ||
44 | #define REG_INVALID(r) (reg_info[r].width == 0) | ||
45 | #endif | ||
46 | |||
47 | #define DW_CFA_nop 0x00 | ||
48 | #define DW_CFA_set_loc 0x01 | ||
49 | #define DW_CFA_advance_loc1 0x02 | ||
50 | #define DW_CFA_advance_loc2 0x03 | ||
51 | #define DW_CFA_advance_loc4 0x04 | ||
52 | #define DW_CFA_offset_extended 0x05 | ||
53 | #define DW_CFA_restore_extended 0x06 | ||
54 | #define DW_CFA_undefined 0x07 | ||
55 | #define DW_CFA_same_value 0x08 | ||
56 | #define DW_CFA_register 0x09 | ||
57 | #define DW_CFA_remember_state 0x0a | ||
58 | #define DW_CFA_restore_state 0x0b | ||
59 | #define DW_CFA_def_cfa 0x0c | ||
60 | #define DW_CFA_def_cfa_register 0x0d | ||
61 | #define DW_CFA_def_cfa_offset 0x0e | ||
62 | #define DW_CFA_def_cfa_expression 0x0f | ||
63 | #define DW_CFA_expression 0x10 | ||
64 | #define DW_CFA_offset_extended_sf 0x11 | ||
65 | #define DW_CFA_def_cfa_sf 0x12 | ||
66 | #define DW_CFA_def_cfa_offset_sf 0x13 | ||
67 | #define DW_CFA_val_offset 0x14 | ||
68 | #define DW_CFA_val_offset_sf 0x15 | ||
69 | #define DW_CFA_val_expression 0x16 | ||
70 | #define DW_CFA_lo_user 0x1c | ||
71 | #define DW_CFA_GNU_window_save 0x2d | ||
72 | #define DW_CFA_GNU_args_size 0x2e | ||
73 | #define DW_CFA_GNU_negative_offset_extended 0x2f | ||
74 | #define DW_CFA_hi_user 0x3f | ||
75 | |||
76 | #define DW_EH_PE_FORM 0x07 | ||
77 | #define DW_EH_PE_native 0x00 | ||
78 | #define DW_EH_PE_leb128 0x01 | ||
79 | #define DW_EH_PE_data2 0x02 | ||
80 | #define DW_EH_PE_data4 0x03 | ||
81 | #define DW_EH_PE_data8 0x04 | ||
82 | #define DW_EH_PE_signed 0x08 | ||
83 | #define DW_EH_PE_ADJUST 0x70 | ||
84 | #define DW_EH_PE_abs 0x00 | ||
85 | #define DW_EH_PE_pcrel 0x10 | ||
86 | #define DW_EH_PE_textrel 0x20 | ||
87 | #define DW_EH_PE_datarel 0x30 | ||
88 | #define DW_EH_PE_funcrel 0x40 | ||
89 | #define DW_EH_PE_aligned 0x50 | ||
90 | #define DW_EH_PE_indirect 0x80 | ||
91 | #define DW_EH_PE_omit 0xff | ||
92 | |||
93 | typedef unsigned long uleb128_t; | ||
94 | typedef signed long sleb128_t; | ||
95 | |||
96 | static struct unwind_table { | ||
97 | struct { | ||
98 | unsigned long pc; | ||
99 | unsigned long range; | ||
100 | } core, init; | ||
101 | const void *address; | ||
102 | unsigned long size; | ||
103 | struct unwind_table *link; | ||
104 | const char *name; | ||
105 | } root_table, *last_table; | ||
106 | |||
107 | struct unwind_item { | ||
108 | enum item_location { | ||
109 | Nowhere, | ||
110 | Memory, | ||
111 | Register, | ||
112 | Value | ||
113 | } where; | ||
114 | uleb128_t value; | ||
115 | }; | ||
116 | |||
117 | struct unwind_state { | ||
118 | uleb128_t loc, org; | ||
119 | const u8 *cieStart, *cieEnd; | ||
120 | uleb128_t codeAlign; | ||
121 | sleb128_t dataAlign; | ||
122 | struct cfa { | ||
123 | uleb128_t reg, offs; | ||
124 | } cfa; | ||
125 | struct unwind_item regs[ARRAY_SIZE(reg_info)]; | ||
126 | unsigned stackDepth:8; | ||
127 | unsigned version:8; | ||
128 | const u8 *label; | ||
129 | const u8 *stack[MAX_STACK_DEPTH]; | ||
130 | }; | ||
131 | |||
132 | static const struct cfa badCFA = { ARRAY_SIZE(reg_info), 1 }; | ||
133 | |||
134 | static struct unwind_table *find_table(unsigned long pc) | ||
135 | { | ||
136 | struct unwind_table *table; | ||
137 | |||
138 | for (table = &root_table; table; table = table->link) | ||
139 | if ((pc >= table->core.pc | ||
140 | && pc < table->core.pc + table->core.range) | ||
141 | || (pc >= table->init.pc | ||
142 | && pc < table->init.pc + table->init.range)) | ||
143 | break; | ||
144 | |||
145 | return table; | ||
146 | } | ||
147 | |||
148 | static void init_unwind_table(struct unwind_table *table, | ||
149 | const char *name, | ||
150 | const void *core_start, | ||
151 | unsigned long core_size, | ||
152 | const void *init_start, | ||
153 | unsigned long init_size, | ||
154 | const void *table_start, | ||
155 | unsigned long table_size) | ||
156 | { | ||
157 | table->core.pc = (unsigned long)core_start; | ||
158 | table->core.range = core_size; | ||
159 | table->init.pc = (unsigned long)init_start; | ||
160 | table->init.range = init_size; | ||
161 | table->address = table_start; | ||
162 | table->size = table_size; | ||
163 | table->link = NULL; | ||
164 | table->name = name; | ||
165 | } | ||
166 | |||
167 | void __init unwind_init(void) | ||
168 | { | ||
169 | init_unwind_table(&root_table, "kernel", | ||
170 | _text, _end - _text, | ||
171 | NULL, 0, | ||
172 | __start_unwind, __end_unwind - __start_unwind); | ||
173 | } | ||
174 | |||
175 | #ifdef CONFIG_MODULES | ||
176 | |||
177 | /* Must be called with module_mutex held. */ | ||
178 | void *unwind_add_table(struct module *module, | ||
179 | const void *table_start, | ||
180 | unsigned long table_size) | ||
181 | { | ||
182 | struct unwind_table *table; | ||
183 | |||
184 | if (table_size <= 0) | ||
185 | return NULL; | ||
186 | |||
187 | table = kmalloc(sizeof(*table), GFP_KERNEL); | ||
188 | if (!table) | ||
189 | return NULL; | ||
190 | |||
191 | init_unwind_table(table, module->name, | ||
192 | module->module_core, module->core_size, | ||
193 | module->module_init, module->init_size, | ||
194 | table_start, table_size); | ||
195 | |||
196 | if (last_table) | ||
197 | last_table->link = table; | ||
198 | else | ||
199 | root_table.link = table; | ||
200 | last_table = table; | ||
201 | |||
202 | return table; | ||
203 | } | ||
204 | |||
205 | struct unlink_table_info | ||
206 | { | ||
207 | struct unwind_table *table; | ||
208 | int init_only; | ||
209 | }; | ||
210 | |||
211 | static int unlink_table(void *arg) | ||
212 | { | ||
213 | struct unlink_table_info *info = arg; | ||
214 | struct unwind_table *table = info->table, *prev; | ||
215 | |||
216 | for (prev = &root_table; prev->link && prev->link != table; prev = prev->link) | ||
217 | ; | ||
218 | |||
219 | if (prev->link) { | ||
220 | if (info->init_only) { | ||
221 | table->init.pc = 0; | ||
222 | table->init.range = 0; | ||
223 | info->table = NULL; | ||
224 | } else { | ||
225 | prev->link = table->link; | ||
226 | if (!prev->link) | ||
227 | last_table = prev; | ||
228 | } | ||
229 | } else | ||
230 | info->table = NULL; | ||
231 | |||
232 | return 0; | ||
233 | } | ||
234 | |||
235 | /* Must be called with module_mutex held. */ | ||
236 | void unwind_remove_table(void *handle, int init_only) | ||
237 | { | ||
238 | struct unwind_table *table = handle; | ||
239 | struct unlink_table_info info; | ||
240 | |||
241 | if (!table || table == &root_table) | ||
242 | return; | ||
243 | |||
244 | if (init_only && table == last_table) { | ||
245 | table->init.pc = 0; | ||
246 | table->init.range = 0; | ||
247 | return; | ||
248 | } | ||
249 | |||
250 | info.table = table; | ||
251 | info.init_only = init_only; | ||
252 | stop_machine_run(unlink_table, &info, NR_CPUS); | ||
253 | |||
254 | if (info.table) | ||
255 | kfree(table); | ||
256 | } | ||
257 | |||
258 | #endif /* CONFIG_MODULES */ | ||
259 | |||
260 | static uleb128_t get_uleb128(const u8 **pcur, const u8 *end) | ||
261 | { | ||
262 | const u8 *cur = *pcur; | ||
263 | uleb128_t value; | ||
264 | unsigned shift; | ||
265 | |||
266 | for (shift = 0, value = 0; cur < end; shift += 7) { | ||
267 | if (shift + 7 > 8 * sizeof(value) | ||
268 | && (*cur & 0x7fU) >= (1U << (8 * sizeof(value) - shift))) { | ||
269 | cur = end + 1; | ||
270 | break; | ||
271 | } | ||
272 | value |= (uleb128_t)(*cur & 0x7f) << shift; | ||
273 | if (!(*cur++ & 0x80)) | ||
274 | break; | ||
275 | } | ||
276 | *pcur = cur; | ||
277 | |||
278 | return value; | ||
279 | } | ||
280 | |||
281 | static sleb128_t get_sleb128(const u8 **pcur, const u8 *end) | ||
282 | { | ||
283 | const u8 *cur = *pcur; | ||
284 | sleb128_t value; | ||
285 | unsigned shift; | ||
286 | |||
287 | for (shift = 0, value = 0; cur < end; shift += 7) { | ||
288 | if (shift + 7 > 8 * sizeof(value) | ||
289 | && (*cur & 0x7fU) >= (1U << (8 * sizeof(value) - shift))) { | ||
290 | cur = end + 1; | ||
291 | break; | ||
292 | } | ||
293 | value |= (sleb128_t)(*cur & 0x7f) << shift; | ||
294 | if (!(*cur & 0x80)) { | ||
295 | value |= -(*cur++ & 0x40) << shift; | ||
296 | break; | ||
297 | } | ||
298 | } | ||
299 | *pcur = cur; | ||
300 | |||
301 | return value; | ||
302 | } | ||
303 | |||
304 | static unsigned long read_pointer(const u8 **pLoc, | ||
305 | const void *end, | ||
306 | signed ptrType) | ||
307 | { | ||
308 | unsigned long value = 0; | ||
309 | union { | ||
310 | const u8 *p8; | ||
311 | const u16 *p16u; | ||
312 | const s16 *p16s; | ||
313 | const u32 *p32u; | ||
314 | const s32 *p32s; | ||
315 | const unsigned long *pul; | ||
316 | } ptr; | ||
317 | |||
318 | if (ptrType < 0 || ptrType == DW_EH_PE_omit) | ||
319 | return 0; | ||
320 | ptr.p8 = *pLoc; | ||
321 | switch(ptrType & DW_EH_PE_FORM) { | ||
322 | case DW_EH_PE_data2: | ||
323 | if (end < (const void *)(ptr.p16u + 1)) | ||
324 | return 0; | ||
325 | if(ptrType & DW_EH_PE_signed) | ||
326 | value = get_unaligned(ptr.p16s++); | ||
327 | else | ||
328 | value = get_unaligned(ptr.p16u++); | ||
329 | break; | ||
330 | case DW_EH_PE_data4: | ||
331 | #ifdef CONFIG_64BIT | ||
332 | if (end < (const void *)(ptr.p32u + 1)) | ||
333 | return 0; | ||
334 | if(ptrType & DW_EH_PE_signed) | ||
335 | value = get_unaligned(ptr.p32s++); | ||
336 | else | ||
337 | value = get_unaligned(ptr.p32u++); | ||
338 | break; | ||
339 | case DW_EH_PE_data8: | ||
340 | BUILD_BUG_ON(sizeof(u64) != sizeof(value)); | ||
341 | #else | ||
342 | BUILD_BUG_ON(sizeof(u32) != sizeof(value)); | ||
343 | #endif | ||
344 | case DW_EH_PE_native: | ||
345 | if (end < (const void *)(ptr.pul + 1)) | ||
346 | return 0; | ||
347 | value = get_unaligned(ptr.pul++); | ||
348 | break; | ||
349 | case DW_EH_PE_leb128: | ||
350 | BUILD_BUG_ON(sizeof(uleb128_t) > sizeof(value)); | ||
351 | value = ptrType & DW_EH_PE_signed | ||
352 | ? get_sleb128(&ptr.p8, end) | ||
353 | : get_uleb128(&ptr.p8, end); | ||
354 | if ((const void *)ptr.p8 > end) | ||
355 | return 0; | ||
356 | break; | ||
357 | default: | ||
358 | return 0; | ||
359 | } | ||
360 | switch(ptrType & DW_EH_PE_ADJUST) { | ||
361 | case DW_EH_PE_abs: | ||
362 | break; | ||
363 | case DW_EH_PE_pcrel: | ||
364 | value += (unsigned long)*pLoc; | ||
365 | break; | ||
366 | default: | ||
367 | return 0; | ||
368 | } | ||
369 | if ((ptrType & DW_EH_PE_indirect) | ||
370 | && __get_user(value, (unsigned long *)value)) | ||
371 | return 0; | ||
372 | *pLoc = ptr.p8; | ||
373 | |||
374 | return value; | ||
375 | } | ||
376 | |||
377 | static signed fde_pointer_type(const u32 *cie) | ||
378 | { | ||
379 | const u8 *ptr = (const u8 *)(cie + 2); | ||
380 | unsigned version = *ptr; | ||
381 | |||
382 | if (version != 1) | ||
383 | return -1; /* unsupported */ | ||
384 | if (*++ptr) { | ||
385 | const char *aug; | ||
386 | const u8 *end = (const u8 *)(cie + 1) + *cie; | ||
387 | uleb128_t len; | ||
388 | |||
389 | /* check if augmentation size is first (and thus present) */ | ||
390 | if (*ptr != 'z') | ||
391 | return -1; | ||
392 | /* check if augmentation string is nul-terminated */ | ||
393 | if ((ptr = memchr(aug = (const void *)ptr, 0, end - ptr)) == NULL) | ||
394 | return -1; | ||
395 | ++ptr; /* skip terminator */ | ||
396 | get_uleb128(&ptr, end); /* skip code alignment */ | ||
397 | get_sleb128(&ptr, end); /* skip data alignment */ | ||
398 | /* skip return address column */ | ||
399 | version <= 1 ? (void)++ptr : (void)get_uleb128(&ptr, end); | ||
400 | len = get_uleb128(&ptr, end); /* augmentation length */ | ||
401 | if (ptr + len < ptr || ptr + len > end) | ||
402 | return -1; | ||
403 | end = ptr + len; | ||
404 | while (*++aug) { | ||
405 | if (ptr >= end) | ||
406 | return -1; | ||
407 | switch(*aug) { | ||
408 | case 'L': | ||
409 | ++ptr; | ||
410 | break; | ||
411 | case 'P': { | ||
412 | signed ptrType = *ptr++; | ||
413 | |||
414 | if (!read_pointer(&ptr, end, ptrType) || ptr > end) | ||
415 | return -1; | ||
416 | } | ||
417 | break; | ||
418 | case 'R': | ||
419 | return *ptr; | ||
420 | default: | ||
421 | return -1; | ||
422 | } | ||
423 | } | ||
424 | } | ||
425 | return DW_EH_PE_native|DW_EH_PE_abs; | ||
426 | } | ||
427 | |||
428 | static int advance_loc(unsigned long delta, struct unwind_state *state) | ||
429 | { | ||
430 | state->loc += delta * state->codeAlign; | ||
431 | |||
432 | return delta > 0; | ||
433 | } | ||
434 | |||
435 | static void set_rule(uleb128_t reg, | ||
436 | enum item_location where, | ||
437 | uleb128_t value, | ||
438 | struct unwind_state *state) | ||
439 | { | ||
440 | if (reg < ARRAY_SIZE(state->regs)) { | ||
441 | state->regs[reg].where = where; | ||
442 | state->regs[reg].value = value; | ||
443 | } | ||
444 | } | ||
445 | |||
446 | static int processCFI(const u8 *start, | ||
447 | const u8 *end, | ||
448 | unsigned long targetLoc, | ||
449 | signed ptrType, | ||
450 | struct unwind_state *state) | ||
451 | { | ||
452 | union { | ||
453 | const u8 *p8; | ||
454 | const u16 *p16; | ||
455 | const u32 *p32; | ||
456 | } ptr; | ||
457 | int result = 1; | ||
458 | |||
459 | if (start != state->cieStart) { | ||
460 | state->loc = state->org; | ||
461 | result = processCFI(state->cieStart, state->cieEnd, 0, ptrType, state); | ||
462 | if (targetLoc == 0 && state->label == NULL) | ||
463 | return result; | ||
464 | } | ||
465 | for (ptr.p8 = start; result && ptr.p8 < end; ) { | ||
466 | switch(*ptr.p8 >> 6) { | ||
467 | uleb128_t value; | ||
468 | |||
469 | case 0: | ||
470 | switch(*ptr.p8++) { | ||
471 | case DW_CFA_nop: | ||
472 | break; | ||
473 | case DW_CFA_set_loc: | ||
474 | if ((state->loc = read_pointer(&ptr.p8, end, ptrType)) == 0) | ||
475 | result = 0; | ||
476 | break; | ||
477 | case DW_CFA_advance_loc1: | ||
478 | result = ptr.p8 < end && advance_loc(*ptr.p8++, state); | ||
479 | break; | ||
480 | case DW_CFA_advance_loc2: | ||
481 | result = ptr.p8 <= end + 2 | ||
482 | && advance_loc(*ptr.p16++, state); | ||
483 | break; | ||
484 | case DW_CFA_advance_loc4: | ||
485 | result = ptr.p8 <= end + 4 | ||
486 | && advance_loc(*ptr.p32++, state); | ||
487 | break; | ||
488 | case DW_CFA_offset_extended: | ||
489 | value = get_uleb128(&ptr.p8, end); | ||
490 | set_rule(value, Memory, get_uleb128(&ptr.p8, end), state); | ||
491 | break; | ||
492 | case DW_CFA_val_offset: | ||
493 | value = get_uleb128(&ptr.p8, end); | ||
494 | set_rule(value, Value, get_uleb128(&ptr.p8, end), state); | ||
495 | break; | ||
496 | case DW_CFA_offset_extended_sf: | ||
497 | value = get_uleb128(&ptr.p8, end); | ||
498 | set_rule(value, Memory, get_sleb128(&ptr.p8, end), state); | ||
499 | break; | ||
500 | case DW_CFA_val_offset_sf: | ||
501 | value = get_uleb128(&ptr.p8, end); | ||
502 | set_rule(value, Value, get_sleb128(&ptr.p8, end), state); | ||
503 | break; | ||
504 | case DW_CFA_restore_extended: | ||
505 | case DW_CFA_undefined: | ||
506 | case DW_CFA_same_value: | ||
507 | set_rule(get_uleb128(&ptr.p8, end), Nowhere, 0, state); | ||
508 | break; | ||
509 | case DW_CFA_register: | ||
510 | value = get_uleb128(&ptr.p8, end); | ||
511 | set_rule(value, | ||
512 | Register, | ||
513 | get_uleb128(&ptr.p8, end), state); | ||
514 | break; | ||
515 | case DW_CFA_remember_state: | ||
516 | if (ptr.p8 == state->label) { | ||
517 | state->label = NULL; | ||
518 | return 1; | ||
519 | } | ||
520 | if (state->stackDepth >= MAX_STACK_DEPTH) | ||
521 | return 0; | ||
522 | state->stack[state->stackDepth++] = ptr.p8; | ||
523 | break; | ||
524 | case DW_CFA_restore_state: | ||
525 | if (state->stackDepth) { | ||
526 | const uleb128_t loc = state->loc; | ||
527 | const u8 *label = state->label; | ||
528 | |||
529 | state->label = state->stack[state->stackDepth - 1]; | ||
530 | memcpy(&state->cfa, &badCFA, sizeof(state->cfa)); | ||
531 | memset(state->regs, 0, sizeof(state->regs)); | ||
532 | state->stackDepth = 0; | ||
533 | result = processCFI(start, end, 0, ptrType, state); | ||
534 | state->loc = loc; | ||
535 | state->label = label; | ||
536 | } else | ||
537 | return 0; | ||
538 | break; | ||
539 | case DW_CFA_def_cfa: | ||
540 | state->cfa.reg = get_uleb128(&ptr.p8, end); | ||
541 | /*nobreak*/ | ||
542 | case DW_CFA_def_cfa_offset: | ||
543 | state->cfa.offs = get_uleb128(&ptr.p8, end); | ||
544 | break; | ||
545 | case DW_CFA_def_cfa_sf: | ||
546 | state->cfa.reg = get_uleb128(&ptr.p8, end); | ||
547 | /*nobreak*/ | ||
548 | case DW_CFA_def_cfa_offset_sf: | ||
549 | state->cfa.offs = get_sleb128(&ptr.p8, end) | ||
550 | * state->dataAlign; | ||
551 | break; | ||
552 | case DW_CFA_def_cfa_register: | ||
553 | state->cfa.reg = get_uleb128(&ptr.p8, end); | ||
554 | break; | ||
555 | /*todo case DW_CFA_def_cfa_expression: */ | ||
556 | /*todo case DW_CFA_expression: */ | ||
557 | /*todo case DW_CFA_val_expression: */ | ||
558 | case DW_CFA_GNU_args_size: | ||
559 | get_uleb128(&ptr.p8, end); | ||
560 | break; | ||
561 | case DW_CFA_GNU_negative_offset_extended: | ||
562 | value = get_uleb128(&ptr.p8, end); | ||
563 | set_rule(value, | ||
564 | Memory, | ||
565 | (uleb128_t)0 - get_uleb128(&ptr.p8, end), state); | ||
566 | break; | ||
567 | case DW_CFA_GNU_window_save: | ||
568 | default: | ||
569 | result = 0; | ||
570 | break; | ||
571 | } | ||
572 | break; | ||
573 | case 1: | ||
574 | result = advance_loc(*ptr.p8++ & 0x3f, state); | ||
575 | break; | ||
576 | case 2: | ||
577 | value = *ptr.p8++ & 0x3f; | ||
578 | set_rule(value, Memory, get_uleb128(&ptr.p8, end), state); | ||
579 | break; | ||
580 | case 3: | ||
581 | set_rule(*ptr.p8++ & 0x3f, Nowhere, 0, state); | ||
582 | break; | ||
583 | } | ||
584 | if (ptr.p8 > end) | ||
585 | result = 0; | ||
586 | if (result && targetLoc != 0 && targetLoc < state->loc) | ||
587 | return 1; | ||
588 | } | ||
589 | |||
590 | return result | ||
591 | && ptr.p8 == end | ||
592 | && (targetLoc == 0 | ||
593 | || (/*todo While in theory this should apply, gcc in practice omits | ||
594 | everything past the function prolog, and hence the location | ||
595 | never reaches the end of the function. | ||
596 | targetLoc < state->loc &&*/ state->label == NULL)); | ||
597 | } | ||
598 | |||
599 | /* Unwind to previous to frame. Returns 0 if successful, negative | ||
600 | * number in case of an error. */ | ||
601 | int unwind(struct unwind_frame_info *frame) | ||
602 | { | ||
603 | #define FRAME_REG(r, t) (((t *)frame)[reg_info[r].offs]) | ||
604 | const u32 *fde = NULL, *cie = NULL; | ||
605 | const u8 *ptr = NULL, *end = NULL; | ||
606 | unsigned long startLoc = 0, endLoc = 0, cfa; | ||
607 | unsigned i; | ||
608 | signed ptrType = -1; | ||
609 | uleb128_t retAddrReg = 0; | ||
610 | struct unwind_table *table; | ||
611 | struct unwind_state state; | ||
612 | |||
613 | if (UNW_PC(frame) == 0) | ||
614 | return -EINVAL; | ||
615 | if ((table = find_table(UNW_PC(frame))) != NULL | ||
616 | && !(table->size & (sizeof(*fde) - 1))) { | ||
617 | unsigned long tableSize = table->size; | ||
618 | |||
619 | for (fde = table->address; | ||
620 | tableSize > sizeof(*fde) && tableSize - sizeof(*fde) >= *fde; | ||
621 | tableSize -= sizeof(*fde) + *fde, | ||
622 | fde += 1 + *fde / sizeof(*fde)) { | ||
623 | if (!*fde || (*fde & (sizeof(*fde) - 1))) | ||
624 | break; | ||
625 | if (!fde[1]) | ||
626 | continue; /* this is a CIE */ | ||
627 | if ((fde[1] & (sizeof(*fde) - 1)) | ||
628 | || fde[1] > (unsigned long)(fde + 1) | ||
629 | - (unsigned long)table->address) | ||
630 | continue; /* this is not a valid FDE */ | ||
631 | cie = fde + 1 - fde[1] / sizeof(*fde); | ||
632 | if (*cie <= sizeof(*cie) + 4 | ||
633 | || *cie >= fde[1] - sizeof(*fde) | ||
634 | || (*cie & (sizeof(*cie) - 1)) | ||
635 | || cie[1] | ||
636 | || (ptrType = fde_pointer_type(cie)) < 0) { | ||
637 | cie = NULL; /* this is not a (valid) CIE */ | ||
638 | continue; | ||
639 | } | ||
640 | ptr = (const u8 *)(fde + 2); | ||
641 | startLoc = read_pointer(&ptr, | ||
642 | (const u8 *)(fde + 1) + *fde, | ||
643 | ptrType); | ||
644 | endLoc = startLoc | ||
645 | + read_pointer(&ptr, | ||
646 | (const u8 *)(fde + 1) + *fde, | ||
647 | ptrType & DW_EH_PE_indirect | ||
648 | ? ptrType | ||
649 | : ptrType & (DW_EH_PE_FORM|DW_EH_PE_signed)); | ||
650 | if (UNW_PC(frame) >= startLoc && UNW_PC(frame) < endLoc) | ||
651 | break; | ||
652 | cie = NULL; | ||
653 | } | ||
654 | } | ||
655 | if (cie != NULL) { | ||
656 | memset(&state, 0, sizeof(state)); | ||
657 | state.cieEnd = ptr; /* keep here temporarily */ | ||
658 | ptr = (const u8 *)(cie + 2); | ||
659 | end = (const u8 *)(cie + 1) + *cie; | ||
660 | if ((state.version = *ptr) != 1) | ||
661 | cie = NULL; /* unsupported version */ | ||
662 | else if (*++ptr) { | ||
663 | /* check if augmentation size is first (and thus present) */ | ||
664 | if (*ptr == 'z') { | ||
665 | /* check for ignorable (or already handled) | ||
666 | * nul-terminated augmentation string */ | ||
667 | while (++ptr < end && *ptr) | ||
668 | if (strchr("LPR", *ptr) == NULL) | ||
669 | break; | ||
670 | } | ||
671 | if (ptr >= end || *ptr) | ||
672 | cie = NULL; | ||
673 | } | ||
674 | ++ptr; | ||
675 | } | ||
676 | if (cie != NULL) { | ||
677 | /* get code aligment factor */ | ||
678 | state.codeAlign = get_uleb128(&ptr, end); | ||
679 | /* get data aligment factor */ | ||
680 | state.dataAlign = get_sleb128(&ptr, end); | ||
681 | if (state.codeAlign == 0 || state.dataAlign == 0 || ptr >= end) | ||
682 | cie = NULL; | ||
683 | else { | ||
684 | retAddrReg = state.version <= 1 ? *ptr++ : get_uleb128(&ptr, end); | ||
685 | /* skip augmentation */ | ||
686 | if (((const char *)(cie + 2))[1] == 'z') | ||
687 | ptr += get_uleb128(&ptr, end); | ||
688 | if (ptr > end | ||
689 | || retAddrReg >= ARRAY_SIZE(reg_info) | ||
690 | || REG_INVALID(retAddrReg) | ||
691 | || reg_info[retAddrReg].width != sizeof(unsigned long)) | ||
692 | cie = NULL; | ||
693 | } | ||
694 | } | ||
695 | if (cie != NULL) { | ||
696 | state.cieStart = ptr; | ||
697 | ptr = state.cieEnd; | ||
698 | state.cieEnd = end; | ||
699 | end = (const u8 *)(fde + 1) + *fde; | ||
700 | /* skip augmentation */ | ||
701 | if (((const char *)(cie + 2))[1] == 'z') { | ||
702 | uleb128_t augSize = get_uleb128(&ptr, end); | ||
703 | |||
704 | if ((ptr += augSize) > end) | ||
705 | fde = NULL; | ||
706 | } | ||
707 | } | ||
708 | if (cie == NULL || fde == NULL) { | ||
709 | #ifdef CONFIG_FRAME_POINTER | ||
710 | unsigned long top, bottom; | ||
711 | #endif | ||
712 | |||
713 | #ifdef CONFIG_FRAME_POINTER | ||
714 | top = STACK_TOP(frame->task); | ||
715 | bottom = STACK_BOTTOM(frame->task); | ||
716 | # if FRAME_RETADDR_OFFSET < 0 | ||
717 | if (UNW_SP(frame) < top | ||
718 | && UNW_FP(frame) <= UNW_SP(frame) | ||
719 | && bottom < UNW_FP(frame) | ||
720 | # else | ||
721 | if (UNW_SP(frame) > top | ||
722 | && UNW_FP(frame) >= UNW_SP(frame) | ||
723 | && bottom > UNW_FP(frame) | ||
724 | # endif | ||
725 | && !((UNW_SP(frame) | UNW_FP(frame)) | ||
726 | & (sizeof(unsigned long) - 1))) { | ||
727 | unsigned long link; | ||
728 | |||
729 | if (!__get_user(link, | ||
730 | (unsigned long *)(UNW_FP(frame) | ||
731 | + FRAME_LINK_OFFSET)) | ||
732 | # if FRAME_RETADDR_OFFSET < 0 | ||
733 | && link > bottom && link < UNW_FP(frame) | ||
734 | # else | ||
735 | && link > UNW_FP(frame) && link < bottom | ||
736 | # endif | ||
737 | && !(link & (sizeof(link) - 1)) | ||
738 | && !__get_user(UNW_PC(frame), | ||
739 | (unsigned long *)(UNW_FP(frame) | ||
740 | + FRAME_RETADDR_OFFSET))) { | ||
741 | UNW_SP(frame) = UNW_FP(frame) + FRAME_RETADDR_OFFSET | ||
742 | # if FRAME_RETADDR_OFFSET < 0 | ||
743 | - | ||
744 | # else | ||
745 | + | ||
746 | # endif | ||
747 | sizeof(UNW_PC(frame)); | ||
748 | UNW_FP(frame) = link; | ||
749 | return 0; | ||
750 | } | ||
751 | } | ||
752 | #endif | ||
753 | return -ENXIO; | ||
754 | } | ||
755 | state.org = startLoc; | ||
756 | memcpy(&state.cfa, &badCFA, sizeof(state.cfa)); | ||
757 | /* process instructions */ | ||
758 | if (!processCFI(ptr, end, UNW_PC(frame), ptrType, &state) | ||
759 | || state.loc > endLoc | ||
760 | || state.regs[retAddrReg].where == Nowhere | ||
761 | || state.cfa.reg >= ARRAY_SIZE(reg_info) | ||
762 | || reg_info[state.cfa.reg].width != sizeof(unsigned long) | ||
763 | || state.cfa.offs % sizeof(unsigned long)) | ||
764 | return -EIO; | ||
765 | /* update frame */ | ||
766 | cfa = FRAME_REG(state.cfa.reg, unsigned long) + state.cfa.offs; | ||
767 | startLoc = min((unsigned long)UNW_SP(frame), cfa); | ||
768 | endLoc = max((unsigned long)UNW_SP(frame), cfa); | ||
769 | if (STACK_LIMIT(startLoc) != STACK_LIMIT(endLoc)) { | ||
770 | startLoc = min(STACK_LIMIT(cfa), cfa); | ||
771 | endLoc = max(STACK_LIMIT(cfa), cfa); | ||
772 | } | ||
773 | #ifndef CONFIG_64BIT | ||
774 | # define CASES CASE(8); CASE(16); CASE(32) | ||
775 | #else | ||
776 | # define CASES CASE(8); CASE(16); CASE(32); CASE(64) | ||
777 | #endif | ||
778 | for (i = 0; i < ARRAY_SIZE(state.regs); ++i) { | ||
779 | if (REG_INVALID(i)) { | ||
780 | if (state.regs[i].where == Nowhere) | ||
781 | continue; | ||
782 | return -EIO; | ||
783 | } | ||
784 | switch(state.regs[i].where) { | ||
785 | default: | ||
786 | break; | ||
787 | case Register: | ||
788 | if (state.regs[i].value >= ARRAY_SIZE(reg_info) | ||
789 | || REG_INVALID(state.regs[i].value) | ||
790 | || reg_info[i].width > reg_info[state.regs[i].value].width) | ||
791 | return -EIO; | ||
792 | switch(reg_info[state.regs[i].value].width) { | ||
793 | #define CASE(n) \ | ||
794 | case sizeof(u##n): \ | ||
795 | state.regs[i].value = FRAME_REG(state.regs[i].value, \ | ||
796 | const u##n); \ | ||
797 | break | ||
798 | CASES; | ||
799 | #undef CASE | ||
800 | default: | ||
801 | return -EIO; | ||
802 | } | ||
803 | break; | ||
804 | } | ||
805 | } | ||
806 | for (i = 0; i < ARRAY_SIZE(state.regs); ++i) { | ||
807 | if (REG_INVALID(i)) | ||
808 | continue; | ||
809 | switch(state.regs[i].where) { | ||
810 | case Nowhere: | ||
811 | if (reg_info[i].width != sizeof(UNW_SP(frame)) | ||
812 | || &FRAME_REG(i, __typeof__(UNW_SP(frame))) | ||
813 | != &UNW_SP(frame)) | ||
814 | continue; | ||
815 | UNW_SP(frame) = cfa; | ||
816 | break; | ||
817 | case Register: | ||
818 | switch(reg_info[i].width) { | ||
819 | #define CASE(n) case sizeof(u##n): \ | ||
820 | FRAME_REG(i, u##n) = state.regs[i].value; \ | ||
821 | break | ||
822 | CASES; | ||
823 | #undef CASE | ||
824 | default: | ||
825 | return -EIO; | ||
826 | } | ||
827 | break; | ||
828 | case Value: | ||
829 | if (reg_info[i].width != sizeof(unsigned long)) | ||
830 | return -EIO; | ||
831 | FRAME_REG(i, unsigned long) = cfa + state.regs[i].value | ||
832 | * state.dataAlign; | ||
833 | break; | ||
834 | case Memory: { | ||
835 | unsigned long addr = cfa + state.regs[i].value | ||
836 | * state.dataAlign; | ||
837 | |||
838 | if ((state.regs[i].value * state.dataAlign) | ||
839 | % sizeof(unsigned long) | ||
840 | || addr < startLoc | ||
841 | || addr + sizeof(unsigned long) < addr | ||
842 | || addr + sizeof(unsigned long) > endLoc) | ||
843 | return -EIO; | ||
844 | switch(reg_info[i].width) { | ||
845 | #define CASE(n) case sizeof(u##n): \ | ||
846 | __get_user(FRAME_REG(i, u##n), (u##n *)addr); \ | ||
847 | break | ||
848 | CASES; | ||
849 | #undef CASE | ||
850 | default: | ||
851 | return -EIO; | ||
852 | } | ||
853 | } | ||
854 | break; | ||
855 | } | ||
856 | } | ||
857 | |||
858 | return 0; | ||
859 | #undef CASES | ||
860 | #undef FRAME_REG | ||
861 | } | ||
862 | EXPORT_SYMBOL(unwind); | ||
863 | |||
864 | int unwind_init_frame_info(struct unwind_frame_info *info, | ||
865 | struct task_struct *tsk, | ||
866 | /*const*/ struct pt_regs *regs) | ||
867 | { | ||
868 | info->task = tsk; | ||
869 | arch_unw_init_frame_info(info, regs); | ||
870 | |||
871 | return 0; | ||
872 | } | ||
873 | EXPORT_SYMBOL(unwind_init_frame_info); | ||
874 | |||
875 | /* | ||
876 | * Prepare to unwind a blocked task. | ||
877 | */ | ||
878 | int unwind_init_blocked(struct unwind_frame_info *info, | ||
879 | struct task_struct *tsk) | ||
880 | { | ||
881 | info->task = tsk; | ||
882 | arch_unw_init_blocked(info); | ||
883 | |||
884 | return 0; | ||
885 | } | ||
886 | EXPORT_SYMBOL(unwind_init_blocked); | ||
887 | |||
888 | /* | ||
889 | * Prepare to unwind the currently running thread. | ||
890 | */ | ||
891 | int unwind_init_running(struct unwind_frame_info *info, | ||
892 | asmlinkage int (*callback)(struct unwind_frame_info *, | ||
893 | void *arg), | ||
894 | void *arg) | ||
895 | { | ||
896 | info->task = current; | ||
897 | |||
898 | return arch_unwind_init_running(info, callback, arg); | ||
899 | } | ||
900 | EXPORT_SYMBOL(unwind_init_running); | ||
901 | |||
902 | /* | ||
903 | * Unwind until the return pointer is in user-land (or until an error | ||
904 | * occurs). Returns 0 if successful, negative number in case of | ||
905 | * error. | ||
906 | */ | ||
907 | int unwind_to_user(struct unwind_frame_info *info) | ||
908 | { | ||
909 | while (!arch_unw_user_mode(info)) { | ||
910 | int err = unwind(info); | ||
911 | |||
912 | if (err < 0) | ||
913 | return err; | ||
914 | } | ||
915 | |||
916 | return 0; | ||
917 | } | ||
918 | EXPORT_SYMBOL(unwind_to_user); | ||
diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 565cf7a1febd..59f0b42bd89e 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c | |||
@@ -559,7 +559,7 @@ static void take_over_work(struct workqueue_struct *wq, unsigned int cpu) | |||
559 | } | 559 | } |
560 | 560 | ||
561 | /* We're holding the cpucontrol mutex here */ | 561 | /* We're holding the cpucontrol mutex here */ |
562 | static int workqueue_cpu_callback(struct notifier_block *nfb, | 562 | static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, |
563 | unsigned long action, | 563 | unsigned long action, |
564 | void *hcpu) | 564 | void *hcpu) |
565 | { | 565 | { |