aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorPaul Mackerras <paulus@samba.org>2006-07-31 20:37:25 -0400
committerPaul Mackerras <paulus@samba.org>2006-07-31 20:37:25 -0400
commit57cad8084e0837e0f2c97da789ec9b3f36809be9 (patch)
treee9c790afb4286f78cb08d9664f58baa7e876fe55 /kernel
parentcb18bd40030c879cd93fef02fd579f74dbab473d (diff)
parent49b1e3ea19b1c95c2f012b8331ffb3b169e4c042 (diff)
Merge branch 'merge'
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile2
-rw-r--r--kernel/acct.c4
-rw-r--r--kernel/cpu.c75
-rw-r--r--kernel/cpuset.c24
-rw-r--r--kernel/delayacct.c178
-rw-r--r--kernel/exit.c10
-rw-r--r--kernel/fork.c10
-rw-r--r--kernel/futex.c127
-rw-r--r--kernel/futex_compat.c34
-rw-r--r--kernel/hrtimer.c4
-rw-r--r--kernel/irq/manage.c28
-rw-r--r--kernel/kallsyms.c4
-rw-r--r--kernel/kprobes.c1
-rw-r--r--kernel/kthread.c24
-rw-r--r--kernel/lockdep.c136
-rw-r--r--kernel/module.c11
-rw-r--r--kernel/panic.c2
-rw-r--r--kernel/power/pm.c37
-rw-r--r--kernel/power/snapshot.c10
-rw-r--r--kernel/power/swap.c26
-rw-r--r--kernel/printk.c4
-rw-r--r--kernel/rcupdate.c4
-rw-r--r--kernel/resource.c2
-rw-r--r--kernel/rtmutex-tester.c1
-rw-r--r--kernel/rtmutex.c2
-rw-r--r--kernel/sched.c125
-rw-r--r--kernel/softirq.c24
-rw-r--r--kernel/softlockup.c4
-rw-r--r--kernel/sys.c2
-rw-r--r--kernel/taskstats.c564
-rw-r--r--kernel/timer.c113
-rw-r--r--kernel/wait.c8
-rw-r--r--kernel/workqueue.c58
33 files changed, 1310 insertions, 348 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 47dbcd570cd8..d62ec66c1af2 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -48,6 +48,8 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
48obj-$(CONFIG_SECCOMP) += seccomp.o 48obj-$(CONFIG_SECCOMP) += seccomp.o
49obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o 49obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
50obj-$(CONFIG_RELAY) += relay.o 50obj-$(CONFIG_RELAY) += relay.o
51obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
52obj-$(CONFIG_TASKSTATS) += taskstats.o
51 53
52ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) 54ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
53# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 55# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/acct.c b/kernel/acct.c
index f18e0b8df3e1..2a7c933651c7 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -488,7 +488,7 @@ static void do_acct_process(struct file *file)
488 old_encode_dev(tty_devnum(current->signal->tty)) : 0; 488 old_encode_dev(tty_devnum(current->signal->tty)) : 0;
489 read_unlock(&tasklist_lock); 489 read_unlock(&tasklist_lock);
490 490
491 spin_lock(&current->sighand->siglock); 491 spin_lock_irq(&current->sighand->siglock);
492 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); 492 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
493 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); 493 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
494 ac.ac_flag = pacct->ac_flag; 494 ac.ac_flag = pacct->ac_flag;
@@ -496,7 +496,7 @@ static void do_acct_process(struct file *file)
496 ac.ac_minflt = encode_comp_t(pacct->ac_minflt); 496 ac.ac_minflt = encode_comp_t(pacct->ac_minflt);
497 ac.ac_majflt = encode_comp_t(pacct->ac_majflt); 497 ac.ac_majflt = encode_comp_t(pacct->ac_majflt);
498 ac.ac_exitcode = pacct->ac_exitcode; 498 ac.ac_exitcode = pacct->ac_exitcode;
499 spin_unlock(&current->sighand->siglock); 499 spin_unlock_irq(&current->sighand->siglock);
500 ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ 500 ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */
501 ac.ac_rw = encode_comp_t(ac.ac_io / 1024); 501 ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
502 ac.ac_swaps = encode_comp_t(0); 502 ac.ac_swaps = encode_comp_t(0);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 70fbf2e83766..f230f9ae01c2 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -16,56 +16,48 @@
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17 17
18/* This protects CPUs going up and down... */ 18/* This protects CPUs going up and down... */
19static DEFINE_MUTEX(cpucontrol); 19static DEFINE_MUTEX(cpu_add_remove_lock);
20static DEFINE_MUTEX(cpu_bitmask_lock);
20 21
21static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain); 22static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain);
22 23
23#ifdef CONFIG_HOTPLUG_CPU 24#ifdef CONFIG_HOTPLUG_CPU
24static struct task_struct *lock_cpu_hotplug_owner;
25static int lock_cpu_hotplug_depth;
26 25
27static int __lock_cpu_hotplug(int interruptible) 26/* Crappy recursive lock-takers in cpufreq! Complain loudly about idiots */
28{ 27static struct task_struct *recursive;
29 int ret = 0; 28static int recursive_depth;
30
31 if (lock_cpu_hotplug_owner != current) {
32 if (interruptible)
33 ret = mutex_lock_interruptible(&cpucontrol);
34 else
35 mutex_lock(&cpucontrol);
36 }
37
38 /*
39 * Set only if we succeed in locking
40 */
41 if (!ret) {
42 lock_cpu_hotplug_depth++;
43 lock_cpu_hotplug_owner = current;
44 }
45
46 return ret;
47}
48 29
49void lock_cpu_hotplug(void) 30void lock_cpu_hotplug(void)
50{ 31{
51 __lock_cpu_hotplug(0); 32 struct task_struct *tsk = current;
33
34 if (tsk == recursive) {
35 static int warnings = 10;
36 if (warnings) {
37 printk(KERN_ERR "Lukewarm IQ detected in hotplug locking\n");
38 WARN_ON(1);
39 warnings--;
40 }
41 recursive_depth++;
42 return;
43 }
44 mutex_lock(&cpu_bitmask_lock);
45 recursive = tsk;
52} 46}
53EXPORT_SYMBOL_GPL(lock_cpu_hotplug); 47EXPORT_SYMBOL_GPL(lock_cpu_hotplug);
54 48
55void unlock_cpu_hotplug(void) 49void unlock_cpu_hotplug(void)
56{ 50{
57 if (--lock_cpu_hotplug_depth == 0) { 51 WARN_ON(recursive != current);
58 lock_cpu_hotplug_owner = NULL; 52 if (recursive_depth) {
59 mutex_unlock(&cpucontrol); 53 recursive_depth--;
54 return;
60 } 55 }
56 mutex_unlock(&cpu_bitmask_lock);
57 recursive = NULL;
61} 58}
62EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); 59EXPORT_SYMBOL_GPL(unlock_cpu_hotplug);
63 60
64int lock_cpu_hotplug_interruptible(void)
65{
66 return __lock_cpu_hotplug(1);
67}
68EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible);
69#endif /* CONFIG_HOTPLUG_CPU */ 61#endif /* CONFIG_HOTPLUG_CPU */
70 62
71/* Need to know about CPUs going up/down? */ 63/* Need to know about CPUs going up/down? */
@@ -122,9 +114,7 @@ int cpu_down(unsigned int cpu)
122 struct task_struct *p; 114 struct task_struct *p;
123 cpumask_t old_allowed, tmp; 115 cpumask_t old_allowed, tmp;
124 116
125 if ((err = lock_cpu_hotplug_interruptible()) != 0) 117 mutex_lock(&cpu_add_remove_lock);
126 return err;
127
128 if (num_online_cpus() == 1) { 118 if (num_online_cpus() == 1) {
129 err = -EBUSY; 119 err = -EBUSY;
130 goto out; 120 goto out;
@@ -150,7 +140,10 @@ int cpu_down(unsigned int cpu)
150 cpu_clear(cpu, tmp); 140 cpu_clear(cpu, tmp);
151 set_cpus_allowed(current, tmp); 141 set_cpus_allowed(current, tmp);
152 142
143 mutex_lock(&cpu_bitmask_lock);
153 p = __stop_machine_run(take_cpu_down, NULL, cpu); 144 p = __stop_machine_run(take_cpu_down, NULL, cpu);
145 mutex_unlock(&cpu_bitmask_lock);
146
154 if (IS_ERR(p)) { 147 if (IS_ERR(p)) {
155 /* CPU didn't die: tell everyone. Can't complain. */ 148 /* CPU didn't die: tell everyone. Can't complain. */
156 if (blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED, 149 if (blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
@@ -187,7 +180,7 @@ out_thread:
187out_allowed: 180out_allowed:
188 set_cpus_allowed(current, old_allowed); 181 set_cpus_allowed(current, old_allowed);
189out: 182out:
190 unlock_cpu_hotplug(); 183 mutex_unlock(&cpu_add_remove_lock);
191 return err; 184 return err;
192} 185}
193#endif /*CONFIG_HOTPLUG_CPU*/ 186#endif /*CONFIG_HOTPLUG_CPU*/
@@ -197,9 +190,7 @@ int __devinit cpu_up(unsigned int cpu)
197 int ret; 190 int ret;
198 void *hcpu = (void *)(long)cpu; 191 void *hcpu = (void *)(long)cpu;
199 192
200 if ((ret = lock_cpu_hotplug_interruptible()) != 0) 193 mutex_lock(&cpu_add_remove_lock);
201 return ret;
202
203 if (cpu_online(cpu) || !cpu_present(cpu)) { 194 if (cpu_online(cpu) || !cpu_present(cpu)) {
204 ret = -EINVAL; 195 ret = -EINVAL;
205 goto out; 196 goto out;
@@ -214,7 +205,9 @@ int __devinit cpu_up(unsigned int cpu)
214 } 205 }
215 206
216 /* Arch-specific enabling code. */ 207 /* Arch-specific enabling code. */
208 mutex_lock(&cpu_bitmask_lock);
217 ret = __cpu_up(cpu); 209 ret = __cpu_up(cpu);
210 mutex_unlock(&cpu_bitmask_lock);
218 if (ret != 0) 211 if (ret != 0)
219 goto out_notify; 212 goto out_notify;
220 BUG_ON(!cpu_online(cpu)); 213 BUG_ON(!cpu_online(cpu));
@@ -227,6 +220,6 @@ out_notify:
227 blocking_notifier_call_chain(&cpu_chain, 220 blocking_notifier_call_chain(&cpu_chain,
228 CPU_UP_CANCELED, hcpu); 221 CPU_UP_CANCELED, hcpu);
229out: 222out:
230 unlock_cpu_hotplug(); 223 mutex_unlock(&cpu_add_remove_lock);
231 return ret; 224 return ret;
232} 225}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index c232dc077438..1a649f2bb9bb 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -762,6 +762,8 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
762 * 762 *
763 * Call with manage_mutex held. May nest a call to the 763 * Call with manage_mutex held. May nest a call to the
764 * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. 764 * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
765 * Must not be called holding callback_mutex, because we must
766 * not call lock_cpu_hotplug() while holding callback_mutex.
765 */ 767 */
766 768
767static void update_cpu_domains(struct cpuset *cur) 769static void update_cpu_domains(struct cpuset *cur)
@@ -781,7 +783,7 @@ static void update_cpu_domains(struct cpuset *cur)
781 if (is_cpu_exclusive(c)) 783 if (is_cpu_exclusive(c))
782 cpus_andnot(pspan, pspan, c->cpus_allowed); 784 cpus_andnot(pspan, pspan, c->cpus_allowed);
783 } 785 }
784 if (is_removed(cur) || !is_cpu_exclusive(cur)) { 786 if (!is_cpu_exclusive(cur)) {
785 cpus_or(pspan, pspan, cur->cpus_allowed); 787 cpus_or(pspan, pspan, cur->cpus_allowed);
786 if (cpus_equal(pspan, cur->cpus_allowed)) 788 if (cpus_equal(pspan, cur->cpus_allowed))
787 return; 789 return;
@@ -1917,6 +1919,17 @@ static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1917 return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR); 1919 return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR);
1918} 1920}
1919 1921
1922/*
1923 * Locking note on the strange update_flag() call below:
1924 *
1925 * If the cpuset being removed is marked cpu_exclusive, then simulate
1926 * turning cpu_exclusive off, which will call update_cpu_domains().
1927 * The lock_cpu_hotplug() call in update_cpu_domains() must not be
1928 * made while holding callback_mutex. Elsewhere the kernel nests
1929 * callback_mutex inside lock_cpu_hotplug() calls. So the reverse
1930 * nesting would risk an ABBA deadlock.
1931 */
1932
1920static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) 1933static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1921{ 1934{
1922 struct cpuset *cs = dentry->d_fsdata; 1935 struct cpuset *cs = dentry->d_fsdata;
@@ -1936,11 +1949,16 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
1936 mutex_unlock(&manage_mutex); 1949 mutex_unlock(&manage_mutex);
1937 return -EBUSY; 1950 return -EBUSY;
1938 } 1951 }
1952 if (is_cpu_exclusive(cs)) {
1953 int retval = update_flag(CS_CPU_EXCLUSIVE, cs, "0");
1954 if (retval < 0) {
1955 mutex_unlock(&manage_mutex);
1956 return retval;
1957 }
1958 }
1939 parent = cs->parent; 1959 parent = cs->parent;
1940 mutex_lock(&callback_mutex); 1960 mutex_lock(&callback_mutex);
1941 set_bit(CS_REMOVED, &cs->flags); 1961 set_bit(CS_REMOVED, &cs->flags);
1942 if (is_cpu_exclusive(cs))
1943 update_cpu_domains(cs);
1944 list_del(&cs->sibling); /* delete my sibling from parent->children */ 1962 list_del(&cs->sibling); /* delete my sibling from parent->children */
1945 spin_lock(&cs->dentry->d_lock); 1963 spin_lock(&cs->dentry->d_lock);
1946 d = dget(cs->dentry); 1964 d = dget(cs->dentry);
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
new file mode 100644
index 000000000000..57ca3730205d
--- /dev/null
+++ b/kernel/delayacct.c
@@ -0,0 +1,178 @@
1/* delayacct.c - per-task delay accounting
2 *
3 * Copyright (C) Shailabh Nagar, IBM Corp. 2006
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it would be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
13 * the GNU General Public License for more details.
14 */
15
16#include <linux/sched.h>
17#include <linux/slab.h>
18#include <linux/time.h>
19#include <linux/sysctl.h>
20#include <linux/delayacct.h>
21
22int delayacct_on __read_mostly = 1; /* Delay accounting turned on/off */
23kmem_cache_t *delayacct_cache;
24
25static int __init delayacct_setup_disable(char *str)
26{
27 delayacct_on = 0;
28 return 1;
29}
30__setup("nodelayacct", delayacct_setup_disable);
31
32void delayacct_init(void)
33{
34 delayacct_cache = kmem_cache_create("delayacct_cache",
35 sizeof(struct task_delay_info),
36 0,
37 SLAB_PANIC,
38 NULL, NULL);
39 delayacct_tsk_init(&init_task);
40}
41
42void __delayacct_tsk_init(struct task_struct *tsk)
43{
44 spin_lock_init(&tsk->delays_lock);
45 /* No need to acquire tsk->delays_lock for allocation here unless
46 __delayacct_tsk_init called after tsk is attached to tasklist
47 */
48 tsk->delays = kmem_cache_zalloc(delayacct_cache, SLAB_KERNEL);
49 if (tsk->delays)
50 spin_lock_init(&tsk->delays->lock);
51}
52
53void __delayacct_tsk_exit(struct task_struct *tsk)
54{
55 struct task_delay_info *delays = tsk->delays;
56 spin_lock(&tsk->delays_lock);
57 tsk->delays = NULL;
58 spin_unlock(&tsk->delays_lock);
59 kmem_cache_free(delayacct_cache, delays);
60}
61
62/*
63 * Start accounting for a delay statistic using
64 * its starting timestamp (@start)
65 */
66
67static inline void delayacct_start(struct timespec *start)
68{
69 do_posix_clock_monotonic_gettime(start);
70}
71
72/*
73 * Finish delay accounting for a statistic using
74 * its timestamps (@start, @end), accumalator (@total) and @count
75 */
76
77static void delayacct_end(struct timespec *start, struct timespec *end,
78 u64 *total, u32 *count)
79{
80 struct timespec ts;
81 s64 ns;
82
83 do_posix_clock_monotonic_gettime(end);
84 ts = timespec_sub(*end, *start);
85 ns = timespec_to_ns(&ts);
86 if (ns < 0)
87 return;
88
89 spin_lock(&current->delays->lock);
90 *total += ns;
91 (*count)++;
92 spin_unlock(&current->delays->lock);
93}
94
95void __delayacct_blkio_start(void)
96{
97 delayacct_start(&current->delays->blkio_start);
98}
99
100void __delayacct_blkio_end(void)
101{
102 if (current->delays->flags & DELAYACCT_PF_SWAPIN)
103 /* Swapin block I/O */
104 delayacct_end(&current->delays->blkio_start,
105 &current->delays->blkio_end,
106 &current->delays->swapin_delay,
107 &current->delays->swapin_count);
108 else /* Other block I/O */
109 delayacct_end(&current->delays->blkio_start,
110 &current->delays->blkio_end,
111 &current->delays->blkio_delay,
112 &current->delays->blkio_count);
113}
114
115int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
116{
117 s64 tmp;
118 struct timespec ts;
119 unsigned long t1,t2,t3;
120
121 spin_lock(&tsk->delays_lock);
122
123 /* Though tsk->delays accessed later, early exit avoids
124 * unnecessary returning of other data
125 */
126 if (!tsk->delays)
127 goto done;
128
129 tmp = (s64)d->cpu_run_real_total;
130 cputime_to_timespec(tsk->utime + tsk->stime, &ts);
131 tmp += timespec_to_ns(&ts);
132 d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
133
134 /*
135 * No locking available for sched_info (and too expensive to add one)
136 * Mitigate by taking snapshot of values
137 */
138 t1 = tsk->sched_info.pcnt;
139 t2 = tsk->sched_info.run_delay;
140 t3 = tsk->sched_info.cpu_time;
141
142 d->cpu_count += t1;
143
144 jiffies_to_timespec(t2, &ts);
145 tmp = (s64)d->cpu_delay_total + timespec_to_ns(&ts);
146 d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp;
147
148 tmp = (s64)d->cpu_run_virtual_total + (s64)jiffies_to_usecs(t3) * 1000;
149 d->cpu_run_virtual_total =
150 (tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp;
151
152 /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */
153
154 spin_lock(&tsk->delays->lock);
155 tmp = d->blkio_delay_total + tsk->delays->blkio_delay;
156 d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp;
157 tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
158 d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp;
159 d->blkio_count += tsk->delays->blkio_count;
160 d->swapin_count += tsk->delays->swapin_count;
161 spin_unlock(&tsk->delays->lock);
162
163done:
164 spin_unlock(&tsk->delays_lock);
165 return 0;
166}
167
168__u64 __delayacct_blkio_ticks(struct task_struct *tsk)
169{
170 __u64 ret;
171
172 spin_lock(&tsk->delays->lock);
173 ret = nsec_to_clock_t(tsk->delays->blkio_delay +
174 tsk->delays->swapin_delay);
175 spin_unlock(&tsk->delays->lock);
176 return ret;
177}
178
diff --git a/kernel/exit.c b/kernel/exit.c
index 6664c084783d..dba194a8d416 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -25,6 +25,8 @@
25#include <linux/mount.h> 25#include <linux/mount.h>
26#include <linux/proc_fs.h> 26#include <linux/proc_fs.h>
27#include <linux/mempolicy.h> 27#include <linux/mempolicy.h>
28#include <linux/taskstats_kern.h>
29#include <linux/delayacct.h>
28#include <linux/cpuset.h> 30#include <linux/cpuset.h>
29#include <linux/syscalls.h> 31#include <linux/syscalls.h>
30#include <linux/signal.h> 32#include <linux/signal.h>
@@ -843,7 +845,9 @@ static void exit_notify(struct task_struct *tsk)
843fastcall NORET_TYPE void do_exit(long code) 845fastcall NORET_TYPE void do_exit(long code)
844{ 846{
845 struct task_struct *tsk = current; 847 struct task_struct *tsk = current;
848 struct taskstats *tidstats;
846 int group_dead; 849 int group_dead;
850 unsigned int mycpu;
847 851
848 profile_task_exit(tsk); 852 profile_task_exit(tsk);
849 853
@@ -881,6 +885,8 @@ fastcall NORET_TYPE void do_exit(long code)
881 current->comm, current->pid, 885 current->comm, current->pid,
882 preempt_count()); 886 preempt_count());
883 887
888 taskstats_exit_alloc(&tidstats, &mycpu);
889
884 acct_update_integrals(tsk); 890 acct_update_integrals(tsk);
885 if (tsk->mm) { 891 if (tsk->mm) {
886 update_hiwater_rss(tsk->mm); 892 update_hiwater_rss(tsk->mm);
@@ -900,6 +906,10 @@ fastcall NORET_TYPE void do_exit(long code)
900#endif 906#endif
901 if (unlikely(tsk->audit_context)) 907 if (unlikely(tsk->audit_context))
902 audit_free(tsk); 908 audit_free(tsk);
909 taskstats_exit_send(tsk, tidstats, group_dead, mycpu);
910 taskstats_exit_free(tidstats);
911 delayacct_tsk_exit(tsk);
912
903 exit_mm(tsk); 913 exit_mm(tsk);
904 914
905 if (group_dead) 915 if (group_dead)
diff --git a/kernel/fork.c b/kernel/fork.c
index 56e4e07e45f7..1b0f7b1e0881 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -43,6 +43,8 @@
43#include <linux/rmap.h> 43#include <linux/rmap.h>
44#include <linux/acct.h> 44#include <linux/acct.h>
45#include <linux/cn_proc.h> 45#include <linux/cn_proc.h>
46#include <linux/delayacct.h>
47#include <linux/taskstats_kern.h>
46 48
47#include <asm/pgtable.h> 49#include <asm/pgtable.h>
48#include <asm/pgalloc.h> 50#include <asm/pgalloc.h>
@@ -61,9 +63,7 @@ int max_threads; /* tunable limit on nr_threads */
61 63
62DEFINE_PER_CPU(unsigned long, process_counts) = 0; 64DEFINE_PER_CPU(unsigned long, process_counts) = 0;
63 65
64 __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ 66__cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */
65
66EXPORT_SYMBOL(tasklist_lock);
67 67
68int nr_processes(void) 68int nr_processes(void)
69{ 69{
@@ -820,6 +820,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
820 if (clone_flags & CLONE_THREAD) { 820 if (clone_flags & CLONE_THREAD) {
821 atomic_inc(&current->signal->count); 821 atomic_inc(&current->signal->count);
822 atomic_inc(&current->signal->live); 822 atomic_inc(&current->signal->live);
823 taskstats_tgid_alloc(current->signal);
823 return 0; 824 return 0;
824 } 825 }
825 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); 826 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
@@ -864,6 +865,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
864 INIT_LIST_HEAD(&sig->cpu_timers[0]); 865 INIT_LIST_HEAD(&sig->cpu_timers[0]);
865 INIT_LIST_HEAD(&sig->cpu_timers[1]); 866 INIT_LIST_HEAD(&sig->cpu_timers[1]);
866 INIT_LIST_HEAD(&sig->cpu_timers[2]); 867 INIT_LIST_HEAD(&sig->cpu_timers[2]);
868 taskstats_tgid_init(sig);
867 869
868 task_lock(current->group_leader); 870 task_lock(current->group_leader);
869 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); 871 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
@@ -885,6 +887,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
885void __cleanup_signal(struct signal_struct *sig) 887void __cleanup_signal(struct signal_struct *sig)
886{ 888{
887 exit_thread_group_keys(sig); 889 exit_thread_group_keys(sig);
890 taskstats_tgid_free(sig);
888 kmem_cache_free(signal_cachep, sig); 891 kmem_cache_free(signal_cachep, sig);
889} 892}
890 893
@@ -1002,6 +1005,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1002 goto bad_fork_cleanup_put_domain; 1005 goto bad_fork_cleanup_put_domain;
1003 1006
1004 p->did_exec = 0; 1007 p->did_exec = 0;
1008 delayacct_tsk_init(p); /* Must remain after dup_task_struct() */
1005 copy_flags(clone_flags, p); 1009 copy_flags(clone_flags, p);
1006 p->pid = pid; 1010 p->pid = pid;
1007 retval = -EFAULT; 1011 retval = -EFAULT;
diff --git a/kernel/futex.c b/kernel/futex.c
index 1dc98e4dd287..dda2049692a2 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -415,15 +415,15 @@ out_unlock:
415 */ 415 */
416void exit_pi_state_list(struct task_struct *curr) 416void exit_pi_state_list(struct task_struct *curr)
417{ 417{
418 struct futex_hash_bucket *hb;
419 struct list_head *next, *head = &curr->pi_state_list; 418 struct list_head *next, *head = &curr->pi_state_list;
420 struct futex_pi_state *pi_state; 419 struct futex_pi_state *pi_state;
420 struct futex_hash_bucket *hb;
421 union futex_key key; 421 union futex_key key;
422 422
423 /* 423 /*
424 * We are a ZOMBIE and nobody can enqueue itself on 424 * We are a ZOMBIE and nobody can enqueue itself on
425 * pi_state_list anymore, but we have to be careful 425 * pi_state_list anymore, but we have to be careful
426 * versus waiters unqueueing themselfs 426 * versus waiters unqueueing themselves:
427 */ 427 */
428 spin_lock_irq(&curr->pi_lock); 428 spin_lock_irq(&curr->pi_lock);
429 while (!list_empty(head)) { 429 while (!list_empty(head)) {
@@ -431,21 +431,24 @@ void exit_pi_state_list(struct task_struct *curr)
431 next = head->next; 431 next = head->next;
432 pi_state = list_entry(next, struct futex_pi_state, list); 432 pi_state = list_entry(next, struct futex_pi_state, list);
433 key = pi_state->key; 433 key = pi_state->key;
434 hb = hash_futex(&key);
434 spin_unlock_irq(&curr->pi_lock); 435 spin_unlock_irq(&curr->pi_lock);
435 436
436 hb = hash_futex(&key);
437 spin_lock(&hb->lock); 437 spin_lock(&hb->lock);
438 438
439 spin_lock_irq(&curr->pi_lock); 439 spin_lock_irq(&curr->pi_lock);
440 /*
441 * We dropped the pi-lock, so re-check whether this
442 * task still owns the PI-state:
443 */
440 if (head->next != next) { 444 if (head->next != next) {
441 spin_unlock(&hb->lock); 445 spin_unlock(&hb->lock);
442 continue; 446 continue;
443 } 447 }
444 448
445 list_del_init(&pi_state->list);
446
447 WARN_ON(pi_state->owner != curr); 449 WARN_ON(pi_state->owner != curr);
448 450 WARN_ON(list_empty(&pi_state->list));
451 list_del_init(&pi_state->list);
449 pi_state->owner = NULL; 452 pi_state->owner = NULL;
450 spin_unlock_irq(&curr->pi_lock); 453 spin_unlock_irq(&curr->pi_lock);
451 454
@@ -470,12 +473,20 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
470 head = &hb->chain; 473 head = &hb->chain;
471 474
472 list_for_each_entry_safe(this, next, head, list) { 475 list_for_each_entry_safe(this, next, head, list) {
473 if (match_futex (&this->key, &me->key)) { 476 if (match_futex(&this->key, &me->key)) {
474 /* 477 /*
475 * Another waiter already exists - bump up 478 * Another waiter already exists - bump up
476 * the refcount and return its pi_state: 479 * the refcount and return its pi_state:
477 */ 480 */
478 pi_state = this->pi_state; 481 pi_state = this->pi_state;
482 /*
483 * Userspace might have messed up non PI and PI futexes
484 */
485 if (unlikely(!pi_state))
486 return -EINVAL;
487
488 WARN_ON(!atomic_read(&pi_state->refcount));
489
479 atomic_inc(&pi_state->refcount); 490 atomic_inc(&pi_state->refcount);
480 me->pi_state = pi_state; 491 me->pi_state = pi_state;
481 492
@@ -484,10 +495,13 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
484 } 495 }
485 496
486 /* 497 /*
487 * We are the first waiter - try to look up the real owner and 498 * We are the first waiter - try to look up the real owner and attach
488 * attach the new pi_state to it: 499 * the new pi_state to it, but bail out when the owner died bit is set
500 * and TID = 0:
489 */ 501 */
490 pid = uval & FUTEX_TID_MASK; 502 pid = uval & FUTEX_TID_MASK;
503 if (!pid && (uval & FUTEX_OWNER_DIED))
504 return -ESRCH;
491 p = futex_find_get_task(pid); 505 p = futex_find_get_task(pid);
492 if (!p) 506 if (!p)
493 return -ESRCH; 507 return -ESRCH;
@@ -504,6 +518,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
504 pi_state->key = me->key; 518 pi_state->key = me->key;
505 519
506 spin_lock_irq(&p->pi_lock); 520 spin_lock_irq(&p->pi_lock);
521 WARN_ON(!list_empty(&pi_state->list));
507 list_add(&pi_state->list, &p->pi_state_list); 522 list_add(&pi_state->list, &p->pi_state_list);
508 pi_state->owner = p; 523 pi_state->owner = p;
509 spin_unlock_irq(&p->pi_lock); 524 spin_unlock_irq(&p->pi_lock);
@@ -567,20 +582,29 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
567 * kept enabled while there is PI state around. We must also 582 * kept enabled while there is PI state around. We must also
568 * preserve the owner died bit.) 583 * preserve the owner died bit.)
569 */ 584 */
570 newval = (uval & FUTEX_OWNER_DIED) | FUTEX_WAITERS | new_owner->pid; 585 if (!(uval & FUTEX_OWNER_DIED)) {
586 newval = FUTEX_WAITERS | new_owner->pid;
571 587
572 inc_preempt_count(); 588 inc_preempt_count();
573 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); 589 curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
574 dec_preempt_count(); 590 dec_preempt_count();
591 if (curval == -EFAULT)
592 return -EFAULT;
593 if (curval != uval)
594 return -EINVAL;
595 }
575 596
576 if (curval == -EFAULT) 597 spin_lock_irq(&pi_state->owner->pi_lock);
577 return -EFAULT; 598 WARN_ON(list_empty(&pi_state->list));
578 if (curval != uval) 599 list_del_init(&pi_state->list);
579 return -EINVAL; 600 spin_unlock_irq(&pi_state->owner->pi_lock);
580 601
581 list_del_init(&pi_state->owner->pi_state_list); 602 spin_lock_irq(&new_owner->pi_lock);
603 WARN_ON(!list_empty(&pi_state->list));
582 list_add(&pi_state->list, &new_owner->pi_state_list); 604 list_add(&pi_state->list, &new_owner->pi_state_list);
583 pi_state->owner = new_owner; 605 pi_state->owner = new_owner;
606 spin_unlock_irq(&new_owner->pi_lock);
607
584 rt_mutex_unlock(&pi_state->pi_mutex); 608 rt_mutex_unlock(&pi_state->pi_mutex);
585 609
586 return 0; 610 return 0;
@@ -1230,6 +1254,7 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
1230 /* Owner died? */ 1254 /* Owner died? */
1231 if (q.pi_state->owner != NULL) { 1255 if (q.pi_state->owner != NULL) {
1232 spin_lock_irq(&q.pi_state->owner->pi_lock); 1256 spin_lock_irq(&q.pi_state->owner->pi_lock);
1257 WARN_ON(list_empty(&q.pi_state->list));
1233 list_del_init(&q.pi_state->list); 1258 list_del_init(&q.pi_state->list);
1234 spin_unlock_irq(&q.pi_state->owner->pi_lock); 1259 spin_unlock_irq(&q.pi_state->owner->pi_lock);
1235 } else 1260 } else
@@ -1238,6 +1263,7 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
1238 q.pi_state->owner = current; 1263 q.pi_state->owner = current;
1239 1264
1240 spin_lock_irq(&current->pi_lock); 1265 spin_lock_irq(&current->pi_lock);
1266 WARN_ON(!list_empty(&q.pi_state->list));
1241 list_add(&q.pi_state->list, &current->pi_state_list); 1267 list_add(&q.pi_state->list, &current->pi_state_list);
1242 spin_unlock_irq(&current->pi_lock); 1268 spin_unlock_irq(&current->pi_lock);
1243 1269
@@ -1421,9 +1447,11 @@ retry_locked:
1421 * again. If it succeeds then we can return without waking 1447 * again. If it succeeds then we can return without waking
1422 * anyone else up: 1448 * anyone else up:
1423 */ 1449 */
1424 inc_preempt_count(); 1450 if (!(uval & FUTEX_OWNER_DIED)) {
1425 uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); 1451 inc_preempt_count();
1426 dec_preempt_count(); 1452 uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
1453 dec_preempt_count();
1454 }
1427 1455
1428 if (unlikely(uval == -EFAULT)) 1456 if (unlikely(uval == -EFAULT))
1429 goto pi_faulted; 1457 goto pi_faulted;
@@ -1456,9 +1484,11 @@ retry_locked:
1456 /* 1484 /*
1457 * No waiters - kernel unlocks the futex: 1485 * No waiters - kernel unlocks the futex:
1458 */ 1486 */
1459 ret = unlock_futex_pi(uaddr, uval); 1487 if (!(uval & FUTEX_OWNER_DIED)) {
1460 if (ret == -EFAULT) 1488 ret = unlock_futex_pi(uaddr, uval);
1461 goto pi_faulted; 1489 if (ret == -EFAULT)
1490 goto pi_faulted;
1491 }
1462 1492
1463out_unlock: 1493out_unlock:
1464 spin_unlock(&hb->lock); 1494 spin_unlock(&hb->lock);
@@ -1677,9 +1707,9 @@ err_unlock:
1677 * Process a futex-list entry, check whether it's owned by the 1707 * Process a futex-list entry, check whether it's owned by the
1678 * dying task, and do notification if so: 1708 * dying task, and do notification if so:
1679 */ 1709 */
1680int handle_futex_death(u32 __user *uaddr, struct task_struct *curr) 1710int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
1681{ 1711{
1682 u32 uval, nval; 1712 u32 uval, nval, mval;
1683 1713
1684retry: 1714retry:
1685 if (get_user(uval, uaddr)) 1715 if (get_user(uval, uaddr))
@@ -1696,21 +1726,45 @@ retry:
1696 * thread-death.) The rest of the cleanup is done in 1726 * thread-death.) The rest of the cleanup is done in
1697 * userspace. 1727 * userspace.
1698 */ 1728 */
1699 nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 1729 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
1700 uval | FUTEX_OWNER_DIED); 1730 nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval);
1731
1701 if (nval == -EFAULT) 1732 if (nval == -EFAULT)
1702 return -1; 1733 return -1;
1703 1734
1704 if (nval != uval) 1735 if (nval != uval)
1705 goto retry; 1736 goto retry;
1706 1737
1707 if (uval & FUTEX_WAITERS) 1738 /*
1708 futex_wake(uaddr, 1); 1739 * Wake robust non-PI futexes here. The wakeup of
1740 * PI futexes happens in exit_pi_state():
1741 */
1742 if (!pi) {
1743 if (uval & FUTEX_WAITERS)
1744 futex_wake(uaddr, 1);
1745 }
1709 } 1746 }
1710 return 0; 1747 return 0;
1711} 1748}
1712 1749
1713/* 1750/*
1751 * Fetch a robust-list pointer. Bit 0 signals PI futexes:
1752 */
1753static inline int fetch_robust_entry(struct robust_list __user **entry,
1754 struct robust_list __user **head, int *pi)
1755{
1756 unsigned long uentry;
1757
1758 if (get_user(uentry, (unsigned long *)head))
1759 return -EFAULT;
1760
1761 *entry = (void *)(uentry & ~1UL);
1762 *pi = uentry & 1;
1763
1764 return 0;
1765}
1766
1767/*
1714 * Walk curr->robust_list (very carefully, it's a userspace list!) 1768 * Walk curr->robust_list (very carefully, it's a userspace list!)
1715 * and mark any locks found there dead, and notify any waiters. 1769 * and mark any locks found there dead, and notify any waiters.
1716 * 1770 *
@@ -1720,14 +1774,14 @@ void exit_robust_list(struct task_struct *curr)
1720{ 1774{
1721 struct robust_list_head __user *head = curr->robust_list; 1775 struct robust_list_head __user *head = curr->robust_list;
1722 struct robust_list __user *entry, *pending; 1776 struct robust_list __user *entry, *pending;
1723 unsigned int limit = ROBUST_LIST_LIMIT; 1777 unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
1724 unsigned long futex_offset; 1778 unsigned long futex_offset;
1725 1779
1726 /* 1780 /*
1727 * Fetch the list head (which was registered earlier, via 1781 * Fetch the list head (which was registered earlier, via
1728 * sys_set_robust_list()): 1782 * sys_set_robust_list()):
1729 */ 1783 */
1730 if (get_user(entry, &head->list.next)) 1784 if (fetch_robust_entry(&entry, &head->list.next, &pi))
1731 return; 1785 return;
1732 /* 1786 /*
1733 * Fetch the relative futex offset: 1787 * Fetch the relative futex offset:
@@ -1738,10 +1792,11 @@ void exit_robust_list(struct task_struct *curr)
1738 * Fetch any possibly pending lock-add first, and handle it 1792 * Fetch any possibly pending lock-add first, and handle it
1739 * if it exists: 1793 * if it exists:
1740 */ 1794 */
1741 if (get_user(pending, &head->list_op_pending)) 1795 if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
1742 return; 1796 return;
1797
1743 if (pending) 1798 if (pending)
1744 handle_futex_death((void *)pending + futex_offset, curr); 1799 handle_futex_death((void *)pending + futex_offset, curr, pip);
1745 1800
1746 while (entry != &head->list) { 1801 while (entry != &head->list) {
1747 /* 1802 /*
@@ -1750,12 +1805,12 @@ void exit_robust_list(struct task_struct *curr)
1750 */ 1805 */
1751 if (entry != pending) 1806 if (entry != pending)
1752 if (handle_futex_death((void *)entry + futex_offset, 1807 if (handle_futex_death((void *)entry + futex_offset,
1753 curr)) 1808 curr, pi))
1754 return; 1809 return;
1755 /* 1810 /*
1756 * Fetch the next entry in the list: 1811 * Fetch the next entry in the list:
1757 */ 1812 */
1758 if (get_user(entry, &entry->next)) 1813 if (fetch_robust_entry(&entry, &entry->next, &pi))
1759 return; 1814 return;
1760 /* 1815 /*
1761 * Avoid excessively long or circular lists: 1816 * Avoid excessively long or circular lists:
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index d1d92b441fb7..d1aab1a452cc 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -12,6 +12,23 @@
12 12
13#include <asm/uaccess.h> 13#include <asm/uaccess.h>
14 14
15
16/*
17 * Fetch a robust-list pointer. Bit 0 signals PI futexes:
18 */
19static inline int
20fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
21 compat_uptr_t *head, int *pi)
22{
23 if (get_user(*uentry, head))
24 return -EFAULT;
25
26 *entry = compat_ptr((*uentry) & ~1);
27 *pi = (unsigned int)(*uentry) & 1;
28
29 return 0;
30}
31
15/* 32/*
16 * Walk curr->robust_list (very carefully, it's a userspace list!) 33 * Walk curr->robust_list (very carefully, it's a userspace list!)
17 * and mark any locks found there dead, and notify any waiters. 34 * and mark any locks found there dead, and notify any waiters.
@@ -22,17 +39,16 @@ void compat_exit_robust_list(struct task_struct *curr)
22{ 39{
23 struct compat_robust_list_head __user *head = curr->compat_robust_list; 40 struct compat_robust_list_head __user *head = curr->compat_robust_list;
24 struct robust_list __user *entry, *pending; 41 struct robust_list __user *entry, *pending;
42 unsigned int limit = ROBUST_LIST_LIMIT, pi;
25 compat_uptr_t uentry, upending; 43 compat_uptr_t uentry, upending;
26 unsigned int limit = ROBUST_LIST_LIMIT;
27 compat_long_t futex_offset; 44 compat_long_t futex_offset;
28 45
29 /* 46 /*
30 * Fetch the list head (which was registered earlier, via 47 * Fetch the list head (which was registered earlier, via
31 * sys_set_robust_list()): 48 * sys_set_robust_list()):
32 */ 49 */
33 if (get_user(uentry, &head->list.next)) 50 if (fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
34 return; 51 return;
35 entry = compat_ptr(uentry);
36 /* 52 /*
37 * Fetch the relative futex offset: 53 * Fetch the relative futex offset:
38 */ 54 */
@@ -42,11 +58,11 @@ void compat_exit_robust_list(struct task_struct *curr)
42 * Fetch any possibly pending lock-add first, and handle it 58 * Fetch any possibly pending lock-add first, and handle it
43 * if it exists: 59 * if it exists:
44 */ 60 */
45 if (get_user(upending, &head->list_op_pending)) 61 if (fetch_robust_entry(&upending, &pending,
62 &head->list_op_pending, &pi))
46 return; 63 return;
47 pending = compat_ptr(upending);
48 if (upending) 64 if (upending)
49 handle_futex_death((void *)pending + futex_offset, curr); 65 handle_futex_death((void *)pending + futex_offset, curr, pi);
50 66
51 while (compat_ptr(uentry) != &head->list) { 67 while (compat_ptr(uentry) != &head->list) {
52 /* 68 /*
@@ -55,15 +71,15 @@ void compat_exit_robust_list(struct task_struct *curr)
55 */ 71 */
56 if (entry != pending) 72 if (entry != pending)
57 if (handle_futex_death((void *)entry + futex_offset, 73 if (handle_futex_death((void *)entry + futex_offset,
58 curr)) 74 curr, pi))
59 return; 75 return;
60 76
61 /* 77 /*
62 * Fetch the next entry in the list: 78 * Fetch the next entry in the list:
63 */ 79 */
64 if (get_user(uentry, (compat_uptr_t *)&entry->next)) 80 if (fetch_robust_entry(&uentry, &entry,
81 (compat_uptr_t *)&entry->next, &pi))
65 return; 82 return;
66 entry = compat_ptr(uentry);
67 /* 83 /*
68 * Avoid excessively long or circular lists: 84 * Avoid excessively long or circular lists:
69 */ 85 */
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index d17766d40dab..be989efc7856 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -835,7 +835,7 @@ static void migrate_hrtimers(int cpu)
835} 835}
836#endif /* CONFIG_HOTPLUG_CPU */ 836#endif /* CONFIG_HOTPLUG_CPU */
837 837
838static int __devinit hrtimer_cpu_notify(struct notifier_block *self, 838static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
839 unsigned long action, void *hcpu) 839 unsigned long action, void *hcpu)
840{ 840{
841 long cpu = (long)hcpu; 841 long cpu = (long)hcpu;
@@ -859,7 +859,7 @@ static int __devinit hrtimer_cpu_notify(struct notifier_block *self,
859 return NOTIFY_OK; 859 return NOTIFY_OK;
860} 860}
861 861
862static struct notifier_block __devinitdata hrtimers_nb = { 862static struct notifier_block __cpuinitdata hrtimers_nb = {
863 .notifier_call = hrtimer_cpu_notify, 863 .notifier_call = hrtimer_cpu_notify,
864}; 864};
865 865
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 4e461438e48b..92be519eff26 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -137,16 +137,40 @@ EXPORT_SYMBOL(enable_irq);
137 * @irq: interrupt to control 137 * @irq: interrupt to control
138 * @on: enable/disable power management wakeup 138 * @on: enable/disable power management wakeup
139 * 139 *
140 * Enable/disable power management wakeup mode 140 * Enable/disable power management wakeup mode, which is
141 * disabled by default. Enables and disables must match,
142 * just as they match for non-wakeup mode support.
143 *
144 * Wakeup mode lets this IRQ wake the system from sleep
145 * states like "suspend to RAM".
141 */ 146 */
142int set_irq_wake(unsigned int irq, unsigned int on) 147int set_irq_wake(unsigned int irq, unsigned int on)
143{ 148{
144 struct irq_desc *desc = irq_desc + irq; 149 struct irq_desc *desc = irq_desc + irq;
145 unsigned long flags; 150 unsigned long flags;
146 int ret = -ENXIO; 151 int ret = -ENXIO;
152 int (*set_wake)(unsigned, unsigned) = desc->chip->set_wake;
147 153
154 /* wakeup-capable irqs can be shared between drivers that
155 * don't need to have the same sleep mode behaviors.
156 */
148 spin_lock_irqsave(&desc->lock, flags); 157 spin_lock_irqsave(&desc->lock, flags);
149 if (desc->chip->set_wake) 158 if (on) {
159 if (desc->wake_depth++ == 0)
160 desc->status |= IRQ_WAKEUP;
161 else
162 set_wake = NULL;
163 } else {
164 if (desc->wake_depth == 0) {
165 printk(KERN_WARNING "Unbalanced IRQ %d "
166 "wake disable\n", irq);
167 WARN_ON(1);
168 } else if (--desc->wake_depth == 0)
169 desc->status &= ~IRQ_WAKEUP;
170 else
171 set_wake = NULL;
172 }
173 if (set_wake)
150 ret = desc->chip->set_wake(irq, on); 174 ret = desc->chip->set_wake(irq, on);
151 spin_unlock_irqrestore(&desc->lock, flags); 175 spin_unlock_irqrestore(&desc->lock, flags);
152 return ret; 176 return ret;
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 39277dd6bf90..ab16a5a4cfe9 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -275,8 +275,8 @@ static void upcase_if_global(struct kallsym_iter *iter)
275static int get_ksymbol_mod(struct kallsym_iter *iter) 275static int get_ksymbol_mod(struct kallsym_iter *iter)
276{ 276{
277 iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms, 277 iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms,
278 &iter->value, 278 &iter->value, &iter->type,
279 &iter->type, iter->name); 279 iter->name, sizeof(iter->name));
280 if (iter->owner == NULL) 280 if (iter->owner == NULL)
281 return 0; 281 return 0;
282 282
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 64aab081153b..3f57dfdc8f92 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -393,6 +393,7 @@ static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
393static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p) 393static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
394{ 394{
395 copy_kprobe(p, ap); 395 copy_kprobe(p, ap);
396 flush_insn_slot(ap);
396 ap->addr = p->addr; 397 ap->addr = p->addr;
397 ap->pre_handler = aggr_pre_handler; 398 ap->pre_handler = aggr_pre_handler;
398 ap->fault_handler = aggr_fault_handler; 399 ap->fault_handler = aggr_fault_handler;
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 24be714b04c7..4f9c60ef95e8 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -216,23 +216,6 @@ EXPORT_SYMBOL(kthread_bind);
216 */ 216 */
217int kthread_stop(struct task_struct *k) 217int kthread_stop(struct task_struct *k)
218{ 218{
219 return kthread_stop_sem(k, NULL);
220}
221EXPORT_SYMBOL(kthread_stop);
222
223/**
224 * kthread_stop_sem - stop a thread created by kthread_create().
225 * @k: thread created by kthread_create().
226 * @s: semaphore that @k waits on while idle.
227 *
228 * Does essentially the same thing as kthread_stop() above, but wakes
229 * @k by calling up(@s).
230 *
231 * Returns the result of threadfn(), or %-EINTR if wake_up_process()
232 * was never called.
233 */
234int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
235{
236 int ret; 219 int ret;
237 220
238 mutex_lock(&kthread_stop_lock); 221 mutex_lock(&kthread_stop_lock);
@@ -246,10 +229,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
246 229
247 /* Now set kthread_should_stop() to true, and wake it up. */ 230 /* Now set kthread_should_stop() to true, and wake it up. */
248 kthread_stop_info.k = k; 231 kthread_stop_info.k = k;
249 if (s) 232 wake_up_process(k);
250 up(s);
251 else
252 wake_up_process(k);
253 put_task_struct(k); 233 put_task_struct(k);
254 234
255 /* Once it dies, reset stop ptr, gather result and we're done. */ 235 /* Once it dies, reset stop ptr, gather result and we're done. */
@@ -260,7 +240,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
260 240
261 return ret; 241 return ret;
262} 242}
263EXPORT_SYMBOL(kthread_stop_sem); 243EXPORT_SYMBOL(kthread_stop);
264 244
265static __init int helper_init(void) 245static __init int helper_init(void)
266{ 246{
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index f32ca78c198d..9bad17884513 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -169,22 +169,17 @@ EXPORT_SYMBOL(lockdep_internal);
169 */ 169 */
170static int class_filter(struct lock_class *class) 170static int class_filter(struct lock_class *class)
171{ 171{
172#if 0
173 /* Example */
172 if (class->name_version == 1 && 174 if (class->name_version == 1 &&
173 !strcmp(class->name, "&rl->lock")) 175 !strcmp(class->name, "lockname"))
174 return 1; 176 return 1;
175 if (class->name_version == 1 && 177 if (class->name_version == 1 &&
176 !strcmp(class->name, "&ni->mrec_lock")) 178 !strcmp(class->name, "&struct->lockfield"))
177 return 1; 179 return 1;
178 if (class->name_version == 1 && 180#endif
179 !strcmp(class->name, "mft_ni_runlist_lock")) 181 /* Allow everything else. 0 would be filter everything else */
180 return 1; 182 return 1;
181 if (class->name_version == 1 &&
182 !strcmp(class->name, "mft_ni_mrec_lock"))
183 return 1;
184 if (class->name_version == 1 &&
185 !strcmp(class->name, "&vol->lcnbmp_lock"))
186 return 1;
187 return 0;
188} 183}
189#endif 184#endif
190 185
@@ -408,23 +403,12 @@ static void lockdep_print_held_locks(struct task_struct *curr)
408 print_lock(curr->held_locks + i); 403 print_lock(curr->held_locks + i);
409 } 404 }
410} 405}
411/*
412 * Helper to print a nice hierarchy of lock dependencies:
413 */
414static void print_spaces(int nr)
415{
416 int i;
417
418 for (i = 0; i < nr; i++)
419 printk(" ");
420}
421 406
422static void print_lock_class_header(struct lock_class *class, int depth) 407static void print_lock_class_header(struct lock_class *class, int depth)
423{ 408{
424 int bit; 409 int bit;
425 410
426 print_spaces(depth); 411 printk("%*s->", depth, "");
427 printk("->");
428 print_lock_name(class); 412 print_lock_name(class);
429 printk(" ops: %lu", class->ops); 413 printk(" ops: %lu", class->ops);
430 printk(" {\n"); 414 printk(" {\n");
@@ -433,17 +417,14 @@ static void print_lock_class_header(struct lock_class *class, int depth)
433 if (class->usage_mask & (1 << bit)) { 417 if (class->usage_mask & (1 << bit)) {
434 int len = depth; 418 int len = depth;
435 419
436 print_spaces(depth); 420 len += printk("%*s %s", depth, "", usage_str[bit]);
437 len += printk(" %s", usage_str[bit]);
438 len += printk(" at:\n"); 421 len += printk(" at:\n");
439 print_stack_trace(class->usage_traces + bit, len); 422 print_stack_trace(class->usage_traces + bit, len);
440 } 423 }
441 } 424 }
442 print_spaces(depth); 425 printk("%*s }\n", depth, "");
443 printk(" }\n");
444 426
445 print_spaces(depth); 427 printk("%*s ... key at: ",depth,"");
446 printk(" ... key at: ");
447 print_ip_sym((unsigned long)class->key); 428 print_ip_sym((unsigned long)class->key);
448} 429}
449 430
@@ -463,8 +444,7 @@ static void print_lock_dependencies(struct lock_class *class, int depth)
463 DEBUG_LOCKS_WARN_ON(!entry->class); 444 DEBUG_LOCKS_WARN_ON(!entry->class);
464 print_lock_dependencies(entry->class, depth + 1); 445 print_lock_dependencies(entry->class, depth + 1);
465 446
466 print_spaces(depth); 447 printk("%*s ... acquired at:\n",depth,"");
467 printk(" ... acquired at:\n");
468 print_stack_trace(&entry->trace, 2); 448 print_stack_trace(&entry->trace, 2);
469 printk("\n"); 449 printk("\n");
470 } 450 }
@@ -1124,7 +1104,7 @@ extern void __error_too_big_MAX_LOCKDEP_SUBCLASSES(void);
1124 * itself, so actual lookup of the hash should be once per lock object. 1104 * itself, so actual lookup of the hash should be once per lock object.
1125 */ 1105 */
1126static inline struct lock_class * 1106static inline struct lock_class *
1127register_lock_class(struct lockdep_map *lock, unsigned int subclass) 1107look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
1128{ 1108{
1129 struct lockdep_subclass_key *key; 1109 struct lockdep_subclass_key *key;
1130 struct list_head *hash_head; 1110 struct list_head *hash_head;
@@ -1168,7 +1148,26 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass)
1168 */ 1148 */
1169 list_for_each_entry(class, hash_head, hash_entry) 1149 list_for_each_entry(class, hash_head, hash_entry)
1170 if (class->key == key) 1150 if (class->key == key)
1171 goto out_set; 1151 return class;
1152
1153 return NULL;
1154}
1155
1156/*
1157 * Register a lock's class in the hash-table, if the class is not present
1158 * yet. Otherwise we look it up. We cache the result in the lock object
1159 * itself, so actual lookup of the hash should be once per lock object.
1160 */
1161static inline struct lock_class *
1162register_lock_class(struct lockdep_map *lock, unsigned int subclass)
1163{
1164 struct lockdep_subclass_key *key;
1165 struct list_head *hash_head;
1166 struct lock_class *class;
1167
1168 class = look_up_lock_class(lock, subclass);
1169 if (likely(class))
1170 return class;
1172 1171
1173 /* 1172 /*
1174 * Debug-check: all keys must be persistent! 1173 * Debug-check: all keys must be persistent!
@@ -1183,6 +1182,9 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass)
1183 return NULL; 1182 return NULL;
1184 } 1183 }
1185 1184
1185 key = lock->key->subkeys + subclass;
1186 hash_head = classhashentry(key);
1187
1186 __raw_spin_lock(&hash_lock); 1188 __raw_spin_lock(&hash_lock);
1187 /* 1189 /*
1188 * We have to do the hash-walk again, to avoid races 1190 * We have to do the hash-walk again, to avoid races
@@ -1229,8 +1231,8 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass)
1229out_unlock_set: 1231out_unlock_set:
1230 __raw_spin_unlock(&hash_lock); 1232 __raw_spin_unlock(&hash_lock);
1231 1233
1232out_set: 1234 if (!subclass)
1233 lock->class[subclass] = class; 1235 lock->class_cache = class;
1234 1236
1235 DEBUG_LOCKS_WARN_ON(class->subclass != subclass); 1237 DEBUG_LOCKS_WARN_ON(class->subclass != subclass);
1236 1238
@@ -1934,7 +1936,7 @@ void lockdep_init_map(struct lockdep_map *lock, const char *name,
1934 } 1936 }
1935 lock->name = name; 1937 lock->name = name;
1936 lock->key = key; 1938 lock->key = key;
1937 memset(lock->class, 0, sizeof(lock->class[0])*MAX_LOCKDEP_SUBCLASSES); 1939 lock->class_cache = NULL;
1938} 1940}
1939 1941
1940EXPORT_SYMBOL_GPL(lockdep_init_map); 1942EXPORT_SYMBOL_GPL(lockdep_init_map);
@@ -1948,8 +1950,8 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
1948 unsigned long ip) 1950 unsigned long ip)
1949{ 1951{
1950 struct task_struct *curr = current; 1952 struct task_struct *curr = current;
1953 struct lock_class *class = NULL;
1951 struct held_lock *hlock; 1954 struct held_lock *hlock;
1952 struct lock_class *class;
1953 unsigned int depth, id; 1955 unsigned int depth, id;
1954 int chain_head = 0; 1956 int chain_head = 0;
1955 u64 chain_key; 1957 u64 chain_key;
@@ -1967,8 +1969,11 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
1967 return 0; 1969 return 0;
1968 } 1970 }
1969 1971
1970 class = lock->class[subclass]; 1972 if (!subclass)
1971 /* not cached yet? */ 1973 class = lock->class_cache;
1974 /*
1975 * Not cached yet or subclass?
1976 */
1972 if (unlikely(!class)) { 1977 if (unlikely(!class)) {
1973 class = register_lock_class(lock, subclass); 1978 class = register_lock_class(lock, subclass);
1974 if (!class) 1979 if (!class)
@@ -2469,48 +2474,44 @@ void lockdep_free_key_range(void *start, unsigned long size)
2469 2474
2470void lockdep_reset_lock(struct lockdep_map *lock) 2475void lockdep_reset_lock(struct lockdep_map *lock)
2471{ 2476{
2472 struct lock_class *class, *next, *entry; 2477 struct lock_class *class, *next;
2473 struct list_head *head; 2478 struct list_head *head;
2474 unsigned long flags; 2479 unsigned long flags;
2475 int i, j; 2480 int i, j;
2476 2481
2477 raw_local_irq_save(flags); 2482 raw_local_irq_save(flags);
2478 __raw_spin_lock(&hash_lock);
2479 2483
2480 /* 2484 /*
2481 * Remove all classes this lock has: 2485 * Remove all classes this lock might have:
2486 */
2487 for (j = 0; j < MAX_LOCKDEP_SUBCLASSES; j++) {
2488 /*
2489 * If the class exists we look it up and zap it:
2490 */
2491 class = look_up_lock_class(lock, j);
2492 if (class)
2493 zap_class(class);
2494 }
2495 /*
2496 * Debug check: in the end all mapped classes should
2497 * be gone.
2482 */ 2498 */
2499 __raw_spin_lock(&hash_lock);
2483 for (i = 0; i < CLASSHASH_SIZE; i++) { 2500 for (i = 0; i < CLASSHASH_SIZE; i++) {
2484 head = classhash_table + i; 2501 head = classhash_table + i;
2485 if (list_empty(head)) 2502 if (list_empty(head))
2486 continue; 2503 continue;
2487 list_for_each_entry_safe(class, next, head, hash_entry) { 2504 list_for_each_entry_safe(class, next, head, hash_entry) {
2488 for (j = 0; j < MAX_LOCKDEP_SUBCLASSES; j++) { 2505 if (unlikely(class == lock->class_cache)) {
2489 entry = lock->class[j]; 2506 __raw_spin_unlock(&hash_lock);
2490 if (class == entry) { 2507 DEBUG_LOCKS_WARN_ON(1);
2491 zap_class(class); 2508 goto out_restore;
2492 lock->class[j] = NULL;
2493 break;
2494 }
2495 } 2509 }
2496 } 2510 }
2497 } 2511 }
2498
2499 /*
2500 * Debug check: in the end all mapped classes should
2501 * be gone.
2502 */
2503 for (j = 0; j < MAX_LOCKDEP_SUBCLASSES; j++) {
2504 entry = lock->class[j];
2505 if (!entry)
2506 continue;
2507 __raw_spin_unlock(&hash_lock);
2508 DEBUG_LOCKS_WARN_ON(1);
2509 raw_local_irq_restore(flags);
2510 return;
2511 }
2512
2513 __raw_spin_unlock(&hash_lock); 2512 __raw_spin_unlock(&hash_lock);
2513
2514out_restore:
2514 raw_local_irq_restore(flags); 2515 raw_local_irq_restore(flags);
2515} 2516}
2516 2517
@@ -2571,7 +2572,7 @@ static inline int in_range(const void *start, const void *addr, const void *end)
2571 2572
2572static void 2573static void
2573print_freed_lock_bug(struct task_struct *curr, const void *mem_from, 2574print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
2574 const void *mem_to) 2575 const void *mem_to, struct held_lock *hlock)
2575{ 2576{
2576 if (!debug_locks_off()) 2577 if (!debug_locks_off())
2577 return; 2578 return;
@@ -2583,6 +2584,7 @@ print_freed_lock_bug(struct task_struct *curr, const void *mem_from,
2583 printk( "-------------------------\n"); 2584 printk( "-------------------------\n");
2584 printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n", 2585 printk("%s/%d is freeing memory %p-%p, with a lock still held there!\n",
2585 curr->comm, curr->pid, mem_from, mem_to-1); 2586 curr->comm, curr->pid, mem_from, mem_to-1);
2587 print_lock(hlock);
2586 lockdep_print_held_locks(curr); 2588 lockdep_print_held_locks(curr);
2587 2589
2588 printk("\nstack backtrace:\n"); 2590 printk("\nstack backtrace:\n");
@@ -2616,7 +2618,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
2616 !in_range(mem_from, lock_to, mem_to)) 2618 !in_range(mem_from, lock_to, mem_to))
2617 continue; 2619 continue;
2618 2620
2619 print_freed_lock_bug(curr, mem_from, mem_to); 2621 print_freed_lock_bug(curr, mem_from, mem_to, hlock);
2620 break; 2622 break;
2621 } 2623 }
2622 local_irq_restore(flags); 2624 local_irq_restore(flags);
diff --git a/kernel/module.c b/kernel/module.c
index 35e1b1f859d7..2a19cd47c046 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2019,10 +2019,8 @@ const char *module_address_lookup(unsigned long addr,
2019 return NULL; 2019 return NULL;
2020} 2020}
2021 2021
2022struct module *module_get_kallsym(unsigned int symnum, 2022struct module *module_get_kallsym(unsigned int symnum, unsigned long *value,
2023 unsigned long *value, 2023 char *type, char *name, size_t namelen)
2024 char *type,
2025 char namebuf[128])
2026{ 2024{
2027 struct module *mod; 2025 struct module *mod;
2028 2026
@@ -2031,9 +2029,8 @@ struct module *module_get_kallsym(unsigned int symnum,
2031 if (symnum < mod->num_symtab) { 2029 if (symnum < mod->num_symtab) {
2032 *value = mod->symtab[symnum].st_value; 2030 *value = mod->symtab[symnum].st_value;
2033 *type = mod->symtab[symnum].st_info; 2031 *type = mod->symtab[symnum].st_info;
2034 strncpy(namebuf, 2032 strlcpy(name, mod->strtab + mod->symtab[symnum].st_name,
2035 mod->strtab + mod->symtab[symnum].st_name, 2033 namelen);
2036 127);
2037 mutex_unlock(&module_mutex); 2034 mutex_unlock(&module_mutex);
2038 return mod; 2035 return mod;
2039 } 2036 }
diff --git a/kernel/panic.c b/kernel/panic.c
index ab13f0f668b5..d8a0bca21233 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -172,6 +172,7 @@ const char *print_tainted(void)
172 172
173void add_taint(unsigned flag) 173void add_taint(unsigned flag)
174{ 174{
175 debug_locks_off(); /* can't trust the integrity of the kernel anymore */
175 tainted |= flag; 176 tainted |= flag;
176} 177}
177EXPORT_SYMBOL(add_taint); 178EXPORT_SYMBOL(add_taint);
@@ -256,6 +257,7 @@ int oops_may_print(void)
256 */ 257 */
257void oops_enter(void) 258void oops_enter(void)
258{ 259{
260 debug_locks_off(); /* can't trust the integrity of the kernel anymore */
259 do_oops_enter_exit(); 261 do_oops_enter_exit();
260} 262}
261 263
diff --git a/kernel/power/pm.c b/kernel/power/pm.c
index 84063ac8fcfc..c50d15266c10 100644
--- a/kernel/power/pm.c
+++ b/kernel/power/pm.c
@@ -75,42 +75,6 @@ struct pm_dev *pm_register(pm_dev_t type,
75 return dev; 75 return dev;
76} 76}
77 77
78static void __pm_unregister(struct pm_dev *dev)
79{
80 if (dev) {
81 list_del(&dev->entry);
82 kfree(dev);
83 }
84}
85
86/**
87 * pm_unregister_all - unregister all devices with matching callback
88 * @callback: callback function pointer
89 *
90 * Unregister every device that would call the callback passed. This
91 * is primarily meant as a helper function for loadable modules. It
92 * enables a module to give up all its managed devices without keeping
93 * its own private list.
94 */
95
96void pm_unregister_all(pm_callback callback)
97{
98 struct list_head *entry;
99
100 if (!callback)
101 return;
102
103 mutex_lock(&pm_devs_lock);
104 entry = pm_devs.next;
105 while (entry != &pm_devs) {
106 struct pm_dev *dev = list_entry(entry, struct pm_dev, entry);
107 entry = entry->next;
108 if (dev->callback == callback)
109 __pm_unregister(dev);
110 }
111 mutex_unlock(&pm_devs_lock);
112}
113
114/** 78/**
115 * pm_send - send request to a single device 79 * pm_send - send request to a single device
116 * @dev: device to send to 80 * @dev: device to send to
@@ -239,7 +203,6 @@ int pm_send_all(pm_request_t rqst, void *data)
239} 203}
240 204
241EXPORT_SYMBOL(pm_register); 205EXPORT_SYMBOL(pm_register);
242EXPORT_SYMBOL(pm_unregister_all);
243EXPORT_SYMBOL(pm_send_all); 206EXPORT_SYMBOL(pm_send_all);
244EXPORT_SYMBOL(pm_active); 207EXPORT_SYMBOL(pm_active);
245 208
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 24c96f354231..75d4886e648e 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -227,11 +227,17 @@ static void copy_data_pages(struct pbe *pblist)
227 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) { 227 for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn) {
228 if (saveable(zone, &zone_pfn)) { 228 if (saveable(zone, &zone_pfn)) {
229 struct page *page; 229 struct page *page;
230 long *src, *dst;
231 int n;
232
230 page = pfn_to_page(zone_pfn + zone->zone_start_pfn); 233 page = pfn_to_page(zone_pfn + zone->zone_start_pfn);
231 BUG_ON(!pbe); 234 BUG_ON(!pbe);
232 pbe->orig_address = (unsigned long)page_address(page); 235 pbe->orig_address = (unsigned long)page_address(page);
233 /* copy_page is not usable for copying task structs. */ 236 /* copy_page and memcpy are not usable for copying task structs. */
234 memcpy((void *)pbe->address, (void *)pbe->orig_address, PAGE_SIZE); 237 dst = (long *)pbe->address;
238 src = (long *)pbe->orig_address;
239 for (n = PAGE_SIZE / sizeof(long); n; n--)
240 *dst++ = *src++;
235 pbe = pbe->next; 241 pbe = pbe->next;
236 } 242 }
237 } 243 }
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 044b8e0c1025..f1dd146bd64d 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -263,7 +263,6 @@ int swsusp_write(void)
263 struct swap_map_handle handle; 263 struct swap_map_handle handle;
264 struct snapshot_handle snapshot; 264 struct snapshot_handle snapshot;
265 struct swsusp_info *header; 265 struct swsusp_info *header;
266 unsigned long start;
267 int error; 266 int error;
268 267
269 if ((error = swsusp_swap_check())) { 268 if ((error = swsusp_swap_check())) {
@@ -281,16 +280,17 @@ int swsusp_write(void)
281 } 280 }
282 error = get_swap_writer(&handle); 281 error = get_swap_writer(&handle);
283 if (!error) { 282 if (!error) {
284 start = handle.cur_swap; 283 unsigned long start = handle.cur_swap;
285 error = swap_write_page(&handle, header); 284 error = swap_write_page(&handle, header);
286 } 285 if (!error)
287 if (!error) 286 error = save_image(&handle, &snapshot,
288 error = save_image(&handle, &snapshot, header->pages - 1); 287 header->pages - 1);
289 if (!error) { 288 if (!error) {
290 flush_swap_writer(&handle); 289 flush_swap_writer(&handle);
291 printk("S"); 290 printk("S");
292 error = mark_swapfiles(swp_entry(root_swap, start)); 291 error = mark_swapfiles(swp_entry(root_swap, start));
293 printk("|\n"); 292 printk("|\n");
293 }
294 } 294 }
295 if (error) 295 if (error)
296 free_all_swap_pages(root_swap, handle.bitmap); 296 free_all_swap_pages(root_swap, handle.bitmap);
@@ -311,8 +311,10 @@ static atomic_t io_done = ATOMIC_INIT(0);
311 311
312static int end_io(struct bio *bio, unsigned int num, int err) 312static int end_io(struct bio *bio, unsigned int num, int err)
313{ 313{
314 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 314 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
315 panic("I/O error reading memory image"); 315 printk(KERN_ERR "I/O error reading swsusp image.\n");
316 return -EIO;
317 }
316 atomic_set(&io_done, 0); 318 atomic_set(&io_done, 0);
317 return 0; 319 return 0;
318} 320}
diff --git a/kernel/printk.c b/kernel/printk.c
index bdba5d80496c..65ca0688f86f 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -52,7 +52,7 @@ int console_printk[4] = {
52 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ 52 DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */
53}; 53};
54 54
55EXPORT_SYMBOL(console_printk); 55EXPORT_UNUSED_SYMBOL(console_printk); /* June 2006 */
56 56
57/* 57/*
58 * Low lever drivers may need that to know if they can schedule in 58 * Low lever drivers may need that to know if they can schedule in
@@ -773,7 +773,7 @@ int is_console_locked(void)
773{ 773{
774 return console_locked; 774 return console_locked;
775} 775}
776EXPORT_SYMBOL(is_console_locked); 776EXPORT_UNUSED_SYMBOL(is_console_locked); /* June 2006 */
777 777
778/** 778/**
779 * release_console_sem - unlock the console system 779 * release_console_sem - unlock the console system
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 759805c9859a..436ab35f6fa7 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -548,7 +548,7 @@ static void __devinit rcu_online_cpu(int cpu)
548 tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL); 548 tasklet_init(&per_cpu(rcu_tasklet, cpu), rcu_process_callbacks, 0UL);
549} 549}
550 550
551static int __devinit rcu_cpu_notify(struct notifier_block *self, 551static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
552 unsigned long action, void *hcpu) 552 unsigned long action, void *hcpu)
553{ 553{
554 long cpu = (long)hcpu; 554 long cpu = (long)hcpu;
@@ -565,7 +565,7 @@ static int __devinit rcu_cpu_notify(struct notifier_block *self,
565 return NOTIFY_OK; 565 return NOTIFY_OK;
566} 566}
567 567
568static struct notifier_block __devinitdata rcu_nb = { 568static struct notifier_block __cpuinitdata rcu_nb = {
569 .notifier_call = rcu_cpu_notify, 569 .notifier_call = rcu_cpu_notify,
570}; 570};
571 571
diff --git a/kernel/resource.c b/kernel/resource.c
index 129cf046e561..0dd3a857579e 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -404,8 +404,6 @@ int insert_resource(struct resource *parent, struct resource *new)
404 return result; 404 return result;
405} 405}
406 406
407EXPORT_SYMBOL(insert_resource);
408
409/* 407/*
410 * Given an existing resource, change its start and size to match the 408 * Given an existing resource, change its start and size to match the
411 * arguments. Returns -EBUSY if it can't fit. Existing children of 409 * arguments. Returns -EBUSY if it can't fit. Existing children of
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 494dac872a13..948bd8f643e2 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -275,6 +275,7 @@ static int test_func(void *data)
275 275
276 /* Wait for the next command to be executed */ 276 /* Wait for the next command to be executed */
277 schedule(); 277 schedule();
278 try_to_freeze();
278 279
279 if (signal_pending(current)) 280 if (signal_pending(current))
280 flush_signals(current); 281 flush_signals(current);
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index d2ef13b485e7..3e13a1e5856f 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -7,6 +7,8 @@
7 * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> 7 * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
8 * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt 8 * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt
9 * Copyright (C) 2006 Esben Nielsen 9 * Copyright (C) 2006 Esben Nielsen
10 *
11 * See Documentation/rt-mutex-design.txt for details.
10 */ 12 */
11#include <linux/spinlock.h> 13#include <linux/spinlock.h>
12#include <linux/module.h> 14#include <linux/module.h>
diff --git a/kernel/sched.c b/kernel/sched.c
index 4ee400f9d56b..a2be2d055299 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -51,6 +51,7 @@
51#include <linux/times.h> 51#include <linux/times.h>
52#include <linux/acct.h> 52#include <linux/acct.h>
53#include <linux/kprobes.h> 53#include <linux/kprobes.h>
54#include <linux/delayacct.h>
54#include <asm/tlb.h> 55#include <asm/tlb.h>
55 56
56#include <asm/unistd.h> 57#include <asm/unistd.h>
@@ -501,9 +502,36 @@ struct file_operations proc_schedstat_operations = {
501 .release = single_release, 502 .release = single_release,
502}; 503};
503 504
505/*
506 * Expects runqueue lock to be held for atomicity of update
507 */
508static inline void
509rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
510{
511 if (rq) {
512 rq->rq_sched_info.run_delay += delta_jiffies;
513 rq->rq_sched_info.pcnt++;
514 }
515}
516
517/*
518 * Expects runqueue lock to be held for atomicity of update
519 */
520static inline void
521rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
522{
523 if (rq)
524 rq->rq_sched_info.cpu_time += delta_jiffies;
525}
504# define schedstat_inc(rq, field) do { (rq)->field++; } while (0) 526# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
505# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) 527# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
506#else /* !CONFIG_SCHEDSTATS */ 528#else /* !CONFIG_SCHEDSTATS */
529static inline void
530rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
531{}
532static inline void
533rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
534{}
507# define schedstat_inc(rq, field) do { } while (0) 535# define schedstat_inc(rq, field) do { } while (0)
508# define schedstat_add(rq, field, amt) do { } while (0) 536# define schedstat_add(rq, field, amt) do { } while (0)
509#endif 537#endif
@@ -523,7 +551,7 @@ static inline struct rq *this_rq_lock(void)
523 return rq; 551 return rq;
524} 552}
525 553
526#ifdef CONFIG_SCHEDSTATS 554#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
527/* 555/*
528 * Called when a process is dequeued from the active array and given 556 * Called when a process is dequeued from the active array and given
529 * the cpu. We should note that with the exception of interactive 557 * the cpu. We should note that with the exception of interactive
@@ -551,21 +579,16 @@ static inline void sched_info_dequeued(struct task_struct *t)
551 */ 579 */
552static void sched_info_arrive(struct task_struct *t) 580static void sched_info_arrive(struct task_struct *t)
553{ 581{
554 unsigned long now = jiffies, diff = 0; 582 unsigned long now = jiffies, delta_jiffies = 0;
555 struct rq *rq = task_rq(t);
556 583
557 if (t->sched_info.last_queued) 584 if (t->sched_info.last_queued)
558 diff = now - t->sched_info.last_queued; 585 delta_jiffies = now - t->sched_info.last_queued;
559 sched_info_dequeued(t); 586 sched_info_dequeued(t);
560 t->sched_info.run_delay += diff; 587 t->sched_info.run_delay += delta_jiffies;
561 t->sched_info.last_arrival = now; 588 t->sched_info.last_arrival = now;
562 t->sched_info.pcnt++; 589 t->sched_info.pcnt++;
563 590
564 if (!rq) 591 rq_sched_info_arrive(task_rq(t), delta_jiffies);
565 return;
566
567 rq->rq_sched_info.run_delay += diff;
568 rq->rq_sched_info.pcnt++;
569} 592}
570 593
571/* 594/*
@@ -585,8 +608,9 @@ static void sched_info_arrive(struct task_struct *t)
585 */ 608 */
586static inline void sched_info_queued(struct task_struct *t) 609static inline void sched_info_queued(struct task_struct *t)
587{ 610{
588 if (!t->sched_info.last_queued) 611 if (unlikely(sched_info_on()))
589 t->sched_info.last_queued = jiffies; 612 if (!t->sched_info.last_queued)
613 t->sched_info.last_queued = jiffies;
590} 614}
591 615
592/* 616/*
@@ -595,13 +619,10 @@ static inline void sched_info_queued(struct task_struct *t)
595 */ 619 */
596static inline void sched_info_depart(struct task_struct *t) 620static inline void sched_info_depart(struct task_struct *t)
597{ 621{
598 struct rq *rq = task_rq(t); 622 unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival;
599 unsigned long diff = jiffies - t->sched_info.last_arrival;
600
601 t->sched_info.cpu_time += diff;
602 623
603 if (rq) 624 t->sched_info.cpu_time += delta_jiffies;
604 rq->rq_sched_info.cpu_time += diff; 625 rq_sched_info_depart(task_rq(t), delta_jiffies);
605} 626}
606 627
607/* 628/*
@@ -610,7 +631,7 @@ static inline void sched_info_depart(struct task_struct *t)
610 * the idle task.) We are only called when prev != next. 631 * the idle task.) We are only called when prev != next.
611 */ 632 */
612static inline void 633static inline void
613sched_info_switch(struct task_struct *prev, struct task_struct *next) 634__sched_info_switch(struct task_struct *prev, struct task_struct *next)
614{ 635{
615 struct rq *rq = task_rq(prev); 636 struct rq *rq = task_rq(prev);
616 637
@@ -625,10 +646,16 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
625 if (next != rq->idle) 646 if (next != rq->idle)
626 sched_info_arrive(next); 647 sched_info_arrive(next);
627} 648}
649static inline void
650sched_info_switch(struct task_struct *prev, struct task_struct *next)
651{
652 if (unlikely(sched_info_on()))
653 __sched_info_switch(prev, next);
654}
628#else 655#else
629#define sched_info_queued(t) do { } while (0) 656#define sched_info_queued(t) do { } while (0)
630#define sched_info_switch(t, next) do { } while (0) 657#define sched_info_switch(t, next) do { } while (0)
631#endif /* CONFIG_SCHEDSTATS */ 658#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
632 659
633/* 660/*
634 * Adding/removing a task to/from a priority array: 661 * Adding/removing a task to/from a priority array:
@@ -1530,8 +1557,9 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags)
1530 1557
1531 INIT_LIST_HEAD(&p->run_list); 1558 INIT_LIST_HEAD(&p->run_list);
1532 p->array = NULL; 1559 p->array = NULL;
1533#ifdef CONFIG_SCHEDSTATS 1560#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
1534 memset(&p->sched_info, 0, sizeof(p->sched_info)); 1561 if (unlikely(sched_info_on()))
1562 memset(&p->sched_info, 0, sizeof(p->sched_info));
1535#endif 1563#endif
1536#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) 1564#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
1537 p->oncpu = 0; 1565 p->oncpu = 0;
@@ -1788,7 +1816,15 @@ context_switch(struct rq *rq, struct task_struct *prev,
1788 WARN_ON(rq->prev_mm); 1816 WARN_ON(rq->prev_mm);
1789 rq->prev_mm = oldmm; 1817 rq->prev_mm = oldmm;
1790 } 1818 }
1819 /*
1820 * Since the runqueue lock will be released by the next
1821 * task (which is an invalid locking op but in the case
1822 * of the scheduler it's an obvious special-case), so we
1823 * do an early lockdep release here:
1824 */
1825#ifndef __ARCH_WANT_UNLOCKED_CTXSW
1791 spin_release(&rq->lock.dep_map, 1, _THIS_IP_); 1826 spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
1827#endif
1792 1828
1793 /* Here we just switch the register state and the stack. */ 1829 /* Here we just switch the register state and the stack. */
1794 switch_to(prev, next, prev); 1830 switch_to(prev, next, prev);
@@ -3384,7 +3420,7 @@ EXPORT_SYMBOL(schedule);
3384 3420
3385#ifdef CONFIG_PREEMPT 3421#ifdef CONFIG_PREEMPT
3386/* 3422/*
3387 * this is is the entry point to schedule() from in-kernel preemption 3423 * this is the entry point to schedule() from in-kernel preemption
3388 * off of preempt_enable. Kernel preemptions off return from interrupt 3424 * off of preempt_enable. Kernel preemptions off return from interrupt
3389 * occur there and call schedule directly. 3425 * occur there and call schedule directly.
3390 */ 3426 */
@@ -3427,7 +3463,7 @@ need_resched:
3427EXPORT_SYMBOL(preempt_schedule); 3463EXPORT_SYMBOL(preempt_schedule);
3428 3464
3429/* 3465/*
3430 * this is is the entry point to schedule() from kernel preemption 3466 * this is the entry point to schedule() from kernel preemption
3431 * off of irq context. 3467 * off of irq context.
3432 * Note, that this is called and return with irqs disabled. This will 3468 * Note, that this is called and return with irqs disabled. This will
3433 * protect us against recursive calling from irq. 3469 * protect us against recursive calling from irq.
@@ -3439,7 +3475,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
3439 struct task_struct *task = current; 3475 struct task_struct *task = current;
3440 int saved_lock_depth; 3476 int saved_lock_depth;
3441#endif 3477#endif
3442 /* Catch callers which need to be fixed*/ 3478 /* Catch callers which need to be fixed */
3443 BUG_ON(ti->preempt_count || !irqs_disabled()); 3479 BUG_ON(ti->preempt_count || !irqs_disabled());
3444 3480
3445need_resched: 3481need_resched:
@@ -4420,9 +4456,9 @@ asmlinkage long sys_sched_yield(void)
4420 return 0; 4456 return 0;
4421} 4457}
4422 4458
4423static inline int __resched_legal(void) 4459static inline int __resched_legal(int expected_preempt_count)
4424{ 4460{
4425 if (unlikely(preempt_count())) 4461 if (unlikely(preempt_count() != expected_preempt_count))
4426 return 0; 4462 return 0;
4427 if (unlikely(system_state != SYSTEM_RUNNING)) 4463 if (unlikely(system_state != SYSTEM_RUNNING))
4428 return 0; 4464 return 0;
@@ -4448,7 +4484,7 @@ static void __cond_resched(void)
4448 4484
4449int __sched cond_resched(void) 4485int __sched cond_resched(void)
4450{ 4486{
4451 if (need_resched() && __resched_legal()) { 4487 if (need_resched() && __resched_legal(0)) {
4452 __cond_resched(); 4488 __cond_resched();
4453 return 1; 4489 return 1;
4454 } 4490 }
@@ -4474,7 +4510,7 @@ int cond_resched_lock(spinlock_t *lock)
4474 ret = 1; 4510 ret = 1;
4475 spin_lock(lock); 4511 spin_lock(lock);
4476 } 4512 }
4477 if (need_resched() && __resched_legal()) { 4513 if (need_resched() && __resched_legal(1)) {
4478 spin_release(&lock->dep_map, 1, _THIS_IP_); 4514 spin_release(&lock->dep_map, 1, _THIS_IP_);
4479 _raw_spin_unlock(lock); 4515 _raw_spin_unlock(lock);
4480 preempt_enable_no_resched(); 4516 preempt_enable_no_resched();
@@ -4490,7 +4526,7 @@ int __sched cond_resched_softirq(void)
4490{ 4526{
4491 BUG_ON(!in_softirq()); 4527 BUG_ON(!in_softirq());
4492 4528
4493 if (need_resched() && __resched_legal()) { 4529 if (need_resched() && __resched_legal(0)) {
4494 raw_local_irq_disable(); 4530 raw_local_irq_disable();
4495 _local_bh_enable(); 4531 _local_bh_enable();
4496 raw_local_irq_enable(); 4532 raw_local_irq_enable();
@@ -4526,9 +4562,11 @@ void __sched io_schedule(void)
4526{ 4562{
4527 struct rq *rq = &__raw_get_cpu_var(runqueues); 4563 struct rq *rq = &__raw_get_cpu_var(runqueues);
4528 4564
4565 delayacct_blkio_start();
4529 atomic_inc(&rq->nr_iowait); 4566 atomic_inc(&rq->nr_iowait);
4530 schedule(); 4567 schedule();
4531 atomic_dec(&rq->nr_iowait); 4568 atomic_dec(&rq->nr_iowait);
4569 delayacct_blkio_end();
4532} 4570}
4533EXPORT_SYMBOL(io_schedule); 4571EXPORT_SYMBOL(io_schedule);
4534 4572
@@ -4537,9 +4575,11 @@ long __sched io_schedule_timeout(long timeout)
4537 struct rq *rq = &__raw_get_cpu_var(runqueues); 4575 struct rq *rq = &__raw_get_cpu_var(runqueues);
4538 long ret; 4576 long ret;
4539 4577
4578 delayacct_blkio_start();
4540 atomic_inc(&rq->nr_iowait); 4579 atomic_inc(&rq->nr_iowait);
4541 ret = schedule_timeout(timeout); 4580 ret = schedule_timeout(timeout);
4542 atomic_dec(&rq->nr_iowait); 4581 atomic_dec(&rq->nr_iowait);
4582 delayacct_blkio_end();
4543 return ret; 4583 return ret;
4544} 4584}
4545 4585
@@ -4650,7 +4690,7 @@ static inline struct task_struct *younger_sibling(struct task_struct *p)
4650 return list_entry(p->sibling.next,struct task_struct,sibling); 4690 return list_entry(p->sibling.next,struct task_struct,sibling);
4651} 4691}
4652 4692
4653static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" }; 4693static const char stat_nam[] = "RSDTtZX";
4654 4694
4655static void show_task(struct task_struct *p) 4695static void show_task(struct task_struct *p)
4656{ 4696{
@@ -4658,12 +4698,9 @@ static void show_task(struct task_struct *p)
4658 unsigned long free = 0; 4698 unsigned long free = 0;
4659 unsigned state; 4699 unsigned state;
4660 4700
4661 printk("%-13.13s ", p->comm);
4662 state = p->state ? __ffs(p->state) + 1 : 0; 4701 state = p->state ? __ffs(p->state) + 1 : 0;
4663 if (state < ARRAY_SIZE(stat_nam)) 4702 printk("%-13.13s %c", p->comm,
4664 printk(stat_nam[state]); 4703 state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
4665 else
4666 printk("?");
4667#if (BITS_PER_LONG == 32) 4704#if (BITS_PER_LONG == 32)
4668 if (state == TASK_RUNNING) 4705 if (state == TASK_RUNNING)
4669 printk(" running "); 4706 printk(" running ");
@@ -4877,7 +4914,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
4877 p->timestamp = p->timestamp - rq_src->timestamp_last_tick 4914 p->timestamp = p->timestamp - rq_src->timestamp_last_tick
4878 + rq_dest->timestamp_last_tick; 4915 + rq_dest->timestamp_last_tick;
4879 deactivate_task(p, rq_src); 4916 deactivate_task(p, rq_src);
4880 activate_task(p, rq_dest, 0); 4917 __activate_task(p, rq_dest);
4881 if (TASK_PREEMPTS_CURR(p, rq_dest)) 4918 if (TASK_PREEMPTS_CURR(p, rq_dest))
4882 resched_task(rq_dest->curr); 4919 resched_task(rq_dest->curr);
4883 } 4920 }
@@ -5776,7 +5813,7 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2)
5776 cache = vmalloc(max_size); 5813 cache = vmalloc(max_size);
5777 if (!cache) { 5814 if (!cache) {
5778 printk("could not vmalloc %d bytes for cache!\n", 2*max_size); 5815 printk("could not vmalloc %d bytes for cache!\n", 2*max_size);
5779 return 1000000; // return 1 msec on very small boxen 5816 return 1000000; /* return 1 msec on very small boxen */
5780 } 5817 }
5781 5818
5782 while (size <= max_size) { 5819 while (size <= max_size) {
@@ -6457,7 +6494,12 @@ static int build_sched_domains(const cpumask_t *cpu_map)
6457 for (i = 0; i < MAX_NUMNODES; i++) 6494 for (i = 0; i < MAX_NUMNODES; i++)
6458 init_numa_sched_groups_power(sched_group_nodes[i]); 6495 init_numa_sched_groups_power(sched_group_nodes[i]);
6459 6496
6460 init_numa_sched_groups_power(sched_group_allnodes); 6497 if (sched_group_allnodes) {
6498 int group = cpu_to_allnodes_group(first_cpu(*cpu_map));
6499 struct sched_group *sg = &sched_group_allnodes[group];
6500
6501 init_numa_sched_groups_power(sg);
6502 }
6461#endif 6503#endif
6462 6504
6463 /* Attach the domains */ 6505 /* Attach the domains */
@@ -6724,6 +6766,11 @@ void __init sched_init(void)
6724 } 6766 }
6725 6767
6726 set_load_weight(&init_task); 6768 set_load_weight(&init_task);
6769
6770#ifdef CONFIG_RT_MUTEXES
6771 plist_head_init(&init_task.pi_waiters, &init_task.pi_lock);
6772#endif
6773
6727 /* 6774 /*
6728 * The boot idle thread does lazy MMU switching as well: 6775 * The boot idle thread does lazy MMU switching as well:
6729 */ 6776 */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 215541e26c1a..3789ca98197c 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -65,6 +65,7 @@ static inline void wakeup_softirqd(void)
65 * This one is for softirq.c-internal use, 65 * This one is for softirq.c-internal use,
66 * where hardirqs are disabled legitimately: 66 * where hardirqs are disabled legitimately:
67 */ 67 */
68#ifdef CONFIG_TRACE_IRQFLAGS
68static void __local_bh_disable(unsigned long ip) 69static void __local_bh_disable(unsigned long ip)
69{ 70{
70 unsigned long flags; 71 unsigned long flags;
@@ -80,6 +81,13 @@ static void __local_bh_disable(unsigned long ip)
80 trace_softirqs_off(ip); 81 trace_softirqs_off(ip);
81 raw_local_irq_restore(flags); 82 raw_local_irq_restore(flags);
82} 83}
84#else /* !CONFIG_TRACE_IRQFLAGS */
85static inline void __local_bh_disable(unsigned long ip)
86{
87 add_preempt_count(SOFTIRQ_OFFSET);
88 barrier();
89}
90#endif /* CONFIG_TRACE_IRQFLAGS */
83 91
84void local_bh_disable(void) 92void local_bh_disable(void)
85{ 93{
@@ -121,12 +129,16 @@ EXPORT_SYMBOL(_local_bh_enable);
121 129
122void local_bh_enable(void) 130void local_bh_enable(void)
123{ 131{
132#ifdef CONFIG_TRACE_IRQFLAGS
124 unsigned long flags; 133 unsigned long flags;
125 134
126 WARN_ON_ONCE(in_irq()); 135 WARN_ON_ONCE(in_irq());
136#endif
127 WARN_ON_ONCE(irqs_disabled()); 137 WARN_ON_ONCE(irqs_disabled());
128 138
139#ifdef CONFIG_TRACE_IRQFLAGS
129 local_irq_save(flags); 140 local_irq_save(flags);
141#endif
130 /* 142 /*
131 * Are softirqs going to be turned on now: 143 * Are softirqs going to be turned on now:
132 */ 144 */
@@ -142,18 +154,22 @@ void local_bh_enable(void)
142 do_softirq(); 154 do_softirq();
143 155
144 dec_preempt_count(); 156 dec_preempt_count();
157#ifdef CONFIG_TRACE_IRQFLAGS
145 local_irq_restore(flags); 158 local_irq_restore(flags);
159#endif
146 preempt_check_resched(); 160 preempt_check_resched();
147} 161}
148EXPORT_SYMBOL(local_bh_enable); 162EXPORT_SYMBOL(local_bh_enable);
149 163
150void local_bh_enable_ip(unsigned long ip) 164void local_bh_enable_ip(unsigned long ip)
151{ 165{
166#ifdef CONFIG_TRACE_IRQFLAGS
152 unsigned long flags; 167 unsigned long flags;
153 168
154 WARN_ON_ONCE(in_irq()); 169 WARN_ON_ONCE(in_irq());
155 170
156 local_irq_save(flags); 171 local_irq_save(flags);
172#endif
157 /* 173 /*
158 * Are softirqs going to be turned on now: 174 * Are softirqs going to be turned on now:
159 */ 175 */
@@ -169,7 +185,9 @@ void local_bh_enable_ip(unsigned long ip)
169 do_softirq(); 185 do_softirq();
170 186
171 dec_preempt_count(); 187 dec_preempt_count();
188#ifdef CONFIG_TRACE_IRQFLAGS
172 local_irq_restore(flags); 189 local_irq_restore(flags);
190#endif
173 preempt_check_resched(); 191 preempt_check_resched();
174} 192}
175EXPORT_SYMBOL(local_bh_enable_ip); 193EXPORT_SYMBOL(local_bh_enable_ip);
@@ -311,8 +329,6 @@ void open_softirq(int nr, void (*action)(struct softirq_action*), void *data)
311 softirq_vec[nr].action = action; 329 softirq_vec[nr].action = action;
312} 330}
313 331
314EXPORT_SYMBOL(open_softirq);
315
316/* Tasklets */ 332/* Tasklets */
317struct tasklet_head 333struct tasklet_head
318{ 334{
@@ -549,7 +565,7 @@ static void takeover_tasklets(unsigned int cpu)
549} 565}
550#endif /* CONFIG_HOTPLUG_CPU */ 566#endif /* CONFIG_HOTPLUG_CPU */
551 567
552static int __devinit cpu_callback(struct notifier_block *nfb, 568static int __cpuinit cpu_callback(struct notifier_block *nfb,
553 unsigned long action, 569 unsigned long action,
554 void *hcpu) 570 void *hcpu)
555{ 571{
@@ -589,7 +605,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb,
589 return NOTIFY_OK; 605 return NOTIFY_OK;
590} 606}
591 607
592static struct notifier_block __devinitdata cpu_nfb = { 608static struct notifier_block __cpuinitdata cpu_nfb = {
593 .notifier_call = cpu_callback 609 .notifier_call = cpu_callback
594}; 610};
595 611
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 6b76caa22981..03e6a2b0b787 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -104,7 +104,7 @@ static int watchdog(void * __bind_cpu)
104/* 104/*
105 * Create/destroy watchdog threads as CPUs come and go: 105 * Create/destroy watchdog threads as CPUs come and go:
106 */ 106 */
107static int __devinit 107static int __cpuinit
108cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) 108cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
109{ 109{
110 int hotcpu = (unsigned long)hcpu; 110 int hotcpu = (unsigned long)hcpu;
@@ -142,7 +142,7 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
142 return NOTIFY_OK; 142 return NOTIFY_OK;
143} 143}
144 144
145static struct notifier_block __devinitdata cpu_nfb = { 145static struct notifier_block __cpuinitdata cpu_nfb = {
146 .notifier_call = cpu_callback 146 .notifier_call = cpu_callback
147}; 147};
148 148
diff --git a/kernel/sys.c b/kernel/sys.c
index dbb3b9c7ea64..e236f98f7ec5 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1983,7 +1983,7 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1983 error = current->mm->dumpable; 1983 error = current->mm->dumpable;
1984 break; 1984 break;
1985 case PR_SET_DUMPABLE: 1985 case PR_SET_DUMPABLE:
1986 if (arg2 < 0 || arg2 > 2) { 1986 if (arg2 < 0 || arg2 > 1) {
1987 error = -EINVAL; 1987 error = -EINVAL;
1988 break; 1988 break;
1989 } 1989 }
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
new file mode 100644
index 000000000000..e78187657330
--- /dev/null
+++ b/kernel/taskstats.c
@@ -0,0 +1,564 @@
1/*
2 * taskstats.c - Export per-task statistics to userland
3 *
4 * Copyright (C) Shailabh Nagar, IBM Corp. 2006
5 * (C) Balbir Singh, IBM Corp. 2006
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 */
18
19#include <linux/kernel.h>
20#include <linux/taskstats_kern.h>
21#include <linux/delayacct.h>
22#include <linux/cpumask.h>
23#include <linux/percpu.h>
24#include <net/genetlink.h>
25#include <asm/atomic.h>
26
27/*
28 * Maximum length of a cpumask that can be specified in
29 * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute
30 */
31#define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS)
32
33static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
34static int family_registered;
35kmem_cache_t *taskstats_cache;
36
37static struct genl_family family = {
38 .id = GENL_ID_GENERATE,
39 .name = TASKSTATS_GENL_NAME,
40 .version = TASKSTATS_GENL_VERSION,
41 .maxattr = TASKSTATS_CMD_ATTR_MAX,
42};
43
44static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1]
45__read_mostly = {
46 [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 },
47 [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
48 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
49 [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
50
51struct listener {
52 struct list_head list;
53 pid_t pid;
54 char valid;
55};
56
57struct listener_list {
58 struct rw_semaphore sem;
59 struct list_head list;
60};
61static DEFINE_PER_CPU(struct listener_list, listener_array);
62
63enum actions {
64 REGISTER,
65 DEREGISTER,
66 CPU_DONT_CARE
67};
68
69static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
70 void **replyp, size_t size)
71{
72 struct sk_buff *skb;
73 void *reply;
74
75 /*
76 * If new attributes are added, please revisit this allocation
77 */
78 skb = nlmsg_new(size);
79 if (!skb)
80 return -ENOMEM;
81
82 if (!info) {
83 int seq = get_cpu_var(taskstats_seqnum)++;
84 put_cpu_var(taskstats_seqnum);
85
86 reply = genlmsg_put(skb, 0, seq,
87 family.id, 0, 0,
88 cmd, family.version);
89 } else
90 reply = genlmsg_put(skb, info->snd_pid, info->snd_seq,
91 family.id, 0, 0,
92 cmd, family.version);
93 if (reply == NULL) {
94 nlmsg_free(skb);
95 return -EINVAL;
96 }
97
98 *skbp = skb;
99 *replyp = reply;
100 return 0;
101}
102
103/*
104 * Send taskstats data in @skb to listener with nl_pid @pid
105 */
106static int send_reply(struct sk_buff *skb, pid_t pid)
107{
108 struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
109 void *reply = genlmsg_data(genlhdr);
110 int rc;
111
112 rc = genlmsg_end(skb, reply);
113 if (rc < 0) {
114 nlmsg_free(skb);
115 return rc;
116 }
117
118 return genlmsg_unicast(skb, pid);
119}
120
121/*
122 * Send taskstats data in @skb to listeners registered for @cpu's exit data
123 */
124static void send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
125{
126 struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
127 struct listener_list *listeners;
128 struct listener *s, *tmp;
129 struct sk_buff *skb_next, *skb_cur = skb;
130 void *reply = genlmsg_data(genlhdr);
131 int rc, delcount = 0;
132
133 rc = genlmsg_end(skb, reply);
134 if (rc < 0) {
135 nlmsg_free(skb);
136 return;
137 }
138
139 rc = 0;
140 listeners = &per_cpu(listener_array, cpu);
141 down_read(&listeners->sem);
142 list_for_each_entry(s, &listeners->list, list) {
143 skb_next = NULL;
144 if (!list_is_last(&s->list, &listeners->list)) {
145 skb_next = skb_clone(skb_cur, GFP_KERNEL);
146 if (!skb_next)
147 break;
148 }
149 rc = genlmsg_unicast(skb_cur, s->pid);
150 if (rc == -ECONNREFUSED) {
151 s->valid = 0;
152 delcount++;
153 }
154 skb_cur = skb_next;
155 }
156 up_read(&listeners->sem);
157
158 if (skb_cur)
159 nlmsg_free(skb_cur);
160
161 if (!delcount)
162 return;
163
164 /* Delete invalidated entries */
165 down_write(&listeners->sem);
166 list_for_each_entry_safe(s, tmp, &listeners->list, list) {
167 if (!s->valid) {
168 list_del(&s->list);
169 kfree(s);
170 }
171 }
172 up_write(&listeners->sem);
173}
174
175static int fill_pid(pid_t pid, struct task_struct *pidtsk,
176 struct taskstats *stats)
177{
178 int rc = 0;
179 struct task_struct *tsk = pidtsk;
180
181 if (!pidtsk) {
182 read_lock(&tasklist_lock);
183 tsk = find_task_by_pid(pid);
184 if (!tsk) {
185 read_unlock(&tasklist_lock);
186 return -ESRCH;
187 }
188 get_task_struct(tsk);
189 read_unlock(&tasklist_lock);
190 } else
191 get_task_struct(tsk);
192
193 /*
194 * Each accounting subsystem adds calls to its functions to
195 * fill in relevant parts of struct taskstsats as follows
196 *
197 * per-task-foo(stats, tsk);
198 */
199
200 delayacct_add_tsk(stats, tsk);
201 stats->version = TASKSTATS_VERSION;
202
203 /* Define err: label here if needed */
204 put_task_struct(tsk);
205 return rc;
206
207}
208
209static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk,
210 struct taskstats *stats)
211{
212 struct task_struct *tsk, *first;
213 unsigned long flags;
214
215 /*
216 * Add additional stats from live tasks except zombie thread group
217 * leaders who are already counted with the dead tasks
218 */
219 first = tgidtsk;
220 if (!first) {
221 read_lock(&tasklist_lock);
222 first = find_task_by_pid(tgid);
223 if (!first) {
224 read_unlock(&tasklist_lock);
225 return -ESRCH;
226 }
227 get_task_struct(first);
228 read_unlock(&tasklist_lock);
229 } else
230 get_task_struct(first);
231
232 /* Start with stats from dead tasks */
233 spin_lock_irqsave(&first->signal->stats_lock, flags);
234 if (first->signal->stats)
235 memcpy(stats, first->signal->stats, sizeof(*stats));
236 spin_unlock_irqrestore(&first->signal->stats_lock, flags);
237
238 tsk = first;
239 read_lock(&tasklist_lock);
240 do {
241 if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk))
242 continue;
243 /*
244 * Accounting subsystem can call its functions here to
245 * fill in relevant parts of struct taskstsats as follows
246 *
247 * per-task-foo(stats, tsk);
248 */
249 delayacct_add_tsk(stats, tsk);
250
251 } while_each_thread(first, tsk);
252 read_unlock(&tasklist_lock);
253 stats->version = TASKSTATS_VERSION;
254
255 /*
256 * Accounting subsytems can also add calls here to modify
257 * fields of taskstats.
258 */
259
260 return 0;
261}
262
263
264static void fill_tgid_exit(struct task_struct *tsk)
265{
266 unsigned long flags;
267
268 spin_lock_irqsave(&tsk->signal->stats_lock, flags);
269 if (!tsk->signal->stats)
270 goto ret;
271
272 /*
273 * Each accounting subsystem calls its functions here to
274 * accumalate its per-task stats for tsk, into the per-tgid structure
275 *
276 * per-task-foo(tsk->signal->stats, tsk);
277 */
278 delayacct_add_tsk(tsk->signal->stats, tsk);
279ret:
280 spin_unlock_irqrestore(&tsk->signal->stats_lock, flags);
281 return;
282}
283
284static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
285{
286 struct listener_list *listeners;
287 struct listener *s, *tmp;
288 unsigned int cpu;
289 cpumask_t mask = *maskp;
290
291 if (!cpus_subset(mask, cpu_possible_map))
292 return -EINVAL;
293
294 if (isadd == REGISTER) {
295 for_each_cpu_mask(cpu, mask) {
296 s = kmalloc_node(sizeof(struct listener), GFP_KERNEL,
297 cpu_to_node(cpu));
298 if (!s)
299 goto cleanup;
300 s->pid = pid;
301 INIT_LIST_HEAD(&s->list);
302 s->valid = 1;
303
304 listeners = &per_cpu(listener_array, cpu);
305 down_write(&listeners->sem);
306 list_add(&s->list, &listeners->list);
307 up_write(&listeners->sem);
308 }
309 return 0;
310 }
311
312 /* Deregister or cleanup */
313cleanup:
314 for_each_cpu_mask(cpu, mask) {
315 listeners = &per_cpu(listener_array, cpu);
316 down_write(&listeners->sem);
317 list_for_each_entry_safe(s, tmp, &listeners->list, list) {
318 if (s->pid == pid) {
319 list_del(&s->list);
320 kfree(s);
321 break;
322 }
323 }
324 up_write(&listeners->sem);
325 }
326 return 0;
327}
328
329static int parse(struct nlattr *na, cpumask_t *mask)
330{
331 char *data;
332 int len;
333 int ret;
334
335 if (na == NULL)
336 return 1;
337 len = nla_len(na);
338 if (len > TASKSTATS_CPUMASK_MAXLEN)
339 return -E2BIG;
340 if (len < 1)
341 return -EINVAL;
342 data = kmalloc(len, GFP_KERNEL);
343 if (!data)
344 return -ENOMEM;
345 nla_strlcpy(data, na, len);
346 ret = cpulist_parse(data, *mask);
347 kfree(data);
348 return ret;
349}
350
351static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
352{
353 int rc = 0;
354 struct sk_buff *rep_skb;
355 struct taskstats stats;
356 void *reply;
357 size_t size;
358 struct nlattr *na;
359 cpumask_t mask;
360
361 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask);
362 if (rc < 0)
363 return rc;
364 if (rc == 0)
365 return add_del_listener(info->snd_pid, &mask, REGISTER);
366
367 rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], &mask);
368 if (rc < 0)
369 return rc;
370 if (rc == 0)
371 return add_del_listener(info->snd_pid, &mask, DEREGISTER);
372
373 /*
374 * Size includes space for nested attributes
375 */
376 size = nla_total_size(sizeof(u32)) +
377 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
378
379 memset(&stats, 0, sizeof(stats));
380 rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
381 if (rc < 0)
382 return rc;
383
384 if (info->attrs[TASKSTATS_CMD_ATTR_PID]) {
385 u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
386 rc = fill_pid(pid, NULL, &stats);
387 if (rc < 0)
388 goto err;
389
390 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID);
391 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, pid);
392 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
393 stats);
394 } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) {
395 u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
396 rc = fill_tgid(tgid, NULL, &stats);
397 if (rc < 0)
398 goto err;
399
400 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
401 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, tgid);
402 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
403 stats);
404 } else {
405 rc = -EINVAL;
406 goto err;
407 }
408
409 nla_nest_end(rep_skb, na);
410
411 return send_reply(rep_skb, info->snd_pid);
412
413nla_put_failure:
414 return genlmsg_cancel(rep_skb, reply);
415err:
416 nlmsg_free(rep_skb);
417 return rc;
418}
419
420void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu)
421{
422 struct listener_list *listeners;
423 struct taskstats *tmp;
424 /*
425 * This is the cpu on which the task is exiting currently and will
426 * be the one for which the exit event is sent, even if the cpu
427 * on which this function is running changes later.
428 */
429 *mycpu = raw_smp_processor_id();
430
431 *ptidstats = NULL;
432 tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
433 if (!tmp)
434 return;
435
436 listeners = &per_cpu(listener_array, *mycpu);
437 down_read(&listeners->sem);
438 if (!list_empty(&listeners->list)) {
439 *ptidstats = tmp;
440 tmp = NULL;
441 }
442 up_read(&listeners->sem);
443 kfree(tmp);
444}
445
446/* Send pid data out on exit */
447void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
448 int group_dead, unsigned int mycpu)
449{
450 int rc;
451 struct sk_buff *rep_skb;
452 void *reply;
453 size_t size;
454 int is_thread_group;
455 struct nlattr *na;
456 unsigned long flags;
457
458 if (!family_registered || !tidstats)
459 return;
460
461 spin_lock_irqsave(&tsk->signal->stats_lock, flags);
462 is_thread_group = tsk->signal->stats ? 1 : 0;
463 spin_unlock_irqrestore(&tsk->signal->stats_lock, flags);
464
465 rc = 0;
466 /*
467 * Size includes space for nested attributes
468 */
469 size = nla_total_size(sizeof(u32)) +
470 nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
471
472 if (is_thread_group)
473 size = 2 * size; /* PID + STATS + TGID + STATS */
474
475 rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
476 if (rc < 0)
477 goto ret;
478
479 rc = fill_pid(tsk->pid, tsk, tidstats);
480 if (rc < 0)
481 goto err_skb;
482
483 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID);
484 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, (u32)tsk->pid);
485 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
486 *tidstats);
487 nla_nest_end(rep_skb, na);
488
489 if (!is_thread_group)
490 goto send;
491
492 /*
493 * tsk has/had a thread group so fill the tsk->signal->stats structure
494 * Doesn't matter if tsk is the leader or the last group member leaving
495 */
496
497 fill_tgid_exit(tsk);
498 if (!group_dead)
499 goto send;
500
501 na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
502 NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid);
503 /* No locking needed for tsk->signal->stats since group is dead */
504 NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
505 *tsk->signal->stats);
506 nla_nest_end(rep_skb, na);
507
508send:
509 send_cpu_listeners(rep_skb, mycpu);
510 return;
511
512nla_put_failure:
513 genlmsg_cancel(rep_skb, reply);
514 goto ret;
515err_skb:
516 nlmsg_free(rep_skb);
517ret:
518 return;
519}
520
521static struct genl_ops taskstats_ops = {
522 .cmd = TASKSTATS_CMD_GET,
523 .doit = taskstats_user_cmd,
524 .policy = taskstats_cmd_get_policy,
525};
526
527/* Needed early in initialization */
528void __init taskstats_init_early(void)
529{
530 unsigned int i;
531
532 taskstats_cache = kmem_cache_create("taskstats_cache",
533 sizeof(struct taskstats),
534 0, SLAB_PANIC, NULL, NULL);
535 for_each_possible_cpu(i) {
536 INIT_LIST_HEAD(&(per_cpu(listener_array, i).list));
537 init_rwsem(&(per_cpu(listener_array, i).sem));
538 }
539}
540
541static int __init taskstats_init(void)
542{
543 int rc;
544
545 rc = genl_register_family(&family);
546 if (rc)
547 return rc;
548
549 rc = genl_register_ops(&family, &taskstats_ops);
550 if (rc < 0)
551 goto err;
552
553 family_registered = 1;
554 return 0;
555err:
556 genl_unregister_family(&family);
557 return rc;
558}
559
560/*
561 * late initcall ensures initialization of statistics collection
562 * mechanisms precedes initialization of the taskstats interface
563 */
564late_initcall(taskstats_init);
diff --git a/kernel/timer.c b/kernel/timer.c
index 396a3c024c2c..b650f04888ed 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -84,7 +84,7 @@ typedef struct tvec_t_base_s tvec_base_t;
84 84
85tvec_base_t boot_tvec_bases; 85tvec_base_t boot_tvec_bases;
86EXPORT_SYMBOL(boot_tvec_bases); 86EXPORT_SYMBOL(boot_tvec_bases);
87static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = { &boot_tvec_bases }; 87static DEFINE_PER_CPU(tvec_base_t *, tvec_bases) = &boot_tvec_bases;
88 88
89static inline void set_running_timer(tvec_base_t *base, 89static inline void set_running_timer(tvec_base_t *base,
90 struct timer_list *timer) 90 struct timer_list *timer)
@@ -374,6 +374,7 @@ int del_timer_sync(struct timer_list *timer)
374 int ret = try_to_del_timer_sync(timer); 374 int ret = try_to_del_timer_sync(timer);
375 if (ret >= 0) 375 if (ret >= 0)
376 return ret; 376 return ret;
377 cpu_relax();
377 } 378 }
378} 379}
379 380
@@ -407,7 +408,7 @@ static int cascade(tvec_base_t *base, tvec_t *tv, int index)
407 * This function cascades all vectors and executes all expired timer 408 * This function cascades all vectors and executes all expired timer
408 * vectors. 409 * vectors.
409 */ 410 */
410#define INDEX(N) (base->timer_jiffies >> (TVR_BITS + N * TVN_BITS)) & TVN_MASK 411#define INDEX(N) ((base->timer_jiffies >> (TVR_BITS + (N) * TVN_BITS)) & TVN_MASK)
411 412
412static inline void __run_timers(tvec_base_t *base) 413static inline void __run_timers(tvec_base_t *base)
413{ 414{
@@ -891,6 +892,7 @@ int do_settimeofday(struct timespec *tv)
891 set_normalized_timespec(&xtime, sec, nsec); 892 set_normalized_timespec(&xtime, sec, nsec);
892 set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); 893 set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
893 894
895 clock->error = 0;
894 ntp_clear(); 896 ntp_clear();
895 897
896 write_sequnlock_irqrestore(&xtime_lock, flags); 898 write_sequnlock_irqrestore(&xtime_lock, flags);
@@ -967,6 +969,7 @@ void __init timekeeping_init(void)
967} 969}
968 970
969 971
972static int timekeeping_suspended;
970/* 973/*
971 * timekeeping_resume - Resumes the generic timekeeping subsystem. 974 * timekeeping_resume - Resumes the generic timekeeping subsystem.
972 * @dev: unused 975 * @dev: unused
@@ -982,6 +985,18 @@ static int timekeeping_resume(struct sys_device *dev)
982 write_seqlock_irqsave(&xtime_lock, flags); 985 write_seqlock_irqsave(&xtime_lock, flags);
983 /* restart the last cycle value */ 986 /* restart the last cycle value */
984 clock->cycle_last = clocksource_read(clock); 987 clock->cycle_last = clocksource_read(clock);
988 clock->error = 0;
989 timekeeping_suspended = 0;
990 write_sequnlock_irqrestore(&xtime_lock, flags);
991 return 0;
992}
993
994static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
995{
996 unsigned long flags;
997
998 write_seqlock_irqsave(&xtime_lock, flags);
999 timekeeping_suspended = 1;
985 write_sequnlock_irqrestore(&xtime_lock, flags); 1000 write_sequnlock_irqrestore(&xtime_lock, flags);
986 return 0; 1001 return 0;
987} 1002}
@@ -989,6 +1004,7 @@ static int timekeeping_resume(struct sys_device *dev)
989/* sysfs resume/suspend bits for timekeeping */ 1004/* sysfs resume/suspend bits for timekeeping */
990static struct sysdev_class timekeeping_sysclass = { 1005static struct sysdev_class timekeeping_sysclass = {
991 .resume = timekeeping_resume, 1006 .resume = timekeeping_resume,
1007 .suspend = timekeeping_suspend,
992 set_kset_name("timekeeping"), 1008 set_kset_name("timekeeping"),
993}; 1009};
994 1010
@@ -1008,52 +1024,52 @@ static int __init timekeeping_init_device(void)
1008device_initcall(timekeeping_init_device); 1024device_initcall(timekeeping_init_device);
1009 1025
1010/* 1026/*
1011 * If the error is already larger, we look ahead another tick, 1027 * If the error is already larger, we look ahead even further
1012 * to compensate for late or lost adjustments. 1028 * to compensate for late or lost adjustments.
1013 */ 1029 */
1014static __always_inline int clocksource_bigadjust(int sign, s64 error, s64 *interval, s64 *offset) 1030static __always_inline int clocksource_bigadjust(s64 error, s64 *interval, s64 *offset)
1015{ 1031{
1016 int adj; 1032 s64 tick_error, i;
1033 u32 look_ahead, adj;
1034 s32 error2, mult;
1017 1035
1018 /* 1036 /*
1019 * As soon as the machine is synchronized to the external time 1037 * Use the current error value to determine how much to look ahead.
1020 * source this should be the common case. 1038 * The larger the error the slower we adjust for it to avoid problems
1039 * with losing too many ticks, otherwise we would overadjust and
1040 * produce an even larger error. The smaller the adjustment the
1041 * faster we try to adjust for it, as lost ticks can do less harm
1042 * here. This is tuned so that an error of about 1 msec is adusted
1043 * within about 1 sec (or 2^20 nsec in 2^SHIFT_HZ ticks).
1021 */ 1044 */
1022 error >>= 2; 1045 error2 = clock->error >> (TICK_LENGTH_SHIFT + 22 - 2 * SHIFT_HZ);
1023 if (likely(sign > 0 ? error <= *interval : error >= *interval)) 1046 error2 = abs(error2);
1024 return sign; 1047 for (look_ahead = 0; error2 > 0; look_ahead++)
1048 error2 >>= 2;
1025 1049
1026 /* 1050 /*
1027 * An extra look ahead dampens the effect of the current error, 1051 * Now calculate the error in (1 << look_ahead) ticks, but first
1028 * which can grow quite large with continously late updates, as 1052 * remove the single look ahead already included in the error.
1029 * it would dominate the adjustment value and can lead to
1030 * oscillation.
1031 */ 1053 */
1032 error += current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1); 1054 tick_error = current_tick_length() >> (TICK_LENGTH_SHIFT - clock->shift + 1);
1033 error -= clock->xtime_interval >> 1; 1055 tick_error -= clock->xtime_interval >> 1;
1034 1056 error = ((error - tick_error) >> look_ahead) + tick_error;
1035 adj = 0; 1057
1036 while (1) { 1058 /* Finally calculate the adjustment shift value. */
1037 error >>= 1; 1059 i = *interval;
1038 if (sign > 0 ? error <= *interval : error >= *interval) 1060 mult = 1;
1039 break; 1061 if (error < 0) {
1040 adj++; 1062 error = -error;
1063 *interval = -*interval;
1064 *offset = -*offset;
1065 mult = -1;
1041 } 1066 }
1042 1067 for (adj = 0; error > i; adj++)
1043 /* 1068 error >>= 1;
1044 * Add the current adjustments to the error and take the offset
1045 * into account, the latter can cause the error to be hardly
1046 * reduced at the next tick. Check the error again if there's
1047 * room for another adjustment, thus further reducing the error
1048 * which otherwise had to be corrected at the next update.
1049 */
1050 error = (error << 1) - *interval + *offset;
1051 if (sign > 0 ? error > *interval : error < *interval)
1052 adj++;
1053 1069
1054 *interval <<= adj; 1070 *interval <<= adj;
1055 *offset <<= adj; 1071 *offset <<= adj;
1056 return sign << adj; 1072 return mult << adj;
1057} 1073}
1058 1074
1059/* 1075/*
@@ -1068,11 +1084,19 @@ static void clocksource_adjust(struct clocksource *clock, s64 offset)
1068 1084
1069 error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1); 1085 error = clock->error >> (TICK_LENGTH_SHIFT - clock->shift - 1);
1070 if (error > interval) { 1086 if (error > interval) {
1071 adj = clocksource_bigadjust(1, error, &interval, &offset); 1087 error >>= 2;
1088 if (likely(error <= interval))
1089 adj = 1;
1090 else
1091 adj = clocksource_bigadjust(error, &interval, &offset);
1072 } else if (error < -interval) { 1092 } else if (error < -interval) {
1073 interval = -interval; 1093 error >>= 2;
1074 offset = -offset; 1094 if (likely(error >= -interval)) {
1075 adj = clocksource_bigadjust(-1, error, &interval, &offset); 1095 adj = -1;
1096 interval = -interval;
1097 offset = -offset;
1098 } else
1099 adj = clocksource_bigadjust(error, &interval, &offset);
1076 } else 1100 } else
1077 return; 1101 return;
1078 1102
@@ -1091,13 +1115,16 @@ static void update_wall_time(void)
1091{ 1115{
1092 cycle_t offset; 1116 cycle_t offset;
1093 1117
1094 clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift; 1118 /* Make sure we're fully resumed: */
1119 if (unlikely(timekeeping_suspended))
1120 return;
1095 1121
1096#ifdef CONFIG_GENERIC_TIME 1122#ifdef CONFIG_GENERIC_TIME
1097 offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask; 1123 offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask;
1098#else 1124#else
1099 offset = clock->cycle_interval; 1125 offset = clock->cycle_interval;
1100#endif 1126#endif
1127 clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
1101 1128
1102 /* normally this loop will run just once, however in the 1129 /* normally this loop will run just once, however in the
1103 * case of lost or late ticks, it will accumulate correctly. 1130 * case of lost or late ticks, it will accumulate correctly.
@@ -1129,7 +1156,7 @@ static void update_wall_time(void)
1129 clocksource_adjust(clock, offset); 1156 clocksource_adjust(clock, offset);
1130 1157
1131 /* store full nanoseconds into xtime */ 1158 /* store full nanoseconds into xtime */
1132 xtime.tv_nsec = clock->xtime_nsec >> clock->shift; 1159 xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift;
1133 clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; 1160 clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
1134 1161
1135 /* check to see if there is a new clocksource to use */ 1162 /* check to see if there is a new clocksource to use */
@@ -1661,7 +1688,7 @@ static void __devinit migrate_timers(int cpu)
1661} 1688}
1662#endif /* CONFIG_HOTPLUG_CPU */ 1689#endif /* CONFIG_HOTPLUG_CPU */
1663 1690
1664static int __devinit timer_cpu_notify(struct notifier_block *self, 1691static int __cpuinit timer_cpu_notify(struct notifier_block *self,
1665 unsigned long action, void *hcpu) 1692 unsigned long action, void *hcpu)
1666{ 1693{
1667 long cpu = (long)hcpu; 1694 long cpu = (long)hcpu;
@@ -1681,7 +1708,7 @@ static int __devinit timer_cpu_notify(struct notifier_block *self,
1681 return NOTIFY_OK; 1708 return NOTIFY_OK;
1682} 1709}
1683 1710
1684static struct notifier_block __devinitdata timers_nb = { 1711static struct notifier_block __cpuinitdata timers_nb = {
1685 .notifier_call = timer_cpu_notify, 1712 .notifier_call = timer_cpu_notify,
1686}; 1713};
1687 1714
diff --git a/kernel/wait.c b/kernel/wait.c
index a1d57aeb7f75..59a82f63275d 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -10,9 +10,13 @@
10#include <linux/wait.h> 10#include <linux/wait.h>
11#include <linux/hash.h> 11#include <linux/hash.h>
12 12
13struct lock_class_key waitqueue_lock_key; 13void init_waitqueue_head(wait_queue_head_t *q)
14{
15 spin_lock_init(&q->lock);
16 INIT_LIST_HEAD(&q->task_list);
17}
14 18
15EXPORT_SYMBOL(waitqueue_lock_key); 19EXPORT_SYMBOL(init_waitqueue_head);
16 20
17void fastcall add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait) 21void fastcall add_wait_queue(wait_queue_head_t *q, wait_queue_t *wait)
18{ 22{
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index eebb1d839235..448e8f7b342d 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -93,9 +93,12 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
93 spin_unlock_irqrestore(&cwq->lock, flags); 93 spin_unlock_irqrestore(&cwq->lock, flags);
94} 94}
95 95
96/* 96/**
97 * Queue work on a workqueue. Return non-zero if it was successfully 97 * queue_work - queue work on a workqueue
98 * added. 98 * @wq: workqueue to use
99 * @work: work to queue
100 *
101 * Returns non-zero if it was successfully added.
99 * 102 *
100 * We queue the work to the CPU it was submitted, but there is no 103 * We queue the work to the CPU it was submitted, but there is no
101 * guarantee that it will be processed by that CPU. 104 * guarantee that it will be processed by that CPU.
@@ -128,6 +131,14 @@ static void delayed_work_timer_fn(unsigned long __data)
128 __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work); 131 __queue_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
129} 132}
130 133
134/**
135 * queue_delayed_work - queue work on a workqueue after delay
136 * @wq: workqueue to use
137 * @work: work to queue
138 * @delay: number of jiffies to wait before queueing
139 *
140 * Returns non-zero if it was successfully added.
141 */
131int fastcall queue_delayed_work(struct workqueue_struct *wq, 142int fastcall queue_delayed_work(struct workqueue_struct *wq,
132 struct work_struct *work, unsigned long delay) 143 struct work_struct *work, unsigned long delay)
133{ 144{
@@ -150,6 +161,15 @@ int fastcall queue_delayed_work(struct workqueue_struct *wq,
150} 161}
151EXPORT_SYMBOL_GPL(queue_delayed_work); 162EXPORT_SYMBOL_GPL(queue_delayed_work);
152 163
164/**
165 * queue_delayed_work_on - queue work on specific CPU after delay
166 * @cpu: CPU number to execute work on
167 * @wq: workqueue to use
168 * @work: work to queue
169 * @delay: number of jiffies to wait before queueing
170 *
171 * Returns non-zero if it was successfully added.
172 */
153int queue_delayed_work_on(int cpu, struct workqueue_struct *wq, 173int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
154 struct work_struct *work, unsigned long delay) 174 struct work_struct *work, unsigned long delay)
155{ 175{
@@ -275,8 +295,9 @@ static void flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
275 } 295 }
276} 296}
277 297
278/* 298/**
279 * flush_workqueue - ensure that any scheduled work has run to completion. 299 * flush_workqueue - ensure that any scheduled work has run to completion.
300 * @wq: workqueue to flush
280 * 301 *
281 * Forces execution of the workqueue and blocks until its completion. 302 * Forces execution of the workqueue and blocks until its completion.
282 * This is typically used in driver shutdown handlers. 303 * This is typically used in driver shutdown handlers.
@@ -400,6 +421,12 @@ static void cleanup_workqueue_thread(struct workqueue_struct *wq, int cpu)
400 kthread_stop(p); 421 kthread_stop(p);
401} 422}
402 423
424/**
425 * destroy_workqueue - safely terminate a workqueue
426 * @wq: target workqueue
427 *
428 * Safely destroy a workqueue. All work currently pending will be done first.
429 */
403void destroy_workqueue(struct workqueue_struct *wq) 430void destroy_workqueue(struct workqueue_struct *wq)
404{ 431{
405 int cpu; 432 int cpu;
@@ -425,18 +452,41 @@ EXPORT_SYMBOL_GPL(destroy_workqueue);
425 452
426static struct workqueue_struct *keventd_wq; 453static struct workqueue_struct *keventd_wq;
427 454
455/**
456 * schedule_work - put work task in global workqueue
457 * @work: job to be done
458 *
459 * This puts a job in the kernel-global workqueue.
460 */
428int fastcall schedule_work(struct work_struct *work) 461int fastcall schedule_work(struct work_struct *work)
429{ 462{
430 return queue_work(keventd_wq, work); 463 return queue_work(keventd_wq, work);
431} 464}
432EXPORT_SYMBOL(schedule_work); 465EXPORT_SYMBOL(schedule_work);
433 466
467/**
468 * schedule_delayed_work - put work task in global workqueue after delay
469 * @work: job to be done
470 * @delay: number of jiffies to wait
471 *
472 * After waiting for a given time this puts a job in the kernel-global
473 * workqueue.
474 */
434int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay) 475int fastcall schedule_delayed_work(struct work_struct *work, unsigned long delay)
435{ 476{
436 return queue_delayed_work(keventd_wq, work, delay); 477 return queue_delayed_work(keventd_wq, work, delay);
437} 478}
438EXPORT_SYMBOL(schedule_delayed_work); 479EXPORT_SYMBOL(schedule_delayed_work);
439 480
481/**
482 * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
483 * @cpu: cpu to use
484 * @work: job to be done
485 * @delay: number of jiffies to wait
486 *
487 * After waiting for a given time this puts a job in the kernel-global
488 * workqueue on the specified CPU.
489 */
440int schedule_delayed_work_on(int cpu, 490int schedule_delayed_work_on(int cpu,
441 struct work_struct *work, unsigned long delay) 491 struct work_struct *work, unsigned long delay)
442{ 492{