aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2008-10-15 07:46:29 -0400
committerIngo Molnar <mingo@elte.hu>2008-10-15 07:46:29 -0400
commitb2aaf8f74cdc84a9182f6cabf198b7763bcb9d40 (patch)
tree53ccb1c2c14751fe69cf93102e76e97021f6df07 /kernel
parent4f962d4d65923d7b722192e729840cfb79af0a5a (diff)
parent278429cff8809958d25415ba0ed32b59866ab1a8 (diff)
Merge branch 'linus' into stackprotector
Conflicts: arch/x86/kernel/Makefile include/asm-x86/pda.h
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.hz2
-rw-r--r--kernel/Makefile25
-rw-r--r--kernel/acct.c224
-rw-r--r--kernel/audit.c19
-rw-r--r--kernel/auditfilter.c13
-rw-r--r--kernel/auditsc.c24
-rw-r--r--kernel/backtracetest.c65
-rw-r--r--kernel/capability.c380
-rw-r--r--kernel/cgroup.c336
-rw-r--r--kernel/cpu.c138
-rw-r--r--kernel/cpuset.c705
-rw-r--r--kernel/delayacct.c16
-rw-r--r--kernel/dma-coherent.c155
-rw-r--r--kernel/exec_domain.c3
-rw-r--r--kernel/exit.c614
-rw-r--r--kernel/fork.c143
-rw-r--r--kernel/hrtimer.c116
-rw-r--r--kernel/irq/chip.c12
-rw-r--r--kernel/irq/manage.c149
-rw-r--r--kernel/irq/proc.c113
-rw-r--r--kernel/kallsyms.c2
-rw-r--r--kernel/kexec.c140
-rw-r--r--kernel/kgdb.c107
-rw-r--r--kernel/kmod.c15
-rw-r--r--kernel/kprobes.c134
-rw-r--r--kernel/kthread.c9
-rw-r--r--kernel/lockdep.c383
-rw-r--r--kernel/lockdep_internals.h25
-rw-r--r--kernel/lockdep_proc.c145
-rw-r--r--kernel/marker.c67
-rw-r--r--kernel/module.c347
-rw-r--r--kernel/mutex-debug.c2
-rw-r--r--kernel/mutex.c6
-rw-r--r--kernel/ns_cgroup.c8
-rw-r--r--kernel/nsproxy.c9
-rw-r--r--kernel/panic.c22
-rw-r--r--kernel/pid.c11
-rw-r--r--kernel/pid_namespace.c13
-rw-r--r--kernel/pm_qos_params.c48
-rw-r--r--kernel/posix-cpu-timers.c3
-rw-r--r--kernel/posix-timers.c42
-rw-r--r--kernel/power/Kconfig13
-rw-r--r--kernel/power/disk.c63
-rw-r--r--kernel/power/main.c222
-rw-r--r--kernel/power/power.h2
-rw-r--r--kernel/power/poweroff.c4
-rw-r--r--kernel/power/process.c99
-rw-r--r--kernel/power/snapshot.c88
-rw-r--r--kernel/power/swap.c1
-rw-r--r--kernel/power/user.c71
-rw-r--r--kernel/printk.c151
-rw-r--r--kernel/profile.c10
-rw-r--r--kernel/ptrace.c53
-rw-r--r--kernel/rcuclassic.c387
-rw-r--r--kernel/rcupdate.c72
-rw-r--r--kernel/rcupreempt.c456
-rw-r--r--kernel/rcupreempt_trace.c8
-rw-r--r--kernel/rcutorture.c174
-rw-r--r--kernel/relay.c182
-rw-r--r--kernel/res_counter.c48
-rw-r--r--kernel/resource.c158
-rw-r--r--kernel/rtmutex-tester.c7
-rw-r--r--kernel/sched.c1684
-rw-r--r--kernel/sched_clock.c175
-rw-r--r--kernel/sched_cpupri.c174
-rw-r--r--kernel/sched_cpupri.h36
-rw-r--r--kernel/sched_debug.c66
-rw-r--r--kernel/sched_fair.c494
-rw-r--r--kernel/sched_features.h8
-rw-r--r--kernel/sched_idletask.c6
-rw-r--r--kernel/sched_rt.c515
-rw-r--r--kernel/sched_stats.h42
-rw-r--r--kernel/semaphore.c5
-rw-r--r--kernel/signal.c183
-rw-r--r--kernel/smp.c431
-rw-r--r--kernel/softirq.c72
-rw-r--r--kernel/softlockup.c74
-rw-r--r--kernel/spinlock.c14
-rw-r--r--kernel/stacktrace.c14
-rw-r--r--kernel/stop_machine.c286
-rw-r--r--kernel/sys.c49
-rw-r--r--kernel/sys_ni.c8
-rw-r--r--kernel/sysctl.c276
-rw-r--r--kernel/sysctl_check.c2
-rw-r--r--kernel/taskstats.c6
-rw-r--r--kernel/time/clockevents.c15
-rw-r--r--kernel/time/clocksource.c12
-rw-r--r--kernel/time/ntp.c2
-rw-r--r--kernel/time/tick-broadcast.c110
-rw-r--r--kernel/time/tick-common.c29
-rw-r--r--kernel/time/tick-internal.h11
-rw-r--r--kernel/time/tick-oneshot.c44
-rw-r--r--kernel/time/tick-sched.c57
-rw-r--r--kernel/timer.c12
-rw-r--r--kernel/trace/Kconfig135
-rw-r--r--kernel/trace/Makefile24
-rw-r--r--kernel/trace/ftrace.c1727
-rw-r--r--kernel/trace/trace.c3157
-rw-r--r--kernel/trace/trace.h339
-rw-r--r--kernel/trace/trace_functions.c81
-rw-r--r--kernel/trace/trace_irqsoff.c490
-rw-r--r--kernel/trace/trace_mmiotrace.c295
-rw-r--r--kernel/trace/trace_sched_switch.c286
-rw-r--r--kernel/trace/trace_sched_wakeup.c453
-rw-r--r--kernel/trace/trace_selftest.c563
-rw-r--r--kernel/trace/trace_selftest_dynamic.c7
-rw-r--r--kernel/trace/trace_sysprof.c363
-rw-r--r--kernel/tsacct.c33
-rw-r--r--kernel/user.c4
-rw-r--r--kernel/user_namespace.c1
-rw-r--r--kernel/utsname.c1
-rw-r--r--kernel/utsname_sysctl.c1
-rw-r--r--kernel/workqueue.c184
113 files changed, 16267 insertions, 3848 deletions
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 526128a2e622..94fabd534b03 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -55,4 +55,4 @@ config HZ
55 default 1000 if HZ_1000 55 default 1000 if HZ_1000
56 56
57config SCHED_HRTICK 57config SCHED_HRTICK
58 def_bool HIGH_RES_TIMERS && X86 58 def_bool HIGH_RES_TIMERS && (!SMP || USE_GENERIC_SMP_HELPERS)
diff --git a/kernel/Makefile b/kernel/Makefile
index 1c9938addb9d..4e1d7df7c3e2 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -2,8 +2,8 @@
2# Makefile for the linux kernel. 2# Makefile for the linux kernel.
3# 3#
4 4
5obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ 5obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
6 exit.o itimer.o time.o softirq.o resource.o \ 6 cpu.o exit.o itimer.o time.o softirq.o resource.o \
7 sysctl.o capability.o ptrace.o timer.o user.o \ 7 sysctl.o capability.o ptrace.o timer.o user.o \
8 signal.o sys.o kmod.o workqueue.o pid.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o \
9 rcupdate.o extable.o params.o posix-timers.o \ 9 rcupdate.o extable.o params.o posix-timers.o \
@@ -11,6 +11,20 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o
13 13
14CFLAGS_REMOVE_sched.o = -mno-spe
15
16ifdef CONFIG_FTRACE
17# Do not trace debug files and internal ftrace files
18CFLAGS_REMOVE_lockdep.o = -pg
19CFLAGS_REMOVE_lockdep_proc.o = -pg
20CFLAGS_REMOVE_mutex-debug.o = -pg
21CFLAGS_REMOVE_rtmutex-debug.o = -pg
22CFLAGS_REMOVE_cgroup-debug.o = -pg
23CFLAGS_REMOVE_sched_clock.o = -pg
24CFLAGS_REMOVE_sched.o = -mno-spe -pg
25endif
26
27obj-$(CONFIG_PROFILING) += profile.o
14obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o 28obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
15obj-$(CONFIG_STACKTRACE) += stacktrace.o 29obj-$(CONFIG_STACKTRACE) += stacktrace.o
16obj-y += time/ 30obj-y += time/
@@ -27,7 +41,8 @@ obj-$(CONFIG_RT_MUTEXES) += rtmutex.o
27obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o 41obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o
28obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o 42obj-$(CONFIG_RT_MUTEX_TESTER) += rtmutex-tester.o
29obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o 43obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
30obj-$(CONFIG_SMP) += cpu.o spinlock.o 44obj-$(CONFIG_USE_GENERIC_SMP_HELPERS) += smp.o
45obj-$(CONFIG_SMP) += spinlock.o
31obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o 46obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o
32obj-$(CONFIG_PROVE_LOCKING) += spinlock.o 47obj-$(CONFIG_PROVE_LOCKING) += spinlock.o
33obj-$(CONFIG_UID16) += uid16.o 48obj-$(CONFIG_UID16) += uid16.o
@@ -69,6 +84,10 @@ obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
69obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o 84obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
70obj-$(CONFIG_MARKERS) += marker.o 85obj-$(CONFIG_MARKERS) += marker.o
71obj-$(CONFIG_LATENCYTOP) += latencytop.o 86obj-$(CONFIG_LATENCYTOP) += latencytop.o
87obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
88obj-$(CONFIG_FTRACE) += trace/
89obj-$(CONFIG_TRACING) += trace/
90obj-$(CONFIG_SMP) += sched_cpupri.o
72 91
73ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) 92ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
74# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is 93# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/acct.c b/kernel/acct.c
index 91e1cfd734d2..f6006a60df5d 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -75,37 +75,39 @@ int acct_parm[3] = {4, 2, 30};
75/* 75/*
76 * External references and all of the globals. 76 * External references and all of the globals.
77 */ 77 */
78static void do_acct_process(struct pid_namespace *ns, struct file *); 78static void do_acct_process(struct bsd_acct_struct *acct,
79 struct pid_namespace *ns, struct file *);
79 80
80/* 81/*
81 * This structure is used so that all the data protected by lock 82 * This structure is used so that all the data protected by lock
82 * can be placed in the same cache line as the lock. This primes 83 * can be placed in the same cache line as the lock. This primes
83 * the cache line to have the data after getting the lock. 84 * the cache line to have the data after getting the lock.
84 */ 85 */
85struct acct_glbs { 86struct bsd_acct_struct {
86 spinlock_t lock;
87 volatile int active; 87 volatile int active;
88 volatile int needcheck; 88 volatile int needcheck;
89 struct file *file; 89 struct file *file;
90 struct pid_namespace *ns; 90 struct pid_namespace *ns;
91 struct timer_list timer; 91 struct timer_list timer;
92 struct list_head list;
92}; 93};
93 94
94static struct acct_glbs acct_globals __cacheline_aligned = 95static DEFINE_SPINLOCK(acct_lock);
95 {__SPIN_LOCK_UNLOCKED(acct_globals.lock)}; 96static LIST_HEAD(acct_list);
96 97
97/* 98/*
98 * Called whenever the timer says to check the free space. 99 * Called whenever the timer says to check the free space.
99 */ 100 */
100static void acct_timeout(unsigned long unused) 101static void acct_timeout(unsigned long x)
101{ 102{
102 acct_globals.needcheck = 1; 103 struct bsd_acct_struct *acct = (struct bsd_acct_struct *)x;
104 acct->needcheck = 1;
103} 105}
104 106
105/* 107/*
106 * Check the amount of free space and suspend/resume accordingly. 108 * Check the amount of free space and suspend/resume accordingly.
107 */ 109 */
108static int check_free_space(struct file *file) 110static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
109{ 111{
110 struct kstatfs sbuf; 112 struct kstatfs sbuf;
111 int res; 113 int res;
@@ -113,11 +115,11 @@ static int check_free_space(struct file *file)
113 sector_t resume; 115 sector_t resume;
114 sector_t suspend; 116 sector_t suspend;
115 117
116 spin_lock(&acct_globals.lock); 118 spin_lock(&acct_lock);
117 res = acct_globals.active; 119 res = acct->active;
118 if (!file || !acct_globals.needcheck) 120 if (!file || !acct->needcheck)
119 goto out; 121 goto out;
120 spin_unlock(&acct_globals.lock); 122 spin_unlock(&acct_lock);
121 123
122 /* May block */ 124 /* May block */
123 if (vfs_statfs(file->f_path.dentry, &sbuf)) 125 if (vfs_statfs(file->f_path.dentry, &sbuf))
@@ -136,35 +138,35 @@ static int check_free_space(struct file *file)
136 act = 0; 138 act = 0;
137 139
138 /* 140 /*
139 * If some joker switched acct_globals.file under us we'ld better be 141 * If some joker switched acct->file under us we'ld better be
140 * silent and _not_ touch anything. 142 * silent and _not_ touch anything.
141 */ 143 */
142 spin_lock(&acct_globals.lock); 144 spin_lock(&acct_lock);
143 if (file != acct_globals.file) { 145 if (file != acct->file) {
144 if (act) 146 if (act)
145 res = act>0; 147 res = act>0;
146 goto out; 148 goto out;
147 } 149 }
148 150
149 if (acct_globals.active) { 151 if (acct->active) {
150 if (act < 0) { 152 if (act < 0) {
151 acct_globals.active = 0; 153 acct->active = 0;
152 printk(KERN_INFO "Process accounting paused\n"); 154 printk(KERN_INFO "Process accounting paused\n");
153 } 155 }
154 } else { 156 } else {
155 if (act > 0) { 157 if (act > 0) {
156 acct_globals.active = 1; 158 acct->active = 1;
157 printk(KERN_INFO "Process accounting resumed\n"); 159 printk(KERN_INFO "Process accounting resumed\n");
158 } 160 }
159 } 161 }
160 162
161 del_timer(&acct_globals.timer); 163 del_timer(&acct->timer);
162 acct_globals.needcheck = 0; 164 acct->needcheck = 0;
163 acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ; 165 acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
164 add_timer(&acct_globals.timer); 166 add_timer(&acct->timer);
165 res = acct_globals.active; 167 res = acct->active;
166out: 168out:
167 spin_unlock(&acct_globals.lock); 169 spin_unlock(&acct_lock);
168 return res; 170 return res;
169} 171}
170 172
@@ -172,39 +174,41 @@ out:
172 * Close the old accounting file (if currently open) and then replace 174 * Close the old accounting file (if currently open) and then replace
173 * it with file (if non-NULL). 175 * it with file (if non-NULL).
174 * 176 *
175 * NOTE: acct_globals.lock MUST be held on entry and exit. 177 * NOTE: acct_lock MUST be held on entry and exit.
176 */ 178 */
177static void acct_file_reopen(struct file *file) 179static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
180 struct pid_namespace *ns)
178{ 181{
179 struct file *old_acct = NULL; 182 struct file *old_acct = NULL;
180 struct pid_namespace *old_ns = NULL; 183 struct pid_namespace *old_ns = NULL;
181 184
182 if (acct_globals.file) { 185 if (acct->file) {
183 old_acct = acct_globals.file; 186 old_acct = acct->file;
184 old_ns = acct_globals.ns; 187 old_ns = acct->ns;
185 del_timer(&acct_globals.timer); 188 del_timer(&acct->timer);
186 acct_globals.active = 0; 189 acct->active = 0;
187 acct_globals.needcheck = 0; 190 acct->needcheck = 0;
188 acct_globals.file = NULL; 191 acct->file = NULL;
192 acct->ns = NULL;
193 list_del(&acct->list);
189 } 194 }
190 if (file) { 195 if (file) {
191 acct_globals.file = file; 196 acct->file = file;
192 acct_globals.ns = get_pid_ns(task_active_pid_ns(current)); 197 acct->ns = ns;
193 acct_globals.needcheck = 0; 198 acct->needcheck = 0;
194 acct_globals.active = 1; 199 acct->active = 1;
200 list_add(&acct->list, &acct_list);
195 /* It's been deleted if it was used before so this is safe */ 201 /* It's been deleted if it was used before so this is safe */
196 init_timer(&acct_globals.timer); 202 setup_timer(&acct->timer, acct_timeout, (unsigned long)acct);
197 acct_globals.timer.function = acct_timeout; 203 acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
198 acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ; 204 add_timer(&acct->timer);
199 add_timer(&acct_globals.timer);
200 } 205 }
201 if (old_acct) { 206 if (old_acct) {
202 mnt_unpin(old_acct->f_path.mnt); 207 mnt_unpin(old_acct->f_path.mnt);
203 spin_unlock(&acct_globals.lock); 208 spin_unlock(&acct_lock);
204 do_acct_process(old_ns, old_acct); 209 do_acct_process(acct, old_ns, old_acct);
205 filp_close(old_acct, NULL); 210 filp_close(old_acct, NULL);
206 put_pid_ns(old_ns); 211 spin_lock(&acct_lock);
207 spin_lock(&acct_globals.lock);
208 } 212 }
209} 213}
210 214
@@ -212,6 +216,8 @@ static int acct_on(char *name)
212{ 216{
213 struct file *file; 217 struct file *file;
214 int error; 218 int error;
219 struct pid_namespace *ns;
220 struct bsd_acct_struct *acct = NULL;
215 221
216 /* Difference from BSD - they don't do O_APPEND */ 222 /* Difference from BSD - they don't do O_APPEND */
217 file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0); 223 file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
@@ -228,18 +234,34 @@ static int acct_on(char *name)
228 return -EIO; 234 return -EIO;
229 } 235 }
230 236
237 ns = task_active_pid_ns(current);
238 if (ns->bacct == NULL) {
239 acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
240 if (acct == NULL) {
241 filp_close(file, NULL);
242 return -ENOMEM;
243 }
244 }
245
231 error = security_acct(file); 246 error = security_acct(file);
232 if (error) { 247 if (error) {
248 kfree(acct);
233 filp_close(file, NULL); 249 filp_close(file, NULL);
234 return error; 250 return error;
235 } 251 }
236 252
237 spin_lock(&acct_globals.lock); 253 spin_lock(&acct_lock);
254 if (ns->bacct == NULL) {
255 ns->bacct = acct;
256 acct = NULL;
257 }
258
238 mnt_pin(file->f_path.mnt); 259 mnt_pin(file->f_path.mnt);
239 acct_file_reopen(file); 260 acct_file_reopen(ns->bacct, file, ns);
240 spin_unlock(&acct_globals.lock); 261 spin_unlock(&acct_lock);
241 262
242 mntput(file->f_path.mnt); /* it's pinned, now give up active reference */ 263 mntput(file->f_path.mnt); /* it's pinned, now give up active reference */
264 kfree(acct);
243 265
244 return 0; 266 return 0;
245} 267}
@@ -269,11 +291,17 @@ asmlinkage long sys_acct(const char __user *name)
269 error = acct_on(tmp); 291 error = acct_on(tmp);
270 putname(tmp); 292 putname(tmp);
271 } else { 293 } else {
294 struct bsd_acct_struct *acct;
295
296 acct = task_active_pid_ns(current)->bacct;
297 if (acct == NULL)
298 return 0;
299
272 error = security_acct(NULL); 300 error = security_acct(NULL);
273 if (!error) { 301 if (!error) {
274 spin_lock(&acct_globals.lock); 302 spin_lock(&acct_lock);
275 acct_file_reopen(NULL); 303 acct_file_reopen(acct, NULL, NULL);
276 spin_unlock(&acct_globals.lock); 304 spin_unlock(&acct_lock);
277 } 305 }
278 } 306 }
279 return error; 307 return error;
@@ -288,10 +316,16 @@ asmlinkage long sys_acct(const char __user *name)
288 */ 316 */
289void acct_auto_close_mnt(struct vfsmount *m) 317void acct_auto_close_mnt(struct vfsmount *m)
290{ 318{
291 spin_lock(&acct_globals.lock); 319 struct bsd_acct_struct *acct;
292 if (acct_globals.file && acct_globals.file->f_path.mnt == m) 320
293 acct_file_reopen(NULL); 321 spin_lock(&acct_lock);
294 spin_unlock(&acct_globals.lock); 322restart:
323 list_for_each_entry(acct, &acct_list, list)
324 if (acct->file && acct->file->f_path.mnt == m) {
325 acct_file_reopen(acct, NULL, NULL);
326 goto restart;
327 }
328 spin_unlock(&acct_lock);
295} 329}
296 330
297/** 331/**
@@ -303,12 +337,31 @@ void acct_auto_close_mnt(struct vfsmount *m)
303 */ 337 */
304void acct_auto_close(struct super_block *sb) 338void acct_auto_close(struct super_block *sb)
305{ 339{
306 spin_lock(&acct_globals.lock); 340 struct bsd_acct_struct *acct;
307 if (acct_globals.file && 341
308 acct_globals.file->f_path.mnt->mnt_sb == sb) { 342 spin_lock(&acct_lock);
309 acct_file_reopen(NULL); 343restart:
344 list_for_each_entry(acct, &acct_list, list)
345 if (acct->file && acct->file->f_path.mnt->mnt_sb == sb) {
346 acct_file_reopen(acct, NULL, NULL);
347 goto restart;
348 }
349 spin_unlock(&acct_lock);
350}
351
352void acct_exit_ns(struct pid_namespace *ns)
353{
354 struct bsd_acct_struct *acct;
355
356 spin_lock(&acct_lock);
357 acct = ns->bacct;
358 if (acct != NULL) {
359 if (acct->file != NULL)
360 acct_file_reopen(acct, NULL, NULL);
361
362 kfree(acct);
310 } 363 }
311 spin_unlock(&acct_globals.lock); 364 spin_unlock(&acct_lock);
312} 365}
313 366
314/* 367/*
@@ -425,7 +478,8 @@ static u32 encode_float(u64 value)
425/* 478/*
426 * do_acct_process does all actual work. Caller holds the reference to file. 479 * do_acct_process does all actual work. Caller holds the reference to file.
427 */ 480 */
428static void do_acct_process(struct pid_namespace *ns, struct file *file) 481static void do_acct_process(struct bsd_acct_struct *acct,
482 struct pid_namespace *ns, struct file *file)
429{ 483{
430 struct pacct_struct *pacct = &current->signal->pacct; 484 struct pacct_struct *pacct = &current->signal->pacct;
431 acct_t ac; 485 acct_t ac;
@@ -440,7 +494,7 @@ static void do_acct_process(struct pid_namespace *ns, struct file *file)
440 * First check to see if there is enough free_space to continue 494 * First check to see if there is enough free_space to continue
441 * the process accounting system. 495 * the process accounting system.
442 */ 496 */
443 if (!check_free_space(file)) 497 if (!check_free_space(acct, file))
444 return; 498 return;
445 499
446 /* 500 /*
@@ -494,7 +548,7 @@ static void do_acct_process(struct pid_namespace *ns, struct file *file)
494#endif 548#endif
495 549
496 spin_lock_irq(&current->sighand->siglock); 550 spin_lock_irq(&current->sighand->siglock);
497 tty = current->signal->tty; 551 tty = current->signal->tty; /* Safe as we hold the siglock */
498 ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; 552 ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
499 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); 553 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
500 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); 554 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
@@ -577,34 +631,46 @@ void acct_collect(long exitcode, int group_dead)
577 spin_unlock_irq(&current->sighand->siglock); 631 spin_unlock_irq(&current->sighand->siglock);
578} 632}
579 633
580/** 634static void acct_process_in_ns(struct pid_namespace *ns)
581 * acct_process - now just a wrapper around do_acct_process
582 * @exitcode: task exit code
583 *
584 * handles process accounting for an exiting task
585 */
586void acct_process(void)
587{ 635{
588 struct file *file = NULL; 636 struct file *file = NULL;
589 struct pid_namespace *ns; 637 struct bsd_acct_struct *acct;
590 638
639 acct = ns->bacct;
591 /* 640 /*
592 * accelerate the common fastpath: 641 * accelerate the common fastpath:
593 */ 642 */
594 if (!acct_globals.file) 643 if (!acct || !acct->file)
595 return; 644 return;
596 645
597 spin_lock(&acct_globals.lock); 646 spin_lock(&acct_lock);
598 file = acct_globals.file; 647 file = acct->file;
599 if (unlikely(!file)) { 648 if (unlikely(!file)) {
600 spin_unlock(&acct_globals.lock); 649 spin_unlock(&acct_lock);
601 return; 650 return;
602 } 651 }
603 get_file(file); 652 get_file(file);
604 ns = get_pid_ns(acct_globals.ns); 653 spin_unlock(&acct_lock);
605 spin_unlock(&acct_globals.lock);
606 654
607 do_acct_process(ns, file); 655 do_acct_process(acct, ns, file);
608 fput(file); 656 fput(file);
609 put_pid_ns(ns); 657}
658
659/**
660 * acct_process - now just a wrapper around acct_process_in_ns,
661 * which in turn is a wrapper around do_acct_process.
662 *
663 * handles process accounting for an exiting task
664 */
665void acct_process(void)
666{
667 struct pid_namespace *ns;
668
669 /*
670 * This loop is safe lockless, since current is still
671 * alive and holds its namespace, which in turn holds
672 * its parent.
673 */
674 for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent)
675 acct_process_in_ns(ns);
610} 676}
diff --git a/kernel/audit.c b/kernel/audit.c
index e8692a5748c2..4414e93d8750 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -707,12 +707,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
707 if (status_get->mask & AUDIT_STATUS_ENABLED) { 707 if (status_get->mask & AUDIT_STATUS_ENABLED) {
708 err = audit_set_enabled(status_get->enabled, 708 err = audit_set_enabled(status_get->enabled,
709 loginuid, sessionid, sid); 709 loginuid, sessionid, sid);
710 if (err < 0) return err; 710 if (err < 0)
711 return err;
711 } 712 }
712 if (status_get->mask & AUDIT_STATUS_FAILURE) { 713 if (status_get->mask & AUDIT_STATUS_FAILURE) {
713 err = audit_set_failure(status_get->failure, 714 err = audit_set_failure(status_get->failure,
714 loginuid, sessionid, sid); 715 loginuid, sessionid, sid);
715 if (err < 0) return err; 716 if (err < 0)
717 return err;
716 } 718 }
717 if (status_get->mask & AUDIT_STATUS_PID) { 719 if (status_get->mask & AUDIT_STATUS_PID) {
718 int new_pid = status_get->pid; 720 int new_pid = status_get->pid;
@@ -725,9 +727,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
725 audit_pid = new_pid; 727 audit_pid = new_pid;
726 audit_nlk_pid = NETLINK_CB(skb).pid; 728 audit_nlk_pid = NETLINK_CB(skb).pid;
727 } 729 }
728 if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) 730 if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) {
729 err = audit_set_rate_limit(status_get->rate_limit, 731 err = audit_set_rate_limit(status_get->rate_limit,
730 loginuid, sessionid, sid); 732 loginuid, sessionid, sid);
733 if (err < 0)
734 return err;
735 }
731 if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) 736 if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
732 err = audit_set_backlog_limit(status_get->backlog_limit, 737 err = audit_set_backlog_limit(status_get->backlog_limit,
733 loginuid, sessionid, sid); 738 loginuid, sessionid, sid);
@@ -738,7 +743,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
738 if (!audit_enabled && msg_type != AUDIT_USER_AVC) 743 if (!audit_enabled && msg_type != AUDIT_USER_AVC)
739 return 0; 744 return 0;
740 745
741 err = audit_filter_user(&NETLINK_CB(skb), msg_type); 746 err = audit_filter_user(&NETLINK_CB(skb));
742 if (err == 1) { 747 if (err == 1) {
743 err = 0; 748 err = 0;
744 if (msg_type == AUDIT_USER_TTY) { 749 if (msg_type == AUDIT_USER_TTY) {
@@ -779,7 +784,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
779 } 784 }
780 /* fallthrough */ 785 /* fallthrough */
781 case AUDIT_LIST: 786 case AUDIT_LIST:
782 err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, 787 err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid,
783 uid, seq, data, nlmsg_len(nlh), 788 uid, seq, data, nlmsg_len(nlh),
784 loginuid, sessionid, sid); 789 loginuid, sessionid, sid);
785 break; 790 break;
@@ -798,7 +803,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
798 } 803 }
799 /* fallthrough */ 804 /* fallthrough */
800 case AUDIT_LIST_RULES: 805 case AUDIT_LIST_RULES:
801 err = audit_receive_filter(nlh->nlmsg_type, NETLINK_CB(skb).pid, 806 err = audit_receive_filter(msg_type, NETLINK_CB(skb).pid,
802 uid, seq, data, nlmsg_len(nlh), 807 uid, seq, data, nlmsg_len(nlh),
803 loginuid, sessionid, sid); 808 loginuid, sessionid, sid);
804 break; 809 break;
@@ -1366,7 +1371,7 @@ int audit_string_contains_control(const char *string, size_t len)
1366{ 1371{
1367 const unsigned char *p; 1372 const unsigned char *p;
1368 for (p = string; p < (const unsigned char *)string + len && *p; p++) { 1373 for (p = string; p < (const unsigned char *)string + len && *p; p++) {
1369 if (*p == '"' || *p < 0x21 || *p > 0x7f) 1374 if (*p == '"' || *p < 0x21 || *p > 0x7e)
1370 return 1; 1375 return 1;
1371 } 1376 }
1372 return 0; 1377 return 0;
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 0e0bd27e6512..b7d354e2b0ef 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1022,8 +1022,11 @@ static void audit_update_watch(struct audit_parent *parent,
1022 struct audit_buffer *ab; 1022 struct audit_buffer *ab;
1023 ab = audit_log_start(NULL, GFP_KERNEL, 1023 ab = audit_log_start(NULL, GFP_KERNEL,
1024 AUDIT_CONFIG_CHANGE); 1024 AUDIT_CONFIG_CHANGE);
1025 audit_log_format(ab, "auid=%u ses=%u",
1026 audit_get_loginuid(current),
1027 audit_get_sessionid(current));
1025 audit_log_format(ab, 1028 audit_log_format(ab,
1026 "op=updated rules specifying path="); 1029 " op=updated rules specifying path=");
1027 audit_log_untrustedstring(ab, owatch->path); 1030 audit_log_untrustedstring(ab, owatch->path);
1028 audit_log_format(ab, " with dev=%u ino=%lu\n", 1031 audit_log_format(ab, " with dev=%u ino=%lu\n",
1029 dev, ino); 1032 dev, ino);
@@ -1058,7 +1061,10 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
1058 struct audit_buffer *ab; 1061 struct audit_buffer *ab;
1059 ab = audit_log_start(NULL, GFP_KERNEL, 1062 ab = audit_log_start(NULL, GFP_KERNEL,
1060 AUDIT_CONFIG_CHANGE); 1063 AUDIT_CONFIG_CHANGE);
1061 audit_log_format(ab, "op=remove rule path="); 1064 audit_log_format(ab, "auid=%u ses=%u",
1065 audit_get_loginuid(current),
1066 audit_get_sessionid(current));
1067 audit_log_format(ab, " op=remove rule path=");
1062 audit_log_untrustedstring(ab, w->path); 1068 audit_log_untrustedstring(ab, w->path);
1063 if (r->filterkey) { 1069 if (r->filterkey) {
1064 audit_log_format(ab, " key="); 1070 audit_log_format(ab, " key=");
@@ -1544,6 +1550,7 @@ static void audit_log_rule_change(uid_t loginuid, u32 sessionid, u32 sid,
1544 * @data: payload data 1550 * @data: payload data
1545 * @datasz: size of payload data 1551 * @datasz: size of payload data
1546 * @loginuid: loginuid of sender 1552 * @loginuid: loginuid of sender
1553 * @sessionid: sessionid for netlink audit message
1547 * @sid: SE Linux Security ID of sender 1554 * @sid: SE Linux Security ID of sender
1548 */ 1555 */
1549int audit_receive_filter(int type, int pid, int uid, int seq, void *data, 1556int audit_receive_filter(int type, int pid, int uid, int seq, void *data,
@@ -1720,7 +1727,7 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
1720 return 1; 1727 return 1;
1721} 1728}
1722 1729
1723int audit_filter_user(struct netlink_skb_parms *cb, int type) 1730int audit_filter_user(struct netlink_skb_parms *cb)
1724{ 1731{
1725 enum audit_state state = AUDIT_DISABLED; 1732 enum audit_state state = AUDIT_DISABLED;
1726 struct audit_entry *e; 1733 struct audit_entry *e;
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index c10e7aae04d7..cf5bc2f5f9c3 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -243,7 +243,11 @@ static inline int open_arg(int flags, int mask)
243 243
244static int audit_match_perm(struct audit_context *ctx, int mask) 244static int audit_match_perm(struct audit_context *ctx, int mask)
245{ 245{
246 unsigned n = ctx->major; 246 unsigned n;
247 if (unlikely(!ctx))
248 return 0;
249 n = ctx->major;
250
247 switch (audit_classify_syscall(ctx->arch, n)) { 251 switch (audit_classify_syscall(ctx->arch, n)) {
248 case 0: /* native */ 252 case 0: /* native */
249 if ((mask & AUDIT_PERM_WRITE) && 253 if ((mask & AUDIT_PERM_WRITE) &&
@@ -284,6 +288,10 @@ static int audit_match_filetype(struct audit_context *ctx, int which)
284{ 288{
285 unsigned index = which & ~S_IFMT; 289 unsigned index = which & ~S_IFMT;
286 mode_t mode = which & S_IFMT; 290 mode_t mode = which & S_IFMT;
291
292 if (unlikely(!ctx))
293 return 0;
294
287 if (index >= ctx->name_count) 295 if (index >= ctx->name_count)
288 return 0; 296 return 0;
289 if (ctx->names[index].ino == -1) 297 if (ctx->names[index].ino == -1)
@@ -610,7 +618,7 @@ static int audit_filter_rules(struct task_struct *tsk,
610 if (!result) 618 if (!result)
611 return 0; 619 return 0;
612 } 620 }
613 if (rule->filterkey) 621 if (rule->filterkey && ctx)
614 ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC); 622 ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC);
615 switch (rule->action) { 623 switch (rule->action) {
616 case AUDIT_NEVER: *state = AUDIT_DISABLED; break; 624 case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
@@ -1196,13 +1204,13 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1196 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", 1204 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
1197 context->return_code); 1205 context->return_code);
1198 1206
1199 mutex_lock(&tty_mutex); 1207 spin_lock_irq(&tsk->sighand->siglock);
1200 read_lock(&tasklist_lock);
1201 if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) 1208 if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
1202 tty = tsk->signal->tty->name; 1209 tty = tsk->signal->tty->name;
1203 else 1210 else
1204 tty = "(none)"; 1211 tty = "(none)";
1205 read_unlock(&tasklist_lock); 1212 spin_unlock_irq(&tsk->sighand->siglock);
1213
1206 audit_log_format(ab, 1214 audit_log_format(ab,
1207 " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" 1215 " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
1208 " ppid=%d pid=%d auid=%u uid=%u gid=%u" 1216 " ppid=%d pid=%d auid=%u uid=%u gid=%u"
@@ -1222,7 +1230,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1222 context->egid, context->sgid, context->fsgid, tty, 1230 context->egid, context->sgid, context->fsgid, tty,
1223 tsk->sessionid); 1231 tsk->sessionid);
1224 1232
1225 mutex_unlock(&tty_mutex);
1226 1233
1227 audit_log_task_info(ab, tsk); 1234 audit_log_task_info(ab, tsk);
1228 if (context->filterkey) { 1235 if (context->filterkey) {
@@ -1476,7 +1483,8 @@ void audit_syscall_entry(int arch, int major,
1476 struct audit_context *context = tsk->audit_context; 1483 struct audit_context *context = tsk->audit_context;
1477 enum audit_state state; 1484 enum audit_state state;
1478 1485
1479 BUG_ON(!context); 1486 if (unlikely(!context))
1487 return;
1480 1488
1481 /* 1489 /*
1482 * This happens only on certain architectures that make system 1490 * This happens only on certain architectures that make system
@@ -2374,7 +2382,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
2374 struct audit_context *ctx = tsk->audit_context; 2382 struct audit_context *ctx = tsk->audit_context;
2375 2383
2376 if (audit_pid && t->tgid == audit_pid) { 2384 if (audit_pid && t->tgid == audit_pid) {
2377 if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) { 2385 if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
2378 audit_sig_pid = tsk->pid; 2386 audit_sig_pid = tsk->pid;
2379 if (tsk->loginuid != -1) 2387 if (tsk->loginuid != -1)
2380 audit_sig_uid = tsk->loginuid; 2388 audit_sig_uid = tsk->loginuid;
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
index d1a7605c5b8f..a5e026bc45c4 100644
--- a/kernel/backtracetest.c
+++ b/kernel/backtracetest.c
@@ -10,30 +10,73 @@
10 * of the License. 10 * of the License.
11 */ 11 */
12 12
13#include <linux/completion.h>
14#include <linux/delay.h>
15#include <linux/interrupt.h>
13#include <linux/module.h> 16#include <linux/module.h>
14#include <linux/sched.h> 17#include <linux/sched.h>
15#include <linux/delay.h> 18#include <linux/stacktrace.h>
19
20static void backtrace_test_normal(void)
21{
22 printk("Testing a backtrace from process context.\n");
23 printk("The following trace is a kernel self test and not a bug!\n");
16 24
17static struct timer_list backtrace_timer; 25 dump_stack();
26}
18 27
19static void backtrace_test_timer(unsigned long data) 28static DECLARE_COMPLETION(backtrace_work);
29
30static void backtrace_test_irq_callback(unsigned long data)
31{
32 dump_stack();
33 complete(&backtrace_work);
34}
35
36static DECLARE_TASKLET(backtrace_tasklet, &backtrace_test_irq_callback, 0);
37
38static void backtrace_test_irq(void)
20{ 39{
21 printk("Testing a backtrace from irq context.\n"); 40 printk("Testing a backtrace from irq context.\n");
22 printk("The following trace is a kernel self test and not a bug!\n"); 41 printk("The following trace is a kernel self test and not a bug!\n");
23 dump_stack(); 42
43 init_completion(&backtrace_work);
44 tasklet_schedule(&backtrace_tasklet);
45 wait_for_completion(&backtrace_work);
46}
47
48#ifdef CONFIG_STACKTRACE
49static void backtrace_test_saved(void)
50{
51 struct stack_trace trace;
52 unsigned long entries[8];
53
54 printk("Testing a saved backtrace.\n");
55 printk("The following trace is a kernel self test and not a bug!\n");
56
57 trace.nr_entries = 0;
58 trace.max_entries = ARRAY_SIZE(entries);
59 trace.entries = entries;
60 trace.skip = 0;
61
62 save_stack_trace(&trace);
63 print_stack_trace(&trace, 0);
64}
65#else
66static void backtrace_test_saved(void)
67{
68 printk("Saved backtrace test skipped.\n");
24} 69}
70#endif
71
25static int backtrace_regression_test(void) 72static int backtrace_regression_test(void)
26{ 73{
27 printk("====[ backtrace testing ]===========\n"); 74 printk("====[ backtrace testing ]===========\n");
28 printk("Testing a backtrace from process context.\n");
29 printk("The following trace is a kernel self test and not a bug!\n");
30 dump_stack();
31 75
32 init_timer(&backtrace_timer); 76 backtrace_test_normal();
33 backtrace_timer.function = backtrace_test_timer; 77 backtrace_test_irq();
34 mod_timer(&backtrace_timer, jiffies + 10); 78 backtrace_test_saved();
35 79
36 msleep(10);
37 printk("====[ end of backtrace testing ]====\n"); 80 printk("====[ end of backtrace testing ]====\n");
38 return 0; 81 return 0;
39} 82}
diff --git a/kernel/capability.c b/kernel/capability.c
index cfbe44299488..33e51e78c2d8 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -115,11 +115,229 @@ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
115 return 0; 115 return 0;
116} 116}
117 117
118#ifndef CONFIG_SECURITY_FILE_CAPABILITIES
119
120/*
121 * Without filesystem capability support, we nominally support one process
122 * setting the capabilities of another
123 */
124static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
125 kernel_cap_t *pIp, kernel_cap_t *pPp)
126{
127 struct task_struct *target;
128 int ret;
129
130 spin_lock(&task_capability_lock);
131 read_lock(&tasklist_lock);
132
133 if (pid && pid != task_pid_vnr(current)) {
134 target = find_task_by_vpid(pid);
135 if (!target) {
136 ret = -ESRCH;
137 goto out;
138 }
139 } else
140 target = current;
141
142 ret = security_capget(target, pEp, pIp, pPp);
143
144out:
145 read_unlock(&tasklist_lock);
146 spin_unlock(&task_capability_lock);
147
148 return ret;
149}
150
151/*
152 * cap_set_pg - set capabilities for all processes in a given process
153 * group. We call this holding task_capability_lock and tasklist_lock.
154 */
155static inline int cap_set_pg(int pgrp_nr, kernel_cap_t *effective,
156 kernel_cap_t *inheritable,
157 kernel_cap_t *permitted)
158{
159 struct task_struct *g, *target;
160 int ret = -EPERM;
161 int found = 0;
162 struct pid *pgrp;
163
164 spin_lock(&task_capability_lock);
165 read_lock(&tasklist_lock);
166
167 pgrp = find_vpid(pgrp_nr);
168 do_each_pid_task(pgrp, PIDTYPE_PGID, g) {
169 target = g;
170 while_each_thread(g, target) {
171 if (!security_capset_check(target, effective,
172 inheritable, permitted)) {
173 security_capset_set(target, effective,
174 inheritable, permitted);
175 ret = 0;
176 }
177 found = 1;
178 }
179 } while_each_pid_task(pgrp, PIDTYPE_PGID, g);
180
181 read_unlock(&tasklist_lock);
182 spin_unlock(&task_capability_lock);
183
184 if (!found)
185 ret = 0;
186 return ret;
187}
188
189/*
190 * cap_set_all - set capabilities for all processes other than init
191 * and self. We call this holding task_capability_lock and tasklist_lock.
192 */
193static inline int cap_set_all(kernel_cap_t *effective,
194 kernel_cap_t *inheritable,
195 kernel_cap_t *permitted)
196{
197 struct task_struct *g, *target;
198 int ret = -EPERM;
199 int found = 0;
200
201 spin_lock(&task_capability_lock);
202 read_lock(&tasklist_lock);
203
204 do_each_thread(g, target) {
205 if (target == current
206 || is_container_init(target->group_leader))
207 continue;
208 found = 1;
209 if (security_capset_check(target, effective, inheritable,
210 permitted))
211 continue;
212 ret = 0;
213 security_capset_set(target, effective, inheritable, permitted);
214 } while_each_thread(g, target);
215
216 read_unlock(&tasklist_lock);
217 spin_unlock(&task_capability_lock);
218
219 if (!found)
220 ret = 0;
221
222 return ret;
223}
224
225/*
226 * Given the target pid does not refer to the current process we
227 * need more elaborate support... (This support is not present when
228 * filesystem capabilities are configured.)
229 */
230static inline int do_sys_capset_other_tasks(pid_t pid, kernel_cap_t *effective,
231 kernel_cap_t *inheritable,
232 kernel_cap_t *permitted)
233{
234 struct task_struct *target;
235 int ret;
236
237 if (!capable(CAP_SETPCAP))
238 return -EPERM;
239
240 if (pid == -1) /* all procs other than current and init */
241 return cap_set_all(effective, inheritable, permitted);
242
243 else if (pid < 0) /* all procs in process group */
244 return cap_set_pg(-pid, effective, inheritable, permitted);
245
246 /* target != current */
247 spin_lock(&task_capability_lock);
248 read_lock(&tasklist_lock);
249
250 target = find_task_by_vpid(pid);
251 if (!target)
252 ret = -ESRCH;
253 else {
254 ret = security_capset_check(target, effective, inheritable,
255 permitted);
256
257 /* having verified that the proposed changes are legal,
258 we now put them into effect. */
259 if (!ret)
260 security_capset_set(target, effective, inheritable,
261 permitted);
262 }
263
264 read_unlock(&tasklist_lock);
265 spin_unlock(&task_capability_lock);
266
267 return ret;
268}
269
270#else /* ie., def CONFIG_SECURITY_FILE_CAPABILITIES */
271
272/*
273 * If we have configured with filesystem capability support, then the
274 * only thing that can change the capabilities of the current process
275 * is the current process. As such, we can't be in this code at the
276 * same time as we are in the process of setting capabilities in this
277 * process. The net result is that we can limit our use of locks to
278 * when we are reading the caps of another process.
279 */
280static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
281 kernel_cap_t *pIp, kernel_cap_t *pPp)
282{
283 int ret;
284
285 if (pid && (pid != task_pid_vnr(current))) {
286 struct task_struct *target;
287
288 spin_lock(&task_capability_lock);
289 read_lock(&tasklist_lock);
290
291 target = find_task_by_vpid(pid);
292 if (!target)
293 ret = -ESRCH;
294 else
295 ret = security_capget(target, pEp, pIp, pPp);
296
297 read_unlock(&tasklist_lock);
298 spin_unlock(&task_capability_lock);
299 } else
300 ret = security_capget(current, pEp, pIp, pPp);
301
302 return ret;
303}
304
118/* 305/*
119 * For sys_getproccap() and sys_setproccap(), any of the three 306 * With filesystem capability support configured, the kernel does not
120 * capability set pointers may be NULL -- indicating that that set is 307 * permit the changing of capabilities in one process by another
121 * uninteresting and/or not to be changed. 308 * process. (CAP_SETPCAP has much less broad semantics when configured
309 * this way.)
122 */ 310 */
311static inline int do_sys_capset_other_tasks(pid_t pid,
312 kernel_cap_t *effective,
313 kernel_cap_t *inheritable,
314 kernel_cap_t *permitted)
315{
316 return -EPERM;
317}
318
319#endif /* ie., ndef CONFIG_SECURITY_FILE_CAPABILITIES */
320
321/*
322 * Atomically modify the effective capabilities returning the original
323 * value. No permission check is performed here - it is assumed that the
324 * caller is permitted to set the desired effective capabilities.
325 */
326kernel_cap_t cap_set_effective(const kernel_cap_t pE_new)
327{
328 kernel_cap_t pE_old;
329
330 spin_lock(&task_capability_lock);
331
332 pE_old = current->cap_effective;
333 current->cap_effective = pE_new;
334
335 spin_unlock(&task_capability_lock);
336
337 return pE_old;
338}
339
340EXPORT_SYMBOL(cap_set_effective);
123 341
124/** 342/**
125 * sys_capget - get the capabilities of a given process. 343 * sys_capget - get the capabilities of a given process.
@@ -134,7 +352,6 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
134{ 352{
135 int ret = 0; 353 int ret = 0;
136 pid_t pid; 354 pid_t pid;
137 struct task_struct *target;
138 unsigned tocopy; 355 unsigned tocopy;
139 kernel_cap_t pE, pI, pP; 356 kernel_cap_t pE, pI, pP;
140 357
@@ -148,23 +365,7 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
148 if (pid < 0) 365 if (pid < 0)
149 return -EINVAL; 366 return -EINVAL;
150 367
151 spin_lock(&task_capability_lock); 368 ret = cap_get_target_pid(pid, &pE, &pI, &pP);
152 read_lock(&tasklist_lock);
153
154 if (pid && pid != task_pid_vnr(current)) {
155 target = find_task_by_vpid(pid);
156 if (!target) {
157 ret = -ESRCH;
158 goto out;
159 }
160 } else
161 target = current;
162
163 ret = security_capget(target, &pE, &pI, &pP);
164
165out:
166 read_unlock(&tasklist_lock);
167 spin_unlock(&task_capability_lock);
168 369
169 if (!ret) { 370 if (!ret) {
170 struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; 371 struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
@@ -195,7 +396,6 @@ out:
195 * before modification is attempted and the application 396 * before modification is attempted and the application
196 * fails. 397 * fails.
197 */ 398 */
198
199 if (copy_to_user(dataptr, kdata, tocopy 399 if (copy_to_user(dataptr, kdata, tocopy
200 * sizeof(struct __user_cap_data_struct))) { 400 * sizeof(struct __user_cap_data_struct))) {
201 return -EFAULT; 401 return -EFAULT;
@@ -205,70 +405,8 @@ out:
205 return ret; 405 return ret;
206} 406}
207 407
208/*
209 * cap_set_pg - set capabilities for all processes in a given process
210 * group. We call this holding task_capability_lock and tasklist_lock.
211 */
212static inline int cap_set_pg(int pgrp_nr, kernel_cap_t *effective,
213 kernel_cap_t *inheritable,
214 kernel_cap_t *permitted)
215{
216 struct task_struct *g, *target;
217 int ret = -EPERM;
218 int found = 0;
219 struct pid *pgrp;
220
221 pgrp = find_vpid(pgrp_nr);
222 do_each_pid_task(pgrp, PIDTYPE_PGID, g) {
223 target = g;
224 while_each_thread(g, target) {
225 if (!security_capset_check(target, effective,
226 inheritable,
227 permitted)) {
228 security_capset_set(target, effective,
229 inheritable,
230 permitted);
231 ret = 0;
232 }
233 found = 1;
234 }
235 } while_each_pid_task(pgrp, PIDTYPE_PGID, g);
236
237 if (!found)
238 ret = 0;
239 return ret;
240}
241
242/*
243 * cap_set_all - set capabilities for all processes other than init
244 * and self. We call this holding task_capability_lock and tasklist_lock.
245 */
246static inline int cap_set_all(kernel_cap_t *effective,
247 kernel_cap_t *inheritable,
248 kernel_cap_t *permitted)
249{
250 struct task_struct *g, *target;
251 int ret = -EPERM;
252 int found = 0;
253
254 do_each_thread(g, target) {
255 if (target == current || is_container_init(target->group_leader))
256 continue;
257 found = 1;
258 if (security_capset_check(target, effective, inheritable,
259 permitted))
260 continue;
261 ret = 0;
262 security_capset_set(target, effective, inheritable, permitted);
263 } while_each_thread(g, target);
264
265 if (!found)
266 ret = 0;
267 return ret;
268}
269
270/** 408/**
271 * sys_capset - set capabilities for a process or a group of processes 409 * sys_capset - set capabilities for a process or (*) a group of processes
272 * @header: pointer to struct that contains capability version and 410 * @header: pointer to struct that contains capability version and
273 * target pid data 411 * target pid data
274 * @data: pointer to struct that contains the effective, permitted, 412 * @data: pointer to struct that contains the effective, permitted,
@@ -292,7 +430,6 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
292 struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; 430 struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
293 unsigned i, tocopy; 431 unsigned i, tocopy;
294 kernel_cap_t inheritable, permitted, effective; 432 kernel_cap_t inheritable, permitted, effective;
295 struct task_struct *target;
296 int ret; 433 int ret;
297 pid_t pid; 434 pid_t pid;
298 435
@@ -303,9 +440,6 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
303 if (get_user(pid, &header->pid)) 440 if (get_user(pid, &header->pid))
304 return -EFAULT; 441 return -EFAULT;
305 442
306 if (pid && pid != task_pid_vnr(current) && !capable(CAP_SETPCAP))
307 return -EPERM;
308
309 if (copy_from_user(&kdata, data, tocopy 443 if (copy_from_user(&kdata, data, tocopy
310 * sizeof(struct __user_cap_data_struct))) { 444 * sizeof(struct __user_cap_data_struct))) {
311 return -EFAULT; 445 return -EFAULT;
@@ -323,55 +457,51 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
323 i++; 457 i++;
324 } 458 }
325 459
326 spin_lock(&task_capability_lock); 460 if (pid && (pid != task_pid_vnr(current)))
327 read_lock(&tasklist_lock); 461 ret = do_sys_capset_other_tasks(pid, &effective, &inheritable,
328 462 &permitted);
329 if (pid > 0 && pid != task_pid_vnr(current)) { 463 else {
330 target = find_task_by_vpid(pid); 464 /*
331 if (!target) { 465 * This lock is required even when filesystem
332 ret = -ESRCH; 466 * capability support is configured - it protects the
333 goto out; 467 * sys_capget() call from returning incorrect data in
334 } 468 * the case that the targeted process is not the
335 } else 469 * current one.
336 target = current; 470 */
337 471 spin_lock(&task_capability_lock);
338 ret = 0;
339
340 /* having verified that the proposed changes are legal,
341 we now put them into effect. */
342 if (pid < 0) {
343 if (pid == -1) /* all procs other than current and init */
344 ret = cap_set_all(&effective, &inheritable, &permitted);
345 472
346 else /* all procs in process group */ 473 ret = security_capset_check(current, &effective, &inheritable,
347 ret = cap_set_pg(-pid, &effective, &inheritable,
348 &permitted);
349 } else {
350 ret = security_capset_check(target, &effective, &inheritable,
351 &permitted); 474 &permitted);
475 /*
476 * Having verified that the proposed changes are
477 * legal, we now put them into effect.
478 */
352 if (!ret) 479 if (!ret)
353 security_capset_set(target, &effective, &inheritable, 480 security_capset_set(current, &effective, &inheritable,
354 &permitted); 481 &permitted);
482 spin_unlock(&task_capability_lock);
355 } 483 }
356 484
357out:
358 read_unlock(&tasklist_lock);
359 spin_unlock(&task_capability_lock);
360 485
361 return ret; 486 return ret;
362} 487}
363 488
364int __capable(struct task_struct *t, int cap) 489/**
490 * capable - Determine if the current task has a superior capability in effect
491 * @cap: The capability to be tested for
492 *
493 * Return true if the current task has the given superior capability currently
494 * available for use, false if not.
495 *
496 * This sets PF_SUPERPRIV on the task if the capability is available on the
497 * assumption that it's about to be used.
498 */
499int capable(int cap)
365{ 500{
366 if (security_capable(t, cap) == 0) { 501 if (has_capability(current, cap)) {
367 t->flags |= PF_SUPERPRIV; 502 current->flags |= PF_SUPERPRIV;
368 return 1; 503 return 1;
369 } 504 }
370 return 0; 505 return 0;
371} 506}
372
373int capable(int cap)
374{
375 return __capable(current, cap);
376}
377EXPORT_SYMBOL(capable); 507EXPORT_SYMBOL(capable);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 15ac0e1e4f4d..a0123d75ec9a 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -45,6 +45,7 @@
45#include <linux/delayacct.h> 45#include <linux/delayacct.h>
46#include <linux/cgroupstats.h> 46#include <linux/cgroupstats.h>
47#include <linux/hash.h> 47#include <linux/hash.h>
48#include <linux/namei.h>
48 49
49#include <asm/atomic.h> 50#include <asm/atomic.h>
50 51
@@ -89,11 +90,7 @@ struct cgroupfs_root {
89 /* Hierarchy-specific flags */ 90 /* Hierarchy-specific flags */
90 unsigned long flags; 91 unsigned long flags;
91 92
92 /* The path to use for release notifications. No locking 93 /* The path to use for release notifications. */
93 * between setting and use - so if userspace updates this
94 * while child cgroups exist, you could miss a
95 * notification. We ensure that it's always a valid
96 * NUL-terminated string */
97 char release_agent_path[PATH_MAX]; 94 char release_agent_path[PATH_MAX];
98}; 95};
99 96
@@ -118,7 +115,7 @@ static int root_count;
118 * extra work in the fork/exit path if none of the subsystems need to 115 * extra work in the fork/exit path if none of the subsystems need to
119 * be called. 116 * be called.
120 */ 117 */
121static int need_forkexit_callback; 118static int need_forkexit_callback __read_mostly;
122static int need_mm_owner_callback __read_mostly; 119static int need_mm_owner_callback __read_mostly;
123 120
124/* convenient tests for these bits */ 121/* convenient tests for these bits */
@@ -220,7 +217,7 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
220 * task until after the first call to cgroup_iter_start(). This 217 * task until after the first call to cgroup_iter_start(). This
221 * reduces the fork()/exit() overhead for people who have cgroups 218 * reduces the fork()/exit() overhead for people who have cgroups
222 * compiled into their kernel but not actually in use */ 219 * compiled into their kernel but not actually in use */
223static int use_task_css_set_links; 220static int use_task_css_set_links __read_mostly;
224 221
225/* When we create or destroy a css_set, the operation simply 222/* When we create or destroy a css_set, the operation simply
226 * takes/releases a reference count on all the cgroups referenced 223 * takes/releases a reference count on all the cgroups referenced
@@ -241,17 +238,20 @@ static int use_task_css_set_links;
241 */ 238 */
242static void unlink_css_set(struct css_set *cg) 239static void unlink_css_set(struct css_set *cg)
243{ 240{
241 struct cg_cgroup_link *link;
242 struct cg_cgroup_link *saved_link;
243
244 write_lock(&css_set_lock); 244 write_lock(&css_set_lock);
245 hlist_del(&cg->hlist); 245 hlist_del(&cg->hlist);
246 css_set_count--; 246 css_set_count--;
247 while (!list_empty(&cg->cg_links)) { 247
248 struct cg_cgroup_link *link; 248 list_for_each_entry_safe(link, saved_link, &cg->cg_links,
249 link = list_entry(cg->cg_links.next, 249 cg_link_list) {
250 struct cg_cgroup_link, cg_link_list);
251 list_del(&link->cg_link_list); 250 list_del(&link->cg_link_list);
252 list_del(&link->cgrp_link_list); 251 list_del(&link->cgrp_link_list);
253 kfree(link); 252 kfree(link);
254 } 253 }
254
255 write_unlock(&css_set_lock); 255 write_unlock(&css_set_lock);
256} 256}
257 257
@@ -355,6 +355,17 @@ static struct css_set *find_existing_css_set(
355 return NULL; 355 return NULL;
356} 356}
357 357
358static void free_cg_links(struct list_head *tmp)
359{
360 struct cg_cgroup_link *link;
361 struct cg_cgroup_link *saved_link;
362
363 list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
364 list_del(&link->cgrp_link_list);
365 kfree(link);
366 }
367}
368
358/* 369/*
359 * allocate_cg_links() allocates "count" cg_cgroup_link structures 370 * allocate_cg_links() allocates "count" cg_cgroup_link structures
360 * and chains them on tmp through their cgrp_link_list fields. Returns 0 on 371 * and chains them on tmp through their cgrp_link_list fields. Returns 0 on
@@ -368,13 +379,7 @@ static int allocate_cg_links(int count, struct list_head *tmp)
368 for (i = 0; i < count; i++) { 379 for (i = 0; i < count; i++) {
369 link = kmalloc(sizeof(*link), GFP_KERNEL); 380 link = kmalloc(sizeof(*link), GFP_KERNEL);
370 if (!link) { 381 if (!link) {
371 while (!list_empty(tmp)) { 382 free_cg_links(tmp);
372 link = list_entry(tmp->next,
373 struct cg_cgroup_link,
374 cgrp_link_list);
375 list_del(&link->cgrp_link_list);
376 kfree(link);
377 }
378 return -ENOMEM; 383 return -ENOMEM;
379 } 384 }
380 list_add(&link->cgrp_link_list, tmp); 385 list_add(&link->cgrp_link_list, tmp);
@@ -382,18 +387,6 @@ static int allocate_cg_links(int count, struct list_head *tmp)
382 return 0; 387 return 0;
383} 388}
384 389
385static void free_cg_links(struct list_head *tmp)
386{
387 while (!list_empty(tmp)) {
388 struct cg_cgroup_link *link;
389 link = list_entry(tmp->next,
390 struct cg_cgroup_link,
391 cgrp_link_list);
392 list_del(&link->cgrp_link_list);
393 kfree(link);
394 }
395}
396
397/* 390/*
398 * find_css_set() takes an existing cgroup group and a 391 * find_css_set() takes an existing cgroup group and a
399 * cgroup object, and returns a css_set object that's 392 * cgroup object, and returns a css_set object that's
@@ -415,11 +408,11 @@ static struct css_set *find_css_set(
415 408
416 /* First see if we already have a cgroup group that matches 409 /* First see if we already have a cgroup group that matches
417 * the desired set */ 410 * the desired set */
418 write_lock(&css_set_lock); 411 read_lock(&css_set_lock);
419 res = find_existing_css_set(oldcg, cgrp, template); 412 res = find_existing_css_set(oldcg, cgrp, template);
420 if (res) 413 if (res)
421 get_css_set(res); 414 get_css_set(res);
422 write_unlock(&css_set_lock); 415 read_unlock(&css_set_lock);
423 416
424 if (res) 417 if (res)
425 return res; 418 return res;
@@ -507,10 +500,6 @@ static struct css_set *find_css_set(
507 * knows that the cgroup won't be removed, as cgroup_rmdir() 500 * knows that the cgroup won't be removed, as cgroup_rmdir()
508 * needs that mutex. 501 * needs that mutex.
509 * 502 *
510 * The cgroup_common_file_write handler for operations that modify
511 * the cgroup hierarchy holds cgroup_mutex across the entire operation,
512 * single threading all such cgroup modifications across the system.
513 *
514 * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't 503 * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
515 * (usually) take cgroup_mutex. These are the two most performance 504 * (usually) take cgroup_mutex. These are the two most performance
516 * critical pieces of code here. The exception occurs on cgroup_exit(), 505 * critical pieces of code here. The exception occurs on cgroup_exit(),
@@ -962,7 +951,6 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
962 struct super_block *sb; 951 struct super_block *sb;
963 struct cgroupfs_root *root; 952 struct cgroupfs_root *root;
964 struct list_head tmp_cg_links; 953 struct list_head tmp_cg_links;
965 INIT_LIST_HEAD(&tmp_cg_links);
966 954
967 /* First find the desired set of subsystems */ 955 /* First find the desired set of subsystems */
968 ret = parse_cgroupfs_options(data, &opts); 956 ret = parse_cgroupfs_options(data, &opts);
@@ -1093,6 +1081,8 @@ static void cgroup_kill_sb(struct super_block *sb) {
1093 struct cgroupfs_root *root = sb->s_fs_info; 1081 struct cgroupfs_root *root = sb->s_fs_info;
1094 struct cgroup *cgrp = &root->top_cgroup; 1082 struct cgroup *cgrp = &root->top_cgroup;
1095 int ret; 1083 int ret;
1084 struct cg_cgroup_link *link;
1085 struct cg_cgroup_link *saved_link;
1096 1086
1097 BUG_ON(!root); 1087 BUG_ON(!root);
1098 1088
@@ -1112,10 +1102,9 @@ static void cgroup_kill_sb(struct super_block *sb) {
1112 * root cgroup 1102 * root cgroup
1113 */ 1103 */
1114 write_lock(&css_set_lock); 1104 write_lock(&css_set_lock);
1115 while (!list_empty(&cgrp->css_sets)) { 1105
1116 struct cg_cgroup_link *link; 1106 list_for_each_entry_safe(link, saved_link, &cgrp->css_sets,
1117 link = list_entry(cgrp->css_sets.next, 1107 cgrp_link_list) {
1118 struct cg_cgroup_link, cgrp_link_list);
1119 list_del(&link->cg_link_list); 1108 list_del(&link->cg_link_list);
1120 list_del(&link->cgrp_link_list); 1109 list_del(&link->cgrp_link_list);
1121 kfree(link); 1110 kfree(link);
@@ -1281,18 +1270,14 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1281} 1270}
1282 1271
1283/* 1272/*
1284 * Attach task with pid 'pid' to cgroup 'cgrp'. Call with 1273 * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
1285 * cgroup_mutex, may take task_lock of task 1274 * held. May take task_lock of task
1286 */ 1275 */
1287static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf) 1276static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
1288{ 1277{
1289 pid_t pid;
1290 struct task_struct *tsk; 1278 struct task_struct *tsk;
1291 int ret; 1279 int ret;
1292 1280
1293 if (sscanf(pidbuf, "%d", &pid) != 1)
1294 return -EIO;
1295
1296 if (pid) { 1281 if (pid) {
1297 rcu_read_lock(); 1282 rcu_read_lock();
1298 tsk = find_task_by_vpid(pid); 1283 tsk = find_task_by_vpid(pid);
@@ -1318,6 +1303,16 @@ static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf)
1318 return ret; 1303 return ret;
1319} 1304}
1320 1305
1306static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
1307{
1308 int ret;
1309 if (!cgroup_lock_live_group(cgrp))
1310 return -ENODEV;
1311 ret = attach_task_by_pid(cgrp, pid);
1312 cgroup_unlock();
1313 return ret;
1314}
1315
1321/* The various types of files and directories in a cgroup file system */ 1316/* The various types of files and directories in a cgroup file system */
1322enum cgroup_filetype { 1317enum cgroup_filetype {
1323 FILE_ROOT, 1318 FILE_ROOT,
@@ -1327,12 +1322,54 @@ enum cgroup_filetype {
1327 FILE_RELEASE_AGENT, 1322 FILE_RELEASE_AGENT,
1328}; 1323};
1329 1324
1325/**
1326 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
1327 * @cgrp: the cgroup to be checked for liveness
1328 *
1329 * On success, returns true; the lock should be later released with
1330 * cgroup_unlock(). On failure returns false with no lock held.
1331 */
1332bool cgroup_lock_live_group(struct cgroup *cgrp)
1333{
1334 mutex_lock(&cgroup_mutex);
1335 if (cgroup_is_removed(cgrp)) {
1336 mutex_unlock(&cgroup_mutex);
1337 return false;
1338 }
1339 return true;
1340}
1341
1342static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
1343 const char *buffer)
1344{
1345 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
1346 if (!cgroup_lock_live_group(cgrp))
1347 return -ENODEV;
1348 strcpy(cgrp->root->release_agent_path, buffer);
1349 cgroup_unlock();
1350 return 0;
1351}
1352
1353static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
1354 struct seq_file *seq)
1355{
1356 if (!cgroup_lock_live_group(cgrp))
1357 return -ENODEV;
1358 seq_puts(seq, cgrp->root->release_agent_path);
1359 seq_putc(seq, '\n');
1360 cgroup_unlock();
1361 return 0;
1362}
1363
1364/* A buffer size big enough for numbers or short strings */
1365#define CGROUP_LOCAL_BUFFER_SIZE 64
1366
1330static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, 1367static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
1331 struct file *file, 1368 struct file *file,
1332 const char __user *userbuf, 1369 const char __user *userbuf,
1333 size_t nbytes, loff_t *unused_ppos) 1370 size_t nbytes, loff_t *unused_ppos)
1334{ 1371{
1335 char buffer[64]; 1372 char buffer[CGROUP_LOCAL_BUFFER_SIZE];
1336 int retval = 0; 1373 int retval = 0;
1337 char *end; 1374 char *end;
1338 1375
@@ -1361,68 +1398,39 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
1361 return retval; 1398 return retval;
1362} 1399}
1363 1400
1364static ssize_t cgroup_common_file_write(struct cgroup *cgrp, 1401static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
1365 struct cftype *cft, 1402 struct file *file,
1366 struct file *file, 1403 const char __user *userbuf,
1367 const char __user *userbuf, 1404 size_t nbytes, loff_t *unused_ppos)
1368 size_t nbytes, loff_t *unused_ppos)
1369{ 1405{
1370 enum cgroup_filetype type = cft->private; 1406 char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
1371 char *buffer;
1372 int retval = 0; 1407 int retval = 0;
1408 size_t max_bytes = cft->max_write_len;
1409 char *buffer = local_buffer;
1373 1410
1374 if (nbytes >= PATH_MAX) 1411 if (!max_bytes)
1412 max_bytes = sizeof(local_buffer) - 1;
1413 if (nbytes >= max_bytes)
1375 return -E2BIG; 1414 return -E2BIG;
1376 1415 /* Allocate a dynamic buffer if we need one */
1377 /* +1 for nul-terminator */ 1416 if (nbytes >= sizeof(local_buffer)) {
1378 buffer = kmalloc(nbytes + 1, GFP_KERNEL); 1417 buffer = kmalloc(nbytes + 1, GFP_KERNEL);
1379 if (buffer == NULL) 1418 if (buffer == NULL)
1380 return -ENOMEM; 1419 return -ENOMEM;
1381 1420 }
1382 if (copy_from_user(buffer, userbuf, nbytes)) { 1421 if (nbytes && copy_from_user(buffer, userbuf, nbytes)) {
1383 retval = -EFAULT; 1422 retval = -EFAULT;
1384 goto out1; 1423 goto out;
1385 } 1424 }
1386 buffer[nbytes] = 0; /* nul-terminate */
1387 strstrip(buffer); /* strip -just- trailing whitespace */
1388 1425
1389 mutex_lock(&cgroup_mutex); 1426 buffer[nbytes] = 0; /* nul-terminate */
1390 1427 strstrip(buffer);
1391 /* 1428 retval = cft->write_string(cgrp, cft, buffer);
1392 * This was already checked for in cgroup_file_write(), but 1429 if (!retval)
1393 * check again now we're holding cgroup_mutex.
1394 */
1395 if (cgroup_is_removed(cgrp)) {
1396 retval = -ENODEV;
1397 goto out2;
1398 }
1399
1400 switch (type) {
1401 case FILE_TASKLIST:
1402 retval = attach_task_by_pid(cgrp, buffer);
1403 break;
1404 case FILE_NOTIFY_ON_RELEASE:
1405 clear_bit(CGRP_RELEASABLE, &cgrp->flags);
1406 if (simple_strtoul(buffer, NULL, 10) != 0)
1407 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
1408 else
1409 clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
1410 break;
1411 case FILE_RELEASE_AGENT:
1412 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
1413 strcpy(cgrp->root->release_agent_path, buffer);
1414 break;
1415 default:
1416 retval = -EINVAL;
1417 goto out2;
1418 }
1419
1420 if (retval == 0)
1421 retval = nbytes; 1430 retval = nbytes;
1422out2: 1431out:
1423 mutex_unlock(&cgroup_mutex); 1432 if (buffer != local_buffer)
1424out1: 1433 kfree(buffer);
1425 kfree(buffer);
1426 return retval; 1434 return retval;
1427} 1435}
1428 1436
@@ -1438,6 +1446,8 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
1438 return cft->write(cgrp, cft, file, buf, nbytes, ppos); 1446 return cft->write(cgrp, cft, file, buf, nbytes, ppos);
1439 if (cft->write_u64 || cft->write_s64) 1447 if (cft->write_u64 || cft->write_s64)
1440 return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos); 1448 return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos);
1449 if (cft->write_string)
1450 return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos);
1441 if (cft->trigger) { 1451 if (cft->trigger) {
1442 int ret = cft->trigger(cgrp, (unsigned int)cft->private); 1452 int ret = cft->trigger(cgrp, (unsigned int)cft->private);
1443 return ret ? ret : nbytes; 1453 return ret ? ret : nbytes;
@@ -1450,7 +1460,7 @@ static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft,
1450 char __user *buf, size_t nbytes, 1460 char __user *buf, size_t nbytes,
1451 loff_t *ppos) 1461 loff_t *ppos)
1452{ 1462{
1453 char tmp[64]; 1463 char tmp[CGROUP_LOCAL_BUFFER_SIZE];
1454 u64 val = cft->read_u64(cgrp, cft); 1464 u64 val = cft->read_u64(cgrp, cft);
1455 int len = sprintf(tmp, "%llu\n", (unsigned long long) val); 1465 int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
1456 1466
@@ -1462,56 +1472,13 @@ static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft,
1462 char __user *buf, size_t nbytes, 1472 char __user *buf, size_t nbytes,
1463 loff_t *ppos) 1473 loff_t *ppos)
1464{ 1474{
1465 char tmp[64]; 1475 char tmp[CGROUP_LOCAL_BUFFER_SIZE];
1466 s64 val = cft->read_s64(cgrp, cft); 1476 s64 val = cft->read_s64(cgrp, cft);
1467 int len = sprintf(tmp, "%lld\n", (long long) val); 1477 int len = sprintf(tmp, "%lld\n", (long long) val);
1468 1478
1469 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 1479 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
1470} 1480}
1471 1481
1472static ssize_t cgroup_common_file_read(struct cgroup *cgrp,
1473 struct cftype *cft,
1474 struct file *file,
1475 char __user *buf,
1476 size_t nbytes, loff_t *ppos)
1477{
1478 enum cgroup_filetype type = cft->private;
1479 char *page;
1480 ssize_t retval = 0;
1481 char *s;
1482
1483 if (!(page = (char *)__get_free_page(GFP_KERNEL)))
1484 return -ENOMEM;
1485
1486 s = page;
1487
1488 switch (type) {
1489 case FILE_RELEASE_AGENT:
1490 {
1491 struct cgroupfs_root *root;
1492 size_t n;
1493 mutex_lock(&cgroup_mutex);
1494 root = cgrp->root;
1495 n = strnlen(root->release_agent_path,
1496 sizeof(root->release_agent_path));
1497 n = min(n, (size_t) PAGE_SIZE);
1498 strncpy(s, root->release_agent_path, n);
1499 mutex_unlock(&cgroup_mutex);
1500 s += n;
1501 break;
1502 }
1503 default:
1504 retval = -EINVAL;
1505 goto out;
1506 }
1507 *s++ = '\n';
1508
1509 retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
1510out:
1511 free_page((unsigned long)page);
1512 return retval;
1513}
1514
1515static ssize_t cgroup_file_read(struct file *file, char __user *buf, 1482static ssize_t cgroup_file_read(struct file *file, char __user *buf,
1516 size_t nbytes, loff_t *ppos) 1483 size_t nbytes, loff_t *ppos)
1517{ 1484{
@@ -1560,7 +1527,7 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg)
1560 return cft->read_seq_string(state->cgroup, cft, m); 1527 return cft->read_seq_string(state->cgroup, cft, m);
1561} 1528}
1562 1529
1563int cgroup_seqfile_release(struct inode *inode, struct file *file) 1530static int cgroup_seqfile_release(struct inode *inode, struct file *file)
1564{ 1531{
1565 struct seq_file *seq = file->private_data; 1532 struct seq_file *seq = file->private_data;
1566 kfree(seq->private); 1533 kfree(seq->private);
@@ -1569,6 +1536,7 @@ int cgroup_seqfile_release(struct inode *inode, struct file *file)
1569 1536
1570static struct file_operations cgroup_seqfile_operations = { 1537static struct file_operations cgroup_seqfile_operations = {
1571 .read = seq_read, 1538 .read = seq_read,
1539 .write = cgroup_file_write,
1572 .llseek = seq_lseek, 1540 .llseek = seq_lseek,
1573 .release = cgroup_seqfile_release, 1541 .release = cgroup_seqfile_release,
1574}; 1542};
@@ -1756,15 +1724,11 @@ int cgroup_add_files(struct cgroup *cgrp,
1756int cgroup_task_count(const struct cgroup *cgrp) 1724int cgroup_task_count(const struct cgroup *cgrp)
1757{ 1725{
1758 int count = 0; 1726 int count = 0;
1759 struct list_head *l; 1727 struct cg_cgroup_link *link;
1760 1728
1761 read_lock(&css_set_lock); 1729 read_lock(&css_set_lock);
1762 l = cgrp->css_sets.next; 1730 list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
1763 while (l != &cgrp->css_sets) {
1764 struct cg_cgroup_link *link =
1765 list_entry(l, struct cg_cgroup_link, cgrp_link_list);
1766 count += atomic_read(&link->cg->ref.refcount); 1731 count += atomic_read(&link->cg->ref.refcount);
1767 l = l->next;
1768 } 1732 }
1769 read_unlock(&css_set_lock); 1733 read_unlock(&css_set_lock);
1770 return count; 1734 return count;
@@ -2227,6 +2191,18 @@ static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
2227 return notify_on_release(cgrp); 2191 return notify_on_release(cgrp);
2228} 2192}
2229 2193
2194static int cgroup_write_notify_on_release(struct cgroup *cgrp,
2195 struct cftype *cft,
2196 u64 val)
2197{
2198 clear_bit(CGRP_RELEASABLE, &cgrp->flags);
2199 if (val)
2200 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
2201 else
2202 clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
2203 return 0;
2204}
2205
2230/* 2206/*
2231 * for the common functions, 'private' gives the type of file 2207 * for the common functions, 'private' gives the type of file
2232 */ 2208 */
@@ -2235,7 +2211,7 @@ static struct cftype files[] = {
2235 .name = "tasks", 2211 .name = "tasks",
2236 .open = cgroup_tasks_open, 2212 .open = cgroup_tasks_open,
2237 .read = cgroup_tasks_read, 2213 .read = cgroup_tasks_read,
2238 .write = cgroup_common_file_write, 2214 .write_u64 = cgroup_tasks_write,
2239 .release = cgroup_tasks_release, 2215 .release = cgroup_tasks_release,
2240 .private = FILE_TASKLIST, 2216 .private = FILE_TASKLIST,
2241 }, 2217 },
@@ -2243,15 +2219,16 @@ static struct cftype files[] = {
2243 { 2219 {
2244 .name = "notify_on_release", 2220 .name = "notify_on_release",
2245 .read_u64 = cgroup_read_notify_on_release, 2221 .read_u64 = cgroup_read_notify_on_release,
2246 .write = cgroup_common_file_write, 2222 .write_u64 = cgroup_write_notify_on_release,
2247 .private = FILE_NOTIFY_ON_RELEASE, 2223 .private = FILE_NOTIFY_ON_RELEASE,
2248 }, 2224 },
2249}; 2225};
2250 2226
2251static struct cftype cft_release_agent = { 2227static struct cftype cft_release_agent = {
2252 .name = "release_agent", 2228 .name = "release_agent",
2253 .read = cgroup_common_file_read, 2229 .read_seq_string = cgroup_release_agent_show,
2254 .write = cgroup_common_file_write, 2230 .write_string = cgroup_release_agent_write,
2231 .max_write_len = PATH_MAX,
2255 .private = FILE_RELEASE_AGENT, 2232 .private = FILE_RELEASE_AGENT,
2256}; 2233};
2257 2234
@@ -2391,7 +2368,7 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode)
2391 return cgroup_create(c_parent, dentry, mode | S_IFDIR); 2368 return cgroup_create(c_parent, dentry, mode | S_IFDIR);
2392} 2369}
2393 2370
2394static inline int cgroup_has_css_refs(struct cgroup *cgrp) 2371static int cgroup_has_css_refs(struct cgroup *cgrp)
2395{ 2372{
2396 /* Check the reference count on each subsystem. Since we 2373 /* Check the reference count on each subsystem. Since we
2397 * already established that there are no tasks in the 2374 * already established that there are no tasks in the
@@ -2761,14 +2738,15 @@ void cgroup_fork_callbacks(struct task_struct *child)
2761 */ 2738 */
2762void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new) 2739void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
2763{ 2740{
2764 struct cgroup *oldcgrp, *newcgrp; 2741 struct cgroup *oldcgrp, *newcgrp = NULL;
2765 2742
2766 if (need_mm_owner_callback) { 2743 if (need_mm_owner_callback) {
2767 int i; 2744 int i;
2768 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 2745 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2769 struct cgroup_subsys *ss = subsys[i]; 2746 struct cgroup_subsys *ss = subsys[i];
2770 oldcgrp = task_cgroup(old, ss->subsys_id); 2747 oldcgrp = task_cgroup(old, ss->subsys_id);
2771 newcgrp = task_cgroup(new, ss->subsys_id); 2748 if (new)
2749 newcgrp = task_cgroup(new, ss->subsys_id);
2772 if (oldcgrp == newcgrp) 2750 if (oldcgrp == newcgrp)
2773 continue; 2751 continue;
2774 if (ss->mm_owner_changed) 2752 if (ss->mm_owner_changed)
@@ -2869,16 +2847,17 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
2869 * cgroup_clone - clone the cgroup the given subsystem is attached to 2847 * cgroup_clone - clone the cgroup the given subsystem is attached to
2870 * @tsk: the task to be moved 2848 * @tsk: the task to be moved
2871 * @subsys: the given subsystem 2849 * @subsys: the given subsystem
2850 * @nodename: the name for the new cgroup
2872 * 2851 *
2873 * Duplicate the current cgroup in the hierarchy that the given 2852 * Duplicate the current cgroup in the hierarchy that the given
2874 * subsystem is attached to, and move this task into the new 2853 * subsystem is attached to, and move this task into the new
2875 * child. 2854 * child.
2876 */ 2855 */
2877int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys) 2856int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
2857 char *nodename)
2878{ 2858{
2879 struct dentry *dentry; 2859 struct dentry *dentry;
2880 int ret = 0; 2860 int ret = 0;
2881 char nodename[MAX_CGROUP_TYPE_NAMELEN];
2882 struct cgroup *parent, *child; 2861 struct cgroup *parent, *child;
2883 struct inode *inode; 2862 struct inode *inode;
2884 struct css_set *cg; 2863 struct css_set *cg;
@@ -2903,8 +2882,6 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)
2903 cg = tsk->cgroups; 2882 cg = tsk->cgroups;
2904 parent = task_cgroup(tsk, subsys->subsys_id); 2883 parent = task_cgroup(tsk, subsys->subsys_id);
2905 2884
2906 snprintf(nodename, MAX_CGROUP_TYPE_NAMELEN, "%d", tsk->pid);
2907
2908 /* Pin the hierarchy */ 2885 /* Pin the hierarchy */
2909 atomic_inc(&parent->root->sb->s_active); 2886 atomic_inc(&parent->root->sb->s_active);
2910 2887
@@ -3078,27 +3055,24 @@ static void cgroup_release_agent(struct work_struct *work)
3078 while (!list_empty(&release_list)) { 3055 while (!list_empty(&release_list)) {
3079 char *argv[3], *envp[3]; 3056 char *argv[3], *envp[3];
3080 int i; 3057 int i;
3081 char *pathbuf; 3058 char *pathbuf = NULL, *agentbuf = NULL;
3082 struct cgroup *cgrp = list_entry(release_list.next, 3059 struct cgroup *cgrp = list_entry(release_list.next,
3083 struct cgroup, 3060 struct cgroup,
3084 release_list); 3061 release_list);
3085 list_del_init(&cgrp->release_list); 3062 list_del_init(&cgrp->release_list);
3086 spin_unlock(&release_list_lock); 3063 spin_unlock(&release_list_lock);
3087 pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); 3064 pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
3088 if (!pathbuf) { 3065 if (!pathbuf)
3089 spin_lock(&release_list_lock); 3066 goto continue_free;
3090 continue; 3067 if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
3091 } 3068 goto continue_free;
3092 3069 agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
3093 if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0) { 3070 if (!agentbuf)
3094 kfree(pathbuf); 3071 goto continue_free;
3095 spin_lock(&release_list_lock);
3096 continue;
3097 }
3098 3072
3099 i = 0; 3073 i = 0;
3100 argv[i++] = cgrp->root->release_agent_path; 3074 argv[i++] = agentbuf;
3101 argv[i++] = (char *)pathbuf; 3075 argv[i++] = pathbuf;
3102 argv[i] = NULL; 3076 argv[i] = NULL;
3103 3077
3104 i = 0; 3078 i = 0;
@@ -3112,8 +3086,10 @@ static void cgroup_release_agent(struct work_struct *work)
3112 * be a slow process */ 3086 * be a slow process */
3113 mutex_unlock(&cgroup_mutex); 3087 mutex_unlock(&cgroup_mutex);
3114 call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); 3088 call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
3115 kfree(pathbuf);
3116 mutex_lock(&cgroup_mutex); 3089 mutex_lock(&cgroup_mutex);
3090 continue_free:
3091 kfree(pathbuf);
3092 kfree(agentbuf);
3117 spin_lock(&release_list_lock); 3093 spin_lock(&release_list_lock);
3118 } 3094 }
3119 spin_unlock(&release_list_lock); 3095 spin_unlock(&release_list_lock);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index c77bc3a1c722..86d49045daed 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -15,6 +15,28 @@
15#include <linux/stop_machine.h> 15#include <linux/stop_machine.h>
16#include <linux/mutex.h> 16#include <linux/mutex.h>
17 17
18/*
19 * Represents all cpu's present in the system
20 * In systems capable of hotplug, this map could dynamically grow
21 * as new cpu's are detected in the system via any platform specific
22 * method, such as ACPI for e.g.
23 */
24cpumask_t cpu_present_map __read_mostly;
25EXPORT_SYMBOL(cpu_present_map);
26
27#ifndef CONFIG_SMP
28
29/*
30 * Represents all cpu's that are currently online.
31 */
32cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
33EXPORT_SYMBOL(cpu_online_map);
34
35cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
36EXPORT_SYMBOL(cpu_possible_map);
37
38#else /* CONFIG_SMP */
39
18/* Serializes the updates to cpu_online_map, cpu_present_map */ 40/* Serializes the updates to cpu_online_map, cpu_present_map */
19static DEFINE_MUTEX(cpu_add_remove_lock); 41static DEFINE_MUTEX(cpu_add_remove_lock);
20 42
@@ -42,6 +64,8 @@ void __init cpu_hotplug_init(void)
42 cpu_hotplug.refcount = 0; 64 cpu_hotplug.refcount = 0;
43} 65}
44 66
67cpumask_t cpu_active_map;
68
45#ifdef CONFIG_HOTPLUG_CPU 69#ifdef CONFIG_HOTPLUG_CPU
46 70
47void get_online_cpus(void) 71void get_online_cpus(void)
@@ -175,13 +199,14 @@ static int __ref take_cpu_down(void *_param)
175 struct take_cpu_down_param *param = _param; 199 struct take_cpu_down_param *param = _param;
176 int err; 200 int err;
177 201
178 raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
179 param->hcpu);
180 /* Ensure this CPU doesn't handle any more interrupts. */ 202 /* Ensure this CPU doesn't handle any more interrupts. */
181 err = __cpu_disable(); 203 err = __cpu_disable();
182 if (err < 0) 204 if (err < 0)
183 return err; 205 return err;
184 206
207 raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
208 param->hcpu);
209
185 /* Force idle task to run as soon as we yield: it should 210 /* Force idle task to run as soon as we yield: it should
186 immediately notice cpu is offline and die quickly. */ 211 immediately notice cpu is offline and die quickly. */
187 sched_idle_next(); 212 sched_idle_next();
@@ -192,7 +217,6 @@ static int __ref take_cpu_down(void *_param)
192static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) 217static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
193{ 218{
194 int err, nr_calls = 0; 219 int err, nr_calls = 0;
195 struct task_struct *p;
196 cpumask_t old_allowed, tmp; 220 cpumask_t old_allowed, tmp;
197 void *hcpu = (void *)(long)cpu; 221 void *hcpu = (void *)(long)cpu;
198 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; 222 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
@@ -225,21 +249,18 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
225 cpus_setall(tmp); 249 cpus_setall(tmp);
226 cpu_clear(cpu, tmp); 250 cpu_clear(cpu, tmp);
227 set_cpus_allowed_ptr(current, &tmp); 251 set_cpus_allowed_ptr(current, &tmp);
252 tmp = cpumask_of_cpu(cpu);
228 253
229 p = __stop_machine_run(take_cpu_down, &tcd_param, cpu); 254 err = __stop_machine(take_cpu_down, &tcd_param, &tmp);
230 255 if (err) {
231 if (IS_ERR(p) || cpu_online(cpu)) {
232 /* CPU didn't die: tell everyone. Can't complain. */ 256 /* CPU didn't die: tell everyone. Can't complain. */
233 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, 257 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
234 hcpu) == NOTIFY_BAD) 258 hcpu) == NOTIFY_BAD)
235 BUG(); 259 BUG();
236 260
237 if (IS_ERR(p)) { 261 goto out_allowed;
238 err = PTR_ERR(p);
239 goto out_allowed;
240 }
241 goto out_thread;
242 } 262 }
263 BUG_ON(cpu_online(cpu));
243 264
244 /* Wait for it to sleep (leaving idle task). */ 265 /* Wait for it to sleep (leaving idle task). */
245 while (!idle_cpu(cpu)) 266 while (!idle_cpu(cpu))
@@ -255,12 +276,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
255 276
256 check_for_tasks(cpu); 277 check_for_tasks(cpu);
257 278
258out_thread:
259 err = kthread_stop(p);
260out_allowed: 279out_allowed:
261 set_cpus_allowed_ptr(current, &old_allowed); 280 set_cpus_allowed_ptr(current, &old_allowed);
262out_release: 281out_release:
263 cpu_hotplug_done(); 282 cpu_hotplug_done();
283 if (!err) {
284 if (raw_notifier_call_chain(&cpu_chain, CPU_POST_DEAD | mod,
285 hcpu) == NOTIFY_BAD)
286 BUG();
287 }
264 return err; 288 return err;
265} 289}
266 290
@@ -269,14 +293,34 @@ int __ref cpu_down(unsigned int cpu)
269 int err = 0; 293 int err = 0;
270 294
271 cpu_maps_update_begin(); 295 cpu_maps_update_begin();
272 if (cpu_hotplug_disabled) 296
297 if (cpu_hotplug_disabled) {
273 err = -EBUSY; 298 err = -EBUSY;
274 else 299 goto out;
275 err = _cpu_down(cpu, 0); 300 }
301
302 cpu_clear(cpu, cpu_active_map);
303
304 /*
305 * Make sure the all cpus did the reschedule and are not
306 * using stale version of the cpu_active_map.
307 * This is not strictly necessary becuase stop_machine()
308 * that we run down the line already provides the required
309 * synchronization. But it's really a side effect and we do not
310 * want to depend on the innards of the stop_machine here.
311 */
312 synchronize_sched();
276 313
314 err = _cpu_down(cpu, 0);
315
316 if (cpu_online(cpu))
317 cpu_set(cpu, cpu_active_map);
318
319out:
277 cpu_maps_update_done(); 320 cpu_maps_update_done();
278 return err; 321 return err;
279} 322}
323EXPORT_SYMBOL(cpu_down);
280#endif /*CONFIG_HOTPLUG_CPU*/ 324#endif /*CONFIG_HOTPLUG_CPU*/
281 325
282/* Requires cpu_add_remove_lock to be held */ 326/* Requires cpu_add_remove_lock to be held */
@@ -306,6 +350,8 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
306 goto out_notify; 350 goto out_notify;
307 BUG_ON(!cpu_online(cpu)); 351 BUG_ON(!cpu_online(cpu));
308 352
353 cpu_set(cpu, cpu_active_map);
354
309 /* Now call notifier in preparation. */ 355 /* Now call notifier in preparation. */
310 raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu); 356 raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu);
311 357
@@ -324,7 +370,7 @@ int __cpuinit cpu_up(unsigned int cpu)
324 if (!cpu_isset(cpu, cpu_possible_map)) { 370 if (!cpu_isset(cpu, cpu_possible_map)) {
325 printk(KERN_ERR "can't online cpu %d because it is not " 371 printk(KERN_ERR "can't online cpu %d because it is not "
326 "configured as may-hotadd at boot time\n", cpu); 372 "configured as may-hotadd at boot time\n", cpu);
327#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) || defined(CONFIG_S390) 373#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
328 printk(KERN_ERR "please check additional_cpus= boot " 374 printk(KERN_ERR "please check additional_cpus= boot "
329 "parameter\n"); 375 "parameter\n");
330#endif 376#endif
@@ -332,11 +378,15 @@ int __cpuinit cpu_up(unsigned int cpu)
332 } 378 }
333 379
334 cpu_maps_update_begin(); 380 cpu_maps_update_begin();
335 if (cpu_hotplug_disabled) 381
382 if (cpu_hotplug_disabled) {
336 err = -EBUSY; 383 err = -EBUSY;
337 else 384 goto out;
338 err = _cpu_up(cpu, 0); 385 }
339 386
387 err = _cpu_up(cpu, 0);
388
389out:
340 cpu_maps_update_done(); 390 cpu_maps_update_done();
341 return err; 391 return err;
342} 392}
@@ -390,7 +440,7 @@ void __ref enable_nonboot_cpus(void)
390 goto out; 440 goto out;
391 441
392 printk("Enabling non-boot CPUs ...\n"); 442 printk("Enabling non-boot CPUs ...\n");
393 for_each_cpu_mask(cpu, frozen_cpus) { 443 for_each_cpu_mask_nr(cpu, frozen_cpus) {
394 error = _cpu_up(cpu, 1); 444 error = _cpu_up(cpu, 1);
395 if (!error) { 445 if (!error) {
396 printk("CPU%d is up\n", cpu); 446 printk("CPU%d is up\n", cpu);
@@ -403,3 +453,49 @@ out:
403 cpu_maps_update_done(); 453 cpu_maps_update_done();
404} 454}
405#endif /* CONFIG_PM_SLEEP_SMP */ 455#endif /* CONFIG_PM_SLEEP_SMP */
456
457/**
458 * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers
459 * @cpu: cpu that just started
460 *
461 * This function calls the cpu_chain notifiers with CPU_STARTING.
462 * It must be called by the arch code on the new cpu, before the new cpu
463 * enables interrupts and before the "boot" cpu returns from __cpu_up().
464 */
465void notify_cpu_starting(unsigned int cpu)
466{
467 unsigned long val = CPU_STARTING;
468
469#ifdef CONFIG_PM_SLEEP_SMP
470 if (cpu_isset(cpu, frozen_cpus))
471 val = CPU_STARTING_FROZEN;
472#endif /* CONFIG_PM_SLEEP_SMP */
473 raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu);
474}
475
476#endif /* CONFIG_SMP */
477
478/*
479 * cpu_bit_bitmap[] is a special, "compressed" data structure that
480 * represents all NR_CPUS bits binary values of 1<<nr.
481 *
482 * It is used by cpumask_of_cpu() to get a constant address to a CPU
483 * mask value that has a single bit set only.
484 */
485
486/* cpu_bit_bitmap[0] is empty - so we can back into it */
487#define MASK_DECLARE_1(x) [x+1][0] = 1UL << (x)
488#define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
489#define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
490#define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)
491
492const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {
493
494 MASK_DECLARE_8(0), MASK_DECLARE_8(8),
495 MASK_DECLARE_8(16), MASK_DECLARE_8(24),
496#if BITS_PER_LONG > 32
497 MASK_DECLARE_8(32), MASK_DECLARE_8(40),
498 MASK_DECLARE_8(48), MASK_DECLARE_8(56),
499#endif
500};
501EXPORT_SYMBOL_GPL(cpu_bit_bitmap);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 9fceb97e989c..eab7bd6628e0 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -14,6 +14,8 @@
14 * 2003-10-22 Updates by Stephen Hemminger. 14 * 2003-10-22 Updates by Stephen Hemminger.
15 * 2004 May-July Rework by Paul Jackson. 15 * 2004 May-July Rework by Paul Jackson.
16 * 2006 Rework by Paul Menage to use generic cgroups 16 * 2006 Rework by Paul Menage to use generic cgroups
17 * 2008 Rework of the scheduler domains and CPU hotplug handling
18 * by Max Krasnyansky
17 * 19 *
18 * This file is subject to the terms and conditions of the GNU General Public 20 * This file is subject to the terms and conditions of the GNU General Public
19 * License. See the file COPYING in the main directory of the Linux 21 * License. See the file COPYING in the main directory of the Linux
@@ -54,7 +56,6 @@
54#include <asm/uaccess.h> 56#include <asm/uaccess.h>
55#include <asm/atomic.h> 57#include <asm/atomic.h>
56#include <linux/mutex.h> 58#include <linux/mutex.h>
57#include <linux/kfifo.h>
58#include <linux/workqueue.h> 59#include <linux/workqueue.h>
59#include <linux/cgroup.h> 60#include <linux/cgroup.h>
60 61
@@ -227,10 +228,6 @@ static struct cpuset top_cpuset = {
227 * The task_struct fields mems_allowed and mems_generation may only 228 * The task_struct fields mems_allowed and mems_generation may only
228 * be accessed in the context of that task, so require no locks. 229 * be accessed in the context of that task, so require no locks.
229 * 230 *
230 * The cpuset_common_file_write handler for operations that modify
231 * the cpuset hierarchy holds cgroup_mutex across the entire operation,
232 * single threading all such cpuset modifications across the system.
233 *
234 * The cpuset_common_file_read() handlers only hold callback_mutex across 231 * The cpuset_common_file_read() handlers only hold callback_mutex across
235 * small pieces of code, such as when reading out possibly multi-word 232 * small pieces of code, such as when reading out possibly multi-word
236 * cpumasks and nodemasks. 233 * cpumasks and nodemasks.
@@ -241,9 +238,11 @@ static struct cpuset top_cpuset = {
241 238
242static DEFINE_MUTEX(callback_mutex); 239static DEFINE_MUTEX(callback_mutex);
243 240
244/* This is ugly, but preserves the userspace API for existing cpuset 241/*
242 * This is ugly, but preserves the userspace API for existing cpuset
245 * users. If someone tries to mount the "cpuset" filesystem, we 243 * users. If someone tries to mount the "cpuset" filesystem, we
246 * silently switch it to mount "cgroup" instead */ 244 * silently switch it to mount "cgroup" instead
245 */
247static int cpuset_get_sb(struct file_system_type *fs_type, 246static int cpuset_get_sb(struct file_system_type *fs_type,
248 int flags, const char *unused_dev_name, 247 int flags, const char *unused_dev_name,
249 void *data, struct vfsmount *mnt) 248 void *data, struct vfsmount *mnt)
@@ -369,7 +368,7 @@ void cpuset_update_task_memory_state(void)
369 my_cpusets_mem_gen = top_cpuset.mems_generation; 368 my_cpusets_mem_gen = top_cpuset.mems_generation;
370 } else { 369 } else {
371 rcu_read_lock(); 370 rcu_read_lock();
372 my_cpusets_mem_gen = task_cs(current)->mems_generation; 371 my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
373 rcu_read_unlock(); 372 rcu_read_unlock();
374 } 373 }
375 374
@@ -478,10 +477,9 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
478} 477}
479 478
480/* 479/*
481 * Helper routine for rebuild_sched_domains(). 480 * Helper routine for generate_sched_domains().
482 * Do cpusets a, b have overlapping cpus_allowed masks? 481 * Do cpusets a, b have overlapping cpus_allowed masks?
483 */ 482 */
484
485static int cpusets_overlap(struct cpuset *a, struct cpuset *b) 483static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
486{ 484{
487 return cpus_intersects(a->cpus_allowed, b->cpus_allowed); 485 return cpus_intersects(a->cpus_allowed, b->cpus_allowed);
@@ -490,29 +488,48 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
490static void 488static void
491update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) 489update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
492{ 490{
493 if (!dattr)
494 return;
495 if (dattr->relax_domain_level < c->relax_domain_level) 491 if (dattr->relax_domain_level < c->relax_domain_level)
496 dattr->relax_domain_level = c->relax_domain_level; 492 dattr->relax_domain_level = c->relax_domain_level;
497 return; 493 return;
498} 494}
499 495
496static void
497update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
498{
499 LIST_HEAD(q);
500
501 list_add(&c->stack_list, &q);
502 while (!list_empty(&q)) {
503 struct cpuset *cp;
504 struct cgroup *cont;
505 struct cpuset *child;
506
507 cp = list_first_entry(&q, struct cpuset, stack_list);
508 list_del(q.next);
509
510 if (cpus_empty(cp->cpus_allowed))
511 continue;
512
513 if (is_sched_load_balance(cp))
514 update_domain_attr(dattr, cp);
515
516 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
517 child = cgroup_cs(cont);
518 list_add_tail(&child->stack_list, &q);
519 }
520 }
521}
522
500/* 523/*
501 * rebuild_sched_domains() 524 * generate_sched_domains()
502 * 525 *
503 * If the flag 'sched_load_balance' of any cpuset with non-empty 526 * This function builds a partial partition of the systems CPUs
504 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset 527 * A 'partial partition' is a set of non-overlapping subsets whose
505 * which has that flag enabled, or if any cpuset with a non-empty 528 * union is a subset of that set.
506 * 'cpus' is removed, then call this routine to rebuild the 529 * The output of this function needs to be passed to kernel/sched.c
507 * scheduler's dynamic sched domains. 530 * partition_sched_domains() routine, which will rebuild the scheduler's
508 * 531 * load balancing domains (sched domains) as specified by that partial
509 * This routine builds a partial partition of the systems CPUs 532 * partition.
510 * (the set of non-overlappping cpumask_t's in the array 'part'
511 * below), and passes that partial partition to the kernel/sched.c
512 * partition_sched_domains() routine, which will rebuild the
513 * schedulers load balancing domains (sched domains) as specified
514 * by that partial partition. A 'partial partition' is a set of
515 * non-overlapping subsets whose union is a subset of that set.
516 * 533 *
517 * See "What is sched_load_balance" in Documentation/cpusets.txt 534 * See "What is sched_load_balance" in Documentation/cpusets.txt
518 * for a background explanation of this. 535 * for a background explanation of this.
@@ -522,16 +539,10 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
522 * domains when operating in the severe memory shortage situations 539 * domains when operating in the severe memory shortage situations
523 * that could cause allocation failures below. 540 * that could cause allocation failures below.
524 * 541 *
525 * Call with cgroup_mutex held. May take callback_mutex during 542 * Must be called with cgroup_lock held.
526 * call due to the kfifo_alloc() and kmalloc() calls. May nest
527 * a call to the get_online_cpus()/put_online_cpus() pair.
528 * Must not be called holding callback_mutex, because we must not
529 * call get_online_cpus() while holding callback_mutex. Elsewhere
530 * the kernel nests callback_mutex inside get_online_cpus() calls.
531 * So the reverse nesting would risk an ABBA deadlock.
532 * 543 *
533 * The three key local variables below are: 544 * The three key local variables below are:
534 * q - a kfifo queue of cpuset pointers, used to implement a 545 * q - a linked-list queue of cpuset pointers, used to implement a
535 * top-down scan of all cpusets. This scan loads a pointer 546 * top-down scan of all cpusets. This scan loads a pointer
536 * to each cpuset marked is_sched_load_balance into the 547 * to each cpuset marked is_sched_load_balance into the
537 * array 'csa'. For our purposes, rebuilding the schedulers 548 * array 'csa'. For our purposes, rebuilding the schedulers
@@ -563,10 +574,10 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
563 * element of the partition (one sched domain) to be passed to 574 * element of the partition (one sched domain) to be passed to
564 * partition_sched_domains(). 575 * partition_sched_domains().
565 */ 576 */
566 577static int generate_sched_domains(cpumask_t **domains,
567static void rebuild_sched_domains(void) 578 struct sched_domain_attr **attributes)
568{ 579{
569 struct kfifo *q; /* queue of cpusets to be scanned */ 580 LIST_HEAD(q); /* queue of cpusets to be scanned */
570 struct cpuset *cp; /* scans q */ 581 struct cpuset *cp; /* scans q */
571 struct cpuset **csa; /* array of all cpuset ptrs */ 582 struct cpuset **csa; /* array of all cpuset ptrs */
572 int csn; /* how many cpuset ptrs in csa so far */ 583 int csn; /* how many cpuset ptrs in csa so far */
@@ -576,44 +587,58 @@ static void rebuild_sched_domains(void)
576 int ndoms; /* number of sched domains in result */ 587 int ndoms; /* number of sched domains in result */
577 int nslot; /* next empty doms[] cpumask_t slot */ 588 int nslot; /* next empty doms[] cpumask_t slot */
578 589
579 q = NULL; 590 ndoms = 0;
580 csa = NULL;
581 doms = NULL; 591 doms = NULL;
582 dattr = NULL; 592 dattr = NULL;
593 csa = NULL;
583 594
584 /* Special case for the 99% of systems with one, full, sched domain */ 595 /* Special case for the 99% of systems with one, full, sched domain */
585 if (is_sched_load_balance(&top_cpuset)) { 596 if (is_sched_load_balance(&top_cpuset)) {
586 ndoms = 1;
587 doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); 597 doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
588 if (!doms) 598 if (!doms)
589 goto rebuild; 599 goto done;
600
590 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); 601 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
591 if (dattr) { 602 if (dattr) {
592 *dattr = SD_ATTR_INIT; 603 *dattr = SD_ATTR_INIT;
593 update_domain_attr(dattr, &top_cpuset); 604 update_domain_attr_tree(dattr, &top_cpuset);
594 } 605 }
595 *doms = top_cpuset.cpus_allowed; 606 *doms = top_cpuset.cpus_allowed;
596 goto rebuild;
597 }
598 607
599 q = kfifo_alloc(number_of_cpusets * sizeof(cp), GFP_KERNEL, NULL); 608 ndoms = 1;
600 if (IS_ERR(q))
601 goto done; 609 goto done;
610 }
611
602 csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); 612 csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
603 if (!csa) 613 if (!csa)
604 goto done; 614 goto done;
605 csn = 0; 615 csn = 0;
606 616
607 cp = &top_cpuset; 617 list_add(&top_cpuset.stack_list, &q);
608 __kfifo_put(q, (void *)&cp, sizeof(cp)); 618 while (!list_empty(&q)) {
609 while (__kfifo_get(q, (void *)&cp, sizeof(cp))) {
610 struct cgroup *cont; 619 struct cgroup *cont;
611 struct cpuset *child; /* scans child cpusets of cp */ 620 struct cpuset *child; /* scans child cpusets of cp */
612 if (is_sched_load_balance(cp)) 621
622 cp = list_first_entry(&q, struct cpuset, stack_list);
623 list_del(q.next);
624
625 if (cpus_empty(cp->cpus_allowed))
626 continue;
627
628 /*
629 * All child cpusets contain a subset of the parent's cpus, so
630 * just skip them, and then we call update_domain_attr_tree()
631 * to calc relax_domain_level of the corresponding sched
632 * domain.
633 */
634 if (is_sched_load_balance(cp)) {
613 csa[csn++] = cp; 635 csa[csn++] = cp;
636 continue;
637 }
638
614 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { 639 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
615 child = cgroup_cs(cont); 640 child = cgroup_cs(cont);
616 __kfifo_put(q, (void *)&child, sizeof(cp)); 641 list_add_tail(&child->stack_list, &q);
617 } 642 }
618 } 643 }
619 644
@@ -644,91 +669,141 @@ restart:
644 } 669 }
645 } 670 }
646 671
647 /* Convert <csn, csa> to <ndoms, doms> */ 672 /*
673 * Now we know how many domains to create.
674 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
675 */
648 doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); 676 doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
649 if (!doms) 677 if (!doms) {
650 goto rebuild; 678 ndoms = 0;
679 goto done;
680 }
681
682 /*
683 * The rest of the code, including the scheduler, can deal with
684 * dattr==NULL case. No need to abort if alloc fails.
685 */
651 dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); 686 dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
652 687
653 for (nslot = 0, i = 0; i < csn; i++) { 688 for (nslot = 0, i = 0; i < csn; i++) {
654 struct cpuset *a = csa[i]; 689 struct cpuset *a = csa[i];
690 cpumask_t *dp;
655 int apn = a->pn; 691 int apn = a->pn;
656 692
657 if (apn >= 0) { 693 if (apn < 0) {
658 cpumask_t *dp = doms + nslot; 694 /* Skip completed partitions */
659 695 continue;
660 if (nslot == ndoms) { 696 }
661 static int warnings = 10; 697
662 if (warnings) { 698 dp = doms + nslot;
663 printk(KERN_WARNING 699
664 "rebuild_sched_domains confused:" 700 if (nslot == ndoms) {
665 " nslot %d, ndoms %d, csn %d, i %d," 701 static int warnings = 10;
666 " apn %d\n", 702 if (warnings) {
667 nslot, ndoms, csn, i, apn); 703 printk(KERN_WARNING
668 warnings--; 704 "rebuild_sched_domains confused:"
669 } 705 " nslot %d, ndoms %d, csn %d, i %d,"
670 continue; 706 " apn %d\n",
707 nslot, ndoms, csn, i, apn);
708 warnings--;
671 } 709 }
710 continue;
711 }
672 712
673 cpus_clear(*dp); 713 cpus_clear(*dp);
674 if (dattr) 714 if (dattr)
675 *(dattr + nslot) = SD_ATTR_INIT; 715 *(dattr + nslot) = SD_ATTR_INIT;
676 for (j = i; j < csn; j++) { 716 for (j = i; j < csn; j++) {
677 struct cpuset *b = csa[j]; 717 struct cpuset *b = csa[j];
678 718
679 if (apn == b->pn) { 719 if (apn == b->pn) {
680 cpus_or(*dp, *dp, b->cpus_allowed); 720 cpus_or(*dp, *dp, b->cpus_allowed);
681 b->pn = -1; 721 if (dattr)
682 update_domain_attr(dattr, b); 722 update_domain_attr_tree(dattr + nslot, b);
683 } 723
724 /* Done with this partition */
725 b->pn = -1;
684 } 726 }
685 nslot++;
686 } 727 }
728 nslot++;
687 } 729 }
688 BUG_ON(nslot != ndoms); 730 BUG_ON(nslot != ndoms);
689 731
690rebuild:
691 /* Have scheduler rebuild sched domains */
692 get_online_cpus();
693 partition_sched_domains(ndoms, doms, dattr);
694 put_online_cpus();
695
696done: 732done:
697 if (q && !IS_ERR(q))
698 kfifo_free(q);
699 kfree(csa); 733 kfree(csa);
700 /* Don't kfree(doms) -- partition_sched_domains() does that. */ 734
701 /* Don't kfree(dattr) -- partition_sched_domains() does that. */ 735 *domains = doms;
736 *attributes = dattr;
737 return ndoms;
702} 738}
703 739
704static inline int started_after_time(struct task_struct *t1, 740/*
705 struct timespec *time, 741 * Rebuild scheduler domains.
706 struct task_struct *t2) 742 *
743 * Call with neither cgroup_mutex held nor within get_online_cpus().
744 * Takes both cgroup_mutex and get_online_cpus().
745 *
746 * Cannot be directly called from cpuset code handling changes
747 * to the cpuset pseudo-filesystem, because it cannot be called
748 * from code that already holds cgroup_mutex.
749 */
750static void do_rebuild_sched_domains(struct work_struct *unused)
707{ 751{
708 int start_diff = timespec_compare(&t1->start_time, time); 752 struct sched_domain_attr *attr;
709 if (start_diff > 0) { 753 cpumask_t *doms;
710 return 1; 754 int ndoms;
711 } else if (start_diff < 0) { 755
712 return 0; 756 get_online_cpus();
713 } else { 757
714 /* 758 /* Generate domain masks and attrs */
715 * Arbitrarily, if two processes started at the same 759 cgroup_lock();
716 * time, we'll say that the lower pointer value 760 ndoms = generate_sched_domains(&doms, &attr);
717 * started first. Note that t2 may have exited by now 761 cgroup_unlock();
718 * so this may not be a valid pointer any longer, but 762
719 * that's fine - it still serves to distinguish 763 /* Have scheduler rebuild the domains */
720 * between two tasks started (effectively) 764 partition_sched_domains(ndoms, doms, attr);
721 * simultaneously. 765
722 */ 766 put_online_cpus();
723 return t1 > t2; 767}
724 } 768
769static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
770
771/*
772 * Rebuild scheduler domains, asynchronously via workqueue.
773 *
774 * If the flag 'sched_load_balance' of any cpuset with non-empty
775 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
776 * which has that flag enabled, or if any cpuset with a non-empty
777 * 'cpus' is removed, then call this routine to rebuild the
778 * scheduler's dynamic sched domains.
779 *
780 * The rebuild_sched_domains() and partition_sched_domains()
781 * routines must nest cgroup_lock() inside get_online_cpus(),
782 * but such cpuset changes as these must nest that locking the
783 * other way, holding cgroup_lock() for much of the code.
784 *
785 * So in order to avoid an ABBA deadlock, the cpuset code handling
786 * these user changes delegates the actual sched domain rebuilding
787 * to a separate workqueue thread, which ends up processing the
788 * above do_rebuild_sched_domains() function.
789 */
790static void async_rebuild_sched_domains(void)
791{
792 schedule_work(&rebuild_sched_domains_work);
725} 793}
726 794
727static inline int started_after(void *p1, void *p2) 795/*
796 * Accomplishes the same scheduler domain rebuild as the above
797 * async_rebuild_sched_domains(), however it directly calls the
798 * rebuild routine synchronously rather than calling it via an
799 * asynchronous work thread.
800 *
801 * This can only be called from code that is not holding
802 * cgroup_mutex (not nested in a cgroup_lock() call.)
803 */
804void rebuild_sched_domains(void)
728{ 805{
729 struct task_struct *t1 = p1; 806 do_rebuild_sched_domains(NULL);
730 struct task_struct *t2 = p2;
731 return started_after_time(t1, &t2->start_time, t2);
732} 807}
733 808
734/** 809/**
@@ -766,15 +841,38 @@ static void cpuset_change_cpumask(struct task_struct *tsk,
766} 841}
767 842
768/** 843/**
844 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
845 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
846 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
847 *
848 * Called with cgroup_mutex held
849 *
850 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
851 * calling callback functions for each.
852 *
853 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
854 * if @heap != NULL.
855 */
856static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
857{
858 struct cgroup_scanner scan;
859
860 scan.cg = cs->css.cgroup;
861 scan.test_task = cpuset_test_cpumask;
862 scan.process_task = cpuset_change_cpumask;
863 scan.heap = heap;
864 cgroup_scan_tasks(&scan);
865}
866
867/**
769 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it 868 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
770 * @cs: the cpuset to consider 869 * @cs: the cpuset to consider
771 * @buf: buffer of cpu numbers written to this cpuset 870 * @buf: buffer of cpu numbers written to this cpuset
772 */ 871 */
773static int update_cpumask(struct cpuset *cs, char *buf) 872static int update_cpumask(struct cpuset *cs, const char *buf)
774{ 873{
775 struct cpuset trialcs;
776 struct cgroup_scanner scan;
777 struct ptr_heap heap; 874 struct ptr_heap heap;
875 struct cpuset trialcs;
778 int retval; 876 int retval;
779 int is_load_balanced; 877 int is_load_balanced;
780 878
@@ -790,7 +888,6 @@ static int update_cpumask(struct cpuset *cs, char *buf)
790 * that parsing. The validate_change() call ensures that cpusets 888 * that parsing. The validate_change() call ensures that cpusets
791 * with tasks have cpus. 889 * with tasks have cpus.
792 */ 890 */
793 buf = strstrip(buf);
794 if (!*buf) { 891 if (!*buf) {
795 cpus_clear(trialcs.cpus_allowed); 892 cpus_clear(trialcs.cpus_allowed);
796 } else { 893 } else {
@@ -809,7 +906,7 @@ static int update_cpumask(struct cpuset *cs, char *buf)
809 if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) 906 if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
810 return 0; 907 return 0;
811 908
812 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after); 909 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
813 if (retval) 910 if (retval)
814 return retval; 911 return retval;
815 912
@@ -823,15 +920,12 @@ static int update_cpumask(struct cpuset *cs, char *buf)
823 * Scan tasks in the cpuset, and update the cpumasks of any 920 * Scan tasks in the cpuset, and update the cpumasks of any
824 * that need an update. 921 * that need an update.
825 */ 922 */
826 scan.cg = cs->css.cgroup; 923 update_tasks_cpumask(cs, &heap);
827 scan.test_task = cpuset_test_cpumask; 924
828 scan.process_task = cpuset_change_cpumask;
829 scan.heap = &heap;
830 cgroup_scan_tasks(&scan);
831 heap_free(&heap); 925 heap_free(&heap);
832 926
833 if (is_load_balanced) 927 if (is_load_balanced)
834 rebuild_sched_domains(); 928 async_rebuild_sched_domains();
835 return 0; 929 return 0;
836} 930}
837 931
@@ -884,74 +978,25 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
884 mutex_unlock(&callback_mutex); 978 mutex_unlock(&callback_mutex);
885} 979}
886 980
887/*
888 * Handle user request to change the 'mems' memory placement
889 * of a cpuset. Needs to validate the request, update the
890 * cpusets mems_allowed and mems_generation, and for each
891 * task in the cpuset, rebind any vma mempolicies and if
892 * the cpuset is marked 'memory_migrate', migrate the tasks
893 * pages to the new memory.
894 *
895 * Call with cgroup_mutex held. May take callback_mutex during call.
896 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
897 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
898 * their mempolicies to the cpusets new mems_allowed.
899 */
900
901static void *cpuset_being_rebound; 981static void *cpuset_being_rebound;
902 982
903static int update_nodemask(struct cpuset *cs, char *buf) 983/**
984 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
985 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
986 * @oldmem: old mems_allowed of cpuset cs
987 *
988 * Called with cgroup_mutex held
989 * Return 0 if successful, -errno if not.
990 */
991static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
904{ 992{
905 struct cpuset trialcs;
906 nodemask_t oldmem;
907 struct task_struct *p; 993 struct task_struct *p;
908 struct mm_struct **mmarray; 994 struct mm_struct **mmarray;
909 int i, n, ntasks; 995 int i, n, ntasks;
910 int migrate; 996 int migrate;
911 int fudge; 997 int fudge;
912 int retval;
913 struct cgroup_iter it; 998 struct cgroup_iter it;
914 999 int retval;
915 /*
916 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
917 * it's read-only
918 */
919 if (cs == &top_cpuset)
920 return -EACCES;
921
922 trialcs = *cs;
923
924 /*
925 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
926 * Since nodelist_parse() fails on an empty mask, we special case
927 * that parsing. The validate_change() call ensures that cpusets
928 * with tasks have memory.
929 */
930 buf = strstrip(buf);
931 if (!*buf) {
932 nodes_clear(trialcs.mems_allowed);
933 } else {
934 retval = nodelist_parse(buf, trialcs.mems_allowed);
935 if (retval < 0)
936 goto done;
937
938 if (!nodes_subset(trialcs.mems_allowed,
939 node_states[N_HIGH_MEMORY]))
940 return -EINVAL;
941 }
942 oldmem = cs->mems_allowed;
943 if (nodes_equal(oldmem, trialcs.mems_allowed)) {
944 retval = 0; /* Too easy - nothing to do */
945 goto done;
946 }
947 retval = validate_change(cs, &trialcs);
948 if (retval < 0)
949 goto done;
950
951 mutex_lock(&callback_mutex);
952 cs->mems_allowed = trialcs.mems_allowed;
953 cs->mems_generation = cpuset_mems_generation++;
954 mutex_unlock(&callback_mutex);
955 1000
956 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ 1001 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
957 1002
@@ -1018,7 +1063,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
1018 1063
1019 mpol_rebind_mm(mm, &cs->mems_allowed); 1064 mpol_rebind_mm(mm, &cs->mems_allowed);
1020 if (migrate) 1065 if (migrate)
1021 cpuset_migrate_mm(mm, &oldmem, &cs->mems_allowed); 1066 cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
1022 mmput(mm); 1067 mmput(mm);
1023 } 1068 }
1024 1069
@@ -1030,6 +1075,70 @@ done:
1030 return retval; 1075 return retval;
1031} 1076}
1032 1077
1078/*
1079 * Handle user request to change the 'mems' memory placement
1080 * of a cpuset. Needs to validate the request, update the
1081 * cpusets mems_allowed and mems_generation, and for each
1082 * task in the cpuset, rebind any vma mempolicies and if
1083 * the cpuset is marked 'memory_migrate', migrate the tasks
1084 * pages to the new memory.
1085 *
1086 * Call with cgroup_mutex held. May take callback_mutex during call.
1087 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
1088 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
1089 * their mempolicies to the cpusets new mems_allowed.
1090 */
1091static int update_nodemask(struct cpuset *cs, const char *buf)
1092{
1093 struct cpuset trialcs;
1094 nodemask_t oldmem;
1095 int retval;
1096
1097 /*
1098 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
1099 * it's read-only
1100 */
1101 if (cs == &top_cpuset)
1102 return -EACCES;
1103
1104 trialcs = *cs;
1105
1106 /*
1107 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
1108 * Since nodelist_parse() fails on an empty mask, we special case
1109 * that parsing. The validate_change() call ensures that cpusets
1110 * with tasks have memory.
1111 */
1112 if (!*buf) {
1113 nodes_clear(trialcs.mems_allowed);
1114 } else {
1115 retval = nodelist_parse(buf, trialcs.mems_allowed);
1116 if (retval < 0)
1117 goto done;
1118
1119 if (!nodes_subset(trialcs.mems_allowed,
1120 node_states[N_HIGH_MEMORY]))
1121 return -EINVAL;
1122 }
1123 oldmem = cs->mems_allowed;
1124 if (nodes_equal(oldmem, trialcs.mems_allowed)) {
1125 retval = 0; /* Too easy - nothing to do */
1126 goto done;
1127 }
1128 retval = validate_change(cs, &trialcs);
1129 if (retval < 0)
1130 goto done;
1131
1132 mutex_lock(&callback_mutex);
1133 cs->mems_allowed = trialcs.mems_allowed;
1134 cs->mems_generation = cpuset_mems_generation++;
1135 mutex_unlock(&callback_mutex);
1136
1137 retval = update_tasks_nodemask(cs, &oldmem);
1138done:
1139 return retval;
1140}
1141
1033int current_cpuset_is_being_rebound(void) 1142int current_cpuset_is_being_rebound(void)
1034{ 1143{
1035 return task_cs(current) == cpuset_being_rebound; 1144 return task_cs(current) == cpuset_being_rebound;
@@ -1042,7 +1151,8 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1042 1151
1043 if (val != cs->relax_domain_level) { 1152 if (val != cs->relax_domain_level) {
1044 cs->relax_domain_level = val; 1153 cs->relax_domain_level = val;
1045 rebuild_sched_domains(); 1154 if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs))
1155 async_rebuild_sched_domains();
1046 } 1156 }
1047 1157
1048 return 0; 1158 return 0;
@@ -1083,7 +1193,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1083 mutex_unlock(&callback_mutex); 1193 mutex_unlock(&callback_mutex);
1084 1194
1085 if (cpus_nonempty && balance_flag_changed) 1195 if (cpus_nonempty && balance_flag_changed)
1086 rebuild_sched_domains(); 1196 async_rebuild_sched_domains();
1087 1197
1088 return 0; 1198 return 0;
1089} 1199}
@@ -1194,6 +1304,15 @@ static int cpuset_can_attach(struct cgroup_subsys *ss,
1194 1304
1195 if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) 1305 if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
1196 return -ENOSPC; 1306 return -ENOSPC;
1307 if (tsk->flags & PF_THREAD_BOUND) {
1308 cpumask_t mask;
1309
1310 mutex_lock(&callback_mutex);
1311 mask = cs->cpus_allowed;
1312 mutex_unlock(&callback_mutex);
1313 if (!cpus_equal(tsk->cpus_allowed, mask))
1314 return -EINVAL;
1315 }
1197 1316
1198 return security_task_setscheduler(tsk, 0, NULL); 1317 return security_task_setscheduler(tsk, 0, NULL);
1199} 1318}
@@ -1207,11 +1326,14 @@ static void cpuset_attach(struct cgroup_subsys *ss,
1207 struct mm_struct *mm; 1326 struct mm_struct *mm;
1208 struct cpuset *cs = cgroup_cs(cont); 1327 struct cpuset *cs = cgroup_cs(cont);
1209 struct cpuset *oldcs = cgroup_cs(oldcont); 1328 struct cpuset *oldcs = cgroup_cs(oldcont);
1329 int err;
1210 1330
1211 mutex_lock(&callback_mutex); 1331 mutex_lock(&callback_mutex);
1212 guarantee_online_cpus(cs, &cpus); 1332 guarantee_online_cpus(cs, &cpus);
1213 set_cpus_allowed_ptr(tsk, &cpus); 1333 err = set_cpus_allowed_ptr(tsk, &cpus);
1214 mutex_unlock(&callback_mutex); 1334 mutex_unlock(&callback_mutex);
1335 if (err)
1336 return;
1215 1337
1216 from = oldcs->mems_allowed; 1338 from = oldcs->mems_allowed;
1217 to = cs->mems_allowed; 1339 to = cs->mems_allowed;
@@ -1242,72 +1364,14 @@ typedef enum {
1242 FILE_SPREAD_SLAB, 1364 FILE_SPREAD_SLAB,
1243} cpuset_filetype_t; 1365} cpuset_filetype_t;
1244 1366
1245static ssize_t cpuset_common_file_write(struct cgroup *cont,
1246 struct cftype *cft,
1247 struct file *file,
1248 const char __user *userbuf,
1249 size_t nbytes, loff_t *unused_ppos)
1250{
1251 struct cpuset *cs = cgroup_cs(cont);
1252 cpuset_filetype_t type = cft->private;
1253 char *buffer;
1254 int retval = 0;
1255
1256 /* Crude upper limit on largest legitimate cpulist user might write. */
1257 if (nbytes > 100U + 6 * max(NR_CPUS, MAX_NUMNODES))
1258 return -E2BIG;
1259
1260 /* +1 for nul-terminator */
1261 buffer = kmalloc(nbytes + 1, GFP_KERNEL);
1262 if (!buffer)
1263 return -ENOMEM;
1264
1265 if (copy_from_user(buffer, userbuf, nbytes)) {
1266 retval = -EFAULT;
1267 goto out1;
1268 }
1269 buffer[nbytes] = 0; /* nul-terminate */
1270
1271 cgroup_lock();
1272
1273 if (cgroup_is_removed(cont)) {
1274 retval = -ENODEV;
1275 goto out2;
1276 }
1277
1278 switch (type) {
1279 case FILE_CPULIST:
1280 retval = update_cpumask(cs, buffer);
1281 break;
1282 case FILE_MEMLIST:
1283 retval = update_nodemask(cs, buffer);
1284 break;
1285 default:
1286 retval = -EINVAL;
1287 goto out2;
1288 }
1289
1290 if (retval == 0)
1291 retval = nbytes;
1292out2:
1293 cgroup_unlock();
1294out1:
1295 kfree(buffer);
1296 return retval;
1297}
1298
1299static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) 1367static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1300{ 1368{
1301 int retval = 0; 1369 int retval = 0;
1302 struct cpuset *cs = cgroup_cs(cgrp); 1370 struct cpuset *cs = cgroup_cs(cgrp);
1303 cpuset_filetype_t type = cft->private; 1371 cpuset_filetype_t type = cft->private;
1304 1372
1305 cgroup_lock(); 1373 if (!cgroup_lock_live_group(cgrp))
1306
1307 if (cgroup_is_removed(cgrp)) {
1308 cgroup_unlock();
1309 return -ENODEV; 1374 return -ENODEV;
1310 }
1311 1375
1312 switch (type) { 1376 switch (type) {
1313 case FILE_CPU_EXCLUSIVE: 1377 case FILE_CPU_EXCLUSIVE:
@@ -1353,12 +1417,9 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
1353 struct cpuset *cs = cgroup_cs(cgrp); 1417 struct cpuset *cs = cgroup_cs(cgrp);
1354 cpuset_filetype_t type = cft->private; 1418 cpuset_filetype_t type = cft->private;
1355 1419
1356 cgroup_lock(); 1420 if (!cgroup_lock_live_group(cgrp))
1357
1358 if (cgroup_is_removed(cgrp)) {
1359 cgroup_unlock();
1360 return -ENODEV; 1421 return -ENODEV;
1361 } 1422
1362 switch (type) { 1423 switch (type) {
1363 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 1424 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1364 retval = update_relax_domain_level(cs, val); 1425 retval = update_relax_domain_level(cs, val);
@@ -1372,6 +1433,32 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
1372} 1433}
1373 1434
1374/* 1435/*
1436 * Common handling for a write to a "cpus" or "mems" file.
1437 */
1438static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1439 const char *buf)
1440{
1441 int retval = 0;
1442
1443 if (!cgroup_lock_live_group(cgrp))
1444 return -ENODEV;
1445
1446 switch (cft->private) {
1447 case FILE_CPULIST:
1448 retval = update_cpumask(cgroup_cs(cgrp), buf);
1449 break;
1450 case FILE_MEMLIST:
1451 retval = update_nodemask(cgroup_cs(cgrp), buf);
1452 break;
1453 default:
1454 retval = -EINVAL;
1455 break;
1456 }
1457 cgroup_unlock();
1458 return retval;
1459}
1460
1461/*
1375 * These ascii lists should be read in a single call, by using a user 1462 * These ascii lists should be read in a single call, by using a user
1376 * buffer large enough to hold the entire map. If read in smaller 1463 * buffer large enough to hold the entire map. If read in smaller
1377 * chunks, there is no guarantee of atomicity. Since the display format 1464 * chunks, there is no guarantee of atomicity. Since the display format
@@ -1467,6 +1554,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
1467 default: 1554 default:
1468 BUG(); 1555 BUG();
1469 } 1556 }
1557
1558 /* Unreachable but makes gcc happy */
1559 return 0;
1470} 1560}
1471 1561
1472static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) 1562static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
@@ -1479,6 +1569,9 @@ static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
1479 default: 1569 default:
1480 BUG(); 1570 BUG();
1481 } 1571 }
1572
1573 /* Unrechable but makes gcc happy */
1574 return 0;
1482} 1575}
1483 1576
1484 1577
@@ -1490,14 +1583,16 @@ static struct cftype files[] = {
1490 { 1583 {
1491 .name = "cpus", 1584 .name = "cpus",
1492 .read = cpuset_common_file_read, 1585 .read = cpuset_common_file_read,
1493 .write = cpuset_common_file_write, 1586 .write_string = cpuset_write_resmask,
1587 .max_write_len = (100U + 6 * NR_CPUS),
1494 .private = FILE_CPULIST, 1588 .private = FILE_CPULIST,
1495 }, 1589 },
1496 1590
1497 { 1591 {
1498 .name = "mems", 1592 .name = "mems",
1499 .read = cpuset_common_file_read, 1593 .read = cpuset_common_file_read,
1500 .write = cpuset_common_file_write, 1594 .write_string = cpuset_write_resmask,
1595 .max_write_len = (100U + 6 * MAX_NUMNODES),
1501 .private = FILE_MEMLIST, 1596 .private = FILE_MEMLIST,
1502 }, 1597 },
1503 1598
@@ -1665,15 +1760,9 @@ static struct cgroup_subsys_state *cpuset_create(
1665} 1760}
1666 1761
1667/* 1762/*
1668 * Locking note on the strange update_flag() call below:
1669 *
1670 * If the cpuset being removed has its flag 'sched_load_balance' 1763 * If the cpuset being removed has its flag 'sched_load_balance'
1671 * enabled, then simulate turning sched_load_balance off, which 1764 * enabled, then simulate turning sched_load_balance off, which
1672 * will call rebuild_sched_domains(). The get_online_cpus() 1765 * will call async_rebuild_sched_domains().
1673 * call in rebuild_sched_domains() must not be made while holding
1674 * callback_mutex. Elsewhere the kernel nests callback_mutex inside
1675 * get_online_cpus() calls. So the reverse nesting would risk an
1676 * ABBA deadlock.
1677 */ 1766 */
1678 1767
1679static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) 1768static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
@@ -1692,7 +1781,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
1692struct cgroup_subsys cpuset_subsys = { 1781struct cgroup_subsys cpuset_subsys = {
1693 .name = "cpuset", 1782 .name = "cpuset",
1694 .create = cpuset_create, 1783 .create = cpuset_create,
1695 .destroy = cpuset_destroy, 1784 .destroy = cpuset_destroy,
1696 .can_attach = cpuset_can_attach, 1785 .can_attach = cpuset_can_attach,
1697 .attach = cpuset_attach, 1786 .attach = cpuset_attach,
1698 .populate = cpuset_populate, 1787 .populate = cpuset_populate,
@@ -1778,13 +1867,13 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
1778 scan.scan.heap = NULL; 1867 scan.scan.heap = NULL;
1779 scan.to = to->css.cgroup; 1868 scan.to = to->css.cgroup;
1780 1869
1781 if (cgroup_scan_tasks((struct cgroup_scanner *)&scan)) 1870 if (cgroup_scan_tasks(&scan.scan))
1782 printk(KERN_ERR "move_member_tasks_to_cpuset: " 1871 printk(KERN_ERR "move_member_tasks_to_cpuset: "
1783 "cgroup_scan_tasks failed\n"); 1872 "cgroup_scan_tasks failed\n");
1784} 1873}
1785 1874
1786/* 1875/*
1787 * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs 1876 * If CPU and/or memory hotplug handlers, below, unplug any CPUs
1788 * or memory nodes, we need to walk over the cpuset hierarchy, 1877 * or memory nodes, we need to walk over the cpuset hierarchy,
1789 * removing that CPU or node from all cpusets. If this removes the 1878 * removing that CPU or node from all cpusets. If this removes the
1790 * last CPU or node from a cpuset, then move the tasks in the empty 1879 * last CPU or node from a cpuset, then move the tasks in the empty
@@ -1832,31 +1921,31 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
1832 * that has tasks along with an empty 'mems'. But if we did see such 1921 * that has tasks along with an empty 'mems'. But if we did see such
1833 * a cpuset, we'd handle it just like we do if its 'cpus' was empty. 1922 * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
1834 */ 1923 */
1835static void scan_for_empty_cpusets(const struct cpuset *root) 1924static void scan_for_empty_cpusets(struct cpuset *root)
1836{ 1925{
1926 LIST_HEAD(queue);
1837 struct cpuset *cp; /* scans cpusets being updated */ 1927 struct cpuset *cp; /* scans cpusets being updated */
1838 struct cpuset *child; /* scans child cpusets of cp */ 1928 struct cpuset *child; /* scans child cpusets of cp */
1839 struct list_head queue;
1840 struct cgroup *cont; 1929 struct cgroup *cont;
1841 1930 nodemask_t oldmems;
1842 INIT_LIST_HEAD(&queue);
1843 1931
1844 list_add_tail((struct list_head *)&root->stack_list, &queue); 1932 list_add_tail((struct list_head *)&root->stack_list, &queue);
1845 1933
1846 while (!list_empty(&queue)) { 1934 while (!list_empty(&queue)) {
1847 cp = container_of(queue.next, struct cpuset, stack_list); 1935 cp = list_first_entry(&queue, struct cpuset, stack_list);
1848 list_del(queue.next); 1936 list_del(queue.next);
1849 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { 1937 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
1850 child = cgroup_cs(cont); 1938 child = cgroup_cs(cont);
1851 list_add_tail(&child->stack_list, &queue); 1939 list_add_tail(&child->stack_list, &queue);
1852 } 1940 }
1853 cont = cp->css.cgroup;
1854 1941
1855 /* Continue past cpusets with all cpus, mems online */ 1942 /* Continue past cpusets with all cpus, mems online */
1856 if (cpus_subset(cp->cpus_allowed, cpu_online_map) && 1943 if (cpus_subset(cp->cpus_allowed, cpu_online_map) &&
1857 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) 1944 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
1858 continue; 1945 continue;
1859 1946
1947 oldmems = cp->mems_allowed;
1948
1860 /* Remove offline cpus and mems from this cpuset. */ 1949 /* Remove offline cpus and mems from this cpuset. */
1861 mutex_lock(&callback_mutex); 1950 mutex_lock(&callback_mutex);
1862 cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map); 1951 cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map);
@@ -1868,38 +1957,14 @@ static void scan_for_empty_cpusets(const struct cpuset *root)
1868 if (cpus_empty(cp->cpus_allowed) || 1957 if (cpus_empty(cp->cpus_allowed) ||
1869 nodes_empty(cp->mems_allowed)) 1958 nodes_empty(cp->mems_allowed))
1870 remove_tasks_in_empty_cpuset(cp); 1959 remove_tasks_in_empty_cpuset(cp);
1960 else {
1961 update_tasks_cpumask(cp, NULL);
1962 update_tasks_nodemask(cp, &oldmems);
1963 }
1871 } 1964 }
1872} 1965}
1873 1966
1874/* 1967/*
1875 * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
1876 * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to
1877 * track what's online after any CPU or memory node hotplug or unplug event.
1878 *
1879 * Since there are two callers of this routine, one for CPU hotplug
1880 * events and one for memory node hotplug events, we could have coded
1881 * two separate routines here. We code it as a single common routine
1882 * in order to minimize text size.
1883 */
1884
1885static void common_cpu_mem_hotplug_unplug(void)
1886{
1887 cgroup_lock();
1888
1889 top_cpuset.cpus_allowed = cpu_online_map;
1890 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
1891 scan_for_empty_cpusets(&top_cpuset);
1892
1893 /*
1894 * Scheduler destroys domains on hotplug events.
1895 * Rebuild them based on the current settings.
1896 */
1897 rebuild_sched_domains();
1898
1899 cgroup_unlock();
1900}
1901
1902/*
1903 * The top_cpuset tracks what CPUs and Memory Nodes are online, 1968 * The top_cpuset tracks what CPUs and Memory Nodes are online,
1904 * period. This is necessary in order to make cpusets transparent 1969 * period. This is necessary in order to make cpusets transparent
1905 * (of no affect) on systems that are actively using CPU hotplug 1970 * (of no affect) on systems that are actively using CPU hotplug
@@ -1907,29 +1972,52 @@ static void common_cpu_mem_hotplug_unplug(void)
1907 * 1972 *
1908 * This routine ensures that top_cpuset.cpus_allowed tracks 1973 * This routine ensures that top_cpuset.cpus_allowed tracks
1909 * cpu_online_map on each CPU hotplug (cpuhp) event. 1974 * cpu_online_map on each CPU hotplug (cpuhp) event.
1975 *
1976 * Called within get_online_cpus(). Needs to call cgroup_lock()
1977 * before calling generate_sched_domains().
1910 */ 1978 */
1911 1979static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
1912static int cpuset_handle_cpuhp(struct notifier_block *unused_nb,
1913 unsigned long phase, void *unused_cpu) 1980 unsigned long phase, void *unused_cpu)
1914{ 1981{
1915 if (phase == CPU_DYING || phase == CPU_DYING_FROZEN) 1982 struct sched_domain_attr *attr;
1983 cpumask_t *doms;
1984 int ndoms;
1985
1986 switch (phase) {
1987 case CPU_ONLINE:
1988 case CPU_ONLINE_FROZEN:
1989 case CPU_DEAD:
1990 case CPU_DEAD_FROZEN:
1991 break;
1992
1993 default:
1916 return NOTIFY_DONE; 1994 return NOTIFY_DONE;
1995 }
1917 1996
1918 common_cpu_mem_hotplug_unplug(); 1997 cgroup_lock();
1919 return 0; 1998 top_cpuset.cpus_allowed = cpu_online_map;
1999 scan_for_empty_cpusets(&top_cpuset);
2000 ndoms = generate_sched_domains(&doms, &attr);
2001 cgroup_unlock();
2002
2003 /* Have scheduler rebuild the domains */
2004 partition_sched_domains(ndoms, doms, attr);
2005
2006 return NOTIFY_OK;
1920} 2007}
1921 2008
1922#ifdef CONFIG_MEMORY_HOTPLUG 2009#ifdef CONFIG_MEMORY_HOTPLUG
1923/* 2010/*
1924 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. 2011 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
1925 * Call this routine anytime after you change 2012 * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
1926 * node_states[N_HIGH_MEMORY]. 2013 * See also the previous routine cpuset_track_online_cpus().
1927 * See also the previous routine cpuset_handle_cpuhp().
1928 */ 2014 */
1929
1930void cpuset_track_online_nodes(void) 2015void cpuset_track_online_nodes(void)
1931{ 2016{
1932 common_cpu_mem_hotplug_unplug(); 2017 cgroup_lock();
2018 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2019 scan_for_empty_cpusets(&top_cpuset);
2020 cgroup_unlock();
1933} 2021}
1934#endif 2022#endif
1935 2023
@@ -1944,11 +2032,10 @@ void __init cpuset_init_smp(void)
1944 top_cpuset.cpus_allowed = cpu_online_map; 2032 top_cpuset.cpus_allowed = cpu_online_map;
1945 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2033 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
1946 2034
1947 hotcpu_notifier(cpuset_handle_cpuhp, 0); 2035 hotcpu_notifier(cpuset_track_online_cpus, 0);
1948} 2036}
1949 2037
1950/** 2038/**
1951
1952 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. 2039 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
1953 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. 2040 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
1954 * @pmask: pointer to cpumask_t variable to receive cpus_allowed set. 2041 * @pmask: pointer to cpumask_t variable to receive cpus_allowed set.
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 10e43fd8b721..b3179dad71be 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -145,8 +145,11 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
145 d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp; 145 d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp;
146 tmp = d->swapin_delay_total + tsk->delays->swapin_delay; 146 tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
147 d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp; 147 d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp;
148 tmp = d->freepages_delay_total + tsk->delays->freepages_delay;
149 d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp;
148 d->blkio_count += tsk->delays->blkio_count; 150 d->blkio_count += tsk->delays->blkio_count;
149 d->swapin_count += tsk->delays->swapin_count; 151 d->swapin_count += tsk->delays->swapin_count;
152 d->freepages_count += tsk->delays->freepages_count;
150 spin_unlock_irqrestore(&tsk->delays->lock, flags); 153 spin_unlock_irqrestore(&tsk->delays->lock, flags);
151 154
152done: 155done:
@@ -165,3 +168,16 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk)
165 return ret; 168 return ret;
166} 169}
167 170
171void __delayacct_freepages_start(void)
172{
173 delayacct_start(&current->delays->freepages_start);
174}
175
176void __delayacct_freepages_end(void)
177{
178 delayacct_end(&current->delays->freepages_start,
179 &current->delays->freepages_end,
180 &current->delays->freepages_delay,
181 &current->delays->freepages_count);
182}
183
diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
new file mode 100644
index 000000000000..f013a0c2e111
--- /dev/null
+++ b/kernel/dma-coherent.c
@@ -0,0 +1,155 @@
1/*
2 * Coherent per-device memory handling.
3 * Borrowed from i386
4 */
5#include <linux/kernel.h>
6#include <linux/dma-mapping.h>
7
8struct dma_coherent_mem {
9 void *virt_base;
10 u32 device_base;
11 int size;
12 int flags;
13 unsigned long *bitmap;
14};
15
16int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
17 dma_addr_t device_addr, size_t size, int flags)
18{
19 void __iomem *mem_base = NULL;
20 int pages = size >> PAGE_SHIFT;
21 int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
22
23 if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
24 goto out;
25 if (!size)
26 goto out;
27 if (dev->dma_mem)
28 goto out;
29
30 /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
31
32 mem_base = ioremap(bus_addr, size);
33 if (!mem_base)
34 goto out;
35
36 dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
37 if (!dev->dma_mem)
38 goto out;
39 dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
40 if (!dev->dma_mem->bitmap)
41 goto free1_out;
42
43 dev->dma_mem->virt_base = mem_base;
44 dev->dma_mem->device_base = device_addr;
45 dev->dma_mem->size = pages;
46 dev->dma_mem->flags = flags;
47
48 if (flags & DMA_MEMORY_MAP)
49 return DMA_MEMORY_MAP;
50
51 return DMA_MEMORY_IO;
52
53 free1_out:
54 kfree(dev->dma_mem);
55 out:
56 if (mem_base)
57 iounmap(mem_base);
58 return 0;
59}
60EXPORT_SYMBOL(dma_declare_coherent_memory);
61
62void dma_release_declared_memory(struct device *dev)
63{
64 struct dma_coherent_mem *mem = dev->dma_mem;
65
66 if (!mem)
67 return;
68 dev->dma_mem = NULL;
69 iounmap(mem->virt_base);
70 kfree(mem->bitmap);
71 kfree(mem);
72}
73EXPORT_SYMBOL(dma_release_declared_memory);
74
75void *dma_mark_declared_memory_occupied(struct device *dev,
76 dma_addr_t device_addr, size_t size)
77{
78 struct dma_coherent_mem *mem = dev->dma_mem;
79 int pos, err;
80
81 size += device_addr & ~PAGE_MASK;
82
83 if (!mem)
84 return ERR_PTR(-EINVAL);
85
86 pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
87 err = bitmap_allocate_region(mem->bitmap, pos, get_order(size));
88 if (err != 0)
89 return ERR_PTR(err);
90 return mem->virt_base + (pos << PAGE_SHIFT);
91}
92EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
93
94/**
95 * dma_alloc_from_coherent() - try to allocate memory from the per-device coherent area
96 *
97 * @dev: device from which we allocate memory
98 * @size: size of requested memory area
99 * @dma_handle: This will be filled with the correct dma handle
100 * @ret: This pointer will be filled with the virtual address
101 * to allocated area.
102 *
103 * This function should be only called from per-arch dma_alloc_coherent()
104 * to support allocation from per-device coherent memory pools.
105 *
106 * Returns 0 if dma_alloc_coherent should continue with allocating from
107 * generic memory areas, or !0 if dma_alloc_coherent should return @ret.
108 */
109int dma_alloc_from_coherent(struct device *dev, ssize_t size,
110 dma_addr_t *dma_handle, void **ret)
111{
112 struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
113 int order = get_order(size);
114
115 if (mem) {
116 int page = bitmap_find_free_region(mem->bitmap, mem->size,
117 order);
118 if (page >= 0) {
119 *dma_handle = mem->device_base + (page << PAGE_SHIFT);
120 *ret = mem->virt_base + (page << PAGE_SHIFT);
121 memset(*ret, 0, size);
122 } else if (mem->flags & DMA_MEMORY_EXCLUSIVE)
123 *ret = NULL;
124 }
125 return (mem != NULL);
126}
127EXPORT_SYMBOL(dma_alloc_from_coherent);
128
129/**
130 * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool
131 * @dev: device from which the memory was allocated
132 * @order: the order of pages allocated
133 * @vaddr: virtual address of allocated pages
134 *
135 * This checks whether the memory was allocated from the per-device
136 * coherent memory pool and if so, releases that memory.
137 *
138 * Returns 1 if we correctly released the memory, or 0 if
139 * dma_release_coherent() should proceed with releasing memory from
140 * generic pools.
141 */
142int dma_release_from_coherent(struct device *dev, int order, void *vaddr)
143{
144 struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
145
146 if (mem && vaddr >= mem->virt_base && vaddr <
147 (mem->virt_base + (mem->size << PAGE_SHIFT))) {
148 int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
149
150 bitmap_release_region(mem->bitmap, page, order);
151 return 1;
152 }
153 return 0;
154}
155EXPORT_SYMBOL(dma_release_from_coherent);
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index a9e6bad9f706..0d407e886735 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -65,7 +65,7 @@ lookup_exec_domain(u_long personality)
65 goto out; 65 goto out;
66 } 66 }
67 67
68#ifdef CONFIG_KMOD 68#ifdef CONFIG_MODULES
69 read_unlock(&exec_domains_lock); 69 read_unlock(&exec_domains_lock);
70 request_module("personality-%ld", pers); 70 request_module("personality-%ld", pers);
71 read_lock(&exec_domains_lock); 71 read_lock(&exec_domains_lock);
@@ -168,7 +168,6 @@ __set_personality(u_long personality)
168 current->personality = personality; 168 current->personality = personality;
169 oep = current_thread_info()->exec_domain; 169 oep = current_thread_info()->exec_domain;
170 current_thread_info()->exec_domain = ep; 170 current_thread_info()->exec_domain = ep;
171 set_fs_altroot();
172 171
173 module_put(oep->module); 172 module_put(oep->module);
174 return 0; 173 return 0;
diff --git a/kernel/exit.c b/kernel/exit.c
index fb8de6cbf2c7..c8d0485578be 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -13,6 +13,7 @@
13#include <linux/personality.h> 13#include <linux/personality.h>
14#include <linux/tty.h> 14#include <linux/tty.h>
15#include <linux/mnt_namespace.h> 15#include <linux/mnt_namespace.h>
16#include <linux/iocontext.h>
16#include <linux/key.h> 17#include <linux/key.h>
17#include <linux/security.h> 18#include <linux/security.h>
18#include <linux/cpu.h> 19#include <linux/cpu.h>
@@ -45,6 +46,7 @@
45#include <linux/resource.h> 46#include <linux/resource.h>
46#include <linux/blkdev.h> 47#include <linux/blkdev.h>
47#include <linux/task_io_accounting_ops.h> 48#include <linux/task_io_accounting_ops.h>
49#include <linux/tracehook.h>
48 50
49#include <asm/uaccess.h> 51#include <asm/uaccess.h>
50#include <asm/unistd.h> 52#include <asm/unistd.h>
@@ -70,7 +72,7 @@ static void __unhash_process(struct task_struct *p)
70 __get_cpu_var(process_counts)--; 72 __get_cpu_var(process_counts)--;
71 } 73 }
72 list_del_rcu(&p->thread_group); 74 list_del_rcu(&p->thread_group);
73 remove_parent(p); 75 list_del_init(&p->sibling);
74} 76}
75 77
76/* 78/*
@@ -84,7 +86,6 @@ static void __exit_signal(struct task_struct *tsk)
84 BUG_ON(!sig); 86 BUG_ON(!sig);
85 BUG_ON(!atomic_read(&sig->count)); 87 BUG_ON(!atomic_read(&sig->count));
86 88
87 rcu_read_lock();
88 sighand = rcu_dereference(tsk->sighand); 89 sighand = rcu_dereference(tsk->sighand);
89 spin_lock(&sighand->siglock); 90 spin_lock(&sighand->siglock);
90 91
@@ -111,15 +112,16 @@ static void __exit_signal(struct task_struct *tsk)
111 * We won't ever get here for the group leader, since it 112 * We won't ever get here for the group leader, since it
112 * will have been the last reference on the signal_struct. 113 * will have been the last reference on the signal_struct.
113 */ 114 */
114 sig->utime = cputime_add(sig->utime, tsk->utime); 115 sig->utime = cputime_add(sig->utime, task_utime(tsk));
115 sig->stime = cputime_add(sig->stime, tsk->stime); 116 sig->stime = cputime_add(sig->stime, task_stime(tsk));
116 sig->gtime = cputime_add(sig->gtime, tsk->gtime); 117 sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));
117 sig->min_flt += tsk->min_flt; 118 sig->min_flt += tsk->min_flt;
118 sig->maj_flt += tsk->maj_flt; 119 sig->maj_flt += tsk->maj_flt;
119 sig->nvcsw += tsk->nvcsw; 120 sig->nvcsw += tsk->nvcsw;
120 sig->nivcsw += tsk->nivcsw; 121 sig->nivcsw += tsk->nivcsw;
121 sig->inblock += task_io_get_inblock(tsk); 122 sig->inblock += task_io_get_inblock(tsk);
122 sig->oublock += task_io_get_oublock(tsk); 123 sig->oublock += task_io_get_oublock(tsk);
124 task_io_accounting_add(&sig->ioac, &tsk->ioac);
123 sig->sum_sched_runtime += tsk->se.sum_exec_runtime; 125 sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
124 sig = NULL; /* Marker for below. */ 126 sig = NULL; /* Marker for below. */
125 } 127 }
@@ -135,7 +137,6 @@ static void __exit_signal(struct task_struct *tsk)
135 tsk->signal = NULL; 137 tsk->signal = NULL;
136 tsk->sighand = NULL; 138 tsk->sighand = NULL;
137 spin_unlock(&sighand->siglock); 139 spin_unlock(&sighand->siglock);
138 rcu_read_unlock();
139 140
140 __cleanup_sighand(sighand); 141 __cleanup_sighand(sighand);
141 clear_tsk_thread_flag(tsk,TIF_SIGPENDING); 142 clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
@@ -151,16 +152,17 @@ static void delayed_put_task_struct(struct rcu_head *rhp)
151 put_task_struct(container_of(rhp, struct task_struct, rcu)); 152 put_task_struct(container_of(rhp, struct task_struct, rcu));
152} 153}
153 154
155
154void release_task(struct task_struct * p) 156void release_task(struct task_struct * p)
155{ 157{
156 struct task_struct *leader; 158 struct task_struct *leader;
157 int zap_leader; 159 int zap_leader;
158repeat: 160repeat:
161 tracehook_prepare_release_task(p);
159 atomic_dec(&p->user->processes); 162 atomic_dec(&p->user->processes);
160 proc_flush_task(p); 163 proc_flush_task(p);
161 write_lock_irq(&tasklist_lock); 164 write_lock_irq(&tasklist_lock);
162 ptrace_unlink(p); 165 tracehook_finish_release_task(p);
163 BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
164 __exit_signal(p); 166 __exit_signal(p);
165 167
166 /* 168 /*
@@ -182,6 +184,13 @@ repeat:
182 * that case. 184 * that case.
183 */ 185 */
184 zap_leader = task_detached(leader); 186 zap_leader = task_detached(leader);
187
188 /*
189 * This maintains the invariant that release_task()
190 * only runs on a task in EXIT_DEAD, just for sanity.
191 */
192 if (zap_leader)
193 leader->exit_state = EXIT_DEAD;
185 } 194 }
186 195
187 write_unlock_irq(&tasklist_lock); 196 write_unlock_irq(&tasklist_lock);
@@ -314,9 +323,8 @@ static void reparent_to_kthreadd(void)
314 323
315 ptrace_unlink(current); 324 ptrace_unlink(current);
316 /* Reparent to init */ 325 /* Reparent to init */
317 remove_parent(current);
318 current->real_parent = current->parent = kthreadd_task; 326 current->real_parent = current->parent = kthreadd_task;
319 add_parent(current); 327 list_move_tail(&current->sibling, &current->real_parent->children);
320 328
321 /* Set the exit signal to SIGCHLD so we signal init on exit */ 329 /* Set the exit signal to SIGCHLD so we signal init on exit */
322 current->exit_signal = SIGCHLD; 330 current->exit_signal = SIGCHLD;
@@ -421,7 +429,7 @@ void daemonize(const char *name, ...)
421 * We don't want to have TIF_FREEZE set if the system-wide hibernation 429 * We don't want to have TIF_FREEZE set if the system-wide hibernation
422 * or suspend transition begins right now. 430 * or suspend transition begins right now.
423 */ 431 */
424 current->flags |= PF_NOFREEZE; 432 current->flags |= (PF_NOFREEZE | PF_KTHREAD);
425 433
426 if (current->nsproxy != &init_nsproxy) { 434 if (current->nsproxy != &init_nsproxy) {
427 get_nsproxy(&init_nsproxy); 435 get_nsproxy(&init_nsproxy);
@@ -546,8 +554,6 @@ void put_fs_struct(struct fs_struct *fs)
546 if (atomic_dec_and_test(&fs->count)) { 554 if (atomic_dec_and_test(&fs->count)) {
547 path_put(&fs->root); 555 path_put(&fs->root);
548 path_put(&fs->pwd); 556 path_put(&fs->pwd);
549 if (fs->altroot.dentry)
550 path_put(&fs->altroot);
551 kmem_cache_free(fs_cachep, fs); 557 kmem_cache_free(fs_cachep, fs);
552 } 558 }
553} 559}
@@ -577,8 +583,6 @@ mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
577 * If there are other users of the mm and the owner (us) is exiting 583 * If there are other users of the mm and the owner (us) is exiting
578 * we need to find a new owner to take on the responsibility. 584 * we need to find a new owner to take on the responsibility.
579 */ 585 */
580 if (!mm)
581 return 0;
582 if (atomic_read(&mm->mm_users) <= 1) 586 if (atomic_read(&mm->mm_users) <= 1)
583 return 0; 587 return 0;
584 if (mm->owner != p) 588 if (mm->owner != p)
@@ -621,6 +625,16 @@ retry:
621 } while_each_thread(g, c); 625 } while_each_thread(g, c);
622 626
623 read_unlock(&tasklist_lock); 627 read_unlock(&tasklist_lock);
628 /*
629 * We found no owner yet mm_users > 1: this implies that we are
630 * most likely racing with swapoff (try_to_unuse()) or /proc or
631 * ptrace or page migration (get_task_mm()). Mark owner as NULL,
632 * so that subsystems can understand the callback and take action.
633 */
634 down_write(&mm->mmap_sem);
635 cgroup_mm_owner_callbacks(mm->owner, NULL);
636 mm->owner = NULL;
637 up_write(&mm->mmap_sem);
624 return; 638 return;
625 639
626assign_new_owner: 640assign_new_owner:
@@ -655,26 +669,40 @@ assign_new_owner:
655static void exit_mm(struct task_struct * tsk) 669static void exit_mm(struct task_struct * tsk)
656{ 670{
657 struct mm_struct *mm = tsk->mm; 671 struct mm_struct *mm = tsk->mm;
672 struct core_state *core_state;
658 673
659 mm_release(tsk, mm); 674 mm_release(tsk, mm);
660 if (!mm) 675 if (!mm)
661 return; 676 return;
662 /* 677 /*
663 * Serialize with any possible pending coredump. 678 * Serialize with any possible pending coredump.
664 * We must hold mmap_sem around checking core_waiters 679 * We must hold mmap_sem around checking core_state
665 * and clearing tsk->mm. The core-inducing thread 680 * and clearing tsk->mm. The core-inducing thread
666 * will increment core_waiters for each thread in the 681 * will increment ->nr_threads for each thread in the
667 * group with ->mm != NULL. 682 * group with ->mm != NULL.
668 */ 683 */
669 down_read(&mm->mmap_sem); 684 down_read(&mm->mmap_sem);
670 if (mm->core_waiters) { 685 core_state = mm->core_state;
686 if (core_state) {
687 struct core_thread self;
671 up_read(&mm->mmap_sem); 688 up_read(&mm->mmap_sem);
672 down_write(&mm->mmap_sem);
673 if (!--mm->core_waiters)
674 complete(mm->core_startup_done);
675 up_write(&mm->mmap_sem);
676 689
677 wait_for_completion(&mm->core_done); 690 self.task = tsk;
691 self.next = xchg(&core_state->dumper.next, &self);
692 /*
693 * Implies mb(), the result of xchg() must be visible
694 * to core_state->dumper.
695 */
696 if (atomic_dec_and_test(&core_state->nr_threads))
697 complete(&core_state->startup);
698
699 for (;;) {
700 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
701 if (!self.task) /* see coredump_finish() */
702 break;
703 schedule();
704 }
705 __set_task_state(tsk, TASK_RUNNING);
678 down_read(&mm->mmap_sem); 706 down_read(&mm->mmap_sem);
679 } 707 }
680 atomic_inc(&mm->mm_count); 708 atomic_inc(&mm->mm_count);
@@ -691,37 +719,97 @@ static void exit_mm(struct task_struct * tsk)
691 mmput(mm); 719 mmput(mm);
692} 720}
693 721
694static void 722/*
695reparent_thread(struct task_struct *p, struct task_struct *father, int traced) 723 * Return nonzero if @parent's children should reap themselves.
724 *
725 * Called with write_lock_irq(&tasklist_lock) held.
726 */
727static int ignoring_children(struct task_struct *parent)
696{ 728{
697 if (p->pdeath_signal) 729 int ret;
698 /* We already hold the tasklist_lock here. */ 730 struct sighand_struct *psig = parent->sighand;
699 group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p); 731 unsigned long flags;
732 spin_lock_irqsave(&psig->siglock, flags);
733 ret = (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
734 (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT));
735 spin_unlock_irqrestore(&psig->siglock, flags);
736 return ret;
737}
700 738
701 /* Move the child from its dying parent to the new one. */ 739/*
702 if (unlikely(traced)) { 740 * Detach all tasks we were using ptrace on.
703 /* Preserve ptrace links if someone else is tracing this child. */ 741 * Any that need to be release_task'd are put on the @dead list.
704 list_del_init(&p->ptrace_list); 742 *
705 if (ptrace_reparented(p)) 743 * Called with write_lock(&tasklist_lock) held.
706 list_add(&p->ptrace_list, &p->real_parent->ptrace_children); 744 */
707 } else { 745static void ptrace_exit(struct task_struct *parent, struct list_head *dead)
708 /* If this child is being traced, then we're the one tracing it 746{
709 * anyway, so let go of it. 747 struct task_struct *p, *n;
748 int ign = -1;
749
750 list_for_each_entry_safe(p, n, &parent->ptraced, ptrace_entry) {
751 __ptrace_unlink(p);
752
753 if (p->exit_state != EXIT_ZOMBIE)
754 continue;
755
756 /*
757 * If it's a zombie, our attachedness prevented normal
758 * parent notification or self-reaping. Do notification
759 * now if it would have happened earlier. If it should
760 * reap itself, add it to the @dead list. We can't call
761 * release_task() here because we already hold tasklist_lock.
762 *
763 * If it's our own child, there is no notification to do.
764 * But if our normal children self-reap, then this child
765 * was prevented by ptrace and we must reap it now.
710 */ 766 */
711 p->ptrace = 0; 767 if (!task_detached(p) && thread_group_empty(p)) {
712 remove_parent(p); 768 if (!same_thread_group(p->real_parent, parent))
713 p->parent = p->real_parent; 769 do_notify_parent(p, p->exit_signal);
714 add_parent(p); 770 else {
771 if (ign < 0)
772 ign = ignoring_children(parent);
773 if (ign)
774 p->exit_signal = -1;
775 }
776 }
715 777
716 if (task_is_traced(p)) { 778 if (task_detached(p)) {
717 /* 779 /*
718 * If it was at a trace stop, turn it into 780 * Mark it as in the process of being reaped.
719 * a normal stop since it's no longer being
720 * traced.
721 */ 781 */
722 ptrace_untrace(p); 782 p->exit_state = EXIT_DEAD;
783 list_add(&p->ptrace_entry, dead);
723 } 784 }
724 } 785 }
786}
787
788/*
789 * Finish up exit-time ptrace cleanup.
790 *
791 * Called without locks.
792 */
793static void ptrace_exit_finish(struct task_struct *parent,
794 struct list_head *dead)
795{
796 struct task_struct *p, *n;
797
798 BUG_ON(!list_empty(&parent->ptraced));
799
800 list_for_each_entry_safe(p, n, dead, ptrace_entry) {
801 list_del_init(&p->ptrace_entry);
802 release_task(p);
803 }
804}
805
806static void reparent_thread(struct task_struct *p, struct task_struct *father)
807{
808 if (p->pdeath_signal)
809 /* We already hold the tasklist_lock here. */
810 group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
811
812 list_move_tail(&p->sibling, &p->real_parent->children);
725 813
726 /* If this is a threaded reparent there is no need to 814 /* If this is a threaded reparent there is no need to
727 * notify anyone anything has happened. 815 * notify anyone anything has happened.
@@ -736,7 +824,8 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
736 /* If we'd notified the old parent about this child's death, 824 /* If we'd notified the old parent about this child's death,
737 * also notify the new parent. 825 * also notify the new parent.
738 */ 826 */
739 if (!traced && p->exit_state == EXIT_ZOMBIE && 827 if (!ptrace_reparented(p) &&
828 p->exit_state == EXIT_ZOMBIE &&
740 !task_detached(p) && thread_group_empty(p)) 829 !task_detached(p) && thread_group_empty(p))
741 do_notify_parent(p, p->exit_signal); 830 do_notify_parent(p, p->exit_signal);
742 831
@@ -750,75 +839,63 @@ reparent_thread(struct task_struct *p, struct task_struct *father, int traced)
750 * the child reaper process (ie "init") in our pid 839 * the child reaper process (ie "init") in our pid
751 * space. 840 * space.
752 */ 841 */
753static void forget_original_parent(struct task_struct *father) 842static struct task_struct *find_new_reaper(struct task_struct *father)
754{ 843{
755 struct task_struct *p, *n, *reaper = father; 844 struct pid_namespace *pid_ns = task_active_pid_ns(father);
756 struct list_head ptrace_dead; 845 struct task_struct *thread;
757
758 INIT_LIST_HEAD(&ptrace_dead);
759
760 write_lock_irq(&tasklist_lock);
761
762 do {
763 reaper = next_thread(reaper);
764 if (reaper == father) {
765 reaper = task_child_reaper(father);
766 break;
767 }
768 } while (reaper->flags & PF_EXITING);
769 846
770 /* 847 thread = father;
771 * There are only two places where our children can be: 848 while_each_thread(father, thread) {
772 * 849 if (thread->flags & PF_EXITING)
773 * - in our child list 850 continue;
774 * - in our ptraced child list 851 if (unlikely(pid_ns->child_reaper == father))
775 * 852 pid_ns->child_reaper = thread;
776 * Search them and reparent children. 853 return thread;
777 */ 854 }
778 list_for_each_entry_safe(p, n, &father->children, sibling) {
779 int ptrace;
780
781 ptrace = p->ptrace;
782
783 /* if father isn't the real parent, then ptrace must be enabled */
784 BUG_ON(father != p->real_parent && !ptrace);
785 855
786 if (father == p->real_parent) { 856 if (unlikely(pid_ns->child_reaper == father)) {
787 /* reparent with a reaper, real father it's us */ 857 write_unlock_irq(&tasklist_lock);
788 p->real_parent = reaper; 858 if (unlikely(pid_ns == &init_pid_ns))
789 reparent_thread(p, father, 0); 859 panic("Attempted to kill init!");
790 } else {
791 /* reparent ptraced task to its real parent */
792 __ptrace_unlink (p);
793 if (p->exit_state == EXIT_ZOMBIE && !task_detached(p) &&
794 thread_group_empty(p))
795 do_notify_parent(p, p->exit_signal);
796 }
797 860
861 zap_pid_ns_processes(pid_ns);
862 write_lock_irq(&tasklist_lock);
798 /* 863 /*
799 * if the ptraced child is a detached zombie we must collect 864 * We can not clear ->child_reaper or leave it alone.
800 * it before we exit, or it will remain zombie forever since 865 * There may by stealth EXIT_DEAD tasks on ->children,
801 * we prevented it from self-reap itself while it was being 866 * forget_original_parent() must move them somewhere.
802 * traced by us, to be able to see it in wait4.
803 */ 867 */
804 if (unlikely(ptrace && p->exit_state == EXIT_ZOMBIE && task_detached(p))) 868 pid_ns->child_reaper = init_pid_ns.child_reaper;
805 list_add(&p->ptrace_list, &ptrace_dead);
806 } 869 }
807 870
808 list_for_each_entry_safe(p, n, &father->ptrace_children, ptrace_list) { 871 return pid_ns->child_reaper;
872}
873
874static void forget_original_parent(struct task_struct *father)
875{
876 struct task_struct *p, *n, *reaper;
877 LIST_HEAD(ptrace_dead);
878
879 write_lock_irq(&tasklist_lock);
880 reaper = find_new_reaper(father);
881 /*
882 * First clean up ptrace if we were using it.
883 */
884 ptrace_exit(father, &ptrace_dead);
885
886 list_for_each_entry_safe(p, n, &father->children, sibling) {
809 p->real_parent = reaper; 887 p->real_parent = reaper;
810 reparent_thread(p, father, 1); 888 if (p->parent == father) {
889 BUG_ON(p->ptrace);
890 p->parent = p->real_parent;
891 }
892 reparent_thread(p, father);
811 } 893 }
812 894
813 write_unlock_irq(&tasklist_lock); 895 write_unlock_irq(&tasklist_lock);
814 BUG_ON(!list_empty(&father->children)); 896 BUG_ON(!list_empty(&father->children));
815 BUG_ON(!list_empty(&father->ptrace_children));
816
817 list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_list) {
818 list_del_init(&p->ptrace_list);
819 release_task(p);
820 }
821 897
898 ptrace_exit_finish(father, &ptrace_dead);
822} 899}
823 900
824/* 901/*
@@ -827,7 +904,8 @@ static void forget_original_parent(struct task_struct *father)
827 */ 904 */
828static void exit_notify(struct task_struct *tsk, int group_dead) 905static void exit_notify(struct task_struct *tsk, int group_dead)
829{ 906{
830 int state; 907 int signal;
908 void *cookie;
831 909
832 /* 910 /*
833 * This does two things: 911 * This does two things:
@@ -864,33 +942,24 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
864 !capable(CAP_KILL)) 942 !capable(CAP_KILL))
865 tsk->exit_signal = SIGCHLD; 943 tsk->exit_signal = SIGCHLD;
866 944
867 /* If something other than our normal parent is ptracing us, then 945 signal = tracehook_notify_death(tsk, &cookie, group_dead);
868 * send it a SIGCHLD instead of honoring exit_signal. exit_signal 946 if (signal >= 0)
869 * only has special meaning to our real parent. 947 signal = do_notify_parent(tsk, signal);
870 */
871 if (!task_detached(tsk) && thread_group_empty(tsk)) {
872 int signal = ptrace_reparented(tsk) ?
873 SIGCHLD : tsk->exit_signal;
874 do_notify_parent(tsk, signal);
875 } else if (tsk->ptrace) {
876 do_notify_parent(tsk, SIGCHLD);
877 }
878 948
879 state = EXIT_ZOMBIE; 949 tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE;
880 if (task_detached(tsk) && likely(!tsk->ptrace))
881 state = EXIT_DEAD;
882 tsk->exit_state = state;
883 950
884 /* mt-exec, de_thread() is waiting for us */ 951 /* mt-exec, de_thread() is waiting for us */
885 if (thread_group_leader(tsk) && 952 if (thread_group_leader(tsk) &&
886 tsk->signal->notify_count < 0 && 953 tsk->signal->group_exit_task &&
887 tsk->signal->group_exit_task) 954 tsk->signal->notify_count < 0)
888 wake_up_process(tsk->signal->group_exit_task); 955 wake_up_process(tsk->signal->group_exit_task);
889 956
890 write_unlock_irq(&tasklist_lock); 957 write_unlock_irq(&tasklist_lock);
891 958
959 tracehook_report_death(tsk, signal, cookie, group_dead);
960
892 /* If the process is dead, release it - nobody will wait for it */ 961 /* If the process is dead, release it - nobody will wait for it */
893 if (state == EXIT_DEAD) 962 if (signal == DEATH_REAP)
894 release_task(tsk); 963 release_task(tsk);
895} 964}
896 965
@@ -919,39 +988,6 @@ static void check_stack_usage(void)
919static inline void check_stack_usage(void) {} 988static inline void check_stack_usage(void) {}
920#endif 989#endif
921 990
922static inline void exit_child_reaper(struct task_struct *tsk)
923{
924 if (likely(tsk->group_leader != task_child_reaper(tsk)))
925 return;
926
927 if (tsk->nsproxy->pid_ns == &init_pid_ns)
928 panic("Attempted to kill init!");
929
930 /*
931 * @tsk is the last thread in the 'cgroup-init' and is exiting.
932 * Terminate all remaining processes in the namespace and reap them
933 * before exiting @tsk.
934 *
935 * Note that @tsk (last thread of cgroup-init) may not necessarily
936 * be the child-reaper (i.e main thread of cgroup-init) of the
937 * namespace i.e the child_reaper may have already exited.
938 *
939 * Even after a child_reaper exits, we let it inherit orphaned children,
940 * because, pid_ns->child_reaper remains valid as long as there is
941 * at least one living sub-thread in the cgroup init.
942
943 * This living sub-thread of the cgroup-init will be notified when
944 * a child inherited by the 'child-reaper' exits (do_notify_parent()
945 * uses __group_send_sig_info()). Further, when reaping child processes,
946 * do_wait() iterates over children of all living sub threads.
947
948 * i.e even though 'child_reaper' thread is listed as the parent of the
949 * orphaned children, any living sub-thread in the cgroup-init can
950 * perform the role of the child_reaper.
951 */
952 zap_pid_ns_processes(tsk->nsproxy->pid_ns);
953}
954
955NORET_TYPE void do_exit(long code) 991NORET_TYPE void do_exit(long code)
956{ 992{
957 struct task_struct *tsk = current; 993 struct task_struct *tsk = current;
@@ -966,10 +1002,7 @@ NORET_TYPE void do_exit(long code)
966 if (unlikely(!tsk->pid)) 1002 if (unlikely(!tsk->pid))
967 panic("Attempted to kill the idle task!"); 1003 panic("Attempted to kill the idle task!");
968 1004
969 if (unlikely(current->ptrace & PT_TRACE_EXIT)) { 1005 tracehook_report_exit(&code);
970 current->ptrace_message = code;
971 ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP);
972 }
973 1006
974 /* 1007 /*
975 * We're taking recursive faults here in do_exit. Safest is to just 1008 * We're taking recursive faults here in do_exit. Safest is to just
@@ -1014,7 +1047,6 @@ NORET_TYPE void do_exit(long code)
1014 } 1047 }
1015 group_dead = atomic_dec_and_test(&tsk->signal->live); 1048 group_dead = atomic_dec_and_test(&tsk->signal->live);
1016 if (group_dead) { 1049 if (group_dead) {
1017 exit_child_reaper(tsk);
1018 hrtimer_cancel(&tsk->signal->real_timer); 1050 hrtimer_cancel(&tsk->signal->real_timer);
1019 exit_itimers(tsk->signal); 1051 exit_itimers(tsk->signal);
1020 } 1052 }
@@ -1176,13 +1208,6 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options,
1176 return 0; 1208 return 0;
1177 } 1209 }
1178 1210
1179 /*
1180 * Do not consider detached threads that are
1181 * not ptraced:
1182 */
1183 if (task_detached(p) && !p->ptrace)
1184 return 0;
1185
1186 /* Wait for all children (clone and not) if __WALL is set; 1211 /* Wait for all children (clone and not) if __WALL is set;
1187 * otherwise, wait for clone children *only* if __WCLONE is 1212 * otherwise, wait for clone children *only* if __WCLONE is
1188 * set; otherwise, wait for non-clone children *only*. (Note: 1213 * set; otherwise, wait for non-clone children *only*. (Note:
@@ -1193,14 +1218,10 @@ static int eligible_child(enum pid_type type, struct pid *pid, int options,
1193 return 0; 1218 return 0;
1194 1219
1195 err = security_task_wait(p); 1220 err = security_task_wait(p);
1196 if (likely(!err)) 1221 if (err)
1197 return 1; 1222 return err;
1198 1223
1199 if (type != PIDTYPE_PID) 1224 return 1;
1200 return 0;
1201 /* This child was explicitly requested, abort */
1202 read_unlock(&tasklist_lock);
1203 return err;
1204} 1225}
1205 1226
1206static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid, 1227static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
@@ -1234,7 +1255,7 @@ static int wait_noreap_copyout(struct task_struct *p, pid_t pid, uid_t uid,
1234 * the lock and this task is uninteresting. If we return nonzero, we have 1255 * the lock and this task is uninteresting. If we return nonzero, we have
1235 * released the lock and the system call should return. 1256 * released the lock and the system call should return.
1236 */ 1257 */
1237static int wait_task_zombie(struct task_struct *p, int noreap, 1258static int wait_task_zombie(struct task_struct *p, int options,
1238 struct siginfo __user *infop, 1259 struct siginfo __user *infop,
1239 int __user *stat_addr, struct rusage __user *ru) 1260 int __user *stat_addr, struct rusage __user *ru)
1240{ 1261{
@@ -1242,7 +1263,10 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
1242 int retval, status, traced; 1263 int retval, status, traced;
1243 pid_t pid = task_pid_vnr(p); 1264 pid_t pid = task_pid_vnr(p);
1244 1265
1245 if (unlikely(noreap)) { 1266 if (!likely(options & WEXITED))
1267 return 0;
1268
1269 if (unlikely(options & WNOWAIT)) {
1246 uid_t uid = p->uid; 1270 uid_t uid = p->uid;
1247 int exit_code = p->exit_code; 1271 int exit_code = p->exit_code;
1248 int why, status; 1272 int why, status;
@@ -1323,6 +1347,8 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
1323 psig->coublock += 1347 psig->coublock +=
1324 task_io_get_oublock(p) + 1348 task_io_get_oublock(p) +
1325 sig->oublock + sig->coublock; 1349 sig->oublock + sig->coublock;
1350 task_io_accounting_add(&psig->ioac, &p->ioac);
1351 task_io_accounting_add(&psig->ioac, &sig->ioac);
1326 spin_unlock_irq(&p->parent->sighand->siglock); 1352 spin_unlock_irq(&p->parent->sighand->siglock);
1327 } 1353 }
1328 1354
@@ -1392,21 +1418,24 @@ static int wait_task_zombie(struct task_struct *p, int noreap,
1392 * the lock and this task is uninteresting. If we return nonzero, we have 1418 * the lock and this task is uninteresting. If we return nonzero, we have
1393 * released the lock and the system call should return. 1419 * released the lock and the system call should return.
1394 */ 1420 */
1395static int wait_task_stopped(struct task_struct *p, 1421static int wait_task_stopped(int ptrace, struct task_struct *p,
1396 int noreap, struct siginfo __user *infop, 1422 int options, struct siginfo __user *infop,
1397 int __user *stat_addr, struct rusage __user *ru) 1423 int __user *stat_addr, struct rusage __user *ru)
1398{ 1424{
1399 int retval, exit_code, why; 1425 int retval, exit_code, why;
1400 uid_t uid = 0; /* unneeded, required by compiler */ 1426 uid_t uid = 0; /* unneeded, required by compiler */
1401 pid_t pid; 1427 pid_t pid;
1402 1428
1429 if (!(options & WUNTRACED))
1430 return 0;
1431
1403 exit_code = 0; 1432 exit_code = 0;
1404 spin_lock_irq(&p->sighand->siglock); 1433 spin_lock_irq(&p->sighand->siglock);
1405 1434
1406 if (unlikely(!task_is_stopped_or_traced(p))) 1435 if (unlikely(!task_is_stopped_or_traced(p)))
1407 goto unlock_sig; 1436 goto unlock_sig;
1408 1437
1409 if (!(p->ptrace & PT_PTRACED) && p->signal->group_stop_count > 0) 1438 if (!ptrace && p->signal->group_stop_count > 0)
1410 /* 1439 /*
1411 * A group stop is in progress and this is the group leader. 1440 * A group stop is in progress and this is the group leader.
1412 * We won't report until all threads have stopped. 1441 * We won't report until all threads have stopped.
@@ -1417,7 +1446,7 @@ static int wait_task_stopped(struct task_struct *p,
1417 if (!exit_code) 1446 if (!exit_code)
1418 goto unlock_sig; 1447 goto unlock_sig;
1419 1448
1420 if (!noreap) 1449 if (!unlikely(options & WNOWAIT))
1421 p->exit_code = 0; 1450 p->exit_code = 0;
1422 1451
1423 uid = p->uid; 1452 uid = p->uid;
@@ -1435,10 +1464,10 @@ unlock_sig:
1435 */ 1464 */
1436 get_task_struct(p); 1465 get_task_struct(p);
1437 pid = task_pid_vnr(p); 1466 pid = task_pid_vnr(p);
1438 why = (p->ptrace & PT_PTRACED) ? CLD_TRAPPED : CLD_STOPPED; 1467 why = ptrace ? CLD_TRAPPED : CLD_STOPPED;
1439 read_unlock(&tasklist_lock); 1468 read_unlock(&tasklist_lock);
1440 1469
1441 if (unlikely(noreap)) 1470 if (unlikely(options & WNOWAIT))
1442 return wait_noreap_copyout(p, pid, uid, 1471 return wait_noreap_copyout(p, pid, uid,
1443 why, exit_code, 1472 why, exit_code,
1444 infop, ru); 1473 infop, ru);
@@ -1472,7 +1501,7 @@ unlock_sig:
1472 * the lock and this task is uninteresting. If we return nonzero, we have 1501 * the lock and this task is uninteresting. If we return nonzero, we have
1473 * released the lock and the system call should return. 1502 * released the lock and the system call should return.
1474 */ 1503 */
1475static int wait_task_continued(struct task_struct *p, int noreap, 1504static int wait_task_continued(struct task_struct *p, int options,
1476 struct siginfo __user *infop, 1505 struct siginfo __user *infop,
1477 int __user *stat_addr, struct rusage __user *ru) 1506 int __user *stat_addr, struct rusage __user *ru)
1478{ 1507{
@@ -1480,6 +1509,9 @@ static int wait_task_continued(struct task_struct *p, int noreap,
1480 pid_t pid; 1509 pid_t pid;
1481 uid_t uid; 1510 uid_t uid;
1482 1511
1512 if (!unlikely(options & WCONTINUED))
1513 return 0;
1514
1483 if (!(p->signal->flags & SIGNAL_STOP_CONTINUED)) 1515 if (!(p->signal->flags & SIGNAL_STOP_CONTINUED))
1484 return 0; 1516 return 0;
1485 1517
@@ -1489,7 +1521,7 @@ static int wait_task_continued(struct task_struct *p, int noreap,
1489 spin_unlock_irq(&p->sighand->siglock); 1521 spin_unlock_irq(&p->sighand->siglock);
1490 return 0; 1522 return 0;
1491 } 1523 }
1492 if (!noreap) 1524 if (!unlikely(options & WNOWAIT))
1493 p->signal->flags &= ~SIGNAL_STOP_CONTINUED; 1525 p->signal->flags &= ~SIGNAL_STOP_CONTINUED;
1494 spin_unlock_irq(&p->sighand->siglock); 1526 spin_unlock_irq(&p->sighand->siglock);
1495 1527
@@ -1515,89 +1547,161 @@ static int wait_task_continued(struct task_struct *p, int noreap,
1515 return retval; 1547 return retval;
1516} 1548}
1517 1549
1550/*
1551 * Consider @p for a wait by @parent.
1552 *
1553 * -ECHILD should be in *@notask_error before the first call.
1554 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
1555 * Returns zero if the search for a child should continue;
1556 * then *@notask_error is 0 if @p is an eligible child,
1557 * or another error from security_task_wait(), or still -ECHILD.
1558 */
1559static int wait_consider_task(struct task_struct *parent, int ptrace,
1560 struct task_struct *p, int *notask_error,
1561 enum pid_type type, struct pid *pid, int options,
1562 struct siginfo __user *infop,
1563 int __user *stat_addr, struct rusage __user *ru)
1564{
1565 int ret = eligible_child(type, pid, options, p);
1566 if (!ret)
1567 return ret;
1568
1569 if (unlikely(ret < 0)) {
1570 /*
1571 * If we have not yet seen any eligible child,
1572 * then let this error code replace -ECHILD.
1573 * A permission error will give the user a clue
1574 * to look for security policy problems, rather
1575 * than for mysterious wait bugs.
1576 */
1577 if (*notask_error)
1578 *notask_error = ret;
1579 }
1580
1581 if (likely(!ptrace) && unlikely(p->ptrace)) {
1582 /*
1583 * This child is hidden by ptrace.
1584 * We aren't allowed to see it now, but eventually we will.
1585 */
1586 *notask_error = 0;
1587 return 0;
1588 }
1589
1590 if (p->exit_state == EXIT_DEAD)
1591 return 0;
1592
1593 /*
1594 * We don't reap group leaders with subthreads.
1595 */
1596 if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
1597 return wait_task_zombie(p, options, infop, stat_addr, ru);
1598
1599 /*
1600 * It's stopped or running now, so it might
1601 * later continue, exit, or stop again.
1602 */
1603 *notask_error = 0;
1604
1605 if (task_is_stopped_or_traced(p))
1606 return wait_task_stopped(ptrace, p, options,
1607 infop, stat_addr, ru);
1608
1609 return wait_task_continued(p, options, infop, stat_addr, ru);
1610}
1611
1612/*
1613 * Do the work of do_wait() for one thread in the group, @tsk.
1614 *
1615 * -ECHILD should be in *@notask_error before the first call.
1616 * Returns nonzero for a final return, when we have unlocked tasklist_lock.
1617 * Returns zero if the search for a child should continue; then
1618 * *@notask_error is 0 if there were any eligible children,
1619 * or another error from security_task_wait(), or still -ECHILD.
1620 */
1621static int do_wait_thread(struct task_struct *tsk, int *notask_error,
1622 enum pid_type type, struct pid *pid, int options,
1623 struct siginfo __user *infop, int __user *stat_addr,
1624 struct rusage __user *ru)
1625{
1626 struct task_struct *p;
1627
1628 list_for_each_entry(p, &tsk->children, sibling) {
1629 /*
1630 * Do not consider detached threads.
1631 */
1632 if (!task_detached(p)) {
1633 int ret = wait_consider_task(tsk, 0, p, notask_error,
1634 type, pid, options,
1635 infop, stat_addr, ru);
1636 if (ret)
1637 return ret;
1638 }
1639 }
1640
1641 return 0;
1642}
1643
1644static int ptrace_do_wait(struct task_struct *tsk, int *notask_error,
1645 enum pid_type type, struct pid *pid, int options,
1646 struct siginfo __user *infop, int __user *stat_addr,
1647 struct rusage __user *ru)
1648{
1649 struct task_struct *p;
1650
1651 /*
1652 * Traditionally we see ptrace'd stopped tasks regardless of options.
1653 */
1654 options |= WUNTRACED;
1655
1656 list_for_each_entry(p, &tsk->ptraced, ptrace_entry) {
1657 int ret = wait_consider_task(tsk, 1, p, notask_error,
1658 type, pid, options,
1659 infop, stat_addr, ru);
1660 if (ret)
1661 return ret;
1662 }
1663
1664 return 0;
1665}
1666
1518static long do_wait(enum pid_type type, struct pid *pid, int options, 1667static long do_wait(enum pid_type type, struct pid *pid, int options,
1519 struct siginfo __user *infop, int __user *stat_addr, 1668 struct siginfo __user *infop, int __user *stat_addr,
1520 struct rusage __user *ru) 1669 struct rusage __user *ru)
1521{ 1670{
1522 DECLARE_WAITQUEUE(wait, current); 1671 DECLARE_WAITQUEUE(wait, current);
1523 struct task_struct *tsk; 1672 struct task_struct *tsk;
1524 int flag, retval; 1673 int retval;
1525 1674
1526 add_wait_queue(&current->signal->wait_chldexit,&wait); 1675 add_wait_queue(&current->signal->wait_chldexit,&wait);
1527repeat: 1676repeat:
1528 /* If there is nothing that can match our critier just get out */ 1677 /*
1678 * If there is nothing that can match our critiera just get out.
1679 * We will clear @retval to zero if we see any child that might later
1680 * match our criteria, even if we are not able to reap it yet.
1681 */
1529 retval = -ECHILD; 1682 retval = -ECHILD;
1530 if ((type < PIDTYPE_MAX) && (!pid || hlist_empty(&pid->tasks[type]))) 1683 if ((type < PIDTYPE_MAX) && (!pid || hlist_empty(&pid->tasks[type])))
1531 goto end; 1684 goto end;
1532 1685
1533 /*
1534 * We will set this flag if we see any child that might later
1535 * match our criteria, even if we are not able to reap it yet.
1536 */
1537 flag = retval = 0;
1538 current->state = TASK_INTERRUPTIBLE; 1686 current->state = TASK_INTERRUPTIBLE;
1539 read_lock(&tasklist_lock); 1687 read_lock(&tasklist_lock);
1540 tsk = current; 1688 tsk = current;
1541 do { 1689 do {
1542 struct task_struct *p; 1690 int tsk_result = do_wait_thread(tsk, &retval,
1543 1691 type, pid, options,
1544 list_for_each_entry(p, &tsk->children, sibling) { 1692 infop, stat_addr, ru);
1545 int ret = eligible_child(type, pid, options, p); 1693 if (!tsk_result)
1546 if (!ret) 1694 tsk_result = ptrace_do_wait(tsk, &retval,
1547 continue; 1695 type, pid, options,
1548 1696 infop, stat_addr, ru);
1549 if (unlikely(ret < 0)) { 1697 if (tsk_result) {
1550 retval = ret; 1698 /*
1551 } else if (task_is_stopped_or_traced(p)) { 1699 * tasklist_lock is unlocked and we have a final result.
1552 /* 1700 */
1553 * It's stopped now, so it might later 1701 retval = tsk_result;
1554 * continue, exit, or stop again. 1702 goto end;
1555 */
1556 flag = 1;
1557 if (!(p->ptrace & PT_PTRACED) &&
1558 !(options & WUNTRACED))
1559 continue;
1560
1561 retval = wait_task_stopped(p,
1562 (options & WNOWAIT), infop,
1563 stat_addr, ru);
1564 } else if (p->exit_state == EXIT_ZOMBIE &&
1565 !delay_group_leader(p)) {
1566 /*
1567 * We don't reap group leaders with subthreads.
1568 */
1569 if (!likely(options & WEXITED))
1570 continue;
1571 retval = wait_task_zombie(p,
1572 (options & WNOWAIT), infop,
1573 stat_addr, ru);
1574 } else if (p->exit_state != EXIT_DEAD) {
1575 /*
1576 * It's running now, so it might later
1577 * exit, stop, or stop and then continue.
1578 */
1579 flag = 1;
1580 if (!unlikely(options & WCONTINUED))
1581 continue;
1582 retval = wait_task_continued(p,
1583 (options & WNOWAIT), infop,
1584 stat_addr, ru);
1585 }
1586 if (retval != 0) /* tasklist_lock released */
1587 goto end;
1588 }
1589 if (!flag) {
1590 list_for_each_entry(p, &tsk->ptrace_children,
1591 ptrace_list) {
1592 flag = eligible_child(type, pid, options, p);
1593 if (!flag)
1594 continue;
1595 if (likely(flag > 0))
1596 break;
1597 retval = flag;
1598 goto end;
1599 }
1600 } 1703 }
1704
1601 if (options & __WNOTHREAD) 1705 if (options & __WNOTHREAD)
1602 break; 1706 break;
1603 tsk = next_thread(tsk); 1707 tsk = next_thread(tsk);
@@ -1605,16 +1709,14 @@ repeat:
1605 } while (tsk != current); 1709 } while (tsk != current);
1606 read_unlock(&tasklist_lock); 1710 read_unlock(&tasklist_lock);
1607 1711
1608 if (flag) { 1712 if (!retval && !(options & WNOHANG)) {
1609 if (options & WNOHANG)
1610 goto end;
1611 retval = -ERESTARTSYS; 1713 retval = -ERESTARTSYS;
1612 if (signal_pending(current)) 1714 if (!signal_pending(current)) {
1613 goto end; 1715 schedule();
1614 schedule(); 1716 goto repeat;
1615 goto repeat; 1717 }
1616 } 1718 }
1617 retval = -ECHILD; 1719
1618end: 1720end:
1619 current->state = TASK_RUNNING; 1721 current->state = TASK_RUNNING;
1620 remove_wait_queue(&current->signal->wait_chldexit,&wait); 1722 remove_wait_queue(&current->signal->wait_chldexit,&wait);
diff --git a/kernel/fork.c b/kernel/fork.c
index d428336e7aa1..99c5c655b098 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -23,18 +23,22 @@
23#include <linux/sem.h> 23#include <linux/sem.h>
24#include <linux/file.h> 24#include <linux/file.h>
25#include <linux/fdtable.h> 25#include <linux/fdtable.h>
26#include <linux/iocontext.h>
26#include <linux/key.h> 27#include <linux/key.h>
27#include <linux/binfmts.h> 28#include <linux/binfmts.h>
28#include <linux/mman.h> 29#include <linux/mman.h>
30#include <linux/mmu_notifier.h>
29#include <linux/fs.h> 31#include <linux/fs.h>
30#include <linux/nsproxy.h> 32#include <linux/nsproxy.h>
31#include <linux/capability.h> 33#include <linux/capability.h>
32#include <linux/cpu.h> 34#include <linux/cpu.h>
33#include <linux/cgroup.h> 35#include <linux/cgroup.h>
34#include <linux/security.h> 36#include <linux/security.h>
37#include <linux/hugetlb.h>
35#include <linux/swap.h> 38#include <linux/swap.h>
36#include <linux/syscalls.h> 39#include <linux/syscalls.h>
37#include <linux/jiffies.h> 40#include <linux/jiffies.h>
41#include <linux/tracehook.h>
38#include <linux/futex.h> 42#include <linux/futex.h>
39#include <linux/task_io_accounting_ops.h> 43#include <linux/task_io_accounting_ops.h>
40#include <linux/rcupdate.h> 44#include <linux/rcupdate.h>
@@ -92,6 +96,23 @@ int nr_processes(void)
92static struct kmem_cache *task_struct_cachep; 96static struct kmem_cache *task_struct_cachep;
93#endif 97#endif
94 98
99#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR
100static inline struct thread_info *alloc_thread_info(struct task_struct *tsk)
101{
102#ifdef CONFIG_DEBUG_STACK_USAGE
103 gfp_t mask = GFP_KERNEL | __GFP_ZERO;
104#else
105 gfp_t mask = GFP_KERNEL;
106#endif
107 return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER);
108}
109
110static inline void free_thread_info(struct thread_info *ti)
111{
112 free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
113}
114#endif
115
95/* SLAB cache for signal_struct structures (tsk->signal) */ 116/* SLAB cache for signal_struct structures (tsk->signal) */
96static struct kmem_cache *signal_cachep; 117static struct kmem_cache *signal_cachep;
97 118
@@ -311,6 +332,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
311 } 332 }
312 333
313 /* 334 /*
335 * Clear hugetlb-related page reserves for children. This only
336 * affects MAP_PRIVATE mappings. Faults generated by the child
337 * are not guaranteed to succeed, even if read-only
338 */
339 if (is_vm_hugetlb_page(tmp))
340 reset_vma_resv_huge_pages(tmp);
341
342 /*
314 * Link in the new vma and copy the page table entries. 343 * Link in the new vma and copy the page table entries.
315 */ 344 */
316 *pprev = tmp; 345 *pprev = tmp;
@@ -378,7 +407,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
378 INIT_LIST_HEAD(&mm->mmlist); 407 INIT_LIST_HEAD(&mm->mmlist);
379 mm->flags = (current->mm) ? current->mm->flags 408 mm->flags = (current->mm) ? current->mm->flags
380 : MMF_DUMP_FILTER_DEFAULT; 409 : MMF_DUMP_FILTER_DEFAULT;
381 mm->core_waiters = 0; 410 mm->core_state = NULL;
382 mm->nr_ptes = 0; 411 mm->nr_ptes = 0;
383 set_mm_counter(mm, file_rss, 0); 412 set_mm_counter(mm, file_rss, 0);
384 set_mm_counter(mm, anon_rss, 0); 413 set_mm_counter(mm, anon_rss, 0);
@@ -391,6 +420,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
391 420
392 if (likely(!mm_alloc_pgd(mm))) { 421 if (likely(!mm_alloc_pgd(mm))) {
393 mm->def_flags = 0; 422 mm->def_flags = 0;
423 mmu_notifier_mm_init(mm);
394 return mm; 424 return mm;
395 } 425 }
396 426
@@ -423,6 +453,7 @@ void __mmdrop(struct mm_struct *mm)
423 BUG_ON(mm == &init_mm); 453 BUG_ON(mm == &init_mm);
424 mm_free_pgd(mm); 454 mm_free_pgd(mm);
425 destroy_context(mm); 455 destroy_context(mm);
456 mmu_notifier_mm_destroy(mm);
426 free_mm(mm); 457 free_mm(mm);
427} 458}
428EXPORT_SYMBOL_GPL(__mmdrop); 459EXPORT_SYMBOL_GPL(__mmdrop);
@@ -452,7 +483,7 @@ EXPORT_SYMBOL_GPL(mmput);
452/** 483/**
453 * get_task_mm - acquire a reference to the task's mm 484 * get_task_mm - acquire a reference to the task's mm
454 * 485 *
455 * Returns %NULL if the task has no mm. Checks PF_BORROWED_MM (meaning 486 * Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning
456 * this kernel workthread has transiently adopted a user mm with use_mm, 487 * this kernel workthread has transiently adopted a user mm with use_mm,
457 * to do its AIO) is not set and if so returns a reference to it, after 488 * to do its AIO) is not set and if so returns a reference to it, after
458 * bumping up the use count. User must release the mm via mmput() 489 * bumping up the use count. User must release the mm via mmput()
@@ -465,7 +496,7 @@ struct mm_struct *get_task_mm(struct task_struct *task)
465 task_lock(task); 496 task_lock(task);
466 mm = task->mm; 497 mm = task->mm;
467 if (mm) { 498 if (mm) {
468 if (task->flags & PF_BORROWED_MM) 499 if (task->flags & PF_KTHREAD)
469 mm = NULL; 500 mm = NULL;
470 else 501 else
471 atomic_inc(&mm->mm_users); 502 atomic_inc(&mm->mm_users);
@@ -634,13 +665,6 @@ static struct fs_struct *__copy_fs_struct(struct fs_struct *old)
634 path_get(&old->root); 665 path_get(&old->root);
635 fs->pwd = old->pwd; 666 fs->pwd = old->pwd;
636 path_get(&old->pwd); 667 path_get(&old->pwd);
637 if (old->altroot.dentry) {
638 fs->altroot = old->altroot;
639 path_get(&old->altroot);
640 } else {
641 fs->altroot.mnt = NULL;
642 fs->altroot.dentry = NULL;
643 }
644 read_unlock(&old->lock); 668 read_unlock(&old->lock);
645 } 669 }
646 return fs; 670 return fs;
@@ -783,6 +807,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
783 807
784 sig->leader = 0; /* session leadership doesn't inherit */ 808 sig->leader = 0; /* session leadership doesn't inherit */
785 sig->tty_old_pgrp = NULL; 809 sig->tty_old_pgrp = NULL;
810 sig->tty = NULL;
786 811
787 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; 812 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
788 sig->gtime = cputime_zero; 813 sig->gtime = cputime_zero;
@@ -790,6 +815,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
790 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; 815 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
791 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; 816 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
792 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; 817 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
818 task_io_accounting_init(&sig->ioac);
793 sig->sum_sched_runtime = 0; 819 sig->sum_sched_runtime = 0;
794 INIT_LIST_HEAD(&sig->cpu_timers[0]); 820 INIT_LIST_HEAD(&sig->cpu_timers[0]);
795 INIT_LIST_HEAD(&sig->cpu_timers[1]); 821 INIT_LIST_HEAD(&sig->cpu_timers[1]);
@@ -818,6 +844,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
818void __cleanup_signal(struct signal_struct *sig) 844void __cleanup_signal(struct signal_struct *sig)
819{ 845{
820 exit_thread_group_keys(sig); 846 exit_thread_group_keys(sig);
847 tty_kref_put(sig->tty);
821 kmem_cache_free(signal_cachep, sig); 848 kmem_cache_free(signal_cachep, sig);
822} 849}
823 850
@@ -837,8 +864,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p)
837 864
838 new_flags &= ~PF_SUPERPRIV; 865 new_flags &= ~PF_SUPERPRIV;
839 new_flags |= PF_FORKNOEXEC; 866 new_flags |= PF_FORKNOEXEC;
840 if (!(clone_flags & CLONE_PTRACE)) 867 new_flags |= PF_STARTING;
841 p->ptrace = 0;
842 p->flags = new_flags; 868 p->flags = new_flags;
843 clear_freeze_flag(p); 869 clear_freeze_flag(p);
844} 870}
@@ -879,7 +905,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
879 struct pt_regs *regs, 905 struct pt_regs *regs,
880 unsigned long stack_size, 906 unsigned long stack_size,
881 int __user *child_tidptr, 907 int __user *child_tidptr,
882 struct pid *pid) 908 struct pid *pid,
909 int trace)
883{ 910{
884 int retval; 911 int retval;
885 struct task_struct *p; 912 struct task_struct *p;
@@ -914,7 +941,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
914 941
915 rt_mutex_init_task(p); 942 rt_mutex_init_task(p);
916 943
917#ifdef CONFIG_TRACE_IRQFLAGS 944#ifdef CONFIG_PROVE_LOCKING
918 DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); 945 DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled);
919 DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); 946 DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
920#endif 947#endif
@@ -972,13 +999,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
972 p->last_switch_timestamp = 0; 999 p->last_switch_timestamp = 0;
973#endif 1000#endif
974 1001
975#ifdef CONFIG_TASK_XACCT 1002 task_io_accounting_init(&p->ioac);
976 p->rchar = 0; /* I/O counter: bytes read */
977 p->wchar = 0; /* I/O counter: bytes written */
978 p->syscr = 0; /* I/O counter: read syscalls */
979 p->syscw = 0; /* I/O counter: write syscalls */
980#endif
981 task_io_accounting_init(p);
982 acct_clear_integrals(p); 1003 acct_clear_integrals(p);
983 1004
984 p->it_virt_expires = cputime_zero; 1005 p->it_virt_expires = cputime_zero;
@@ -1085,6 +1106,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1085 if (clone_flags & CLONE_THREAD) 1106 if (clone_flags & CLONE_THREAD)
1086 p->tgid = current->tgid; 1107 p->tgid = current->tgid;
1087 1108
1109 if (current->nsproxy != p->nsproxy) {
1110 retval = ns_cgroup_clone(p, pid);
1111 if (retval)
1112 goto bad_fork_free_pid;
1113 }
1114
1088 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; 1115 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
1089 /* 1116 /*
1090 * Clear TID on mm_release()? 1117 * Clear TID on mm_release()?
@@ -1129,8 +1156,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1129 */ 1156 */
1130 p->group_leader = p; 1157 p->group_leader = p;
1131 INIT_LIST_HEAD(&p->thread_group); 1158 INIT_LIST_HEAD(&p->thread_group);
1132 INIT_LIST_HEAD(&p->ptrace_children);
1133 INIT_LIST_HEAD(&p->ptrace_list);
1134 1159
1135 /* Now that the task is set up, run cgroup callbacks if 1160 /* Now that the task is set up, run cgroup callbacks if
1136 * necessary. We need to run them before the task is visible 1161 * necessary. We need to run them before the task is visible
@@ -1161,7 +1186,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1161 p->real_parent = current->real_parent; 1186 p->real_parent = current->real_parent;
1162 else 1187 else
1163 p->real_parent = current; 1188 p->real_parent = current;
1164 p->parent = p->real_parent;
1165 1189
1166 spin_lock(&current->sighand->siglock); 1190 spin_lock(&current->sighand->siglock);
1167 1191
@@ -1202,16 +1226,16 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1202 } 1226 }
1203 1227
1204 if (likely(p->pid)) { 1228 if (likely(p->pid)) {
1205 add_parent(p); 1229 list_add_tail(&p->sibling, &p->real_parent->children);
1206 if (unlikely(p->ptrace & PT_PTRACED)) 1230 tracehook_finish_clone(p, clone_flags, trace);
1207 __ptrace_link(p, current->parent);
1208 1231
1209 if (thread_group_leader(p)) { 1232 if (thread_group_leader(p)) {
1210 if (clone_flags & CLONE_NEWPID) 1233 if (clone_flags & CLONE_NEWPID)
1211 p->nsproxy->pid_ns->child_reaper = p; 1234 p->nsproxy->pid_ns->child_reaper = p;
1212 1235
1213 p->signal->leader_pid = pid; 1236 p->signal->leader_pid = pid;
1214 p->signal->tty = current->signal->tty; 1237 tty_kref_put(p->signal->tty);
1238 p->signal->tty = tty_kref_get(current->signal->tty);
1215 set_task_pgrp(p, task_pgrp_nr(current)); 1239 set_task_pgrp(p, task_pgrp_nr(current));
1216 set_task_session(p, task_session_nr(current)); 1240 set_task_session(p, task_session_nr(current));
1217 attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); 1241 attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
@@ -1289,29 +1313,13 @@ struct task_struct * __cpuinit fork_idle(int cpu)
1289 struct pt_regs regs; 1313 struct pt_regs regs;
1290 1314
1291 task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, 1315 task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
1292 &init_struct_pid); 1316 &init_struct_pid, 0);
1293 if (!IS_ERR(task)) 1317 if (!IS_ERR(task))
1294 init_idle(task, cpu); 1318 init_idle(task, cpu);
1295 1319
1296 return task; 1320 return task;
1297} 1321}
1298 1322
1299static int fork_traceflag(unsigned clone_flags)
1300{
1301 if (clone_flags & CLONE_UNTRACED)
1302 return 0;
1303 else if (clone_flags & CLONE_VFORK) {
1304 if (current->ptrace & PT_TRACE_VFORK)
1305 return PTRACE_EVENT_VFORK;
1306 } else if ((clone_flags & CSIGNAL) != SIGCHLD) {
1307 if (current->ptrace & PT_TRACE_CLONE)
1308 return PTRACE_EVENT_CLONE;
1309 } else if (current->ptrace & PT_TRACE_FORK)
1310 return PTRACE_EVENT_FORK;
1311
1312 return 0;
1313}
1314
1315/* 1323/*
1316 * Ok, this is the main fork-routine. 1324 * Ok, this is the main fork-routine.
1317 * 1325 *
@@ -1346,14 +1354,14 @@ long do_fork(unsigned long clone_flags,
1346 } 1354 }
1347 } 1355 }
1348 1356
1349 if (unlikely(current->ptrace)) { 1357 /*
1350 trace = fork_traceflag (clone_flags); 1358 * When called from kernel_thread, don't do user tracing stuff.
1351 if (trace) 1359 */
1352 clone_flags |= CLONE_PTRACE; 1360 if (likely(user_mode(regs)))
1353 } 1361 trace = tracehook_prepare_clone(clone_flags);
1354 1362
1355 p = copy_process(clone_flags, stack_start, regs, stack_size, 1363 p = copy_process(clone_flags, stack_start, regs, stack_size,
1356 child_tidptr, NULL); 1364 child_tidptr, NULL, trace);
1357 /* 1365 /*
1358 * Do this prior waking up the new thread - the thread pointer 1366 * Do this prior waking up the new thread - the thread pointer
1359 * might get invalid after that point, if the thread exits quickly. 1367 * might get invalid after that point, if the thread exits quickly.
@@ -1371,32 +1379,35 @@ long do_fork(unsigned long clone_flags,
1371 init_completion(&vfork); 1379 init_completion(&vfork);
1372 } 1380 }
1373 1381
1374 if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) { 1382 tracehook_report_clone(trace, regs, clone_flags, nr, p);
1383
1384 /*
1385 * We set PF_STARTING at creation in case tracing wants to
1386 * use this to distinguish a fully live task from one that
1387 * hasn't gotten to tracehook_report_clone() yet. Now we
1388 * clear it and set the child going.
1389 */
1390 p->flags &= ~PF_STARTING;
1391
1392 if (unlikely(clone_flags & CLONE_STOPPED)) {
1375 /* 1393 /*
1376 * We'll start up with an immediate SIGSTOP. 1394 * We'll start up with an immediate SIGSTOP.
1377 */ 1395 */
1378 sigaddset(&p->pending.signal, SIGSTOP); 1396 sigaddset(&p->pending.signal, SIGSTOP);
1379 set_tsk_thread_flag(p, TIF_SIGPENDING); 1397 set_tsk_thread_flag(p, TIF_SIGPENDING);
1380 }
1381
1382 if (!(clone_flags & CLONE_STOPPED))
1383 wake_up_new_task(p, clone_flags);
1384 else
1385 __set_task_state(p, TASK_STOPPED); 1398 __set_task_state(p, TASK_STOPPED);
1386 1399 } else {
1387 if (unlikely (trace)) { 1400 wake_up_new_task(p, clone_flags);
1388 current->ptrace_message = nr;
1389 ptrace_notify ((trace << 8) | SIGTRAP);
1390 } 1401 }
1391 1402
1403 tracehook_report_clone_complete(trace, regs,
1404 clone_flags, nr, p);
1405
1392 if (clone_flags & CLONE_VFORK) { 1406 if (clone_flags & CLONE_VFORK) {
1393 freezer_do_not_count(); 1407 freezer_do_not_count();
1394 wait_for_completion(&vfork); 1408 wait_for_completion(&vfork);
1395 freezer_count(); 1409 freezer_count();
1396 if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) { 1410 tracehook_report_vfork_done(p, nr);
1397 current->ptrace_message = nr;
1398 ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
1399 }
1400 } 1411 }
1401 } else { 1412 } else {
1402 nr = PTR_ERR(p); 1413 nr = PTR_ERR(p);
@@ -1408,7 +1419,7 @@ long do_fork(unsigned long clone_flags,
1408#define ARCH_MIN_MMSTRUCT_ALIGN 0 1419#define ARCH_MIN_MMSTRUCT_ALIGN 0
1409#endif 1420#endif
1410 1421
1411static void sighand_ctor(struct kmem_cache *cachep, void *data) 1422static void sighand_ctor(void *data)
1412{ 1423{
1413 struct sighand_struct *sighand = data; 1424 struct sighand_struct *sighand = data;
1414 1425
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 421be5fe5cc7..cdec83e722fa 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -300,11 +300,10 @@ EXPORT_SYMBOL_GPL(ktime_sub_ns);
300 */ 300 */
301u64 ktime_divns(const ktime_t kt, s64 div) 301u64 ktime_divns(const ktime_t kt, s64 div)
302{ 302{
303 u64 dclc, inc, dns; 303 u64 dclc;
304 int sft = 0; 304 int sft = 0;
305 305
306 dclc = dns = ktime_to_ns(kt); 306 dclc = ktime_to_ns(kt);
307 inc = div;
308 /* Make sure the divisor is less than 2^32: */ 307 /* Make sure the divisor is less than 2^32: */
309 while (div >> 32) { 308 while (div >> 32) {
310 sft++; 309 sft++;
@@ -623,7 +622,7 @@ static void retrigger_next_event(void *arg)
623void clock_was_set(void) 622void clock_was_set(void)
624{ 623{
625 /* Retrigger the CPU local events everywhere */ 624 /* Retrigger the CPU local events everywhere */
626 on_each_cpu(retrigger_next_event, NULL, 0, 1); 625 on_each_cpu(retrigger_next_event, NULL, 1);
627} 626}
628 627
629/* 628/*
@@ -632,8 +631,6 @@ void clock_was_set(void)
632 */ 631 */
633void hres_timers_resume(void) 632void hres_timers_resume(void)
634{ 633{
635 WARN_ON_ONCE(num_online_cpus() > 1);
636
637 /* Retrigger the CPU local events: */ 634 /* Retrigger the CPU local events: */
638 retrigger_next_event(NULL); 635 retrigger_next_event(NULL);
639} 636}
@@ -675,13 +672,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
675 */ 672 */
676 BUG_ON(timer->function(timer) != HRTIMER_NORESTART); 673 BUG_ON(timer->function(timer) != HRTIMER_NORESTART);
677 return 1; 674 return 1;
678 case HRTIMER_CB_IRQSAFE_NO_SOFTIRQ: 675 case HRTIMER_CB_IRQSAFE_PERCPU:
676 case HRTIMER_CB_IRQSAFE_UNLOCKED:
679 /* 677 /*
680 * This is solely for the sched tick emulation with 678 * This is solely for the sched tick emulation with
681 * dynamic tick support to ensure that we do not 679 * dynamic tick support to ensure that we do not
682 * restart the tick right on the edge and end up with 680 * restart the tick right on the edge and end up with
683 * the tick timer in the softirq ! The calling site 681 * the tick timer in the softirq ! The calling site
684 * takes care of this. 682 * takes care of this. Also used for hrtimer sleeper !
685 */ 683 */
686 debug_hrtimer_deactivate(timer); 684 debug_hrtimer_deactivate(timer);
687 return 1; 685 return 1;
@@ -1003,10 +1001,18 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
1003 */ 1001 */
1004 raise = timer->state == HRTIMER_STATE_PENDING; 1002 raise = timer->state == HRTIMER_STATE_PENDING;
1005 1003
1004 /*
1005 * We use preempt_disable to prevent this task from migrating after
1006 * setting up the softirq and raising it. Otherwise, if me migrate
1007 * we will raise the softirq on the wrong CPU.
1008 */
1009 preempt_disable();
1010
1006 unlock_hrtimer_base(timer, &flags); 1011 unlock_hrtimer_base(timer, &flags);
1007 1012
1008 if (raise) 1013 if (raise)
1009 hrtimer_raise_softirq(); 1014 hrtimer_raise_softirq();
1015 preempt_enable();
1010 1016
1011 return ret; 1017 return ret;
1012} 1018}
@@ -1078,7 +1084,7 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
1078} 1084}
1079EXPORT_SYMBOL_GPL(hrtimer_get_remaining); 1085EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
1080 1086
1081#if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_NO_HZ) 1087#ifdef CONFIG_NO_HZ
1082/** 1088/**
1083 * hrtimer_get_next_event - get the time until next expiry event 1089 * hrtimer_get_next_event - get the time until next expiry event
1084 * 1090 *
@@ -1240,7 +1246,8 @@ static void __run_hrtimer(struct hrtimer *timer)
1240 timer_stats_account_hrtimer(timer); 1246 timer_stats_account_hrtimer(timer);
1241 1247
1242 fn = timer->function; 1248 fn = timer->function;
1243 if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) { 1249 if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU ||
1250 timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) {
1244 /* 1251 /*
1245 * Used for scheduler timers, avoid lock inversion with 1252 * Used for scheduler timers, avoid lock inversion with
1246 * rq->lock and tasklist_lock. 1253 * rq->lock and tasklist_lock.
@@ -1447,7 +1454,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
1447 sl->timer.function = hrtimer_wakeup; 1454 sl->timer.function = hrtimer_wakeup;
1448 sl->task = task; 1455 sl->task = task;
1449#ifdef CONFIG_HIGH_RES_TIMERS 1456#ifdef CONFIG_HIGH_RES_TIMERS
1450 sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 1457 sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
1451#endif 1458#endif
1452} 1459}
1453 1460
@@ -1586,29 +1593,95 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
1586 1593
1587#ifdef CONFIG_HOTPLUG_CPU 1594#ifdef CONFIG_HOTPLUG_CPU
1588 1595
1589static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, 1596static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1590 struct hrtimer_clock_base *new_base) 1597 struct hrtimer_clock_base *new_base, int dcpu)
1591{ 1598{
1592 struct hrtimer *timer; 1599 struct hrtimer *timer;
1593 struct rb_node *node; 1600 struct rb_node *node;
1601 int raise = 0;
1594 1602
1595 while ((node = rb_first(&old_base->active))) { 1603 while ((node = rb_first(&old_base->active))) {
1596 timer = rb_entry(node, struct hrtimer, node); 1604 timer = rb_entry(node, struct hrtimer, node);
1597 BUG_ON(hrtimer_callback_running(timer)); 1605 BUG_ON(hrtimer_callback_running(timer));
1598 debug_hrtimer_deactivate(timer); 1606 debug_hrtimer_deactivate(timer);
1599 __remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0); 1607
1608 /*
1609 * Should not happen. Per CPU timers should be
1610 * canceled _before_ the migration code is called
1611 */
1612 if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU) {
1613 __remove_hrtimer(timer, old_base,
1614 HRTIMER_STATE_INACTIVE, 0);
1615 WARN(1, "hrtimer (%p %p)active but cpu %d dead\n",
1616 timer, timer->function, dcpu);
1617 continue;
1618 }
1619
1620 /*
1621 * Mark it as STATE_MIGRATE not INACTIVE otherwise the
1622 * timer could be seen as !active and just vanish away
1623 * under us on another CPU
1624 */
1625 __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
1600 timer->base = new_base; 1626 timer->base = new_base;
1601 /* 1627 /*
1602 * Enqueue the timer. Allow reprogramming of the event device 1628 * Enqueue the timer. Allow reprogramming of the event device
1603 */ 1629 */
1604 enqueue_hrtimer(timer, new_base, 1); 1630 enqueue_hrtimer(timer, new_base, 1);
1631
1632#ifdef CONFIG_HIGH_RES_TIMERS
1633 /*
1634 * Happens with high res enabled when the timer was
1635 * already expired and the callback mode is
1636 * HRTIMER_CB_IRQSAFE_UNLOCKED (hrtimer_sleeper). The
1637 * enqueue code does not move them to the soft irq
1638 * pending list for performance/latency reasons, but
1639 * in the migration state, we need to do that
1640 * otherwise we end up with a stale timer.
1641 */
1642 if (timer->state == HRTIMER_STATE_MIGRATE) {
1643 timer->state = HRTIMER_STATE_PENDING;
1644 list_add_tail(&timer->cb_entry,
1645 &new_base->cpu_base->cb_pending);
1646 raise = 1;
1647 }
1648#endif
1649 /* Clear the migration state bit */
1650 timer->state &= ~HRTIMER_STATE_MIGRATE;
1605 } 1651 }
1652 return raise;
1606} 1653}
1607 1654
1655#ifdef CONFIG_HIGH_RES_TIMERS
1656static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
1657 struct hrtimer_cpu_base *new_base)
1658{
1659 struct hrtimer *timer;
1660 int raise = 0;
1661
1662 while (!list_empty(&old_base->cb_pending)) {
1663 timer = list_entry(old_base->cb_pending.next,
1664 struct hrtimer, cb_entry);
1665
1666 __remove_hrtimer(timer, timer->base, HRTIMER_STATE_PENDING, 0);
1667 timer->base = &new_base->clock_base[timer->base->index];
1668 list_add_tail(&timer->cb_entry, &new_base->cb_pending);
1669 raise = 1;
1670 }
1671 return raise;
1672}
1673#else
1674static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
1675 struct hrtimer_cpu_base *new_base)
1676{
1677 return 0;
1678}
1679#endif
1680
1608static void migrate_hrtimers(int cpu) 1681static void migrate_hrtimers(int cpu)
1609{ 1682{
1610 struct hrtimer_cpu_base *old_base, *new_base; 1683 struct hrtimer_cpu_base *old_base, *new_base;
1611 int i; 1684 int i, raise = 0;
1612 1685
1613 BUG_ON(cpu_online(cpu)); 1686 BUG_ON(cpu_online(cpu));
1614 old_base = &per_cpu(hrtimer_bases, cpu); 1687 old_base = &per_cpu(hrtimer_bases, cpu);
@@ -1621,14 +1694,21 @@ static void migrate_hrtimers(int cpu)
1621 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); 1694 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
1622 1695
1623 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1696 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1624 migrate_hrtimer_list(&old_base->clock_base[i], 1697 if (migrate_hrtimer_list(&old_base->clock_base[i],
1625 &new_base->clock_base[i]); 1698 &new_base->clock_base[i], cpu))
1699 raise = 1;
1626 } 1700 }
1627 1701
1702 if (migrate_hrtimer_pending(old_base, new_base))
1703 raise = 1;
1704
1628 spin_unlock(&old_base->lock); 1705 spin_unlock(&old_base->lock);
1629 spin_unlock(&new_base->lock); 1706 spin_unlock(&new_base->lock);
1630 local_irq_enable(); 1707 local_irq_enable();
1631 put_cpu_var(hrtimer_bases); 1708 put_cpu_var(hrtimer_bases);
1709
1710 if (raise)
1711 hrtimer_raise_softirq();
1632} 1712}
1633#endif /* CONFIG_HOTPLUG_CPU */ 1713#endif /* CONFIG_HOTPLUG_CPU */
1634 1714
@@ -1669,7 +1749,7 @@ void __init hrtimers_init(void)
1669 (void *)(long)smp_processor_id()); 1749 (void *)(long)smp_processor_id());
1670 register_cpu_notifier(&hrtimers_nb); 1750 register_cpu_notifier(&hrtimers_nb);
1671#ifdef CONFIG_HIGH_RES_TIMERS 1751#ifdef CONFIG_HIGH_RES_TIMERS
1672 open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq, NULL); 1752 open_softirq(HRTIMER_SOFTIRQ, run_hrtimer_softirq);
1673#endif 1753#endif
1674} 1754}
1675 1755
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 964964baefa2..3cd441ebf5d2 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -28,8 +28,7 @@ void dynamic_irq_init(unsigned int irq)
28 unsigned long flags; 28 unsigned long flags;
29 29
30 if (irq >= NR_IRQS) { 30 if (irq >= NR_IRQS) {
31 printk(KERN_ERR "Trying to initialize invalid IRQ%d\n", irq); 31 WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
32 WARN_ON(1);
33 return; 32 return;
34 } 33 }
35 34
@@ -62,8 +61,7 @@ void dynamic_irq_cleanup(unsigned int irq)
62 unsigned long flags; 61 unsigned long flags;
63 62
64 if (irq >= NR_IRQS) { 63 if (irq >= NR_IRQS) {
65 printk(KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq); 64 WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq);
66 WARN_ON(1);
67 return; 65 return;
68 } 66 }
69 67
@@ -71,9 +69,8 @@ void dynamic_irq_cleanup(unsigned int irq)
71 spin_lock_irqsave(&desc->lock, flags); 69 spin_lock_irqsave(&desc->lock, flags);
72 if (desc->action) { 70 if (desc->action) {
73 spin_unlock_irqrestore(&desc->lock, flags); 71 spin_unlock_irqrestore(&desc->lock, flags);
74 printk(KERN_ERR "Destroying IRQ%d without calling free_irq\n", 72 WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n",
75 irq); 73 irq);
76 WARN_ON(1);
77 return; 74 return;
78 } 75 }
79 desc->msi_desc = NULL; 76 desc->msi_desc = NULL;
@@ -96,8 +93,7 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip)
96 unsigned long flags; 93 unsigned long flags;
97 94
98 if (irq >= NR_IRQS) { 95 if (irq >= NR_IRQS) {
99 printk(KERN_ERR "Trying to install chip for IRQ%d\n", irq); 96 WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq);
100 WARN_ON(1);
101 return -EINVAL; 97 return -EINVAL;
102 } 98 }
103 99
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 46d6611a33bb..60c49e324390 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -17,6 +17,8 @@
17 17
18#ifdef CONFIG_SMP 18#ifdef CONFIG_SMP
19 19
20cpumask_t irq_default_affinity = CPU_MASK_ALL;
21
20/** 22/**
21 * synchronize_irq - wait for pending IRQ handlers (on other CPUs) 23 * synchronize_irq - wait for pending IRQ handlers (on other CPUs)
22 * @irq: interrupt number to wait for 24 * @irq: interrupt number to wait for
@@ -87,7 +89,14 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
87 set_balance_irq_affinity(irq, cpumask); 89 set_balance_irq_affinity(irq, cpumask);
88 90
89#ifdef CONFIG_GENERIC_PENDING_IRQ 91#ifdef CONFIG_GENERIC_PENDING_IRQ
90 set_pending_irq(irq, cpumask); 92 if (desc->status & IRQ_MOVE_PCNTXT) {
93 unsigned long flags;
94
95 spin_lock_irqsave(&desc->lock, flags);
96 desc->chip->set_affinity(irq, cpumask);
97 spin_unlock_irqrestore(&desc->lock, flags);
98 } else
99 set_pending_irq(irq, cpumask);
91#else 100#else
92 desc->affinity = cpumask; 101 desc->affinity = cpumask;
93 desc->chip->set_affinity(irq, cpumask); 102 desc->chip->set_affinity(irq, cpumask);
@@ -95,6 +104,27 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
95 return 0; 104 return 0;
96} 105}
97 106
107#ifndef CONFIG_AUTO_IRQ_AFFINITY
108/*
109 * Generic version of the affinity autoselector.
110 */
111int irq_select_affinity(unsigned int irq)
112{
113 cpumask_t mask;
114
115 if (!irq_can_set_affinity(irq))
116 return 0;
117
118 cpus_and(mask, cpu_online_map, irq_default_affinity);
119
120 irq_desc[irq].affinity = mask;
121 irq_desc[irq].chip->set_affinity(irq, mask);
122
123 set_balance_irq_affinity(irq, mask);
124 return 0;
125}
126#endif
127
98#endif 128#endif
99 129
100/** 130/**
@@ -154,8 +184,7 @@ static void __enable_irq(struct irq_desc *desc, unsigned int irq)
154{ 184{
155 switch (desc->depth) { 185 switch (desc->depth) {
156 case 0: 186 case 0:
157 printk(KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); 187 WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
158 WARN_ON(1);
159 break; 188 break;
160 case 1: { 189 case 1: {
161 unsigned int status = desc->status & ~IRQ_DISABLED; 190 unsigned int status = desc->status & ~IRQ_DISABLED;
@@ -194,6 +223,17 @@ void enable_irq(unsigned int irq)
194} 223}
195EXPORT_SYMBOL(enable_irq); 224EXPORT_SYMBOL(enable_irq);
196 225
226int set_irq_wake_real(unsigned int irq, unsigned int on)
227{
228 struct irq_desc *desc = irq_desc + irq;
229 int ret = -ENXIO;
230
231 if (desc->chip->set_wake)
232 ret = desc->chip->set_wake(irq, on);
233
234 return ret;
235}
236
197/** 237/**
198 * set_irq_wake - control irq power management wakeup 238 * set_irq_wake - control irq power management wakeup
199 * @irq: interrupt to control 239 * @irq: interrupt to control
@@ -210,30 +250,32 @@ int set_irq_wake(unsigned int irq, unsigned int on)
210{ 250{
211 struct irq_desc *desc = irq_desc + irq; 251 struct irq_desc *desc = irq_desc + irq;
212 unsigned long flags; 252 unsigned long flags;
213 int ret = -ENXIO; 253 int ret = 0;
214 int (*set_wake)(unsigned, unsigned) = desc->chip->set_wake;
215 254
216 /* wakeup-capable irqs can be shared between drivers that 255 /* wakeup-capable irqs can be shared between drivers that
217 * don't need to have the same sleep mode behaviors. 256 * don't need to have the same sleep mode behaviors.
218 */ 257 */
219 spin_lock_irqsave(&desc->lock, flags); 258 spin_lock_irqsave(&desc->lock, flags);
220 if (on) { 259 if (on) {
221 if (desc->wake_depth++ == 0) 260 if (desc->wake_depth++ == 0) {
222 desc->status |= IRQ_WAKEUP; 261 ret = set_irq_wake_real(irq, on);
223 else 262 if (ret)
224 set_wake = NULL; 263 desc->wake_depth = 0;
264 else
265 desc->status |= IRQ_WAKEUP;
266 }
225 } else { 267 } else {
226 if (desc->wake_depth == 0) { 268 if (desc->wake_depth == 0) {
227 printk(KERN_WARNING "Unbalanced IRQ %d " 269 WARN(1, "Unbalanced IRQ %d wake disable\n", irq);
228 "wake disable\n", irq); 270 } else if (--desc->wake_depth == 0) {
229 WARN_ON(1); 271 ret = set_irq_wake_real(irq, on);
230 } else if (--desc->wake_depth == 0) 272 if (ret)
231 desc->status &= ~IRQ_WAKEUP; 273 desc->wake_depth = 1;
232 else 274 else
233 set_wake = NULL; 275 desc->status &= ~IRQ_WAKEUP;
276 }
234 } 277 }
235 if (set_wake) 278
236 ret = desc->chip->set_wake(irq, on);
237 spin_unlock_irqrestore(&desc->lock, flags); 279 spin_unlock_irqrestore(&desc->lock, flags);
238 return ret; 280 return ret;
239} 281}
@@ -270,6 +312,31 @@ void compat_irq_chip_set_default_handler(struct irq_desc *desc)
270 desc->handle_irq = NULL; 312 desc->handle_irq = NULL;
271} 313}
272 314
315static int __irq_set_trigger(struct irq_chip *chip, unsigned int irq,
316 unsigned long flags)
317{
318 int ret;
319
320 if (!chip || !chip->set_type) {
321 /*
322 * IRQF_TRIGGER_* but the PIC does not support multiple
323 * flow-types?
324 */
325 pr_warning("No set_type function for IRQ %d (%s)\n", irq,
326 chip ? (chip->name ? : "unknown") : "unknown");
327 return 0;
328 }
329
330 ret = chip->set_type(irq, flags & IRQF_TRIGGER_MASK);
331
332 if (ret)
333 pr_err("setting trigger mode %d for irq %u failed (%pF)\n",
334 (int)(flags & IRQF_TRIGGER_MASK),
335 irq, chip->set_type);
336
337 return ret;
338}
339
273/* 340/*
274 * Internal function to register an irqaction - typically used to 341 * Internal function to register an irqaction - typically used to
275 * allocate special interrupts that are part of the architecture. 342 * allocate special interrupts that are part of the architecture.
@@ -281,6 +348,7 @@ int setup_irq(unsigned int irq, struct irqaction *new)
281 const char *old_name = NULL; 348 const char *old_name = NULL;
282 unsigned long flags; 349 unsigned long flags;
283 int shared = 0; 350 int shared = 0;
351 int ret;
284 352
285 if (irq >= NR_IRQS) 353 if (irq >= NR_IRQS)
286 return -EINVAL; 354 return -EINVAL;
@@ -338,36 +406,23 @@ int setup_irq(unsigned int irq, struct irqaction *new)
338 shared = 1; 406 shared = 1;
339 } 407 }
340 408
341 *p = new;
342
343 /* Exclude IRQ from balancing */
344 if (new->flags & IRQF_NOBALANCING)
345 desc->status |= IRQ_NO_BALANCING;
346
347 if (!shared) { 409 if (!shared) {
348 irq_chip_set_defaults(desc->chip); 410 irq_chip_set_defaults(desc->chip);
349 411
350#if defined(CONFIG_IRQ_PER_CPU)
351 if (new->flags & IRQF_PERCPU)
352 desc->status |= IRQ_PER_CPU;
353#endif
354
355 /* Setup the type (level, edge polarity) if configured: */ 412 /* Setup the type (level, edge polarity) if configured: */
356 if (new->flags & IRQF_TRIGGER_MASK) { 413 if (new->flags & IRQF_TRIGGER_MASK) {
357 if (desc->chip && desc->chip->set_type) 414 ret = __irq_set_trigger(desc->chip, irq, new->flags);
358 desc->chip->set_type(irq, 415
359 new->flags & IRQF_TRIGGER_MASK); 416 if (ret) {
360 else 417 spin_unlock_irqrestore(&desc->lock, flags);
361 /* 418 return ret;
362 * IRQF_TRIGGER_* but the PIC does not support 419 }
363 * multiple flow-types?
364 */
365 printk(KERN_WARNING "No IRQF_TRIGGER set_type "
366 "function for IRQ %d (%s)\n", irq,
367 desc->chip ? desc->chip->name :
368 "unknown");
369 } else 420 } else
370 compat_irq_chip_set_default_handler(desc); 421 compat_irq_chip_set_default_handler(desc);
422#if defined(CONFIG_IRQ_PER_CPU)
423 if (new->flags & IRQF_PERCPU)
424 desc->status |= IRQ_PER_CPU;
425#endif
371 426
372 desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | 427 desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING |
373 IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED); 428 IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED);
@@ -382,7 +437,17 @@ int setup_irq(unsigned int irq, struct irqaction *new)
382 } else 437 } else
383 /* Undo nested disables: */ 438 /* Undo nested disables: */
384 desc->depth = 1; 439 desc->depth = 1;
440
441 /* Set default affinity mask once everything is setup */
442 irq_select_affinity(irq);
385 } 443 }
444
445 *p = new;
446
447 /* Exclude IRQ from balancing */
448 if (new->flags & IRQF_NOBALANCING)
449 desc->status |= IRQ_NO_BALANCING;
450
386 /* Reset broken irq detection when installing new handler */ 451 /* Reset broken irq detection when installing new handler */
387 desc->irq_count = 0; 452 desc->irq_count = 0;
388 desc->irqs_unhandled = 0; 453 desc->irqs_unhandled = 0;
@@ -571,8 +636,6 @@ int request_irq(unsigned int irq, irq_handler_t handler,
571 action->next = NULL; 636 action->next = NULL;
572 action->dev_id = dev_id; 637 action->dev_id = dev_id;
573 638
574 select_smp_affinity(irq);
575
576#ifdef CONFIG_DEBUG_SHIRQ 639#ifdef CONFIG_DEBUG_SHIRQ
577 if (irqflags & IRQF_SHARED) { 640 if (irqflags & IRQF_SHARED) {
578 /* 641 /*
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index c2f2ccb0549a..a09dd29c2fd7 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -8,6 +8,7 @@
8 8
9#include <linux/irq.h> 9#include <linux/irq.h>
10#include <linux/proc_fs.h> 10#include <linux/proc_fs.h>
11#include <linux/seq_file.h>
11#include <linux/interrupt.h> 12#include <linux/interrupt.h>
12 13
13#include "internals.h" 14#include "internals.h"
@@ -16,23 +17,18 @@ static struct proc_dir_entry *root_irq_dir;
16 17
17#ifdef CONFIG_SMP 18#ifdef CONFIG_SMP
18 19
19static int irq_affinity_read_proc(char *page, char **start, off_t off, 20static int irq_affinity_proc_show(struct seq_file *m, void *v)
20 int count, int *eof, void *data)
21{ 21{
22 struct irq_desc *desc = irq_desc + (long)data; 22 struct irq_desc *desc = irq_desc + (long)m->private;
23 cpumask_t *mask = &desc->affinity; 23 cpumask_t *mask = &desc->affinity;
24 int len;
25 24
26#ifdef CONFIG_GENERIC_PENDING_IRQ 25#ifdef CONFIG_GENERIC_PENDING_IRQ
27 if (desc->status & IRQ_MOVE_PENDING) 26 if (desc->status & IRQ_MOVE_PENDING)
28 mask = &desc->pending_mask; 27 mask = &desc->pending_mask;
29#endif 28#endif
30 len = cpumask_scnprintf(page, count, *mask); 29 seq_cpumask(m, mask);
31 30 seq_putc(m, '\n');
32 if (count - len < 2) 31 return 0;
33 return -EINVAL;
34 len += sprintf(page + len, "\n");
35 return len;
36} 32}
37 33
38#ifndef is_affinity_mask_valid 34#ifndef is_affinity_mask_valid
@@ -40,11 +36,12 @@ static int irq_affinity_read_proc(char *page, char **start, off_t off,
40#endif 36#endif
41 37
42int no_irq_affinity; 38int no_irq_affinity;
43static int irq_affinity_write_proc(struct file *file, const char __user *buffer, 39static ssize_t irq_affinity_proc_write(struct file *file,
44 unsigned long count, void *data) 40 const char __user *buffer, size_t count, loff_t *pos)
45{ 41{
46 unsigned int irq = (int)(long)data, full_count = count, err; 42 unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data;
47 cpumask_t new_value, tmp; 43 cpumask_t new_value;
44 int err;
48 45
49 if (!irq_desc[irq].chip->set_affinity || no_irq_affinity || 46 if (!irq_desc[irq].chip->set_affinity || no_irq_affinity ||
50 irq_balancing_disabled(irq)) 47 irq_balancing_disabled(irq))
@@ -62,17 +59,74 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
62 * way to make the system unusable accidentally :-) At least 59 * way to make the system unusable accidentally :-) At least
63 * one online CPU still has to be targeted. 60 * one online CPU still has to be targeted.
64 */ 61 */
65 cpus_and(tmp, new_value, cpu_online_map); 62 if (!cpus_intersects(new_value, cpu_online_map))
66 if (cpus_empty(tmp))
67 /* Special case for empty set - allow the architecture 63 /* Special case for empty set - allow the architecture
68 code to set default SMP affinity. */ 64 code to set default SMP affinity. */
69 return select_smp_affinity(irq) ? -EINVAL : full_count; 65 return irq_select_affinity(irq) ? -EINVAL : count;
70 66
71 irq_set_affinity(irq, new_value); 67 irq_set_affinity(irq, new_value);
72 68
73 return full_count; 69 return count;
70}
71
72static int irq_affinity_proc_open(struct inode *inode, struct file *file)
73{
74 return single_open(file, irq_affinity_proc_show, PDE(inode)->data);
75}
76
77static const struct file_operations irq_affinity_proc_fops = {
78 .open = irq_affinity_proc_open,
79 .read = seq_read,
80 .llseek = seq_lseek,
81 .release = single_release,
82 .write = irq_affinity_proc_write,
83};
84
85static int default_affinity_show(struct seq_file *m, void *v)
86{
87 seq_cpumask(m, &irq_default_affinity);
88 seq_putc(m, '\n');
89 return 0;
90}
91
92static ssize_t default_affinity_write(struct file *file,
93 const char __user *buffer, size_t count, loff_t *ppos)
94{
95 cpumask_t new_value;
96 int err;
97
98 err = cpumask_parse_user(buffer, count, new_value);
99 if (err)
100 return err;
101
102 if (!is_affinity_mask_valid(new_value))
103 return -EINVAL;
104
105 /*
106 * Do not allow disabling IRQs completely - it's a too easy
107 * way to make the system unusable accidentally :-) At least
108 * one online CPU still has to be targeted.
109 */
110 if (!cpus_intersects(new_value, cpu_online_map))
111 return -EINVAL;
112
113 irq_default_affinity = new_value;
114
115 return count;
116}
117
118static int default_affinity_open(struct inode *inode, struct file *file)
119{
120 return single_open(file, default_affinity_show, NULL);
74} 121}
75 122
123static const struct file_operations default_affinity_proc_fops = {
124 .open = default_affinity_open,
125 .read = seq_read,
126 .llseek = seq_lseek,
127 .release = single_release,
128 .write = default_affinity_write,
129};
76#endif 130#endif
77 131
78static int irq_spurious_read(char *page, char **start, off_t off, 132static int irq_spurious_read(char *page, char **start, off_t off,
@@ -144,16 +198,9 @@ void register_irq_proc(unsigned int irq)
144 irq_desc[irq].dir = proc_mkdir(name, root_irq_dir); 198 irq_desc[irq].dir = proc_mkdir(name, root_irq_dir);
145 199
146#ifdef CONFIG_SMP 200#ifdef CONFIG_SMP
147 { 201 /* create /proc/irq/<irq>/smp_affinity */
148 /* create /proc/irq/<irq>/smp_affinity */ 202 proc_create_data("smp_affinity", 0600, irq_desc[irq].dir,
149 entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir); 203 &irq_affinity_proc_fops, (void *)(long)irq);
150
151 if (entry) {
152 entry->data = (void *)(long)irq;
153 entry->read_proc = irq_affinity_read_proc;
154 entry->write_proc = irq_affinity_write_proc;
155 }
156 }
157#endif 204#endif
158 205
159 entry = create_proc_entry("spurious", 0444, irq_desc[irq].dir); 206 entry = create_proc_entry("spurious", 0444, irq_desc[irq].dir);
@@ -171,6 +218,14 @@ void unregister_handler_proc(unsigned int irq, struct irqaction *action)
171 remove_proc_entry(action->dir->name, irq_desc[irq].dir); 218 remove_proc_entry(action->dir->name, irq_desc[irq].dir);
172} 219}
173 220
221void register_default_affinity_proc(void)
222{
223#ifdef CONFIG_SMP
224 proc_create("irq/default_smp_affinity", 0600, NULL,
225 &default_affinity_proc_fops);
226#endif
227}
228
174void init_irq_proc(void) 229void init_irq_proc(void)
175{ 230{
176 int i; 231 int i;
@@ -180,6 +235,8 @@ void init_irq_proc(void)
180 if (!root_irq_dir) 235 if (!root_irq_dir)
181 return; 236 return;
182 237
238 register_default_affinity_proc();
239
183 /* 240 /*
184 * Create entries for all existing IRQs. 241 * Create entries for all existing IRQs.
185 */ 242 */
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 6fc0040f3e3a..38fc10ac7541 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -176,7 +176,7 @@ static unsigned long get_symbol_pos(unsigned long addr,
176 high = kallsyms_num_syms; 176 high = kallsyms_num_syms;
177 177
178 while (high - low > 1) { 178 while (high - low > 1) {
179 mid = (low + high) / 2; 179 mid = low + (high - low) / 2;
180 if (kallsyms_addresses[mid] <= addr) 180 if (kallsyms_addresses[mid] <= addr)
181 low = mid; 181 low = mid;
182 else 182 else
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 1c5fcacbcf33..aef265325cd3 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -12,7 +12,7 @@
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/kexec.h> 14#include <linux/kexec.h>
15#include <linux/spinlock.h> 15#include <linux/mutex.h>
16#include <linux/list.h> 16#include <linux/list.h>
17#include <linux/highmem.h> 17#include <linux/highmem.h>
18#include <linux/syscalls.h> 18#include <linux/syscalls.h>
@@ -24,6 +24,12 @@
24#include <linux/utsrelease.h> 24#include <linux/utsrelease.h>
25#include <linux/utsname.h> 25#include <linux/utsname.h>
26#include <linux/numa.h> 26#include <linux/numa.h>
27#include <linux/suspend.h>
28#include <linux/device.h>
29#include <linux/freezer.h>
30#include <linux/pm.h>
31#include <linux/cpu.h>
32#include <linux/console.h>
27 33
28#include <asm/page.h> 34#include <asm/page.h>
29#include <asm/uaccess.h> 35#include <asm/uaccess.h>
@@ -71,7 +77,7 @@ int kexec_should_crash(struct task_struct *p)
71 * 77 *
72 * The code for the transition from the current kernel to the 78 * The code for the transition from the current kernel to the
73 * the new kernel is placed in the control_code_buffer, whose size 79 * the new kernel is placed in the control_code_buffer, whose size
74 * is given by KEXEC_CONTROL_CODE_SIZE. In the best case only a single 80 * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single
75 * page of memory is necessary, but some architectures require more. 81 * page of memory is necessary, but some architectures require more.
76 * Because this memory must be identity mapped in the transition from 82 * Because this memory must be identity mapped in the transition from
77 * virtual to physical addresses it must live in the range 83 * virtual to physical addresses it must live in the range
@@ -236,12 +242,18 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
236 */ 242 */
237 result = -ENOMEM; 243 result = -ENOMEM;
238 image->control_code_page = kimage_alloc_control_pages(image, 244 image->control_code_page = kimage_alloc_control_pages(image,
239 get_order(KEXEC_CONTROL_CODE_SIZE)); 245 get_order(KEXEC_CONTROL_PAGE_SIZE));
240 if (!image->control_code_page) { 246 if (!image->control_code_page) {
241 printk(KERN_ERR "Could not allocate control_code_buffer\n"); 247 printk(KERN_ERR "Could not allocate control_code_buffer\n");
242 goto out; 248 goto out;
243 } 249 }
244 250
251 image->swap_page = kimage_alloc_control_pages(image, 0);
252 if (!image->swap_page) {
253 printk(KERN_ERR "Could not allocate swap buffer\n");
254 goto out;
255 }
256
245 result = 0; 257 result = 0;
246 out: 258 out:
247 if (result == 0) 259 if (result == 0)
@@ -305,7 +317,7 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
305 */ 317 */
306 result = -ENOMEM; 318 result = -ENOMEM;
307 image->control_code_page = kimage_alloc_control_pages(image, 319 image->control_code_page = kimage_alloc_control_pages(image,
308 get_order(KEXEC_CONTROL_CODE_SIZE)); 320 get_order(KEXEC_CONTROL_PAGE_SIZE));
309 if (!image->control_code_page) { 321 if (!image->control_code_page) {
310 printk(KERN_ERR "Could not allocate control_code_buffer\n"); 322 printk(KERN_ERR "Could not allocate control_code_buffer\n");
311 goto out; 323 goto out;
@@ -589,14 +601,12 @@ static void kimage_free_extra_pages(struct kimage *image)
589 kimage_free_page_list(&image->unuseable_pages); 601 kimage_free_page_list(&image->unuseable_pages);
590 602
591} 603}
592static int kimage_terminate(struct kimage *image) 604static void kimage_terminate(struct kimage *image)
593{ 605{
594 if (*image->entry != 0) 606 if (*image->entry != 0)
595 image->entry++; 607 image->entry++;
596 608
597 *image->entry = IND_DONE; 609 *image->entry = IND_DONE;
598
599 return 0;
600} 610}
601 611
602#define for_each_kimage_entry(image, ptr, entry) \ 612#define for_each_kimage_entry(image, ptr, entry) \
@@ -743,8 +753,14 @@ static struct page *kimage_alloc_page(struct kimage *image,
743 *old = addr | (*old & ~PAGE_MASK); 753 *old = addr | (*old & ~PAGE_MASK);
744 754
745 /* The old page I have found cannot be a 755 /* The old page I have found cannot be a
746 * destination page, so return it. 756 * destination page, so return it if it's
757 * gfp_flags honor the ones passed in.
747 */ 758 */
759 if (!(gfp_mask & __GFP_HIGHMEM) &&
760 PageHighMem(old_page)) {
761 kimage_free_pages(old_page);
762 continue;
763 }
748 addr = old_addr; 764 addr = old_addr;
749 page = old_page; 765 page = old_page;
750 break; 766 break;
@@ -914,19 +930,14 @@ static int kimage_load_segment(struct kimage *image,
914 */ 930 */
915struct kimage *kexec_image; 931struct kimage *kexec_image;
916struct kimage *kexec_crash_image; 932struct kimage *kexec_crash_image;
917/* 933
918 * A home grown binary mutex. 934static DEFINE_MUTEX(kexec_mutex);
919 * Nothing can wait so this mutex is safe to use
920 * in interrupt context :)
921 */
922static int kexec_lock;
923 935
924asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, 936asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
925 struct kexec_segment __user *segments, 937 struct kexec_segment __user *segments,
926 unsigned long flags) 938 unsigned long flags)
927{ 939{
928 struct kimage **dest_image, *image; 940 struct kimage **dest_image, *image;
929 int locked;
930 int result; 941 int result;
931 942
932 /* We only trust the superuser with rebooting the system. */ 943 /* We only trust the superuser with rebooting the system. */
@@ -962,8 +973,7 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
962 * 973 *
963 * KISS: always take the mutex. 974 * KISS: always take the mutex.
964 */ 975 */
965 locked = xchg(&kexec_lock, 1); 976 if (!mutex_trylock(&kexec_mutex))
966 if (locked)
967 return -EBUSY; 977 return -EBUSY;
968 978
969 dest_image = &kexec_image; 979 dest_image = &kexec_image;
@@ -988,6 +998,8 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
988 if (result) 998 if (result)
989 goto out; 999 goto out;
990 1000
1001 if (flags & KEXEC_PRESERVE_CONTEXT)
1002 image->preserve_context = 1;
991 result = machine_kexec_prepare(image); 1003 result = machine_kexec_prepare(image);
992 if (result) 1004 if (result)
993 goto out; 1005 goto out;
@@ -997,16 +1009,13 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
997 if (result) 1009 if (result)
998 goto out; 1010 goto out;
999 } 1011 }
1000 result = kimage_terminate(image); 1012 kimage_terminate(image);
1001 if (result)
1002 goto out;
1003 } 1013 }
1004 /* Install the new kernel, and Uninstall the old */ 1014 /* Install the new kernel, and Uninstall the old */
1005 image = xchg(dest_image, image); 1015 image = xchg(dest_image, image);
1006 1016
1007out: 1017out:
1008 locked = xchg(&kexec_lock, 0); /* Release the mutex */ 1018 mutex_unlock(&kexec_mutex);
1009 BUG_ON(!locked);
1010 kimage_free(image); 1019 kimage_free(image);
1011 1020
1012 return result; 1021 return result;
@@ -1053,10 +1062,7 @@ asmlinkage long compat_sys_kexec_load(unsigned long entry,
1053 1062
1054void crash_kexec(struct pt_regs *regs) 1063void crash_kexec(struct pt_regs *regs)
1055{ 1064{
1056 int locked; 1065 /* Take the kexec_mutex here to prevent sys_kexec_load
1057
1058
1059 /* Take the kexec_lock here to prevent sys_kexec_load
1060 * running on one cpu from replacing the crash kernel 1066 * running on one cpu from replacing the crash kernel
1061 * we are using after a panic on a different cpu. 1067 * we are using after a panic on a different cpu.
1062 * 1068 *
@@ -1064,8 +1070,7 @@ void crash_kexec(struct pt_regs *regs)
1064 * of memory the xchg(&kexec_crash_image) would be 1070 * of memory the xchg(&kexec_crash_image) would be
1065 * sufficient. But since I reuse the memory... 1071 * sufficient. But since I reuse the memory...
1066 */ 1072 */
1067 locked = xchg(&kexec_lock, 1); 1073 if (mutex_trylock(&kexec_mutex)) {
1068 if (!locked) {
1069 if (kexec_crash_image) { 1074 if (kexec_crash_image) {
1070 struct pt_regs fixed_regs; 1075 struct pt_regs fixed_regs;
1071 crash_setup_regs(&fixed_regs, regs); 1076 crash_setup_regs(&fixed_regs, regs);
@@ -1073,8 +1078,7 @@ void crash_kexec(struct pt_regs *regs)
1073 machine_crash_shutdown(&fixed_regs); 1078 machine_crash_shutdown(&fixed_regs);
1074 machine_kexec(kexec_crash_image); 1079 machine_kexec(kexec_crash_image);
1075 } 1080 }
1076 locked = xchg(&kexec_lock, 0); 1081 mutex_unlock(&kexec_mutex);
1077 BUG_ON(!locked);
1078 } 1082 }
1079} 1083}
1080 1084
@@ -1415,3 +1419,79 @@ static int __init crash_save_vmcoreinfo_init(void)
1415} 1419}
1416 1420
1417module_init(crash_save_vmcoreinfo_init) 1421module_init(crash_save_vmcoreinfo_init)
1422
1423/*
1424 * Move into place and start executing a preloaded standalone
1425 * executable. If nothing was preloaded return an error.
1426 */
1427int kernel_kexec(void)
1428{
1429 int error = 0;
1430
1431 if (!mutex_trylock(&kexec_mutex))
1432 return -EBUSY;
1433 if (!kexec_image) {
1434 error = -EINVAL;
1435 goto Unlock;
1436 }
1437
1438#ifdef CONFIG_KEXEC_JUMP
1439 if (kexec_image->preserve_context) {
1440 mutex_lock(&pm_mutex);
1441 pm_prepare_console();
1442 error = freeze_processes();
1443 if (error) {
1444 error = -EBUSY;
1445 goto Restore_console;
1446 }
1447 suspend_console();
1448 error = device_suspend(PMSG_FREEZE);
1449 if (error)
1450 goto Resume_console;
1451 error = disable_nonboot_cpus();
1452 if (error)
1453 goto Resume_devices;
1454 device_pm_lock();
1455 local_irq_disable();
1456 /* At this point, device_suspend() has been called,
1457 * but *not* device_power_down(). We *must*
1458 * device_power_down() now. Otherwise, drivers for
1459 * some devices (e.g. interrupt controllers) become
1460 * desynchronized with the actual state of the
1461 * hardware at resume time, and evil weirdness ensues.
1462 */
1463 error = device_power_down(PMSG_FREEZE);
1464 if (error)
1465 goto Enable_irqs;
1466 } else
1467#endif
1468 {
1469 kernel_restart_prepare(NULL);
1470 printk(KERN_EMERG "Starting new kernel\n");
1471 machine_shutdown();
1472 }
1473
1474 machine_kexec(kexec_image);
1475
1476#ifdef CONFIG_KEXEC_JUMP
1477 if (kexec_image->preserve_context) {
1478 device_power_up(PMSG_RESTORE);
1479 Enable_irqs:
1480 local_irq_enable();
1481 device_pm_unlock();
1482 enable_nonboot_cpus();
1483 Resume_devices:
1484 device_resume(PMSG_RESTORE);
1485 Resume_console:
1486 resume_console();
1487 thaw_processes();
1488 Restore_console:
1489 pm_restore_console();
1490 mutex_unlock(&pm_mutex);
1491 }
1492#endif
1493
1494 Unlock:
1495 mutex_unlock(&kexec_mutex);
1496 return error;
1497}
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 3ec23c3ec97f..e4dcfb2272a4 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -56,12 +56,14 @@
56 56
57static int kgdb_break_asap; 57static int kgdb_break_asap;
58 58
59#define KGDB_MAX_THREAD_QUERY 17
59struct kgdb_state { 60struct kgdb_state {
60 int ex_vector; 61 int ex_vector;
61 int signo; 62 int signo;
62 int err_code; 63 int err_code;
63 int cpu; 64 int cpu;
64 int pass_exception; 65 int pass_exception;
66 unsigned long thr_query;
65 unsigned long threadid; 67 unsigned long threadid;
66 long kgdb_usethreadid; 68 long kgdb_usethreadid;
67 struct pt_regs *linux_regs; 69 struct pt_regs *linux_regs;
@@ -166,13 +168,6 @@ early_param("nokgdbroundup", opt_nokgdbroundup);
166 * Weak aliases for breakpoint management, 168 * Weak aliases for breakpoint management,
167 * can be overriden by architectures when needed: 169 * can be overriden by architectures when needed:
168 */ 170 */
169int __weak kgdb_validate_break_address(unsigned long addr)
170{
171 char tmp_variable[BREAK_INSTR_SIZE];
172
173 return probe_kernel_read(tmp_variable, (char *)addr, BREAK_INSTR_SIZE);
174}
175
176int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr) 171int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr)
177{ 172{
178 int err; 173 int err;
@@ -191,6 +186,25 @@ int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle)
191 (char *)bundle, BREAK_INSTR_SIZE); 186 (char *)bundle, BREAK_INSTR_SIZE);
192} 187}
193 188
189int __weak kgdb_validate_break_address(unsigned long addr)
190{
191 char tmp_variable[BREAK_INSTR_SIZE];
192 int err;
193 /* Validate setting the breakpoint and then removing it. In the
194 * remove fails, the kernel needs to emit a bad message because we
195 * are deep trouble not being able to put things back the way we
196 * found them.
197 */
198 err = kgdb_arch_set_breakpoint(addr, tmp_variable);
199 if (err)
200 return err;
201 err = kgdb_arch_remove_breakpoint(addr, tmp_variable);
202 if (err)
203 printk(KERN_ERR "KGDB: Critical breakpoint error, kernel "
204 "memory destroyed at: %lx", addr);
205 return err;
206}
207
194unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs) 208unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs)
195{ 209{
196 return instruction_pointer(regs); 210 return instruction_pointer(regs);
@@ -433,9 +447,14 @@ int kgdb_hex2long(char **ptr, unsigned long *long_val)
433{ 447{
434 int hex_val; 448 int hex_val;
435 int num = 0; 449 int num = 0;
450 int negate = 0;
436 451
437 *long_val = 0; 452 *long_val = 0;
438 453
454 if (**ptr == '-') {
455 negate = 1;
456 (*ptr)++;
457 }
439 while (**ptr) { 458 while (**ptr) {
440 hex_val = hex(**ptr); 459 hex_val = hex(**ptr);
441 if (hex_val < 0) 460 if (hex_val < 0)
@@ -446,6 +465,9 @@ int kgdb_hex2long(char **ptr, unsigned long *long_val)
446 (*ptr)++; 465 (*ptr)++;
447 } 466 }
448 467
468 if (negate)
469 *long_val = -*long_val;
470
449 return num; 471 return num;
450} 472}
451 473
@@ -466,7 +488,7 @@ static int write_mem_msg(int binary)
466 if (err) 488 if (err)
467 return err; 489 return err;
468 if (CACHE_FLUSH_IS_SAFE) 490 if (CACHE_FLUSH_IS_SAFE)
469 flush_icache_range(addr, addr + length + 1); 491 flush_icache_range(addr, addr + length);
470 return 0; 492 return 0;
471 } 493 }
472 494
@@ -515,10 +537,16 @@ static void int_to_threadref(unsigned char *id, int value)
515static struct task_struct *getthread(struct pt_regs *regs, int tid) 537static struct task_struct *getthread(struct pt_regs *regs, int tid)
516{ 538{
517 /* 539 /*
518 * Non-positive TIDs are remapped idle tasks: 540 * Non-positive TIDs are remapped to the cpu shadow information
519 */ 541 */
520 if (tid <= 0) 542 if (tid == 0 || tid == -1)
521 return idle_task(-tid); 543 tid = -atomic_read(&kgdb_active) - 2;
544 if (tid < 0) {
545 if (kgdb_info[-tid - 2].task)
546 return kgdb_info[-tid - 2].task;
547 else
548 return idle_task(-tid - 2);
549 }
522 550
523 /* 551 /*
524 * find_task_by_pid_ns() does not take the tasklist lock anymore 552 * find_task_by_pid_ns() does not take the tasklist lock anymore
@@ -562,6 +590,7 @@ static void kgdb_wait(struct pt_regs *regs)
562 590
563 /* Signal the primary CPU that we are done: */ 591 /* Signal the primary CPU that we are done: */
564 atomic_set(&cpu_in_kgdb[cpu], 0); 592 atomic_set(&cpu_in_kgdb[cpu], 0);
593 touch_softlockup_watchdog();
565 clocksource_touch_watchdog(); 594 clocksource_touch_watchdog();
566 local_irq_restore(flags); 595 local_irq_restore(flags);
567} 596}
@@ -725,14 +754,15 @@ setundefined:
725} 754}
726 755
727/* 756/*
728 * Remap normal tasks to their real PID, idle tasks to -1 ... -NR_CPUs: 757 * Remap normal tasks to their real PID,
758 * CPU shadow threads are mapped to -CPU - 2
729 */ 759 */
730static inline int shadow_pid(int realpid) 760static inline int shadow_pid(int realpid)
731{ 761{
732 if (realpid) 762 if (realpid)
733 return realpid; 763 return realpid;
734 764
735 return -1-raw_smp_processor_id(); 765 return -raw_smp_processor_id() - 2;
736} 766}
737 767
738static char gdbmsgbuf[BUFMAX + 1]; 768static char gdbmsgbuf[BUFMAX + 1];
@@ -826,7 +856,7 @@ static void gdb_cmd_getregs(struct kgdb_state *ks)
826 local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo; 856 local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo;
827 } else { 857 } else {
828 local_debuggerinfo = NULL; 858 local_debuggerinfo = NULL;
829 for (i = 0; i < NR_CPUS; i++) { 859 for_each_online_cpu(i) {
830 /* 860 /*
831 * Try to find the task on some other 861 * Try to find the task on some other
832 * or possibly this node if we do not 862 * or possibly this node if we do not
@@ -960,10 +990,13 @@ static int gdb_cmd_reboot(struct kgdb_state *ks)
960/* Handle the 'q' query packets */ 990/* Handle the 'q' query packets */
961static void gdb_cmd_query(struct kgdb_state *ks) 991static void gdb_cmd_query(struct kgdb_state *ks)
962{ 992{
963 struct task_struct *thread; 993 struct task_struct *g;
994 struct task_struct *p;
964 unsigned char thref[8]; 995 unsigned char thref[8];
965 char *ptr; 996 char *ptr;
966 int i; 997 int i;
998 int cpu;
999 int finished = 0;
967 1000
968 switch (remcom_in_buffer[1]) { 1001 switch (remcom_in_buffer[1]) {
969 case 's': 1002 case 's':
@@ -973,22 +1006,34 @@ static void gdb_cmd_query(struct kgdb_state *ks)
973 break; 1006 break;
974 } 1007 }
975 1008
976 if (remcom_in_buffer[1] == 'f') 1009 i = 0;
977 ks->threadid = 1;
978
979 remcom_out_buffer[0] = 'm'; 1010 remcom_out_buffer[0] = 'm';
980 ptr = remcom_out_buffer + 1; 1011 ptr = remcom_out_buffer + 1;
981 1012 if (remcom_in_buffer[1] == 'f') {
982 for (i = 0; i < 17; ks->threadid++) { 1013 /* Each cpu is a shadow thread */
983 thread = getthread(ks->linux_regs, ks->threadid); 1014 for_each_online_cpu(cpu) {
984 if (thread) { 1015 ks->thr_query = 0;
985 int_to_threadref(thref, ks->threadid); 1016 int_to_threadref(thref, -cpu - 2);
986 pack_threadid(ptr, thref); 1017 pack_threadid(ptr, thref);
987 ptr += BUF_THREAD_ID_SIZE; 1018 ptr += BUF_THREAD_ID_SIZE;
988 *(ptr++) = ','; 1019 *(ptr++) = ',';
989 i++; 1020 i++;
990 } 1021 }
991 } 1022 }
1023
1024 do_each_thread(g, p) {
1025 if (i >= ks->thr_query && !finished) {
1026 int_to_threadref(thref, p->pid);
1027 pack_threadid(ptr, thref);
1028 ptr += BUF_THREAD_ID_SIZE;
1029 *(ptr++) = ',';
1030 ks->thr_query++;
1031 if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0)
1032 finished = 1;
1033 }
1034 i++;
1035 } while_each_thread(g, p);
1036
992 *(--ptr) = '\0'; 1037 *(--ptr) = '\0';
993 break; 1038 break;
994 1039
@@ -1011,15 +1056,15 @@ static void gdb_cmd_query(struct kgdb_state *ks)
1011 error_packet(remcom_out_buffer, -EINVAL); 1056 error_packet(remcom_out_buffer, -EINVAL);
1012 break; 1057 break;
1013 } 1058 }
1014 if (ks->threadid > 0) { 1059 if ((int)ks->threadid > 0) {
1015 kgdb_mem2hex(getthread(ks->linux_regs, 1060 kgdb_mem2hex(getthread(ks->linux_regs,
1016 ks->threadid)->comm, 1061 ks->threadid)->comm,
1017 remcom_out_buffer, 16); 1062 remcom_out_buffer, 16);
1018 } else { 1063 } else {
1019 static char tmpstr[23 + BUF_THREAD_ID_SIZE]; 1064 static char tmpstr[23 + BUF_THREAD_ID_SIZE];
1020 1065
1021 sprintf(tmpstr, "Shadow task %d for pid 0", 1066 sprintf(tmpstr, "shadowCPU%d",
1022 (int)(-ks->threadid-1)); 1067 (int)(-ks->threadid - 2));
1023 kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr)); 1068 kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr));
1024 } 1069 }
1025 break; 1070 break;
@@ -1388,6 +1433,7 @@ acquirelock:
1388 atomic_read(&kgdb_cpu_doing_single_step) != cpu) { 1433 atomic_read(&kgdb_cpu_doing_single_step) != cpu) {
1389 1434
1390 atomic_set(&kgdb_active, -1); 1435 atomic_set(&kgdb_active, -1);
1436 touch_softlockup_watchdog();
1391 clocksource_touch_watchdog(); 1437 clocksource_touch_watchdog();
1392 local_irq_restore(flags); 1438 local_irq_restore(flags);
1393 1439
@@ -1418,7 +1464,7 @@ acquirelock:
1418 * Get the passive CPU lock which will hold all the non-primary 1464 * Get the passive CPU lock which will hold all the non-primary
1419 * CPU in a spin state while the debugger is active 1465 * CPU in a spin state while the debugger is active
1420 */ 1466 */
1421 if (!kgdb_single_step || !kgdb_contthread) { 1467 if (!kgdb_single_step) {
1422 for (i = 0; i < NR_CPUS; i++) 1468 for (i = 0; i < NR_CPUS; i++)
1423 atomic_set(&passive_cpu_wait[i], 1); 1469 atomic_set(&passive_cpu_wait[i], 1);
1424 } 1470 }
@@ -1431,7 +1477,7 @@ acquirelock:
1431 1477
1432#ifdef CONFIG_SMP 1478#ifdef CONFIG_SMP
1433 /* Signal the other CPUs to enter kgdb_wait() */ 1479 /* Signal the other CPUs to enter kgdb_wait() */
1434 if ((!kgdb_single_step || !kgdb_contthread) && kgdb_do_roundup) 1480 if ((!kgdb_single_step) && kgdb_do_roundup)
1435 kgdb_roundup_cpus(flags); 1481 kgdb_roundup_cpus(flags);
1436#endif 1482#endif
1437 1483
@@ -1450,7 +1496,7 @@ acquirelock:
1450 kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code); 1496 kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code);
1451 kgdb_deactivate_sw_breakpoints(); 1497 kgdb_deactivate_sw_breakpoints();
1452 kgdb_single_step = 0; 1498 kgdb_single_step = 0;
1453 kgdb_contthread = NULL; 1499 kgdb_contthread = current;
1454 exception_level = 0; 1500 exception_level = 0;
1455 1501
1456 /* Talk to debugger with gdbserial protocol */ 1502 /* Talk to debugger with gdbserial protocol */
@@ -1464,7 +1510,7 @@ acquirelock:
1464 kgdb_info[ks->cpu].task = NULL; 1510 kgdb_info[ks->cpu].task = NULL;
1465 atomic_set(&cpu_in_kgdb[ks->cpu], 0); 1511 atomic_set(&cpu_in_kgdb[ks->cpu], 0);
1466 1512
1467 if (!kgdb_single_step || !kgdb_contthread) { 1513 if (!kgdb_single_step) {
1468 for (i = NR_CPUS-1; i >= 0; i--) 1514 for (i = NR_CPUS-1; i >= 0; i--)
1469 atomic_set(&passive_cpu_wait[i], 0); 1515 atomic_set(&passive_cpu_wait[i], 0);
1470 /* 1516 /*
@@ -1480,6 +1526,7 @@ acquirelock:
1480kgdb_restore: 1526kgdb_restore:
1481 /* Free kgdb_active */ 1527 /* Free kgdb_active */
1482 atomic_set(&kgdb_active, -1); 1528 atomic_set(&kgdb_active, -1);
1529 touch_softlockup_watchdog();
1483 clocksource_touch_watchdog(); 1530 clocksource_touch_watchdog();
1484 local_irq_restore(flags); 1531 local_irq_restore(flags);
1485 1532
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 8df97d3dfda8..2456d1a0befb 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -42,7 +42,7 @@ extern int max_threads;
42 42
43static struct workqueue_struct *khelper_wq; 43static struct workqueue_struct *khelper_wq;
44 44
45#ifdef CONFIG_KMOD 45#ifdef CONFIG_MODULES
46 46
47/* 47/*
48 modprobe_path is set via /proc/sys. 48 modprobe_path is set via /proc/sys.
@@ -352,16 +352,17 @@ static inline void register_pm_notifier_callback(void) {}
352 * @path: path to usermode executable 352 * @path: path to usermode executable
353 * @argv: arg vector for process 353 * @argv: arg vector for process
354 * @envp: environment for process 354 * @envp: environment for process
355 * @gfp_mask: gfp mask for memory allocation
355 * 356 *
356 * Returns either %NULL on allocation failure, or a subprocess_info 357 * Returns either %NULL on allocation failure, or a subprocess_info
357 * structure. This should be passed to call_usermodehelper_exec to 358 * structure. This should be passed to call_usermodehelper_exec to
358 * exec the process and free the structure. 359 * exec the process and free the structure.
359 */ 360 */
360struct subprocess_info *call_usermodehelper_setup(char *path, 361struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
361 char **argv, char **envp) 362 char **envp, gfp_t gfp_mask)
362{ 363{
363 struct subprocess_info *sub_info; 364 struct subprocess_info *sub_info;
364 sub_info = kzalloc(sizeof(struct subprocess_info), GFP_ATOMIC); 365 sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask);
365 if (!sub_info) 366 if (!sub_info)
366 goto out; 367 goto out;
367 368
@@ -417,12 +418,12 @@ int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info,
417{ 418{
418 struct file *f; 419 struct file *f;
419 420
420 f = create_write_pipe(); 421 f = create_write_pipe(0);
421 if (IS_ERR(f)) 422 if (IS_ERR(f))
422 return PTR_ERR(f); 423 return PTR_ERR(f);
423 *filp = f; 424 *filp = f;
424 425
425 f = create_read_pipe(f); 426 f = create_read_pipe(f, 0);
426 if (IS_ERR(f)) { 427 if (IS_ERR(f)) {
427 free_write_pipe(*filp); 428 free_write_pipe(*filp);
428 return PTR_ERR(f); 429 return PTR_ERR(f);
@@ -494,7 +495,7 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
494 struct subprocess_info *sub_info; 495 struct subprocess_info *sub_info;
495 int ret; 496 int ret;
496 497
497 sub_info = call_usermodehelper_setup(path, argv, envp); 498 sub_info = call_usermodehelper_setup(path, argv, envp, GFP_KERNEL);
498 if (sub_info == NULL) 499 if (sub_info == NULL)
499 return -ENOMEM; 500 return -ENOMEM;
500 501
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index d4998f81e229..75bc2cd9ebc6 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -62,6 +62,7 @@
62 addr = ((kprobe_opcode_t *)(kallsyms_lookup_name(name))) 62 addr = ((kprobe_opcode_t *)(kallsyms_lookup_name(name)))
63#endif 63#endif
64 64
65static int kprobes_initialized;
65static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; 66static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
66static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; 67static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
67 68
@@ -69,8 +70,15 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
69static bool kprobe_enabled; 70static bool kprobe_enabled;
70 71
71DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ 72DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
72DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */
73static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; 73static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
74static struct {
75 spinlock_t lock ____cacheline_aligned;
76} kretprobe_table_locks[KPROBE_TABLE_SIZE];
77
78static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
79{
80 return &(kretprobe_table_locks[hash].lock);
81}
74 82
75/* 83/*
76 * Normally, functions that we'd want to prohibit kprobes in, are marked 84 * Normally, functions that we'd want to prohibit kprobes in, are marked
@@ -79,7 +87,7 @@ static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
79 * 87 *
80 * For such cases, we now have a blacklist 88 * For such cases, we now have a blacklist
81 */ 89 */
82struct kprobe_blackpoint kprobe_blacklist[] = { 90static struct kprobe_blackpoint kprobe_blacklist[] = {
83 {"preempt_schedule",}, 91 {"preempt_schedule",},
84 {NULL} /* Terminator */ 92 {NULL} /* Terminator */
85}; 93};
@@ -368,26 +376,53 @@ void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
368 return; 376 return;
369} 377}
370 378
371/* Called with kretprobe_lock held */
372void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, 379void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
373 struct hlist_head *head) 380 struct hlist_head *head)
374{ 381{
382 struct kretprobe *rp = ri->rp;
383
375 /* remove rp inst off the rprobe_inst_table */ 384 /* remove rp inst off the rprobe_inst_table */
376 hlist_del(&ri->hlist); 385 hlist_del(&ri->hlist);
377 if (ri->rp) { 386 INIT_HLIST_NODE(&ri->hlist);
378 /* remove rp inst off the used list */ 387 if (likely(rp)) {
379 hlist_del(&ri->uflist); 388 spin_lock(&rp->lock);
380 /* put rp inst back onto the free list */ 389 hlist_add_head(&ri->hlist, &rp->free_instances);
381 INIT_HLIST_NODE(&ri->uflist); 390 spin_unlock(&rp->lock);
382 hlist_add_head(&ri->uflist, &ri->rp->free_instances);
383 } else 391 } else
384 /* Unregistering */ 392 /* Unregistering */
385 hlist_add_head(&ri->hlist, head); 393 hlist_add_head(&ri->hlist, head);
386} 394}
387 395
388struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk) 396void kretprobe_hash_lock(struct task_struct *tsk,
397 struct hlist_head **head, unsigned long *flags)
389{ 398{
390 return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)]; 399 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
400 spinlock_t *hlist_lock;
401
402 *head = &kretprobe_inst_table[hash];
403 hlist_lock = kretprobe_table_lock_ptr(hash);
404 spin_lock_irqsave(hlist_lock, *flags);
405}
406
407void kretprobe_table_lock(unsigned long hash, unsigned long *flags)
408{
409 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
410 spin_lock_irqsave(hlist_lock, *flags);
411}
412
413void kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags)
414{
415 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
416 spinlock_t *hlist_lock;
417
418 hlist_lock = kretprobe_table_lock_ptr(hash);
419 spin_unlock_irqrestore(hlist_lock, *flags);
420}
421
422void kretprobe_table_unlock(unsigned long hash, unsigned long *flags)
423{
424 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
425 spin_unlock_irqrestore(hlist_lock, *flags);
391} 426}
392 427
393/* 428/*
@@ -401,17 +436,21 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
401 struct kretprobe_instance *ri; 436 struct kretprobe_instance *ri;
402 struct hlist_head *head, empty_rp; 437 struct hlist_head *head, empty_rp;
403 struct hlist_node *node, *tmp; 438 struct hlist_node *node, *tmp;
404 unsigned long flags = 0; 439 unsigned long hash, flags = 0;
405 440
406 INIT_HLIST_HEAD(&empty_rp); 441 if (unlikely(!kprobes_initialized))
407 spin_lock_irqsave(&kretprobe_lock, flags); 442 /* Early boot. kretprobe_table_locks not yet initialized. */
408 head = kretprobe_inst_table_head(tk); 443 return;
444
445 hash = hash_ptr(tk, KPROBE_HASH_BITS);
446 head = &kretprobe_inst_table[hash];
447 kretprobe_table_lock(hash, &flags);
409 hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { 448 hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
410 if (ri->task == tk) 449 if (ri->task == tk)
411 recycle_rp_inst(ri, &empty_rp); 450 recycle_rp_inst(ri, &empty_rp);
412 } 451 }
413 spin_unlock_irqrestore(&kretprobe_lock, flags); 452 kretprobe_table_unlock(hash, &flags);
414 453 INIT_HLIST_HEAD(&empty_rp);
415 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { 454 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
416 hlist_del(&ri->hlist); 455 hlist_del(&ri->hlist);
417 kfree(ri); 456 kfree(ri);
@@ -423,24 +462,29 @@ static inline void free_rp_inst(struct kretprobe *rp)
423 struct kretprobe_instance *ri; 462 struct kretprobe_instance *ri;
424 struct hlist_node *pos, *next; 463 struct hlist_node *pos, *next;
425 464
426 hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, uflist) { 465 hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, hlist) {
427 hlist_del(&ri->uflist); 466 hlist_del(&ri->hlist);
428 kfree(ri); 467 kfree(ri);
429 } 468 }
430} 469}
431 470
432static void __kprobes cleanup_rp_inst(struct kretprobe *rp) 471static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
433{ 472{
434 unsigned long flags; 473 unsigned long flags, hash;
435 struct kretprobe_instance *ri; 474 struct kretprobe_instance *ri;
436 struct hlist_node *pos, *next; 475 struct hlist_node *pos, *next;
476 struct hlist_head *head;
477
437 /* No race here */ 478 /* No race here */
438 spin_lock_irqsave(&kretprobe_lock, flags); 479 for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) {
439 hlist_for_each_entry_safe(ri, pos, next, &rp->used_instances, uflist) { 480 kretprobe_table_lock(hash, &flags);
440 ri->rp = NULL; 481 head = &kretprobe_inst_table[hash];
441 hlist_del(&ri->uflist); 482 hlist_for_each_entry_safe(ri, pos, next, head, hlist) {
483 if (ri->rp == rp)
484 ri->rp = NULL;
485 }
486 kretprobe_table_unlock(hash, &flags);
442 } 487 }
443 spin_unlock_irqrestore(&kretprobe_lock, flags);
444 free_rp_inst(rp); 488 free_rp_inst(rp);
445} 489}
446 490
@@ -831,32 +875,37 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
831 struct pt_regs *regs) 875 struct pt_regs *regs)
832{ 876{
833 struct kretprobe *rp = container_of(p, struct kretprobe, kp); 877 struct kretprobe *rp = container_of(p, struct kretprobe, kp);
834 unsigned long flags = 0; 878 unsigned long hash, flags = 0;
879 struct kretprobe_instance *ri;
835 880
836 /*TODO: consider to only swap the RA after the last pre_handler fired */ 881 /*TODO: consider to only swap the RA after the last pre_handler fired */
837 spin_lock_irqsave(&kretprobe_lock, flags); 882 hash = hash_ptr(current, KPROBE_HASH_BITS);
883 spin_lock_irqsave(&rp->lock, flags);
838 if (!hlist_empty(&rp->free_instances)) { 884 if (!hlist_empty(&rp->free_instances)) {
839 struct kretprobe_instance *ri;
840
841 ri = hlist_entry(rp->free_instances.first, 885 ri = hlist_entry(rp->free_instances.first,
842 struct kretprobe_instance, uflist); 886 struct kretprobe_instance, hlist);
887 hlist_del(&ri->hlist);
888 spin_unlock_irqrestore(&rp->lock, flags);
889
843 ri->rp = rp; 890 ri->rp = rp;
844 ri->task = current; 891 ri->task = current;
845 892
846 if (rp->entry_handler && rp->entry_handler(ri, regs)) { 893 if (rp->entry_handler && rp->entry_handler(ri, regs)) {
847 spin_unlock_irqrestore(&kretprobe_lock, flags); 894 spin_unlock_irqrestore(&rp->lock, flags);
848 return 0; 895 return 0;
849 } 896 }
850 897
851 arch_prepare_kretprobe(ri, regs); 898 arch_prepare_kretprobe(ri, regs);
852 899
853 /* XXX(hch): why is there no hlist_move_head? */ 900 /* XXX(hch): why is there no hlist_move_head? */
854 hlist_del(&ri->uflist); 901 INIT_HLIST_NODE(&ri->hlist);
855 hlist_add_head(&ri->uflist, &ri->rp->used_instances); 902 kretprobe_table_lock(hash, &flags);
856 hlist_add_head(&ri->hlist, kretprobe_inst_table_head(ri->task)); 903 hlist_add_head(&ri->hlist, &kretprobe_inst_table[hash]);
857 } else 904 kretprobe_table_unlock(hash, &flags);
905 } else {
858 rp->nmissed++; 906 rp->nmissed++;
859 spin_unlock_irqrestore(&kretprobe_lock, flags); 907 spin_unlock_irqrestore(&rp->lock, flags);
908 }
860 return 0; 909 return 0;
861} 910}
862 911
@@ -892,7 +941,7 @@ static int __kprobes __register_kretprobe(struct kretprobe *rp,
892 rp->maxactive = NR_CPUS; 941 rp->maxactive = NR_CPUS;
893#endif 942#endif
894 } 943 }
895 INIT_HLIST_HEAD(&rp->used_instances); 944 spin_lock_init(&rp->lock);
896 INIT_HLIST_HEAD(&rp->free_instances); 945 INIT_HLIST_HEAD(&rp->free_instances);
897 for (i = 0; i < rp->maxactive; i++) { 946 for (i = 0; i < rp->maxactive; i++) {
898 inst = kmalloc(sizeof(struct kretprobe_instance) + 947 inst = kmalloc(sizeof(struct kretprobe_instance) +
@@ -901,8 +950,8 @@ static int __kprobes __register_kretprobe(struct kretprobe *rp,
901 free_rp_inst(rp); 950 free_rp_inst(rp);
902 return -ENOMEM; 951 return -ENOMEM;
903 } 952 }
904 INIT_HLIST_NODE(&inst->uflist); 953 INIT_HLIST_NODE(&inst->hlist);
905 hlist_add_head(&inst->uflist, &rp->free_instances); 954 hlist_add_head(&inst->hlist, &rp->free_instances);
906 } 955 }
907 956
908 rp->nmissed = 0; 957 rp->nmissed = 0;
@@ -1009,6 +1058,7 @@ static int __init init_kprobes(void)
1009 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1058 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1010 INIT_HLIST_HEAD(&kprobe_table[i]); 1059 INIT_HLIST_HEAD(&kprobe_table[i]);
1011 INIT_HLIST_HEAD(&kretprobe_inst_table[i]); 1060 INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
1061 spin_lock_init(&(kretprobe_table_locks[i].lock));
1012 } 1062 }
1013 1063
1014 /* 1064 /*
@@ -1050,6 +1100,7 @@ static int __init init_kprobes(void)
1050 err = arch_init_kprobes(); 1100 err = arch_init_kprobes();
1051 if (!err) 1101 if (!err)
1052 err = register_die_notifier(&kprobe_exceptions_nb); 1102 err = register_die_notifier(&kprobe_exceptions_nb);
1103 kprobes_initialized = (err == 0);
1053 1104
1054 if (!err) 1105 if (!err)
1055 init_test_probes(); 1106 init_test_probes();
@@ -1286,13 +1337,8 @@ EXPORT_SYMBOL_GPL(register_jprobe);
1286EXPORT_SYMBOL_GPL(unregister_jprobe); 1337EXPORT_SYMBOL_GPL(unregister_jprobe);
1287EXPORT_SYMBOL_GPL(register_jprobes); 1338EXPORT_SYMBOL_GPL(register_jprobes);
1288EXPORT_SYMBOL_GPL(unregister_jprobes); 1339EXPORT_SYMBOL_GPL(unregister_jprobes);
1289#ifdef CONFIG_KPROBES
1290EXPORT_SYMBOL_GPL(jprobe_return); 1340EXPORT_SYMBOL_GPL(jprobe_return);
1291#endif
1292
1293#ifdef CONFIG_KPROBES
1294EXPORT_SYMBOL_GPL(register_kretprobe); 1341EXPORT_SYMBOL_GPL(register_kretprobe);
1295EXPORT_SYMBOL_GPL(unregister_kretprobe); 1342EXPORT_SYMBOL_GPL(unregister_kretprobe);
1296EXPORT_SYMBOL_GPL(register_kretprobes); 1343EXPORT_SYMBOL_GPL(register_kretprobes);
1297EXPORT_SYMBOL_GPL(unregister_kretprobes); 1344EXPORT_SYMBOL_GPL(unregister_kretprobes);
1298#endif
diff --git a/kernel/kthread.c b/kernel/kthread.c
index bd1b9ea024e1..96cff2f8710b 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -106,7 +106,7 @@ static void create_kthread(struct kthread_create_info *create)
106 */ 106 */
107 sched_setscheduler(create->result, SCHED_NORMAL, &param); 107 sched_setscheduler(create->result, SCHED_NORMAL, &param);
108 set_user_nice(create->result, KTHREAD_NICE_LEVEL); 108 set_user_nice(create->result, KTHREAD_NICE_LEVEL);
109 set_cpus_allowed(create->result, CPU_MASK_ALL); 109 set_cpus_allowed_ptr(create->result, CPU_MASK_ALL_PTR);
110 } 110 }
111 complete(&create->done); 111 complete(&create->done);
112} 112}
@@ -176,10 +176,11 @@ void kthread_bind(struct task_struct *k, unsigned int cpu)
176 return; 176 return;
177 } 177 }
178 /* Must have done schedule() in kthread() before we set_task_cpu */ 178 /* Must have done schedule() in kthread() before we set_task_cpu */
179 wait_task_inactive(k); 179 wait_task_inactive(k, 0);
180 set_task_cpu(k, cpu); 180 set_task_cpu(k, cpu);
181 k->cpus_allowed = cpumask_of_cpu(cpu); 181 k->cpus_allowed = cpumask_of_cpu(cpu);
182 k->rt.nr_cpus_allowed = 1; 182 k->rt.nr_cpus_allowed = 1;
183 k->flags |= PF_THREAD_BOUND;
183} 184}
184EXPORT_SYMBOL(kthread_bind); 185EXPORT_SYMBOL(kthread_bind);
185 186
@@ -232,9 +233,9 @@ int kthreadd(void *unused)
232 set_task_comm(tsk, "kthreadd"); 233 set_task_comm(tsk, "kthreadd");
233 ignore_signals(tsk); 234 ignore_signals(tsk);
234 set_user_nice(tsk, KTHREAD_NICE_LEVEL); 235 set_user_nice(tsk, KTHREAD_NICE_LEVEL);
235 set_cpus_allowed(tsk, CPU_MASK_ALL); 236 set_cpus_allowed_ptr(tsk, CPU_MASK_ALL_PTR);
236 237
237 current->flags |= PF_NOFREEZE; 238 current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
238 239
239 for (;;) { 240 for (;;) {
240 set_current_state(TASK_INTERRUPTIBLE); 241 set_current_state(TASK_INTERRUPTIBLE);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 81a4e4a3f087..dbda475b13bd 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -39,6 +39,7 @@
39#include <linux/irqflags.h> 39#include <linux/irqflags.h>
40#include <linux/utsname.h> 40#include <linux/utsname.h>
41#include <linux/hash.h> 41#include <linux/hash.h>
42#include <linux/ftrace.h>
42 43
43#include <asm/sections.h> 44#include <asm/sections.h>
44 45
@@ -81,6 +82,8 @@ static int graph_lock(void)
81 __raw_spin_unlock(&lockdep_lock); 82 __raw_spin_unlock(&lockdep_lock);
82 return 0; 83 return 0;
83 } 84 }
85 /* prevent any recursions within lockdep from causing deadlocks */
86 current->lockdep_recursion++;
84 return 1; 87 return 1;
85} 88}
86 89
@@ -89,6 +92,7 @@ static inline int graph_unlock(void)
89 if (debug_locks && !__raw_spin_is_locked(&lockdep_lock)) 92 if (debug_locks && !__raw_spin_is_locked(&lockdep_lock))
90 return DEBUG_LOCKS_WARN_ON(1); 93 return DEBUG_LOCKS_WARN_ON(1);
91 94
95 current->lockdep_recursion--;
92 __raw_spin_unlock(&lockdep_lock); 96 __raw_spin_unlock(&lockdep_lock);
93 return 0; 97 return 0;
94} 98}
@@ -120,6 +124,15 @@ static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
120unsigned long nr_lock_classes; 124unsigned long nr_lock_classes;
121static struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; 125static struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
122 126
127static inline struct lock_class *hlock_class(struct held_lock *hlock)
128{
129 if (!hlock->class_idx) {
130 DEBUG_LOCKS_WARN_ON(1);
131 return NULL;
132 }
133 return lock_classes + hlock->class_idx - 1;
134}
135
123#ifdef CONFIG_LOCK_STAT 136#ifdef CONFIG_LOCK_STAT
124static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats); 137static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats);
125 138
@@ -218,7 +231,7 @@ static void lock_release_holdtime(struct held_lock *hlock)
218 231
219 holdtime = sched_clock() - hlock->holdtime_stamp; 232 holdtime = sched_clock() - hlock->holdtime_stamp;
220 233
221 stats = get_lock_stats(hlock->class); 234 stats = get_lock_stats(hlock_class(hlock));
222 if (hlock->read) 235 if (hlock->read)
223 lock_time_inc(&stats->read_holdtime, holdtime); 236 lock_time_inc(&stats->read_holdtime, holdtime);
224 else 237 else
@@ -368,6 +381,19 @@ unsigned int nr_process_chains;
368unsigned int max_lockdep_depth; 381unsigned int max_lockdep_depth;
369unsigned int max_recursion_depth; 382unsigned int max_recursion_depth;
370 383
384static unsigned int lockdep_dependency_gen_id;
385
386static bool lockdep_dependency_visit(struct lock_class *source,
387 unsigned int depth)
388{
389 if (!depth)
390 lockdep_dependency_gen_id++;
391 if (source->dep_gen_id == lockdep_dependency_gen_id)
392 return true;
393 source->dep_gen_id = lockdep_dependency_gen_id;
394 return false;
395}
396
371#ifdef CONFIG_DEBUG_LOCKDEP 397#ifdef CONFIG_DEBUG_LOCKDEP
372/* 398/*
373 * We cannot printk in early bootup code. Not even early_printk() 399 * We cannot printk in early bootup code. Not even early_printk()
@@ -501,7 +527,7 @@ static void print_lockdep_cache(struct lockdep_map *lock)
501 527
502static void print_lock(struct held_lock *hlock) 528static void print_lock(struct held_lock *hlock)
503{ 529{
504 print_lock_name(hlock->class); 530 print_lock_name(hlock_class(hlock));
505 printk(", at: "); 531 printk(", at: ");
506 print_ip_sym(hlock->acquire_ip); 532 print_ip_sym(hlock->acquire_ip);
507} 533}
@@ -554,6 +580,9 @@ static void print_lock_dependencies(struct lock_class *class, int depth)
554{ 580{
555 struct lock_list *entry; 581 struct lock_list *entry;
556 582
583 if (lockdep_dependency_visit(class, depth))
584 return;
585
557 if (DEBUG_LOCKS_WARN_ON(depth >= 20)) 586 if (DEBUG_LOCKS_WARN_ON(depth >= 20))
558 return; 587 return;
559 588
@@ -846,11 +875,11 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
846 if (!entry) 875 if (!entry)
847 return 0; 876 return 0;
848 877
849 entry->class = this;
850 entry->distance = distance;
851 if (!save_trace(&entry->trace)) 878 if (!save_trace(&entry->trace))
852 return 0; 879 return 0;
853 880
881 entry->class = this;
882 entry->distance = distance;
854 /* 883 /*
855 * Since we never remove from the dependency list, the list can 884 * Since we never remove from the dependency list, the list can
856 * be walked lockless by other CPUs, it's only allocation 885 * be walked lockless by other CPUs, it's only allocation
@@ -928,7 +957,7 @@ static noinline int print_circular_bug_tail(void)
928 if (debug_locks_silent) 957 if (debug_locks_silent)
929 return 0; 958 return 0;
930 959
931 this.class = check_source->class; 960 this.class = hlock_class(check_source);
932 if (!save_trace(&this.trace)) 961 if (!save_trace(&this.trace))
933 return 0; 962 return 0;
934 963
@@ -955,6 +984,67 @@ static int noinline print_infinite_recursion_bug(void)
955 return 0; 984 return 0;
956} 985}
957 986
987unsigned long __lockdep_count_forward_deps(struct lock_class *class,
988 unsigned int depth)
989{
990 struct lock_list *entry;
991 unsigned long ret = 1;
992
993 if (lockdep_dependency_visit(class, depth))
994 return 0;
995
996 /*
997 * Recurse this class's dependency list:
998 */
999 list_for_each_entry(entry, &class->locks_after, entry)
1000 ret += __lockdep_count_forward_deps(entry->class, depth + 1);
1001
1002 return ret;
1003}
1004
1005unsigned long lockdep_count_forward_deps(struct lock_class *class)
1006{
1007 unsigned long ret, flags;
1008
1009 local_irq_save(flags);
1010 __raw_spin_lock(&lockdep_lock);
1011 ret = __lockdep_count_forward_deps(class, 0);
1012 __raw_spin_unlock(&lockdep_lock);
1013 local_irq_restore(flags);
1014
1015 return ret;
1016}
1017
1018unsigned long __lockdep_count_backward_deps(struct lock_class *class,
1019 unsigned int depth)
1020{
1021 struct lock_list *entry;
1022 unsigned long ret = 1;
1023
1024 if (lockdep_dependency_visit(class, depth))
1025 return 0;
1026 /*
1027 * Recurse this class's dependency list:
1028 */
1029 list_for_each_entry(entry, &class->locks_before, entry)
1030 ret += __lockdep_count_backward_deps(entry->class, depth + 1);
1031
1032 return ret;
1033}
1034
1035unsigned long lockdep_count_backward_deps(struct lock_class *class)
1036{
1037 unsigned long ret, flags;
1038
1039 local_irq_save(flags);
1040 __raw_spin_lock(&lockdep_lock);
1041 ret = __lockdep_count_backward_deps(class, 0);
1042 __raw_spin_unlock(&lockdep_lock);
1043 local_irq_restore(flags);
1044
1045 return ret;
1046}
1047
958/* 1048/*
959 * Prove that the dependency graph starting at <entry> can not 1049 * Prove that the dependency graph starting at <entry> can not
960 * lead to <target>. Print an error and return 0 if it does. 1050 * lead to <target>. Print an error and return 0 if it does.
@@ -964,6 +1054,9 @@ check_noncircular(struct lock_class *source, unsigned int depth)
964{ 1054{
965 struct lock_list *entry; 1055 struct lock_list *entry;
966 1056
1057 if (lockdep_dependency_visit(source, depth))
1058 return 1;
1059
967 debug_atomic_inc(&nr_cyclic_check_recursions); 1060 debug_atomic_inc(&nr_cyclic_check_recursions);
968 if (depth > max_recursion_depth) 1061 if (depth > max_recursion_depth)
969 max_recursion_depth = depth; 1062 max_recursion_depth = depth;
@@ -973,7 +1066,7 @@ check_noncircular(struct lock_class *source, unsigned int depth)
973 * Check this lock's dependency list: 1066 * Check this lock's dependency list:
974 */ 1067 */
975 list_for_each_entry(entry, &source->locks_after, entry) { 1068 list_for_each_entry(entry, &source->locks_after, entry) {
976 if (entry->class == check_target->class) 1069 if (entry->class == hlock_class(check_target))
977 return print_circular_bug_header(entry, depth+1); 1070 return print_circular_bug_header(entry, depth+1);
978 debug_atomic_inc(&nr_cyclic_checks); 1071 debug_atomic_inc(&nr_cyclic_checks);
979 if (!check_noncircular(entry->class, depth+1)) 1072 if (!check_noncircular(entry->class, depth+1))
@@ -982,7 +1075,7 @@ check_noncircular(struct lock_class *source, unsigned int depth)
982 return 1; 1075 return 1;
983} 1076}
984 1077
985#ifdef CONFIG_TRACE_IRQFLAGS 1078#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
986/* 1079/*
987 * Forwards and backwards subgraph searching, for the purposes of 1080 * Forwards and backwards subgraph searching, for the purposes of
988 * proving that two subgraphs can be connected by a new dependency 1081 * proving that two subgraphs can be connected by a new dependency
@@ -1007,6 +1100,9 @@ find_usage_forwards(struct lock_class *source, unsigned int depth)
1007 struct lock_list *entry; 1100 struct lock_list *entry;
1008 int ret; 1101 int ret;
1009 1102
1103 if (lockdep_dependency_visit(source, depth))
1104 return 1;
1105
1010 if (depth > max_recursion_depth) 1106 if (depth > max_recursion_depth)
1011 max_recursion_depth = depth; 1107 max_recursion_depth = depth;
1012 if (depth >= RECURSION_LIMIT) 1108 if (depth >= RECURSION_LIMIT)
@@ -1046,6 +1142,9 @@ find_usage_backwards(struct lock_class *source, unsigned int depth)
1046 struct lock_list *entry; 1142 struct lock_list *entry;
1047 int ret; 1143 int ret;
1048 1144
1145 if (lockdep_dependency_visit(source, depth))
1146 return 1;
1147
1049 if (!__raw_spin_is_locked(&lockdep_lock)) 1148 if (!__raw_spin_is_locked(&lockdep_lock))
1050 return DEBUG_LOCKS_WARN_ON(1); 1149 return DEBUG_LOCKS_WARN_ON(1);
1051 1150
@@ -1060,6 +1159,11 @@ find_usage_backwards(struct lock_class *source, unsigned int depth)
1060 return 2; 1159 return 2;
1061 } 1160 }
1062 1161
1162 if (!source && debug_locks_off_graph_unlock()) {
1163 WARN_ON(1);
1164 return 0;
1165 }
1166
1063 /* 1167 /*
1064 * Check this lock's dependency list: 1168 * Check this lock's dependency list:
1065 */ 1169 */
@@ -1099,9 +1203,9 @@ print_bad_irq_dependency(struct task_struct *curr,
1099 printk("\nand this task is already holding:\n"); 1203 printk("\nand this task is already holding:\n");
1100 print_lock(prev); 1204 print_lock(prev);
1101 printk("which would create a new lock dependency:\n"); 1205 printk("which would create a new lock dependency:\n");
1102 print_lock_name(prev->class); 1206 print_lock_name(hlock_class(prev));
1103 printk(" ->"); 1207 printk(" ->");
1104 print_lock_name(next->class); 1208 print_lock_name(hlock_class(next));
1105 printk("\n"); 1209 printk("\n");
1106 1210
1107 printk("\nbut this new dependency connects a %s-irq-safe lock:\n", 1211 printk("\nbut this new dependency connects a %s-irq-safe lock:\n",
@@ -1142,12 +1246,12 @@ check_usage(struct task_struct *curr, struct held_lock *prev,
1142 1246
1143 find_usage_bit = bit_backwards; 1247 find_usage_bit = bit_backwards;
1144 /* fills in <backwards_match> */ 1248 /* fills in <backwards_match> */
1145 ret = find_usage_backwards(prev->class, 0); 1249 ret = find_usage_backwards(hlock_class(prev), 0);
1146 if (!ret || ret == 1) 1250 if (!ret || ret == 1)
1147 return ret; 1251 return ret;
1148 1252
1149 find_usage_bit = bit_forwards; 1253 find_usage_bit = bit_forwards;
1150 ret = find_usage_forwards(next->class, 0); 1254 ret = find_usage_forwards(hlock_class(next), 0);
1151 if (!ret || ret == 1) 1255 if (!ret || ret == 1)
1152 return ret; 1256 return ret;
1153 /* ret == 2 */ 1257 /* ret == 2 */
@@ -1268,18 +1372,32 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
1268 struct lockdep_map *next_instance, int read) 1372 struct lockdep_map *next_instance, int read)
1269{ 1373{
1270 struct held_lock *prev; 1374 struct held_lock *prev;
1375 struct held_lock *nest = NULL;
1271 int i; 1376 int i;
1272 1377
1273 for (i = 0; i < curr->lockdep_depth; i++) { 1378 for (i = 0; i < curr->lockdep_depth; i++) {
1274 prev = curr->held_locks + i; 1379 prev = curr->held_locks + i;
1275 if (prev->class != next->class) 1380
1381 if (prev->instance == next->nest_lock)
1382 nest = prev;
1383
1384 if (hlock_class(prev) != hlock_class(next))
1276 continue; 1385 continue;
1386
1277 /* 1387 /*
1278 * Allow read-after-read recursion of the same 1388 * Allow read-after-read recursion of the same
1279 * lock class (i.e. read_lock(lock)+read_lock(lock)): 1389 * lock class (i.e. read_lock(lock)+read_lock(lock)):
1280 */ 1390 */
1281 if ((read == 2) && prev->read) 1391 if ((read == 2) && prev->read)
1282 return 2; 1392 return 2;
1393
1394 /*
1395 * We're holding the nest_lock, which serializes this lock's
1396 * nesting behaviour.
1397 */
1398 if (nest)
1399 return 2;
1400
1283 return print_deadlock_bug(curr, prev, next); 1401 return print_deadlock_bug(curr, prev, next);
1284 } 1402 }
1285 return 1; 1403 return 1;
@@ -1325,7 +1443,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
1325 */ 1443 */
1326 check_source = next; 1444 check_source = next;
1327 check_target = prev; 1445 check_target = prev;
1328 if (!(check_noncircular(next->class, 0))) 1446 if (!(check_noncircular(hlock_class(next), 0)))
1329 return print_circular_bug_tail(); 1447 return print_circular_bug_tail();
1330 1448
1331 if (!check_prev_add_irq(curr, prev, next)) 1449 if (!check_prev_add_irq(curr, prev, next))
@@ -1349,8 +1467,8 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
1349 * chains - the second one will be new, but L1 already has 1467 * chains - the second one will be new, but L1 already has
1350 * L2 added to its dependency list, due to the first chain.) 1468 * L2 added to its dependency list, due to the first chain.)
1351 */ 1469 */
1352 list_for_each_entry(entry, &prev->class->locks_after, entry) { 1470 list_for_each_entry(entry, &hlock_class(prev)->locks_after, entry) {
1353 if (entry->class == next->class) { 1471 if (entry->class == hlock_class(next)) {
1354 if (distance == 1) 1472 if (distance == 1)
1355 entry->distance = 1; 1473 entry->distance = 1;
1356 return 2; 1474 return 2;
@@ -1361,26 +1479,28 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
1361 * Ok, all validations passed, add the new lock 1479 * Ok, all validations passed, add the new lock
1362 * to the previous lock's dependency list: 1480 * to the previous lock's dependency list:
1363 */ 1481 */
1364 ret = add_lock_to_list(prev->class, next->class, 1482 ret = add_lock_to_list(hlock_class(prev), hlock_class(next),
1365 &prev->class->locks_after, next->acquire_ip, distance); 1483 &hlock_class(prev)->locks_after,
1484 next->acquire_ip, distance);
1366 1485
1367 if (!ret) 1486 if (!ret)
1368 return 0; 1487 return 0;
1369 1488
1370 ret = add_lock_to_list(next->class, prev->class, 1489 ret = add_lock_to_list(hlock_class(next), hlock_class(prev),
1371 &next->class->locks_before, next->acquire_ip, distance); 1490 &hlock_class(next)->locks_before,
1491 next->acquire_ip, distance);
1372 if (!ret) 1492 if (!ret)
1373 return 0; 1493 return 0;
1374 1494
1375 /* 1495 /*
1376 * Debugging printouts: 1496 * Debugging printouts:
1377 */ 1497 */
1378 if (verbose(prev->class) || verbose(next->class)) { 1498 if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) {
1379 graph_unlock(); 1499 graph_unlock();
1380 printk("\n new dependency: "); 1500 printk("\n new dependency: ");
1381 print_lock_name(prev->class); 1501 print_lock_name(hlock_class(prev));
1382 printk(" => "); 1502 printk(" => ");
1383 print_lock_name(next->class); 1503 print_lock_name(hlock_class(next));
1384 printk("\n"); 1504 printk("\n");
1385 dump_stack(); 1505 dump_stack();
1386 return graph_lock(); 1506 return graph_lock();
@@ -1458,7 +1578,14 @@ out_bug:
1458} 1578}
1459 1579
1460unsigned long nr_lock_chains; 1580unsigned long nr_lock_chains;
1461static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS]; 1581struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS];
1582int nr_chain_hlocks;
1583static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS];
1584
1585struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i)
1586{
1587 return lock_classes + chain_hlocks[chain->base + i];
1588}
1462 1589
1463/* 1590/*
1464 * Look up a dependency chain. If the key is not present yet then 1591 * Look up a dependency chain. If the key is not present yet then
@@ -1466,10 +1593,15 @@ static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS];
1466 * validated. If the key is already hashed, return 0. 1593 * validated. If the key is already hashed, return 0.
1467 * (On return with 1 graph_lock is held.) 1594 * (On return with 1 graph_lock is held.)
1468 */ 1595 */
1469static inline int lookup_chain_cache(u64 chain_key, struct lock_class *class) 1596static inline int lookup_chain_cache(struct task_struct *curr,
1597 struct held_lock *hlock,
1598 u64 chain_key)
1470{ 1599{
1600 struct lock_class *class = hlock_class(hlock);
1471 struct list_head *hash_head = chainhashentry(chain_key); 1601 struct list_head *hash_head = chainhashentry(chain_key);
1472 struct lock_chain *chain; 1602 struct lock_chain *chain;
1603 struct held_lock *hlock_curr, *hlock_next;
1604 int i, j, n, cn;
1473 1605
1474 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) 1606 if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
1475 return 0; 1607 return 0;
@@ -1517,6 +1649,32 @@ cache_hit:
1517 } 1649 }
1518 chain = lock_chains + nr_lock_chains++; 1650 chain = lock_chains + nr_lock_chains++;
1519 chain->chain_key = chain_key; 1651 chain->chain_key = chain_key;
1652 chain->irq_context = hlock->irq_context;
1653 /* Find the first held_lock of current chain */
1654 hlock_next = hlock;
1655 for (i = curr->lockdep_depth - 1; i >= 0; i--) {
1656 hlock_curr = curr->held_locks + i;
1657 if (hlock_curr->irq_context != hlock_next->irq_context)
1658 break;
1659 hlock_next = hlock;
1660 }
1661 i++;
1662 chain->depth = curr->lockdep_depth + 1 - i;
1663 cn = nr_chain_hlocks;
1664 while (cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS) {
1665 n = cmpxchg(&nr_chain_hlocks, cn, cn + chain->depth);
1666 if (n == cn)
1667 break;
1668 cn = n;
1669 }
1670 if (likely(cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
1671 chain->base = cn;
1672 for (j = 0; j < chain->depth - 1; j++, i++) {
1673 int lock_id = curr->held_locks[i].class_idx - 1;
1674 chain_hlocks[chain->base + j] = lock_id;
1675 }
1676 chain_hlocks[chain->base + j] = class - lock_classes;
1677 }
1520 list_add_tail_rcu(&chain->entry, hash_head); 1678 list_add_tail_rcu(&chain->entry, hash_head);
1521 debug_atomic_inc(&chain_lookup_misses); 1679 debug_atomic_inc(&chain_lookup_misses);
1522 inc_chains(); 1680 inc_chains();
@@ -1538,7 +1696,7 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock,
1538 * graph_lock for us) 1696 * graph_lock for us)
1539 */ 1697 */
1540 if (!hlock->trylock && (hlock->check == 2) && 1698 if (!hlock->trylock && (hlock->check == 2) &&
1541 lookup_chain_cache(chain_key, hlock->class)) { 1699 lookup_chain_cache(curr, hlock, chain_key)) {
1542 /* 1700 /*
1543 * Check whether last held lock: 1701 * Check whether last held lock:
1544 * 1702 *
@@ -1601,14 +1759,13 @@ static void check_chain_key(struct task_struct *curr)
1601 hlock = curr->held_locks + i; 1759 hlock = curr->held_locks + i;
1602 if (chain_key != hlock->prev_chain_key) { 1760 if (chain_key != hlock->prev_chain_key) {
1603 debug_locks_off(); 1761 debug_locks_off();
1604 printk("hm#1, depth: %u [%u], %016Lx != %016Lx\n", 1762 WARN(1, "hm#1, depth: %u [%u], %016Lx != %016Lx\n",
1605 curr->lockdep_depth, i, 1763 curr->lockdep_depth, i,
1606 (unsigned long long)chain_key, 1764 (unsigned long long)chain_key,
1607 (unsigned long long)hlock->prev_chain_key); 1765 (unsigned long long)hlock->prev_chain_key);
1608 WARN_ON(1);
1609 return; 1766 return;
1610 } 1767 }
1611 id = hlock->class - lock_classes; 1768 id = hlock->class_idx - 1;
1612 if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) 1769 if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
1613 return; 1770 return;
1614 1771
@@ -1620,11 +1777,10 @@ static void check_chain_key(struct task_struct *curr)
1620 } 1777 }
1621 if (chain_key != curr->curr_chain_key) { 1778 if (chain_key != curr->curr_chain_key) {
1622 debug_locks_off(); 1779 debug_locks_off();
1623 printk("hm#2, depth: %u [%u], %016Lx != %016Lx\n", 1780 WARN(1, "hm#2, depth: %u [%u], %016Lx != %016Lx\n",
1624 curr->lockdep_depth, i, 1781 curr->lockdep_depth, i,
1625 (unsigned long long)chain_key, 1782 (unsigned long long)chain_key,
1626 (unsigned long long)curr->curr_chain_key); 1783 (unsigned long long)curr->curr_chain_key);
1627 WARN_ON(1);
1628 } 1784 }
1629#endif 1785#endif
1630} 1786}
@@ -1653,7 +1809,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
1653 print_lock(this); 1809 print_lock(this);
1654 1810
1655 printk("{%s} state was registered at:\n", usage_str[prev_bit]); 1811 printk("{%s} state was registered at:\n", usage_str[prev_bit]);
1656 print_stack_trace(this->class->usage_traces + prev_bit, 1); 1812 print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1);
1657 1813
1658 print_irqtrace_events(curr); 1814 print_irqtrace_events(curr);
1659 printk("\nother info that might help us debug this:\n"); 1815 printk("\nother info that might help us debug this:\n");
@@ -1672,7 +1828,7 @@ static inline int
1672valid_state(struct task_struct *curr, struct held_lock *this, 1828valid_state(struct task_struct *curr, struct held_lock *this,
1673 enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit) 1829 enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit)
1674{ 1830{
1675 if (unlikely(this->class->usage_mask & (1 << bad_bit))) 1831 if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit)))
1676 return print_usage_bug(curr, this, bad_bit, new_bit); 1832 return print_usage_bug(curr, this, bad_bit, new_bit);
1677 return 1; 1833 return 1;
1678} 1834}
@@ -1680,7 +1836,7 @@ valid_state(struct task_struct *curr, struct held_lock *this,
1680static int mark_lock(struct task_struct *curr, struct held_lock *this, 1836static int mark_lock(struct task_struct *curr, struct held_lock *this,
1681 enum lock_usage_bit new_bit); 1837 enum lock_usage_bit new_bit);
1682 1838
1683#ifdef CONFIG_TRACE_IRQFLAGS 1839#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_PROVE_LOCKING)
1684 1840
1685/* 1841/*
1686 * print irq inversion bug: 1842 * print irq inversion bug:
@@ -1711,7 +1867,7 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
1711 lockdep_print_held_locks(curr); 1867 lockdep_print_held_locks(curr);
1712 1868
1713 printk("\nthe first lock's dependencies:\n"); 1869 printk("\nthe first lock's dependencies:\n");
1714 print_lock_dependencies(this->class, 0); 1870 print_lock_dependencies(hlock_class(this), 0);
1715 1871
1716 printk("\nthe second lock's dependencies:\n"); 1872 printk("\nthe second lock's dependencies:\n");
1717 print_lock_dependencies(other, 0); 1873 print_lock_dependencies(other, 0);
@@ -1734,7 +1890,7 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this,
1734 1890
1735 find_usage_bit = bit; 1891 find_usage_bit = bit;
1736 /* fills in <forwards_match> */ 1892 /* fills in <forwards_match> */
1737 ret = find_usage_forwards(this->class, 0); 1893 ret = find_usage_forwards(hlock_class(this), 0);
1738 if (!ret || ret == 1) 1894 if (!ret || ret == 1)
1739 return ret; 1895 return ret;
1740 1896
@@ -1753,7 +1909,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
1753 1909
1754 find_usage_bit = bit; 1910 find_usage_bit = bit;
1755 /* fills in <backwards_match> */ 1911 /* fills in <backwards_match> */
1756 ret = find_usage_backwards(this->class, 0); 1912 ret = find_usage_backwards(hlock_class(this), 0);
1757 if (!ret || ret == 1) 1913 if (!ret || ret == 1)
1758 return ret; 1914 return ret;
1759 1915
@@ -1819,7 +1975,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
1819 LOCK_ENABLED_HARDIRQS_READ, "hard-read")) 1975 LOCK_ENABLED_HARDIRQS_READ, "hard-read"))
1820 return 0; 1976 return 0;
1821#endif 1977#endif
1822 if (hardirq_verbose(this->class)) 1978 if (hardirq_verbose(hlock_class(this)))
1823 ret = 2; 1979 ret = 2;
1824 break; 1980 break;
1825 case LOCK_USED_IN_SOFTIRQ: 1981 case LOCK_USED_IN_SOFTIRQ:
@@ -1844,7 +2000,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
1844 LOCK_ENABLED_SOFTIRQS_READ, "soft-read")) 2000 LOCK_ENABLED_SOFTIRQS_READ, "soft-read"))
1845 return 0; 2001 return 0;
1846#endif 2002#endif
1847 if (softirq_verbose(this->class)) 2003 if (softirq_verbose(hlock_class(this)))
1848 ret = 2; 2004 ret = 2;
1849 break; 2005 break;
1850 case LOCK_USED_IN_HARDIRQ_READ: 2006 case LOCK_USED_IN_HARDIRQ_READ:
@@ -1857,7 +2013,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
1857 if (!check_usage_forwards(curr, this, 2013 if (!check_usage_forwards(curr, this,
1858 LOCK_ENABLED_HARDIRQS, "hard")) 2014 LOCK_ENABLED_HARDIRQS, "hard"))
1859 return 0; 2015 return 0;
1860 if (hardirq_verbose(this->class)) 2016 if (hardirq_verbose(hlock_class(this)))
1861 ret = 2; 2017 ret = 2;
1862 break; 2018 break;
1863 case LOCK_USED_IN_SOFTIRQ_READ: 2019 case LOCK_USED_IN_SOFTIRQ_READ:
@@ -1870,7 +2026,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
1870 if (!check_usage_forwards(curr, this, 2026 if (!check_usage_forwards(curr, this,
1871 LOCK_ENABLED_SOFTIRQS, "soft")) 2027 LOCK_ENABLED_SOFTIRQS, "soft"))
1872 return 0; 2028 return 0;
1873 if (softirq_verbose(this->class)) 2029 if (softirq_verbose(hlock_class(this)))
1874 ret = 2; 2030 ret = 2;
1875 break; 2031 break;
1876 case LOCK_ENABLED_HARDIRQS: 2032 case LOCK_ENABLED_HARDIRQS:
@@ -1896,7 +2052,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
1896 LOCK_USED_IN_HARDIRQ_READ, "hard-read")) 2052 LOCK_USED_IN_HARDIRQ_READ, "hard-read"))
1897 return 0; 2053 return 0;
1898#endif 2054#endif
1899 if (hardirq_verbose(this->class)) 2055 if (hardirq_verbose(hlock_class(this)))
1900 ret = 2; 2056 ret = 2;
1901 break; 2057 break;
1902 case LOCK_ENABLED_SOFTIRQS: 2058 case LOCK_ENABLED_SOFTIRQS:
@@ -1922,7 +2078,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
1922 LOCK_USED_IN_SOFTIRQ_READ, "soft-read")) 2078 LOCK_USED_IN_SOFTIRQ_READ, "soft-read"))
1923 return 0; 2079 return 0;
1924#endif 2080#endif
1925 if (softirq_verbose(this->class)) 2081 if (softirq_verbose(hlock_class(this)))
1926 ret = 2; 2082 ret = 2;
1927 break; 2083 break;
1928 case LOCK_ENABLED_HARDIRQS_READ: 2084 case LOCK_ENABLED_HARDIRQS_READ:
@@ -1937,7 +2093,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
1937 LOCK_USED_IN_HARDIRQ, "hard")) 2093 LOCK_USED_IN_HARDIRQ, "hard"))
1938 return 0; 2094 return 0;
1939#endif 2095#endif
1940 if (hardirq_verbose(this->class)) 2096 if (hardirq_verbose(hlock_class(this)))
1941 ret = 2; 2097 ret = 2;
1942 break; 2098 break;
1943 case LOCK_ENABLED_SOFTIRQS_READ: 2099 case LOCK_ENABLED_SOFTIRQS_READ:
@@ -1952,7 +2108,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
1952 LOCK_USED_IN_SOFTIRQ, "soft")) 2108 LOCK_USED_IN_SOFTIRQ, "soft"))
1953 return 0; 2109 return 0;
1954#endif 2110#endif
1955 if (softirq_verbose(this->class)) 2111 if (softirq_verbose(hlock_class(this)))
1956 ret = 2; 2112 ret = 2;
1957 break; 2113 break;
1958 default: 2114 default:
@@ -2013,11 +2169,13 @@ void early_boot_irqs_on(void)
2013/* 2169/*
2014 * Hardirqs will be enabled: 2170 * Hardirqs will be enabled:
2015 */ 2171 */
2016void trace_hardirqs_on(void) 2172void trace_hardirqs_on_caller(unsigned long a0)
2017{ 2173{
2018 struct task_struct *curr = current; 2174 struct task_struct *curr = current;
2019 unsigned long ip; 2175 unsigned long ip;
2020 2176
2177 time_hardirqs_on(CALLER_ADDR0, a0);
2178
2021 if (unlikely(!debug_locks || current->lockdep_recursion)) 2179 if (unlikely(!debug_locks || current->lockdep_recursion))
2022 return; 2180 return;
2023 2181
@@ -2055,16 +2213,23 @@ void trace_hardirqs_on(void)
2055 curr->hardirq_enable_event = ++curr->irq_events; 2213 curr->hardirq_enable_event = ++curr->irq_events;
2056 debug_atomic_inc(&hardirqs_on_events); 2214 debug_atomic_inc(&hardirqs_on_events);
2057} 2215}
2216EXPORT_SYMBOL(trace_hardirqs_on_caller);
2058 2217
2218void trace_hardirqs_on(void)
2219{
2220 trace_hardirqs_on_caller(CALLER_ADDR0);
2221}
2059EXPORT_SYMBOL(trace_hardirqs_on); 2222EXPORT_SYMBOL(trace_hardirqs_on);
2060 2223
2061/* 2224/*
2062 * Hardirqs were disabled: 2225 * Hardirqs were disabled:
2063 */ 2226 */
2064void trace_hardirqs_off(void) 2227void trace_hardirqs_off_caller(unsigned long a0)
2065{ 2228{
2066 struct task_struct *curr = current; 2229 struct task_struct *curr = current;
2067 2230
2231 time_hardirqs_off(CALLER_ADDR0, a0);
2232
2068 if (unlikely(!debug_locks || current->lockdep_recursion)) 2233 if (unlikely(!debug_locks || current->lockdep_recursion))
2069 return; 2234 return;
2070 2235
@@ -2082,7 +2247,12 @@ void trace_hardirqs_off(void)
2082 } else 2247 } else
2083 debug_atomic_inc(&redundant_hardirqs_off); 2248 debug_atomic_inc(&redundant_hardirqs_off);
2084} 2249}
2250EXPORT_SYMBOL(trace_hardirqs_off_caller);
2085 2251
2252void trace_hardirqs_off(void)
2253{
2254 trace_hardirqs_off_caller(CALLER_ADDR0);
2255}
2086EXPORT_SYMBOL(trace_hardirqs_off); 2256EXPORT_SYMBOL(trace_hardirqs_off);
2087 2257
2088/* 2258/*
@@ -2246,7 +2416,7 @@ static inline int separate_irq_context(struct task_struct *curr,
2246 * Mark a lock with a usage bit, and validate the state transition: 2416 * Mark a lock with a usage bit, and validate the state transition:
2247 */ 2417 */
2248static int mark_lock(struct task_struct *curr, struct held_lock *this, 2418static int mark_lock(struct task_struct *curr, struct held_lock *this,
2249 enum lock_usage_bit new_bit) 2419 enum lock_usage_bit new_bit)
2250{ 2420{
2251 unsigned int new_mask = 1 << new_bit, ret = 1; 2421 unsigned int new_mask = 1 << new_bit, ret = 1;
2252 2422
@@ -2254,7 +2424,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
2254 * If already set then do not dirty the cacheline, 2424 * If already set then do not dirty the cacheline,
2255 * nor do any checks: 2425 * nor do any checks:
2256 */ 2426 */
2257 if (likely(this->class->usage_mask & new_mask)) 2427 if (likely(hlock_class(this)->usage_mask & new_mask))
2258 return 1; 2428 return 1;
2259 2429
2260 if (!graph_lock()) 2430 if (!graph_lock())
@@ -2262,14 +2432,14 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
2262 /* 2432 /*
2263 * Make sure we didnt race: 2433 * Make sure we didnt race:
2264 */ 2434 */
2265 if (unlikely(this->class->usage_mask & new_mask)) { 2435 if (unlikely(hlock_class(this)->usage_mask & new_mask)) {
2266 graph_unlock(); 2436 graph_unlock();
2267 return 1; 2437 return 1;
2268 } 2438 }
2269 2439
2270 this->class->usage_mask |= new_mask; 2440 hlock_class(this)->usage_mask |= new_mask;
2271 2441
2272 if (!save_trace(this->class->usage_traces + new_bit)) 2442 if (!save_trace(hlock_class(this)->usage_traces + new_bit))
2273 return 0; 2443 return 0;
2274 2444
2275 switch (new_bit) { 2445 switch (new_bit) {
@@ -2349,7 +2519,7 @@ EXPORT_SYMBOL_GPL(lockdep_init_map);
2349 */ 2519 */
2350static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, 2520static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2351 int trylock, int read, int check, int hardirqs_off, 2521 int trylock, int read, int check, int hardirqs_off,
2352 unsigned long ip) 2522 struct lockdep_map *nest_lock, unsigned long ip)
2353{ 2523{
2354 struct task_struct *curr = current; 2524 struct task_struct *curr = current;
2355 struct lock_class *class = NULL; 2525 struct lock_class *class = NULL;
@@ -2403,14 +2573,16 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2403 return 0; 2573 return 0;
2404 2574
2405 hlock = curr->held_locks + depth; 2575 hlock = curr->held_locks + depth;
2406 2576 if (DEBUG_LOCKS_WARN_ON(!class))
2407 hlock->class = class; 2577 return 0;
2578 hlock->class_idx = class - lock_classes + 1;
2408 hlock->acquire_ip = ip; 2579 hlock->acquire_ip = ip;
2409 hlock->instance = lock; 2580 hlock->instance = lock;
2581 hlock->nest_lock = nest_lock;
2410 hlock->trylock = trylock; 2582 hlock->trylock = trylock;
2411 hlock->read = read; 2583 hlock->read = read;
2412 hlock->check = check; 2584 hlock->check = check;
2413 hlock->hardirqs_off = hardirqs_off; 2585 hlock->hardirqs_off = !!hardirqs_off;
2414#ifdef CONFIG_LOCK_STAT 2586#ifdef CONFIG_LOCK_STAT
2415 hlock->waittime_stamp = 0; 2587 hlock->waittime_stamp = 0;
2416 hlock->holdtime_stamp = sched_clock(); 2588 hlock->holdtime_stamp = sched_clock();
@@ -2518,6 +2690,55 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
2518 return 1; 2690 return 1;
2519} 2691}
2520 2692
2693static int
2694__lock_set_subclass(struct lockdep_map *lock,
2695 unsigned int subclass, unsigned long ip)
2696{
2697 struct task_struct *curr = current;
2698 struct held_lock *hlock, *prev_hlock;
2699 struct lock_class *class;
2700 unsigned int depth;
2701 int i;
2702
2703 depth = curr->lockdep_depth;
2704 if (DEBUG_LOCKS_WARN_ON(!depth))
2705 return 0;
2706
2707 prev_hlock = NULL;
2708 for (i = depth-1; i >= 0; i--) {
2709 hlock = curr->held_locks + i;
2710 /*
2711 * We must not cross into another context:
2712 */
2713 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
2714 break;
2715 if (hlock->instance == lock)
2716 goto found_it;
2717 prev_hlock = hlock;
2718 }
2719 return print_unlock_inbalance_bug(curr, lock, ip);
2720
2721found_it:
2722 class = register_lock_class(lock, subclass, 0);
2723 hlock->class_idx = class - lock_classes + 1;
2724
2725 curr->lockdep_depth = i;
2726 curr->curr_chain_key = hlock->prev_chain_key;
2727
2728 for (; i < depth; i++) {
2729 hlock = curr->held_locks + i;
2730 if (!__lock_acquire(hlock->instance,
2731 hlock_class(hlock)->subclass, hlock->trylock,
2732 hlock->read, hlock->check, hlock->hardirqs_off,
2733 hlock->nest_lock, hlock->acquire_ip))
2734 return 0;
2735 }
2736
2737 if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth))
2738 return 0;
2739 return 1;
2740}
2741
2521/* 2742/*
2522 * Remove the lock to the list of currently held locks in a 2743 * Remove the lock to the list of currently held locks in a
2523 * potentially non-nested (out of order) manner. This is a 2744 * potentially non-nested (out of order) manner. This is a
@@ -2568,9 +2789,9 @@ found_it:
2568 for (i++; i < depth; i++) { 2789 for (i++; i < depth; i++) {
2569 hlock = curr->held_locks + i; 2790 hlock = curr->held_locks + i;
2570 if (!__lock_acquire(hlock->instance, 2791 if (!__lock_acquire(hlock->instance,
2571 hlock->class->subclass, hlock->trylock, 2792 hlock_class(hlock)->subclass, hlock->trylock,
2572 hlock->read, hlock->check, hlock->hardirqs_off, 2793 hlock->read, hlock->check, hlock->hardirqs_off,
2573 hlock->acquire_ip)) 2794 hlock->nest_lock, hlock->acquire_ip))
2574 return 0; 2795 return 0;
2575 } 2796 }
2576 2797
@@ -2613,7 +2834,7 @@ static int lock_release_nested(struct task_struct *curr,
2613 2834
2614#ifdef CONFIG_DEBUG_LOCKDEP 2835#ifdef CONFIG_DEBUG_LOCKDEP
2615 hlock->prev_chain_key = 0; 2836 hlock->prev_chain_key = 0;
2616 hlock->class = NULL; 2837 hlock->class_idx = 0;
2617 hlock->acquire_ip = 0; 2838 hlock->acquire_ip = 0;
2618 hlock->irq_context = 0; 2839 hlock->irq_context = 0;
2619#endif 2840#endif
@@ -2650,7 +2871,8 @@ __lock_release(struct lockdep_map *lock, int nested, unsigned long ip)
2650 */ 2871 */
2651static void check_flags(unsigned long flags) 2872static void check_flags(unsigned long flags)
2652{ 2873{
2653#if defined(CONFIG_DEBUG_LOCKDEP) && defined(CONFIG_TRACE_IRQFLAGS) 2874#if defined(CONFIG_PROVE_LOCKING) && defined(CONFIG_DEBUG_LOCKDEP) && \
2875 defined(CONFIG_TRACE_IRQFLAGS)
2654 if (!debug_locks) 2876 if (!debug_locks)
2655 return; 2877 return;
2656 2878
@@ -2681,18 +2903,36 @@ static void check_flags(unsigned long flags)
2681#endif 2903#endif
2682} 2904}
2683 2905
2906void
2907lock_set_subclass(struct lockdep_map *lock,
2908 unsigned int subclass, unsigned long ip)
2909{
2910 unsigned long flags;
2911
2912 if (unlikely(current->lockdep_recursion))
2913 return;
2914
2915 raw_local_irq_save(flags);
2916 current->lockdep_recursion = 1;
2917 check_flags(flags);
2918 if (__lock_set_subclass(lock, subclass, ip))
2919 check_chain_key(current);
2920 current->lockdep_recursion = 0;
2921 raw_local_irq_restore(flags);
2922}
2923
2924EXPORT_SYMBOL_GPL(lock_set_subclass);
2925
2684/* 2926/*
2685 * We are not always called with irqs disabled - do that here, 2927 * We are not always called with irqs disabled - do that here,
2686 * and also avoid lockdep recursion: 2928 * and also avoid lockdep recursion:
2687 */ 2929 */
2688void lock_acquire(struct lockdep_map *lock, unsigned int subclass, 2930void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2689 int trylock, int read, int check, unsigned long ip) 2931 int trylock, int read, int check,
2932 struct lockdep_map *nest_lock, unsigned long ip)
2690{ 2933{
2691 unsigned long flags; 2934 unsigned long flags;
2692 2935
2693 if (unlikely(!lock_stat && !prove_locking))
2694 return;
2695
2696 if (unlikely(current->lockdep_recursion)) 2936 if (unlikely(current->lockdep_recursion))
2697 return; 2937 return;
2698 2938
@@ -2701,20 +2941,18 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2701 2941
2702 current->lockdep_recursion = 1; 2942 current->lockdep_recursion = 1;
2703 __lock_acquire(lock, subclass, trylock, read, check, 2943 __lock_acquire(lock, subclass, trylock, read, check,
2704 irqs_disabled_flags(flags), ip); 2944 irqs_disabled_flags(flags), nest_lock, ip);
2705 current->lockdep_recursion = 0; 2945 current->lockdep_recursion = 0;
2706 raw_local_irq_restore(flags); 2946 raw_local_irq_restore(flags);
2707} 2947}
2708 2948
2709EXPORT_SYMBOL_GPL(lock_acquire); 2949EXPORT_SYMBOL_GPL(lock_acquire);
2710 2950
2711void lock_release(struct lockdep_map *lock, int nested, unsigned long ip) 2951void lock_release(struct lockdep_map *lock, int nested,
2952 unsigned long ip)
2712{ 2953{
2713 unsigned long flags; 2954 unsigned long flags;
2714 2955
2715 if (unlikely(!lock_stat && !prove_locking))
2716 return;
2717
2718 if (unlikely(current->lockdep_recursion)) 2956 if (unlikely(current->lockdep_recursion))
2719 return; 2957 return;
2720 2958
@@ -2787,11 +3025,11 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
2787found_it: 3025found_it:
2788 hlock->waittime_stamp = sched_clock(); 3026 hlock->waittime_stamp = sched_clock();
2789 3027
2790 point = lock_contention_point(hlock->class, ip); 3028 point = lock_contention_point(hlock_class(hlock), ip);
2791 3029
2792 stats = get_lock_stats(hlock->class); 3030 stats = get_lock_stats(hlock_class(hlock));
2793 if (point < ARRAY_SIZE(stats->contention_point)) 3031 if (point < ARRAY_SIZE(stats->contention_point))
2794 stats->contention_point[i]++; 3032 stats->contention_point[point]++;
2795 if (lock->cpu != smp_processor_id()) 3033 if (lock->cpu != smp_processor_id())
2796 stats->bounces[bounce_contended + !!hlock->read]++; 3034 stats->bounces[bounce_contended + !!hlock->read]++;
2797 put_lock_stats(stats); 3035 put_lock_stats(stats);
@@ -2835,7 +3073,7 @@ found_it:
2835 hlock->holdtime_stamp = now; 3073 hlock->holdtime_stamp = now;
2836 } 3074 }
2837 3075
2838 stats = get_lock_stats(hlock->class); 3076 stats = get_lock_stats(hlock_class(hlock));
2839 if (waittime) { 3077 if (waittime) {
2840 if (hlock->read) 3078 if (hlock->read)
2841 lock_time_inc(&stats->read_waittime, waittime); 3079 lock_time_inc(&stats->read_waittime, waittime);
@@ -2930,6 +3168,7 @@ static void zap_class(struct lock_class *class)
2930 list_del_rcu(&class->hash_entry); 3168 list_del_rcu(&class->hash_entry);
2931 list_del_rcu(&class->lock_entry); 3169 list_del_rcu(&class->lock_entry);
2932 3170
3171 class->key = NULL;
2933} 3172}
2934 3173
2935static inline int within(const void *addr, void *start, unsigned long size) 3174static inline int within(const void *addr, void *start, unsigned long size)
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index 8ce09bc4613d..56b196932c08 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -17,12 +17,11 @@
17 */ 17 */
18#define MAX_LOCKDEP_ENTRIES 8192UL 18#define MAX_LOCKDEP_ENTRIES 8192UL
19 19
20#define MAX_LOCKDEP_KEYS_BITS 11
21#define MAX_LOCKDEP_KEYS (1UL << MAX_LOCKDEP_KEYS_BITS)
22
23#define MAX_LOCKDEP_CHAINS_BITS 14 20#define MAX_LOCKDEP_CHAINS_BITS 14
24#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) 21#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS)
25 22
23#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5)
24
26/* 25/*
27 * Stack-trace: tightly packed array of stack backtrace 26 * Stack-trace: tightly packed array of stack backtrace
28 * addresses. Protected by the hash_lock. 27 * addresses. Protected by the hash_lock.
@@ -30,15 +29,19 @@
30#define MAX_STACK_TRACE_ENTRIES 262144UL 29#define MAX_STACK_TRACE_ENTRIES 262144UL
31 30
32extern struct list_head all_lock_classes; 31extern struct list_head all_lock_classes;
32extern struct lock_chain lock_chains[];
33 33
34extern void 34extern void
35get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4); 35get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4);
36 36
37extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str); 37extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str);
38 38
39struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i);
40
39extern unsigned long nr_lock_classes; 41extern unsigned long nr_lock_classes;
40extern unsigned long nr_list_entries; 42extern unsigned long nr_list_entries;
41extern unsigned long nr_lock_chains; 43extern unsigned long nr_lock_chains;
44extern int nr_chain_hlocks;
42extern unsigned long nr_stack_trace_entries; 45extern unsigned long nr_stack_trace_entries;
43 46
44extern unsigned int nr_hardirq_chains; 47extern unsigned int nr_hardirq_chains;
@@ -47,6 +50,22 @@ extern unsigned int nr_process_chains;
47extern unsigned int max_lockdep_depth; 50extern unsigned int max_lockdep_depth;
48extern unsigned int max_recursion_depth; 51extern unsigned int max_recursion_depth;
49 52
53#ifdef CONFIG_PROVE_LOCKING
54extern unsigned long lockdep_count_forward_deps(struct lock_class *);
55extern unsigned long lockdep_count_backward_deps(struct lock_class *);
56#else
57static inline unsigned long
58lockdep_count_forward_deps(struct lock_class *class)
59{
60 return 0;
61}
62static inline unsigned long
63lockdep_count_backward_deps(struct lock_class *class)
64{
65 return 0;
66}
67#endif
68
50#ifdef CONFIG_DEBUG_LOCKDEP 69#ifdef CONFIG_DEBUG_LOCKDEP
51/* 70/*
52 * Various lockdep statistics: 71 * Various lockdep statistics:
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index dc5d29648d85..20dbcbf9c7dd 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -63,34 +63,6 @@ static void l_stop(struct seq_file *m, void *v)
63{ 63{
64} 64}
65 65
66static unsigned long count_forward_deps(struct lock_class *class)
67{
68 struct lock_list *entry;
69 unsigned long ret = 1;
70
71 /*
72 * Recurse this class's dependency list:
73 */
74 list_for_each_entry(entry, &class->locks_after, entry)
75 ret += count_forward_deps(entry->class);
76
77 return ret;
78}
79
80static unsigned long count_backward_deps(struct lock_class *class)
81{
82 struct lock_list *entry;
83 unsigned long ret = 1;
84
85 /*
86 * Recurse this class's dependency list:
87 */
88 list_for_each_entry(entry, &class->locks_before, entry)
89 ret += count_backward_deps(entry->class);
90
91 return ret;
92}
93
94static void print_name(struct seq_file *m, struct lock_class *class) 66static void print_name(struct seq_file *m, struct lock_class *class)
95{ 67{
96 char str[128]; 68 char str[128];
@@ -110,7 +82,6 @@ static void print_name(struct seq_file *m, struct lock_class *class)
110 82
111static int l_show(struct seq_file *m, void *v) 83static int l_show(struct seq_file *m, void *v)
112{ 84{
113 unsigned long nr_forward_deps, nr_backward_deps;
114 struct lock_class *class = v; 85 struct lock_class *class = v;
115 struct lock_list *entry; 86 struct lock_list *entry;
116 char c1, c2, c3, c4; 87 char c1, c2, c3, c4;
@@ -124,11 +95,10 @@ static int l_show(struct seq_file *m, void *v)
124#ifdef CONFIG_DEBUG_LOCKDEP 95#ifdef CONFIG_DEBUG_LOCKDEP
125 seq_printf(m, " OPS:%8ld", class->ops); 96 seq_printf(m, " OPS:%8ld", class->ops);
126#endif 97#endif
127 nr_forward_deps = count_forward_deps(class); 98#ifdef CONFIG_PROVE_LOCKING
128 seq_printf(m, " FD:%5ld", nr_forward_deps); 99 seq_printf(m, " FD:%5ld", lockdep_count_forward_deps(class));
129 100 seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class));
130 nr_backward_deps = count_backward_deps(class); 101#endif
131 seq_printf(m, " BD:%5ld", nr_backward_deps);
132 102
133 get_usage_chars(class, &c1, &c2, &c3, &c4); 103 get_usage_chars(class, &c1, &c2, &c3, &c4);
134 seq_printf(m, " %c%c%c%c", c1, c2, c3, c4); 104 seq_printf(m, " %c%c%c%c", c1, c2, c3, c4);
@@ -139,7 +109,7 @@ static int l_show(struct seq_file *m, void *v)
139 109
140 list_for_each_entry(entry, &class->locks_after, entry) { 110 list_for_each_entry(entry, &class->locks_after, entry) {
141 if (entry->distance == 1) { 111 if (entry->distance == 1) {
142 seq_printf(m, " -> [%p] ", entry->class); 112 seq_printf(m, " -> [%p] ", entry->class->key);
143 print_name(m, entry->class); 113 print_name(m, entry->class);
144 seq_puts(m, "\n"); 114 seq_puts(m, "\n");
145 } 115 }
@@ -178,6 +148,98 @@ static const struct file_operations proc_lockdep_operations = {
178 .release = seq_release, 148 .release = seq_release,
179}; 149};
180 150
151#ifdef CONFIG_PROVE_LOCKING
152static void *lc_next(struct seq_file *m, void *v, loff_t *pos)
153{
154 struct lock_chain *chain;
155
156 (*pos)++;
157
158 if (v == SEQ_START_TOKEN)
159 chain = m->private;
160 else {
161 chain = v;
162
163 if (*pos < nr_lock_chains)
164 chain = lock_chains + *pos;
165 else
166 chain = NULL;
167 }
168
169 return chain;
170}
171
172static void *lc_start(struct seq_file *m, loff_t *pos)
173{
174 if (*pos == 0)
175 return SEQ_START_TOKEN;
176
177 if (*pos < nr_lock_chains)
178 return lock_chains + *pos;
179
180 return NULL;
181}
182
183static void lc_stop(struct seq_file *m, void *v)
184{
185}
186
187static int lc_show(struct seq_file *m, void *v)
188{
189 struct lock_chain *chain = v;
190 struct lock_class *class;
191 int i;
192
193 if (v == SEQ_START_TOKEN) {
194 seq_printf(m, "all lock chains:\n");
195 return 0;
196 }
197
198 seq_printf(m, "irq_context: %d\n", chain->irq_context);
199
200 for (i = 0; i < chain->depth; i++) {
201 class = lock_chain_get_class(chain, i);
202 if (!class->key)
203 continue;
204
205 seq_printf(m, "[%p] ", class->key);
206 print_name(m, class);
207 seq_puts(m, "\n");
208 }
209 seq_puts(m, "\n");
210
211 return 0;
212}
213
214static const struct seq_operations lockdep_chains_ops = {
215 .start = lc_start,
216 .next = lc_next,
217 .stop = lc_stop,
218 .show = lc_show,
219};
220
221static int lockdep_chains_open(struct inode *inode, struct file *file)
222{
223 int res = seq_open(file, &lockdep_chains_ops);
224 if (!res) {
225 struct seq_file *m = file->private_data;
226
227 if (nr_lock_chains)
228 m->private = lock_chains;
229 else
230 m->private = NULL;
231 }
232 return res;
233}
234
235static const struct file_operations proc_lockdep_chains_operations = {
236 .open = lockdep_chains_open,
237 .read = seq_read,
238 .llseek = seq_lseek,
239 .release = seq_release,
240};
241#endif /* CONFIG_PROVE_LOCKING */
242
181static void lockdep_stats_debug_show(struct seq_file *m) 243static void lockdep_stats_debug_show(struct seq_file *m)
182{ 244{
183#ifdef CONFIG_DEBUG_LOCKDEP 245#ifdef CONFIG_DEBUG_LOCKDEP
@@ -261,7 +323,9 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
261 if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ) 323 if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ)
262 nr_hardirq_read_unsafe++; 324 nr_hardirq_read_unsafe++;
263 325
264 sum_forward_deps += count_forward_deps(class); 326#ifdef CONFIG_PROVE_LOCKING
327 sum_forward_deps += lockdep_count_forward_deps(class);
328#endif
265 } 329 }
266#ifdef CONFIG_DEBUG_LOCKDEP 330#ifdef CONFIG_DEBUG_LOCKDEP
267 DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused); 331 DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused);
@@ -294,6 +358,8 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
294#ifdef CONFIG_PROVE_LOCKING 358#ifdef CONFIG_PROVE_LOCKING
295 seq_printf(m, " dependency chains: %11lu [max: %lu]\n", 359 seq_printf(m, " dependency chains: %11lu [max: %lu]\n",
296 nr_lock_chains, MAX_LOCKDEP_CHAINS); 360 nr_lock_chains, MAX_LOCKDEP_CHAINS);
361 seq_printf(m, " dependency chain hlocks: %11d [max: %lu]\n",
362 nr_chain_hlocks, MAX_LOCKDEP_CHAIN_HLOCKS);
297#endif 363#endif
298 364
299#ifdef CONFIG_TRACE_IRQFLAGS 365#ifdef CONFIG_TRACE_IRQFLAGS
@@ -406,8 +472,9 @@ static void snprint_time(char *buf, size_t bufsiz, s64 nr)
406{ 472{
407 unsigned long rem; 473 unsigned long rem;
408 474
475 nr += 5; /* for display rounding */
409 rem = do_div(nr, 1000); /* XXX: do_div_signed */ 476 rem = do_div(nr, 1000); /* XXX: do_div_signed */
410 snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, ((int)rem+5)/10); 477 snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, (int)rem/10);
411} 478}
412 479
413static void seq_time(struct seq_file *m, s64 time) 480static void seq_time(struct seq_file *m, s64 time)
@@ -661,6 +728,10 @@ static const struct file_operations proc_lock_stat_operations = {
661static int __init lockdep_proc_init(void) 728static int __init lockdep_proc_init(void)
662{ 729{
663 proc_create("lockdep", S_IRUSR, NULL, &proc_lockdep_operations); 730 proc_create("lockdep", S_IRUSR, NULL, &proc_lockdep_operations);
731#ifdef CONFIG_PROVE_LOCKING
732 proc_create("lockdep_chains", S_IRUSR, NULL,
733 &proc_lockdep_chains_operations);
734#endif
664 proc_create("lockdep_stats", S_IRUSR, NULL, 735 proc_create("lockdep_stats", S_IRUSR, NULL,
665 &proc_lockdep_stats_operations); 736 &proc_lockdep_stats_operations);
666 737
diff --git a/kernel/marker.c b/kernel/marker.c
index b5a9fe1d50d5..7d1faecd7a51 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -55,8 +55,8 @@ static DEFINE_MUTEX(markers_mutex);
55struct marker_entry { 55struct marker_entry {
56 struct hlist_node hlist; 56 struct hlist_node hlist;
57 char *format; 57 char *format;
58 void (*call)(const struct marker *mdata, /* Probe wrapper */ 58 /* Probe wrapper */
59 void *call_private, const char *fmt, ...); 59 void (*call)(const struct marker *mdata, void *call_private, ...);
60 struct marker_probe_closure single; 60 struct marker_probe_closure single;
61 struct marker_probe_closure *multi; 61 struct marker_probe_closure *multi;
62 int refcount; /* Number of times armed. 0 if disarmed. */ 62 int refcount; /* Number of times armed. 0 if disarmed. */
@@ -91,15 +91,13 @@ EXPORT_SYMBOL_GPL(__mark_empty_function);
91 * marker_probe_cb Callback that prepares the variable argument list for probes. 91 * marker_probe_cb Callback that prepares the variable argument list for probes.
92 * @mdata: pointer of type struct marker 92 * @mdata: pointer of type struct marker
93 * @call_private: caller site private data 93 * @call_private: caller site private data
94 * @fmt: format string
95 * @...: Variable argument list. 94 * @...: Variable argument list.
96 * 95 *
97 * Since we do not use "typical" pointer based RCU in the 1 argument case, we 96 * Since we do not use "typical" pointer based RCU in the 1 argument case, we
98 * need to put a full smp_rmb() in this branch. This is why we do not use 97 * need to put a full smp_rmb() in this branch. This is why we do not use
99 * rcu_dereference() for the pointer read. 98 * rcu_dereference() for the pointer read.
100 */ 99 */
101void marker_probe_cb(const struct marker *mdata, void *call_private, 100void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
102 const char *fmt, ...)
103{ 101{
104 va_list args; 102 va_list args;
105 char ptype; 103 char ptype;
@@ -120,13 +118,19 @@ void marker_probe_cb(const struct marker *mdata, void *call_private,
120 /* Must read the ptr before private data. They are not data 118 /* Must read the ptr before private data. They are not data
121 * dependant, so we put an explicit smp_rmb() here. */ 119 * dependant, so we put an explicit smp_rmb() here. */
122 smp_rmb(); 120 smp_rmb();
123 va_start(args, fmt); 121 va_start(args, call_private);
124 func(mdata->single.probe_private, call_private, fmt, &args); 122 func(mdata->single.probe_private, call_private, mdata->format,
123 &args);
125 va_end(args); 124 va_end(args);
126 } else { 125 } else {
127 struct marker_probe_closure *multi; 126 struct marker_probe_closure *multi;
128 int i; 127 int i;
129 /* 128 /*
129 * Read mdata->ptype before mdata->multi.
130 */
131 smp_rmb();
132 multi = mdata->multi;
133 /*
130 * multi points to an array, therefore accessing the array 134 * multi points to an array, therefore accessing the array
131 * depends on reading multi. However, even in this case, 135 * depends on reading multi. However, even in this case,
132 * we must insure that the pointer is read _before_ the array 136 * we must insure that the pointer is read _before_ the array
@@ -134,11 +138,10 @@ void marker_probe_cb(const struct marker *mdata, void *call_private,
134 * in the fast path, so put the explicit barrier here. 138 * in the fast path, so put the explicit barrier here.
135 */ 139 */
136 smp_read_barrier_depends(); 140 smp_read_barrier_depends();
137 multi = mdata->multi;
138 for (i = 0; multi[i].func; i++) { 141 for (i = 0; multi[i].func; i++) {
139 va_start(args, fmt); 142 va_start(args, call_private);
140 multi[i].func(multi[i].probe_private, call_private, fmt, 143 multi[i].func(multi[i].probe_private, call_private,
141 &args); 144 mdata->format, &args);
142 va_end(args); 145 va_end(args);
143 } 146 }
144 } 147 }
@@ -150,13 +153,11 @@ EXPORT_SYMBOL_GPL(marker_probe_cb);
150 * marker_probe_cb Callback that does not prepare the variable argument list. 153 * marker_probe_cb Callback that does not prepare the variable argument list.
151 * @mdata: pointer of type struct marker 154 * @mdata: pointer of type struct marker
152 * @call_private: caller site private data 155 * @call_private: caller site private data
153 * @fmt: format string
154 * @...: Variable argument list. 156 * @...: Variable argument list.
155 * 157 *
156 * Should be connected to markers "MARK_NOARGS". 158 * Should be connected to markers "MARK_NOARGS".
157 */ 159 */
158void marker_probe_cb_noarg(const struct marker *mdata, 160void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
159 void *call_private, const char *fmt, ...)
160{ 161{
161 va_list args; /* not initialized */ 162 va_list args; /* not initialized */
162 char ptype; 163 char ptype;
@@ -172,11 +173,17 @@ void marker_probe_cb_noarg(const struct marker *mdata,
172 /* Must read the ptr before private data. They are not data 173 /* Must read the ptr before private data. They are not data
173 * dependant, so we put an explicit smp_rmb() here. */ 174 * dependant, so we put an explicit smp_rmb() here. */
174 smp_rmb(); 175 smp_rmb();
175 func(mdata->single.probe_private, call_private, fmt, &args); 176 func(mdata->single.probe_private, call_private, mdata->format,
177 &args);
176 } else { 178 } else {
177 struct marker_probe_closure *multi; 179 struct marker_probe_closure *multi;
178 int i; 180 int i;
179 /* 181 /*
182 * Read mdata->ptype before mdata->multi.
183 */
184 smp_rmb();
185 multi = mdata->multi;
186 /*
180 * multi points to an array, therefore accessing the array 187 * multi points to an array, therefore accessing the array
181 * depends on reading multi. However, even in this case, 188 * depends on reading multi. However, even in this case,
182 * we must insure that the pointer is read _before_ the array 189 * we must insure that the pointer is read _before_ the array
@@ -184,10 +191,9 @@ void marker_probe_cb_noarg(const struct marker *mdata,
184 * in the fast path, so put the explicit barrier here. 191 * in the fast path, so put the explicit barrier here.
185 */ 192 */
186 smp_read_barrier_depends(); 193 smp_read_barrier_depends();
187 multi = mdata->multi;
188 for (i = 0; multi[i].func; i++) 194 for (i = 0; multi[i].func; i++)
189 multi[i].func(multi[i].probe_private, call_private, fmt, 195 multi[i].func(multi[i].probe_private, call_private,
190 &args); 196 mdata->format, &args);
191 } 197 }
192 preempt_enable(); 198 preempt_enable();
193} 199}
@@ -443,7 +449,7 @@ static int remove_marker(const char *name)
443 hlist_del(&e->hlist); 449 hlist_del(&e->hlist);
444 /* Make sure the call_rcu has been executed */ 450 /* Make sure the call_rcu has been executed */
445 if (e->rcu_pending) 451 if (e->rcu_pending)
446 rcu_barrier(); 452 rcu_barrier_sched();
447 kfree(e); 453 kfree(e);
448 return 0; 454 return 0;
449} 455}
@@ -478,7 +484,7 @@ static int marker_set_format(struct marker_entry **entry, const char *format)
478 hlist_del(&(*entry)->hlist); 484 hlist_del(&(*entry)->hlist);
479 /* Make sure the call_rcu has been executed */ 485 /* Make sure the call_rcu has been executed */
480 if ((*entry)->rcu_pending) 486 if ((*entry)->rcu_pending)
481 rcu_barrier(); 487 rcu_barrier_sched();
482 kfree(*entry); 488 kfree(*entry);
483 *entry = e; 489 *entry = e;
484 trace_mark(core_marker_format, "name %s format %s", 490 trace_mark(core_marker_format, "name %s format %s",
@@ -657,7 +663,7 @@ int marker_probe_register(const char *name, const char *format,
657 * make sure it's executed now. 663 * make sure it's executed now.
658 */ 664 */
659 if (entry->rcu_pending) 665 if (entry->rcu_pending)
660 rcu_barrier(); 666 rcu_barrier_sched();
661 old = marker_entry_add_probe(entry, probe, probe_private); 667 old = marker_entry_add_probe(entry, probe, probe_private);
662 if (IS_ERR(old)) { 668 if (IS_ERR(old)) {
663 ret = PTR_ERR(old); 669 ret = PTR_ERR(old);
@@ -672,10 +678,7 @@ int marker_probe_register(const char *name, const char *format,
672 entry->rcu_pending = 1; 678 entry->rcu_pending = 1;
673 /* write rcu_pending before calling the RCU callback */ 679 /* write rcu_pending before calling the RCU callback */
674 smp_wmb(); 680 smp_wmb();
675#ifdef CONFIG_PREEMPT_RCU 681 call_rcu_sched(&entry->rcu, free_old_closure);
676 synchronize_sched(); /* Until we have the call_rcu_sched() */
677#endif
678 call_rcu(&entry->rcu, free_old_closure);
679end: 682end:
680 mutex_unlock(&markers_mutex); 683 mutex_unlock(&markers_mutex);
681 return ret; 684 return ret;
@@ -706,7 +709,7 @@ int marker_probe_unregister(const char *name,
706 if (!entry) 709 if (!entry)
707 goto end; 710 goto end;
708 if (entry->rcu_pending) 711 if (entry->rcu_pending)
709 rcu_barrier(); 712 rcu_barrier_sched();
710 old = marker_entry_remove_probe(entry, probe, probe_private); 713 old = marker_entry_remove_probe(entry, probe, probe_private);
711 mutex_unlock(&markers_mutex); 714 mutex_unlock(&markers_mutex);
712 marker_update_probes(); /* may update entry */ 715 marker_update_probes(); /* may update entry */
@@ -718,10 +721,7 @@ int marker_probe_unregister(const char *name,
718 entry->rcu_pending = 1; 721 entry->rcu_pending = 1;
719 /* write rcu_pending before calling the RCU callback */ 722 /* write rcu_pending before calling the RCU callback */
720 smp_wmb(); 723 smp_wmb();
721#ifdef CONFIG_PREEMPT_RCU 724 call_rcu_sched(&entry->rcu, free_old_closure);
722 synchronize_sched(); /* Until we have the call_rcu_sched() */
723#endif
724 call_rcu(&entry->rcu, free_old_closure);
725 remove_marker(name); /* Ignore busy error message */ 725 remove_marker(name); /* Ignore busy error message */
726 ret = 0; 726 ret = 0;
727end: 727end:
@@ -788,7 +788,7 @@ int marker_probe_unregister_private_data(marker_probe_func *probe,
788 goto end; 788 goto end;
789 } 789 }
790 if (entry->rcu_pending) 790 if (entry->rcu_pending)
791 rcu_barrier(); 791 rcu_barrier_sched();
792 old = marker_entry_remove_probe(entry, NULL, probe_private); 792 old = marker_entry_remove_probe(entry, NULL, probe_private);
793 mutex_unlock(&markers_mutex); 793 mutex_unlock(&markers_mutex);
794 marker_update_probes(); /* may update entry */ 794 marker_update_probes(); /* may update entry */
@@ -799,10 +799,7 @@ int marker_probe_unregister_private_data(marker_probe_func *probe,
799 entry->rcu_pending = 1; 799 entry->rcu_pending = 1;
800 /* write rcu_pending before calling the RCU callback */ 800 /* write rcu_pending before calling the RCU callback */
801 smp_wmb(); 801 smp_wmb();
802#ifdef CONFIG_PREEMPT_RCU 802 call_rcu_sched(&entry->rcu, free_old_closure);
803 synchronize_sched(); /* Until we have the call_rcu_sched() */
804#endif
805 call_rcu(&entry->rcu, free_old_closure);
806 remove_marker(entry->name); /* Ignore busy error message */ 803 remove_marker(entry->name); /* Ignore busy error message */
807end: 804end:
808 mutex_unlock(&markers_mutex); 805 mutex_unlock(&markers_mutex);
diff --git a/kernel/module.c b/kernel/module.c
index 5f80478b746d..9db11911e04b 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -70,6 +70,9 @@ static DECLARE_WAIT_QUEUE_HEAD(module_wq);
70 70
71static BLOCKING_NOTIFIER_HEAD(module_notify_list); 71static BLOCKING_NOTIFIER_HEAD(module_notify_list);
72 72
73/* Bounds of module allocation, for speeding __module_text_address */
74static unsigned long module_addr_min = -1UL, module_addr_max = 0;
75
73int register_module_notifier(struct notifier_block * nb) 76int register_module_notifier(struct notifier_block * nb)
74{ 77{
75 return blocking_notifier_chain_register(&module_notify_list, nb); 78 return blocking_notifier_chain_register(&module_notify_list, nb);
@@ -134,17 +137,19 @@ extern const struct kernel_symbol __start___ksymtab_gpl[];
134extern const struct kernel_symbol __stop___ksymtab_gpl[]; 137extern const struct kernel_symbol __stop___ksymtab_gpl[];
135extern const struct kernel_symbol __start___ksymtab_gpl_future[]; 138extern const struct kernel_symbol __start___ksymtab_gpl_future[];
136extern const struct kernel_symbol __stop___ksymtab_gpl_future[]; 139extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
137extern const struct kernel_symbol __start___ksymtab_unused[];
138extern const struct kernel_symbol __stop___ksymtab_unused[];
139extern const struct kernel_symbol __start___ksymtab_unused_gpl[];
140extern const struct kernel_symbol __stop___ksymtab_unused_gpl[];
141extern const struct kernel_symbol __start___ksymtab_gpl_future[]; 140extern const struct kernel_symbol __start___ksymtab_gpl_future[];
142extern const struct kernel_symbol __stop___ksymtab_gpl_future[]; 141extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
143extern const unsigned long __start___kcrctab[]; 142extern const unsigned long __start___kcrctab[];
144extern const unsigned long __start___kcrctab_gpl[]; 143extern const unsigned long __start___kcrctab_gpl[];
145extern const unsigned long __start___kcrctab_gpl_future[]; 144extern const unsigned long __start___kcrctab_gpl_future[];
145#ifdef CONFIG_UNUSED_SYMBOLS
146extern const struct kernel_symbol __start___ksymtab_unused[];
147extern const struct kernel_symbol __stop___ksymtab_unused[];
148extern const struct kernel_symbol __start___ksymtab_unused_gpl[];
149extern const struct kernel_symbol __stop___ksymtab_unused_gpl[];
146extern const unsigned long __start___kcrctab_unused[]; 150extern const unsigned long __start___kcrctab_unused[];
147extern const unsigned long __start___kcrctab_unused_gpl[]; 151extern const unsigned long __start___kcrctab_unused_gpl[];
152#endif
148 153
149#ifndef CONFIG_MODVERSIONS 154#ifndef CONFIG_MODVERSIONS
150#define symversion(base, idx) NULL 155#define symversion(base, idx) NULL
@@ -152,152 +157,170 @@ extern const unsigned long __start___kcrctab_unused_gpl[];
152#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL) 157#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
153#endif 158#endif
154 159
155/* lookup symbol in given range of kernel_symbols */
156static const struct kernel_symbol *lookup_symbol(const char *name,
157 const struct kernel_symbol *start,
158 const struct kernel_symbol *stop)
159{
160 const struct kernel_symbol *ks = start;
161 for (; ks < stop; ks++)
162 if (strcmp(ks->name, name) == 0)
163 return ks;
164 return NULL;
165}
166
167static bool always_ok(bool gplok, bool warn, const char *name)
168{
169 return true;
170}
171
172static bool printk_unused_warning(bool gplok, bool warn, const char *name)
173{
174 if (warn) {
175 printk(KERN_WARNING "Symbol %s is marked as UNUSED, "
176 "however this module is using it.\n", name);
177 printk(KERN_WARNING
178 "This symbol will go away in the future.\n");
179 printk(KERN_WARNING
180 "Please evalute if this is the right api to use and if "
181 "it really is, submit a report the linux kernel "
182 "mailinglist together with submitting your code for "
183 "inclusion.\n");
184 }
185 return true;
186}
187
188static bool gpl_only_unused_warning(bool gplok, bool warn, const char *name)
189{
190 if (!gplok)
191 return false;
192 return printk_unused_warning(gplok, warn, name);
193}
194
195static bool gpl_only(bool gplok, bool warn, const char *name)
196{
197 return gplok;
198}
199
200static bool warn_if_not_gpl(bool gplok, bool warn, const char *name)
201{
202 if (!gplok && warn) {
203 printk(KERN_WARNING "Symbol %s is being used "
204 "by a non-GPL module, which will not "
205 "be allowed in the future\n", name);
206 printk(KERN_WARNING "Please see the file "
207 "Documentation/feature-removal-schedule.txt "
208 "in the kernel source tree for more details.\n");
209 }
210 return true;
211}
212
213struct symsearch { 160struct symsearch {
214 const struct kernel_symbol *start, *stop; 161 const struct kernel_symbol *start, *stop;
215 const unsigned long *crcs; 162 const unsigned long *crcs;
216 bool (*check)(bool gplok, bool warn, const char *name); 163 enum {
164 NOT_GPL_ONLY,
165 GPL_ONLY,
166 WILL_BE_GPL_ONLY,
167 } licence;
168 bool unused;
217}; 169};
218 170
219/* Look through this array of symbol tables for a symbol match which 171static bool each_symbol_in_section(const struct symsearch *arr,
220 * passes the check function. */ 172 unsigned int arrsize,
221static const struct kernel_symbol *search_symarrays(const struct symsearch *arr, 173 struct module *owner,
222 unsigned int num, 174 bool (*fn)(const struct symsearch *syms,
223 const char *name, 175 struct module *owner,
224 bool gplok, 176 unsigned int symnum, void *data),
225 bool warn, 177 void *data)
226 const unsigned long **crc)
227{ 178{
228 unsigned int i; 179 unsigned int i, j;
229 const struct kernel_symbol *ks;
230
231 for (i = 0; i < num; i++) {
232 ks = lookup_symbol(name, arr[i].start, arr[i].stop);
233 if (!ks || !arr[i].check(gplok, warn, name))
234 continue;
235 180
236 if (crc) 181 for (j = 0; j < arrsize; j++) {
237 *crc = symversion(arr[i].crcs, ks - arr[i].start); 182 for (i = 0; i < arr[j].stop - arr[j].start; i++)
238 return ks; 183 if (fn(&arr[j], owner, i, data))
184 return true;
239 } 185 }
240 return NULL; 186
187 return false;
241} 188}
242 189
243/* Find a symbol, return value, (optional) crc and (optional) module 190/* Returns true as soon as fn returns true, otherwise false. */
244 * which owns it */ 191static bool each_symbol(bool (*fn)(const struct symsearch *arr,
245static unsigned long find_symbol(const char *name, 192 struct module *owner,
246 struct module **owner, 193 unsigned int symnum, void *data),
247 const unsigned long **crc, 194 void *data)
248 bool gplok,
249 bool warn)
250{ 195{
251 struct module *mod; 196 struct module *mod;
252 const struct kernel_symbol *ks;
253 const struct symsearch arr[] = { 197 const struct symsearch arr[] = {
254 { __start___ksymtab, __stop___ksymtab, __start___kcrctab, 198 { __start___ksymtab, __stop___ksymtab, __start___kcrctab,
255 always_ok }, 199 NOT_GPL_ONLY, false },
256 { __start___ksymtab_gpl, __stop___ksymtab_gpl, 200 { __start___ksymtab_gpl, __stop___ksymtab_gpl,
257 __start___kcrctab_gpl, gpl_only }, 201 __start___kcrctab_gpl,
202 GPL_ONLY, false },
258 { __start___ksymtab_gpl_future, __stop___ksymtab_gpl_future, 203 { __start___ksymtab_gpl_future, __stop___ksymtab_gpl_future,
259 __start___kcrctab_gpl_future, warn_if_not_gpl }, 204 __start___kcrctab_gpl_future,
205 WILL_BE_GPL_ONLY, false },
206#ifdef CONFIG_UNUSED_SYMBOLS
260 { __start___ksymtab_unused, __stop___ksymtab_unused, 207 { __start___ksymtab_unused, __stop___ksymtab_unused,
261 __start___kcrctab_unused, printk_unused_warning }, 208 __start___kcrctab_unused,
209 NOT_GPL_ONLY, true },
262 { __start___ksymtab_unused_gpl, __stop___ksymtab_unused_gpl, 210 { __start___ksymtab_unused_gpl, __stop___ksymtab_unused_gpl,
263 __start___kcrctab_unused_gpl, gpl_only_unused_warning }, 211 __start___kcrctab_unused_gpl,
212 GPL_ONLY, true },
213#endif
264 }; 214 };
265 215
266 /* Core kernel first. */ 216 if (each_symbol_in_section(arr, ARRAY_SIZE(arr), NULL, fn, data))
267 ks = search_symarrays(arr, ARRAY_SIZE(arr), name, gplok, warn, crc); 217 return true;
268 if (ks) {
269 if (owner)
270 *owner = NULL;
271 return ks->value;
272 }
273 218
274 /* Now try modules. */
275 list_for_each_entry(mod, &modules, list) { 219 list_for_each_entry(mod, &modules, list) {
276 struct symsearch arr[] = { 220 struct symsearch arr[] = {
277 { mod->syms, mod->syms + mod->num_syms, mod->crcs, 221 { mod->syms, mod->syms + mod->num_syms, mod->crcs,
278 always_ok }, 222 NOT_GPL_ONLY, false },
279 { mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms, 223 { mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms,
280 mod->gpl_crcs, gpl_only }, 224 mod->gpl_crcs,
225 GPL_ONLY, false },
281 { mod->gpl_future_syms, 226 { mod->gpl_future_syms,
282 mod->gpl_future_syms + mod->num_gpl_future_syms, 227 mod->gpl_future_syms + mod->num_gpl_future_syms,
283 mod->gpl_future_crcs, warn_if_not_gpl }, 228 mod->gpl_future_crcs,
229 WILL_BE_GPL_ONLY, false },
230#ifdef CONFIG_UNUSED_SYMBOLS
284 { mod->unused_syms, 231 { mod->unused_syms,
285 mod->unused_syms + mod->num_unused_syms, 232 mod->unused_syms + mod->num_unused_syms,
286 mod->unused_crcs, printk_unused_warning }, 233 mod->unused_crcs,
234 NOT_GPL_ONLY, true },
287 { mod->unused_gpl_syms, 235 { mod->unused_gpl_syms,
288 mod->unused_gpl_syms + mod->num_unused_gpl_syms, 236 mod->unused_gpl_syms + mod->num_unused_gpl_syms,
289 mod->unused_gpl_crcs, gpl_only_unused_warning }, 237 mod->unused_gpl_crcs,
238 GPL_ONLY, true },
239#endif
290 }; 240 };
291 241
292 ks = search_symarrays(arr, ARRAY_SIZE(arr), 242 if (each_symbol_in_section(arr, ARRAY_SIZE(arr), mod, fn, data))
293 name, gplok, warn, crc); 243 return true;
294 if (ks) { 244 }
295 if (owner) 245 return false;
296 *owner = mod; 246}
297 return ks->value; 247
248struct find_symbol_arg {
249 /* Input */
250 const char *name;
251 bool gplok;
252 bool warn;
253
254 /* Output */
255 struct module *owner;
256 const unsigned long *crc;
257 unsigned long value;
258};
259
260static bool find_symbol_in_section(const struct symsearch *syms,
261 struct module *owner,
262 unsigned int symnum, void *data)
263{
264 struct find_symbol_arg *fsa = data;
265
266 if (strcmp(syms->start[symnum].name, fsa->name) != 0)
267 return false;
268
269 if (!fsa->gplok) {
270 if (syms->licence == GPL_ONLY)
271 return false;
272 if (syms->licence == WILL_BE_GPL_ONLY && fsa->warn) {
273 printk(KERN_WARNING "Symbol %s is being used "
274 "by a non-GPL module, which will not "
275 "be allowed in the future\n", fsa->name);
276 printk(KERN_WARNING "Please see the file "
277 "Documentation/feature-removal-schedule.txt "
278 "in the kernel source tree for more details.\n");
298 } 279 }
299 } 280 }
300 281
282#ifdef CONFIG_UNUSED_SYMBOLS
283 if (syms->unused && fsa->warn) {
284 printk(KERN_WARNING "Symbol %s is marked as UNUSED, "
285 "however this module is using it.\n", fsa->name);
286 printk(KERN_WARNING
287 "This symbol will go away in the future.\n");
288 printk(KERN_WARNING
289 "Please evalute if this is the right api to use and if "
290 "it really is, submit a report the linux kernel "
291 "mailinglist together with submitting your code for "
292 "inclusion.\n");
293 }
294#endif
295
296 fsa->owner = owner;
297 fsa->crc = symversion(syms->crcs, symnum);
298 fsa->value = syms->start[symnum].value;
299 return true;
300}
301
302/* Find a symbol, return value, (optional) crc and (optional) module
303 * which owns it */
304static unsigned long find_symbol(const char *name,
305 struct module **owner,
306 const unsigned long **crc,
307 bool gplok,
308 bool warn)
309{
310 struct find_symbol_arg fsa;
311
312 fsa.name = name;
313 fsa.gplok = gplok;
314 fsa.warn = warn;
315
316 if (each_symbol(find_symbol_in_section, &fsa)) {
317 if (owner)
318 *owner = fsa.owner;
319 if (crc)
320 *crc = fsa.crc;
321 return fsa.value;
322 }
323
301 DEBUGP("Failed to find symbol %s\n", name); 324 DEBUGP("Failed to find symbol %s\n", name);
302 return -ENOENT; 325 return -ENOENT;
303} 326}
@@ -639,8 +662,8 @@ static int __try_stop_module(void *_sref)
639{ 662{
640 struct stopref *sref = _sref; 663 struct stopref *sref = _sref;
641 664
642 /* If it's not unused, quit unless we are told to block. */ 665 /* If it's not unused, quit unless we're forcing. */
643 if ((sref->flags & O_NONBLOCK) && module_refcount(sref->mod) != 0) { 666 if (module_refcount(sref->mod) != 0) {
644 if (!(*sref->forced = try_force_unload(sref->flags))) 667 if (!(*sref->forced = try_force_unload(sref->flags)))
645 return -EWOULDBLOCK; 668 return -EWOULDBLOCK;
646 } 669 }
@@ -652,9 +675,16 @@ static int __try_stop_module(void *_sref)
652 675
653static int try_stop_module(struct module *mod, int flags, int *forced) 676static int try_stop_module(struct module *mod, int flags, int *forced)
654{ 677{
655 struct stopref sref = { mod, flags, forced }; 678 if (flags & O_NONBLOCK) {
679 struct stopref sref = { mod, flags, forced };
656 680
657 return stop_machine_run(__try_stop_module, &sref, NR_CPUS); 681 return stop_machine(__try_stop_module, &sref, NULL);
682 } else {
683 /* We don't need to stop the machine for this. */
684 mod->state = MODULE_STATE_GOING;
685 synchronize_sched();
686 return 0;
687 }
658} 688}
659 689
660unsigned int module_refcount(struct module *mod) 690unsigned int module_refcount(struct module *mod)
@@ -1386,7 +1416,7 @@ static int __unlink_module(void *_mod)
1386static void free_module(struct module *mod) 1416static void free_module(struct module *mod)
1387{ 1417{
1388 /* Delete from various lists */ 1418 /* Delete from various lists */
1389 stop_machine_run(__unlink_module, mod, NR_CPUS); 1419 stop_machine(__unlink_module, mod, NULL);
1390 remove_notes_attrs(mod); 1420 remove_notes_attrs(mod);
1391 remove_sect_attrs(mod); 1421 remove_sect_attrs(mod);
1392 mod_kobject_remove(mod); 1422 mod_kobject_remove(mod);
@@ -1445,8 +1475,10 @@ static int verify_export_symbols(struct module *mod)
1445 { mod->syms, mod->num_syms }, 1475 { mod->syms, mod->num_syms },
1446 { mod->gpl_syms, mod->num_gpl_syms }, 1476 { mod->gpl_syms, mod->num_gpl_syms },
1447 { mod->gpl_future_syms, mod->num_gpl_future_syms }, 1477 { mod->gpl_future_syms, mod->num_gpl_future_syms },
1478#ifdef CONFIG_UNUSED_SYMBOLS
1448 { mod->unused_syms, mod->num_unused_syms }, 1479 { mod->unused_syms, mod->num_unused_syms },
1449 { mod->unused_gpl_syms, mod->num_unused_gpl_syms }, 1480 { mod->unused_gpl_syms, mod->num_unused_gpl_syms },
1481#endif
1450 }; 1482 };
1451 1483
1452 for (i = 0; i < ARRAY_SIZE(arr); i++) { 1484 for (i = 0; i < ARRAY_SIZE(arr); i++) {
@@ -1526,7 +1558,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
1526} 1558}
1527 1559
1528/* Update size with this section: return offset. */ 1560/* Update size with this section: return offset. */
1529static long get_offset(unsigned long *size, Elf_Shdr *sechdr) 1561static long get_offset(unsigned int *size, Elf_Shdr *sechdr)
1530{ 1562{
1531 long ret; 1563 long ret;
1532 1564
@@ -1659,6 +1691,19 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
1659} 1691}
1660 1692
1661#ifdef CONFIG_KALLSYMS 1693#ifdef CONFIG_KALLSYMS
1694
1695/* lookup symbol in given range of kernel_symbols */
1696static const struct kernel_symbol *lookup_symbol(const char *name,
1697 const struct kernel_symbol *start,
1698 const struct kernel_symbol *stop)
1699{
1700 const struct kernel_symbol *ks = start;
1701 for (; ks < stop; ks++)
1702 if (strcmp(ks->name, name) == 0)
1703 return ks;
1704 return NULL;
1705}
1706
1662static int is_exported(const char *name, const struct module *mod) 1707static int is_exported(const char *name, const struct module *mod)
1663{ 1708{
1664 if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab)) 1709 if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab))
@@ -1738,9 +1783,23 @@ static inline void add_kallsyms(struct module *mod,
1738} 1783}
1739#endif /* CONFIG_KALLSYMS */ 1784#endif /* CONFIG_KALLSYMS */
1740 1785
1786static void *module_alloc_update_bounds(unsigned long size)
1787{
1788 void *ret = module_alloc(size);
1789
1790 if (ret) {
1791 /* Update module bounds. */
1792 if ((unsigned long)ret < module_addr_min)
1793 module_addr_min = (unsigned long)ret;
1794 if ((unsigned long)ret + size > module_addr_max)
1795 module_addr_max = (unsigned long)ret + size;
1796 }
1797 return ret;
1798}
1799
1741/* Allocate and load the module: note that size of section 0 is always 1800/* Allocate and load the module: note that size of section 0 is always
1742 zero, and we rely on this for optional sections. */ 1801 zero, and we rely on this for optional sections. */
1743static struct module *load_module(void __user *umod, 1802static noinline struct module *load_module(void __user *umod,
1744 unsigned long len, 1803 unsigned long len,
1745 const char __user *uargs) 1804 const char __user *uargs)
1746{ 1805{
@@ -1764,10 +1823,12 @@ static struct module *load_module(void __user *umod,
1764 unsigned int gplfutureindex; 1823 unsigned int gplfutureindex;
1765 unsigned int gplfuturecrcindex; 1824 unsigned int gplfuturecrcindex;
1766 unsigned int unwindex = 0; 1825 unsigned int unwindex = 0;
1826#ifdef CONFIG_UNUSED_SYMBOLS
1767 unsigned int unusedindex; 1827 unsigned int unusedindex;
1768 unsigned int unusedcrcindex; 1828 unsigned int unusedcrcindex;
1769 unsigned int unusedgplindex; 1829 unsigned int unusedgplindex;
1770 unsigned int unusedgplcrcindex; 1830 unsigned int unusedgplcrcindex;
1831#endif
1771 unsigned int markersindex; 1832 unsigned int markersindex;
1772 unsigned int markersstringsindex; 1833 unsigned int markersstringsindex;
1773 struct module *mod; 1834 struct module *mod;
@@ -1850,13 +1911,15 @@ static struct module *load_module(void __user *umod,
1850 exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab"); 1911 exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab");
1851 gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl"); 1912 gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl");
1852 gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future"); 1913 gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future");
1853 unusedindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused");
1854 unusedgplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused_gpl");
1855 crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab"); 1914 crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab");
1856 gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl"); 1915 gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl");
1857 gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future"); 1916 gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future");
1917#ifdef CONFIG_UNUSED_SYMBOLS
1918 unusedindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused");
1919 unusedgplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused_gpl");
1858 unusedcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused"); 1920 unusedcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused");
1859 unusedgplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused_gpl"); 1921 unusedgplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused_gpl");
1922#endif
1860 setupindex = find_sec(hdr, sechdrs, secstrings, "__param"); 1923 setupindex = find_sec(hdr, sechdrs, secstrings, "__param");
1861 exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table"); 1924 exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table");
1862 obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm"); 1925 obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm");
@@ -1935,7 +1998,7 @@ static struct module *load_module(void __user *umod,
1935 layout_sections(mod, hdr, sechdrs, secstrings); 1998 layout_sections(mod, hdr, sechdrs, secstrings);
1936 1999
1937 /* Do the allocs. */ 2000 /* Do the allocs. */
1938 ptr = module_alloc(mod->core_size); 2001 ptr = module_alloc_update_bounds(mod->core_size);
1939 if (!ptr) { 2002 if (!ptr) {
1940 err = -ENOMEM; 2003 err = -ENOMEM;
1941 goto free_percpu; 2004 goto free_percpu;
@@ -1943,7 +2006,7 @@ static struct module *load_module(void __user *umod,
1943 memset(ptr, 0, mod->core_size); 2006 memset(ptr, 0, mod->core_size);
1944 mod->module_core = ptr; 2007 mod->module_core = ptr;
1945 2008
1946 ptr = module_alloc(mod->init_size); 2009 ptr = module_alloc_update_bounds(mod->init_size);
1947 if (!ptr && mod->init_size) { 2010 if (!ptr && mod->init_size) {
1948 err = -ENOMEM; 2011 err = -ENOMEM;
1949 goto free_core; 2012 goto free_core;
@@ -2018,14 +2081,15 @@ static struct module *load_module(void __user *umod,
2018 mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr; 2081 mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr;
2019 mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size / 2082 mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size /
2020 sizeof(*mod->gpl_future_syms); 2083 sizeof(*mod->gpl_future_syms);
2021 mod->num_unused_syms = sechdrs[unusedindex].sh_size /
2022 sizeof(*mod->unused_syms);
2023 mod->num_unused_gpl_syms = sechdrs[unusedgplindex].sh_size /
2024 sizeof(*mod->unused_gpl_syms);
2025 mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr; 2084 mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr;
2026 if (gplfuturecrcindex) 2085 if (gplfuturecrcindex)
2027 mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr; 2086 mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr;
2028 2087
2088#ifdef CONFIG_UNUSED_SYMBOLS
2089 mod->num_unused_syms = sechdrs[unusedindex].sh_size /
2090 sizeof(*mod->unused_syms);
2091 mod->num_unused_gpl_syms = sechdrs[unusedgplindex].sh_size /
2092 sizeof(*mod->unused_gpl_syms);
2029 mod->unused_syms = (void *)sechdrs[unusedindex].sh_addr; 2093 mod->unused_syms = (void *)sechdrs[unusedindex].sh_addr;
2030 if (unusedcrcindex) 2094 if (unusedcrcindex)
2031 mod->unused_crcs = (void *)sechdrs[unusedcrcindex].sh_addr; 2095 mod->unused_crcs = (void *)sechdrs[unusedcrcindex].sh_addr;
@@ -2033,13 +2097,17 @@ static struct module *load_module(void __user *umod,
2033 if (unusedgplcrcindex) 2097 if (unusedgplcrcindex)
2034 mod->unused_gpl_crcs 2098 mod->unused_gpl_crcs
2035 = (void *)sechdrs[unusedgplcrcindex].sh_addr; 2099 = (void *)sechdrs[unusedgplcrcindex].sh_addr;
2100#endif
2036 2101
2037#ifdef CONFIG_MODVERSIONS 2102#ifdef CONFIG_MODVERSIONS
2038 if ((mod->num_syms && !crcindex) || 2103 if ((mod->num_syms && !crcindex)
2039 (mod->num_gpl_syms && !gplcrcindex) || 2104 || (mod->num_gpl_syms && !gplcrcindex)
2040 (mod->num_gpl_future_syms && !gplfuturecrcindex) || 2105 || (mod->num_gpl_future_syms && !gplfuturecrcindex)
2041 (mod->num_unused_syms && !unusedcrcindex) || 2106#ifdef CONFIG_UNUSED_SYMBOLS
2042 (mod->num_unused_gpl_syms && !unusedgplcrcindex)) { 2107 || (mod->num_unused_syms && !unusedcrcindex)
2108 || (mod->num_unused_gpl_syms && !unusedgplcrcindex)
2109#endif
2110 ) {
2043 printk(KERN_WARNING "%s: No versions for exported symbols.\n", mod->name); 2111 printk(KERN_WARNING "%s: No versions for exported symbols.\n", mod->name);
2044 err = try_to_force_load(mod, "nocrc"); 2112 err = try_to_force_load(mod, "nocrc");
2045 if (err) 2113 if (err)
@@ -2129,7 +2197,7 @@ static struct module *load_module(void __user *umod,
2129 /* Now sew it into the lists so we can get lockdep and oops 2197 /* Now sew it into the lists so we can get lockdep and oops
2130 * info during argument parsing. Noone should access us, since 2198 * info during argument parsing. Noone should access us, since
2131 * strong_try_module_get() will fail. */ 2199 * strong_try_module_get() will fail. */
2132 stop_machine_run(__link_module, mod, NR_CPUS); 2200 stop_machine(__link_module, mod, NULL);
2133 2201
2134 /* Size of section 0 is 0, so this works well if no params */ 2202 /* Size of section 0 is 0, so this works well if no params */
2135 err = parse_args(mod->name, mod->args, 2203 err = parse_args(mod->name, mod->args,
@@ -2163,7 +2231,7 @@ static struct module *load_module(void __user *umod,
2163 return mod; 2231 return mod;
2164 2232
2165 unlink: 2233 unlink:
2166 stop_machine_run(__unlink_module, mod, NR_CPUS); 2234 stop_machine(__unlink_module, mod, NULL);
2167 module_arch_cleanup(mod); 2235 module_arch_cleanup(mod);
2168 cleanup: 2236 cleanup:
2169 kobject_del(&mod->mkobj.kobj); 2237 kobject_del(&mod->mkobj.kobj);
@@ -2220,7 +2288,7 @@ sys_init_module(void __user *umod,
2220 2288
2221 /* Start the module */ 2289 /* Start the module */
2222 if (mod->init != NULL) 2290 if (mod->init != NULL)
2223 ret = mod->init(); 2291 ret = do_one_initcall(mod->init);
2224 if (ret < 0) { 2292 if (ret < 0) {
2225 /* Init routine failed: abort. Try to protect us from 2293 /* Init routine failed: abort. Try to protect us from
2226 buggy refcounters. */ 2294 buggy refcounters. */
@@ -2512,7 +2580,7 @@ static int m_show(struct seq_file *m, void *p)
2512 struct module *mod = list_entry(p, struct module, list); 2580 struct module *mod = list_entry(p, struct module, list);
2513 char buf[8]; 2581 char buf[8];
2514 2582
2515 seq_printf(m, "%s %lu", 2583 seq_printf(m, "%s %u",
2516 mod->name, mod->init_size + mod->core_size); 2584 mod->name, mod->init_size + mod->core_size);
2517 print_unload_info(m, mod); 2585 print_unload_info(m, mod);
2518 2586
@@ -2595,6 +2663,9 @@ struct module *__module_text_address(unsigned long addr)
2595{ 2663{
2596 struct module *mod; 2664 struct module *mod;
2597 2665
2666 if (addr < module_addr_min || addr > module_addr_max)
2667 return NULL;
2668
2598 list_for_each_entry(mod, &modules, list) 2669 list_for_each_entry(mod, &modules, list)
2599 if (within(addr, mod->module_init, mod->init_text_size) 2670 if (within(addr, mod->module_init, mod->init_text_size)
2600 || within(addr, mod->module_core, mod->core_text_size)) 2671 || within(addr, mod->module_core, mod->core_text_size))
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 3aaa06c561de..1d94160eb532 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -79,8 +79,8 @@ void debug_mutex_unlock(struct mutex *lock)
79 if (unlikely(!debug_locks)) 79 if (unlikely(!debug_locks))
80 return; 80 return;
81 81
82 DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
83 DEBUG_LOCKS_WARN_ON(lock->magic != lock); 82 DEBUG_LOCKS_WARN_ON(lock->magic != lock);
83 DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
84 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); 84 DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
85 DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info()); 85 DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
86} 86}
diff --git a/kernel/mutex.c b/kernel/mutex.c
index d046a345d365..12c779dc65d4 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -34,6 +34,7 @@
34/*** 34/***
35 * mutex_init - initialize the mutex 35 * mutex_init - initialize the mutex
36 * @lock: the mutex to be initialized 36 * @lock: the mutex to be initialized
37 * @key: the lock_class_key for the class; used by mutex lock debugging
37 * 38 *
38 * Initialize the mutex to unlocked state. 39 * Initialize the mutex to unlocked state.
39 * 40 *
@@ -165,10 +166,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
165 * got a signal? (This code gets eliminated in the 166 * got a signal? (This code gets eliminated in the
166 * TASK_UNINTERRUPTIBLE case.) 167 * TASK_UNINTERRUPTIBLE case.)
167 */ 168 */
168 if (unlikely((state == TASK_INTERRUPTIBLE && 169 if (unlikely(signal_pending_state(state, task))) {
169 signal_pending(task)) ||
170 (state == TASK_KILLABLE &&
171 fatal_signal_pending(task)))) {
172 mutex_remove_waiter(lock, &waiter, 170 mutex_remove_waiter(lock, &waiter,
173 task_thread_info(task)); 171 task_thread_info(task));
174 mutex_release(&lock->dep_map, 1, ip); 172 mutex_release(&lock->dep_map, 1, ip);
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 48d7ed6fc3a4..43c2111cd54d 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -7,6 +7,7 @@
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/cgroup.h> 8#include <linux/cgroup.h>
9#include <linux/fs.h> 9#include <linux/fs.h>
10#include <linux/proc_fs.h>
10#include <linux/slab.h> 11#include <linux/slab.h>
11#include <linux/nsproxy.h> 12#include <linux/nsproxy.h>
12 13
@@ -24,9 +25,12 @@ static inline struct ns_cgroup *cgroup_to_ns(
24 struct ns_cgroup, css); 25 struct ns_cgroup, css);
25} 26}
26 27
27int ns_cgroup_clone(struct task_struct *task) 28int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
28{ 29{
29 return cgroup_clone(task, &ns_subsys); 30 char name[PROC_NUMBUF];
31
32 snprintf(name, PROC_NUMBUF, "%d", pid_vnr(pid));
33 return cgroup_clone(task, &ns_subsys, name);
30} 34}
31 35
32/* 36/*
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index adc785146a1c..1d3ef29a2583 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -14,7 +14,6 @@
14 */ 14 */
15 15
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/version.h>
18#include <linux/nsproxy.h> 17#include <linux/nsproxy.h>
19#include <linux/init_task.h> 18#include <linux/init_task.h>
20#include <linux/mnt_namespace.h> 19#include <linux/mnt_namespace.h>
@@ -157,12 +156,6 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
157 goto out; 156 goto out;
158 } 157 }
159 158
160 err = ns_cgroup_clone(tsk);
161 if (err) {
162 put_nsproxy(new_ns);
163 goto out;
164 }
165
166 tsk->nsproxy = new_ns; 159 tsk->nsproxy = new_ns;
167 160
168out: 161out:
@@ -209,7 +202,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
209 goto out; 202 goto out;
210 } 203 }
211 204
212 err = ns_cgroup_clone(current); 205 err = ns_cgroup_clone(current, task_pid(current));
213 if (err) 206 if (err)
214 put_nsproxy(*new_nsp); 207 put_nsproxy(*new_nsp);
215 208
diff --git a/kernel/panic.c b/kernel/panic.c
index c35c9eca3eb2..e0a87bb025c0 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -321,6 +321,28 @@ void warn_on_slowpath(const char *file, int line)
321 add_taint(TAINT_WARN); 321 add_taint(TAINT_WARN);
322} 322}
323EXPORT_SYMBOL(warn_on_slowpath); 323EXPORT_SYMBOL(warn_on_slowpath);
324
325
326void warn_slowpath(const char *file, int line, const char *fmt, ...)
327{
328 va_list args;
329 char function[KSYM_SYMBOL_LEN];
330 unsigned long caller = (unsigned long)__builtin_return_address(0);
331 sprint_symbol(function, caller);
332
333 printk(KERN_WARNING "------------[ cut here ]------------\n");
334 printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file,
335 line, function);
336 va_start(args, fmt);
337 vprintk(fmt, args);
338 va_end(args);
339
340 print_modules();
341 dump_stack();
342 print_oops_end_marker();
343 add_taint(TAINT_WARN);
344}
345EXPORT_SYMBOL(warn_slowpath);
324#endif 346#endif
325 347
326#ifdef CONFIG_CC_STACKPROTECTOR 348#ifdef CONFIG_CC_STACKPROTECTOR
diff --git a/kernel/pid.c b/kernel/pid.c
index 20d59fa2d493..064e76afa507 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -30,6 +30,7 @@
30#include <linux/module.h> 30#include <linux/module.h>
31#include <linux/slab.h> 31#include <linux/slab.h>
32#include <linux/init.h> 32#include <linux/init.h>
33#include <linux/rculist.h>
33#include <linux/bootmem.h> 34#include <linux/bootmem.h>
34#include <linux/hash.h> 35#include <linux/hash.h>
35#include <linux/pid_namespace.h> 36#include <linux/pid_namespace.h>
@@ -308,12 +309,6 @@ struct pid *find_vpid(int nr)
308} 309}
309EXPORT_SYMBOL_GPL(find_vpid); 310EXPORT_SYMBOL_GPL(find_vpid);
310 311
311struct pid *find_pid(int nr)
312{
313 return find_pid_ns(nr, &init_pid_ns);
314}
315EXPORT_SYMBOL_GPL(find_pid);
316
317/* 312/*
318 * attach_pid() must be called with the tasklist_lock write-held. 313 * attach_pid() must be called with the tasklist_lock write-held.
319 */ 314 */
@@ -434,6 +429,7 @@ struct pid *find_get_pid(pid_t nr)
434 429
435 return pid; 430 return pid;
436} 431}
432EXPORT_SYMBOL_GPL(find_get_pid);
437 433
438pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns) 434pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
439{ 435{
@@ -481,7 +477,7 @@ EXPORT_SYMBOL(task_session_nr_ns);
481/* 477/*
482 * Used by proc to find the first pid that is greater then or equal to nr. 478 * Used by proc to find the first pid that is greater then or equal to nr.
483 * 479 *
484 * If there is a pid at nr this function is exactly the same as find_pid. 480 * If there is a pid at nr this function is exactly the same as find_pid_ns.
485 */ 481 */
486struct pid *find_ge_pid(int nr, struct pid_namespace *ns) 482struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
487{ 483{
@@ -496,7 +492,6 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
496 492
497 return pid; 493 return pid;
498} 494}
499EXPORT_SYMBOL_GPL(find_get_pid);
500 495
501/* 496/*
502 * The pid hash table is scaled according to the amount of memory in the 497 * The pid hash table is scaled according to the amount of memory in the
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 98702b4b8851..fab8ea86fac3 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -12,6 +12,7 @@
12#include <linux/pid_namespace.h> 12#include <linux/pid_namespace.h>
13#include <linux/syscalls.h> 13#include <linux/syscalls.h>
14#include <linux/err.h> 14#include <linux/err.h>
15#include <linux/acct.h>
15 16
16#define BITS_PER_PAGE (PAGE_SIZE*8) 17#define BITS_PER_PAGE (PAGE_SIZE*8)
17 18
@@ -71,7 +72,7 @@ static struct pid_namespace *create_pid_namespace(unsigned int level)
71 struct pid_namespace *ns; 72 struct pid_namespace *ns;
72 int i; 73 int i;
73 74
74 ns = kmem_cache_alloc(pid_ns_cachep, GFP_KERNEL); 75 ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
75 if (ns == NULL) 76 if (ns == NULL)
76 goto out; 77 goto out;
77 78
@@ -84,17 +85,13 @@ static struct pid_namespace *create_pid_namespace(unsigned int level)
84 goto out_free_map; 85 goto out_free_map;
85 86
86 kref_init(&ns->kref); 87 kref_init(&ns->kref);
87 ns->last_pid = 0;
88 ns->child_reaper = NULL;
89 ns->level = level; 88 ns->level = level;
90 89
91 set_bit(0, ns->pidmap[0].page); 90 set_bit(0, ns->pidmap[0].page);
92 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); 91 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
93 92
94 for (i = 1; i < PIDMAP_ENTRIES; i++) { 93 for (i = 1; i < PIDMAP_ENTRIES; i++)
95 ns->pidmap[i].page = NULL;
96 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); 94 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
97 }
98 95
99 return ns; 96 return ns;
100 97
@@ -182,9 +179,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
182 rc = sys_wait4(-1, NULL, __WALL, NULL); 179 rc = sys_wait4(-1, NULL, __WALL, NULL);
183 } while (rc != -ECHILD); 180 } while (rc != -ECHILD);
184 181
185 182 acct_exit_ns(pid_ns);
186 /* Child reaper for the pid namespace is going away */
187 pid_ns->child_reaper = NULL;
188 return; 183 return;
189} 184}
190 185
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index 0afe32be4c85..dfdec524d1b7 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -24,11 +24,12 @@
24 * requirement that the application has is cleaned up when closes the file 24 * requirement that the application has is cleaned up when closes the file
25 * pointer or exits the pm_qos_object will get an opportunity to clean up. 25 * pointer or exits the pm_qos_object will get an opportunity to clean up.
26 * 26 *
27 * mark gross mgross@linux.intel.com 27 * Mark Gross <mgross@linux.intel.com>
28 */ 28 */
29 29
30#include <linux/pm_qos_params.h> 30#include <linux/pm_qos_params.h>
31#include <linux/sched.h> 31#include <linux/sched.h>
32#include <linux/smp_lock.h>
32#include <linux/spinlock.h> 33#include <linux/spinlock.h>
33#include <linux/slab.h> 34#include <linux/slab.h>
34#include <linux/time.h> 35#include <linux/time.h>
@@ -42,7 +43,7 @@
42#include <linux/uaccess.h> 43#include <linux/uaccess.h>
43 44
44/* 45/*
45 * locking rule: all changes to target_value or requirements or notifiers lists 46 * locking rule: all changes to requirements or notifiers lists
46 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock 47 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
47 * held, taken with _irqsave. One lock to rule them all 48 * held, taken with _irqsave. One lock to rule them all
48 */ 49 */
@@ -65,7 +66,7 @@ struct pm_qos_object {
65 struct miscdevice pm_qos_power_miscdev; 66 struct miscdevice pm_qos_power_miscdev;
66 char *name; 67 char *name;
67 s32 default_value; 68 s32 default_value;
68 s32 target_value; 69 atomic_t target_value;
69 s32 (*comparitor)(s32, s32); 70 s32 (*comparitor)(s32, s32);
70}; 71};
71 72
@@ -76,7 +77,7 @@ static struct pm_qos_object cpu_dma_pm_qos = {
76 .notifiers = &cpu_dma_lat_notifier, 77 .notifiers = &cpu_dma_lat_notifier,
77 .name = "cpu_dma_latency", 78 .name = "cpu_dma_latency",
78 .default_value = 2000 * USEC_PER_SEC, 79 .default_value = 2000 * USEC_PER_SEC,
79 .target_value = 2000 * USEC_PER_SEC, 80 .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC),
80 .comparitor = min_compare 81 .comparitor = min_compare
81}; 82};
82 83
@@ -86,7 +87,7 @@ static struct pm_qos_object network_lat_pm_qos = {
86 .notifiers = &network_lat_notifier, 87 .notifiers = &network_lat_notifier,
87 .name = "network_latency", 88 .name = "network_latency",
88 .default_value = 2000 * USEC_PER_SEC, 89 .default_value = 2000 * USEC_PER_SEC,
89 .target_value = 2000 * USEC_PER_SEC, 90 .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC),
90 .comparitor = min_compare 91 .comparitor = min_compare
91}; 92};
92 93
@@ -98,7 +99,7 @@ static struct pm_qos_object network_throughput_pm_qos = {
98 .notifiers = &network_throughput_notifier, 99 .notifiers = &network_throughput_notifier,
99 .name = "network_throughput", 100 .name = "network_throughput",
100 .default_value = 0, 101 .default_value = 0,
101 .target_value = 0, 102 .target_value = ATOMIC_INIT(0),
102 .comparitor = max_compare 103 .comparitor = max_compare
103}; 104};
104 105
@@ -149,11 +150,11 @@ static void update_target(int target)
149 extreme_value = pm_qos_array[target]->comparitor( 150 extreme_value = pm_qos_array[target]->comparitor(
150 extreme_value, node->value); 151 extreme_value, node->value);
151 } 152 }
152 if (pm_qos_array[target]->target_value != extreme_value) { 153 if (atomic_read(&pm_qos_array[target]->target_value) != extreme_value) {
153 call_notifier = 1; 154 call_notifier = 1;
154 pm_qos_array[target]->target_value = extreme_value; 155 atomic_set(&pm_qos_array[target]->target_value, extreme_value);
155 pr_debug(KERN_ERR "new target for qos %d is %d\n", target, 156 pr_debug(KERN_ERR "new target for qos %d is %d\n", target,
156 pm_qos_array[target]->target_value); 157 atomic_read(&pm_qos_array[target]->target_value));
157 } 158 }
158 spin_unlock_irqrestore(&pm_qos_lock, flags); 159 spin_unlock_irqrestore(&pm_qos_lock, flags);
159 160
@@ -192,14 +193,7 @@ static int find_pm_qos_object_by_minor(int minor)
192 */ 193 */
193int pm_qos_requirement(int pm_qos_class) 194int pm_qos_requirement(int pm_qos_class)
194{ 195{
195 int ret_val; 196 return atomic_read(&pm_qos_array[pm_qos_class]->target_value);
196 unsigned long flags;
197
198 spin_lock_irqsave(&pm_qos_lock, flags);
199 ret_val = pm_qos_array[pm_qos_class]->target_value;
200 spin_unlock_irqrestore(&pm_qos_lock, flags);
201
202 return ret_val;
203} 197}
204EXPORT_SYMBOL_GPL(pm_qos_requirement); 198EXPORT_SYMBOL_GPL(pm_qos_requirement);
205 199
@@ -210,8 +204,8 @@ EXPORT_SYMBOL_GPL(pm_qos_requirement);
210 * @value: defines the qos request 204 * @value: defines the qos request
211 * 205 *
212 * This function inserts a new entry in the pm_qos_class list of requested qos 206 * This function inserts a new entry in the pm_qos_class list of requested qos
213 * performance charactoistics. It recomputes the agregate QoS expectations for 207 * performance characteristics. It recomputes the aggregate QoS expectations
214 * the pm_qos_class of parrameters. 208 * for the pm_qos_class of parameters.
215 */ 209 */
216int pm_qos_add_requirement(int pm_qos_class, char *name, s32 value) 210int pm_qos_add_requirement(int pm_qos_class, char *name, s32 value)
217{ 211{
@@ -249,10 +243,10 @@ EXPORT_SYMBOL_GPL(pm_qos_add_requirement);
249 * @name: identifies the request 243 * @name: identifies the request
250 * @value: defines the qos request 244 * @value: defines the qos request
251 * 245 *
252 * Updates an existing qos requierement for the pm_qos_class of parameters along 246 * Updates an existing qos requirement for the pm_qos_class of parameters along
253 * with updating the target pm_qos_class value. 247 * with updating the target pm_qos_class value.
254 * 248 *
255 * If the named request isn't in the lest then no change is made. 249 * If the named request isn't in the list then no change is made.
256 */ 250 */
257int pm_qos_update_requirement(int pm_qos_class, char *name, s32 new_value) 251int pm_qos_update_requirement(int pm_qos_class, char *name, s32 new_value)
258{ 252{
@@ -286,7 +280,7 @@ EXPORT_SYMBOL_GPL(pm_qos_update_requirement);
286 * @pm_qos_class: identifies which list of qos request to us 280 * @pm_qos_class: identifies which list of qos request to us
287 * @name: identifies the request 281 * @name: identifies the request
288 * 282 *
289 * Will remove named qos request from pm_qos_class list of parrameters and 283 * Will remove named qos request from pm_qos_class list of parameters and
290 * recompute the current target value for the pm_qos_class. 284 * recompute the current target value for the pm_qos_class.
291 */ 285 */
292void pm_qos_remove_requirement(int pm_qos_class, char *name) 286void pm_qos_remove_requirement(int pm_qos_class, char *name)
@@ -318,7 +312,7 @@ EXPORT_SYMBOL_GPL(pm_qos_remove_requirement);
318 * @notifier: notifier block managed by caller. 312 * @notifier: notifier block managed by caller.
319 * 313 *
320 * will register the notifier into a notification chain that gets called 314 * will register the notifier into a notification chain that gets called
321 * uppon changes to the pm_qos_class target value. 315 * upon changes to the pm_qos_class target value.
322 */ 316 */
323 int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier) 317 int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier)
324{ 318{
@@ -337,7 +331,7 @@ EXPORT_SYMBOL_GPL(pm_qos_add_notifier);
337 * @notifier: notifier block to be removed. 331 * @notifier: notifier block to be removed.
338 * 332 *
339 * will remove the notifier from the notification chain that gets called 333 * will remove the notifier from the notification chain that gets called
340 * uppon changes to the pm_qos_class target value. 334 * upon changes to the pm_qos_class target value.
341 */ 335 */
342int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier) 336int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
343{ 337{
@@ -358,15 +352,19 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp)
358 int ret; 352 int ret;
359 long pm_qos_class; 353 long pm_qos_class;
360 354
355 lock_kernel();
361 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); 356 pm_qos_class = find_pm_qos_object_by_minor(iminor(inode));
362 if (pm_qos_class >= 0) { 357 if (pm_qos_class >= 0) {
363 filp->private_data = (void *)pm_qos_class; 358 filp->private_data = (void *)pm_qos_class;
364 sprintf(name, "process_%d", current->pid); 359 sprintf(name, "process_%d", current->pid);
365 ret = pm_qos_add_requirement(pm_qos_class, name, 360 ret = pm_qos_add_requirement(pm_qos_class, name,
366 PM_QOS_DEFAULT_VALUE); 361 PM_QOS_DEFAULT_VALUE);
367 if (ret >= 0) 362 if (ret >= 0) {
363 unlock_kernel();
368 return 0; 364 return 0;
365 }
369 } 366 }
367 unlock_kernel();
370 368
371 return -EPERM; 369 return -EPERM;
372} 370}
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index f1525ad06cb3..c42a03aef36f 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1037,6 +1037,9 @@ static void check_thread_timers(struct task_struct *tsk,
1037 sig->rlim[RLIMIT_RTTIME].rlim_cur += 1037 sig->rlim[RLIMIT_RTTIME].rlim_cur +=
1038 USEC_PER_SEC; 1038 USEC_PER_SEC;
1039 } 1039 }
1040 printk(KERN_INFO
1041 "RT Watchdog Timeout: %s[%d]\n",
1042 tsk->comm, task_pid_nr(tsk));
1040 __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); 1043 __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
1041 } 1044 }
1042 } 1045 }
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index dbd8398ddb0b..5131e5471169 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -289,21 +289,29 @@ void do_schedule_next_timer(struct siginfo *info)
289 else 289 else
290 schedule_next_timer(timr); 290 schedule_next_timer(timr);
291 291
292 info->si_overrun = timr->it_overrun_last; 292 info->si_overrun += timr->it_overrun_last;
293 } 293 }
294 294
295 if (timr) 295 if (timr)
296 unlock_timer(timr, flags); 296 unlock_timer(timr, flags);
297} 297}
298 298
299int posix_timer_event(struct k_itimer *timr,int si_private) 299int posix_timer_event(struct k_itimer *timr, int si_private)
300{ 300{
301 memset(&timr->sigq->info, 0, sizeof(siginfo_t)); 301 /*
302 * FIXME: if ->sigq is queued we can race with
303 * dequeue_signal()->do_schedule_next_timer().
304 *
305 * If dequeue_signal() sees the "right" value of
306 * si_sys_private it calls do_schedule_next_timer().
307 * We re-queue ->sigq and drop ->it_lock().
308 * do_schedule_next_timer() locks the timer
309 * and re-schedules it while ->sigq is pending.
310 * Not really bad, but not that we want.
311 */
302 timr->sigq->info.si_sys_private = si_private; 312 timr->sigq->info.si_sys_private = si_private;
303 /* Send signal to the process that owns this timer.*/
304 313
305 timr->sigq->info.si_signo = timr->it_sigev_signo; 314 timr->sigq->info.si_signo = timr->it_sigev_signo;
306 timr->sigq->info.si_errno = 0;
307 timr->sigq->info.si_code = SI_TIMER; 315 timr->sigq->info.si_code = SI_TIMER;
308 timr->sigq->info.si_tid = timr->it_id; 316 timr->sigq->info.si_tid = timr->it_id;
309 timr->sigq->info.si_value = timr->it_sigev_value; 317 timr->sigq->info.si_value = timr->it_sigev_value;
@@ -433,8 +441,9 @@ static struct k_itimer * alloc_posix_timer(void)
433 return tmr; 441 return tmr;
434 if (unlikely(!(tmr->sigq = sigqueue_alloc()))) { 442 if (unlikely(!(tmr->sigq = sigqueue_alloc()))) {
435 kmem_cache_free(posix_timers_cache, tmr); 443 kmem_cache_free(posix_timers_cache, tmr);
436 tmr = NULL; 444 return NULL;
437 } 445 }
446 memset(&tmr->sigq->info, 0, sizeof(siginfo_t));
438 return tmr; 447 return tmr;
439} 448}
440 449
@@ -449,9 +458,6 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
449 spin_unlock_irqrestore(&idr_lock, flags); 458 spin_unlock_irqrestore(&idr_lock, flags);
450 } 459 }
451 sigqueue_free(tmr->sigq); 460 sigqueue_free(tmr->sigq);
452 if (unlikely(tmr->it_process) &&
453 tmr->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
454 put_task_struct(tmr->it_process);
455 kmem_cache_free(posix_timers_cache, tmr); 461 kmem_cache_free(posix_timers_cache, tmr);
456} 462}
457 463
@@ -856,11 +862,10 @@ retry_delete:
856 * This keeps any tasks waiting on the spin lock from thinking 862 * This keeps any tasks waiting on the spin lock from thinking
857 * they got something (see the lock code above). 863 * they got something (see the lock code above).
858 */ 864 */
859 if (timer->it_process) { 865 if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
860 if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) 866 put_task_struct(timer->it_process);
861 put_task_struct(timer->it_process); 867 timer->it_process = NULL;
862 timer->it_process = NULL; 868
863 }
864 unlock_timer(timer, flags); 869 unlock_timer(timer, flags);
865 release_posix_timer(timer, IT_ID_SET); 870 release_posix_timer(timer, IT_ID_SET);
866 return 0; 871 return 0;
@@ -885,11 +890,10 @@ retry_delete:
885 * This keeps any tasks waiting on the spin lock from thinking 890 * This keeps any tasks waiting on the spin lock from thinking
886 * they got something (see the lock code above). 891 * they got something (see the lock code above).
887 */ 892 */
888 if (timer->it_process) { 893 if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
889 if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) 894 put_task_struct(timer->it_process);
890 put_task_struct(timer->it_process); 895 timer->it_process = NULL;
891 timer->it_process = NULL; 896
892 }
893 unlock_timer(timer, flags); 897 unlock_timer(timer, flags);
894 release_posix_timer(timer, IT_ID_SET); 898 release_posix_timer(timer, IT_ID_SET);
895} 899}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index b45da40e8d25..dcd165f92a88 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -82,7 +82,7 @@ config PM_SLEEP_SMP
82 82
83config PM_SLEEP 83config PM_SLEEP
84 bool 84 bool
85 depends on SUSPEND || HIBERNATION 85 depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE
86 default y 86 default y
87 87
88config SUSPEND 88config SUSPEND
@@ -94,6 +94,17 @@ config SUSPEND
94 powered and thus its contents are preserved, such as the 94 powered and thus its contents are preserved, such as the
95 suspend-to-RAM state (e.g. the ACPI S3 state). 95 suspend-to-RAM state (e.g. the ACPI S3 state).
96 96
97config PM_TEST_SUSPEND
98 bool "Test suspend/resume and wakealarm during bootup"
99 depends on SUSPEND && PM_DEBUG && RTC_LIB=y
100 ---help---
101 This option will let you suspend your machine during bootup, and
102 make it wake up a few seconds later using an RTC wakeup alarm.
103 Enable this with a kernel parameter like "test_suspend=mem".
104
105 You probably want to have your system's RTC driver statically
106 linked, ensuring that it's available when this test runs.
107
97config SUSPEND_FREEZER 108config SUSPEND_FREEZER
98 bool "Enable freezer for suspend to RAM/standby" \ 109 bool "Enable freezer for suspend to RAM/standby" \
99 if ARCH_WANTS_FREEZER_CONTROL || BROKEN 110 if ARCH_WANTS_FREEZER_CONTROL || BROKEN
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 14a656cdc652..bbd85c60f741 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -21,6 +21,7 @@
21#include <linux/console.h> 21#include <linux/console.h>
22#include <linux/cpu.h> 22#include <linux/cpu.h>
23#include <linux/freezer.h> 23#include <linux/freezer.h>
24#include <linux/ftrace.h>
24 25
25#include "power.h" 26#include "power.h"
26 27
@@ -180,6 +181,17 @@ static void platform_restore_cleanup(int platform_mode)
180} 181}
181 182
182/** 183/**
184 * platform_recover - recover the platform from a failure to suspend
185 * devices.
186 */
187
188static void platform_recover(int platform_mode)
189{
190 if (platform_mode && hibernation_ops && hibernation_ops->recover)
191 hibernation_ops->recover();
192}
193
194/**
183 * create_image - freeze devices that need to be frozen with interrupts 195 * create_image - freeze devices that need to be frozen with interrupts
184 * off, create the hibernation image and thaw those devices. Control 196 * off, create the hibernation image and thaw those devices. Control
185 * reappears in this routine after a restore. 197 * reappears in this routine after a restore.
@@ -193,6 +205,7 @@ static int create_image(int platform_mode)
193 if (error) 205 if (error)
194 return error; 206 return error;
195 207
208 device_pm_lock();
196 local_irq_disable(); 209 local_irq_disable();
197 /* At this point, device_suspend() has been called, but *not* 210 /* At this point, device_suspend() has been called, but *not*
198 * device_power_down(). We *must* call device_power_down() now. 211 * device_power_down(). We *must* call device_power_down() now.
@@ -224,9 +237,11 @@ static int create_image(int platform_mode)
224 /* NOTE: device_power_up() is just a resume() for devices 237 /* NOTE: device_power_up() is just a resume() for devices
225 * that suspended with irqs off ... no overall powerup. 238 * that suspended with irqs off ... no overall powerup.
226 */ 239 */
227 device_power_up(); 240 device_power_up(in_suspend ?
241 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
228 Enable_irqs: 242 Enable_irqs:
229 local_irq_enable(); 243 local_irq_enable();
244 device_pm_unlock();
230 return error; 245 return error;
231} 246}
232 247
@@ -241,7 +256,7 @@ static int create_image(int platform_mode)
241 256
242int hibernation_snapshot(int platform_mode) 257int hibernation_snapshot(int platform_mode)
243{ 258{
244 int error; 259 int error, ftrace_save;
245 260
246 /* Free memory before shutting down devices. */ 261 /* Free memory before shutting down devices. */
247 error = swsusp_shrink_memory(); 262 error = swsusp_shrink_memory();
@@ -253,12 +268,13 @@ int hibernation_snapshot(int platform_mode)
253 goto Close; 268 goto Close;
254 269
255 suspend_console(); 270 suspend_console();
271 ftrace_save = __ftrace_enabled_save();
256 error = device_suspend(PMSG_FREEZE); 272 error = device_suspend(PMSG_FREEZE);
257 if (error) 273 if (error)
258 goto Resume_console; 274 goto Recover_platform;
259 275
260 if (hibernation_test(TEST_DEVICES)) 276 if (hibernation_test(TEST_DEVICES))
261 goto Resume_devices; 277 goto Recover_platform;
262 278
263 error = platform_pre_snapshot(platform_mode); 279 error = platform_pre_snapshot(platform_mode);
264 if (error || hibernation_test(TEST_PLATFORM)) 280 if (error || hibernation_test(TEST_PLATFORM))
@@ -280,12 +296,17 @@ int hibernation_snapshot(int platform_mode)
280 Finish: 296 Finish:
281 platform_finish(platform_mode); 297 platform_finish(platform_mode);
282 Resume_devices: 298 Resume_devices:
283 device_resume(); 299 device_resume(in_suspend ?
284 Resume_console: 300 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
301 __ftrace_enabled_restore(ftrace_save);
285 resume_console(); 302 resume_console();
286 Close: 303 Close:
287 platform_end(platform_mode); 304 platform_end(platform_mode);
288 return error; 305 return error;
306
307 Recover_platform:
308 platform_recover(platform_mode);
309 goto Resume_devices;
289} 310}
290 311
291/** 312/**
@@ -300,8 +321,9 @@ static int resume_target_kernel(void)
300{ 321{
301 int error; 322 int error;
302 323
324 device_pm_lock();
303 local_irq_disable(); 325 local_irq_disable();
304 error = device_power_down(PMSG_PRETHAW); 326 error = device_power_down(PMSG_QUIESCE);
305 if (error) { 327 if (error) {
306 printk(KERN_ERR "PM: Some devices failed to power down, " 328 printk(KERN_ERR "PM: Some devices failed to power down, "
307 "aborting resume\n"); 329 "aborting resume\n");
@@ -329,9 +351,10 @@ static int resume_target_kernel(void)
329 swsusp_free(); 351 swsusp_free();
330 restore_processor_state(); 352 restore_processor_state();
331 touch_softlockup_watchdog(); 353 touch_softlockup_watchdog();
332 device_power_up(); 354 device_power_up(PMSG_RECOVER);
333 Enable_irqs: 355 Enable_irqs:
334 local_irq_enable(); 356 local_irq_enable();
357 device_pm_unlock();
335 return error; 358 return error;
336} 359}
337 360
@@ -346,11 +369,12 @@ static int resume_target_kernel(void)
346 369
347int hibernation_restore(int platform_mode) 370int hibernation_restore(int platform_mode)
348{ 371{
349 int error; 372 int error, ftrace_save;
350 373
351 pm_prepare_console(); 374 pm_prepare_console();
352 suspend_console(); 375 suspend_console();
353 error = device_suspend(PMSG_PRETHAW); 376 ftrace_save = __ftrace_enabled_save();
377 error = device_suspend(PMSG_QUIESCE);
354 if (error) 378 if (error)
355 goto Finish; 379 goto Finish;
356 380
@@ -362,8 +386,9 @@ int hibernation_restore(int platform_mode)
362 enable_nonboot_cpus(); 386 enable_nonboot_cpus();
363 } 387 }
364 platform_restore_cleanup(platform_mode); 388 platform_restore_cleanup(platform_mode);
365 device_resume(); 389 device_resume(PMSG_RECOVER);
366 Finish: 390 Finish:
391 __ftrace_enabled_restore(ftrace_save);
367 resume_console(); 392 resume_console();
368 pm_restore_console(); 393 pm_restore_console();
369 return error; 394 return error;
@@ -376,7 +401,7 @@ int hibernation_restore(int platform_mode)
376 401
377int hibernation_platform_enter(void) 402int hibernation_platform_enter(void)
378{ 403{
379 int error; 404 int error, ftrace_save;
380 405
381 if (!hibernation_ops) 406 if (!hibernation_ops)
382 return -ENOSYS; 407 return -ENOSYS;
@@ -391,9 +416,13 @@ int hibernation_platform_enter(void)
391 goto Close; 416 goto Close;
392 417
393 suspend_console(); 418 suspend_console();
419 ftrace_save = __ftrace_enabled_save();
394 error = device_suspend(PMSG_HIBERNATE); 420 error = device_suspend(PMSG_HIBERNATE);
395 if (error) 421 if (error) {
396 goto Resume_console; 422 if (hibernation_ops->recover)
423 hibernation_ops->recover();
424 goto Resume_devices;
425 }
397 426
398 error = hibernation_ops->prepare(); 427 error = hibernation_ops->prepare();
399 if (error) 428 if (error)
@@ -403,6 +432,7 @@ int hibernation_platform_enter(void)
403 if (error) 432 if (error)
404 goto Finish; 433 goto Finish;
405 434
435 device_pm_lock();
406 local_irq_disable(); 436 local_irq_disable();
407 error = device_power_down(PMSG_HIBERNATE); 437 error = device_power_down(PMSG_HIBERNATE);
408 if (!error) { 438 if (!error) {
@@ -411,6 +441,7 @@ int hibernation_platform_enter(void)
411 while (1); 441 while (1);
412 } 442 }
413 local_irq_enable(); 443 local_irq_enable();
444 device_pm_unlock();
414 445
415 /* 446 /*
416 * We don't need to reenable the nonboot CPUs or resume consoles, since 447 * We don't need to reenable the nonboot CPUs or resume consoles, since
@@ -419,8 +450,8 @@ int hibernation_platform_enter(void)
419 Finish: 450 Finish:
420 hibernation_ops->finish(); 451 hibernation_ops->finish();
421 Resume_devices: 452 Resume_devices:
422 device_resume(); 453 device_resume(PMSG_RESTORE);
423 Resume_console: 454 __ftrace_enabled_restore(ftrace_save);
424 resume_console(); 455 resume_console();
425 Close: 456 Close:
426 hibernation_ops->end(); 457 hibernation_ops->end();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 6a6d5eb3524e..540b16b68565 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -21,6 +21,7 @@
21#include <linux/freezer.h> 21#include <linux/freezer.h>
22#include <linux/vmstat.h> 22#include <linux/vmstat.h>
23#include <linux/syscalls.h> 23#include <linux/syscalls.h>
24#include <linux/ftrace.h>
24 25
25#include "power.h" 26#include "power.h"
26 27
@@ -132,6 +133,61 @@ static inline int suspend_test(int level) { return 0; }
132 133
133#ifdef CONFIG_SUSPEND 134#ifdef CONFIG_SUSPEND
134 135
136#ifdef CONFIG_PM_TEST_SUSPEND
137
138/*
139 * We test the system suspend code by setting an RTC wakealarm a short
140 * time in the future, then suspending. Suspending the devices won't
141 * normally take long ... some systems only need a few milliseconds.
142 *
143 * The time it takes is system-specific though, so when we test this
144 * during system bootup we allow a LOT of time.
145 */
146#define TEST_SUSPEND_SECONDS 5
147
148static unsigned long suspend_test_start_time;
149
150static void suspend_test_start(void)
151{
152 /* FIXME Use better timebase than "jiffies", ideally a clocksource.
153 * What we want is a hardware counter that will work correctly even
154 * during the irqs-are-off stages of the suspend/resume cycle...
155 */
156 suspend_test_start_time = jiffies;
157}
158
159static void suspend_test_finish(const char *label)
160{
161 long nj = jiffies - suspend_test_start_time;
162 unsigned msec;
163
164 msec = jiffies_to_msecs(abs(nj));
165 pr_info("PM: %s took %d.%03d seconds\n", label,
166 msec / 1000, msec % 1000);
167
168 /* Warning on suspend means the RTC alarm period needs to be
169 * larger -- the system was sooo slooowwww to suspend that the
170 * alarm (should have) fired before the system went to sleep!
171 *
172 * Warning on either suspend or resume also means the system
173 * has some performance issues. The stack dump of a WARN_ON
174 * is more likely to get the right attention than a printk...
175 */
176 WARN_ON(msec > (TEST_SUSPEND_SECONDS * 1000));
177}
178
179#else
180
181static void suspend_test_start(void)
182{
183}
184
185static void suspend_test_finish(const char *label)
186{
187}
188
189#endif
190
135/* This is just an arbitrary number */ 191/* This is just an arbitrary number */
136#define FREE_PAGE_NUMBER (100) 192#define FREE_PAGE_NUMBER (100)
137 193
@@ -228,6 +284,7 @@ static int suspend_enter(suspend_state_t state)
228{ 284{
229 int error = 0; 285 int error = 0;
230 286
287 device_pm_lock();
231 arch_suspend_disable_irqs(); 288 arch_suspend_disable_irqs();
232 BUG_ON(!irqs_disabled()); 289 BUG_ON(!irqs_disabled());
233 290
@@ -239,10 +296,11 @@ static int suspend_enter(suspend_state_t state)
239 if (!suspend_test(TEST_CORE)) 296 if (!suspend_test(TEST_CORE))
240 error = suspend_ops->enter(state); 297 error = suspend_ops->enter(state);
241 298
242 device_power_up(); 299 device_power_up(PMSG_RESUME);
243 Done: 300 Done:
244 arch_suspend_enable_irqs(); 301 arch_suspend_enable_irqs();
245 BUG_ON(irqs_disabled()); 302 BUG_ON(irqs_disabled());
303 device_pm_unlock();
246 return error; 304 return error;
247} 305}
248 306
@@ -253,7 +311,7 @@ static int suspend_enter(suspend_state_t state)
253 */ 311 */
254int suspend_devices_and_enter(suspend_state_t state) 312int suspend_devices_and_enter(suspend_state_t state)
255{ 313{
256 int error; 314 int error, ftrace_save;
257 315
258 if (!suspend_ops) 316 if (!suspend_ops)
259 return -ENOSYS; 317 return -ENOSYS;
@@ -264,14 +322,16 @@ int suspend_devices_and_enter(suspend_state_t state)
264 goto Close; 322 goto Close;
265 } 323 }
266 suspend_console(); 324 suspend_console();
325 ftrace_save = __ftrace_enabled_save();
326 suspend_test_start();
267 error = device_suspend(PMSG_SUSPEND); 327 error = device_suspend(PMSG_SUSPEND);
268 if (error) { 328 if (error) {
269 printk(KERN_ERR "PM: Some devices failed to suspend\n"); 329 printk(KERN_ERR "PM: Some devices failed to suspend\n");
270 goto Resume_console; 330 goto Recover_platform;
271 } 331 }
272 332 suspend_test_finish("suspend devices");
273 if (suspend_test(TEST_DEVICES)) 333 if (suspend_test(TEST_DEVICES))
274 goto Resume_devices; 334 goto Recover_platform;
275 335
276 if (suspend_ops->prepare) { 336 if (suspend_ops->prepare) {
277 error = suspend_ops->prepare(); 337 error = suspend_ops->prepare();
@@ -291,13 +351,20 @@ int suspend_devices_and_enter(suspend_state_t state)
291 if (suspend_ops->finish) 351 if (suspend_ops->finish)
292 suspend_ops->finish(); 352 suspend_ops->finish();
293 Resume_devices: 353 Resume_devices:
294 device_resume(); 354 suspend_test_start();
295 Resume_console: 355 device_resume(PMSG_RESUME);
356 suspend_test_finish("resume devices");
357 __ftrace_enabled_restore(ftrace_save);
296 resume_console(); 358 resume_console();
297 Close: 359 Close:
298 if (suspend_ops->end) 360 if (suspend_ops->end)
299 suspend_ops->end(); 361 suspend_ops->end();
300 return error; 362 return error;
363
364 Recover_platform:
365 if (suspend_ops->recover)
366 suspend_ops->recover();
367 goto Resume_devices;
301} 368}
302 369
303/** 370/**
@@ -515,3 +582,144 @@ static int __init pm_init(void)
515} 582}
516 583
517core_initcall(pm_init); 584core_initcall(pm_init);
585
586
587#ifdef CONFIG_PM_TEST_SUSPEND
588
589#include <linux/rtc.h>
590
591/*
592 * To test system suspend, we need a hands-off mechanism to resume the
593 * system. RTCs wake alarms are a common self-contained mechanism.
594 */
595
596static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
597{
598 static char err_readtime[] __initdata =
599 KERN_ERR "PM: can't read %s time, err %d\n";
600 static char err_wakealarm [] __initdata =
601 KERN_ERR "PM: can't set %s wakealarm, err %d\n";
602 static char err_suspend[] __initdata =
603 KERN_ERR "PM: suspend test failed, error %d\n";
604 static char info_test[] __initdata =
605 KERN_INFO "PM: test RTC wakeup from '%s' suspend\n";
606
607 unsigned long now;
608 struct rtc_wkalrm alm;
609 int status;
610
611 /* this may fail if the RTC hasn't been initialized */
612 status = rtc_read_time(rtc, &alm.time);
613 if (status < 0) {
614 printk(err_readtime, rtc->dev.bus_id, status);
615 return;
616 }
617 rtc_tm_to_time(&alm.time, &now);
618
619 memset(&alm, 0, sizeof alm);
620 rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time);
621 alm.enabled = true;
622
623 status = rtc_set_alarm(rtc, &alm);
624 if (status < 0) {
625 printk(err_wakealarm, rtc->dev.bus_id, status);
626 return;
627 }
628
629 if (state == PM_SUSPEND_MEM) {
630 printk(info_test, pm_states[state]);
631 status = pm_suspend(state);
632 if (status == -ENODEV)
633 state = PM_SUSPEND_STANDBY;
634 }
635 if (state == PM_SUSPEND_STANDBY) {
636 printk(info_test, pm_states[state]);
637 status = pm_suspend(state);
638 }
639 if (status < 0)
640 printk(err_suspend, status);
641
642 /* Some platforms can't detect that the alarm triggered the
643 * wakeup, or (accordingly) disable it after it afterwards.
644 * It's supposed to give oneshot behavior; cope.
645 */
646 alm.enabled = false;
647 rtc_set_alarm(rtc, &alm);
648}
649
650static int __init has_wakealarm(struct device *dev, void *name_ptr)
651{
652 struct rtc_device *candidate = to_rtc_device(dev);
653
654 if (!candidate->ops->set_alarm)
655 return 0;
656 if (!device_may_wakeup(candidate->dev.parent))
657 return 0;
658
659 *(char **)name_ptr = dev->bus_id;
660 return 1;
661}
662
663/*
664 * Kernel options like "test_suspend=mem" force suspend/resume sanity tests
665 * at startup time. They're normally disabled, for faster boot and because
666 * we can't know which states really work on this particular system.
667 */
668static suspend_state_t test_state __initdata = PM_SUSPEND_ON;
669
670static char warn_bad_state[] __initdata =
671 KERN_WARNING "PM: can't test '%s' suspend state\n";
672
673static int __init setup_test_suspend(char *value)
674{
675 unsigned i;
676
677 /* "=mem" ==> "mem" */
678 value++;
679 for (i = 0; i < PM_SUSPEND_MAX; i++) {
680 if (!pm_states[i])
681 continue;
682 if (strcmp(pm_states[i], value) != 0)
683 continue;
684 test_state = (__force suspend_state_t) i;
685 return 0;
686 }
687 printk(warn_bad_state, value);
688 return 0;
689}
690__setup("test_suspend", setup_test_suspend);
691
692static int __init test_suspend(void)
693{
694 static char warn_no_rtc[] __initdata =
695 KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n";
696
697 char *pony = NULL;
698 struct rtc_device *rtc = NULL;
699
700 /* PM is initialized by now; is that state testable? */
701 if (test_state == PM_SUSPEND_ON)
702 goto done;
703 if (!valid_state(test_state)) {
704 printk(warn_bad_state, pm_states[test_state]);
705 goto done;
706 }
707
708 /* RTCs have initialized by now too ... can we use one? */
709 class_find_device(rtc_class, NULL, &pony, has_wakealarm);
710 if (pony)
711 rtc = rtc_class_open(pony);
712 if (!rtc) {
713 printk(warn_no_rtc);
714 goto done;
715 }
716
717 /* go for it */
718 test_wakealarm(rtc, test_state);
719 rtc_class_close(rtc);
720done:
721 return 0;
722}
723late_initcall(test_suspend);
724
725#endif /* CONFIG_PM_TEST_SUSPEND */
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 700f44ec8406..acc0c101dbd5 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -53,8 +53,6 @@ extern int hibernation_platform_enter(void);
53 53
54extern int pfn_is_nosave(unsigned long); 54extern int pfn_is_nosave(unsigned long);
55 55
56extern struct mutex pm_mutex;
57
58#define power_attr(_name) \ 56#define power_attr(_name) \
59static struct kobj_attribute _name##_attr = { \ 57static struct kobj_attribute _name##_attr = { \
60 .attr = { \ 58 .attr = { \
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 678ec736076b..72016f051477 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -10,6 +10,7 @@
10#include <linux/pm.h> 10#include <linux/pm.h>
11#include <linux/workqueue.h> 11#include <linux/workqueue.h>
12#include <linux/reboot.h> 12#include <linux/reboot.h>
13#include <linux/cpumask.h>
13 14
14/* 15/*
15 * When the user hits Sys-Rq o to power down the machine this is the 16 * When the user hits Sys-Rq o to power down the machine this is the
@@ -25,7 +26,8 @@ static DECLARE_WORK(poweroff_work, do_poweroff);
25 26
26static void handle_poweroff(int key, struct tty_struct *tty) 27static void handle_poweroff(int key, struct tty_struct *tty)
27{ 28{
28 schedule_work(&poweroff_work); 29 /* run sysrq poweroff on boot cpu */
30 schedule_work_on(first_cpu(cpu_online_map), &poweroff_work);
29} 31}
30 32
31static struct sysrq_key_op sysrq_poweroff_op = { 33static struct sysrq_key_op sysrq_poweroff_op = {
diff --git a/kernel/power/process.c b/kernel/power/process.c
index f1d0b345c9ba..278946aecaf0 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -19,9 +19,6 @@
19 */ 19 */
20#define TIMEOUT (20 * HZ) 20#define TIMEOUT (20 * HZ)
21 21
22#define FREEZER_KERNEL_THREADS 0
23#define FREEZER_USER_SPACE 1
24
25static inline int freezeable(struct task_struct * p) 22static inline int freezeable(struct task_struct * p)
26{ 23{
27 if ((p == current) || 24 if ((p == current) ||
@@ -84,63 +81,53 @@ static void fake_signal_wake_up(struct task_struct *p)
84 spin_unlock_irqrestore(&p->sighand->siglock, flags); 81 spin_unlock_irqrestore(&p->sighand->siglock, flags);
85} 82}
86 83
87static int has_mm(struct task_struct *p) 84static inline bool should_send_signal(struct task_struct *p)
88{ 85{
89 return (p->mm && !(p->flags & PF_BORROWED_MM)); 86 return !(p->flags & PF_FREEZER_NOSIG);
90} 87}
91 88
92/** 89/**
93 * freeze_task - send a freeze request to given task 90 * freeze_task - send a freeze request to given task
94 * @p: task to send the request to 91 * @p: task to send the request to
95 * @with_mm_only: if set, the request will only be sent if the task has its 92 * @sig_only: if set, the request will only be sent if the task has the
96 * own mm 93 * PF_FREEZER_NOSIG flag unset
97 * Return value: 0, if @with_mm_only is set and the task has no mm of its 94 * Return value: 'false', if @sig_only is set and the task has
98 * own or the task is frozen, 1, otherwise 95 * PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise
99 * 96 *
100 * The freeze request is sent by seting the tasks's TIF_FREEZE flag and 97 * The freeze request is sent by setting the tasks's TIF_FREEZE flag and
101 * either sending a fake signal to it or waking it up, depending on whether 98 * either sending a fake signal to it or waking it up, depending on whether
102 * or not it has its own mm (ie. it is a user land task). If @with_mm_only 99 * or not it has PF_FREEZER_NOSIG set. If @sig_only is set and the task
103 * is set and the task has no mm of its own (ie. it is a kernel thread), 100 * has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its
104 * its TIF_FREEZE flag should not be set. 101 * TIF_FREEZE flag will not be set.
105 *
106 * The task_lock() is necessary to prevent races with exit_mm() or
107 * use_mm()/unuse_mm() from occuring.
108 */ 102 */
109static int freeze_task(struct task_struct *p, int with_mm_only) 103static bool freeze_task(struct task_struct *p, bool sig_only)
110{ 104{
111 int ret = 1; 105 /*
106 * We first check if the task is freezing and next if it has already
107 * been frozen to avoid the race with frozen_process() which first marks
108 * the task as frozen and next clears its TIF_FREEZE.
109 */
110 if (!freezing(p)) {
111 rmb();
112 if (frozen(p))
113 return false;
112 114
113 task_lock(p); 115 if (!sig_only || should_send_signal(p))
114 if (freezing(p)) { 116 set_freeze_flag(p);
115 if (has_mm(p)) { 117 else
116 if (!signal_pending(p)) 118 return false;
117 fake_signal_wake_up(p); 119 }
118 } else { 120
119 if (with_mm_only) 121 if (should_send_signal(p)) {
120 ret = 0; 122 if (!signal_pending(p))
121 else 123 fake_signal_wake_up(p);
122 wake_up_state(p, TASK_INTERRUPTIBLE); 124 } else if (sig_only) {
123 } 125 return false;
124 } else { 126 } else {
125 rmb(); 127 wake_up_state(p, TASK_INTERRUPTIBLE);
126 if (frozen(p)) {
127 ret = 0;
128 } else {
129 if (has_mm(p)) {
130 set_freeze_flag(p);
131 fake_signal_wake_up(p);
132 } else {
133 if (with_mm_only) {
134 ret = 0;
135 } else {
136 set_freeze_flag(p);
137 wake_up_state(p, TASK_INTERRUPTIBLE);
138 }
139 }
140 }
141 } 128 }
142 task_unlock(p); 129
143 return ret; 130 return true;
144} 131}
145 132
146static void cancel_freezing(struct task_struct *p) 133static void cancel_freezing(struct task_struct *p)
@@ -156,13 +143,13 @@ static void cancel_freezing(struct task_struct *p)
156 } 143 }
157} 144}
158 145
159static int try_to_freeze_tasks(int freeze_user_space) 146static int try_to_freeze_tasks(bool sig_only)
160{ 147{
161 struct task_struct *g, *p; 148 struct task_struct *g, *p;
162 unsigned long end_time; 149 unsigned long end_time;
163 unsigned int todo; 150 unsigned int todo;
164 struct timeval start, end; 151 struct timeval start, end;
165 s64 elapsed_csecs64; 152 u64 elapsed_csecs64;
166 unsigned int elapsed_csecs; 153 unsigned int elapsed_csecs;
167 154
168 do_gettimeofday(&start); 155 do_gettimeofday(&start);
@@ -175,7 +162,7 @@ static int try_to_freeze_tasks(int freeze_user_space)
175 if (frozen(p) || !freezeable(p)) 162 if (frozen(p) || !freezeable(p))
176 continue; 163 continue;
177 164
178 if (!freeze_task(p, freeze_user_space)) 165 if (!freeze_task(p, sig_only))
179 continue; 166 continue;
180 167
181 /* 168 /*
@@ -235,13 +222,13 @@ int freeze_processes(void)
235 int error; 222 int error;
236 223
237 printk("Freezing user space processes ... "); 224 printk("Freezing user space processes ... ");
238 error = try_to_freeze_tasks(FREEZER_USER_SPACE); 225 error = try_to_freeze_tasks(true);
239 if (error) 226 if (error)
240 goto Exit; 227 goto Exit;
241 printk("done.\n"); 228 printk("done.\n");
242 229
243 printk("Freezing remaining freezable tasks ... "); 230 printk("Freezing remaining freezable tasks ... ");
244 error = try_to_freeze_tasks(FREEZER_KERNEL_THREADS); 231 error = try_to_freeze_tasks(false);
245 if (error) 232 if (error)
246 goto Exit; 233 goto Exit;
247 printk("done."); 234 printk("done.");
@@ -251,7 +238,7 @@ int freeze_processes(void)
251 return error; 238 return error;
252} 239}
253 240
254static void thaw_tasks(int thaw_user_space) 241static void thaw_tasks(bool nosig_only)
255{ 242{
256 struct task_struct *g, *p; 243 struct task_struct *g, *p;
257 244
@@ -260,7 +247,7 @@ static void thaw_tasks(int thaw_user_space)
260 if (!freezeable(p)) 247 if (!freezeable(p))
261 continue; 248 continue;
262 249
263 if (!p->mm == thaw_user_space) 250 if (nosig_only && should_send_signal(p))
264 continue; 251 continue;
265 252
266 thaw_process(p); 253 thaw_process(p);
@@ -271,8 +258,8 @@ static void thaw_tasks(int thaw_user_space)
271void thaw_processes(void) 258void thaw_processes(void)
272{ 259{
273 printk("Restarting tasks ... "); 260 printk("Restarting tasks ... ");
274 thaw_tasks(FREEZER_KERNEL_THREADS); 261 thaw_tasks(true);
275 thaw_tasks(FREEZER_USER_SPACE); 262 thaw_tasks(false);
276 schedule(); 263 schedule();
277 printk("done.\n"); 264 printk("done.\n");
278} 265}
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 5f91a07c4eac..5d2ab836e998 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -205,8 +205,7 @@ static void chain_free(struct chain_allocator *ca, int clear_page_nosave)
205 * objects. The main list's elements are of type struct zone_bitmap 205 * objects. The main list's elements are of type struct zone_bitmap
206 * and each of them corresonds to one zone. For each zone bitmap 206 * and each of them corresonds to one zone. For each zone bitmap
207 * object there is a list of objects of type struct bm_block that 207 * object there is a list of objects of type struct bm_block that
208 * represent each blocks of bit chunks in which information is 208 * represent each blocks of bitmap in which information is stored.
209 * stored.
210 * 209 *
211 * struct memory_bitmap contains a pointer to the main list of zone 210 * struct memory_bitmap contains a pointer to the main list of zone
212 * bitmap objects, a struct bm_position used for browsing the bitmap, 211 * bitmap objects, a struct bm_position used for browsing the bitmap,
@@ -224,26 +223,27 @@ static void chain_free(struct chain_allocator *ca, int clear_page_nosave)
224 * pfns that correspond to the start and end of the represented zone. 223 * pfns that correspond to the start and end of the represented zone.
225 * 224 *
226 * struct bm_block contains a pointer to the memory page in which 225 * struct bm_block contains a pointer to the memory page in which
227 * information is stored (in the form of a block of bit chunks 226 * information is stored (in the form of a block of bitmap)
228 * of type unsigned long each). It also contains the pfns that 227 * It also contains the pfns that correspond to the start and end of
229 * correspond to the start and end of the represented memory area and 228 * the represented memory area.
230 * the number of bit chunks in the block.
231 */ 229 */
232 230
233#define BM_END_OF_MAP (~0UL) 231#define BM_END_OF_MAP (~0UL)
234 232
235#define BM_CHUNKS_PER_BLOCK (PAGE_SIZE / sizeof(long))
236#define BM_BITS_PER_CHUNK (sizeof(long) << 3)
237#define BM_BITS_PER_BLOCK (PAGE_SIZE << 3) 233#define BM_BITS_PER_BLOCK (PAGE_SIZE << 3)
238 234
239struct bm_block { 235struct bm_block {
240 struct bm_block *next; /* next element of the list */ 236 struct bm_block *next; /* next element of the list */
241 unsigned long start_pfn; /* pfn represented by the first bit */ 237 unsigned long start_pfn; /* pfn represented by the first bit */
242 unsigned long end_pfn; /* pfn represented by the last bit plus 1 */ 238 unsigned long end_pfn; /* pfn represented by the last bit plus 1 */
243 unsigned int size; /* number of bit chunks */ 239 unsigned long *data; /* bitmap representing pages */
244 unsigned long *data; /* chunks of bits representing pages */
245}; 240};
246 241
242static inline unsigned long bm_block_bits(struct bm_block *bb)
243{
244 return bb->end_pfn - bb->start_pfn;
245}
246
247struct zone_bitmap { 247struct zone_bitmap {
248 struct zone_bitmap *next; /* next element of the list */ 248 struct zone_bitmap *next; /* next element of the list */
249 unsigned long start_pfn; /* minimal pfn in this zone */ 249 unsigned long start_pfn; /* minimal pfn in this zone */
@@ -257,7 +257,6 @@ struct zone_bitmap {
257struct bm_position { 257struct bm_position {
258 struct zone_bitmap *zone_bm; 258 struct zone_bitmap *zone_bm;
259 struct bm_block *block; 259 struct bm_block *block;
260 int chunk;
261 int bit; 260 int bit;
262}; 261};
263 262
@@ -272,12 +271,6 @@ struct memory_bitmap {
272 271
273/* Functions that operate on memory bitmaps */ 272/* Functions that operate on memory bitmaps */
274 273
275static inline void memory_bm_reset_chunk(struct memory_bitmap *bm)
276{
277 bm->cur.chunk = 0;
278 bm->cur.bit = -1;
279}
280
281static void memory_bm_position_reset(struct memory_bitmap *bm) 274static void memory_bm_position_reset(struct memory_bitmap *bm)
282{ 275{
283 struct zone_bitmap *zone_bm; 276 struct zone_bitmap *zone_bm;
@@ -285,7 +278,7 @@ static void memory_bm_position_reset(struct memory_bitmap *bm)
285 zone_bm = bm->zone_bm_list; 278 zone_bm = bm->zone_bm_list;
286 bm->cur.zone_bm = zone_bm; 279 bm->cur.zone_bm = zone_bm;
287 bm->cur.block = zone_bm->bm_blocks; 280 bm->cur.block = zone_bm->bm_blocks;
288 memory_bm_reset_chunk(bm); 281 bm->cur.bit = 0;
289} 282}
290 283
291static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); 284static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
@@ -394,12 +387,10 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
394 bb->start_pfn = pfn; 387 bb->start_pfn = pfn;
395 if (nr >= BM_BITS_PER_BLOCK) { 388 if (nr >= BM_BITS_PER_BLOCK) {
396 pfn += BM_BITS_PER_BLOCK; 389 pfn += BM_BITS_PER_BLOCK;
397 bb->size = BM_CHUNKS_PER_BLOCK;
398 nr -= BM_BITS_PER_BLOCK; 390 nr -= BM_BITS_PER_BLOCK;
399 } else { 391 } else {
400 /* This is executed only once in the loop */ 392 /* This is executed only once in the loop */
401 pfn += nr; 393 pfn += nr;
402 bb->size = DIV_ROUND_UP(nr, BM_BITS_PER_CHUNK);
403 } 394 }
404 bb->end_pfn = pfn; 395 bb->end_pfn = pfn;
405 bb = bb->next; 396 bb = bb->next;
@@ -478,8 +469,8 @@ static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
478 } 469 }
479 zone_bm->cur_block = bb; 470 zone_bm->cur_block = bb;
480 pfn -= bb->start_pfn; 471 pfn -= bb->start_pfn;
481 *bit_nr = pfn % BM_BITS_PER_CHUNK; 472 *bit_nr = pfn;
482 *addr = bb->data + pfn / BM_BITS_PER_CHUNK; 473 *addr = bb->data;
483 return 0; 474 return 0;
484} 475}
485 476
@@ -528,36 +519,6 @@ static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
528 return test_bit(bit, addr); 519 return test_bit(bit, addr);
529} 520}
530 521
531/* Two auxiliary functions for memory_bm_next_pfn */
532
533/* Find the first set bit in the given chunk, if there is one */
534
535static inline int next_bit_in_chunk(int bit, unsigned long *chunk_p)
536{
537 bit++;
538 while (bit < BM_BITS_PER_CHUNK) {
539 if (test_bit(bit, chunk_p))
540 return bit;
541
542 bit++;
543 }
544 return -1;
545}
546
547/* Find a chunk containing some bits set in given block of bits */
548
549static inline int next_chunk_in_block(int n, struct bm_block *bb)
550{
551 n++;
552 while (n < bb->size) {
553 if (bb->data[n])
554 return n;
555
556 n++;
557 }
558 return -1;
559}
560
561/** 522/**
562 * memory_bm_next_pfn - find the pfn that corresponds to the next set bit 523 * memory_bm_next_pfn - find the pfn that corresponds to the next set bit
563 * in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is 524 * in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is
@@ -571,40 +532,33 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
571{ 532{
572 struct zone_bitmap *zone_bm; 533 struct zone_bitmap *zone_bm;
573 struct bm_block *bb; 534 struct bm_block *bb;
574 int chunk;
575 int bit; 535 int bit;
576 536
577 do { 537 do {
578 bb = bm->cur.block; 538 bb = bm->cur.block;
579 do { 539 do {
580 chunk = bm->cur.chunk;
581 bit = bm->cur.bit; 540 bit = bm->cur.bit;
582 do { 541 bit = find_next_bit(bb->data, bm_block_bits(bb), bit);
583 bit = next_bit_in_chunk(bit, bb->data + chunk); 542 if (bit < bm_block_bits(bb))
584 if (bit >= 0) 543 goto Return_pfn;
585 goto Return_pfn; 544
586
587 chunk = next_chunk_in_block(chunk, bb);
588 bit = -1;
589 } while (chunk >= 0);
590 bb = bb->next; 545 bb = bb->next;
591 bm->cur.block = bb; 546 bm->cur.block = bb;
592 memory_bm_reset_chunk(bm); 547 bm->cur.bit = 0;
593 } while (bb); 548 } while (bb);
594 zone_bm = bm->cur.zone_bm->next; 549 zone_bm = bm->cur.zone_bm->next;
595 if (zone_bm) { 550 if (zone_bm) {
596 bm->cur.zone_bm = zone_bm; 551 bm->cur.zone_bm = zone_bm;
597 bm->cur.block = zone_bm->bm_blocks; 552 bm->cur.block = zone_bm->bm_blocks;
598 memory_bm_reset_chunk(bm); 553 bm->cur.bit = 0;
599 } 554 }
600 } while (zone_bm); 555 } while (zone_bm);
601 memory_bm_position_reset(bm); 556 memory_bm_position_reset(bm);
602 return BM_END_OF_MAP; 557 return BM_END_OF_MAP;
603 558
604 Return_pfn: 559 Return_pfn:
605 bm->cur.chunk = chunk; 560 bm->cur.bit = bit + 1;
606 bm->cur.bit = bit; 561 return bb->start_pfn + bit;
607 return bb->start_pfn + chunk * BM_BITS_PER_CHUNK + bit;
608} 562}
609 563
610/** 564/**
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index a0abf9a463f9..80ccac849e46 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -14,7 +14,6 @@
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/file.h> 15#include <linux/file.h>
16#include <linux/utsname.h> 16#include <linux/utsname.h>
17#include <linux/version.h>
18#include <linux/delay.h> 17#include <linux/delay.h>
19#include <linux/bitops.h> 18#include <linux/bitops.h>
20#include <linux/genhd.h> 19#include <linux/genhd.h>
diff --git a/kernel/power/user.c b/kernel/power/user.c
index f5512cb3aa86..a6332a313262 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -23,6 +23,7 @@
23#include <linux/console.h> 23#include <linux/console.h>
24#include <linux/cpu.h> 24#include <linux/cpu.h>
25#include <linux/freezer.h> 25#include <linux/freezer.h>
26#include <linux/smp_lock.h>
26 27
27#include <asm/uaccess.h> 28#include <asm/uaccess.h>
28 29
@@ -69,16 +70,22 @@ static int snapshot_open(struct inode *inode, struct file *filp)
69 struct snapshot_data *data; 70 struct snapshot_data *data;
70 int error; 71 int error;
71 72
72 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) 73 mutex_lock(&pm_mutex);
73 return -EBUSY; 74
75 if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
76 error = -EBUSY;
77 goto Unlock;
78 }
74 79
75 if ((filp->f_flags & O_ACCMODE) == O_RDWR) { 80 if ((filp->f_flags & O_ACCMODE) == O_RDWR) {
76 atomic_inc(&snapshot_device_available); 81 atomic_inc(&snapshot_device_available);
77 return -ENOSYS; 82 error = -ENOSYS;
83 goto Unlock;
78 } 84 }
79 if(create_basic_memory_bitmaps()) { 85 if(create_basic_memory_bitmaps()) {
80 atomic_inc(&snapshot_device_available); 86 atomic_inc(&snapshot_device_available);
81 return -ENOMEM; 87 error = -ENOMEM;
88 goto Unlock;
82 } 89 }
83 nonseekable_open(inode, filp); 90 nonseekable_open(inode, filp);
84 data = &snapshot_state; 91 data = &snapshot_state;
@@ -98,33 +105,36 @@ static int snapshot_open(struct inode *inode, struct file *filp)
98 if (error) 105 if (error)
99 pm_notifier_call_chain(PM_POST_HIBERNATION); 106 pm_notifier_call_chain(PM_POST_HIBERNATION);
100 } 107 }
101 if (error) { 108 if (error)
102 atomic_inc(&snapshot_device_available); 109 atomic_inc(&snapshot_device_available);
103 return error;
104 }
105 data->frozen = 0; 110 data->frozen = 0;
106 data->ready = 0; 111 data->ready = 0;
107 data->platform_support = 0; 112 data->platform_support = 0;
108 113
109 return 0; 114 Unlock:
115 mutex_unlock(&pm_mutex);
116
117 return error;
110} 118}
111 119
112static int snapshot_release(struct inode *inode, struct file *filp) 120static int snapshot_release(struct inode *inode, struct file *filp)
113{ 121{
114 struct snapshot_data *data; 122 struct snapshot_data *data;
115 123
124 mutex_lock(&pm_mutex);
125
116 swsusp_free(); 126 swsusp_free();
117 free_basic_memory_bitmaps(); 127 free_basic_memory_bitmaps();
118 data = filp->private_data; 128 data = filp->private_data;
119 free_all_swap_pages(data->swap); 129 free_all_swap_pages(data->swap);
120 if (data->frozen) { 130 if (data->frozen)
121 mutex_lock(&pm_mutex);
122 thaw_processes(); 131 thaw_processes();
123 mutex_unlock(&pm_mutex);
124 }
125 pm_notifier_call_chain(data->mode == O_WRONLY ? 132 pm_notifier_call_chain(data->mode == O_WRONLY ?
126 PM_POST_HIBERNATION : PM_POST_RESTORE); 133 PM_POST_HIBERNATION : PM_POST_RESTORE);
127 atomic_inc(&snapshot_device_available); 134 atomic_inc(&snapshot_device_available);
135
136 mutex_unlock(&pm_mutex);
137
128 return 0; 138 return 0;
129} 139}
130 140
@@ -134,9 +144,13 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
134 struct snapshot_data *data; 144 struct snapshot_data *data;
135 ssize_t res; 145 ssize_t res;
136 146
147 mutex_lock(&pm_mutex);
148
137 data = filp->private_data; 149 data = filp->private_data;
138 if (!data->ready) 150 if (!data->ready) {
139 return -ENODATA; 151 res = -ENODATA;
152 goto Unlock;
153 }
140 res = snapshot_read_next(&data->handle, count); 154 res = snapshot_read_next(&data->handle, count);
141 if (res > 0) { 155 if (res > 0) {
142 if (copy_to_user(buf, data_of(data->handle), res)) 156 if (copy_to_user(buf, data_of(data->handle), res))
@@ -144,6 +158,10 @@ static ssize_t snapshot_read(struct file *filp, char __user *buf,
144 else 158 else
145 *offp = data->handle.offset; 159 *offp = data->handle.offset;
146 } 160 }
161
162 Unlock:
163 mutex_unlock(&pm_mutex);
164
147 return res; 165 return res;
148} 166}
149 167
@@ -153,6 +171,8 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
153 struct snapshot_data *data; 171 struct snapshot_data *data;
154 ssize_t res; 172 ssize_t res;
155 173
174 mutex_lock(&pm_mutex);
175
156 data = filp->private_data; 176 data = filp->private_data;
157 res = snapshot_write_next(&data->handle, count); 177 res = snapshot_write_next(&data->handle, count);
158 if (res > 0) { 178 if (res > 0) {
@@ -161,11 +181,14 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf,
161 else 181 else
162 *offp = data->handle.offset; 182 *offp = data->handle.offset;
163 } 183 }
184
185 mutex_unlock(&pm_mutex);
186
164 return res; 187 return res;
165} 188}
166 189
167static int snapshot_ioctl(struct inode *inode, struct file *filp, 190static long snapshot_ioctl(struct file *filp, unsigned int cmd,
168 unsigned int cmd, unsigned long arg) 191 unsigned long arg)
169{ 192{
170 int error = 0; 193 int error = 0;
171 struct snapshot_data *data; 194 struct snapshot_data *data;
@@ -179,6 +202,9 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
179 if (!capable(CAP_SYS_ADMIN)) 202 if (!capable(CAP_SYS_ADMIN))
180 return -EPERM; 203 return -EPERM;
181 204
205 if (!mutex_trylock(&pm_mutex))
206 return -EBUSY;
207
182 data = filp->private_data; 208 data = filp->private_data;
183 209
184 switch (cmd) { 210 switch (cmd) {
@@ -186,7 +212,6 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
186 case SNAPSHOT_FREEZE: 212 case SNAPSHOT_FREEZE:
187 if (data->frozen) 213 if (data->frozen)
188 break; 214 break;
189 mutex_lock(&pm_mutex);
190 printk("Syncing filesystems ... "); 215 printk("Syncing filesystems ... ");
191 sys_sync(); 216 sys_sync();
192 printk("done.\n"); 217 printk("done.\n");
@@ -194,7 +219,6 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
194 error = freeze_processes(); 219 error = freeze_processes();
195 if (error) 220 if (error)
196 thaw_processes(); 221 thaw_processes();
197 mutex_unlock(&pm_mutex);
198 if (!error) 222 if (!error)
199 data->frozen = 1; 223 data->frozen = 1;
200 break; 224 break;
@@ -202,9 +226,7 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
202 case SNAPSHOT_UNFREEZE: 226 case SNAPSHOT_UNFREEZE:
203 if (!data->frozen || data->ready) 227 if (!data->frozen || data->ready)
204 break; 228 break;
205 mutex_lock(&pm_mutex);
206 thaw_processes(); 229 thaw_processes();
207 mutex_unlock(&pm_mutex);
208 data->frozen = 0; 230 data->frozen = 0;
209 break; 231 break;
210 232
@@ -307,16 +329,11 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
307 error = -EPERM; 329 error = -EPERM;
308 break; 330 break;
309 } 331 }
310 if (!mutex_trylock(&pm_mutex)) {
311 error = -EBUSY;
312 break;
313 }
314 /* 332 /*
315 * Tasks are frozen and the notifiers have been called with 333 * Tasks are frozen and the notifiers have been called with
316 * PM_HIBERNATION_PREPARE 334 * PM_HIBERNATION_PREPARE
317 */ 335 */
318 error = suspend_devices_and_enter(PM_SUSPEND_MEM); 336 error = suspend_devices_and_enter(PM_SUSPEND_MEM);
319 mutex_unlock(&pm_mutex);
320 break; 337 break;
321 338
322 case SNAPSHOT_PLATFORM_SUPPORT: 339 case SNAPSHOT_PLATFORM_SUPPORT:
@@ -390,6 +407,8 @@ static int snapshot_ioctl(struct inode *inode, struct file *filp,
390 407
391 } 408 }
392 409
410 mutex_unlock(&pm_mutex);
411
393 return error; 412 return error;
394} 413}
395 414
@@ -399,7 +418,7 @@ static const struct file_operations snapshot_fops = {
399 .read = snapshot_read, 418 .read = snapshot_read,
400 .write = snapshot_write, 419 .write = snapshot_write,
401 .llseek = no_llseek, 420 .llseek = no_llseek,
402 .ioctl = snapshot_ioctl, 421 .unlocked_ioctl = snapshot_ioctl,
403}; 422};
404 423
405static struct miscdevice snapshot_device = { 424static struct miscdevice snapshot_device = {
diff --git a/kernel/printk.c b/kernel/printk.c
index 8fb01c32aa3b..a430fd04008b 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -38,7 +38,7 @@
38/* 38/*
39 * Architectures can override it: 39 * Architectures can override it:
40 */ 40 */
41void __attribute__((weak)) early_printk(const char *fmt, ...) 41void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
42{ 42{
43} 43}
44 44
@@ -75,6 +75,8 @@ EXPORT_SYMBOL(oops_in_progress);
75static DECLARE_MUTEX(console_sem); 75static DECLARE_MUTEX(console_sem);
76static DECLARE_MUTEX(secondary_console_sem); 76static DECLARE_MUTEX(secondary_console_sem);
77struct console *console_drivers; 77struct console *console_drivers;
78EXPORT_SYMBOL_GPL(console_drivers);
79
78/* 80/*
79 * This is used for debugging the mess that is the VT code by 81 * This is used for debugging the mess that is the VT code by
80 * keeping track if we have the console semaphore held. It's 82 * keeping track if we have the console semaphore held. It's
@@ -121,6 +123,8 @@ struct console_cmdline
121static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; 123static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES];
122static int selected_console = -1; 124static int selected_console = -1;
123static int preferred_console = -1; 125static int preferred_console = -1;
126int console_set_on_cmdline;
127EXPORT_SYMBOL(console_set_on_cmdline);
124 128
125/* Flag: console code may call schedule() */ 129/* Flag: console code may call schedule() */
126static int console_may_schedule; 130static int console_may_schedule;
@@ -231,7 +235,7 @@ static inline void boot_delay_msec(void)
231/* 235/*
232 * Return the number of unread characters in the log buffer. 236 * Return the number of unread characters in the log buffer.
233 */ 237 */
234int log_buf_get_len(void) 238static int log_buf_get_len(void)
235{ 239{
236 return logged_chars; 240 return logged_chars;
237} 241}
@@ -268,19 +272,6 @@ int log_buf_copy(char *dest, int idx, int len)
268} 272}
269 273
270/* 274/*
271 * Extract a single character from the log buffer.
272 */
273int log_buf_read(int idx)
274{
275 char ret;
276
277 if (log_buf_copy(&ret, idx, 1) == 1)
278 return ret;
279 else
280 return -1;
281}
282
283/*
284 * Commands to do_syslog: 275 * Commands to do_syslog:
285 * 276 *
286 * 0 -- Close the log. Currently a NOP. 277 * 0 -- Close the log. Currently a NOP.
@@ -665,18 +656,17 @@ static int acquire_console_semaphore_for_printk(unsigned int cpu)
665 spin_unlock(&logbuf_lock); 656 spin_unlock(&logbuf_lock);
666 return retval; 657 return retval;
667} 658}
668 659static const char recursion_bug_msg [] =
669const char printk_recursion_bug_msg [] = 660 KERN_CRIT "BUG: recent printk recursion!\n";
670 KERN_CRIT "BUG: recent printk recursion!\n"; 661static int recursion_bug;
671static int printk_recursion_bug; 662 static int new_text_line = 1;
663static char printk_buf[1024];
672 664
673asmlinkage int vprintk(const char *fmt, va_list args) 665asmlinkage int vprintk(const char *fmt, va_list args)
674{ 666{
675 static int log_level_unknown = 1;
676 static char printk_buf[1024];
677
678 unsigned long flags;
679 int printed_len = 0; 667 int printed_len = 0;
668 int current_log_level = default_message_loglevel;
669 unsigned long flags;
680 int this_cpu; 670 int this_cpu;
681 char *p; 671 char *p;
682 672
@@ -699,7 +689,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
699 * it can be printed at the next appropriate moment: 689 * it can be printed at the next appropriate moment:
700 */ 690 */
701 if (!oops_in_progress) { 691 if (!oops_in_progress) {
702 printk_recursion_bug = 1; 692 recursion_bug = 1;
703 goto out_restore_irqs; 693 goto out_restore_irqs;
704 } 694 }
705 zap_locks(); 695 zap_locks();
@@ -709,70 +699,62 @@ asmlinkage int vprintk(const char *fmt, va_list args)
709 spin_lock(&logbuf_lock); 699 spin_lock(&logbuf_lock);
710 printk_cpu = this_cpu; 700 printk_cpu = this_cpu;
711 701
712 if (printk_recursion_bug) { 702 if (recursion_bug) {
713 printk_recursion_bug = 0; 703 recursion_bug = 0;
714 strcpy(printk_buf, printk_recursion_bug_msg); 704 strcpy(printk_buf, recursion_bug_msg);
715 printed_len = sizeof(printk_recursion_bug_msg); 705 printed_len = sizeof(recursion_bug_msg);
716 } 706 }
717 /* Emit the output into the temporary buffer */ 707 /* Emit the output into the temporary buffer */
718 printed_len += vscnprintf(printk_buf + printed_len, 708 printed_len += vscnprintf(printk_buf + printed_len,
719 sizeof(printk_buf) - printed_len, fmt, args); 709 sizeof(printk_buf) - printed_len, fmt, args);
720 710
711
721 /* 712 /*
722 * Copy the output into log_buf. If the caller didn't provide 713 * Copy the output into log_buf. If the caller didn't provide
723 * appropriate log level tags, we insert them here 714 * appropriate log level tags, we insert them here
724 */ 715 */
725 for (p = printk_buf; *p; p++) { 716 for (p = printk_buf; *p; p++) {
726 if (log_level_unknown) { 717 if (new_text_line) {
727 /* log_level_unknown signals the start of a new line */ 718 /* If a token, set current_log_level and skip over */
719 if (p[0] == '<' && p[1] >= '0' && p[1] <= '7' &&
720 p[2] == '>') {
721 current_log_level = p[1] - '0';
722 p += 3;
723 printed_len -= 3;
724 }
725
726 /* Always output the token */
727 emit_log_char('<');
728 emit_log_char(current_log_level + '0');
729 emit_log_char('>');
730 printed_len += 3;
731 new_text_line = 0;
732
728 if (printk_time) { 733 if (printk_time) {
729 int loglev_char; 734 /* Follow the token with the time */
730 char tbuf[50], *tp; 735 char tbuf[50], *tp;
731 unsigned tlen; 736 unsigned tlen;
732 unsigned long long t; 737 unsigned long long t;
733 unsigned long nanosec_rem; 738 unsigned long nanosec_rem;
734 739
735 /*
736 * force the log level token to be
737 * before the time output.
738 */
739 if (p[0] == '<' && p[1] >='0' &&
740 p[1] <= '7' && p[2] == '>') {
741 loglev_char = p[1];
742 p += 3;
743 printed_len -= 3;
744 } else {
745 loglev_char = default_message_loglevel
746 + '0';
747 }
748 t = cpu_clock(printk_cpu); 740 t = cpu_clock(printk_cpu);
749 nanosec_rem = do_div(t, 1000000000); 741 nanosec_rem = do_div(t, 1000000000);
750 tlen = sprintf(tbuf, 742 tlen = sprintf(tbuf, "[%5lu.%06lu] ",
751 "<%c>[%5lu.%06lu] ", 743 (unsigned long) t,
752 loglev_char, 744 nanosec_rem / 1000);
753 (unsigned long)t,
754 nanosec_rem/1000);
755 745
756 for (tp = tbuf; tp < tbuf + tlen; tp++) 746 for (tp = tbuf; tp < tbuf + tlen; tp++)
757 emit_log_char(*tp); 747 emit_log_char(*tp);
758 printed_len += tlen; 748 printed_len += tlen;
759 } else {
760 if (p[0] != '<' || p[1] < '0' ||
761 p[1] > '7' || p[2] != '>') {
762 emit_log_char('<');
763 emit_log_char(default_message_loglevel
764 + '0');
765 emit_log_char('>');
766 printed_len += 3;
767 }
768 } 749 }
769 log_level_unknown = 0; 750
770 if (!*p) 751 if (!*p)
771 break; 752 break;
772 } 753 }
754
773 emit_log_char(*p); 755 emit_log_char(*p);
774 if (*p == '\n') 756 if (*p == '\n')
775 log_level_unknown = 1; 757 new_text_line = 1;
776 } 758 }
777 759
778 /* 760 /*
@@ -890,6 +872,7 @@ static int __init console_setup(char *str)
890 *s = 0; 872 *s = 0;
891 873
892 __add_preferred_console(buf, idx, options, brl_options); 874 __add_preferred_console(buf, idx, options, brl_options);
875 console_set_on_cmdline = 1;
893 return 1; 876 return 1;
894} 877}
895__setup("console=", console_setup); 878__setup("console=", console_setup);
@@ -950,7 +933,7 @@ void suspend_console(void)
950{ 933{
951 if (!console_suspend_enabled) 934 if (!console_suspend_enabled)
952 return; 935 return;
953 printk("Suspending console(s)\n"); 936 printk("Suspending console(s) (use no_console_suspend to debug)\n");
954 acquire_console_sem(); 937 acquire_console_sem();
955 console_suspended = 1; 938 console_suspended = 1;
956} 939}
@@ -1041,7 +1024,9 @@ void release_console_sem(void)
1041 _log_end = log_end; 1024 _log_end = log_end;
1042 con_start = log_end; /* Flush */ 1025 con_start = log_end; /* Flush */
1043 spin_unlock(&logbuf_lock); 1026 spin_unlock(&logbuf_lock);
1027 stop_critical_timings(); /* don't trace print latency */
1044 call_console_drivers(_con_start, _log_end); 1028 call_console_drivers(_con_start, _log_end);
1029 start_critical_timings();
1045 local_irq_restore(flags); 1030 local_irq_restore(flags);
1046 } 1031 }
1047 console_locked = 0; 1032 console_locked = 0;
@@ -1172,8 +1157,11 @@ void register_console(struct console *console)
1172 console->index = 0; 1157 console->index = 0;
1173 if (console->setup == NULL || 1158 if (console->setup == NULL ||
1174 console->setup(console, NULL) == 0) { 1159 console->setup(console, NULL) == 0) {
1175 console->flags |= CON_ENABLED | CON_CONSDEV; 1160 console->flags |= CON_ENABLED;
1176 preferred_console = 0; 1161 if (console->device) {
1162 console->flags |= CON_CONSDEV;
1163 preferred_console = 0;
1164 }
1177 } 1165 }
1178 } 1166 }
1179 1167
@@ -1303,46 +1291,19 @@ static int __init disable_boot_consoles(void)
1303} 1291}
1304late_initcall(disable_boot_consoles); 1292late_initcall(disable_boot_consoles);
1305 1293
1306/**
1307 * tty_write_message - write a message to a certain tty, not just the console.
1308 * @tty: the destination tty_struct
1309 * @msg: the message to write
1310 *
1311 * This is used for messages that need to be redirected to a specific tty.
1312 * We don't put it into the syslog queue right now maybe in the future if
1313 * really needed.
1314 */
1315void tty_write_message(struct tty_struct *tty, char *msg)
1316{
1317 if (tty && tty->ops->write)
1318 tty->ops->write(tty, msg, strlen(msg));
1319 return;
1320}
1321
1322#if defined CONFIG_PRINTK 1294#if defined CONFIG_PRINTK
1295
1323/* 1296/*
1324 * printk rate limiting, lifted from the networking subsystem. 1297 * printk rate limiting, lifted from the networking subsystem.
1325 * 1298 *
1326 * This enforces a rate limit: not more than one kernel message 1299 * This enforces a rate limit: not more than 10 kernel messages
1327 * every printk_ratelimit_jiffies to make a denial-of-service 1300 * every 5s to make a denial-of-service attack impossible.
1328 * attack impossible.
1329 */ 1301 */
1330int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) 1302DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10);
1331{
1332 return __ratelimit(ratelimit_jiffies, ratelimit_burst);
1333}
1334EXPORT_SYMBOL(__printk_ratelimit);
1335
1336/* minimum time in jiffies between messages */
1337int printk_ratelimit_jiffies = 5 * HZ;
1338
1339/* number of messages we send before ratelimiting */
1340int printk_ratelimit_burst = 10;
1341 1303
1342int printk_ratelimit(void) 1304int printk_ratelimit(void)
1343{ 1305{
1344 return __printk_ratelimit(printk_ratelimit_jiffies, 1306 return __ratelimit(&printk_ratelimit_state);
1345 printk_ratelimit_burst);
1346} 1307}
1347EXPORT_SYMBOL(printk_ratelimit); 1308EXPORT_SYMBOL(printk_ratelimit);
1348 1309
diff --git a/kernel/profile.c b/kernel/profile.c
index ae7ead82cbc9..cd26bed4cc26 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -112,8 +112,6 @@ void __init profile_init(void)
112 112
113/* Profile event notifications */ 113/* Profile event notifications */
114 114
115#ifdef CONFIG_PROFILING
116
117static BLOCKING_NOTIFIER_HEAD(task_exit_notifier); 115static BLOCKING_NOTIFIER_HEAD(task_exit_notifier);
118static ATOMIC_NOTIFIER_HEAD(task_free_notifier); 116static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
119static BLOCKING_NOTIFIER_HEAD(munmap_notifier); 117static BLOCKING_NOTIFIER_HEAD(munmap_notifier);
@@ -203,8 +201,6 @@ void unregister_timer_hook(int (*hook)(struct pt_regs *))
203} 201}
204EXPORT_SYMBOL_GPL(unregister_timer_hook); 202EXPORT_SYMBOL_GPL(unregister_timer_hook);
205 203
206#endif /* CONFIG_PROFILING */
207
208 204
209#ifdef CONFIG_SMP 205#ifdef CONFIG_SMP
210/* 206/*
@@ -252,7 +248,7 @@ static void profile_flip_buffers(void)
252 mutex_lock(&profile_flip_mutex); 248 mutex_lock(&profile_flip_mutex);
253 j = per_cpu(cpu_profile_flip, get_cpu()); 249 j = per_cpu(cpu_profile_flip, get_cpu());
254 put_cpu(); 250 put_cpu();
255 on_each_cpu(__profile_flip_buffers, NULL, 0, 1); 251 on_each_cpu(__profile_flip_buffers, NULL, 1);
256 for_each_online_cpu(cpu) { 252 for_each_online_cpu(cpu) {
257 struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j]; 253 struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[j];
258 for (i = 0; i < NR_PROFILE_HIT; ++i) { 254 for (i = 0; i < NR_PROFILE_HIT; ++i) {
@@ -275,7 +271,7 @@ static void profile_discard_flip_buffers(void)
275 mutex_lock(&profile_flip_mutex); 271 mutex_lock(&profile_flip_mutex);
276 i = per_cpu(cpu_profile_flip, get_cpu()); 272 i = per_cpu(cpu_profile_flip, get_cpu());
277 put_cpu(); 273 put_cpu();
278 on_each_cpu(__profile_flip_buffers, NULL, 0, 1); 274 on_each_cpu(__profile_flip_buffers, NULL, 1);
279 for_each_online_cpu(cpu) { 275 for_each_online_cpu(cpu) {
280 struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i]; 276 struct profile_hit *hits = per_cpu(cpu_profile_hits, cpu)[i];
281 memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit)); 277 memset(hits, 0, NR_PROFILE_HIT*sizeof(struct profile_hit));
@@ -558,7 +554,7 @@ static int __init create_hash_tables(void)
558out_cleanup: 554out_cleanup:
559 prof_on = 0; 555 prof_on = 0;
560 smp_mb(); 556 smp_mb();
561 on_each_cpu(profile_nop, NULL, 0, 1); 557 on_each_cpu(profile_nop, NULL, 1);
562 for_each_online_cpu(cpu) { 558 for_each_online_cpu(cpu) {
563 struct page *page; 559 struct page *page;
564 560
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 6c19e94fd0a5..356699a96d56 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -33,13 +33,9 @@
33 */ 33 */
34void __ptrace_link(struct task_struct *child, struct task_struct *new_parent) 34void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
35{ 35{
36 BUG_ON(!list_empty(&child->ptrace_list)); 36 BUG_ON(!list_empty(&child->ptrace_entry));
37 if (child->parent == new_parent) 37 list_add(&child->ptrace_entry, &new_parent->ptraced);
38 return;
39 list_add(&child->ptrace_list, &child->parent->ptrace_children);
40 remove_parent(child);
41 child->parent = new_parent; 38 child->parent = new_parent;
42 add_parent(child);
43} 39}
44 40
45/* 41/*
@@ -73,12 +69,8 @@ void __ptrace_unlink(struct task_struct *child)
73 BUG_ON(!child->ptrace); 69 BUG_ON(!child->ptrace);
74 70
75 child->ptrace = 0; 71 child->ptrace = 0;
76 if (ptrace_reparented(child)) { 72 child->parent = child->real_parent;
77 list_del_init(&child->ptrace_list); 73 list_del_init(&child->ptrace_entry);
78 remove_parent(child);
79 child->parent = child->real_parent;
80 add_parent(child);
81 }
82 74
83 if (task_is_traced(child)) 75 if (task_is_traced(child))
84 ptrace_untrace(child); 76 ptrace_untrace(child);
@@ -115,13 +107,13 @@ int ptrace_check_attach(struct task_struct *child, int kill)
115 read_unlock(&tasklist_lock); 107 read_unlock(&tasklist_lock);
116 108
117 if (!ret && !kill) 109 if (!ret && !kill)
118 wait_task_inactive(child); 110 ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH;
119 111
120 /* All systems go.. */ 112 /* All systems go.. */
121 return ret; 113 return ret;
122} 114}
123 115
124int __ptrace_may_attach(struct task_struct *task) 116int __ptrace_may_access(struct task_struct *task, unsigned int mode)
125{ 117{
126 /* May we inspect the given task? 118 /* May we inspect the given task?
127 * This check is used both for attaching with ptrace 119 * This check is used both for attaching with ptrace
@@ -148,16 +140,16 @@ int __ptrace_may_attach(struct task_struct *task)
148 if (!dumpable && !capable(CAP_SYS_PTRACE)) 140 if (!dumpable && !capable(CAP_SYS_PTRACE))
149 return -EPERM; 141 return -EPERM;
150 142
151 return security_ptrace(current, task); 143 return security_ptrace_may_access(task, mode);
152} 144}
153 145
154int ptrace_may_attach(struct task_struct *task) 146bool ptrace_may_access(struct task_struct *task, unsigned int mode)
155{ 147{
156 int err; 148 int err;
157 task_lock(task); 149 task_lock(task);
158 err = __ptrace_may_attach(task); 150 err = __ptrace_may_access(task, mode);
159 task_unlock(task); 151 task_unlock(task);
160 return !err; 152 return (!err ? true : false);
161} 153}
162 154
163int ptrace_attach(struct task_struct *task) 155int ptrace_attach(struct task_struct *task)
@@ -195,7 +187,7 @@ repeat:
195 /* the same process cannot be attached many times */ 187 /* the same process cannot be attached many times */
196 if (task->ptrace & PT_PTRACED) 188 if (task->ptrace & PT_PTRACED)
197 goto bad; 189 goto bad;
198 retval = __ptrace_may_attach(task); 190 retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH);
199 if (retval) 191 if (retval)
200 goto bad; 192 goto bad;
201 193
@@ -492,14 +484,33 @@ int ptrace_traceme(void)
492 /* 484 /*
493 * Are we already being traced? 485 * Are we already being traced?
494 */ 486 */
487repeat:
495 task_lock(current); 488 task_lock(current);
496 if (!(current->ptrace & PT_PTRACED)) { 489 if (!(current->ptrace & PT_PTRACED)) {
497 ret = security_ptrace(current->parent, current); 490 /*
491 * See ptrace_attach() comments about the locking here.
492 */
493 unsigned long flags;
494 if (!write_trylock_irqsave(&tasklist_lock, flags)) {
495 task_unlock(current);
496 do {
497 cpu_relax();
498 } while (!write_can_lock(&tasklist_lock));
499 goto repeat;
500 }
501
502 ret = security_ptrace_traceme(current->parent);
503
498 /* 504 /*
499 * Set the ptrace bit in the process ptrace flags. 505 * Set the ptrace bit in the process ptrace flags.
506 * Then link us on our parent's ptraced list.
500 */ 507 */
501 if (!ret) 508 if (!ret) {
502 current->ptrace |= PT_PTRACED; 509 current->ptrace |= PT_PTRACED;
510 __ptrace_link(current, current->real_parent);
511 }
512
513 write_unlock_irqrestore(&tasklist_lock, flags);
503 } 514 }
504 task_unlock(current); 515 task_unlock(current);
505 return ret; 516 return ret;
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index f4ffbd0f306f..37f72e551542 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -47,6 +47,7 @@
47#include <linux/notifier.h> 47#include <linux/notifier.h>
48#include <linux/cpu.h> 48#include <linux/cpu.h>
49#include <linux/mutex.h> 49#include <linux/mutex.h>
50#include <linux/time.h>
50 51
51#ifdef CONFIG_DEBUG_LOCK_ALLOC 52#ifdef CONFIG_DEBUG_LOCK_ALLOC
52static struct lock_class_key rcu_lock_key; 53static struct lock_class_key rcu_lock_key;
@@ -60,12 +61,14 @@ EXPORT_SYMBOL_GPL(rcu_lock_map);
60static struct rcu_ctrlblk rcu_ctrlblk = { 61static struct rcu_ctrlblk rcu_ctrlblk = {
61 .cur = -300, 62 .cur = -300,
62 .completed = -300, 63 .completed = -300,
64 .pending = -300,
63 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), 65 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
64 .cpumask = CPU_MASK_NONE, 66 .cpumask = CPU_MASK_NONE,
65}; 67};
66static struct rcu_ctrlblk rcu_bh_ctrlblk = { 68static struct rcu_ctrlblk rcu_bh_ctrlblk = {
67 .cur = -300, 69 .cur = -300,
68 .completed = -300, 70 .completed = -300,
71 .pending = -300,
69 .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), 72 .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
70 .cpumask = CPU_MASK_NONE, 73 .cpumask = CPU_MASK_NONE,
71}; 74};
@@ -83,18 +86,36 @@ static void force_quiescent_state(struct rcu_data *rdp,
83{ 86{
84 int cpu; 87 int cpu;
85 cpumask_t cpumask; 88 cpumask_t cpumask;
89 unsigned long flags;
90
86 set_need_resched(); 91 set_need_resched();
92 spin_lock_irqsave(&rcp->lock, flags);
87 if (unlikely(!rcp->signaled)) { 93 if (unlikely(!rcp->signaled)) {
88 rcp->signaled = 1; 94 rcp->signaled = 1;
89 /* 95 /*
90 * Don't send IPI to itself. With irqs disabled, 96 * Don't send IPI to itself. With irqs disabled,
91 * rdp->cpu is the current cpu. 97 * rdp->cpu is the current cpu.
98 *
99 * cpu_online_map is updated by the _cpu_down()
100 * using __stop_machine(). Since we're in irqs disabled
101 * section, __stop_machine() is not exectuting, hence
102 * the cpu_online_map is stable.
103 *
104 * However, a cpu might have been offlined _just_ before
105 * we disabled irqs while entering here.
106 * And rcu subsystem might not yet have handled the CPU_DEAD
107 * notification, leading to the offlined cpu's bit
108 * being set in the rcp->cpumask.
109 *
110 * Hence cpumask = (rcp->cpumask & cpu_online_map) to prevent
111 * sending smp_reschedule() to an offlined CPU.
92 */ 112 */
93 cpumask = rcp->cpumask; 113 cpus_and(cpumask, rcp->cpumask, cpu_online_map);
94 cpu_clear(rdp->cpu, cpumask); 114 cpu_clear(rdp->cpu, cpumask);
95 for_each_cpu_mask(cpu, cpumask) 115 for_each_cpu_mask_nr(cpu, cpumask)
96 smp_send_reschedule(cpu); 116 smp_send_reschedule(cpu);
97 } 117 }
118 spin_unlock_irqrestore(&rcp->lock, flags);
98} 119}
99#else 120#else
100static inline void force_quiescent_state(struct rcu_data *rdp, 121static inline void force_quiescent_state(struct rcu_data *rdp,
@@ -104,6 +125,126 @@ static inline void force_quiescent_state(struct rcu_data *rdp,
104} 125}
105#endif 126#endif
106 127
128static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp,
129 struct rcu_data *rdp)
130{
131 long batch;
132
133 head->next = NULL;
134 smp_mb(); /* Read of rcu->cur must happen after any change by caller. */
135
136 /*
137 * Determine the batch number of this callback.
138 *
139 * Using ACCESS_ONCE to avoid the following error when gcc eliminates
140 * local variable "batch" and emits codes like this:
141 * 1) rdp->batch = rcp->cur + 1 # gets old value
142 * ......
143 * 2)rcu_batch_after(rcp->cur + 1, rdp->batch) # gets new value
144 * then [*nxttail[0], *nxttail[1]) may contain callbacks
145 * that batch# = rdp->batch, see the comment of struct rcu_data.
146 */
147 batch = ACCESS_ONCE(rcp->cur) + 1;
148
149 if (rdp->nxtlist && rcu_batch_after(batch, rdp->batch)) {
150 /* process callbacks */
151 rdp->nxttail[0] = rdp->nxttail[1];
152 rdp->nxttail[1] = rdp->nxttail[2];
153 if (rcu_batch_after(batch - 1, rdp->batch))
154 rdp->nxttail[0] = rdp->nxttail[2];
155 }
156
157 rdp->batch = batch;
158 *rdp->nxttail[2] = head;
159 rdp->nxttail[2] = &head->next;
160
161 if (unlikely(++rdp->qlen > qhimark)) {
162 rdp->blimit = INT_MAX;
163 force_quiescent_state(rdp, &rcu_ctrlblk);
164 }
165}
166
167#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
168
169static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
170{
171 rcp->gp_start = jiffies;
172 rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
173}
174
175static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
176{
177 int cpu;
178 long delta;
179 unsigned long flags;
180
181 /* Only let one CPU complain about others per time interval. */
182
183 spin_lock_irqsave(&rcp->lock, flags);
184 delta = jiffies - rcp->jiffies_stall;
185 if (delta < 2 || rcp->cur != rcp->completed) {
186 spin_unlock_irqrestore(&rcp->lock, flags);
187 return;
188 }
189 rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
190 spin_unlock_irqrestore(&rcp->lock, flags);
191
192 /* OK, time to rat on our buddy... */
193
194 printk(KERN_ERR "RCU detected CPU stalls:");
195 for_each_possible_cpu(cpu) {
196 if (cpu_isset(cpu, rcp->cpumask))
197 printk(" %d", cpu);
198 }
199 printk(" (detected by %d, t=%ld jiffies)\n",
200 smp_processor_id(), (long)(jiffies - rcp->gp_start));
201}
202
203static void print_cpu_stall(struct rcu_ctrlblk *rcp)
204{
205 unsigned long flags;
206
207 printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
208 smp_processor_id(), jiffies,
209 jiffies - rcp->gp_start);
210 dump_stack();
211 spin_lock_irqsave(&rcp->lock, flags);
212 if ((long)(jiffies - rcp->jiffies_stall) >= 0)
213 rcp->jiffies_stall =
214 jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
215 spin_unlock_irqrestore(&rcp->lock, flags);
216 set_need_resched(); /* kick ourselves to get things going. */
217}
218
219static void check_cpu_stall(struct rcu_ctrlblk *rcp)
220{
221 long delta;
222
223 delta = jiffies - rcp->jiffies_stall;
224 if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) {
225
226 /* We haven't checked in, so go dump stack. */
227 print_cpu_stall(rcp);
228
229 } else if (rcp->cur != rcp->completed && delta >= 2) {
230
231 /* They had two seconds to dump stack, so complain. */
232 print_other_cpu_stall(rcp);
233 }
234}
235
236#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
237
238static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
239{
240}
241
242static inline void check_cpu_stall(struct rcu_ctrlblk *rcp)
243{
244}
245
246#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
247
107/** 248/**
108 * call_rcu - Queue an RCU callback for invocation after a grace period. 249 * call_rcu - Queue an RCU callback for invocation after a grace period.
109 * @head: structure to be used for queueing the RCU updates. 250 * @head: structure to be used for queueing the RCU updates.
@@ -119,18 +260,10 @@ void call_rcu(struct rcu_head *head,
119 void (*func)(struct rcu_head *rcu)) 260 void (*func)(struct rcu_head *rcu))
120{ 261{
121 unsigned long flags; 262 unsigned long flags;
122 struct rcu_data *rdp;
123 263
124 head->func = func; 264 head->func = func;
125 head->next = NULL;
126 local_irq_save(flags); 265 local_irq_save(flags);
127 rdp = &__get_cpu_var(rcu_data); 266 __call_rcu(head, &rcu_ctrlblk, &__get_cpu_var(rcu_data));
128 *rdp->nxttail = head;
129 rdp->nxttail = &head->next;
130 if (unlikely(++rdp->qlen > qhimark)) {
131 rdp->blimit = INT_MAX;
132 force_quiescent_state(rdp, &rcu_ctrlblk);
133 }
134 local_irq_restore(flags); 267 local_irq_restore(flags);
135} 268}
136EXPORT_SYMBOL_GPL(call_rcu); 269EXPORT_SYMBOL_GPL(call_rcu);
@@ -155,20 +288,10 @@ void call_rcu_bh(struct rcu_head *head,
155 void (*func)(struct rcu_head *rcu)) 288 void (*func)(struct rcu_head *rcu))
156{ 289{
157 unsigned long flags; 290 unsigned long flags;
158 struct rcu_data *rdp;
159 291
160 head->func = func; 292 head->func = func;
161 head->next = NULL;
162 local_irq_save(flags); 293 local_irq_save(flags);
163 rdp = &__get_cpu_var(rcu_bh_data); 294 __call_rcu(head, &rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
164 *rdp->nxttail = head;
165 rdp->nxttail = &head->next;
166
167 if (unlikely(++rdp->qlen > qhimark)) {
168 rdp->blimit = INT_MAX;
169 force_quiescent_state(rdp, &rcu_bh_ctrlblk);
170 }
171
172 local_irq_restore(flags); 295 local_irq_restore(flags);
173} 296}
174EXPORT_SYMBOL_GPL(call_rcu_bh); 297EXPORT_SYMBOL_GPL(call_rcu_bh);
@@ -197,12 +320,6 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
197static inline void raise_rcu_softirq(void) 320static inline void raise_rcu_softirq(void)
198{ 321{
199 raise_softirq(RCU_SOFTIRQ); 322 raise_softirq(RCU_SOFTIRQ);
200 /*
201 * The smp_mb() here is required to ensure that this cpu's
202 * __rcu_process_callbacks() reads the most recently updated
203 * value of rcu->cur.
204 */
205 smp_mb();
206} 323}
207 324
208/* 325/*
@@ -211,6 +328,7 @@ static inline void raise_rcu_softirq(void)
211 */ 328 */
212static void rcu_do_batch(struct rcu_data *rdp) 329static void rcu_do_batch(struct rcu_data *rdp)
213{ 330{
331 unsigned long flags;
214 struct rcu_head *next, *list; 332 struct rcu_head *next, *list;
215 int count = 0; 333 int count = 0;
216 334
@@ -225,9 +343,9 @@ static void rcu_do_batch(struct rcu_data *rdp)
225 } 343 }
226 rdp->donelist = list; 344 rdp->donelist = list;
227 345
228 local_irq_disable(); 346 local_irq_save(flags);
229 rdp->qlen -= count; 347 rdp->qlen -= count;
230 local_irq_enable(); 348 local_irq_restore(flags);
231 if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) 349 if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
232 rdp->blimit = blimit; 350 rdp->blimit = blimit;
233 351
@@ -255,6 +373,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
255 * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace 373 * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
256 * period (if necessary). 374 * period (if necessary).
257 */ 375 */
376
258/* 377/*
259 * Register a new batch of callbacks, and start it up if there is currently no 378 * Register a new batch of callbacks, and start it up if there is currently no
260 * active batch and the batch to be registered has not already occurred. 379 * active batch and the batch to be registered has not already occurred.
@@ -262,15 +381,10 @@ static void rcu_do_batch(struct rcu_data *rdp)
262 */ 381 */
263static void rcu_start_batch(struct rcu_ctrlblk *rcp) 382static void rcu_start_batch(struct rcu_ctrlblk *rcp)
264{ 383{
265 if (rcp->next_pending && 384 if (rcp->cur != rcp->pending &&
266 rcp->completed == rcp->cur) { 385 rcp->completed == rcp->cur) {
267 rcp->next_pending = 0;
268 /*
269 * next_pending == 0 must be visible in
270 * __rcu_process_callbacks() before it can see new value of cur.
271 */
272 smp_wmb();
273 rcp->cur++; 386 rcp->cur++;
387 record_gp_stall_check_time(rcp);
274 388
275 /* 389 /*
276 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a 390 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
@@ -308,6 +422,8 @@ static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
308static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, 422static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
309 struct rcu_data *rdp) 423 struct rcu_data *rdp)
310{ 424{
425 unsigned long flags;
426
311 if (rdp->quiescbatch != rcp->cur) { 427 if (rdp->quiescbatch != rcp->cur) {
312 /* start new grace period: */ 428 /* start new grace period: */
313 rdp->qs_pending = 1; 429 rdp->qs_pending = 1;
@@ -331,7 +447,7 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
331 return; 447 return;
332 rdp->qs_pending = 0; 448 rdp->qs_pending = 0;
333 449
334 spin_lock(&rcp->lock); 450 spin_lock_irqsave(&rcp->lock, flags);
335 /* 451 /*
336 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync 452 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
337 * during cpu startup. Ignore the quiescent state. 453 * during cpu startup. Ignore the quiescent state.
@@ -339,7 +455,7 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
339 if (likely(rdp->quiescbatch == rcp->cur)) 455 if (likely(rdp->quiescbatch == rcp->cur))
340 cpu_quiet(rdp->cpu, rcp); 456 cpu_quiet(rdp->cpu, rcp);
341 457
342 spin_unlock(&rcp->lock); 458 spin_unlock_irqrestore(&rcp->lock, flags);
343} 459}
344 460
345 461
@@ -350,29 +466,38 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
350 * which is dead and hence not processing interrupts. 466 * which is dead and hence not processing interrupts.
351 */ 467 */
352static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, 468static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
353 struct rcu_head **tail) 469 struct rcu_head **tail, long batch)
354{ 470{
355 local_irq_disable(); 471 unsigned long flags;
356 *this_rdp->nxttail = list; 472
357 if (list) 473 if (list) {
358 this_rdp->nxttail = tail; 474 local_irq_save(flags);
359 local_irq_enable(); 475 this_rdp->batch = batch;
476 *this_rdp->nxttail[2] = list;
477 this_rdp->nxttail[2] = tail;
478 local_irq_restore(flags);
479 }
360} 480}
361 481
362static void __rcu_offline_cpu(struct rcu_data *this_rdp, 482static void __rcu_offline_cpu(struct rcu_data *this_rdp,
363 struct rcu_ctrlblk *rcp, struct rcu_data *rdp) 483 struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
364{ 484{
365 /* if the cpu going offline owns the grace period 485 unsigned long flags;
486
487 /*
488 * if the cpu going offline owns the grace period
366 * we can block indefinitely waiting for it, so flush 489 * we can block indefinitely waiting for it, so flush
367 * it here 490 * it here
368 */ 491 */
369 spin_lock_bh(&rcp->lock); 492 spin_lock_irqsave(&rcp->lock, flags);
370 if (rcp->cur != rcp->completed) 493 if (rcp->cur != rcp->completed)
371 cpu_quiet(rdp->cpu, rcp); 494 cpu_quiet(rdp->cpu, rcp);
372 spin_unlock_bh(&rcp->lock); 495 rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1);
373 rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail); 496 rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1);
374 rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail); 497 spin_unlock(&rcp->lock);
375 rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail); 498
499 this_rdp->qlen += rdp->qlen;
500 local_irq_restore(flags);
376} 501}
377 502
378static void rcu_offline_cpu(int cpu) 503static void rcu_offline_cpu(int cpu)
@@ -402,38 +527,52 @@ static void rcu_offline_cpu(int cpu)
402static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, 527static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
403 struct rcu_data *rdp) 528 struct rcu_data *rdp)
404{ 529{
405 if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) { 530 unsigned long flags;
406 *rdp->donetail = rdp->curlist; 531 long completed_snap;
407 rdp->donetail = rdp->curtail;
408 rdp->curlist = NULL;
409 rdp->curtail = &rdp->curlist;
410 }
411 532
412 if (rdp->nxtlist && !rdp->curlist) { 533 if (rdp->nxtlist) {
413 local_irq_disable(); 534 local_irq_save(flags);
414 rdp->curlist = rdp->nxtlist; 535 completed_snap = ACCESS_ONCE(rcp->completed);
415 rdp->curtail = rdp->nxttail;
416 rdp->nxtlist = NULL;
417 rdp->nxttail = &rdp->nxtlist;
418 local_irq_enable();
419 536
420 /* 537 /*
421 * start the next batch of callbacks 538 * move the other grace-period-completed entries to
539 * [rdp->nxtlist, *rdp->nxttail[0]) temporarily
422 */ 540 */
541 if (!rcu_batch_before(completed_snap, rdp->batch))
542 rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2];
543 else if (!rcu_batch_before(completed_snap, rdp->batch - 1))
544 rdp->nxttail[0] = rdp->nxttail[1];
423 545
424 /* determine batch number */ 546 /*
425 rdp->batch = rcp->cur + 1; 547 * the grace period for entries in
426 /* see the comment and corresponding wmb() in 548 * [rdp->nxtlist, *rdp->nxttail[0]) has completed and
427 * the rcu_start_batch() 549 * move these entries to donelist
428 */ 550 */
429 smp_rmb(); 551 if (rdp->nxttail[0] != &rdp->nxtlist) {
552 *rdp->donetail = rdp->nxtlist;
553 rdp->donetail = rdp->nxttail[0];
554 rdp->nxtlist = *rdp->nxttail[0];
555 *rdp->donetail = NULL;
556
557 if (rdp->nxttail[1] == rdp->nxttail[0])
558 rdp->nxttail[1] = &rdp->nxtlist;
559 if (rdp->nxttail[2] == rdp->nxttail[0])
560 rdp->nxttail[2] = &rdp->nxtlist;
561 rdp->nxttail[0] = &rdp->nxtlist;
562 }
563
564 local_irq_restore(flags);
565
566 if (rcu_batch_after(rdp->batch, rcp->pending)) {
567 unsigned long flags2;
430 568
431 if (!rcp->next_pending) {
432 /* and start it/schedule start if it's a new batch */ 569 /* and start it/schedule start if it's a new batch */
433 spin_lock(&rcp->lock); 570 spin_lock_irqsave(&rcp->lock, flags2);
434 rcp->next_pending = 1; 571 if (rcu_batch_after(rdp->batch, rcp->pending)) {
435 rcu_start_batch(rcp); 572 rcp->pending = rdp->batch;
436 spin_unlock(&rcp->lock); 573 rcu_start_batch(rcp);
574 }
575 spin_unlock_irqrestore(&rcp->lock, flags2);
437 } 576 }
438 } 577 }
439 578
@@ -444,21 +583,53 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
444 583
445static void rcu_process_callbacks(struct softirq_action *unused) 584static void rcu_process_callbacks(struct softirq_action *unused)
446{ 585{
586 /*
587 * Memory references from any prior RCU read-side critical sections
588 * executed by the interrupted code must be see before any RCU
589 * grace-period manupulations below.
590 */
591
592 smp_mb(); /* See above block comment. */
593
447 __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data)); 594 __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
448 __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); 595 __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
596
597 /*
598 * Memory references from any later RCU read-side critical sections
599 * executed by the interrupted code must be see after any RCU
600 * grace-period manupulations above.
601 */
602
603 smp_mb(); /* See above block comment. */
449} 604}
450 605
451static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) 606static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
452{ 607{
453 /* This cpu has pending rcu entries and the grace period 608 /* Check for CPU stalls, if enabled. */
454 * for them has completed. 609 check_cpu_stall(rcp);
455 */
456 if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch))
457 return 1;
458 610
459 /* This cpu has no pending entries, but there are new entries */ 611 if (rdp->nxtlist) {
460 if (!rdp->curlist && rdp->nxtlist) 612 long completed_snap = ACCESS_ONCE(rcp->completed);
461 return 1; 613
614 /*
615 * This cpu has pending rcu entries and the grace period
616 * for them has completed.
617 */
618 if (!rcu_batch_before(completed_snap, rdp->batch))
619 return 1;
620 if (!rcu_batch_before(completed_snap, rdp->batch - 1) &&
621 rdp->nxttail[0] != rdp->nxttail[1])
622 return 1;
623 if (rdp->nxttail[0] != &rdp->nxtlist)
624 return 1;
625
626 /*
627 * This cpu has pending rcu entries and the new batch
628 * for then hasn't been started nor scheduled start
629 */
630 if (rcu_batch_after(rdp->batch, rcp->pending))
631 return 1;
632 }
462 633
463 /* This cpu has finished callbacks to invoke */ 634 /* This cpu has finished callbacks to invoke */
464 if (rdp->donelist) 635 if (rdp->donelist)
@@ -494,32 +665,69 @@ int rcu_needs_cpu(int cpu)
494 struct rcu_data *rdp = &per_cpu(rcu_data, cpu); 665 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
495 struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu); 666 struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
496 667
497 return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu)); 668 return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu);
498} 669}
499 670
671/*
672 * Top-level function driving RCU grace-period detection, normally
673 * invoked from the scheduler-clock interrupt. This function simply
674 * increments counters that are read only from softirq by this same
675 * CPU, so there are no memory barriers required.
676 */
500void rcu_check_callbacks(int cpu, int user) 677void rcu_check_callbacks(int cpu, int user)
501{ 678{
502 if (user || 679 if (user ||
503 (idle_cpu(cpu) && !in_softirq() && 680 (idle_cpu(cpu) && !in_softirq() &&
504 hardirq_count() <= (1 << HARDIRQ_SHIFT))) { 681 hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
682
683 /*
684 * Get here if this CPU took its interrupt from user
685 * mode or from the idle loop, and if this is not a
686 * nested interrupt. In this case, the CPU is in
687 * a quiescent state, so count it.
688 *
689 * Also do a memory barrier. This is needed to handle
690 * the case where writes from a preempt-disable section
691 * of code get reordered into schedule() by this CPU's
692 * write buffer. The memory barrier makes sure that
693 * the rcu_qsctr_inc() and rcu_bh_qsctr_inc() are see
694 * by other CPUs to happen after any such write.
695 */
696
697 smp_mb(); /* See above block comment. */
505 rcu_qsctr_inc(cpu); 698 rcu_qsctr_inc(cpu);
506 rcu_bh_qsctr_inc(cpu); 699 rcu_bh_qsctr_inc(cpu);
507 } else if (!in_softirq()) 700
701 } else if (!in_softirq()) {
702
703 /*
704 * Get here if this CPU did not take its interrupt from
705 * softirq, in other words, if it is not interrupting
706 * a rcu_bh read-side critical section. This is an _bh
707 * critical section, so count it. The memory barrier
708 * is needed for the same reason as is the above one.
709 */
710
711 smp_mb(); /* See above block comment. */
508 rcu_bh_qsctr_inc(cpu); 712 rcu_bh_qsctr_inc(cpu);
713 }
509 raise_rcu_softirq(); 714 raise_rcu_softirq();
510} 715}
511 716
512static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, 717static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
513 struct rcu_data *rdp) 718 struct rcu_data *rdp)
514{ 719{
720 unsigned long flags;
721
722 spin_lock_irqsave(&rcp->lock, flags);
515 memset(rdp, 0, sizeof(*rdp)); 723 memset(rdp, 0, sizeof(*rdp));
516 rdp->curtail = &rdp->curlist; 724 rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist;
517 rdp->nxttail = &rdp->nxtlist;
518 rdp->donetail = &rdp->donelist; 725 rdp->donetail = &rdp->donelist;
519 rdp->quiescbatch = rcp->completed; 726 rdp->quiescbatch = rcp->completed;
520 rdp->qs_pending = 0; 727 rdp->qs_pending = 0;
521 rdp->cpu = cpu; 728 rdp->cpu = cpu;
522 rdp->blimit = blimit; 729 rdp->blimit = blimit;
730 spin_unlock_irqrestore(&rcp->lock, flags);
523} 731}
524 732
525static void __cpuinit rcu_online_cpu(int cpu) 733static void __cpuinit rcu_online_cpu(int cpu)
@@ -529,7 +737,7 @@ static void __cpuinit rcu_online_cpu(int cpu)
529 737
530 rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp); 738 rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
531 rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp); 739 rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
532 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL); 740 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
533} 741}
534 742
535static int __cpuinit rcu_cpu_notify(struct notifier_block *self, 743static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
@@ -564,6 +772,9 @@ static struct notifier_block __cpuinitdata rcu_nb = {
564 */ 772 */
565void __init __rcu_init(void) 773void __init __rcu_init(void)
566{ 774{
775#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
776 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
777#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
567 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, 778 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
568 (void *)(long)smp_processor_id()); 779 (void *)(long)smp_processor_id());
569 /* Register notifier for non-boot CPUs */ 780 /* Register notifier for non-boot CPUs */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index c09605f8d16c..467d5940f624 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -39,16 +39,16 @@
39#include <linux/sched.h> 39#include <linux/sched.h>
40#include <asm/atomic.h> 40#include <asm/atomic.h>
41#include <linux/bitops.h> 41#include <linux/bitops.h>
42#include <linux/completion.h>
43#include <linux/percpu.h> 42#include <linux/percpu.h>
44#include <linux/notifier.h> 43#include <linux/notifier.h>
45#include <linux/cpu.h> 44#include <linux/cpu.h>
46#include <linux/mutex.h> 45#include <linux/mutex.h>
47#include <linux/module.h> 46#include <linux/module.h>
48 47
49struct rcu_synchronize { 48enum rcu_barrier {
50 struct rcu_head head; 49 RCU_BARRIER_STD,
51 struct completion completion; 50 RCU_BARRIER_BH,
51 RCU_BARRIER_SCHED,
52}; 52};
53 53
54static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; 54static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
@@ -60,7 +60,7 @@ static struct completion rcu_barrier_completion;
60 * Awaken the corresponding synchronize_rcu() instance now that a 60 * Awaken the corresponding synchronize_rcu() instance now that a
61 * grace period has elapsed. 61 * grace period has elapsed.
62 */ 62 */
63static void wakeme_after_rcu(struct rcu_head *head) 63void wakeme_after_rcu(struct rcu_head *head)
64{ 64{
65 struct rcu_synchronize *rcu; 65 struct rcu_synchronize *rcu;
66 66
@@ -77,17 +77,8 @@ static void wakeme_after_rcu(struct rcu_head *head)
77 * sections are delimited by rcu_read_lock() and rcu_read_unlock(), 77 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
78 * and may be nested. 78 * and may be nested.
79 */ 79 */
80void synchronize_rcu(void) 80void synchronize_rcu(void); /* Makes kernel-doc tools happy */
81{ 81synchronize_rcu_xxx(synchronize_rcu, call_rcu)
82 struct rcu_synchronize rcu;
83
84 init_completion(&rcu.completion);
85 /* Will wake me after RCU finished */
86 call_rcu(&rcu.head, wakeme_after_rcu);
87
88 /* Wait for it */
89 wait_for_completion(&rcu.completion);
90}
91EXPORT_SYMBOL_GPL(synchronize_rcu); 82EXPORT_SYMBOL_GPL(synchronize_rcu);
92 83
93static void rcu_barrier_callback(struct rcu_head *notused) 84static void rcu_barrier_callback(struct rcu_head *notused)
@@ -99,19 +90,30 @@ static void rcu_barrier_callback(struct rcu_head *notused)
99/* 90/*
100 * Called with preemption disabled, and from cross-cpu IRQ context. 91 * Called with preemption disabled, and from cross-cpu IRQ context.
101 */ 92 */
102static void rcu_barrier_func(void *notused) 93static void rcu_barrier_func(void *type)
103{ 94{
104 int cpu = smp_processor_id(); 95 int cpu = smp_processor_id();
105 struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu); 96 struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu);
106 97
107 atomic_inc(&rcu_barrier_cpu_count); 98 atomic_inc(&rcu_barrier_cpu_count);
108 call_rcu(head, rcu_barrier_callback); 99 switch ((enum rcu_barrier)type) {
100 case RCU_BARRIER_STD:
101 call_rcu(head, rcu_barrier_callback);
102 break;
103 case RCU_BARRIER_BH:
104 call_rcu_bh(head, rcu_barrier_callback);
105 break;
106 case RCU_BARRIER_SCHED:
107 call_rcu_sched(head, rcu_barrier_callback);
108 break;
109 }
109} 110}
110 111
111/** 112/*
112 * rcu_barrier - Wait until all the in-flight RCUs are complete. 113 * Orchestrate the specified type of RCU barrier, waiting for all
114 * RCU callbacks of the specified type to complete.
113 */ 115 */
114void rcu_barrier(void) 116static void _rcu_barrier(enum rcu_barrier type)
115{ 117{
116 BUG_ON(in_interrupt()); 118 BUG_ON(in_interrupt());
117 /* Take cpucontrol mutex to protect against CPU hotplug */ 119 /* Take cpucontrol mutex to protect against CPU hotplug */
@@ -127,13 +129,39 @@ void rcu_barrier(void)
127 * until all the callbacks are queued. 129 * until all the callbacks are queued.
128 */ 130 */
129 rcu_read_lock(); 131 rcu_read_lock();
130 on_each_cpu(rcu_barrier_func, NULL, 0, 1); 132 on_each_cpu(rcu_barrier_func, (void *)type, 1);
131 rcu_read_unlock(); 133 rcu_read_unlock();
132 wait_for_completion(&rcu_barrier_completion); 134 wait_for_completion(&rcu_barrier_completion);
133 mutex_unlock(&rcu_barrier_mutex); 135 mutex_unlock(&rcu_barrier_mutex);
134} 136}
137
138/**
139 * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
140 */
141void rcu_barrier(void)
142{
143 _rcu_barrier(RCU_BARRIER_STD);
144}
135EXPORT_SYMBOL_GPL(rcu_barrier); 145EXPORT_SYMBOL_GPL(rcu_barrier);
136 146
147/**
148 * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
149 */
150void rcu_barrier_bh(void)
151{
152 _rcu_barrier(RCU_BARRIER_BH);
153}
154EXPORT_SYMBOL_GPL(rcu_barrier_bh);
155
156/**
157 * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
158 */
159void rcu_barrier_sched(void)
160{
161 _rcu_barrier(RCU_BARRIER_SCHED);
162}
163EXPORT_SYMBOL_GPL(rcu_barrier_sched);
164
137void __init rcu_init(void) 165void __init rcu_init(void)
138{ 166{
139 __rcu_init(); 167 __rcu_init();
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 5e02b7740702..ca4bbbe04aa4 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -46,11 +46,11 @@
46#include <asm/atomic.h> 46#include <asm/atomic.h>
47#include <linux/bitops.h> 47#include <linux/bitops.h>
48#include <linux/module.h> 48#include <linux/module.h>
49#include <linux/kthread.h>
49#include <linux/completion.h> 50#include <linux/completion.h>
50#include <linux/moduleparam.h> 51#include <linux/moduleparam.h>
51#include <linux/percpu.h> 52#include <linux/percpu.h>
52#include <linux/notifier.h> 53#include <linux/notifier.h>
53#include <linux/rcupdate.h>
54#include <linux/cpu.h> 54#include <linux/cpu.h>
55#include <linux/random.h> 55#include <linux/random.h>
56#include <linux/delay.h> 56#include <linux/delay.h>
@@ -59,14 +59,6 @@
59#include <linux/rcupreempt_trace.h> 59#include <linux/rcupreempt_trace.h>
60 60
61/* 61/*
62 * Macro that prevents the compiler from reordering accesses, but does
63 * absolutely -nothing- to prevent CPUs from reordering. This is used
64 * only to mediate communication between mainline code and hardware
65 * interrupt and NMI handlers.
66 */
67#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
68
69/*
70 * PREEMPT_RCU data structures. 62 * PREEMPT_RCU data structures.
71 */ 63 */
72 64
@@ -82,14 +74,18 @@ struct rcu_data {
82 spinlock_t lock; /* Protect rcu_data fields. */ 74 spinlock_t lock; /* Protect rcu_data fields. */
83 long completed; /* Number of last completed batch. */ 75 long completed; /* Number of last completed batch. */
84 int waitlistcount; 76 int waitlistcount;
85 struct tasklet_struct rcu_tasklet;
86 struct rcu_head *nextlist; 77 struct rcu_head *nextlist;
87 struct rcu_head **nexttail; 78 struct rcu_head **nexttail;
88 struct rcu_head *waitlist[GP_STAGES]; 79 struct rcu_head *waitlist[GP_STAGES];
89 struct rcu_head **waittail[GP_STAGES]; 80 struct rcu_head **waittail[GP_STAGES];
90 struct rcu_head *donelist; 81 struct rcu_head *donelist; /* from waitlist & waitschedlist */
91 struct rcu_head **donetail; 82 struct rcu_head **donetail;
92 long rcu_flipctr[2]; 83 long rcu_flipctr[2];
84 struct rcu_head *nextschedlist;
85 struct rcu_head **nextschedtail;
86 struct rcu_head *waitschedlist;
87 struct rcu_head **waitschedtail;
88 int rcu_sched_sleeping;
93#ifdef CONFIG_RCU_TRACE 89#ifdef CONFIG_RCU_TRACE
94 struct rcupreempt_trace trace; 90 struct rcupreempt_trace trace;
95#endif /* #ifdef CONFIG_RCU_TRACE */ 91#endif /* #ifdef CONFIG_RCU_TRACE */
@@ -131,11 +127,24 @@ enum rcu_try_flip_states {
131 rcu_try_flip_waitmb_state, 127 rcu_try_flip_waitmb_state,
132}; 128};
133 129
130/*
131 * States for rcu_ctrlblk.rcu_sched_sleep.
132 */
133
134enum rcu_sched_sleep_states {
135 rcu_sched_not_sleeping, /* Not sleeping, callbacks need GP. */
136 rcu_sched_sleep_prep, /* Thinking of sleeping, rechecking. */
137 rcu_sched_sleeping, /* Sleeping, awaken if GP needed. */
138};
139
134struct rcu_ctrlblk { 140struct rcu_ctrlblk {
135 spinlock_t fliplock; /* Protect state-machine transitions. */ 141 spinlock_t fliplock; /* Protect state-machine transitions. */
136 long completed; /* Number of last completed batch. */ 142 long completed; /* Number of last completed batch. */
137 enum rcu_try_flip_states rcu_try_flip_state; /* The current state of 143 enum rcu_try_flip_states rcu_try_flip_state; /* The current state of
138 the rcu state machine */ 144 the rcu state machine */
145 spinlock_t schedlock; /* Protect rcu_sched sleep state. */
146 enum rcu_sched_sleep_states sched_sleep; /* rcu_sched state. */
147 wait_queue_head_t sched_wq; /* Place for rcu_sched to sleep. */
139}; 148};
140 149
141static DEFINE_PER_CPU(struct rcu_data, rcu_data); 150static DEFINE_PER_CPU(struct rcu_data, rcu_data);
@@ -143,8 +152,12 @@ static struct rcu_ctrlblk rcu_ctrlblk = {
143 .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock), 152 .fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
144 .completed = 0, 153 .completed = 0,
145 .rcu_try_flip_state = rcu_try_flip_idle_state, 154 .rcu_try_flip_state = rcu_try_flip_idle_state,
155 .schedlock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.schedlock),
156 .sched_sleep = rcu_sched_not_sleeping,
157 .sched_wq = __WAIT_QUEUE_HEAD_INITIALIZER(rcu_ctrlblk.sched_wq),
146}; 158};
147 159
160static struct task_struct *rcu_sched_grace_period_task;
148 161
149#ifdef CONFIG_RCU_TRACE 162#ifdef CONFIG_RCU_TRACE
150static char *rcu_try_flip_state_names[] = 163static char *rcu_try_flip_state_names[] =
@@ -207,6 +220,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(enum rcu_mb_flag_values, rcu_mb_flag)
207 */ 220 */
208#define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace)); 221#define RCU_TRACE_RDP(f, rdp) RCU_TRACE(f, &((rdp)->trace));
209 222
223#define RCU_SCHED_BATCH_TIME (HZ / 50)
224
210/* 225/*
211 * Return the number of RCU batches processed thus far. Useful 226 * Return the number of RCU batches processed thus far. Useful
212 * for debug and statistics. 227 * for debug and statistics.
@@ -411,32 +426,34 @@ static void __rcu_advance_callbacks(struct rcu_data *rdp)
411 } 426 }
412} 427}
413 428
414#ifdef CONFIG_NO_HZ 429DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
430 .dynticks = 1,
431};
415 432
416DEFINE_PER_CPU(long, dynticks_progress_counter) = 1; 433#ifdef CONFIG_NO_HZ
417static DEFINE_PER_CPU(long, rcu_dyntick_snapshot);
418static DEFINE_PER_CPU(int, rcu_update_flag); 434static DEFINE_PER_CPU(int, rcu_update_flag);
419 435
420/** 436/**
421 * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI. 437 * rcu_irq_enter - Called from Hard irq handlers and NMI/SMI.
422 * 438 *
423 * If the CPU was idle with dynamic ticks active, this updates the 439 * If the CPU was idle with dynamic ticks active, this updates the
424 * dynticks_progress_counter to let the RCU handling know that the 440 * rcu_dyntick_sched.dynticks to let the RCU handling know that the
425 * CPU is active. 441 * CPU is active.
426 */ 442 */
427void rcu_irq_enter(void) 443void rcu_irq_enter(void)
428{ 444{
429 int cpu = smp_processor_id(); 445 int cpu = smp_processor_id();
446 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
430 447
431 if (per_cpu(rcu_update_flag, cpu)) 448 if (per_cpu(rcu_update_flag, cpu))
432 per_cpu(rcu_update_flag, cpu)++; 449 per_cpu(rcu_update_flag, cpu)++;
433 450
434 /* 451 /*
435 * Only update if we are coming from a stopped ticks mode 452 * Only update if we are coming from a stopped ticks mode
436 * (dynticks_progress_counter is even). 453 * (rcu_dyntick_sched.dynticks is even).
437 */ 454 */
438 if (!in_interrupt() && 455 if (!in_interrupt() &&
439 (per_cpu(dynticks_progress_counter, cpu) & 0x1) == 0) { 456 (rdssp->dynticks & 0x1) == 0) {
440 /* 457 /*
441 * The following might seem like we could have a race 458 * The following might seem like we could have a race
442 * with NMI/SMIs. But this really isn't a problem. 459 * with NMI/SMIs. But this really isn't a problem.
@@ -459,12 +476,12 @@ void rcu_irq_enter(void)
459 * RCU read-side critical sections on this CPU would 476 * RCU read-side critical sections on this CPU would
460 * have already completed. 477 * have already completed.
461 */ 478 */
462 per_cpu(dynticks_progress_counter, cpu)++; 479 rdssp->dynticks++;
463 /* 480 /*
464 * The following memory barrier ensures that any 481 * The following memory barrier ensures that any
465 * rcu_read_lock() primitives in the irq handler 482 * rcu_read_lock() primitives in the irq handler
466 * are seen by other CPUs to follow the above 483 * are seen by other CPUs to follow the above
467 * increment to dynticks_progress_counter. This is 484 * increment to rcu_dyntick_sched.dynticks. This is
468 * required in order for other CPUs to correctly 485 * required in order for other CPUs to correctly
469 * determine when it is safe to advance the RCU 486 * determine when it is safe to advance the RCU
470 * grace-period state machine. 487 * grace-period state machine.
@@ -472,7 +489,7 @@ void rcu_irq_enter(void)
472 smp_mb(); /* see above block comment. */ 489 smp_mb(); /* see above block comment. */
473 /* 490 /*
474 * Since we can't determine the dynamic tick mode from 491 * Since we can't determine the dynamic tick mode from
475 * the dynticks_progress_counter after this routine, 492 * the rcu_dyntick_sched.dynticks after this routine,
476 * we use a second flag to acknowledge that we came 493 * we use a second flag to acknowledge that we came
477 * from an idle state with ticks stopped. 494 * from an idle state with ticks stopped.
478 */ 495 */
@@ -480,7 +497,7 @@ void rcu_irq_enter(void)
480 /* 497 /*
481 * If we take an NMI/SMI now, they will also increment 498 * If we take an NMI/SMI now, they will also increment
482 * the rcu_update_flag, and will not update the 499 * the rcu_update_flag, and will not update the
483 * dynticks_progress_counter on exit. That is for 500 * rcu_dyntick_sched.dynticks on exit. That is for
484 * this IRQ to do. 501 * this IRQ to do.
485 */ 502 */
486 } 503 }
@@ -490,12 +507,13 @@ void rcu_irq_enter(void)
490 * rcu_irq_exit - Called from exiting Hard irq context. 507 * rcu_irq_exit - Called from exiting Hard irq context.
491 * 508 *
492 * If the CPU was idle with dynamic ticks active, update the 509 * If the CPU was idle with dynamic ticks active, update the
493 * dynticks_progress_counter to put let the RCU handling be 510 * rcu_dyntick_sched.dynticks to put let the RCU handling be
494 * aware that the CPU is going back to idle with no ticks. 511 * aware that the CPU is going back to idle with no ticks.
495 */ 512 */
496void rcu_irq_exit(void) 513void rcu_irq_exit(void)
497{ 514{
498 int cpu = smp_processor_id(); 515 int cpu = smp_processor_id();
516 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
499 517
500 /* 518 /*
501 * rcu_update_flag is set if we interrupted the CPU 519 * rcu_update_flag is set if we interrupted the CPU
@@ -503,7 +521,7 @@ void rcu_irq_exit(void)
503 * Once this occurs, we keep track of interrupt nesting 521 * Once this occurs, we keep track of interrupt nesting
504 * because a NMI/SMI could also come in, and we still 522 * because a NMI/SMI could also come in, and we still
505 * only want the IRQ that started the increment of the 523 * only want the IRQ that started the increment of the
506 * dynticks_progress_counter to be the one that modifies 524 * rcu_dyntick_sched.dynticks to be the one that modifies
507 * it on exit. 525 * it on exit.
508 */ 526 */
509 if (per_cpu(rcu_update_flag, cpu)) { 527 if (per_cpu(rcu_update_flag, cpu)) {
@@ -515,28 +533,29 @@ void rcu_irq_exit(void)
515 533
516 /* 534 /*
517 * If an NMI/SMI happens now we are still 535 * If an NMI/SMI happens now we are still
518 * protected by the dynticks_progress_counter being odd. 536 * protected by the rcu_dyntick_sched.dynticks being odd.
519 */ 537 */
520 538
521 /* 539 /*
522 * The following memory barrier ensures that any 540 * The following memory barrier ensures that any
523 * rcu_read_unlock() primitives in the irq handler 541 * rcu_read_unlock() primitives in the irq handler
524 * are seen by other CPUs to preceed the following 542 * are seen by other CPUs to preceed the following
525 * increment to dynticks_progress_counter. This 543 * increment to rcu_dyntick_sched.dynticks. This
526 * is required in order for other CPUs to determine 544 * is required in order for other CPUs to determine
527 * when it is safe to advance the RCU grace-period 545 * when it is safe to advance the RCU grace-period
528 * state machine. 546 * state machine.
529 */ 547 */
530 smp_mb(); /* see above block comment. */ 548 smp_mb(); /* see above block comment. */
531 per_cpu(dynticks_progress_counter, cpu)++; 549 rdssp->dynticks++;
532 WARN_ON(per_cpu(dynticks_progress_counter, cpu) & 0x1); 550 WARN_ON(rdssp->dynticks & 0x1);
533 } 551 }
534} 552}
535 553
536static void dyntick_save_progress_counter(int cpu) 554static void dyntick_save_progress_counter(int cpu)
537{ 555{
538 per_cpu(rcu_dyntick_snapshot, cpu) = 556 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
539 per_cpu(dynticks_progress_counter, cpu); 557
558 rdssp->dynticks_snap = rdssp->dynticks;
540} 559}
541 560
542static inline int 561static inline int
@@ -544,9 +563,10 @@ rcu_try_flip_waitack_needed(int cpu)
544{ 563{
545 long curr; 564 long curr;
546 long snap; 565 long snap;
566 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
547 567
548 curr = per_cpu(dynticks_progress_counter, cpu); 568 curr = rdssp->dynticks;
549 snap = per_cpu(rcu_dyntick_snapshot, cpu); 569 snap = rdssp->dynticks_snap;
550 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ 570 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
551 571
552 /* 572 /*
@@ -567,7 +587,7 @@ rcu_try_flip_waitack_needed(int cpu)
567 * that this CPU already acknowledged the counter. 587 * that this CPU already acknowledged the counter.
568 */ 588 */
569 589
570 if ((curr - snap) > 2 || (snap & 0x1) == 0) 590 if ((curr - snap) > 2 || (curr & 0x1) == 0)
571 return 0; 591 return 0;
572 592
573 /* We need this CPU to explicitly acknowledge the counter flip. */ 593 /* We need this CPU to explicitly acknowledge the counter flip. */
@@ -580,9 +600,10 @@ rcu_try_flip_waitmb_needed(int cpu)
580{ 600{
581 long curr; 601 long curr;
582 long snap; 602 long snap;
603 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
583 604
584 curr = per_cpu(dynticks_progress_counter, cpu); 605 curr = rdssp->dynticks;
585 snap = per_cpu(rcu_dyntick_snapshot, cpu); 606 snap = rdssp->dynticks_snap;
586 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */ 607 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
587 608
588 /* 609 /*
@@ -609,14 +630,86 @@ rcu_try_flip_waitmb_needed(int cpu)
609 return 1; 630 return 1;
610} 631}
611 632
633static void dyntick_save_progress_counter_sched(int cpu)
634{
635 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
636
637 rdssp->sched_dynticks_snap = rdssp->dynticks;
638}
639
640static int rcu_qsctr_inc_needed_dyntick(int cpu)
641{
642 long curr;
643 long snap;
644 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
645
646 curr = rdssp->dynticks;
647 snap = rdssp->sched_dynticks_snap;
648 smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
649
650 /*
651 * If the CPU remained in dynticks mode for the entire time
652 * and didn't take any interrupts, NMIs, SMIs, or whatever,
653 * then it cannot be in the middle of an rcu_read_lock(), so
654 * the next rcu_read_lock() it executes must use the new value
655 * of the counter. Therefore, this CPU has been in a quiescent
656 * state the entire time, and we don't need to wait for it.
657 */
658
659 if ((curr == snap) && ((curr & 0x1) == 0))
660 return 0;
661
662 /*
663 * If the CPU passed through or entered a dynticks idle phase with
664 * no active irq handlers, then, as above, this CPU has already
665 * passed through a quiescent state.
666 */
667
668 if ((curr - snap) > 2 || (snap & 0x1) == 0)
669 return 0;
670
671 /* We need this CPU to go through a quiescent state. */
672
673 return 1;
674}
675
612#else /* !CONFIG_NO_HZ */ 676#else /* !CONFIG_NO_HZ */
613 677
614# define dyntick_save_progress_counter(cpu) do { } while (0) 678# define dyntick_save_progress_counter(cpu) do { } while (0)
615# define rcu_try_flip_waitack_needed(cpu) (1) 679# define rcu_try_flip_waitack_needed(cpu) (1)
616# define rcu_try_flip_waitmb_needed(cpu) (1) 680# define rcu_try_flip_waitmb_needed(cpu) (1)
681
682# define dyntick_save_progress_counter_sched(cpu) do { } while (0)
683# define rcu_qsctr_inc_needed_dyntick(cpu) (1)
617 684
618#endif /* CONFIG_NO_HZ */ 685#endif /* CONFIG_NO_HZ */
619 686
687static void save_qsctr_sched(int cpu)
688{
689 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
690
691 rdssp->sched_qs_snap = rdssp->sched_qs;
692}
693
694static inline int rcu_qsctr_inc_needed(int cpu)
695{
696 struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
697
698 /*
699 * If there has been a quiescent state, no more need to wait
700 * on this CPU.
701 */
702
703 if (rdssp->sched_qs != rdssp->sched_qs_snap) {
704 smp_mb(); /* force ordering with cpu entering schedule(). */
705 return 0;
706 }
707
708 /* We need this CPU to go through a quiescent state. */
709
710 return 1;
711}
712
620/* 713/*
621 * Get here when RCU is idle. Decide whether we need to 714 * Get here when RCU is idle. Decide whether we need to
622 * move out of idle state, and return non-zero if so. 715 * move out of idle state, and return non-zero if so.
@@ -655,7 +748,7 @@ rcu_try_flip_idle(void)
655 748
656 /* Now ask each CPU for acknowledgement of the flip. */ 749 /* Now ask each CPU for acknowledgement of the flip. */
657 750
658 for_each_cpu_mask(cpu, rcu_cpu_online_map) { 751 for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) {
659 per_cpu(rcu_flip_flag, cpu) = rcu_flipped; 752 per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
660 dyntick_save_progress_counter(cpu); 753 dyntick_save_progress_counter(cpu);
661 } 754 }
@@ -673,7 +766,7 @@ rcu_try_flip_waitack(void)
673 int cpu; 766 int cpu;
674 767
675 RCU_TRACE_ME(rcupreempt_trace_try_flip_a1); 768 RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
676 for_each_cpu_mask(cpu, rcu_cpu_online_map) 769 for_each_cpu_mask_nr(cpu, rcu_cpu_online_map)
677 if (rcu_try_flip_waitack_needed(cpu) && 770 if (rcu_try_flip_waitack_needed(cpu) &&
678 per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) { 771 per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
679 RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1); 772 RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
@@ -705,7 +798,7 @@ rcu_try_flip_waitzero(void)
705 /* Check to see if the sum of the "last" counters is zero. */ 798 /* Check to see if the sum of the "last" counters is zero. */
706 799
707 RCU_TRACE_ME(rcupreempt_trace_try_flip_z1); 800 RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
708 for_each_cpu_mask(cpu, rcu_cpu_online_map) 801 for_each_cpu_mask_nr(cpu, rcu_cpu_online_map)
709 sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx]; 802 sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
710 if (sum != 0) { 803 if (sum != 0) {
711 RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1); 804 RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
@@ -720,7 +813,7 @@ rcu_try_flip_waitzero(void)
720 smp_mb(); /* ^^^^^^^^^^^^ */ 813 smp_mb(); /* ^^^^^^^^^^^^ */
721 814
722 /* Call for a memory barrier from each CPU. */ 815 /* Call for a memory barrier from each CPU. */
723 for_each_cpu_mask(cpu, rcu_cpu_online_map) { 816 for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) {
724 per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed; 817 per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
725 dyntick_save_progress_counter(cpu); 818 dyntick_save_progress_counter(cpu);
726 } 819 }
@@ -740,7 +833,7 @@ rcu_try_flip_waitmb(void)
740 int cpu; 833 int cpu;
741 834
742 RCU_TRACE_ME(rcupreempt_trace_try_flip_m1); 835 RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
743 for_each_cpu_mask(cpu, rcu_cpu_online_map) 836 for_each_cpu_mask_nr(cpu, rcu_cpu_online_map)
744 if (rcu_try_flip_waitmb_needed(cpu) && 837 if (rcu_try_flip_waitmb_needed(cpu) &&
745 per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) { 838 per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
746 RCU_TRACE_ME(rcupreempt_trace_try_flip_me1); 839 RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
@@ -819,6 +912,26 @@ void rcu_check_callbacks(int cpu, int user)
819 unsigned long flags; 912 unsigned long flags;
820 struct rcu_data *rdp = RCU_DATA_CPU(cpu); 913 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
821 914
915 /*
916 * If this CPU took its interrupt from user mode or from the
917 * idle loop, and this is not a nested interrupt, then
918 * this CPU has to have exited all prior preept-disable
919 * sections of code. So increment the counter to note this.
920 *
921 * The memory barrier is needed to handle the case where
922 * writes from a preempt-disable section of code get reordered
923 * into schedule() by this CPU's write buffer. So the memory
924 * barrier makes sure that the rcu_qsctr_inc() is seen by other
925 * CPUs to happen after any such write.
926 */
927
928 if (user ||
929 (idle_cpu(cpu) && !in_softirq() &&
930 hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
931 smp_mb(); /* Guard against aggressive schedule(). */
932 rcu_qsctr_inc(cpu);
933 }
934
822 rcu_check_mb(cpu); 935 rcu_check_mb(cpu);
823 if (rcu_ctrlblk.completed == rdp->completed) 936 if (rcu_ctrlblk.completed == rdp->completed)
824 rcu_try_flip(); 937 rcu_try_flip();
@@ -869,6 +982,8 @@ void rcu_offline_cpu(int cpu)
869 struct rcu_head *list = NULL; 982 struct rcu_head *list = NULL;
870 unsigned long flags; 983 unsigned long flags;
871 struct rcu_data *rdp = RCU_DATA_CPU(cpu); 984 struct rcu_data *rdp = RCU_DATA_CPU(cpu);
985 struct rcu_head *schedlist = NULL;
986 struct rcu_head **schedtail = &schedlist;
872 struct rcu_head **tail = &list; 987 struct rcu_head **tail = &list;
873 988
874 /* 989 /*
@@ -882,6 +997,11 @@ void rcu_offline_cpu(int cpu)
882 rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i], 997 rcu_offline_cpu_enqueue(rdp->waitlist[i], rdp->waittail[i],
883 list, tail); 998 list, tail);
884 rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail); 999 rcu_offline_cpu_enqueue(rdp->nextlist, rdp->nexttail, list, tail);
1000 rcu_offline_cpu_enqueue(rdp->waitschedlist, rdp->waitschedtail,
1001 schedlist, schedtail);
1002 rcu_offline_cpu_enqueue(rdp->nextschedlist, rdp->nextschedtail,
1003 schedlist, schedtail);
1004 rdp->rcu_sched_sleeping = 0;
885 spin_unlock_irqrestore(&rdp->lock, flags); 1005 spin_unlock_irqrestore(&rdp->lock, flags);
886 rdp->waitlistcount = 0; 1006 rdp->waitlistcount = 0;
887 1007
@@ -916,36 +1036,50 @@ void rcu_offline_cpu(int cpu)
916 * fix. 1036 * fix.
917 */ 1037 */
918 1038
919 local_irq_save(flags); 1039 local_irq_save(flags); /* disable preempt till we know what lock. */
920 rdp = RCU_DATA_ME(); 1040 rdp = RCU_DATA_ME();
921 spin_lock(&rdp->lock); 1041 spin_lock(&rdp->lock);
922 *rdp->nexttail = list; 1042 *rdp->nexttail = list;
923 if (list) 1043 if (list)
924 rdp->nexttail = tail; 1044 rdp->nexttail = tail;
1045 *rdp->nextschedtail = schedlist;
1046 if (schedlist)
1047 rdp->nextschedtail = schedtail;
925 spin_unlock_irqrestore(&rdp->lock, flags); 1048 spin_unlock_irqrestore(&rdp->lock, flags);
926} 1049}
927 1050
928void __devinit rcu_online_cpu(int cpu) 1051#else /* #ifdef CONFIG_HOTPLUG_CPU */
1052
1053void rcu_offline_cpu(int cpu)
1054{
1055}
1056
1057#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
1058
1059void __cpuinit rcu_online_cpu(int cpu)
929{ 1060{
930 unsigned long flags; 1061 unsigned long flags;
1062 struct rcu_data *rdp;
931 1063
932 spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags); 1064 spin_lock_irqsave(&rcu_ctrlblk.fliplock, flags);
933 cpu_set(cpu, rcu_cpu_online_map); 1065 cpu_set(cpu, rcu_cpu_online_map);
934 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags); 1066 spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, flags);
935}
936 1067
937#else /* #ifdef CONFIG_HOTPLUG_CPU */ 1068 /*
938 1069 * The rcu_sched grace-period processing might have bypassed
939void rcu_offline_cpu(int cpu) 1070 * this CPU, given that it was not in the rcu_cpu_online_map
940{ 1071 * when the grace-period scan started. This means that the
941} 1072 * grace-period task might sleep. So make sure that if this
1073 * should happen, the first callback posted to this CPU will
1074 * wake up the grace-period task if need be.
1075 */
942 1076
943void __devinit rcu_online_cpu(int cpu) 1077 rdp = RCU_DATA_CPU(cpu);
944{ 1078 spin_lock_irqsave(&rdp->lock, flags);
1079 rdp->rcu_sched_sleeping = 1;
1080 spin_unlock_irqrestore(&rdp->lock, flags);
945} 1081}
946 1082
947#endif /* #else #ifdef CONFIG_HOTPLUG_CPU */
948
949static void rcu_process_callbacks(struct softirq_action *unused) 1083static void rcu_process_callbacks(struct softirq_action *unused)
950{ 1084{
951 unsigned long flags; 1085 unsigned long flags;
@@ -986,31 +1120,196 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
986 *rdp->nexttail = head; 1120 *rdp->nexttail = head;
987 rdp->nexttail = &head->next; 1121 rdp->nexttail = &head->next;
988 RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp); 1122 RCU_TRACE_RDP(rcupreempt_trace_next_add, rdp);
989 spin_unlock(&rdp->lock); 1123 spin_unlock_irqrestore(&rdp->lock, flags);
990 local_irq_restore(flags);
991} 1124}
992EXPORT_SYMBOL_GPL(call_rcu); 1125EXPORT_SYMBOL_GPL(call_rcu);
993 1126
1127void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
1128{
1129 unsigned long flags;
1130 struct rcu_data *rdp;
1131 int wake_gp = 0;
1132
1133 head->func = func;
1134 head->next = NULL;
1135 local_irq_save(flags);
1136 rdp = RCU_DATA_ME();
1137 spin_lock(&rdp->lock);
1138 *rdp->nextschedtail = head;
1139 rdp->nextschedtail = &head->next;
1140 if (rdp->rcu_sched_sleeping) {
1141
1142 /* Grace-period processing might be sleeping... */
1143
1144 rdp->rcu_sched_sleeping = 0;
1145 wake_gp = 1;
1146 }
1147 spin_unlock_irqrestore(&rdp->lock, flags);
1148 if (wake_gp) {
1149
1150 /* Wake up grace-period processing, unless someone beat us. */
1151
1152 spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
1153 if (rcu_ctrlblk.sched_sleep != rcu_sched_sleeping)
1154 wake_gp = 0;
1155 rcu_ctrlblk.sched_sleep = rcu_sched_not_sleeping;
1156 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1157 if (wake_gp)
1158 wake_up_interruptible(&rcu_ctrlblk.sched_wq);
1159 }
1160}
1161EXPORT_SYMBOL_GPL(call_rcu_sched);
1162
994/* 1163/*
995 * Wait until all currently running preempt_disable() code segments 1164 * Wait until all currently running preempt_disable() code segments
996 * (including hardware-irq-disable segments) complete. Note that 1165 * (including hardware-irq-disable segments) complete. Note that
997 * in -rt this does -not- necessarily result in all currently executing 1166 * in -rt this does -not- necessarily result in all currently executing
998 * interrupt -handlers- having completed. 1167 * interrupt -handlers- having completed.
999 */ 1168 */
1000void __synchronize_sched(void) 1169synchronize_rcu_xxx(__synchronize_sched, call_rcu_sched)
1170EXPORT_SYMBOL_GPL(__synchronize_sched);
1171
1172/*
1173 * kthread function that manages call_rcu_sched grace periods.
1174 */
1175static int rcu_sched_grace_period(void *arg)
1001{ 1176{
1002 cpumask_t oldmask; 1177 int couldsleep; /* might sleep after current pass. */
1178 int couldsleepnext = 0; /* might sleep after next pass. */
1003 int cpu; 1179 int cpu;
1180 unsigned long flags;
1181 struct rcu_data *rdp;
1182 int ret;
1004 1183
1005 if (sched_getaffinity(0, &oldmask) < 0) 1184 /*
1006 oldmask = cpu_possible_map; 1185 * Each pass through the following loop handles one
1007 for_each_online_cpu(cpu) { 1186 * rcu_sched grace period cycle.
1008 sched_setaffinity(0, &cpumask_of_cpu(cpu)); 1187 */
1009 schedule(); 1188 do {
1010 } 1189 /* Save each CPU's current state. */
1011 sched_setaffinity(0, &oldmask); 1190
1191 for_each_online_cpu(cpu) {
1192 dyntick_save_progress_counter_sched(cpu);
1193 save_qsctr_sched(cpu);
1194 }
1195
1196 /*
1197 * Sleep for about an RCU grace-period's worth to
1198 * allow better batching and to consume less CPU.
1199 */
1200 schedule_timeout_interruptible(RCU_SCHED_BATCH_TIME);
1201
1202 /*
1203 * If there was nothing to do last time, prepare to
1204 * sleep at the end of the current grace period cycle.
1205 */
1206 couldsleep = couldsleepnext;
1207 couldsleepnext = 1;
1208 if (couldsleep) {
1209 spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
1210 rcu_ctrlblk.sched_sleep = rcu_sched_sleep_prep;
1211 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1212 }
1213
1214 /*
1215 * Wait on each CPU in turn to have either visited
1216 * a quiescent state or been in dynticks-idle mode.
1217 */
1218 for_each_online_cpu(cpu) {
1219 while (rcu_qsctr_inc_needed(cpu) &&
1220 rcu_qsctr_inc_needed_dyntick(cpu)) {
1221 /* resched_cpu(cpu); @@@ */
1222 schedule_timeout_interruptible(1);
1223 }
1224 }
1225
1226 /* Advance callbacks for each CPU. */
1227
1228 for_each_online_cpu(cpu) {
1229
1230 rdp = RCU_DATA_CPU(cpu);
1231 spin_lock_irqsave(&rdp->lock, flags);
1232
1233 /*
1234 * We are running on this CPU irq-disabled, so no
1235 * CPU can go offline until we re-enable irqs.
1236 * The current CPU might have already gone
1237 * offline (between the for_each_offline_cpu and
1238 * the spin_lock_irqsave), but in that case all its
1239 * callback lists will be empty, so no harm done.
1240 *
1241 * Advance the callbacks! We share normal RCU's
1242 * donelist, since callbacks are invoked the
1243 * same way in either case.
1244 */
1245 if (rdp->waitschedlist != NULL) {
1246 *rdp->donetail = rdp->waitschedlist;
1247 rdp->donetail = rdp->waitschedtail;
1248
1249 /*
1250 * Next rcu_check_callbacks() will
1251 * do the required raise_softirq().
1252 */
1253 }
1254 if (rdp->nextschedlist != NULL) {
1255 rdp->waitschedlist = rdp->nextschedlist;
1256 rdp->waitschedtail = rdp->nextschedtail;
1257 couldsleep = 0;
1258 couldsleepnext = 0;
1259 } else {
1260 rdp->waitschedlist = NULL;
1261 rdp->waitschedtail = &rdp->waitschedlist;
1262 }
1263 rdp->nextschedlist = NULL;
1264 rdp->nextschedtail = &rdp->nextschedlist;
1265
1266 /* Mark sleep intention. */
1267
1268 rdp->rcu_sched_sleeping = couldsleep;
1269
1270 spin_unlock_irqrestore(&rdp->lock, flags);
1271 }
1272
1273 /* If we saw callbacks on the last scan, go deal with them. */
1274
1275 if (!couldsleep)
1276 continue;
1277
1278 /* Attempt to block... */
1279
1280 spin_lock_irqsave(&rcu_ctrlblk.schedlock, flags);
1281 if (rcu_ctrlblk.sched_sleep != rcu_sched_sleep_prep) {
1282
1283 /*
1284 * Someone posted a callback after we scanned.
1285 * Go take care of it.
1286 */
1287 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1288 couldsleepnext = 0;
1289 continue;
1290 }
1291
1292 /* Block until the next person posts a callback. */
1293
1294 rcu_ctrlblk.sched_sleep = rcu_sched_sleeping;
1295 spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
1296 ret = 0;
1297 __wait_event_interruptible(rcu_ctrlblk.sched_wq,
1298 rcu_ctrlblk.sched_sleep != rcu_sched_sleeping,
1299 ret);
1300
1301 /*
1302 * Signals would prevent us from sleeping, and we cannot
1303 * do much with them in any case. So flush them.
1304 */
1305 if (ret)
1306 flush_signals(current);
1307 couldsleepnext = 0;
1308
1309 } while (!kthread_should_stop());
1310
1311 return (0);
1012} 1312}
1013EXPORT_SYMBOL_GPL(__synchronize_sched);
1014 1313
1015/* 1314/*
1016 * Check to see if any future RCU-related work will need to be done 1315 * Check to see if any future RCU-related work will need to be done
@@ -1027,7 +1326,9 @@ int rcu_needs_cpu(int cpu)
1027 1326
1028 return (rdp->donelist != NULL || 1327 return (rdp->donelist != NULL ||
1029 !!rdp->waitlistcount || 1328 !!rdp->waitlistcount ||
1030 rdp->nextlist != NULL); 1329 rdp->nextlist != NULL ||
1330 rdp->nextschedlist != NULL ||
1331 rdp->waitschedlist != NULL);
1031} 1332}
1032 1333
1033int rcu_pending(int cpu) 1334int rcu_pending(int cpu)
@@ -1038,7 +1339,9 @@ int rcu_pending(int cpu)
1038 1339
1039 if (rdp->donelist != NULL || 1340 if (rdp->donelist != NULL ||
1040 !!rdp->waitlistcount || 1341 !!rdp->waitlistcount ||
1041 rdp->nextlist != NULL) 1342 rdp->nextlist != NULL ||
1343 rdp->nextschedlist != NULL ||
1344 rdp->waitschedlist != NULL)
1042 return 1; 1345 return 1;
1043 1346
1044 /* The RCU core needs an acknowledgement from this CPU. */ 1347 /* The RCU core needs an acknowledgement from this CPU. */
@@ -1105,6 +1408,11 @@ void __init __rcu_init(void)
1105 rdp->donetail = &rdp->donelist; 1408 rdp->donetail = &rdp->donelist;
1106 rdp->rcu_flipctr[0] = 0; 1409 rdp->rcu_flipctr[0] = 0;
1107 rdp->rcu_flipctr[1] = 0; 1410 rdp->rcu_flipctr[1] = 0;
1411 rdp->nextschedlist = NULL;
1412 rdp->nextschedtail = &rdp->nextschedlist;
1413 rdp->waitschedlist = NULL;
1414 rdp->waitschedtail = &rdp->waitschedlist;
1415 rdp->rcu_sched_sleeping = 0;
1108 } 1416 }
1109 register_cpu_notifier(&rcu_nb); 1417 register_cpu_notifier(&rcu_nb);
1110 1418
@@ -1123,15 +1431,19 @@ void __init __rcu_init(void)
1123 for_each_online_cpu(cpu) 1431 for_each_online_cpu(cpu)
1124 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long) cpu); 1432 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, (void *)(long) cpu);
1125 1433
1126 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks, NULL); 1434 open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
1127} 1435}
1128 1436
1129/* 1437/*
1130 * Deprecated, use synchronize_rcu() or synchronize_sched() instead. 1438 * Late-boot-time RCU initialization that must wait until after scheduler
1439 * has been initialized.
1131 */ 1440 */
1132void synchronize_kernel(void) 1441void __init rcu_init_sched(void)
1133{ 1442{
1134 synchronize_rcu(); 1443 rcu_sched_grace_period_task = kthread_run(rcu_sched_grace_period,
1444 NULL,
1445 "rcu_sched_grace_period");
1446 WARN_ON(IS_ERR(rcu_sched_grace_period_task));
1135} 1447}
1136 1448
1137#ifdef CONFIG_RCU_TRACE 1449#ifdef CONFIG_RCU_TRACE
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
index 49ac4947af24..35c2d3360ecf 100644
--- a/kernel/rcupreempt_trace.c
+++ b/kernel/rcupreempt_trace.c
@@ -38,7 +38,6 @@
38#include <linux/moduleparam.h> 38#include <linux/moduleparam.h>
39#include <linux/percpu.h> 39#include <linux/percpu.h>
40#include <linux/notifier.h> 40#include <linux/notifier.h>
41#include <linux/rcupdate.h>
42#include <linux/cpu.h> 41#include <linux/cpu.h>
43#include <linux/mutex.h> 42#include <linux/mutex.h>
44#include <linux/rcupreempt_trace.h> 43#include <linux/rcupreempt_trace.h>
@@ -309,11 +308,16 @@ out:
309 308
310static int __init rcupreempt_trace_init(void) 309static int __init rcupreempt_trace_init(void)
311{ 310{
311 int ret;
312
312 mutex_init(&rcupreempt_trace_mutex); 313 mutex_init(&rcupreempt_trace_mutex);
313 rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL); 314 rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL);
314 if (!rcupreempt_trace_buf) 315 if (!rcupreempt_trace_buf)
315 return 1; 316 return 1;
316 return rcupreempt_debugfs_init(); 317 ret = rcupreempt_debugfs_init();
318 if (ret)
319 kfree(rcupreempt_trace_buf);
320 return ret;
317} 321}
318 322
319static void __exit rcupreempt_trace_cleanup(void) 323static void __exit rcupreempt_trace_cleanup(void)
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 33acc424667e..90b5b123f7a1 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -57,7 +57,9 @@ static int stat_interval; /* Interval between stats, in seconds. */
57 /* Defaults to "only at end of test". */ 57 /* Defaults to "only at end of test". */
58static int verbose; /* Print more debug info. */ 58static int verbose; /* Print more debug info. */
59static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ 59static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */
60static int shuffle_interval = 5; /* Interval between shuffles (in sec)*/ 60static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/
61static int stutter = 5; /* Start/stop testing interval (in sec) */
62static int irqreader = 1; /* RCU readers from irq (timers). */
61static char *torture_type = "rcu"; /* What RCU implementation to torture. */ 63static char *torture_type = "rcu"; /* What RCU implementation to torture. */
62 64
63module_param(nreaders, int, 0444); 65module_param(nreaders, int, 0444);
@@ -72,6 +74,10 @@ module_param(test_no_idle_hz, bool, 0444);
72MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); 74MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs");
73module_param(shuffle_interval, int, 0444); 75module_param(shuffle_interval, int, 0444);
74MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); 76MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles");
77module_param(stutter, int, 0444);
78MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test");
79module_param(irqreader, int, 0444);
80MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers");
75module_param(torture_type, charp, 0444); 81module_param(torture_type, charp, 0444);
76MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); 82MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)");
77 83
@@ -91,6 +97,7 @@ static struct task_struct **fakewriter_tasks;
91static struct task_struct **reader_tasks; 97static struct task_struct **reader_tasks;
92static struct task_struct *stats_task; 98static struct task_struct *stats_task;
93static struct task_struct *shuffler_task; 99static struct task_struct *shuffler_task;
100static struct task_struct *stutter_task;
94 101
95#define RCU_TORTURE_PIPE_LEN 10 102#define RCU_TORTURE_PIPE_LEN 10
96 103
@@ -117,8 +124,18 @@ static atomic_t n_rcu_torture_alloc_fail;
117static atomic_t n_rcu_torture_free; 124static atomic_t n_rcu_torture_free;
118static atomic_t n_rcu_torture_mberror; 125static atomic_t n_rcu_torture_mberror;
119static atomic_t n_rcu_torture_error; 126static atomic_t n_rcu_torture_error;
127static long n_rcu_torture_timers = 0;
120static struct list_head rcu_torture_removed; 128static struct list_head rcu_torture_removed;
121 129
130static int stutter_pause_test = 0;
131
132#if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE)
133#define RCUTORTURE_RUNNABLE_INIT 1
134#else
135#define RCUTORTURE_RUNNABLE_INIT 0
136#endif
137int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
138
122/* 139/*
123 * Allocate an element from the rcu_tortures pool. 140 * Allocate an element from the rcu_tortures pool.
124 */ 141 */
@@ -179,6 +196,16 @@ rcu_random(struct rcu_random_state *rrsp)
179 return swahw32(rrsp->rrs_state); 196 return swahw32(rrsp->rrs_state);
180} 197}
181 198
199static void
200rcu_stutter_wait(void)
201{
202 while (stutter_pause_test || !rcutorture_runnable)
203 if (rcutorture_runnable)
204 schedule_timeout_interruptible(1);
205 else
206 schedule_timeout_interruptible(round_jiffies_relative(HZ));
207}
208
182/* 209/*
183 * Operations vector for selecting different types of tests. 210 * Operations vector for selecting different types of tests.
184 */ 211 */
@@ -192,7 +219,9 @@ struct rcu_torture_ops {
192 int (*completed)(void); 219 int (*completed)(void);
193 void (*deferredfree)(struct rcu_torture *p); 220 void (*deferredfree)(struct rcu_torture *p);
194 void (*sync)(void); 221 void (*sync)(void);
222 void (*cb_barrier)(void);
195 int (*stats)(char *page); 223 int (*stats)(char *page);
224 int irqcapable;
196 char *name; 225 char *name;
197}; 226};
198static struct rcu_torture_ops *cur_ops = NULL; 227static struct rcu_torture_ops *cur_ops = NULL;
@@ -265,7 +294,9 @@ static struct rcu_torture_ops rcu_ops = {
265 .completed = rcu_torture_completed, 294 .completed = rcu_torture_completed,
266 .deferredfree = rcu_torture_deferred_free, 295 .deferredfree = rcu_torture_deferred_free,
267 .sync = synchronize_rcu, 296 .sync = synchronize_rcu,
297 .cb_barrier = rcu_barrier,
268 .stats = NULL, 298 .stats = NULL,
299 .irqcapable = 1,
269 .name = "rcu" 300 .name = "rcu"
270}; 301};
271 302
@@ -304,7 +335,9 @@ static struct rcu_torture_ops rcu_sync_ops = {
304 .completed = rcu_torture_completed, 335 .completed = rcu_torture_completed,
305 .deferredfree = rcu_sync_torture_deferred_free, 336 .deferredfree = rcu_sync_torture_deferred_free,
306 .sync = synchronize_rcu, 337 .sync = synchronize_rcu,
338 .cb_barrier = NULL,
307 .stats = NULL, 339 .stats = NULL,
340 .irqcapable = 1,
308 .name = "rcu_sync" 341 .name = "rcu_sync"
309}; 342};
310 343
@@ -364,7 +397,9 @@ static struct rcu_torture_ops rcu_bh_ops = {
364 .completed = rcu_bh_torture_completed, 397 .completed = rcu_bh_torture_completed,
365 .deferredfree = rcu_bh_torture_deferred_free, 398 .deferredfree = rcu_bh_torture_deferred_free,
366 .sync = rcu_bh_torture_synchronize, 399 .sync = rcu_bh_torture_synchronize,
400 .cb_barrier = rcu_barrier_bh,
367 .stats = NULL, 401 .stats = NULL,
402 .irqcapable = 1,
368 .name = "rcu_bh" 403 .name = "rcu_bh"
369}; 404};
370 405
@@ -377,7 +412,9 @@ static struct rcu_torture_ops rcu_bh_sync_ops = {
377 .completed = rcu_bh_torture_completed, 412 .completed = rcu_bh_torture_completed,
378 .deferredfree = rcu_sync_torture_deferred_free, 413 .deferredfree = rcu_sync_torture_deferred_free,
379 .sync = rcu_bh_torture_synchronize, 414 .sync = rcu_bh_torture_synchronize,
415 .cb_barrier = NULL,
380 .stats = NULL, 416 .stats = NULL,
417 .irqcapable = 1,
381 .name = "rcu_bh_sync" 418 .name = "rcu_bh_sync"
382}; 419};
383 420
@@ -458,6 +495,7 @@ static struct rcu_torture_ops srcu_ops = {
458 .completed = srcu_torture_completed, 495 .completed = srcu_torture_completed,
459 .deferredfree = rcu_sync_torture_deferred_free, 496 .deferredfree = rcu_sync_torture_deferred_free,
460 .sync = srcu_torture_synchronize, 497 .sync = srcu_torture_synchronize,
498 .cb_barrier = NULL,
461 .stats = srcu_torture_stats, 499 .stats = srcu_torture_stats,
462 .name = "srcu" 500 .name = "srcu"
463}; 501};
@@ -482,6 +520,11 @@ static int sched_torture_completed(void)
482 return 0; 520 return 0;
483} 521}
484 522
523static void rcu_sched_torture_deferred_free(struct rcu_torture *p)
524{
525 call_rcu_sched(&p->rtort_rcu, rcu_torture_cb);
526}
527
485static void sched_torture_synchronize(void) 528static void sched_torture_synchronize(void)
486{ 529{
487 synchronize_sched(); 530 synchronize_sched();
@@ -494,12 +537,28 @@ static struct rcu_torture_ops sched_ops = {
494 .readdelay = rcu_read_delay, /* just reuse rcu's version. */ 537 .readdelay = rcu_read_delay, /* just reuse rcu's version. */
495 .readunlock = sched_torture_read_unlock, 538 .readunlock = sched_torture_read_unlock,
496 .completed = sched_torture_completed, 539 .completed = sched_torture_completed,
497 .deferredfree = rcu_sync_torture_deferred_free, 540 .deferredfree = rcu_sched_torture_deferred_free,
498 .sync = sched_torture_synchronize, 541 .sync = sched_torture_synchronize,
542 .cb_barrier = rcu_barrier_sched,
499 .stats = NULL, 543 .stats = NULL,
544 .irqcapable = 1,
500 .name = "sched" 545 .name = "sched"
501}; 546};
502 547
548static struct rcu_torture_ops sched_ops_sync = {
549 .init = rcu_sync_torture_init,
550 .cleanup = NULL,
551 .readlock = sched_torture_read_lock,
552 .readdelay = rcu_read_delay, /* just reuse rcu's version. */
553 .readunlock = sched_torture_read_unlock,
554 .completed = sched_torture_completed,
555 .deferredfree = rcu_sync_torture_deferred_free,
556 .sync = sched_torture_synchronize,
557 .cb_barrier = NULL,
558 .stats = NULL,
559 .name = "sched_sync"
560};
561
503/* 562/*
504 * RCU torture writer kthread. Repeatedly substitutes a new structure 563 * RCU torture writer kthread. Repeatedly substitutes a new structure
505 * for that pointed to by rcu_torture_current, freeing the old structure 564 * for that pointed to by rcu_torture_current, freeing the old structure
@@ -537,6 +596,7 @@ rcu_torture_writer(void *arg)
537 } 596 }
538 rcu_torture_current_version++; 597 rcu_torture_current_version++;
539 oldbatch = cur_ops->completed(); 598 oldbatch = cur_ops->completed();
599 rcu_stutter_wait();
540 } while (!kthread_should_stop() && !fullstop); 600 } while (!kthread_should_stop() && !fullstop);
541 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); 601 VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
542 while (!kthread_should_stop()) 602 while (!kthread_should_stop())
@@ -560,6 +620,7 @@ rcu_torture_fakewriter(void *arg)
560 schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); 620 schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10);
561 udelay(rcu_random(&rand) & 0x3ff); 621 udelay(rcu_random(&rand) & 0x3ff);
562 cur_ops->sync(); 622 cur_ops->sync();
623 rcu_stutter_wait();
563 } while (!kthread_should_stop() && !fullstop); 624 } while (!kthread_should_stop() && !fullstop);
564 625
565 VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); 626 VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping");
@@ -569,6 +630,52 @@ rcu_torture_fakewriter(void *arg)
569} 630}
570 631
571/* 632/*
633 * RCU torture reader from timer handler. Dereferences rcu_torture_current,
634 * incrementing the corresponding element of the pipeline array. The
635 * counter in the element should never be greater than 1, otherwise, the
636 * RCU implementation is broken.
637 */
638static void rcu_torture_timer(unsigned long unused)
639{
640 int idx;
641 int completed;
642 static DEFINE_RCU_RANDOM(rand);
643 static DEFINE_SPINLOCK(rand_lock);
644 struct rcu_torture *p;
645 int pipe_count;
646
647 idx = cur_ops->readlock();
648 completed = cur_ops->completed();
649 p = rcu_dereference(rcu_torture_current);
650 if (p == NULL) {
651 /* Leave because rcu_torture_writer is not yet underway */
652 cur_ops->readunlock(idx);
653 return;
654 }
655 if (p->rtort_mbtest == 0)
656 atomic_inc(&n_rcu_torture_mberror);
657 spin_lock(&rand_lock);
658 cur_ops->readdelay(&rand);
659 n_rcu_torture_timers++;
660 spin_unlock(&rand_lock);
661 preempt_disable();
662 pipe_count = p->rtort_pipe_count;
663 if (pipe_count > RCU_TORTURE_PIPE_LEN) {
664 /* Should not happen, but... */
665 pipe_count = RCU_TORTURE_PIPE_LEN;
666 }
667 ++__get_cpu_var(rcu_torture_count)[pipe_count];
668 completed = cur_ops->completed() - completed;
669 if (completed > RCU_TORTURE_PIPE_LEN) {
670 /* Should not happen, but... */
671 completed = RCU_TORTURE_PIPE_LEN;
672 }
673 ++__get_cpu_var(rcu_torture_batch)[completed];
674 preempt_enable();
675 cur_ops->readunlock(idx);
676}
677
678/*
572 * RCU torture reader kthread. Repeatedly dereferences rcu_torture_current, 679 * RCU torture reader kthread. Repeatedly dereferences rcu_torture_current,
573 * incrementing the corresponding element of the pipeline array. The 680 * incrementing the corresponding element of the pipeline array. The
574 * counter in the element should never be greater than 1, otherwise, the 681 * counter in the element should never be greater than 1, otherwise, the
@@ -582,11 +689,18 @@ rcu_torture_reader(void *arg)
582 DEFINE_RCU_RANDOM(rand); 689 DEFINE_RCU_RANDOM(rand);
583 struct rcu_torture *p; 690 struct rcu_torture *p;
584 int pipe_count; 691 int pipe_count;
692 struct timer_list t;
585 693
586 VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); 694 VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
587 set_user_nice(current, 19); 695 set_user_nice(current, 19);
696 if (irqreader && cur_ops->irqcapable)
697 setup_timer_on_stack(&t, rcu_torture_timer, 0);
588 698
589 do { 699 do {
700 if (irqreader && cur_ops->irqcapable) {
701 if (!timer_pending(&t))
702 mod_timer(&t, 1);
703 }
590 idx = cur_ops->readlock(); 704 idx = cur_ops->readlock();
591 completed = cur_ops->completed(); 705 completed = cur_ops->completed();
592 p = rcu_dereference(rcu_torture_current); 706 p = rcu_dereference(rcu_torture_current);
@@ -615,8 +729,11 @@ rcu_torture_reader(void *arg)
615 preempt_enable(); 729 preempt_enable();
616 cur_ops->readunlock(idx); 730 cur_ops->readunlock(idx);
617 schedule(); 731 schedule();
732 rcu_stutter_wait();
618 } while (!kthread_should_stop() && !fullstop); 733 } while (!kthread_should_stop() && !fullstop);
619 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); 734 VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
735 if (irqreader && cur_ops->irqcapable)
736 del_timer_sync(&t);
620 while (!kthread_should_stop()) 737 while (!kthread_should_stop())
621 schedule_timeout_uninterruptible(1); 738 schedule_timeout_uninterruptible(1);
622 return 0; 739 return 0;
@@ -647,20 +764,22 @@ rcu_torture_printk(char *page)
647 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); 764 cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
648 cnt += sprintf(&page[cnt], 765 cnt += sprintf(&page[cnt],
649 "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d " 766 "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
650 "rtmbe: %d", 767 "rtmbe: %d nt: %ld",
651 rcu_torture_current, 768 rcu_torture_current,
652 rcu_torture_current_version, 769 rcu_torture_current_version,
653 list_empty(&rcu_torture_freelist), 770 list_empty(&rcu_torture_freelist),
654 atomic_read(&n_rcu_torture_alloc), 771 atomic_read(&n_rcu_torture_alloc),
655 atomic_read(&n_rcu_torture_alloc_fail), 772 atomic_read(&n_rcu_torture_alloc_fail),
656 atomic_read(&n_rcu_torture_free), 773 atomic_read(&n_rcu_torture_free),
657 atomic_read(&n_rcu_torture_mberror)); 774 atomic_read(&n_rcu_torture_mberror),
775 n_rcu_torture_timers);
658 if (atomic_read(&n_rcu_torture_mberror) != 0) 776 if (atomic_read(&n_rcu_torture_mberror) != 0)
659 cnt += sprintf(&page[cnt], " !!!"); 777 cnt += sprintf(&page[cnt], " !!!");
660 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); 778 cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
661 if (i > 1) { 779 if (i > 1) {
662 cnt += sprintf(&page[cnt], "!!! "); 780 cnt += sprintf(&page[cnt], "!!! ");
663 atomic_inc(&n_rcu_torture_error); 781 atomic_inc(&n_rcu_torture_error);
782 WARN_ON_ONCE(1);
664 } 783 }
665 cnt += sprintf(&page[cnt], "Reader Pipe: "); 784 cnt += sprintf(&page[cnt], "Reader Pipe: ");
666 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) 785 for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
@@ -785,15 +904,34 @@ rcu_torture_shuffle(void *arg)
785 return 0; 904 return 0;
786} 905}
787 906
907/* Cause the rcutorture test to "stutter", starting and stopping all
908 * threads periodically.
909 */
910static int
911rcu_torture_stutter(void *arg)
912{
913 VERBOSE_PRINTK_STRING("rcu_torture_stutter task started");
914 do {
915 schedule_timeout_interruptible(stutter * HZ);
916 stutter_pause_test = 1;
917 if (!kthread_should_stop())
918 schedule_timeout_interruptible(stutter * HZ);
919 stutter_pause_test = 0;
920 } while (!kthread_should_stop());
921 VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping");
922 return 0;
923}
924
788static inline void 925static inline void
789rcu_torture_print_module_parms(char *tag) 926rcu_torture_print_module_parms(char *tag)
790{ 927{
791 printk(KERN_ALERT "%s" TORTURE_FLAG 928 printk(KERN_ALERT "%s" TORTURE_FLAG
792 "--- %s: nreaders=%d nfakewriters=%d " 929 "--- %s: nreaders=%d nfakewriters=%d "
793 "stat_interval=%d verbose=%d test_no_idle_hz=%d " 930 "stat_interval=%d verbose=%d test_no_idle_hz=%d "
794 "shuffle_interval = %d\n", 931 "shuffle_interval=%d stutter=%d irqreader=%d\n",
795 torture_type, tag, nrealreaders, nfakewriters, 932 torture_type, tag, nrealreaders, nfakewriters,
796 stat_interval, verbose, test_no_idle_hz, shuffle_interval); 933 stat_interval, verbose, test_no_idle_hz, shuffle_interval,
934 stutter, irqreader);
797} 935}
798 936
799static void 937static void
@@ -802,6 +940,11 @@ rcu_torture_cleanup(void)
802 int i; 940 int i;
803 941
804 fullstop = 1; 942 fullstop = 1;
943 if (stutter_task) {
944 VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task");
945 kthread_stop(stutter_task);
946 }
947 stutter_task = NULL;
805 if (shuffler_task) { 948 if (shuffler_task) {
806 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task"); 949 VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
807 kthread_stop(shuffler_task); 950 kthread_stop(shuffler_task);
@@ -848,7 +991,9 @@ rcu_torture_cleanup(void)
848 stats_task = NULL; 991 stats_task = NULL;
849 992
850 /* Wait for all RCU callbacks to fire. */ 993 /* Wait for all RCU callbacks to fire. */
851 rcu_barrier(); 994
995 if (cur_ops->cb_barrier != NULL)
996 cur_ops->cb_barrier();
852 997
853 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ 998 rcu_torture_stats_print(); /* -After- the stats thread is stopped! */
854 999
@@ -868,7 +1013,7 @@ rcu_torture_init(void)
868 int firsterr = 0; 1013 int firsterr = 0;
869 static struct rcu_torture_ops *torture_ops[] = 1014 static struct rcu_torture_ops *torture_ops[] =
870 { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops, 1015 { &rcu_ops, &rcu_sync_ops, &rcu_bh_ops, &rcu_bh_sync_ops,
871 &srcu_ops, &sched_ops, }; 1016 &srcu_ops, &sched_ops, &sched_ops_sync, };
872 1017
873 /* Process args and tell the world that the torturer is on the job. */ 1018 /* Process args and tell the world that the torturer is on the job. */
874 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { 1019 for (i = 0; i < ARRAY_SIZE(torture_ops); i++) {
@@ -988,6 +1133,19 @@ rcu_torture_init(void)
988 goto unwind; 1133 goto unwind;
989 } 1134 }
990 } 1135 }
1136 if (stutter < 0)
1137 stutter = 0;
1138 if (stutter) {
1139 /* Create the stutter thread */
1140 stutter_task = kthread_run(rcu_torture_stutter, NULL,
1141 "rcu_torture_stutter");
1142 if (IS_ERR(stutter_task)) {
1143 firsterr = PTR_ERR(stutter_task);
1144 VERBOSE_PRINTK_ERRSTRING("Failed to create stutter");
1145 stutter_task = NULL;
1146 goto unwind;
1147 }
1148 }
991 return 0; 1149 return 0;
992 1150
993unwind: 1151unwind:
diff --git a/kernel/relay.c b/kernel/relay.c
index 7de644cdec43..8d13a7855c08 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -407,6 +407,35 @@ void relay_reset(struct rchan *chan)
407} 407}
408EXPORT_SYMBOL_GPL(relay_reset); 408EXPORT_SYMBOL_GPL(relay_reset);
409 409
410static inline void relay_set_buf_dentry(struct rchan_buf *buf,
411 struct dentry *dentry)
412{
413 buf->dentry = dentry;
414 buf->dentry->d_inode->i_size = buf->early_bytes;
415}
416
417static struct dentry *relay_create_buf_file(struct rchan *chan,
418 struct rchan_buf *buf,
419 unsigned int cpu)
420{
421 struct dentry *dentry;
422 char *tmpname;
423
424 tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL);
425 if (!tmpname)
426 return NULL;
427 snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu);
428
429 /* Create file in fs */
430 dentry = chan->cb->create_buf_file(tmpname, chan->parent,
431 S_IRUSR, buf,
432 &chan->is_global);
433
434 kfree(tmpname);
435
436 return dentry;
437}
438
410/* 439/*
411 * relay_open_buf - create a new relay channel buffer 440 * relay_open_buf - create a new relay channel buffer
412 * 441 *
@@ -416,45 +445,34 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu)
416{ 445{
417 struct rchan_buf *buf = NULL; 446 struct rchan_buf *buf = NULL;
418 struct dentry *dentry; 447 struct dentry *dentry;
419 char *tmpname;
420 448
421 if (chan->is_global) 449 if (chan->is_global)
422 return chan->buf[0]; 450 return chan->buf[0];
423 451
424 tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL);
425 if (!tmpname)
426 goto end;
427 snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu);
428
429 buf = relay_create_buf(chan); 452 buf = relay_create_buf(chan);
430 if (!buf) 453 if (!buf)
431 goto free_name; 454 return NULL;
455
456 if (chan->has_base_filename) {
457 dentry = relay_create_buf_file(chan, buf, cpu);
458 if (!dentry)
459 goto free_buf;
460 relay_set_buf_dentry(buf, dentry);
461 }
432 462
433 buf->cpu = cpu; 463 buf->cpu = cpu;
434 __relay_reset(buf, 1); 464 __relay_reset(buf, 1);
435 465
436 /* Create file in fs */
437 dentry = chan->cb->create_buf_file(tmpname, chan->parent, S_IRUSR,
438 buf, &chan->is_global);
439 if (!dentry)
440 goto free_buf;
441
442 buf->dentry = dentry;
443
444 if(chan->is_global) { 466 if(chan->is_global) {
445 chan->buf[0] = buf; 467 chan->buf[0] = buf;
446 buf->cpu = 0; 468 buf->cpu = 0;
447 } 469 }
448 470
449 goto free_name; 471 return buf;
450 472
451free_buf: 473free_buf:
452 relay_destroy_buf(buf); 474 relay_destroy_buf(buf);
453 buf = NULL; 475 return NULL;
454free_name:
455 kfree(tmpname);
456end:
457 return buf;
458} 476}
459 477
460/** 478/**
@@ -537,8 +555,8 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,
537 555
538/** 556/**
539 * relay_open - create a new relay channel 557 * relay_open - create a new relay channel
540 * @base_filename: base name of files to create 558 * @base_filename: base name of files to create, %NULL for buffering only
541 * @parent: dentry of parent directory, %NULL for root directory 559 * @parent: dentry of parent directory, %NULL for root directory or buffer
542 * @subbuf_size: size of sub-buffers 560 * @subbuf_size: size of sub-buffers
543 * @n_subbufs: number of sub-buffers 561 * @n_subbufs: number of sub-buffers
544 * @cb: client callback functions 562 * @cb: client callback functions
@@ -560,8 +578,6 @@ struct rchan *relay_open(const char *base_filename,
560{ 578{
561 unsigned int i; 579 unsigned int i;
562 struct rchan *chan; 580 struct rchan *chan;
563 if (!base_filename)
564 return NULL;
565 581
566 if (!(subbuf_size && n_subbufs)) 582 if (!(subbuf_size && n_subbufs))
567 return NULL; 583 return NULL;
@@ -576,7 +592,10 @@ struct rchan *relay_open(const char *base_filename,
576 chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs); 592 chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs);
577 chan->parent = parent; 593 chan->parent = parent;
578 chan->private_data = private_data; 594 chan->private_data = private_data;
579 strlcpy(chan->base_filename, base_filename, NAME_MAX); 595 if (base_filename) {
596 chan->has_base_filename = 1;
597 strlcpy(chan->base_filename, base_filename, NAME_MAX);
598 }
580 setup_callbacks(chan, cb); 599 setup_callbacks(chan, cb);
581 kref_init(&chan->kref); 600 kref_init(&chan->kref);
582 601
@@ -604,6 +623,94 @@ free_bufs:
604} 623}
605EXPORT_SYMBOL_GPL(relay_open); 624EXPORT_SYMBOL_GPL(relay_open);
606 625
626struct rchan_percpu_buf_dispatcher {
627 struct rchan_buf *buf;
628 struct dentry *dentry;
629};
630
631/* Called in atomic context. */
632static void __relay_set_buf_dentry(void *info)
633{
634 struct rchan_percpu_buf_dispatcher *p = info;
635
636 relay_set_buf_dentry(p->buf, p->dentry);
637}
638
639/**
640 * relay_late_setup_files - triggers file creation
641 * @chan: channel to operate on
642 * @base_filename: base name of files to create
643 * @parent: dentry of parent directory, %NULL for root directory
644 *
645 * Returns 0 if successful, non-zero otherwise.
646 *
647 * Use to setup files for a previously buffer-only channel.
648 * Useful to do early tracing in kernel, before VFS is up, for example.
649 */
650int relay_late_setup_files(struct rchan *chan,
651 const char *base_filename,
652 struct dentry *parent)
653{
654 int err = 0;
655 unsigned int i, curr_cpu;
656 unsigned long flags;
657 struct dentry *dentry;
658 struct rchan_percpu_buf_dispatcher disp;
659
660 if (!chan || !base_filename)
661 return -EINVAL;
662
663 strlcpy(chan->base_filename, base_filename, NAME_MAX);
664
665 mutex_lock(&relay_channels_mutex);
666 /* Is chan already set up? */
667 if (unlikely(chan->has_base_filename))
668 return -EEXIST;
669 chan->has_base_filename = 1;
670 chan->parent = parent;
671 curr_cpu = get_cpu();
672 /*
673 * The CPU hotplug notifier ran before us and created buffers with
674 * no files associated. So it's safe to call relay_setup_buf_file()
675 * on all currently online CPUs.
676 */
677 for_each_online_cpu(i) {
678 if (unlikely(!chan->buf[i])) {
679 printk(KERN_ERR "relay_late_setup_files: CPU %u "
680 "has no buffer, it must have!\n", i);
681 BUG();
682 err = -EINVAL;
683 break;
684 }
685
686 dentry = relay_create_buf_file(chan, chan->buf[i], i);
687 if (unlikely(!dentry)) {
688 err = -EINVAL;
689 break;
690 }
691
692 if (curr_cpu == i) {
693 local_irq_save(flags);
694 relay_set_buf_dentry(chan->buf[i], dentry);
695 local_irq_restore(flags);
696 } else {
697 disp.buf = chan->buf[i];
698 disp.dentry = dentry;
699 smp_mb();
700 /* relay_channels_mutex must be held, so wait. */
701 err = smp_call_function_single(i,
702 __relay_set_buf_dentry,
703 &disp, 1);
704 }
705 if (unlikely(err))
706 break;
707 }
708 put_cpu();
709 mutex_unlock(&relay_channels_mutex);
710
711 return err;
712}
713
607/** 714/**
608 * relay_switch_subbuf - switch to a new sub-buffer 715 * relay_switch_subbuf - switch to a new sub-buffer
609 * @buf: channel buffer 716 * @buf: channel buffer
@@ -627,8 +734,13 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
627 old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs; 734 old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
628 buf->padding[old_subbuf] = buf->prev_padding; 735 buf->padding[old_subbuf] = buf->prev_padding;
629 buf->subbufs_produced++; 736 buf->subbufs_produced++;
630 buf->dentry->d_inode->i_size += buf->chan->subbuf_size - 737 if (buf->dentry)
631 buf->padding[old_subbuf]; 738 buf->dentry->d_inode->i_size +=
739 buf->chan->subbuf_size -
740 buf->padding[old_subbuf];
741 else
742 buf->early_bytes += buf->chan->subbuf_size -
743 buf->padding[old_subbuf];
632 smp_mb(); 744 smp_mb();
633 if (waitqueue_active(&buf->read_wait)) 745 if (waitqueue_active(&buf->read_wait))
634 /* 746 /*
@@ -832,6 +944,10 @@ static void relay_file_read_consume(struct rchan_buf *buf,
832 size_t n_subbufs = buf->chan->n_subbufs; 944 size_t n_subbufs = buf->chan->n_subbufs;
833 size_t read_subbuf; 945 size_t read_subbuf;
834 946
947 if (buf->subbufs_produced == buf->subbufs_consumed &&
948 buf->offset == buf->bytes_consumed)
949 return;
950
835 if (buf->bytes_consumed + bytes_consumed > subbuf_size) { 951 if (buf->bytes_consumed + bytes_consumed > subbuf_size) {
836 relay_subbufs_consumed(buf->chan, buf->cpu, 1); 952 relay_subbufs_consumed(buf->chan, buf->cpu, 1);
837 buf->bytes_consumed = 0; 953 buf->bytes_consumed = 0;
@@ -863,6 +979,8 @@ static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
863 979
864 relay_file_read_consume(buf, read_pos, 0); 980 relay_file_read_consume(buf, read_pos, 0);
865 981
982 consumed = buf->subbufs_consumed;
983
866 if (unlikely(buf->offset > subbuf_size)) { 984 if (unlikely(buf->offset > subbuf_size)) {
867 if (produced == consumed) 985 if (produced == consumed)
868 return 0; 986 return 0;
@@ -881,8 +999,12 @@ static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
881 if (consumed > produced) 999 if (consumed > produced)
882 produced += n_subbufs * subbuf_size; 1000 produced += n_subbufs * subbuf_size;
883 1001
884 if (consumed == produced) 1002 if (consumed == produced) {
1003 if (buf->offset == subbuf_size &&
1004 buf->subbufs_produced > buf->subbufs_consumed)
1005 return 1;
885 return 0; 1006 return 0;
1007 }
886 1008
887 return 1; 1009 return 1;
888} 1010}
@@ -1237,4 +1359,4 @@ static __init int relay_init(void)
1237 return 0; 1359 return 0;
1238} 1360}
1239 1361
1240module_init(relay_init); 1362early_initcall(relay_init);
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index d3c61b4ebef2..f275c8eca772 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -13,6 +13,7 @@
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/res_counter.h> 14#include <linux/res_counter.h>
15#include <linux/uaccess.h> 15#include <linux/uaccess.h>
16#include <linux/mm.h>
16 17
17void res_counter_init(struct res_counter *counter) 18void res_counter_init(struct res_counter *counter)
18{ 19{
@@ -102,44 +103,37 @@ u64 res_counter_read_u64(struct res_counter *counter, int member)
102 return *res_counter_member(counter, member); 103 return *res_counter_member(counter, member);
103} 104}
104 105
105ssize_t res_counter_write(struct res_counter *counter, int member, 106int res_counter_memparse_write_strategy(const char *buf,
106 const char __user *userbuf, size_t nbytes, loff_t *pos, 107 unsigned long long *res)
107 int (*write_strategy)(char *st_buf, unsigned long long *val))
108{ 108{
109 int ret; 109 char *end;
110 char *buf, *end; 110 /* FIXME - make memparse() take const char* args */
111 unsigned long flags; 111 *res = memparse((char *)buf, &end);
112 unsigned long long tmp, *val; 112 if (*end != '\0')
113 113 return -EINVAL;
114 buf = kmalloc(nbytes + 1, GFP_KERNEL);
115 ret = -ENOMEM;
116 if (buf == NULL)
117 goto out;
118 114
119 buf[nbytes] = '\0'; 115 *res = PAGE_ALIGN(*res);
120 ret = -EFAULT; 116 return 0;
121 if (copy_from_user(buf, userbuf, nbytes)) 117}
122 goto out_free;
123 118
124 ret = -EINVAL; 119int res_counter_write(struct res_counter *counter, int member,
120 const char *buf, write_strategy_fn write_strategy)
121{
122 char *end;
123 unsigned long flags;
124 unsigned long long tmp, *val;
125 125
126 strstrip(buf);
127 if (write_strategy) { 126 if (write_strategy) {
128 if (write_strategy(buf, &tmp)) { 127 if (write_strategy(buf, &tmp))
129 goto out_free; 128 return -EINVAL;
130 }
131 } else { 129 } else {
132 tmp = simple_strtoull(buf, &end, 10); 130 tmp = simple_strtoull(buf, &end, 10);
133 if (*end != '\0') 131 if (*end != '\0')
134 goto out_free; 132 return -EINVAL;
135 } 133 }
136 spin_lock_irqsave(&counter->lock, flags); 134 spin_lock_irqsave(&counter->lock, flags);
137 val = res_counter_member(counter, member); 135 val = res_counter_member(counter, member);
138 *val = tmp; 136 *val = tmp;
139 spin_unlock_irqrestore(&counter->lock, flags); 137 spin_unlock_irqrestore(&counter->lock, flags);
140 ret = nbytes; 138 return 0;
141out_free:
142 kfree(buf);
143out:
144 return ret;
145} 139}
diff --git a/kernel/resource.c b/kernel/resource.c
index 74af2d7cb5a1..414d6fc9131e 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -362,35 +362,21 @@ int allocate_resource(struct resource *root, struct resource *new,
362 362
363EXPORT_SYMBOL(allocate_resource); 363EXPORT_SYMBOL(allocate_resource);
364 364
365/** 365/*
366 * insert_resource - Inserts a resource in the resource tree 366 * Insert a resource into the resource tree. If successful, return NULL,
367 * @parent: parent of the new resource 367 * otherwise return the conflicting resource (compare to __request_resource())
368 * @new: new resource to insert
369 *
370 * Returns 0 on success, -EBUSY if the resource can't be inserted.
371 *
372 * This function is equivalent to request_resource when no conflict
373 * happens. If a conflict happens, and the conflicting resources
374 * entirely fit within the range of the new resource, then the new
375 * resource is inserted and the conflicting resources become children of
376 * the new resource.
377 */ 368 */
378int insert_resource(struct resource *parent, struct resource *new) 369static struct resource * __insert_resource(struct resource *parent, struct resource *new)
379{ 370{
380 int result;
381 struct resource *first, *next; 371 struct resource *first, *next;
382 372
383 write_lock(&resource_lock);
384
385 for (;; parent = first) { 373 for (;; parent = first) {
386 result = 0;
387 first = __request_resource(parent, new); 374 first = __request_resource(parent, new);
388 if (!first) 375 if (!first)
389 goto out; 376 return first;
390 377
391 result = -EBUSY;
392 if (first == parent) 378 if (first == parent)
393 goto out; 379 return first;
394 380
395 if ((first->start > new->start) || (first->end < new->end)) 381 if ((first->start > new->start) || (first->end < new->end))
396 break; 382 break;
@@ -401,15 +387,13 @@ int insert_resource(struct resource *parent, struct resource *new)
401 for (next = first; ; next = next->sibling) { 387 for (next = first; ; next = next->sibling) {
402 /* Partial overlap? Bad, and unfixable */ 388 /* Partial overlap? Bad, and unfixable */
403 if (next->start < new->start || next->end > new->end) 389 if (next->start < new->start || next->end > new->end)
404 goto out; 390 return next;
405 if (!next->sibling) 391 if (!next->sibling)
406 break; 392 break;
407 if (next->sibling->start > new->end) 393 if (next->sibling->start > new->end)
408 break; 394 break;
409 } 395 }
410 396
411 result = 0;
412
413 new->parent = parent; 397 new->parent = parent;
414 new->sibling = next->sibling; 398 new->sibling = next->sibling;
415 new->child = first; 399 new->child = first;
@@ -426,10 +410,64 @@ int insert_resource(struct resource *parent, struct resource *new)
426 next = next->sibling; 410 next = next->sibling;
427 next->sibling = new; 411 next->sibling = new;
428 } 412 }
413 return NULL;
414}
429 415
430 out: 416/**
417 * insert_resource - Inserts a resource in the resource tree
418 * @parent: parent of the new resource
419 * @new: new resource to insert
420 *
421 * Returns 0 on success, -EBUSY if the resource can't be inserted.
422 *
423 * This function is equivalent to request_resource when no conflict
424 * happens. If a conflict happens, and the conflicting resources
425 * entirely fit within the range of the new resource, then the new
426 * resource is inserted and the conflicting resources become children of
427 * the new resource.
428 */
429int insert_resource(struct resource *parent, struct resource *new)
430{
431 struct resource *conflict;
432
433 write_lock(&resource_lock);
434 conflict = __insert_resource(parent, new);
435 write_unlock(&resource_lock);
436 return conflict ? -EBUSY : 0;
437}
438
439/**
440 * insert_resource_expand_to_fit - Insert a resource into the resource tree
441 * @root: root resource descriptor
442 * @new: new resource to insert
443 *
444 * Insert a resource into the resource tree, possibly expanding it in order
445 * to make it encompass any conflicting resources.
446 */
447void insert_resource_expand_to_fit(struct resource *root, struct resource *new)
448{
449 if (new->parent)
450 return;
451
452 write_lock(&resource_lock);
453 for (;;) {
454 struct resource *conflict;
455
456 conflict = __insert_resource(root, new);
457 if (!conflict)
458 break;
459 if (conflict == root)
460 break;
461
462 /* Ok, expand resource to cover the conflict, then try again .. */
463 if (conflict->start < new->start)
464 new->start = conflict->start;
465 if (conflict->end > new->end)
466 new->end = conflict->end;
467
468 printk("Expanded resource %s due to conflict with %s\n", new->name, conflict->name);
469 }
431 write_unlock(&resource_lock); 470 write_unlock(&resource_lock);
432 return result;
433} 471}
434 472
435/** 473/**
@@ -478,6 +516,74 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t
478 return result; 516 return result;
479} 517}
480 518
519static void __init __reserve_region_with_split(struct resource *root,
520 resource_size_t start, resource_size_t end,
521 const char *name)
522{
523 struct resource *parent = root;
524 struct resource *conflict;
525 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
526
527 if (!res)
528 return;
529
530 res->name = name;
531 res->start = start;
532 res->end = end;
533 res->flags = IORESOURCE_BUSY;
534
535 for (;;) {
536 conflict = __request_resource(parent, res);
537 if (!conflict)
538 break;
539 if (conflict != parent) {
540 parent = conflict;
541 if (!(conflict->flags & IORESOURCE_BUSY))
542 continue;
543 }
544
545 /* Uhhuh, that didn't work out.. */
546 kfree(res);
547 res = NULL;
548 break;
549 }
550
551 if (!res) {
552 printk(KERN_DEBUG " __reserve_region_with_split: (%s) [%llx, %llx], res: (%s) [%llx, %llx]\n",
553 conflict->name, conflict->start, conflict->end,
554 name, start, end);
555
556 /* failed, split and try again */
557
558 /* conflict coverred whole area */
559 if (conflict->start <= start && conflict->end >= end)
560 return;
561
562 if (conflict->start > start)
563 __reserve_region_with_split(root, start, conflict->start-1, name);
564 if (!(conflict->flags & IORESOURCE_BUSY)) {
565 resource_size_t common_start, common_end;
566
567 common_start = max(conflict->start, start);
568 common_end = min(conflict->end, end);
569 if (common_start < common_end)
570 __reserve_region_with_split(root, common_start, common_end, name);
571 }
572 if (conflict->end < end)
573 __reserve_region_with_split(root, conflict->end+1, end, name);
574 }
575
576}
577
578void reserve_region_with_split(struct resource *root,
579 resource_size_t start, resource_size_t end,
580 const char *name)
581{
582 write_lock(&resource_lock);
583 __reserve_region_with_split(root, start, end, name);
584 write_unlock(&resource_lock);
585}
586
481EXPORT_SYMBOL(adjust_resource); 587EXPORT_SYMBOL(adjust_resource);
482 588
483/** 589/**
@@ -490,7 +596,7 @@ resource_size_t resource_alignment(struct resource *res)
490{ 596{
491 switch (res->flags & (IORESOURCE_SIZEALIGN | IORESOURCE_STARTALIGN)) { 597 switch (res->flags & (IORESOURCE_SIZEALIGN | IORESOURCE_STARTALIGN)) {
492 case IORESOURCE_SIZEALIGN: 598 case IORESOURCE_SIZEALIGN:
493 return res->end - res->start + 1; 599 return resource_size(res);
494 case IORESOURCE_STARTALIGN: 600 case IORESOURCE_STARTALIGN:
495 return res->start; 601 return res->start;
496 default: 602 default:
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 092e4c620af9..a56f629b057a 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -297,8 +297,8 @@ static int test_func(void *data)
297 * 297 *
298 * opcode:data 298 * opcode:data
299 */ 299 */
300static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf, 300static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribute *attr,
301 size_t count) 301 const char *buf, size_t count)
302{ 302{
303 struct sched_param schedpar; 303 struct sched_param schedpar;
304 struct test_thread_data *td; 304 struct test_thread_data *td;
@@ -360,7 +360,8 @@ static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf,
360 * @dev: thread to query 360 * @dev: thread to query
361 * @buf: char buffer to be filled with thread status info 361 * @buf: char buffer to be filled with thread status info
362 */ 362 */
363static ssize_t sysfs_test_status(struct sys_device *dev, char *buf) 363static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute *attr,
364 char *buf)
364{ 365{
365 struct test_thread_data *td; 366 struct test_thread_data *td;
366 struct task_struct *tsk; 367 struct task_struct *tsk;
diff --git a/kernel/sched.c b/kernel/sched.c
index 0cdb50260dbf..d897a524e7d8 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -70,10 +70,13 @@
70#include <linux/bootmem.h> 70#include <linux/bootmem.h>
71#include <linux/debugfs.h> 71#include <linux/debugfs.h>
72#include <linux/ctype.h> 72#include <linux/ctype.h>
73#include <linux/ftrace.h>
73 74
74#include <asm/tlb.h> 75#include <asm/tlb.h>
75#include <asm/irq_regs.h> 76#include <asm/irq_regs.h>
76 77
78#include "sched_cpupri.h"
79
77/* 80/*
78 * Convert user-nice values [ -20 ... 0 ... 19 ] 81 * Convert user-nice values [ -20 ... 0 ... 19 ]
79 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], 82 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -198,14 +201,19 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
198 hrtimer_init(&rt_b->rt_period_timer, 201 hrtimer_init(&rt_b->rt_period_timer,
199 CLOCK_MONOTONIC, HRTIMER_MODE_REL); 202 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
200 rt_b->rt_period_timer.function = sched_rt_period_timer; 203 rt_b->rt_period_timer.function = sched_rt_period_timer;
201 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 204 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
205}
206
207static inline int rt_bandwidth_enabled(void)
208{
209 return sysctl_sched_rt_runtime >= 0;
202} 210}
203 211
204static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 212static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
205{ 213{
206 ktime_t now; 214 ktime_t now;
207 215
208 if (rt_b->rt_runtime == RUNTIME_INF) 216 if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
209 return; 217 return;
210 218
211 if (hrtimer_active(&rt_b->rt_period_timer)) 219 if (hrtimer_active(&rt_b->rt_period_timer))
@@ -289,15 +297,15 @@ struct task_group root_task_group;
289static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); 297static DEFINE_PER_CPU(struct sched_entity, init_sched_entity);
290/* Default task group's cfs_rq on each cpu */ 298/* Default task group's cfs_rq on each cpu */
291static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; 299static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
292#endif 300#endif /* CONFIG_FAIR_GROUP_SCHED */
293 301
294#ifdef CONFIG_RT_GROUP_SCHED 302#ifdef CONFIG_RT_GROUP_SCHED
295static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); 303static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
296static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; 304static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
297#endif 305#endif /* CONFIG_RT_GROUP_SCHED */
298#else 306#else /* !CONFIG_USER_SCHED */
299#define root_task_group init_task_group 307#define root_task_group init_task_group
300#endif 308#endif /* CONFIG_USER_SCHED */
301 309
302/* task_group_lock serializes add/remove of task groups and also changes to 310/* task_group_lock serializes add/remove of task groups and also changes to
303 * a task group's cpu shares. 311 * a task group's cpu shares.
@@ -307,9 +315,9 @@ static DEFINE_SPINLOCK(task_group_lock);
307#ifdef CONFIG_FAIR_GROUP_SCHED 315#ifdef CONFIG_FAIR_GROUP_SCHED
308#ifdef CONFIG_USER_SCHED 316#ifdef CONFIG_USER_SCHED
309# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) 317# define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD)
310#else 318#else /* !CONFIG_USER_SCHED */
311# define INIT_TASK_GROUP_LOAD NICE_0_LOAD 319# define INIT_TASK_GROUP_LOAD NICE_0_LOAD
312#endif 320#endif /* CONFIG_USER_SCHED */
313 321
314/* 322/*
315 * A weight of 0 or 1 can cause arithmetics problems. 323 * A weight of 0 or 1 can cause arithmetics problems.
@@ -363,6 +371,10 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
363#else 371#else
364 372
365static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } 373static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
374static inline struct task_group *task_group(struct task_struct *p)
375{
376 return NULL;
377}
366 378
367#endif /* CONFIG_GROUP_SCHED */ 379#endif /* CONFIG_GROUP_SCHED */
368 380
@@ -373,6 +385,7 @@ struct cfs_rq {
373 385
374 u64 exec_clock; 386 u64 exec_clock;
375 u64 min_vruntime; 387 u64 min_vruntime;
388 u64 pair_start;
376 389
377 struct rb_root tasks_timeline; 390 struct rb_root tasks_timeline;
378 struct rb_node *rb_leftmost; 391 struct rb_node *rb_leftmost;
@@ -401,6 +414,31 @@ struct cfs_rq {
401 */ 414 */
402 struct list_head leaf_cfs_rq_list; 415 struct list_head leaf_cfs_rq_list;
403 struct task_group *tg; /* group that "owns" this runqueue */ 416 struct task_group *tg; /* group that "owns" this runqueue */
417
418#ifdef CONFIG_SMP
419 /*
420 * the part of load.weight contributed by tasks
421 */
422 unsigned long task_weight;
423
424 /*
425 * h_load = weight * f(tg)
426 *
427 * Where f(tg) is the recursive weight fraction assigned to
428 * this group.
429 */
430 unsigned long h_load;
431
432 /*
433 * this cpu's part of tg->shares
434 */
435 unsigned long shares;
436
437 /*
438 * load.weight at the time we set shares
439 */
440 unsigned long rq_weight;
441#endif
404#endif 442#endif
405}; 443};
406 444
@@ -452,6 +490,9 @@ struct root_domain {
452 */ 490 */
453 cpumask_t rto_mask; 491 cpumask_t rto_mask;
454 atomic_t rto_count; 492 atomic_t rto_count;
493#ifdef CONFIG_SMP
494 struct cpupri cpupri;
495#endif
455}; 496};
456 497
457/* 498/*
@@ -526,14 +567,19 @@ struct rq {
526 int push_cpu; 567 int push_cpu;
527 /* cpu of this runqueue: */ 568 /* cpu of this runqueue: */
528 int cpu; 569 int cpu;
570 int online;
571
572 unsigned long avg_load_per_task;
529 573
530 struct task_struct *migration_thread; 574 struct task_struct *migration_thread;
531 struct list_head migration_queue; 575 struct list_head migration_queue;
532#endif 576#endif
533 577
534#ifdef CONFIG_SCHED_HRTICK 578#ifdef CONFIG_SCHED_HRTICK
535 unsigned long hrtick_flags; 579#ifdef CONFIG_SMP
536 ktime_t hrtick_expire; 580 int hrtick_csd_pending;
581 struct call_single_data hrtick_csd;
582#endif
537 struct hrtimer hrtick_timer; 583 struct hrtimer hrtick_timer;
538#endif 584#endif
539 585
@@ -559,14 +605,13 @@ struct rq {
559 /* BKL stats */ 605 /* BKL stats */
560 unsigned int bkl_count; 606 unsigned int bkl_count;
561#endif 607#endif
562 struct lock_class_key rq_lock_key;
563}; 608};
564 609
565static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 610static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
566 611
567static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) 612static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
568{ 613{
569 rq->curr->sched_class->check_preempt_curr(rq, p); 614 rq->curr->sched_class->check_preempt_curr(rq, p, sync);
570} 615}
571 616
572static inline int cpu_of(struct rq *rq) 617static inline int cpu_of(struct rq *rq)
@@ -607,6 +652,24 @@ static inline void update_rq_clock(struct rq *rq)
607# define const_debug static const 652# define const_debug static const
608#endif 653#endif
609 654
655/**
656 * runqueue_is_locked
657 *
658 * Returns true if the current cpu runqueue is locked.
659 * This interface allows printk to be called with the runqueue lock
660 * held and know whether or not it is OK to wake up the klogd.
661 */
662int runqueue_is_locked(void)
663{
664 int cpu = get_cpu();
665 struct rq *rq = cpu_rq(cpu);
666 int ret;
667
668 ret = spin_is_locked(&rq->lock);
669 put_cpu();
670 return ret;
671}
672
610/* 673/*
611 * Debugging: various feature bits 674 * Debugging: various feature bits
612 */ 675 */
@@ -749,6 +812,12 @@ late_initcall(sched_init_debug);
749const_debug unsigned int sysctl_sched_nr_migrate = 32; 812const_debug unsigned int sysctl_sched_nr_migrate = 32;
750 813
751/* 814/*
815 * ratelimit for updating the group shares.
816 * default: 0.25ms
817 */
818unsigned int sysctl_sched_shares_ratelimit = 250000;
819
820/*
752 * period over which we measure -rt task cpu usage in us. 821 * period over which we measure -rt task cpu usage in us.
753 * default: 1s 822 * default: 1s
754 */ 823 */
@@ -769,88 +838,12 @@ static inline u64 global_rt_period(void)
769 838
770static inline u64 global_rt_runtime(void) 839static inline u64 global_rt_runtime(void)
771{ 840{
772 if (sysctl_sched_rt_period < 0) 841 if (sysctl_sched_rt_runtime < 0)
773 return RUNTIME_INF; 842 return RUNTIME_INF;
774 843
775 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; 844 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
776} 845}
777 846
778unsigned long long time_sync_thresh = 100000;
779
780static DEFINE_PER_CPU(unsigned long long, time_offset);
781static DEFINE_PER_CPU(unsigned long long, prev_cpu_time);
782
783/*
784 * Global lock which we take every now and then to synchronize
785 * the CPUs time. This method is not warp-safe, but it's good
786 * enough to synchronize slowly diverging time sources and thus
787 * it's good enough for tracing:
788 */
789static DEFINE_SPINLOCK(time_sync_lock);
790static unsigned long long prev_global_time;
791
792static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu)
793{
794 /*
795 * We want this inlined, to not get tracer function calls
796 * in this critical section:
797 */
798 spin_acquire(&time_sync_lock.dep_map, 0, 0, _THIS_IP_);
799 __raw_spin_lock(&time_sync_lock.raw_lock);
800
801 if (time < prev_global_time) {
802 per_cpu(time_offset, cpu) += prev_global_time - time;
803 time = prev_global_time;
804 } else {
805 prev_global_time = time;
806 }
807
808 __raw_spin_unlock(&time_sync_lock.raw_lock);
809 spin_release(&time_sync_lock.dep_map, 1, _THIS_IP_);
810
811 return time;
812}
813
814static unsigned long long __cpu_clock(int cpu)
815{
816 unsigned long long now;
817
818 /*
819 * Only call sched_clock() if the scheduler has already been
820 * initialized (some code might call cpu_clock() very early):
821 */
822 if (unlikely(!scheduler_running))
823 return 0;
824
825 now = sched_clock_cpu(cpu);
826
827 return now;
828}
829
830/*
831 * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
832 * clock constructed from sched_clock():
833 */
834unsigned long long cpu_clock(int cpu)
835{
836 unsigned long long prev_cpu_time, time, delta_time;
837 unsigned long flags;
838
839 local_irq_save(flags);
840 prev_cpu_time = per_cpu(prev_cpu_time, cpu);
841 time = __cpu_clock(cpu) + per_cpu(time_offset, cpu);
842 delta_time = time-prev_cpu_time;
843
844 if (unlikely(delta_time > time_sync_thresh)) {
845 time = __sync_cpu_clock(time, cpu);
846 per_cpu(prev_cpu_time, cpu) = time;
847 }
848 local_irq_restore(flags);
849
850 return time;
851}
852EXPORT_SYMBOL_GPL(cpu_clock);
853
854#ifndef prepare_arch_switch 847#ifndef prepare_arch_switch
855# define prepare_arch_switch(next) do { } while (0) 848# define prepare_arch_switch(next) do { } while (0)
856#endif 849#endif
@@ -996,13 +989,6 @@ static struct rq *this_rq_lock(void)
996 return rq; 989 return rq;
997} 990}
998 991
999static void __resched_task(struct task_struct *p, int tif_bit);
1000
1001static inline void resched_task(struct task_struct *p)
1002{
1003 __resched_task(p, TIF_NEED_RESCHED);
1004}
1005
1006#ifdef CONFIG_SCHED_HRTICK 992#ifdef CONFIG_SCHED_HRTICK
1007/* 993/*
1008 * Use HR-timers to deliver accurate preemption points. 994 * Use HR-timers to deliver accurate preemption points.
@@ -1014,25 +1000,6 @@ static inline void resched_task(struct task_struct *p)
1014 * When we get rescheduled we reprogram the hrtick_timer outside of the 1000 * When we get rescheduled we reprogram the hrtick_timer outside of the
1015 * rq->lock. 1001 * rq->lock.
1016 */ 1002 */
1017static inline void resched_hrt(struct task_struct *p)
1018{
1019 __resched_task(p, TIF_HRTICK_RESCHED);
1020}
1021
1022static inline void resched_rq(struct rq *rq)
1023{
1024 unsigned long flags;
1025
1026 spin_lock_irqsave(&rq->lock, flags);
1027 resched_task(rq->curr);
1028 spin_unlock_irqrestore(&rq->lock, flags);
1029}
1030
1031enum {
1032 HRTICK_SET, /* re-programm hrtick_timer */
1033 HRTICK_RESET, /* not a new slice */
1034 HRTICK_BLOCK, /* stop hrtick operations */
1035};
1036 1003
1037/* 1004/*
1038 * Use hrtick when: 1005 * Use hrtick when:
@@ -1043,40 +1010,11 @@ static inline int hrtick_enabled(struct rq *rq)
1043{ 1010{
1044 if (!sched_feat(HRTICK)) 1011 if (!sched_feat(HRTICK))
1045 return 0; 1012 return 0;
1046 if (unlikely(test_bit(HRTICK_BLOCK, &rq->hrtick_flags))) 1013 if (!cpu_active(cpu_of(rq)))
1047 return 0; 1014 return 0;
1048 return hrtimer_is_hres_active(&rq->hrtick_timer); 1015 return hrtimer_is_hres_active(&rq->hrtick_timer);
1049} 1016}
1050 1017
1051/*
1052 * Called to set the hrtick timer state.
1053 *
1054 * called with rq->lock held and irqs disabled
1055 */
1056static void hrtick_start(struct rq *rq, u64 delay, int reset)
1057{
1058 assert_spin_locked(&rq->lock);
1059
1060 /*
1061 * preempt at: now + delay
1062 */
1063 rq->hrtick_expire =
1064 ktime_add_ns(rq->hrtick_timer.base->get_time(), delay);
1065 /*
1066 * indicate we need to program the timer
1067 */
1068 __set_bit(HRTICK_SET, &rq->hrtick_flags);
1069 if (reset)
1070 __set_bit(HRTICK_RESET, &rq->hrtick_flags);
1071
1072 /*
1073 * New slices are called from the schedule path and don't need a
1074 * forced reschedule.
1075 */
1076 if (reset)
1077 resched_hrt(rq->curr);
1078}
1079
1080static void hrtick_clear(struct rq *rq) 1018static void hrtick_clear(struct rq *rq)
1081{ 1019{
1082 if (hrtimer_active(&rq->hrtick_timer)) 1020 if (hrtimer_active(&rq->hrtick_timer))
@@ -1084,32 +1022,6 @@ static void hrtick_clear(struct rq *rq)
1084} 1022}
1085 1023
1086/* 1024/*
1087 * Update the timer from the possible pending state.
1088 */
1089static void hrtick_set(struct rq *rq)
1090{
1091 ktime_t time;
1092 int set, reset;
1093 unsigned long flags;
1094
1095 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1096
1097 spin_lock_irqsave(&rq->lock, flags);
1098 set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags);
1099 reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags);
1100 time = rq->hrtick_expire;
1101 clear_thread_flag(TIF_HRTICK_RESCHED);
1102 spin_unlock_irqrestore(&rq->lock, flags);
1103
1104 if (set) {
1105 hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS);
1106 if (reset && !hrtimer_active(&rq->hrtick_timer))
1107 resched_rq(rq);
1108 } else
1109 hrtick_clear(rq);
1110}
1111
1112/*
1113 * High-resolution timer tick. 1025 * High-resolution timer tick.
1114 * Runs from hardirq context with interrupts disabled. 1026 * Runs from hardirq context with interrupts disabled.
1115 */ 1027 */
@@ -1128,27 +1040,37 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
1128} 1040}
1129 1041
1130#ifdef CONFIG_SMP 1042#ifdef CONFIG_SMP
1131static void hotplug_hrtick_disable(int cpu) 1043/*
1044 * called from hardirq (IPI) context
1045 */
1046static void __hrtick_start(void *arg)
1132{ 1047{
1133 struct rq *rq = cpu_rq(cpu); 1048 struct rq *rq = arg;
1134 unsigned long flags;
1135 1049
1136 spin_lock_irqsave(&rq->lock, flags); 1050 spin_lock(&rq->lock);
1137 rq->hrtick_flags = 0; 1051 hrtimer_restart(&rq->hrtick_timer);
1138 __set_bit(HRTICK_BLOCK, &rq->hrtick_flags); 1052 rq->hrtick_csd_pending = 0;
1139 spin_unlock_irqrestore(&rq->lock, flags); 1053 spin_unlock(&rq->lock);
1140
1141 hrtick_clear(rq);
1142} 1054}
1143 1055
1144static void hotplug_hrtick_enable(int cpu) 1056/*
1057 * Called to set the hrtick timer state.
1058 *
1059 * called with rq->lock held and irqs disabled
1060 */
1061static void hrtick_start(struct rq *rq, u64 delay)
1145{ 1062{
1146 struct rq *rq = cpu_rq(cpu); 1063 struct hrtimer *timer = &rq->hrtick_timer;
1147 unsigned long flags; 1064 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
1148 1065
1149 spin_lock_irqsave(&rq->lock, flags); 1066 timer->expires = time;
1150 __clear_bit(HRTICK_BLOCK, &rq->hrtick_flags); 1067
1151 spin_unlock_irqrestore(&rq->lock, flags); 1068 if (rq == this_rq()) {
1069 hrtimer_restart(timer);
1070 } else if (!rq->hrtick_csd_pending) {
1071 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd);
1072 rq->hrtick_csd_pending = 1;
1073 }
1152} 1074}
1153 1075
1154static int 1076static int
@@ -1163,70 +1085,60 @@ hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
1163 case CPU_DOWN_PREPARE_FROZEN: 1085 case CPU_DOWN_PREPARE_FROZEN:
1164 case CPU_DEAD: 1086 case CPU_DEAD:
1165 case CPU_DEAD_FROZEN: 1087 case CPU_DEAD_FROZEN:
1166 hotplug_hrtick_disable(cpu); 1088 hrtick_clear(cpu_rq(cpu));
1167 return NOTIFY_OK;
1168
1169 case CPU_UP_PREPARE:
1170 case CPU_UP_PREPARE_FROZEN:
1171 case CPU_DOWN_FAILED:
1172 case CPU_DOWN_FAILED_FROZEN:
1173 case CPU_ONLINE:
1174 case CPU_ONLINE_FROZEN:
1175 hotplug_hrtick_enable(cpu);
1176 return NOTIFY_OK; 1089 return NOTIFY_OK;
1177 } 1090 }
1178 1091
1179 return NOTIFY_DONE; 1092 return NOTIFY_DONE;
1180} 1093}
1181 1094
1182static void init_hrtick(void) 1095static __init void init_hrtick(void)
1183{ 1096{
1184 hotcpu_notifier(hotplug_hrtick, 0); 1097 hotcpu_notifier(hotplug_hrtick, 0);
1185} 1098}
1186#endif /* CONFIG_SMP */ 1099#else
1100/*
1101 * Called to set the hrtick timer state.
1102 *
1103 * called with rq->lock held and irqs disabled
1104 */
1105static void hrtick_start(struct rq *rq, u64 delay)
1106{
1107 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
1108}
1187 1109
1188static void init_rq_hrtick(struct rq *rq) 1110static inline void init_hrtick(void)
1189{ 1111{
1190 rq->hrtick_flags = 0;
1191 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1192 rq->hrtick_timer.function = hrtick;
1193 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
1194} 1112}
1113#endif /* CONFIG_SMP */
1195 1114
1196void hrtick_resched(void) 1115static void init_rq_hrtick(struct rq *rq)
1197{ 1116{
1198 struct rq *rq; 1117#ifdef CONFIG_SMP
1199 unsigned long flags; 1118 rq->hrtick_csd_pending = 0;
1200 1119
1201 if (!test_thread_flag(TIF_HRTICK_RESCHED)) 1120 rq->hrtick_csd.flags = 0;
1202 return; 1121 rq->hrtick_csd.func = __hrtick_start;
1122 rq->hrtick_csd.info = rq;
1123#endif
1203 1124
1204 local_irq_save(flags); 1125 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1205 rq = cpu_rq(smp_processor_id()); 1126 rq->hrtick_timer.function = hrtick;
1206 hrtick_set(rq); 1127 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
1207 local_irq_restore(flags);
1208} 1128}
1209#else 1129#else /* CONFIG_SCHED_HRTICK */
1210static inline void hrtick_clear(struct rq *rq) 1130static inline void hrtick_clear(struct rq *rq)
1211{ 1131{
1212} 1132}
1213 1133
1214static inline void hrtick_set(struct rq *rq)
1215{
1216}
1217
1218static inline void init_rq_hrtick(struct rq *rq) 1134static inline void init_rq_hrtick(struct rq *rq)
1219{ 1135{
1220} 1136}
1221 1137
1222void hrtick_resched(void)
1223{
1224}
1225
1226static inline void init_hrtick(void) 1138static inline void init_hrtick(void)
1227{ 1139{
1228} 1140}
1229#endif 1141#endif /* CONFIG_SCHED_HRTICK */
1230 1142
1231/* 1143/*
1232 * resched_task - mark a task 'to be rescheduled now'. 1144 * resched_task - mark a task 'to be rescheduled now'.
@@ -1241,16 +1153,16 @@ static inline void init_hrtick(void)
1241#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) 1153#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1242#endif 1154#endif
1243 1155
1244static void __resched_task(struct task_struct *p, int tif_bit) 1156static void resched_task(struct task_struct *p)
1245{ 1157{
1246 int cpu; 1158 int cpu;
1247 1159
1248 assert_spin_locked(&task_rq(p)->lock); 1160 assert_spin_locked(&task_rq(p)->lock);
1249 1161
1250 if (unlikely(test_tsk_thread_flag(p, tif_bit))) 1162 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
1251 return; 1163 return;
1252 1164
1253 set_tsk_thread_flag(p, tif_bit); 1165 set_tsk_thread_flag(p, TIF_NEED_RESCHED);
1254 1166
1255 cpu = task_cpu(p); 1167 cpu = task_cpu(p);
1256 if (cpu == smp_processor_id()) 1168 if (cpu == smp_processor_id())
@@ -1313,15 +1225,15 @@ void wake_up_idle_cpu(int cpu)
1313 if (!tsk_is_polling(rq->idle)) 1225 if (!tsk_is_polling(rq->idle))
1314 smp_send_reschedule(cpu); 1226 smp_send_reschedule(cpu);
1315} 1227}
1316#endif 1228#endif /* CONFIG_NO_HZ */
1317 1229
1318#else 1230#else /* !CONFIG_SMP */
1319static void __resched_task(struct task_struct *p, int tif_bit) 1231static void resched_task(struct task_struct *p)
1320{ 1232{
1321 assert_spin_locked(&task_rq(p)->lock); 1233 assert_spin_locked(&task_rq(p)->lock);
1322 set_tsk_thread_flag(p, tif_bit); 1234 set_tsk_need_resched(p);
1323} 1235}
1324#endif 1236#endif /* CONFIG_SMP */
1325 1237
1326#if BITS_PER_LONG == 32 1238#if BITS_PER_LONG == 32
1327# define WMULT_CONST (~0UL) 1239# define WMULT_CONST (~0UL)
@@ -1336,6 +1248,9 @@ static void __resched_task(struct task_struct *p, int tif_bit)
1336 */ 1248 */
1337#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) 1249#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
1338 1250
1251/*
1252 * delta *= weight / lw
1253 */
1339static unsigned long 1254static unsigned long
1340calc_delta_mine(unsigned long delta_exec, unsigned long weight, 1255calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1341 struct load_weight *lw) 1256 struct load_weight *lw)
@@ -1363,12 +1278,6 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
1363 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); 1278 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
1364} 1279}
1365 1280
1366static inline unsigned long
1367calc_delta_fair(unsigned long delta_exec, struct load_weight *lw)
1368{
1369 return calc_delta_mine(delta_exec, NICE_0_LOAD, lw);
1370}
1371
1372static inline void update_load_add(struct load_weight *lw, unsigned long inc) 1281static inline void update_load_add(struct load_weight *lw, unsigned long inc)
1373{ 1282{
1374 lw->weight += inc; 1283 lw->weight += inc;
@@ -1476,20 +1385,227 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1476 update_load_sub(&rq->load, load); 1385 update_load_sub(&rq->load, load);
1477} 1386}
1478 1387
1388#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
1389typedef int (*tg_visitor)(struct task_group *, void *);
1390
1391/*
1392 * Iterate the full tree, calling @down when first entering a node and @up when
1393 * leaving it for the final time.
1394 */
1395static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1396{
1397 struct task_group *parent, *child;
1398 int ret;
1399
1400 rcu_read_lock();
1401 parent = &root_task_group;
1402down:
1403 ret = (*down)(parent, data);
1404 if (ret)
1405 goto out_unlock;
1406 list_for_each_entry_rcu(child, &parent->children, siblings) {
1407 parent = child;
1408 goto down;
1409
1410up:
1411 continue;
1412 }
1413 ret = (*up)(parent, data);
1414 if (ret)
1415 goto out_unlock;
1416
1417 child = parent;
1418 parent = parent->parent;
1419 if (parent)
1420 goto up;
1421out_unlock:
1422 rcu_read_unlock();
1423
1424 return ret;
1425}
1426
1427static int tg_nop(struct task_group *tg, void *data)
1428{
1429 return 0;
1430}
1431#endif
1432
1479#ifdef CONFIG_SMP 1433#ifdef CONFIG_SMP
1480static unsigned long source_load(int cpu, int type); 1434static unsigned long source_load(int cpu, int type);
1481static unsigned long target_load(int cpu, int type); 1435static unsigned long target_load(int cpu, int type);
1482static unsigned long cpu_avg_load_per_task(int cpu);
1483static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); 1436static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1484#else /* CONFIG_SMP */ 1437
1438static unsigned long cpu_avg_load_per_task(int cpu)
1439{
1440 struct rq *rq = cpu_rq(cpu);
1441
1442 if (rq->nr_running)
1443 rq->avg_load_per_task = rq->load.weight / rq->nr_running;
1444
1445 return rq->avg_load_per_task;
1446}
1485 1447
1486#ifdef CONFIG_FAIR_GROUP_SCHED 1448#ifdef CONFIG_FAIR_GROUP_SCHED
1487static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) 1449
1450static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1451
1452/*
1453 * Calculate and set the cpu's group shares.
1454 */
1455static void
1456__update_group_shares_cpu(struct task_group *tg, int cpu,
1457 unsigned long sd_shares, unsigned long sd_rq_weight)
1458{
1459 int boost = 0;
1460 unsigned long shares;
1461 unsigned long rq_weight;
1462
1463 if (!tg->se[cpu])
1464 return;
1465
1466 rq_weight = tg->cfs_rq[cpu]->load.weight;
1467
1468 /*
1469 * If there are currently no tasks on the cpu pretend there is one of
1470 * average load so that when a new task gets to run here it will not
1471 * get delayed by group starvation.
1472 */
1473 if (!rq_weight) {
1474 boost = 1;
1475 rq_weight = NICE_0_LOAD;
1476 }
1477
1478 if (unlikely(rq_weight > sd_rq_weight))
1479 rq_weight = sd_rq_weight;
1480
1481 /*
1482 * \Sum shares * rq_weight
1483 * shares = -----------------------
1484 * \Sum rq_weight
1485 *
1486 */
1487 shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
1488
1489 /*
1490 * record the actual number of shares, not the boosted amount.
1491 */
1492 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1493 tg->cfs_rq[cpu]->rq_weight = rq_weight;
1494
1495 if (shares < MIN_SHARES)
1496 shares = MIN_SHARES;
1497 else if (shares > MAX_SHARES)
1498 shares = MAX_SHARES;
1499
1500 __set_se_shares(tg->se[cpu], shares);
1501}
1502
1503/*
1504 * Re-compute the task group their per cpu shares over the given domain.
1505 * This needs to be done in a bottom-up fashion because the rq weight of a
1506 * parent group depends on the shares of its child groups.
1507 */
1508static int tg_shares_up(struct task_group *tg, void *data)
1509{
1510 unsigned long rq_weight = 0;
1511 unsigned long shares = 0;
1512 struct sched_domain *sd = data;
1513 int i;
1514
1515 for_each_cpu_mask(i, sd->span) {
1516 rq_weight += tg->cfs_rq[i]->load.weight;
1517 shares += tg->cfs_rq[i]->shares;
1518 }
1519
1520 if ((!shares && rq_weight) || shares > tg->shares)
1521 shares = tg->shares;
1522
1523 if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
1524 shares = tg->shares;
1525
1526 if (!rq_weight)
1527 rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
1528
1529 for_each_cpu_mask(i, sd->span) {
1530 struct rq *rq = cpu_rq(i);
1531 unsigned long flags;
1532
1533 spin_lock_irqsave(&rq->lock, flags);
1534 __update_group_shares_cpu(tg, i, shares, rq_weight);
1535 spin_unlock_irqrestore(&rq->lock, flags);
1536 }
1537
1538 return 0;
1539}
1540
1541/*
1542 * Compute the cpu's hierarchical load factor for each task group.
1543 * This needs to be done in a top-down fashion because the load of a child
1544 * group is a fraction of its parents load.
1545 */
1546static int tg_load_down(struct task_group *tg, void *data)
1547{
1548 unsigned long load;
1549 long cpu = (long)data;
1550
1551 if (!tg->parent) {
1552 load = cpu_rq(cpu)->load.weight;
1553 } else {
1554 load = tg->parent->cfs_rq[cpu]->h_load;
1555 load *= tg->cfs_rq[cpu]->shares;
1556 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
1557 }
1558
1559 tg->cfs_rq[cpu]->h_load = load;
1560
1561 return 0;
1562}
1563
1564static void update_shares(struct sched_domain *sd)
1565{
1566 u64 now = cpu_clock(raw_smp_processor_id());
1567 s64 elapsed = now - sd->last_update;
1568
1569 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1570 sd->last_update = now;
1571 walk_tg_tree(tg_nop, tg_shares_up, sd);
1572 }
1573}
1574
1575static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1576{
1577 spin_unlock(&rq->lock);
1578 update_shares(sd);
1579 spin_lock(&rq->lock);
1580}
1581
1582static void update_h_load(long cpu)
1583{
1584 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1585}
1586
1587#else
1588
1589static inline void update_shares(struct sched_domain *sd)
1590{
1591}
1592
1593static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1488{ 1594{
1489} 1595}
1596
1490#endif 1597#endif
1491 1598
1492#endif /* CONFIG_SMP */ 1599#endif
1600
1601#ifdef CONFIG_FAIR_GROUP_SCHED
1602static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1603{
1604#ifdef CONFIG_SMP
1605 cfs_rq->shares = shares;
1606#endif
1607}
1608#endif
1493 1609
1494#include "sched_stats.h" 1610#include "sched_stats.h"
1495#include "sched_idletask.c" 1611#include "sched_idletask.c"
@@ -1500,27 +1616,17 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
1500#endif 1616#endif
1501 1617
1502#define sched_class_highest (&rt_sched_class) 1618#define sched_class_highest (&rt_sched_class)
1619#define for_each_class(class) \
1620 for (class = sched_class_highest; class; class = class->next)
1503 1621
1504static inline void inc_load(struct rq *rq, const struct task_struct *p) 1622static void inc_nr_running(struct rq *rq)
1505{
1506 update_load_add(&rq->load, p->se.load.weight);
1507}
1508
1509static inline void dec_load(struct rq *rq, const struct task_struct *p)
1510{
1511 update_load_sub(&rq->load, p->se.load.weight);
1512}
1513
1514static void inc_nr_running(struct task_struct *p, struct rq *rq)
1515{ 1623{
1516 rq->nr_running++; 1624 rq->nr_running++;
1517 inc_load(rq, p);
1518} 1625}
1519 1626
1520static void dec_nr_running(struct task_struct *p, struct rq *rq) 1627static void dec_nr_running(struct rq *rq)
1521{ 1628{
1522 rq->nr_running--; 1629 rq->nr_running--;
1523 dec_load(rq, p);
1524} 1630}
1525 1631
1526static void set_load_weight(struct task_struct *p) 1632static void set_load_weight(struct task_struct *p)
@@ -1544,6 +1650,12 @@ static void set_load_weight(struct task_struct *p)
1544 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; 1650 p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
1545} 1651}
1546 1652
1653static void update_avg(u64 *avg, u64 sample)
1654{
1655 s64 diff = sample - *avg;
1656 *avg += diff >> 3;
1657}
1658
1547static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) 1659static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1548{ 1660{
1549 sched_info_queued(p); 1661 sched_info_queued(p);
@@ -1553,6 +1665,13 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
1553 1665
1554static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) 1666static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
1555{ 1667{
1668 if (sleep && p->se.last_wakeup) {
1669 update_avg(&p->se.avg_overlap,
1670 p->se.sum_exec_runtime - p->se.last_wakeup);
1671 p->se.last_wakeup = 0;
1672 }
1673
1674 sched_info_dequeued(p);
1556 p->sched_class->dequeue_task(rq, p, sleep); 1675 p->sched_class->dequeue_task(rq, p, sleep);
1557 p->se.on_rq = 0; 1676 p->se.on_rq = 0;
1558} 1677}
@@ -1612,7 +1731,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
1612 rq->nr_uninterruptible--; 1731 rq->nr_uninterruptible--;
1613 1732
1614 enqueue_task(rq, p, wakeup); 1733 enqueue_task(rq, p, wakeup);
1615 inc_nr_running(p, rq); 1734 inc_nr_running(rq);
1616} 1735}
1617 1736
1618/* 1737/*
@@ -1624,7 +1743,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
1624 rq->nr_uninterruptible++; 1743 rq->nr_uninterruptible++;
1625 1744
1626 dequeue_task(rq, p, sleep); 1745 dequeue_task(rq, p, sleep);
1627 dec_nr_running(p, rq); 1746 dec_nr_running(rq);
1628} 1747}
1629 1748
1630/** 1749/**
@@ -1636,12 +1755,6 @@ inline int task_curr(const struct task_struct *p)
1636 return cpu_curr(task_cpu(p)) == p; 1755 return cpu_curr(task_cpu(p)) == p;
1637} 1756}
1638 1757
1639/* Used instead of source_load when we know the type == 0 */
1640unsigned long weighted_cpuload(const int cpu)
1641{
1642 return cpu_rq(cpu)->load.weight;
1643}
1644
1645static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) 1758static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
1646{ 1759{
1647 set_task_rq(p, cpu); 1760 set_task_rq(p, cpu);
@@ -1670,6 +1783,12 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
1670 1783
1671#ifdef CONFIG_SMP 1784#ifdef CONFIG_SMP
1672 1785
1786/* Used instead of source_load when we know the type == 0 */
1787static unsigned long weighted_cpuload(const int cpu)
1788{
1789 return cpu_rq(cpu)->load.weight;
1790}
1791
1673/* 1792/*
1674 * Is this task likely cache-hot: 1793 * Is this task likely cache-hot:
1675 */ 1794 */
@@ -1765,16 +1884,24 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1765/* 1884/*
1766 * wait_task_inactive - wait for a thread to unschedule. 1885 * wait_task_inactive - wait for a thread to unschedule.
1767 * 1886 *
1887 * If @match_state is nonzero, it's the @p->state value just checked and
1888 * not expected to change. If it changes, i.e. @p might have woken up,
1889 * then return zero. When we succeed in waiting for @p to be off its CPU,
1890 * we return a positive number (its total switch count). If a second call
1891 * a short while later returns the same number, the caller can be sure that
1892 * @p has remained unscheduled the whole time.
1893 *
1768 * The caller must ensure that the task *will* unschedule sometime soon, 1894 * The caller must ensure that the task *will* unschedule sometime soon,
1769 * else this function might spin for a *long* time. This function can't 1895 * else this function might spin for a *long* time. This function can't
1770 * be called with interrupts off, or it may introduce deadlock with 1896 * be called with interrupts off, or it may introduce deadlock with
1771 * smp_call_function() if an IPI is sent by the same process we are 1897 * smp_call_function() if an IPI is sent by the same process we are
1772 * waiting to become inactive. 1898 * waiting to become inactive.
1773 */ 1899 */
1774void wait_task_inactive(struct task_struct *p) 1900unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1775{ 1901{
1776 unsigned long flags; 1902 unsigned long flags;
1777 int running, on_rq; 1903 int running, on_rq;
1904 unsigned long ncsw;
1778 struct rq *rq; 1905 struct rq *rq;
1779 1906
1780 for (;;) { 1907 for (;;) {
@@ -1797,8 +1924,11 @@ void wait_task_inactive(struct task_struct *p)
1797 * return false if the runqueue has changed and p 1924 * return false if the runqueue has changed and p
1798 * is actually now running somewhere else! 1925 * is actually now running somewhere else!
1799 */ 1926 */
1800 while (task_running(rq, p)) 1927 while (task_running(rq, p)) {
1928 if (match_state && unlikely(p->state != match_state))
1929 return 0;
1801 cpu_relax(); 1930 cpu_relax();
1931 }
1802 1932
1803 /* 1933 /*
1804 * Ok, time to look more closely! We need the rq 1934 * Ok, time to look more closely! We need the rq
@@ -1808,9 +1938,18 @@ void wait_task_inactive(struct task_struct *p)
1808 rq = task_rq_lock(p, &flags); 1938 rq = task_rq_lock(p, &flags);
1809 running = task_running(rq, p); 1939 running = task_running(rq, p);
1810 on_rq = p->se.on_rq; 1940 on_rq = p->se.on_rq;
1941 ncsw = 0;
1942 if (!match_state || p->state == match_state)
1943 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
1811 task_rq_unlock(rq, &flags); 1944 task_rq_unlock(rq, &flags);
1812 1945
1813 /* 1946 /*
1947 * If it changed from the expected state, bail out now.
1948 */
1949 if (unlikely(!ncsw))
1950 break;
1951
1952 /*
1814 * Was it really running after all now that we 1953 * Was it really running after all now that we
1815 * checked with the proper locks actually held? 1954 * checked with the proper locks actually held?
1816 * 1955 *
@@ -1842,6 +1981,8 @@ void wait_task_inactive(struct task_struct *p)
1842 */ 1981 */
1843 break; 1982 break;
1844 } 1983 }
1984
1985 return ncsw;
1845} 1986}
1846 1987
1847/*** 1988/***
@@ -1880,7 +2021,7 @@ static unsigned long source_load(int cpu, int type)
1880 struct rq *rq = cpu_rq(cpu); 2021 struct rq *rq = cpu_rq(cpu);
1881 unsigned long total = weighted_cpuload(cpu); 2022 unsigned long total = weighted_cpuload(cpu);
1882 2023
1883 if (type == 0) 2024 if (type == 0 || !sched_feat(LB_BIAS))
1884 return total; 2025 return total;
1885 2026
1886 return min(rq->cpu_load[type-1], total); 2027 return min(rq->cpu_load[type-1], total);
@@ -1895,25 +2036,13 @@ static unsigned long target_load(int cpu, int type)
1895 struct rq *rq = cpu_rq(cpu); 2036 struct rq *rq = cpu_rq(cpu);
1896 unsigned long total = weighted_cpuload(cpu); 2037 unsigned long total = weighted_cpuload(cpu);
1897 2038
1898 if (type == 0) 2039 if (type == 0 || !sched_feat(LB_BIAS))
1899 return total; 2040 return total;
1900 2041
1901 return max(rq->cpu_load[type-1], total); 2042 return max(rq->cpu_load[type-1], total);
1902} 2043}
1903 2044
1904/* 2045/*
1905 * Return the average load per task on the cpu's run queue
1906 */
1907static unsigned long cpu_avg_load_per_task(int cpu)
1908{
1909 struct rq *rq = cpu_rq(cpu);
1910 unsigned long total = weighted_cpuload(cpu);
1911 unsigned long n = rq->nr_running;
1912
1913 return n ? total / n : SCHED_LOAD_SCALE;
1914}
1915
1916/*
1917 * find_idlest_group finds and returns the least busy CPU group within the 2046 * find_idlest_group finds and returns the least busy CPU group within the
1918 * domain. 2047 * domain.
1919 */ 2048 */
@@ -1939,7 +2068,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
1939 /* Tally up the load of all CPUs in the group */ 2068 /* Tally up the load of all CPUs in the group */
1940 avg_load = 0; 2069 avg_load = 0;
1941 2070
1942 for_each_cpu_mask(i, group->cpumask) { 2071 for_each_cpu_mask_nr(i, group->cpumask) {
1943 /* Bias balancing toward cpus of our domain */ 2072 /* Bias balancing toward cpus of our domain */
1944 if (local_group) 2073 if (local_group)
1945 load = source_load(i, load_idx); 2074 load = source_load(i, load_idx);
@@ -1981,7 +2110,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
1981 /* Traverse only the allowed CPUs */ 2110 /* Traverse only the allowed CPUs */
1982 cpus_and(*tmp, group->cpumask, p->cpus_allowed); 2111 cpus_and(*tmp, group->cpumask, p->cpus_allowed);
1983 2112
1984 for_each_cpu_mask(i, *tmp) { 2113 for_each_cpu_mask_nr(i, *tmp) {
1985 load = weighted_cpuload(i); 2114 load = weighted_cpuload(i);
1986 2115
1987 if (load < min_load || (load == min_load && i == this_cpu)) { 2116 if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -2019,6 +2148,9 @@ static int sched_balance_self(int cpu, int flag)
2019 sd = tmp; 2148 sd = tmp;
2020 } 2149 }
2021 2150
2151 if (sd)
2152 update_shares(sd);
2153
2022 while (sd) { 2154 while (sd) {
2023 cpumask_t span, tmpmask; 2155 cpumask_t span, tmpmask;
2024 struct sched_group *group; 2156 struct sched_group *group;
@@ -2085,6 +2217,22 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2085 if (!sched_feat(SYNC_WAKEUPS)) 2217 if (!sched_feat(SYNC_WAKEUPS))
2086 sync = 0; 2218 sync = 0;
2087 2219
2220#ifdef CONFIG_SMP
2221 if (sched_feat(LB_WAKEUP_UPDATE)) {
2222 struct sched_domain *sd;
2223
2224 this_cpu = raw_smp_processor_id();
2225 cpu = task_cpu(p);
2226
2227 for_each_domain(this_cpu, sd) {
2228 if (cpu_isset(cpu, sd->span)) {
2229 update_shares(sd);
2230 break;
2231 }
2232 }
2233 }
2234#endif
2235
2088 smp_wmb(); 2236 smp_wmb();
2089 rq = task_rq_lock(p, &flags); 2237 rq = task_rq_lock(p, &flags);
2090 old_state = p->state; 2238 old_state = p->state;
@@ -2131,7 +2279,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
2131 } 2279 }
2132 } 2280 }
2133 } 2281 }
2134#endif 2282#endif /* CONFIG_SCHEDSTATS */
2135 2283
2136out_activate: 2284out_activate:
2137#endif /* CONFIG_SMP */ 2285#endif /* CONFIG_SMP */
@@ -2149,7 +2297,10 @@ out_activate:
2149 success = 1; 2297 success = 1;
2150 2298
2151out_running: 2299out_running:
2152 check_preempt_curr(rq, p); 2300 trace_mark(kernel_sched_wakeup,
2301 "pid %d state %ld ## rq %p task %p rq->curr %p",
2302 p->pid, p->state, rq, p, rq->curr);
2303 check_preempt_curr(rq, p, sync);
2153 2304
2154 p->state = TASK_RUNNING; 2305 p->state = TASK_RUNNING;
2155#ifdef CONFIG_SMP 2306#ifdef CONFIG_SMP
@@ -2157,6 +2308,8 @@ out_running:
2157 p->sched_class->task_wake_up(rq, p); 2308 p->sched_class->task_wake_up(rq, p);
2158#endif 2309#endif
2159out: 2310out:
2311 current->se.last_wakeup = current->se.sum_exec_runtime;
2312
2160 task_rq_unlock(rq, &flags); 2313 task_rq_unlock(rq, &flags);
2161 2314
2162 return success; 2315 return success;
@@ -2277,9 +2430,12 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2277 * management (if any): 2430 * management (if any):
2278 */ 2431 */
2279 p->sched_class->task_new(rq, p); 2432 p->sched_class->task_new(rq, p);
2280 inc_nr_running(p, rq); 2433 inc_nr_running(rq);
2281 } 2434 }
2282 check_preempt_curr(rq, p); 2435 trace_mark(kernel_sched_wakeup_new,
2436 "pid %d state %ld ## rq %p task %p rq->curr %p",
2437 p->pid, p->state, rq, p, rq->curr);
2438 check_preempt_curr(rq, p, 0);
2283#ifdef CONFIG_SMP 2439#ifdef CONFIG_SMP
2284 if (p->sched_class->task_wake_up) 2440 if (p->sched_class->task_wake_up)
2285 p->sched_class->task_wake_up(rq, p); 2441 p->sched_class->task_wake_up(rq, p);
@@ -2331,7 +2487,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
2331 notifier->ops->sched_out(notifier, next); 2487 notifier->ops->sched_out(notifier, next);
2332} 2488}
2333 2489
2334#else 2490#else /* !CONFIG_PREEMPT_NOTIFIERS */
2335 2491
2336static void fire_sched_in_preempt_notifiers(struct task_struct *curr) 2492static void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2337{ 2493{
@@ -2343,7 +2499,7 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr,
2343{ 2499{
2344} 2500}
2345 2501
2346#endif 2502#endif /* CONFIG_PREEMPT_NOTIFIERS */
2347 2503
2348/** 2504/**
2349 * prepare_task_switch - prepare to switch tasks 2505 * prepare_task_switch - prepare to switch tasks
@@ -2451,6 +2607,11 @@ context_switch(struct rq *rq, struct task_struct *prev,
2451 struct mm_struct *mm, *oldmm; 2607 struct mm_struct *mm, *oldmm;
2452 2608
2453 prepare_task_switch(rq, prev, next); 2609 prepare_task_switch(rq, prev, next);
2610 trace_mark(kernel_sched_schedule,
2611 "prev_pid %d next_pid %d prev_state %ld "
2612 "## rq %p prev %p next %p",
2613 prev->pid, next->pid, prev->state,
2614 rq, prev, next);
2454 mm = next->mm; 2615 mm = next->mm;
2455 oldmm = prev->active_mm; 2616 oldmm = prev->active_mm;
2456 /* 2617 /*
@@ -2612,10 +2773,10 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
2612 } else { 2773 } else {
2613 if (rq1 < rq2) { 2774 if (rq1 < rq2) {
2614 spin_lock(&rq1->lock); 2775 spin_lock(&rq1->lock);
2615 spin_lock(&rq2->lock); 2776 spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
2616 } else { 2777 } else {
2617 spin_lock(&rq2->lock); 2778 spin_lock(&rq2->lock);
2618 spin_lock(&rq1->lock); 2779 spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
2619 } 2780 }
2620 } 2781 }
2621 update_rq_clock(rq1); 2782 update_rq_clock(rq1);
@@ -2658,14 +2819,21 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
2658 if (busiest < this_rq) { 2819 if (busiest < this_rq) {
2659 spin_unlock(&this_rq->lock); 2820 spin_unlock(&this_rq->lock);
2660 spin_lock(&busiest->lock); 2821 spin_lock(&busiest->lock);
2661 spin_lock(&this_rq->lock); 2822 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
2662 ret = 1; 2823 ret = 1;
2663 } else 2824 } else
2664 spin_lock(&busiest->lock); 2825 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
2665 } 2826 }
2666 return ret; 2827 return ret;
2667} 2828}
2668 2829
2830static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
2831 __releases(busiest->lock)
2832{
2833 spin_unlock(&busiest->lock);
2834 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
2835}
2836
2669/* 2837/*
2670 * If dest_cpu is allowed for this process, migrate the task to it. 2838 * If dest_cpu is allowed for this process, migrate the task to it.
2671 * This is accomplished by forcing the cpu_allowed mask to only 2839 * This is accomplished by forcing the cpu_allowed mask to only
@@ -2680,7 +2848,7 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2680 2848
2681 rq = task_rq_lock(p, &flags); 2849 rq = task_rq_lock(p, &flags);
2682 if (!cpu_isset(dest_cpu, p->cpus_allowed) 2850 if (!cpu_isset(dest_cpu, p->cpus_allowed)
2683 || unlikely(cpu_is_offline(dest_cpu))) 2851 || unlikely(!cpu_active(dest_cpu)))
2684 goto out; 2852 goto out;
2685 2853
2686 /* force the process onto the specified CPU */ 2854 /* force the process onto the specified CPU */
@@ -2727,7 +2895,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
2727 * Note that idle threads have a prio of MAX_PRIO, for this test 2895 * Note that idle threads have a prio of MAX_PRIO, for this test
2728 * to be always true for them. 2896 * to be always true for them.
2729 */ 2897 */
2730 check_preempt_curr(this_rq, p); 2898 check_preempt_curr(this_rq, p, 0);
2731} 2899}
2732 2900
2733/* 2901/*
@@ -2785,7 +2953,7 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2785 enum cpu_idle_type idle, int *all_pinned, 2953 enum cpu_idle_type idle, int *all_pinned,
2786 int *this_best_prio, struct rq_iterator *iterator) 2954 int *this_best_prio, struct rq_iterator *iterator)
2787{ 2955{
2788 int loops = 0, pulled = 0, pinned = 0, skip_for_load; 2956 int loops = 0, pulled = 0, pinned = 0;
2789 struct task_struct *p; 2957 struct task_struct *p;
2790 long rem_load_move = max_load_move; 2958 long rem_load_move = max_load_move;
2791 2959
@@ -2801,14 +2969,8 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2801next: 2969next:
2802 if (!p || loops++ > sysctl_sched_nr_migrate) 2970 if (!p || loops++ > sysctl_sched_nr_migrate)
2803 goto out; 2971 goto out;
2804 /* 2972
2805 * To help distribute high priority tasks across CPUs we don't 2973 if ((p->se.load.weight >> 1) > rem_load_move ||
2806 * skip a task if it will be the highest priority task (i.e. smallest
2807 * prio value) on its new queue regardless of its load weight
2808 */
2809 skip_for_load = (p->se.load.weight >> 1) > rem_load_move +
2810 SCHED_LOAD_SCALE_FUZZ;
2811 if ((skip_for_load && p->prio >= *this_best_prio) ||
2812 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { 2974 !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) {
2813 p = iterator->next(iterator->arg); 2975 p = iterator->next(iterator->arg);
2814 goto next; 2976 goto next;
@@ -2863,6 +3025,10 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
2863 max_load_move - total_load_moved, 3025 max_load_move - total_load_moved,
2864 sd, idle, all_pinned, &this_best_prio); 3026 sd, idle, all_pinned, &this_best_prio);
2865 class = class->next; 3027 class = class->next;
3028
3029 if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
3030 break;
3031
2866 } while (class && max_load_move > total_load_moved); 3032 } while (class && max_load_move > total_load_moved);
2867 3033
2868 return total_load_moved > 0; 3034 return total_load_moved > 0;
@@ -2939,6 +3105,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2939 max_load = this_load = total_load = total_pwr = 0; 3105 max_load = this_load = total_load = total_pwr = 0;
2940 busiest_load_per_task = busiest_nr_running = 0; 3106 busiest_load_per_task = busiest_nr_running = 0;
2941 this_load_per_task = this_nr_running = 0; 3107 this_load_per_task = this_nr_running = 0;
3108
2942 if (idle == CPU_NOT_IDLE) 3109 if (idle == CPU_NOT_IDLE)
2943 load_idx = sd->busy_idx; 3110 load_idx = sd->busy_idx;
2944 else if (idle == CPU_NEWLY_IDLE) 3111 else if (idle == CPU_NEWLY_IDLE)
@@ -2953,6 +3120,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2953 int __group_imb = 0; 3120 int __group_imb = 0;
2954 unsigned int balance_cpu = -1, first_idle_cpu = 0; 3121 unsigned int balance_cpu = -1, first_idle_cpu = 0;
2955 unsigned long sum_nr_running, sum_weighted_load; 3122 unsigned long sum_nr_running, sum_weighted_load;
3123 unsigned long sum_avg_load_per_task;
3124 unsigned long avg_load_per_task;
2956 3125
2957 local_group = cpu_isset(this_cpu, group->cpumask); 3126 local_group = cpu_isset(this_cpu, group->cpumask);
2958 3127
@@ -2961,10 +3130,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2961 3130
2962 /* Tally up the load of all CPUs in the group */ 3131 /* Tally up the load of all CPUs in the group */
2963 sum_weighted_load = sum_nr_running = avg_load = 0; 3132 sum_weighted_load = sum_nr_running = avg_load = 0;
3133 sum_avg_load_per_task = avg_load_per_task = 0;
3134
2964 max_cpu_load = 0; 3135 max_cpu_load = 0;
2965 min_cpu_load = ~0UL; 3136 min_cpu_load = ~0UL;
2966 3137
2967 for_each_cpu_mask(i, group->cpumask) { 3138 for_each_cpu_mask_nr(i, group->cpumask) {
2968 struct rq *rq; 3139 struct rq *rq;
2969 3140
2970 if (!cpu_isset(i, *cpus)) 3141 if (!cpu_isset(i, *cpus))
@@ -2994,6 +3165,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
2994 avg_load += load; 3165 avg_load += load;
2995 sum_nr_running += rq->nr_running; 3166 sum_nr_running += rq->nr_running;
2996 sum_weighted_load += weighted_cpuload(i); 3167 sum_weighted_load += weighted_cpuload(i);
3168
3169 sum_avg_load_per_task += cpu_avg_load_per_task(i);
2997 } 3170 }
2998 3171
2999 /* 3172 /*
@@ -3015,7 +3188,20 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3015 avg_load = sg_div_cpu_power(group, 3188 avg_load = sg_div_cpu_power(group,
3016 avg_load * SCHED_LOAD_SCALE); 3189 avg_load * SCHED_LOAD_SCALE);
3017 3190
3018 if ((max_cpu_load - min_cpu_load) > SCHED_LOAD_SCALE) 3191
3192 /*
3193 * Consider the group unbalanced when the imbalance is larger
3194 * than the average weight of two tasks.
3195 *
3196 * APZ: with cgroup the avg task weight can vary wildly and
3197 * might not be a suitable number - should we keep a
3198 * normalized nr_running number somewhere that negates
3199 * the hierarchy?
3200 */
3201 avg_load_per_task = sg_div_cpu_power(group,
3202 sum_avg_load_per_task * SCHED_LOAD_SCALE);
3203
3204 if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
3019 __group_imb = 1; 3205 __group_imb = 1;
3020 3206
3021 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; 3207 group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
@@ -3156,9 +3342,9 @@ small_imbalance:
3156 if (busiest_load_per_task > this_load_per_task) 3342 if (busiest_load_per_task > this_load_per_task)
3157 imbn = 1; 3343 imbn = 1;
3158 } else 3344 } else
3159 this_load_per_task = SCHED_LOAD_SCALE; 3345 this_load_per_task = cpu_avg_load_per_task(this_cpu);
3160 3346
3161 if (max_load - this_load + SCHED_LOAD_SCALE_FUZZ >= 3347 if (max_load - this_load + 2*busiest_load_per_task >=
3162 busiest_load_per_task * imbn) { 3348 busiest_load_per_task * imbn) {
3163 *imbalance = busiest_load_per_task; 3349 *imbalance = busiest_load_per_task;
3164 return busiest; 3350 return busiest;
@@ -3228,7 +3414,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3228 unsigned long max_load = 0; 3414 unsigned long max_load = 0;
3229 int i; 3415 int i;
3230 3416
3231 for_each_cpu_mask(i, group->cpumask) { 3417 for_each_cpu_mask_nr(i, group->cpumask) {
3232 unsigned long wl; 3418 unsigned long wl;
3233 3419
3234 if (!cpu_isset(i, *cpus)) 3420 if (!cpu_isset(i, *cpus))
@@ -3284,6 +3470,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
3284 schedstat_inc(sd, lb_count[idle]); 3470 schedstat_inc(sd, lb_count[idle]);
3285 3471
3286redo: 3472redo:
3473 update_shares(sd);
3287 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, 3474 group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
3288 cpus, balance); 3475 cpus, balance);
3289 3476
@@ -3386,8 +3573,9 @@ redo:
3386 3573
3387 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && 3574 if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3388 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 3575 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3389 return -1; 3576 ld_moved = -1;
3390 return ld_moved; 3577
3578 goto out;
3391 3579
3392out_balanced: 3580out_balanced:
3393 schedstat_inc(sd, lb_balanced[idle]); 3581 schedstat_inc(sd, lb_balanced[idle]);
@@ -3402,8 +3590,13 @@ out_one_pinned:
3402 3590
3403 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && 3591 if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
3404 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) 3592 !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
3405 return -1; 3593 ld_moved = -1;
3406 return 0; 3594 else
3595 ld_moved = 0;
3596out:
3597 if (ld_moved)
3598 update_shares(sd);
3599 return ld_moved;
3407} 3600}
3408 3601
3409/* 3602/*
@@ -3438,6 +3631,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
3438 3631
3439 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]); 3632 schedstat_inc(sd, lb_count[CPU_NEWLY_IDLE]);
3440redo: 3633redo:
3634 update_shares_locked(this_rq, sd);
3441 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, 3635 group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE,
3442 &sd_idle, cpus, NULL); 3636 &sd_idle, cpus, NULL);
3443 if (!group) { 3637 if (!group) {
@@ -3464,7 +3658,7 @@ redo:
3464 ld_moved = move_tasks(this_rq, this_cpu, busiest, 3658 ld_moved = move_tasks(this_rq, this_cpu, busiest,
3465 imbalance, sd, CPU_NEWLY_IDLE, 3659 imbalance, sd, CPU_NEWLY_IDLE,
3466 &all_pinned); 3660 &all_pinned);
3467 spin_unlock(&busiest->lock); 3661 double_unlock_balance(this_rq, busiest);
3468 3662
3469 if (unlikely(all_pinned)) { 3663 if (unlikely(all_pinned)) {
3470 cpu_clear(cpu_of(busiest), *cpus); 3664 cpu_clear(cpu_of(busiest), *cpus);
@@ -3481,6 +3675,7 @@ redo:
3481 } else 3675 } else
3482 sd->nr_balance_failed = 0; 3676 sd->nr_balance_failed = 0;
3483 3677
3678 update_shares_locked(this_rq, sd);
3484 return ld_moved; 3679 return ld_moved;
3485 3680
3486out_balanced: 3681out_balanced:
@@ -3578,7 +3773,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3578 else 3773 else
3579 schedstat_inc(sd, alb_failed); 3774 schedstat_inc(sd, alb_failed);
3580 } 3775 }
3581 spin_unlock(&target_rq->lock); 3776 double_unlock_balance(busiest_rq, target_rq);
3582} 3777}
3583 3778
3584#ifdef CONFIG_NO_HZ 3779#ifdef CONFIG_NO_HZ
@@ -3621,7 +3816,7 @@ int select_nohz_load_balancer(int stop_tick)
3621 /* 3816 /*
3622 * If we are going offline and still the leader, give up! 3817 * If we are going offline and still the leader, give up!
3623 */ 3818 */
3624 if (cpu_is_offline(cpu) && 3819 if (!cpu_active(cpu) &&
3625 atomic_read(&nohz.load_balancer) == cpu) { 3820 atomic_read(&nohz.load_balancer) == cpu) {
3626 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 3821 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3627 BUG(); 3822 BUG();
@@ -3672,6 +3867,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3672 /* Earliest time when we have to do rebalance again */ 3867 /* Earliest time when we have to do rebalance again */
3673 unsigned long next_balance = jiffies + 60*HZ; 3868 unsigned long next_balance = jiffies + 60*HZ;
3674 int update_next_balance = 0; 3869 int update_next_balance = 0;
3870 int need_serialize;
3675 cpumask_t tmp; 3871 cpumask_t tmp;
3676 3872
3677 for_each_domain(cpu, sd) { 3873 for_each_domain(cpu, sd) {
@@ -3689,8 +3885,9 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3689 if (interval > HZ*NR_CPUS/10) 3885 if (interval > HZ*NR_CPUS/10)
3690 interval = HZ*NR_CPUS/10; 3886 interval = HZ*NR_CPUS/10;
3691 3887
3888 need_serialize = sd->flags & SD_SERIALIZE;
3692 3889
3693 if (sd->flags & SD_SERIALIZE) { 3890 if (need_serialize) {
3694 if (!spin_trylock(&balancing)) 3891 if (!spin_trylock(&balancing))
3695 goto out; 3892 goto out;
3696 } 3893 }
@@ -3706,7 +3903,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
3706 } 3903 }
3707 sd->last_balance = jiffies; 3904 sd->last_balance = jiffies;
3708 } 3905 }
3709 if (sd->flags & SD_SERIALIZE) 3906 if (need_serialize)
3710 spin_unlock(&balancing); 3907 spin_unlock(&balancing);
3711out: 3908out:
3712 if (time_after(next_balance, sd->last_balance + interval)) { 3909 if (time_after(next_balance, sd->last_balance + interval)) {
@@ -3759,7 +3956,7 @@ static void run_rebalance_domains(struct softirq_action *h)
3759 int balance_cpu; 3956 int balance_cpu;
3760 3957
3761 cpu_clear(this_cpu, cpus); 3958 cpu_clear(this_cpu, cpus);
3762 for_each_cpu_mask(balance_cpu, cpus) { 3959 for_each_cpu_mask_nr(balance_cpu, cpus) {
3763 /* 3960 /*
3764 * If this cpu gets work to do, stop the load balancing 3961 * If this cpu gets work to do, stop the load balancing
3765 * work being done for other cpus. Next load 3962 * work being done for other cpus. Next load
@@ -3895,6 +4092,8 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
3895 cpustat->nice = cputime64_add(cpustat->nice, tmp); 4092 cpustat->nice = cputime64_add(cpustat->nice, tmp);
3896 else 4093 else
3897 cpustat->user = cputime64_add(cpustat->user, tmp); 4094 cpustat->user = cputime64_add(cpustat->user, tmp);
4095 /* Account for user time used */
4096 acct_update_integrals(p);
3898} 4097}
3899 4098
3900/* 4099/*
@@ -3995,6 +4194,65 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
3995} 4194}
3996 4195
3997/* 4196/*
4197 * Use precise platform statistics if available:
4198 */
4199#ifdef CONFIG_VIRT_CPU_ACCOUNTING
4200cputime_t task_utime(struct task_struct *p)
4201{
4202 return p->utime;
4203}
4204
4205cputime_t task_stime(struct task_struct *p)
4206{
4207 return p->stime;
4208}
4209#else
4210cputime_t task_utime(struct task_struct *p)
4211{
4212 clock_t utime = cputime_to_clock_t(p->utime),
4213 total = utime + cputime_to_clock_t(p->stime);
4214 u64 temp;
4215
4216 /*
4217 * Use CFS's precise accounting:
4218 */
4219 temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
4220
4221 if (total) {
4222 temp *= utime;
4223 do_div(temp, total);
4224 }
4225 utime = (clock_t)temp;
4226
4227 p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
4228 return p->prev_utime;
4229}
4230
4231cputime_t task_stime(struct task_struct *p)
4232{
4233 clock_t stime;
4234
4235 /*
4236 * Use CFS's precise accounting. (we subtract utime from
4237 * the total, to make sure the total observed by userspace
4238 * grows monotonically - apps rely on that):
4239 */
4240 stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
4241 cputime_to_clock_t(task_utime(p));
4242
4243 if (stime >= 0)
4244 p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
4245
4246 return p->prev_stime;
4247}
4248#endif
4249
4250inline cputime_t task_gtime(struct task_struct *p)
4251{
4252 return p->gtime;
4253}
4254
4255/*
3998 * This function gets called by the timer code, with HZ frequency. 4256 * This function gets called by the timer code, with HZ frequency.
3999 * We call it with interrupts disabled. 4257 * We call it with interrupts disabled.
4000 * 4258 *
@@ -4021,26 +4279,44 @@ void scheduler_tick(void)
4021#endif 4279#endif
4022} 4280}
4023 4281
4024#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) 4282#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
4283 defined(CONFIG_PREEMPT_TRACER))
4284
4285static inline unsigned long get_parent_ip(unsigned long addr)
4286{
4287 if (in_lock_functions(addr)) {
4288 addr = CALLER_ADDR2;
4289 if (in_lock_functions(addr))
4290 addr = CALLER_ADDR3;
4291 }
4292 return addr;
4293}
4025 4294
4026void __kprobes add_preempt_count(int val) 4295void __kprobes add_preempt_count(int val)
4027{ 4296{
4297#ifdef CONFIG_DEBUG_PREEMPT
4028 /* 4298 /*
4029 * Underflow? 4299 * Underflow?
4030 */ 4300 */
4031 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) 4301 if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
4032 return; 4302 return;
4303#endif
4033 preempt_count() += val; 4304 preempt_count() += val;
4305#ifdef CONFIG_DEBUG_PREEMPT
4034 /* 4306 /*
4035 * Spinlock count overflowing soon? 4307 * Spinlock count overflowing soon?
4036 */ 4308 */
4037 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= 4309 DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
4038 PREEMPT_MASK - 10); 4310 PREEMPT_MASK - 10);
4311#endif
4312 if (preempt_count() == val)
4313 trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4039} 4314}
4040EXPORT_SYMBOL(add_preempt_count); 4315EXPORT_SYMBOL(add_preempt_count);
4041 4316
4042void __kprobes sub_preempt_count(int val) 4317void __kprobes sub_preempt_count(int val)
4043{ 4318{
4319#ifdef CONFIG_DEBUG_PREEMPT
4044 /* 4320 /*
4045 * Underflow? 4321 * Underflow?
4046 */ 4322 */
@@ -4052,7 +4328,10 @@ void __kprobes sub_preempt_count(int val)
4052 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && 4328 if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
4053 !(preempt_count() & PREEMPT_MASK))) 4329 !(preempt_count() & PREEMPT_MASK)))
4054 return; 4330 return;
4331#endif
4055 4332
4333 if (preempt_count() == val)
4334 trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
4056 preempt_count() -= val; 4335 preempt_count() -= val;
4057} 4336}
4058EXPORT_SYMBOL(sub_preempt_count); 4337EXPORT_SYMBOL(sub_preempt_count);
@@ -4070,6 +4349,7 @@ static noinline void __schedule_bug(struct task_struct *prev)
4070 prev->comm, prev->pid, preempt_count()); 4349 prev->comm, prev->pid, preempt_count());
4071 4350
4072 debug_show_held_locks(prev); 4351 debug_show_held_locks(prev);
4352 print_modules();
4073 if (irqs_disabled()) 4353 if (irqs_disabled())
4074 print_irqtrace_events(prev); 4354 print_irqtrace_events(prev);
4075 4355
@@ -4158,7 +4438,8 @@ need_resched_nonpreemptible:
4158 4438
4159 schedule_debug(prev); 4439 schedule_debug(prev);
4160 4440
4161 hrtick_clear(rq); 4441 if (sched_feat(HRTICK))
4442 hrtick_clear(rq);
4162 4443
4163 /* 4444 /*
4164 * Do the rq-clock update outside the rq lock: 4445 * Do the rq-clock update outside the rq lock:
@@ -4204,8 +4485,6 @@ need_resched_nonpreemptible:
4204 } else 4485 } else
4205 spin_unlock_irq(&rq->lock); 4486 spin_unlock_irq(&rq->lock);
4206 4487
4207 hrtick_set(rq);
4208
4209 if (unlikely(reacquire_kernel_lock(current) < 0)) 4488 if (unlikely(reacquire_kernel_lock(current) < 0))
4210 goto need_resched_nonpreemptible; 4489 goto need_resched_nonpreemptible;
4211 4490
@@ -4363,6 +4642,15 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
4363} 4642}
4364EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ 4643EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
4365 4644
4645/**
4646 * complete: - signals a single thread waiting on this completion
4647 * @x: holds the state of this particular completion
4648 *
4649 * This will wake up a single thread waiting on this completion. Threads will be
4650 * awakened in the same order in which they were queued.
4651 *
4652 * See also complete_all(), wait_for_completion() and related routines.
4653 */
4366void complete(struct completion *x) 4654void complete(struct completion *x)
4367{ 4655{
4368 unsigned long flags; 4656 unsigned long flags;
@@ -4374,6 +4662,12 @@ void complete(struct completion *x)
4374} 4662}
4375EXPORT_SYMBOL(complete); 4663EXPORT_SYMBOL(complete);
4376 4664
4665/**
4666 * complete_all: - signals all threads waiting on this completion
4667 * @x: holds the state of this particular completion
4668 *
4669 * This will wake up all threads waiting on this particular completion event.
4670 */
4377void complete_all(struct completion *x) 4671void complete_all(struct completion *x)
4378{ 4672{
4379 unsigned long flags; 4673 unsigned long flags;
@@ -4394,10 +4688,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
4394 wait.flags |= WQ_FLAG_EXCLUSIVE; 4688 wait.flags |= WQ_FLAG_EXCLUSIVE;
4395 __add_wait_queue_tail(&x->wait, &wait); 4689 __add_wait_queue_tail(&x->wait, &wait);
4396 do { 4690 do {
4397 if ((state == TASK_INTERRUPTIBLE && 4691 if (signal_pending_state(state, current)) {
4398 signal_pending(current)) ||
4399 (state == TASK_KILLABLE &&
4400 fatal_signal_pending(current))) {
4401 timeout = -ERESTARTSYS; 4692 timeout = -ERESTARTSYS;
4402 break; 4693 break;
4403 } 4694 }
@@ -4425,12 +4716,31 @@ wait_for_common(struct completion *x, long timeout, int state)
4425 return timeout; 4716 return timeout;
4426} 4717}
4427 4718
4719/**
4720 * wait_for_completion: - waits for completion of a task
4721 * @x: holds the state of this particular completion
4722 *
4723 * This waits to be signaled for completion of a specific task. It is NOT
4724 * interruptible and there is no timeout.
4725 *
4726 * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
4727 * and interrupt capability. Also see complete().
4728 */
4428void __sched wait_for_completion(struct completion *x) 4729void __sched wait_for_completion(struct completion *x)
4429{ 4730{
4430 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); 4731 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
4431} 4732}
4432EXPORT_SYMBOL(wait_for_completion); 4733EXPORT_SYMBOL(wait_for_completion);
4433 4734
4735/**
4736 * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
4737 * @x: holds the state of this particular completion
4738 * @timeout: timeout value in jiffies
4739 *
4740 * This waits for either a completion of a specific task to be signaled or for a
4741 * specified timeout to expire. The timeout is in jiffies. It is not
4742 * interruptible.
4743 */
4434unsigned long __sched 4744unsigned long __sched
4435wait_for_completion_timeout(struct completion *x, unsigned long timeout) 4745wait_for_completion_timeout(struct completion *x, unsigned long timeout)
4436{ 4746{
@@ -4438,6 +4748,13 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout)
4438} 4748}
4439EXPORT_SYMBOL(wait_for_completion_timeout); 4749EXPORT_SYMBOL(wait_for_completion_timeout);
4440 4750
4751/**
4752 * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
4753 * @x: holds the state of this particular completion
4754 *
4755 * This waits for completion of a specific task to be signaled. It is
4756 * interruptible.
4757 */
4441int __sched wait_for_completion_interruptible(struct completion *x) 4758int __sched wait_for_completion_interruptible(struct completion *x)
4442{ 4759{
4443 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); 4760 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
@@ -4447,6 +4764,14 @@ int __sched wait_for_completion_interruptible(struct completion *x)
4447} 4764}
4448EXPORT_SYMBOL(wait_for_completion_interruptible); 4765EXPORT_SYMBOL(wait_for_completion_interruptible);
4449 4766
4767/**
4768 * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
4769 * @x: holds the state of this particular completion
4770 * @timeout: timeout value in jiffies
4771 *
4772 * This waits for either a completion of a specific task to be signaled or for a
4773 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
4774 */
4450unsigned long __sched 4775unsigned long __sched
4451wait_for_completion_interruptible_timeout(struct completion *x, 4776wait_for_completion_interruptible_timeout(struct completion *x,
4452 unsigned long timeout) 4777 unsigned long timeout)
@@ -4455,6 +4780,13 @@ wait_for_completion_interruptible_timeout(struct completion *x,
4455} 4780}
4456EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); 4781EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
4457 4782
4783/**
4784 * wait_for_completion_killable: - waits for completion of a task (killable)
4785 * @x: holds the state of this particular completion
4786 *
4787 * This waits to be signaled for completion of a specific task. It can be
4788 * interrupted by a kill signal.
4789 */
4458int __sched wait_for_completion_killable(struct completion *x) 4790int __sched wait_for_completion_killable(struct completion *x)
4459{ 4791{
4460 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); 4792 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
@@ -4464,6 +4796,52 @@ int __sched wait_for_completion_killable(struct completion *x)
4464} 4796}
4465EXPORT_SYMBOL(wait_for_completion_killable); 4797EXPORT_SYMBOL(wait_for_completion_killable);
4466 4798
4799/**
4800 * try_wait_for_completion - try to decrement a completion without blocking
4801 * @x: completion structure
4802 *
4803 * Returns: 0 if a decrement cannot be done without blocking
4804 * 1 if a decrement succeeded.
4805 *
4806 * If a completion is being used as a counting completion,
4807 * attempt to decrement the counter without blocking. This
4808 * enables us to avoid waiting if the resource the completion
4809 * is protecting is not available.
4810 */
4811bool try_wait_for_completion(struct completion *x)
4812{
4813 int ret = 1;
4814
4815 spin_lock_irq(&x->wait.lock);
4816 if (!x->done)
4817 ret = 0;
4818 else
4819 x->done--;
4820 spin_unlock_irq(&x->wait.lock);
4821 return ret;
4822}
4823EXPORT_SYMBOL(try_wait_for_completion);
4824
4825/**
4826 * completion_done - Test to see if a completion has any waiters
4827 * @x: completion structure
4828 *
4829 * Returns: 0 if there are waiters (wait_for_completion() in progress)
4830 * 1 if there are no waiters.
4831 *
4832 */
4833bool completion_done(struct completion *x)
4834{
4835 int ret = 1;
4836
4837 spin_lock_irq(&x->wait.lock);
4838 if (!x->done)
4839 ret = 0;
4840 spin_unlock_irq(&x->wait.lock);
4841 return ret;
4842}
4843EXPORT_SYMBOL(completion_done);
4844
4467static long __sched 4845static long __sched
4468sleep_on_common(wait_queue_head_t *q, int state, long timeout) 4846sleep_on_common(wait_queue_head_t *q, int state, long timeout)
4469{ 4847{
@@ -4586,10 +4964,8 @@ void set_user_nice(struct task_struct *p, long nice)
4586 goto out_unlock; 4964 goto out_unlock;
4587 } 4965 }
4588 on_rq = p->se.on_rq; 4966 on_rq = p->se.on_rq;
4589 if (on_rq) { 4967 if (on_rq)
4590 dequeue_task(rq, p, 0); 4968 dequeue_task(rq, p, 0);
4591 dec_load(rq, p);
4592 }
4593 4969
4594 p->static_prio = NICE_TO_PRIO(nice); 4970 p->static_prio = NICE_TO_PRIO(nice);
4595 set_load_weight(p); 4971 set_load_weight(p);
@@ -4599,7 +4975,6 @@ void set_user_nice(struct task_struct *p, long nice)
4599 4975
4600 if (on_rq) { 4976 if (on_rq) {
4601 enqueue_task(rq, p, 0); 4977 enqueue_task(rq, p, 0);
4602 inc_load(rq, p);
4603 /* 4978 /*
4604 * If the task increased its priority or is running and 4979 * If the task increased its priority or is running and
4605 * lowered its priority, then reschedule its CPU: 4980 * lowered its priority, then reschedule its CPU:
@@ -4744,16 +5119,8 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
4744 set_load_weight(p); 5119 set_load_weight(p);
4745} 5120}
4746 5121
4747/** 5122static int __sched_setscheduler(struct task_struct *p, int policy,
4748 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. 5123 struct sched_param *param, bool user)
4749 * @p: the task in question.
4750 * @policy: new policy.
4751 * @param: structure containing the new RT priority.
4752 *
4753 * NOTE that the task may be already dead.
4754 */
4755int sched_setscheduler(struct task_struct *p, int policy,
4756 struct sched_param *param)
4757{ 5124{
4758 int retval, oldprio, oldpolicy = -1, on_rq, running; 5125 int retval, oldprio, oldpolicy = -1, on_rq, running;
4759 unsigned long flags; 5126 unsigned long flags;
@@ -4785,7 +5152,7 @@ recheck:
4785 /* 5152 /*
4786 * Allow unprivileged RT tasks to decrease priority: 5153 * Allow unprivileged RT tasks to decrease priority:
4787 */ 5154 */
4788 if (!capable(CAP_SYS_NICE)) { 5155 if (user && !capable(CAP_SYS_NICE)) {
4789 if (rt_policy(policy)) { 5156 if (rt_policy(policy)) {
4790 unsigned long rlim_rtprio; 5157 unsigned long rlim_rtprio;
4791 5158
@@ -4816,18 +5183,22 @@ recheck:
4816 return -EPERM; 5183 return -EPERM;
4817 } 5184 }
4818 5185
5186 if (user) {
4819#ifdef CONFIG_RT_GROUP_SCHED 5187#ifdef CONFIG_RT_GROUP_SCHED
4820 /* 5188 /*
4821 * Do not allow realtime tasks into groups that have no runtime 5189 * Do not allow realtime tasks into groups that have no runtime
4822 * assigned. 5190 * assigned.
4823 */ 5191 */
4824 if (rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) 5192 if (rt_bandwidth_enabled() && rt_policy(policy) &&
4825 return -EPERM; 5193 task_group(p)->rt_bandwidth.rt_runtime == 0)
5194 return -EPERM;
4826#endif 5195#endif
4827 5196
4828 retval = security_task_setscheduler(p, policy, param); 5197 retval = security_task_setscheduler(p, policy, param);
4829 if (retval) 5198 if (retval)
4830 return retval; 5199 return retval;
5200 }
5201
4831 /* 5202 /*
4832 * make sure no PI-waiters arrive (or leave) while we are 5203 * make sure no PI-waiters arrive (or leave) while we are
4833 * changing the priority of the task: 5204 * changing the priority of the task:
@@ -4870,8 +5241,39 @@ recheck:
4870 5241
4871 return 0; 5242 return 0;
4872} 5243}
5244
5245/**
5246 * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
5247 * @p: the task in question.
5248 * @policy: new policy.
5249 * @param: structure containing the new RT priority.
5250 *
5251 * NOTE that the task may be already dead.
5252 */
5253int sched_setscheduler(struct task_struct *p, int policy,
5254 struct sched_param *param)
5255{
5256 return __sched_setscheduler(p, policy, param, true);
5257}
4873EXPORT_SYMBOL_GPL(sched_setscheduler); 5258EXPORT_SYMBOL_GPL(sched_setscheduler);
4874 5259
5260/**
5261 * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
5262 * @p: the task in question.
5263 * @policy: new policy.
5264 * @param: structure containing the new RT priority.
5265 *
5266 * Just like sched_setscheduler, only don't bother checking if the
5267 * current context has permission. For example, this is needed in
5268 * stop_machine(): we create temporary high priority worker threads,
5269 * but our caller might not have that capability.
5270 */
5271int sched_setscheduler_nocheck(struct task_struct *p, int policy,
5272 struct sched_param *param)
5273{
5274 return __sched_setscheduler(p, policy, param, false);
5275}
5276
4875static int 5277static int
4876do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) 5278do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4877{ 5279{
@@ -5070,24 +5472,6 @@ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
5070 return sched_setaffinity(pid, &new_mask); 5472 return sched_setaffinity(pid, &new_mask);
5071} 5473}
5072 5474
5073/*
5074 * Represents all cpu's present in the system
5075 * In systems capable of hotplug, this map could dynamically grow
5076 * as new cpu's are detected in the system via any platform specific
5077 * method, such as ACPI for e.g.
5078 */
5079
5080cpumask_t cpu_present_map __read_mostly;
5081EXPORT_SYMBOL(cpu_present_map);
5082
5083#ifndef CONFIG_SMP
5084cpumask_t cpu_online_map __read_mostly = CPU_MASK_ALL;
5085EXPORT_SYMBOL(cpu_online_map);
5086
5087cpumask_t cpu_possible_map __read_mostly = CPU_MASK_ALL;
5088EXPORT_SYMBOL(cpu_possible_map);
5089#endif
5090
5091long sched_getaffinity(pid_t pid, cpumask_t *mask) 5475long sched_getaffinity(pid_t pid, cpumask_t *mask)
5092{ 5476{
5093 struct task_struct *p; 5477 struct task_struct *p;
@@ -5384,7 +5768,7 @@ out_unlock:
5384 return retval; 5768 return retval;
5385} 5769}
5386 5770
5387static const char stat_nam[] = "RSDTtZX"; 5771static const char stat_nam[] = TASK_STATE_TO_CHAR_STR;
5388 5772
5389void sched_show_task(struct task_struct *p) 5773void sched_show_task(struct task_struct *p)
5390{ 5774{
@@ -5525,6 +5909,8 @@ static inline void sched_init_granularity(void)
5525 sysctl_sched_latency = limit; 5909 sysctl_sched_latency = limit;
5526 5910
5527 sysctl_sched_wakeup_granularity *= factor; 5911 sysctl_sched_wakeup_granularity *= factor;
5912
5913 sysctl_sched_shares_ratelimit *= factor;
5528} 5914}
5529 5915
5530#ifdef CONFIG_SMP 5916#ifdef CONFIG_SMP
@@ -5566,6 +5952,12 @@ int set_cpus_allowed_ptr(struct task_struct *p, const cpumask_t *new_mask)
5566 goto out; 5952 goto out;
5567 } 5953 }
5568 5954
5955 if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
5956 !cpus_equal(p->cpus_allowed, *new_mask))) {
5957 ret = -EINVAL;
5958 goto out;
5959 }
5960
5569 if (p->sched_class->set_cpus_allowed) 5961 if (p->sched_class->set_cpus_allowed)
5570 p->sched_class->set_cpus_allowed(p, new_mask); 5962 p->sched_class->set_cpus_allowed(p, new_mask);
5571 else { 5963 else {
@@ -5608,7 +6000,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5608 struct rq *rq_dest, *rq_src; 6000 struct rq *rq_dest, *rq_src;
5609 int ret = 0, on_rq; 6001 int ret = 0, on_rq;
5610 6002
5611 if (unlikely(cpu_is_offline(dest_cpu))) 6003 if (unlikely(!cpu_active(dest_cpu)))
5612 return ret; 6004 return ret;
5613 6005
5614 rq_src = cpu_rq(src_cpu); 6006 rq_src = cpu_rq(src_cpu);
@@ -5617,10 +6009,10 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5617 double_rq_lock(rq_src, rq_dest); 6009 double_rq_lock(rq_src, rq_dest);
5618 /* Already moved. */ 6010 /* Already moved. */
5619 if (task_cpu(p) != src_cpu) 6011 if (task_cpu(p) != src_cpu)
5620 goto out; 6012 goto done;
5621 /* Affinity changed (again). */ 6013 /* Affinity changed (again). */
5622 if (!cpu_isset(dest_cpu, p->cpus_allowed)) 6014 if (!cpu_isset(dest_cpu, p->cpus_allowed))
5623 goto out; 6015 goto fail;
5624 6016
5625 on_rq = p->se.on_rq; 6017 on_rq = p->se.on_rq;
5626 if (on_rq) 6018 if (on_rq)
@@ -5629,10 +6021,11 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5629 set_task_cpu(p, dest_cpu); 6021 set_task_cpu(p, dest_cpu);
5630 if (on_rq) { 6022 if (on_rq) {
5631 activate_task(rq_dest, p, 0); 6023 activate_task(rq_dest, p, 0);
5632 check_preempt_curr(rq_dest, p); 6024 check_preempt_curr(rq_dest, p, 0);
5633 } 6025 }
6026done:
5634 ret = 1; 6027 ret = 1;
5635out: 6028fail:
5636 double_rq_unlock(rq_src, rq_dest); 6029 double_rq_unlock(rq_src, rq_dest);
5637 return ret; 6030 return ret;
5638} 6031}
@@ -5882,6 +6275,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
5882 next = pick_next_task(rq, rq->curr); 6275 next = pick_next_task(rq, rq->curr);
5883 if (!next) 6276 if (!next)
5884 break; 6277 break;
6278 next->sched_class->put_prev_task(rq, next);
5885 migrate_dead(dead_cpu, next); 6279 migrate_dead(dead_cpu, next);
5886 6280
5887 } 6281 }
@@ -5952,7 +6346,7 @@ set_table_entry(struct ctl_table *entry,
5952static struct ctl_table * 6346static struct ctl_table *
5953sd_alloc_ctl_domain_table(struct sched_domain *sd) 6347sd_alloc_ctl_domain_table(struct sched_domain *sd)
5954{ 6348{
5955 struct ctl_table *table = sd_alloc_ctl_entry(12); 6349 struct ctl_table *table = sd_alloc_ctl_entry(13);
5956 6350
5957 if (table == NULL) 6351 if (table == NULL)
5958 return NULL; 6352 return NULL;
@@ -5980,7 +6374,9 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
5980 sizeof(int), 0644, proc_dointvec_minmax); 6374 sizeof(int), 0644, proc_dointvec_minmax);
5981 set_table_entry(&table[10], "flags", &sd->flags, 6375 set_table_entry(&table[10], "flags", &sd->flags,
5982 sizeof(int), 0644, proc_dointvec_minmax); 6376 sizeof(int), 0644, proc_dointvec_minmax);
5983 /* &table[11] is terminator */ 6377 set_table_entry(&table[11], "name", sd->name,
6378 CORENAME_MAX_SIZE, 0444, proc_dostring);
6379 /* &table[12] is terminator */
5984 6380
5985 return table; 6381 return table;
5986} 6382}
@@ -6053,6 +6449,36 @@ static void unregister_sched_domain_sysctl(void)
6053} 6449}
6054#endif 6450#endif
6055 6451
6452static void set_rq_online(struct rq *rq)
6453{
6454 if (!rq->online) {
6455 const struct sched_class *class;
6456
6457 cpu_set(rq->cpu, rq->rd->online);
6458 rq->online = 1;
6459
6460 for_each_class(class) {
6461 if (class->rq_online)
6462 class->rq_online(rq);
6463 }
6464 }
6465}
6466
6467static void set_rq_offline(struct rq *rq)
6468{
6469 if (rq->online) {
6470 const struct sched_class *class;
6471
6472 for_each_class(class) {
6473 if (class->rq_offline)
6474 class->rq_offline(rq);
6475 }
6476
6477 cpu_clear(rq->cpu, rq->rd->online);
6478 rq->online = 0;
6479 }
6480}
6481
6056/* 6482/*
6057 * migration_call - callback that gets triggered when a CPU is added. 6483 * migration_call - callback that gets triggered when a CPU is added.
6058 * Here we can start up the necessary migration thread for the new CPU. 6484 * Here we can start up the necessary migration thread for the new CPU.
@@ -6090,7 +6516,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6090 spin_lock_irqsave(&rq->lock, flags); 6516 spin_lock_irqsave(&rq->lock, flags);
6091 if (rq->rd) { 6517 if (rq->rd) {
6092 BUG_ON(!cpu_isset(cpu, rq->rd->span)); 6518 BUG_ON(!cpu_isset(cpu, rq->rd->span));
6093 cpu_set(cpu, rq->rd->online); 6519
6520 set_rq_online(rq);
6094 } 6521 }
6095 spin_unlock_irqrestore(&rq->lock, flags); 6522 spin_unlock_irqrestore(&rq->lock, flags);
6096 break; 6523 break;
@@ -6151,7 +6578,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
6151 spin_lock_irqsave(&rq->lock, flags); 6578 spin_lock_irqsave(&rq->lock, flags);
6152 if (rq->rd) { 6579 if (rq->rd) {
6153 BUG_ON(!cpu_isset(cpu, rq->rd->span)); 6580 BUG_ON(!cpu_isset(cpu, rq->rd->span));
6154 cpu_clear(cpu, rq->rd->online); 6581 set_rq_offline(rq);
6155 } 6582 }
6156 spin_unlock_irqrestore(&rq->lock, flags); 6583 spin_unlock_irqrestore(&rq->lock, flags);
6157 break; 6584 break;
@@ -6168,7 +6595,7 @@ static struct notifier_block __cpuinitdata migration_notifier = {
6168 .priority = 10 6595 .priority = 10
6169}; 6596};
6170 6597
6171void __init migration_init(void) 6598static int __init migration_init(void)
6172{ 6599{
6173 void *cpu = (void *)(long)smp_processor_id(); 6600 void *cpu = (void *)(long)smp_processor_id();
6174 int err; 6601 int err;
@@ -6178,13 +6605,38 @@ void __init migration_init(void)
6178 BUG_ON(err == NOTIFY_BAD); 6605 BUG_ON(err == NOTIFY_BAD);
6179 migration_call(&migration_notifier, CPU_ONLINE, cpu); 6606 migration_call(&migration_notifier, CPU_ONLINE, cpu);
6180 register_cpu_notifier(&migration_notifier); 6607 register_cpu_notifier(&migration_notifier);
6608
6609 return err;
6181} 6610}
6611early_initcall(migration_init);
6182#endif 6612#endif
6183 6613
6184#ifdef CONFIG_SMP 6614#ifdef CONFIG_SMP
6185 6615
6186#ifdef CONFIG_SCHED_DEBUG 6616#ifdef CONFIG_SCHED_DEBUG
6187 6617
6618static inline const char *sd_level_to_string(enum sched_domain_level lvl)
6619{
6620 switch (lvl) {
6621 case SD_LV_NONE:
6622 return "NONE";
6623 case SD_LV_SIBLING:
6624 return "SIBLING";
6625 case SD_LV_MC:
6626 return "MC";
6627 case SD_LV_CPU:
6628 return "CPU";
6629 case SD_LV_NODE:
6630 return "NODE";
6631 case SD_LV_ALLNODES:
6632 return "ALLNODES";
6633 case SD_LV_MAX:
6634 return "MAX";
6635
6636 }
6637 return "MAX";
6638}
6639
6188static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, 6640static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6189 cpumask_t *groupmask) 6641 cpumask_t *groupmask)
6190{ 6642{
@@ -6204,7 +6656,8 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
6204 return -1; 6656 return -1;
6205 } 6657 }
6206 6658
6207 printk(KERN_CONT "span %s\n", str); 6659 printk(KERN_CONT "span %s level %s\n",
6660 str, sd_level_to_string(sd->level));
6208 6661
6209 if (!cpu_isset(cpu, sd->span)) { 6662 if (!cpu_isset(cpu, sd->span)) {
6210 printk(KERN_ERR "ERROR: domain->span does not contain " 6663 printk(KERN_ERR "ERROR: domain->span does not contain "
@@ -6288,9 +6741,9 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
6288 } 6741 }
6289 kfree(groupmask); 6742 kfree(groupmask);
6290} 6743}
6291#else 6744#else /* !CONFIG_SCHED_DEBUG */
6292# define sched_domain_debug(sd, cpu) do { } while (0) 6745# define sched_domain_debug(sd, cpu) do { } while (0)
6293#endif 6746#endif /* CONFIG_SCHED_DEBUG */
6294 6747
6295static int sd_degenerate(struct sched_domain *sd) 6748static int sd_degenerate(struct sched_domain *sd)
6296{ 6749{
@@ -6350,20 +6803,16 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
6350static void rq_attach_root(struct rq *rq, struct root_domain *rd) 6803static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6351{ 6804{
6352 unsigned long flags; 6805 unsigned long flags;
6353 const struct sched_class *class;
6354 6806
6355 spin_lock_irqsave(&rq->lock, flags); 6807 spin_lock_irqsave(&rq->lock, flags);
6356 6808
6357 if (rq->rd) { 6809 if (rq->rd) {
6358 struct root_domain *old_rd = rq->rd; 6810 struct root_domain *old_rd = rq->rd;
6359 6811
6360 for (class = sched_class_highest; class; class = class->next) { 6812 if (cpu_isset(rq->cpu, old_rd->online))
6361 if (class->leave_domain) 6813 set_rq_offline(rq);
6362 class->leave_domain(rq);
6363 }
6364 6814
6365 cpu_clear(rq->cpu, old_rd->span); 6815 cpu_clear(rq->cpu, old_rd->span);
6366 cpu_clear(rq->cpu, old_rd->online);
6367 6816
6368 if (atomic_dec_and_test(&old_rd->refcount)) 6817 if (atomic_dec_and_test(&old_rd->refcount))
6369 kfree(old_rd); 6818 kfree(old_rd);
@@ -6374,12 +6823,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
6374 6823
6375 cpu_set(rq->cpu, rd->span); 6824 cpu_set(rq->cpu, rd->span);
6376 if (cpu_isset(rq->cpu, cpu_online_map)) 6825 if (cpu_isset(rq->cpu, cpu_online_map))
6377 cpu_set(rq->cpu, rd->online); 6826 set_rq_online(rq);
6378
6379 for (class = sched_class_highest; class; class = class->next) {
6380 if (class->join_domain)
6381 class->join_domain(rq);
6382 }
6383 6827
6384 spin_unlock_irqrestore(&rq->lock, flags); 6828 spin_unlock_irqrestore(&rq->lock, flags);
6385} 6829}
@@ -6390,6 +6834,8 @@ static void init_rootdomain(struct root_domain *rd)
6390 6834
6391 cpus_clear(rd->span); 6835 cpus_clear(rd->span);
6392 cpus_clear(rd->online); 6836 cpus_clear(rd->online);
6837
6838 cpupri_init(&rd->cpupri);
6393} 6839}
6394 6840
6395static void init_defrootdomain(void) 6841static void init_defrootdomain(void)
@@ -6451,7 +6897,8 @@ static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
6451/* Setup the mask of cpus configured for isolated domains */ 6897/* Setup the mask of cpus configured for isolated domains */
6452static int __init isolated_cpu_setup(char *str) 6898static int __init isolated_cpu_setup(char *str)
6453{ 6899{
6454 int ints[NR_CPUS], i; 6900 static int __initdata ints[NR_CPUS];
6901 int i;
6455 6902
6456 str = get_options(str, ARRAY_SIZE(ints), ints); 6903 str = get_options(str, ARRAY_SIZE(ints), ints);
6457 cpus_clear(cpu_isolated_map); 6904 cpus_clear(cpu_isolated_map);
@@ -6485,7 +6932,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
6485 6932
6486 cpus_clear(*covered); 6933 cpus_clear(*covered);
6487 6934
6488 for_each_cpu_mask(i, *span) { 6935 for_each_cpu_mask_nr(i, *span) {
6489 struct sched_group *sg; 6936 struct sched_group *sg;
6490 int group = group_fn(i, cpu_map, &sg, tmpmask); 6937 int group = group_fn(i, cpu_map, &sg, tmpmask);
6491 int j; 6938 int j;
@@ -6496,7 +6943,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
6496 cpus_clear(sg->cpumask); 6943 cpus_clear(sg->cpumask);
6497 sg->__cpu_power = 0; 6944 sg->__cpu_power = 0;
6498 6945
6499 for_each_cpu_mask(j, *span) { 6946 for_each_cpu_mask_nr(j, *span) {
6500 if (group_fn(j, cpu_map, NULL, tmpmask) != group) 6947 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
6501 continue; 6948 continue;
6502 6949
@@ -6532,9 +6979,9 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
6532 6979
6533 min_val = INT_MAX; 6980 min_val = INT_MAX;
6534 6981
6535 for (i = 0; i < MAX_NUMNODES; i++) { 6982 for (i = 0; i < nr_node_ids; i++) {
6536 /* Start at @node */ 6983 /* Start at @node */
6537 n = (node + i) % MAX_NUMNODES; 6984 n = (node + i) % nr_node_ids;
6538 6985
6539 if (!nr_cpus_node(n)) 6986 if (!nr_cpus_node(n))
6540 continue; 6987 continue;
@@ -6584,7 +7031,7 @@ static void sched_domain_node_span(int node, cpumask_t *span)
6584 cpus_or(*span, *span, *nodemask); 7031 cpus_or(*span, *span, *nodemask);
6585 } 7032 }
6586} 7033}
6587#endif 7034#endif /* CONFIG_NUMA */
6588 7035
6589int sched_smt_power_savings = 0, sched_mc_power_savings = 0; 7036int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
6590 7037
@@ -6603,7 +7050,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
6603 *sg = &per_cpu(sched_group_cpus, cpu); 7050 *sg = &per_cpu(sched_group_cpus, cpu);
6604 return cpu; 7051 return cpu;
6605} 7052}
6606#endif 7053#endif /* CONFIG_SCHED_SMT */
6607 7054
6608/* 7055/*
6609 * multi-core sched-domains: 7056 * multi-core sched-domains:
@@ -6611,7 +7058,7 @@ cpu_to_cpu_group(int cpu, const cpumask_t *cpu_map, struct sched_group **sg,
6611#ifdef CONFIG_SCHED_MC 7058#ifdef CONFIG_SCHED_MC
6612static DEFINE_PER_CPU(struct sched_domain, core_domains); 7059static DEFINE_PER_CPU(struct sched_domain, core_domains);
6613static DEFINE_PER_CPU(struct sched_group, sched_group_core); 7060static DEFINE_PER_CPU(struct sched_group, sched_group_core);
6614#endif 7061#endif /* CONFIG_SCHED_MC */
6615 7062
6616#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT) 7063#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
6617static int 7064static int
@@ -6696,7 +7143,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
6696 if (!sg) 7143 if (!sg)
6697 return; 7144 return;
6698 do { 7145 do {
6699 for_each_cpu_mask(j, sg->cpumask) { 7146 for_each_cpu_mask_nr(j, sg->cpumask) {
6700 struct sched_domain *sd; 7147 struct sched_domain *sd;
6701 7148
6702 sd = &per_cpu(phys_domains, j); 7149 sd = &per_cpu(phys_domains, j);
@@ -6713,7 +7160,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
6713 sg = sg->next; 7160 sg = sg->next;
6714 } while (sg != group_head); 7161 } while (sg != group_head);
6715} 7162}
6716#endif 7163#endif /* CONFIG_NUMA */
6717 7164
6718#ifdef CONFIG_NUMA 7165#ifdef CONFIG_NUMA
6719/* Free memory allocated for various sched_group structures */ 7166/* Free memory allocated for various sched_group structures */
@@ -6721,14 +7168,14 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
6721{ 7168{
6722 int cpu, i; 7169 int cpu, i;
6723 7170
6724 for_each_cpu_mask(cpu, *cpu_map) { 7171 for_each_cpu_mask_nr(cpu, *cpu_map) {
6725 struct sched_group **sched_group_nodes 7172 struct sched_group **sched_group_nodes
6726 = sched_group_nodes_bycpu[cpu]; 7173 = sched_group_nodes_bycpu[cpu];
6727 7174
6728 if (!sched_group_nodes) 7175 if (!sched_group_nodes)
6729 continue; 7176 continue;
6730 7177
6731 for (i = 0; i < MAX_NUMNODES; i++) { 7178 for (i = 0; i < nr_node_ids; i++) {
6732 struct sched_group *oldsg, *sg = sched_group_nodes[i]; 7179 struct sched_group *oldsg, *sg = sched_group_nodes[i];
6733 7180
6734 *nodemask = node_to_cpumask(i); 7181 *nodemask = node_to_cpumask(i);
@@ -6750,11 +7197,11 @@ next_sg:
6750 sched_group_nodes_bycpu[cpu] = NULL; 7197 sched_group_nodes_bycpu[cpu] = NULL;
6751 } 7198 }
6752} 7199}
6753#else 7200#else /* !CONFIG_NUMA */
6754static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask) 7201static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
6755{ 7202{
6756} 7203}
6757#endif 7204#endif /* CONFIG_NUMA */
6758 7205
6759/* 7206/*
6760 * Initialize sched groups cpu_power. 7207 * Initialize sched groups cpu_power.
@@ -6813,13 +7260,21 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
6813 * Non-inlined to reduce accumulated stack pressure in build_sched_domains() 7260 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
6814 */ 7261 */
6815 7262
7263#ifdef CONFIG_SCHED_DEBUG
7264# define SD_INIT_NAME(sd, type) sd->name = #type
7265#else
7266# define SD_INIT_NAME(sd, type) do { } while (0)
7267#endif
7268
6816#define SD_INIT(sd, type) sd_init_##type(sd) 7269#define SD_INIT(sd, type) sd_init_##type(sd)
7270
6817#define SD_INIT_FUNC(type) \ 7271#define SD_INIT_FUNC(type) \
6818static noinline void sd_init_##type(struct sched_domain *sd) \ 7272static noinline void sd_init_##type(struct sched_domain *sd) \
6819{ \ 7273{ \
6820 memset(sd, 0, sizeof(*sd)); \ 7274 memset(sd, 0, sizeof(*sd)); \
6821 *sd = SD_##type##_INIT; \ 7275 *sd = SD_##type##_INIT; \
6822 sd->level = SD_LV_##type; \ 7276 sd->level = SD_LV_##type; \
7277 SD_INIT_NAME(sd, type); \
6823} 7278}
6824 7279
6825SD_INIT_FUNC(CPU) 7280SD_INIT_FUNC(CPU)
@@ -6921,7 +7376,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
6921 /* 7376 /*
6922 * Allocate the per-node list of sched groups 7377 * Allocate the per-node list of sched groups
6923 */ 7378 */
6924 sched_group_nodes = kcalloc(MAX_NUMNODES, sizeof(struct sched_group *), 7379 sched_group_nodes = kcalloc(nr_node_ids, sizeof(struct sched_group *),
6925 GFP_KERNEL); 7380 GFP_KERNEL);
6926 if (!sched_group_nodes) { 7381 if (!sched_group_nodes) {
6927 printk(KERN_WARNING "Can not alloc sched group node list\n"); 7382 printk(KERN_WARNING "Can not alloc sched group node list\n");
@@ -6960,7 +7415,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
6960 /* 7415 /*
6961 * Set up domains for cpus specified by the cpu_map. 7416 * Set up domains for cpus specified by the cpu_map.
6962 */ 7417 */
6963 for_each_cpu_mask(i, *cpu_map) { 7418 for_each_cpu_mask_nr(i, *cpu_map) {
6964 struct sched_domain *sd = NULL, *p; 7419 struct sched_domain *sd = NULL, *p;
6965 SCHED_CPUMASK_VAR(nodemask, allmasks); 7420 SCHED_CPUMASK_VAR(nodemask, allmasks);
6966 7421
@@ -7027,7 +7482,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7027 7482
7028#ifdef CONFIG_SCHED_SMT 7483#ifdef CONFIG_SCHED_SMT
7029 /* Set up CPU (sibling) groups */ 7484 /* Set up CPU (sibling) groups */
7030 for_each_cpu_mask(i, *cpu_map) { 7485 for_each_cpu_mask_nr(i, *cpu_map) {
7031 SCHED_CPUMASK_VAR(this_sibling_map, allmasks); 7486 SCHED_CPUMASK_VAR(this_sibling_map, allmasks);
7032 SCHED_CPUMASK_VAR(send_covered, allmasks); 7487 SCHED_CPUMASK_VAR(send_covered, allmasks);
7033 7488
@@ -7044,7 +7499,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7044 7499
7045#ifdef CONFIG_SCHED_MC 7500#ifdef CONFIG_SCHED_MC
7046 /* Set up multi-core groups */ 7501 /* Set up multi-core groups */
7047 for_each_cpu_mask(i, *cpu_map) { 7502 for_each_cpu_mask_nr(i, *cpu_map) {
7048 SCHED_CPUMASK_VAR(this_core_map, allmasks); 7503 SCHED_CPUMASK_VAR(this_core_map, allmasks);
7049 SCHED_CPUMASK_VAR(send_covered, allmasks); 7504 SCHED_CPUMASK_VAR(send_covered, allmasks);
7050 7505
@@ -7060,7 +7515,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7060#endif 7515#endif
7061 7516
7062 /* Set up physical groups */ 7517 /* Set up physical groups */
7063 for (i = 0; i < MAX_NUMNODES; i++) { 7518 for (i = 0; i < nr_node_ids; i++) {
7064 SCHED_CPUMASK_VAR(nodemask, allmasks); 7519 SCHED_CPUMASK_VAR(nodemask, allmasks);
7065 SCHED_CPUMASK_VAR(send_covered, allmasks); 7520 SCHED_CPUMASK_VAR(send_covered, allmasks);
7066 7521
@@ -7084,7 +7539,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7084 send_covered, tmpmask); 7539 send_covered, tmpmask);
7085 } 7540 }
7086 7541
7087 for (i = 0; i < MAX_NUMNODES; i++) { 7542 for (i = 0; i < nr_node_ids; i++) {
7088 /* Set up node groups */ 7543 /* Set up node groups */
7089 struct sched_group *sg, *prev; 7544 struct sched_group *sg, *prev;
7090 SCHED_CPUMASK_VAR(nodemask, allmasks); 7545 SCHED_CPUMASK_VAR(nodemask, allmasks);
@@ -7111,7 +7566,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7111 goto error; 7566 goto error;
7112 } 7567 }
7113 sched_group_nodes[i] = sg; 7568 sched_group_nodes[i] = sg;
7114 for_each_cpu_mask(j, *nodemask) { 7569 for_each_cpu_mask_nr(j, *nodemask) {
7115 struct sched_domain *sd; 7570 struct sched_domain *sd;
7116 7571
7117 sd = &per_cpu(node_domains, j); 7572 sd = &per_cpu(node_domains, j);
@@ -7123,9 +7578,9 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7123 cpus_or(*covered, *covered, *nodemask); 7578 cpus_or(*covered, *covered, *nodemask);
7124 prev = sg; 7579 prev = sg;
7125 7580
7126 for (j = 0; j < MAX_NUMNODES; j++) { 7581 for (j = 0; j < nr_node_ids; j++) {
7127 SCHED_CPUMASK_VAR(notcovered, allmasks); 7582 SCHED_CPUMASK_VAR(notcovered, allmasks);
7128 int n = (i + j) % MAX_NUMNODES; 7583 int n = (i + j) % nr_node_ids;
7129 node_to_cpumask_ptr(pnodemask, n); 7584 node_to_cpumask_ptr(pnodemask, n);
7130 7585
7131 cpus_complement(*notcovered, *covered); 7586 cpus_complement(*notcovered, *covered);
@@ -7157,28 +7612,28 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7157 7612
7158 /* Calculate CPU power for physical packages and nodes */ 7613 /* Calculate CPU power for physical packages and nodes */
7159#ifdef CONFIG_SCHED_SMT 7614#ifdef CONFIG_SCHED_SMT
7160 for_each_cpu_mask(i, *cpu_map) { 7615 for_each_cpu_mask_nr(i, *cpu_map) {
7161 struct sched_domain *sd = &per_cpu(cpu_domains, i); 7616 struct sched_domain *sd = &per_cpu(cpu_domains, i);
7162 7617
7163 init_sched_groups_power(i, sd); 7618 init_sched_groups_power(i, sd);
7164 } 7619 }
7165#endif 7620#endif
7166#ifdef CONFIG_SCHED_MC 7621#ifdef CONFIG_SCHED_MC
7167 for_each_cpu_mask(i, *cpu_map) { 7622 for_each_cpu_mask_nr(i, *cpu_map) {
7168 struct sched_domain *sd = &per_cpu(core_domains, i); 7623 struct sched_domain *sd = &per_cpu(core_domains, i);
7169 7624
7170 init_sched_groups_power(i, sd); 7625 init_sched_groups_power(i, sd);
7171 } 7626 }
7172#endif 7627#endif
7173 7628
7174 for_each_cpu_mask(i, *cpu_map) { 7629 for_each_cpu_mask_nr(i, *cpu_map) {
7175 struct sched_domain *sd = &per_cpu(phys_domains, i); 7630 struct sched_domain *sd = &per_cpu(phys_domains, i);
7176 7631
7177 init_sched_groups_power(i, sd); 7632 init_sched_groups_power(i, sd);
7178 } 7633 }
7179 7634
7180#ifdef CONFIG_NUMA 7635#ifdef CONFIG_NUMA
7181 for (i = 0; i < MAX_NUMNODES; i++) 7636 for (i = 0; i < nr_node_ids; i++)
7182 init_numa_sched_groups_power(sched_group_nodes[i]); 7637 init_numa_sched_groups_power(sched_group_nodes[i]);
7183 7638
7184 if (sd_allnodes) { 7639 if (sd_allnodes) {
@@ -7191,7 +7646,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7191#endif 7646#endif
7192 7647
7193 /* Attach the domains */ 7648 /* Attach the domains */
7194 for_each_cpu_mask(i, *cpu_map) { 7649 for_each_cpu_mask_nr(i, *cpu_map) {
7195 struct sched_domain *sd; 7650 struct sched_domain *sd;
7196#ifdef CONFIG_SCHED_SMT 7651#ifdef CONFIG_SCHED_SMT
7197 sd = &per_cpu(cpu_domains, i); 7652 sd = &per_cpu(cpu_domains, i);
@@ -7236,18 +7691,6 @@ void __attribute__((weak)) arch_update_cpu_topology(void)
7236} 7691}
7237 7692
7238/* 7693/*
7239 * Free current domain masks.
7240 * Called after all cpus are attached to NULL domain.
7241 */
7242static void free_sched_domains(void)
7243{
7244 ndoms_cur = 0;
7245 if (doms_cur != &fallback_doms)
7246 kfree(doms_cur);
7247 doms_cur = &fallback_doms;
7248}
7249
7250/*
7251 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 7694 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
7252 * For now this just excludes isolated cpus, but could be used to 7695 * For now this just excludes isolated cpus, but could be used to
7253 * exclude other special cases in the future. 7696 * exclude other special cases in the future.
@@ -7286,7 +7729,7 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
7286 7729
7287 unregister_sched_domain_sysctl(); 7730 unregister_sched_domain_sysctl();
7288 7731
7289 for_each_cpu_mask(i, *cpu_map) 7732 for_each_cpu_mask_nr(i, *cpu_map)
7290 cpu_attach_domain(NULL, &def_root_domain, i); 7733 cpu_attach_domain(NULL, &def_root_domain, i);
7291 synchronize_sched(); 7734 synchronize_sched();
7292 arch_destroy_sched_domains(cpu_map, &tmpmask); 7735 arch_destroy_sched_domains(cpu_map, &tmpmask);
@@ -7325,30 +7768,29 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7325 * ownership of it and will kfree it when done with it. If the caller 7768 * ownership of it and will kfree it when done with it. If the caller
7326 * failed the kmalloc call, then it can pass in doms_new == NULL, 7769 * failed the kmalloc call, then it can pass in doms_new == NULL,
7327 * and partition_sched_domains() will fallback to the single partition 7770 * and partition_sched_domains() will fallback to the single partition
7328 * 'fallback_doms'. 7771 * 'fallback_doms', it also forces the domains to be rebuilt.
7772 *
7773 * If doms_new==NULL it will be replaced with cpu_online_map.
7774 * ndoms_new==0 is a special case for destroying existing domains.
7775 * It will not create the default domain.
7329 * 7776 *
7330 * Call with hotplug lock held 7777 * Call with hotplug lock held
7331 */ 7778 */
7332void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, 7779void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
7333 struct sched_domain_attr *dattr_new) 7780 struct sched_domain_attr *dattr_new)
7334{ 7781{
7335 int i, j; 7782 int i, j, n;
7336 7783
7337 mutex_lock(&sched_domains_mutex); 7784 mutex_lock(&sched_domains_mutex);
7338 7785
7339 /* always unregister in case we don't destroy any domains */ 7786 /* always unregister in case we don't destroy any domains */
7340 unregister_sched_domain_sysctl(); 7787 unregister_sched_domain_sysctl();
7341 7788
7342 if (doms_new == NULL) { 7789 n = doms_new ? ndoms_new : 0;
7343 ndoms_new = 1;
7344 doms_new = &fallback_doms;
7345 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
7346 dattr_new = NULL;
7347 }
7348 7790
7349 /* Destroy deleted domains */ 7791 /* Destroy deleted domains */
7350 for (i = 0; i < ndoms_cur; i++) { 7792 for (i = 0; i < ndoms_cur; i++) {
7351 for (j = 0; j < ndoms_new; j++) { 7793 for (j = 0; j < n; j++) {
7352 if (cpus_equal(doms_cur[i], doms_new[j]) 7794 if (cpus_equal(doms_cur[i], doms_new[j])
7353 && dattrs_equal(dattr_cur, i, dattr_new, j)) 7795 && dattrs_equal(dattr_cur, i, dattr_new, j))
7354 goto match1; 7796 goto match1;
@@ -7359,6 +7801,13 @@ match1:
7359 ; 7801 ;
7360 } 7802 }
7361 7803
7804 if (doms_new == NULL) {
7805 ndoms_cur = 0;
7806 doms_new = &fallback_doms;
7807 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
7808 dattr_new = NULL;
7809 }
7810
7362 /* Build new domains */ 7811 /* Build new domains */
7363 for (i = 0; i < ndoms_new; i++) { 7812 for (i = 0; i < ndoms_new; i++) {
7364 for (j = 0; j < ndoms_cur; j++) { 7813 for (j = 0; j < ndoms_cur; j++) {
@@ -7389,17 +7838,15 @@ match2:
7389#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 7838#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
7390int arch_reinit_sched_domains(void) 7839int arch_reinit_sched_domains(void)
7391{ 7840{
7392 int err;
7393
7394 get_online_cpus(); 7841 get_online_cpus();
7395 mutex_lock(&sched_domains_mutex); 7842
7396 detach_destroy_domains(&cpu_online_map); 7843 /* Destroy domains first to force the rebuild */
7397 free_sched_domains(); 7844 partition_sched_domains(0, NULL, NULL);
7398 err = arch_init_sched_domains(&cpu_online_map); 7845
7399 mutex_unlock(&sched_domains_mutex); 7846 rebuild_sched_domains();
7400 put_online_cpus(); 7847 put_online_cpus();
7401 7848
7402 return err; 7849 return 0;
7403} 7850}
7404 7851
7405static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) 7852static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
@@ -7420,30 +7867,34 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
7420} 7867}
7421 7868
7422#ifdef CONFIG_SCHED_MC 7869#ifdef CONFIG_SCHED_MC
7423static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page) 7870static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
7871 char *page)
7424{ 7872{
7425 return sprintf(page, "%u\n", sched_mc_power_savings); 7873 return sprintf(page, "%u\n", sched_mc_power_savings);
7426} 7874}
7427static ssize_t sched_mc_power_savings_store(struct sys_device *dev, 7875static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
7428 const char *buf, size_t count) 7876 const char *buf, size_t count)
7429{ 7877{
7430 return sched_power_savings_store(buf, count, 0); 7878 return sched_power_savings_store(buf, count, 0);
7431} 7879}
7432static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, 7880static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
7433 sched_mc_power_savings_store); 7881 sched_mc_power_savings_show,
7882 sched_mc_power_savings_store);
7434#endif 7883#endif
7435 7884
7436#ifdef CONFIG_SCHED_SMT 7885#ifdef CONFIG_SCHED_SMT
7437static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page) 7886static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
7887 char *page)
7438{ 7888{
7439 return sprintf(page, "%u\n", sched_smt_power_savings); 7889 return sprintf(page, "%u\n", sched_smt_power_savings);
7440} 7890}
7441static ssize_t sched_smt_power_savings_store(struct sys_device *dev, 7891static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
7442 const char *buf, size_t count) 7892 const char *buf, size_t count)
7443{ 7893{
7444 return sched_power_savings_store(buf, count, 1); 7894 return sched_power_savings_store(buf, count, 1);
7445} 7895}
7446static SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, 7896static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
7897 sched_smt_power_savings_show,
7447 sched_smt_power_savings_store); 7898 sched_smt_power_savings_store);
7448#endif 7899#endif
7449 7900
@@ -7463,54 +7914,51 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7463#endif 7914#endif
7464 return err; 7915 return err;
7465} 7916}
7466#endif 7917#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
7467 7918
7919#ifndef CONFIG_CPUSETS
7468/* 7920/*
7469 * Force a reinitialization of the sched domains hierarchy. The domains 7921 * Add online and remove offline CPUs from the scheduler domains.
7470 * and groups cannot be updated in place without racing with the balancing 7922 * When cpusets are enabled they take over this function.
7471 * code, so we temporarily attach all running cpus to the NULL domain
7472 * which will prevent rebalancing while the sched domains are recalculated.
7473 */ 7923 */
7474static int update_sched_domains(struct notifier_block *nfb, 7924static int update_sched_domains(struct notifier_block *nfb,
7475 unsigned long action, void *hcpu) 7925 unsigned long action, void *hcpu)
7476{ 7926{
7477 switch (action) { 7927 switch (action) {
7478 case CPU_UP_PREPARE: 7928 case CPU_ONLINE:
7479 case CPU_UP_PREPARE_FROZEN: 7929 case CPU_ONLINE_FROZEN:
7930 case CPU_DEAD:
7931 case CPU_DEAD_FROZEN:
7932 partition_sched_domains(1, NULL, NULL);
7933 return NOTIFY_OK;
7934
7935 default:
7936 return NOTIFY_DONE;
7937 }
7938}
7939#endif
7940
7941static int update_runtime(struct notifier_block *nfb,
7942 unsigned long action, void *hcpu)
7943{
7944 int cpu = (int)(long)hcpu;
7945
7946 switch (action) {
7480 case CPU_DOWN_PREPARE: 7947 case CPU_DOWN_PREPARE:
7481 case CPU_DOWN_PREPARE_FROZEN: 7948 case CPU_DOWN_PREPARE_FROZEN:
7482 detach_destroy_domains(&cpu_online_map); 7949 disable_runtime(cpu_rq(cpu));
7483 free_sched_domains();
7484 return NOTIFY_OK; 7950 return NOTIFY_OK;
7485 7951
7486 case CPU_UP_CANCELED:
7487 case CPU_UP_CANCELED_FROZEN:
7488 case CPU_DOWN_FAILED: 7952 case CPU_DOWN_FAILED:
7489 case CPU_DOWN_FAILED_FROZEN: 7953 case CPU_DOWN_FAILED_FROZEN:
7490 case CPU_ONLINE: 7954 case CPU_ONLINE:
7491 case CPU_ONLINE_FROZEN: 7955 case CPU_ONLINE_FROZEN:
7492 case CPU_DEAD: 7956 enable_runtime(cpu_rq(cpu));
7493 case CPU_DEAD_FROZEN: 7957 return NOTIFY_OK;
7494 /* 7958
7495 * Fall through and re-initialise the domains.
7496 */
7497 break;
7498 default: 7959 default:
7499 return NOTIFY_DONE; 7960 return NOTIFY_DONE;
7500 } 7961 }
7501
7502#ifndef CONFIG_CPUSETS
7503 /*
7504 * Create default domain partitioning if cpusets are disabled.
7505 * Otherwise we let cpusets rebuild the domains based on the
7506 * current setup.
7507 */
7508
7509 /* The hotplug lock is already held by cpu_up/cpu_down */
7510 arch_init_sched_domains(&cpu_online_map);
7511#endif
7512
7513 return NOTIFY_OK;
7514} 7962}
7515 7963
7516void __init sched_init_smp(void) 7964void __init sched_init_smp(void)
@@ -7530,8 +7978,15 @@ void __init sched_init_smp(void)
7530 cpu_set(smp_processor_id(), non_isolated_cpus); 7978 cpu_set(smp_processor_id(), non_isolated_cpus);
7531 mutex_unlock(&sched_domains_mutex); 7979 mutex_unlock(&sched_domains_mutex);
7532 put_online_cpus(); 7980 put_online_cpus();
7981
7982#ifndef CONFIG_CPUSETS
7533 /* XXX: Theoretical race here - CPU may be hotplugged now */ 7983 /* XXX: Theoretical race here - CPU may be hotplugged now */
7534 hotcpu_notifier(update_sched_domains, 0); 7984 hotcpu_notifier(update_sched_domains, 0);
7985#endif
7986
7987 /* RT runtime code needs to handle some hotplug events */
7988 hotcpu_notifier(update_runtime, 0);
7989
7535 init_hrtick(); 7990 init_hrtick();
7536 7991
7537 /* Move init over to a non-isolated CPU */ 7992 /* Move init over to a non-isolated CPU */
@@ -7688,8 +8143,8 @@ void __init sched_init(void)
7688 8143
7689 root_task_group.cfs_rq = (struct cfs_rq **)ptr; 8144 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
7690 ptr += nr_cpu_ids * sizeof(void **); 8145 ptr += nr_cpu_ids * sizeof(void **);
7691#endif 8146#endif /* CONFIG_USER_SCHED */
7692#endif 8147#endif /* CONFIG_FAIR_GROUP_SCHED */
7693#ifdef CONFIG_RT_GROUP_SCHED 8148#ifdef CONFIG_RT_GROUP_SCHED
7694 init_task_group.rt_se = (struct sched_rt_entity **)ptr; 8149 init_task_group.rt_se = (struct sched_rt_entity **)ptr;
7695 ptr += nr_cpu_ids * sizeof(void **); 8150 ptr += nr_cpu_ids * sizeof(void **);
@@ -7703,8 +8158,8 @@ void __init sched_init(void)
7703 8158
7704 root_task_group.rt_rq = (struct rt_rq **)ptr; 8159 root_task_group.rt_rq = (struct rt_rq **)ptr;
7705 ptr += nr_cpu_ids * sizeof(void **); 8160 ptr += nr_cpu_ids * sizeof(void **);
7706#endif 8161#endif /* CONFIG_USER_SCHED */
7707#endif 8162#endif /* CONFIG_RT_GROUP_SCHED */
7708 } 8163 }
7709 8164
7710#ifdef CONFIG_SMP 8165#ifdef CONFIG_SMP
@@ -7720,8 +8175,8 @@ void __init sched_init(void)
7720#ifdef CONFIG_USER_SCHED 8175#ifdef CONFIG_USER_SCHED
7721 init_rt_bandwidth(&root_task_group.rt_bandwidth, 8176 init_rt_bandwidth(&root_task_group.rt_bandwidth,
7722 global_rt_period(), RUNTIME_INF); 8177 global_rt_period(), RUNTIME_INF);
7723#endif 8178#endif /* CONFIG_USER_SCHED */
7724#endif 8179#endif /* CONFIG_RT_GROUP_SCHED */
7725 8180
7726#ifdef CONFIG_GROUP_SCHED 8181#ifdef CONFIG_GROUP_SCHED
7727 list_add(&init_task_group.list, &task_groups); 8182 list_add(&init_task_group.list, &task_groups);
@@ -7731,15 +8186,14 @@ void __init sched_init(void)
7731 INIT_LIST_HEAD(&root_task_group.children); 8186 INIT_LIST_HEAD(&root_task_group.children);
7732 init_task_group.parent = &root_task_group; 8187 init_task_group.parent = &root_task_group;
7733 list_add(&init_task_group.siblings, &root_task_group.children); 8188 list_add(&init_task_group.siblings, &root_task_group.children);
7734#endif 8189#endif /* CONFIG_USER_SCHED */
7735#endif 8190#endif /* CONFIG_GROUP_SCHED */
7736 8191
7737 for_each_possible_cpu(i) { 8192 for_each_possible_cpu(i) {
7738 struct rq *rq; 8193 struct rq *rq;
7739 8194
7740 rq = cpu_rq(i); 8195 rq = cpu_rq(i);
7741 spin_lock_init(&rq->lock); 8196 spin_lock_init(&rq->lock);
7742 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
7743 rq->nr_running = 0; 8197 rq->nr_running = 0;
7744 init_cfs_rq(&rq->cfs, rq); 8198 init_cfs_rq(&rq->cfs, rq);
7745 init_rt_rq(&rq->rt, rq); 8199 init_rt_rq(&rq->rt, rq);
@@ -7812,6 +8266,7 @@ void __init sched_init(void)
7812 rq->next_balance = jiffies; 8266 rq->next_balance = jiffies;
7813 rq->push_cpu = 0; 8267 rq->push_cpu = 0;
7814 rq->cpu = i; 8268 rq->cpu = i;
8269 rq->online = 0;
7815 rq->migration_thread = NULL; 8270 rq->migration_thread = NULL;
7816 INIT_LIST_HEAD(&rq->migration_queue); 8271 INIT_LIST_HEAD(&rq->migration_queue);
7817 rq_attach_root(rq, &def_root_domain); 8272 rq_attach_root(rq, &def_root_domain);
@@ -7827,7 +8282,7 @@ void __init sched_init(void)
7827#endif 8282#endif
7828 8283
7829#ifdef CONFIG_SMP 8284#ifdef CONFIG_SMP
7830 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); 8285 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
7831#endif 8286#endif
7832 8287
7833#ifdef CONFIG_RT_MUTEXES 8288#ifdef CONFIG_RT_MUTEXES
@@ -7861,20 +8316,25 @@ void __might_sleep(char *file, int line)
7861#ifdef in_atomic 8316#ifdef in_atomic
7862 static unsigned long prev_jiffy; /* ratelimiting */ 8317 static unsigned long prev_jiffy; /* ratelimiting */
7863 8318
7864 if ((in_atomic() || irqs_disabled()) && 8319 if ((!in_atomic() && !irqs_disabled()) ||
7865 system_state == SYSTEM_RUNNING && !oops_in_progress) { 8320 system_state != SYSTEM_RUNNING || oops_in_progress)
7866 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 8321 return;
7867 return; 8322 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
7868 prev_jiffy = jiffies; 8323 return;
7869 printk(KERN_ERR "BUG: sleeping function called from invalid" 8324 prev_jiffy = jiffies;
7870 " context at %s:%d\n", file, line); 8325
7871 printk("in_atomic():%d, irqs_disabled():%d\n", 8326 printk(KERN_ERR
7872 in_atomic(), irqs_disabled()); 8327 "BUG: sleeping function called from invalid context at %s:%d\n",
7873 debug_show_held_locks(current); 8328 file, line);
7874 if (irqs_disabled()) 8329 printk(KERN_ERR
7875 print_irqtrace_events(current); 8330 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
7876 dump_stack(); 8331 in_atomic(), irqs_disabled(),
7877 } 8332 current->pid, current->comm);
8333
8334 debug_show_held_locks(current);
8335 if (irqs_disabled())
8336 print_irqtrace_events(current);
8337 dump_stack();
7878#endif 8338#endif
7879} 8339}
7880EXPORT_SYMBOL(__might_sleep); 8340EXPORT_SYMBOL(__might_sleep);
@@ -8051,7 +8511,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8051{ 8511{
8052 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list); 8512 list_del_rcu(&tg->cfs_rq[cpu]->leaf_cfs_rq_list);
8053} 8513}
8054#else 8514#else /* !CONFG_FAIR_GROUP_SCHED */
8055static inline void free_fair_sched_group(struct task_group *tg) 8515static inline void free_fair_sched_group(struct task_group *tg)
8056{ 8516{
8057} 8517}
@@ -8069,7 +8529,7 @@ static inline void register_fair_sched_group(struct task_group *tg, int cpu)
8069static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) 8529static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
8070{ 8530{
8071} 8531}
8072#endif 8532#endif /* CONFIG_FAIR_GROUP_SCHED */
8073 8533
8074#ifdef CONFIG_RT_GROUP_SCHED 8534#ifdef CONFIG_RT_GROUP_SCHED
8075static void free_rt_sched_group(struct task_group *tg) 8535static void free_rt_sched_group(struct task_group *tg)
@@ -8140,7 +8600,7 @@ static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8140{ 8600{
8141 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list); 8601 list_del_rcu(&tg->rt_rq[cpu]->leaf_rt_rq_list);
8142} 8602}
8143#else 8603#else /* !CONFIG_RT_GROUP_SCHED */
8144static inline void free_rt_sched_group(struct task_group *tg) 8604static inline void free_rt_sched_group(struct task_group *tg)
8145{ 8605{
8146} 8606}
@@ -8158,7 +8618,7 @@ static inline void register_rt_sched_group(struct task_group *tg, int cpu)
8158static inline void unregister_rt_sched_group(struct task_group *tg, int cpu) 8618static inline void unregister_rt_sched_group(struct task_group *tg, int cpu)
8159{ 8619{
8160} 8620}
8161#endif 8621#endif /* CONFIG_RT_GROUP_SCHED */
8162 8622
8163#ifdef CONFIG_GROUP_SCHED 8623#ifdef CONFIG_GROUP_SCHED
8164static void free_sched_group(struct task_group *tg) 8624static void free_sched_group(struct task_group *tg)
@@ -8195,8 +8655,8 @@ struct task_group *sched_create_group(struct task_group *parent)
8195 WARN_ON(!parent); /* root should already exist */ 8655 WARN_ON(!parent); /* root should already exist */
8196 8656
8197 tg->parent = parent; 8657 tg->parent = parent;
8198 list_add_rcu(&tg->siblings, &parent->children);
8199 INIT_LIST_HEAD(&tg->children); 8658 INIT_LIST_HEAD(&tg->children);
8659 list_add_rcu(&tg->siblings, &parent->children);
8200 spin_unlock_irqrestore(&task_group_lock, flags); 8660 spin_unlock_irqrestore(&task_group_lock, flags);
8201 8661
8202 return tg; 8662 return tg;
@@ -8269,17 +8729,14 @@ void sched_move_task(struct task_struct *tsk)
8269 8729
8270 task_rq_unlock(rq, &flags); 8730 task_rq_unlock(rq, &flags);
8271} 8731}
8272#endif 8732#endif /* CONFIG_GROUP_SCHED */
8273 8733
8274#ifdef CONFIG_FAIR_GROUP_SCHED 8734#ifdef CONFIG_FAIR_GROUP_SCHED
8275static void set_se_shares(struct sched_entity *se, unsigned long shares) 8735static void __set_se_shares(struct sched_entity *se, unsigned long shares)
8276{ 8736{
8277 struct cfs_rq *cfs_rq = se->cfs_rq; 8737 struct cfs_rq *cfs_rq = se->cfs_rq;
8278 struct rq *rq = cfs_rq->rq;
8279 int on_rq; 8738 int on_rq;
8280 8739
8281 spin_lock_irq(&rq->lock);
8282
8283 on_rq = se->on_rq; 8740 on_rq = se->on_rq;
8284 if (on_rq) 8741 if (on_rq)
8285 dequeue_entity(cfs_rq, se, 0); 8742 dequeue_entity(cfs_rq, se, 0);
@@ -8289,8 +8746,17 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares)
8289 8746
8290 if (on_rq) 8747 if (on_rq)
8291 enqueue_entity(cfs_rq, se, 0); 8748 enqueue_entity(cfs_rq, se, 0);
8749}
8292 8750
8293 spin_unlock_irq(&rq->lock); 8751static void set_se_shares(struct sched_entity *se, unsigned long shares)
8752{
8753 struct cfs_rq *cfs_rq = se->cfs_rq;
8754 struct rq *rq = cfs_rq->rq;
8755 unsigned long flags;
8756
8757 spin_lock_irqsave(&rq->lock, flags);
8758 __set_se_shares(se, shares);
8759 spin_unlock_irqrestore(&rq->lock, flags);
8294} 8760}
8295 8761
8296static DEFINE_MUTEX(shares_mutex); 8762static DEFINE_MUTEX(shares_mutex);
@@ -8329,8 +8795,13 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
8329 * w/o tripping rebalance_share or load_balance_fair. 8795 * w/o tripping rebalance_share or load_balance_fair.
8330 */ 8796 */
8331 tg->shares = shares; 8797 tg->shares = shares;
8332 for_each_possible_cpu(i) 8798 for_each_possible_cpu(i) {
8799 /*
8800 * force a rebalance
8801 */
8802 cfs_rq_set_shares(tg->cfs_rq[i], 0);
8333 set_se_shares(tg->se[i], shares); 8803 set_se_shares(tg->se[i], shares);
8804 }
8334 8805
8335 /* 8806 /*
8336 * Enable load balance activity on this group, by inserting it back on 8807 * Enable load balance activity on this group, by inserting it back on
@@ -8361,73 +8832,95 @@ static DEFINE_MUTEX(rt_constraints_mutex);
8361static unsigned long to_ratio(u64 period, u64 runtime) 8832static unsigned long to_ratio(u64 period, u64 runtime)
8362{ 8833{
8363 if (runtime == RUNTIME_INF) 8834 if (runtime == RUNTIME_INF)
8364 return 1ULL << 16; 8835 return 1ULL << 20;
8365 8836
8366 return div64_u64(runtime << 16, period); 8837 return div64_u64(runtime << 20, period);
8367} 8838}
8368 8839
8369#ifdef CONFIG_CGROUP_SCHED 8840/* Must be called with tasklist_lock held */
8370static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 8841static inline int tg_has_rt_tasks(struct task_group *tg)
8371{ 8842{
8372 struct task_group *tgi, *parent = tg ? tg->parent : NULL; 8843 struct task_struct *g, *p;
8373 unsigned long total = 0;
8374 8844
8375 if (!parent) { 8845 do_each_thread(g, p) {
8376 if (global_rt_period() < period) 8846 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
8377 return 0; 8847 return 1;
8848 } while_each_thread(g, p);
8378 8849
8379 return to_ratio(period, runtime) < 8850 return 0;
8380 to_ratio(global_rt_period(), global_rt_runtime()); 8851}
8381 }
8382 8852
8383 if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period) 8853struct rt_schedulable_data {
8384 return 0; 8854 struct task_group *tg;
8855 u64 rt_period;
8856 u64 rt_runtime;
8857};
8385 8858
8386 rcu_read_lock(); 8859static int tg_schedulable(struct task_group *tg, void *data)
8387 list_for_each_entry_rcu(tgi, &parent->children, siblings) { 8860{
8388 if (tgi == tg) 8861 struct rt_schedulable_data *d = data;
8389 continue; 8862 struct task_group *child;
8863 unsigned long total, sum = 0;
8864 u64 period, runtime;
8865
8866 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8867 runtime = tg->rt_bandwidth.rt_runtime;
8390 8868
8391 total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), 8869 if (tg == d->tg) {
8392 tgi->rt_bandwidth.rt_runtime); 8870 period = d->rt_period;
8871 runtime = d->rt_runtime;
8393 } 8872 }
8394 rcu_read_unlock();
8395 8873
8396 return total + to_ratio(period, runtime) < 8874 /*
8397 to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), 8875 * Cannot have more runtime than the period.
8398 parent->rt_bandwidth.rt_runtime); 8876 */
8399} 8877 if (runtime > period && runtime != RUNTIME_INF)
8400#elif defined CONFIG_USER_SCHED 8878 return -EINVAL;
8401static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8402{
8403 struct task_group *tgi;
8404 unsigned long total = 0;
8405 unsigned long global_ratio =
8406 to_ratio(global_rt_period(), global_rt_runtime());
8407 8879
8408 rcu_read_lock(); 8880 /*
8409 list_for_each_entry_rcu(tgi, &task_groups, list) { 8881 * Ensure we don't starve existing RT tasks.
8410 if (tgi == tg) 8882 */
8411 continue; 8883 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
8884 return -EBUSY;
8885
8886 total = to_ratio(period, runtime);
8412 8887
8413 total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), 8888 /*
8414 tgi->rt_bandwidth.rt_runtime); 8889 * Nobody can have more than the global setting allows.
8890 */
8891 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
8892 return -EINVAL;
8893
8894 /*
8895 * The sum of our children's runtime should not exceed our own.
8896 */
8897 list_for_each_entry_rcu(child, &tg->children, siblings) {
8898 period = ktime_to_ns(child->rt_bandwidth.rt_period);
8899 runtime = child->rt_bandwidth.rt_runtime;
8900
8901 if (child == d->tg) {
8902 period = d->rt_period;
8903 runtime = d->rt_runtime;
8904 }
8905
8906 sum += to_ratio(period, runtime);
8415 } 8907 }
8416 rcu_read_unlock();
8417 8908
8418 return total + to_ratio(period, runtime) < global_ratio; 8909 if (sum > total)
8910 return -EINVAL;
8911
8912 return 0;
8419} 8913}
8420#endif
8421 8914
8422/* Must be called with tasklist_lock held */ 8915static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8423static inline int tg_has_rt_tasks(struct task_group *tg)
8424{ 8916{
8425 struct task_struct *g, *p; 8917 struct rt_schedulable_data data = {
8426 do_each_thread(g, p) { 8918 .tg = tg,
8427 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) 8919 .rt_period = period,
8428 return 1; 8920 .rt_runtime = runtime,
8429 } while_each_thread(g, p); 8921 };
8430 return 0; 8922
8923 return walk_tg_tree(tg_schedulable, tg_nop, &data);
8431} 8924}
8432 8925
8433static int tg_set_bandwidth(struct task_group *tg, 8926static int tg_set_bandwidth(struct task_group *tg,
@@ -8437,14 +8930,9 @@ static int tg_set_bandwidth(struct task_group *tg,
8437 8930
8438 mutex_lock(&rt_constraints_mutex); 8931 mutex_lock(&rt_constraints_mutex);
8439 read_lock(&tasklist_lock); 8932 read_lock(&tasklist_lock);
8440 if (rt_runtime == 0 && tg_has_rt_tasks(tg)) { 8933 err = __rt_schedulable(tg, rt_period, rt_runtime);
8441 err = -EBUSY; 8934 if (err)
8442 goto unlock; 8935 goto unlock;
8443 }
8444 if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
8445 err = -EINVAL;
8446 goto unlock;
8447 }
8448 8936
8449 spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); 8937 spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8450 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); 8938 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
@@ -8496,6 +8984,9 @@ int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
8496 rt_period = (u64)rt_period_us * NSEC_PER_USEC; 8984 rt_period = (u64)rt_period_us * NSEC_PER_USEC;
8497 rt_runtime = tg->rt_bandwidth.rt_runtime; 8985 rt_runtime = tg->rt_bandwidth.rt_runtime;
8498 8986
8987 if (rt_period == 0)
8988 return -EINVAL;
8989
8499 return tg_set_bandwidth(tg, rt_period, rt_runtime); 8990 return tg_set_bandwidth(tg, rt_period, rt_runtime);
8500} 8991}
8501 8992
@@ -8510,21 +9001,38 @@ long sched_group_rt_period(struct task_group *tg)
8510 9001
8511static int sched_rt_global_constraints(void) 9002static int sched_rt_global_constraints(void)
8512{ 9003{
9004 u64 runtime, period;
8513 int ret = 0; 9005 int ret = 0;
8514 9006
9007 if (sysctl_sched_rt_period <= 0)
9008 return -EINVAL;
9009
9010 runtime = global_rt_runtime();
9011 period = global_rt_period();
9012
9013 /*
9014 * Sanity check on the sysctl variables.
9015 */
9016 if (runtime > period && runtime != RUNTIME_INF)
9017 return -EINVAL;
9018
8515 mutex_lock(&rt_constraints_mutex); 9019 mutex_lock(&rt_constraints_mutex);
8516 if (!__rt_schedulable(NULL, 1, 0)) 9020 read_lock(&tasklist_lock);
8517 ret = -EINVAL; 9021 ret = __rt_schedulable(NULL, 0, 0);
9022 read_unlock(&tasklist_lock);
8518 mutex_unlock(&rt_constraints_mutex); 9023 mutex_unlock(&rt_constraints_mutex);
8519 9024
8520 return ret; 9025 return ret;
8521} 9026}
8522#else 9027#else /* !CONFIG_RT_GROUP_SCHED */
8523static int sched_rt_global_constraints(void) 9028static int sched_rt_global_constraints(void)
8524{ 9029{
8525 unsigned long flags; 9030 unsigned long flags;
8526 int i; 9031 int i;
8527 9032
9033 if (sysctl_sched_rt_period <= 0)
9034 return -EINVAL;
9035
8528 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 9036 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
8529 for_each_possible_cpu(i) { 9037 for_each_possible_cpu(i) {
8530 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 9038 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
@@ -8537,7 +9045,7 @@ static int sched_rt_global_constraints(void)
8537 9045
8538 return 0; 9046 return 0;
8539} 9047}
8540#endif 9048#endif /* CONFIG_RT_GROUP_SCHED */
8541 9049
8542int sched_rt_handler(struct ctl_table *table, int write, 9050int sched_rt_handler(struct ctl_table *table, int write,
8543 struct file *filp, void __user *buffer, size_t *lenp, 9051 struct file *filp, void __user *buffer, size_t *lenp,
@@ -8585,7 +9093,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
8585 9093
8586 if (!cgrp->parent) { 9094 if (!cgrp->parent) {
8587 /* This is early initialization for the top cgroup */ 9095 /* This is early initialization for the top cgroup */
8588 init_task_group.css.cgroup = cgrp;
8589 return &init_task_group.css; 9096 return &init_task_group.css;
8590 } 9097 }
8591 9098
@@ -8594,9 +9101,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
8594 if (IS_ERR(tg)) 9101 if (IS_ERR(tg))
8595 return ERR_PTR(-ENOMEM); 9102 return ERR_PTR(-ENOMEM);
8596 9103
8597 /* Bind the cgroup to task_group object we just created */
8598 tg->css.cgroup = cgrp;
8599
8600 return &tg->css; 9104 return &tg->css;
8601} 9105}
8602 9106
@@ -8645,7 +9149,7 @@ static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
8645 9149
8646 return (u64) tg->shares; 9150 return (u64) tg->shares;
8647} 9151}
8648#endif 9152#endif /* CONFIG_FAIR_GROUP_SCHED */
8649 9153
8650#ifdef CONFIG_RT_GROUP_SCHED 9154#ifdef CONFIG_RT_GROUP_SCHED
8651static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, 9155static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
@@ -8669,7 +9173,7 @@ static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
8669{ 9173{
8670 return sched_group_rt_period(cgroup_tg(cgrp)); 9174 return sched_group_rt_period(cgroup_tg(cgrp));
8671} 9175}
8672#endif 9176#endif /* CONFIG_RT_GROUP_SCHED */
8673 9177
8674static struct cftype cpu_files[] = { 9178static struct cftype cpu_files[] = {
8675#ifdef CONFIG_FAIR_GROUP_SCHED 9179#ifdef CONFIG_FAIR_GROUP_SCHED
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index ce05271219ab..e8ab096ddfe3 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -3,25 +3,26 @@
3 * 3 *
4 * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 4 * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
5 * 5 *
6 * Updates and enhancements:
7 * Copyright (C) 2008 Red Hat, Inc. Steven Rostedt <srostedt@redhat.com>
8 *
6 * Based on code by: 9 * Based on code by:
7 * Ingo Molnar <mingo@redhat.com> 10 * Ingo Molnar <mingo@redhat.com>
8 * Guillaume Chazarain <guichaz@gmail.com> 11 * Guillaume Chazarain <guichaz@gmail.com>
9 * 12 *
10 * Create a semi stable clock from a mixture of other events, including: 13 * Create a semi stable clock from a mixture of other events, including:
11 * - gtod 14 * - gtod
12 * - jiffies
13 * - sched_clock() 15 * - sched_clock()
14 * - explicit idle events 16 * - explicit idle events
15 * 17 *
16 * We use gtod as base and the unstable clock deltas. The deltas are filtered, 18 * We use gtod as base and the unstable clock deltas. The deltas are filtered,
17 * making it monotonic and keeping it within an expected window. This window 19 * making it monotonic and keeping it within an expected window.
18 * is set up using jiffies.
19 * 20 *
20 * Furthermore, explicit sleep and wakeup hooks allow us to account for time 21 * Furthermore, explicit sleep and wakeup hooks allow us to account for time
21 * that is otherwise invisible (TSC gets stopped). 22 * that is otherwise invisible (TSC gets stopped).
22 * 23 *
23 * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat 24 * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat
24 * consistent between cpus (never more than 1 jiffies difference). 25 * consistent between cpus (never more than 2 jiffies difference).
25 */ 26 */
26#include <linux/sched.h> 27#include <linux/sched.h>
27#include <linux/percpu.h> 28#include <linux/percpu.h>
@@ -29,6 +30,17 @@
29#include <linux/ktime.h> 30#include <linux/ktime.h>
30#include <linux/module.h> 31#include <linux/module.h>
31 32
33/*
34 * Scheduler clock - returns current time in nanosec units.
35 * This is default implementation.
36 * Architectures and sub-architectures can override this.
37 */
38unsigned long long __attribute__((weak)) sched_clock(void)
39{
40 return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
41}
42
43static __read_mostly int sched_clock_running;
32 44
33#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK 45#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
34 46
@@ -40,8 +52,6 @@ struct sched_clock_data {
40 */ 52 */
41 raw_spinlock_t lock; 53 raw_spinlock_t lock;
42 54
43 unsigned long prev_jiffies;
44 u64 prev_raw;
45 u64 tick_raw; 55 u64 tick_raw;
46 u64 tick_gtod; 56 u64 tick_gtod;
47 u64 clock; 57 u64 clock;
@@ -59,20 +69,15 @@ static inline struct sched_clock_data *cpu_sdc(int cpu)
59 return &per_cpu(sched_clock_data, cpu); 69 return &per_cpu(sched_clock_data, cpu);
60} 70}
61 71
62static __read_mostly int sched_clock_running;
63
64void sched_clock_init(void) 72void sched_clock_init(void)
65{ 73{
66 u64 ktime_now = ktime_to_ns(ktime_get()); 74 u64 ktime_now = ktime_to_ns(ktime_get());
67 unsigned long now_jiffies = jiffies;
68 int cpu; 75 int cpu;
69 76
70 for_each_possible_cpu(cpu) { 77 for_each_possible_cpu(cpu) {
71 struct sched_clock_data *scd = cpu_sdc(cpu); 78 struct sched_clock_data *scd = cpu_sdc(cpu);
72 79
73 scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 80 scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
74 scd->prev_jiffies = now_jiffies;
75 scd->prev_raw = 0;
76 scd->tick_raw = 0; 81 scd->tick_raw = 0;
77 scd->tick_gtod = ktime_now; 82 scd->tick_gtod = ktime_now;
78 scd->clock = ktime_now; 83 scd->clock = ktime_now;
@@ -82,45 +87,51 @@ void sched_clock_init(void)
82} 87}
83 88
84/* 89/*
90 * min,max except they take wrapping into account
91 */
92
93static inline u64 wrap_min(u64 x, u64 y)
94{
95 return (s64)(x - y) < 0 ? x : y;
96}
97
98static inline u64 wrap_max(u64 x, u64 y)
99{
100 return (s64)(x - y) > 0 ? x : y;
101}
102
103/*
85 * update the percpu scd from the raw @now value 104 * update the percpu scd from the raw @now value
86 * 105 *
87 * - filter out backward motion 106 * - filter out backward motion
88 * - use jiffies to generate a min,max window to clip the raw values 107 * - use the GTOD tick value to create a window to filter crazy TSC values
89 */ 108 */
90static void __update_sched_clock(struct sched_clock_data *scd, u64 now) 109static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
91{ 110{
92 unsigned long now_jiffies = jiffies; 111 s64 delta = now - scd->tick_raw;
93 long delta_jiffies = now_jiffies - scd->prev_jiffies; 112 u64 clock, min_clock, max_clock;
94 u64 clock = scd->clock;
95 u64 min_clock, max_clock;
96 s64 delta = now - scd->prev_raw;
97 113
98 WARN_ON_ONCE(!irqs_disabled()); 114 WARN_ON_ONCE(!irqs_disabled());
99 min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC;
100 115
101 if (unlikely(delta < 0)) { 116 if (unlikely(delta < 0))
102 clock++; 117 delta = 0;
103 goto out;
104 }
105 118
106 max_clock = min_clock + TICK_NSEC; 119 /*
120 * scd->clock = clamp(scd->tick_gtod + delta,
121 * max(scd->tick_gtod, scd->clock),
122 * scd->tick_gtod + TICK_NSEC);
123 */
107 124
108 if (unlikely(clock + delta > max_clock)) { 125 clock = scd->tick_gtod + delta;
109 if (clock < max_clock) 126 min_clock = wrap_max(scd->tick_gtod, scd->clock);
110 clock = max_clock; 127 max_clock = scd->tick_gtod + TICK_NSEC;
111 else
112 clock++;
113 } else {
114 clock += delta;
115 }
116 128
117 out: 129 clock = wrap_max(clock, min_clock);
118 if (unlikely(clock < min_clock)) 130 clock = wrap_min(clock, max_clock);
119 clock = min_clock;
120 131
121 scd->prev_raw = now;
122 scd->prev_jiffies = now_jiffies;
123 scd->clock = clock; 132 scd->clock = clock;
133
134 return scd->clock;
124} 135}
125 136
126static void lock_double_clock(struct sched_clock_data *data1, 137static void lock_double_clock(struct sched_clock_data *data1,
@@ -138,7 +149,7 @@ static void lock_double_clock(struct sched_clock_data *data1,
138u64 sched_clock_cpu(int cpu) 149u64 sched_clock_cpu(int cpu)
139{ 150{
140 struct sched_clock_data *scd = cpu_sdc(cpu); 151 struct sched_clock_data *scd = cpu_sdc(cpu);
141 u64 now, clock; 152 u64 now, clock, this_clock, remote_clock;
142 153
143 if (unlikely(!sched_clock_running)) 154 if (unlikely(!sched_clock_running))
144 return 0ull; 155 return 0ull;
@@ -147,30 +158,36 @@ u64 sched_clock_cpu(int cpu)
147 now = sched_clock(); 158 now = sched_clock();
148 159
149 if (cpu != raw_smp_processor_id()) { 160 if (cpu != raw_smp_processor_id()) {
150 /*
151 * in order to update a remote cpu's clock based on our
152 * unstable raw time rebase it against:
153 * tick_raw (offset between raw counters)
154 * tick_gotd (tick offset between cpus)
155 */
156 struct sched_clock_data *my_scd = this_scd(); 161 struct sched_clock_data *my_scd = this_scd();
157 162
158 lock_double_clock(scd, my_scd); 163 lock_double_clock(scd, my_scd);
159 164
160 now -= my_scd->tick_raw; 165 this_clock = __update_sched_clock(my_scd, now);
161 now += scd->tick_raw; 166 remote_clock = scd->clock;
162 167
163 now -= my_scd->tick_gtod; 168 /*
164 now += scd->tick_gtod; 169 * Use the opportunity that we have both locks
170 * taken to couple the two clocks: we take the
171 * larger time as the latest time for both
172 * runqueues. (this creates monotonic movement)
173 */
174 if (likely((s64)(remote_clock - this_clock) < 0)) {
175 clock = this_clock;
176 scd->clock = clock;
177 } else {
178 /*
179 * Should be rare, but possible:
180 */
181 clock = remote_clock;
182 my_scd->clock = remote_clock;
183 }
165 184
166 __raw_spin_unlock(&my_scd->lock); 185 __raw_spin_unlock(&my_scd->lock);
167 } else { 186 } else {
168 __raw_spin_lock(&scd->lock); 187 __raw_spin_lock(&scd->lock);
188 clock = __update_sched_clock(scd, now);
169 } 189 }
170 190
171 __update_sched_clock(scd, now);
172 clock = scd->clock;
173
174 __raw_spin_unlock(&scd->lock); 191 __raw_spin_unlock(&scd->lock);
175 192
176 return clock; 193 return clock;
@@ -186,18 +203,13 @@ void sched_clock_tick(void)
186 203
187 WARN_ON_ONCE(!irqs_disabled()); 204 WARN_ON_ONCE(!irqs_disabled());
188 205
189 now = sched_clock();
190 now_gtod = ktime_to_ns(ktime_get()); 206 now_gtod = ktime_to_ns(ktime_get());
207 now = sched_clock();
191 208
192 __raw_spin_lock(&scd->lock); 209 __raw_spin_lock(&scd->lock);
193 __update_sched_clock(scd, now);
194 /*
195 * update tick_gtod after __update_sched_clock() because that will
196 * already observe 1 new jiffy; adding a new tick_gtod to that would
197 * increase the clock 2 jiffies.
198 */
199 scd->tick_raw = now; 210 scd->tick_raw = now;
200 scd->tick_gtod = now_gtod; 211 scd->tick_gtod = now_gtod;
212 __update_sched_clock(scd, now);
201 __raw_spin_unlock(&scd->lock); 213 __raw_spin_unlock(&scd->lock);
202} 214}
203 215
@@ -215,32 +227,37 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
215 */ 227 */
216void sched_clock_idle_wakeup_event(u64 delta_ns) 228void sched_clock_idle_wakeup_event(u64 delta_ns)
217{ 229{
218 struct sched_clock_data *scd = this_scd(); 230 sched_clock_tick();
219 u64 now = sched_clock();
220
221 /*
222 * Override the previous timestamp and ignore all
223 * sched_clock() deltas that occured while we idled,
224 * and use the PM-provided delta_ns to advance the
225 * rq clock:
226 */
227 __raw_spin_lock(&scd->lock);
228 scd->prev_raw = now;
229 scd->clock += delta_ns;
230 __raw_spin_unlock(&scd->lock);
231
232 touch_softlockup_watchdog(); 231 touch_softlockup_watchdog();
233} 232}
234EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); 233EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
235 234
235#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
236
237void sched_clock_init(void)
238{
239 sched_clock_running = 1;
240}
241
242u64 sched_clock_cpu(int cpu)
243{
244 if (unlikely(!sched_clock_running))
245 return 0;
246
247 return sched_clock();
248}
249
236#endif 250#endif
237 251
238/* 252unsigned long long cpu_clock(int cpu)
239 * Scheduler clock - returns current time in nanosec units.
240 * This is default implementation.
241 * Architectures and sub-architectures can override this.
242 */
243unsigned long long __attribute__((weak)) sched_clock(void)
244{ 253{
245 return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); 254 unsigned long long clock;
255 unsigned long flags;
256
257 local_irq_save(flags);
258 clock = sched_clock_cpu(cpu);
259 local_irq_restore(flags);
260
261 return clock;
246} 262}
263EXPORT_SYMBOL_GPL(cpu_clock);
diff --git a/kernel/sched_cpupri.c b/kernel/sched_cpupri.c
new file mode 100644
index 000000000000..52154fefab7e
--- /dev/null
+++ b/kernel/sched_cpupri.c
@@ -0,0 +1,174 @@
1/*
2 * kernel/sched_cpupri.c
3 *
4 * CPU priority management
5 *
6 * Copyright (C) 2007-2008 Novell
7 *
8 * Author: Gregory Haskins <ghaskins@novell.com>
9 *
10 * This code tracks the priority of each CPU so that global migration
11 * decisions are easy to calculate. Each CPU can be in a state as follows:
12 *
13 * (INVALID), IDLE, NORMAL, RT1, ... RT99
14 *
15 * going from the lowest priority to the highest. CPUs in the INVALID state
16 * are not eligible for routing. The system maintains this state with
17 * a 2 dimensional bitmap (the first for priority class, the second for cpus
18 * in that class). Therefore a typical application without affinity
19 * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
20 * searches). For tasks with affinity restrictions, the algorithm has a
21 * worst case complexity of O(min(102, nr_domcpus)), though the scenario that
22 * yields the worst case search is fairly contrived.
23 *
24 * This program is free software; you can redistribute it and/or
25 * modify it under the terms of the GNU General Public License
26 * as published by the Free Software Foundation; version 2
27 * of the License.
28 */
29
30#include "sched_cpupri.h"
31
32/* Convert between a 140 based task->prio, and our 102 based cpupri */
33static int convert_prio(int prio)
34{
35 int cpupri;
36
37 if (prio == CPUPRI_INVALID)
38 cpupri = CPUPRI_INVALID;
39 else if (prio == MAX_PRIO)
40 cpupri = CPUPRI_IDLE;
41 else if (prio >= MAX_RT_PRIO)
42 cpupri = CPUPRI_NORMAL;
43 else
44 cpupri = MAX_RT_PRIO - prio + 1;
45
46 return cpupri;
47}
48
49#define for_each_cpupri_active(array, idx) \
50 for (idx = find_first_bit(array, CPUPRI_NR_PRIORITIES); \
51 idx < CPUPRI_NR_PRIORITIES; \
52 idx = find_next_bit(array, CPUPRI_NR_PRIORITIES, idx+1))
53
54/**
55 * cpupri_find - find the best (lowest-pri) CPU in the system
56 * @cp: The cpupri context
57 * @p: The task
58 * @lowest_mask: A mask to fill in with selected CPUs
59 *
60 * Note: This function returns the recommended CPUs as calculated during the
61 * current invokation. By the time the call returns, the CPUs may have in
62 * fact changed priorities any number of times. While not ideal, it is not
63 * an issue of correctness since the normal rebalancer logic will correct
64 * any discrepancies created by racing against the uncertainty of the current
65 * priority configuration.
66 *
67 * Returns: (int)bool - CPUs were found
68 */
69int cpupri_find(struct cpupri *cp, struct task_struct *p,
70 cpumask_t *lowest_mask)
71{
72 int idx = 0;
73 int task_pri = convert_prio(p->prio);
74
75 for_each_cpupri_active(cp->pri_active, idx) {
76 struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
77 cpumask_t mask;
78
79 if (idx >= task_pri)
80 break;
81
82 cpus_and(mask, p->cpus_allowed, vec->mask);
83
84 if (cpus_empty(mask))
85 continue;
86
87 *lowest_mask = mask;
88 return 1;
89 }
90
91 return 0;
92}
93
94/**
95 * cpupri_set - update the cpu priority setting
96 * @cp: The cpupri context
97 * @cpu: The target cpu
98 * @pri: The priority (INVALID-RT99) to assign to this CPU
99 *
100 * Note: Assumes cpu_rq(cpu)->lock is locked
101 *
102 * Returns: (void)
103 */
104void cpupri_set(struct cpupri *cp, int cpu, int newpri)
105{
106 int *currpri = &cp->cpu_to_pri[cpu];
107 int oldpri = *currpri;
108 unsigned long flags;
109
110 newpri = convert_prio(newpri);
111
112 BUG_ON(newpri >= CPUPRI_NR_PRIORITIES);
113
114 if (newpri == oldpri)
115 return;
116
117 /*
118 * If the cpu was currently mapped to a different value, we
119 * first need to unmap the old value
120 */
121 if (likely(oldpri != CPUPRI_INVALID)) {
122 struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
123
124 spin_lock_irqsave(&vec->lock, flags);
125
126 vec->count--;
127 if (!vec->count)
128 clear_bit(oldpri, cp->pri_active);
129 cpu_clear(cpu, vec->mask);
130
131 spin_unlock_irqrestore(&vec->lock, flags);
132 }
133
134 if (likely(newpri != CPUPRI_INVALID)) {
135 struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
136
137 spin_lock_irqsave(&vec->lock, flags);
138
139 cpu_set(cpu, vec->mask);
140 vec->count++;
141 if (vec->count == 1)
142 set_bit(newpri, cp->pri_active);
143
144 spin_unlock_irqrestore(&vec->lock, flags);
145 }
146
147 *currpri = newpri;
148}
149
150/**
151 * cpupri_init - initialize the cpupri structure
152 * @cp: The cpupri context
153 *
154 * Returns: (void)
155 */
156void cpupri_init(struct cpupri *cp)
157{
158 int i;
159
160 memset(cp, 0, sizeof(*cp));
161
162 for (i = 0; i < CPUPRI_NR_PRIORITIES; i++) {
163 struct cpupri_vec *vec = &cp->pri_to_cpu[i];
164
165 spin_lock_init(&vec->lock);
166 vec->count = 0;
167 cpus_clear(vec->mask);
168 }
169
170 for_each_possible_cpu(i)
171 cp->cpu_to_pri[i] = CPUPRI_INVALID;
172}
173
174
diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
new file mode 100644
index 000000000000..f25811b0f931
--- /dev/null
+++ b/kernel/sched_cpupri.h
@@ -0,0 +1,36 @@
1#ifndef _LINUX_CPUPRI_H
2#define _LINUX_CPUPRI_H
3
4#include <linux/sched.h>
5
6#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2)
7#define CPUPRI_NR_PRI_WORDS BITS_TO_LONGS(CPUPRI_NR_PRIORITIES)
8
9#define CPUPRI_INVALID -1
10#define CPUPRI_IDLE 0
11#define CPUPRI_NORMAL 1
12/* values 2-101 are RT priorities 0-99 */
13
14struct cpupri_vec {
15 spinlock_t lock;
16 int count;
17 cpumask_t mask;
18};
19
20struct cpupri {
21 struct cpupri_vec pri_to_cpu[CPUPRI_NR_PRIORITIES];
22 long pri_active[CPUPRI_NR_PRI_WORDS];
23 int cpu_to_pri[NR_CPUS];
24};
25
26#ifdef CONFIG_SMP
27int cpupri_find(struct cpupri *cp,
28 struct task_struct *p, cpumask_t *lowest_mask);
29void cpupri_set(struct cpupri *cp, int cpu, int pri);
30void cpupri_init(struct cpupri *cp);
31#else
32#define cpupri_set(cp, cpu, pri) do { } while (0)
33#define cpupri_init() do { } while (0)
34#endif
35
36#endif /* _LINUX_CPUPRI_H */
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 8bb713040ac9..ad958c1ec708 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -119,9 +119,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
119 struct sched_entity *last; 119 struct sched_entity *last;
120 unsigned long flags; 120 unsigned long flags;
121 121
122#if !defined(CONFIG_CGROUP_SCHED) || !defined(CONFIG_USER_SCHED) 122#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_FAIR_GROUP_SCHED)
123 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
124#else
125 char path[128] = ""; 123 char path[128] = "";
126 struct cgroup *cgroup = NULL; 124 struct cgroup *cgroup = NULL;
127 struct task_group *tg = cfs_rq->tg; 125 struct task_group *tg = cfs_rq->tg;
@@ -133,6 +131,8 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
133 cgroup_path(cgroup, path, sizeof(path)); 131 cgroup_path(cgroup, path, sizeof(path));
134 132
135 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path); 133 SEQ_printf(m, "\ncfs_rq[%d]:%s\n", cpu, path);
134#else
135 SEQ_printf(m, "\ncfs_rq[%d]:\n", cpu);
136#endif 136#endif
137 137
138 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", 138 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock",
@@ -162,11 +162,64 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
162 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running); 162 SEQ_printf(m, " .%-30s: %ld\n", "nr_running", cfs_rq->nr_running);
163 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight); 163 SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
164#ifdef CONFIG_SCHEDSTATS 164#ifdef CONFIG_SCHEDSTATS
165 SEQ_printf(m, " .%-30s: %d\n", "bkl_count", 165#define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n);
166 rq->bkl_count); 166
167 P(yld_exp_empty);
168 P(yld_act_empty);
169 P(yld_both_empty);
170 P(yld_count);
171
172 P(sched_switch);
173 P(sched_count);
174 P(sched_goidle);
175
176 P(ttwu_count);
177 P(ttwu_local);
178
179 P(bkl_count);
180
181#undef P
167#endif 182#endif
168 SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over", 183 SEQ_printf(m, " .%-30s: %ld\n", "nr_spread_over",
169 cfs_rq->nr_spread_over); 184 cfs_rq->nr_spread_over);
185#ifdef CONFIG_FAIR_GROUP_SCHED
186#ifdef CONFIG_SMP
187 SEQ_printf(m, " .%-30s: %lu\n", "shares", cfs_rq->shares);
188#endif
189#endif
190}
191
192void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
193{
194#if defined(CONFIG_CGROUP_SCHED) && defined(CONFIG_RT_GROUP_SCHED)
195 char path[128] = "";
196 struct cgroup *cgroup = NULL;
197 struct task_group *tg = rt_rq->tg;
198
199 if (tg)
200 cgroup = tg->css.cgroup;
201
202 if (cgroup)
203 cgroup_path(cgroup, path, sizeof(path));
204
205 SEQ_printf(m, "\nrt_rq[%d]:%s\n", cpu, path);
206#else
207 SEQ_printf(m, "\nrt_rq[%d]:\n", cpu);
208#endif
209
210
211#define P(x) \
212 SEQ_printf(m, " .%-30s: %Ld\n", #x, (long long)(rt_rq->x))
213#define PN(x) \
214 SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x))
215
216 P(rt_nr_running);
217 P(rt_throttled);
218 PN(rt_time);
219 PN(rt_runtime);
220
221#undef PN
222#undef P
170} 223}
171 224
172static void print_cpu(struct seq_file *m, int cpu) 225static void print_cpu(struct seq_file *m, int cpu)
@@ -208,6 +261,7 @@ static void print_cpu(struct seq_file *m, int cpu)
208#undef PN 261#undef PN
209 262
210 print_cfs_stats(m, cpu); 263 print_cfs_stats(m, cpu);
264 print_rt_stats(m, cpu);
211 265
212 print_rq(m, rq, cpu); 266 print_rq(m, rq, cpu);
213} 267}
@@ -279,12 +333,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
279 unsigned long flags; 333 unsigned long flags;
280 int num_threads = 1; 334 int num_threads = 1;
281 335
282 rcu_read_lock();
283 if (lock_task_sighand(p, &flags)) { 336 if (lock_task_sighand(p, &flags)) {
284 num_threads = atomic_read(&p->signal->count); 337 num_threads = atomic_read(&p->signal->count);
285 unlock_task_sighand(p, &flags); 338 unlock_task_sighand(p, &flags);
286 } 339 }
287 rcu_read_unlock();
288 340
289 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads); 341 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads);
290 SEQ_printf(m, 342 SEQ_printf(m,
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 08ae848b71d4..18fd17172eb6 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -63,13 +63,13 @@ unsigned int __read_mostly sysctl_sched_compat_yield;
63 63
64/* 64/*
65 * SCHED_OTHER wake-up granularity. 65 * SCHED_OTHER wake-up granularity.
66 * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) 66 * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds)
67 * 67 *
68 * This option delays the preemption effects of decoupled workloads 68 * This option delays the preemption effects of decoupled workloads
69 * and reduces their over-scheduling. Synchronous workloads will still 69 * and reduces their over-scheduling. Synchronous workloads will still
70 * have immediate wakeup/sleep latencies. 70 * have immediate wakeup/sleep latencies.
71 */ 71 */
72unsigned int sysctl_sched_wakeup_granularity = 10000000UL; 72unsigned int sysctl_sched_wakeup_granularity = 5000000UL;
73 73
74const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 74const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
75 75
@@ -334,6 +334,34 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
334#endif 334#endif
335 335
336/* 336/*
337 * delta *= w / rw
338 */
339static inline unsigned long
340calc_delta_weight(unsigned long delta, struct sched_entity *se)
341{
342 for_each_sched_entity(se) {
343 delta = calc_delta_mine(delta,
344 se->load.weight, &cfs_rq_of(se)->load);
345 }
346
347 return delta;
348}
349
350/*
351 * delta *= rw / w
352 */
353static inline unsigned long
354calc_delta_fair(unsigned long delta, struct sched_entity *se)
355{
356 for_each_sched_entity(se) {
357 delta = calc_delta_mine(delta,
358 cfs_rq_of(se)->load.weight, &se->load);
359 }
360
361 return delta;
362}
363
364/*
337 * The idea is to set a period in which each task runs once. 365 * The idea is to set a period in which each task runs once.
338 * 366 *
339 * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch 367 * When there are too many tasks (sysctl_sched_nr_latency) we have to stretch
@@ -362,47 +390,22 @@ static u64 __sched_period(unsigned long nr_running)
362 */ 390 */
363static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) 391static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
364{ 392{
365 u64 slice = __sched_period(cfs_rq->nr_running); 393 return calc_delta_weight(__sched_period(cfs_rq->nr_running), se);
366
367 for_each_sched_entity(se) {
368 cfs_rq = cfs_rq_of(se);
369
370 slice *= se->load.weight;
371 do_div(slice, cfs_rq->load.weight);
372 }
373
374
375 return slice;
376} 394}
377 395
378/* 396/*
379 * We calculate the vruntime slice of a to be inserted task 397 * We calculate the vruntime slice of a to be inserted task
380 * 398 *
381 * vs = s/w = p/rw 399 * vs = s*rw/w = p
382 */ 400 */
383static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) 401static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
384{ 402{
385 unsigned long nr_running = cfs_rq->nr_running; 403 unsigned long nr_running = cfs_rq->nr_running;
386 unsigned long weight;
387 u64 vslice;
388 404
389 if (!se->on_rq) 405 if (!se->on_rq)
390 nr_running++; 406 nr_running++;
391 407
392 vslice = __sched_period(nr_running); 408 return __sched_period(nr_running);
393
394 for_each_sched_entity(se) {
395 cfs_rq = cfs_rq_of(se);
396
397 weight = cfs_rq->load.weight;
398 if (!se->on_rq)
399 weight += se->load.weight;
400
401 vslice *= NICE_0_LOAD;
402 do_div(vslice, weight);
403 }
404
405 return vslice;
406} 409}
407 410
408/* 411/*
@@ -419,11 +422,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
419 422
420 curr->sum_exec_runtime += delta_exec; 423 curr->sum_exec_runtime += delta_exec;
421 schedstat_add(cfs_rq, exec_clock, delta_exec); 424 schedstat_add(cfs_rq, exec_clock, delta_exec);
422 delta_exec_weighted = delta_exec; 425 delta_exec_weighted = calc_delta_fair(delta_exec, curr);
423 if (unlikely(curr->load.weight != NICE_0_LOAD)) {
424 delta_exec_weighted = calc_delta_fair(delta_exec_weighted,
425 &curr->load);
426 }
427 curr->vruntime += delta_exec_weighted; 426 curr->vruntime += delta_exec_weighted;
428} 427}
429 428
@@ -510,22 +509,45 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
510 * Scheduling class queueing methods: 509 * Scheduling class queueing methods:
511 */ 510 */
512 511
512#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
513static void
514add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
515{
516 cfs_rq->task_weight += weight;
517}
518#else
519static inline void
520add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
521{
522}
523#endif
524
513static void 525static void
514account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) 526account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
515{ 527{
516 update_load_add(&cfs_rq->load, se->load.weight); 528 update_load_add(&cfs_rq->load, se->load.weight);
529 if (!parent_entity(se))
530 inc_cpu_load(rq_of(cfs_rq), se->load.weight);
531 if (entity_is_task(se)) {
532 add_cfs_task_weight(cfs_rq, se->load.weight);
533 list_add(&se->group_node, &cfs_rq->tasks);
534 }
517 cfs_rq->nr_running++; 535 cfs_rq->nr_running++;
518 se->on_rq = 1; 536 se->on_rq = 1;
519 list_add(&se->group_node, &cfs_rq->tasks);
520} 537}
521 538
522static void 539static void
523account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) 540account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
524{ 541{
525 update_load_sub(&cfs_rq->load, se->load.weight); 542 update_load_sub(&cfs_rq->load, se->load.weight);
543 if (!parent_entity(se))
544 dec_cpu_load(rq_of(cfs_rq), se->load.weight);
545 if (entity_is_task(se)) {
546 add_cfs_task_weight(cfs_rq, -se->load.weight);
547 list_del_init(&se->group_node);
548 }
526 cfs_rq->nr_running--; 549 cfs_rq->nr_running--;
527 se->on_rq = 0; 550 se->on_rq = 0;
528 list_del_init(&se->group_node);
529} 551}
530 552
531static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 553static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -609,8 +631,17 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
609 631
610 if (!initial) { 632 if (!initial) {
611 /* sleeps upto a single latency don't count. */ 633 /* sleeps upto a single latency don't count. */
612 if (sched_feat(NEW_FAIR_SLEEPERS)) 634 if (sched_feat(NEW_FAIR_SLEEPERS)) {
613 vruntime -= sysctl_sched_latency; 635 unsigned long thresh = sysctl_sched_latency;
636
637 /*
638 * convert the sleeper threshold into virtual time
639 */
640 if (sched_feat(NORMALIZED_SLEEPER))
641 thresh = calc_delta_fair(thresh, se);
642
643 vruntime -= thresh;
644 }
614 645
615 /* ensure we never gain time by being placed backwards. */ 646 /* ensure we never gain time by being placed backwards. */
616 vruntime = max_vruntime(se->vruntime, vruntime); 647 vruntime = max_vruntime(se->vruntime, vruntime);
@@ -639,21 +670,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
639 __enqueue_entity(cfs_rq, se); 670 __enqueue_entity(cfs_rq, se);
640} 671}
641 672
642static void update_avg(u64 *avg, u64 sample)
643{
644 s64 diff = sample - *avg;
645 *avg += diff >> 3;
646}
647
648static void update_avg_stats(struct cfs_rq *cfs_rq, struct sched_entity *se)
649{
650 if (!se->last_wakeup)
651 return;
652
653 update_avg(&se->avg_overlap, se->sum_exec_runtime - se->last_wakeup);
654 se->last_wakeup = 0;
655}
656
657static void 673static void
658dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) 674dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
659{ 675{
@@ -664,7 +680,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
664 680
665 update_stats_dequeue(cfs_rq, se); 681 update_stats_dequeue(cfs_rq, se);
666 if (sleep) { 682 if (sleep) {
667 update_avg_stats(cfs_rq, se);
668#ifdef CONFIG_SCHEDSTATS 683#ifdef CONFIG_SCHEDSTATS
669 if (entity_is_task(se)) { 684 if (entity_is_task(se)) {
670 struct task_struct *tsk = task_of(se); 685 struct task_struct *tsk = task_of(se);
@@ -726,17 +741,16 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
726 se->prev_sum_exec_runtime = se->sum_exec_runtime; 741 se->prev_sum_exec_runtime = se->sum_exec_runtime;
727} 742}
728 743
729static int
730wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
731
732static struct sched_entity * 744static struct sched_entity *
733pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) 745pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se)
734{ 746{
735 if (!cfs_rq->next) 747 struct rq *rq = rq_of(cfs_rq);
736 return se; 748 u64 pair_slice = rq->clock - cfs_rq->pair_start;
737 749
738 if (wakeup_preempt_entity(cfs_rq->next, se) != 0) 750 if (!cfs_rq->next || pair_slice > sched_slice(cfs_rq, cfs_rq->next)) {
751 cfs_rq->pair_start = rq->clock;
739 return se; 752 return se;
753 }
740 754
741 return cfs_rq->next; 755 return cfs_rq->next;
742} 756}
@@ -808,7 +822,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
808#ifdef CONFIG_SCHED_HRTICK 822#ifdef CONFIG_SCHED_HRTICK
809static void hrtick_start_fair(struct rq *rq, struct task_struct *p) 823static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
810{ 824{
811 int requeue = rq->curr == p;
812 struct sched_entity *se = &p->se; 825 struct sched_entity *se = &p->se;
813 struct cfs_rq *cfs_rq = cfs_rq_of(se); 826 struct cfs_rq *cfs_rq = cfs_rq_of(se);
814 827
@@ -829,13 +842,13 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
829 * Don't schedule slices shorter than 10000ns, that just 842 * Don't schedule slices shorter than 10000ns, that just
830 * doesn't make sense. Rely on vruntime for fairness. 843 * doesn't make sense. Rely on vruntime for fairness.
831 */ 844 */
832 if (!requeue) 845 if (rq->curr != p)
833 delta = max(10000LL, delta); 846 delta = max_t(s64, 10000LL, delta);
834 847
835 hrtick_start(rq, delta, requeue); 848 hrtick_start(rq, delta);
836 } 849 }
837} 850}
838#else 851#else /* !CONFIG_SCHED_HRTICK */
839static inline void 852static inline void
840hrtick_start_fair(struct rq *rq, struct task_struct *p) 853hrtick_start_fair(struct rq *rq, struct task_struct *p)
841{ 854{
@@ -934,6 +947,8 @@ static void yield_task_fair(struct rq *rq)
934 * not idle and an idle cpu is available. The span of cpus to 947 * not idle and an idle cpu is available. The span of cpus to
935 * search starts with cpus closest then further out as needed, 948 * search starts with cpus closest then further out as needed,
936 * so we always favor a closer, idle cpu. 949 * so we always favor a closer, idle cpu.
950 * Domains may include CPUs that are not usable for migration,
951 * hence we need to mask them out (cpu_active_map)
937 * 952 *
938 * Returns the CPU we should wake onto. 953 * Returns the CPU we should wake onto.
939 */ 954 */
@@ -961,7 +976,8 @@ static int wake_idle(int cpu, struct task_struct *p)
961 || ((sd->flags & SD_WAKE_IDLE_FAR) 976 || ((sd->flags & SD_WAKE_IDLE_FAR)
962 && !task_hot(p, task_rq(p)->clock, sd))) { 977 && !task_hot(p, task_rq(p)->clock, sd))) {
963 cpus_and(tmp, sd->span, p->cpus_allowed); 978 cpus_and(tmp, sd->span, p->cpus_allowed);
964 for_each_cpu_mask(i, tmp) { 979 cpus_and(tmp, tmp, cpu_active_map);
980 for_each_cpu_mask_nr(i, tmp) {
965 if (idle_cpu(i)) { 981 if (idle_cpu(i)) {
966 if (i != task_cpu(p)) { 982 if (i != task_cpu(p)) {
967 schedstat_inc(p, 983 schedstat_inc(p,
@@ -976,7 +992,7 @@ static int wake_idle(int cpu, struct task_struct *p)
976 } 992 }
977 return cpu; 993 return cpu;
978} 994}
979#else 995#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/
980static inline int wake_idle(int cpu, struct task_struct *p) 996static inline int wake_idle(int cpu, struct task_struct *p)
981{ 997{
982 return cpu; 998 return cpu;
@@ -987,46 +1003,143 @@ static inline int wake_idle(int cpu, struct task_struct *p)
987 1003
988static const struct sched_class fair_sched_class; 1004static const struct sched_class fair_sched_class;
989 1005
1006#ifdef CONFIG_FAIR_GROUP_SCHED
1007/*
1008 * effective_load() calculates the load change as seen from the root_task_group
1009 *
1010 * Adding load to a group doesn't make a group heavier, but can cause movement
1011 * of group shares between cpus. Assuming the shares were perfectly aligned one
1012 * can calculate the shift in shares.
1013 *
1014 * The problem is that perfectly aligning the shares is rather expensive, hence
1015 * we try to avoid doing that too often - see update_shares(), which ratelimits
1016 * this change.
1017 *
1018 * We compensate this by not only taking the current delta into account, but
1019 * also considering the delta between when the shares were last adjusted and
1020 * now.
1021 *
1022 * We still saw a performance dip, some tracing learned us that between
1023 * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased
1024 * significantly. Therefore try to bias the error in direction of failing
1025 * the affine wakeup.
1026 *
1027 */
1028static long effective_load(struct task_group *tg, int cpu,
1029 long wl, long wg)
1030{
1031 struct sched_entity *se = tg->se[cpu];
1032
1033 if (!tg->parent)
1034 return wl;
1035
1036 /*
1037 * By not taking the decrease of shares on the other cpu into
1038 * account our error leans towards reducing the affine wakeups.
1039 */
1040 if (!wl && sched_feat(ASYM_EFF_LOAD))
1041 return wl;
1042
1043 for_each_sched_entity(se) {
1044 long S, rw, s, a, b;
1045 long more_w;
1046
1047 /*
1048 * Instead of using this increment, also add the difference
1049 * between when the shares were last updated and now.
1050 */
1051 more_w = se->my_q->load.weight - se->my_q->rq_weight;
1052 wl += more_w;
1053 wg += more_w;
1054
1055 S = se->my_q->tg->shares;
1056 s = se->my_q->shares;
1057 rw = se->my_q->rq_weight;
1058
1059 a = S*(rw + wl);
1060 b = S*rw + s*wg;
1061
1062 wl = s*(a-b);
1063
1064 if (likely(b))
1065 wl /= b;
1066
1067 /*
1068 * Assume the group is already running and will
1069 * thus already be accounted for in the weight.
1070 *
1071 * That is, moving shares between CPUs, does not
1072 * alter the group weight.
1073 */
1074 wg = 0;
1075 }
1076
1077 return wl;
1078}
1079
1080#else
1081
1082static inline unsigned long effective_load(struct task_group *tg, int cpu,
1083 unsigned long wl, unsigned long wg)
1084{
1085 return wl;
1086}
1087
1088#endif
1089
990static int 1090static int
991wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, 1091wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
992 struct task_struct *p, int prev_cpu, int this_cpu, int sync, 1092 struct task_struct *p, int prev_cpu, int this_cpu, int sync,
993 int idx, unsigned long load, unsigned long this_load, 1093 int idx, unsigned long load, unsigned long this_load,
994 unsigned int imbalance) 1094 unsigned int imbalance)
995{ 1095{
996 struct task_struct *curr = this_rq->curr; 1096 struct task_struct *curr = this_rq->curr;
1097 struct task_group *tg;
997 unsigned long tl = this_load; 1098 unsigned long tl = this_load;
998 unsigned long tl_per_task; 1099 unsigned long tl_per_task;
1100 unsigned long weight;
999 int balanced; 1101 int balanced;
1000 1102
1001 if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) 1103 if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
1002 return 0; 1104 return 0;
1003 1105
1106 if (!sync && sched_feat(SYNC_WAKEUPS) &&
1107 curr->se.avg_overlap < sysctl_sched_migration_cost &&
1108 p->se.avg_overlap < sysctl_sched_migration_cost)
1109 sync = 1;
1110
1004 /* 1111 /*
1005 * If sync wakeup then subtract the (maximum possible) 1112 * If sync wakeup then subtract the (maximum possible)
1006 * effect of the currently running task from the load 1113 * effect of the currently running task from the load
1007 * of the current CPU: 1114 * of the current CPU:
1008 */ 1115 */
1009 if (sync) 1116 if (sync) {
1010 tl -= current->se.load.weight; 1117 tg = task_group(current);
1118 weight = current->se.load.weight;
1119
1120 tl += effective_load(tg, this_cpu, -weight, -weight);
1121 load += effective_load(tg, prev_cpu, 0, -weight);
1122 }
1123
1124 tg = task_group(p);
1125 weight = p->se.load.weight;
1011 1126
1012 balanced = 100*(tl + p->se.load.weight) <= imbalance*load; 1127 balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <=
1128 imbalance*(load + effective_load(tg, prev_cpu, 0, weight));
1013 1129
1014 /* 1130 /*
1015 * If the currently running task will sleep within 1131 * If the currently running task will sleep within
1016 * a reasonable amount of time then attract this newly 1132 * a reasonable amount of time then attract this newly
1017 * woken task: 1133 * woken task:
1018 */ 1134 */
1019 if (sync && balanced && curr->sched_class == &fair_sched_class) { 1135 if (sync && balanced)
1020 if (curr->se.avg_overlap < sysctl_sched_migration_cost && 1136 return 1;
1021 p->se.avg_overlap < sysctl_sched_migration_cost)
1022 return 1;
1023 }
1024 1137
1025 schedstat_inc(p, se.nr_wakeups_affine_attempts); 1138 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1026 tl_per_task = cpu_avg_load_per_task(this_cpu); 1139 tl_per_task = cpu_avg_load_per_task(this_cpu);
1027 1140
1028 if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) || 1141 if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <=
1029 balanced) { 1142 tl_per_task)) {
1030 /* 1143 /*
1031 * This domain has SD_WAKE_AFFINE and 1144 * This domain has SD_WAKE_AFFINE and
1032 * p is cache cold in this domain, and 1145 * p is cache cold in this domain, and
@@ -1045,16 +1158,17 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
1045 struct sched_domain *sd, *this_sd = NULL; 1158 struct sched_domain *sd, *this_sd = NULL;
1046 int prev_cpu, this_cpu, new_cpu; 1159 int prev_cpu, this_cpu, new_cpu;
1047 unsigned long load, this_load; 1160 unsigned long load, this_load;
1048 struct rq *rq, *this_rq; 1161 struct rq *this_rq;
1049 unsigned int imbalance; 1162 unsigned int imbalance;
1050 int idx; 1163 int idx;
1051 1164
1052 prev_cpu = task_cpu(p); 1165 prev_cpu = task_cpu(p);
1053 rq = task_rq(p);
1054 this_cpu = smp_processor_id(); 1166 this_cpu = smp_processor_id();
1055 this_rq = cpu_rq(this_cpu); 1167 this_rq = cpu_rq(this_cpu);
1056 new_cpu = prev_cpu; 1168 new_cpu = prev_cpu;
1057 1169
1170 if (prev_cpu == this_cpu)
1171 goto out;
1058 /* 1172 /*
1059 * 'this_sd' is the first domain that both 1173 * 'this_sd' is the first domain that both
1060 * this_cpu and prev_cpu are present in: 1174 * this_cpu and prev_cpu are present in:
@@ -1082,13 +1196,10 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
1082 load = source_load(prev_cpu, idx); 1196 load = source_load(prev_cpu, idx);
1083 this_load = target_load(this_cpu, idx); 1197 this_load = target_load(this_cpu, idx);
1084 1198
1085 if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, 1199 if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
1086 load, this_load, imbalance)) 1200 load, this_load, imbalance))
1087 return this_cpu; 1201 return this_cpu;
1088 1202
1089 if (prev_cpu == this_cpu)
1090 goto out;
1091
1092 /* 1203 /*
1093 * Start passive balancing when half the imbalance_pct 1204 * Start passive balancing when half the imbalance_pct
1094 * limit is reached. 1205 * limit is reached.
@@ -1111,64 +1222,24 @@ static unsigned long wakeup_gran(struct sched_entity *se)
1111 unsigned long gran = sysctl_sched_wakeup_granularity; 1222 unsigned long gran = sysctl_sched_wakeup_granularity;
1112 1223
1113 /* 1224 /*
1114 * More easily preempt - nice tasks, while not making 1225 * More easily preempt - nice tasks, while not making it harder for
1115 * it harder for + nice tasks. 1226 * + nice tasks.
1116 */ 1227 */
1117 if (unlikely(se->load.weight > NICE_0_LOAD)) 1228 if (sched_feat(ASYM_GRAN))
1118 gran = calc_delta_fair(gran, &se->load); 1229 gran = calc_delta_mine(gran, NICE_0_LOAD, &se->load);
1119 1230
1120 return gran; 1231 return gran;
1121} 1232}
1122 1233
1123/* 1234/*
1124 * Should 'se' preempt 'curr'.
1125 *
1126 * |s1
1127 * |s2
1128 * |s3
1129 * g
1130 * |<--->|c
1131 *
1132 * w(c, s1) = -1
1133 * w(c, s2) = 0
1134 * w(c, s3) = 1
1135 *
1136 */
1137static int
1138wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
1139{
1140 s64 gran, vdiff = curr->vruntime - se->vruntime;
1141
1142 if (vdiff < 0)
1143 return -1;
1144
1145 gran = wakeup_gran(curr);
1146 if (vdiff > gran)
1147 return 1;
1148
1149 return 0;
1150}
1151
1152/* return depth at which a sched entity is present in the hierarchy */
1153static inline int depth_se(struct sched_entity *se)
1154{
1155 int depth = 0;
1156
1157 for_each_sched_entity(se)
1158 depth++;
1159
1160 return depth;
1161}
1162
1163/*
1164 * Preempt the current task with a newly woken task if needed: 1235 * Preempt the current task with a newly woken task if needed:
1165 */ 1236 */
1166static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) 1237static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1167{ 1238{
1168 struct task_struct *curr = rq->curr; 1239 struct task_struct *curr = rq->curr;
1169 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1240 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1170 struct sched_entity *se = &curr->se, *pse = &p->se; 1241 struct sched_entity *se = &curr->se, *pse = &p->se;
1171 int se_depth, pse_depth; 1242 s64 delta_exec;
1172 1243
1173 if (unlikely(rt_prio(p->prio))) { 1244 if (unlikely(rt_prio(p->prio))) {
1174 update_rq_clock(rq); 1245 update_rq_clock(rq);
@@ -1177,13 +1248,19 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
1177 return; 1248 return;
1178 } 1249 }
1179 1250
1180 se->last_wakeup = se->sum_exec_runtime;
1181 if (unlikely(se == pse)) 1251 if (unlikely(se == pse))
1182 return; 1252 return;
1183 1253
1184 cfs_rq_of(pse)->next = pse; 1254 cfs_rq_of(pse)->next = pse;
1185 1255
1186 /* 1256 /*
1257 * We can come here with TIF_NEED_RESCHED already set from new task
1258 * wake up path.
1259 */
1260 if (test_tsk_need_resched(curr))
1261 return;
1262
1263 /*
1187 * Batch tasks do not preempt (their preemption is driven by 1264 * Batch tasks do not preempt (their preemption is driven by
1188 * the tick): 1265 * the tick):
1189 */ 1266 */
@@ -1193,33 +1270,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
1193 if (!sched_feat(WAKEUP_PREEMPT)) 1270 if (!sched_feat(WAKEUP_PREEMPT))
1194 return; 1271 return;
1195 1272
1196 /* 1273 if (sched_feat(WAKEUP_OVERLAP) && (sync ||
1197 * preemption test can be made between sibling entities who are in the 1274 (se->avg_overlap < sysctl_sched_migration_cost &&
1198 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of 1275 pse->avg_overlap < sysctl_sched_migration_cost))) {
1199 * both tasks until we find their ancestors who are siblings of common 1276 resched_task(curr);
1200 * parent. 1277 return;
1201 */
1202
1203 /* First walk up until both entities are at same depth */
1204 se_depth = depth_se(se);
1205 pse_depth = depth_se(pse);
1206
1207 while (se_depth > pse_depth) {
1208 se_depth--;
1209 se = parent_entity(se);
1210 }
1211
1212 while (pse_depth > se_depth) {
1213 pse_depth--;
1214 pse = parent_entity(pse);
1215 }
1216
1217 while (!is_same_group(se, pse)) {
1218 se = parent_entity(se);
1219 pse = parent_entity(pse);
1220 } 1278 }
1221 1279
1222 if (wakeup_preempt_entity(se, pse) == 1) 1280 delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime;
1281 if (delta_exec > wakeup_gran(pse))
1223 resched_task(curr); 1282 resched_task(curr);
1224} 1283}
1225 1284
@@ -1278,19 +1337,9 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
1278 if (next == &cfs_rq->tasks) 1337 if (next == &cfs_rq->tasks)
1279 return NULL; 1338 return NULL;
1280 1339
1281 /* Skip over entities that are not tasks */ 1340 se = list_entry(next, struct sched_entity, group_node);
1282 do { 1341 p = task_of(se);
1283 se = list_entry(next, struct sched_entity, group_node); 1342 cfs_rq->balance_iterator = next->next;
1284 next = next->next;
1285 } while (next != &cfs_rq->tasks && !entity_is_task(se));
1286
1287 if (next == &cfs_rq->tasks)
1288 return NULL;
1289
1290 cfs_rq->balance_iterator = next;
1291
1292 if (entity_is_task(se))
1293 p = task_of(se);
1294 1343
1295 return p; 1344 return p;
1296} 1345}
@@ -1309,75 +1358,82 @@ static struct task_struct *load_balance_next_fair(void *arg)
1309 return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); 1358 return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator);
1310} 1359}
1311 1360
1312#ifdef CONFIG_FAIR_GROUP_SCHED 1361static unsigned long
1313static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) 1362__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1363 unsigned long max_load_move, struct sched_domain *sd,
1364 enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
1365 struct cfs_rq *cfs_rq)
1314{ 1366{
1315 struct sched_entity *curr; 1367 struct rq_iterator cfs_rq_iterator;
1316 struct task_struct *p;
1317
1318 if (!cfs_rq->nr_running || !first_fair(cfs_rq))
1319 return MAX_PRIO;
1320
1321 curr = cfs_rq->curr;
1322 if (!curr)
1323 curr = __pick_next_entity(cfs_rq);
1324 1368
1325 p = task_of(curr); 1369 cfs_rq_iterator.start = load_balance_start_fair;
1370 cfs_rq_iterator.next = load_balance_next_fair;
1371 cfs_rq_iterator.arg = cfs_rq;
1326 1372
1327 return p->prio; 1373 return balance_tasks(this_rq, this_cpu, busiest,
1374 max_load_move, sd, idle, all_pinned,
1375 this_best_prio, &cfs_rq_iterator);
1328} 1376}
1329#endif
1330 1377
1378#ifdef CONFIG_FAIR_GROUP_SCHED
1331static unsigned long 1379static unsigned long
1332load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1380load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1333 unsigned long max_load_move, 1381 unsigned long max_load_move,
1334 struct sched_domain *sd, enum cpu_idle_type idle, 1382 struct sched_domain *sd, enum cpu_idle_type idle,
1335 int *all_pinned, int *this_best_prio) 1383 int *all_pinned, int *this_best_prio)
1336{ 1384{
1337 struct cfs_rq *busy_cfs_rq;
1338 long rem_load_move = max_load_move; 1385 long rem_load_move = max_load_move;
1339 struct rq_iterator cfs_rq_iterator; 1386 int busiest_cpu = cpu_of(busiest);
1340 1387 struct task_group *tg;
1341 cfs_rq_iterator.start = load_balance_start_fair;
1342 cfs_rq_iterator.next = load_balance_next_fair;
1343 1388
1344 for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { 1389 rcu_read_lock();
1345#ifdef CONFIG_FAIR_GROUP_SCHED 1390 update_h_load(busiest_cpu);
1346 struct cfs_rq *this_cfs_rq;
1347 long imbalance;
1348 unsigned long maxload;
1349 1391
1350 this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); 1392 list_for_each_entry_rcu(tg, &task_groups, list) {
1393 struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
1394 unsigned long busiest_h_load = busiest_cfs_rq->h_load;
1395 unsigned long busiest_weight = busiest_cfs_rq->load.weight;
1396 u64 rem_load, moved_load;
1351 1397
1352 imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; 1398 /*
1353 /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ 1399 * empty group
1354 if (imbalance <= 0) 1400 */
1401 if (!busiest_cfs_rq->task_weight)
1355 continue; 1402 continue;
1356 1403
1357 /* Don't pull more than imbalance/2 */ 1404 rem_load = (u64)rem_load_move * busiest_weight;
1358 imbalance /= 2; 1405 rem_load = div_u64(rem_load, busiest_h_load + 1);
1359 maxload = min(rem_load_move, imbalance);
1360 1406
1361 *this_best_prio = cfs_rq_best_prio(this_cfs_rq); 1407 moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
1362#else 1408 rem_load, sd, idle, all_pinned, this_best_prio,
1363# define maxload rem_load_move 1409 tg->cfs_rq[busiest_cpu]);
1364#endif 1410
1365 /* 1411 if (!moved_load)
1366 * pass busy_cfs_rq argument into 1412 continue;
1367 * load_balance_[start|next]_fair iterators 1413
1368 */ 1414 moved_load *= busiest_h_load;
1369 cfs_rq_iterator.arg = busy_cfs_rq; 1415 moved_load = div_u64(moved_load, busiest_weight + 1);
1370 rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
1371 maxload, sd, idle, all_pinned,
1372 this_best_prio,
1373 &cfs_rq_iterator);
1374 1416
1375 if (rem_load_move <= 0) 1417 rem_load_move -= moved_load;
1418 if (rem_load_move < 0)
1376 break; 1419 break;
1377 } 1420 }
1421 rcu_read_unlock();
1378 1422
1379 return max_load_move - rem_load_move; 1423 return max_load_move - rem_load_move;
1380} 1424}
1425#else
1426static unsigned long
1427load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1428 unsigned long max_load_move,
1429 struct sched_domain *sd, enum cpu_idle_type idle,
1430 int *all_pinned, int *this_best_prio)
1431{
1432 return __load_balance_fair(this_rq, this_cpu, busiest,
1433 max_load_move, sd, idle, all_pinned,
1434 this_best_prio, &busiest->cfs);
1435}
1436#endif
1381 1437
1382static int 1438static int
1383move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, 1439move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
@@ -1402,7 +1458,7 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1402 1458
1403 return 0; 1459 return 0;
1404} 1460}
1405#endif 1461#endif /* CONFIG_SMP */
1406 1462
1407/* 1463/*
1408 * scheduler tick hitting a task of our scheduling class: 1464 * scheduler tick hitting a task of our scheduling class:
@@ -1446,10 +1502,10 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1446 * 'current' within the tree based on its new key value. 1502 * 'current' within the tree based on its new key value.
1447 */ 1503 */
1448 swap(curr->vruntime, se->vruntime); 1504 swap(curr->vruntime, se->vruntime);
1505 resched_task(rq->curr);
1449 } 1506 }
1450 1507
1451 enqueue_task_fair(rq, p, 0); 1508 enqueue_task_fair(rq, p, 0);
1452 resched_task(rq->curr);
1453} 1509}
1454 1510
1455/* 1511/*
@@ -1468,7 +1524,7 @@ static void prio_changed_fair(struct rq *rq, struct task_struct *p,
1468 if (p->prio > oldprio) 1524 if (p->prio > oldprio)
1469 resched_task(rq->curr); 1525 resched_task(rq->curr);
1470 } else 1526 } else
1471 check_preempt_curr(rq, p); 1527 check_preempt_curr(rq, p, 0);
1472} 1528}
1473 1529
1474/* 1530/*
@@ -1485,7 +1541,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p,
1485 if (running) 1541 if (running)
1486 resched_task(rq->curr); 1542 resched_task(rq->curr);
1487 else 1543 else
1488 check_preempt_curr(rq, p); 1544 check_preempt_curr(rq, p, 0);
1489} 1545}
1490 1546
1491/* Account for a task changing its policy or group. 1547/* Account for a task changing its policy or group.
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 1c7283cb9581..7c9e8f4a049f 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,4 +1,5 @@
1SCHED_FEAT(NEW_FAIR_SLEEPERS, 1) 1SCHED_FEAT(NEW_FAIR_SLEEPERS, 1)
2SCHED_FEAT(NORMALIZED_SLEEPER, 1)
2SCHED_FEAT(WAKEUP_PREEMPT, 1) 3SCHED_FEAT(WAKEUP_PREEMPT, 1)
3SCHED_FEAT(START_DEBIT, 1) 4SCHED_FEAT(START_DEBIT, 1)
4SCHED_FEAT(AFFINE_WAKEUPS, 1) 5SCHED_FEAT(AFFINE_WAKEUPS, 1)
@@ -6,5 +7,8 @@ SCHED_FEAT(CACHE_HOT_BUDDY, 1)
6SCHED_FEAT(SYNC_WAKEUPS, 1) 7SCHED_FEAT(SYNC_WAKEUPS, 1)
7SCHED_FEAT(HRTICK, 1) 8SCHED_FEAT(HRTICK, 1)
8SCHED_FEAT(DOUBLE_TICK, 0) 9SCHED_FEAT(DOUBLE_TICK, 0)
9SCHED_FEAT(NORMALIZED_SLEEPER, 1) 10SCHED_FEAT(ASYM_GRAN, 1)
10SCHED_FEAT(DEADLINE, 1) 11SCHED_FEAT(LB_BIAS, 1)
12SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
13SCHED_FEAT(ASYM_EFF_LOAD, 1)
14SCHED_FEAT(WAKEUP_OVERLAP, 0)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 3a4f92dbbe66..dec4ccabe2f5 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sync)
14/* 14/*
15 * Idle tasks are unconditionally rescheduled: 15 * Idle tasks are unconditionally rescheduled:
16 */ 16 */
17static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p) 17static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync)
18{ 18{
19 resched_task(rq->idle); 19 resched_task(rq->idle);
20} 20}
@@ -76,7 +76,7 @@ static void switched_to_idle(struct rq *rq, struct task_struct *p,
76 if (running) 76 if (running)
77 resched_task(rq->curr); 77 resched_task(rq->curr);
78 else 78 else
79 check_preempt_curr(rq, p); 79 check_preempt_curr(rq, p, 0);
80} 80}
81 81
82static void prio_changed_idle(struct rq *rq, struct task_struct *p, 82static void prio_changed_idle(struct rq *rq, struct task_struct *p,
@@ -93,7 +93,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
93 if (p->prio > oldprio) 93 if (p->prio > oldprio)
94 resched_task(rq->curr); 94 resched_task(rq->curr);
95 } else 95 } else
96 check_preempt_curr(rq, p); 96 check_preempt_curr(rq, p, 0);
97} 97}
98 98
99/* 99/*
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 0f3c19197fa4..cdf5740ab03e 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -12,6 +12,9 @@ static inline int rt_overloaded(struct rq *rq)
12 12
13static inline void rt_set_overload(struct rq *rq) 13static inline void rt_set_overload(struct rq *rq)
14{ 14{
15 if (!rq->online)
16 return;
17
15 cpu_set(rq->cpu, rq->rd->rto_mask); 18 cpu_set(rq->cpu, rq->rd->rto_mask);
16 /* 19 /*
17 * Make sure the mask is visible before we set 20 * Make sure the mask is visible before we set
@@ -26,6 +29,9 @@ static inline void rt_set_overload(struct rq *rq)
26 29
27static inline void rt_clear_overload(struct rq *rq) 30static inline void rt_clear_overload(struct rq *rq)
28{ 31{
32 if (!rq->online)
33 return;
34
29 /* the order here really doesn't matter */ 35 /* the order here really doesn't matter */
30 atomic_dec(&rq->rd->rto_count); 36 atomic_dec(&rq->rd->rto_count);
31 cpu_clear(rq->cpu, rq->rd->rto_mask); 37 cpu_clear(rq->cpu, rq->rd->rto_mask);
@@ -96,12 +102,12 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
96 102
97static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 103static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
98{ 104{
105 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
99 struct sched_rt_entity *rt_se = rt_rq->rt_se; 106 struct sched_rt_entity *rt_se = rt_rq->rt_se;
100 107
101 if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) { 108 if (rt_rq->rt_nr_running) {
102 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; 109 if (rt_se && !on_rt_rq(rt_se))
103 110 enqueue_rt_entity(rt_se);
104 enqueue_rt_entity(rt_se);
105 if (rt_rq->highest_prio < curr->prio) 111 if (rt_rq->highest_prio < curr->prio)
106 resched_task(curr); 112 resched_task(curr);
107 } 113 }
@@ -155,7 +161,7 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
155 return &rt_rq->tg->rt_bandwidth; 161 return &rt_rq->tg->rt_bandwidth;
156} 162}
157 163
158#else 164#else /* !CONFIG_RT_GROUP_SCHED */
159 165
160static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) 166static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
161{ 167{
@@ -193,6 +199,8 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
193 199
194static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 200static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
195{ 201{
202 if (rt_rq->rt_nr_running)
203 resched_task(rq_of_rt_rq(rt_rq)->curr);
196} 204}
197 205
198static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 206static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
@@ -220,14 +228,210 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
220 return &def_rt_bandwidth; 228 return &def_rt_bandwidth;
221} 229}
222 230
223#endif 231#endif /* CONFIG_RT_GROUP_SCHED */
232
233#ifdef CONFIG_SMP
234/*
235 * We ran out of runtime, see if we can borrow some from our neighbours.
236 */
237static int do_balance_runtime(struct rt_rq *rt_rq)
238{
239 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
240 struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
241 int i, weight, more = 0;
242 u64 rt_period;
243
244 weight = cpus_weight(rd->span);
245
246 spin_lock(&rt_b->rt_runtime_lock);
247 rt_period = ktime_to_ns(rt_b->rt_period);
248 for_each_cpu_mask_nr(i, rd->span) {
249 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
250 s64 diff;
251
252 if (iter == rt_rq)
253 continue;
254
255 spin_lock(&iter->rt_runtime_lock);
256 /*
257 * Either all rqs have inf runtime and there's nothing to steal
258 * or __disable_runtime() below sets a specific rq to inf to
259 * indicate its been disabled and disalow stealing.
260 */
261 if (iter->rt_runtime == RUNTIME_INF)
262 goto next;
263
264 /*
265 * From runqueues with spare time, take 1/n part of their
266 * spare time, but no more than our period.
267 */
268 diff = iter->rt_runtime - iter->rt_time;
269 if (diff > 0) {
270 diff = div_u64((u64)diff, weight);
271 if (rt_rq->rt_runtime + diff > rt_period)
272 diff = rt_period - rt_rq->rt_runtime;
273 iter->rt_runtime -= diff;
274 rt_rq->rt_runtime += diff;
275 more = 1;
276 if (rt_rq->rt_runtime == rt_period) {
277 spin_unlock(&iter->rt_runtime_lock);
278 break;
279 }
280 }
281next:
282 spin_unlock(&iter->rt_runtime_lock);
283 }
284 spin_unlock(&rt_b->rt_runtime_lock);
285
286 return more;
287}
288
289/*
290 * Ensure this RQ takes back all the runtime it lend to its neighbours.
291 */
292static void __disable_runtime(struct rq *rq)
293{
294 struct root_domain *rd = rq->rd;
295 struct rt_rq *rt_rq;
296
297 if (unlikely(!scheduler_running))
298 return;
299
300 for_each_leaf_rt_rq(rt_rq, rq) {
301 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
302 s64 want;
303 int i;
304
305 spin_lock(&rt_b->rt_runtime_lock);
306 spin_lock(&rt_rq->rt_runtime_lock);
307 /*
308 * Either we're all inf and nobody needs to borrow, or we're
309 * already disabled and thus have nothing to do, or we have
310 * exactly the right amount of runtime to take out.
311 */
312 if (rt_rq->rt_runtime == RUNTIME_INF ||
313 rt_rq->rt_runtime == rt_b->rt_runtime)
314 goto balanced;
315 spin_unlock(&rt_rq->rt_runtime_lock);
316
317 /*
318 * Calculate the difference between what we started out with
319 * and what we current have, that's the amount of runtime
320 * we lend and now have to reclaim.
321 */
322 want = rt_b->rt_runtime - rt_rq->rt_runtime;
323
324 /*
325 * Greedy reclaim, take back as much as we can.
326 */
327 for_each_cpu_mask(i, rd->span) {
328 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
329 s64 diff;
330
331 /*
332 * Can't reclaim from ourselves or disabled runqueues.
333 */
334 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
335 continue;
336
337 spin_lock(&iter->rt_runtime_lock);
338 if (want > 0) {
339 diff = min_t(s64, iter->rt_runtime, want);
340 iter->rt_runtime -= diff;
341 want -= diff;
342 } else {
343 iter->rt_runtime -= want;
344 want -= want;
345 }
346 spin_unlock(&iter->rt_runtime_lock);
347
348 if (!want)
349 break;
350 }
351
352 spin_lock(&rt_rq->rt_runtime_lock);
353 /*
354 * We cannot be left wanting - that would mean some runtime
355 * leaked out of the system.
356 */
357 BUG_ON(want);
358balanced:
359 /*
360 * Disable all the borrow logic by pretending we have inf
361 * runtime - in which case borrowing doesn't make sense.
362 */
363 rt_rq->rt_runtime = RUNTIME_INF;
364 spin_unlock(&rt_rq->rt_runtime_lock);
365 spin_unlock(&rt_b->rt_runtime_lock);
366 }
367}
368
369static void disable_runtime(struct rq *rq)
370{
371 unsigned long flags;
372
373 spin_lock_irqsave(&rq->lock, flags);
374 __disable_runtime(rq);
375 spin_unlock_irqrestore(&rq->lock, flags);
376}
377
378static void __enable_runtime(struct rq *rq)
379{
380 struct rt_rq *rt_rq;
381
382 if (unlikely(!scheduler_running))
383 return;
384
385 /*
386 * Reset each runqueue's bandwidth settings
387 */
388 for_each_leaf_rt_rq(rt_rq, rq) {
389 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
390
391 spin_lock(&rt_b->rt_runtime_lock);
392 spin_lock(&rt_rq->rt_runtime_lock);
393 rt_rq->rt_runtime = rt_b->rt_runtime;
394 rt_rq->rt_time = 0;
395 rt_rq->rt_throttled = 0;
396 spin_unlock(&rt_rq->rt_runtime_lock);
397 spin_unlock(&rt_b->rt_runtime_lock);
398 }
399}
400
401static void enable_runtime(struct rq *rq)
402{
403 unsigned long flags;
404
405 spin_lock_irqsave(&rq->lock, flags);
406 __enable_runtime(rq);
407 spin_unlock_irqrestore(&rq->lock, flags);
408}
409
410static int balance_runtime(struct rt_rq *rt_rq)
411{
412 int more = 0;
413
414 if (rt_rq->rt_time > rt_rq->rt_runtime) {
415 spin_unlock(&rt_rq->rt_runtime_lock);
416 more = do_balance_runtime(rt_rq);
417 spin_lock(&rt_rq->rt_runtime_lock);
418 }
419
420 return more;
421}
422#else /* !CONFIG_SMP */
423static inline int balance_runtime(struct rt_rq *rt_rq)
424{
425 return 0;
426}
427#endif /* CONFIG_SMP */
224 428
225static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) 429static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
226{ 430{
227 int i, idle = 1; 431 int i, idle = 1;
228 cpumask_t span; 432 cpumask_t span;
229 433
230 if (rt_b->rt_runtime == RUNTIME_INF) 434 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
231 return 1; 435 return 1;
232 436
233 span = sched_rt_period_mask(); 437 span = sched_rt_period_mask();
@@ -241,6 +445,8 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
241 u64 runtime; 445 u64 runtime;
242 446
243 spin_lock(&rt_rq->rt_runtime_lock); 447 spin_lock(&rt_rq->rt_runtime_lock);
448 if (rt_rq->rt_throttled)
449 balance_runtime(rt_rq);
244 runtime = rt_rq->rt_runtime; 450 runtime = rt_rq->rt_runtime;
245 rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime); 451 rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
246 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) { 452 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
@@ -261,47 +467,6 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
261 return idle; 467 return idle;
262} 468}
263 469
264#ifdef CONFIG_SMP
265static int balance_runtime(struct rt_rq *rt_rq)
266{
267 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
268 struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
269 int i, weight, more = 0;
270 u64 rt_period;
271
272 weight = cpus_weight(rd->span);
273
274 spin_lock(&rt_b->rt_runtime_lock);
275 rt_period = ktime_to_ns(rt_b->rt_period);
276 for_each_cpu_mask(i, rd->span) {
277 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
278 s64 diff;
279
280 if (iter == rt_rq)
281 continue;
282
283 spin_lock(&iter->rt_runtime_lock);
284 diff = iter->rt_runtime - iter->rt_time;
285 if (diff > 0) {
286 do_div(diff, weight);
287 if (rt_rq->rt_runtime + diff > rt_period)
288 diff = rt_period - rt_rq->rt_runtime;
289 iter->rt_runtime -= diff;
290 rt_rq->rt_runtime += diff;
291 more = 1;
292 if (rt_rq->rt_runtime == rt_period) {
293 spin_unlock(&iter->rt_runtime_lock);
294 break;
295 }
296 }
297 spin_unlock(&iter->rt_runtime_lock);
298 }
299 spin_unlock(&rt_b->rt_runtime_lock);
300
301 return more;
302}
303#endif
304
305static inline int rt_se_prio(struct sched_rt_entity *rt_se) 470static inline int rt_se_prio(struct sched_rt_entity *rt_se)
306{ 471{
307#ifdef CONFIG_RT_GROUP_SCHED 472#ifdef CONFIG_RT_GROUP_SCHED
@@ -318,27 +483,16 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
318{ 483{
319 u64 runtime = sched_rt_runtime(rt_rq); 484 u64 runtime = sched_rt_runtime(rt_rq);
320 485
321 if (runtime == RUNTIME_INF)
322 return 0;
323
324 if (rt_rq->rt_throttled) 486 if (rt_rq->rt_throttled)
325 return rt_rq_throttled(rt_rq); 487 return rt_rq_throttled(rt_rq);
326 488
327 if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq)) 489 if (sched_rt_runtime(rt_rq) >= sched_rt_period(rt_rq))
328 return 0; 490 return 0;
329 491
330#ifdef CONFIG_SMP 492 balance_runtime(rt_rq);
331 if (rt_rq->rt_time > runtime) { 493 runtime = sched_rt_runtime(rt_rq);
332 int more; 494 if (runtime == RUNTIME_INF)
333 495 return 0;
334 spin_unlock(&rt_rq->rt_runtime_lock);
335 more = balance_runtime(rt_rq);
336 spin_lock(&rt_rq->rt_runtime_lock);
337
338 if (more)
339 runtime = sched_rt_runtime(rt_rq);
340 }
341#endif
342 496
343 if (rt_rq->rt_time > runtime) { 497 if (rt_rq->rt_time > runtime) {
344 rt_rq->rt_throttled = 1; 498 rt_rq->rt_throttled = 1;
@@ -375,13 +529,18 @@ static void update_curr_rt(struct rq *rq)
375 curr->se.exec_start = rq->clock; 529 curr->se.exec_start = rq->clock;
376 cpuacct_charge(curr, delta_exec); 530 cpuacct_charge(curr, delta_exec);
377 531
532 if (!rt_bandwidth_enabled())
533 return;
534
378 for_each_sched_rt_entity(rt_se) { 535 for_each_sched_rt_entity(rt_se) {
379 rt_rq = rt_rq_of_se(rt_se); 536 rt_rq = rt_rq_of_se(rt_se);
380 537
381 spin_lock(&rt_rq->rt_runtime_lock); 538 spin_lock(&rt_rq->rt_runtime_lock);
382 rt_rq->rt_time += delta_exec; 539 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
383 if (sched_rt_runtime_exceeded(rt_rq)) 540 rt_rq->rt_time += delta_exec;
384 resched_task(curr); 541 if (sched_rt_runtime_exceeded(rt_rq))
542 resched_task(curr);
543 }
385 spin_unlock(&rt_rq->rt_runtime_lock); 544 spin_unlock(&rt_rq->rt_runtime_lock);
386 } 545 }
387} 546}
@@ -392,12 +551,23 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
392 WARN_ON(!rt_prio(rt_se_prio(rt_se))); 551 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
393 rt_rq->rt_nr_running++; 552 rt_rq->rt_nr_running++;
394#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 553#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
395 if (rt_se_prio(rt_se) < rt_rq->highest_prio) 554 if (rt_se_prio(rt_se) < rt_rq->highest_prio) {
555#ifdef CONFIG_SMP
556 struct rq *rq = rq_of_rt_rq(rt_rq);
557#endif
558
396 rt_rq->highest_prio = rt_se_prio(rt_se); 559 rt_rq->highest_prio = rt_se_prio(rt_se);
560#ifdef CONFIG_SMP
561 if (rq->online)
562 cpupri_set(&rq->rd->cpupri, rq->cpu,
563 rt_se_prio(rt_se));
564#endif
565 }
397#endif 566#endif
398#ifdef CONFIG_SMP 567#ifdef CONFIG_SMP
399 if (rt_se->nr_cpus_allowed > 1) { 568 if (rt_se->nr_cpus_allowed > 1) {
400 struct rq *rq = rq_of_rt_rq(rt_rq); 569 struct rq *rq = rq_of_rt_rq(rt_rq);
570
401 rq->rt.rt_nr_migratory++; 571 rq->rt.rt_nr_migratory++;
402 } 572 }
403 573
@@ -417,6 +587,10 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
417static inline 587static inline
418void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) 588void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
419{ 589{
590#ifdef CONFIG_SMP
591 int highest_prio = rt_rq->highest_prio;
592#endif
593
420 WARN_ON(!rt_prio(rt_se_prio(rt_se))); 594 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
421 WARN_ON(!rt_rq->rt_nr_running); 595 WARN_ON(!rt_rq->rt_nr_running);
422 rt_rq->rt_nr_running--; 596 rt_rq->rt_nr_running--;
@@ -440,6 +614,14 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
440 rq->rt.rt_nr_migratory--; 614 rq->rt.rt_nr_migratory--;
441 } 615 }
442 616
617 if (rt_rq->highest_prio != highest_prio) {
618 struct rq *rq = rq_of_rt_rq(rt_rq);
619
620 if (rq->online)
621 cpupri_set(&rq->rd->cpupri, rq->cpu,
622 rt_rq->highest_prio);
623 }
624
443 update_rt_migration(rq_of_rt_rq(rt_rq)); 625 update_rt_migration(rq_of_rt_rq(rt_rq));
444#endif /* CONFIG_SMP */ 626#endif /* CONFIG_SMP */
445#ifdef CONFIG_RT_GROUP_SCHED 627#ifdef CONFIG_RT_GROUP_SCHED
@@ -455,6 +637,7 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
455 struct rt_rq *rt_rq = rt_rq_of_se(rt_se); 637 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
456 struct rt_prio_array *array = &rt_rq->active; 638 struct rt_prio_array *array = &rt_rq->active;
457 struct rt_rq *group_rq = group_rt_rq(rt_se); 639 struct rt_rq *group_rq = group_rt_rq(rt_se);
640 struct list_head *queue = array->queue + rt_se_prio(rt_se);
458 641
459 /* 642 /*
460 * Don't enqueue the group if its throttled, or when empty. 643 * Don't enqueue the group if its throttled, or when empty.
@@ -465,7 +648,7 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
465 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) 648 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
466 return; 649 return;
467 650
468 list_add_tail(&rt_se->run_list, array->queue + rt_se_prio(rt_se)); 651 list_add_tail(&rt_se->run_list, queue);
469 __set_bit(rt_se_prio(rt_se), array->bitmap); 652 __set_bit(rt_se_prio(rt_se), array->bitmap);
470 653
471 inc_rt_tasks(rt_se, rt_rq); 654 inc_rt_tasks(rt_se, rt_rq);
@@ -532,6 +715,8 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
532 rt_se->timeout = 0; 715 rt_se->timeout = 0;
533 716
534 enqueue_rt_entity(rt_se); 717 enqueue_rt_entity(rt_se);
718
719 inc_cpu_load(rq, p->se.load.weight);
535} 720}
536 721
537static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep) 722static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
@@ -540,36 +725,42 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
540 725
541 update_curr_rt(rq); 726 update_curr_rt(rq);
542 dequeue_rt_entity(rt_se); 727 dequeue_rt_entity(rt_se);
728
729 dec_cpu_load(rq, p->se.load.weight);
543} 730}
544 731
545/* 732/*
546 * Put task to the end of the run list without the overhead of dequeue 733 * Put task to the end of the run list without the overhead of dequeue
547 * followed by enqueue. 734 * followed by enqueue.
548 */ 735 */
549static 736static void
550void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) 737requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
551{ 738{
552 struct rt_prio_array *array = &rt_rq->active; 739 if (on_rt_rq(rt_se)) {
553 struct list_head *queue = array->queue + rt_se_prio(rt_se); 740 struct rt_prio_array *array = &rt_rq->active;
741 struct list_head *queue = array->queue + rt_se_prio(rt_se);
554 742
555 if (on_rt_rq(rt_se)) 743 if (head)
556 list_move_tail(&rt_se->run_list, queue); 744 list_move(&rt_se->run_list, queue);
745 else
746 list_move_tail(&rt_se->run_list, queue);
747 }
557} 748}
558 749
559static void requeue_task_rt(struct rq *rq, struct task_struct *p) 750static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
560{ 751{
561 struct sched_rt_entity *rt_se = &p->rt; 752 struct sched_rt_entity *rt_se = &p->rt;
562 struct rt_rq *rt_rq; 753 struct rt_rq *rt_rq;
563 754
564 for_each_sched_rt_entity(rt_se) { 755 for_each_sched_rt_entity(rt_se) {
565 rt_rq = rt_rq_of_se(rt_se); 756 rt_rq = rt_rq_of_se(rt_se);
566 requeue_rt_entity(rt_rq, rt_se); 757 requeue_rt_entity(rt_rq, rt_se, head);
567 } 758 }
568} 759}
569 760
570static void yield_task_rt(struct rq *rq) 761static void yield_task_rt(struct rq *rq)
571{ 762{
572 requeue_task_rt(rq, rq->curr); 763 requeue_task_rt(rq, rq->curr, 0);
573} 764}
574 765
575#ifdef CONFIG_SMP 766#ifdef CONFIG_SMP
@@ -609,15 +800,58 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
609 */ 800 */
610 return task_cpu(p); 801 return task_cpu(p);
611} 802}
803
804static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
805{
806 cpumask_t mask;
807
808 if (rq->curr->rt.nr_cpus_allowed == 1)
809 return;
810
811 if (p->rt.nr_cpus_allowed != 1
812 && cpupri_find(&rq->rd->cpupri, p, &mask))
813 return;
814
815 if (!cpupri_find(&rq->rd->cpupri, rq->curr, &mask))
816 return;
817
818 /*
819 * There appears to be other cpus that can accept
820 * current and none to run 'p', so lets reschedule
821 * to try and push current away:
822 */
823 requeue_task_rt(rq, p, 1);
824 resched_task(rq->curr);
825}
826
612#endif /* CONFIG_SMP */ 827#endif /* CONFIG_SMP */
613 828
614/* 829/*
615 * Preempt the current task with a newly woken task if needed: 830 * Preempt the current task with a newly woken task if needed:
616 */ 831 */
617static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) 832static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync)
618{ 833{
619 if (p->prio < rq->curr->prio) 834 if (p->prio < rq->curr->prio) {
620 resched_task(rq->curr); 835 resched_task(rq->curr);
836 return;
837 }
838
839#ifdef CONFIG_SMP
840 /*
841 * If:
842 *
843 * - the newly woken task is of equal priority to the current task
844 * - the newly woken task is non-migratable while current is migratable
845 * - current will be preempted on the next reschedule
846 *
847 * we should check to see if current can readily move to a different
848 * cpu. If so, we will reschedule to allow the push logic to try
849 * to move current somewhere else, making room for our non-migratable
850 * task.
851 */
852 if (p->prio == rq->curr->prio && !need_resched())
853 check_preempt_equal_prio(rq, p);
854#endif
621} 855}
622 856
623static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, 857static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
@@ -674,6 +908,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
674#define RT_MAX_TRIES 3 908#define RT_MAX_TRIES 3
675 909
676static int double_lock_balance(struct rq *this_rq, struct rq *busiest); 910static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
911static void double_unlock_balance(struct rq *this_rq, struct rq *busiest);
912
677static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep); 913static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
678 914
679static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 915static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
@@ -720,73 +956,6 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
720 956
721static DEFINE_PER_CPU(cpumask_t, local_cpu_mask); 957static DEFINE_PER_CPU(cpumask_t, local_cpu_mask);
722 958
723static int find_lowest_cpus(struct task_struct *task, cpumask_t *lowest_mask)
724{
725 int lowest_prio = -1;
726 int lowest_cpu = -1;
727 int count = 0;
728 int cpu;
729
730 cpus_and(*lowest_mask, task_rq(task)->rd->online, task->cpus_allowed);
731
732 /*
733 * Scan each rq for the lowest prio.
734 */
735 for_each_cpu_mask(cpu, *lowest_mask) {
736 struct rq *rq = cpu_rq(cpu);
737
738 /* We look for lowest RT prio or non-rt CPU */
739 if (rq->rt.highest_prio >= MAX_RT_PRIO) {
740 /*
741 * if we already found a low RT queue
742 * and now we found this non-rt queue
743 * clear the mask and set our bit.
744 * Otherwise just return the queue as is
745 * and the count==1 will cause the algorithm
746 * to use the first bit found.
747 */
748 if (lowest_cpu != -1) {
749 cpus_clear(*lowest_mask);
750 cpu_set(rq->cpu, *lowest_mask);
751 }
752 return 1;
753 }
754
755 /* no locking for now */
756 if ((rq->rt.highest_prio > task->prio)
757 && (rq->rt.highest_prio >= lowest_prio)) {
758 if (rq->rt.highest_prio > lowest_prio) {
759 /* new low - clear old data */
760 lowest_prio = rq->rt.highest_prio;
761 lowest_cpu = cpu;
762 count = 0;
763 }
764 count++;
765 } else
766 cpu_clear(cpu, *lowest_mask);
767 }
768
769 /*
770 * Clear out all the set bits that represent
771 * runqueues that were of higher prio than
772 * the lowest_prio.
773 */
774 if (lowest_cpu > 0) {
775 /*
776 * Perhaps we could add another cpumask op to
777 * zero out bits. Like cpu_zero_bits(cpumask, nrbits);
778 * Then that could be optimized to use memset and such.
779 */
780 for_each_cpu_mask(cpu, *lowest_mask) {
781 if (cpu >= lowest_cpu)
782 break;
783 cpu_clear(cpu, *lowest_mask);
784 }
785 }
786
787 return count;
788}
789
790static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask) 959static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
791{ 960{
792 int first; 961 int first;
@@ -808,17 +977,19 @@ static int find_lowest_rq(struct task_struct *task)
808 cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask); 977 cpumask_t *lowest_mask = &__get_cpu_var(local_cpu_mask);
809 int this_cpu = smp_processor_id(); 978 int this_cpu = smp_processor_id();
810 int cpu = task_cpu(task); 979 int cpu = task_cpu(task);
811 int count = find_lowest_cpus(task, lowest_mask);
812 980
813 if (!count) 981 if (task->rt.nr_cpus_allowed == 1)
982 return -1; /* No other targets possible */
983
984 if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
814 return -1; /* No targets found */ 985 return -1; /* No targets found */
815 986
816 /* 987 /*
817 * There is no sense in performing an optimal search if only one 988 * Only consider CPUs that are usable for migration.
818 * target is found. 989 * I guess we might want to change cpupri_find() to ignore those
990 * in the first place.
819 */ 991 */
820 if (count == 1) 992 cpus_and(*lowest_mask, *lowest_mask, cpu_active_map);
821 return first_cpu(*lowest_mask);
822 993
823 /* 994 /*
824 * At this point we have built a mask of cpus representing the 995 * At this point we have built a mask of cpus representing the
@@ -900,7 +1071,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
900 break; 1071 break;
901 1072
902 /* try again */ 1073 /* try again */
903 spin_unlock(&lowest_rq->lock); 1074 double_unlock_balance(rq, lowest_rq);
904 lowest_rq = NULL; 1075 lowest_rq = NULL;
905 } 1076 }
906 1077
@@ -969,7 +1140,7 @@ static int push_rt_task(struct rq *rq)
969 1140
970 resched_task(lowest_rq->curr); 1141 resched_task(lowest_rq->curr);
971 1142
972 spin_unlock(&lowest_rq->lock); 1143 double_unlock_balance(rq, lowest_rq);
973 1144
974 ret = 1; 1145 ret = 1;
975out: 1146out:
@@ -1006,7 +1177,7 @@ static int pull_rt_task(struct rq *this_rq)
1006 1177
1007 next = pick_next_task_rt(this_rq); 1178 next = pick_next_task_rt(this_rq);
1008 1179
1009 for_each_cpu_mask(cpu, this_rq->rd->rto_mask) { 1180 for_each_cpu_mask_nr(cpu, this_rq->rd->rto_mask) {
1010 if (this_cpu == cpu) 1181 if (this_cpu == cpu)
1011 continue; 1182 continue;
1012 1183
@@ -1075,7 +1246,7 @@ static int pull_rt_task(struct rq *this_rq)
1075 1246
1076 } 1247 }
1077 skip: 1248 skip:
1078 spin_unlock(&src_rq->lock); 1249 double_unlock_balance(this_rq, src_rq);
1079 } 1250 }
1080 1251
1081 return ret; 1252 return ret;
@@ -1163,17 +1334,25 @@ static void set_cpus_allowed_rt(struct task_struct *p,
1163} 1334}
1164 1335
1165/* Assumes rq->lock is held */ 1336/* Assumes rq->lock is held */
1166static void join_domain_rt(struct rq *rq) 1337static void rq_online_rt(struct rq *rq)
1167{ 1338{
1168 if (rq->rt.overloaded) 1339 if (rq->rt.overloaded)
1169 rt_set_overload(rq); 1340 rt_set_overload(rq);
1341
1342 __enable_runtime(rq);
1343
1344 cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio);
1170} 1345}
1171 1346
1172/* Assumes rq->lock is held */ 1347/* Assumes rq->lock is held */
1173static void leave_domain_rt(struct rq *rq) 1348static void rq_offline_rt(struct rq *rq)
1174{ 1349{
1175 if (rq->rt.overloaded) 1350 if (rq->rt.overloaded)
1176 rt_clear_overload(rq); 1351 rt_clear_overload(rq);
1352
1353 __disable_runtime(rq);
1354
1355 cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
1177} 1356}
1178 1357
1179/* 1358/*
@@ -1306,7 +1485,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
1306 * on the queue: 1485 * on the queue:
1307 */ 1486 */
1308 if (p->rt.run_list.prev != p->rt.run_list.next) { 1487 if (p->rt.run_list.prev != p->rt.run_list.next) {
1309 requeue_task_rt(rq, p); 1488 requeue_task_rt(rq, p, 0);
1310 set_tsk_need_resched(p); 1489 set_tsk_need_resched(p);
1311 } 1490 }
1312} 1491}
@@ -1336,8 +1515,8 @@ static const struct sched_class rt_sched_class = {
1336 .load_balance = load_balance_rt, 1515 .load_balance = load_balance_rt,
1337 .move_one_task = move_one_task_rt, 1516 .move_one_task = move_one_task_rt,
1338 .set_cpus_allowed = set_cpus_allowed_rt, 1517 .set_cpus_allowed = set_cpus_allowed_rt,
1339 .join_domain = join_domain_rt, 1518 .rq_online = rq_online_rt,
1340 .leave_domain = leave_domain_rt, 1519 .rq_offline = rq_offline_rt,
1341 .pre_schedule = pre_schedule_rt, 1520 .pre_schedule = pre_schedule_rt,
1342 .post_schedule = post_schedule_rt, 1521 .post_schedule = post_schedule_rt,
1343 .task_wake_up = task_wake_up_rt, 1522 .task_wake_up = task_wake_up_rt,
@@ -1350,3 +1529,17 @@ static const struct sched_class rt_sched_class = {
1350 .prio_changed = prio_changed_rt, 1529 .prio_changed = prio_changed_rt,
1351 .switched_to = switched_to_rt, 1530 .switched_to = switched_to_rt,
1352}; 1531};
1532
1533#ifdef CONFIG_SCHED_DEBUG
1534extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
1535
1536static void print_rt_stats(struct seq_file *m, int cpu)
1537{
1538 struct rt_rq *rt_rq;
1539
1540 rcu_read_lock();
1541 for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu))
1542 print_rt_rq(m, cpu, rt_rq);
1543 rcu_read_unlock();
1544}
1545#endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 80179ef7450e..8385d43987e2 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -118,6 +118,13 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
118 if (rq) 118 if (rq)
119 rq->rq_sched_info.cpu_time += delta; 119 rq->rq_sched_info.cpu_time += delta;
120} 120}
121
122static inline void
123rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
124{
125 if (rq)
126 rq->rq_sched_info.run_delay += delta;
127}
121# define schedstat_inc(rq, field) do { (rq)->field++; } while (0) 128# define schedstat_inc(rq, field) do { (rq)->field++; } while (0)
122# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) 129# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0)
123# define schedstat_set(var, val) do { var = (val); } while (0) 130# define schedstat_set(var, val) do { var = (val); } while (0)
@@ -126,6 +133,9 @@ static inline void
126rq_sched_info_arrive(struct rq *rq, unsigned long long delta) 133rq_sched_info_arrive(struct rq *rq, unsigned long long delta)
127{} 134{}
128static inline void 135static inline void
136rq_sched_info_dequeued(struct rq *rq, unsigned long long delta)
137{}
138static inline void
129rq_sched_info_depart(struct rq *rq, unsigned long long delta) 139rq_sched_info_depart(struct rq *rq, unsigned long long delta)
130{} 140{}
131# define schedstat_inc(rq, field) do { } while (0) 141# define schedstat_inc(rq, field) do { } while (0)
@@ -134,6 +144,11 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
134#endif 144#endif
135 145
136#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) 146#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
147static inline void sched_info_reset_dequeued(struct task_struct *t)
148{
149 t->sched_info.last_queued = 0;
150}
151
137/* 152/*
138 * Called when a process is dequeued from the active array and given 153 * Called when a process is dequeued from the active array and given
139 * the cpu. We should note that with the exception of interactive 154 * the cpu. We should note that with the exception of interactive
@@ -143,15 +158,22 @@ rq_sched_info_depart(struct rq *rq, unsigned long long delta)
143 * active queue, thus delaying tasks in the expired queue from running; 158 * active queue, thus delaying tasks in the expired queue from running;
144 * see scheduler_tick()). 159 * see scheduler_tick()).
145 * 160 *
146 * This function is only called from sched_info_arrive(), rather than 161 * Though we are interested in knowing how long it was from the *first* time a
147 * dequeue_task(). Even though a task may be queued and dequeued multiple 162 * task was queued to the time that it finally hit a cpu, we call this routine
148 * times as it is shuffled about, we're really interested in knowing how 163 * from dequeue_task() to account for possible rq->clock skew across cpus. The
149 * long it was from the *first* time it was queued to the time that it 164 * delta taken on each cpu would annul the skew.
150 * finally hit a cpu.
151 */ 165 */
152static inline void sched_info_dequeued(struct task_struct *t) 166static inline void sched_info_dequeued(struct task_struct *t)
153{ 167{
154 t->sched_info.last_queued = 0; 168 unsigned long long now = task_rq(t)->clock, delta = 0;
169
170 if (unlikely(sched_info_on()))
171 if (t->sched_info.last_queued)
172 delta = now - t->sched_info.last_queued;
173 sched_info_reset_dequeued(t);
174 t->sched_info.run_delay += delta;
175
176 rq_sched_info_dequeued(task_rq(t), delta);
155} 177}
156 178
157/* 179/*
@@ -165,7 +187,7 @@ static void sched_info_arrive(struct task_struct *t)
165 187
166 if (t->sched_info.last_queued) 188 if (t->sched_info.last_queued)
167 delta = now - t->sched_info.last_queued; 189 delta = now - t->sched_info.last_queued;
168 sched_info_dequeued(t); 190 sched_info_reset_dequeued(t);
169 t->sched_info.run_delay += delta; 191 t->sched_info.run_delay += delta;
170 t->sched_info.last_arrival = now; 192 t->sched_info.last_arrival = now;
171 t->sched_info.pcount++; 193 t->sched_info.pcount++;
@@ -242,7 +264,9 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
242 __sched_info_switch(prev, next); 264 __sched_info_switch(prev, next);
243} 265}
244#else 266#else
245#define sched_info_queued(t) do { } while (0) 267#define sched_info_queued(t) do { } while (0)
246#define sched_info_switch(t, next) do { } while (0) 268#define sched_info_reset_dequeued(t) do { } while (0)
269#define sched_info_dequeued(t) do { } while (0)
270#define sched_info_switch(t, next) do { } while (0)
247#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ 271#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
248 272
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index 5c2942e768cd..94a62c0d4ade 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -31,6 +31,7 @@
31#include <linux/sched.h> 31#include <linux/sched.h>
32#include <linux/semaphore.h> 32#include <linux/semaphore.h>
33#include <linux/spinlock.h> 33#include <linux/spinlock.h>
34#include <linux/ftrace.h>
34 35
35static noinline void __down(struct semaphore *sem); 36static noinline void __down(struct semaphore *sem);
36static noinline int __down_interruptible(struct semaphore *sem); 37static noinline int __down_interruptible(struct semaphore *sem);
@@ -211,9 +212,7 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
211 waiter.up = 0; 212 waiter.up = 0;
212 213
213 for (;;) { 214 for (;;) {
214 if (state == TASK_INTERRUPTIBLE && signal_pending(task)) 215 if (signal_pending_state(state, task))
215 goto interrupted;
216 if (state == TASK_KILLABLE && fatal_signal_pending(task))
217 goto interrupted; 216 goto interrupted;
218 if (timeout <= 0) 217 if (timeout <= 0)
219 goto timed_out; 218 goto timed_out;
diff --git a/kernel/signal.c b/kernel/signal.c
index 6c0958e52ea7..e661b01d340f 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -22,6 +22,7 @@
22#include <linux/ptrace.h> 22#include <linux/ptrace.h>
23#include <linux/signal.h> 23#include <linux/signal.h>
24#include <linux/signalfd.h> 24#include <linux/signalfd.h>
25#include <linux/tracehook.h>
25#include <linux/capability.h> 26#include <linux/capability.h>
26#include <linux/freezer.h> 27#include <linux/freezer.h>
27#include <linux/pid_namespace.h> 28#include <linux/pid_namespace.h>
@@ -39,24 +40,21 @@
39 40
40static struct kmem_cache *sigqueue_cachep; 41static struct kmem_cache *sigqueue_cachep;
41 42
42static int __sig_ignored(struct task_struct *t, int sig) 43static void __user *sig_handler(struct task_struct *t, int sig)
43{ 44{
44 void __user *handler; 45 return t->sighand->action[sig - 1].sa.sa_handler;
46}
45 47
48static int sig_handler_ignored(void __user *handler, int sig)
49{
46 /* Is it explicitly or implicitly ignored? */ 50 /* Is it explicitly or implicitly ignored? */
47
48 handler = t->sighand->action[sig - 1].sa.sa_handler;
49 return handler == SIG_IGN || 51 return handler == SIG_IGN ||
50 (handler == SIG_DFL && sig_kernel_ignore(sig)); 52 (handler == SIG_DFL && sig_kernel_ignore(sig));
51} 53}
52 54
53static int sig_ignored(struct task_struct *t, int sig) 55static int sig_ignored(struct task_struct *t, int sig)
54{ 56{
55 /* 57 void __user *handler;
56 * Tracers always want to know about signals..
57 */
58 if (t->ptrace & PT_PTRACED)
59 return 0;
60 58
61 /* 59 /*
62 * Blocked signals are never ignored, since the 60 * Blocked signals are never ignored, since the
@@ -66,7 +64,14 @@ static int sig_ignored(struct task_struct *t, int sig)
66 if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig)) 64 if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
67 return 0; 65 return 0;
68 66
69 return __sig_ignored(t, sig); 67 handler = sig_handler(t, sig);
68 if (!sig_handler_ignored(handler, sig))
69 return 0;
70
71 /*
72 * Tracers may want to know about even ignored signals.
73 */
74 return !tracehook_consider_ignored_signal(t, sig, handler);
70} 75}
71 76
72/* 77/*
@@ -129,7 +134,9 @@ void recalc_sigpending_and_wake(struct task_struct *t)
129 134
130void recalc_sigpending(void) 135void recalc_sigpending(void)
131{ 136{
132 if (!recalc_sigpending_tsk(current) && !freezing(current)) 137 if (unlikely(tracehook_force_sigpending()))
138 set_thread_flag(TIF_SIGPENDING);
139 else if (!recalc_sigpending_tsk(current) && !freezing(current))
133 clear_thread_flag(TIF_SIGPENDING); 140 clear_thread_flag(TIF_SIGPENDING);
134 141
135} 142}
@@ -295,12 +302,12 @@ flush_signal_handlers(struct task_struct *t, int force_default)
295 302
296int unhandled_signal(struct task_struct *tsk, int sig) 303int unhandled_signal(struct task_struct *tsk, int sig)
297{ 304{
305 void __user *handler = tsk->sighand->action[sig-1].sa.sa_handler;
298 if (is_global_init(tsk)) 306 if (is_global_init(tsk))
299 return 1; 307 return 1;
300 if (tsk->ptrace & PT_PTRACED) 308 if (handler != SIG_IGN && handler != SIG_DFL)
301 return 0; 309 return 0;
302 return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) || 310 return !tracehook_consider_fatal_signal(tsk, sig, handler);
303 (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
304} 311}
305 312
306 313
@@ -338,13 +345,9 @@ unblock_all_signals(void)
338 spin_unlock_irqrestore(&current->sighand->siglock, flags); 345 spin_unlock_irqrestore(&current->sighand->siglock, flags);
339} 346}
340 347
341static int collect_signal(int sig, struct sigpending *list, siginfo_t *info) 348static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
342{ 349{
343 struct sigqueue *q, *first = NULL; 350 struct sigqueue *q, *first = NULL;
344 int still_pending = 0;
345
346 if (unlikely(!sigismember(&list->signal, sig)))
347 return 0;
348 351
349 /* 352 /*
350 * Collect the siginfo appropriate to this signal. Check if 353 * Collect the siginfo appropriate to this signal. Check if
@@ -352,33 +355,30 @@ static int collect_signal(int sig, struct sigpending *list, siginfo_t *info)
352 */ 355 */
353 list_for_each_entry(q, &list->list, list) { 356 list_for_each_entry(q, &list->list, list) {
354 if (q->info.si_signo == sig) { 357 if (q->info.si_signo == sig) {
355 if (first) { 358 if (first)
356 still_pending = 1; 359 goto still_pending;
357 break;
358 }
359 first = q; 360 first = q;
360 } 361 }
361 } 362 }
363
364 sigdelset(&list->signal, sig);
365
362 if (first) { 366 if (first) {
367still_pending:
363 list_del_init(&first->list); 368 list_del_init(&first->list);
364 copy_siginfo(info, &first->info); 369 copy_siginfo(info, &first->info);
365 __sigqueue_free(first); 370 __sigqueue_free(first);
366 if (!still_pending)
367 sigdelset(&list->signal, sig);
368 } else { 371 } else {
369
370 /* Ok, it wasn't in the queue. This must be 372 /* Ok, it wasn't in the queue. This must be
371 a fast-pathed signal or we must have been 373 a fast-pathed signal or we must have been
372 out of queue space. So zero out the info. 374 out of queue space. So zero out the info.
373 */ 375 */
374 sigdelset(&list->signal, sig);
375 info->si_signo = sig; 376 info->si_signo = sig;
376 info->si_errno = 0; 377 info->si_errno = 0;
377 info->si_code = 0; 378 info->si_code = 0;
378 info->si_pid = 0; 379 info->si_pid = 0;
379 info->si_uid = 0; 380 info->si_uid = 0;
380 } 381 }
381 return 1;
382} 382}
383 383
384static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, 384static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
@@ -396,8 +396,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
396 } 396 }
397 } 397 }
398 398
399 if (!collect_signal(sig, pending, info)) 399 collect_signal(sig, pending, info);
400 sig = 0;
401 } 400 }
402 401
403 return sig; 402 return sig;
@@ -462,8 +461,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
462 * is to alert stop-signal processing code when another 461 * is to alert stop-signal processing code when another
463 * processor has come along and cleared the flag. 462 * processor has come along and cleared the flag.
464 */ 463 */
465 if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) 464 tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
466 tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
467 } 465 }
468 if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) { 466 if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) {
469 /* 467 /*
@@ -600,9 +598,6 @@ static int check_kill_permission(int sig, struct siginfo *info,
600 return security_task_kill(t, info, sig, 0); 598 return security_task_kill(t, info, sig, 0);
601} 599}
602 600
603/* forward decl */
604static void do_notify_parent_cldstop(struct task_struct *tsk, int why);
605
606/* 601/*
607 * Handle magic process-wide effects of stop/continue signals. Unlike 602 * Handle magic process-wide effects of stop/continue signals. Unlike
608 * the signal actions, these happen immediately at signal-generation 603 * the signal actions, these happen immediately at signal-generation
@@ -765,7 +760,8 @@ static void complete_signal(int sig, struct task_struct *p, int group)
765 if (sig_fatal(p, sig) && 760 if (sig_fatal(p, sig) &&
766 !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) && 761 !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
767 !sigismember(&t->real_blocked, sig) && 762 !sigismember(&t->real_blocked, sig) &&
768 (sig == SIGKILL || !(t->ptrace & PT_PTRACED))) { 763 (sig == SIGKILL ||
764 !tracehook_consider_fatal_signal(t, sig, SIG_DFL))) {
769 /* 765 /*
770 * This signal will be fatal to the whole group. 766 * This signal will be fatal to the whole group.
771 */ 767 */
@@ -1125,7 +1121,7 @@ EXPORT_SYMBOL_GPL(kill_pid_info_as_uid);
1125 * is probably wrong. Should make it like BSD or SYSV. 1121 * is probably wrong. Should make it like BSD or SYSV.
1126 */ 1122 */
1127 1123
1128static int kill_something_info(int sig, struct siginfo *info, int pid) 1124static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
1129{ 1125{
1130 int ret; 1126 int ret;
1131 1127
@@ -1237,17 +1233,6 @@ int kill_pid(struct pid *pid, int sig, int priv)
1237} 1233}
1238EXPORT_SYMBOL(kill_pid); 1234EXPORT_SYMBOL(kill_pid);
1239 1235
1240int
1241kill_proc(pid_t pid, int sig, int priv)
1242{
1243 int ret;
1244
1245 rcu_read_lock();
1246 ret = kill_pid_info(sig, __si_special(priv), find_pid(pid));
1247 rcu_read_unlock();
1248 return ret;
1249}
1250
1251/* 1236/*
1252 * These functions support sending signals using preallocated sigqueue 1237 * These functions support sending signals using preallocated sigqueue
1253 * structures. This is needed "because realtime applications cannot 1238 * structures. This is needed "because realtime applications cannot
@@ -1319,6 +1304,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
1319 q->info.si_overrun++; 1304 q->info.si_overrun++;
1320 goto out; 1305 goto out;
1321 } 1306 }
1307 q->info.si_overrun = 0;
1322 1308
1323 signalfd_notify(t, sig); 1309 signalfd_notify(t, sig);
1324 pending = group ? &t->signal->shared_pending : &t->pending; 1310 pending = group ? &t->signal->shared_pending : &t->pending;
@@ -1343,13 +1329,16 @@ static inline void __wake_up_parent(struct task_struct *p,
1343/* 1329/*
1344 * Let a parent know about the death of a child. 1330 * Let a parent know about the death of a child.
1345 * For a stopped/continued status change, use do_notify_parent_cldstop instead. 1331 * For a stopped/continued status change, use do_notify_parent_cldstop instead.
1332 *
1333 * Returns -1 if our parent ignored us and so we've switched to
1334 * self-reaping, or else @sig.
1346 */ 1335 */
1347 1336int do_notify_parent(struct task_struct *tsk, int sig)
1348void do_notify_parent(struct task_struct *tsk, int sig)
1349{ 1337{
1350 struct siginfo info; 1338 struct siginfo info;
1351 unsigned long flags; 1339 unsigned long flags;
1352 struct sighand_struct *psig; 1340 struct sighand_struct *psig;
1341 int ret = sig;
1353 1342
1354 BUG_ON(sig == -1); 1343 BUG_ON(sig == -1);
1355 1344
@@ -1379,10 +1368,9 @@ void do_notify_parent(struct task_struct *tsk, int sig)
1379 1368
1380 info.si_uid = tsk->uid; 1369 info.si_uid = tsk->uid;
1381 1370
1382 /* FIXME: find out whether or not this is supposed to be c*time. */ 1371 info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime,
1383 info.si_utime = cputime_to_jiffies(cputime_add(tsk->utime,
1384 tsk->signal->utime)); 1372 tsk->signal->utime));
1385 info.si_stime = cputime_to_jiffies(cputime_add(tsk->stime, 1373 info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime,
1386 tsk->signal->stime)); 1374 tsk->signal->stime));
1387 1375
1388 info.si_status = tsk->exit_code & 0x7f; 1376 info.si_status = tsk->exit_code & 0x7f;
@@ -1415,14 +1403,16 @@ void do_notify_parent(struct task_struct *tsk, int sig)
1415 * is implementation-defined: we do (if you don't want 1403 * is implementation-defined: we do (if you don't want
1416 * it, just use SIG_IGN instead). 1404 * it, just use SIG_IGN instead).
1417 */ 1405 */
1418 tsk->exit_signal = -1; 1406 ret = tsk->exit_signal = -1;
1419 if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) 1407 if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
1420 sig = 0; 1408 sig = -1;
1421 } 1409 }
1422 if (valid_signal(sig) && sig > 0) 1410 if (valid_signal(sig) && sig > 0)
1423 __group_send_sig_info(sig, &info, tsk->parent); 1411 __group_send_sig_info(sig, &info, tsk->parent);
1424 __wake_up_parent(tsk, tsk->parent); 1412 __wake_up_parent(tsk, tsk->parent);
1425 spin_unlock_irqrestore(&psig->siglock, flags); 1413 spin_unlock_irqrestore(&psig->siglock, flags);
1414
1415 return ret;
1426} 1416}
1427 1417
1428static void do_notify_parent_cldstop(struct task_struct *tsk, int why) 1418static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
@@ -1450,9 +1440,8 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
1450 1440
1451 info.si_uid = tsk->uid; 1441 info.si_uid = tsk->uid;
1452 1442
1453 /* FIXME: find out whether or not this is supposed to be c*time. */ 1443 info.si_utime = cputime_to_clock_t(tsk->utime);
1454 info.si_utime = cputime_to_jiffies(tsk->utime); 1444 info.si_stime = cputime_to_clock_t(tsk->stime);
1455 info.si_stime = cputime_to_jiffies(tsk->stime);
1456 1445
1457 info.si_code = why; 1446 info.si_code = why;
1458 switch (why) { 1447 switch (why) {
@@ -1491,10 +1480,10 @@ static inline int may_ptrace_stop(void)
1491 * is a deadlock situation, and pointless because our tracer 1480 * is a deadlock situation, and pointless because our tracer
1492 * is dead so don't allow us to stop. 1481 * is dead so don't allow us to stop.
1493 * If SIGKILL was already sent before the caller unlocked 1482 * If SIGKILL was already sent before the caller unlocked
1494 * ->siglock we must see ->core_waiters != 0. Otherwise it 1483 * ->siglock we must see ->core_state != NULL. Otherwise it
1495 * is safe to enter schedule(). 1484 * is safe to enter schedule().
1496 */ 1485 */
1497 if (unlikely(current->mm->core_waiters) && 1486 if (unlikely(current->mm->core_state) &&
1498 unlikely(current->mm == current->parent->mm)) 1487 unlikely(current->mm == current->parent->mm))
1499 return 0; 1488 return 0;
1500 1489
@@ -1507,9 +1496,8 @@ static inline int may_ptrace_stop(void)
1507 */ 1496 */
1508static int sigkill_pending(struct task_struct *tsk) 1497static int sigkill_pending(struct task_struct *tsk)
1509{ 1498{
1510 return ((sigismember(&tsk->pending.signal, SIGKILL) || 1499 return sigismember(&tsk->pending.signal, SIGKILL) ||
1511 sigismember(&tsk->signal->shared_pending.signal, SIGKILL)) && 1500 sigismember(&tsk->signal->shared_pending.signal, SIGKILL);
1512 !unlikely(sigismember(&tsk->blocked, SIGKILL)));
1513} 1501}
1514 1502
1515/* 1503/*
@@ -1525,8 +1513,6 @@ static int sigkill_pending(struct task_struct *tsk)
1525 */ 1513 */
1526static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) 1514static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
1527{ 1515{
1528 int killed = 0;
1529
1530 if (arch_ptrace_stop_needed(exit_code, info)) { 1516 if (arch_ptrace_stop_needed(exit_code, info)) {
1531 /* 1517 /*
1532 * The arch code has something special to do before a 1518 * The arch code has something special to do before a
@@ -1542,7 +1528,8 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
1542 spin_unlock_irq(&current->sighand->siglock); 1528 spin_unlock_irq(&current->sighand->siglock);
1543 arch_ptrace_stop(exit_code, info); 1529 arch_ptrace_stop(exit_code, info);
1544 spin_lock_irq(&current->sighand->siglock); 1530 spin_lock_irq(&current->sighand->siglock);
1545 killed = sigkill_pending(current); 1531 if (sigkill_pending(current))
1532 return;
1546 } 1533 }
1547 1534
1548 /* 1535 /*
@@ -1559,7 +1546,7 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
1559 __set_current_state(TASK_TRACED); 1546 __set_current_state(TASK_TRACED);
1560 spin_unlock_irq(&current->sighand->siglock); 1547 spin_unlock_irq(&current->sighand->siglock);
1561 read_lock(&tasklist_lock); 1548 read_lock(&tasklist_lock);
1562 if (!unlikely(killed) && may_ptrace_stop()) { 1549 if (may_ptrace_stop()) {
1563 do_notify_parent_cldstop(current, CLD_TRAPPED); 1550 do_notify_parent_cldstop(current, CLD_TRAPPED);
1564 read_unlock(&tasklist_lock); 1551 read_unlock(&tasklist_lock);
1565 schedule(); 1552 schedule();
@@ -1623,7 +1610,7 @@ finish_stop(int stop_count)
1623 * a group stop in progress and we are the last to stop, 1610 * a group stop in progress and we are the last to stop,
1624 * report to the parent. When ptraced, every thread reports itself. 1611 * report to the parent. When ptraced, every thread reports itself.
1625 */ 1612 */
1626 if (stop_count == 0 || (current->ptrace & PT_PTRACED)) { 1613 if (tracehook_notify_jctl(stop_count == 0, CLD_STOPPED)) {
1627 read_lock(&tasklist_lock); 1614 read_lock(&tasklist_lock);
1628 do_notify_parent_cldstop(current, CLD_STOPPED); 1615 do_notify_parent_cldstop(current, CLD_STOPPED);
1629 read_unlock(&tasklist_lock); 1616 read_unlock(&tasklist_lock);
@@ -1658,8 +1645,7 @@ static int do_signal_stop(int signr)
1658 } else { 1645 } else {
1659 struct task_struct *t; 1646 struct task_struct *t;
1660 1647
1661 if (unlikely((sig->flags & (SIGNAL_STOP_DEQUEUED | SIGNAL_UNKILLABLE)) 1648 if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) ||
1662 != SIGNAL_STOP_DEQUEUED) ||
1663 unlikely(signal_group_exit(sig))) 1649 unlikely(signal_group_exit(sig)))
1664 return 0; 1650 return 0;
1665 /* 1651 /*
@@ -1760,6 +1746,9 @@ relock:
1760 signal->flags &= ~SIGNAL_CLD_MASK; 1746 signal->flags &= ~SIGNAL_CLD_MASK;
1761 spin_unlock_irq(&sighand->siglock); 1747 spin_unlock_irq(&sighand->siglock);
1762 1748
1749 if (unlikely(!tracehook_notify_jctl(1, why)))
1750 goto relock;
1751
1763 read_lock(&tasklist_lock); 1752 read_lock(&tasklist_lock);
1764 do_notify_parent_cldstop(current->group_leader, why); 1753 do_notify_parent_cldstop(current->group_leader, why);
1765 read_unlock(&tasklist_lock); 1754 read_unlock(&tasklist_lock);
@@ -1773,17 +1762,33 @@ relock:
1773 do_signal_stop(0)) 1762 do_signal_stop(0))
1774 goto relock; 1763 goto relock;
1775 1764
1776 signr = dequeue_signal(current, &current->blocked, info); 1765 /*
1777 if (!signr) 1766 * Tracing can induce an artifical signal and choose sigaction.
1778 break; /* will return 0 */ 1767 * The return value in @signr determines the default action,
1768 * but @info->si_signo is the signal number we will report.
1769 */
1770 signr = tracehook_get_signal(current, regs, info, return_ka);
1771 if (unlikely(signr < 0))
1772 goto relock;
1773 if (unlikely(signr != 0))
1774 ka = return_ka;
1775 else {
1776 signr = dequeue_signal(current, &current->blocked,
1777 info);
1779 1778
1780 if (signr != SIGKILL) {
1781 signr = ptrace_signal(signr, info, regs, cookie);
1782 if (!signr) 1779 if (!signr)
1783 continue; 1780 break; /* will return 0 */
1781
1782 if (signr != SIGKILL) {
1783 signr = ptrace_signal(signr, info,
1784 regs, cookie);
1785 if (!signr)
1786 continue;
1787 }
1788
1789 ka = &sighand->action[signr-1];
1784 } 1790 }
1785 1791
1786 ka = &sighand->action[signr-1];
1787 if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ 1792 if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */
1788 continue; 1793 continue;
1789 if (ka->sa.sa_handler != SIG_DFL) { 1794 if (ka->sa.sa_handler != SIG_DFL) {
@@ -1831,7 +1836,7 @@ relock:
1831 spin_lock_irq(&sighand->siglock); 1836 spin_lock_irq(&sighand->siglock);
1832 } 1837 }
1833 1838
1834 if (likely(do_signal_stop(signr))) { 1839 if (likely(do_signal_stop(info->si_signo))) {
1835 /* It released the siglock. */ 1840 /* It released the siglock. */
1836 goto relock; 1841 goto relock;
1837 } 1842 }
@@ -1852,7 +1857,7 @@ relock:
1852 1857
1853 if (sig_kernel_coredump(signr)) { 1858 if (sig_kernel_coredump(signr)) {
1854 if (print_fatal_signals) 1859 if (print_fatal_signals)
1855 print_fatal_signal(regs, signr); 1860 print_fatal_signal(regs, info->si_signo);
1856 /* 1861 /*
1857 * If it was able to dump core, this kills all 1862 * If it was able to dump core, this kills all
1858 * other threads in the group and synchronizes with 1863 * other threads in the group and synchronizes with
@@ -1861,13 +1866,13 @@ relock:
1861 * first and our do_group_exit call below will use 1866 * first and our do_group_exit call below will use
1862 * that value and ignore the one we pass it. 1867 * that value and ignore the one we pass it.
1863 */ 1868 */
1864 do_coredump((long)signr, signr, regs); 1869 do_coredump(info->si_signo, info->si_signo, regs);
1865 } 1870 }
1866 1871
1867 /* 1872 /*
1868 * Death signals, no core dump. 1873 * Death signals, no core dump.
1869 */ 1874 */
1870 do_group_exit(signr); 1875 do_group_exit(info->si_signo);
1871 /* NOTREACHED */ 1876 /* NOTREACHED */
1872 } 1877 }
1873 spin_unlock_irq(&sighand->siglock); 1878 spin_unlock_irq(&sighand->siglock);
@@ -1909,7 +1914,7 @@ void exit_signals(struct task_struct *tsk)
1909out: 1914out:
1910 spin_unlock_irq(&tsk->sighand->siglock); 1915 spin_unlock_irq(&tsk->sighand->siglock);
1911 1916
1912 if (unlikely(group_stop)) { 1917 if (unlikely(group_stop) && tracehook_notify_jctl(1, CLD_STOPPED)) {
1913 read_lock(&tasklist_lock); 1918 read_lock(&tasklist_lock);
1914 do_notify_parent_cldstop(tsk, CLD_STOPPED); 1919 do_notify_parent_cldstop(tsk, CLD_STOPPED);
1915 read_unlock(&tasklist_lock); 1920 read_unlock(&tasklist_lock);
@@ -1920,8 +1925,6 @@ EXPORT_SYMBOL(recalc_sigpending);
1920EXPORT_SYMBOL_GPL(dequeue_signal); 1925EXPORT_SYMBOL_GPL(dequeue_signal);
1921EXPORT_SYMBOL(flush_signals); 1926EXPORT_SYMBOL(flush_signals);
1922EXPORT_SYMBOL(force_sig); 1927EXPORT_SYMBOL(force_sig);
1923EXPORT_SYMBOL(kill_proc);
1924EXPORT_SYMBOL(ptrace_notify);
1925EXPORT_SYMBOL(send_sig); 1928EXPORT_SYMBOL(send_sig);
1926EXPORT_SYMBOL(send_sig_info); 1929EXPORT_SYMBOL(send_sig_info);
1927EXPORT_SYMBOL(sigprocmask); 1930EXPORT_SYMBOL(sigprocmask);
@@ -2196,7 +2199,7 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese,
2196} 2199}
2197 2200
2198asmlinkage long 2201asmlinkage long
2199sys_kill(int pid, int sig) 2202sys_kill(pid_t pid, int sig)
2200{ 2203{
2201 struct siginfo info; 2204 struct siginfo info;
2202 2205
@@ -2209,7 +2212,7 @@ sys_kill(int pid, int sig)
2209 return kill_something_info(sig, &info, pid); 2212 return kill_something_info(sig, &info, pid);
2210} 2213}
2211 2214
2212static int do_tkill(int tgid, int pid, int sig) 2215static int do_tkill(pid_t tgid, pid_t pid, int sig)
2213{ 2216{
2214 int error; 2217 int error;
2215 struct siginfo info; 2218 struct siginfo info;
@@ -2255,7 +2258,7 @@ static int do_tkill(int tgid, int pid, int sig)
2255 * exists but it's not belonging to the target process anymore. This 2258 * exists but it's not belonging to the target process anymore. This
2256 * method solves the problem of threads exiting and PIDs getting reused. 2259 * method solves the problem of threads exiting and PIDs getting reused.
2257 */ 2260 */
2258asmlinkage long sys_tgkill(int tgid, int pid, int sig) 2261asmlinkage long sys_tgkill(pid_t tgid, pid_t pid, int sig)
2259{ 2262{
2260 /* This is only valid for single tasks */ 2263 /* This is only valid for single tasks */
2261 if (pid <= 0 || tgid <= 0) 2264 if (pid <= 0 || tgid <= 0)
@@ -2268,7 +2271,7 @@ asmlinkage long sys_tgkill(int tgid, int pid, int sig)
2268 * Send a signal to only one task, even if it's a CLONE_THREAD task. 2271 * Send a signal to only one task, even if it's a CLONE_THREAD task.
2269 */ 2272 */
2270asmlinkage long 2273asmlinkage long
2271sys_tkill(int pid, int sig) 2274sys_tkill(pid_t pid, int sig)
2272{ 2275{
2273 /* This is only valid for single tasks */ 2276 /* This is only valid for single tasks */
2274 if (pid <= 0) 2277 if (pid <= 0)
@@ -2278,7 +2281,7 @@ sys_tkill(int pid, int sig)
2278} 2281}
2279 2282
2280asmlinkage long 2283asmlinkage long
2281sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo) 2284sys_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t __user *uinfo)
2282{ 2285{
2283 siginfo_t info; 2286 siginfo_t info;
2284 2287
@@ -2325,7 +2328,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
2325 * (for example, SIGCHLD), shall cause the pending signal to 2328 * (for example, SIGCHLD), shall cause the pending signal to
2326 * be discarded, whether or not it is blocked" 2329 * be discarded, whether or not it is blocked"
2327 */ 2330 */
2328 if (__sig_ignored(t, sig)) { 2331 if (sig_handler_ignored(sig_handler(t, sig), sig)) {
2329 sigemptyset(&mask); 2332 sigemptyset(&mask);
2330 sigaddset(&mask, sig); 2333 sigaddset(&mask, sig);
2331 rm_from_queue_full(&mask, &t->signal->shared_pending); 2334 rm_from_queue_full(&mask, &t->signal->shared_pending);
diff --git a/kernel/smp.c b/kernel/smp.c
new file mode 100644
index 000000000000..f362a8553777
--- /dev/null
+++ b/kernel/smp.c
@@ -0,0 +1,431 @@
1/*
2 * Generic helpers for smp ipi calls
3 *
4 * (C) Jens Axboe <jens.axboe@oracle.com> 2008
5 *
6 */
7#include <linux/init.h>
8#include <linux/module.h>
9#include <linux/percpu.h>
10#include <linux/rcupdate.h>
11#include <linux/rculist.h>
12#include <linux/smp.h>
13
14static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
15static LIST_HEAD(call_function_queue);
16__cacheline_aligned_in_smp DEFINE_SPINLOCK(call_function_lock);
17
18enum {
19 CSD_FLAG_WAIT = 0x01,
20 CSD_FLAG_ALLOC = 0x02,
21};
22
23struct call_function_data {
24 struct call_single_data csd;
25 spinlock_t lock;
26 unsigned int refs;
27 cpumask_t cpumask;
28 struct rcu_head rcu_head;
29};
30
31struct call_single_queue {
32 struct list_head list;
33 spinlock_t lock;
34};
35
36static int __cpuinit init_call_single_data(void)
37{
38 int i;
39
40 for_each_possible_cpu(i) {
41 struct call_single_queue *q = &per_cpu(call_single_queue, i);
42
43 spin_lock_init(&q->lock);
44 INIT_LIST_HEAD(&q->list);
45 }
46 return 0;
47}
48early_initcall(init_call_single_data);
49
50static void csd_flag_wait(struct call_single_data *data)
51{
52 /* Wait for response */
53 do {
54 /*
55 * We need to see the flags store in the IPI handler
56 */
57 smp_mb();
58 if (!(data->flags & CSD_FLAG_WAIT))
59 break;
60 cpu_relax();
61 } while (1);
62}
63
64/*
65 * Insert a previously allocated call_single_data element for execution
66 * on the given CPU. data must already have ->func, ->info, and ->flags set.
67 */
68static void generic_exec_single(int cpu, struct call_single_data *data)
69{
70 struct call_single_queue *dst = &per_cpu(call_single_queue, cpu);
71 int wait = data->flags & CSD_FLAG_WAIT, ipi;
72 unsigned long flags;
73
74 spin_lock_irqsave(&dst->lock, flags);
75 ipi = list_empty(&dst->list);
76 list_add_tail(&data->list, &dst->list);
77 spin_unlock_irqrestore(&dst->lock, flags);
78
79 if (ipi)
80 arch_send_call_function_single_ipi(cpu);
81
82 if (wait)
83 csd_flag_wait(data);
84}
85
86static void rcu_free_call_data(struct rcu_head *head)
87{
88 struct call_function_data *data;
89
90 data = container_of(head, struct call_function_data, rcu_head);
91
92 kfree(data);
93}
94
95/*
96 * Invoked by arch to handle an IPI for call function. Must be called with
97 * interrupts disabled.
98 */
99void generic_smp_call_function_interrupt(void)
100{
101 struct call_function_data *data;
102 int cpu = get_cpu();
103
104 /*
105 * It's ok to use list_for_each_rcu() here even though we may delete
106 * 'pos', since list_del_rcu() doesn't clear ->next
107 */
108 rcu_read_lock();
109 list_for_each_entry_rcu(data, &call_function_queue, csd.list) {
110 int refs;
111
112 if (!cpu_isset(cpu, data->cpumask))
113 continue;
114
115 data->csd.func(data->csd.info);
116
117 spin_lock(&data->lock);
118 cpu_clear(cpu, data->cpumask);
119 WARN_ON(data->refs == 0);
120 data->refs--;
121 refs = data->refs;
122 spin_unlock(&data->lock);
123
124 if (refs)
125 continue;
126
127 spin_lock(&call_function_lock);
128 list_del_rcu(&data->csd.list);
129 spin_unlock(&call_function_lock);
130
131 if (data->csd.flags & CSD_FLAG_WAIT) {
132 /*
133 * serialize stores to data with the flag clear
134 * and wakeup
135 */
136 smp_wmb();
137 data->csd.flags &= ~CSD_FLAG_WAIT;
138 }
139 if (data->csd.flags & CSD_FLAG_ALLOC)
140 call_rcu(&data->rcu_head, rcu_free_call_data);
141 }
142 rcu_read_unlock();
143
144 put_cpu();
145}
146
147/*
148 * Invoked by arch to handle an IPI for call function single. Must be called
149 * from the arch with interrupts disabled.
150 */
151void generic_smp_call_function_single_interrupt(void)
152{
153 struct call_single_queue *q = &__get_cpu_var(call_single_queue);
154 LIST_HEAD(list);
155
156 /*
157 * Need to see other stores to list head for checking whether
158 * list is empty without holding q->lock
159 */
160 smp_mb();
161 while (!list_empty(&q->list)) {
162 unsigned int data_flags;
163
164 spin_lock(&q->lock);
165 list_replace_init(&q->list, &list);
166 spin_unlock(&q->lock);
167
168 while (!list_empty(&list)) {
169 struct call_single_data *data;
170
171 data = list_entry(list.next, struct call_single_data,
172 list);
173 list_del(&data->list);
174
175 /*
176 * 'data' can be invalid after this call if
177 * flags == 0 (when called through
178 * generic_exec_single(), so save them away before
179 * making the call.
180 */
181 data_flags = data->flags;
182
183 data->func(data->info);
184
185 if (data_flags & CSD_FLAG_WAIT) {
186 smp_wmb();
187 data->flags &= ~CSD_FLAG_WAIT;
188 } else if (data_flags & CSD_FLAG_ALLOC)
189 kfree(data);
190 }
191 /*
192 * See comment on outer loop
193 */
194 smp_mb();
195 }
196}
197
198/*
199 * smp_call_function_single - Run a function on a specific CPU
200 * @func: The function to run. This must be fast and non-blocking.
201 * @info: An arbitrary pointer to pass to the function.
202 * @wait: If true, wait until function has completed on other CPUs.
203 *
204 * Returns 0 on success, else a negative status code. Note that @wait
205 * will be implicitly turned on in case of allocation failures, since
206 * we fall back to on-stack allocation.
207 */
208int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
209 int wait)
210{
211 struct call_single_data d;
212 unsigned long flags;
213 /* prevent preemption and reschedule on another processor,
214 as well as CPU removal */
215 int me = get_cpu();
216 int err = 0;
217
218 /* Can deadlock when called with interrupts disabled */
219 WARN_ON(irqs_disabled());
220
221 if (cpu == me) {
222 local_irq_save(flags);
223 func(info);
224 local_irq_restore(flags);
225 } else if ((unsigned)cpu < NR_CPUS && cpu_online(cpu)) {
226 struct call_single_data *data = NULL;
227
228 if (!wait) {
229 data = kmalloc(sizeof(*data), GFP_ATOMIC);
230 if (data)
231 data->flags = CSD_FLAG_ALLOC;
232 }
233 if (!data) {
234 data = &d;
235 data->flags = CSD_FLAG_WAIT;
236 }
237
238 data->func = func;
239 data->info = info;
240 generic_exec_single(cpu, data);
241 } else {
242 err = -ENXIO; /* CPU not online */
243 }
244
245 put_cpu();
246 return err;
247}
248EXPORT_SYMBOL(smp_call_function_single);
249
250/**
251 * __smp_call_function_single(): Run a function on another CPU
252 * @cpu: The CPU to run on.
253 * @data: Pre-allocated and setup data structure
254 *
255 * Like smp_call_function_single(), but allow caller to pass in a pre-allocated
256 * data structure. Useful for embedding @data inside other structures, for
257 * instance.
258 *
259 */
260void __smp_call_function_single(int cpu, struct call_single_data *data)
261{
262 /* Can deadlock when called with interrupts disabled */
263 WARN_ON((data->flags & CSD_FLAG_WAIT) && irqs_disabled());
264
265 generic_exec_single(cpu, data);
266}
267
268/* Dummy function */
269static void quiesce_dummy(void *unused)
270{
271}
272
273/*
274 * Ensure stack based data used in call function mask is safe to free.
275 *
276 * This is needed by smp_call_function_mask when using on-stack data, because
277 * a single call function queue is shared by all CPUs, and any CPU may pick up
278 * the data item on the queue at any time before it is deleted. So we need to
279 * ensure that all CPUs have transitioned through a quiescent state after
280 * this call.
281 *
282 * This is a very slow function, implemented by sending synchronous IPIs to
283 * all possible CPUs. For this reason, we have to alloc data rather than use
284 * stack based data even in the case of synchronous calls. The stack based
285 * data is then just used for deadlock/oom fallback which will be very rare.
286 *
287 * If a faster scheme can be made, we could go back to preferring stack based
288 * data -- the data allocation/free is non-zero cost.
289 */
290static void smp_call_function_mask_quiesce_stack(cpumask_t mask)
291{
292 struct call_single_data data;
293 int cpu;
294
295 data.func = quiesce_dummy;
296 data.info = NULL;
297
298 for_each_cpu_mask(cpu, mask) {
299 data.flags = CSD_FLAG_WAIT;
300 generic_exec_single(cpu, &data);
301 }
302}
303
304/**
305 * smp_call_function_mask(): Run a function on a set of other CPUs.
306 * @mask: The set of cpus to run on.
307 * @func: The function to run. This must be fast and non-blocking.
308 * @info: An arbitrary pointer to pass to the function.
309 * @wait: If true, wait (atomically) until function has completed on other CPUs.
310 *
311 * Returns 0 on success, else a negative status code.
312 *
313 * If @wait is true, then returns once @func has returned. Note that @wait
314 * will be implicitly turned on in case of allocation failures, since
315 * we fall back to on-stack allocation.
316 *
317 * You must not call this function with disabled interrupts or from a
318 * hardware interrupt handler or from a bottom half handler. Preemption
319 * must be disabled when calling this function.
320 */
321int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
322 int wait)
323{
324 struct call_function_data d;
325 struct call_function_data *data = NULL;
326 cpumask_t allbutself;
327 unsigned long flags;
328 int cpu, num_cpus;
329 int slowpath = 0;
330
331 /* Can deadlock when called with interrupts disabled */
332 WARN_ON(irqs_disabled());
333
334 cpu = smp_processor_id();
335 allbutself = cpu_online_map;
336 cpu_clear(cpu, allbutself);
337 cpus_and(mask, mask, allbutself);
338 num_cpus = cpus_weight(mask);
339
340 /*
341 * If zero CPUs, return. If just a single CPU, turn this request
342 * into a targetted single call instead since it's faster.
343 */
344 if (!num_cpus)
345 return 0;
346 else if (num_cpus == 1) {
347 cpu = first_cpu(mask);
348 return smp_call_function_single(cpu, func, info, wait);
349 }
350
351 data = kmalloc(sizeof(*data), GFP_ATOMIC);
352 if (data) {
353 data->csd.flags = CSD_FLAG_ALLOC;
354 if (wait)
355 data->csd.flags |= CSD_FLAG_WAIT;
356 } else {
357 data = &d;
358 data->csd.flags = CSD_FLAG_WAIT;
359 wait = 1;
360 slowpath = 1;
361 }
362
363 spin_lock_init(&data->lock);
364 data->csd.func = func;
365 data->csd.info = info;
366 data->refs = num_cpus;
367 data->cpumask = mask;
368
369 spin_lock_irqsave(&call_function_lock, flags);
370 list_add_tail_rcu(&data->csd.list, &call_function_queue);
371 spin_unlock_irqrestore(&call_function_lock, flags);
372
373 /* Send a message to all CPUs in the map */
374 arch_send_call_function_ipi(mask);
375
376 /* optionally wait for the CPUs to complete */
377 if (wait) {
378 csd_flag_wait(&data->csd);
379 if (unlikely(slowpath))
380 smp_call_function_mask_quiesce_stack(mask);
381 }
382
383 return 0;
384}
385EXPORT_SYMBOL(smp_call_function_mask);
386
387/**
388 * smp_call_function(): Run a function on all other CPUs.
389 * @func: The function to run. This must be fast and non-blocking.
390 * @info: An arbitrary pointer to pass to the function.
391 * @wait: If true, wait (atomically) until function has completed on other CPUs.
392 *
393 * Returns 0 on success, else a negative status code.
394 *
395 * If @wait is true, then returns once @func has returned; otherwise
396 * it returns just before the target cpu calls @func. In case of allocation
397 * failure, @wait will be implicitly turned on.
398 *
399 * You must not call this function with disabled interrupts or from a
400 * hardware interrupt handler or from a bottom half handler.
401 */
402int smp_call_function(void (*func)(void *), void *info, int wait)
403{
404 int ret;
405
406 preempt_disable();
407 ret = smp_call_function_mask(cpu_online_map, func, info, wait);
408 preempt_enable();
409 return ret;
410}
411EXPORT_SYMBOL(smp_call_function);
412
413void ipi_call_lock(void)
414{
415 spin_lock(&call_function_lock);
416}
417
418void ipi_call_unlock(void)
419{
420 spin_unlock(&call_function_lock);
421}
422
423void ipi_call_lock_irq(void)
424{
425 spin_lock_irq(&call_function_lock);
426}
427
428void ipi_call_unlock_irq(void)
429{
430 spin_unlock_irq(&call_function_lock);
431}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 36e061740047..c506f266a6b9 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -131,23 +131,17 @@ void _local_bh_enable(void)
131 131
132EXPORT_SYMBOL(_local_bh_enable); 132EXPORT_SYMBOL(_local_bh_enable);
133 133
134void local_bh_enable(void) 134static inline void _local_bh_enable_ip(unsigned long ip)
135{ 135{
136 WARN_ON_ONCE(in_irq() || irqs_disabled());
136#ifdef CONFIG_TRACE_IRQFLAGS 137#ifdef CONFIG_TRACE_IRQFLAGS
137 unsigned long flags; 138 local_irq_disable();
138
139 WARN_ON_ONCE(in_irq());
140#endif
141 WARN_ON_ONCE(irqs_disabled());
142
143#ifdef CONFIG_TRACE_IRQFLAGS
144 local_irq_save(flags);
145#endif 139#endif
146 /* 140 /*
147 * Are softirqs going to be turned on now: 141 * Are softirqs going to be turned on now:
148 */ 142 */
149 if (softirq_count() == SOFTIRQ_OFFSET) 143 if (softirq_count() == SOFTIRQ_OFFSET)
150 trace_softirqs_on((unsigned long)__builtin_return_address(0)); 144 trace_softirqs_on(ip);
151 /* 145 /*
152 * Keep preemption disabled until we are done with 146 * Keep preemption disabled until we are done with
153 * softirq processing: 147 * softirq processing:
@@ -159,40 +153,20 @@ void local_bh_enable(void)
159 153
160 dec_preempt_count(); 154 dec_preempt_count();
161#ifdef CONFIG_TRACE_IRQFLAGS 155#ifdef CONFIG_TRACE_IRQFLAGS
162 local_irq_restore(flags); 156 local_irq_enable();
163#endif 157#endif
164 preempt_check_resched(); 158 preempt_check_resched();
165} 159}
160
161void local_bh_enable(void)
162{
163 _local_bh_enable_ip((unsigned long)__builtin_return_address(0));
164}
166EXPORT_SYMBOL(local_bh_enable); 165EXPORT_SYMBOL(local_bh_enable);
167 166
168void local_bh_enable_ip(unsigned long ip) 167void local_bh_enable_ip(unsigned long ip)
169{ 168{
170#ifdef CONFIG_TRACE_IRQFLAGS 169 _local_bh_enable_ip(ip);
171 unsigned long flags;
172
173 WARN_ON_ONCE(in_irq());
174
175 local_irq_save(flags);
176#endif
177 /*
178 * Are softirqs going to be turned on now:
179 */
180 if (softirq_count() == SOFTIRQ_OFFSET)
181 trace_softirqs_on(ip);
182 /*
183 * Keep preemption disabled until we are done with
184 * softirq processing:
185 */
186 sub_preempt_count(SOFTIRQ_OFFSET - 1);
187
188 if (unlikely(!in_interrupt() && local_softirq_pending()))
189 do_softirq();
190
191 dec_preempt_count();
192#ifdef CONFIG_TRACE_IRQFLAGS
193 local_irq_restore(flags);
194#endif
195 preempt_check_resched();
196} 170}
197EXPORT_SYMBOL(local_bh_enable_ip); 171EXPORT_SYMBOL(local_bh_enable_ip);
198 172
@@ -312,7 +286,7 @@ void irq_exit(void)
312#ifdef CONFIG_NO_HZ 286#ifdef CONFIG_NO_HZ
313 /* Make sure that timer wheel updates are propagated */ 287 /* Make sure that timer wheel updates are propagated */
314 if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched()) 288 if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched())
315 tick_nohz_stop_sched_tick(); 289 tick_nohz_stop_sched_tick(0);
316 rcu_irq_exit(); 290 rcu_irq_exit();
317#endif 291#endif
318 preempt_enable_no_resched(); 292 preempt_enable_no_resched();
@@ -347,9 +321,8 @@ void raise_softirq(unsigned int nr)
347 local_irq_restore(flags); 321 local_irq_restore(flags);
348} 322}
349 323
350void open_softirq(int nr, void (*action)(struct softirq_action*), void *data) 324void open_softirq(int nr, void (*action)(struct softirq_action *))
351{ 325{
352 softirq_vec[nr].data = data;
353 softirq_vec[nr].action = action; 326 softirq_vec[nr].action = action;
354} 327}
355 328
@@ -360,10 +333,8 @@ struct tasklet_head
360 struct tasklet_struct **tail; 333 struct tasklet_struct **tail;
361}; 334};
362 335
363/* Some compilers disobey section attribute on statics when not 336static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec);
364 initialized -- RR */ 337static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec);
365static DEFINE_PER_CPU(struct tasklet_head, tasklet_vec) = { NULL };
366static DEFINE_PER_CPU(struct tasklet_head, tasklet_hi_vec) = { NULL };
367 338
368void __tasklet_schedule(struct tasklet_struct *t) 339void __tasklet_schedule(struct tasklet_struct *t)
369{ 340{
@@ -503,8 +474,8 @@ void __init softirq_init(void)
503 &per_cpu(tasklet_hi_vec, cpu).head; 474 &per_cpu(tasklet_hi_vec, cpu).head;
504 } 475 }
505 476
506 open_softirq(TASKLET_SOFTIRQ, tasklet_action, NULL); 477 open_softirq(TASKLET_SOFTIRQ, tasklet_action);
507 open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL); 478 open_softirq(HI_SOFTIRQ, tasklet_hi_action);
508} 479}
509 480
510static int ksoftirqd(void * __bind_cpu) 481static int ksoftirqd(void * __bind_cpu)
@@ -645,7 +616,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
645 616
646 p = per_cpu(ksoftirqd, hotcpu); 617 p = per_cpu(ksoftirqd, hotcpu);
647 per_cpu(ksoftirqd, hotcpu) = NULL; 618 per_cpu(ksoftirqd, hotcpu) = NULL;
648 sched_setscheduler(p, SCHED_FIFO, &param); 619 sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
649 kthread_stop(p); 620 kthread_stop(p);
650 takeover_tasklets(hotcpu); 621 takeover_tasklets(hotcpu);
651 break; 622 break;
@@ -659,7 +630,7 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
659 .notifier_call = cpu_callback 630 .notifier_call = cpu_callback
660}; 631};
661 632
662__init int spawn_ksoftirqd(void) 633static __init int spawn_ksoftirqd(void)
663{ 634{
664 void *cpu = (void *)(long)smp_processor_id(); 635 void *cpu = (void *)(long)smp_processor_id();
665 int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); 636 int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
@@ -669,17 +640,18 @@ __init int spawn_ksoftirqd(void)
669 register_cpu_notifier(&cpu_nfb); 640 register_cpu_notifier(&cpu_nfb);
670 return 0; 641 return 0;
671} 642}
643early_initcall(spawn_ksoftirqd);
672 644
673#ifdef CONFIG_SMP 645#ifdef CONFIG_SMP
674/* 646/*
675 * Call a function on all processors 647 * Call a function on all processors
676 */ 648 */
677int on_each_cpu(void (*func) (void *info), void *info, int retry, int wait) 649int on_each_cpu(void (*func) (void *info), void *info, int wait)
678{ 650{
679 int ret = 0; 651 int ret = 0;
680 652
681 preempt_disable(); 653 preempt_disable();
682 ret = smp_call_function(func, info, retry, wait); 654 ret = smp_call_function(func, info, wait);
683 local_irq_disable(); 655 local_irq_disable();
684 func(info); 656 func(info);
685 local_irq_enable(); 657 local_irq_enable();
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index c828c2339cc9..cb838ee93a82 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -13,6 +13,7 @@
13#include <linux/delay.h> 13#include <linux/delay.h>
14#include <linux/freezer.h> 14#include <linux/freezer.h>
15#include <linux/kthread.h> 15#include <linux/kthread.h>
16#include <linux/lockdep.h>
16#include <linux/notifier.h> 17#include <linux/notifier.h>
17#include <linux/module.h> 18#include <linux/module.h>
18 19
@@ -25,7 +26,22 @@ static DEFINE_PER_CPU(unsigned long, print_timestamp);
25static DEFINE_PER_CPU(struct task_struct *, watchdog_task); 26static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
26 27
27static int __read_mostly did_panic; 28static int __read_mostly did_panic;
28unsigned long __read_mostly softlockup_thresh = 60; 29int __read_mostly softlockup_thresh = 60;
30
31/*
32 * Should we panic (and reboot, if panic_timeout= is set) when a
33 * soft-lockup occurs:
34 */
35unsigned int __read_mostly softlockup_panic =
36 CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
37
38static int __init softlockup_panic_setup(char *str)
39{
40 softlockup_panic = simple_strtoul(str, NULL, 0);
41
42 return 1;
43}
44__setup("softlockup_panic=", softlockup_panic_setup);
29 45
30static int 46static int
31softlock_panic(struct notifier_block *this, unsigned long event, void *ptr) 47softlock_panic(struct notifier_block *this, unsigned long event, void *ptr)
@@ -84,6 +100,14 @@ void softlockup_tick(void)
84 struct pt_regs *regs = get_irq_regs(); 100 struct pt_regs *regs = get_irq_regs();
85 unsigned long now; 101 unsigned long now;
86 102
103 /* Is detection switched off? */
104 if (!per_cpu(watchdog_task, this_cpu) || softlockup_thresh <= 0) {
105 /* Be sure we don't false trigger if switched back on */
106 if (touch_timestamp)
107 per_cpu(touch_timestamp, this_cpu) = 0;
108 return;
109 }
110
87 if (touch_timestamp == 0) { 111 if (touch_timestamp == 0) {
88 __touch_softlockup_watchdog(); 112 __touch_softlockup_watchdog();
89 return; 113 return;
@@ -92,11 +116,8 @@ void softlockup_tick(void)
92 print_timestamp = per_cpu(print_timestamp, this_cpu); 116 print_timestamp = per_cpu(print_timestamp, this_cpu);
93 117
94 /* report at most once a second */ 118 /* report at most once a second */
95 if ((print_timestamp >= touch_timestamp && 119 if (print_timestamp == touch_timestamp || did_panic)
96 print_timestamp < (touch_timestamp + 1)) ||
97 did_panic || !per_cpu(watchdog_task, this_cpu)) {
98 return; 120 return;
99 }
100 121
101 /* do not print during early bootup: */ 122 /* do not print during early bootup: */
102 if (unlikely(system_state != SYSTEM_RUNNING)) { 123 if (unlikely(system_state != SYSTEM_RUNNING)) {
@@ -106,8 +127,11 @@ void softlockup_tick(void)
106 127
107 now = get_timestamp(this_cpu); 128 now = get_timestamp(this_cpu);
108 129
109 /* Wake up the high-prio watchdog task every second: */ 130 /*
110 if (now > (touch_timestamp + 1)) 131 * Wake up the high-prio watchdog task twice per
132 * threshold timespan.
133 */
134 if (now > touch_timestamp + softlockup_thresh/2)
111 wake_up_process(per_cpu(watchdog_task, this_cpu)); 135 wake_up_process(per_cpu(watchdog_task, this_cpu));
112 136
113 /* Warn about unreasonable delays: */ 137 /* Warn about unreasonable delays: */
@@ -120,11 +144,16 @@ void softlockup_tick(void)
120 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n", 144 printk(KERN_ERR "BUG: soft lockup - CPU#%d stuck for %lus! [%s:%d]\n",
121 this_cpu, now - touch_timestamp, 145 this_cpu, now - touch_timestamp,
122 current->comm, task_pid_nr(current)); 146 current->comm, task_pid_nr(current));
147 print_modules();
148 print_irqtrace_events(current);
123 if (regs) 149 if (regs)
124 show_regs(regs); 150 show_regs(regs);
125 else 151 else
126 dump_stack(); 152 dump_stack();
127 spin_unlock(&print_lock); 153 spin_unlock(&print_lock);
154
155 if (softlockup_panic)
156 panic("softlockup: hung tasks");
128} 157}
129 158
130/* 159/*
@@ -177,6 +206,9 @@ static void check_hung_task(struct task_struct *t, unsigned long now)
177 206
178 t->last_switch_timestamp = now; 207 t->last_switch_timestamp = now;
179 touch_nmi_watchdog(); 208 touch_nmi_watchdog();
209
210 if (softlockup_panic)
211 panic("softlockup: blocked tasks");
180} 212}
181 213
182/* 214/*
@@ -201,7 +233,8 @@ static void check_hung_uninterruptible_tasks(int this_cpu)
201 do_each_thread(g, t) { 233 do_each_thread(g, t) {
202 if (!--max_count) 234 if (!--max_count)
203 goto unlock; 235 goto unlock;
204 if (t->state & TASK_UNINTERRUPTIBLE) 236 /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
237 if (t->state == TASK_UNINTERRUPTIBLE)
205 check_hung_task(t, now); 238 check_hung_task(t, now);
206 } while_each_thread(g, t); 239 } while_each_thread(g, t);
207 unlock: 240 unlock:
@@ -306,14 +339,33 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
306 .notifier_call = cpu_callback 339 .notifier_call = cpu_callback
307}; 340};
308 341
309__init void spawn_softlockup_task(void) 342static int __initdata nosoftlockup;
343
344static int __init nosoftlockup_setup(char *str)
345{
346 nosoftlockup = 1;
347 return 1;
348}
349__setup("nosoftlockup", nosoftlockup_setup);
350
351static int __init spawn_softlockup_task(void)
310{ 352{
311 void *cpu = (void *)(long)smp_processor_id(); 353 void *cpu = (void *)(long)smp_processor_id();
312 int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); 354 int err;
355
356 if (nosoftlockup)
357 return 0;
313 358
314 BUG_ON(err == NOTIFY_BAD); 359 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
360 if (err == NOTIFY_BAD) {
361 BUG();
362 return 1;
363 }
315 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); 364 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
316 register_cpu_notifier(&cpu_nfb); 365 register_cpu_notifier(&cpu_nfb);
317 366
318 atomic_notifier_chain_register(&panic_notifier_list, &panic_block); 367 atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
368
369 return 0;
319} 370}
371early_initcall(spawn_softlockup_task);
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index ae28c8245123..29ab20749dd3 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -290,8 +290,8 @@ void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass)
290 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); 290 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
291 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); 291 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
292} 292}
293
294EXPORT_SYMBOL(_spin_lock_nested); 293EXPORT_SYMBOL(_spin_lock_nested);
294
295unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass) 295unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass)
296{ 296{
297 unsigned long flags; 297 unsigned long flags;
@@ -311,9 +311,17 @@ unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclas
311#endif 311#endif
312 return flags; 312 return flags;
313} 313}
314
315EXPORT_SYMBOL(_spin_lock_irqsave_nested); 314EXPORT_SYMBOL(_spin_lock_irqsave_nested);
316 315
316void __lockfunc _spin_lock_nest_lock(spinlock_t *lock,
317 struct lockdep_map *nest_lock)
318{
319 preempt_disable();
320 spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_);
321 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
322}
323EXPORT_SYMBOL(_spin_lock_nest_lock);
324
317#endif 325#endif
318 326
319void __lockfunc _spin_unlock(spinlock_t *lock) 327void __lockfunc _spin_unlock(spinlock_t *lock)
@@ -436,7 +444,7 @@ int __lockfunc _spin_trylock_bh(spinlock_t *lock)
436} 444}
437EXPORT_SYMBOL(_spin_trylock_bh); 445EXPORT_SYMBOL(_spin_trylock_bh);
438 446
439int in_lock_functions(unsigned long addr) 447notrace int in_lock_functions(unsigned long addr)
440{ 448{
441 /* Linker adds these: start and end of __lockfunc functions */ 449 /* Linker adds these: start and end of __lockfunc functions */
442 extern char __lock_text_start[], __lock_text_end[]; 450 extern char __lock_text_start[], __lock_text_end[];
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index b71816e47a30..94b527ef1d1e 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -6,19 +6,21 @@
6 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> 6 * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
7 */ 7 */
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/module.h>
9#include <linux/kallsyms.h> 10#include <linux/kallsyms.h>
10#include <linux/stacktrace.h> 11#include <linux/stacktrace.h>
11 12
12void print_stack_trace(struct stack_trace *trace, int spaces) 13void print_stack_trace(struct stack_trace *trace, int spaces)
13{ 14{
14 int i, j; 15 int i;
15 16
16 for (i = 0; i < trace->nr_entries; i++) { 17 if (WARN_ON(!trace->entries))
17 unsigned long ip = trace->entries[i]; 18 return;
18 19
19 for (j = 0; j < spaces + 1; j++) 20 for (i = 0; i < trace->nr_entries; i++) {
20 printk(" "); 21 printk("%*c", 1 + spaces, ' ');
21 print_ip_sym(ip); 22 print_ip_sym(trace->entries[i]);
22 } 23 }
23} 24}
25EXPORT_SYMBOL_GPL(print_stack_trace);
24 26
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index b7350bbfb076..af3c7cea258b 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -1,4 +1,4 @@
1/* Copyright 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation. 1/* Copyright 2008, 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation.
2 * GPL v2 and any later version. 2 * GPL v2 and any later version.
3 */ 3 */
4#include <linux/cpu.h> 4#include <linux/cpu.h>
@@ -13,203 +13,177 @@
13#include <asm/atomic.h> 13#include <asm/atomic.h>
14#include <asm/uaccess.h> 14#include <asm/uaccess.h>
15 15
16/* Since we effect priority and affinity (both of which are visible 16/* This controls the threads on each CPU. */
17 * to, and settable by outside processes) we do indirection via a
18 * kthread. */
19
20/* Thread to stop each CPU in user context. */
21enum stopmachine_state { 17enum stopmachine_state {
22 STOPMACHINE_WAIT, 18 /* Dummy starting state for thread. */
19 STOPMACHINE_NONE,
20 /* Awaiting everyone to be scheduled. */
23 STOPMACHINE_PREPARE, 21 STOPMACHINE_PREPARE,
22 /* Disable interrupts. */
24 STOPMACHINE_DISABLE_IRQ, 23 STOPMACHINE_DISABLE_IRQ,
24 /* Run the function */
25 STOPMACHINE_RUN,
26 /* Exit */
25 STOPMACHINE_EXIT, 27 STOPMACHINE_EXIT,
26}; 28};
29static enum stopmachine_state state;
27 30
28static enum stopmachine_state stopmachine_state; 31struct stop_machine_data {
29static unsigned int stopmachine_num_threads; 32 int (*fn)(void *);
30static atomic_t stopmachine_thread_ack; 33 void *data;
31 34 int fnret;
32static int stopmachine(void *cpu) 35};
33{
34 int irqs_disabled = 0;
35 int prepared = 0;
36
37 set_cpus_allowed_ptr(current, &cpumask_of_cpu((int)(long)cpu));
38
39 /* Ack: we are alive */
40 smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */
41 atomic_inc(&stopmachine_thread_ack);
42
43 /* Simple state machine */
44 while (stopmachine_state != STOPMACHINE_EXIT) {
45 if (stopmachine_state == STOPMACHINE_DISABLE_IRQ
46 && !irqs_disabled) {
47 local_irq_disable();
48 hard_irq_disable();
49 irqs_disabled = 1;
50 /* Ack: irqs disabled. */
51 smp_mb(); /* Must read state first. */
52 atomic_inc(&stopmachine_thread_ack);
53 } else if (stopmachine_state == STOPMACHINE_PREPARE
54 && !prepared) {
55 /* Everyone is in place, hold CPU. */
56 preempt_disable();
57 prepared = 1;
58 smp_mb(); /* Must read state first. */
59 atomic_inc(&stopmachine_thread_ack);
60 }
61 /* Yield in first stage: migration threads need to
62 * help our sisters onto their CPUs. */
63 if (!prepared && !irqs_disabled)
64 yield();
65 cpu_relax();
66 }
67
68 /* Ack: we are exiting. */
69 smp_mb(); /* Must read state first. */
70 atomic_inc(&stopmachine_thread_ack);
71
72 if (irqs_disabled)
73 local_irq_enable();
74 if (prepared)
75 preempt_enable();
76 36
77 return 0; 37/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
78} 38static unsigned int num_threads;
39static atomic_t thread_ack;
40static struct completion finished;
41static DEFINE_MUTEX(lock);
79 42
80/* Change the thread state */ 43static void set_state(enum stopmachine_state newstate)
81static void stopmachine_set_state(enum stopmachine_state state)
82{ 44{
83 atomic_set(&stopmachine_thread_ack, 0); 45 /* Reset ack counter. */
46 atomic_set(&thread_ack, num_threads);
84 smp_wmb(); 47 smp_wmb();
85 stopmachine_state = state; 48 state = newstate;
86 while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads)
87 cpu_relax();
88} 49}
89 50
90static int stop_machine(void) 51/* Last one to ack a state moves to the next state. */
52static void ack_state(void)
91{ 53{
92 int i, ret = 0; 54 if (atomic_dec_and_test(&thread_ack)) {
93 55 /* If we're the last one to ack the EXIT, we're finished. */
94 atomic_set(&stopmachine_thread_ack, 0); 56 if (state == STOPMACHINE_EXIT)
95 stopmachine_num_threads = 0; 57 complete(&finished);
96 stopmachine_state = STOPMACHINE_WAIT; 58 else
97 59 set_state(state + 1);
98 for_each_online_cpu(i) {
99 if (i == raw_smp_processor_id())
100 continue;
101 ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL);
102 if (ret < 0)
103 break;
104 stopmachine_num_threads++;
105 }
106
107 /* Wait for them all to come to life. */
108 while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads) {
109 yield();
110 cpu_relax();
111 } 60 }
61}
112 62
113 /* If some failed, kill them all. */ 63/* This is the actual thread which stops the CPU. It exits by itself rather
114 if (ret < 0) { 64 * than waiting for kthread_stop(), because it's easier for hotplug CPU. */
115 stopmachine_set_state(STOPMACHINE_EXIT); 65static int stop_cpu(struct stop_machine_data *smdata)
116 return ret; 66{
117 } 67 enum stopmachine_state curstate = STOPMACHINE_NONE;
118 68
119 /* Now they are all started, make them hold the CPUs, ready. */ 69 /* Simple state machine */
120 preempt_disable(); 70 do {
121 stopmachine_set_state(STOPMACHINE_PREPARE); 71 /* Chill out and ensure we re-read stopmachine_state. */
72 cpu_relax();
73 if (state != curstate) {
74 curstate = state;
75 switch (curstate) {
76 case STOPMACHINE_DISABLE_IRQ:
77 local_irq_disable();
78 hard_irq_disable();
79 break;
80 case STOPMACHINE_RUN:
81 /* |= allows error detection if functions on
82 * multiple CPUs. */
83 smdata->fnret |= smdata->fn(smdata->data);
84 break;
85 default:
86 break;
87 }
88 ack_state();
89 }
90 } while (curstate != STOPMACHINE_EXIT);
122 91
123 /* Make them disable irqs. */ 92 local_irq_enable();
124 local_irq_disable(); 93 do_exit(0);
125 hard_irq_disable(); 94}
126 stopmachine_set_state(STOPMACHINE_DISABLE_IRQ);
127 95
96/* Callback for CPUs which aren't supposed to do anything. */
97static int chill(void *unused)
98{
128 return 0; 99 return 0;
129} 100}
130 101
131static void restart_machine(void) 102int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
132{ 103{
133 stopmachine_set_state(STOPMACHINE_EXIT); 104 int i, err;
134 local_irq_enable(); 105 struct stop_machine_data active, idle;
135 preempt_enable_no_resched(); 106 struct task_struct **threads;
136} 107
108 active.fn = fn;
109 active.data = data;
110 active.fnret = 0;
111 idle.fn = chill;
112 idle.data = NULL;
113
114 /* This could be too big for stack on large machines. */
115 threads = kcalloc(NR_CPUS, sizeof(threads[0]), GFP_KERNEL);
116 if (!threads)
117 return -ENOMEM;
118
119 /* Set up initial state. */
120 mutex_lock(&lock);
121 init_completion(&finished);
122 num_threads = num_online_cpus();
123 set_state(STOPMACHINE_PREPARE);
137 124
138struct stop_machine_data { 125 for_each_online_cpu(i) {
139 int (*fn)(void *); 126 struct stop_machine_data *smdata = &idle;
140 void *data; 127 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
141 struct completion done;
142};
143 128
144static int do_stop(void *_smdata) 129 if (!cpus) {
145{ 130 if (i == first_cpu(cpu_online_map))
146 struct stop_machine_data *smdata = _smdata; 131 smdata = &active;
147 int ret; 132 } else {
133 if (cpu_isset(i, *cpus))
134 smdata = &active;
135 }
148 136
149 ret = stop_machine(); 137 threads[i] = kthread_create((void *)stop_cpu, smdata, "kstop%u",
150 if (ret == 0) { 138 i);
151 ret = smdata->fn(smdata->data); 139 if (IS_ERR(threads[i])) {
152 restart_machine(); 140 err = PTR_ERR(threads[i]);
153 } 141 threads[i] = NULL;
142 goto kill_threads;
143 }
154 144
155 /* We're done: you can kthread_stop us now */ 145 /* Place it onto correct cpu. */
156 complete(&smdata->done); 146 kthread_bind(threads[i], i);
157 147
158 /* Wait for kthread_stop */ 148 /* Make it highest prio. */
159 set_current_state(TASK_INTERRUPTIBLE); 149 if (sched_setscheduler_nocheck(threads[i], SCHED_FIFO, &param))
160 while (!kthread_should_stop()) { 150 BUG();
161 schedule();
162 set_current_state(TASK_INTERRUPTIBLE);
163 } 151 }
164 __set_current_state(TASK_RUNNING);
165 return ret;
166}
167 152
168struct task_struct *__stop_machine_run(int (*fn)(void *), void *data, 153 /* We've created all the threads. Wake them all: hold this CPU so one
169 unsigned int cpu) 154 * doesn't hit this CPU until we're ready. */
170{ 155 get_cpu();
171 static DEFINE_MUTEX(stopmachine_mutex); 156 for_each_online_cpu(i)
172 struct stop_machine_data smdata; 157 wake_up_process(threads[i]);
173 struct task_struct *p;
174 158
175 smdata.fn = fn; 159 /* This will release the thread on our CPU. */
176 smdata.data = data; 160 put_cpu();
177 init_completion(&smdata.done); 161 wait_for_completion(&finished);
162 mutex_unlock(&lock);
178 163
179 mutex_lock(&stopmachine_mutex); 164 kfree(threads);
180 165
181 /* If they don't care which CPU fn runs on, bind to any online one. */ 166 return active.fnret;
182 if (cpu == NR_CPUS)
183 cpu = raw_smp_processor_id();
184 167
185 p = kthread_create(do_stop, &smdata, "kstopmachine"); 168kill_threads:
186 if (!IS_ERR(p)) { 169 for_each_online_cpu(i)
187 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 170 if (threads[i])
171 kthread_stop(threads[i]);
172 mutex_unlock(&lock);
188 173
189 /* One high-prio thread per cpu. We'll do this one. */ 174 kfree(threads);
190 sched_setscheduler(p, SCHED_FIFO, &param); 175 return err;
191 kthread_bind(p, cpu);
192 wake_up_process(p);
193 wait_for_completion(&smdata.done);
194 }
195 mutex_unlock(&stopmachine_mutex);
196 return p;
197} 176}
198 177
199int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) 178int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
200{ 179{
201 struct task_struct *p;
202 int ret; 180 int ret;
203 181
204 /* No CPUs can come up or down during this. */ 182 /* No CPUs can come up or down during this. */
205 get_online_cpus(); 183 get_online_cpus();
206 p = __stop_machine_run(fn, data, cpu); 184 ret = __stop_machine(fn, data, cpus);
207 if (!IS_ERR(p))
208 ret = kthread_stop(p);
209 else
210 ret = PTR_ERR(p);
211 put_online_cpus(); 185 put_online_cpus();
212 186
213 return ret; 187 return ret;
214} 188}
215EXPORT_SYMBOL_GPL(stop_machine_run); 189EXPORT_SYMBOL_GPL(stop_machine);
diff --git a/kernel/sys.c b/kernel/sys.c
index 14e97282eb6c..234d9454294e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -169,9 +169,9 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
169 pgrp = find_vpid(who); 169 pgrp = find_vpid(who);
170 else 170 else
171 pgrp = task_pgrp(current); 171 pgrp = task_pgrp(current);
172 do_each_pid_task(pgrp, PIDTYPE_PGID, p) { 172 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
173 error = set_one_prio(p, niceval, error); 173 error = set_one_prio(p, niceval, error);
174 } while_each_pid_task(pgrp, PIDTYPE_PGID, p); 174 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
175 break; 175 break;
176 case PRIO_USER: 176 case PRIO_USER:
177 user = current->user; 177 user = current->user;
@@ -229,11 +229,11 @@ asmlinkage long sys_getpriority(int which, int who)
229 pgrp = find_vpid(who); 229 pgrp = find_vpid(who);
230 else 230 else
231 pgrp = task_pgrp(current); 231 pgrp = task_pgrp(current);
232 do_each_pid_task(pgrp, PIDTYPE_PGID, p) { 232 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
233 niceval = 20 - task_nice(p); 233 niceval = 20 - task_nice(p);
234 if (niceval > retval) 234 if (niceval > retval)
235 retval = niceval; 235 retval = niceval;
236 } while_each_pid_task(pgrp, PIDTYPE_PGID, p); 236 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
237 break; 237 break;
238 case PRIO_USER: 238 case PRIO_USER:
239 user = current->user; 239 user = current->user;
@@ -274,7 +274,7 @@ void emergency_restart(void)
274} 274}
275EXPORT_SYMBOL_GPL(emergency_restart); 275EXPORT_SYMBOL_GPL(emergency_restart);
276 276
277static void kernel_restart_prepare(char *cmd) 277void kernel_restart_prepare(char *cmd)
278{ 278{
279 blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); 279 blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
280 system_state = SYSTEM_RESTART; 280 system_state = SYSTEM_RESTART;
@@ -301,26 +301,6 @@ void kernel_restart(char *cmd)
301} 301}
302EXPORT_SYMBOL_GPL(kernel_restart); 302EXPORT_SYMBOL_GPL(kernel_restart);
303 303
304/**
305 * kernel_kexec - reboot the system
306 *
307 * Move into place and start executing a preloaded standalone
308 * executable. If nothing was preloaded return an error.
309 */
310static void kernel_kexec(void)
311{
312#ifdef CONFIG_KEXEC
313 struct kimage *image;
314 image = xchg(&kexec_image, NULL);
315 if (!image)
316 return;
317 kernel_restart_prepare(NULL);
318 printk(KERN_EMERG "Starting new kernel\n");
319 machine_shutdown();
320 machine_kexec(image);
321#endif
322}
323
324static void kernel_shutdown_prepare(enum system_states state) 304static void kernel_shutdown_prepare(enum system_states state)
325{ 305{
326 blocking_notifier_call_chain(&reboot_notifier_list, 306 blocking_notifier_call_chain(&reboot_notifier_list,
@@ -425,10 +405,15 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
425 kernel_restart(buffer); 405 kernel_restart(buffer);
426 break; 406 break;
427 407
408#ifdef CONFIG_KEXEC
428 case LINUX_REBOOT_CMD_KEXEC: 409 case LINUX_REBOOT_CMD_KEXEC:
429 kernel_kexec(); 410 {
430 unlock_kernel(); 411 int ret;
431 return -EINVAL; 412 ret = kernel_kexec();
413 unlock_kernel();
414 return ret;
415 }
416#endif
432 417
433#ifdef CONFIG_HIBERNATION 418#ifdef CONFIG_HIBERNATION
434 case LINUX_REBOOT_CMD_SW_SUSPEND: 419 case LINUX_REBOOT_CMD_SW_SUSPEND:
@@ -1075,9 +1060,7 @@ asmlinkage long sys_setsid(void)
1075 group_leader->signal->leader = 1; 1060 group_leader->signal->leader = 1;
1076 __set_special_pids(sid); 1061 __set_special_pids(sid);
1077 1062
1078 spin_lock(&group_leader->sighand->siglock); 1063 proc_clear_tty(group_leader);
1079 group_leader->signal->tty = NULL;
1080 spin_unlock(&group_leader->sighand->siglock);
1081 1064
1082 err = session; 1065 err = session;
1083out: 1066out:
@@ -1343,8 +1326,6 @@ EXPORT_SYMBOL(in_egroup_p);
1343 1326
1344DECLARE_RWSEM(uts_sem); 1327DECLARE_RWSEM(uts_sem);
1345 1328
1346EXPORT_SYMBOL(uts_sem);
1347
1348asmlinkage long sys_newuname(struct new_utsname __user * name) 1329asmlinkage long sys_newuname(struct new_utsname __user * name)
1349{ 1330{
1350 int errno = 0; 1331 int errno = 0;
@@ -1795,7 +1776,7 @@ int orderly_poweroff(bool force)
1795 goto out; 1776 goto out;
1796 } 1777 }
1797 1778
1798 info = call_usermodehelper_setup(argv[0], argv, envp); 1779 info = call_usermodehelper_setup(argv[0], argv, envp, GFP_ATOMIC);
1799 if (info == NULL) { 1780 if (info == NULL) {
1800 argv_free(argv); 1781 argv_free(argv);
1801 goto out; 1782 goto out;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 5b9b467de070..503d8d4eb80a 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -31,6 +31,7 @@ cond_syscall(sys_socketpair);
31cond_syscall(sys_bind); 31cond_syscall(sys_bind);
32cond_syscall(sys_listen); 32cond_syscall(sys_listen);
33cond_syscall(sys_accept); 33cond_syscall(sys_accept);
34cond_syscall(sys_paccept);
34cond_syscall(sys_connect); 35cond_syscall(sys_connect);
35cond_syscall(sys_getsockname); 36cond_syscall(sys_getsockname);
36cond_syscall(sys_getpeername); 37cond_syscall(sys_getpeername);
@@ -56,9 +57,11 @@ cond_syscall(compat_sys_set_robust_list);
56cond_syscall(sys_get_robust_list); 57cond_syscall(sys_get_robust_list);
57cond_syscall(compat_sys_get_robust_list); 58cond_syscall(compat_sys_get_robust_list);
58cond_syscall(sys_epoll_create); 59cond_syscall(sys_epoll_create);
60cond_syscall(sys_epoll_create1);
59cond_syscall(sys_epoll_ctl); 61cond_syscall(sys_epoll_ctl);
60cond_syscall(sys_epoll_wait); 62cond_syscall(sys_epoll_wait);
61cond_syscall(sys_epoll_pwait); 63cond_syscall(sys_epoll_pwait);
64cond_syscall(compat_sys_epoll_pwait);
62cond_syscall(sys_semget); 65cond_syscall(sys_semget);
63cond_syscall(sys_semop); 66cond_syscall(sys_semop);
64cond_syscall(sys_semtimedop); 67cond_syscall(sys_semtimedop);
@@ -94,6 +97,7 @@ cond_syscall(sys_keyctl);
94cond_syscall(compat_sys_keyctl); 97cond_syscall(compat_sys_keyctl);
95cond_syscall(compat_sys_socketcall); 98cond_syscall(compat_sys_socketcall);
96cond_syscall(sys_inotify_init); 99cond_syscall(sys_inotify_init);
100cond_syscall(sys_inotify_init1);
97cond_syscall(sys_inotify_add_watch); 101cond_syscall(sys_inotify_add_watch);
98cond_syscall(sys_inotify_rm_watch); 102cond_syscall(sys_inotify_rm_watch);
99cond_syscall(sys_migrate_pages); 103cond_syscall(sys_migrate_pages);
@@ -121,6 +125,7 @@ cond_syscall(sys_vm86old);
121cond_syscall(sys_vm86); 125cond_syscall(sys_vm86);
122cond_syscall(compat_sys_ipc); 126cond_syscall(compat_sys_ipc);
123cond_syscall(compat_sys_sysctl); 127cond_syscall(compat_sys_sysctl);
128cond_syscall(sys_flock);
124 129
125/* arch-specific weak syscall entries */ 130/* arch-specific weak syscall entries */
126cond_syscall(sys_pciconfig_read); 131cond_syscall(sys_pciconfig_read);
@@ -154,10 +159,13 @@ cond_syscall(sys_ioprio_get);
154 159
155/* New file descriptors */ 160/* New file descriptors */
156cond_syscall(sys_signalfd); 161cond_syscall(sys_signalfd);
162cond_syscall(sys_signalfd4);
157cond_syscall(compat_sys_signalfd); 163cond_syscall(compat_sys_signalfd);
164cond_syscall(compat_sys_signalfd4);
158cond_syscall(sys_timerfd_create); 165cond_syscall(sys_timerfd_create);
159cond_syscall(sys_timerfd_settime); 166cond_syscall(sys_timerfd_settime);
160cond_syscall(sys_timerfd_gettime); 167cond_syscall(sys_timerfd_gettime);
161cond_syscall(compat_sys_timerfd_settime); 168cond_syscall(compat_sys_timerfd_settime);
162cond_syscall(compat_sys_timerfd_gettime); 169cond_syscall(compat_sys_timerfd_gettime);
163cond_syscall(sys_eventfd); 170cond_syscall(sys_eventfd);
171cond_syscall(sys_eventfd2);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 29116652dca8..cfc5295f1e82 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -43,9 +43,11 @@
43#include <linux/limits.h> 43#include <linux/limits.h>
44#include <linux/dcache.h> 44#include <linux/dcache.h>
45#include <linux/syscalls.h> 45#include <linux/syscalls.h>
46#include <linux/vmstat.h>
46#include <linux/nfs_fs.h> 47#include <linux/nfs_fs.h>
47#include <linux/acpi.h> 48#include <linux/acpi.h>
48#include <linux/reboot.h> 49#include <linux/reboot.h>
50#include <linux/ftrace.h>
49 51
50#include <asm/uaccess.h> 52#include <asm/uaccess.h>
51#include <asm/processor.h> 53#include <asm/processor.h>
@@ -78,21 +80,23 @@ extern int pid_max_min, pid_max_max;
78extern int sysctl_drop_caches; 80extern int sysctl_drop_caches;
79extern int percpu_pagelist_fraction; 81extern int percpu_pagelist_fraction;
80extern int compat_log; 82extern int compat_log;
81extern int maps_protect;
82extern int sysctl_stat_interval;
83extern int latencytop_enabled; 83extern int latencytop_enabled;
84extern int sysctl_nr_open_min, sysctl_nr_open_max; 84extern int sysctl_nr_open_min, sysctl_nr_open_max;
85#ifdef CONFIG_RCU_TORTURE_TEST
86extern int rcutorture_runnable;
87#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
85 88
86/* Constants used for minimum and maximum */ 89/* Constants used for minimum and maximum */
87#if defined(CONFIG_DETECT_SOFTLOCKUP) || defined(CONFIG_HIGHMEM) 90#if defined(CONFIG_HIGHMEM) || defined(CONFIG_DETECT_SOFTLOCKUP)
88static int one = 1; 91static int one = 1;
89#endif 92#endif
90 93
91#ifdef CONFIG_DETECT_SOFTLOCKUP 94#ifdef CONFIG_DETECT_SOFTLOCKUP
92static int sixty = 60; 95static int sixty = 60;
96static int neg_one = -1;
93#endif 97#endif
94 98
95#ifdef CONFIG_MMU 99#if defined(CONFIG_MMU) && defined(CONFIG_FILE_LOCKING)
96static int two = 2; 100static int two = 2;
97#endif 101#endif
98 102
@@ -106,17 +110,15 @@ static int min_percpu_pagelist_fract = 8;
106 110
107static int ngroups_max = NGROUPS_MAX; 111static int ngroups_max = NGROUPS_MAX;
108 112
109#ifdef CONFIG_KMOD 113#ifdef CONFIG_MODULES
110extern char modprobe_path[]; 114extern char modprobe_path[];
111#endif 115#endif
112#ifdef CONFIG_CHR_DEV_SG 116#ifdef CONFIG_CHR_DEV_SG
113extern int sg_big_buff; 117extern int sg_big_buff;
114#endif 118#endif
115 119
116#ifdef __sparc__ 120#ifdef CONFIG_SPARC
117extern char reboot_command []; 121#include <asm/system.h>
118extern int stop_a_enabled;
119extern int scons_pwroff;
120#endif 122#endif
121 123
122#ifdef __hppa__ 124#ifdef __hppa__
@@ -132,8 +134,6 @@ extern int sysctl_userprocess_debug;
132extern int spin_retry; 134extern int spin_retry;
133#endif 135#endif
134 136
135extern int sysctl_hz_timer;
136
137#ifdef CONFIG_BSD_PROCESS_ACCT 137#ifdef CONFIG_BSD_PROCESS_ACCT
138extern int acct_parm[]; 138extern int acct_parm[];
139#endif 139#endif
@@ -156,13 +156,15 @@ static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *
156static struct ctl_table root_table[]; 156static struct ctl_table root_table[];
157static struct ctl_table_root sysctl_table_root; 157static struct ctl_table_root sysctl_table_root;
158static struct ctl_table_header root_table_header = { 158static struct ctl_table_header root_table_header = {
159 .count = 1,
159 .ctl_table = root_table, 160 .ctl_table = root_table,
160 .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.header_list), 161 .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),
161 .root = &sysctl_table_root, 162 .root = &sysctl_table_root,
163 .set = &sysctl_table_root.default_set,
162}; 164};
163static struct ctl_table_root sysctl_table_root = { 165static struct ctl_table_root sysctl_table_root = {
164 .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list), 166 .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list),
165 .header_list = LIST_HEAD_INIT(root_table_header.ctl_entry), 167 .default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry),
166}; 168};
167 169
168static struct ctl_table kern_table[]; 170static struct ctl_table kern_table[];
@@ -266,6 +268,14 @@ static struct ctl_table kern_table[] = {
266 }, 268 },
267 { 269 {
268 .ctl_name = CTL_UNNUMBERED, 270 .ctl_name = CTL_UNNUMBERED,
271 .procname = "sched_shares_ratelimit",
272 .data = &sysctl_sched_shares_ratelimit,
273 .maxlen = sizeof(unsigned int),
274 .mode = 0644,
275 .proc_handler = &proc_dointvec,
276 },
277 {
278 .ctl_name = CTL_UNNUMBERED,
269 .procname = "sched_child_runs_first", 279 .procname = "sched_child_runs_first",
270 .data = &sysctl_sched_child_runs_first, 280 .data = &sysctl_sched_child_runs_first,
271 .maxlen = sizeof(unsigned int), 281 .maxlen = sizeof(unsigned int),
@@ -402,7 +412,7 @@ static struct ctl_table kern_table[] = {
402 .mode = 0644, 412 .mode = 0644,
403 .proc_handler = &proc_dointvec, 413 .proc_handler = &proc_dointvec,
404 }, 414 },
405#ifdef __sparc__ 415#ifdef CONFIG_SPARC
406 { 416 {
407 .ctl_name = KERN_SPARC_REBOOT, 417 .ctl_name = KERN_SPARC_REBOOT,
408 .procname = "reboot-cmd", 418 .procname = "reboot-cmd",
@@ -455,7 +465,17 @@ static struct ctl_table kern_table[] = {
455 .mode = 0644, 465 .mode = 0644,
456 .proc_handler = &proc_dointvec, 466 .proc_handler = &proc_dointvec,
457 }, 467 },
458#ifdef CONFIG_KMOD 468#ifdef CONFIG_FTRACE
469 {
470 .ctl_name = CTL_UNNUMBERED,
471 .procname = "ftrace_enabled",
472 .data = &ftrace_enabled,
473 .maxlen = sizeof(int),
474 .mode = 0644,
475 .proc_handler = &ftrace_enable_sysctl,
476 },
477#endif
478#ifdef CONFIG_MODULES
459 { 479 {
460 .ctl_name = KERN_MODPROBE, 480 .ctl_name = KERN_MODPROBE,
461 .procname = "modprobe", 481 .procname = "modprobe",
@@ -563,16 +583,6 @@ static struct ctl_table kern_table[] = {
563 .proc_handler = &proc_dointvec, 583 .proc_handler = &proc_dointvec,
564 }, 584 },
565#endif 585#endif
566#ifdef CONFIG_NO_IDLE_HZ
567 {
568 .ctl_name = KERN_HZ_TIMER,
569 .procname = "hz_timer",
570 .data = &sysctl_hz_timer,
571 .maxlen = sizeof(int),
572 .mode = 0644,
573 .proc_handler = &proc_dointvec,
574 },
575#endif
576 { 586 {
577 .ctl_name = KERN_S390_USER_DEBUG_LOGGING, 587 .ctl_name = KERN_S390_USER_DEBUG_LOGGING,
578 .procname = "userprocess_debug", 588 .procname = "userprocess_debug",
@@ -613,7 +623,7 @@ static struct ctl_table kern_table[] = {
613 { 623 {
614 .ctl_name = KERN_PRINTK_RATELIMIT, 624 .ctl_name = KERN_PRINTK_RATELIMIT,
615 .procname = "printk_ratelimit", 625 .procname = "printk_ratelimit",
616 .data = &printk_ratelimit_jiffies, 626 .data = &printk_ratelimit_state.interval,
617 .maxlen = sizeof(int), 627 .maxlen = sizeof(int),
618 .mode = 0644, 628 .mode = 0644,
619 .proc_handler = &proc_dointvec_jiffies, 629 .proc_handler = &proc_dointvec_jiffies,
@@ -622,7 +632,7 @@ static struct ctl_table kern_table[] = {
622 { 632 {
623 .ctl_name = KERN_PRINTK_RATELIMIT_BURST, 633 .ctl_name = KERN_PRINTK_RATELIMIT_BURST,
624 .procname = "printk_ratelimit_burst", 634 .procname = "printk_ratelimit_burst",
625 .data = &printk_ratelimit_burst, 635 .data = &printk_ratelimit_state.burst,
626 .maxlen = sizeof(int), 636 .maxlen = sizeof(int),
627 .mode = 0644, 637 .mode = 0644,
628 .proc_handler = &proc_dointvec, 638 .proc_handler = &proc_dointvec,
@@ -729,13 +739,24 @@ static struct ctl_table kern_table[] = {
729#ifdef CONFIG_DETECT_SOFTLOCKUP 739#ifdef CONFIG_DETECT_SOFTLOCKUP
730 { 740 {
731 .ctl_name = CTL_UNNUMBERED, 741 .ctl_name = CTL_UNNUMBERED,
742 .procname = "softlockup_panic",
743 .data = &softlockup_panic,
744 .maxlen = sizeof(int),
745 .mode = 0644,
746 .proc_handler = &proc_dointvec_minmax,
747 .strategy = &sysctl_intvec,
748 .extra1 = &zero,
749 .extra2 = &one,
750 },
751 {
752 .ctl_name = CTL_UNNUMBERED,
732 .procname = "softlockup_thresh", 753 .procname = "softlockup_thresh",
733 .data = &softlockup_thresh, 754 .data = &softlockup_thresh,
734 .maxlen = sizeof(unsigned long), 755 .maxlen = sizeof(int),
735 .mode = 0644, 756 .mode = 0644,
736 .proc_handler = &proc_doulongvec_minmax, 757 .proc_handler = &proc_dointvec_minmax,
737 .strategy = &sysctl_intvec, 758 .strategy = &sysctl_intvec,
738 .extra1 = &one, 759 .extra1 = &neg_one,
739 .extra2 = &sixty, 760 .extra2 = &sixty,
740 }, 761 },
741 { 762 {
@@ -786,16 +807,6 @@ static struct ctl_table kern_table[] = {
786 .proc_handler = &proc_dointvec, 807 .proc_handler = &proc_dointvec,
787 }, 808 },
788#endif 809#endif
789#ifdef CONFIG_PROC_FS
790 {
791 .ctl_name = CTL_UNNUMBERED,
792 .procname = "maps_protect",
793 .data = &maps_protect,
794 .maxlen = sizeof(int),
795 .mode = 0644,
796 .proc_handler = &proc_dointvec,
797 },
798#endif
799 { 810 {
800 .ctl_name = CTL_UNNUMBERED, 811 .ctl_name = CTL_UNNUMBERED,
801 .procname = "poweroff_cmd", 812 .procname = "poweroff_cmd",
@@ -813,6 +824,16 @@ static struct ctl_table kern_table[] = {
813 .child = key_sysctls, 824 .child = key_sysctls,
814 }, 825 },
815#endif 826#endif
827#ifdef CONFIG_RCU_TORTURE_TEST
828 {
829 .ctl_name = CTL_UNNUMBERED,
830 .procname = "rcutorture_runnable",
831 .data = &rcutorture_runnable,
832 .maxlen = sizeof(int),
833 .mode = 0644,
834 .proc_handler = &proc_dointvec,
835 },
836#endif
816/* 837/*
817 * NOTE: do not add new entries to this table unless you have read 838 * NOTE: do not add new entries to this table unless you have read
818 * Documentation/sysctl/ctl_unnumbered.txt 839 * Documentation/sysctl/ctl_unnumbered.txt
@@ -927,7 +948,7 @@ static struct ctl_table vm_table[] = {
927#ifdef CONFIG_HUGETLB_PAGE 948#ifdef CONFIG_HUGETLB_PAGE
928 { 949 {
929 .procname = "nr_hugepages", 950 .procname = "nr_hugepages",
930 .data = &max_huge_pages, 951 .data = NULL,
931 .maxlen = sizeof(unsigned long), 952 .maxlen = sizeof(unsigned long),
932 .mode = 0644, 953 .mode = 0644,
933 .proc_handler = &hugetlb_sysctl_handler, 954 .proc_handler = &hugetlb_sysctl_handler,
@@ -953,10 +974,12 @@ static struct ctl_table vm_table[] = {
953 { 974 {
954 .ctl_name = CTL_UNNUMBERED, 975 .ctl_name = CTL_UNNUMBERED,
955 .procname = "nr_overcommit_hugepages", 976 .procname = "nr_overcommit_hugepages",
956 .data = &sysctl_overcommit_huge_pages, 977 .data = NULL,
957 .maxlen = sizeof(sysctl_overcommit_huge_pages), 978 .maxlen = sizeof(unsigned long),
958 .mode = 0644, 979 .mode = 0644,
959 .proc_handler = &hugetlb_overcommit_handler, 980 .proc_handler = &hugetlb_overcommit_handler,
981 .extra1 = (void *)&hugetlb_zero,
982 .extra2 = (void *)&hugetlb_infinity,
960 }, 983 },
961#endif 984#endif
962 { 985 {
@@ -1225,6 +1248,7 @@ static struct ctl_table fs_table[] = {
1225 .extra1 = &minolduid, 1248 .extra1 = &minolduid,
1226 .extra2 = &maxolduid, 1249 .extra2 = &maxolduid,
1227 }, 1250 },
1251#ifdef CONFIG_FILE_LOCKING
1228 { 1252 {
1229 .ctl_name = FS_LEASES, 1253 .ctl_name = FS_LEASES,
1230 .procname = "leases-enable", 1254 .procname = "leases-enable",
@@ -1233,6 +1257,7 @@ static struct ctl_table fs_table[] = {
1233 .mode = 0644, 1257 .mode = 0644,
1234 .proc_handler = &proc_dointvec, 1258 .proc_handler = &proc_dointvec,
1235 }, 1259 },
1260#endif
1236#ifdef CONFIG_DNOTIFY 1261#ifdef CONFIG_DNOTIFY
1237 { 1262 {
1238 .ctl_name = FS_DIR_NOTIFY, 1263 .ctl_name = FS_DIR_NOTIFY,
@@ -1244,6 +1269,7 @@ static struct ctl_table fs_table[] = {
1244 }, 1269 },
1245#endif 1270#endif
1246#ifdef CONFIG_MMU 1271#ifdef CONFIG_MMU
1272#ifdef CONFIG_FILE_LOCKING
1247 { 1273 {
1248 .ctl_name = FS_LEASE_TIME, 1274 .ctl_name = FS_LEASE_TIME,
1249 .procname = "lease-break-time", 1275 .procname = "lease-break-time",
@@ -1255,6 +1281,7 @@ static struct ctl_table fs_table[] = {
1255 .extra1 = &zero, 1281 .extra1 = &zero,
1256 .extra2 = &two, 1282 .extra2 = &two,
1257 }, 1283 },
1284#endif
1258 { 1285 {
1259 .procname = "aio-nr", 1286 .procname = "aio-nr",
1260 .data = &aio_nr, 1287 .data = &aio_nr,
@@ -1352,6 +1379,9 @@ static void start_unregistering(struct ctl_table_header *p)
1352 spin_unlock(&sysctl_lock); 1379 spin_unlock(&sysctl_lock);
1353 wait_for_completion(&wait); 1380 wait_for_completion(&wait);
1354 spin_lock(&sysctl_lock); 1381 spin_lock(&sysctl_lock);
1382 } else {
1383 /* anything non-NULL; we'll never dereference it */
1384 p->unregistering = ERR_PTR(-EINVAL);
1355 } 1385 }
1356 /* 1386 /*
1357 * do not remove from the list until nobody holds it; walking the 1387 * do not remove from the list until nobody holds it; walking the
@@ -1360,6 +1390,32 @@ static void start_unregistering(struct ctl_table_header *p)
1360 list_del_init(&p->ctl_entry); 1390 list_del_init(&p->ctl_entry);
1361} 1391}
1362 1392
1393void sysctl_head_get(struct ctl_table_header *head)
1394{
1395 spin_lock(&sysctl_lock);
1396 head->count++;
1397 spin_unlock(&sysctl_lock);
1398}
1399
1400void sysctl_head_put(struct ctl_table_header *head)
1401{
1402 spin_lock(&sysctl_lock);
1403 if (!--head->count)
1404 kfree(head);
1405 spin_unlock(&sysctl_lock);
1406}
1407
1408struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
1409{
1410 if (!head)
1411 BUG();
1412 spin_lock(&sysctl_lock);
1413 if (!use_table(head))
1414 head = ERR_PTR(-ENOENT);
1415 spin_unlock(&sysctl_lock);
1416 return head;
1417}
1418
1363void sysctl_head_finish(struct ctl_table_header *head) 1419void sysctl_head_finish(struct ctl_table_header *head)
1364{ 1420{
1365 if (!head) 1421 if (!head)
@@ -1369,14 +1425,20 @@ void sysctl_head_finish(struct ctl_table_header *head)
1369 spin_unlock(&sysctl_lock); 1425 spin_unlock(&sysctl_lock);
1370} 1426}
1371 1427
1428static struct ctl_table_set *
1429lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces)
1430{
1431 struct ctl_table_set *set = &root->default_set;
1432 if (root->lookup)
1433 set = root->lookup(root, namespaces);
1434 return set;
1435}
1436
1372static struct list_head * 1437static struct list_head *
1373lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces) 1438lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces)
1374{ 1439{
1375 struct list_head *header_list; 1440 struct ctl_table_set *set = lookup_header_set(root, namespaces);
1376 header_list = &root->header_list; 1441 return &set->list;
1377 if (root->lookup)
1378 header_list = root->lookup(root, namespaces);
1379 return header_list;
1380} 1442}
1381 1443
1382struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces, 1444struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces,
@@ -1446,9 +1508,9 @@ static int do_sysctl_strategy(struct ctl_table_root *root,
1446 int op = 0, rc; 1508 int op = 0, rc;
1447 1509
1448 if (oldval) 1510 if (oldval)
1449 op |= 004; 1511 op |= MAY_READ;
1450 if (newval) 1512 if (newval)
1451 op |= 002; 1513 op |= MAY_WRITE;
1452 if (sysctl_perm(root, table, op)) 1514 if (sysctl_perm(root, table, op))
1453 return -EPERM; 1515 return -EPERM;
1454 1516
@@ -1490,7 +1552,7 @@ repeat:
1490 if (n == table->ctl_name) { 1552 if (n == table->ctl_name) {
1491 int error; 1553 int error;
1492 if (table->child) { 1554 if (table->child) {
1493 if (sysctl_perm(root, table, 001)) 1555 if (sysctl_perm(root, table, MAY_EXEC))
1494 return -EPERM; 1556 return -EPERM;
1495 name++; 1557 name++;
1496 nlen--; 1558 nlen--;
@@ -1565,7 +1627,7 @@ static int test_perm(int mode, int op)
1565 mode >>= 6; 1627 mode >>= 6;
1566 else if (in_egroup_p(0)) 1628 else if (in_egroup_p(0))
1567 mode >>= 3; 1629 mode >>= 3;
1568 if ((mode & op & 0007) == op) 1630 if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0)
1569 return 0; 1631 return 0;
1570 return -EACCES; 1632 return -EACCES;
1571} 1633}
@@ -1575,7 +1637,7 @@ int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
1575 int error; 1637 int error;
1576 int mode; 1638 int mode;
1577 1639
1578 error = security_sysctl(table, op); 1640 error = security_sysctl(table, op & (MAY_READ | MAY_WRITE | MAY_EXEC));
1579 if (error) 1641 if (error)
1580 return error; 1642 return error;
1581 1643
@@ -1610,6 +1672,54 @@ static __init int sysctl_init(void)
1610 1672
1611core_initcall(sysctl_init); 1673core_initcall(sysctl_init);
1612 1674
1675static struct ctl_table *is_branch_in(struct ctl_table *branch,
1676 struct ctl_table *table)
1677{
1678 struct ctl_table *p;
1679 const char *s = branch->procname;
1680
1681 /* branch should have named subdirectory as its first element */
1682 if (!s || !branch->child)
1683 return NULL;
1684
1685 /* ... and nothing else */
1686 if (branch[1].procname || branch[1].ctl_name)
1687 return NULL;
1688
1689 /* table should contain subdirectory with the same name */
1690 for (p = table; p->procname || p->ctl_name; p++) {
1691 if (!p->child)
1692 continue;
1693 if (p->procname && strcmp(p->procname, s) == 0)
1694 return p;
1695 }
1696 return NULL;
1697}
1698
1699/* see if attaching q to p would be an improvement */
1700static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
1701{
1702 struct ctl_table *to = p->ctl_table, *by = q->ctl_table;
1703 struct ctl_table *next;
1704 int is_better = 0;
1705 int not_in_parent = !p->attached_by;
1706
1707 while ((next = is_branch_in(by, to)) != NULL) {
1708 if (by == q->attached_by)
1709 is_better = 1;
1710 if (to == p->attached_by)
1711 not_in_parent = 1;
1712 by = by->child;
1713 to = next->child;
1714 }
1715
1716 if (is_better && not_in_parent) {
1717 q->attached_by = by;
1718 q->attached_to = to;
1719 q->parent = p;
1720 }
1721}
1722
1613/** 1723/**
1614 * __register_sysctl_paths - register a sysctl hierarchy 1724 * __register_sysctl_paths - register a sysctl hierarchy
1615 * @root: List of sysctl headers to register on 1725 * @root: List of sysctl headers to register on
@@ -1686,10 +1796,10 @@ struct ctl_table_header *__register_sysctl_paths(
1686 struct nsproxy *namespaces, 1796 struct nsproxy *namespaces,
1687 const struct ctl_path *path, struct ctl_table *table) 1797 const struct ctl_path *path, struct ctl_table *table)
1688{ 1798{
1689 struct list_head *header_list;
1690 struct ctl_table_header *header; 1799 struct ctl_table_header *header;
1691 struct ctl_table *new, **prevp; 1800 struct ctl_table *new, **prevp;
1692 unsigned int n, npath; 1801 unsigned int n, npath;
1802 struct ctl_table_set *set;
1693 1803
1694 /* Count the path components */ 1804 /* Count the path components */
1695 for (npath = 0; path[npath].ctl_name || path[npath].procname; ++npath) 1805 for (npath = 0; path[npath].ctl_name || path[npath].procname; ++npath)
@@ -1731,6 +1841,7 @@ struct ctl_table_header *__register_sysctl_paths(
1731 header->unregistering = NULL; 1841 header->unregistering = NULL;
1732 header->root = root; 1842 header->root = root;
1733 sysctl_set_parent(NULL, header->ctl_table); 1843 sysctl_set_parent(NULL, header->ctl_table);
1844 header->count = 1;
1734#ifdef CONFIG_SYSCTL_SYSCALL_CHECK 1845#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
1735 if (sysctl_check_table(namespaces, header->ctl_table)) { 1846 if (sysctl_check_table(namespaces, header->ctl_table)) {
1736 kfree(header); 1847 kfree(header);
@@ -1738,8 +1849,20 @@ struct ctl_table_header *__register_sysctl_paths(
1738 } 1849 }
1739#endif 1850#endif
1740 spin_lock(&sysctl_lock); 1851 spin_lock(&sysctl_lock);
1741 header_list = lookup_header_list(root, namespaces); 1852 header->set = lookup_header_set(root, namespaces);
1742 list_add_tail(&header->ctl_entry, header_list); 1853 header->attached_by = header->ctl_table;
1854 header->attached_to = root_table;
1855 header->parent = &root_table_header;
1856 for (set = header->set; set; set = set->parent) {
1857 struct ctl_table_header *p;
1858 list_for_each_entry(p, &set->list, ctl_entry) {
1859 if (p->unregistering)
1860 continue;
1861 try_attach(p, header);
1862 }
1863 }
1864 header->parent->count++;
1865 list_add_tail(&header->ctl_entry, &header->set->list);
1743 spin_unlock(&sysctl_lock); 1866 spin_unlock(&sysctl_lock);
1744 1867
1745 return header; 1868 return header;
@@ -1794,8 +1917,37 @@ void unregister_sysctl_table(struct ctl_table_header * header)
1794 1917
1795 spin_lock(&sysctl_lock); 1918 spin_lock(&sysctl_lock);
1796 start_unregistering(header); 1919 start_unregistering(header);
1920 if (!--header->parent->count) {
1921 WARN_ON(1);
1922 kfree(header->parent);
1923 }
1924 if (!--header->count)
1925 kfree(header);
1797 spin_unlock(&sysctl_lock); 1926 spin_unlock(&sysctl_lock);
1798 kfree(header); 1927}
1928
1929int sysctl_is_seen(struct ctl_table_header *p)
1930{
1931 struct ctl_table_set *set = p->set;
1932 int res;
1933 spin_lock(&sysctl_lock);
1934 if (p->unregistering)
1935 res = 0;
1936 else if (!set->is_seen)
1937 res = 1;
1938 else
1939 res = set->is_seen(set);
1940 spin_unlock(&sysctl_lock);
1941 return res;
1942}
1943
1944void setup_sysctl_set(struct ctl_table_set *p,
1945 struct ctl_table_set *parent,
1946 int (*is_seen)(struct ctl_table_set *))
1947{
1948 INIT_LIST_HEAD(&p->list);
1949 p->parent = parent ? parent : &sysctl_table_root.default_set;
1950 p->is_seen = is_seen;
1799} 1951}
1800 1952
1801#else /* !CONFIG_SYSCTL */ 1953#else /* !CONFIG_SYSCTL */
@@ -1814,6 +1966,16 @@ void unregister_sysctl_table(struct ctl_table_header * table)
1814{ 1966{
1815} 1967}
1816 1968
1969void setup_sysctl_set(struct ctl_table_set *p,
1970 struct ctl_table_set *parent,
1971 int (*is_seen)(struct ctl_table_set *))
1972{
1973}
1974
1975void sysctl_head_put(struct ctl_table_header *head)
1976{
1977}
1978
1817#endif /* CONFIG_SYSCTL */ 1979#endif /* CONFIG_SYSCTL */
1818 1980
1819/* 1981/*
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index c09350d564f2..c35da23ab8fb 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -1532,6 +1532,8 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
1532 sysctl_check_leaf(namespaces, table, &fail); 1532 sysctl_check_leaf(namespaces, table, &fail);
1533 } 1533 }
1534 sysctl_check_bin_path(table, &fail); 1534 sysctl_check_bin_path(table, &fail);
1535 if (table->mode > 0777)
1536 set_fail(&fail, table, "bogus .mode");
1535 if (fail) { 1537 if (fail) {
1536 set_fail(&fail, table, NULL); 1538 set_fail(&fail, table, NULL);
1537 error = -EINVAL; 1539 error = -EINVAL;
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 4a23517169a6..bd6be76303cf 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -35,7 +35,7 @@
35 */ 35 */
36#define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS) 36#define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS)
37 37
38static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 }; 38static DEFINE_PER_CPU(__u32, taskstats_seqnum);
39static int family_registered; 39static int family_registered;
40struct kmem_cache *taskstats_cache; 40struct kmem_cache *taskstats_cache;
41 41
@@ -301,7 +301,7 @@ static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
301 return -EINVAL; 301 return -EINVAL;
302 302
303 if (isadd == REGISTER) { 303 if (isadd == REGISTER) {
304 for_each_cpu_mask(cpu, mask) { 304 for_each_cpu_mask_nr(cpu, mask) {
305 s = kmalloc_node(sizeof(struct listener), GFP_KERNEL, 305 s = kmalloc_node(sizeof(struct listener), GFP_KERNEL,
306 cpu_to_node(cpu)); 306 cpu_to_node(cpu));
307 if (!s) 307 if (!s)
@@ -320,7 +320,7 @@ static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
320 320
321 /* Deregister or cleanup */ 321 /* Deregister or cleanup */
322cleanup: 322cleanup:
323 for_each_cpu_mask(cpu, mask) { 323 for_each_cpu_mask_nr(cpu, mask) {
324 listeners = &per_cpu(listener_array, cpu); 324 listeners = &per_cpu(listener_array, cpu);
325 down_write(&listeners->sem); 325 down_write(&listeners->sem);
326 list_for_each_entry_safe(s, tmp, &listeners->list, list) { 326 list_for_each_entry_safe(s, tmp, &listeners->list, list) {
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 3d1e3e1a1971..f8d968063cea 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -72,6 +72,16 @@ void clockevents_set_mode(struct clock_event_device *dev,
72} 72}
73 73
74/** 74/**
75 * clockevents_shutdown - shutdown the device and clear next_event
76 * @dev: device to shutdown
77 */
78void clockevents_shutdown(struct clock_event_device *dev)
79{
80 clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
81 dev->next_event.tv64 = KTIME_MAX;
82}
83
84/**
75 * clockevents_program_event - Reprogram the clock event device. 85 * clockevents_program_event - Reprogram the clock event device.
76 * @expires: absolute expiry time (monotonic clock) 86 * @expires: absolute expiry time (monotonic clock)
77 * 87 *
@@ -177,7 +187,7 @@ void clockevents_register_device(struct clock_event_device *dev)
177/* 187/*
178 * Noop handler when we shut down an event device 188 * Noop handler when we shut down an event device
179 */ 189 */
180static void clockevents_handle_noop(struct clock_event_device *dev) 190void clockevents_handle_noop(struct clock_event_device *dev)
181{ 191{
182} 192}
183 193
@@ -199,7 +209,6 @@ void clockevents_exchange_device(struct clock_event_device *old,
199 * released list and do a notify add later. 209 * released list and do a notify add later.
200 */ 210 */
201 if (old) { 211 if (old) {
202 old->event_handler = clockevents_handle_noop;
203 clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); 212 clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED);
204 list_del(&old->list); 213 list_del(&old->list);
205 list_add(&old->list, &clockevents_released); 214 list_add(&old->list, &clockevents_released);
@@ -207,7 +216,7 @@ void clockevents_exchange_device(struct clock_event_device *old,
207 216
208 if (new) { 217 if (new) {
209 BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED); 218 BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED);
210 clockevents_set_mode(new, CLOCK_EVT_MODE_SHUTDOWN); 219 clockevents_shutdown(new);
211 } 220 }
212 local_irq_restore(flags); 221 local_irq_restore(flags);
213} 222}
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index dadde5361f32..093d4acf993b 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -145,9 +145,9 @@ static void clocksource_watchdog(unsigned long data)
145 * Cycle through CPUs to check if the CPUs stay 145 * Cycle through CPUs to check if the CPUs stay
146 * synchronized to each other. 146 * synchronized to each other.
147 */ 147 */
148 int next_cpu = next_cpu(raw_smp_processor_id(), cpu_online_map); 148 int next_cpu = next_cpu_nr(raw_smp_processor_id(), cpu_online_map);
149 149
150 if (next_cpu >= NR_CPUS) 150 if (next_cpu >= nr_cpu_ids)
151 next_cpu = first_cpu(cpu_online_map); 151 next_cpu = first_cpu(cpu_online_map);
152 watchdog_timer.expires += WATCHDOG_INTERVAL; 152 watchdog_timer.expires += WATCHDOG_INTERVAL;
153 add_timer_on(&watchdog_timer, next_cpu); 153 add_timer_on(&watchdog_timer, next_cpu);
@@ -376,7 +376,8 @@ void clocksource_unregister(struct clocksource *cs)
376 * Provides sysfs interface for listing current clocksource. 376 * Provides sysfs interface for listing current clocksource.
377 */ 377 */
378static ssize_t 378static ssize_t
379sysfs_show_current_clocksources(struct sys_device *dev, char *buf) 379sysfs_show_current_clocksources(struct sys_device *dev,
380 struct sysdev_attribute *attr, char *buf)
380{ 381{
381 ssize_t count = 0; 382 ssize_t count = 0;
382 383
@@ -397,6 +398,7 @@ sysfs_show_current_clocksources(struct sys_device *dev, char *buf)
397 * clocksource selction. 398 * clocksource selction.
398 */ 399 */
399static ssize_t sysfs_override_clocksource(struct sys_device *dev, 400static ssize_t sysfs_override_clocksource(struct sys_device *dev,
401 struct sysdev_attribute *attr,
400 const char *buf, size_t count) 402 const char *buf, size_t count)
401{ 403{
402 struct clocksource *ovr = NULL; 404 struct clocksource *ovr = NULL;
@@ -449,7 +451,9 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
449 * Provides sysfs interface for listing registered clocksources 451 * Provides sysfs interface for listing registered clocksources
450 */ 452 */
451static ssize_t 453static ssize_t
452sysfs_show_available_clocksources(struct sys_device *dev, char *buf) 454sysfs_show_available_clocksources(struct sys_device *dev,
455 struct sysdev_attribute *attr,
456 char *buf)
453{ 457{
454 struct clocksource *src; 458 struct clocksource *src;
455 ssize_t count = 0; 459 ssize_t count = 0;
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 5125ddd8196b..1ad46f3df6e7 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -245,7 +245,7 @@ static void sync_cmos_clock(unsigned long dummy)
245 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) 245 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2)
246 fail = update_persistent_clock(now); 246 fail = update_persistent_clock(now);
247 247
248 next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec; 248 next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2);
249 if (next.tv_nsec <= 0) 249 if (next.tv_nsec <= 0)
250 next.tv_nsec += NSEC_PER_SEC; 250 next.tv_nsec += NSEC_PER_SEC;
251 251
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 57a1f02e5ec0..cb01cd8f919b 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -30,6 +30,7 @@
30struct tick_device tick_broadcast_device; 30struct tick_device tick_broadcast_device;
31static cpumask_t tick_broadcast_mask; 31static cpumask_t tick_broadcast_mask;
32static DEFINE_SPINLOCK(tick_broadcast_lock); 32static DEFINE_SPINLOCK(tick_broadcast_lock);
33static int tick_broadcast_force;
33 34
34#ifdef CONFIG_TICK_ONESHOT 35#ifdef CONFIG_TICK_ONESHOT
35static void tick_broadcast_clear_oneshot(int cpu); 36static void tick_broadcast_clear_oneshot(int cpu);
@@ -174,6 +175,8 @@ static void tick_do_periodic_broadcast(void)
174 */ 175 */
175static void tick_handle_periodic_broadcast(struct clock_event_device *dev) 176static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
176{ 177{
178 ktime_t next;
179
177 tick_do_periodic_broadcast(); 180 tick_do_periodic_broadcast();
178 181
179 /* 182 /*
@@ -184,10 +187,13 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
184 187
185 /* 188 /*
186 * Setup the next period for devices, which do not have 189 * Setup the next period for devices, which do not have
187 * periodic mode: 190 * periodic mode. We read dev->next_event first and add to it
191 * when the event alrady expired. clockevents_program_event()
192 * sets dev->next_event only when the event is really
193 * programmed to the device.
188 */ 194 */
189 for (;;) { 195 for (next = dev->next_event; ;) {
190 ktime_t next = ktime_add(dev->next_event, tick_period); 196 next = ktime_add(next, tick_period);
191 197
192 if (!clockevents_program_event(dev, next, ktime_get())) 198 if (!clockevents_program_event(dev, next, ktime_get()))
193 return; 199 return;
@@ -204,7 +210,7 @@ static void tick_do_broadcast_on_off(void *why)
204 struct clock_event_device *bc, *dev; 210 struct clock_event_device *bc, *dev;
205 struct tick_device *td; 211 struct tick_device *td;
206 unsigned long flags, *reason = why; 212 unsigned long flags, *reason = why;
207 int cpu; 213 int cpu, bc_stopped;
208 214
209 spin_lock_irqsave(&tick_broadcast_lock, flags); 215 spin_lock_irqsave(&tick_broadcast_lock, flags);
210 216
@@ -222,30 +228,35 @@ static void tick_do_broadcast_on_off(void *why)
222 if (!tick_device_is_functional(dev)) 228 if (!tick_device_is_functional(dev))
223 goto out; 229 goto out;
224 230
231 bc_stopped = cpus_empty(tick_broadcast_mask);
232
225 switch (*reason) { 233 switch (*reason) {
226 case CLOCK_EVT_NOTIFY_BROADCAST_ON: 234 case CLOCK_EVT_NOTIFY_BROADCAST_ON:
227 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: 235 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
228 if (!cpu_isset(cpu, tick_broadcast_mask)) { 236 if (!cpu_isset(cpu, tick_broadcast_mask)) {
229 cpu_set(cpu, tick_broadcast_mask); 237 cpu_set(cpu, tick_broadcast_mask);
230 if (td->mode == TICKDEV_MODE_PERIODIC) 238 if (tick_broadcast_device.mode ==
231 clockevents_set_mode(dev, 239 TICKDEV_MODE_PERIODIC)
232 CLOCK_EVT_MODE_SHUTDOWN); 240 clockevents_shutdown(dev);
233 } 241 }
234 if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE) 242 if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE)
235 dev->features |= CLOCK_EVT_FEAT_DUMMY; 243 tick_broadcast_force = 1;
236 break; 244 break;
237 case CLOCK_EVT_NOTIFY_BROADCAST_OFF: 245 case CLOCK_EVT_NOTIFY_BROADCAST_OFF:
238 if (cpu_isset(cpu, tick_broadcast_mask)) { 246 if (!tick_broadcast_force &&
247 cpu_isset(cpu, tick_broadcast_mask)) {
239 cpu_clear(cpu, tick_broadcast_mask); 248 cpu_clear(cpu, tick_broadcast_mask);
240 if (td->mode == TICKDEV_MODE_PERIODIC) 249 if (tick_broadcast_device.mode ==
250 TICKDEV_MODE_PERIODIC)
241 tick_setup_periodic(dev, 0); 251 tick_setup_periodic(dev, 0);
242 } 252 }
243 break; 253 break;
244 } 254 }
245 255
246 if (cpus_empty(tick_broadcast_mask)) 256 if (cpus_empty(tick_broadcast_mask)) {
247 clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); 257 if (!bc_stopped)
248 else { 258 clockevents_shutdown(bc);
259 } else if (bc_stopped) {
249 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) 260 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
250 tick_broadcast_start_periodic(bc); 261 tick_broadcast_start_periodic(bc);
251 else 262 else
@@ -266,7 +277,7 @@ void tick_broadcast_on_off(unsigned long reason, int *oncpu)
266 "offline CPU #%d\n", *oncpu); 277 "offline CPU #%d\n", *oncpu);
267 else 278 else
268 smp_call_function_single(*oncpu, tick_do_broadcast_on_off, 279 smp_call_function_single(*oncpu, tick_do_broadcast_on_off,
269 &reason, 1, 1); 280 &reason, 1);
270} 281}
271 282
272/* 283/*
@@ -296,7 +307,7 @@ void tick_shutdown_broadcast(unsigned int *cpup)
296 307
297 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { 308 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
298 if (bc && cpus_empty(tick_broadcast_mask)) 309 if (bc && cpus_empty(tick_broadcast_mask))
299 clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); 310 clockevents_shutdown(bc);
300 } 311 }
301 312
302 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 313 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
@@ -311,7 +322,7 @@ void tick_suspend_broadcast(void)
311 322
312 bc = tick_broadcast_device.evtdev; 323 bc = tick_broadcast_device.evtdev;
313 if (bc) 324 if (bc)
314 clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); 325 clockevents_shutdown(bc);
315 326
316 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 327 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
317} 328}
@@ -362,16 +373,8 @@ cpumask_t *tick_get_broadcast_oneshot_mask(void)
362static int tick_broadcast_set_event(ktime_t expires, int force) 373static int tick_broadcast_set_event(ktime_t expires, int force)
363{ 374{
364 struct clock_event_device *bc = tick_broadcast_device.evtdev; 375 struct clock_event_device *bc = tick_broadcast_device.evtdev;
365 ktime_t now = ktime_get(); 376
366 int res; 377 return tick_dev_program_event(bc, expires, force);
367
368 for(;;) {
369 res = clockevents_program_event(bc, expires, now);
370 if (!res || !force)
371 return res;
372 now = ktime_get();
373 expires = ktime_add(now, ktime_set(0, bc->min_delta_ns));
374 }
375} 378}
376 379
377int tick_resume_broadcast_oneshot(struct clock_event_device *bc) 380int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
@@ -397,8 +400,7 @@ again:
397 mask = CPU_MASK_NONE; 400 mask = CPU_MASK_NONE;
398 now = ktime_get(); 401 now = ktime_get();
399 /* Find all expired events */ 402 /* Find all expired events */
400 for (cpu = first_cpu(tick_broadcast_oneshot_mask); cpu != NR_CPUS; 403 for_each_cpu_mask_nr(cpu, tick_broadcast_oneshot_mask) {
401 cpu = next_cpu(cpu, tick_broadcast_oneshot_mask)) {
402 td = &per_cpu(tick_cpu_device, cpu); 404 td = &per_cpu(tick_cpu_device, cpu);
403 if (td->evtdev->next_event.tv64 <= now.tv64) 405 if (td->evtdev->next_event.tv64 <= now.tv64)
404 cpu_set(cpu, mask); 406 cpu_set(cpu, mask);
@@ -490,14 +492,52 @@ static void tick_broadcast_clear_oneshot(int cpu)
490 cpu_clear(cpu, tick_broadcast_oneshot_mask); 492 cpu_clear(cpu, tick_broadcast_oneshot_mask);
491} 493}
492 494
495static void tick_broadcast_init_next_event(cpumask_t *mask, ktime_t expires)
496{
497 struct tick_device *td;
498 int cpu;
499
500 for_each_cpu_mask_nr(cpu, *mask) {
501 td = &per_cpu(tick_cpu_device, cpu);
502 if (td->evtdev)
503 td->evtdev->next_event = expires;
504 }
505}
506
493/** 507/**
494 * tick_broadcast_setup_oneshot - setup the broadcast device 508 * tick_broadcast_setup_oneshot - setup the broadcast device
495 */ 509 */
496void tick_broadcast_setup_oneshot(struct clock_event_device *bc) 510void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
497{ 511{
498 bc->event_handler = tick_handle_oneshot_broadcast; 512 /* Set it up only once ! */
499 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); 513 if (bc->event_handler != tick_handle_oneshot_broadcast) {
500 bc->next_event.tv64 = KTIME_MAX; 514 int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC;
515 int cpu = smp_processor_id();
516 cpumask_t mask;
517
518 bc->event_handler = tick_handle_oneshot_broadcast;
519 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
520
521 /* Take the do_timer update */
522 tick_do_timer_cpu = cpu;
523
524 /*
525 * We must be careful here. There might be other CPUs
526 * waiting for periodic broadcast. We need to set the
527 * oneshot_mask bits for those and program the
528 * broadcast device to fire.
529 */
530 mask = tick_broadcast_mask;
531 cpu_clear(cpu, mask);
532 cpus_or(tick_broadcast_oneshot_mask,
533 tick_broadcast_oneshot_mask, mask);
534
535 if (was_periodic && !cpus_empty(mask)) {
536 tick_broadcast_init_next_event(&mask, tick_next_period);
537 tick_broadcast_set_event(tick_next_period, 1);
538 } else
539 bc->next_event.tv64 = KTIME_MAX;
540 }
501} 541}
502 542
503/* 543/*
@@ -537,4 +577,12 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
537 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 577 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
538} 578}
539 579
580/*
581 * Check, whether the broadcast device is in one shot mode
582 */
583int tick_broadcast_oneshot_active(void)
584{
585 return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT;
586}
587
540#endif 588#endif
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 4f3886562b8c..df12434b43ca 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -33,7 +33,7 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
33 */ 33 */
34ktime_t tick_next_period; 34ktime_t tick_next_period;
35ktime_t tick_period; 35ktime_t tick_period;
36int tick_do_timer_cpu __read_mostly = -1; 36int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
37DEFINE_SPINLOCK(tick_device_lock); 37DEFINE_SPINLOCK(tick_device_lock);
38 38
39/* 39/*
@@ -109,7 +109,8 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
109 if (!tick_device_is_functional(dev)) 109 if (!tick_device_is_functional(dev))
110 return; 110 return;
111 111
112 if (dev->features & CLOCK_EVT_FEAT_PERIODIC) { 112 if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
113 !tick_broadcast_oneshot_active()) {
113 clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC); 114 clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC);
114 } else { 115 } else {
115 unsigned long seq; 116 unsigned long seq;
@@ -135,7 +136,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
135 */ 136 */
136static void tick_setup_device(struct tick_device *td, 137static void tick_setup_device(struct tick_device *td,
137 struct clock_event_device *newdev, int cpu, 138 struct clock_event_device *newdev, int cpu,
138 cpumask_t cpumask) 139 const cpumask_t *cpumask)
139{ 140{
140 ktime_t next_event; 141 ktime_t next_event;
141 void (*handler)(struct clock_event_device *) = NULL; 142 void (*handler)(struct clock_event_device *) = NULL;
@@ -148,7 +149,7 @@ static void tick_setup_device(struct tick_device *td,
148 * If no cpu took the do_timer update, assign it to 149 * If no cpu took the do_timer update, assign it to
149 * this cpu: 150 * this cpu:
150 */ 151 */
151 if (tick_do_timer_cpu == -1) { 152 if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
152 tick_do_timer_cpu = cpu; 153 tick_do_timer_cpu = cpu;
153 tick_next_period = ktime_get(); 154 tick_next_period = ktime_get();
154 tick_period = ktime_set(0, NSEC_PER_SEC / HZ); 155 tick_period = ktime_set(0, NSEC_PER_SEC / HZ);
@@ -161,6 +162,7 @@ static void tick_setup_device(struct tick_device *td,
161 } else { 162 } else {
162 handler = td->evtdev->event_handler; 163 handler = td->evtdev->event_handler;
163 next_event = td->evtdev->next_event; 164 next_event = td->evtdev->next_event;
165 td->evtdev->event_handler = clockevents_handle_noop;
164 } 166 }
165 167
166 td->evtdev = newdev; 168 td->evtdev = newdev;
@@ -169,8 +171,8 @@ static void tick_setup_device(struct tick_device *td,
169 * When the device is not per cpu, pin the interrupt to the 171 * When the device is not per cpu, pin the interrupt to the
170 * current cpu: 172 * current cpu:
171 */ 173 */
172 if (!cpus_equal(newdev->cpumask, cpumask)) 174 if (!cpus_equal(newdev->cpumask, *cpumask))
173 irq_set_affinity(newdev->irq, cpumask); 175 irq_set_affinity(newdev->irq, *cpumask);
174 176
175 /* 177 /*
176 * When global broadcasting is active, check if the current 178 * When global broadcasting is active, check if the current
@@ -196,7 +198,6 @@ static int tick_check_new_device(struct clock_event_device *newdev)
196 struct tick_device *td; 198 struct tick_device *td;
197 int cpu, ret = NOTIFY_OK; 199 int cpu, ret = NOTIFY_OK;
198 unsigned long flags; 200 unsigned long flags;
199 cpumask_t cpumask;
200 201
201 spin_lock_irqsave(&tick_device_lock, flags); 202 spin_lock_irqsave(&tick_device_lock, flags);
202 203
@@ -206,10 +207,9 @@ static int tick_check_new_device(struct clock_event_device *newdev)
206 207
207 td = &per_cpu(tick_cpu_device, cpu); 208 td = &per_cpu(tick_cpu_device, cpu);
208 curdev = td->evtdev; 209 curdev = td->evtdev;
209 cpumask = cpumask_of_cpu(cpu);
210 210
211 /* cpu local device ? */ 211 /* cpu local device ? */
212 if (!cpus_equal(newdev->cpumask, cpumask)) { 212 if (!cpus_equal(newdev->cpumask, cpumask_of_cpu(cpu))) {
213 213
214 /* 214 /*
215 * If the cpu affinity of the device interrupt can not 215 * If the cpu affinity of the device interrupt can not
@@ -222,7 +222,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
222 * If we have a cpu local device already, do not replace it 222 * If we have a cpu local device already, do not replace it
223 * by a non cpu local device 223 * by a non cpu local device
224 */ 224 */
225 if (curdev && cpus_equal(curdev->cpumask, cpumask)) 225 if (curdev && cpus_equal(curdev->cpumask, cpumask_of_cpu(cpu)))
226 goto out_bc; 226 goto out_bc;
227 } 227 }
228 228
@@ -250,11 +250,11 @@ static int tick_check_new_device(struct clock_event_device *newdev)
250 * not give it back to the clockevents layer ! 250 * not give it back to the clockevents layer !
251 */ 251 */
252 if (tick_is_broadcast_device(curdev)) { 252 if (tick_is_broadcast_device(curdev)) {
253 clockevents_set_mode(curdev, CLOCK_EVT_MODE_SHUTDOWN); 253 clockevents_shutdown(curdev);
254 curdev = NULL; 254 curdev = NULL;
255 } 255 }
256 clockevents_exchange_device(curdev, newdev); 256 clockevents_exchange_device(curdev, newdev);
257 tick_setup_device(td, newdev, cpu, cpumask); 257 tick_setup_device(td, newdev, cpu, &cpumask_of_cpu(cpu));
258 if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) 258 if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
259 tick_oneshot_notify(); 259 tick_oneshot_notify();
260 260
@@ -301,7 +301,8 @@ static void tick_shutdown(unsigned int *cpup)
301 if (*cpup == tick_do_timer_cpu) { 301 if (*cpup == tick_do_timer_cpu) {
302 int cpu = first_cpu(cpu_online_map); 302 int cpu = first_cpu(cpu_online_map);
303 303
304 tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : -1; 304 tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu :
305 TICK_DO_TIMER_NONE;
305 } 306 }
306 spin_unlock_irqrestore(&tick_device_lock, flags); 307 spin_unlock_irqrestore(&tick_device_lock, flags);
307} 308}
@@ -312,7 +313,7 @@ static void tick_suspend(void)
312 unsigned long flags; 313 unsigned long flags;
313 314
314 spin_lock_irqsave(&tick_device_lock, flags); 315 spin_lock_irqsave(&tick_device_lock, flags);
315 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_SHUTDOWN); 316 clockevents_shutdown(td->evtdev);
316 spin_unlock_irqrestore(&tick_device_lock, flags); 317 spin_unlock_irqrestore(&tick_device_lock, flags);
317} 318}
318 319
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index f13f2b7f4fd4..469248782c23 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -1,6 +1,10 @@
1/* 1/*
2 * tick internal variable and functions used by low/high res code 2 * tick internal variable and functions used by low/high res code
3 */ 3 */
4
5#define TICK_DO_TIMER_NONE -1
6#define TICK_DO_TIMER_BOOT -2
7
4DECLARE_PER_CPU(struct tick_device, tick_cpu_device); 8DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
5extern spinlock_t tick_device_lock; 9extern spinlock_t tick_device_lock;
6extern ktime_t tick_next_period; 10extern ktime_t tick_next_period;
@@ -10,6 +14,8 @@ extern int tick_do_timer_cpu __read_mostly;
10extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); 14extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
11extern void tick_handle_periodic(struct clock_event_device *dev); 15extern void tick_handle_periodic(struct clock_event_device *dev);
12 16
17extern void clockevents_shutdown(struct clock_event_device *dev);
18
13/* 19/*
14 * NO_HZ / high resolution timer shared code 20 * NO_HZ / high resolution timer shared code
15 */ 21 */
@@ -17,6 +23,8 @@ extern void tick_handle_periodic(struct clock_event_device *dev);
17extern void tick_setup_oneshot(struct clock_event_device *newdev, 23extern void tick_setup_oneshot(struct clock_event_device *newdev,
18 void (*handler)(struct clock_event_device *), 24 void (*handler)(struct clock_event_device *),
19 ktime_t nextevt); 25 ktime_t nextevt);
26extern int tick_dev_program_event(struct clock_event_device *dev,
27 ktime_t expires, int force);
20extern int tick_program_event(ktime_t expires, int force); 28extern int tick_program_event(ktime_t expires, int force);
21extern void tick_oneshot_notify(void); 29extern void tick_oneshot_notify(void);
22extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); 30extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *));
@@ -27,6 +35,7 @@ extern void tick_broadcast_oneshot_control(unsigned long reason);
27extern void tick_broadcast_switch_to_oneshot(void); 35extern void tick_broadcast_switch_to_oneshot(void);
28extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); 36extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
29extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); 37extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
38extern int tick_broadcast_oneshot_active(void);
30# else /* BROADCAST */ 39# else /* BROADCAST */
31static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) 40static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
32{ 41{
@@ -35,6 +44,7 @@ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
35static inline void tick_broadcast_oneshot_control(unsigned long reason) { } 44static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
36static inline void tick_broadcast_switch_to_oneshot(void) { } 45static inline void tick_broadcast_switch_to_oneshot(void) { }
37static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } 46static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
47static inline int tick_broadcast_oneshot_active(void) { return 0; }
38# endif /* !BROADCAST */ 48# endif /* !BROADCAST */
39 49
40#else /* !ONESHOT */ 50#else /* !ONESHOT */
@@ -64,6 +74,7 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
64{ 74{
65 return 0; 75 return 0;
66} 76}
77static inline int tick_broadcast_oneshot_active(void) { return 0; }
67#endif /* !TICK_ONESHOT */ 78#endif /* !TICK_ONESHOT */
68 79
69/* 80/*
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 450c04935b66..2e8de678e767 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -23,24 +23,56 @@
23#include "tick-internal.h" 23#include "tick-internal.h"
24 24
25/** 25/**
26 * tick_program_event 26 * tick_program_event internal worker function
27 */ 27 */
28int tick_program_event(ktime_t expires, int force) 28int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
29 int force)
29{ 30{
30 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
31 ktime_t now = ktime_get(); 31 ktime_t now = ktime_get();
32 int i;
32 33
33 while (1) { 34 for (i = 0;;) {
34 int ret = clockevents_program_event(dev, expires, now); 35 int ret = clockevents_program_event(dev, expires, now);
35 36
36 if (!ret || !force) 37 if (!ret || !force)
37 return ret; 38 return ret;
39
40 /*
41 * We tried 2 times to program the device with the given
42 * min_delta_ns. If that's not working then we double it
43 * and emit a warning.
44 */
45 if (++i > 2) {
46 /* Increase the min. delta and try again */
47 if (!dev->min_delta_ns)
48 dev->min_delta_ns = 5000;
49 else
50 dev->min_delta_ns += dev->min_delta_ns >> 1;
51
52 printk(KERN_WARNING
53 "CE: %s increasing min_delta_ns to %lu nsec\n",
54 dev->name ? dev->name : "?",
55 dev->min_delta_ns << 1);
56
57 i = 0;
58 }
59
38 now = ktime_get(); 60 now = ktime_get();
39 expires = ktime_add(now, ktime_set(0, dev->min_delta_ns)); 61 expires = ktime_add_ns(now, dev->min_delta_ns);
40 } 62 }
41} 63}
42 64
43/** 65/**
66 * tick_program_event
67 */
68int tick_program_event(ktime_t expires, int force)
69{
70 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
71
72 return tick_dev_program_event(dev, expires, force);
73}
74
75/**
44 * tick_resume_onshot - resume oneshot mode 76 * tick_resume_onshot - resume oneshot mode
45 */ 77 */
46void tick_resume_oneshot(void) 78void tick_resume_oneshot(void)
@@ -61,7 +93,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev,
61{ 93{
62 newdev->event_handler = handler; 94 newdev->event_handler = handler;
63 clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT); 95 clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT);
64 clockevents_program_event(newdev, next_event, ktime_get()); 96 tick_dev_program_event(newdev, next_event, 1);
65} 97}
66 98
67/** 99/**
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index b854a895591e..a4d219398167 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -20,6 +20,7 @@
20#include <linux/profile.h> 20#include <linux/profile.h>
21#include <linux/sched.h> 21#include <linux/sched.h>
22#include <linux/tick.h> 22#include <linux/tick.h>
23#include <linux/module.h>
23 24
24#include <asm/irq_regs.h> 25#include <asm/irq_regs.h>
25 26
@@ -48,6 +49,13 @@ static void tick_do_update_jiffies64(ktime_t now)
48 unsigned long ticks = 0; 49 unsigned long ticks = 0;
49 ktime_t delta; 50 ktime_t delta;
50 51
52 /*
53 * Do a quick check without holding xtime_lock:
54 */
55 delta = ktime_sub(now, last_jiffies_update);
56 if (delta.tv64 < tick_period.tv64)
57 return;
58
51 /* Reevalute with xtime_lock held */ 59 /* Reevalute with xtime_lock held */
52 write_seqlock(&xtime_lock); 60 write_seqlock(&xtime_lock);
53 61
@@ -68,6 +76,9 @@ static void tick_do_update_jiffies64(ktime_t now)
68 incr * ticks); 76 incr * ticks);
69 } 77 }
70 do_timer(++ticks); 78 do_timer(++ticks);
79
80 /* Keep the tick_next_period variable up to date */
81 tick_next_period = ktime_add(last_jiffies_update, tick_period);
71 } 82 }
72 write_sequnlock(&xtime_lock); 83 write_sequnlock(&xtime_lock);
73} 84}
@@ -133,8 +144,6 @@ void tick_nohz_update_jiffies(void)
133 if (!ts->tick_stopped) 144 if (!ts->tick_stopped)
134 return; 145 return;
135 146
136 touch_softlockup_watchdog();
137
138 cpu_clear(cpu, nohz_cpu_mask); 147 cpu_clear(cpu, nohz_cpu_mask);
139 now = ktime_get(); 148 now = ktime_get();
140 ts->idle_waketime = now; 149 ts->idle_waketime = now;
@@ -142,6 +151,8 @@ void tick_nohz_update_jiffies(void)
142 local_irq_save(flags); 151 local_irq_save(flags);
143 tick_do_update_jiffies64(now); 152 tick_do_update_jiffies64(now);
144 local_irq_restore(flags); 153 local_irq_restore(flags);
154
155 touch_softlockup_watchdog();
145} 156}
146 157
147void tick_nohz_stop_idle(int cpu) 158void tick_nohz_stop_idle(int cpu)
@@ -155,6 +166,8 @@ void tick_nohz_stop_idle(int cpu)
155 ts->idle_lastupdate = now; 166 ts->idle_lastupdate = now;
156 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); 167 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
157 ts->idle_active = 0; 168 ts->idle_active = 0;
169
170 sched_clock_idle_wakeup_event(0);
158 } 171 }
159} 172}
160 173
@@ -170,6 +183,7 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
170 } 183 }
171 ts->idle_entrytime = now; 184 ts->idle_entrytime = now;
172 ts->idle_active = 1; 185 ts->idle_active = 1;
186 sched_clock_idle_sleep_event();
173 return now; 187 return now;
174} 188}
175 189
@@ -177,9 +191,17 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
177{ 191{
178 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 192 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
179 193
180 *last_update_time = ktime_to_us(ts->idle_lastupdate); 194 if (!tick_nohz_enabled)
195 return -1;
196
197 if (ts->idle_active)
198 *last_update_time = ktime_to_us(ts->idle_lastupdate);
199 else
200 *last_update_time = ktime_to_us(ktime_get());
201
181 return ktime_to_us(ts->idle_sleeptime); 202 return ktime_to_us(ts->idle_sleeptime);
182} 203}
204EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
183 205
184/** 206/**
185 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task 207 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
@@ -188,7 +210,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
188 * Called either from the idle loop or from irq_exit() when an idle period was 210 * Called either from the idle loop or from irq_exit() when an idle period was
189 * just interrupted by an interrupt which did not cause a reschedule. 211 * just interrupted by an interrupt which did not cause a reschedule.
190 */ 212 */
191void tick_nohz_stop_sched_tick(void) 213void tick_nohz_stop_sched_tick(int inidle)
192{ 214{
193 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; 215 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
194 struct tick_sched *ts; 216 struct tick_sched *ts;
@@ -211,12 +233,17 @@ void tick_nohz_stop_sched_tick(void)
211 */ 233 */
212 if (unlikely(!cpu_online(cpu))) { 234 if (unlikely(!cpu_online(cpu))) {
213 if (cpu == tick_do_timer_cpu) 235 if (cpu == tick_do_timer_cpu)
214 tick_do_timer_cpu = -1; 236 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
215 } 237 }
216 238
217 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) 239 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
218 goto end; 240 goto end;
219 241
242 if (!inidle && !ts->inidle)
243 goto end;
244
245 ts->inidle = 1;
246
220 if (need_resched()) 247 if (need_resched())
221 goto end; 248 goto end;
222 249
@@ -228,6 +255,7 @@ void tick_nohz_stop_sched_tick(void)
228 local_softirq_pending()); 255 local_softirq_pending());
229 ratelimit++; 256 ratelimit++;
230 } 257 }
258 goto end;
231 } 259 }
232 260
233 ts->idle_calls++; 261 ts->idle_calls++;
@@ -287,7 +315,7 @@ void tick_nohz_stop_sched_tick(void)
287 * invoked. 315 * invoked.
288 */ 316 */
289 if (cpu == tick_do_timer_cpu) 317 if (cpu == tick_do_timer_cpu)
290 tick_do_timer_cpu = -1; 318 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
291 319
292 ts->idle_sleeps++; 320 ts->idle_sleeps++;
293 321
@@ -364,11 +392,14 @@ void tick_nohz_restart_sched_tick(void)
364 local_irq_disable(); 392 local_irq_disable();
365 tick_nohz_stop_idle(cpu); 393 tick_nohz_stop_idle(cpu);
366 394
367 if (!ts->tick_stopped) { 395 if (!ts->inidle || !ts->tick_stopped) {
396 ts->inidle = 0;
368 local_irq_enable(); 397 local_irq_enable();
369 return; 398 return;
370 } 399 }
371 400
401 ts->inidle = 0;
402
372 rcu_exit_nohz(); 403 rcu_exit_nohz();
373 404
374 /* Update jiffies first */ 405 /* Update jiffies first */
@@ -449,7 +480,7 @@ static void tick_nohz_handler(struct clock_event_device *dev)
449 * this duty, then the jiffies update is still serialized by 480 * this duty, then the jiffies update is still serialized by
450 * xtime_lock. 481 * xtime_lock.
451 */ 482 */
452 if (unlikely(tick_do_timer_cpu == -1)) 483 if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
453 tick_do_timer_cpu = cpu; 484 tick_do_timer_cpu = cpu;
454 485
455 /* Check, if the jiffies need an update */ 486 /* Check, if the jiffies need an update */
@@ -551,7 +582,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
551 * this duty, then the jiffies update is still serialized by 582 * this duty, then the jiffies update is still serialized by
552 * xtime_lock. 583 * xtime_lock.
553 */ 584 */
554 if (unlikely(tick_do_timer_cpu == -1)) 585 if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
555 tick_do_timer_cpu = cpu; 586 tick_do_timer_cpu = cpu;
556#endif 587#endif
557 588
@@ -603,7 +634,7 @@ void tick_setup_sched_timer(void)
603 */ 634 */
604 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 635 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
605 ts->sched_timer.function = tick_sched_timer; 636 ts->sched_timer.function = tick_sched_timer;
606 ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 637 ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
607 638
608 /* Get the next period (per cpu) */ 639 /* Get the next period (per cpu) */
609 ts->sched_timer.expires = tick_init_jiffy_update(); 640 ts->sched_timer.expires = tick_init_jiffy_update();
@@ -627,17 +658,21 @@ void tick_setup_sched_timer(void)
627 ts->nohz_mode = NOHZ_MODE_HIGHRES; 658 ts->nohz_mode = NOHZ_MODE_HIGHRES;
628#endif 659#endif
629} 660}
661#endif /* HIGH_RES_TIMERS */
630 662
663#if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS
631void tick_cancel_sched_timer(int cpu) 664void tick_cancel_sched_timer(int cpu)
632{ 665{
633 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 666 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
634 667
668# ifdef CONFIG_HIGH_RES_TIMERS
635 if (ts->sched_timer.base) 669 if (ts->sched_timer.base)
636 hrtimer_cancel(&ts->sched_timer); 670 hrtimer_cancel(&ts->sched_timer);
671# endif
637 672
638 ts->nohz_mode = NOHZ_MODE_INACTIVE; 673 ts->nohz_mode = NOHZ_MODE_INACTIVE;
639} 674}
640#endif /* HIGH_RES_TIMERS */ 675#endif
641 676
642/** 677/**
643 * Async notification about clocksource changes 678 * Async notification about clocksource changes
diff --git a/kernel/timer.c b/kernel/timer.c
index ceacc6626572..03bc7f1f1593 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -812,7 +812,7 @@ static inline void __run_timers(struct tvec_base *base)
812 spin_unlock_irq(&base->lock); 812 spin_unlock_irq(&base->lock);
813} 813}
814 814
815#if defined(CONFIG_NO_IDLE_HZ) || defined(CONFIG_NO_HZ) 815#ifdef CONFIG_NO_HZ
816/* 816/*
817 * Find out when the next timer event is due to happen. This 817 * Find out when the next timer event is due to happen. This
818 * is used on S/390 to stop all activity when a cpus is idle. 818 * is used on S/390 to stop all activity when a cpus is idle.
@@ -947,14 +947,6 @@ unsigned long get_next_timer_interrupt(unsigned long now)
947 947
948 return cmp_next_hrtimer_event(now, expires); 948 return cmp_next_hrtimer_event(now, expires);
949} 949}
950
951#ifdef CONFIG_NO_IDLE_HZ
952unsigned long next_timer_interrupt(void)
953{
954 return get_next_timer_interrupt(jiffies);
955}
956#endif
957
958#endif 950#endif
959 951
960#ifndef CONFIG_VIRT_CPU_ACCOUNTING 952#ifndef CONFIG_VIRT_CPU_ACCOUNTING
@@ -1502,7 +1494,7 @@ void __init init_timers(void)
1502 1494
1503 BUG_ON(err == NOTIFY_BAD); 1495 BUG_ON(err == NOTIFY_BAD);
1504 register_cpu_notifier(&timers_nb); 1496 register_cpu_notifier(&timers_nb);
1505 open_softirq(TIMER_SOFTIRQ, run_timer_softirq, NULL); 1497 open_softirq(TIMER_SOFTIRQ, run_timer_softirq);
1506} 1498}
1507 1499
1508/** 1500/**
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
new file mode 100644
index 000000000000..263e9e6bbd60
--- /dev/null
+++ b/kernel/trace/Kconfig
@@ -0,0 +1,135 @@
1#
2# Architectures that offer an FTRACE implementation should select HAVE_FTRACE:
3#
4config HAVE_FTRACE
5 bool
6
7config HAVE_DYNAMIC_FTRACE
8 bool
9
10config TRACER_MAX_TRACE
11 bool
12
13config TRACING
14 bool
15 select DEBUG_FS
16 select STACKTRACE
17
18config FTRACE
19 bool "Kernel Function Tracer"
20 depends on HAVE_FTRACE
21 select FRAME_POINTER
22 select TRACING
23 select CONTEXT_SWITCH_TRACER
24 help
25 Enable the kernel to trace every kernel function. This is done
26 by using a compiler feature to insert a small, 5-byte No-Operation
27 instruction to the beginning of every kernel function, which NOP
28 sequence is then dynamically patched into a tracer call when
29 tracing is enabled by the administrator. If it's runtime disabled
30 (the bootup default), then the overhead of the instructions is very
31 small and not measurable even in micro-benchmarks.
32
33config IRQSOFF_TRACER
34 bool "Interrupts-off Latency Tracer"
35 default n
36 depends on TRACE_IRQFLAGS_SUPPORT
37 depends on GENERIC_TIME
38 depends on HAVE_FTRACE
39 select TRACE_IRQFLAGS
40 select TRACING
41 select TRACER_MAX_TRACE
42 help
43 This option measures the time spent in irqs-off critical
44 sections, with microsecond accuracy.
45
46 The default measurement method is a maximum search, which is
47 disabled by default and can be runtime (re-)started
48 via:
49
50 echo 0 > /debugfs/tracing/tracing_max_latency
51
52 (Note that kernel size and overhead increases with this option
53 enabled. This option and the preempt-off timing option can be
54 used together or separately.)
55
56config PREEMPT_TRACER
57 bool "Preemption-off Latency Tracer"
58 default n
59 depends on GENERIC_TIME
60 depends on PREEMPT
61 depends on HAVE_FTRACE
62 select TRACING
63 select TRACER_MAX_TRACE
64 help
65 This option measures the time spent in preemption off critical
66 sections, with microsecond accuracy.
67
68 The default measurement method is a maximum search, which is
69 disabled by default and can be runtime (re-)started
70 via:
71
72 echo 0 > /debugfs/tracing/tracing_max_latency
73
74 (Note that kernel size and overhead increases with this option
75 enabled. This option and the irqs-off timing option can be
76 used together or separately.)
77
78config SYSPROF_TRACER
79 bool "Sysprof Tracer"
80 depends on X86
81 select TRACING
82 help
83 This tracer provides the trace needed by the 'Sysprof' userspace
84 tool.
85
86config SCHED_TRACER
87 bool "Scheduling Latency Tracer"
88 depends on HAVE_FTRACE
89 select TRACING
90 select CONTEXT_SWITCH_TRACER
91 select TRACER_MAX_TRACE
92 help
93 This tracer tracks the latency of the highest priority task
94 to be scheduled in, starting from the point it has woken up.
95
96config CONTEXT_SWITCH_TRACER
97 bool "Trace process context switches"
98 depends on HAVE_FTRACE
99 select TRACING
100 select MARKERS
101 help
102 This tracer gets called from the context switch and records
103 all switching of tasks.
104
105config DYNAMIC_FTRACE
106 bool "enable/disable ftrace tracepoints dynamically"
107 depends on FTRACE
108 depends on HAVE_DYNAMIC_FTRACE
109 default y
110 help
111 This option will modify all the calls to ftrace dynamically
112 (will patch them out of the binary image and replaces them
113 with a No-Op instruction) as they are called. A table is
114 created to dynamically enable them again.
115
116 This way a CONFIG_FTRACE kernel is slightly larger, but otherwise
117 has native performance as long as no tracing is active.
118
119 The changes to the code are done by a kernel thread that
120 wakes up once a second and checks to see if any ftrace calls
121 were made. If so, it runs stop_machine (stops all CPUS)
122 and modifies the code to jump over the call to ftrace.
123
124config FTRACE_SELFTEST
125 bool
126
127config FTRACE_STARTUP_TEST
128 bool "Perform a startup test on ftrace"
129 depends on TRACING
130 select FTRACE_SELFTEST
131 help
132 This option performs a series of startup tests on ftrace. On bootup
133 a series of tests are made to verify that the tracer is
134 functioning properly. It will do tests on all the configured
135 tracers of ftrace.
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
new file mode 100644
index 000000000000..71d17de17288
--- /dev/null
+++ b/kernel/trace/Makefile
@@ -0,0 +1,24 @@
1
2# Do not instrument the tracer itself:
3
4ifdef CONFIG_FTRACE
5ORIG_CFLAGS := $(KBUILD_CFLAGS)
6KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS))
7
8# selftest needs instrumentation
9CFLAGS_trace_selftest_dynamic.o = -pg
10obj-y += trace_selftest_dynamic.o
11endif
12
13obj-$(CONFIG_FTRACE) += libftrace.o
14
15obj-$(CONFIG_TRACING) += trace.o
16obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
17obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
18obj-$(CONFIG_FTRACE) += trace_functions.o
19obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
20obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
21obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
22obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
23
24libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
new file mode 100644
index 000000000000..f6e3af31b403
--- /dev/null
+++ b/kernel/trace/ftrace.c
@@ -0,0 +1,1727 @@
1/*
2 * Infrastructure for profiling code inserted by 'gcc -pg'.
3 *
4 * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
5 * Copyright (C) 2004-2008 Ingo Molnar <mingo@redhat.com>
6 *
7 * Originally ported from the -rt patch by:
8 * Copyright (C) 2007 Arnaldo Carvalho de Melo <acme@redhat.com>
9 *
10 * Based on code in the latency_tracer, that is:
11 *
12 * Copyright (C) 2004-2006 Ingo Molnar
13 * Copyright (C) 2004 William Lee Irwin III
14 */
15
16#include <linux/stop_machine.h>
17#include <linux/clocksource.h>
18#include <linux/kallsyms.h>
19#include <linux/seq_file.h>
20#include <linux/debugfs.h>
21#include <linux/hardirq.h>
22#include <linux/kthread.h>
23#include <linux/uaccess.h>
24#include <linux/kprobes.h>
25#include <linux/ftrace.h>
26#include <linux/sysctl.h>
27#include <linux/ctype.h>
28#include <linux/hash.h>
29#include <linux/list.h>
30
31#include <asm/ftrace.h>
32
33#include "trace.h"
34
35/* ftrace_enabled is a method to turn ftrace on or off */
36int ftrace_enabled __read_mostly;
37static int last_ftrace_enabled;
38
39/*
40 * ftrace_disabled is set when an anomaly is discovered.
41 * ftrace_disabled is much stronger than ftrace_enabled.
42 */
43static int ftrace_disabled __read_mostly;
44
45static DEFINE_SPINLOCK(ftrace_lock);
46static DEFINE_MUTEX(ftrace_sysctl_lock);
47
48static struct ftrace_ops ftrace_list_end __read_mostly =
49{
50 .func = ftrace_stub,
51};
52
53static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
54ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
55
56static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
57{
58 struct ftrace_ops *op = ftrace_list;
59
60 /* in case someone actually ports this to alpha! */
61 read_barrier_depends();
62
63 while (op != &ftrace_list_end) {
64 /* silly alpha */
65 read_barrier_depends();
66 op->func(ip, parent_ip);
67 op = op->next;
68 };
69}
70
71/**
72 * clear_ftrace_function - reset the ftrace function
73 *
74 * This NULLs the ftrace function and in essence stops
75 * tracing. There may be lag
76 */
77void clear_ftrace_function(void)
78{
79 ftrace_trace_function = ftrace_stub;
80}
81
82static int __register_ftrace_function(struct ftrace_ops *ops)
83{
84 /* Should never be called by interrupts */
85 spin_lock(&ftrace_lock);
86
87 ops->next = ftrace_list;
88 /*
89 * We are entering ops into the ftrace_list but another
90 * CPU might be walking that list. We need to make sure
91 * the ops->next pointer is valid before another CPU sees
92 * the ops pointer included into the ftrace_list.
93 */
94 smp_wmb();
95 ftrace_list = ops;
96
97 if (ftrace_enabled) {
98 /*
99 * For one func, simply call it directly.
100 * For more than one func, call the chain.
101 */
102 if (ops->next == &ftrace_list_end)
103 ftrace_trace_function = ops->func;
104 else
105 ftrace_trace_function = ftrace_list_func;
106 }
107
108 spin_unlock(&ftrace_lock);
109
110 return 0;
111}
112
113static int __unregister_ftrace_function(struct ftrace_ops *ops)
114{
115 struct ftrace_ops **p;
116 int ret = 0;
117
118 spin_lock(&ftrace_lock);
119
120 /*
121 * If we are removing the last function, then simply point
122 * to the ftrace_stub.
123 */
124 if (ftrace_list == ops && ops->next == &ftrace_list_end) {
125 ftrace_trace_function = ftrace_stub;
126 ftrace_list = &ftrace_list_end;
127 goto out;
128 }
129
130 for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next)
131 if (*p == ops)
132 break;
133
134 if (*p != ops) {
135 ret = -1;
136 goto out;
137 }
138
139 *p = (*p)->next;
140
141 if (ftrace_enabled) {
142 /* If we only have one func left, then call that directly */
143 if (ftrace_list == &ftrace_list_end ||
144 ftrace_list->next == &ftrace_list_end)
145 ftrace_trace_function = ftrace_list->func;
146 }
147
148 out:
149 spin_unlock(&ftrace_lock);
150
151 return ret;
152}
153
154#ifdef CONFIG_DYNAMIC_FTRACE
155
156static struct task_struct *ftraced_task;
157
158enum {
159 FTRACE_ENABLE_CALLS = (1 << 0),
160 FTRACE_DISABLE_CALLS = (1 << 1),
161 FTRACE_UPDATE_TRACE_FUNC = (1 << 2),
162 FTRACE_ENABLE_MCOUNT = (1 << 3),
163 FTRACE_DISABLE_MCOUNT = (1 << 4),
164};
165
166static int ftrace_filtered;
167static int tracing_on;
168static int frozen_record_count;
169
170static struct hlist_head ftrace_hash[FTRACE_HASHSIZE];
171
172static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu);
173
174static DEFINE_SPINLOCK(ftrace_shutdown_lock);
175static DEFINE_MUTEX(ftraced_lock);
176static DEFINE_MUTEX(ftrace_regex_lock);
177
178struct ftrace_page {
179 struct ftrace_page *next;
180 unsigned long index;
181 struct dyn_ftrace records[];
182};
183
184#define ENTRIES_PER_PAGE \
185 ((PAGE_SIZE - sizeof(struct ftrace_page)) / sizeof(struct dyn_ftrace))
186
187/* estimate from running different kernels */
188#define NR_TO_INIT 10000
189
190static struct ftrace_page *ftrace_pages_start;
191static struct ftrace_page *ftrace_pages;
192
193static int ftraced_trigger;
194static int ftraced_suspend;
195static int ftraced_stop;
196
197static int ftrace_record_suspend;
198
199static struct dyn_ftrace *ftrace_free_records;
200
201
202#ifdef CONFIG_KPROBES
203static inline void freeze_record(struct dyn_ftrace *rec)
204{
205 if (!(rec->flags & FTRACE_FL_FROZEN)) {
206 rec->flags |= FTRACE_FL_FROZEN;
207 frozen_record_count++;
208 }
209}
210
211static inline void unfreeze_record(struct dyn_ftrace *rec)
212{
213 if (rec->flags & FTRACE_FL_FROZEN) {
214 rec->flags &= ~FTRACE_FL_FROZEN;
215 frozen_record_count--;
216 }
217}
218
219static inline int record_frozen(struct dyn_ftrace *rec)
220{
221 return rec->flags & FTRACE_FL_FROZEN;
222}
223#else
224# define freeze_record(rec) ({ 0; })
225# define unfreeze_record(rec) ({ 0; })
226# define record_frozen(rec) ({ 0; })
227#endif /* CONFIG_KPROBES */
228
229int skip_trace(unsigned long ip)
230{
231 unsigned long fl;
232 struct dyn_ftrace *rec;
233 struct hlist_node *t;
234 struct hlist_head *head;
235
236 if (frozen_record_count == 0)
237 return 0;
238
239 head = &ftrace_hash[hash_long(ip, FTRACE_HASHBITS)];
240 hlist_for_each_entry_rcu(rec, t, head, node) {
241 if (rec->ip == ip) {
242 if (record_frozen(rec)) {
243 if (rec->flags & FTRACE_FL_FAILED)
244 return 1;
245
246 if (!(rec->flags & FTRACE_FL_CONVERTED))
247 return 1;
248
249 if (!tracing_on || !ftrace_enabled)
250 return 1;
251
252 if (ftrace_filtered) {
253 fl = rec->flags & (FTRACE_FL_FILTER |
254 FTRACE_FL_NOTRACE);
255 if (!fl || (fl & FTRACE_FL_NOTRACE))
256 return 1;
257 }
258 }
259 break;
260 }
261 }
262
263 return 0;
264}
265
266static inline int
267ftrace_ip_in_hash(unsigned long ip, unsigned long key)
268{
269 struct dyn_ftrace *p;
270 struct hlist_node *t;
271 int found = 0;
272
273 hlist_for_each_entry_rcu(p, t, &ftrace_hash[key], node) {
274 if (p->ip == ip) {
275 found = 1;
276 break;
277 }
278 }
279
280 return found;
281}
282
283static inline void
284ftrace_add_hash(struct dyn_ftrace *node, unsigned long key)
285{
286 hlist_add_head_rcu(&node->node, &ftrace_hash[key]);
287}
288
289/* called from kstop_machine */
290static inline void ftrace_del_hash(struct dyn_ftrace *node)
291{
292 hlist_del(&node->node);
293}
294
295static void ftrace_free_rec(struct dyn_ftrace *rec)
296{
297 /* no locking, only called from kstop_machine */
298
299 rec->ip = (unsigned long)ftrace_free_records;
300 ftrace_free_records = rec;
301 rec->flags |= FTRACE_FL_FREE;
302}
303
304static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
305{
306 struct dyn_ftrace *rec;
307
308 /* First check for freed records */
309 if (ftrace_free_records) {
310 rec = ftrace_free_records;
311
312 if (unlikely(!(rec->flags & FTRACE_FL_FREE))) {
313 WARN_ON_ONCE(1);
314 ftrace_free_records = NULL;
315 ftrace_disabled = 1;
316 ftrace_enabled = 0;
317 return NULL;
318 }
319
320 ftrace_free_records = (void *)rec->ip;
321 memset(rec, 0, sizeof(*rec));
322 return rec;
323 }
324
325 if (ftrace_pages->index == ENTRIES_PER_PAGE) {
326 if (!ftrace_pages->next)
327 return NULL;
328 ftrace_pages = ftrace_pages->next;
329 }
330
331 return &ftrace_pages->records[ftrace_pages->index++];
332}
333
334static void
335ftrace_record_ip(unsigned long ip)
336{
337 struct dyn_ftrace *node;
338 unsigned long flags;
339 unsigned long key;
340 int resched;
341 int atomic;
342 int cpu;
343
344 if (!ftrace_enabled || ftrace_disabled)
345 return;
346
347 resched = need_resched();
348 preempt_disable_notrace();
349
350 /*
351 * We simply need to protect against recursion.
352 * Use the the raw version of smp_processor_id and not
353 * __get_cpu_var which can call debug hooks that can
354 * cause a recursive crash here.
355 */
356 cpu = raw_smp_processor_id();
357 per_cpu(ftrace_shutdown_disable_cpu, cpu)++;
358 if (per_cpu(ftrace_shutdown_disable_cpu, cpu) != 1)
359 goto out;
360
361 if (unlikely(ftrace_record_suspend))
362 goto out;
363
364 key = hash_long(ip, FTRACE_HASHBITS);
365
366 WARN_ON_ONCE(key >= FTRACE_HASHSIZE);
367
368 if (ftrace_ip_in_hash(ip, key))
369 goto out;
370
371 atomic = irqs_disabled();
372
373 spin_lock_irqsave(&ftrace_shutdown_lock, flags);
374
375 /* This ip may have hit the hash before the lock */
376 if (ftrace_ip_in_hash(ip, key))
377 goto out_unlock;
378
379 node = ftrace_alloc_dyn_node(ip);
380 if (!node)
381 goto out_unlock;
382
383 node->ip = ip;
384
385 ftrace_add_hash(node, key);
386
387 ftraced_trigger = 1;
388
389 out_unlock:
390 spin_unlock_irqrestore(&ftrace_shutdown_lock, flags);
391 out:
392 per_cpu(ftrace_shutdown_disable_cpu, cpu)--;
393
394 /* prevent recursion with scheduler */
395 if (resched)
396 preempt_enable_no_resched_notrace();
397 else
398 preempt_enable_notrace();
399}
400
401#define FTRACE_ADDR ((long)(ftrace_caller))
402
403static int
404__ftrace_replace_code(struct dyn_ftrace *rec,
405 unsigned char *old, unsigned char *new, int enable)
406{
407 unsigned long ip, fl;
408
409 ip = rec->ip;
410
411 if (ftrace_filtered && enable) {
412 /*
413 * If filtering is on:
414 *
415 * If this record is set to be filtered and
416 * is enabled then do nothing.
417 *
418 * If this record is set to be filtered and
419 * it is not enabled, enable it.
420 *
421 * If this record is not set to be filtered
422 * and it is not enabled do nothing.
423 *
424 * If this record is set not to trace then
425 * do nothing.
426 *
427 * If this record is set not to trace and
428 * it is enabled then disable it.
429 *
430 * If this record is not set to be filtered and
431 * it is enabled, disable it.
432 */
433
434 fl = rec->flags & (FTRACE_FL_FILTER | FTRACE_FL_NOTRACE |
435 FTRACE_FL_ENABLED);
436
437 if ((fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED)) ||
438 (fl == (FTRACE_FL_FILTER | FTRACE_FL_NOTRACE)) ||
439 !fl || (fl == FTRACE_FL_NOTRACE))
440 return 0;
441
442 /*
443 * If it is enabled disable it,
444 * otherwise enable it!
445 */
446 if (fl & FTRACE_FL_ENABLED) {
447 /* swap new and old */
448 new = old;
449 old = ftrace_call_replace(ip, FTRACE_ADDR);
450 rec->flags &= ~FTRACE_FL_ENABLED;
451 } else {
452 new = ftrace_call_replace(ip, FTRACE_ADDR);
453 rec->flags |= FTRACE_FL_ENABLED;
454 }
455 } else {
456
457 if (enable) {
458 /*
459 * If this record is set not to trace and is
460 * not enabled, do nothing.
461 */
462 fl = rec->flags & (FTRACE_FL_NOTRACE | FTRACE_FL_ENABLED);
463 if (fl == FTRACE_FL_NOTRACE)
464 return 0;
465
466 new = ftrace_call_replace(ip, FTRACE_ADDR);
467 } else
468 old = ftrace_call_replace(ip, FTRACE_ADDR);
469
470 if (enable) {
471 if (rec->flags & FTRACE_FL_ENABLED)
472 return 0;
473 rec->flags |= FTRACE_FL_ENABLED;
474 } else {
475 if (!(rec->flags & FTRACE_FL_ENABLED))
476 return 0;
477 rec->flags &= ~FTRACE_FL_ENABLED;
478 }
479 }
480
481 return ftrace_modify_code(ip, old, new);
482}
483
484static void ftrace_replace_code(int enable)
485{
486 int i, failed;
487 unsigned char *new = NULL, *old = NULL;
488 struct dyn_ftrace *rec;
489 struct ftrace_page *pg;
490
491 if (enable)
492 old = ftrace_nop_replace();
493 else
494 new = ftrace_nop_replace();
495
496 for (pg = ftrace_pages_start; pg; pg = pg->next) {
497 for (i = 0; i < pg->index; i++) {
498 rec = &pg->records[i];
499
500 /* don't modify code that has already faulted */
501 if (rec->flags & FTRACE_FL_FAILED)
502 continue;
503
504 /* ignore updates to this record's mcount site */
505 if (get_kprobe((void *)rec->ip)) {
506 freeze_record(rec);
507 continue;
508 } else {
509 unfreeze_record(rec);
510 }
511
512 failed = __ftrace_replace_code(rec, old, new, enable);
513 if (failed && (rec->flags & FTRACE_FL_CONVERTED)) {
514 rec->flags |= FTRACE_FL_FAILED;
515 if ((system_state == SYSTEM_BOOTING) ||
516 !core_kernel_text(rec->ip)) {
517 ftrace_del_hash(rec);
518 ftrace_free_rec(rec);
519 }
520 }
521 }
522 }
523}
524
525static void ftrace_shutdown_replenish(void)
526{
527 if (ftrace_pages->next)
528 return;
529
530 /* allocate another page */
531 ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL);
532}
533
534static int
535ftrace_code_disable(struct dyn_ftrace *rec)
536{
537 unsigned long ip;
538 unsigned char *nop, *call;
539 int failed;
540
541 ip = rec->ip;
542
543 nop = ftrace_nop_replace();
544 call = ftrace_call_replace(ip, MCOUNT_ADDR);
545
546 failed = ftrace_modify_code(ip, call, nop);
547 if (failed) {
548 rec->flags |= FTRACE_FL_FAILED;
549 return 0;
550 }
551 return 1;
552}
553
554static int __ftrace_update_code(void *ignore);
555
556static int __ftrace_modify_code(void *data)
557{
558 unsigned long addr;
559 int *command = data;
560
561 if (*command & FTRACE_ENABLE_CALLS) {
562 /*
563 * Update any recorded ips now that we have the
564 * machine stopped
565 */
566 __ftrace_update_code(NULL);
567 ftrace_replace_code(1);
568 tracing_on = 1;
569 } else if (*command & FTRACE_DISABLE_CALLS) {
570 ftrace_replace_code(0);
571 tracing_on = 0;
572 }
573
574 if (*command & FTRACE_UPDATE_TRACE_FUNC)
575 ftrace_update_ftrace_func(ftrace_trace_function);
576
577 if (*command & FTRACE_ENABLE_MCOUNT) {
578 addr = (unsigned long)ftrace_record_ip;
579 ftrace_mcount_set(&addr);
580 } else if (*command & FTRACE_DISABLE_MCOUNT) {
581 addr = (unsigned long)ftrace_stub;
582 ftrace_mcount_set(&addr);
583 }
584
585 return 0;
586}
587
588static void ftrace_run_update_code(int command)
589{
590 stop_machine(__ftrace_modify_code, &command, NULL);
591}
592
593void ftrace_disable_daemon(void)
594{
595 /* Stop the daemon from calling kstop_machine */
596 mutex_lock(&ftraced_lock);
597 ftraced_stop = 1;
598 mutex_unlock(&ftraced_lock);
599
600 ftrace_force_update();
601}
602
603void ftrace_enable_daemon(void)
604{
605 mutex_lock(&ftraced_lock);
606 ftraced_stop = 0;
607 mutex_unlock(&ftraced_lock);
608
609 ftrace_force_update();
610}
611
612static ftrace_func_t saved_ftrace_func;
613
614static void ftrace_startup(void)
615{
616 int command = 0;
617
618 if (unlikely(ftrace_disabled))
619 return;
620
621 mutex_lock(&ftraced_lock);
622 ftraced_suspend++;
623 if (ftraced_suspend == 1)
624 command |= FTRACE_ENABLE_CALLS;
625
626 if (saved_ftrace_func != ftrace_trace_function) {
627 saved_ftrace_func = ftrace_trace_function;
628 command |= FTRACE_UPDATE_TRACE_FUNC;
629 }
630
631 if (!command || !ftrace_enabled)
632 goto out;
633
634 ftrace_run_update_code(command);
635 out:
636 mutex_unlock(&ftraced_lock);
637}
638
639static void ftrace_shutdown(void)
640{
641 int command = 0;
642
643 if (unlikely(ftrace_disabled))
644 return;
645
646 mutex_lock(&ftraced_lock);
647 ftraced_suspend--;
648 if (!ftraced_suspend)
649 command |= FTRACE_DISABLE_CALLS;
650
651 if (saved_ftrace_func != ftrace_trace_function) {
652 saved_ftrace_func = ftrace_trace_function;
653 command |= FTRACE_UPDATE_TRACE_FUNC;
654 }
655
656 if (!command || !ftrace_enabled)
657 goto out;
658
659 ftrace_run_update_code(command);
660 out:
661 mutex_unlock(&ftraced_lock);
662}
663
664static void ftrace_startup_sysctl(void)
665{
666 int command = FTRACE_ENABLE_MCOUNT;
667
668 if (unlikely(ftrace_disabled))
669 return;
670
671 mutex_lock(&ftraced_lock);
672 /* Force update next time */
673 saved_ftrace_func = NULL;
674 /* ftraced_suspend is true if we want ftrace running */
675 if (ftraced_suspend)
676 command |= FTRACE_ENABLE_CALLS;
677
678 ftrace_run_update_code(command);
679 mutex_unlock(&ftraced_lock);
680}
681
682static void ftrace_shutdown_sysctl(void)
683{
684 int command = FTRACE_DISABLE_MCOUNT;
685
686 if (unlikely(ftrace_disabled))
687 return;
688
689 mutex_lock(&ftraced_lock);
690 /* ftraced_suspend is true if ftrace is running */
691 if (ftraced_suspend)
692 command |= FTRACE_DISABLE_CALLS;
693
694 ftrace_run_update_code(command);
695 mutex_unlock(&ftraced_lock);
696}
697
698static cycle_t ftrace_update_time;
699static unsigned long ftrace_update_cnt;
700unsigned long ftrace_update_tot_cnt;
701
702static int __ftrace_update_code(void *ignore)
703{
704 int i, save_ftrace_enabled;
705 cycle_t start, stop;
706 struct dyn_ftrace *p;
707 struct hlist_node *t, *n;
708 struct hlist_head *head, temp_list;
709
710 /* Don't be recording funcs now */
711 ftrace_record_suspend++;
712 save_ftrace_enabled = ftrace_enabled;
713 ftrace_enabled = 0;
714
715 start = ftrace_now(raw_smp_processor_id());
716 ftrace_update_cnt = 0;
717
718 /* No locks needed, the machine is stopped! */
719 for (i = 0; i < FTRACE_HASHSIZE; i++) {
720 INIT_HLIST_HEAD(&temp_list);
721 head = &ftrace_hash[i];
722
723 /* all CPUS are stopped, we are safe to modify code */
724 hlist_for_each_entry_safe(p, t, n, head, node) {
725 /* Skip over failed records which have not been
726 * freed. */
727 if (p->flags & FTRACE_FL_FAILED)
728 continue;
729
730 /* Unconverted records are always at the head of the
731 * hash bucket. Once we encounter a converted record,
732 * simply skip over to the next bucket. Saves ftraced
733 * some processor cycles (ftrace does its bid for
734 * global warming :-p ). */
735 if (p->flags & (FTRACE_FL_CONVERTED))
736 break;
737
738 /* Ignore updates to this record's mcount site.
739 * Reintroduce this record at the head of this
740 * bucket to attempt to "convert" it again if
741 * the kprobe on it is unregistered before the
742 * next run. */
743 if (get_kprobe((void *)p->ip)) {
744 ftrace_del_hash(p);
745 INIT_HLIST_NODE(&p->node);
746 hlist_add_head(&p->node, &temp_list);
747 freeze_record(p);
748 continue;
749 } else {
750 unfreeze_record(p);
751 }
752
753 /* convert record (i.e, patch mcount-call with NOP) */
754 if (ftrace_code_disable(p)) {
755 p->flags |= FTRACE_FL_CONVERTED;
756 ftrace_update_cnt++;
757 } else {
758 if ((system_state == SYSTEM_BOOTING) ||
759 !core_kernel_text(p->ip)) {
760 ftrace_del_hash(p);
761 ftrace_free_rec(p);
762 }
763 }
764 }
765
766 hlist_for_each_entry_safe(p, t, n, &temp_list, node) {
767 hlist_del(&p->node);
768 INIT_HLIST_NODE(&p->node);
769 hlist_add_head(&p->node, head);
770 }
771 }
772
773 stop = ftrace_now(raw_smp_processor_id());
774 ftrace_update_time = stop - start;
775 ftrace_update_tot_cnt += ftrace_update_cnt;
776 ftraced_trigger = 0;
777
778 ftrace_enabled = save_ftrace_enabled;
779 ftrace_record_suspend--;
780
781 return 0;
782}
783
784static int ftrace_update_code(void)
785{
786 if (unlikely(ftrace_disabled) ||
787 !ftrace_enabled || !ftraced_trigger)
788 return 0;
789
790 stop_machine(__ftrace_update_code, NULL, NULL);
791
792 return 1;
793}
794
795static int ftraced(void *ignore)
796{
797 unsigned long usecs;
798
799 while (!kthread_should_stop()) {
800
801 set_current_state(TASK_INTERRUPTIBLE);
802
803 /* check once a second */
804 schedule_timeout(HZ);
805
806 if (unlikely(ftrace_disabled))
807 continue;
808
809 mutex_lock(&ftrace_sysctl_lock);
810 mutex_lock(&ftraced_lock);
811 if (!ftraced_suspend && !ftraced_stop &&
812 ftrace_update_code()) {
813 usecs = nsecs_to_usecs(ftrace_update_time);
814 if (ftrace_update_tot_cnt > 100000) {
815 ftrace_update_tot_cnt = 0;
816 pr_info("hm, dftrace overflow: %lu change%s"
817 " (%lu total) in %lu usec%s\n",
818 ftrace_update_cnt,
819 ftrace_update_cnt != 1 ? "s" : "",
820 ftrace_update_tot_cnt,
821 usecs, usecs != 1 ? "s" : "");
822 ftrace_disabled = 1;
823 WARN_ON_ONCE(1);
824 }
825 }
826 mutex_unlock(&ftraced_lock);
827 mutex_unlock(&ftrace_sysctl_lock);
828
829 ftrace_shutdown_replenish();
830 }
831 __set_current_state(TASK_RUNNING);
832 return 0;
833}
834
835static int __init ftrace_dyn_table_alloc(void)
836{
837 struct ftrace_page *pg;
838 int cnt;
839 int i;
840
841 /* allocate a few pages */
842 ftrace_pages_start = (void *)get_zeroed_page(GFP_KERNEL);
843 if (!ftrace_pages_start)
844 return -1;
845
846 /*
847 * Allocate a few more pages.
848 *
849 * TODO: have some parser search vmlinux before
850 * final linking to find all calls to ftrace.
851 * Then we can:
852 * a) know how many pages to allocate.
853 * and/or
854 * b) set up the table then.
855 *
856 * The dynamic code is still necessary for
857 * modules.
858 */
859
860 pg = ftrace_pages = ftrace_pages_start;
861
862 cnt = NR_TO_INIT / ENTRIES_PER_PAGE;
863
864 for (i = 0; i < cnt; i++) {
865 pg->next = (void *)get_zeroed_page(GFP_KERNEL);
866
867 /* If we fail, we'll try later anyway */
868 if (!pg->next)
869 break;
870
871 pg = pg->next;
872 }
873
874 return 0;
875}
876
877enum {
878 FTRACE_ITER_FILTER = (1 << 0),
879 FTRACE_ITER_CONT = (1 << 1),
880 FTRACE_ITER_NOTRACE = (1 << 2),
881 FTRACE_ITER_FAILURES = (1 << 3),
882};
883
884#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
885
886struct ftrace_iterator {
887 loff_t pos;
888 struct ftrace_page *pg;
889 unsigned idx;
890 unsigned flags;
891 unsigned char buffer[FTRACE_BUFF_MAX+1];
892 unsigned buffer_idx;
893 unsigned filtered;
894};
895
896static void *
897t_next(struct seq_file *m, void *v, loff_t *pos)
898{
899 struct ftrace_iterator *iter = m->private;
900 struct dyn_ftrace *rec = NULL;
901
902 (*pos)++;
903
904 retry:
905 if (iter->idx >= iter->pg->index) {
906 if (iter->pg->next) {
907 iter->pg = iter->pg->next;
908 iter->idx = 0;
909 goto retry;
910 }
911 } else {
912 rec = &iter->pg->records[iter->idx++];
913 if ((!(iter->flags & FTRACE_ITER_FAILURES) &&
914 (rec->flags & FTRACE_FL_FAILED)) ||
915
916 ((iter->flags & FTRACE_ITER_FAILURES) &&
917 (!(rec->flags & FTRACE_FL_FAILED) ||
918 (rec->flags & FTRACE_FL_FREE))) ||
919
920 ((iter->flags & FTRACE_ITER_FILTER) &&
921 !(rec->flags & FTRACE_FL_FILTER)) ||
922
923 ((iter->flags & FTRACE_ITER_NOTRACE) &&
924 !(rec->flags & FTRACE_FL_NOTRACE))) {
925 rec = NULL;
926 goto retry;
927 }
928 }
929
930 iter->pos = *pos;
931
932 return rec;
933}
934
935static void *t_start(struct seq_file *m, loff_t *pos)
936{
937 struct ftrace_iterator *iter = m->private;
938 void *p = NULL;
939 loff_t l = -1;
940
941 if (*pos != iter->pos) {
942 for (p = t_next(m, p, &l); p && l < *pos; p = t_next(m, p, &l))
943 ;
944 } else {
945 l = *pos;
946 p = t_next(m, p, &l);
947 }
948
949 return p;
950}
951
952static void t_stop(struct seq_file *m, void *p)
953{
954}
955
956static int t_show(struct seq_file *m, void *v)
957{
958 struct dyn_ftrace *rec = v;
959 char str[KSYM_SYMBOL_LEN];
960
961 if (!rec)
962 return 0;
963
964 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
965
966 seq_printf(m, "%s\n", str);
967
968 return 0;
969}
970
971static struct seq_operations show_ftrace_seq_ops = {
972 .start = t_start,
973 .next = t_next,
974 .stop = t_stop,
975 .show = t_show,
976};
977
978static int
979ftrace_avail_open(struct inode *inode, struct file *file)
980{
981 struct ftrace_iterator *iter;
982 int ret;
983
984 if (unlikely(ftrace_disabled))
985 return -ENODEV;
986
987 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
988 if (!iter)
989 return -ENOMEM;
990
991 iter->pg = ftrace_pages_start;
992 iter->pos = -1;
993
994 ret = seq_open(file, &show_ftrace_seq_ops);
995 if (!ret) {
996 struct seq_file *m = file->private_data;
997
998 m->private = iter;
999 } else {
1000 kfree(iter);
1001 }
1002
1003 return ret;
1004}
1005
1006int ftrace_avail_release(struct inode *inode, struct file *file)
1007{
1008 struct seq_file *m = (struct seq_file *)file->private_data;
1009 struct ftrace_iterator *iter = m->private;
1010
1011 seq_release(inode, file);
1012 kfree(iter);
1013
1014 return 0;
1015}
1016
1017static int
1018ftrace_failures_open(struct inode *inode, struct file *file)
1019{
1020 int ret;
1021 struct seq_file *m;
1022 struct ftrace_iterator *iter;
1023
1024 ret = ftrace_avail_open(inode, file);
1025 if (!ret) {
1026 m = (struct seq_file *)file->private_data;
1027 iter = (struct ftrace_iterator *)m->private;
1028 iter->flags = FTRACE_ITER_FAILURES;
1029 }
1030
1031 return ret;
1032}
1033
1034
1035static void ftrace_filter_reset(int enable)
1036{
1037 struct ftrace_page *pg;
1038 struct dyn_ftrace *rec;
1039 unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
1040 unsigned i;
1041
1042 /* keep kstop machine from running */
1043 preempt_disable();
1044 if (enable)
1045 ftrace_filtered = 0;
1046 pg = ftrace_pages_start;
1047 while (pg) {
1048 for (i = 0; i < pg->index; i++) {
1049 rec = &pg->records[i];
1050 if (rec->flags & FTRACE_FL_FAILED)
1051 continue;
1052 rec->flags &= ~type;
1053 }
1054 pg = pg->next;
1055 }
1056 preempt_enable();
1057}
1058
1059static int
1060ftrace_regex_open(struct inode *inode, struct file *file, int enable)
1061{
1062 struct ftrace_iterator *iter;
1063 int ret = 0;
1064
1065 if (unlikely(ftrace_disabled))
1066 return -ENODEV;
1067
1068 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
1069 if (!iter)
1070 return -ENOMEM;
1071
1072 mutex_lock(&ftrace_regex_lock);
1073 if ((file->f_mode & FMODE_WRITE) &&
1074 !(file->f_flags & O_APPEND))
1075 ftrace_filter_reset(enable);
1076
1077 if (file->f_mode & FMODE_READ) {
1078 iter->pg = ftrace_pages_start;
1079 iter->pos = -1;
1080 iter->flags = enable ? FTRACE_ITER_FILTER :
1081 FTRACE_ITER_NOTRACE;
1082
1083 ret = seq_open(file, &show_ftrace_seq_ops);
1084 if (!ret) {
1085 struct seq_file *m = file->private_data;
1086 m->private = iter;
1087 } else
1088 kfree(iter);
1089 } else
1090 file->private_data = iter;
1091 mutex_unlock(&ftrace_regex_lock);
1092
1093 return ret;
1094}
1095
1096static int
1097ftrace_filter_open(struct inode *inode, struct file *file)
1098{
1099 return ftrace_regex_open(inode, file, 1);
1100}
1101
1102static int
1103ftrace_notrace_open(struct inode *inode, struct file *file)
1104{
1105 return ftrace_regex_open(inode, file, 0);
1106}
1107
1108static ssize_t
1109ftrace_regex_read(struct file *file, char __user *ubuf,
1110 size_t cnt, loff_t *ppos)
1111{
1112 if (file->f_mode & FMODE_READ)
1113 return seq_read(file, ubuf, cnt, ppos);
1114 else
1115 return -EPERM;
1116}
1117
1118static loff_t
1119ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
1120{
1121 loff_t ret;
1122
1123 if (file->f_mode & FMODE_READ)
1124 ret = seq_lseek(file, offset, origin);
1125 else
1126 file->f_pos = ret = 1;
1127
1128 return ret;
1129}
1130
1131enum {
1132 MATCH_FULL,
1133 MATCH_FRONT_ONLY,
1134 MATCH_MIDDLE_ONLY,
1135 MATCH_END_ONLY,
1136};
1137
1138static void
1139ftrace_match(unsigned char *buff, int len, int enable)
1140{
1141 char str[KSYM_SYMBOL_LEN];
1142 char *search = NULL;
1143 struct ftrace_page *pg;
1144 struct dyn_ftrace *rec;
1145 int type = MATCH_FULL;
1146 unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
1147 unsigned i, match = 0, search_len = 0;
1148
1149 for (i = 0; i < len; i++) {
1150 if (buff[i] == '*') {
1151 if (!i) {
1152 search = buff + i + 1;
1153 type = MATCH_END_ONLY;
1154 search_len = len - (i + 1);
1155 } else {
1156 if (type == MATCH_END_ONLY) {
1157 type = MATCH_MIDDLE_ONLY;
1158 } else {
1159 match = i;
1160 type = MATCH_FRONT_ONLY;
1161 }
1162 buff[i] = 0;
1163 break;
1164 }
1165 }
1166 }
1167
1168 /* keep kstop machine from running */
1169 preempt_disable();
1170 if (enable)
1171 ftrace_filtered = 1;
1172 pg = ftrace_pages_start;
1173 while (pg) {
1174 for (i = 0; i < pg->index; i++) {
1175 int matched = 0;
1176 char *ptr;
1177
1178 rec = &pg->records[i];
1179 if (rec->flags & FTRACE_FL_FAILED)
1180 continue;
1181 kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
1182 switch (type) {
1183 case MATCH_FULL:
1184 if (strcmp(str, buff) == 0)
1185 matched = 1;
1186 break;
1187 case MATCH_FRONT_ONLY:
1188 if (memcmp(str, buff, match) == 0)
1189 matched = 1;
1190 break;
1191 case MATCH_MIDDLE_ONLY:
1192 if (strstr(str, search))
1193 matched = 1;
1194 break;
1195 case MATCH_END_ONLY:
1196 ptr = strstr(str, search);
1197 if (ptr && (ptr[search_len] == 0))
1198 matched = 1;
1199 break;
1200 }
1201 if (matched)
1202 rec->flags |= flag;
1203 }
1204 pg = pg->next;
1205 }
1206 preempt_enable();
1207}
1208
1209static ssize_t
1210ftrace_regex_write(struct file *file, const char __user *ubuf,
1211 size_t cnt, loff_t *ppos, int enable)
1212{
1213 struct ftrace_iterator *iter;
1214 char ch;
1215 size_t read = 0;
1216 ssize_t ret;
1217
1218 if (!cnt || cnt < 0)
1219 return 0;
1220
1221 mutex_lock(&ftrace_regex_lock);
1222
1223 if (file->f_mode & FMODE_READ) {
1224 struct seq_file *m = file->private_data;
1225 iter = m->private;
1226 } else
1227 iter = file->private_data;
1228
1229 if (!*ppos) {
1230 iter->flags &= ~FTRACE_ITER_CONT;
1231 iter->buffer_idx = 0;
1232 }
1233
1234 ret = get_user(ch, ubuf++);
1235 if (ret)
1236 goto out;
1237 read++;
1238 cnt--;
1239
1240 if (!(iter->flags & ~FTRACE_ITER_CONT)) {
1241 /* skip white space */
1242 while (cnt && isspace(ch)) {
1243 ret = get_user(ch, ubuf++);
1244 if (ret)
1245 goto out;
1246 read++;
1247 cnt--;
1248 }
1249
1250 if (isspace(ch)) {
1251 file->f_pos += read;
1252 ret = read;
1253 goto out;
1254 }
1255
1256 iter->buffer_idx = 0;
1257 }
1258
1259 while (cnt && !isspace(ch)) {
1260 if (iter->buffer_idx < FTRACE_BUFF_MAX)
1261 iter->buffer[iter->buffer_idx++] = ch;
1262 else {
1263 ret = -EINVAL;
1264 goto out;
1265 }
1266 ret = get_user(ch, ubuf++);
1267 if (ret)
1268 goto out;
1269 read++;
1270 cnt--;
1271 }
1272
1273 if (isspace(ch)) {
1274 iter->filtered++;
1275 iter->buffer[iter->buffer_idx] = 0;
1276 ftrace_match(iter->buffer, iter->buffer_idx, enable);
1277 iter->buffer_idx = 0;
1278 } else
1279 iter->flags |= FTRACE_ITER_CONT;
1280
1281
1282 file->f_pos += read;
1283
1284 ret = read;
1285 out:
1286 mutex_unlock(&ftrace_regex_lock);
1287
1288 return ret;
1289}
1290
1291static ssize_t
1292ftrace_filter_write(struct file *file, const char __user *ubuf,
1293 size_t cnt, loff_t *ppos)
1294{
1295 return ftrace_regex_write(file, ubuf, cnt, ppos, 1);
1296}
1297
1298static ssize_t
1299ftrace_notrace_write(struct file *file, const char __user *ubuf,
1300 size_t cnt, loff_t *ppos)
1301{
1302 return ftrace_regex_write(file, ubuf, cnt, ppos, 0);
1303}
1304
1305static void
1306ftrace_set_regex(unsigned char *buf, int len, int reset, int enable)
1307{
1308 if (unlikely(ftrace_disabled))
1309 return;
1310
1311 mutex_lock(&ftrace_regex_lock);
1312 if (reset)
1313 ftrace_filter_reset(enable);
1314 if (buf)
1315 ftrace_match(buf, len, enable);
1316 mutex_unlock(&ftrace_regex_lock);
1317}
1318
1319/**
1320 * ftrace_set_filter - set a function to filter on in ftrace
1321 * @buf - the string that holds the function filter text.
1322 * @len - the length of the string.
1323 * @reset - non zero to reset all filters before applying this filter.
1324 *
1325 * Filters denote which functions should be enabled when tracing is enabled.
1326 * If @buf is NULL and reset is set, all functions will be enabled for tracing.
1327 */
1328void ftrace_set_filter(unsigned char *buf, int len, int reset)
1329{
1330 ftrace_set_regex(buf, len, reset, 1);
1331}
1332
1333/**
1334 * ftrace_set_notrace - set a function to not trace in ftrace
1335 * @buf - the string that holds the function notrace text.
1336 * @len - the length of the string.
1337 * @reset - non zero to reset all filters before applying this filter.
1338 *
1339 * Notrace Filters denote which functions should not be enabled when tracing
1340 * is enabled. If @buf is NULL and reset is set, all functions will be enabled
1341 * for tracing.
1342 */
1343void ftrace_set_notrace(unsigned char *buf, int len, int reset)
1344{
1345 ftrace_set_regex(buf, len, reset, 0);
1346}
1347
1348static int
1349ftrace_regex_release(struct inode *inode, struct file *file, int enable)
1350{
1351 struct seq_file *m = (struct seq_file *)file->private_data;
1352 struct ftrace_iterator *iter;
1353
1354 mutex_lock(&ftrace_regex_lock);
1355 if (file->f_mode & FMODE_READ) {
1356 iter = m->private;
1357
1358 seq_release(inode, file);
1359 } else
1360 iter = file->private_data;
1361
1362 if (iter->buffer_idx) {
1363 iter->filtered++;
1364 iter->buffer[iter->buffer_idx] = 0;
1365 ftrace_match(iter->buffer, iter->buffer_idx, enable);
1366 }
1367
1368 mutex_lock(&ftrace_sysctl_lock);
1369 mutex_lock(&ftraced_lock);
1370 if (iter->filtered && ftraced_suspend && ftrace_enabled)
1371 ftrace_run_update_code(FTRACE_ENABLE_CALLS);
1372 mutex_unlock(&ftraced_lock);
1373 mutex_unlock(&ftrace_sysctl_lock);
1374
1375 kfree(iter);
1376 mutex_unlock(&ftrace_regex_lock);
1377 return 0;
1378}
1379
1380static int
1381ftrace_filter_release(struct inode *inode, struct file *file)
1382{
1383 return ftrace_regex_release(inode, file, 1);
1384}
1385
1386static int
1387ftrace_notrace_release(struct inode *inode, struct file *file)
1388{
1389 return ftrace_regex_release(inode, file, 0);
1390}
1391
1392static ssize_t
1393ftraced_read(struct file *filp, char __user *ubuf,
1394 size_t cnt, loff_t *ppos)
1395{
1396 /* don't worry about races */
1397 char *buf = ftraced_stop ? "disabled\n" : "enabled\n";
1398 int r = strlen(buf);
1399
1400 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
1401}
1402
1403static ssize_t
1404ftraced_write(struct file *filp, const char __user *ubuf,
1405 size_t cnt, loff_t *ppos)
1406{
1407 char buf[64];
1408 long val;
1409 int ret;
1410
1411 if (cnt >= sizeof(buf))
1412 return -EINVAL;
1413
1414 if (copy_from_user(&buf, ubuf, cnt))
1415 return -EFAULT;
1416
1417 if (strncmp(buf, "enable", 6) == 0)
1418 val = 1;
1419 else if (strncmp(buf, "disable", 7) == 0)
1420 val = 0;
1421 else {
1422 buf[cnt] = 0;
1423
1424 ret = strict_strtoul(buf, 10, &val);
1425 if (ret < 0)
1426 return ret;
1427
1428 val = !!val;
1429 }
1430
1431 if (val)
1432 ftrace_enable_daemon();
1433 else
1434 ftrace_disable_daemon();
1435
1436 filp->f_pos += cnt;
1437
1438 return cnt;
1439}
1440
1441static struct file_operations ftrace_avail_fops = {
1442 .open = ftrace_avail_open,
1443 .read = seq_read,
1444 .llseek = seq_lseek,
1445 .release = ftrace_avail_release,
1446};
1447
1448static struct file_operations ftrace_failures_fops = {
1449 .open = ftrace_failures_open,
1450 .read = seq_read,
1451 .llseek = seq_lseek,
1452 .release = ftrace_avail_release,
1453};
1454
1455static struct file_operations ftrace_filter_fops = {
1456 .open = ftrace_filter_open,
1457 .read = ftrace_regex_read,
1458 .write = ftrace_filter_write,
1459 .llseek = ftrace_regex_lseek,
1460 .release = ftrace_filter_release,
1461};
1462
1463static struct file_operations ftrace_notrace_fops = {
1464 .open = ftrace_notrace_open,
1465 .read = ftrace_regex_read,
1466 .write = ftrace_notrace_write,
1467 .llseek = ftrace_regex_lseek,
1468 .release = ftrace_notrace_release,
1469};
1470
1471static struct file_operations ftraced_fops = {
1472 .open = tracing_open_generic,
1473 .read = ftraced_read,
1474 .write = ftraced_write,
1475};
1476
1477/**
1478 * ftrace_force_update - force an update to all recording ftrace functions
1479 */
1480int ftrace_force_update(void)
1481{
1482 int ret = 0;
1483
1484 if (unlikely(ftrace_disabled))
1485 return -ENODEV;
1486
1487 mutex_lock(&ftrace_sysctl_lock);
1488 mutex_lock(&ftraced_lock);
1489
1490 /*
1491 * If ftraced_trigger is not set, then there is nothing
1492 * to update.
1493 */
1494 if (ftraced_trigger && !ftrace_update_code())
1495 ret = -EBUSY;
1496
1497 mutex_unlock(&ftraced_lock);
1498 mutex_unlock(&ftrace_sysctl_lock);
1499
1500 return ret;
1501}
1502
1503static void ftrace_force_shutdown(void)
1504{
1505 struct task_struct *task;
1506 int command = FTRACE_DISABLE_CALLS | FTRACE_UPDATE_TRACE_FUNC;
1507
1508 mutex_lock(&ftraced_lock);
1509 task = ftraced_task;
1510 ftraced_task = NULL;
1511 ftraced_suspend = -1;
1512 ftrace_run_update_code(command);
1513 mutex_unlock(&ftraced_lock);
1514
1515 if (task)
1516 kthread_stop(task);
1517}
1518
1519static __init int ftrace_init_debugfs(void)
1520{
1521 struct dentry *d_tracer;
1522 struct dentry *entry;
1523
1524 d_tracer = tracing_init_dentry();
1525
1526 entry = debugfs_create_file("available_filter_functions", 0444,
1527 d_tracer, NULL, &ftrace_avail_fops);
1528 if (!entry)
1529 pr_warning("Could not create debugfs "
1530 "'available_filter_functions' entry\n");
1531
1532 entry = debugfs_create_file("failures", 0444,
1533 d_tracer, NULL, &ftrace_failures_fops);
1534 if (!entry)
1535 pr_warning("Could not create debugfs 'failures' entry\n");
1536
1537 entry = debugfs_create_file("set_ftrace_filter", 0644, d_tracer,
1538 NULL, &ftrace_filter_fops);
1539 if (!entry)
1540 pr_warning("Could not create debugfs "
1541 "'set_ftrace_filter' entry\n");
1542
1543 entry = debugfs_create_file("set_ftrace_notrace", 0644, d_tracer,
1544 NULL, &ftrace_notrace_fops);
1545 if (!entry)
1546 pr_warning("Could not create debugfs "
1547 "'set_ftrace_notrace' entry\n");
1548
1549 entry = debugfs_create_file("ftraced_enabled", 0644, d_tracer,
1550 NULL, &ftraced_fops);
1551 if (!entry)
1552 pr_warning("Could not create debugfs "
1553 "'ftraced_enabled' entry\n");
1554 return 0;
1555}
1556
1557fs_initcall(ftrace_init_debugfs);
1558
1559static int __init ftrace_dynamic_init(void)
1560{
1561 struct task_struct *p;
1562 unsigned long addr;
1563 int ret;
1564
1565 addr = (unsigned long)ftrace_record_ip;
1566
1567 stop_machine(ftrace_dyn_arch_init, &addr, NULL);
1568
1569 /* ftrace_dyn_arch_init places the return code in addr */
1570 if (addr) {
1571 ret = (int)addr;
1572 goto failed;
1573 }
1574
1575 ret = ftrace_dyn_table_alloc();
1576 if (ret)
1577 goto failed;
1578
1579 p = kthread_run(ftraced, NULL, "ftraced");
1580 if (IS_ERR(p)) {
1581 ret = -1;
1582 goto failed;
1583 }
1584
1585 last_ftrace_enabled = ftrace_enabled = 1;
1586 ftraced_task = p;
1587
1588 return 0;
1589
1590 failed:
1591 ftrace_disabled = 1;
1592 return ret;
1593}
1594
1595core_initcall(ftrace_dynamic_init);
1596#else
1597# define ftrace_startup() do { } while (0)
1598# define ftrace_shutdown() do { } while (0)
1599# define ftrace_startup_sysctl() do { } while (0)
1600# define ftrace_shutdown_sysctl() do { } while (0)
1601# define ftrace_force_shutdown() do { } while (0)
1602#endif /* CONFIG_DYNAMIC_FTRACE */
1603
1604/**
1605 * ftrace_kill_atomic - kill ftrace from critical sections
1606 *
1607 * This function should be used by panic code. It stops ftrace
1608 * but in a not so nice way. If you need to simply kill ftrace
1609 * from a non-atomic section, use ftrace_kill.
1610 */
1611void ftrace_kill_atomic(void)
1612{
1613 ftrace_disabled = 1;
1614 ftrace_enabled = 0;
1615#ifdef CONFIG_DYNAMIC_FTRACE
1616 ftraced_suspend = -1;
1617#endif
1618 clear_ftrace_function();
1619}
1620
1621/**
1622 * ftrace_kill - totally shutdown ftrace
1623 *
1624 * This is a safety measure. If something was detected that seems
1625 * wrong, calling this function will keep ftrace from doing
1626 * any more modifications, and updates.
1627 * used when something went wrong.
1628 */
1629void ftrace_kill(void)
1630{
1631 mutex_lock(&ftrace_sysctl_lock);
1632 ftrace_disabled = 1;
1633 ftrace_enabled = 0;
1634
1635 clear_ftrace_function();
1636 mutex_unlock(&ftrace_sysctl_lock);
1637
1638 /* Try to totally disable ftrace */
1639 ftrace_force_shutdown();
1640}
1641
1642/**
1643 * register_ftrace_function - register a function for profiling
1644 * @ops - ops structure that holds the function for profiling.
1645 *
1646 * Register a function to be called by all functions in the
1647 * kernel.
1648 *
1649 * Note: @ops->func and all the functions it calls must be labeled
1650 * with "notrace", otherwise it will go into a
1651 * recursive loop.
1652 */
1653int register_ftrace_function(struct ftrace_ops *ops)
1654{
1655 int ret;
1656
1657 if (unlikely(ftrace_disabled))
1658 return -1;
1659
1660 mutex_lock(&ftrace_sysctl_lock);
1661 ret = __register_ftrace_function(ops);
1662 ftrace_startup();
1663 mutex_unlock(&ftrace_sysctl_lock);
1664
1665 return ret;
1666}
1667
1668/**
1669 * unregister_ftrace_function - unresgister a function for profiling.
1670 * @ops - ops structure that holds the function to unregister
1671 *
1672 * Unregister a function that was added to be called by ftrace profiling.
1673 */
1674int unregister_ftrace_function(struct ftrace_ops *ops)
1675{
1676 int ret;
1677
1678 mutex_lock(&ftrace_sysctl_lock);
1679 ret = __unregister_ftrace_function(ops);
1680 ftrace_shutdown();
1681 mutex_unlock(&ftrace_sysctl_lock);
1682
1683 return ret;
1684}
1685
1686int
1687ftrace_enable_sysctl(struct ctl_table *table, int write,
1688 struct file *file, void __user *buffer, size_t *lenp,
1689 loff_t *ppos)
1690{
1691 int ret;
1692
1693 if (unlikely(ftrace_disabled))
1694 return -ENODEV;
1695
1696 mutex_lock(&ftrace_sysctl_lock);
1697
1698 ret = proc_dointvec(table, write, file, buffer, lenp, ppos);
1699
1700 if (ret || !write || (last_ftrace_enabled == ftrace_enabled))
1701 goto out;
1702
1703 last_ftrace_enabled = ftrace_enabled;
1704
1705 if (ftrace_enabled) {
1706
1707 ftrace_startup_sysctl();
1708
1709 /* we are starting ftrace again */
1710 if (ftrace_list != &ftrace_list_end) {
1711 if (ftrace_list->next == &ftrace_list_end)
1712 ftrace_trace_function = ftrace_list->func;
1713 else
1714 ftrace_trace_function = ftrace_list_func;
1715 }
1716
1717 } else {
1718 /* stopping ftrace calls (just send to ftrace_stub) */
1719 ftrace_trace_function = ftrace_stub;
1720
1721 ftrace_shutdown_sysctl();
1722 }
1723
1724 out:
1725 mutex_unlock(&ftrace_sysctl_lock);
1726 return ret;
1727}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
new file mode 100644
index 000000000000..8f3fb3db61c3
--- /dev/null
+++ b/kernel/trace/trace.c
@@ -0,0 +1,3157 @@
1/*
2 * ring buffer based function tracer
3 *
4 * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
5 * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
6 *
7 * Originally taken from the RT patch by:
8 * Arnaldo Carvalho de Melo <acme@redhat.com>
9 *
10 * Based on code from the latency_tracer, that is:
11 * Copyright (C) 2004-2006 Ingo Molnar
12 * Copyright (C) 2004 William Lee Irwin III
13 */
14#include <linux/utsrelease.h>
15#include <linux/kallsyms.h>
16#include <linux/seq_file.h>
17#include <linux/debugfs.h>
18#include <linux/pagemap.h>
19#include <linux/hardirq.h>
20#include <linux/linkage.h>
21#include <linux/uaccess.h>
22#include <linux/ftrace.h>
23#include <linux/module.h>
24#include <linux/percpu.h>
25#include <linux/ctype.h>
26#include <linux/init.h>
27#include <linux/poll.h>
28#include <linux/gfp.h>
29#include <linux/fs.h>
30#include <linux/kprobes.h>
31#include <linux/writeback.h>
32
33#include <linux/stacktrace.h>
34
35#include "trace.h"
36
37unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX;
38unsigned long __read_mostly tracing_thresh;
39
40static unsigned long __read_mostly tracing_nr_buffers;
41static cpumask_t __read_mostly tracing_buffer_mask;
42
43#define for_each_tracing_cpu(cpu) \
44 for_each_cpu_mask(cpu, tracing_buffer_mask)
45
46static int trace_alloc_page(void);
47static int trace_free_page(void);
48
49static int tracing_disabled = 1;
50
51static unsigned long tracing_pages_allocated;
52
53long
54ns2usecs(cycle_t nsec)
55{
56 nsec += 500;
57 do_div(nsec, 1000);
58 return nsec;
59}
60
61cycle_t ftrace_now(int cpu)
62{
63 return cpu_clock(cpu);
64}
65
66/*
67 * The global_trace is the descriptor that holds the tracing
68 * buffers for the live tracing. For each CPU, it contains
69 * a link list of pages that will store trace entries. The
70 * page descriptor of the pages in the memory is used to hold
71 * the link list by linking the lru item in the page descriptor
72 * to each of the pages in the buffer per CPU.
73 *
74 * For each active CPU there is a data field that holds the
75 * pages for the buffer for that CPU. Each CPU has the same number
76 * of pages allocated for its buffer.
77 */
78static struct trace_array global_trace;
79
80static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
81
82/*
83 * The max_tr is used to snapshot the global_trace when a maximum
84 * latency is reached. Some tracers will use this to store a maximum
85 * trace while it continues examining live traces.
86 *
87 * The buffers for the max_tr are set up the same as the global_trace.
88 * When a snapshot is taken, the link list of the max_tr is swapped
89 * with the link list of the global_trace and the buffers are reset for
90 * the global_trace so the tracing can continue.
91 */
92static struct trace_array max_tr;
93
94static DEFINE_PER_CPU(struct trace_array_cpu, max_data);
95
96/* tracer_enabled is used to toggle activation of a tracer */
97static int tracer_enabled = 1;
98
99/* function tracing enabled */
100int ftrace_function_enabled;
101
102/*
103 * trace_nr_entries is the number of entries that is allocated
104 * for a buffer. Note, the number of entries is always rounded
105 * to ENTRIES_PER_PAGE.
106 */
107static unsigned long trace_nr_entries = 65536UL;
108
109/* trace_types holds a link list of available tracers. */
110static struct tracer *trace_types __read_mostly;
111
112/* current_trace points to the tracer that is currently active */
113static struct tracer *current_trace __read_mostly;
114
115/*
116 * max_tracer_type_len is used to simplify the allocating of
117 * buffers to read userspace tracer names. We keep track of
118 * the longest tracer name registered.
119 */
120static int max_tracer_type_len;
121
122/*
123 * trace_types_lock is used to protect the trace_types list.
124 * This lock is also used to keep user access serialized.
125 * Accesses from userspace will grab this lock while userspace
126 * activities happen inside the kernel.
127 */
128static DEFINE_MUTEX(trace_types_lock);
129
130/* trace_wait is a waitqueue for tasks blocked on trace_poll */
131static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
132
133/* trace_flags holds iter_ctrl options */
134unsigned long trace_flags = TRACE_ITER_PRINT_PARENT;
135
136static notrace void no_trace_init(struct trace_array *tr)
137{
138 int cpu;
139
140 ftrace_function_enabled = 0;
141 if(tr->ctrl)
142 for_each_online_cpu(cpu)
143 tracing_reset(tr->data[cpu]);
144 tracer_enabled = 0;
145}
146
147/* dummy trace to disable tracing */
148static struct tracer no_tracer __read_mostly = {
149 .name = "none",
150 .init = no_trace_init
151};
152
153
154/**
155 * trace_wake_up - wake up tasks waiting for trace input
156 *
157 * Simply wakes up any task that is blocked on the trace_wait
158 * queue. These is used with trace_poll for tasks polling the trace.
159 */
160void trace_wake_up(void)
161{
162 /*
163 * The runqueue_is_locked() can fail, but this is the best we
164 * have for now:
165 */
166 if (!(trace_flags & TRACE_ITER_BLOCK) && !runqueue_is_locked())
167 wake_up(&trace_wait);
168}
169
170#define ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(struct trace_entry))
171
172static int __init set_nr_entries(char *str)
173{
174 unsigned long nr_entries;
175 int ret;
176
177 if (!str)
178 return 0;
179 ret = strict_strtoul(str, 0, &nr_entries);
180 /* nr_entries can not be zero */
181 if (ret < 0 || nr_entries == 0)
182 return 0;
183 trace_nr_entries = nr_entries;
184 return 1;
185}
186__setup("trace_entries=", set_nr_entries);
187
188unsigned long nsecs_to_usecs(unsigned long nsecs)
189{
190 return nsecs / 1000;
191}
192
193/*
194 * trace_flag_type is an enumeration that holds different
195 * states when a trace occurs. These are:
196 * IRQS_OFF - interrupts were disabled
197 * NEED_RESCED - reschedule is requested
198 * HARDIRQ - inside an interrupt handler
199 * SOFTIRQ - inside a softirq handler
200 */
201enum trace_flag_type {
202 TRACE_FLAG_IRQS_OFF = 0x01,
203 TRACE_FLAG_NEED_RESCHED = 0x02,
204 TRACE_FLAG_HARDIRQ = 0x04,
205 TRACE_FLAG_SOFTIRQ = 0x08,
206};
207
208/*
209 * TRACE_ITER_SYM_MASK masks the options in trace_flags that
210 * control the output of kernel symbols.
211 */
212#define TRACE_ITER_SYM_MASK \
213 (TRACE_ITER_PRINT_PARENT|TRACE_ITER_SYM_OFFSET|TRACE_ITER_SYM_ADDR)
214
215/* These must match the bit postions in trace_iterator_flags */
216static const char *trace_options[] = {
217 "print-parent",
218 "sym-offset",
219 "sym-addr",
220 "verbose",
221 "raw",
222 "hex",
223 "bin",
224 "block",
225 "stacktrace",
226 "sched-tree",
227 NULL
228};
229
230/*
231 * ftrace_max_lock is used to protect the swapping of buffers
232 * when taking a max snapshot. The buffers themselves are
233 * protected by per_cpu spinlocks. But the action of the swap
234 * needs its own lock.
235 *
236 * This is defined as a raw_spinlock_t in order to help
237 * with performance when lockdep debugging is enabled.
238 */
239static raw_spinlock_t ftrace_max_lock =
240 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
241
242/*
243 * Copy the new maximum trace into the separate maximum-trace
244 * structure. (this way the maximum trace is permanently saved,
245 * for later retrieval via /debugfs/tracing/latency_trace)
246 */
247static void
248__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
249{
250 struct trace_array_cpu *data = tr->data[cpu];
251
252 max_tr.cpu = cpu;
253 max_tr.time_start = data->preempt_timestamp;
254
255 data = max_tr.data[cpu];
256 data->saved_latency = tracing_max_latency;
257
258 memcpy(data->comm, tsk->comm, TASK_COMM_LEN);
259 data->pid = tsk->pid;
260 data->uid = tsk->uid;
261 data->nice = tsk->static_prio - 20 - MAX_RT_PRIO;
262 data->policy = tsk->policy;
263 data->rt_priority = tsk->rt_priority;
264
265 /* record this tasks comm */
266 tracing_record_cmdline(current);
267}
268
269#define CHECK_COND(cond) \
270 if (unlikely(cond)) { \
271 tracing_disabled = 1; \
272 WARN_ON(1); \
273 return -1; \
274 }
275
276/**
277 * check_pages - integrity check of trace buffers
278 *
279 * As a safty measure we check to make sure the data pages have not
280 * been corrupted.
281 */
282int check_pages(struct trace_array_cpu *data)
283{
284 struct page *page, *tmp;
285
286 CHECK_COND(data->trace_pages.next->prev != &data->trace_pages);
287 CHECK_COND(data->trace_pages.prev->next != &data->trace_pages);
288
289 list_for_each_entry_safe(page, tmp, &data->trace_pages, lru) {
290 CHECK_COND(page->lru.next->prev != &page->lru);
291 CHECK_COND(page->lru.prev->next != &page->lru);
292 }
293
294 return 0;
295}
296
297/**
298 * head_page - page address of the first page in per_cpu buffer.
299 *
300 * head_page returns the page address of the first page in
301 * a per_cpu buffer. This also preforms various consistency
302 * checks to make sure the buffer has not been corrupted.
303 */
304void *head_page(struct trace_array_cpu *data)
305{
306 struct page *page;
307
308 if (list_empty(&data->trace_pages))
309 return NULL;
310
311 page = list_entry(data->trace_pages.next, struct page, lru);
312 BUG_ON(&page->lru == &data->trace_pages);
313
314 return page_address(page);
315}
316
317/**
318 * trace_seq_printf - sequence printing of trace information
319 * @s: trace sequence descriptor
320 * @fmt: printf format string
321 *
322 * The tracer may use either sequence operations or its own
323 * copy to user routines. To simplify formating of a trace
324 * trace_seq_printf is used to store strings into a special
325 * buffer (@s). Then the output may be either used by
326 * the sequencer or pulled into another buffer.
327 */
328int
329trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
330{
331 int len = (PAGE_SIZE - 1) - s->len;
332 va_list ap;
333 int ret;
334
335 if (!len)
336 return 0;
337
338 va_start(ap, fmt);
339 ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
340 va_end(ap);
341
342 /* If we can't write it all, don't bother writing anything */
343 if (ret >= len)
344 return 0;
345
346 s->len += ret;
347
348 return len;
349}
350
351/**
352 * trace_seq_puts - trace sequence printing of simple string
353 * @s: trace sequence descriptor
354 * @str: simple string to record
355 *
356 * The tracer may use either the sequence operations or its own
357 * copy to user routines. This function records a simple string
358 * into a special buffer (@s) for later retrieval by a sequencer
359 * or other mechanism.
360 */
361static int
362trace_seq_puts(struct trace_seq *s, const char *str)
363{
364 int len = strlen(str);
365
366 if (len > ((PAGE_SIZE - 1) - s->len))
367 return 0;
368
369 memcpy(s->buffer + s->len, str, len);
370 s->len += len;
371
372 return len;
373}
374
375static int
376trace_seq_putc(struct trace_seq *s, unsigned char c)
377{
378 if (s->len >= (PAGE_SIZE - 1))
379 return 0;
380
381 s->buffer[s->len++] = c;
382
383 return 1;
384}
385
386static int
387trace_seq_putmem(struct trace_seq *s, void *mem, size_t len)
388{
389 if (len > ((PAGE_SIZE - 1) - s->len))
390 return 0;
391
392 memcpy(s->buffer + s->len, mem, len);
393 s->len += len;
394
395 return len;
396}
397
398#define HEX_CHARS 17
399static const char hex2asc[] = "0123456789abcdef";
400
401static int
402trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
403{
404 unsigned char hex[HEX_CHARS];
405 unsigned char *data = mem;
406 unsigned char byte;
407 int i, j;
408
409 BUG_ON(len >= HEX_CHARS);
410
411#ifdef __BIG_ENDIAN
412 for (i = 0, j = 0; i < len; i++) {
413#else
414 for (i = len-1, j = 0; i >= 0; i--) {
415#endif
416 byte = data[i];
417
418 hex[j++] = hex2asc[byte & 0x0f];
419 hex[j++] = hex2asc[byte >> 4];
420 }
421 hex[j++] = ' ';
422
423 return trace_seq_putmem(s, hex, j);
424}
425
426static void
427trace_seq_reset(struct trace_seq *s)
428{
429 s->len = 0;
430 s->readpos = 0;
431}
432
433ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
434{
435 int len;
436 int ret;
437
438 if (s->len <= s->readpos)
439 return -EBUSY;
440
441 len = s->len - s->readpos;
442 if (cnt > len)
443 cnt = len;
444 ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
445 if (ret)
446 return -EFAULT;
447
448 s->readpos += len;
449 return cnt;
450}
451
452static void
453trace_print_seq(struct seq_file *m, struct trace_seq *s)
454{
455 int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
456
457 s->buffer[len] = 0;
458 seq_puts(m, s->buffer);
459
460 trace_seq_reset(s);
461}
462
463/*
464 * flip the trace buffers between two trace descriptors.
465 * This usually is the buffers between the global_trace and
466 * the max_tr to record a snapshot of a current trace.
467 *
468 * The ftrace_max_lock must be held.
469 */
470static void
471flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2)
472{
473 struct list_head flip_pages;
474
475 INIT_LIST_HEAD(&flip_pages);
476
477 memcpy(&tr1->trace_head_idx, &tr2->trace_head_idx,
478 sizeof(struct trace_array_cpu) -
479 offsetof(struct trace_array_cpu, trace_head_idx));
480
481 check_pages(tr1);
482 check_pages(tr2);
483 list_splice_init(&tr1->trace_pages, &flip_pages);
484 list_splice_init(&tr2->trace_pages, &tr1->trace_pages);
485 list_splice_init(&flip_pages, &tr2->trace_pages);
486 BUG_ON(!list_empty(&flip_pages));
487 check_pages(tr1);
488 check_pages(tr2);
489}
490
491/**
492 * update_max_tr - snapshot all trace buffers from global_trace to max_tr
493 * @tr: tracer
494 * @tsk: the task with the latency
495 * @cpu: The cpu that initiated the trace.
496 *
497 * Flip the buffers between the @tr and the max_tr and record information
498 * about which task was the cause of this latency.
499 */
500void
501update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
502{
503 struct trace_array_cpu *data;
504 int i;
505
506 WARN_ON_ONCE(!irqs_disabled());
507 __raw_spin_lock(&ftrace_max_lock);
508 /* clear out all the previous traces */
509 for_each_tracing_cpu(i) {
510 data = tr->data[i];
511 flip_trace(max_tr.data[i], data);
512 tracing_reset(data);
513 }
514
515 __update_max_tr(tr, tsk, cpu);
516 __raw_spin_unlock(&ftrace_max_lock);
517}
518
519/**
520 * update_max_tr_single - only copy one trace over, and reset the rest
521 * @tr - tracer
522 * @tsk - task with the latency
523 * @cpu - the cpu of the buffer to copy.
524 *
525 * Flip the trace of a single CPU buffer between the @tr and the max_tr.
526 */
527void
528update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
529{
530 struct trace_array_cpu *data = tr->data[cpu];
531 int i;
532
533 WARN_ON_ONCE(!irqs_disabled());
534 __raw_spin_lock(&ftrace_max_lock);
535 for_each_tracing_cpu(i)
536 tracing_reset(max_tr.data[i]);
537
538 flip_trace(max_tr.data[cpu], data);
539 tracing_reset(data);
540
541 __update_max_tr(tr, tsk, cpu);
542 __raw_spin_unlock(&ftrace_max_lock);
543}
544
545/**
546 * register_tracer - register a tracer with the ftrace system.
547 * @type - the plugin for the tracer
548 *
549 * Register a new plugin tracer.
550 */
551int register_tracer(struct tracer *type)
552{
553 struct tracer *t;
554 int len;
555 int ret = 0;
556
557 if (!type->name) {
558 pr_info("Tracer must have a name\n");
559 return -1;
560 }
561
562 mutex_lock(&trace_types_lock);
563 for (t = trace_types; t; t = t->next) {
564 if (strcmp(type->name, t->name) == 0) {
565 /* already found */
566 pr_info("Trace %s already registered\n",
567 type->name);
568 ret = -1;
569 goto out;
570 }
571 }
572
573#ifdef CONFIG_FTRACE_STARTUP_TEST
574 if (type->selftest) {
575 struct tracer *saved_tracer = current_trace;
576 struct trace_array_cpu *data;
577 struct trace_array *tr = &global_trace;
578 int saved_ctrl = tr->ctrl;
579 int i;
580 /*
581 * Run a selftest on this tracer.
582 * Here we reset the trace buffer, and set the current
583 * tracer to be this tracer. The tracer can then run some
584 * internal tracing to verify that everything is in order.
585 * If we fail, we do not register this tracer.
586 */
587 for_each_tracing_cpu(i) {
588 data = tr->data[i];
589 if (!head_page(data))
590 continue;
591 tracing_reset(data);
592 }
593 current_trace = type;
594 tr->ctrl = 0;
595 /* the test is responsible for initializing and enabling */
596 pr_info("Testing tracer %s: ", type->name);
597 ret = type->selftest(type, tr);
598 /* the test is responsible for resetting too */
599 current_trace = saved_tracer;
600 tr->ctrl = saved_ctrl;
601 if (ret) {
602 printk(KERN_CONT "FAILED!\n");
603 goto out;
604 }
605 /* Only reset on passing, to avoid touching corrupted buffers */
606 for_each_tracing_cpu(i) {
607 data = tr->data[i];
608 if (!head_page(data))
609 continue;
610 tracing_reset(data);
611 }
612 printk(KERN_CONT "PASSED\n");
613 }
614#endif
615
616 type->next = trace_types;
617 trace_types = type;
618 len = strlen(type->name);
619 if (len > max_tracer_type_len)
620 max_tracer_type_len = len;
621
622 out:
623 mutex_unlock(&trace_types_lock);
624
625 return ret;
626}
627
628void unregister_tracer(struct tracer *type)
629{
630 struct tracer **t;
631 int len;
632
633 mutex_lock(&trace_types_lock);
634 for (t = &trace_types; *t; t = &(*t)->next) {
635 if (*t == type)
636 goto found;
637 }
638 pr_info("Trace %s not registered\n", type->name);
639 goto out;
640
641 found:
642 *t = (*t)->next;
643 if (strlen(type->name) != max_tracer_type_len)
644 goto out;
645
646 max_tracer_type_len = 0;
647 for (t = &trace_types; *t; t = &(*t)->next) {
648 len = strlen((*t)->name);
649 if (len > max_tracer_type_len)
650 max_tracer_type_len = len;
651 }
652 out:
653 mutex_unlock(&trace_types_lock);
654}
655
656void tracing_reset(struct trace_array_cpu *data)
657{
658 data->trace_idx = 0;
659 data->overrun = 0;
660 data->trace_head = data->trace_tail = head_page(data);
661 data->trace_head_idx = 0;
662 data->trace_tail_idx = 0;
663}
664
665#define SAVED_CMDLINES 128
666static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
667static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
668static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN];
669static int cmdline_idx;
670static DEFINE_SPINLOCK(trace_cmdline_lock);
671
672/* temporary disable recording */
673atomic_t trace_record_cmdline_disabled __read_mostly;
674
675static void trace_init_cmdlines(void)
676{
677 memset(&map_pid_to_cmdline, -1, sizeof(map_pid_to_cmdline));
678 memset(&map_cmdline_to_pid, -1, sizeof(map_cmdline_to_pid));
679 cmdline_idx = 0;
680}
681
682void trace_stop_cmdline_recording(void);
683
684static void trace_save_cmdline(struct task_struct *tsk)
685{
686 unsigned map;
687 unsigned idx;
688
689 if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT))
690 return;
691
692 /*
693 * It's not the end of the world if we don't get
694 * the lock, but we also don't want to spin
695 * nor do we want to disable interrupts,
696 * so if we miss here, then better luck next time.
697 */
698 if (!spin_trylock(&trace_cmdline_lock))
699 return;
700
701 idx = map_pid_to_cmdline[tsk->pid];
702 if (idx >= SAVED_CMDLINES) {
703 idx = (cmdline_idx + 1) % SAVED_CMDLINES;
704
705 map = map_cmdline_to_pid[idx];
706 if (map <= PID_MAX_DEFAULT)
707 map_pid_to_cmdline[map] = (unsigned)-1;
708
709 map_pid_to_cmdline[tsk->pid] = idx;
710
711 cmdline_idx = idx;
712 }
713
714 memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN);
715
716 spin_unlock(&trace_cmdline_lock);
717}
718
719static char *trace_find_cmdline(int pid)
720{
721 char *cmdline = "<...>";
722 unsigned map;
723
724 if (!pid)
725 return "<idle>";
726
727 if (pid > PID_MAX_DEFAULT)
728 goto out;
729
730 map = map_pid_to_cmdline[pid];
731 if (map >= SAVED_CMDLINES)
732 goto out;
733
734 cmdline = saved_cmdlines[map];
735
736 out:
737 return cmdline;
738}
739
740void tracing_record_cmdline(struct task_struct *tsk)
741{
742 if (atomic_read(&trace_record_cmdline_disabled))
743 return;
744
745 trace_save_cmdline(tsk);
746}
747
748static inline struct list_head *
749trace_next_list(struct trace_array_cpu *data, struct list_head *next)
750{
751 /*
752 * Roundrobin - but skip the head (which is not a real page):
753 */
754 next = next->next;
755 if (unlikely(next == &data->trace_pages))
756 next = next->next;
757 BUG_ON(next == &data->trace_pages);
758
759 return next;
760}
761
762static inline void *
763trace_next_page(struct trace_array_cpu *data, void *addr)
764{
765 struct list_head *next;
766 struct page *page;
767
768 page = virt_to_page(addr);
769
770 next = trace_next_list(data, &page->lru);
771 page = list_entry(next, struct page, lru);
772
773 return page_address(page);
774}
775
776static inline struct trace_entry *
777tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data)
778{
779 unsigned long idx, idx_next;
780 struct trace_entry *entry;
781
782 data->trace_idx++;
783 idx = data->trace_head_idx;
784 idx_next = idx + 1;
785
786 BUG_ON(idx * TRACE_ENTRY_SIZE >= PAGE_SIZE);
787
788 entry = data->trace_head + idx * TRACE_ENTRY_SIZE;
789
790 if (unlikely(idx_next >= ENTRIES_PER_PAGE)) {
791 data->trace_head = trace_next_page(data, data->trace_head);
792 idx_next = 0;
793 }
794
795 if (data->trace_head == data->trace_tail &&
796 idx_next == data->trace_tail_idx) {
797 /* overrun */
798 data->overrun++;
799 data->trace_tail_idx++;
800 if (data->trace_tail_idx >= ENTRIES_PER_PAGE) {
801 data->trace_tail =
802 trace_next_page(data, data->trace_tail);
803 data->trace_tail_idx = 0;
804 }
805 }
806
807 data->trace_head_idx = idx_next;
808
809 return entry;
810}
811
812static inline void
813tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags)
814{
815 struct task_struct *tsk = current;
816 unsigned long pc;
817
818 pc = preempt_count();
819
820 entry->preempt_count = pc & 0xff;
821 entry->pid = (tsk) ? tsk->pid : 0;
822 entry->t = ftrace_now(raw_smp_processor_id());
823 entry->flags = (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
824 ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
825 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
826 (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
827}
828
829void
830trace_function(struct trace_array *tr, struct trace_array_cpu *data,
831 unsigned long ip, unsigned long parent_ip, unsigned long flags)
832{
833 struct trace_entry *entry;
834 unsigned long irq_flags;
835
836 raw_local_irq_save(irq_flags);
837 __raw_spin_lock(&data->lock);
838 entry = tracing_get_trace_entry(tr, data);
839 tracing_generic_entry_update(entry, flags);
840 entry->type = TRACE_FN;
841 entry->fn.ip = ip;
842 entry->fn.parent_ip = parent_ip;
843 __raw_spin_unlock(&data->lock);
844 raw_local_irq_restore(irq_flags);
845}
846
847void
848ftrace(struct trace_array *tr, struct trace_array_cpu *data,
849 unsigned long ip, unsigned long parent_ip, unsigned long flags)
850{
851 if (likely(!atomic_read(&data->disabled)))
852 trace_function(tr, data, ip, parent_ip, flags);
853}
854
855#ifdef CONFIG_MMIOTRACE
856void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data,
857 struct mmiotrace_rw *rw)
858{
859 struct trace_entry *entry;
860 unsigned long irq_flags;
861
862 raw_local_irq_save(irq_flags);
863 __raw_spin_lock(&data->lock);
864
865 entry = tracing_get_trace_entry(tr, data);
866 tracing_generic_entry_update(entry, 0);
867 entry->type = TRACE_MMIO_RW;
868 entry->mmiorw = *rw;
869
870 __raw_spin_unlock(&data->lock);
871 raw_local_irq_restore(irq_flags);
872
873 trace_wake_up();
874}
875
876void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data,
877 struct mmiotrace_map *map)
878{
879 struct trace_entry *entry;
880 unsigned long irq_flags;
881
882 raw_local_irq_save(irq_flags);
883 __raw_spin_lock(&data->lock);
884
885 entry = tracing_get_trace_entry(tr, data);
886 tracing_generic_entry_update(entry, 0);
887 entry->type = TRACE_MMIO_MAP;
888 entry->mmiomap = *map;
889
890 __raw_spin_unlock(&data->lock);
891 raw_local_irq_restore(irq_flags);
892
893 trace_wake_up();
894}
895#endif
896
897void __trace_stack(struct trace_array *tr,
898 struct trace_array_cpu *data,
899 unsigned long flags,
900 int skip)
901{
902 struct trace_entry *entry;
903 struct stack_trace trace;
904
905 if (!(trace_flags & TRACE_ITER_STACKTRACE))
906 return;
907
908 entry = tracing_get_trace_entry(tr, data);
909 tracing_generic_entry_update(entry, flags);
910 entry->type = TRACE_STACK;
911
912 memset(&entry->stack, 0, sizeof(entry->stack));
913
914 trace.nr_entries = 0;
915 trace.max_entries = FTRACE_STACK_ENTRIES;
916 trace.skip = skip;
917 trace.entries = entry->stack.caller;
918
919 save_stack_trace(&trace);
920}
921
922void
923__trace_special(void *__tr, void *__data,
924 unsigned long arg1, unsigned long arg2, unsigned long arg3)
925{
926 struct trace_array_cpu *data = __data;
927 struct trace_array *tr = __tr;
928 struct trace_entry *entry;
929 unsigned long irq_flags;
930
931 raw_local_irq_save(irq_flags);
932 __raw_spin_lock(&data->lock);
933 entry = tracing_get_trace_entry(tr, data);
934 tracing_generic_entry_update(entry, 0);
935 entry->type = TRACE_SPECIAL;
936 entry->special.arg1 = arg1;
937 entry->special.arg2 = arg2;
938 entry->special.arg3 = arg3;
939 __trace_stack(tr, data, irq_flags, 4);
940 __raw_spin_unlock(&data->lock);
941 raw_local_irq_restore(irq_flags);
942
943 trace_wake_up();
944}
945
946void
947tracing_sched_switch_trace(struct trace_array *tr,
948 struct trace_array_cpu *data,
949 struct task_struct *prev,
950 struct task_struct *next,
951 unsigned long flags)
952{
953 struct trace_entry *entry;
954 unsigned long irq_flags;
955
956 raw_local_irq_save(irq_flags);
957 __raw_spin_lock(&data->lock);
958 entry = tracing_get_trace_entry(tr, data);
959 tracing_generic_entry_update(entry, flags);
960 entry->type = TRACE_CTX;
961 entry->ctx.prev_pid = prev->pid;
962 entry->ctx.prev_prio = prev->prio;
963 entry->ctx.prev_state = prev->state;
964 entry->ctx.next_pid = next->pid;
965 entry->ctx.next_prio = next->prio;
966 entry->ctx.next_state = next->state;
967 __trace_stack(tr, data, flags, 5);
968 __raw_spin_unlock(&data->lock);
969 raw_local_irq_restore(irq_flags);
970}
971
972void
973tracing_sched_wakeup_trace(struct trace_array *tr,
974 struct trace_array_cpu *data,
975 struct task_struct *wakee,
976 struct task_struct *curr,
977 unsigned long flags)
978{
979 struct trace_entry *entry;
980 unsigned long irq_flags;
981
982 raw_local_irq_save(irq_flags);
983 __raw_spin_lock(&data->lock);
984 entry = tracing_get_trace_entry(tr, data);
985 tracing_generic_entry_update(entry, flags);
986 entry->type = TRACE_WAKE;
987 entry->ctx.prev_pid = curr->pid;
988 entry->ctx.prev_prio = curr->prio;
989 entry->ctx.prev_state = curr->state;
990 entry->ctx.next_pid = wakee->pid;
991 entry->ctx.next_prio = wakee->prio;
992 entry->ctx.next_state = wakee->state;
993 __trace_stack(tr, data, flags, 6);
994 __raw_spin_unlock(&data->lock);
995 raw_local_irq_restore(irq_flags);
996
997 trace_wake_up();
998}
999
1000void
1001ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
1002{
1003 struct trace_array *tr = &global_trace;
1004 struct trace_array_cpu *data;
1005 unsigned long flags;
1006 long disabled;
1007 int cpu;
1008
1009 if (tracing_disabled || current_trace == &no_tracer || !tr->ctrl)
1010 return;
1011
1012 local_irq_save(flags);
1013 cpu = raw_smp_processor_id();
1014 data = tr->data[cpu];
1015 disabled = atomic_inc_return(&data->disabled);
1016
1017 if (likely(disabled == 1))
1018 __trace_special(tr, data, arg1, arg2, arg3);
1019
1020 atomic_dec(&data->disabled);
1021 local_irq_restore(flags);
1022}
1023
1024#ifdef CONFIG_FTRACE
1025static void
1026function_trace_call(unsigned long ip, unsigned long parent_ip)
1027{
1028 struct trace_array *tr = &global_trace;
1029 struct trace_array_cpu *data;
1030 unsigned long flags;
1031 long disabled;
1032 int cpu;
1033
1034 if (unlikely(!ftrace_function_enabled))
1035 return;
1036
1037 if (skip_trace(ip))
1038 return;
1039
1040 local_irq_save(flags);
1041 cpu = raw_smp_processor_id();
1042 data = tr->data[cpu];
1043 disabled = atomic_inc_return(&data->disabled);
1044
1045 if (likely(disabled == 1))
1046 trace_function(tr, data, ip, parent_ip, flags);
1047
1048 atomic_dec(&data->disabled);
1049 local_irq_restore(flags);
1050}
1051
1052static struct ftrace_ops trace_ops __read_mostly =
1053{
1054 .func = function_trace_call,
1055};
1056
1057void tracing_start_function_trace(void)
1058{
1059 ftrace_function_enabled = 0;
1060 register_ftrace_function(&trace_ops);
1061 if (tracer_enabled)
1062 ftrace_function_enabled = 1;
1063}
1064
1065void tracing_stop_function_trace(void)
1066{
1067 ftrace_function_enabled = 0;
1068 unregister_ftrace_function(&trace_ops);
1069}
1070#endif
1071
1072enum trace_file_type {
1073 TRACE_FILE_LAT_FMT = 1,
1074};
1075
1076static struct trace_entry *
1077trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data,
1078 struct trace_iterator *iter, int cpu)
1079{
1080 struct page *page;
1081 struct trace_entry *array;
1082
1083 if (iter->next_idx[cpu] >= tr->entries ||
1084 iter->next_idx[cpu] >= data->trace_idx ||
1085 (data->trace_head == data->trace_tail &&
1086 data->trace_head_idx == data->trace_tail_idx))
1087 return NULL;
1088
1089 if (!iter->next_page[cpu]) {
1090 /* Initialize the iterator for this cpu trace buffer */
1091 WARN_ON(!data->trace_tail);
1092 page = virt_to_page(data->trace_tail);
1093 iter->next_page[cpu] = &page->lru;
1094 iter->next_page_idx[cpu] = data->trace_tail_idx;
1095 }
1096
1097 page = list_entry(iter->next_page[cpu], struct page, lru);
1098 BUG_ON(&data->trace_pages == &page->lru);
1099
1100 array = page_address(page);
1101
1102 WARN_ON(iter->next_page_idx[cpu] >= ENTRIES_PER_PAGE);
1103 return &array[iter->next_page_idx[cpu]];
1104}
1105
1106static struct trace_entry *
1107find_next_entry(struct trace_iterator *iter, int *ent_cpu)
1108{
1109 struct trace_array *tr = iter->tr;
1110 struct trace_entry *ent, *next = NULL;
1111 int next_cpu = -1;
1112 int cpu;
1113
1114 for_each_tracing_cpu(cpu) {
1115 if (!head_page(tr->data[cpu]))
1116 continue;
1117 ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu);
1118 /*
1119 * Pick the entry with the smallest timestamp:
1120 */
1121 if (ent && (!next || ent->t < next->t)) {
1122 next = ent;
1123 next_cpu = cpu;
1124 }
1125 }
1126
1127 if (ent_cpu)
1128 *ent_cpu = next_cpu;
1129
1130 return next;
1131}
1132
1133static void trace_iterator_increment(struct trace_iterator *iter)
1134{
1135 iter->idx++;
1136 iter->next_idx[iter->cpu]++;
1137 iter->next_page_idx[iter->cpu]++;
1138
1139 if (iter->next_page_idx[iter->cpu] >= ENTRIES_PER_PAGE) {
1140 struct trace_array_cpu *data = iter->tr->data[iter->cpu];
1141
1142 iter->next_page_idx[iter->cpu] = 0;
1143 iter->next_page[iter->cpu] =
1144 trace_next_list(data, iter->next_page[iter->cpu]);
1145 }
1146}
1147
1148static void trace_consume(struct trace_iterator *iter)
1149{
1150 struct trace_array_cpu *data = iter->tr->data[iter->cpu];
1151
1152 data->trace_tail_idx++;
1153 if (data->trace_tail_idx >= ENTRIES_PER_PAGE) {
1154 data->trace_tail = trace_next_page(data, data->trace_tail);
1155 data->trace_tail_idx = 0;
1156 }
1157
1158 /* Check if we empty it, then reset the index */
1159 if (data->trace_head == data->trace_tail &&
1160 data->trace_head_idx == data->trace_tail_idx)
1161 data->trace_idx = 0;
1162}
1163
1164static void *find_next_entry_inc(struct trace_iterator *iter)
1165{
1166 struct trace_entry *next;
1167 int next_cpu = -1;
1168
1169 next = find_next_entry(iter, &next_cpu);
1170
1171 iter->prev_ent = iter->ent;
1172 iter->prev_cpu = iter->cpu;
1173
1174 iter->ent = next;
1175 iter->cpu = next_cpu;
1176
1177 if (next)
1178 trace_iterator_increment(iter);
1179
1180 return next ? iter : NULL;
1181}
1182
1183static void *s_next(struct seq_file *m, void *v, loff_t *pos)
1184{
1185 struct trace_iterator *iter = m->private;
1186 int i = (int)*pos;
1187 void *ent;
1188
1189 (*pos)++;
1190
1191 /* can't go backwards */
1192 if (iter->idx > i)
1193 return NULL;
1194
1195 if (iter->idx < 0)
1196 ent = find_next_entry_inc(iter);
1197 else
1198 ent = iter;
1199
1200 while (ent && iter->idx < i)
1201 ent = find_next_entry_inc(iter);
1202
1203 iter->pos = *pos;
1204
1205 return ent;
1206}
1207
1208static void *s_start(struct seq_file *m, loff_t *pos)
1209{
1210 struct trace_iterator *iter = m->private;
1211 void *p = NULL;
1212 loff_t l = 0;
1213 int i;
1214
1215 mutex_lock(&trace_types_lock);
1216
1217 if (!current_trace || current_trace != iter->trace) {
1218 mutex_unlock(&trace_types_lock);
1219 return NULL;
1220 }
1221
1222 atomic_inc(&trace_record_cmdline_disabled);
1223
1224 /* let the tracer grab locks here if needed */
1225 if (current_trace->start)
1226 current_trace->start(iter);
1227
1228 if (*pos != iter->pos) {
1229 iter->ent = NULL;
1230 iter->cpu = 0;
1231 iter->idx = -1;
1232 iter->prev_ent = NULL;
1233 iter->prev_cpu = -1;
1234
1235 for_each_tracing_cpu(i) {
1236 iter->next_idx[i] = 0;
1237 iter->next_page[i] = NULL;
1238 }
1239
1240 for (p = iter; p && l < *pos; p = s_next(m, p, &l))
1241 ;
1242
1243 } else {
1244 l = *pos - 1;
1245 p = s_next(m, p, &l);
1246 }
1247
1248 return p;
1249}
1250
1251static void s_stop(struct seq_file *m, void *p)
1252{
1253 struct trace_iterator *iter = m->private;
1254
1255 atomic_dec(&trace_record_cmdline_disabled);
1256
1257 /* let the tracer release locks here if needed */
1258 if (current_trace && current_trace == iter->trace && iter->trace->stop)
1259 iter->trace->stop(iter);
1260
1261 mutex_unlock(&trace_types_lock);
1262}
1263
1264#define KRETPROBE_MSG "[unknown/kretprobe'd]"
1265
1266#ifdef CONFIG_KRETPROBES
1267static inline int kretprobed(unsigned long addr)
1268{
1269 return addr == (unsigned long)kretprobe_trampoline;
1270}
1271#else
1272static inline int kretprobed(unsigned long addr)
1273{
1274 return 0;
1275}
1276#endif /* CONFIG_KRETPROBES */
1277
1278static int
1279seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
1280{
1281#ifdef CONFIG_KALLSYMS
1282 char str[KSYM_SYMBOL_LEN];
1283
1284 kallsyms_lookup(address, NULL, NULL, NULL, str);
1285
1286 return trace_seq_printf(s, fmt, str);
1287#endif
1288 return 1;
1289}
1290
1291static int
1292seq_print_sym_offset(struct trace_seq *s, const char *fmt,
1293 unsigned long address)
1294{
1295#ifdef CONFIG_KALLSYMS
1296 char str[KSYM_SYMBOL_LEN];
1297
1298 sprint_symbol(str, address);
1299 return trace_seq_printf(s, fmt, str);
1300#endif
1301 return 1;
1302}
1303
1304#ifndef CONFIG_64BIT
1305# define IP_FMT "%08lx"
1306#else
1307# define IP_FMT "%016lx"
1308#endif
1309
1310static int
1311seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
1312{
1313 int ret;
1314
1315 if (!ip)
1316 return trace_seq_printf(s, "0");
1317
1318 if (sym_flags & TRACE_ITER_SYM_OFFSET)
1319 ret = seq_print_sym_offset(s, "%s", ip);
1320 else
1321 ret = seq_print_sym_short(s, "%s", ip);
1322
1323 if (!ret)
1324 return 0;
1325
1326 if (sym_flags & TRACE_ITER_SYM_ADDR)
1327 ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
1328 return ret;
1329}
1330
1331static void print_lat_help_header(struct seq_file *m)
1332{
1333 seq_puts(m, "# _------=> CPU# \n");
1334 seq_puts(m, "# / _-----=> irqs-off \n");
1335 seq_puts(m, "# | / _----=> need-resched \n");
1336 seq_puts(m, "# || / _---=> hardirq/softirq \n");
1337 seq_puts(m, "# ||| / _--=> preempt-depth \n");
1338 seq_puts(m, "# |||| / \n");
1339 seq_puts(m, "# ||||| delay \n");
1340 seq_puts(m, "# cmd pid ||||| time | caller \n");
1341 seq_puts(m, "# \\ / ||||| \\ | / \n");
1342}
1343
1344static void print_func_help_header(struct seq_file *m)
1345{
1346 seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n");
1347 seq_puts(m, "# | | | | |\n");
1348}
1349
1350
1351static void
1352print_trace_header(struct seq_file *m, struct trace_iterator *iter)
1353{
1354 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
1355 struct trace_array *tr = iter->tr;
1356 struct trace_array_cpu *data = tr->data[tr->cpu];
1357 struct tracer *type = current_trace;
1358 unsigned long total = 0;
1359 unsigned long entries = 0;
1360 int cpu;
1361 const char *name = "preemption";
1362
1363 if (type)
1364 name = type->name;
1365
1366 for_each_tracing_cpu(cpu) {
1367 if (head_page(tr->data[cpu])) {
1368 total += tr->data[cpu]->trace_idx;
1369 if (tr->data[cpu]->trace_idx > tr->entries)
1370 entries += tr->entries;
1371 else
1372 entries += tr->data[cpu]->trace_idx;
1373 }
1374 }
1375
1376 seq_printf(m, "%s latency trace v1.1.5 on %s\n",
1377 name, UTS_RELEASE);
1378 seq_puts(m, "-----------------------------------"
1379 "---------------------------------\n");
1380 seq_printf(m, " latency: %lu us, #%lu/%lu, CPU#%d |"
1381 " (M:%s VP:%d, KP:%d, SP:%d HP:%d",
1382 nsecs_to_usecs(data->saved_latency),
1383 entries,
1384 total,
1385 tr->cpu,
1386#if defined(CONFIG_PREEMPT_NONE)
1387 "server",
1388#elif defined(CONFIG_PREEMPT_VOLUNTARY)
1389 "desktop",
1390#elif defined(CONFIG_PREEMPT)
1391 "preempt",
1392#else
1393 "unknown",
1394#endif
1395 /* These are reserved for later use */
1396 0, 0, 0, 0);
1397#ifdef CONFIG_SMP
1398 seq_printf(m, " #P:%d)\n", num_online_cpus());
1399#else
1400 seq_puts(m, ")\n");
1401#endif
1402 seq_puts(m, " -----------------\n");
1403 seq_printf(m, " | task: %.16s-%d "
1404 "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n",
1405 data->comm, data->pid, data->uid, data->nice,
1406 data->policy, data->rt_priority);
1407 seq_puts(m, " -----------------\n");
1408
1409 if (data->critical_start) {
1410 seq_puts(m, " => started at: ");
1411 seq_print_ip_sym(&iter->seq, data->critical_start, sym_flags);
1412 trace_print_seq(m, &iter->seq);
1413 seq_puts(m, "\n => ended at: ");
1414 seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags);
1415 trace_print_seq(m, &iter->seq);
1416 seq_puts(m, "\n");
1417 }
1418
1419 seq_puts(m, "\n");
1420}
1421
1422static void
1423lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
1424{
1425 int hardirq, softirq;
1426 char *comm;
1427
1428 comm = trace_find_cmdline(entry->pid);
1429
1430 trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid);
1431 trace_seq_printf(s, "%d", cpu);
1432 trace_seq_printf(s, "%c%c",
1433 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : '.',
1434 ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'));
1435
1436 hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
1437 softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
1438 if (hardirq && softirq) {
1439 trace_seq_putc(s, 'H');
1440 } else {
1441 if (hardirq) {
1442 trace_seq_putc(s, 'h');
1443 } else {
1444 if (softirq)
1445 trace_seq_putc(s, 's');
1446 else
1447 trace_seq_putc(s, '.');
1448 }
1449 }
1450
1451 if (entry->preempt_count)
1452 trace_seq_printf(s, "%x", entry->preempt_count);
1453 else
1454 trace_seq_puts(s, ".");
1455}
1456
1457unsigned long preempt_mark_thresh = 100;
1458
1459static void
1460lat_print_timestamp(struct trace_seq *s, unsigned long long abs_usecs,
1461 unsigned long rel_usecs)
1462{
1463 trace_seq_printf(s, " %4lldus", abs_usecs);
1464 if (rel_usecs > preempt_mark_thresh)
1465 trace_seq_puts(s, "!: ");
1466 else if (rel_usecs > 1)
1467 trace_seq_puts(s, "+: ");
1468 else
1469 trace_seq_puts(s, " : ");
1470}
1471
1472static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
1473
1474static int
1475print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
1476{
1477 struct trace_seq *s = &iter->seq;
1478 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
1479 struct trace_entry *next_entry = find_next_entry(iter, NULL);
1480 unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
1481 struct trace_entry *entry = iter->ent;
1482 unsigned long abs_usecs;
1483 unsigned long rel_usecs;
1484 char *comm;
1485 int S, T;
1486 int i;
1487 unsigned state;
1488
1489 if (!next_entry)
1490 next_entry = entry;
1491 rel_usecs = ns2usecs(next_entry->t - entry->t);
1492 abs_usecs = ns2usecs(entry->t - iter->tr->time_start);
1493
1494 if (verbose) {
1495 comm = trace_find_cmdline(entry->pid);
1496 trace_seq_printf(s, "%16s %5d %d %d %08x %08x [%08lx]"
1497 " %ld.%03ldms (+%ld.%03ldms): ",
1498 comm,
1499 entry->pid, cpu, entry->flags,
1500 entry->preempt_count, trace_idx,
1501 ns2usecs(entry->t),
1502 abs_usecs/1000,
1503 abs_usecs % 1000, rel_usecs/1000,
1504 rel_usecs % 1000);
1505 } else {
1506 lat_print_generic(s, entry, cpu);
1507 lat_print_timestamp(s, abs_usecs, rel_usecs);
1508 }
1509 switch (entry->type) {
1510 case TRACE_FN:
1511 seq_print_ip_sym(s, entry->fn.ip, sym_flags);
1512 trace_seq_puts(s, " (");
1513 if (kretprobed(entry->fn.parent_ip))
1514 trace_seq_puts(s, KRETPROBE_MSG);
1515 else
1516 seq_print_ip_sym(s, entry->fn.parent_ip, sym_flags);
1517 trace_seq_puts(s, ")\n");
1518 break;
1519 case TRACE_CTX:
1520 case TRACE_WAKE:
1521 T = entry->ctx.next_state < sizeof(state_to_char) ?
1522 state_to_char[entry->ctx.next_state] : 'X';
1523
1524 state = entry->ctx.prev_state ? __ffs(entry->ctx.prev_state) + 1 : 0;
1525 S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 'X';
1526 comm = trace_find_cmdline(entry->ctx.next_pid);
1527 trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c %s\n",
1528 entry->ctx.prev_pid,
1529 entry->ctx.prev_prio,
1530 S, entry->type == TRACE_CTX ? "==>" : " +",
1531 entry->ctx.next_pid,
1532 entry->ctx.next_prio,
1533 T, comm);
1534 break;
1535 case TRACE_SPECIAL:
1536 trace_seq_printf(s, "# %ld %ld %ld\n",
1537 entry->special.arg1,
1538 entry->special.arg2,
1539 entry->special.arg3);
1540 break;
1541 case TRACE_STACK:
1542 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
1543 if (i)
1544 trace_seq_puts(s, " <= ");
1545 seq_print_ip_sym(s, entry->stack.caller[i], sym_flags);
1546 }
1547 trace_seq_puts(s, "\n");
1548 break;
1549 default:
1550 trace_seq_printf(s, "Unknown type %d\n", entry->type);
1551 }
1552 return 1;
1553}
1554
1555static int print_trace_fmt(struct trace_iterator *iter)
1556{
1557 struct trace_seq *s = &iter->seq;
1558 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
1559 struct trace_entry *entry;
1560 unsigned long usec_rem;
1561 unsigned long long t;
1562 unsigned long secs;
1563 char *comm;
1564 int ret;
1565 int S, T;
1566 int i;
1567
1568 entry = iter->ent;
1569
1570 comm = trace_find_cmdline(iter->ent->pid);
1571
1572 t = ns2usecs(entry->t);
1573 usec_rem = do_div(t, 1000000ULL);
1574 secs = (unsigned long)t;
1575
1576 ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid);
1577 if (!ret)
1578 return 0;
1579 ret = trace_seq_printf(s, "[%02d] ", iter->cpu);
1580 if (!ret)
1581 return 0;
1582 ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem);
1583 if (!ret)
1584 return 0;
1585
1586 switch (entry->type) {
1587 case TRACE_FN:
1588 ret = seq_print_ip_sym(s, entry->fn.ip, sym_flags);
1589 if (!ret)
1590 return 0;
1591 if ((sym_flags & TRACE_ITER_PRINT_PARENT) &&
1592 entry->fn.parent_ip) {
1593 ret = trace_seq_printf(s, " <-");
1594 if (!ret)
1595 return 0;
1596 if (kretprobed(entry->fn.parent_ip))
1597 ret = trace_seq_puts(s, KRETPROBE_MSG);
1598 else
1599 ret = seq_print_ip_sym(s, entry->fn.parent_ip,
1600 sym_flags);
1601 if (!ret)
1602 return 0;
1603 }
1604 ret = trace_seq_printf(s, "\n");
1605 if (!ret)
1606 return 0;
1607 break;
1608 case TRACE_CTX:
1609 case TRACE_WAKE:
1610 S = entry->ctx.prev_state < sizeof(state_to_char) ?
1611 state_to_char[entry->ctx.prev_state] : 'X';
1612 T = entry->ctx.next_state < sizeof(state_to_char) ?
1613 state_to_char[entry->ctx.next_state] : 'X';
1614 ret = trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c\n",
1615 entry->ctx.prev_pid,
1616 entry->ctx.prev_prio,
1617 S,
1618 entry->type == TRACE_CTX ? "==>" : " +",
1619 entry->ctx.next_pid,
1620 entry->ctx.next_prio,
1621 T);
1622 if (!ret)
1623 return 0;
1624 break;
1625 case TRACE_SPECIAL:
1626 ret = trace_seq_printf(s, "# %ld %ld %ld\n",
1627 entry->special.arg1,
1628 entry->special.arg2,
1629 entry->special.arg3);
1630 if (!ret)
1631 return 0;
1632 break;
1633 case TRACE_STACK:
1634 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
1635 if (i) {
1636 ret = trace_seq_puts(s, " <= ");
1637 if (!ret)
1638 return 0;
1639 }
1640 ret = seq_print_ip_sym(s, entry->stack.caller[i],
1641 sym_flags);
1642 if (!ret)
1643 return 0;
1644 }
1645 ret = trace_seq_puts(s, "\n");
1646 if (!ret)
1647 return 0;
1648 break;
1649 }
1650 return 1;
1651}
1652
1653static int print_raw_fmt(struct trace_iterator *iter)
1654{
1655 struct trace_seq *s = &iter->seq;
1656 struct trace_entry *entry;
1657 int ret;
1658 int S, T;
1659
1660 entry = iter->ent;
1661
1662 ret = trace_seq_printf(s, "%d %d %llu ",
1663 entry->pid, iter->cpu, entry->t);
1664 if (!ret)
1665 return 0;
1666
1667 switch (entry->type) {
1668 case TRACE_FN:
1669 ret = trace_seq_printf(s, "%x %x\n",
1670 entry->fn.ip, entry->fn.parent_ip);
1671 if (!ret)
1672 return 0;
1673 break;
1674 case TRACE_CTX:
1675 case TRACE_WAKE:
1676 S = entry->ctx.prev_state < sizeof(state_to_char) ?
1677 state_to_char[entry->ctx.prev_state] : 'X';
1678 T = entry->ctx.next_state < sizeof(state_to_char) ?
1679 state_to_char[entry->ctx.next_state] : 'X';
1680 if (entry->type == TRACE_WAKE)
1681 S = '+';
1682 ret = trace_seq_printf(s, "%d %d %c %d %d %c\n",
1683 entry->ctx.prev_pid,
1684 entry->ctx.prev_prio,
1685 S,
1686 entry->ctx.next_pid,
1687 entry->ctx.next_prio,
1688 T);
1689 if (!ret)
1690 return 0;
1691 break;
1692 case TRACE_SPECIAL:
1693 case TRACE_STACK:
1694 ret = trace_seq_printf(s, "# %ld %ld %ld\n",
1695 entry->special.arg1,
1696 entry->special.arg2,
1697 entry->special.arg3);
1698 if (!ret)
1699 return 0;
1700 break;
1701 }
1702 return 1;
1703}
1704
1705#define SEQ_PUT_FIELD_RET(s, x) \
1706do { \
1707 if (!trace_seq_putmem(s, &(x), sizeof(x))) \
1708 return 0; \
1709} while (0)
1710
1711#define SEQ_PUT_HEX_FIELD_RET(s, x) \
1712do { \
1713 if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \
1714 return 0; \
1715} while (0)
1716
1717static int print_hex_fmt(struct trace_iterator *iter)
1718{
1719 struct trace_seq *s = &iter->seq;
1720 unsigned char newline = '\n';
1721 struct trace_entry *entry;
1722 int S, T;
1723
1724 entry = iter->ent;
1725
1726 SEQ_PUT_HEX_FIELD_RET(s, entry->pid);
1727 SEQ_PUT_HEX_FIELD_RET(s, iter->cpu);
1728 SEQ_PUT_HEX_FIELD_RET(s, entry->t);
1729
1730 switch (entry->type) {
1731 case TRACE_FN:
1732 SEQ_PUT_HEX_FIELD_RET(s, entry->fn.ip);
1733 SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip);
1734 break;
1735 case TRACE_CTX:
1736 case TRACE_WAKE:
1737 S = entry->ctx.prev_state < sizeof(state_to_char) ?
1738 state_to_char[entry->ctx.prev_state] : 'X';
1739 T = entry->ctx.next_state < sizeof(state_to_char) ?
1740 state_to_char[entry->ctx.next_state] : 'X';
1741 if (entry->type == TRACE_WAKE)
1742 S = '+';
1743 SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_pid);
1744 SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_prio);
1745 SEQ_PUT_HEX_FIELD_RET(s, S);
1746 SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_pid);
1747 SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_prio);
1748 SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip);
1749 SEQ_PUT_HEX_FIELD_RET(s, T);
1750 break;
1751 case TRACE_SPECIAL:
1752 case TRACE_STACK:
1753 SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg1);
1754 SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg2);
1755 SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg3);
1756 break;
1757 }
1758 SEQ_PUT_FIELD_RET(s, newline);
1759
1760 return 1;
1761}
1762
1763static int print_bin_fmt(struct trace_iterator *iter)
1764{
1765 struct trace_seq *s = &iter->seq;
1766 struct trace_entry *entry;
1767
1768 entry = iter->ent;
1769
1770 SEQ_PUT_FIELD_RET(s, entry->pid);
1771 SEQ_PUT_FIELD_RET(s, entry->cpu);
1772 SEQ_PUT_FIELD_RET(s, entry->t);
1773
1774 switch (entry->type) {
1775 case TRACE_FN:
1776 SEQ_PUT_FIELD_RET(s, entry->fn.ip);
1777 SEQ_PUT_FIELD_RET(s, entry->fn.parent_ip);
1778 break;
1779 case TRACE_CTX:
1780 SEQ_PUT_FIELD_RET(s, entry->ctx.prev_pid);
1781 SEQ_PUT_FIELD_RET(s, entry->ctx.prev_prio);
1782 SEQ_PUT_FIELD_RET(s, entry->ctx.prev_state);
1783 SEQ_PUT_FIELD_RET(s, entry->ctx.next_pid);
1784 SEQ_PUT_FIELD_RET(s, entry->ctx.next_prio);
1785 SEQ_PUT_FIELD_RET(s, entry->ctx.next_state);
1786 break;
1787 case TRACE_SPECIAL:
1788 case TRACE_STACK:
1789 SEQ_PUT_FIELD_RET(s, entry->special.arg1);
1790 SEQ_PUT_FIELD_RET(s, entry->special.arg2);
1791 SEQ_PUT_FIELD_RET(s, entry->special.arg3);
1792 break;
1793 }
1794 return 1;
1795}
1796
1797static int trace_empty(struct trace_iterator *iter)
1798{
1799 struct trace_array_cpu *data;
1800 int cpu;
1801
1802 for_each_tracing_cpu(cpu) {
1803 data = iter->tr->data[cpu];
1804
1805 if (head_page(data) && data->trace_idx &&
1806 (data->trace_tail != data->trace_head ||
1807 data->trace_tail_idx != data->trace_head_idx))
1808 return 0;
1809 }
1810 return 1;
1811}
1812
1813static int print_trace_line(struct trace_iterator *iter)
1814{
1815 if (iter->trace && iter->trace->print_line)
1816 return iter->trace->print_line(iter);
1817
1818 if (trace_flags & TRACE_ITER_BIN)
1819 return print_bin_fmt(iter);
1820
1821 if (trace_flags & TRACE_ITER_HEX)
1822 return print_hex_fmt(iter);
1823
1824 if (trace_flags & TRACE_ITER_RAW)
1825 return print_raw_fmt(iter);
1826
1827 if (iter->iter_flags & TRACE_FILE_LAT_FMT)
1828 return print_lat_fmt(iter, iter->idx, iter->cpu);
1829
1830 return print_trace_fmt(iter);
1831}
1832
1833static int s_show(struct seq_file *m, void *v)
1834{
1835 struct trace_iterator *iter = v;
1836
1837 if (iter->ent == NULL) {
1838 if (iter->tr) {
1839 seq_printf(m, "# tracer: %s\n", iter->trace->name);
1840 seq_puts(m, "#\n");
1841 }
1842 if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
1843 /* print nothing if the buffers are empty */
1844 if (trace_empty(iter))
1845 return 0;
1846 print_trace_header(m, iter);
1847 if (!(trace_flags & TRACE_ITER_VERBOSE))
1848 print_lat_help_header(m);
1849 } else {
1850 if (!(trace_flags & TRACE_ITER_VERBOSE))
1851 print_func_help_header(m);
1852 }
1853 } else {
1854 print_trace_line(iter);
1855 trace_print_seq(m, &iter->seq);
1856 }
1857
1858 return 0;
1859}
1860
1861static struct seq_operations tracer_seq_ops = {
1862 .start = s_start,
1863 .next = s_next,
1864 .stop = s_stop,
1865 .show = s_show,
1866};
1867
1868static struct trace_iterator *
1869__tracing_open(struct inode *inode, struct file *file, int *ret)
1870{
1871 struct trace_iterator *iter;
1872
1873 if (tracing_disabled) {
1874 *ret = -ENODEV;
1875 return NULL;
1876 }
1877
1878 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
1879 if (!iter) {
1880 *ret = -ENOMEM;
1881 goto out;
1882 }
1883
1884 mutex_lock(&trace_types_lock);
1885 if (current_trace && current_trace->print_max)
1886 iter->tr = &max_tr;
1887 else
1888 iter->tr = inode->i_private;
1889 iter->trace = current_trace;
1890 iter->pos = -1;
1891
1892 /* TODO stop tracer */
1893 *ret = seq_open(file, &tracer_seq_ops);
1894 if (!*ret) {
1895 struct seq_file *m = file->private_data;
1896 m->private = iter;
1897
1898 /* stop the trace while dumping */
1899 if (iter->tr->ctrl) {
1900 tracer_enabled = 0;
1901 ftrace_function_enabled = 0;
1902 }
1903
1904 if (iter->trace && iter->trace->open)
1905 iter->trace->open(iter);
1906 } else {
1907 kfree(iter);
1908 iter = NULL;
1909 }
1910 mutex_unlock(&trace_types_lock);
1911
1912 out:
1913 return iter;
1914}
1915
1916int tracing_open_generic(struct inode *inode, struct file *filp)
1917{
1918 if (tracing_disabled)
1919 return -ENODEV;
1920
1921 filp->private_data = inode->i_private;
1922 return 0;
1923}
1924
1925int tracing_release(struct inode *inode, struct file *file)
1926{
1927 struct seq_file *m = (struct seq_file *)file->private_data;
1928 struct trace_iterator *iter = m->private;
1929
1930 mutex_lock(&trace_types_lock);
1931 if (iter->trace && iter->trace->close)
1932 iter->trace->close(iter);
1933
1934 /* reenable tracing if it was previously enabled */
1935 if (iter->tr->ctrl) {
1936 tracer_enabled = 1;
1937 /*
1938 * It is safe to enable function tracing even if it
1939 * isn't used
1940 */
1941 ftrace_function_enabled = 1;
1942 }
1943 mutex_unlock(&trace_types_lock);
1944
1945 seq_release(inode, file);
1946 kfree(iter);
1947 return 0;
1948}
1949
1950static int tracing_open(struct inode *inode, struct file *file)
1951{
1952 int ret;
1953
1954 __tracing_open(inode, file, &ret);
1955
1956 return ret;
1957}
1958
1959static int tracing_lt_open(struct inode *inode, struct file *file)
1960{
1961 struct trace_iterator *iter;
1962 int ret;
1963
1964 iter = __tracing_open(inode, file, &ret);
1965
1966 if (!ret)
1967 iter->iter_flags |= TRACE_FILE_LAT_FMT;
1968
1969 return ret;
1970}
1971
1972
1973static void *
1974t_next(struct seq_file *m, void *v, loff_t *pos)
1975{
1976 struct tracer *t = m->private;
1977
1978 (*pos)++;
1979
1980 if (t)
1981 t = t->next;
1982
1983 m->private = t;
1984
1985 return t;
1986}
1987
1988static void *t_start(struct seq_file *m, loff_t *pos)
1989{
1990 struct tracer *t = m->private;
1991 loff_t l = 0;
1992
1993 mutex_lock(&trace_types_lock);
1994 for (; t && l < *pos; t = t_next(m, t, &l))
1995 ;
1996
1997 return t;
1998}
1999
2000static void t_stop(struct seq_file *m, void *p)
2001{
2002 mutex_unlock(&trace_types_lock);
2003}
2004
2005static int t_show(struct seq_file *m, void *v)
2006{
2007 struct tracer *t = v;
2008
2009 if (!t)
2010 return 0;
2011
2012 seq_printf(m, "%s", t->name);
2013 if (t->next)
2014 seq_putc(m, ' ');
2015 else
2016 seq_putc(m, '\n');
2017
2018 return 0;
2019}
2020
2021static struct seq_operations show_traces_seq_ops = {
2022 .start = t_start,
2023 .next = t_next,
2024 .stop = t_stop,
2025 .show = t_show,
2026};
2027
2028static int show_traces_open(struct inode *inode, struct file *file)
2029{
2030 int ret;
2031
2032 if (tracing_disabled)
2033 return -ENODEV;
2034
2035 ret = seq_open(file, &show_traces_seq_ops);
2036 if (!ret) {
2037 struct seq_file *m = file->private_data;
2038 m->private = trace_types;
2039 }
2040
2041 return ret;
2042}
2043
2044static struct file_operations tracing_fops = {
2045 .open = tracing_open,
2046 .read = seq_read,
2047 .llseek = seq_lseek,
2048 .release = tracing_release,
2049};
2050
2051static struct file_operations tracing_lt_fops = {
2052 .open = tracing_lt_open,
2053 .read = seq_read,
2054 .llseek = seq_lseek,
2055 .release = tracing_release,
2056};
2057
2058static struct file_operations show_traces_fops = {
2059 .open = show_traces_open,
2060 .read = seq_read,
2061 .release = seq_release,
2062};
2063
2064/*
2065 * Only trace on a CPU if the bitmask is set:
2066 */
2067static cpumask_t tracing_cpumask = CPU_MASK_ALL;
2068
2069/*
2070 * When tracing/tracing_cpu_mask is modified then this holds
2071 * the new bitmask we are about to install:
2072 */
2073static cpumask_t tracing_cpumask_new;
2074
2075/*
2076 * The tracer itself will not take this lock, but still we want
2077 * to provide a consistent cpumask to user-space:
2078 */
2079static DEFINE_MUTEX(tracing_cpumask_update_lock);
2080
2081/*
2082 * Temporary storage for the character representation of the
2083 * CPU bitmask (and one more byte for the newline):
2084 */
2085static char mask_str[NR_CPUS + 1];
2086
2087static ssize_t
2088tracing_cpumask_read(struct file *filp, char __user *ubuf,
2089 size_t count, loff_t *ppos)
2090{
2091 int len;
2092
2093 mutex_lock(&tracing_cpumask_update_lock);
2094
2095 len = cpumask_scnprintf(mask_str, count, tracing_cpumask);
2096 if (count - len < 2) {
2097 count = -EINVAL;
2098 goto out_err;
2099 }
2100 len += sprintf(mask_str + len, "\n");
2101 count = simple_read_from_buffer(ubuf, count, ppos, mask_str, NR_CPUS+1);
2102
2103out_err:
2104 mutex_unlock(&tracing_cpumask_update_lock);
2105
2106 return count;
2107}
2108
2109static ssize_t
2110tracing_cpumask_write(struct file *filp, const char __user *ubuf,
2111 size_t count, loff_t *ppos)
2112{
2113 int err, cpu;
2114
2115 mutex_lock(&tracing_cpumask_update_lock);
2116 err = cpumask_parse_user(ubuf, count, tracing_cpumask_new);
2117 if (err)
2118 goto err_unlock;
2119
2120 raw_local_irq_disable();
2121 __raw_spin_lock(&ftrace_max_lock);
2122 for_each_tracing_cpu(cpu) {
2123 /*
2124 * Increase/decrease the disabled counter if we are
2125 * about to flip a bit in the cpumask:
2126 */
2127 if (cpu_isset(cpu, tracing_cpumask) &&
2128 !cpu_isset(cpu, tracing_cpumask_new)) {
2129 atomic_inc(&global_trace.data[cpu]->disabled);
2130 }
2131 if (!cpu_isset(cpu, tracing_cpumask) &&
2132 cpu_isset(cpu, tracing_cpumask_new)) {
2133 atomic_dec(&global_trace.data[cpu]->disabled);
2134 }
2135 }
2136 __raw_spin_unlock(&ftrace_max_lock);
2137 raw_local_irq_enable();
2138
2139 tracing_cpumask = tracing_cpumask_new;
2140
2141 mutex_unlock(&tracing_cpumask_update_lock);
2142
2143 return count;
2144
2145err_unlock:
2146 mutex_unlock(&tracing_cpumask_update_lock);
2147
2148 return err;
2149}
2150
2151static struct file_operations tracing_cpumask_fops = {
2152 .open = tracing_open_generic,
2153 .read = tracing_cpumask_read,
2154 .write = tracing_cpumask_write,
2155};
2156
2157static ssize_t
2158tracing_iter_ctrl_read(struct file *filp, char __user *ubuf,
2159 size_t cnt, loff_t *ppos)
2160{
2161 char *buf;
2162 int r = 0;
2163 int len = 0;
2164 int i;
2165
2166 /* calulate max size */
2167 for (i = 0; trace_options[i]; i++) {
2168 len += strlen(trace_options[i]);
2169 len += 3; /* "no" and space */
2170 }
2171
2172 /* +2 for \n and \0 */
2173 buf = kmalloc(len + 2, GFP_KERNEL);
2174 if (!buf)
2175 return -ENOMEM;
2176
2177 for (i = 0; trace_options[i]; i++) {
2178 if (trace_flags & (1 << i))
2179 r += sprintf(buf + r, "%s ", trace_options[i]);
2180 else
2181 r += sprintf(buf + r, "no%s ", trace_options[i]);
2182 }
2183
2184 r += sprintf(buf + r, "\n");
2185 WARN_ON(r >= len + 2);
2186
2187 r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2188
2189 kfree(buf);
2190
2191 return r;
2192}
2193
2194static ssize_t
2195tracing_iter_ctrl_write(struct file *filp, const char __user *ubuf,
2196 size_t cnt, loff_t *ppos)
2197{
2198 char buf[64];
2199 char *cmp = buf;
2200 int neg = 0;
2201 int i;
2202
2203 if (cnt >= sizeof(buf))
2204 return -EINVAL;
2205
2206 if (copy_from_user(&buf, ubuf, cnt))
2207 return -EFAULT;
2208
2209 buf[cnt] = 0;
2210
2211 if (strncmp(buf, "no", 2) == 0) {
2212 neg = 1;
2213 cmp += 2;
2214 }
2215
2216 for (i = 0; trace_options[i]; i++) {
2217 int len = strlen(trace_options[i]);
2218
2219 if (strncmp(cmp, trace_options[i], len) == 0) {
2220 if (neg)
2221 trace_flags &= ~(1 << i);
2222 else
2223 trace_flags |= (1 << i);
2224 break;
2225 }
2226 }
2227 /*
2228 * If no option could be set, return an error:
2229 */
2230 if (!trace_options[i])
2231 return -EINVAL;
2232
2233 filp->f_pos += cnt;
2234
2235 return cnt;
2236}
2237
2238static struct file_operations tracing_iter_fops = {
2239 .open = tracing_open_generic,
2240 .read = tracing_iter_ctrl_read,
2241 .write = tracing_iter_ctrl_write,
2242};
2243
2244static const char readme_msg[] =
2245 "tracing mini-HOWTO:\n\n"
2246 "# mkdir /debug\n"
2247 "# mount -t debugfs nodev /debug\n\n"
2248 "# cat /debug/tracing/available_tracers\n"
2249 "wakeup preemptirqsoff preemptoff irqsoff ftrace sched_switch none\n\n"
2250 "# cat /debug/tracing/current_tracer\n"
2251 "none\n"
2252 "# echo sched_switch > /debug/tracing/current_tracer\n"
2253 "# cat /debug/tracing/current_tracer\n"
2254 "sched_switch\n"
2255 "# cat /debug/tracing/iter_ctrl\n"
2256 "noprint-parent nosym-offset nosym-addr noverbose\n"
2257 "# echo print-parent > /debug/tracing/iter_ctrl\n"
2258 "# echo 1 > /debug/tracing/tracing_enabled\n"
2259 "# cat /debug/tracing/trace > /tmp/trace.txt\n"
2260 "echo 0 > /debug/tracing/tracing_enabled\n"
2261;
2262
2263static ssize_t
2264tracing_readme_read(struct file *filp, char __user *ubuf,
2265 size_t cnt, loff_t *ppos)
2266{
2267 return simple_read_from_buffer(ubuf, cnt, ppos,
2268 readme_msg, strlen(readme_msg));
2269}
2270
2271static struct file_operations tracing_readme_fops = {
2272 .open = tracing_open_generic,
2273 .read = tracing_readme_read,
2274};
2275
2276static ssize_t
2277tracing_ctrl_read(struct file *filp, char __user *ubuf,
2278 size_t cnt, loff_t *ppos)
2279{
2280 struct trace_array *tr = filp->private_data;
2281 char buf[64];
2282 int r;
2283
2284 r = sprintf(buf, "%ld\n", tr->ctrl);
2285 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2286}
2287
2288static ssize_t
2289tracing_ctrl_write(struct file *filp, const char __user *ubuf,
2290 size_t cnt, loff_t *ppos)
2291{
2292 struct trace_array *tr = filp->private_data;
2293 char buf[64];
2294 long val;
2295 int ret;
2296
2297 if (cnt >= sizeof(buf))
2298 return -EINVAL;
2299
2300 if (copy_from_user(&buf, ubuf, cnt))
2301 return -EFAULT;
2302
2303 buf[cnt] = 0;
2304
2305 ret = strict_strtoul(buf, 10, &val);
2306 if (ret < 0)
2307 return ret;
2308
2309 val = !!val;
2310
2311 mutex_lock(&trace_types_lock);
2312 if (tr->ctrl ^ val) {
2313 if (val)
2314 tracer_enabled = 1;
2315 else
2316 tracer_enabled = 0;
2317
2318 tr->ctrl = val;
2319
2320 if (current_trace && current_trace->ctrl_update)
2321 current_trace->ctrl_update(tr);
2322 }
2323 mutex_unlock(&trace_types_lock);
2324
2325 filp->f_pos += cnt;
2326
2327 return cnt;
2328}
2329
2330static ssize_t
2331tracing_set_trace_read(struct file *filp, char __user *ubuf,
2332 size_t cnt, loff_t *ppos)
2333{
2334 char buf[max_tracer_type_len+2];
2335 int r;
2336
2337 mutex_lock(&trace_types_lock);
2338 if (current_trace)
2339 r = sprintf(buf, "%s\n", current_trace->name);
2340 else
2341 r = sprintf(buf, "\n");
2342 mutex_unlock(&trace_types_lock);
2343
2344 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2345}
2346
2347static ssize_t
2348tracing_set_trace_write(struct file *filp, const char __user *ubuf,
2349 size_t cnt, loff_t *ppos)
2350{
2351 struct trace_array *tr = &global_trace;
2352 struct tracer *t;
2353 char buf[max_tracer_type_len+1];
2354 int i;
2355
2356 if (cnt > max_tracer_type_len)
2357 cnt = max_tracer_type_len;
2358
2359 if (copy_from_user(&buf, ubuf, cnt))
2360 return -EFAULT;
2361
2362 buf[cnt] = 0;
2363
2364 /* strip ending whitespace. */
2365 for (i = cnt - 1; i > 0 && isspace(buf[i]); i--)
2366 buf[i] = 0;
2367
2368 mutex_lock(&trace_types_lock);
2369 for (t = trace_types; t; t = t->next) {
2370 if (strcmp(t->name, buf) == 0)
2371 break;
2372 }
2373 if (!t || t == current_trace)
2374 goto out;
2375
2376 if (current_trace && current_trace->reset)
2377 current_trace->reset(tr);
2378
2379 current_trace = t;
2380 if (t->init)
2381 t->init(tr);
2382
2383 out:
2384 mutex_unlock(&trace_types_lock);
2385
2386 filp->f_pos += cnt;
2387
2388 return cnt;
2389}
2390
2391static ssize_t
2392tracing_max_lat_read(struct file *filp, char __user *ubuf,
2393 size_t cnt, loff_t *ppos)
2394{
2395 unsigned long *ptr = filp->private_data;
2396 char buf[64];
2397 int r;
2398
2399 r = snprintf(buf, sizeof(buf), "%ld\n",
2400 *ptr == (unsigned long)-1 ? -1 : nsecs_to_usecs(*ptr));
2401 if (r > sizeof(buf))
2402 r = sizeof(buf);
2403 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2404}
2405
2406static ssize_t
2407tracing_max_lat_write(struct file *filp, const char __user *ubuf,
2408 size_t cnt, loff_t *ppos)
2409{
2410 long *ptr = filp->private_data;
2411 char buf[64];
2412 long val;
2413 int ret;
2414
2415 if (cnt >= sizeof(buf))
2416 return -EINVAL;
2417
2418 if (copy_from_user(&buf, ubuf, cnt))
2419 return -EFAULT;
2420
2421 buf[cnt] = 0;
2422
2423 ret = strict_strtoul(buf, 10, &val);
2424 if (ret < 0)
2425 return ret;
2426
2427 *ptr = val * 1000;
2428
2429 return cnt;
2430}
2431
2432static atomic_t tracing_reader;
2433
2434static int tracing_open_pipe(struct inode *inode, struct file *filp)
2435{
2436 struct trace_iterator *iter;
2437
2438 if (tracing_disabled)
2439 return -ENODEV;
2440
2441 /* We only allow for reader of the pipe */
2442 if (atomic_inc_return(&tracing_reader) != 1) {
2443 atomic_dec(&tracing_reader);
2444 return -EBUSY;
2445 }
2446
2447 /* create a buffer to store the information to pass to userspace */
2448 iter = kzalloc(sizeof(*iter), GFP_KERNEL);
2449 if (!iter)
2450 return -ENOMEM;
2451
2452 mutex_lock(&trace_types_lock);
2453 iter->tr = &global_trace;
2454 iter->trace = current_trace;
2455 filp->private_data = iter;
2456
2457 if (iter->trace->pipe_open)
2458 iter->trace->pipe_open(iter);
2459 mutex_unlock(&trace_types_lock);
2460
2461 return 0;
2462}
2463
2464static int tracing_release_pipe(struct inode *inode, struct file *file)
2465{
2466 struct trace_iterator *iter = file->private_data;
2467
2468 kfree(iter);
2469 atomic_dec(&tracing_reader);
2470
2471 return 0;
2472}
2473
2474static unsigned int
2475tracing_poll_pipe(struct file *filp, poll_table *poll_table)
2476{
2477 struct trace_iterator *iter = filp->private_data;
2478
2479 if (trace_flags & TRACE_ITER_BLOCK) {
2480 /*
2481 * Always select as readable when in blocking mode
2482 */
2483 return POLLIN | POLLRDNORM;
2484 } else {
2485 if (!trace_empty(iter))
2486 return POLLIN | POLLRDNORM;
2487 poll_wait(filp, &trace_wait, poll_table);
2488 if (!trace_empty(iter))
2489 return POLLIN | POLLRDNORM;
2490
2491 return 0;
2492 }
2493}
2494
2495/*
2496 * Consumer reader.
2497 */
2498static ssize_t
2499tracing_read_pipe(struct file *filp, char __user *ubuf,
2500 size_t cnt, loff_t *ppos)
2501{
2502 struct trace_iterator *iter = filp->private_data;
2503 struct trace_array_cpu *data;
2504 static cpumask_t mask;
2505 unsigned long flags;
2506#ifdef CONFIG_FTRACE
2507 int ftrace_save;
2508#endif
2509 int cpu;
2510 ssize_t sret;
2511
2512 /* return any leftover data */
2513 sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
2514 if (sret != -EBUSY)
2515 return sret;
2516 sret = 0;
2517
2518 trace_seq_reset(&iter->seq);
2519
2520 mutex_lock(&trace_types_lock);
2521 if (iter->trace->read) {
2522 sret = iter->trace->read(iter, filp, ubuf, cnt, ppos);
2523 if (sret)
2524 goto out;
2525 }
2526
2527 while (trace_empty(iter)) {
2528
2529 if ((filp->f_flags & O_NONBLOCK)) {
2530 sret = -EAGAIN;
2531 goto out;
2532 }
2533
2534 /*
2535 * This is a make-shift waitqueue. The reason we don't use
2536 * an actual wait queue is because:
2537 * 1) we only ever have one waiter
2538 * 2) the tracing, traces all functions, we don't want
2539 * the overhead of calling wake_up and friends
2540 * (and tracing them too)
2541 * Anyway, this is really very primitive wakeup.
2542 */
2543 set_current_state(TASK_INTERRUPTIBLE);
2544 iter->tr->waiter = current;
2545
2546 mutex_unlock(&trace_types_lock);
2547
2548 /* sleep for 100 msecs, and try again. */
2549 schedule_timeout(HZ/10);
2550
2551 mutex_lock(&trace_types_lock);
2552
2553 iter->tr->waiter = NULL;
2554
2555 if (signal_pending(current)) {
2556 sret = -EINTR;
2557 goto out;
2558 }
2559
2560 if (iter->trace != current_trace)
2561 goto out;
2562
2563 /*
2564 * We block until we read something and tracing is disabled.
2565 * We still block if tracing is disabled, but we have never
2566 * read anything. This allows a user to cat this file, and
2567 * then enable tracing. But after we have read something,
2568 * we give an EOF when tracing is again disabled.
2569 *
2570 * iter->pos will be 0 if we haven't read anything.
2571 */
2572 if (!tracer_enabled && iter->pos)
2573 break;
2574
2575 continue;
2576 }
2577
2578 /* stop when tracing is finished */
2579 if (trace_empty(iter))
2580 goto out;
2581
2582 if (cnt >= PAGE_SIZE)
2583 cnt = PAGE_SIZE - 1;
2584
2585 /* reset all but tr, trace, and overruns */
2586 memset(&iter->seq, 0,
2587 sizeof(struct trace_iterator) -
2588 offsetof(struct trace_iterator, seq));
2589 iter->pos = -1;
2590
2591 /*
2592 * We need to stop all tracing on all CPUS to read the
2593 * the next buffer. This is a bit expensive, but is
2594 * not done often. We fill all what we can read,
2595 * and then release the locks again.
2596 */
2597
2598 cpus_clear(mask);
2599 local_irq_save(flags);
2600#ifdef CONFIG_FTRACE
2601 ftrace_save = ftrace_enabled;
2602 ftrace_enabled = 0;
2603#endif
2604 smp_wmb();
2605 for_each_tracing_cpu(cpu) {
2606 data = iter->tr->data[cpu];
2607
2608 if (!head_page(data) || !data->trace_idx)
2609 continue;
2610
2611 atomic_inc(&data->disabled);
2612 cpu_set(cpu, mask);
2613 }
2614
2615 for_each_cpu_mask(cpu, mask) {
2616 data = iter->tr->data[cpu];
2617 __raw_spin_lock(&data->lock);
2618
2619 if (data->overrun > iter->last_overrun[cpu])
2620 iter->overrun[cpu] +=
2621 data->overrun - iter->last_overrun[cpu];
2622 iter->last_overrun[cpu] = data->overrun;
2623 }
2624
2625 while (find_next_entry_inc(iter) != NULL) {
2626 int ret;
2627 int len = iter->seq.len;
2628
2629 ret = print_trace_line(iter);
2630 if (!ret) {
2631 /* don't print partial lines */
2632 iter->seq.len = len;
2633 break;
2634 }
2635
2636 trace_consume(iter);
2637
2638 if (iter->seq.len >= cnt)
2639 break;
2640 }
2641
2642 for_each_cpu_mask(cpu, mask) {
2643 data = iter->tr->data[cpu];
2644 __raw_spin_unlock(&data->lock);
2645 }
2646
2647 for_each_cpu_mask(cpu, mask) {
2648 data = iter->tr->data[cpu];
2649 atomic_dec(&data->disabled);
2650 }
2651#ifdef CONFIG_FTRACE
2652 ftrace_enabled = ftrace_save;
2653#endif
2654 local_irq_restore(flags);
2655
2656 /* Now copy what we have to the user */
2657 sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
2658 if (iter->seq.readpos >= iter->seq.len)
2659 trace_seq_reset(&iter->seq);
2660 if (sret == -EBUSY)
2661 sret = 0;
2662
2663out:
2664 mutex_unlock(&trace_types_lock);
2665
2666 return sret;
2667}
2668
2669static ssize_t
2670tracing_entries_read(struct file *filp, char __user *ubuf,
2671 size_t cnt, loff_t *ppos)
2672{
2673 struct trace_array *tr = filp->private_data;
2674 char buf[64];
2675 int r;
2676
2677 r = sprintf(buf, "%lu\n", tr->entries);
2678 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2679}
2680
2681static ssize_t
2682tracing_entries_write(struct file *filp, const char __user *ubuf,
2683 size_t cnt, loff_t *ppos)
2684{
2685 unsigned long val;
2686 char buf[64];
2687 int i, ret;
2688
2689 if (cnt >= sizeof(buf))
2690 return -EINVAL;
2691
2692 if (copy_from_user(&buf, ubuf, cnt))
2693 return -EFAULT;
2694
2695 buf[cnt] = 0;
2696
2697 ret = strict_strtoul(buf, 10, &val);
2698 if (ret < 0)
2699 return ret;
2700
2701 /* must have at least 1 entry */
2702 if (!val)
2703 return -EINVAL;
2704
2705 mutex_lock(&trace_types_lock);
2706
2707 if (current_trace != &no_tracer) {
2708 cnt = -EBUSY;
2709 pr_info("ftrace: set current_tracer to none"
2710 " before modifying buffer size\n");
2711 goto out;
2712 }
2713
2714 if (val > global_trace.entries) {
2715 long pages_requested;
2716 unsigned long freeable_pages;
2717
2718 /* make sure we have enough memory before mapping */
2719 pages_requested =
2720 (val + (ENTRIES_PER_PAGE-1)) / ENTRIES_PER_PAGE;
2721
2722 /* account for each buffer (and max_tr) */
2723 pages_requested *= tracing_nr_buffers * 2;
2724
2725 /* Check for overflow */
2726 if (pages_requested < 0) {
2727 cnt = -ENOMEM;
2728 goto out;
2729 }
2730
2731 freeable_pages = determine_dirtyable_memory();
2732
2733 /* we only allow to request 1/4 of useable memory */
2734 if (pages_requested >
2735 ((freeable_pages + tracing_pages_allocated) / 4)) {
2736 cnt = -ENOMEM;
2737 goto out;
2738 }
2739
2740 while (global_trace.entries < val) {
2741 if (trace_alloc_page()) {
2742 cnt = -ENOMEM;
2743 goto out;
2744 }
2745 /* double check that we don't go over the known pages */
2746 if (tracing_pages_allocated > pages_requested)
2747 break;
2748 }
2749
2750 } else {
2751 /* include the number of entries in val (inc of page entries) */
2752 while (global_trace.entries > val + (ENTRIES_PER_PAGE - 1))
2753 trace_free_page();
2754 }
2755
2756 /* check integrity */
2757 for_each_tracing_cpu(i)
2758 check_pages(global_trace.data[i]);
2759
2760 filp->f_pos += cnt;
2761
2762 /* If check pages failed, return ENOMEM */
2763 if (tracing_disabled)
2764 cnt = -ENOMEM;
2765 out:
2766 max_tr.entries = global_trace.entries;
2767 mutex_unlock(&trace_types_lock);
2768
2769 return cnt;
2770}
2771
2772static struct file_operations tracing_max_lat_fops = {
2773 .open = tracing_open_generic,
2774 .read = tracing_max_lat_read,
2775 .write = tracing_max_lat_write,
2776};
2777
2778static struct file_operations tracing_ctrl_fops = {
2779 .open = tracing_open_generic,
2780 .read = tracing_ctrl_read,
2781 .write = tracing_ctrl_write,
2782};
2783
2784static struct file_operations set_tracer_fops = {
2785 .open = tracing_open_generic,
2786 .read = tracing_set_trace_read,
2787 .write = tracing_set_trace_write,
2788};
2789
2790static struct file_operations tracing_pipe_fops = {
2791 .open = tracing_open_pipe,
2792 .poll = tracing_poll_pipe,
2793 .read = tracing_read_pipe,
2794 .release = tracing_release_pipe,
2795};
2796
2797static struct file_operations tracing_entries_fops = {
2798 .open = tracing_open_generic,
2799 .read = tracing_entries_read,
2800 .write = tracing_entries_write,
2801};
2802
2803#ifdef CONFIG_DYNAMIC_FTRACE
2804
2805static ssize_t
2806tracing_read_long(struct file *filp, char __user *ubuf,
2807 size_t cnt, loff_t *ppos)
2808{
2809 unsigned long *p = filp->private_data;
2810 char buf[64];
2811 int r;
2812
2813 r = sprintf(buf, "%ld\n", *p);
2814
2815 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
2816}
2817
2818static struct file_operations tracing_read_long_fops = {
2819 .open = tracing_open_generic,
2820 .read = tracing_read_long,
2821};
2822#endif
2823
2824static struct dentry *d_tracer;
2825
2826struct dentry *tracing_init_dentry(void)
2827{
2828 static int once;
2829
2830 if (d_tracer)
2831 return d_tracer;
2832
2833 d_tracer = debugfs_create_dir("tracing", NULL);
2834
2835 if (!d_tracer && !once) {
2836 once = 1;
2837 pr_warning("Could not create debugfs directory 'tracing'\n");
2838 return NULL;
2839 }
2840
2841 return d_tracer;
2842}
2843
2844#ifdef CONFIG_FTRACE_SELFTEST
2845/* Let selftest have access to static functions in this file */
2846#include "trace_selftest.c"
2847#endif
2848
2849static __init void tracer_init_debugfs(void)
2850{
2851 struct dentry *d_tracer;
2852 struct dentry *entry;
2853
2854 d_tracer = tracing_init_dentry();
2855
2856 entry = debugfs_create_file("tracing_enabled", 0644, d_tracer,
2857 &global_trace, &tracing_ctrl_fops);
2858 if (!entry)
2859 pr_warning("Could not create debugfs 'tracing_enabled' entry\n");
2860
2861 entry = debugfs_create_file("iter_ctrl", 0644, d_tracer,
2862 NULL, &tracing_iter_fops);
2863 if (!entry)
2864 pr_warning("Could not create debugfs 'iter_ctrl' entry\n");
2865
2866 entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer,
2867 NULL, &tracing_cpumask_fops);
2868 if (!entry)
2869 pr_warning("Could not create debugfs 'tracing_cpumask' entry\n");
2870
2871 entry = debugfs_create_file("latency_trace", 0444, d_tracer,
2872 &global_trace, &tracing_lt_fops);
2873 if (!entry)
2874 pr_warning("Could not create debugfs 'latency_trace' entry\n");
2875
2876 entry = debugfs_create_file("trace", 0444, d_tracer,
2877 &global_trace, &tracing_fops);
2878 if (!entry)
2879 pr_warning("Could not create debugfs 'trace' entry\n");
2880
2881 entry = debugfs_create_file("available_tracers", 0444, d_tracer,
2882 &global_trace, &show_traces_fops);
2883 if (!entry)
2884 pr_warning("Could not create debugfs 'trace' entry\n");
2885
2886 entry = debugfs_create_file("current_tracer", 0444, d_tracer,
2887 &global_trace, &set_tracer_fops);
2888 if (!entry)
2889 pr_warning("Could not create debugfs 'trace' entry\n");
2890
2891 entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer,
2892 &tracing_max_latency,
2893 &tracing_max_lat_fops);
2894 if (!entry)
2895 pr_warning("Could not create debugfs "
2896 "'tracing_max_latency' entry\n");
2897
2898 entry = debugfs_create_file("tracing_thresh", 0644, d_tracer,
2899 &tracing_thresh, &tracing_max_lat_fops);
2900 if (!entry)
2901 pr_warning("Could not create debugfs "
2902 "'tracing_threash' entry\n");
2903 entry = debugfs_create_file("README", 0644, d_tracer,
2904 NULL, &tracing_readme_fops);
2905 if (!entry)
2906 pr_warning("Could not create debugfs 'README' entry\n");
2907
2908 entry = debugfs_create_file("trace_pipe", 0644, d_tracer,
2909 NULL, &tracing_pipe_fops);
2910 if (!entry)
2911 pr_warning("Could not create debugfs "
2912 "'tracing_threash' entry\n");
2913
2914 entry = debugfs_create_file("trace_entries", 0644, d_tracer,
2915 &global_trace, &tracing_entries_fops);
2916 if (!entry)
2917 pr_warning("Could not create debugfs "
2918 "'tracing_threash' entry\n");
2919
2920#ifdef CONFIG_DYNAMIC_FTRACE
2921 entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
2922 &ftrace_update_tot_cnt,
2923 &tracing_read_long_fops);
2924 if (!entry)
2925 pr_warning("Could not create debugfs "
2926 "'dyn_ftrace_total_info' entry\n");
2927#endif
2928#ifdef CONFIG_SYSPROF_TRACER
2929 init_tracer_sysprof_debugfs(d_tracer);
2930#endif
2931}
2932
2933static int trace_alloc_page(void)
2934{
2935 struct trace_array_cpu *data;
2936 struct page *page, *tmp;
2937 LIST_HEAD(pages);
2938 void *array;
2939 unsigned pages_allocated = 0;
2940 int i;
2941
2942 /* first allocate a page for each CPU */
2943 for_each_tracing_cpu(i) {
2944 array = (void *)__get_free_page(GFP_KERNEL);
2945 if (array == NULL) {
2946 printk(KERN_ERR "tracer: failed to allocate page"
2947 "for trace buffer!\n");
2948 goto free_pages;
2949 }
2950
2951 pages_allocated++;
2952 page = virt_to_page(array);
2953 list_add(&page->lru, &pages);
2954
2955/* Only allocate if we are actually using the max trace */
2956#ifdef CONFIG_TRACER_MAX_TRACE
2957 array = (void *)__get_free_page(GFP_KERNEL);
2958 if (array == NULL) {
2959 printk(KERN_ERR "tracer: failed to allocate page"
2960 "for trace buffer!\n");
2961 goto free_pages;
2962 }
2963 pages_allocated++;
2964 page = virt_to_page(array);
2965 list_add(&page->lru, &pages);
2966#endif
2967 }
2968
2969 /* Now that we successfully allocate a page per CPU, add them */
2970 for_each_tracing_cpu(i) {
2971 data = global_trace.data[i];
2972 page = list_entry(pages.next, struct page, lru);
2973 list_del_init(&page->lru);
2974 list_add_tail(&page->lru, &data->trace_pages);
2975 ClearPageLRU(page);
2976
2977#ifdef CONFIG_TRACER_MAX_TRACE
2978 data = max_tr.data[i];
2979 page = list_entry(pages.next, struct page, lru);
2980 list_del_init(&page->lru);
2981 list_add_tail(&page->lru, &data->trace_pages);
2982 SetPageLRU(page);
2983#endif
2984 }
2985 tracing_pages_allocated += pages_allocated;
2986 global_trace.entries += ENTRIES_PER_PAGE;
2987
2988 return 0;
2989
2990 free_pages:
2991 list_for_each_entry_safe(page, tmp, &pages, lru) {
2992 list_del_init(&page->lru);
2993 __free_page(page);
2994 }
2995 return -ENOMEM;
2996}
2997
2998static int trace_free_page(void)
2999{
3000 struct trace_array_cpu *data;
3001 struct page *page;
3002 struct list_head *p;
3003 int i;
3004 int ret = 0;
3005
3006 /* free one page from each buffer */
3007 for_each_tracing_cpu(i) {
3008 data = global_trace.data[i];
3009 p = data->trace_pages.next;
3010 if (p == &data->trace_pages) {
3011 /* should never happen */
3012 WARN_ON(1);
3013 tracing_disabled = 1;
3014 ret = -1;
3015 break;
3016 }
3017 page = list_entry(p, struct page, lru);
3018 ClearPageLRU(page);
3019 list_del(&page->lru);
3020 tracing_pages_allocated--;
3021 tracing_pages_allocated--;
3022 __free_page(page);
3023
3024 tracing_reset(data);
3025
3026#ifdef CONFIG_TRACER_MAX_TRACE
3027 data = max_tr.data[i];
3028 p = data->trace_pages.next;
3029 if (p == &data->trace_pages) {
3030 /* should never happen */
3031 WARN_ON(1);
3032 tracing_disabled = 1;
3033 ret = -1;
3034 break;
3035 }
3036 page = list_entry(p, struct page, lru);
3037 ClearPageLRU(page);
3038 list_del(&page->lru);
3039 __free_page(page);
3040
3041 tracing_reset(data);
3042#endif
3043 }
3044 global_trace.entries -= ENTRIES_PER_PAGE;
3045
3046 return ret;
3047}
3048
3049__init static int tracer_alloc_buffers(void)
3050{
3051 struct trace_array_cpu *data;
3052 void *array;
3053 struct page *page;
3054 int pages = 0;
3055 int ret = -ENOMEM;
3056 int i;
3057
3058 /* TODO: make the number of buffers hot pluggable with CPUS */
3059 tracing_nr_buffers = num_possible_cpus();
3060 tracing_buffer_mask = cpu_possible_map;
3061
3062 /* Allocate the first page for all buffers */
3063 for_each_tracing_cpu(i) {
3064 data = global_trace.data[i] = &per_cpu(global_trace_cpu, i);
3065 max_tr.data[i] = &per_cpu(max_data, i);
3066
3067 array = (void *)__get_free_page(GFP_KERNEL);
3068 if (array == NULL) {
3069 printk(KERN_ERR "tracer: failed to allocate page"
3070 "for trace buffer!\n");
3071 goto free_buffers;
3072 }
3073
3074 /* set the array to the list */
3075 INIT_LIST_HEAD(&data->trace_pages);
3076 page = virt_to_page(array);
3077 list_add(&page->lru, &data->trace_pages);
3078 /* use the LRU flag to differentiate the two buffers */
3079 ClearPageLRU(page);
3080
3081 data->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
3082 max_tr.data[i]->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
3083
3084/* Only allocate if we are actually using the max trace */
3085#ifdef CONFIG_TRACER_MAX_TRACE
3086 array = (void *)__get_free_page(GFP_KERNEL);
3087 if (array == NULL) {
3088 printk(KERN_ERR "tracer: failed to allocate page"
3089 "for trace buffer!\n");
3090 goto free_buffers;
3091 }
3092
3093 INIT_LIST_HEAD(&max_tr.data[i]->trace_pages);
3094 page = virt_to_page(array);
3095 list_add(&page->lru, &max_tr.data[i]->trace_pages);
3096 SetPageLRU(page);
3097#endif
3098 }
3099
3100 /*
3101 * Since we allocate by orders of pages, we may be able to
3102 * round up a bit.
3103 */
3104 global_trace.entries = ENTRIES_PER_PAGE;
3105 pages++;
3106
3107 while (global_trace.entries < trace_nr_entries) {
3108 if (trace_alloc_page())
3109 break;
3110 pages++;
3111 }
3112 max_tr.entries = global_trace.entries;
3113
3114 pr_info("tracer: %d pages allocated for %ld entries of %ld bytes\n",
3115 pages, trace_nr_entries, (long)TRACE_ENTRY_SIZE);
3116 pr_info(" actual entries %ld\n", global_trace.entries);
3117
3118 tracer_init_debugfs();
3119
3120 trace_init_cmdlines();
3121
3122 register_tracer(&no_tracer);
3123 current_trace = &no_tracer;
3124
3125 /* All seems OK, enable tracing */
3126 global_trace.ctrl = tracer_enabled;
3127 tracing_disabled = 0;
3128
3129 return 0;
3130
3131 free_buffers:
3132 for (i-- ; i >= 0; i--) {
3133 struct page *page, *tmp;
3134 struct trace_array_cpu *data = global_trace.data[i];
3135
3136 if (data) {
3137 list_for_each_entry_safe(page, tmp,
3138 &data->trace_pages, lru) {
3139 list_del_init(&page->lru);
3140 __free_page(page);
3141 }
3142 }
3143
3144#ifdef CONFIG_TRACER_MAX_TRACE
3145 data = max_tr.data[i];
3146 if (data) {
3147 list_for_each_entry_safe(page, tmp,
3148 &data->trace_pages, lru) {
3149 list_del_init(&page->lru);
3150 __free_page(page);
3151 }
3152 }
3153#endif
3154 }
3155 return ret;
3156}
3157fs_initcall(tracer_alloc_buffers);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
new file mode 100644
index 000000000000..f69f86788c2b
--- /dev/null
+++ b/kernel/trace/trace.h
@@ -0,0 +1,339 @@
1#ifndef _LINUX_KERNEL_TRACE_H
2#define _LINUX_KERNEL_TRACE_H
3
4#include <linux/fs.h>
5#include <asm/atomic.h>
6#include <linux/sched.h>
7#include <linux/clocksource.h>
8#include <linux/mmiotrace.h>
9
10enum trace_type {
11 __TRACE_FIRST_TYPE = 0,
12
13 TRACE_FN,
14 TRACE_CTX,
15 TRACE_WAKE,
16 TRACE_STACK,
17 TRACE_SPECIAL,
18 TRACE_MMIO_RW,
19 TRACE_MMIO_MAP,
20
21 __TRACE_LAST_TYPE
22};
23
24/*
25 * Function trace entry - function address and parent function addres:
26 */
27struct ftrace_entry {
28 unsigned long ip;
29 unsigned long parent_ip;
30};
31
32/*
33 * Context switch trace entry - which task (and prio) we switched from/to:
34 */
35struct ctx_switch_entry {
36 unsigned int prev_pid;
37 unsigned char prev_prio;
38 unsigned char prev_state;
39 unsigned int next_pid;
40 unsigned char next_prio;
41 unsigned char next_state;
42};
43
44/*
45 * Special (free-form) trace entry:
46 */
47struct special_entry {
48 unsigned long arg1;
49 unsigned long arg2;
50 unsigned long arg3;
51};
52
53/*
54 * Stack-trace entry:
55 */
56
57#define FTRACE_STACK_ENTRIES 8
58
59struct stack_entry {
60 unsigned long caller[FTRACE_STACK_ENTRIES];
61};
62
63/*
64 * The trace entry - the most basic unit of tracing. This is what
65 * is printed in the end as a single line in the trace output, such as:
66 *
67 * bash-15816 [01] 235.197585: idle_cpu <- irq_enter
68 */
69struct trace_entry {
70 char type;
71 char cpu;
72 char flags;
73 char preempt_count;
74 int pid;
75 cycle_t t;
76 union {
77 struct ftrace_entry fn;
78 struct ctx_switch_entry ctx;
79 struct special_entry special;
80 struct stack_entry stack;
81 struct mmiotrace_rw mmiorw;
82 struct mmiotrace_map mmiomap;
83 };
84};
85
86#define TRACE_ENTRY_SIZE sizeof(struct trace_entry)
87
88/*
89 * The CPU trace array - it consists of thousands of trace entries
90 * plus some other descriptor data: (for example which task started
91 * the trace, etc.)
92 */
93struct trace_array_cpu {
94 struct list_head trace_pages;
95 atomic_t disabled;
96 raw_spinlock_t lock;
97 struct lock_class_key lock_key;
98
99 /* these fields get copied into max-trace: */
100 unsigned trace_head_idx;
101 unsigned trace_tail_idx;
102 void *trace_head; /* producer */
103 void *trace_tail; /* consumer */
104 unsigned long trace_idx;
105 unsigned long overrun;
106 unsigned long saved_latency;
107 unsigned long critical_start;
108 unsigned long critical_end;
109 unsigned long critical_sequence;
110 unsigned long nice;
111 unsigned long policy;
112 unsigned long rt_priority;
113 cycle_t preempt_timestamp;
114 pid_t pid;
115 uid_t uid;
116 char comm[TASK_COMM_LEN];
117};
118
119struct trace_iterator;
120
121/*
122 * The trace array - an array of per-CPU trace arrays. This is the
123 * highest level data structure that individual tracers deal with.
124 * They have on/off state as well:
125 */
126struct trace_array {
127 unsigned long entries;
128 long ctrl;
129 int cpu;
130 cycle_t time_start;
131 struct task_struct *waiter;
132 struct trace_array_cpu *data[NR_CPUS];
133};
134
135/*
136 * A specific tracer, represented by methods that operate on a trace array:
137 */
138struct tracer {
139 const char *name;
140 void (*init)(struct trace_array *tr);
141 void (*reset)(struct trace_array *tr);
142 void (*open)(struct trace_iterator *iter);
143 void (*pipe_open)(struct trace_iterator *iter);
144 void (*close)(struct trace_iterator *iter);
145 void (*start)(struct trace_iterator *iter);
146 void (*stop)(struct trace_iterator *iter);
147 ssize_t (*read)(struct trace_iterator *iter,
148 struct file *filp, char __user *ubuf,
149 size_t cnt, loff_t *ppos);
150 void (*ctrl_update)(struct trace_array *tr);
151#ifdef CONFIG_FTRACE_STARTUP_TEST
152 int (*selftest)(struct tracer *trace,
153 struct trace_array *tr);
154#endif
155 int (*print_line)(struct trace_iterator *iter);
156 struct tracer *next;
157 int print_max;
158};
159
160struct trace_seq {
161 unsigned char buffer[PAGE_SIZE];
162 unsigned int len;
163 unsigned int readpos;
164};
165
166/*
167 * Trace iterator - used by printout routines who present trace
168 * results to users and which routines might sleep, etc:
169 */
170struct trace_iterator {
171 struct trace_array *tr;
172 struct tracer *trace;
173 void *private;
174 long last_overrun[NR_CPUS];
175 long overrun[NR_CPUS];
176
177 /* The below is zeroed out in pipe_read */
178 struct trace_seq seq;
179 struct trace_entry *ent;
180 int cpu;
181
182 struct trace_entry *prev_ent;
183 int prev_cpu;
184
185 unsigned long iter_flags;
186 loff_t pos;
187 unsigned long next_idx[NR_CPUS];
188 struct list_head *next_page[NR_CPUS];
189 unsigned next_page_idx[NR_CPUS];
190 long idx;
191};
192
193void tracing_reset(struct trace_array_cpu *data);
194int tracing_open_generic(struct inode *inode, struct file *filp);
195struct dentry *tracing_init_dentry(void);
196void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
197
198void ftrace(struct trace_array *tr,
199 struct trace_array_cpu *data,
200 unsigned long ip,
201 unsigned long parent_ip,
202 unsigned long flags);
203void tracing_sched_switch_trace(struct trace_array *tr,
204 struct trace_array_cpu *data,
205 struct task_struct *prev,
206 struct task_struct *next,
207 unsigned long flags);
208void tracing_record_cmdline(struct task_struct *tsk);
209
210void tracing_sched_wakeup_trace(struct trace_array *tr,
211 struct trace_array_cpu *data,
212 struct task_struct *wakee,
213 struct task_struct *cur,
214 unsigned long flags);
215void trace_special(struct trace_array *tr,
216 struct trace_array_cpu *data,
217 unsigned long arg1,
218 unsigned long arg2,
219 unsigned long arg3);
220void trace_function(struct trace_array *tr,
221 struct trace_array_cpu *data,
222 unsigned long ip,
223 unsigned long parent_ip,
224 unsigned long flags);
225
226void tracing_start_cmdline_record(void);
227void tracing_stop_cmdline_record(void);
228int register_tracer(struct tracer *type);
229void unregister_tracer(struct tracer *type);
230
231extern unsigned long nsecs_to_usecs(unsigned long nsecs);
232
233extern unsigned long tracing_max_latency;
234extern unsigned long tracing_thresh;
235
236void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
237void update_max_tr_single(struct trace_array *tr,
238 struct task_struct *tsk, int cpu);
239
240extern cycle_t ftrace_now(int cpu);
241
242#ifdef CONFIG_FTRACE
243void tracing_start_function_trace(void);
244void tracing_stop_function_trace(void);
245#else
246# define tracing_start_function_trace() do { } while (0)
247# define tracing_stop_function_trace() do { } while (0)
248#endif
249
250#ifdef CONFIG_CONTEXT_SWITCH_TRACER
251typedef void
252(*tracer_switch_func_t)(void *private,
253 void *__rq,
254 struct task_struct *prev,
255 struct task_struct *next);
256
257struct tracer_switch_ops {
258 tracer_switch_func_t func;
259 void *private;
260 struct tracer_switch_ops *next;
261};
262
263#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
264
265#ifdef CONFIG_DYNAMIC_FTRACE
266extern unsigned long ftrace_update_tot_cnt;
267#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func
268extern int DYN_FTRACE_TEST_NAME(void);
269#endif
270
271#ifdef CONFIG_MMIOTRACE
272extern void __trace_mmiotrace_rw(struct trace_array *tr,
273 struct trace_array_cpu *data,
274 struct mmiotrace_rw *rw);
275extern void __trace_mmiotrace_map(struct trace_array *tr,
276 struct trace_array_cpu *data,
277 struct mmiotrace_map *map);
278#endif
279
280#ifdef CONFIG_FTRACE_STARTUP_TEST
281#ifdef CONFIG_FTRACE
282extern int trace_selftest_startup_function(struct tracer *trace,
283 struct trace_array *tr);
284#endif
285#ifdef CONFIG_IRQSOFF_TRACER
286extern int trace_selftest_startup_irqsoff(struct tracer *trace,
287 struct trace_array *tr);
288#endif
289#ifdef CONFIG_PREEMPT_TRACER
290extern int trace_selftest_startup_preemptoff(struct tracer *trace,
291 struct trace_array *tr);
292#endif
293#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER)
294extern int trace_selftest_startup_preemptirqsoff(struct tracer *trace,
295 struct trace_array *tr);
296#endif
297#ifdef CONFIG_SCHED_TRACER
298extern int trace_selftest_startup_wakeup(struct tracer *trace,
299 struct trace_array *tr);
300#endif
301#ifdef CONFIG_CONTEXT_SWITCH_TRACER
302extern int trace_selftest_startup_sched_switch(struct tracer *trace,
303 struct trace_array *tr);
304#endif
305#ifdef CONFIG_SYSPROF_TRACER
306extern int trace_selftest_startup_sysprof(struct tracer *trace,
307 struct trace_array *tr);
308#endif
309#endif /* CONFIG_FTRACE_STARTUP_TEST */
310
311extern void *head_page(struct trace_array_cpu *data);
312extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
313extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
314 size_t cnt);
315extern long ns2usecs(cycle_t nsec);
316
317extern unsigned long trace_flags;
318
319/*
320 * trace_iterator_flags is an enumeration that defines bit
321 * positions into trace_flags that controls the output.
322 *
323 * NOTE: These bits must match the trace_options array in
324 * trace.c.
325 */
326enum trace_iterator_flags {
327 TRACE_ITER_PRINT_PARENT = 0x01,
328 TRACE_ITER_SYM_OFFSET = 0x02,
329 TRACE_ITER_SYM_ADDR = 0x04,
330 TRACE_ITER_VERBOSE = 0x08,
331 TRACE_ITER_RAW = 0x10,
332 TRACE_ITER_HEX = 0x20,
333 TRACE_ITER_BIN = 0x40,
334 TRACE_ITER_BLOCK = 0x80,
335 TRACE_ITER_STACKTRACE = 0x100,
336 TRACE_ITER_SCHED_TREE = 0x200,
337};
338
339#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
new file mode 100644
index 000000000000..312144897970
--- /dev/null
+++ b/kernel/trace/trace_functions.c
@@ -0,0 +1,81 @@
1/*
2 * ring buffer based function tracer
3 *
4 * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
5 * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
6 *
7 * Based on code from the latency_tracer, that is:
8 *
9 * Copyright (C) 2004-2006 Ingo Molnar
10 * Copyright (C) 2004 William Lee Irwin III
11 */
12#include <linux/debugfs.h>
13#include <linux/uaccess.h>
14#include <linux/ftrace.h>
15#include <linux/fs.h>
16
17#include "trace.h"
18
19static void function_reset(struct trace_array *tr)
20{
21 int cpu;
22
23 tr->time_start = ftrace_now(tr->cpu);
24
25 for_each_online_cpu(cpu)
26 tracing_reset(tr->data[cpu]);
27}
28
29static void start_function_trace(struct trace_array *tr)
30{
31 tr->cpu = get_cpu();
32 function_reset(tr);
33 put_cpu();
34
35 tracing_start_cmdline_record();
36 tracing_start_function_trace();
37}
38
39static void stop_function_trace(struct trace_array *tr)
40{
41 tracing_stop_function_trace();
42 tracing_stop_cmdline_record();
43}
44
45static void function_trace_init(struct trace_array *tr)
46{
47 if (tr->ctrl)
48 start_function_trace(tr);
49}
50
51static void function_trace_reset(struct trace_array *tr)
52{
53 if (tr->ctrl)
54 stop_function_trace(tr);
55}
56
57static void function_trace_ctrl_update(struct trace_array *tr)
58{
59 if (tr->ctrl)
60 start_function_trace(tr);
61 else
62 stop_function_trace(tr);
63}
64
65static struct tracer function_trace __read_mostly =
66{
67 .name = "ftrace",
68 .init = function_trace_init,
69 .reset = function_trace_reset,
70 .ctrl_update = function_trace_ctrl_update,
71#ifdef CONFIG_FTRACE_SELFTEST
72 .selftest = trace_selftest_startup_function,
73#endif
74};
75
76static __init int init_function_trace(void)
77{
78 return register_tracer(&function_trace);
79}
80
81device_initcall(init_function_trace);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
new file mode 100644
index 000000000000..ece6cfb649fa
--- /dev/null
+++ b/kernel/trace/trace_irqsoff.c
@@ -0,0 +1,490 @@
1/*
2 * trace irqs off criticall timings
3 *
4 * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
5 * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
6 *
7 * From code in the latency_tracer, that is:
8 *
9 * Copyright (C) 2004-2006 Ingo Molnar
10 * Copyright (C) 2004 William Lee Irwin III
11 */
12#include <linux/kallsyms.h>
13#include <linux/debugfs.h>
14#include <linux/uaccess.h>
15#include <linux/module.h>
16#include <linux/ftrace.h>
17#include <linux/fs.h>
18
19#include "trace.h"
20
21static struct trace_array *irqsoff_trace __read_mostly;
22static int tracer_enabled __read_mostly;
23
24static DEFINE_PER_CPU(int, tracing_cpu);
25
26static DEFINE_SPINLOCK(max_trace_lock);
27
28enum {
29 TRACER_IRQS_OFF = (1 << 1),
30 TRACER_PREEMPT_OFF = (1 << 2),
31};
32
33static int trace_type __read_mostly;
34
35#ifdef CONFIG_PREEMPT_TRACER
36static inline int
37preempt_trace(void)
38{
39 return ((trace_type & TRACER_PREEMPT_OFF) && preempt_count());
40}
41#else
42# define preempt_trace() (0)
43#endif
44
45#ifdef CONFIG_IRQSOFF_TRACER
46static inline int
47irq_trace(void)
48{
49 return ((trace_type & TRACER_IRQS_OFF) &&
50 irqs_disabled());
51}
52#else
53# define irq_trace() (0)
54#endif
55
56/*
57 * Sequence count - we record it when starting a measurement and
58 * skip the latency if the sequence has changed - some other section
59 * did a maximum and could disturb our measurement with serial console
60 * printouts, etc. Truly coinciding maximum latencies should be rare
61 * and what happens together happens separately as well, so this doesnt
62 * decrease the validity of the maximum found:
63 */
64static __cacheline_aligned_in_smp unsigned long max_sequence;
65
66#ifdef CONFIG_FTRACE
67/*
68 * irqsoff uses its own tracer function to keep the overhead down:
69 */
70static void
71irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
72{
73 struct trace_array *tr = irqsoff_trace;
74 struct trace_array_cpu *data;
75 unsigned long flags;
76 long disabled;
77 int cpu;
78
79 /*
80 * Does not matter if we preempt. We test the flags
81 * afterward, to see if irqs are disabled or not.
82 * If we preempt and get a false positive, the flags
83 * test will fail.
84 */
85 cpu = raw_smp_processor_id();
86 if (likely(!per_cpu(tracing_cpu, cpu)))
87 return;
88
89 local_save_flags(flags);
90 /* slight chance to get a false positive on tracing_cpu */
91 if (!irqs_disabled_flags(flags))
92 return;
93
94 data = tr->data[cpu];
95 disabled = atomic_inc_return(&data->disabled);
96
97 if (likely(disabled == 1))
98 trace_function(tr, data, ip, parent_ip, flags);
99
100 atomic_dec(&data->disabled);
101}
102
103static struct ftrace_ops trace_ops __read_mostly =
104{
105 .func = irqsoff_tracer_call,
106};
107#endif /* CONFIG_FTRACE */
108
109/*
110 * Should this new latency be reported/recorded?
111 */
112static int report_latency(cycle_t delta)
113{
114 if (tracing_thresh) {
115 if (delta < tracing_thresh)
116 return 0;
117 } else {
118 if (delta <= tracing_max_latency)
119 return 0;
120 }
121 return 1;
122}
123
124static void
125check_critical_timing(struct trace_array *tr,
126 struct trace_array_cpu *data,
127 unsigned long parent_ip,
128 int cpu)
129{
130 unsigned long latency, t0, t1;
131 cycle_t T0, T1, delta;
132 unsigned long flags;
133
134 /*
135 * usecs conversion is slow so we try to delay the conversion
136 * as long as possible:
137 */
138 T0 = data->preempt_timestamp;
139 T1 = ftrace_now(cpu);
140 delta = T1-T0;
141
142 local_save_flags(flags);
143
144 if (!report_latency(delta))
145 goto out;
146
147 spin_lock_irqsave(&max_trace_lock, flags);
148
149 /* check if we are still the max latency */
150 if (!report_latency(delta))
151 goto out_unlock;
152
153 trace_function(tr, data, CALLER_ADDR0, parent_ip, flags);
154
155 latency = nsecs_to_usecs(delta);
156
157 if (data->critical_sequence != max_sequence)
158 goto out_unlock;
159
160 tracing_max_latency = delta;
161 t0 = nsecs_to_usecs(T0);
162 t1 = nsecs_to_usecs(T1);
163
164 data->critical_end = parent_ip;
165
166 update_max_tr_single(tr, current, cpu);
167
168 max_sequence++;
169
170out_unlock:
171 spin_unlock_irqrestore(&max_trace_lock, flags);
172
173out:
174 data->critical_sequence = max_sequence;
175 data->preempt_timestamp = ftrace_now(cpu);
176 tracing_reset(data);
177 trace_function(tr, data, CALLER_ADDR0, parent_ip, flags);
178}
179
180static inline void
181start_critical_timing(unsigned long ip, unsigned long parent_ip)
182{
183 int cpu;
184 struct trace_array *tr = irqsoff_trace;
185 struct trace_array_cpu *data;
186 unsigned long flags;
187
188 if (likely(!tracer_enabled))
189 return;
190
191 cpu = raw_smp_processor_id();
192
193 if (per_cpu(tracing_cpu, cpu))
194 return;
195
196 data = tr->data[cpu];
197
198 if (unlikely(!data) || atomic_read(&data->disabled))
199 return;
200
201 atomic_inc(&data->disabled);
202
203 data->critical_sequence = max_sequence;
204 data->preempt_timestamp = ftrace_now(cpu);
205 data->critical_start = parent_ip ? : ip;
206 tracing_reset(data);
207
208 local_save_flags(flags);
209
210 trace_function(tr, data, ip, parent_ip, flags);
211
212 per_cpu(tracing_cpu, cpu) = 1;
213
214 atomic_dec(&data->disabled);
215}
216
217static inline void
218stop_critical_timing(unsigned long ip, unsigned long parent_ip)
219{
220 int cpu;
221 struct trace_array *tr = irqsoff_trace;
222 struct trace_array_cpu *data;
223 unsigned long flags;
224
225 cpu = raw_smp_processor_id();
226 /* Always clear the tracing cpu on stopping the trace */
227 if (unlikely(per_cpu(tracing_cpu, cpu)))
228 per_cpu(tracing_cpu, cpu) = 0;
229 else
230 return;
231
232 if (!tracer_enabled)
233 return;
234
235 data = tr->data[cpu];
236
237 if (unlikely(!data) || unlikely(!head_page(data)) ||
238 !data->critical_start || atomic_read(&data->disabled))
239 return;
240
241 atomic_inc(&data->disabled);
242
243 local_save_flags(flags);
244 trace_function(tr, data, ip, parent_ip, flags);
245 check_critical_timing(tr, data, parent_ip ? : ip, cpu);
246 data->critical_start = 0;
247 atomic_dec(&data->disabled);
248}
249
250/* start and stop critical timings used to for stoppage (in idle) */
251void start_critical_timings(void)
252{
253 if (preempt_trace() || irq_trace())
254 start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
255}
256EXPORT_SYMBOL_GPL(start_critical_timings);
257
258void stop_critical_timings(void)
259{
260 if (preempt_trace() || irq_trace())
261 stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
262}
263EXPORT_SYMBOL_GPL(stop_critical_timings);
264
265#ifdef CONFIG_IRQSOFF_TRACER
266#ifdef CONFIG_PROVE_LOCKING
267void time_hardirqs_on(unsigned long a0, unsigned long a1)
268{
269 if (!preempt_trace() && irq_trace())
270 stop_critical_timing(a0, a1);
271}
272
273void time_hardirqs_off(unsigned long a0, unsigned long a1)
274{
275 if (!preempt_trace() && irq_trace())
276 start_critical_timing(a0, a1);
277}
278
279#else /* !CONFIG_PROVE_LOCKING */
280
281/*
282 * Stubs:
283 */
284
285void early_boot_irqs_off(void)
286{
287}
288
289void early_boot_irqs_on(void)
290{
291}
292
293void trace_softirqs_on(unsigned long ip)
294{
295}
296
297void trace_softirqs_off(unsigned long ip)
298{
299}
300
301inline void print_irqtrace_events(struct task_struct *curr)
302{
303}
304
305/*
306 * We are only interested in hardirq on/off events:
307 */
308void trace_hardirqs_on(void)
309{
310 if (!preempt_trace() && irq_trace())
311 stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
312}
313EXPORT_SYMBOL(trace_hardirqs_on);
314
315void trace_hardirqs_off(void)
316{
317 if (!preempt_trace() && irq_trace())
318 start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
319}
320EXPORT_SYMBOL(trace_hardirqs_off);
321
322void trace_hardirqs_on_caller(unsigned long caller_addr)
323{
324 if (!preempt_trace() && irq_trace())
325 stop_critical_timing(CALLER_ADDR0, caller_addr);
326}
327EXPORT_SYMBOL(trace_hardirqs_on_caller);
328
329void trace_hardirqs_off_caller(unsigned long caller_addr)
330{
331 if (!preempt_trace() && irq_trace())
332 start_critical_timing(CALLER_ADDR0, caller_addr);
333}
334EXPORT_SYMBOL(trace_hardirqs_off_caller);
335
336#endif /* CONFIG_PROVE_LOCKING */
337#endif /* CONFIG_IRQSOFF_TRACER */
338
339#ifdef CONFIG_PREEMPT_TRACER
340void trace_preempt_on(unsigned long a0, unsigned long a1)
341{
342 if (preempt_trace())
343 stop_critical_timing(a0, a1);
344}
345
346void trace_preempt_off(unsigned long a0, unsigned long a1)
347{
348 if (preempt_trace())
349 start_critical_timing(a0, a1);
350}
351#endif /* CONFIG_PREEMPT_TRACER */
352
353static void start_irqsoff_tracer(struct trace_array *tr)
354{
355 register_ftrace_function(&trace_ops);
356 tracer_enabled = 1;
357}
358
359static void stop_irqsoff_tracer(struct trace_array *tr)
360{
361 tracer_enabled = 0;
362 unregister_ftrace_function(&trace_ops);
363}
364
365static void __irqsoff_tracer_init(struct trace_array *tr)
366{
367 irqsoff_trace = tr;
368 /* make sure that the tracer is visible */
369 smp_wmb();
370
371 if (tr->ctrl)
372 start_irqsoff_tracer(tr);
373}
374
375static void irqsoff_tracer_reset(struct trace_array *tr)
376{
377 if (tr->ctrl)
378 stop_irqsoff_tracer(tr);
379}
380
381static void irqsoff_tracer_ctrl_update(struct trace_array *tr)
382{
383 if (tr->ctrl)
384 start_irqsoff_tracer(tr);
385 else
386 stop_irqsoff_tracer(tr);
387}
388
389static void irqsoff_tracer_open(struct trace_iterator *iter)
390{
391 /* stop the trace while dumping */
392 if (iter->tr->ctrl)
393 stop_irqsoff_tracer(iter->tr);
394}
395
396static void irqsoff_tracer_close(struct trace_iterator *iter)
397{
398 if (iter->tr->ctrl)
399 start_irqsoff_tracer(iter->tr);
400}
401
402#ifdef CONFIG_IRQSOFF_TRACER
403static void irqsoff_tracer_init(struct trace_array *tr)
404{
405 trace_type = TRACER_IRQS_OFF;
406
407 __irqsoff_tracer_init(tr);
408}
409static struct tracer irqsoff_tracer __read_mostly =
410{
411 .name = "irqsoff",
412 .init = irqsoff_tracer_init,
413 .reset = irqsoff_tracer_reset,
414 .open = irqsoff_tracer_open,
415 .close = irqsoff_tracer_close,
416 .ctrl_update = irqsoff_tracer_ctrl_update,
417 .print_max = 1,
418#ifdef CONFIG_FTRACE_SELFTEST
419 .selftest = trace_selftest_startup_irqsoff,
420#endif
421};
422# define register_irqsoff(trace) register_tracer(&trace)
423#else
424# define register_irqsoff(trace) do { } while (0)
425#endif
426
427#ifdef CONFIG_PREEMPT_TRACER
428static void preemptoff_tracer_init(struct trace_array *tr)
429{
430 trace_type = TRACER_PREEMPT_OFF;
431
432 __irqsoff_tracer_init(tr);
433}
434
435static struct tracer preemptoff_tracer __read_mostly =
436{
437 .name = "preemptoff",
438 .init = preemptoff_tracer_init,
439 .reset = irqsoff_tracer_reset,
440 .open = irqsoff_tracer_open,
441 .close = irqsoff_tracer_close,
442 .ctrl_update = irqsoff_tracer_ctrl_update,
443 .print_max = 1,
444#ifdef CONFIG_FTRACE_SELFTEST
445 .selftest = trace_selftest_startup_preemptoff,
446#endif
447};
448# define register_preemptoff(trace) register_tracer(&trace)
449#else
450# define register_preemptoff(trace) do { } while (0)
451#endif
452
453#if defined(CONFIG_IRQSOFF_TRACER) && \
454 defined(CONFIG_PREEMPT_TRACER)
455
456static void preemptirqsoff_tracer_init(struct trace_array *tr)
457{
458 trace_type = TRACER_IRQS_OFF | TRACER_PREEMPT_OFF;
459
460 __irqsoff_tracer_init(tr);
461}
462
463static struct tracer preemptirqsoff_tracer __read_mostly =
464{
465 .name = "preemptirqsoff",
466 .init = preemptirqsoff_tracer_init,
467 .reset = irqsoff_tracer_reset,
468 .open = irqsoff_tracer_open,
469 .close = irqsoff_tracer_close,
470 .ctrl_update = irqsoff_tracer_ctrl_update,
471 .print_max = 1,
472#ifdef CONFIG_FTRACE_SELFTEST
473 .selftest = trace_selftest_startup_preemptirqsoff,
474#endif
475};
476
477# define register_preemptirqsoff(trace) register_tracer(&trace)
478#else
479# define register_preemptirqsoff(trace) do { } while (0)
480#endif
481
482__init static int init_irqsoff_tracer(void)
483{
484 register_irqsoff(irqsoff_tracer);
485 register_preemptoff(preemptoff_tracer);
486 register_preemptirqsoff(preemptirqsoff_tracer);
487
488 return 0;
489}
490device_initcall(init_irqsoff_tracer);
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
new file mode 100644
index 000000000000..b13dc19dcbb4
--- /dev/null
+++ b/kernel/trace/trace_mmiotrace.c
@@ -0,0 +1,295 @@
1/*
2 * Memory mapped I/O tracing
3 *
4 * Copyright (C) 2008 Pekka Paalanen <pq@iki.fi>
5 */
6
7#define DEBUG 1
8
9#include <linux/kernel.h>
10#include <linux/mmiotrace.h>
11#include <linux/pci.h>
12
13#include "trace.h"
14
15struct header_iter {
16 struct pci_dev *dev;
17};
18
19static struct trace_array *mmio_trace_array;
20static bool overrun_detected;
21
22static void mmio_reset_data(struct trace_array *tr)
23{
24 int cpu;
25
26 overrun_detected = false;
27 tr->time_start = ftrace_now(tr->cpu);
28
29 for_each_online_cpu(cpu)
30 tracing_reset(tr->data[cpu]);
31}
32
33static void mmio_trace_init(struct trace_array *tr)
34{
35 pr_debug("in %s\n", __func__);
36 mmio_trace_array = tr;
37 if (tr->ctrl) {
38 mmio_reset_data(tr);
39 enable_mmiotrace();
40 }
41}
42
43static void mmio_trace_reset(struct trace_array *tr)
44{
45 pr_debug("in %s\n", __func__);
46 if (tr->ctrl)
47 disable_mmiotrace();
48 mmio_reset_data(tr);
49 mmio_trace_array = NULL;
50}
51
52static void mmio_trace_ctrl_update(struct trace_array *tr)
53{
54 pr_debug("in %s\n", __func__);
55 if (tr->ctrl) {
56 mmio_reset_data(tr);
57 enable_mmiotrace();
58 } else {
59 disable_mmiotrace();
60 }
61}
62
63static int mmio_print_pcidev(struct trace_seq *s, const struct pci_dev *dev)
64{
65 int ret = 0;
66 int i;
67 resource_size_t start, end;
68 const struct pci_driver *drv = pci_dev_driver(dev);
69
70 /* XXX: incomplete checks for trace_seq_printf() return value */
71 ret += trace_seq_printf(s, "PCIDEV %02x%02x %04x%04x %x",
72 dev->bus->number, dev->devfn,
73 dev->vendor, dev->device, dev->irq);
74 /*
75 * XXX: is pci_resource_to_user() appropriate, since we are
76 * supposed to interpret the __ioremap() phys_addr argument based on
77 * these printed values?
78 */
79 for (i = 0; i < 7; i++) {
80 pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
81 ret += trace_seq_printf(s, " %llx",
82 (unsigned long long)(start |
83 (dev->resource[i].flags & PCI_REGION_FLAG_MASK)));
84 }
85 for (i = 0; i < 7; i++) {
86 pci_resource_to_user(dev, i, &dev->resource[i], &start, &end);
87 ret += trace_seq_printf(s, " %llx",
88 dev->resource[i].start < dev->resource[i].end ?
89 (unsigned long long)(end - start) + 1 : 0);
90 }
91 if (drv)
92 ret += trace_seq_printf(s, " %s\n", drv->name);
93 else
94 ret += trace_seq_printf(s, " \n");
95 return ret;
96}
97
98static void destroy_header_iter(struct header_iter *hiter)
99{
100 if (!hiter)
101 return;
102 pci_dev_put(hiter->dev);
103 kfree(hiter);
104}
105
106static void mmio_pipe_open(struct trace_iterator *iter)
107{
108 struct header_iter *hiter;
109 struct trace_seq *s = &iter->seq;
110
111 trace_seq_printf(s, "VERSION 20070824\n");
112
113 hiter = kzalloc(sizeof(*hiter), GFP_KERNEL);
114 if (!hiter)
115 return;
116
117 hiter->dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, NULL);
118 iter->private = hiter;
119}
120
121/* XXX: This is not called when the pipe is closed! */
122static void mmio_close(struct trace_iterator *iter)
123{
124 struct header_iter *hiter = iter->private;
125 destroy_header_iter(hiter);
126 iter->private = NULL;
127}
128
129static unsigned long count_overruns(struct trace_iterator *iter)
130{
131 int cpu;
132 unsigned long cnt = 0;
133 for_each_online_cpu(cpu) {
134 cnt += iter->overrun[cpu];
135 iter->overrun[cpu] = 0;
136 }
137 return cnt;
138}
139
140static ssize_t mmio_read(struct trace_iterator *iter, struct file *filp,
141 char __user *ubuf, size_t cnt, loff_t *ppos)
142{
143 ssize_t ret;
144 struct header_iter *hiter = iter->private;
145 struct trace_seq *s = &iter->seq;
146 unsigned long n;
147
148 n = count_overruns(iter);
149 if (n) {
150 /* XXX: This is later than where events were lost. */
151 trace_seq_printf(s, "MARK 0.000000 Lost %lu events.\n", n);
152 if (!overrun_detected)
153 pr_warning("mmiotrace has lost events.\n");
154 overrun_detected = true;
155 goto print_out;
156 }
157
158 if (!hiter)
159 return 0;
160
161 mmio_print_pcidev(s, hiter->dev);
162 hiter->dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, hiter->dev);
163
164 if (!hiter->dev) {
165 destroy_header_iter(hiter);
166 iter->private = NULL;
167 }
168
169print_out:
170 ret = trace_seq_to_user(s, ubuf, cnt);
171 return (ret == -EBUSY) ? 0 : ret;
172}
173
174static int mmio_print_rw(struct trace_iterator *iter)
175{
176 struct trace_entry *entry = iter->ent;
177 struct mmiotrace_rw *rw = &entry->mmiorw;
178 struct trace_seq *s = &iter->seq;
179 unsigned long long t = ns2usecs(entry->t);
180 unsigned long usec_rem = do_div(t, 1000000ULL);
181 unsigned secs = (unsigned long)t;
182 int ret = 1;
183
184 switch (entry->mmiorw.opcode) {
185 case MMIO_READ:
186 ret = trace_seq_printf(s,
187 "R %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
188 rw->width, secs, usec_rem, rw->map_id,
189 (unsigned long long)rw->phys,
190 rw->value, rw->pc, 0);
191 break;
192 case MMIO_WRITE:
193 ret = trace_seq_printf(s,
194 "W %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
195 rw->width, secs, usec_rem, rw->map_id,
196 (unsigned long long)rw->phys,
197 rw->value, rw->pc, 0);
198 break;
199 case MMIO_UNKNOWN_OP:
200 ret = trace_seq_printf(s,
201 "UNKNOWN %lu.%06lu %d 0x%llx %02x,%02x,%02x 0x%lx %d\n",
202 secs, usec_rem, rw->map_id,
203 (unsigned long long)rw->phys,
204 (rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff,
205 (rw->value >> 0) & 0xff, rw->pc, 0);
206 break;
207 default:
208 ret = trace_seq_printf(s, "rw what?\n");
209 break;
210 }
211 if (ret)
212 return 1;
213 return 0;
214}
215
216static int mmio_print_map(struct trace_iterator *iter)
217{
218 struct trace_entry *entry = iter->ent;
219 struct mmiotrace_map *m = &entry->mmiomap;
220 struct trace_seq *s = &iter->seq;
221 unsigned long long t = ns2usecs(entry->t);
222 unsigned long usec_rem = do_div(t, 1000000ULL);
223 unsigned secs = (unsigned long)t;
224 int ret = 1;
225
226 switch (entry->mmiorw.opcode) {
227 case MMIO_PROBE:
228 ret = trace_seq_printf(s,
229 "MAP %lu.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n",
230 secs, usec_rem, m->map_id,
231 (unsigned long long)m->phys, m->virt, m->len,
232 0UL, 0);
233 break;
234 case MMIO_UNPROBE:
235 ret = trace_seq_printf(s,
236 "UNMAP %lu.%06lu %d 0x%lx %d\n",
237 secs, usec_rem, m->map_id, 0UL, 0);
238 break;
239 default:
240 ret = trace_seq_printf(s, "map what?\n");
241 break;
242 }
243 if (ret)
244 return 1;
245 return 0;
246}
247
248/* return 0 to abort printing without consuming current entry in pipe mode */
249static int mmio_print_line(struct trace_iterator *iter)
250{
251 switch (iter->ent->type) {
252 case TRACE_MMIO_RW:
253 return mmio_print_rw(iter);
254 case TRACE_MMIO_MAP:
255 return mmio_print_map(iter);
256 default:
257 return 1; /* ignore unknown entries */
258 }
259}
260
261static struct tracer mmio_tracer __read_mostly =
262{
263 .name = "mmiotrace",
264 .init = mmio_trace_init,
265 .reset = mmio_trace_reset,
266 .pipe_open = mmio_pipe_open,
267 .close = mmio_close,
268 .read = mmio_read,
269 .ctrl_update = mmio_trace_ctrl_update,
270 .print_line = mmio_print_line,
271};
272
273__init static int init_mmio_trace(void)
274{
275 return register_tracer(&mmio_tracer);
276}
277device_initcall(init_mmio_trace);
278
279void mmio_trace_rw(struct mmiotrace_rw *rw)
280{
281 struct trace_array *tr = mmio_trace_array;
282 struct trace_array_cpu *data = tr->data[smp_processor_id()];
283 __trace_mmiotrace_rw(tr, data, rw);
284}
285
286void mmio_trace_mapping(struct mmiotrace_map *map)
287{
288 struct trace_array *tr = mmio_trace_array;
289 struct trace_array_cpu *data;
290
291 preempt_disable();
292 data = tr->data[smp_processor_id()];
293 __trace_mmiotrace_map(tr, data, map);
294 preempt_enable();
295}
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
new file mode 100644
index 000000000000..cb817a209aa0
--- /dev/null
+++ b/kernel/trace/trace_sched_switch.c
@@ -0,0 +1,286 @@
1/*
2 * trace context switch
3 *
4 * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com>
5 *
6 */
7#include <linux/module.h>
8#include <linux/fs.h>
9#include <linux/debugfs.h>
10#include <linux/kallsyms.h>
11#include <linux/uaccess.h>
12#include <linux/marker.h>
13#include <linux/ftrace.h>
14
15#include "trace.h"
16
17static struct trace_array *ctx_trace;
18static int __read_mostly tracer_enabled;
19static atomic_t sched_ref;
20
21static void
22sched_switch_func(void *private, void *__rq, struct task_struct *prev,
23 struct task_struct *next)
24{
25 struct trace_array **ptr = private;
26 struct trace_array *tr = *ptr;
27 struct trace_array_cpu *data;
28 unsigned long flags;
29 long disabled;
30 int cpu;
31
32 tracing_record_cmdline(prev);
33 tracing_record_cmdline(next);
34
35 if (!tracer_enabled)
36 return;
37
38 local_irq_save(flags);
39 cpu = raw_smp_processor_id();
40 data = tr->data[cpu];
41 disabled = atomic_inc_return(&data->disabled);
42
43 if (likely(disabled == 1))
44 tracing_sched_switch_trace(tr, data, prev, next, flags);
45
46 atomic_dec(&data->disabled);
47 local_irq_restore(flags);
48}
49
50static notrace void
51sched_switch_callback(void *probe_data, void *call_data,
52 const char *format, va_list *args)
53{
54 struct task_struct *prev;
55 struct task_struct *next;
56 struct rq *__rq;
57
58 if (!atomic_read(&sched_ref))
59 return;
60
61 /* skip prev_pid %d next_pid %d prev_state %ld */
62 (void)va_arg(*args, int);
63 (void)va_arg(*args, int);
64 (void)va_arg(*args, long);
65 __rq = va_arg(*args, typeof(__rq));
66 prev = va_arg(*args, typeof(prev));
67 next = va_arg(*args, typeof(next));
68
69 /*
70 * If tracer_switch_func only points to the local
71 * switch func, it still needs the ptr passed to it.
72 */
73 sched_switch_func(probe_data, __rq, prev, next);
74}
75
76static void
77wakeup_func(void *private, void *__rq, struct task_struct *wakee, struct
78 task_struct *curr)
79{
80 struct trace_array **ptr = private;
81 struct trace_array *tr = *ptr;
82 struct trace_array_cpu *data;
83 unsigned long flags;
84 long disabled;
85 int cpu;
86
87 if (!tracer_enabled)
88 return;
89
90 tracing_record_cmdline(curr);
91
92 local_irq_save(flags);
93 cpu = raw_smp_processor_id();
94 data = tr->data[cpu];
95 disabled = atomic_inc_return(&data->disabled);
96
97 if (likely(disabled == 1))
98 tracing_sched_wakeup_trace(tr, data, wakee, curr, flags);
99
100 atomic_dec(&data->disabled);
101 local_irq_restore(flags);
102}
103
104static notrace void
105wake_up_callback(void *probe_data, void *call_data,
106 const char *format, va_list *args)
107{
108 struct task_struct *curr;
109 struct task_struct *task;
110 struct rq *__rq;
111
112 if (likely(!tracer_enabled))
113 return;
114
115 /* Skip pid %d state %ld */
116 (void)va_arg(*args, int);
117 (void)va_arg(*args, long);
118 /* now get the meat: "rq %p task %p rq->curr %p" */
119 __rq = va_arg(*args, typeof(__rq));
120 task = va_arg(*args, typeof(task));
121 curr = va_arg(*args, typeof(curr));
122
123 tracing_record_cmdline(task);
124 tracing_record_cmdline(curr);
125
126 wakeup_func(probe_data, __rq, task, curr);
127}
128
129static void sched_switch_reset(struct trace_array *tr)
130{
131 int cpu;
132
133 tr->time_start = ftrace_now(tr->cpu);
134
135 for_each_online_cpu(cpu)
136 tracing_reset(tr->data[cpu]);
137}
138
139static int tracing_sched_register(void)
140{
141 int ret;
142
143 ret = marker_probe_register("kernel_sched_wakeup",
144 "pid %d state %ld ## rq %p task %p rq->curr %p",
145 wake_up_callback,
146 &ctx_trace);
147 if (ret) {
148 pr_info("wakeup trace: Couldn't add marker"
149 " probe to kernel_sched_wakeup\n");
150 return ret;
151 }
152
153 ret = marker_probe_register("kernel_sched_wakeup_new",
154 "pid %d state %ld ## rq %p task %p rq->curr %p",
155 wake_up_callback,
156 &ctx_trace);
157 if (ret) {
158 pr_info("wakeup trace: Couldn't add marker"
159 " probe to kernel_sched_wakeup_new\n");
160 goto fail_deprobe;
161 }
162
163 ret = marker_probe_register("kernel_sched_schedule",
164 "prev_pid %d next_pid %d prev_state %ld "
165 "## rq %p prev %p next %p",
166 sched_switch_callback,
167 &ctx_trace);
168 if (ret) {
169 pr_info("sched trace: Couldn't add marker"
170 " probe to kernel_sched_schedule\n");
171 goto fail_deprobe_wake_new;
172 }
173
174 return ret;
175fail_deprobe_wake_new:
176 marker_probe_unregister("kernel_sched_wakeup_new",
177 wake_up_callback,
178 &ctx_trace);
179fail_deprobe:
180 marker_probe_unregister("kernel_sched_wakeup",
181 wake_up_callback,
182 &ctx_trace);
183 return ret;
184}
185
186static void tracing_sched_unregister(void)
187{
188 marker_probe_unregister("kernel_sched_schedule",
189 sched_switch_callback,
190 &ctx_trace);
191 marker_probe_unregister("kernel_sched_wakeup_new",
192 wake_up_callback,
193 &ctx_trace);
194 marker_probe_unregister("kernel_sched_wakeup",
195 wake_up_callback,
196 &ctx_trace);
197}
198
199static void tracing_start_sched_switch(void)
200{
201 long ref;
202
203 ref = atomic_inc_return(&sched_ref);
204 if (ref == 1)
205 tracing_sched_register();
206}
207
208static void tracing_stop_sched_switch(void)
209{
210 long ref;
211
212 ref = atomic_dec_and_test(&sched_ref);
213 if (ref)
214 tracing_sched_unregister();
215}
216
217void tracing_start_cmdline_record(void)
218{
219 tracing_start_sched_switch();
220}
221
222void tracing_stop_cmdline_record(void)
223{
224 tracing_stop_sched_switch();
225}
226
227static void start_sched_trace(struct trace_array *tr)
228{
229 sched_switch_reset(tr);
230 tracing_start_cmdline_record();
231 tracer_enabled = 1;
232}
233
234static void stop_sched_trace(struct trace_array *tr)
235{
236 tracer_enabled = 0;
237 tracing_stop_cmdline_record();
238}
239
240static void sched_switch_trace_init(struct trace_array *tr)
241{
242 ctx_trace = tr;
243
244 if (tr->ctrl)
245 start_sched_trace(tr);
246}
247
248static void sched_switch_trace_reset(struct trace_array *tr)
249{
250 if (tr->ctrl)
251 stop_sched_trace(tr);
252}
253
254static void sched_switch_trace_ctrl_update(struct trace_array *tr)
255{
256 /* When starting a new trace, reset the buffers */
257 if (tr->ctrl)
258 start_sched_trace(tr);
259 else
260 stop_sched_trace(tr);
261}
262
263static struct tracer sched_switch_trace __read_mostly =
264{
265 .name = "sched_switch",
266 .init = sched_switch_trace_init,
267 .reset = sched_switch_trace_reset,
268 .ctrl_update = sched_switch_trace_ctrl_update,
269#ifdef CONFIG_FTRACE_SELFTEST
270 .selftest = trace_selftest_startup_sched_switch,
271#endif
272};
273
274__init static int init_sched_switch_trace(void)
275{
276 int ret = 0;
277
278 if (atomic_read(&sched_ref))
279 ret = tracing_sched_register();
280 if (ret) {
281 pr_info("error registering scheduler trace\n");
282 return ret;
283 }
284 return register_tracer(&sched_switch_trace);
285}
286device_initcall(init_sched_switch_trace);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
new file mode 100644
index 000000000000..e303ccb62cdf
--- /dev/null
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -0,0 +1,453 @@
1/*
2 * trace task wakeup timings
3 *
4 * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
5 * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
6 *
7 * Based on code from the latency_tracer, that is:
8 *
9 * Copyright (C) 2004-2006 Ingo Molnar
10 * Copyright (C) 2004 William Lee Irwin III
11 */
12#include <linux/module.h>
13#include <linux/fs.h>
14#include <linux/debugfs.h>
15#include <linux/kallsyms.h>
16#include <linux/uaccess.h>
17#include <linux/ftrace.h>
18#include <linux/marker.h>
19
20#include "trace.h"
21
22static struct trace_array *wakeup_trace;
23static int __read_mostly tracer_enabled;
24
25static struct task_struct *wakeup_task;
26static int wakeup_cpu;
27static unsigned wakeup_prio = -1;
28
29static raw_spinlock_t wakeup_lock =
30 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
31
32static void __wakeup_reset(struct trace_array *tr);
33
34#ifdef CONFIG_FTRACE
35/*
36 * irqsoff uses its own tracer function to keep the overhead down:
37 */
38static void
39wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
40{
41 struct trace_array *tr = wakeup_trace;
42 struct trace_array_cpu *data;
43 unsigned long flags;
44 long disabled;
45 int resched;
46 int cpu;
47
48 if (likely(!wakeup_task))
49 return;
50
51 resched = need_resched();
52 preempt_disable_notrace();
53
54 cpu = raw_smp_processor_id();
55 data = tr->data[cpu];
56 disabled = atomic_inc_return(&data->disabled);
57 if (unlikely(disabled != 1))
58 goto out;
59
60 local_irq_save(flags);
61 __raw_spin_lock(&wakeup_lock);
62
63 if (unlikely(!wakeup_task))
64 goto unlock;
65
66 /*
67 * The task can't disappear because it needs to
68 * wake up first, and we have the wakeup_lock.
69 */
70 if (task_cpu(wakeup_task) != cpu)
71 goto unlock;
72
73 trace_function(tr, data, ip, parent_ip, flags);
74
75 unlock:
76 __raw_spin_unlock(&wakeup_lock);
77 local_irq_restore(flags);
78
79 out:
80 atomic_dec(&data->disabled);
81
82 /*
83 * To prevent recursion from the scheduler, if the
84 * resched flag was set before we entered, then
85 * don't reschedule.
86 */
87 if (resched)
88 preempt_enable_no_resched_notrace();
89 else
90 preempt_enable_notrace();
91}
92
93static struct ftrace_ops trace_ops __read_mostly =
94{
95 .func = wakeup_tracer_call,
96};
97#endif /* CONFIG_FTRACE */
98
99/*
100 * Should this new latency be reported/recorded?
101 */
102static int report_latency(cycle_t delta)
103{
104 if (tracing_thresh) {
105 if (delta < tracing_thresh)
106 return 0;
107 } else {
108 if (delta <= tracing_max_latency)
109 return 0;
110 }
111 return 1;
112}
113
114static void notrace
115wakeup_sched_switch(void *private, void *rq, struct task_struct *prev,
116 struct task_struct *next)
117{
118 unsigned long latency = 0, t0 = 0, t1 = 0;
119 struct trace_array **ptr = private;
120 struct trace_array *tr = *ptr;
121 struct trace_array_cpu *data;
122 cycle_t T0, T1, delta;
123 unsigned long flags;
124 long disabled;
125 int cpu;
126
127 if (unlikely(!tracer_enabled))
128 return;
129
130 /*
131 * When we start a new trace, we set wakeup_task to NULL
132 * and then set tracer_enabled = 1. We want to make sure
133 * that another CPU does not see the tracer_enabled = 1
134 * and the wakeup_task with an older task, that might
135 * actually be the same as next.
136 */
137 smp_rmb();
138
139 if (next != wakeup_task)
140 return;
141
142 /* The task we are waiting for is waking up */
143 data = tr->data[wakeup_cpu];
144
145 /* disable local data, not wakeup_cpu data */
146 cpu = raw_smp_processor_id();
147 disabled = atomic_inc_return(&tr->data[cpu]->disabled);
148 if (likely(disabled != 1))
149 goto out;
150
151 local_irq_save(flags);
152 __raw_spin_lock(&wakeup_lock);
153
154 /* We could race with grabbing wakeup_lock */
155 if (unlikely(!tracer_enabled || next != wakeup_task))
156 goto out_unlock;
157
158 trace_function(tr, data, CALLER_ADDR1, CALLER_ADDR2, flags);
159
160 /*
161 * usecs conversion is slow so we try to delay the conversion
162 * as long as possible:
163 */
164 T0 = data->preempt_timestamp;
165 T1 = ftrace_now(cpu);
166 delta = T1-T0;
167
168 if (!report_latency(delta))
169 goto out_unlock;
170
171 latency = nsecs_to_usecs(delta);
172
173 tracing_max_latency = delta;
174 t0 = nsecs_to_usecs(T0);
175 t1 = nsecs_to_usecs(T1);
176
177 update_max_tr(tr, wakeup_task, wakeup_cpu);
178
179out_unlock:
180 __wakeup_reset(tr);
181 __raw_spin_unlock(&wakeup_lock);
182 local_irq_restore(flags);
183out:
184 atomic_dec(&tr->data[cpu]->disabled);
185}
186
187static notrace void
188sched_switch_callback(void *probe_data, void *call_data,
189 const char *format, va_list *args)
190{
191 struct task_struct *prev;
192 struct task_struct *next;
193 struct rq *__rq;
194
195 /* skip prev_pid %d next_pid %d prev_state %ld */
196 (void)va_arg(*args, int);
197 (void)va_arg(*args, int);
198 (void)va_arg(*args, long);
199 __rq = va_arg(*args, typeof(__rq));
200 prev = va_arg(*args, typeof(prev));
201 next = va_arg(*args, typeof(next));
202
203 tracing_record_cmdline(prev);
204
205 /*
206 * If tracer_switch_func only points to the local
207 * switch func, it still needs the ptr passed to it.
208 */
209 wakeup_sched_switch(probe_data, __rq, prev, next);
210}
211
212static void __wakeup_reset(struct trace_array *tr)
213{
214 struct trace_array_cpu *data;
215 int cpu;
216
217 for_each_possible_cpu(cpu) {
218 data = tr->data[cpu];
219 tracing_reset(data);
220 }
221
222 wakeup_cpu = -1;
223 wakeup_prio = -1;
224
225 if (wakeup_task)
226 put_task_struct(wakeup_task);
227
228 wakeup_task = NULL;
229}
230
231static void wakeup_reset(struct trace_array *tr)
232{
233 unsigned long flags;
234
235 local_irq_save(flags);
236 __raw_spin_lock(&wakeup_lock);
237 __wakeup_reset(tr);
238 __raw_spin_unlock(&wakeup_lock);
239 local_irq_restore(flags);
240}
241
242static void
243wakeup_check_start(struct trace_array *tr, struct task_struct *p,
244 struct task_struct *curr)
245{
246 int cpu = smp_processor_id();
247 unsigned long flags;
248 long disabled;
249
250 if (likely(!rt_task(p)) ||
251 p->prio >= wakeup_prio ||
252 p->prio >= curr->prio)
253 return;
254
255 disabled = atomic_inc_return(&tr->data[cpu]->disabled);
256 if (unlikely(disabled != 1))
257 goto out;
258
259 /* interrupts should be off from try_to_wake_up */
260 __raw_spin_lock(&wakeup_lock);
261
262 /* check for races. */
263 if (!tracer_enabled || p->prio >= wakeup_prio)
264 goto out_locked;
265
266 /* reset the trace */
267 __wakeup_reset(tr);
268
269 wakeup_cpu = task_cpu(p);
270 wakeup_prio = p->prio;
271
272 wakeup_task = p;
273 get_task_struct(wakeup_task);
274
275 local_save_flags(flags);
276
277 tr->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu);
278 trace_function(tr, tr->data[wakeup_cpu],
279 CALLER_ADDR1, CALLER_ADDR2, flags);
280
281out_locked:
282 __raw_spin_unlock(&wakeup_lock);
283out:
284 atomic_dec(&tr->data[cpu]->disabled);
285}
286
287static notrace void
288wake_up_callback(void *probe_data, void *call_data,
289 const char *format, va_list *args)
290{
291 struct trace_array **ptr = probe_data;
292 struct trace_array *tr = *ptr;
293 struct task_struct *curr;
294 struct task_struct *task;
295 struct rq *__rq;
296
297 if (likely(!tracer_enabled))
298 return;
299
300 /* Skip pid %d state %ld */
301 (void)va_arg(*args, int);
302 (void)va_arg(*args, long);
303 /* now get the meat: "rq %p task %p rq->curr %p" */
304 __rq = va_arg(*args, typeof(__rq));
305 task = va_arg(*args, typeof(task));
306 curr = va_arg(*args, typeof(curr));
307
308 tracing_record_cmdline(task);
309 tracing_record_cmdline(curr);
310
311 wakeup_check_start(tr, task, curr);
312}
313
314static void start_wakeup_tracer(struct trace_array *tr)
315{
316 int ret;
317
318 ret = marker_probe_register("kernel_sched_wakeup",
319 "pid %d state %ld ## rq %p task %p rq->curr %p",
320 wake_up_callback,
321 &wakeup_trace);
322 if (ret) {
323 pr_info("wakeup trace: Couldn't add marker"
324 " probe to kernel_sched_wakeup\n");
325 return;
326 }
327
328 ret = marker_probe_register("kernel_sched_wakeup_new",
329 "pid %d state %ld ## rq %p task %p rq->curr %p",
330 wake_up_callback,
331 &wakeup_trace);
332 if (ret) {
333 pr_info("wakeup trace: Couldn't add marker"
334 " probe to kernel_sched_wakeup_new\n");
335 goto fail_deprobe;
336 }
337
338 ret = marker_probe_register("kernel_sched_schedule",
339 "prev_pid %d next_pid %d prev_state %ld "
340 "## rq %p prev %p next %p",
341 sched_switch_callback,
342 &wakeup_trace);
343 if (ret) {
344 pr_info("sched trace: Couldn't add marker"
345 " probe to kernel_sched_schedule\n");
346 goto fail_deprobe_wake_new;
347 }
348
349 wakeup_reset(tr);
350
351 /*
352 * Don't let the tracer_enabled = 1 show up before
353 * the wakeup_task is reset. This may be overkill since
354 * wakeup_reset does a spin_unlock after setting the
355 * wakeup_task to NULL, but I want to be safe.
356 * This is a slow path anyway.
357 */
358 smp_wmb();
359
360 register_ftrace_function(&trace_ops);
361
362 tracer_enabled = 1;
363
364 return;
365fail_deprobe_wake_new:
366 marker_probe_unregister("kernel_sched_wakeup_new",
367 wake_up_callback,
368 &wakeup_trace);
369fail_deprobe:
370 marker_probe_unregister("kernel_sched_wakeup",
371 wake_up_callback,
372 &wakeup_trace);
373}
374
375static void stop_wakeup_tracer(struct trace_array *tr)
376{
377 tracer_enabled = 0;
378 unregister_ftrace_function(&trace_ops);
379 marker_probe_unregister("kernel_sched_schedule",
380 sched_switch_callback,
381 &wakeup_trace);
382 marker_probe_unregister("kernel_sched_wakeup_new",
383 wake_up_callback,
384 &wakeup_trace);
385 marker_probe_unregister("kernel_sched_wakeup",
386 wake_up_callback,
387 &wakeup_trace);
388}
389
390static void wakeup_tracer_init(struct trace_array *tr)
391{
392 wakeup_trace = tr;
393
394 if (tr->ctrl)
395 start_wakeup_tracer(tr);
396}
397
398static void wakeup_tracer_reset(struct trace_array *tr)
399{
400 if (tr->ctrl) {
401 stop_wakeup_tracer(tr);
402 /* make sure we put back any tasks we are tracing */
403 wakeup_reset(tr);
404 }
405}
406
407static void wakeup_tracer_ctrl_update(struct trace_array *tr)
408{
409 if (tr->ctrl)
410 start_wakeup_tracer(tr);
411 else
412 stop_wakeup_tracer(tr);
413}
414
415static void wakeup_tracer_open(struct trace_iterator *iter)
416{
417 /* stop the trace while dumping */
418 if (iter->tr->ctrl)
419 stop_wakeup_tracer(iter->tr);
420}
421
422static void wakeup_tracer_close(struct trace_iterator *iter)
423{
424 /* forget about any processes we were recording */
425 if (iter->tr->ctrl)
426 start_wakeup_tracer(iter->tr);
427}
428
429static struct tracer wakeup_tracer __read_mostly =
430{
431 .name = "wakeup",
432 .init = wakeup_tracer_init,
433 .reset = wakeup_tracer_reset,
434 .open = wakeup_tracer_open,
435 .close = wakeup_tracer_close,
436 .ctrl_update = wakeup_tracer_ctrl_update,
437 .print_max = 1,
438#ifdef CONFIG_FTRACE_SELFTEST
439 .selftest = trace_selftest_startup_wakeup,
440#endif
441};
442
443__init static int init_wakeup_tracer(void)
444{
445 int ret;
446
447 ret = register_tracer(&wakeup_tracer);
448 if (ret)
449 return ret;
450
451 return 0;
452}
453device_initcall(init_wakeup_tracer);
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
new file mode 100644
index 000000000000..0911b7e073bf
--- /dev/null
+++ b/kernel/trace/trace_selftest.c
@@ -0,0 +1,563 @@
1/* Include in trace.c */
2
3#include <linux/kthread.h>
4#include <linux/delay.h>
5
6static inline int trace_valid_entry(struct trace_entry *entry)
7{
8 switch (entry->type) {
9 case TRACE_FN:
10 case TRACE_CTX:
11 case TRACE_WAKE:
12 case TRACE_STACK:
13 case TRACE_SPECIAL:
14 return 1;
15 }
16 return 0;
17}
18
19static int
20trace_test_buffer_cpu(struct trace_array *tr, struct trace_array_cpu *data)
21{
22 struct trace_entry *entries;
23 struct page *page;
24 int idx = 0;
25 int i;
26
27 BUG_ON(list_empty(&data->trace_pages));
28 page = list_entry(data->trace_pages.next, struct page, lru);
29 entries = page_address(page);
30
31 check_pages(data);
32 if (head_page(data) != entries)
33 goto failed;
34
35 /*
36 * The starting trace buffer always has valid elements,
37 * if any element exists.
38 */
39 entries = head_page(data);
40
41 for (i = 0; i < tr->entries; i++) {
42
43 if (i < data->trace_idx && !trace_valid_entry(&entries[idx])) {
44 printk(KERN_CONT ".. invalid entry %d ",
45 entries[idx].type);
46 goto failed;
47 }
48
49 idx++;
50 if (idx >= ENTRIES_PER_PAGE) {
51 page = virt_to_page(entries);
52 if (page->lru.next == &data->trace_pages) {
53 if (i != tr->entries - 1) {
54 printk(KERN_CONT ".. entries buffer mismatch");
55 goto failed;
56 }
57 } else {
58 page = list_entry(page->lru.next, struct page, lru);
59 entries = page_address(page);
60 }
61 idx = 0;
62 }
63 }
64
65 page = virt_to_page(entries);
66 if (page->lru.next != &data->trace_pages) {
67 printk(KERN_CONT ".. too many entries");
68 goto failed;
69 }
70
71 return 0;
72
73 failed:
74 /* disable tracing */
75 tracing_disabled = 1;
76 printk(KERN_CONT ".. corrupted trace buffer .. ");
77 return -1;
78}
79
80/*
81 * Test the trace buffer to see if all the elements
82 * are still sane.
83 */
84static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
85{
86 unsigned long flags, cnt = 0;
87 int cpu, ret = 0;
88
89 /* Don't allow flipping of max traces now */
90 raw_local_irq_save(flags);
91 __raw_spin_lock(&ftrace_max_lock);
92 for_each_possible_cpu(cpu) {
93 if (!head_page(tr->data[cpu]))
94 continue;
95
96 cnt += tr->data[cpu]->trace_idx;
97
98 ret = trace_test_buffer_cpu(tr, tr->data[cpu]);
99 if (ret)
100 break;
101 }
102 __raw_spin_unlock(&ftrace_max_lock);
103 raw_local_irq_restore(flags);
104
105 if (count)
106 *count = cnt;
107
108 return ret;
109}
110
111#ifdef CONFIG_FTRACE
112
113#ifdef CONFIG_DYNAMIC_FTRACE
114
115#define __STR(x) #x
116#define STR(x) __STR(x)
117
118/* Test dynamic code modification and ftrace filters */
119int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
120 struct trace_array *tr,
121 int (*func)(void))
122{
123 unsigned long count;
124 int ret;
125 int save_ftrace_enabled = ftrace_enabled;
126 int save_tracer_enabled = tracer_enabled;
127 char *func_name;
128
129 /* The ftrace test PASSED */
130 printk(KERN_CONT "PASSED\n");
131 pr_info("Testing dynamic ftrace: ");
132
133 /* enable tracing, and record the filter function */
134 ftrace_enabled = 1;
135 tracer_enabled = 1;
136
137 /* passed in by parameter to fool gcc from optimizing */
138 func();
139
140 /* update the records */
141 ret = ftrace_force_update();
142 if (ret) {
143 printk(KERN_CONT ".. ftraced failed .. ");
144 return ret;
145 }
146
147 /*
148 * Some archs *cough*PowerPC*cough* add charachters to the
149 * start of the function names. We simply put a '*' to
150 * accomodate them.
151 */
152 func_name = "*" STR(DYN_FTRACE_TEST_NAME);
153
154 /* filter only on our function */
155 ftrace_set_filter(func_name, strlen(func_name), 1);
156
157 /* enable tracing */
158 tr->ctrl = 1;
159 trace->init(tr);
160 /* Sleep for a 1/10 of a second */
161 msleep(100);
162
163 /* we should have nothing in the buffer */
164 ret = trace_test_buffer(tr, &count);
165 if (ret)
166 goto out;
167
168 if (count) {
169 ret = -1;
170 printk(KERN_CONT ".. filter did not filter .. ");
171 goto out;
172 }
173
174 /* call our function again */
175 func();
176
177 /* sleep again */
178 msleep(100);
179
180 /* stop the tracing. */
181 tr->ctrl = 0;
182 trace->ctrl_update(tr);
183 ftrace_enabled = 0;
184
185 /* check the trace buffer */
186 ret = trace_test_buffer(tr, &count);
187 trace->reset(tr);
188
189 /* we should only have one item */
190 if (!ret && count != 1) {
191 printk(KERN_CONT ".. filter failed count=%ld ..", count);
192 ret = -1;
193 goto out;
194 }
195 out:
196 ftrace_enabled = save_ftrace_enabled;
197 tracer_enabled = save_tracer_enabled;
198
199 /* Enable tracing on all functions again */
200 ftrace_set_filter(NULL, 0, 1);
201
202 return ret;
203}
204#else
205# define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; })
206#endif /* CONFIG_DYNAMIC_FTRACE */
207/*
208 * Simple verification test of ftrace function tracer.
209 * Enable ftrace, sleep 1/10 second, and then read the trace
210 * buffer to see if all is in order.
211 */
212int
213trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
214{
215 unsigned long count;
216 int ret;
217 int save_ftrace_enabled = ftrace_enabled;
218 int save_tracer_enabled = tracer_enabled;
219
220 /* make sure msleep has been recorded */
221 msleep(1);
222
223 /* force the recorded functions to be traced */
224 ret = ftrace_force_update();
225 if (ret) {
226 printk(KERN_CONT ".. ftraced failed .. ");
227 return ret;
228 }
229
230 /* start the tracing */
231 ftrace_enabled = 1;
232 tracer_enabled = 1;
233
234 tr->ctrl = 1;
235 trace->init(tr);
236 /* Sleep for a 1/10 of a second */
237 msleep(100);
238 /* stop the tracing. */
239 tr->ctrl = 0;
240 trace->ctrl_update(tr);
241 ftrace_enabled = 0;
242
243 /* check the trace buffer */
244 ret = trace_test_buffer(tr, &count);
245 trace->reset(tr);
246
247 if (!ret && !count) {
248 printk(KERN_CONT ".. no entries found ..");
249 ret = -1;
250 goto out;
251 }
252
253 ret = trace_selftest_startup_dynamic_tracing(trace, tr,
254 DYN_FTRACE_TEST_NAME);
255
256 out:
257 ftrace_enabled = save_ftrace_enabled;
258 tracer_enabled = save_tracer_enabled;
259
260 /* kill ftrace totally if we failed */
261 if (ret)
262 ftrace_kill();
263
264 return ret;
265}
266#endif /* CONFIG_FTRACE */
267
268#ifdef CONFIG_IRQSOFF_TRACER
269int
270trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
271{
272 unsigned long save_max = tracing_max_latency;
273 unsigned long count;
274 int ret;
275
276 /* start the tracing */
277 tr->ctrl = 1;
278 trace->init(tr);
279 /* reset the max latency */
280 tracing_max_latency = 0;
281 /* disable interrupts for a bit */
282 local_irq_disable();
283 udelay(100);
284 local_irq_enable();
285 /* stop the tracing. */
286 tr->ctrl = 0;
287 trace->ctrl_update(tr);
288 /* check both trace buffers */
289 ret = trace_test_buffer(tr, NULL);
290 if (!ret)
291 ret = trace_test_buffer(&max_tr, &count);
292 trace->reset(tr);
293
294 if (!ret && !count) {
295 printk(KERN_CONT ".. no entries found ..");
296 ret = -1;
297 }
298
299 tracing_max_latency = save_max;
300
301 return ret;
302}
303#endif /* CONFIG_IRQSOFF_TRACER */
304
305#ifdef CONFIG_PREEMPT_TRACER
306int
307trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
308{
309 unsigned long save_max = tracing_max_latency;
310 unsigned long count;
311 int ret;
312
313 /* start the tracing */
314 tr->ctrl = 1;
315 trace->init(tr);
316 /* reset the max latency */
317 tracing_max_latency = 0;
318 /* disable preemption for a bit */
319 preempt_disable();
320 udelay(100);
321 preempt_enable();
322 /* stop the tracing. */
323 tr->ctrl = 0;
324 trace->ctrl_update(tr);
325 /* check both trace buffers */
326 ret = trace_test_buffer(tr, NULL);
327 if (!ret)
328 ret = trace_test_buffer(&max_tr, &count);
329 trace->reset(tr);
330
331 if (!ret && !count) {
332 printk(KERN_CONT ".. no entries found ..");
333 ret = -1;
334 }
335
336 tracing_max_latency = save_max;
337
338 return ret;
339}
340#endif /* CONFIG_PREEMPT_TRACER */
341
342#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER)
343int
344trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr)
345{
346 unsigned long save_max = tracing_max_latency;
347 unsigned long count;
348 int ret;
349
350 /* start the tracing */
351 tr->ctrl = 1;
352 trace->init(tr);
353
354 /* reset the max latency */
355 tracing_max_latency = 0;
356
357 /* disable preemption and interrupts for a bit */
358 preempt_disable();
359 local_irq_disable();
360 udelay(100);
361 preempt_enable();
362 /* reverse the order of preempt vs irqs */
363 local_irq_enable();
364
365 /* stop the tracing. */
366 tr->ctrl = 0;
367 trace->ctrl_update(tr);
368 /* check both trace buffers */
369 ret = trace_test_buffer(tr, NULL);
370 if (ret)
371 goto out;
372
373 ret = trace_test_buffer(&max_tr, &count);
374 if (ret)
375 goto out;
376
377 if (!ret && !count) {
378 printk(KERN_CONT ".. no entries found ..");
379 ret = -1;
380 goto out;
381 }
382
383 /* do the test by disabling interrupts first this time */
384 tracing_max_latency = 0;
385 tr->ctrl = 1;
386 trace->ctrl_update(tr);
387 preempt_disable();
388 local_irq_disable();
389 udelay(100);
390 preempt_enable();
391 /* reverse the order of preempt vs irqs */
392 local_irq_enable();
393
394 /* stop the tracing. */
395 tr->ctrl = 0;
396 trace->ctrl_update(tr);
397 /* check both trace buffers */
398 ret = trace_test_buffer(tr, NULL);
399 if (ret)
400 goto out;
401
402 ret = trace_test_buffer(&max_tr, &count);
403
404 if (!ret && !count) {
405 printk(KERN_CONT ".. no entries found ..");
406 ret = -1;
407 goto out;
408 }
409
410 out:
411 trace->reset(tr);
412 tracing_max_latency = save_max;
413
414 return ret;
415}
416#endif /* CONFIG_IRQSOFF_TRACER && CONFIG_PREEMPT_TRACER */
417
418#ifdef CONFIG_SCHED_TRACER
419static int trace_wakeup_test_thread(void *data)
420{
421 /* Make this a RT thread, doesn't need to be too high */
422 struct sched_param param = { .sched_priority = 5 };
423 struct completion *x = data;
424
425 sched_setscheduler(current, SCHED_FIFO, &param);
426
427 /* Make it know we have a new prio */
428 complete(x);
429
430 /* now go to sleep and let the test wake us up */
431 set_current_state(TASK_INTERRUPTIBLE);
432 schedule();
433
434 /* we are awake, now wait to disappear */
435 while (!kthread_should_stop()) {
436 /*
437 * This is an RT task, do short sleeps to let
438 * others run.
439 */
440 msleep(100);
441 }
442
443 return 0;
444}
445
446int
447trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
448{
449 unsigned long save_max = tracing_max_latency;
450 struct task_struct *p;
451 struct completion isrt;
452 unsigned long count;
453 int ret;
454
455 init_completion(&isrt);
456
457 /* create a high prio thread */
458 p = kthread_run(trace_wakeup_test_thread, &isrt, "ftrace-test");
459 if (IS_ERR(p)) {
460 printk(KERN_CONT "Failed to create ftrace wakeup test thread ");
461 return -1;
462 }
463
464 /* make sure the thread is running at an RT prio */
465 wait_for_completion(&isrt);
466
467 /* start the tracing */
468 tr->ctrl = 1;
469 trace->init(tr);
470 /* reset the max latency */
471 tracing_max_latency = 0;
472
473 /* sleep to let the RT thread sleep too */
474 msleep(100);
475
476 /*
477 * Yes this is slightly racy. It is possible that for some
478 * strange reason that the RT thread we created, did not
479 * call schedule for 100ms after doing the completion,
480 * and we do a wakeup on a task that already is awake.
481 * But that is extremely unlikely, and the worst thing that
482 * happens in such a case, is that we disable tracing.
483 * Honestly, if this race does happen something is horrible
484 * wrong with the system.
485 */
486
487 wake_up_process(p);
488
489 /* stop the tracing. */
490 tr->ctrl = 0;
491 trace->ctrl_update(tr);
492 /* check both trace buffers */
493 ret = trace_test_buffer(tr, NULL);
494 if (!ret)
495 ret = trace_test_buffer(&max_tr, &count);
496
497
498 trace->reset(tr);
499
500 tracing_max_latency = save_max;
501
502 /* kill the thread */
503 kthread_stop(p);
504
505 if (!ret && !count) {
506 printk(KERN_CONT ".. no entries found ..");
507 ret = -1;
508 }
509
510 return ret;
511}
512#endif /* CONFIG_SCHED_TRACER */
513
514#ifdef CONFIG_CONTEXT_SWITCH_TRACER
515int
516trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr)
517{
518 unsigned long count;
519 int ret;
520
521 /* start the tracing */
522 tr->ctrl = 1;
523 trace->init(tr);
524 /* Sleep for a 1/10 of a second */
525 msleep(100);
526 /* stop the tracing. */
527 tr->ctrl = 0;
528 trace->ctrl_update(tr);
529 /* check the trace buffer */
530 ret = trace_test_buffer(tr, &count);
531 trace->reset(tr);
532
533 if (!ret && !count) {
534 printk(KERN_CONT ".. no entries found ..");
535 ret = -1;
536 }
537
538 return ret;
539}
540#endif /* CONFIG_CONTEXT_SWITCH_TRACER */
541
542#ifdef CONFIG_SYSPROF_TRACER
543int
544trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
545{
546 unsigned long count;
547 int ret;
548
549 /* start the tracing */
550 tr->ctrl = 1;
551 trace->init(tr);
552 /* Sleep for a 1/10 of a second */
553 msleep(100);
554 /* stop the tracing. */
555 tr->ctrl = 0;
556 trace->ctrl_update(tr);
557 /* check the trace buffer */
558 ret = trace_test_buffer(tr, &count);
559 trace->reset(tr);
560
561 return ret;
562}
563#endif /* CONFIG_SYSPROF_TRACER */
diff --git a/kernel/trace/trace_selftest_dynamic.c b/kernel/trace/trace_selftest_dynamic.c
new file mode 100644
index 000000000000..54dd77cce5bf
--- /dev/null
+++ b/kernel/trace/trace_selftest_dynamic.c
@@ -0,0 +1,7 @@
1#include "trace.h"
2
3int DYN_FTRACE_TEST_NAME(void)
4{
5 /* used to call mcount */
6 return 0;
7}
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
new file mode 100644
index 000000000000..db58fb66a135
--- /dev/null
+++ b/kernel/trace/trace_sysprof.c
@@ -0,0 +1,363 @@
1/*
2 * trace stack traces
3 *
4 * Copyright (C) 2004-2008, Soeren Sandmann
5 * Copyright (C) 2007 Steven Rostedt <srostedt@redhat.com>
6 * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
7 */
8#include <linux/kallsyms.h>
9#include <linux/debugfs.h>
10#include <linux/hrtimer.h>
11#include <linux/uaccess.h>
12#include <linux/ftrace.h>
13#include <linux/module.h>
14#include <linux/irq.h>
15#include <linux/fs.h>
16
17#include <asm/stacktrace.h>
18
19#include "trace.h"
20
21static struct trace_array *sysprof_trace;
22static int __read_mostly tracer_enabled;
23
24/*
25 * 1 msec sample interval by default:
26 */
27static unsigned long sample_period = 1000000;
28static const unsigned int sample_max_depth = 512;
29
30static DEFINE_MUTEX(sample_timer_lock);
31/*
32 * Per CPU hrtimers that do the profiling:
33 */
34static DEFINE_PER_CPU(struct hrtimer, stack_trace_hrtimer);
35
36struct stack_frame {
37 const void __user *next_fp;
38 unsigned long return_address;
39};
40
41static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
42{
43 int ret;
44
45 if (!access_ok(VERIFY_READ, fp, sizeof(*frame)))
46 return 0;
47
48 ret = 1;
49 pagefault_disable();
50 if (__copy_from_user_inatomic(frame, fp, sizeof(*frame)))
51 ret = 0;
52 pagefault_enable();
53
54 return ret;
55}
56
57struct backtrace_info {
58 struct trace_array_cpu *data;
59 struct trace_array *tr;
60 int pos;
61};
62
63static void
64backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
65{
66 /* Ignore warnings */
67}
68
69static void backtrace_warning(void *data, char *msg)
70{
71 /* Ignore warnings */
72}
73
74static int backtrace_stack(void *data, char *name)
75{
76 /* Don't bother with IRQ stacks for now */
77 return -1;
78}
79
80static void backtrace_address(void *data, unsigned long addr, int reliable)
81{
82 struct backtrace_info *info = data;
83
84 if (info->pos < sample_max_depth && reliable) {
85 __trace_special(info->tr, info->data, 1, addr, 0);
86
87 info->pos++;
88 }
89}
90
91const static struct stacktrace_ops backtrace_ops = {
92 .warning = backtrace_warning,
93 .warning_symbol = backtrace_warning_symbol,
94 .stack = backtrace_stack,
95 .address = backtrace_address,
96};
97
98static int
99trace_kernel(struct pt_regs *regs, struct trace_array *tr,
100 struct trace_array_cpu *data)
101{
102 struct backtrace_info info;
103 unsigned long bp;
104 char *stack;
105
106 info.tr = tr;
107 info.data = data;
108 info.pos = 1;
109
110 __trace_special(info.tr, info.data, 1, regs->ip, 0);
111
112 stack = ((char *)regs + sizeof(struct pt_regs));
113#ifdef CONFIG_FRAME_POINTER
114 bp = regs->bp;
115#else
116 bp = 0;
117#endif
118
119 dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, &info);
120
121 return info.pos;
122}
123
124static void timer_notify(struct pt_regs *regs, int cpu)
125{
126 struct trace_array_cpu *data;
127 struct stack_frame frame;
128 struct trace_array *tr;
129 const void __user *fp;
130 int is_user;
131 int i;
132
133 if (!regs)
134 return;
135
136 tr = sysprof_trace;
137 data = tr->data[cpu];
138 is_user = user_mode(regs);
139
140 if (!current || current->pid == 0)
141 return;
142
143 if (is_user && current->state != TASK_RUNNING)
144 return;
145
146 __trace_special(tr, data, 0, 0, current->pid);
147
148 if (!is_user)
149 i = trace_kernel(regs, tr, data);
150 else
151 i = 0;
152
153 /*
154 * Trace user stack if we are not a kernel thread
155 */
156 if (current->mm && i < sample_max_depth) {
157 regs = (struct pt_regs *)current->thread.sp0 - 1;
158
159 fp = (void __user *)regs->bp;
160
161 __trace_special(tr, data, 2, regs->ip, 0);
162
163 while (i < sample_max_depth) {
164 frame.next_fp = NULL;
165 frame.return_address = 0;
166 if (!copy_stack_frame(fp, &frame))
167 break;
168 if ((unsigned long)fp < regs->sp)
169 break;
170
171 __trace_special(tr, data, 2, frame.return_address,
172 (unsigned long)fp);
173 fp = frame.next_fp;
174
175 i++;
176 }
177
178 }
179
180 /*
181 * Special trace entry if we overflow the max depth:
182 */
183 if (i == sample_max_depth)
184 __trace_special(tr, data, -1, -1, -1);
185
186 __trace_special(tr, data, 3, current->pid, i);
187}
188
189static enum hrtimer_restart stack_trace_timer_fn(struct hrtimer *hrtimer)
190{
191 /* trace here */
192 timer_notify(get_irq_regs(), smp_processor_id());
193
194 hrtimer_forward_now(hrtimer, ns_to_ktime(sample_period));
195
196 return HRTIMER_RESTART;
197}
198
199static void start_stack_timer(int cpu)
200{
201 struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu);
202
203 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
204 hrtimer->function = stack_trace_timer_fn;
205 hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
206
207 hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL);
208}
209
210static void start_stack_timers(void)
211{
212 cpumask_t saved_mask = current->cpus_allowed;
213 int cpu;
214
215 for_each_online_cpu(cpu) {
216 set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
217 start_stack_timer(cpu);
218 }
219 set_cpus_allowed_ptr(current, &saved_mask);
220}
221
222static void stop_stack_timer(int cpu)
223{
224 struct hrtimer *hrtimer = &per_cpu(stack_trace_hrtimer, cpu);
225
226 hrtimer_cancel(hrtimer);
227}
228
229static void stop_stack_timers(void)
230{
231 int cpu;
232
233 for_each_online_cpu(cpu)
234 stop_stack_timer(cpu);
235}
236
237static void stack_reset(struct trace_array *tr)
238{
239 int cpu;
240
241 tr->time_start = ftrace_now(tr->cpu);
242
243 for_each_online_cpu(cpu)
244 tracing_reset(tr->data[cpu]);
245}
246
247static void start_stack_trace(struct trace_array *tr)
248{
249 mutex_lock(&sample_timer_lock);
250 stack_reset(tr);
251 start_stack_timers();
252 tracer_enabled = 1;
253 mutex_unlock(&sample_timer_lock);
254}
255
256static void stop_stack_trace(struct trace_array *tr)
257{
258 mutex_lock(&sample_timer_lock);
259 stop_stack_timers();
260 tracer_enabled = 0;
261 mutex_unlock(&sample_timer_lock);
262}
263
264static void stack_trace_init(struct trace_array *tr)
265{
266 sysprof_trace = tr;
267
268 if (tr->ctrl)
269 start_stack_trace(tr);
270}
271
272static void stack_trace_reset(struct trace_array *tr)
273{
274 if (tr->ctrl)
275 stop_stack_trace(tr);
276}
277
278static void stack_trace_ctrl_update(struct trace_array *tr)
279{
280 /* When starting a new trace, reset the buffers */
281 if (tr->ctrl)
282 start_stack_trace(tr);
283 else
284 stop_stack_trace(tr);
285}
286
287static struct tracer stack_trace __read_mostly =
288{
289 .name = "sysprof",
290 .init = stack_trace_init,
291 .reset = stack_trace_reset,
292 .ctrl_update = stack_trace_ctrl_update,
293#ifdef CONFIG_FTRACE_SELFTEST
294 .selftest = trace_selftest_startup_sysprof,
295#endif
296};
297
298__init static int init_stack_trace(void)
299{
300 return register_tracer(&stack_trace);
301}
302device_initcall(init_stack_trace);
303
304#define MAX_LONG_DIGITS 22
305
306static ssize_t
307sysprof_sample_read(struct file *filp, char __user *ubuf,
308 size_t cnt, loff_t *ppos)
309{
310 char buf[MAX_LONG_DIGITS];
311 int r;
312
313 r = sprintf(buf, "%ld\n", nsecs_to_usecs(sample_period));
314
315 return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
316}
317
318static ssize_t
319sysprof_sample_write(struct file *filp, const char __user *ubuf,
320 size_t cnt, loff_t *ppos)
321{
322 char buf[MAX_LONG_DIGITS];
323 unsigned long val;
324
325 if (cnt > MAX_LONG_DIGITS-1)
326 cnt = MAX_LONG_DIGITS-1;
327
328 if (copy_from_user(&buf, ubuf, cnt))
329 return -EFAULT;
330
331 buf[cnt] = 0;
332
333 val = simple_strtoul(buf, NULL, 10);
334 /*
335 * Enforce a minimum sample period of 100 usecs:
336 */
337 if (val < 100)
338 val = 100;
339
340 mutex_lock(&sample_timer_lock);
341 stop_stack_timers();
342 sample_period = val * 1000;
343 start_stack_timers();
344 mutex_unlock(&sample_timer_lock);
345
346 return cnt;
347}
348
349static struct file_operations sysprof_sample_fops = {
350 .read = sysprof_sample_read,
351 .write = sysprof_sample_write,
352};
353
354void init_tracer_sysprof_debugfs(struct dentry *d_tracer)
355{
356 struct dentry *entry;
357
358 entry = debugfs_create_file("sysprof_sample_period", 0644,
359 d_tracer, NULL, &sysprof_sample_fops);
360 if (entry)
361 return;
362 pr_warning("Could not create debugfs 'dyn_ftrace_total_info' entry\n");
363}
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 4ab1b584961b..8ebcd8532dfb 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -28,14 +28,14 @@
28void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) 28void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
29{ 29{
30 struct timespec uptime, ts; 30 struct timespec uptime, ts;
31 s64 ac_etime; 31 u64 ac_etime;
32 32
33 BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN); 33 BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
34 34
35 /* calculate task elapsed time in timespec */ 35 /* calculate task elapsed time in timespec */
36 do_posix_clock_monotonic_gettime(&uptime); 36 do_posix_clock_monotonic_gettime(&uptime);
37 ts = timespec_sub(uptime, tsk->start_time); 37 ts = timespec_sub(uptime, tsk->start_time);
38 /* rebase elapsed time to usec */ 38 /* rebase elapsed time to usec (should never be negative) */
39 ac_etime = timespec_to_ns(&ts); 39 ac_etime = timespec_to_ns(&ts);
40 do_div(ac_etime, NSEC_PER_USEC); 40 do_div(ac_etime, NSEC_PER_USEC);
41 stats->ac_etime = ac_etime; 41 stats->ac_etime = ac_etime;
@@ -84,9 +84,9 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
84{ 84{
85 struct mm_struct *mm; 85 struct mm_struct *mm;
86 86
87 /* convert pages-jiffies to Mbyte-usec */ 87 /* convert pages-usec to Mbyte-usec */
88 stats->coremem = jiffies_to_usecs(p->acct_rss_mem1) * PAGE_SIZE / MB; 88 stats->coremem = p->acct_rss_mem1 * PAGE_SIZE / MB;
89 stats->virtmem = jiffies_to_usecs(p->acct_vm_mem1) * PAGE_SIZE / MB; 89 stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE / MB;
90 mm = get_task_mm(p); 90 mm = get_task_mm(p);
91 if (mm) { 91 if (mm) {
92 /* adjust to KB unit */ 92 /* adjust to KB unit */
@@ -94,10 +94,10 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
94 stats->hiwater_vm = mm->hiwater_vm * PAGE_SIZE / KB; 94 stats->hiwater_vm = mm->hiwater_vm * PAGE_SIZE / KB;
95 mmput(mm); 95 mmput(mm);
96 } 96 }
97 stats->read_char = p->rchar; 97 stats->read_char = p->ioac.rchar;
98 stats->write_char = p->wchar; 98 stats->write_char = p->ioac.wchar;
99 stats->read_syscalls = p->syscr; 99 stats->read_syscalls = p->ioac.syscr;
100 stats->write_syscalls = p->syscw; 100 stats->write_syscalls = p->ioac.syscw;
101#ifdef CONFIG_TASK_IO_ACCOUNTING 101#ifdef CONFIG_TASK_IO_ACCOUNTING
102 stats->read_bytes = p->ioac.read_bytes; 102 stats->read_bytes = p->ioac.read_bytes;
103 stats->write_bytes = p->ioac.write_bytes; 103 stats->write_bytes = p->ioac.write_bytes;
@@ -118,12 +118,19 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
118void acct_update_integrals(struct task_struct *tsk) 118void acct_update_integrals(struct task_struct *tsk)
119{ 119{
120 if (likely(tsk->mm)) { 120 if (likely(tsk->mm)) {
121 long delta = cputime_to_jiffies( 121 cputime_t time, dtime;
122 cputime_sub(tsk->stime, tsk->acct_stimexpd)); 122 struct timeval value;
123 u64 delta;
124
125 time = tsk->stime + tsk->utime;
126 dtime = cputime_sub(time, tsk->acct_timexpd);
127 jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
128 delta = value.tv_sec;
129 delta = delta * USEC_PER_SEC + value.tv_usec;
123 130
124 if (delta == 0) 131 if (delta == 0)
125 return; 132 return;
126 tsk->acct_stimexpd = tsk->stime; 133 tsk->acct_timexpd = time;
127 tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); 134 tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
128 tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; 135 tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
129 } 136 }
@@ -135,7 +142,7 @@ void acct_update_integrals(struct task_struct *tsk)
135 */ 142 */
136void acct_clear_integrals(struct task_struct *tsk) 143void acct_clear_integrals(struct task_struct *tsk)
137{ 144{
138 tsk->acct_stimexpd = 0; 145 tsk->acct_timexpd = 0;
139 tsk->acct_rss_mem1 = 0; 146 tsk->acct_rss_mem1 = 0;
140 tsk->acct_vm_mem1 = 0; 147 tsk->acct_vm_mem1 = 0;
141} 148}
diff --git a/kernel/user.c b/kernel/user.c
index 865ecf57a096..39d6159fae43 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -169,7 +169,7 @@ static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
169{ 169{
170 struct user_struct *up = container_of(kobj, struct user_struct, kobj); 170 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
171 171
172 return sprintf(buf, "%lu\n", sched_group_rt_runtime(up->tg)); 172 return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg));
173} 173}
174 174
175static ssize_t cpu_rt_runtime_store(struct kobject *kobj, 175static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
@@ -180,7 +180,7 @@ static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
180 unsigned long rt_runtime; 180 unsigned long rt_runtime;
181 int rc; 181 int rc;
182 182
183 sscanf(buf, "%lu", &rt_runtime); 183 sscanf(buf, "%ld", &rt_runtime);
184 184
185 rc = sched_group_set_rt_runtime(up->tg, rt_runtime); 185 rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
186 186
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index a9ab0596de44..532858fa5b88 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -6,7 +6,6 @@
6 */ 6 */
7 7
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/version.h>
10#include <linux/nsproxy.h> 9#include <linux/nsproxy.h>
11#include <linux/slab.h> 10#include <linux/slab.h>
12#include <linux/user_namespace.h> 11#include <linux/user_namespace.h>
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 64d398f12444..815237a55af8 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -12,7 +12,6 @@
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/uts.h> 13#include <linux/uts.h>
14#include <linux/utsname.h> 14#include <linux/utsname.h>
15#include <linux/version.h>
16#include <linux/err.h> 15#include <linux/err.h>
17#include <linux/slab.h> 16#include <linux/slab.h>
18 17
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index fe3a56c2256d..4ab9659d269e 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -12,7 +12,6 @@
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/uts.h> 13#include <linux/uts.h>
14#include <linux/utsname.h> 14#include <linux/utsname.h>
15#include <linux/version.h>
16#include <linux/sysctl.h> 15#include <linux/sysctl.h>
17 16
18static void *get_uts(ctl_table *table, int write) 17static void *get_uts(ctl_table *table, int write)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 29fc39f1029c..4048e92aa04f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -13,7 +13,7 @@
13 * Kai Petzke <wpp@marie.physik.tu-berlin.de> 13 * Kai Petzke <wpp@marie.physik.tu-berlin.de>
14 * Theodore Ts'o <tytso@mit.edu> 14 * Theodore Ts'o <tytso@mit.edu>
15 * 15 *
16 * Made to use alloc_percpu by Christoph Lameter <clameter@sgi.com>. 16 * Made to use alloc_percpu by Christoph Lameter.
17 */ 17 */
18 18
19#include <linux/module.h> 19#include <linux/module.h>
@@ -125,7 +125,7 @@ struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
125} 125}
126 126
127static void insert_work(struct cpu_workqueue_struct *cwq, 127static void insert_work(struct cpu_workqueue_struct *cwq,
128 struct work_struct *work, int tail) 128 struct work_struct *work, struct list_head *head)
129{ 129{
130 set_wq_data(work, cwq); 130 set_wq_data(work, cwq);
131 /* 131 /*
@@ -133,21 +133,17 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
133 * result of list_add() below, see try_to_grab_pending(). 133 * result of list_add() below, see try_to_grab_pending().
134 */ 134 */
135 smp_wmb(); 135 smp_wmb();
136 if (tail) 136 list_add_tail(&work->entry, head);
137 list_add_tail(&work->entry, &cwq->worklist);
138 else
139 list_add(&work->entry, &cwq->worklist);
140 wake_up(&cwq->more_work); 137 wake_up(&cwq->more_work);
141} 138}
142 139
143/* Preempt must be disabled. */
144static void __queue_work(struct cpu_workqueue_struct *cwq, 140static void __queue_work(struct cpu_workqueue_struct *cwq,
145 struct work_struct *work) 141 struct work_struct *work)
146{ 142{
147 unsigned long flags; 143 unsigned long flags;
148 144
149 spin_lock_irqsave(&cwq->lock, flags); 145 spin_lock_irqsave(&cwq->lock, flags);
150 insert_work(cwq, work, 1); 146 insert_work(cwq, work, &cwq->worklist);
151 spin_unlock_irqrestore(&cwq->lock, flags); 147 spin_unlock_irqrestore(&cwq->lock, flags);
152} 148}
153 149
@@ -163,17 +159,39 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
163 */ 159 */
164int queue_work(struct workqueue_struct *wq, struct work_struct *work) 160int queue_work(struct workqueue_struct *wq, struct work_struct *work)
165{ 161{
162 int ret;
163
164 ret = queue_work_on(get_cpu(), wq, work);
165 put_cpu();
166
167 return ret;
168}
169EXPORT_SYMBOL_GPL(queue_work);
170
171/**
172 * queue_work_on - queue work on specific cpu
173 * @cpu: CPU number to execute work on
174 * @wq: workqueue to use
175 * @work: work to queue
176 *
177 * Returns 0 if @work was already on a queue, non-zero otherwise.
178 *
179 * We queue the work to a specific CPU, the caller must ensure it
180 * can't go away.
181 */
182int
183queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work)
184{
166 int ret = 0; 185 int ret = 0;
167 186
168 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { 187 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
169 BUG_ON(!list_empty(&work->entry)); 188 BUG_ON(!list_empty(&work->entry));
170 __queue_work(wq_per_cpu(wq, get_cpu()), work); 189 __queue_work(wq_per_cpu(wq, cpu), work);
171 put_cpu();
172 ret = 1; 190 ret = 1;
173 } 191 }
174 return ret; 192 return ret;
175} 193}
176EXPORT_SYMBOL_GPL(queue_work); 194EXPORT_SYMBOL_GPL(queue_work_on);
177 195
178static void delayed_work_timer_fn(unsigned long __data) 196static void delayed_work_timer_fn(unsigned long __data)
179{ 197{
@@ -272,11 +290,11 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
272 290
273 BUG_ON(get_wq_data(work) != cwq); 291 BUG_ON(get_wq_data(work) != cwq);
274 work_clear_pending(work); 292 work_clear_pending(work);
275 lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_); 293 lock_map_acquire(&cwq->wq->lockdep_map);
276 lock_acquire(&lockdep_map, 0, 0, 0, 2, _THIS_IP_); 294 lock_map_acquire(&lockdep_map);
277 f(work); 295 f(work);
278 lock_release(&lockdep_map, 1, _THIS_IP_); 296 lock_map_release(&lockdep_map);
279 lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_); 297 lock_map_release(&cwq->wq->lockdep_map);
280 298
281 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { 299 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
282 printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " 300 printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
@@ -337,14 +355,14 @@ static void wq_barrier_func(struct work_struct *work)
337} 355}
338 356
339static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, 357static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
340 struct wq_barrier *barr, int tail) 358 struct wq_barrier *barr, struct list_head *head)
341{ 359{
342 INIT_WORK(&barr->work, wq_barrier_func); 360 INIT_WORK(&barr->work, wq_barrier_func);
343 __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work)); 361 __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));
344 362
345 init_completion(&barr->done); 363 init_completion(&barr->done);
346 364
347 insert_work(cwq, &barr->work, tail); 365 insert_work(cwq, &barr->work, head);
348} 366}
349 367
350static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) 368static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
@@ -364,7 +382,7 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
364 active = 0; 382 active = 0;
365 spin_lock_irq(&cwq->lock); 383 spin_lock_irq(&cwq->lock);
366 if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) { 384 if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
367 insert_wq_barrier(cwq, &barr, 1); 385 insert_wq_barrier(cwq, &barr, &cwq->worklist);
368 active = 1; 386 active = 1;
369 } 387 }
370 spin_unlock_irq(&cwq->lock); 388 spin_unlock_irq(&cwq->lock);
@@ -395,13 +413,64 @@ void flush_workqueue(struct workqueue_struct *wq)
395 int cpu; 413 int cpu;
396 414
397 might_sleep(); 415 might_sleep();
398 lock_acquire(&wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_); 416 lock_map_acquire(&wq->lockdep_map);
399 lock_release(&wq->lockdep_map, 1, _THIS_IP_); 417 lock_map_release(&wq->lockdep_map);
400 for_each_cpu_mask(cpu, *cpu_map) 418 for_each_cpu_mask_nr(cpu, *cpu_map)
401 flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); 419 flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
402} 420}
403EXPORT_SYMBOL_GPL(flush_workqueue); 421EXPORT_SYMBOL_GPL(flush_workqueue);
404 422
423/**
424 * flush_work - block until a work_struct's callback has terminated
425 * @work: the work which is to be flushed
426 *
427 * Returns false if @work has already terminated.
428 *
429 * It is expected that, prior to calling flush_work(), the caller has
430 * arranged for the work to not be requeued, otherwise it doesn't make
431 * sense to use this function.
432 */
433int flush_work(struct work_struct *work)
434{
435 struct cpu_workqueue_struct *cwq;
436 struct list_head *prev;
437 struct wq_barrier barr;
438
439 might_sleep();
440 cwq = get_wq_data(work);
441 if (!cwq)
442 return 0;
443
444 lock_map_acquire(&cwq->wq->lockdep_map);
445 lock_map_release(&cwq->wq->lockdep_map);
446
447 prev = NULL;
448 spin_lock_irq(&cwq->lock);
449 if (!list_empty(&work->entry)) {
450 /*
451 * See the comment near try_to_grab_pending()->smp_rmb().
452 * If it was re-queued under us we are not going to wait.
453 */
454 smp_rmb();
455 if (unlikely(cwq != get_wq_data(work)))
456 goto out;
457 prev = &work->entry;
458 } else {
459 if (cwq->current_work != work)
460 goto out;
461 prev = &cwq->worklist;
462 }
463 insert_wq_barrier(cwq, &barr, prev->next);
464out:
465 spin_unlock_irq(&cwq->lock);
466 if (!prev)
467 return 0;
468
469 wait_for_completion(&barr.done);
470 return 1;
471}
472EXPORT_SYMBOL_GPL(flush_work);
473
405/* 474/*
406 * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit, 475 * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit,
407 * so this work can't be re-armed in any way. 476 * so this work can't be re-armed in any way.
@@ -449,7 +518,7 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
449 518
450 spin_lock_irq(&cwq->lock); 519 spin_lock_irq(&cwq->lock);
451 if (unlikely(cwq->current_work == work)) { 520 if (unlikely(cwq->current_work == work)) {
452 insert_wq_barrier(cwq, &barr, 0); 521 insert_wq_barrier(cwq, &barr, cwq->worklist.next);
453 running = 1; 522 running = 1;
454 } 523 }
455 spin_unlock_irq(&cwq->lock); 524 spin_unlock_irq(&cwq->lock);
@@ -467,8 +536,8 @@ static void wait_on_work(struct work_struct *work)
467 536
468 might_sleep(); 537 might_sleep();
469 538
470 lock_acquire(&work->lockdep_map, 0, 0, 0, 2, _THIS_IP_); 539 lock_map_acquire(&work->lockdep_map);
471 lock_release(&work->lockdep_map, 1, _THIS_IP_); 540 lock_map_release(&work->lockdep_map);
472 541
473 cwq = get_wq_data(work); 542 cwq = get_wq_data(work);
474 if (!cwq) 543 if (!cwq)
@@ -477,7 +546,7 @@ static void wait_on_work(struct work_struct *work)
477 wq = cwq->wq; 546 wq = cwq->wq;
478 cpu_map = wq_cpu_map(wq); 547 cpu_map = wq_cpu_map(wq);
479 548
480 for_each_cpu_mask(cpu, *cpu_map) 549 for_each_cpu_mask_nr(cpu, *cpu_map)
481 wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work); 550 wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
482} 551}
483 552
@@ -553,6 +622,19 @@ int schedule_work(struct work_struct *work)
553} 622}
554EXPORT_SYMBOL(schedule_work); 623EXPORT_SYMBOL(schedule_work);
555 624
625/*
626 * schedule_work_on - put work task on a specific cpu
627 * @cpu: cpu to put the work task on
628 * @work: job to be done
629 *
630 * This puts a job on a specific cpu
631 */
632int schedule_work_on(int cpu, struct work_struct *work)
633{
634 return queue_work_on(cpu, keventd_wq, work);
635}
636EXPORT_SYMBOL(schedule_work_on);
637
556/** 638/**
557 * schedule_delayed_work - put work task in global workqueue after delay 639 * schedule_delayed_work - put work task in global workqueue after delay
558 * @dwork: job to be done 640 * @dwork: job to be done
@@ -607,10 +689,10 @@ int schedule_on_each_cpu(work_func_t func)
607 struct work_struct *work = per_cpu_ptr(works, cpu); 689 struct work_struct *work = per_cpu_ptr(works, cpu);
608 690
609 INIT_WORK(work, func); 691 INIT_WORK(work, func);
610 set_bit(WORK_STRUCT_PENDING, work_data_bits(work)); 692 schedule_work_on(cpu, work);
611 __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work);
612 } 693 }
613 flush_workqueue(keventd_wq); 694 for_each_online_cpu(cpu)
695 flush_work(per_cpu_ptr(works, cpu));
614 put_online_cpus(); 696 put_online_cpus();
615 free_percpu(works); 697 free_percpu(works);
616 return 0; 698 return 0;
@@ -747,11 +829,22 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
747 err = create_workqueue_thread(cwq, singlethread_cpu); 829 err = create_workqueue_thread(cwq, singlethread_cpu);
748 start_workqueue_thread(cwq, -1); 830 start_workqueue_thread(cwq, -1);
749 } else { 831 } else {
750 get_online_cpus(); 832 cpu_maps_update_begin();
833 /*
834 * We must place this wq on list even if the code below fails.
835 * cpu_down(cpu) can remove cpu from cpu_populated_map before
836 * destroy_workqueue() takes the lock, in that case we leak
837 * cwq[cpu]->thread.
838 */
751 spin_lock(&workqueue_lock); 839 spin_lock(&workqueue_lock);
752 list_add(&wq->list, &workqueues); 840 list_add(&wq->list, &workqueues);
753 spin_unlock(&workqueue_lock); 841 spin_unlock(&workqueue_lock);
754 842 /*
843 * We must initialize cwqs for each possible cpu even if we
844 * are going to call destroy_workqueue() finally. Otherwise
845 * cpu_up() can hit the uninitialized cwq once we drop the
846 * lock.
847 */
755 for_each_possible_cpu(cpu) { 848 for_each_possible_cpu(cpu) {
756 cwq = init_cpu_workqueue(wq, cpu); 849 cwq = init_cpu_workqueue(wq, cpu);
757 if (err || !cpu_online(cpu)) 850 if (err || !cpu_online(cpu))
@@ -759,7 +852,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
759 err = create_workqueue_thread(cwq, cpu); 852 err = create_workqueue_thread(cwq, cpu);
760 start_workqueue_thread(cwq, cpu); 853 start_workqueue_thread(cwq, cpu);
761 } 854 }
762 put_online_cpus(); 855 cpu_maps_update_done();
763 } 856 }
764 857
765 if (err) { 858 if (err) {
@@ -773,18 +866,18 @@ EXPORT_SYMBOL_GPL(__create_workqueue_key);
773static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) 866static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
774{ 867{
775 /* 868 /*
776 * Our caller is either destroy_workqueue() or CPU_DEAD, 869 * Our caller is either destroy_workqueue() or CPU_POST_DEAD,
777 * get_online_cpus() protects cwq->thread. 870 * cpu_add_remove_lock protects cwq->thread.
778 */ 871 */
779 if (cwq->thread == NULL) 872 if (cwq->thread == NULL)
780 return; 873 return;
781 874
782 lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_); 875 lock_map_acquire(&cwq->wq->lockdep_map);
783 lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_); 876 lock_map_release(&cwq->wq->lockdep_map);
784 877
785 flush_cpu_workqueue(cwq); 878 flush_cpu_workqueue(cwq);
786 /* 879 /*
787 * If the caller is CPU_DEAD and cwq->worklist was not empty, 880 * If the caller is CPU_POST_DEAD and cwq->worklist was not empty,
788 * a concurrent flush_workqueue() can insert a barrier after us. 881 * a concurrent flush_workqueue() can insert a barrier after us.
789 * However, in that case run_workqueue() won't return and check 882 * However, in that case run_workqueue() won't return and check
790 * kthread_should_stop() until it flushes all work_struct's. 883 * kthread_should_stop() until it flushes all work_struct's.
@@ -808,14 +901,14 @@ void destroy_workqueue(struct workqueue_struct *wq)
808 const cpumask_t *cpu_map = wq_cpu_map(wq); 901 const cpumask_t *cpu_map = wq_cpu_map(wq);
809 int cpu; 902 int cpu;
810 903
811 get_online_cpus(); 904 cpu_maps_update_begin();
812 spin_lock(&workqueue_lock); 905 spin_lock(&workqueue_lock);
813 list_del(&wq->list); 906 list_del(&wq->list);
814 spin_unlock(&workqueue_lock); 907 spin_unlock(&workqueue_lock);
815 908
816 for_each_cpu_mask(cpu, *cpu_map) 909 for_each_cpu_mask_nr(cpu, *cpu_map)
817 cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu)); 910 cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu));
818 put_online_cpus(); 911 cpu_maps_update_done();
819 912
820 free_percpu(wq->cpu_wq); 913 free_percpu(wq->cpu_wq);
821 kfree(wq); 914 kfree(wq);
@@ -829,6 +922,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
829 unsigned int cpu = (unsigned long)hcpu; 922 unsigned int cpu = (unsigned long)hcpu;
830 struct cpu_workqueue_struct *cwq; 923 struct cpu_workqueue_struct *cwq;
831 struct workqueue_struct *wq; 924 struct workqueue_struct *wq;
925 int ret = NOTIFY_OK;
832 926
833 action &= ~CPU_TASKS_FROZEN; 927 action &= ~CPU_TASKS_FROZEN;
834 928
@@ -836,7 +930,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
836 case CPU_UP_PREPARE: 930 case CPU_UP_PREPARE:
837 cpu_set(cpu, cpu_populated_map); 931 cpu_set(cpu, cpu_populated_map);
838 } 932 }
839 933undo:
840 list_for_each_entry(wq, &workqueues, list) { 934 list_for_each_entry(wq, &workqueues, list) {
841 cwq = per_cpu_ptr(wq->cpu_wq, cpu); 935 cwq = per_cpu_ptr(wq->cpu_wq, cpu);
842 936
@@ -846,7 +940,9 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
846 break; 940 break;
847 printk(KERN_ERR "workqueue [%s] for %i failed\n", 941 printk(KERN_ERR "workqueue [%s] for %i failed\n",
848 wq->name, cpu); 942 wq->name, cpu);
849 return NOTIFY_BAD; 943 action = CPU_UP_CANCELED;
944 ret = NOTIFY_BAD;
945 goto undo;
850 946
851 case CPU_ONLINE: 947 case CPU_ONLINE:
852 start_workqueue_thread(cwq, cpu); 948 start_workqueue_thread(cwq, cpu);
@@ -854,7 +950,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
854 950
855 case CPU_UP_CANCELED: 951 case CPU_UP_CANCELED:
856 start_workqueue_thread(cwq, -1); 952 start_workqueue_thread(cwq, -1);
857 case CPU_DEAD: 953 case CPU_POST_DEAD:
858 cleanup_workqueue_thread(cwq); 954 cleanup_workqueue_thread(cwq);
859 break; 955 break;
860 } 956 }
@@ -862,11 +958,11 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
862 958
863 switch (action) { 959 switch (action) {
864 case CPU_UP_CANCELED: 960 case CPU_UP_CANCELED:
865 case CPU_DEAD: 961 case CPU_POST_DEAD:
866 cpu_clear(cpu, cpu_populated_map); 962 cpu_clear(cpu, cpu_populated_map);
867 } 963 }
868 964
869 return NOTIFY_OK; 965 return ret;
870} 966}
871 967
872void __init init_workqueues(void) 968void __init init_workqueues(void)