aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2008-10-28 11:26:12 -0400
committerIngo Molnar <mingo@elte.hu>2008-10-28 11:26:12 -0400
commit7a9787e1eba95a166265e6a260cf30af04ef0a99 (patch)
treee730a4565e0318140d2fbd2f0415d18a339d7336 /kernel
parent41b9eb264c8407655db57b60b4457fe1b2ec9977 (diff)
parent0173a3265b228da319ceb9c1ec6a5682fd1b2d92 (diff)
Merge commit 'v2.6.28-rc2' into x86/pci-ioapic-boot-irq-quirks
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.freezer2
-rw-r--r--kernel/Kconfig.hz2
-rw-r--r--kernel/Makefile9
-rw-r--r--kernel/acct.c224
-rw-r--r--kernel/audit.c13
-rw-r--r--kernel/audit_tree.c48
-rw-r--r--kernel/auditfilter.c10
-rw-r--r--kernel/auditsc.c24
-rw-r--r--kernel/capability.c359
-rw-r--r--kernel/cgroup.c603
-rw-r--r--kernel/cgroup_debug.c4
-rw-r--r--kernel/cgroup_freezer.c379
-rw-r--r--kernel/compat.c111
-rw-r--r--kernel/configs.c9
-rw-r--r--kernel/cpu.c113
-rw-r--r--kernel/cpuset.c702
-rw-r--r--kernel/delayacct.c16
-rw-r--r--kernel/dma-coherent.c155
-rw-r--r--kernel/dma.c2
-rw-r--r--kernel/exec_domain.c36
-rw-r--r--kernel/exit.c226
-rw-r--r--kernel/fork.c235
-rw-r--r--kernel/freezer.c154
-rw-r--r--kernel/futex.c11
-rw-r--r--kernel/hrtimer.c316
-rw-r--r--kernel/irq/autoprobe.c43
-rw-r--r--kernel/irq/chip.c114
-rw-r--r--kernel/irq/handle.c27
-rw-r--r--kernel/irq/internals.h7
-rw-r--r--kernel/irq/manage.c233
-rw-r--r--kernel/irq/migration.c14
-rw-r--r--kernel/irq/proc.c137
-rw-r--r--kernel/irq/resend.c6
-rw-r--r--kernel/irq/spurious.c162
-rw-r--r--kernel/itimer.c33
-rw-r--r--kernel/kallsyms.c3
-rw-r--r--kernel/kexec.c143
-rw-r--r--kernel/kgdb.c107
-rw-r--r--kernel/kmod.c82
-rw-r--r--kernel/kprobes.c132
-rw-r--r--kernel/ksysfs.c35
-rw-r--r--kernel/kthread.c14
-rw-r--r--kernel/lockdep.c309
-rw-r--r--kernel/lockdep_internals.h19
-rw-r--r--kernel/lockdep_proc.c48
-rw-r--r--kernel/marker.c73
-rw-r--r--kernel/module.c711
-rw-r--r--kernel/mutex.c1
-rw-r--r--kernel/notifier.c2
-rw-r--r--kernel/ns_cgroup.c8
-rw-r--r--kernel/nsproxy.c9
-rw-r--r--kernel/panic.c106
-rw-r--r--kernel/params.c276
-rw-r--r--kernel/pid.c10
-rw-r--r--kernel/pid_namespace.c13
-rw-r--r--kernel/pm_qos_params.c41
-rw-r--r--kernel/posix-cpu-timers.c512
-rw-r--r--kernel/posix-timers.c197
-rw-r--r--kernel/power/Kconfig13
-rw-r--r--kernel/power/disk.c26
-rw-r--r--kernel/power/main.c213
-rw-r--r--kernel/power/power.h4
-rw-r--r--kernel/power/poweroff.c4
-rw-r--r--kernel/power/process.c121
-rw-r--r--kernel/power/snapshot.c88
-rw-r--r--kernel/power/swap.c15
-rw-r--r--kernel/power/user.c10
-rw-r--r--kernel/printk.c65
-rw-r--r--kernel/profile.c45
-rw-r--r--kernel/ptrace.c9
-rw-r--r--kernel/rcuclassic.c343
-rw-r--r--kernel/rcupdate.c20
-rw-r--r--kernel/rcupreempt.c20
-rw-r--r--kernel/rcupreempt_trace.c7
-rw-r--r--kernel/rcutorture.c2
-rw-r--r--kernel/relay.c182
-rw-r--r--kernel/res_counter.c48
-rw-r--r--kernel/resource.c240
-rw-r--r--kernel/rtmutex-tester.c7
-rw-r--r--kernel/rtmutex.c3
-rw-r--r--kernel/sched.c1034
-rw-r--r--kernel/sched_clock.c224
-rw-r--r--kernel/sched_debug.c2
-rw-r--r--kernel/sched_fair.c302
-rw-r--r--kernel/sched_features.h5
-rw-r--r--kernel/sched_idletask.c6
-rw-r--r--kernel/sched_rt.c168
-rw-r--r--kernel/sched_stats.h97
-rw-r--r--kernel/semaphore.c4
-rw-r--r--kernel/signal.c190
-rw-r--r--kernel/smp.c72
-rw-r--r--kernel/softirq.c157
-rw-r--r--kernel/softlockup.c75
-rw-r--r--kernel/spinlock.c12
-rw-r--r--kernel/stop_machine.c276
-rw-r--r--kernel/sys.c168
-rw-r--r--kernel/sys_ni.c13
-rw-r--r--kernel/sysctl.c350
-rw-r--r--kernel/sysctl_check.c2
-rw-r--r--kernel/taskstats.c6
-rw-r--r--kernel/time.c18
-rw-r--r--kernel/time/Kconfig1
-rw-r--r--kernel/time/clockevents.c15
-rw-r--r--kernel/time/clocksource.c15
-rw-r--r--kernel/time/jiffies.c1
-rw-r--r--kernel/time/ntp.c98
-rw-r--r--kernel/time/tick-broadcast.c115
-rw-r--r--kernel/time/tick-common.c29
-rw-r--r--kernel/time/tick-internal.h13
-rw-r--r--kernel/time/tick-oneshot.c44
-rw-r--r--kernel/time/tick-sched.c167
-rw-r--r--kernel/time/timekeeping.c122
-rw-r--r--kernel/time/timer_list.c28
-rw-r--r--kernel/timer.c12
-rw-r--r--kernel/trace/Kconfig64
-rw-r--r--kernel/trace/Makefile4
-rw-r--r--kernel/trace/ftrace.c281
-rw-r--r--kernel/trace/ring_buffer.c2014
-rw-r--r--kernel/trace/trace.c1849
-rw-r--r--kernel/trace/trace.h211
-rw-r--r--kernel/trace/trace_boot.c126
-rw-r--r--kernel/trace/trace_functions.c2
-rw-r--r--kernel/trace/trace_irqsoff.c27
-rw-r--r--kernel/trace/trace_mmiotrace.c116
-rw-r--r--kernel/trace/trace_nop.c64
-rw-r--r--kernel/trace/trace_sched_switch.c137
-rw-r--r--kernel/trace/trace_sched_wakeup.c175
-rw-r--r--kernel/trace/trace_selftest.c83
-rw-r--r--kernel/trace/trace_stack.c310
-rw-r--r--kernel/trace/trace_sysprof.c6
-rw-r--r--kernel/tracepoint.c477
-rw-r--r--kernel/tsacct.c33
-rw-r--r--kernel/user.c4
-rw-r--r--kernel/user_namespace.c1
-rw-r--r--kernel/utsname.c1
-rw-r--r--kernel/utsname_sysctl.c6
-rw-r--r--kernel/wait.c14
-rw-r--r--kernel/workqueue.c191
138 files changed, 12801 insertions, 6111 deletions
diff --git a/kernel/Kconfig.freezer b/kernel/Kconfig.freezer
new file mode 100644
index 000000000000..a3bb4cb52539
--- /dev/null
+++ b/kernel/Kconfig.freezer
@@ -0,0 +1,2 @@
1config FREEZER
2 def_bool PM_SLEEP || CGROUP_FREEZER
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
index 526128a2e622..94fabd534b03 100644
--- a/kernel/Kconfig.hz
+++ b/kernel/Kconfig.hz
@@ -55,4 +55,4 @@ config HZ
55 default 1000 if HZ_1000 55 default 1000 if HZ_1000
56 56
57config SCHED_HRTICK 57config SCHED_HRTICK
58 def_bool HIGH_RES_TIMERS && X86 58 def_bool HIGH_RES_TIMERS && (!SMP || USE_GENERIC_SMP_HELPERS)
diff --git a/kernel/Makefile b/kernel/Makefile
index 985ddb7da4d0..305f11dbef21 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -2,7 +2,7 @@
2# Makefile for the linux kernel. 2# Makefile for the linux kernel.
3# 3#
4 4
5obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ 5obj-y = sched.o fork.o exec_domain.o panic.o printk.o \
6 cpu.o exit.o itimer.o time.o softirq.o resource.o \ 6 cpu.o exit.o itimer.o time.o softirq.o resource.o \
7 sysctl.o capability.o ptrace.o timer.o user.o \ 7 sysctl.o capability.o ptrace.o timer.o user.o \
8 signal.o sys.o kmod.o workqueue.o pid.o \ 8 signal.o sys.o kmod.o workqueue.o pid.o \
@@ -11,6 +11,8 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \ 11 hrtimer.o rwsem.o nsproxy.o srcu.o semaphore.o \
12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o 12 notifier.o ksysfs.o pm_qos_params.o sched_clock.o
13 13
14CFLAGS_REMOVE_sched.o = -mno-spe
15
14ifdef CONFIG_FTRACE 16ifdef CONFIG_FTRACE
15# Do not trace debug files and internal ftrace files 17# Do not trace debug files and internal ftrace files
16CFLAGS_REMOVE_lockdep.o = -pg 18CFLAGS_REMOVE_lockdep.o = -pg
@@ -22,6 +24,8 @@ CFLAGS_REMOVE_sched_clock.o = -pg
22CFLAGS_REMOVE_sched.o = -mno-spe -pg 24CFLAGS_REMOVE_sched.o = -mno-spe -pg
23endif 25endif
24 26
27obj-$(CONFIG_FREEZER) += freezer.o
28obj-$(CONFIG_PROFILING) += profile.o
25obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o 29obj-$(CONFIG_SYSCTL_SYSCALL_CHECK) += sysctl_check.o
26obj-$(CONFIG_STACKTRACE) += stacktrace.o 30obj-$(CONFIG_STACKTRACE) += stacktrace.o
27obj-y += time/ 31obj-y += time/
@@ -52,6 +56,7 @@ obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
52obj-$(CONFIG_COMPAT) += compat.o 56obj-$(CONFIG_COMPAT) += compat.o
53obj-$(CONFIG_CGROUPS) += cgroup.o 57obj-$(CONFIG_CGROUPS) += cgroup.o
54obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o 58obj-$(CONFIG_CGROUP_DEBUG) += cgroup_debug.o
59obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
55obj-$(CONFIG_CPUSETS) += cpuset.o 60obj-$(CONFIG_CPUSETS) += cpuset.o
56obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o 61obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
57obj-$(CONFIG_UTS_NS) += utsname.o 62obj-$(CONFIG_UTS_NS) += utsname.o
@@ -80,7 +85,9 @@ obj-$(CONFIG_SYSCTL) += utsname_sysctl.o
80obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o 85obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
81obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o 86obj-$(CONFIG_TASKSTATS) += taskstats.o tsacct.o
82obj-$(CONFIG_MARKERS) += marker.o 87obj-$(CONFIG_MARKERS) += marker.o
88obj-$(CONFIG_TRACEPOINTS) += tracepoint.o
83obj-$(CONFIG_LATENCYTOP) += latencytop.o 89obj-$(CONFIG_LATENCYTOP) += latencytop.o
90obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
84obj-$(CONFIG_FTRACE) += trace/ 91obj-$(CONFIG_FTRACE) += trace/
85obj-$(CONFIG_TRACING) += trace/ 92obj-$(CONFIG_TRACING) += trace/
86obj-$(CONFIG_SMP) += sched_cpupri.o 93obj-$(CONFIG_SMP) += sched_cpupri.o
diff --git a/kernel/acct.c b/kernel/acct.c
index 91e1cfd734d2..f6006a60df5d 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -75,37 +75,39 @@ int acct_parm[3] = {4, 2, 30};
75/* 75/*
76 * External references and all of the globals. 76 * External references and all of the globals.
77 */ 77 */
78static void do_acct_process(struct pid_namespace *ns, struct file *); 78static void do_acct_process(struct bsd_acct_struct *acct,
79 struct pid_namespace *ns, struct file *);
79 80
80/* 81/*
81 * This structure is used so that all the data protected by lock 82 * This structure is used so that all the data protected by lock
82 * can be placed in the same cache line as the lock. This primes 83 * can be placed in the same cache line as the lock. This primes
83 * the cache line to have the data after getting the lock. 84 * the cache line to have the data after getting the lock.
84 */ 85 */
85struct acct_glbs { 86struct bsd_acct_struct {
86 spinlock_t lock;
87 volatile int active; 87 volatile int active;
88 volatile int needcheck; 88 volatile int needcheck;
89 struct file *file; 89 struct file *file;
90 struct pid_namespace *ns; 90 struct pid_namespace *ns;
91 struct timer_list timer; 91 struct timer_list timer;
92 struct list_head list;
92}; 93};
93 94
94static struct acct_glbs acct_globals __cacheline_aligned = 95static DEFINE_SPINLOCK(acct_lock);
95 {__SPIN_LOCK_UNLOCKED(acct_globals.lock)}; 96static LIST_HEAD(acct_list);
96 97
97/* 98/*
98 * Called whenever the timer says to check the free space. 99 * Called whenever the timer says to check the free space.
99 */ 100 */
100static void acct_timeout(unsigned long unused) 101static void acct_timeout(unsigned long x)
101{ 102{
102 acct_globals.needcheck = 1; 103 struct bsd_acct_struct *acct = (struct bsd_acct_struct *)x;
104 acct->needcheck = 1;
103} 105}
104 106
105/* 107/*
106 * Check the amount of free space and suspend/resume accordingly. 108 * Check the amount of free space and suspend/resume accordingly.
107 */ 109 */
108static int check_free_space(struct file *file) 110static int check_free_space(struct bsd_acct_struct *acct, struct file *file)
109{ 111{
110 struct kstatfs sbuf; 112 struct kstatfs sbuf;
111 int res; 113 int res;
@@ -113,11 +115,11 @@ static int check_free_space(struct file *file)
113 sector_t resume; 115 sector_t resume;
114 sector_t suspend; 116 sector_t suspend;
115 117
116 spin_lock(&acct_globals.lock); 118 spin_lock(&acct_lock);
117 res = acct_globals.active; 119 res = acct->active;
118 if (!file || !acct_globals.needcheck) 120 if (!file || !acct->needcheck)
119 goto out; 121 goto out;
120 spin_unlock(&acct_globals.lock); 122 spin_unlock(&acct_lock);
121 123
122 /* May block */ 124 /* May block */
123 if (vfs_statfs(file->f_path.dentry, &sbuf)) 125 if (vfs_statfs(file->f_path.dentry, &sbuf))
@@ -136,35 +138,35 @@ static int check_free_space(struct file *file)
136 act = 0; 138 act = 0;
137 139
138 /* 140 /*
139 * If some joker switched acct_globals.file under us we'ld better be 141 * If some joker switched acct->file under us we'ld better be
140 * silent and _not_ touch anything. 142 * silent and _not_ touch anything.
141 */ 143 */
142 spin_lock(&acct_globals.lock); 144 spin_lock(&acct_lock);
143 if (file != acct_globals.file) { 145 if (file != acct->file) {
144 if (act) 146 if (act)
145 res = act>0; 147 res = act>0;
146 goto out; 148 goto out;
147 } 149 }
148 150
149 if (acct_globals.active) { 151 if (acct->active) {
150 if (act < 0) { 152 if (act < 0) {
151 acct_globals.active = 0; 153 acct->active = 0;
152 printk(KERN_INFO "Process accounting paused\n"); 154 printk(KERN_INFO "Process accounting paused\n");
153 } 155 }
154 } else { 156 } else {
155 if (act > 0) { 157 if (act > 0) {
156 acct_globals.active = 1; 158 acct->active = 1;
157 printk(KERN_INFO "Process accounting resumed\n"); 159 printk(KERN_INFO "Process accounting resumed\n");
158 } 160 }
159 } 161 }
160 162
161 del_timer(&acct_globals.timer); 163 del_timer(&acct->timer);
162 acct_globals.needcheck = 0; 164 acct->needcheck = 0;
163 acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ; 165 acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
164 add_timer(&acct_globals.timer); 166 add_timer(&acct->timer);
165 res = acct_globals.active; 167 res = acct->active;
166out: 168out:
167 spin_unlock(&acct_globals.lock); 169 spin_unlock(&acct_lock);
168 return res; 170 return res;
169} 171}
170 172
@@ -172,39 +174,41 @@ out:
172 * Close the old accounting file (if currently open) and then replace 174 * Close the old accounting file (if currently open) and then replace
173 * it with file (if non-NULL). 175 * it with file (if non-NULL).
174 * 176 *
175 * NOTE: acct_globals.lock MUST be held on entry and exit. 177 * NOTE: acct_lock MUST be held on entry and exit.
176 */ 178 */
177static void acct_file_reopen(struct file *file) 179static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file,
180 struct pid_namespace *ns)
178{ 181{
179 struct file *old_acct = NULL; 182 struct file *old_acct = NULL;
180 struct pid_namespace *old_ns = NULL; 183 struct pid_namespace *old_ns = NULL;
181 184
182 if (acct_globals.file) { 185 if (acct->file) {
183 old_acct = acct_globals.file; 186 old_acct = acct->file;
184 old_ns = acct_globals.ns; 187 old_ns = acct->ns;
185 del_timer(&acct_globals.timer); 188 del_timer(&acct->timer);
186 acct_globals.active = 0; 189 acct->active = 0;
187 acct_globals.needcheck = 0; 190 acct->needcheck = 0;
188 acct_globals.file = NULL; 191 acct->file = NULL;
192 acct->ns = NULL;
193 list_del(&acct->list);
189 } 194 }
190 if (file) { 195 if (file) {
191 acct_globals.file = file; 196 acct->file = file;
192 acct_globals.ns = get_pid_ns(task_active_pid_ns(current)); 197 acct->ns = ns;
193 acct_globals.needcheck = 0; 198 acct->needcheck = 0;
194 acct_globals.active = 1; 199 acct->active = 1;
200 list_add(&acct->list, &acct_list);
195 /* It's been deleted if it was used before so this is safe */ 201 /* It's been deleted if it was used before so this is safe */
196 init_timer(&acct_globals.timer); 202 setup_timer(&acct->timer, acct_timeout, (unsigned long)acct);
197 acct_globals.timer.function = acct_timeout; 203 acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ;
198 acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ; 204 add_timer(&acct->timer);
199 add_timer(&acct_globals.timer);
200 } 205 }
201 if (old_acct) { 206 if (old_acct) {
202 mnt_unpin(old_acct->f_path.mnt); 207 mnt_unpin(old_acct->f_path.mnt);
203 spin_unlock(&acct_globals.lock); 208 spin_unlock(&acct_lock);
204 do_acct_process(old_ns, old_acct); 209 do_acct_process(acct, old_ns, old_acct);
205 filp_close(old_acct, NULL); 210 filp_close(old_acct, NULL);
206 put_pid_ns(old_ns); 211 spin_lock(&acct_lock);
207 spin_lock(&acct_globals.lock);
208 } 212 }
209} 213}
210 214
@@ -212,6 +216,8 @@ static int acct_on(char *name)
212{ 216{
213 struct file *file; 217 struct file *file;
214 int error; 218 int error;
219 struct pid_namespace *ns;
220 struct bsd_acct_struct *acct = NULL;
215 221
216 /* Difference from BSD - they don't do O_APPEND */ 222 /* Difference from BSD - they don't do O_APPEND */
217 file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0); 223 file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0);
@@ -228,18 +234,34 @@ static int acct_on(char *name)
228 return -EIO; 234 return -EIO;
229 } 235 }
230 236
237 ns = task_active_pid_ns(current);
238 if (ns->bacct == NULL) {
239 acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL);
240 if (acct == NULL) {
241 filp_close(file, NULL);
242 return -ENOMEM;
243 }
244 }
245
231 error = security_acct(file); 246 error = security_acct(file);
232 if (error) { 247 if (error) {
248 kfree(acct);
233 filp_close(file, NULL); 249 filp_close(file, NULL);
234 return error; 250 return error;
235 } 251 }
236 252
237 spin_lock(&acct_globals.lock); 253 spin_lock(&acct_lock);
254 if (ns->bacct == NULL) {
255 ns->bacct = acct;
256 acct = NULL;
257 }
258
238 mnt_pin(file->f_path.mnt); 259 mnt_pin(file->f_path.mnt);
239 acct_file_reopen(file); 260 acct_file_reopen(ns->bacct, file, ns);
240 spin_unlock(&acct_globals.lock); 261 spin_unlock(&acct_lock);
241 262
242 mntput(file->f_path.mnt); /* it's pinned, now give up active reference */ 263 mntput(file->f_path.mnt); /* it's pinned, now give up active reference */
264 kfree(acct);
243 265
244 return 0; 266 return 0;
245} 267}
@@ -269,11 +291,17 @@ asmlinkage long sys_acct(const char __user *name)
269 error = acct_on(tmp); 291 error = acct_on(tmp);
270 putname(tmp); 292 putname(tmp);
271 } else { 293 } else {
294 struct bsd_acct_struct *acct;
295
296 acct = task_active_pid_ns(current)->bacct;
297 if (acct == NULL)
298 return 0;
299
272 error = security_acct(NULL); 300 error = security_acct(NULL);
273 if (!error) { 301 if (!error) {
274 spin_lock(&acct_globals.lock); 302 spin_lock(&acct_lock);
275 acct_file_reopen(NULL); 303 acct_file_reopen(acct, NULL, NULL);
276 spin_unlock(&acct_globals.lock); 304 spin_unlock(&acct_lock);
277 } 305 }
278 } 306 }
279 return error; 307 return error;
@@ -288,10 +316,16 @@ asmlinkage long sys_acct(const char __user *name)
288 */ 316 */
289void acct_auto_close_mnt(struct vfsmount *m) 317void acct_auto_close_mnt(struct vfsmount *m)
290{ 318{
291 spin_lock(&acct_globals.lock); 319 struct bsd_acct_struct *acct;
292 if (acct_globals.file && acct_globals.file->f_path.mnt == m) 320
293 acct_file_reopen(NULL); 321 spin_lock(&acct_lock);
294 spin_unlock(&acct_globals.lock); 322restart:
323 list_for_each_entry(acct, &acct_list, list)
324 if (acct->file && acct->file->f_path.mnt == m) {
325 acct_file_reopen(acct, NULL, NULL);
326 goto restart;
327 }
328 spin_unlock(&acct_lock);
295} 329}
296 330
297/** 331/**
@@ -303,12 +337,31 @@ void acct_auto_close_mnt(struct vfsmount *m)
303 */ 337 */
304void acct_auto_close(struct super_block *sb) 338void acct_auto_close(struct super_block *sb)
305{ 339{
306 spin_lock(&acct_globals.lock); 340 struct bsd_acct_struct *acct;
307 if (acct_globals.file && 341
308 acct_globals.file->f_path.mnt->mnt_sb == sb) { 342 spin_lock(&acct_lock);
309 acct_file_reopen(NULL); 343restart:
344 list_for_each_entry(acct, &acct_list, list)
345 if (acct->file && acct->file->f_path.mnt->mnt_sb == sb) {
346 acct_file_reopen(acct, NULL, NULL);
347 goto restart;
348 }
349 spin_unlock(&acct_lock);
350}
351
352void acct_exit_ns(struct pid_namespace *ns)
353{
354 struct bsd_acct_struct *acct;
355
356 spin_lock(&acct_lock);
357 acct = ns->bacct;
358 if (acct != NULL) {
359 if (acct->file != NULL)
360 acct_file_reopen(acct, NULL, NULL);
361
362 kfree(acct);
310 } 363 }
311 spin_unlock(&acct_globals.lock); 364 spin_unlock(&acct_lock);
312} 365}
313 366
314/* 367/*
@@ -425,7 +478,8 @@ static u32 encode_float(u64 value)
425/* 478/*
426 * do_acct_process does all actual work. Caller holds the reference to file. 479 * do_acct_process does all actual work. Caller holds the reference to file.
427 */ 480 */
428static void do_acct_process(struct pid_namespace *ns, struct file *file) 481static void do_acct_process(struct bsd_acct_struct *acct,
482 struct pid_namespace *ns, struct file *file)
429{ 483{
430 struct pacct_struct *pacct = &current->signal->pacct; 484 struct pacct_struct *pacct = &current->signal->pacct;
431 acct_t ac; 485 acct_t ac;
@@ -440,7 +494,7 @@ static void do_acct_process(struct pid_namespace *ns, struct file *file)
440 * First check to see if there is enough free_space to continue 494 * First check to see if there is enough free_space to continue
441 * the process accounting system. 495 * the process accounting system.
442 */ 496 */
443 if (!check_free_space(file)) 497 if (!check_free_space(acct, file))
444 return; 498 return;
445 499
446 /* 500 /*
@@ -494,7 +548,7 @@ static void do_acct_process(struct pid_namespace *ns, struct file *file)
494#endif 548#endif
495 549
496 spin_lock_irq(&current->sighand->siglock); 550 spin_lock_irq(&current->sighand->siglock);
497 tty = current->signal->tty; 551 tty = current->signal->tty; /* Safe as we hold the siglock */
498 ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; 552 ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0;
499 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); 553 ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
500 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); 554 ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
@@ -577,34 +631,46 @@ void acct_collect(long exitcode, int group_dead)
577 spin_unlock_irq(&current->sighand->siglock); 631 spin_unlock_irq(&current->sighand->siglock);
578} 632}
579 633
580/** 634static void acct_process_in_ns(struct pid_namespace *ns)
581 * acct_process - now just a wrapper around do_acct_process
582 * @exitcode: task exit code
583 *
584 * handles process accounting for an exiting task
585 */
586void acct_process(void)
587{ 635{
588 struct file *file = NULL; 636 struct file *file = NULL;
589 struct pid_namespace *ns; 637 struct bsd_acct_struct *acct;
590 638
639 acct = ns->bacct;
591 /* 640 /*
592 * accelerate the common fastpath: 641 * accelerate the common fastpath:
593 */ 642 */
594 if (!acct_globals.file) 643 if (!acct || !acct->file)
595 return; 644 return;
596 645
597 spin_lock(&acct_globals.lock); 646 spin_lock(&acct_lock);
598 file = acct_globals.file; 647 file = acct->file;
599 if (unlikely(!file)) { 648 if (unlikely(!file)) {
600 spin_unlock(&acct_globals.lock); 649 spin_unlock(&acct_lock);
601 return; 650 return;
602 } 651 }
603 get_file(file); 652 get_file(file);
604 ns = get_pid_ns(acct_globals.ns); 653 spin_unlock(&acct_lock);
605 spin_unlock(&acct_globals.lock);
606 654
607 do_acct_process(ns, file); 655 do_acct_process(acct, ns, file);
608 fput(file); 656 fput(file);
609 put_pid_ns(ns); 657}
658
659/**
660 * acct_process - now just a wrapper around acct_process_in_ns,
661 * which in turn is a wrapper around do_acct_process.
662 *
663 * handles process accounting for an exiting task
664 */
665void acct_process(void)
666{
667 struct pid_namespace *ns;
668
669 /*
670 * This loop is safe lockless, since current is still
671 * alive and holds its namespace, which in turn holds
672 * its parent.
673 */
674 for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent)
675 acct_process_in_ns(ns);
610} 676}
diff --git a/kernel/audit.c b/kernel/audit.c
index e092f1c0ce30..4414e93d8750 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -707,12 +707,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
707 if (status_get->mask & AUDIT_STATUS_ENABLED) { 707 if (status_get->mask & AUDIT_STATUS_ENABLED) {
708 err = audit_set_enabled(status_get->enabled, 708 err = audit_set_enabled(status_get->enabled,
709 loginuid, sessionid, sid); 709 loginuid, sessionid, sid);
710 if (err < 0) return err; 710 if (err < 0)
711 return err;
711 } 712 }
712 if (status_get->mask & AUDIT_STATUS_FAILURE) { 713 if (status_get->mask & AUDIT_STATUS_FAILURE) {
713 err = audit_set_failure(status_get->failure, 714 err = audit_set_failure(status_get->failure,
714 loginuid, sessionid, sid); 715 loginuid, sessionid, sid);
715 if (err < 0) return err; 716 if (err < 0)
717 return err;
716 } 718 }
717 if (status_get->mask & AUDIT_STATUS_PID) { 719 if (status_get->mask & AUDIT_STATUS_PID) {
718 int new_pid = status_get->pid; 720 int new_pid = status_get->pid;
@@ -725,9 +727,12 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
725 audit_pid = new_pid; 727 audit_pid = new_pid;
726 audit_nlk_pid = NETLINK_CB(skb).pid; 728 audit_nlk_pid = NETLINK_CB(skb).pid;
727 } 729 }
728 if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) 730 if (status_get->mask & AUDIT_STATUS_RATE_LIMIT) {
729 err = audit_set_rate_limit(status_get->rate_limit, 731 err = audit_set_rate_limit(status_get->rate_limit,
730 loginuid, sessionid, sid); 732 loginuid, sessionid, sid);
733 if (err < 0)
734 return err;
735 }
731 if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT) 736 if (status_get->mask & AUDIT_STATUS_BACKLOG_LIMIT)
732 err = audit_set_backlog_limit(status_get->backlog_limit, 737 err = audit_set_backlog_limit(status_get->backlog_limit,
733 loginuid, sessionid, sid); 738 loginuid, sessionid, sid);
@@ -1366,7 +1371,7 @@ int audit_string_contains_control(const char *string, size_t len)
1366{ 1371{
1367 const unsigned char *p; 1372 const unsigned char *p;
1368 for (p = string; p < (const unsigned char *)string + len && *p; p++) { 1373 for (p = string; p < (const unsigned char *)string + len && *p; p++) {
1369 if (*p == '"' || *p < 0x21 || *p > 0x7f) 1374 if (*p == '"' || *p < 0x21 || *p > 0x7e)
1370 return 1; 1375 return 1;
1371 } 1376 }
1372 return 0; 1377 return 0;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index f7921a2ecf16..8ba0e0d934f2 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -532,7 +532,7 @@ void audit_trim_trees(void)
532 list_add(&cursor, &tree_list); 532 list_add(&cursor, &tree_list);
533 while (cursor.next != &tree_list) { 533 while (cursor.next != &tree_list) {
534 struct audit_tree *tree; 534 struct audit_tree *tree;
535 struct nameidata nd; 535 struct path path;
536 struct vfsmount *root_mnt; 536 struct vfsmount *root_mnt;
537 struct node *node; 537 struct node *node;
538 struct list_head list; 538 struct list_head list;
@@ -544,12 +544,12 @@ void audit_trim_trees(void)
544 list_add(&cursor, &tree->list); 544 list_add(&cursor, &tree->list);
545 mutex_unlock(&audit_filter_mutex); 545 mutex_unlock(&audit_filter_mutex);
546 546
547 err = path_lookup(tree->pathname, 0, &nd); 547 err = kern_path(tree->pathname, 0, &path);
548 if (err) 548 if (err)
549 goto skip_it; 549 goto skip_it;
550 550
551 root_mnt = collect_mounts(nd.path.mnt, nd.path.dentry); 551 root_mnt = collect_mounts(path.mnt, path.dentry);
552 path_put(&nd.path); 552 path_put(&path);
553 if (!root_mnt) 553 if (!root_mnt)
554 goto skip_it; 554 goto skip_it;
555 555
@@ -580,19 +580,19 @@ skip_it:
580} 580}
581 581
582static int is_under(struct vfsmount *mnt, struct dentry *dentry, 582static int is_under(struct vfsmount *mnt, struct dentry *dentry,
583 struct nameidata *nd) 583 struct path *path)
584{ 584{
585 if (mnt != nd->path.mnt) { 585 if (mnt != path->mnt) {
586 for (;;) { 586 for (;;) {
587 if (mnt->mnt_parent == mnt) 587 if (mnt->mnt_parent == mnt)
588 return 0; 588 return 0;
589 if (mnt->mnt_parent == nd->path.mnt) 589 if (mnt->mnt_parent == path->mnt)
590 break; 590 break;
591 mnt = mnt->mnt_parent; 591 mnt = mnt->mnt_parent;
592 } 592 }
593 dentry = mnt->mnt_mountpoint; 593 dentry = mnt->mnt_mountpoint;
594 } 594 }
595 return is_subdir(dentry, nd->path.dentry); 595 return is_subdir(dentry, path->dentry);
596} 596}
597 597
598int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op) 598int audit_make_tree(struct audit_krule *rule, char *pathname, u32 op)
@@ -618,7 +618,7 @@ void audit_put_tree(struct audit_tree *tree)
618int audit_add_tree_rule(struct audit_krule *rule) 618int audit_add_tree_rule(struct audit_krule *rule)
619{ 619{
620 struct audit_tree *seed = rule->tree, *tree; 620 struct audit_tree *seed = rule->tree, *tree;
621 struct nameidata nd; 621 struct path path;
622 struct vfsmount *mnt, *p; 622 struct vfsmount *mnt, *p;
623 struct list_head list; 623 struct list_head list;
624 int err; 624 int err;
@@ -637,11 +637,11 @@ int audit_add_tree_rule(struct audit_krule *rule)
637 /* do not set rule->tree yet */ 637 /* do not set rule->tree yet */
638 mutex_unlock(&audit_filter_mutex); 638 mutex_unlock(&audit_filter_mutex);
639 639
640 err = path_lookup(tree->pathname, 0, &nd); 640 err = kern_path(tree->pathname, 0, &path);
641 if (err) 641 if (err)
642 goto Err; 642 goto Err;
643 mnt = collect_mounts(nd.path.mnt, nd.path.dentry); 643 mnt = collect_mounts(path.mnt, path.dentry);
644 path_put(&nd.path); 644 path_put(&path);
645 if (!mnt) { 645 if (!mnt) {
646 err = -ENOMEM; 646 err = -ENOMEM;
647 goto Err; 647 goto Err;
@@ -690,29 +690,29 @@ int audit_tag_tree(char *old, char *new)
690{ 690{
691 struct list_head cursor, barrier; 691 struct list_head cursor, barrier;
692 int failed = 0; 692 int failed = 0;
693 struct nameidata nd; 693 struct path path;
694 struct vfsmount *tagged; 694 struct vfsmount *tagged;
695 struct list_head list; 695 struct list_head list;
696 struct vfsmount *mnt; 696 struct vfsmount *mnt;
697 struct dentry *dentry; 697 struct dentry *dentry;
698 int err; 698 int err;
699 699
700 err = path_lookup(new, 0, &nd); 700 err = kern_path(new, 0, &path);
701 if (err) 701 if (err)
702 return err; 702 return err;
703 tagged = collect_mounts(nd.path.mnt, nd.path.dentry); 703 tagged = collect_mounts(path.mnt, path.dentry);
704 path_put(&nd.path); 704 path_put(&path);
705 if (!tagged) 705 if (!tagged)
706 return -ENOMEM; 706 return -ENOMEM;
707 707
708 err = path_lookup(old, 0, &nd); 708 err = kern_path(old, 0, &path);
709 if (err) { 709 if (err) {
710 drop_collected_mounts(tagged); 710 drop_collected_mounts(tagged);
711 return err; 711 return err;
712 } 712 }
713 mnt = mntget(nd.path.mnt); 713 mnt = mntget(path.mnt);
714 dentry = dget(nd.path.dentry); 714 dentry = dget(path.dentry);
715 path_put(&nd.path); 715 path_put(&path);
716 716
717 if (dentry == tagged->mnt_root && dentry == mnt->mnt_root) 717 if (dentry == tagged->mnt_root && dentry == mnt->mnt_root)
718 follow_up(&mnt, &dentry); 718 follow_up(&mnt, &dentry);
@@ -733,7 +733,7 @@ int audit_tag_tree(char *old, char *new)
733 list_add(&cursor, &tree->list); 733 list_add(&cursor, &tree->list);
734 mutex_unlock(&audit_filter_mutex); 734 mutex_unlock(&audit_filter_mutex);
735 735
736 err = path_lookup(tree->pathname, 0, &nd); 736 err = kern_path(tree->pathname, 0, &path);
737 if (err) { 737 if (err) {
738 put_tree(tree); 738 put_tree(tree);
739 mutex_lock(&audit_filter_mutex); 739 mutex_lock(&audit_filter_mutex);
@@ -741,15 +741,15 @@ int audit_tag_tree(char *old, char *new)
741 } 741 }
742 742
743 spin_lock(&vfsmount_lock); 743 spin_lock(&vfsmount_lock);
744 if (!is_under(mnt, dentry, &nd)) { 744 if (!is_under(mnt, dentry, &path)) {
745 spin_unlock(&vfsmount_lock); 745 spin_unlock(&vfsmount_lock);
746 path_put(&nd.path); 746 path_put(&path);
747 put_tree(tree); 747 put_tree(tree);
748 mutex_lock(&audit_filter_mutex); 748 mutex_lock(&audit_filter_mutex);
749 continue; 749 continue;
750 } 750 }
751 spin_unlock(&vfsmount_lock); 751 spin_unlock(&vfsmount_lock);
752 path_put(&nd.path); 752 path_put(&path);
753 753
754 list_for_each_entry(p, &list, mnt_list) { 754 list_for_each_entry(p, &list, mnt_list) {
755 failed = tag_chunk(p->mnt_root->d_inode, tree); 755 failed = tag_chunk(p->mnt_root->d_inode, tree);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index 98c50cc671bb..b7d354e2b0ef 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1022,8 +1022,11 @@ static void audit_update_watch(struct audit_parent *parent,
1022 struct audit_buffer *ab; 1022 struct audit_buffer *ab;
1023 ab = audit_log_start(NULL, GFP_KERNEL, 1023 ab = audit_log_start(NULL, GFP_KERNEL,
1024 AUDIT_CONFIG_CHANGE); 1024 AUDIT_CONFIG_CHANGE);
1025 audit_log_format(ab, "auid=%u ses=%u",
1026 audit_get_loginuid(current),
1027 audit_get_sessionid(current));
1025 audit_log_format(ab, 1028 audit_log_format(ab,
1026 "op=updated rules specifying path="); 1029 " op=updated rules specifying path=");
1027 audit_log_untrustedstring(ab, owatch->path); 1030 audit_log_untrustedstring(ab, owatch->path);
1028 audit_log_format(ab, " with dev=%u ino=%lu\n", 1031 audit_log_format(ab, " with dev=%u ino=%lu\n",
1029 dev, ino); 1032 dev, ino);
@@ -1058,7 +1061,10 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
1058 struct audit_buffer *ab; 1061 struct audit_buffer *ab;
1059 ab = audit_log_start(NULL, GFP_KERNEL, 1062 ab = audit_log_start(NULL, GFP_KERNEL,
1060 AUDIT_CONFIG_CHANGE); 1063 AUDIT_CONFIG_CHANGE);
1061 audit_log_format(ab, "op=remove rule path="); 1064 audit_log_format(ab, "auid=%u ses=%u",
1065 audit_get_loginuid(current),
1066 audit_get_sessionid(current));
1067 audit_log_format(ab, " op=remove rule path=");
1062 audit_log_untrustedstring(ab, w->path); 1068 audit_log_untrustedstring(ab, w->path);
1063 if (r->filterkey) { 1069 if (r->filterkey) {
1064 audit_log_format(ab, " key="); 1070 audit_log_format(ab, " key=");
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index c10e7aae04d7..cf5bc2f5f9c3 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -243,7 +243,11 @@ static inline int open_arg(int flags, int mask)
243 243
244static int audit_match_perm(struct audit_context *ctx, int mask) 244static int audit_match_perm(struct audit_context *ctx, int mask)
245{ 245{
246 unsigned n = ctx->major; 246 unsigned n;
247 if (unlikely(!ctx))
248 return 0;
249 n = ctx->major;
250
247 switch (audit_classify_syscall(ctx->arch, n)) { 251 switch (audit_classify_syscall(ctx->arch, n)) {
248 case 0: /* native */ 252 case 0: /* native */
249 if ((mask & AUDIT_PERM_WRITE) && 253 if ((mask & AUDIT_PERM_WRITE) &&
@@ -284,6 +288,10 @@ static int audit_match_filetype(struct audit_context *ctx, int which)
284{ 288{
285 unsigned index = which & ~S_IFMT; 289 unsigned index = which & ~S_IFMT;
286 mode_t mode = which & S_IFMT; 290 mode_t mode = which & S_IFMT;
291
292 if (unlikely(!ctx))
293 return 0;
294
287 if (index >= ctx->name_count) 295 if (index >= ctx->name_count)
288 return 0; 296 return 0;
289 if (ctx->names[index].ino == -1) 297 if (ctx->names[index].ino == -1)
@@ -610,7 +618,7 @@ static int audit_filter_rules(struct task_struct *tsk,
610 if (!result) 618 if (!result)
611 return 0; 619 return 0;
612 } 620 }
613 if (rule->filterkey) 621 if (rule->filterkey && ctx)
614 ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC); 622 ctx->filterkey = kstrdup(rule->filterkey, GFP_ATOMIC);
615 switch (rule->action) { 623 switch (rule->action) {
616 case AUDIT_NEVER: *state = AUDIT_DISABLED; break; 624 case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
@@ -1196,13 +1204,13 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1196 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no", 1204 (context->return_valid==AUDITSC_SUCCESS)?"yes":"no",
1197 context->return_code); 1205 context->return_code);
1198 1206
1199 mutex_lock(&tty_mutex); 1207 spin_lock_irq(&tsk->sighand->siglock);
1200 read_lock(&tasklist_lock);
1201 if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name) 1208 if (tsk->signal && tsk->signal->tty && tsk->signal->tty->name)
1202 tty = tsk->signal->tty->name; 1209 tty = tsk->signal->tty->name;
1203 else 1210 else
1204 tty = "(none)"; 1211 tty = "(none)";
1205 read_unlock(&tasklist_lock); 1212 spin_unlock_irq(&tsk->sighand->siglock);
1213
1206 audit_log_format(ab, 1214 audit_log_format(ab,
1207 " a0=%lx a1=%lx a2=%lx a3=%lx items=%d" 1215 " a0=%lx a1=%lx a2=%lx a3=%lx items=%d"
1208 " ppid=%d pid=%d auid=%u uid=%u gid=%u" 1216 " ppid=%d pid=%d auid=%u uid=%u gid=%u"
@@ -1222,7 +1230,6 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
1222 context->egid, context->sgid, context->fsgid, tty, 1230 context->egid, context->sgid, context->fsgid, tty,
1223 tsk->sessionid); 1231 tsk->sessionid);
1224 1232
1225 mutex_unlock(&tty_mutex);
1226 1233
1227 audit_log_task_info(ab, tsk); 1234 audit_log_task_info(ab, tsk);
1228 if (context->filterkey) { 1235 if (context->filterkey) {
@@ -1476,7 +1483,8 @@ void audit_syscall_entry(int arch, int major,
1476 struct audit_context *context = tsk->audit_context; 1483 struct audit_context *context = tsk->audit_context;
1477 enum audit_state state; 1484 enum audit_state state;
1478 1485
1479 BUG_ON(!context); 1486 if (unlikely(!context))
1487 return;
1480 1488
1481 /* 1489 /*
1482 * This happens only on certain architectures that make system 1490 * This happens only on certain architectures that make system
@@ -2374,7 +2382,7 @@ int __audit_signal_info(int sig, struct task_struct *t)
2374 struct audit_context *ctx = tsk->audit_context; 2382 struct audit_context *ctx = tsk->audit_context;
2375 2383
2376 if (audit_pid && t->tgid == audit_pid) { 2384 if (audit_pid && t->tgid == audit_pid) {
2377 if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1) { 2385 if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
2378 audit_sig_pid = tsk->pid; 2386 audit_sig_pid = tsk->pid;
2379 if (tsk->loginuid != -1) 2387 if (tsk->loginuid != -1)
2380 audit_sig_uid = tsk->loginuid; 2388 audit_sig_uid = tsk->loginuid;
diff --git a/kernel/capability.c b/kernel/capability.c
index 901e0fdc3fff..33e51e78c2d8 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -115,11 +115,208 @@ static int cap_validate_magic(cap_user_header_t header, unsigned *tocopy)
115 return 0; 115 return 0;
116} 116}
117 117
118#ifndef CONFIG_SECURITY_FILE_CAPABILITIES
119
120/*
121 * Without filesystem capability support, we nominally support one process
122 * setting the capabilities of another
123 */
124static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
125 kernel_cap_t *pIp, kernel_cap_t *pPp)
126{
127 struct task_struct *target;
128 int ret;
129
130 spin_lock(&task_capability_lock);
131 read_lock(&tasklist_lock);
132
133 if (pid && pid != task_pid_vnr(current)) {
134 target = find_task_by_vpid(pid);
135 if (!target) {
136 ret = -ESRCH;
137 goto out;
138 }
139 } else
140 target = current;
141
142 ret = security_capget(target, pEp, pIp, pPp);
143
144out:
145 read_unlock(&tasklist_lock);
146 spin_unlock(&task_capability_lock);
147
148 return ret;
149}
150
151/*
152 * cap_set_pg - set capabilities for all processes in a given process
153 * group. We call this holding task_capability_lock and tasklist_lock.
154 */
155static inline int cap_set_pg(int pgrp_nr, kernel_cap_t *effective,
156 kernel_cap_t *inheritable,
157 kernel_cap_t *permitted)
158{
159 struct task_struct *g, *target;
160 int ret = -EPERM;
161 int found = 0;
162 struct pid *pgrp;
163
164 spin_lock(&task_capability_lock);
165 read_lock(&tasklist_lock);
166
167 pgrp = find_vpid(pgrp_nr);
168 do_each_pid_task(pgrp, PIDTYPE_PGID, g) {
169 target = g;
170 while_each_thread(g, target) {
171 if (!security_capset_check(target, effective,
172 inheritable, permitted)) {
173 security_capset_set(target, effective,
174 inheritable, permitted);
175 ret = 0;
176 }
177 found = 1;
178 }
179 } while_each_pid_task(pgrp, PIDTYPE_PGID, g);
180
181 read_unlock(&tasklist_lock);
182 spin_unlock(&task_capability_lock);
183
184 if (!found)
185 ret = 0;
186 return ret;
187}
188
189/*
190 * cap_set_all - set capabilities for all processes other than init
191 * and self. We call this holding task_capability_lock and tasklist_lock.
192 */
193static inline int cap_set_all(kernel_cap_t *effective,
194 kernel_cap_t *inheritable,
195 kernel_cap_t *permitted)
196{
197 struct task_struct *g, *target;
198 int ret = -EPERM;
199 int found = 0;
200
201 spin_lock(&task_capability_lock);
202 read_lock(&tasklist_lock);
203
204 do_each_thread(g, target) {
205 if (target == current
206 || is_container_init(target->group_leader))
207 continue;
208 found = 1;
209 if (security_capset_check(target, effective, inheritable,
210 permitted))
211 continue;
212 ret = 0;
213 security_capset_set(target, effective, inheritable, permitted);
214 } while_each_thread(g, target);
215
216 read_unlock(&tasklist_lock);
217 spin_unlock(&task_capability_lock);
218
219 if (!found)
220 ret = 0;
221
222 return ret;
223}
224
225/*
226 * Given the target pid does not refer to the current process we
227 * need more elaborate support... (This support is not present when
228 * filesystem capabilities are configured.)
229 */
230static inline int do_sys_capset_other_tasks(pid_t pid, kernel_cap_t *effective,
231 kernel_cap_t *inheritable,
232 kernel_cap_t *permitted)
233{
234 struct task_struct *target;
235 int ret;
236
237 if (!capable(CAP_SETPCAP))
238 return -EPERM;
239
240 if (pid == -1) /* all procs other than current and init */
241 return cap_set_all(effective, inheritable, permitted);
242
243 else if (pid < 0) /* all procs in process group */
244 return cap_set_pg(-pid, effective, inheritable, permitted);
245
246 /* target != current */
247 spin_lock(&task_capability_lock);
248 read_lock(&tasklist_lock);
249
250 target = find_task_by_vpid(pid);
251 if (!target)
252 ret = -ESRCH;
253 else {
254 ret = security_capset_check(target, effective, inheritable,
255 permitted);
256
257 /* having verified that the proposed changes are legal,
258 we now put them into effect. */
259 if (!ret)
260 security_capset_set(target, effective, inheritable,
261 permitted);
262 }
263
264 read_unlock(&tasklist_lock);
265 spin_unlock(&task_capability_lock);
266
267 return ret;
268}
269
270#else /* ie., def CONFIG_SECURITY_FILE_CAPABILITIES */
271
118/* 272/*
119 * For sys_getproccap() and sys_setproccap(), any of the three 273 * If we have configured with filesystem capability support, then the
120 * capability set pointers may be NULL -- indicating that that set is 274 * only thing that can change the capabilities of the current process
121 * uninteresting and/or not to be changed. 275 * is the current process. As such, we can't be in this code at the
276 * same time as we are in the process of setting capabilities in this
277 * process. The net result is that we can limit our use of locks to
278 * when we are reading the caps of another process.
122 */ 279 */
280static inline int cap_get_target_pid(pid_t pid, kernel_cap_t *pEp,
281 kernel_cap_t *pIp, kernel_cap_t *pPp)
282{
283 int ret;
284
285 if (pid && (pid != task_pid_vnr(current))) {
286 struct task_struct *target;
287
288 spin_lock(&task_capability_lock);
289 read_lock(&tasklist_lock);
290
291 target = find_task_by_vpid(pid);
292 if (!target)
293 ret = -ESRCH;
294 else
295 ret = security_capget(target, pEp, pIp, pPp);
296
297 read_unlock(&tasklist_lock);
298 spin_unlock(&task_capability_lock);
299 } else
300 ret = security_capget(current, pEp, pIp, pPp);
301
302 return ret;
303}
304
305/*
306 * With filesystem capability support configured, the kernel does not
307 * permit the changing of capabilities in one process by another
308 * process. (CAP_SETPCAP has much less broad semantics when configured
309 * this way.)
310 */
311static inline int do_sys_capset_other_tasks(pid_t pid,
312 kernel_cap_t *effective,
313 kernel_cap_t *inheritable,
314 kernel_cap_t *permitted)
315{
316 return -EPERM;
317}
318
319#endif /* ie., ndef CONFIG_SECURITY_FILE_CAPABILITIES */
123 320
124/* 321/*
125 * Atomically modify the effective capabilities returning the original 322 * Atomically modify the effective capabilities returning the original
@@ -155,7 +352,6 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
155{ 352{
156 int ret = 0; 353 int ret = 0;
157 pid_t pid; 354 pid_t pid;
158 struct task_struct *target;
159 unsigned tocopy; 355 unsigned tocopy;
160 kernel_cap_t pE, pI, pP; 356 kernel_cap_t pE, pI, pP;
161 357
@@ -169,23 +365,7 @@ asmlinkage long sys_capget(cap_user_header_t header, cap_user_data_t dataptr)
169 if (pid < 0) 365 if (pid < 0)
170 return -EINVAL; 366 return -EINVAL;
171 367
172 spin_lock(&task_capability_lock); 368 ret = cap_get_target_pid(pid, &pE, &pI, &pP);
173 read_lock(&tasklist_lock);
174
175 if (pid && pid != task_pid_vnr(current)) {
176 target = find_task_by_vpid(pid);
177 if (!target) {
178 ret = -ESRCH;
179 goto out;
180 }
181 } else
182 target = current;
183
184 ret = security_capget(target, &pE, &pI, &pP);
185
186out:
187 read_unlock(&tasklist_lock);
188 spin_unlock(&task_capability_lock);
189 369
190 if (!ret) { 370 if (!ret) {
191 struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; 371 struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
@@ -216,7 +396,6 @@ out:
216 * before modification is attempted and the application 396 * before modification is attempted and the application
217 * fails. 397 * fails.
218 */ 398 */
219
220 if (copy_to_user(dataptr, kdata, tocopy 399 if (copy_to_user(dataptr, kdata, tocopy
221 * sizeof(struct __user_cap_data_struct))) { 400 * sizeof(struct __user_cap_data_struct))) {
222 return -EFAULT; 401 return -EFAULT;
@@ -226,70 +405,8 @@ out:
226 return ret; 405 return ret;
227} 406}
228 407
229/*
230 * cap_set_pg - set capabilities for all processes in a given process
231 * group. We call this holding task_capability_lock and tasklist_lock.
232 */
233static inline int cap_set_pg(int pgrp_nr, kernel_cap_t *effective,
234 kernel_cap_t *inheritable,
235 kernel_cap_t *permitted)
236{
237 struct task_struct *g, *target;
238 int ret = -EPERM;
239 int found = 0;
240 struct pid *pgrp;
241
242 pgrp = find_vpid(pgrp_nr);
243 do_each_pid_task(pgrp, PIDTYPE_PGID, g) {
244 target = g;
245 while_each_thread(g, target) {
246 if (!security_capset_check(target, effective,
247 inheritable,
248 permitted)) {
249 security_capset_set(target, effective,
250 inheritable,
251 permitted);
252 ret = 0;
253 }
254 found = 1;
255 }
256 } while_each_pid_task(pgrp, PIDTYPE_PGID, g);
257
258 if (!found)
259 ret = 0;
260 return ret;
261}
262
263/*
264 * cap_set_all - set capabilities for all processes other than init
265 * and self. We call this holding task_capability_lock and tasklist_lock.
266 */
267static inline int cap_set_all(kernel_cap_t *effective,
268 kernel_cap_t *inheritable,
269 kernel_cap_t *permitted)
270{
271 struct task_struct *g, *target;
272 int ret = -EPERM;
273 int found = 0;
274
275 do_each_thread(g, target) {
276 if (target == current || is_container_init(target->group_leader))
277 continue;
278 found = 1;
279 if (security_capset_check(target, effective, inheritable,
280 permitted))
281 continue;
282 ret = 0;
283 security_capset_set(target, effective, inheritable, permitted);
284 } while_each_thread(g, target);
285
286 if (!found)
287 ret = 0;
288 return ret;
289}
290
291/** 408/**
292 * sys_capset - set capabilities for a process or a group of processes 409 * sys_capset - set capabilities for a process or (*) a group of processes
293 * @header: pointer to struct that contains capability version and 410 * @header: pointer to struct that contains capability version and
294 * target pid data 411 * target pid data
295 * @data: pointer to struct that contains the effective, permitted, 412 * @data: pointer to struct that contains the effective, permitted,
@@ -313,7 +430,6 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
313 struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S]; 430 struct __user_cap_data_struct kdata[_KERNEL_CAPABILITY_U32S];
314 unsigned i, tocopy; 431 unsigned i, tocopy;
315 kernel_cap_t inheritable, permitted, effective; 432 kernel_cap_t inheritable, permitted, effective;
316 struct task_struct *target;
317 int ret; 433 int ret;
318 pid_t pid; 434 pid_t pid;
319 435
@@ -324,9 +440,6 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
324 if (get_user(pid, &header->pid)) 440 if (get_user(pid, &header->pid))
325 return -EFAULT; 441 return -EFAULT;
326 442
327 if (pid && pid != task_pid_vnr(current) && !capable(CAP_SETPCAP))
328 return -EPERM;
329
330 if (copy_from_user(&kdata, data, tocopy 443 if (copy_from_user(&kdata, data, tocopy
331 * sizeof(struct __user_cap_data_struct))) { 444 * sizeof(struct __user_cap_data_struct))) {
332 return -EFAULT; 445 return -EFAULT;
@@ -344,55 +457,51 @@ asmlinkage long sys_capset(cap_user_header_t header, const cap_user_data_t data)
344 i++; 457 i++;
345 } 458 }
346 459
347 spin_lock(&task_capability_lock); 460 if (pid && (pid != task_pid_vnr(current)))
348 read_lock(&tasklist_lock); 461 ret = do_sys_capset_other_tasks(pid, &effective, &inheritable,
349 462 &permitted);
350 if (pid > 0 && pid != task_pid_vnr(current)) { 463 else {
351 target = find_task_by_vpid(pid); 464 /*
352 if (!target) { 465 * This lock is required even when filesystem
353 ret = -ESRCH; 466 * capability support is configured - it protects the
354 goto out; 467 * sys_capget() call from returning incorrect data in
355 } 468 * the case that the targeted process is not the
356 } else 469 * current one.
357 target = current; 470 */
358 471 spin_lock(&task_capability_lock);
359 ret = 0;
360
361 /* having verified that the proposed changes are legal,
362 we now put them into effect. */
363 if (pid < 0) {
364 if (pid == -1) /* all procs other than current and init */
365 ret = cap_set_all(&effective, &inheritable, &permitted);
366 472
367 else /* all procs in process group */ 473 ret = security_capset_check(current, &effective, &inheritable,
368 ret = cap_set_pg(-pid, &effective, &inheritable,
369 &permitted);
370 } else {
371 ret = security_capset_check(target, &effective, &inheritable,
372 &permitted); 474 &permitted);
475 /*
476 * Having verified that the proposed changes are
477 * legal, we now put them into effect.
478 */
373 if (!ret) 479 if (!ret)
374 security_capset_set(target, &effective, &inheritable, 480 security_capset_set(current, &effective, &inheritable,
375 &permitted); 481 &permitted);
482 spin_unlock(&task_capability_lock);
376 } 483 }
377 484
378out:
379 read_unlock(&tasklist_lock);
380 spin_unlock(&task_capability_lock);
381 485
382 return ret; 486 return ret;
383} 487}
384 488
385int __capable(struct task_struct *t, int cap) 489/**
490 * capable - Determine if the current task has a superior capability in effect
491 * @cap: The capability to be tested for
492 *
493 * Return true if the current task has the given superior capability currently
494 * available for use, false if not.
495 *
496 * This sets PF_SUPERPRIV on the task if the capability is available on the
497 * assumption that it's about to be used.
498 */
499int capable(int cap)
386{ 500{
387 if (security_capable(t, cap) == 0) { 501 if (has_capability(current, cap)) {
388 t->flags |= PF_SUPERPRIV; 502 current->flags |= PF_SUPERPRIV;
389 return 1; 503 return 1;
390 } 504 }
391 return 0; 505 return 0;
392} 506}
393
394int capable(int cap)
395{
396 return __capable(current, cap);
397}
398EXPORT_SYMBOL(capable); 507EXPORT_SYMBOL(capable);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 15ac0e1e4f4d..35eebd5510c2 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -45,6 +45,7 @@
45#include <linux/delayacct.h> 45#include <linux/delayacct.h>
46#include <linux/cgroupstats.h> 46#include <linux/cgroupstats.h>
47#include <linux/hash.h> 47#include <linux/hash.h>
48#include <linux/namei.h>
48 49
49#include <asm/atomic.h> 50#include <asm/atomic.h>
50 51
@@ -89,11 +90,7 @@ struct cgroupfs_root {
89 /* Hierarchy-specific flags */ 90 /* Hierarchy-specific flags */
90 unsigned long flags; 91 unsigned long flags;
91 92
92 /* The path to use for release notifications. No locking 93 /* The path to use for release notifications. */
93 * between setting and use - so if userspace updates this
94 * while child cgroups exist, you could miss a
95 * notification. We ensure that it's always a valid
96 * NUL-terminated string */
97 char release_agent_path[PATH_MAX]; 94 char release_agent_path[PATH_MAX];
98}; 95};
99 96
@@ -118,7 +115,7 @@ static int root_count;
118 * extra work in the fork/exit path if none of the subsystems need to 115 * extra work in the fork/exit path if none of the subsystems need to
119 * be called. 116 * be called.
120 */ 117 */
121static int need_forkexit_callback; 118static int need_forkexit_callback __read_mostly;
122static int need_mm_owner_callback __read_mostly; 119static int need_mm_owner_callback __read_mostly;
123 120
124/* convenient tests for these bits */ 121/* convenient tests for these bits */
@@ -220,7 +217,7 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
220 * task until after the first call to cgroup_iter_start(). This 217 * task until after the first call to cgroup_iter_start(). This
221 * reduces the fork()/exit() overhead for people who have cgroups 218 * reduces the fork()/exit() overhead for people who have cgroups
222 * compiled into their kernel but not actually in use */ 219 * compiled into their kernel but not actually in use */
223static int use_task_css_set_links; 220static int use_task_css_set_links __read_mostly;
224 221
225/* When we create or destroy a css_set, the operation simply 222/* When we create or destroy a css_set, the operation simply
226 * takes/releases a reference count on all the cgroups referenced 223 * takes/releases a reference count on all the cgroups referenced
@@ -241,26 +238,37 @@ static int use_task_css_set_links;
241 */ 238 */
242static void unlink_css_set(struct css_set *cg) 239static void unlink_css_set(struct css_set *cg)
243{ 240{
244 write_lock(&css_set_lock); 241 struct cg_cgroup_link *link;
242 struct cg_cgroup_link *saved_link;
243
245 hlist_del(&cg->hlist); 244 hlist_del(&cg->hlist);
246 css_set_count--; 245 css_set_count--;
247 while (!list_empty(&cg->cg_links)) { 246
248 struct cg_cgroup_link *link; 247 list_for_each_entry_safe(link, saved_link, &cg->cg_links,
249 link = list_entry(cg->cg_links.next, 248 cg_link_list) {
250 struct cg_cgroup_link, cg_link_list);
251 list_del(&link->cg_link_list); 249 list_del(&link->cg_link_list);
252 list_del(&link->cgrp_link_list); 250 list_del(&link->cgrp_link_list);
253 kfree(link); 251 kfree(link);
254 } 252 }
255 write_unlock(&css_set_lock);
256} 253}
257 254
258static void __release_css_set(struct kref *k, int taskexit) 255static void __put_css_set(struct css_set *cg, int taskexit)
259{ 256{
260 int i; 257 int i;
261 struct css_set *cg = container_of(k, struct css_set, ref); 258 /*
262 259 * Ensure that the refcount doesn't hit zero while any readers
260 * can see it. Similar to atomic_dec_and_lock(), but for an
261 * rwlock
262 */
263 if (atomic_add_unless(&cg->refcount, -1, 1))
264 return;
265 write_lock(&css_set_lock);
266 if (!atomic_dec_and_test(&cg->refcount)) {
267 write_unlock(&css_set_lock);
268 return;
269 }
263 unlink_css_set(cg); 270 unlink_css_set(cg);
271 write_unlock(&css_set_lock);
264 272
265 rcu_read_lock(); 273 rcu_read_lock();
266 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 274 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
@@ -276,32 +284,22 @@ static void __release_css_set(struct kref *k, int taskexit)
276 kfree(cg); 284 kfree(cg);
277} 285}
278 286
279static void release_css_set(struct kref *k)
280{
281 __release_css_set(k, 0);
282}
283
284static void release_css_set_taskexit(struct kref *k)
285{
286 __release_css_set(k, 1);
287}
288
289/* 287/*
290 * refcounted get/put for css_set objects 288 * refcounted get/put for css_set objects
291 */ 289 */
292static inline void get_css_set(struct css_set *cg) 290static inline void get_css_set(struct css_set *cg)
293{ 291{
294 kref_get(&cg->ref); 292 atomic_inc(&cg->refcount);
295} 293}
296 294
297static inline void put_css_set(struct css_set *cg) 295static inline void put_css_set(struct css_set *cg)
298{ 296{
299 kref_put(&cg->ref, release_css_set); 297 __put_css_set(cg, 0);
300} 298}
301 299
302static inline void put_css_set_taskexit(struct css_set *cg) 300static inline void put_css_set_taskexit(struct css_set *cg)
303{ 301{
304 kref_put(&cg->ref, release_css_set_taskexit); 302 __put_css_set(cg, 1);
305} 303}
306 304
307/* 305/*
@@ -355,6 +353,17 @@ static struct css_set *find_existing_css_set(
355 return NULL; 353 return NULL;
356} 354}
357 355
356static void free_cg_links(struct list_head *tmp)
357{
358 struct cg_cgroup_link *link;
359 struct cg_cgroup_link *saved_link;
360
361 list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) {
362 list_del(&link->cgrp_link_list);
363 kfree(link);
364 }
365}
366
358/* 367/*
359 * allocate_cg_links() allocates "count" cg_cgroup_link structures 368 * allocate_cg_links() allocates "count" cg_cgroup_link structures
360 * and chains them on tmp through their cgrp_link_list fields. Returns 0 on 369 * and chains them on tmp through their cgrp_link_list fields. Returns 0 on
@@ -368,13 +377,7 @@ static int allocate_cg_links(int count, struct list_head *tmp)
368 for (i = 0; i < count; i++) { 377 for (i = 0; i < count; i++) {
369 link = kmalloc(sizeof(*link), GFP_KERNEL); 378 link = kmalloc(sizeof(*link), GFP_KERNEL);
370 if (!link) { 379 if (!link) {
371 while (!list_empty(tmp)) { 380 free_cg_links(tmp);
372 link = list_entry(tmp->next,
373 struct cg_cgroup_link,
374 cgrp_link_list);
375 list_del(&link->cgrp_link_list);
376 kfree(link);
377 }
378 return -ENOMEM; 381 return -ENOMEM;
379 } 382 }
380 list_add(&link->cgrp_link_list, tmp); 383 list_add(&link->cgrp_link_list, tmp);
@@ -382,18 +385,6 @@ static int allocate_cg_links(int count, struct list_head *tmp)
382 return 0; 385 return 0;
383} 386}
384 387
385static void free_cg_links(struct list_head *tmp)
386{
387 while (!list_empty(tmp)) {
388 struct cg_cgroup_link *link;
389 link = list_entry(tmp->next,
390 struct cg_cgroup_link,
391 cgrp_link_list);
392 list_del(&link->cgrp_link_list);
393 kfree(link);
394 }
395}
396
397/* 388/*
398 * find_css_set() takes an existing cgroup group and a 389 * find_css_set() takes an existing cgroup group and a
399 * cgroup object, and returns a css_set object that's 390 * cgroup object, and returns a css_set object that's
@@ -415,11 +406,11 @@ static struct css_set *find_css_set(
415 406
416 /* First see if we already have a cgroup group that matches 407 /* First see if we already have a cgroup group that matches
417 * the desired set */ 408 * the desired set */
418 write_lock(&css_set_lock); 409 read_lock(&css_set_lock);
419 res = find_existing_css_set(oldcg, cgrp, template); 410 res = find_existing_css_set(oldcg, cgrp, template);
420 if (res) 411 if (res)
421 get_css_set(res); 412 get_css_set(res);
422 write_unlock(&css_set_lock); 413 read_unlock(&css_set_lock);
423 414
424 if (res) 415 if (res)
425 return res; 416 return res;
@@ -434,7 +425,7 @@ static struct css_set *find_css_set(
434 return NULL; 425 return NULL;
435 } 426 }
436 427
437 kref_init(&res->ref); 428 atomic_set(&res->refcount, 1);
438 INIT_LIST_HEAD(&res->cg_links); 429 INIT_LIST_HEAD(&res->cg_links);
439 INIT_LIST_HEAD(&res->tasks); 430 INIT_LIST_HEAD(&res->tasks);
440 INIT_HLIST_NODE(&res->hlist); 431 INIT_HLIST_NODE(&res->hlist);
@@ -507,10 +498,6 @@ static struct css_set *find_css_set(
507 * knows that the cgroup won't be removed, as cgroup_rmdir() 498 * knows that the cgroup won't be removed, as cgroup_rmdir()
508 * needs that mutex. 499 * needs that mutex.
509 * 500 *
510 * The cgroup_common_file_write handler for operations that modify
511 * the cgroup hierarchy holds cgroup_mutex across the entire operation,
512 * single threading all such cgroup modifications across the system.
513 *
514 * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't 501 * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't
515 * (usually) take cgroup_mutex. These are the two most performance 502 * (usually) take cgroup_mutex. These are the two most performance
516 * critical pieces of code here. The exception occurs on cgroup_exit(), 503 * critical pieces of code here. The exception occurs on cgroup_exit(),
@@ -881,6 +868,14 @@ static struct super_operations cgroup_ops = {
881 .remount_fs = cgroup_remount, 868 .remount_fs = cgroup_remount,
882}; 869};
883 870
871static void init_cgroup_housekeeping(struct cgroup *cgrp)
872{
873 INIT_LIST_HEAD(&cgrp->sibling);
874 INIT_LIST_HEAD(&cgrp->children);
875 INIT_LIST_HEAD(&cgrp->css_sets);
876 INIT_LIST_HEAD(&cgrp->release_list);
877 init_rwsem(&cgrp->pids_mutex);
878}
884static void init_cgroup_root(struct cgroupfs_root *root) 879static void init_cgroup_root(struct cgroupfs_root *root)
885{ 880{
886 struct cgroup *cgrp = &root->top_cgroup; 881 struct cgroup *cgrp = &root->top_cgroup;
@@ -889,10 +884,7 @@ static void init_cgroup_root(struct cgroupfs_root *root)
889 root->number_of_cgroups = 1; 884 root->number_of_cgroups = 1;
890 cgrp->root = root; 885 cgrp->root = root;
891 cgrp->top_cgroup = cgrp; 886 cgrp->top_cgroup = cgrp;
892 INIT_LIST_HEAD(&cgrp->sibling); 887 init_cgroup_housekeeping(cgrp);
893 INIT_LIST_HEAD(&cgrp->children);
894 INIT_LIST_HEAD(&cgrp->css_sets);
895 INIT_LIST_HEAD(&cgrp->release_list);
896} 888}
897 889
898static int cgroup_test_super(struct super_block *sb, void *data) 890static int cgroup_test_super(struct super_block *sb, void *data)
@@ -962,7 +954,6 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
962 struct super_block *sb; 954 struct super_block *sb;
963 struct cgroupfs_root *root; 955 struct cgroupfs_root *root;
964 struct list_head tmp_cg_links; 956 struct list_head tmp_cg_links;
965 INIT_LIST_HEAD(&tmp_cg_links);
966 957
967 /* First find the desired set of subsystems */ 958 /* First find the desired set of subsystems */
968 ret = parse_cgroupfs_options(data, &opts); 959 ret = parse_cgroupfs_options(data, &opts);
@@ -1093,6 +1084,8 @@ static void cgroup_kill_sb(struct super_block *sb) {
1093 struct cgroupfs_root *root = sb->s_fs_info; 1084 struct cgroupfs_root *root = sb->s_fs_info;
1094 struct cgroup *cgrp = &root->top_cgroup; 1085 struct cgroup *cgrp = &root->top_cgroup;
1095 int ret; 1086 int ret;
1087 struct cg_cgroup_link *link;
1088 struct cg_cgroup_link *saved_link;
1096 1089
1097 BUG_ON(!root); 1090 BUG_ON(!root);
1098 1091
@@ -1112,10 +1105,9 @@ static void cgroup_kill_sb(struct super_block *sb) {
1112 * root cgroup 1105 * root cgroup
1113 */ 1106 */
1114 write_lock(&css_set_lock); 1107 write_lock(&css_set_lock);
1115 while (!list_empty(&cgrp->css_sets)) { 1108
1116 struct cg_cgroup_link *link; 1109 list_for_each_entry_safe(link, saved_link, &cgrp->css_sets,
1117 link = list_entry(cgrp->css_sets.next, 1110 cgrp_link_list) {
1118 struct cg_cgroup_link, cgrp_link_list);
1119 list_del(&link->cg_link_list); 1111 list_del(&link->cg_link_list);
1120 list_del(&link->cgrp_link_list); 1112 list_del(&link->cgrp_link_list);
1121 kfree(link); 1113 kfree(link);
@@ -1281,18 +1273,14 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
1281} 1273}
1282 1274
1283/* 1275/*
1284 * Attach task with pid 'pid' to cgroup 'cgrp'. Call with 1276 * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
1285 * cgroup_mutex, may take task_lock of task 1277 * held. May take task_lock of task
1286 */ 1278 */
1287static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf) 1279static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
1288{ 1280{
1289 pid_t pid;
1290 struct task_struct *tsk; 1281 struct task_struct *tsk;
1291 int ret; 1282 int ret;
1292 1283
1293 if (sscanf(pidbuf, "%d", &pid) != 1)
1294 return -EIO;
1295
1296 if (pid) { 1284 if (pid) {
1297 rcu_read_lock(); 1285 rcu_read_lock();
1298 tsk = find_task_by_vpid(pid); 1286 tsk = find_task_by_vpid(pid);
@@ -1318,6 +1306,16 @@ static int attach_task_by_pid(struct cgroup *cgrp, char *pidbuf)
1318 return ret; 1306 return ret;
1319} 1307}
1320 1308
1309static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
1310{
1311 int ret;
1312 if (!cgroup_lock_live_group(cgrp))
1313 return -ENODEV;
1314 ret = attach_task_by_pid(cgrp, pid);
1315 cgroup_unlock();
1316 return ret;
1317}
1318
1321/* The various types of files and directories in a cgroup file system */ 1319/* The various types of files and directories in a cgroup file system */
1322enum cgroup_filetype { 1320enum cgroup_filetype {
1323 FILE_ROOT, 1321 FILE_ROOT,
@@ -1327,12 +1325,54 @@ enum cgroup_filetype {
1327 FILE_RELEASE_AGENT, 1325 FILE_RELEASE_AGENT,
1328}; 1326};
1329 1327
1328/**
1329 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive.
1330 * @cgrp: the cgroup to be checked for liveness
1331 *
1332 * On success, returns true; the lock should be later released with
1333 * cgroup_unlock(). On failure returns false with no lock held.
1334 */
1335bool cgroup_lock_live_group(struct cgroup *cgrp)
1336{
1337 mutex_lock(&cgroup_mutex);
1338 if (cgroup_is_removed(cgrp)) {
1339 mutex_unlock(&cgroup_mutex);
1340 return false;
1341 }
1342 return true;
1343}
1344
1345static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
1346 const char *buffer)
1347{
1348 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
1349 if (!cgroup_lock_live_group(cgrp))
1350 return -ENODEV;
1351 strcpy(cgrp->root->release_agent_path, buffer);
1352 cgroup_unlock();
1353 return 0;
1354}
1355
1356static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft,
1357 struct seq_file *seq)
1358{
1359 if (!cgroup_lock_live_group(cgrp))
1360 return -ENODEV;
1361 seq_puts(seq, cgrp->root->release_agent_path);
1362 seq_putc(seq, '\n');
1363 cgroup_unlock();
1364 return 0;
1365}
1366
1367/* A buffer size big enough for numbers or short strings */
1368#define CGROUP_LOCAL_BUFFER_SIZE 64
1369
1330static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, 1370static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
1331 struct file *file, 1371 struct file *file,
1332 const char __user *userbuf, 1372 const char __user *userbuf,
1333 size_t nbytes, loff_t *unused_ppos) 1373 size_t nbytes, loff_t *unused_ppos)
1334{ 1374{
1335 char buffer[64]; 1375 char buffer[CGROUP_LOCAL_BUFFER_SIZE];
1336 int retval = 0; 1376 int retval = 0;
1337 char *end; 1377 char *end;
1338 1378
@@ -1361,68 +1401,39 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
1361 return retval; 1401 return retval;
1362} 1402}
1363 1403
1364static ssize_t cgroup_common_file_write(struct cgroup *cgrp, 1404static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
1365 struct cftype *cft, 1405 struct file *file,
1366 struct file *file, 1406 const char __user *userbuf,
1367 const char __user *userbuf, 1407 size_t nbytes, loff_t *unused_ppos)
1368 size_t nbytes, loff_t *unused_ppos)
1369{ 1408{
1370 enum cgroup_filetype type = cft->private; 1409 char local_buffer[CGROUP_LOCAL_BUFFER_SIZE];
1371 char *buffer;
1372 int retval = 0; 1410 int retval = 0;
1411 size_t max_bytes = cft->max_write_len;
1412 char *buffer = local_buffer;
1373 1413
1374 if (nbytes >= PATH_MAX) 1414 if (!max_bytes)
1415 max_bytes = sizeof(local_buffer) - 1;
1416 if (nbytes >= max_bytes)
1375 return -E2BIG; 1417 return -E2BIG;
1376 1418 /* Allocate a dynamic buffer if we need one */
1377 /* +1 for nul-terminator */ 1419 if (nbytes >= sizeof(local_buffer)) {
1378 buffer = kmalloc(nbytes + 1, GFP_KERNEL); 1420 buffer = kmalloc(nbytes + 1, GFP_KERNEL);
1379 if (buffer == NULL) 1421 if (buffer == NULL)
1380 return -ENOMEM; 1422 return -ENOMEM;
1381 1423 }
1382 if (copy_from_user(buffer, userbuf, nbytes)) { 1424 if (nbytes && copy_from_user(buffer, userbuf, nbytes)) {
1383 retval = -EFAULT; 1425 retval = -EFAULT;
1384 goto out1; 1426 goto out;
1385 } 1427 }
1386 buffer[nbytes] = 0; /* nul-terminate */
1387 strstrip(buffer); /* strip -just- trailing whitespace */
1388
1389 mutex_lock(&cgroup_mutex);
1390 1428
1391 /* 1429 buffer[nbytes] = 0; /* nul-terminate */
1392 * This was already checked for in cgroup_file_write(), but 1430 strstrip(buffer);
1393 * check again now we're holding cgroup_mutex. 1431 retval = cft->write_string(cgrp, cft, buffer);
1394 */ 1432 if (!retval)
1395 if (cgroup_is_removed(cgrp)) {
1396 retval = -ENODEV;
1397 goto out2;
1398 }
1399
1400 switch (type) {
1401 case FILE_TASKLIST:
1402 retval = attach_task_by_pid(cgrp, buffer);
1403 break;
1404 case FILE_NOTIFY_ON_RELEASE:
1405 clear_bit(CGRP_RELEASABLE, &cgrp->flags);
1406 if (simple_strtoul(buffer, NULL, 10) != 0)
1407 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
1408 else
1409 clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
1410 break;
1411 case FILE_RELEASE_AGENT:
1412 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
1413 strcpy(cgrp->root->release_agent_path, buffer);
1414 break;
1415 default:
1416 retval = -EINVAL;
1417 goto out2;
1418 }
1419
1420 if (retval == 0)
1421 retval = nbytes; 1433 retval = nbytes;
1422out2: 1434out:
1423 mutex_unlock(&cgroup_mutex); 1435 if (buffer != local_buffer)
1424out1: 1436 kfree(buffer);
1425 kfree(buffer);
1426 return retval; 1437 return retval;
1427} 1438}
1428 1439
@@ -1438,6 +1449,8 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
1438 return cft->write(cgrp, cft, file, buf, nbytes, ppos); 1449 return cft->write(cgrp, cft, file, buf, nbytes, ppos);
1439 if (cft->write_u64 || cft->write_s64) 1450 if (cft->write_u64 || cft->write_s64)
1440 return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos); 1451 return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos);
1452 if (cft->write_string)
1453 return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos);
1441 if (cft->trigger) { 1454 if (cft->trigger) {
1442 int ret = cft->trigger(cgrp, (unsigned int)cft->private); 1455 int ret = cft->trigger(cgrp, (unsigned int)cft->private);
1443 return ret ? ret : nbytes; 1456 return ret ? ret : nbytes;
@@ -1450,7 +1463,7 @@ static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft,
1450 char __user *buf, size_t nbytes, 1463 char __user *buf, size_t nbytes,
1451 loff_t *ppos) 1464 loff_t *ppos)
1452{ 1465{
1453 char tmp[64]; 1466 char tmp[CGROUP_LOCAL_BUFFER_SIZE];
1454 u64 val = cft->read_u64(cgrp, cft); 1467 u64 val = cft->read_u64(cgrp, cft);
1455 int len = sprintf(tmp, "%llu\n", (unsigned long long) val); 1468 int len = sprintf(tmp, "%llu\n", (unsigned long long) val);
1456 1469
@@ -1462,56 +1475,13 @@ static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft,
1462 char __user *buf, size_t nbytes, 1475 char __user *buf, size_t nbytes,
1463 loff_t *ppos) 1476 loff_t *ppos)
1464{ 1477{
1465 char tmp[64]; 1478 char tmp[CGROUP_LOCAL_BUFFER_SIZE];
1466 s64 val = cft->read_s64(cgrp, cft); 1479 s64 val = cft->read_s64(cgrp, cft);
1467 int len = sprintf(tmp, "%lld\n", (long long) val); 1480 int len = sprintf(tmp, "%lld\n", (long long) val);
1468 1481
1469 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 1482 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
1470} 1483}
1471 1484
1472static ssize_t cgroup_common_file_read(struct cgroup *cgrp,
1473 struct cftype *cft,
1474 struct file *file,
1475 char __user *buf,
1476 size_t nbytes, loff_t *ppos)
1477{
1478 enum cgroup_filetype type = cft->private;
1479 char *page;
1480 ssize_t retval = 0;
1481 char *s;
1482
1483 if (!(page = (char *)__get_free_page(GFP_KERNEL)))
1484 return -ENOMEM;
1485
1486 s = page;
1487
1488 switch (type) {
1489 case FILE_RELEASE_AGENT:
1490 {
1491 struct cgroupfs_root *root;
1492 size_t n;
1493 mutex_lock(&cgroup_mutex);
1494 root = cgrp->root;
1495 n = strnlen(root->release_agent_path,
1496 sizeof(root->release_agent_path));
1497 n = min(n, (size_t) PAGE_SIZE);
1498 strncpy(s, root->release_agent_path, n);
1499 mutex_unlock(&cgroup_mutex);
1500 s += n;
1501 break;
1502 }
1503 default:
1504 retval = -EINVAL;
1505 goto out;
1506 }
1507 *s++ = '\n';
1508
1509 retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
1510out:
1511 free_page((unsigned long)page);
1512 return retval;
1513}
1514
1515static ssize_t cgroup_file_read(struct file *file, char __user *buf, 1485static ssize_t cgroup_file_read(struct file *file, char __user *buf,
1516 size_t nbytes, loff_t *ppos) 1486 size_t nbytes, loff_t *ppos)
1517{ 1487{
@@ -1560,7 +1530,7 @@ static int cgroup_seqfile_show(struct seq_file *m, void *arg)
1560 return cft->read_seq_string(state->cgroup, cft, m); 1530 return cft->read_seq_string(state->cgroup, cft, m);
1561} 1531}
1562 1532
1563int cgroup_seqfile_release(struct inode *inode, struct file *file) 1533static int cgroup_seqfile_release(struct inode *inode, struct file *file)
1564{ 1534{
1565 struct seq_file *seq = file->private_data; 1535 struct seq_file *seq = file->private_data;
1566 kfree(seq->private); 1536 kfree(seq->private);
@@ -1569,6 +1539,7 @@ int cgroup_seqfile_release(struct inode *inode, struct file *file)
1569 1539
1570static struct file_operations cgroup_seqfile_operations = { 1540static struct file_operations cgroup_seqfile_operations = {
1571 .read = seq_read, 1541 .read = seq_read,
1542 .write = cgroup_file_write,
1572 .llseek = seq_lseek, 1543 .llseek = seq_lseek,
1573 .release = cgroup_seqfile_release, 1544 .release = cgroup_seqfile_release,
1574}; 1545};
@@ -1756,15 +1727,11 @@ int cgroup_add_files(struct cgroup *cgrp,
1756int cgroup_task_count(const struct cgroup *cgrp) 1727int cgroup_task_count(const struct cgroup *cgrp)
1757{ 1728{
1758 int count = 0; 1729 int count = 0;
1759 struct list_head *l; 1730 struct cg_cgroup_link *link;
1760 1731
1761 read_lock(&css_set_lock); 1732 read_lock(&css_set_lock);
1762 l = cgrp->css_sets.next; 1733 list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
1763 while (l != &cgrp->css_sets) { 1734 count += atomic_read(&link->cg->refcount);
1764 struct cg_cgroup_link *link =
1765 list_entry(l, struct cg_cgroup_link, cgrp_link_list);
1766 count += atomic_read(&link->cg->ref.refcount);
1767 l = l->next;
1768 } 1735 }
1769 read_unlock(&css_set_lock); 1736 read_unlock(&css_set_lock);
1770 return count; 1737 return count;
@@ -2033,16 +2000,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
2033 * but we cannot guarantee that the information we produce is correct 2000 * but we cannot guarantee that the information we produce is correct
2034 * unless we produce it entirely atomically. 2001 * unless we produce it entirely atomically.
2035 * 2002 *
2036 * Upon tasks file open(), a struct ctr_struct is allocated, that
2037 * will have a pointer to an array (also allocated here). The struct
2038 * ctr_struct * is stored in file->private_data. Its resources will
2039 * be freed by release() when the file is closed. The array is used
2040 * to sprintf the PIDs and then used by read().
2041 */ 2003 */
2042struct ctr_struct {
2043 char *buf;
2044 int bufsz;
2045};
2046 2004
2047/* 2005/*
2048 * Load into 'pidarray' up to 'npids' of the tasks using cgroup 2006 * Load into 'pidarray' up to 'npids' of the tasks using cgroup
@@ -2124,42 +2082,132 @@ static int cmppid(const void *a, const void *b)
2124 return *(pid_t *)a - *(pid_t *)b; 2082 return *(pid_t *)a - *(pid_t *)b;
2125} 2083}
2126 2084
2085
2127/* 2086/*
2128 * Convert array 'a' of 'npids' pid_t's to a string of newline separated 2087 * seq_file methods for the "tasks" file. The seq_file position is the
2129 * decimal pids in 'buf'. Don't write more than 'sz' chars, but return 2088 * next pid to display; the seq_file iterator is a pointer to the pid
2130 * count 'cnt' of how many chars would be written if buf were large enough. 2089 * in the cgroup->tasks_pids array.
2131 */ 2090 */
2132static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids) 2091
2092static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
2133{ 2093{
2134 int cnt = 0; 2094 /*
2135 int i; 2095 * Initially we receive a position value that corresponds to
2096 * one more than the last pid shown (or 0 on the first call or
2097 * after a seek to the start). Use a binary-search to find the
2098 * next pid to display, if any
2099 */
2100 struct cgroup *cgrp = s->private;
2101 int index = 0, pid = *pos;
2102 int *iter;
2103
2104 down_read(&cgrp->pids_mutex);
2105 if (pid) {
2106 int end = cgrp->pids_length;
2107
2108 while (index < end) {
2109 int mid = (index + end) / 2;
2110 if (cgrp->tasks_pids[mid] == pid) {
2111 index = mid;
2112 break;
2113 } else if (cgrp->tasks_pids[mid] <= pid)
2114 index = mid + 1;
2115 else
2116 end = mid;
2117 }
2118 }
2119 /* If we're off the end of the array, we're done */
2120 if (index >= cgrp->pids_length)
2121 return NULL;
2122 /* Update the abstract position to be the actual pid that we found */
2123 iter = cgrp->tasks_pids + index;
2124 *pos = *iter;
2125 return iter;
2126}
2127
2128static void cgroup_tasks_stop(struct seq_file *s, void *v)
2129{
2130 struct cgroup *cgrp = s->private;
2131 up_read(&cgrp->pids_mutex);
2132}
2133
2134static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
2135{
2136 struct cgroup *cgrp = s->private;
2137 int *p = v;
2138 int *end = cgrp->tasks_pids + cgrp->pids_length;
2139
2140 /*
2141 * Advance to the next pid in the array. If this goes off the
2142 * end, we're done
2143 */
2144 p++;
2145 if (p >= end) {
2146 return NULL;
2147 } else {
2148 *pos = *p;
2149 return p;
2150 }
2151}
2152
2153static int cgroup_tasks_show(struct seq_file *s, void *v)
2154{
2155 return seq_printf(s, "%d\n", *(int *)v);
2156}
2157
2158static struct seq_operations cgroup_tasks_seq_operations = {
2159 .start = cgroup_tasks_start,
2160 .stop = cgroup_tasks_stop,
2161 .next = cgroup_tasks_next,
2162 .show = cgroup_tasks_show,
2163};
2136 2164
2137 for (i = 0; i < npids; i++) 2165static void release_cgroup_pid_array(struct cgroup *cgrp)
2138 cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]); 2166{
2139 return cnt; 2167 down_write(&cgrp->pids_mutex);
2168 BUG_ON(!cgrp->pids_use_count);
2169 if (!--cgrp->pids_use_count) {
2170 kfree(cgrp->tasks_pids);
2171 cgrp->tasks_pids = NULL;
2172 cgrp->pids_length = 0;
2173 }
2174 up_write(&cgrp->pids_mutex);
2140} 2175}
2141 2176
2177static int cgroup_tasks_release(struct inode *inode, struct file *file)
2178{
2179 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2180
2181 if (!(file->f_mode & FMODE_READ))
2182 return 0;
2183
2184 release_cgroup_pid_array(cgrp);
2185 return seq_release(inode, file);
2186}
2187
2188static struct file_operations cgroup_tasks_operations = {
2189 .read = seq_read,
2190 .llseek = seq_lseek,
2191 .write = cgroup_file_write,
2192 .release = cgroup_tasks_release,
2193};
2194
2142/* 2195/*
2143 * Handle an open on 'tasks' file. Prepare a buffer listing the 2196 * Handle an open on 'tasks' file. Prepare an array containing the
2144 * process id's of tasks currently attached to the cgroup being opened. 2197 * process id's of tasks currently attached to the cgroup being opened.
2145 *
2146 * Does not require any specific cgroup mutexes, and does not take any.
2147 */ 2198 */
2199
2148static int cgroup_tasks_open(struct inode *unused, struct file *file) 2200static int cgroup_tasks_open(struct inode *unused, struct file *file)
2149{ 2201{
2150 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2202 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
2151 struct ctr_struct *ctr;
2152 pid_t *pidarray; 2203 pid_t *pidarray;
2153 int npids; 2204 int npids;
2154 char c; 2205 int retval;
2155 2206
2207 /* Nothing to do for write-only files */
2156 if (!(file->f_mode & FMODE_READ)) 2208 if (!(file->f_mode & FMODE_READ))
2157 return 0; 2209 return 0;
2158 2210
2159 ctr = kmalloc(sizeof(*ctr), GFP_KERNEL);
2160 if (!ctr)
2161 goto err0;
2162
2163 /* 2211 /*
2164 * If cgroup gets more users after we read count, we won't have 2212 * If cgroup gets more users after we read count, we won't have
2165 * enough space - tough. This race is indistinguishable to the 2213 * enough space - tough. This race is indistinguishable to the
@@ -2167,57 +2215,31 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
2167 * show up until sometime later on. 2215 * show up until sometime later on.
2168 */ 2216 */
2169 npids = cgroup_task_count(cgrp); 2217 npids = cgroup_task_count(cgrp);
2170 if (npids) { 2218 pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
2171 pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL); 2219 if (!pidarray)
2172 if (!pidarray) 2220 return -ENOMEM;
2173 goto err1; 2221 npids = pid_array_load(pidarray, npids, cgrp);
2174 2222 sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
2175 npids = pid_array_load(pidarray, npids, cgrp);
2176 sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
2177
2178 /* Call pid_array_to_buf() twice, first just to get bufsz */
2179 ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1;
2180 ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL);
2181 if (!ctr->buf)
2182 goto err2;
2183 ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids);
2184
2185 kfree(pidarray);
2186 } else {
2187 ctr->buf = NULL;
2188 ctr->bufsz = 0;
2189 }
2190 file->private_data = ctr;
2191 return 0;
2192
2193err2:
2194 kfree(pidarray);
2195err1:
2196 kfree(ctr);
2197err0:
2198 return -ENOMEM;
2199}
2200
2201static ssize_t cgroup_tasks_read(struct cgroup *cgrp,
2202 struct cftype *cft,
2203 struct file *file, char __user *buf,
2204 size_t nbytes, loff_t *ppos)
2205{
2206 struct ctr_struct *ctr = file->private_data;
2207 2223
2208 return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz); 2224 /*
2209} 2225 * Store the array in the cgroup, freeing the old
2226 * array if necessary
2227 */
2228 down_write(&cgrp->pids_mutex);
2229 kfree(cgrp->tasks_pids);
2230 cgrp->tasks_pids = pidarray;
2231 cgrp->pids_length = npids;
2232 cgrp->pids_use_count++;
2233 up_write(&cgrp->pids_mutex);
2210 2234
2211static int cgroup_tasks_release(struct inode *unused_inode, 2235 file->f_op = &cgroup_tasks_operations;
2212 struct file *file)
2213{
2214 struct ctr_struct *ctr;
2215 2236
2216 if (file->f_mode & FMODE_READ) { 2237 retval = seq_open(file, &cgroup_tasks_seq_operations);
2217 ctr = file->private_data; 2238 if (retval) {
2218 kfree(ctr->buf); 2239 release_cgroup_pid_array(cgrp);
2219 kfree(ctr); 2240 return retval;
2220 } 2241 }
2242 ((struct seq_file *)file->private_data)->private = cgrp;
2221 return 0; 2243 return 0;
2222} 2244}
2223 2245
@@ -2227,6 +2249,18 @@ static u64 cgroup_read_notify_on_release(struct cgroup *cgrp,
2227 return notify_on_release(cgrp); 2249 return notify_on_release(cgrp);
2228} 2250}
2229 2251
2252static int cgroup_write_notify_on_release(struct cgroup *cgrp,
2253 struct cftype *cft,
2254 u64 val)
2255{
2256 clear_bit(CGRP_RELEASABLE, &cgrp->flags);
2257 if (val)
2258 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
2259 else
2260 clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags);
2261 return 0;
2262}
2263
2230/* 2264/*
2231 * for the common functions, 'private' gives the type of file 2265 * for the common functions, 'private' gives the type of file
2232 */ 2266 */
@@ -2234,8 +2268,7 @@ static struct cftype files[] = {
2234 { 2268 {
2235 .name = "tasks", 2269 .name = "tasks",
2236 .open = cgroup_tasks_open, 2270 .open = cgroup_tasks_open,
2237 .read = cgroup_tasks_read, 2271 .write_u64 = cgroup_tasks_write,
2238 .write = cgroup_common_file_write,
2239 .release = cgroup_tasks_release, 2272 .release = cgroup_tasks_release,
2240 .private = FILE_TASKLIST, 2273 .private = FILE_TASKLIST,
2241 }, 2274 },
@@ -2243,15 +2276,16 @@ static struct cftype files[] = {
2243 { 2276 {
2244 .name = "notify_on_release", 2277 .name = "notify_on_release",
2245 .read_u64 = cgroup_read_notify_on_release, 2278 .read_u64 = cgroup_read_notify_on_release,
2246 .write = cgroup_common_file_write, 2279 .write_u64 = cgroup_write_notify_on_release,
2247 .private = FILE_NOTIFY_ON_RELEASE, 2280 .private = FILE_NOTIFY_ON_RELEASE,
2248 }, 2281 },
2249}; 2282};
2250 2283
2251static struct cftype cft_release_agent = { 2284static struct cftype cft_release_agent = {
2252 .name = "release_agent", 2285 .name = "release_agent",
2253 .read = cgroup_common_file_read, 2286 .read_seq_string = cgroup_release_agent_show,
2254 .write = cgroup_common_file_write, 2287 .write_string = cgroup_release_agent_write,
2288 .max_write_len = PATH_MAX,
2255 .private = FILE_RELEASE_AGENT, 2289 .private = FILE_RELEASE_AGENT,
2256}; 2290};
2257 2291
@@ -2323,10 +2357,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
2323 2357
2324 mutex_lock(&cgroup_mutex); 2358 mutex_lock(&cgroup_mutex);
2325 2359
2326 INIT_LIST_HEAD(&cgrp->sibling); 2360 init_cgroup_housekeeping(cgrp);
2327 INIT_LIST_HEAD(&cgrp->children);
2328 INIT_LIST_HEAD(&cgrp->css_sets);
2329 INIT_LIST_HEAD(&cgrp->release_list);
2330 2361
2331 cgrp->parent = parent; 2362 cgrp->parent = parent;
2332 cgrp->root = parent->root; 2363 cgrp->root = parent->root;
@@ -2391,7 +2422,7 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode)
2391 return cgroup_create(c_parent, dentry, mode | S_IFDIR); 2422 return cgroup_create(c_parent, dentry, mode | S_IFDIR);
2392} 2423}
2393 2424
2394static inline int cgroup_has_css_refs(struct cgroup *cgrp) 2425static int cgroup_has_css_refs(struct cgroup *cgrp)
2395{ 2426{
2396 /* Check the reference count on each subsystem. Since we 2427 /* Check the reference count on each subsystem. Since we
2397 * already established that there are no tasks in the 2428 * already established that there are no tasks in the
@@ -2518,8 +2549,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
2518int __init cgroup_init_early(void) 2549int __init cgroup_init_early(void)
2519{ 2550{
2520 int i; 2551 int i;
2521 kref_init(&init_css_set.ref); 2552 atomic_set(&init_css_set.refcount, 1);
2522 kref_get(&init_css_set.ref);
2523 INIT_LIST_HEAD(&init_css_set.cg_links); 2553 INIT_LIST_HEAD(&init_css_set.cg_links);
2524 INIT_LIST_HEAD(&init_css_set.tasks); 2554 INIT_LIST_HEAD(&init_css_set.tasks);
2525 INIT_HLIST_NODE(&init_css_set.hlist); 2555 INIT_HLIST_NODE(&init_css_set.hlist);
@@ -2758,21 +2788,24 @@ void cgroup_fork_callbacks(struct task_struct *child)
2758 * Called on every change to mm->owner. mm_init_owner() does not 2788 * Called on every change to mm->owner. mm_init_owner() does not
2759 * invoke this routine, since it assigns the mm->owner the first time 2789 * invoke this routine, since it assigns the mm->owner the first time
2760 * and does not change it. 2790 * and does not change it.
2791 *
2792 * The callbacks are invoked with mmap_sem held in read mode.
2761 */ 2793 */
2762void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new) 2794void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
2763{ 2795{
2764 struct cgroup *oldcgrp, *newcgrp; 2796 struct cgroup *oldcgrp, *newcgrp = NULL;
2765 2797
2766 if (need_mm_owner_callback) { 2798 if (need_mm_owner_callback) {
2767 int i; 2799 int i;
2768 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 2800 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
2769 struct cgroup_subsys *ss = subsys[i]; 2801 struct cgroup_subsys *ss = subsys[i];
2770 oldcgrp = task_cgroup(old, ss->subsys_id); 2802 oldcgrp = task_cgroup(old, ss->subsys_id);
2771 newcgrp = task_cgroup(new, ss->subsys_id); 2803 if (new)
2804 newcgrp = task_cgroup(new, ss->subsys_id);
2772 if (oldcgrp == newcgrp) 2805 if (oldcgrp == newcgrp)
2773 continue; 2806 continue;
2774 if (ss->mm_owner_changed) 2807 if (ss->mm_owner_changed)
2775 ss->mm_owner_changed(ss, oldcgrp, newcgrp); 2808 ss->mm_owner_changed(ss, oldcgrp, newcgrp, new);
2776 } 2809 }
2777 } 2810 }
2778} 2811}
@@ -2869,16 +2902,17 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
2869 * cgroup_clone - clone the cgroup the given subsystem is attached to 2902 * cgroup_clone - clone the cgroup the given subsystem is attached to
2870 * @tsk: the task to be moved 2903 * @tsk: the task to be moved
2871 * @subsys: the given subsystem 2904 * @subsys: the given subsystem
2905 * @nodename: the name for the new cgroup
2872 * 2906 *
2873 * Duplicate the current cgroup in the hierarchy that the given 2907 * Duplicate the current cgroup in the hierarchy that the given
2874 * subsystem is attached to, and move this task into the new 2908 * subsystem is attached to, and move this task into the new
2875 * child. 2909 * child.
2876 */ 2910 */
2877int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys) 2911int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
2912 char *nodename)
2878{ 2913{
2879 struct dentry *dentry; 2914 struct dentry *dentry;
2880 int ret = 0; 2915 int ret = 0;
2881 char nodename[MAX_CGROUP_TYPE_NAMELEN];
2882 struct cgroup *parent, *child; 2916 struct cgroup *parent, *child;
2883 struct inode *inode; 2917 struct inode *inode;
2884 struct css_set *cg; 2918 struct css_set *cg;
@@ -2903,8 +2937,6 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)
2903 cg = tsk->cgroups; 2937 cg = tsk->cgroups;
2904 parent = task_cgroup(tsk, subsys->subsys_id); 2938 parent = task_cgroup(tsk, subsys->subsys_id);
2905 2939
2906 snprintf(nodename, MAX_CGROUP_TYPE_NAMELEN, "%d", tsk->pid);
2907
2908 /* Pin the hierarchy */ 2940 /* Pin the hierarchy */
2909 atomic_inc(&parent->root->sb->s_active); 2941 atomic_inc(&parent->root->sb->s_active);
2910 2942
@@ -3078,27 +3110,24 @@ static void cgroup_release_agent(struct work_struct *work)
3078 while (!list_empty(&release_list)) { 3110 while (!list_empty(&release_list)) {
3079 char *argv[3], *envp[3]; 3111 char *argv[3], *envp[3];
3080 int i; 3112 int i;
3081 char *pathbuf; 3113 char *pathbuf = NULL, *agentbuf = NULL;
3082 struct cgroup *cgrp = list_entry(release_list.next, 3114 struct cgroup *cgrp = list_entry(release_list.next,
3083 struct cgroup, 3115 struct cgroup,
3084 release_list); 3116 release_list);
3085 list_del_init(&cgrp->release_list); 3117 list_del_init(&cgrp->release_list);
3086 spin_unlock(&release_list_lock); 3118 spin_unlock(&release_list_lock);
3087 pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); 3119 pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
3088 if (!pathbuf) { 3120 if (!pathbuf)
3089 spin_lock(&release_list_lock); 3121 goto continue_free;
3090 continue; 3122 if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0)
3091 } 3123 goto continue_free;
3092 3124 agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
3093 if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0) { 3125 if (!agentbuf)
3094 kfree(pathbuf); 3126 goto continue_free;
3095 spin_lock(&release_list_lock);
3096 continue;
3097 }
3098 3127
3099 i = 0; 3128 i = 0;
3100 argv[i++] = cgrp->root->release_agent_path; 3129 argv[i++] = agentbuf;
3101 argv[i++] = (char *)pathbuf; 3130 argv[i++] = pathbuf;
3102 argv[i] = NULL; 3131 argv[i] = NULL;
3103 3132
3104 i = 0; 3133 i = 0;
@@ -3112,8 +3141,10 @@ static void cgroup_release_agent(struct work_struct *work)
3112 * be a slow process */ 3141 * be a slow process */
3113 mutex_unlock(&cgroup_mutex); 3142 mutex_unlock(&cgroup_mutex);
3114 call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); 3143 call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
3115 kfree(pathbuf);
3116 mutex_lock(&cgroup_mutex); 3144 mutex_lock(&cgroup_mutex);
3145 continue_free:
3146 kfree(pathbuf);
3147 kfree(agentbuf);
3117 spin_lock(&release_list_lock); 3148 spin_lock(&release_list_lock);
3118 } 3149 }
3119 spin_unlock(&release_list_lock); 3150 spin_unlock(&release_list_lock);
diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c
index c3dc3aba4c02..daca6209202d 100644
--- a/kernel/cgroup_debug.c
+++ b/kernel/cgroup_debug.c
@@ -57,7 +57,7 @@ static u64 current_css_set_refcount_read(struct cgroup *cont,
57 u64 count; 57 u64 count;
58 58
59 rcu_read_lock(); 59 rcu_read_lock();
60 count = atomic_read(&current->cgroups->ref.refcount); 60 count = atomic_read(&current->cgroups->refcount);
61 rcu_read_unlock(); 61 rcu_read_unlock();
62 return count; 62 return count;
63} 63}
@@ -90,7 +90,7 @@ static struct cftype files[] = {
90 { 90 {
91 .name = "releasable", 91 .name = "releasable",
92 .read_u64 = releasable_read, 92 .read_u64 = releasable_read,
93 } 93 },
94}; 94};
95 95
96static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont) 96static int debug_populate(struct cgroup_subsys *ss, struct cgroup *cont)
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
new file mode 100644
index 000000000000..e95056954498
--- /dev/null
+++ b/kernel/cgroup_freezer.c
@@ -0,0 +1,379 @@
1/*
2 * cgroup_freezer.c - control group freezer subsystem
3 *
4 * Copyright IBM Corporation, 2007
5 *
6 * Author : Cedric Le Goater <clg@fr.ibm.com>
7 *
8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms of version 2.1 of the GNU Lesser General Public License
10 * as published by the Free Software Foundation.
11 *
12 * This program is distributed in the hope that it would be useful, but
13 * WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
15 */
16
17#include <linux/module.h>
18#include <linux/cgroup.h>
19#include <linux/fs.h>
20#include <linux/uaccess.h>
21#include <linux/freezer.h>
22#include <linux/seq_file.h>
23
24enum freezer_state {
25 CGROUP_THAWED = 0,
26 CGROUP_FREEZING,
27 CGROUP_FROZEN,
28};
29
30struct freezer {
31 struct cgroup_subsys_state css;
32 enum freezer_state state;
33 spinlock_t lock; /* protects _writes_ to state */
34};
35
36static inline struct freezer *cgroup_freezer(
37 struct cgroup *cgroup)
38{
39 return container_of(
40 cgroup_subsys_state(cgroup, freezer_subsys_id),
41 struct freezer, css);
42}
43
44static inline struct freezer *task_freezer(struct task_struct *task)
45{
46 return container_of(task_subsys_state(task, freezer_subsys_id),
47 struct freezer, css);
48}
49
50int cgroup_frozen(struct task_struct *task)
51{
52 struct freezer *freezer;
53 enum freezer_state state;
54
55 task_lock(task);
56 freezer = task_freezer(task);
57 state = freezer->state;
58 task_unlock(task);
59
60 return state == CGROUP_FROZEN;
61}
62
63/*
64 * cgroups_write_string() limits the size of freezer state strings to
65 * CGROUP_LOCAL_BUFFER_SIZE
66 */
67static const char *freezer_state_strs[] = {
68 "THAWED",
69 "FREEZING",
70 "FROZEN",
71};
72
73/*
74 * State diagram
75 * Transitions are caused by userspace writes to the freezer.state file.
76 * The values in parenthesis are state labels. The rest are edge labels.
77 *
78 * (THAWED) --FROZEN--> (FREEZING) --FROZEN--> (FROZEN)
79 * ^ ^ | |
80 * | \_______THAWED_______/ |
81 * \__________________________THAWED____________/
82 */
83
84struct cgroup_subsys freezer_subsys;
85
86/* Locks taken and their ordering
87 * ------------------------------
88 * css_set_lock
89 * cgroup_mutex (AKA cgroup_lock)
90 * task->alloc_lock (AKA task_lock)
91 * freezer->lock
92 * task->sighand->siglock
93 *
94 * cgroup code forces css_set_lock to be taken before task->alloc_lock
95 *
96 * freezer_create(), freezer_destroy():
97 * cgroup_mutex [ by cgroup core ]
98 *
99 * can_attach():
100 * cgroup_mutex
101 *
102 * cgroup_frozen():
103 * task->alloc_lock (to get task's cgroup)
104 *
105 * freezer_fork() (preserving fork() performance means can't take cgroup_mutex):
106 * task->alloc_lock (to get task's cgroup)
107 * freezer->lock
108 * sighand->siglock (if the cgroup is freezing)
109 *
110 * freezer_read():
111 * cgroup_mutex
112 * freezer->lock
113 * read_lock css_set_lock (cgroup iterator start)
114 *
115 * freezer_write() (freeze):
116 * cgroup_mutex
117 * freezer->lock
118 * read_lock css_set_lock (cgroup iterator start)
119 * sighand->siglock
120 *
121 * freezer_write() (unfreeze):
122 * cgroup_mutex
123 * freezer->lock
124 * read_lock css_set_lock (cgroup iterator start)
125 * task->alloc_lock (to prevent races with freeze_task())
126 * sighand->siglock
127 */
128static struct cgroup_subsys_state *freezer_create(struct cgroup_subsys *ss,
129 struct cgroup *cgroup)
130{
131 struct freezer *freezer;
132
133 freezer = kzalloc(sizeof(struct freezer), GFP_KERNEL);
134 if (!freezer)
135 return ERR_PTR(-ENOMEM);
136
137 spin_lock_init(&freezer->lock);
138 freezer->state = CGROUP_THAWED;
139 return &freezer->css;
140}
141
142static void freezer_destroy(struct cgroup_subsys *ss,
143 struct cgroup *cgroup)
144{
145 kfree(cgroup_freezer(cgroup));
146}
147
148/* Task is frozen or will freeze immediately when next it gets woken */
149static bool is_task_frozen_enough(struct task_struct *task)
150{
151 return frozen(task) ||
152 (task_is_stopped_or_traced(task) && freezing(task));
153}
154
155/*
156 * The call to cgroup_lock() in the freezer.state write method prevents
157 * a write to that file racing against an attach, and hence the
158 * can_attach() result will remain valid until the attach completes.
159 */
160static int freezer_can_attach(struct cgroup_subsys *ss,
161 struct cgroup *new_cgroup,
162 struct task_struct *task)
163{
164 struct freezer *freezer;
165 int retval;
166
167 /* Anything frozen can't move or be moved to/from */
168
169 if (is_task_frozen_enough(task))
170 return -EBUSY;
171
172 freezer = cgroup_freezer(new_cgroup);
173 if (freezer->state == CGROUP_FROZEN)
174 return -EBUSY;
175
176 retval = 0;
177 task_lock(task);
178 freezer = task_freezer(task);
179 if (freezer->state == CGROUP_FROZEN)
180 retval = -EBUSY;
181 task_unlock(task);
182 return retval;
183}
184
185static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task)
186{
187 struct freezer *freezer;
188
189 task_lock(task);
190 freezer = task_freezer(task);
191 task_unlock(task);
192
193 BUG_ON(freezer->state == CGROUP_FROZEN);
194 spin_lock_irq(&freezer->lock);
195 /* Locking avoids race with FREEZING -> THAWED transitions. */
196 if (freezer->state == CGROUP_FREEZING)
197 freeze_task(task, true);
198 spin_unlock_irq(&freezer->lock);
199}
200
201/*
202 * caller must hold freezer->lock
203 */
204static void update_freezer_state(struct cgroup *cgroup,
205 struct freezer *freezer)
206{
207 struct cgroup_iter it;
208 struct task_struct *task;
209 unsigned int nfrozen = 0, ntotal = 0;
210
211 cgroup_iter_start(cgroup, &it);
212 while ((task = cgroup_iter_next(cgroup, &it))) {
213 ntotal++;
214 if (is_task_frozen_enough(task))
215 nfrozen++;
216 }
217
218 /*
219 * Transition to FROZEN when no new tasks can be added ensures
220 * that we never exist in the FROZEN state while there are unfrozen
221 * tasks.
222 */
223 if (nfrozen == ntotal)
224 freezer->state = CGROUP_FROZEN;
225 else if (nfrozen > 0)
226 freezer->state = CGROUP_FREEZING;
227 else
228 freezer->state = CGROUP_THAWED;
229 cgroup_iter_end(cgroup, &it);
230}
231
232static int freezer_read(struct cgroup *cgroup, struct cftype *cft,
233 struct seq_file *m)
234{
235 struct freezer *freezer;
236 enum freezer_state state;
237
238 if (!cgroup_lock_live_group(cgroup))
239 return -ENODEV;
240
241 freezer = cgroup_freezer(cgroup);
242 spin_lock_irq(&freezer->lock);
243 state = freezer->state;
244 if (state == CGROUP_FREEZING) {
245 /* We change from FREEZING to FROZEN lazily if the cgroup was
246 * only partially frozen when we exitted write. */
247 update_freezer_state(cgroup, freezer);
248 state = freezer->state;
249 }
250 spin_unlock_irq(&freezer->lock);
251 cgroup_unlock();
252
253 seq_puts(m, freezer_state_strs[state]);
254 seq_putc(m, '\n');
255 return 0;
256}
257
258static int try_to_freeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
259{
260 struct cgroup_iter it;
261 struct task_struct *task;
262 unsigned int num_cant_freeze_now = 0;
263
264 freezer->state = CGROUP_FREEZING;
265 cgroup_iter_start(cgroup, &it);
266 while ((task = cgroup_iter_next(cgroup, &it))) {
267 if (!freeze_task(task, true))
268 continue;
269 if (is_task_frozen_enough(task))
270 continue;
271 if (!freezing(task) && !freezer_should_skip(task))
272 num_cant_freeze_now++;
273 }
274 cgroup_iter_end(cgroup, &it);
275
276 return num_cant_freeze_now ? -EBUSY : 0;
277}
278
279static int unfreeze_cgroup(struct cgroup *cgroup, struct freezer *freezer)
280{
281 struct cgroup_iter it;
282 struct task_struct *task;
283
284 cgroup_iter_start(cgroup, &it);
285 while ((task = cgroup_iter_next(cgroup, &it))) {
286 int do_wake;
287
288 task_lock(task);
289 do_wake = __thaw_process(task);
290 task_unlock(task);
291 if (do_wake)
292 wake_up_process(task);
293 }
294 cgroup_iter_end(cgroup, &it);
295 freezer->state = CGROUP_THAWED;
296
297 return 0;
298}
299
300static int freezer_change_state(struct cgroup *cgroup,
301 enum freezer_state goal_state)
302{
303 struct freezer *freezer;
304 int retval = 0;
305
306 freezer = cgroup_freezer(cgroup);
307 spin_lock_irq(&freezer->lock);
308 update_freezer_state(cgroup, freezer);
309 if (goal_state == freezer->state)
310 goto out;
311 switch (freezer->state) {
312 case CGROUP_THAWED:
313 retval = try_to_freeze_cgroup(cgroup, freezer);
314 break;
315 case CGROUP_FREEZING:
316 if (goal_state == CGROUP_FROZEN) {
317 /* Userspace is retrying after
318 * "/bin/echo FROZEN > freezer.state" returned -EBUSY */
319 retval = try_to_freeze_cgroup(cgroup, freezer);
320 break;
321 }
322 /* state == FREEZING and goal_state == THAWED, so unfreeze */
323 case CGROUP_FROZEN:
324 retval = unfreeze_cgroup(cgroup, freezer);
325 break;
326 default:
327 break;
328 }
329out:
330 spin_unlock_irq(&freezer->lock);
331
332 return retval;
333}
334
335static int freezer_write(struct cgroup *cgroup,
336 struct cftype *cft,
337 const char *buffer)
338{
339 int retval;
340 enum freezer_state goal_state;
341
342 if (strcmp(buffer, freezer_state_strs[CGROUP_THAWED]) == 0)
343 goal_state = CGROUP_THAWED;
344 else if (strcmp(buffer, freezer_state_strs[CGROUP_FROZEN]) == 0)
345 goal_state = CGROUP_FROZEN;
346 else
347 return -EIO;
348
349 if (!cgroup_lock_live_group(cgroup))
350 return -ENODEV;
351 retval = freezer_change_state(cgroup, goal_state);
352 cgroup_unlock();
353 return retval;
354}
355
356static struct cftype files[] = {
357 {
358 .name = "state",
359 .read_seq_string = freezer_read,
360 .write_string = freezer_write,
361 },
362};
363
364static int freezer_populate(struct cgroup_subsys *ss, struct cgroup *cgroup)
365{
366 return cgroup_add_files(cgroup, ss, files, ARRAY_SIZE(files));
367}
368
369struct cgroup_subsys freezer_subsys = {
370 .name = "freezer",
371 .create = freezer_create,
372 .destroy = freezer_destroy,
373 .populate = freezer_populate,
374 .subsys_id = freezer_subsys_id,
375 .can_attach = freezer_can_attach,
376 .attach = NULL,
377 .fork = freezer_fork,
378 .exit = NULL,
379};
diff --git a/kernel/compat.c b/kernel/compat.c
index 32c254a8ab9a..8eafe3eb50d9 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -23,9 +23,68 @@
23#include <linux/timex.h> 23#include <linux/timex.h>
24#include <linux/migrate.h> 24#include <linux/migrate.h>
25#include <linux/posix-timers.h> 25#include <linux/posix-timers.h>
26#include <linux/times.h>
26 27
27#include <asm/uaccess.h> 28#include <asm/uaccess.h>
28 29
30/*
31 * Note that the native side is already converted to a timespec, because
32 * that's what we want anyway.
33 */
34static int compat_get_timeval(struct timespec *o,
35 struct compat_timeval __user *i)
36{
37 long usec;
38
39 if (get_user(o->tv_sec, &i->tv_sec) ||
40 get_user(usec, &i->tv_usec))
41 return -EFAULT;
42 o->tv_nsec = usec * 1000;
43 return 0;
44}
45
46static int compat_put_timeval(struct compat_timeval __user *o,
47 struct timeval *i)
48{
49 return (put_user(i->tv_sec, &o->tv_sec) ||
50 put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0;
51}
52
53asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
54 struct timezone __user *tz)
55{
56 if (tv) {
57 struct timeval ktv;
58 do_gettimeofday(&ktv);
59 if (compat_put_timeval(tv, &ktv))
60 return -EFAULT;
61 }
62 if (tz) {
63 if (copy_to_user(tz, &sys_tz, sizeof(sys_tz)))
64 return -EFAULT;
65 }
66
67 return 0;
68}
69
70asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
71 struct timezone __user *tz)
72{
73 struct timespec kts;
74 struct timezone ktz;
75
76 if (tv) {
77 if (compat_get_timeval(&kts, tv))
78 return -EFAULT;
79 }
80 if (tz) {
81 if (copy_from_user(&ktz, tz, sizeof(ktz)))
82 return -EFAULT;
83 }
84
85 return do_sys_settimeofday(tv ? &kts : NULL, tz ? &ktz : NULL);
86}
87
29int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts) 88int get_compat_timespec(struct timespec *ts, const struct compat_timespec __user *cts)
30{ 89{
31 return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) || 90 return (!access_ok(VERIFY_READ, cts, sizeof(*cts)) ||
@@ -150,49 +209,23 @@ asmlinkage long compat_sys_setitimer(int which,
150 return 0; 209 return 0;
151} 210}
152 211
212static compat_clock_t clock_t_to_compat_clock_t(clock_t x)
213{
214 return compat_jiffies_to_clock_t(clock_t_to_jiffies(x));
215}
216
153asmlinkage long compat_sys_times(struct compat_tms __user *tbuf) 217asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
154{ 218{
155 /*
156 * In the SMP world we might just be unlucky and have one of
157 * the times increment as we use it. Since the value is an
158 * atomically safe type this is just fine. Conceptually its
159 * as if the syscall took an instant longer to occur.
160 */
161 if (tbuf) { 219 if (tbuf) {
220 struct tms tms;
162 struct compat_tms tmp; 221 struct compat_tms tmp;
163 struct task_struct *tsk = current; 222
164 struct task_struct *t; 223 do_sys_times(&tms);
165 cputime_t utime, stime, cutime, cstime; 224 /* Convert our struct tms to the compat version. */
166 225 tmp.tms_utime = clock_t_to_compat_clock_t(tms.tms_utime);
167 read_lock(&tasklist_lock); 226 tmp.tms_stime = clock_t_to_compat_clock_t(tms.tms_stime);
168 utime = tsk->signal->utime; 227 tmp.tms_cutime = clock_t_to_compat_clock_t(tms.tms_cutime);
169 stime = tsk->signal->stime; 228 tmp.tms_cstime = clock_t_to_compat_clock_t(tms.tms_cstime);
170 t = tsk;
171 do {
172 utime = cputime_add(utime, t->utime);
173 stime = cputime_add(stime, t->stime);
174 t = next_thread(t);
175 } while (t != tsk);
176
177 /*
178 * While we have tasklist_lock read-locked, no dying thread
179 * can be updating current->signal->[us]time. Instead,
180 * we got their counts included in the live thread loop.
181 * However, another thread can come in right now and
182 * do a wait call that updates current->signal->c[us]time.
183 * To make sure we always see that pair updated atomically,
184 * we take the siglock around fetching them.
185 */
186 spin_lock_irq(&tsk->sighand->siglock);
187 cutime = tsk->signal->cutime;
188 cstime = tsk->signal->cstime;
189 spin_unlock_irq(&tsk->sighand->siglock);
190 read_unlock(&tasklist_lock);
191
192 tmp.tms_utime = compat_jiffies_to_clock_t(cputime_to_jiffies(utime));
193 tmp.tms_stime = compat_jiffies_to_clock_t(cputime_to_jiffies(stime));
194 tmp.tms_cutime = compat_jiffies_to_clock_t(cputime_to_jiffies(cutime));
195 tmp.tms_cstime = compat_jiffies_to_clock_t(cputime_to_jiffies(cstime));
196 if (copy_to_user(tbuf, &tmp, sizeof(tmp))) 229 if (copy_to_user(tbuf, &tmp, sizeof(tmp)))
197 return -EFAULT; 230 return -EFAULT;
198 } 231 }
diff --git a/kernel/configs.c b/kernel/configs.c
index 4c345210ed8c..abaee684ecbf 100644
--- a/kernel/configs.c
+++ b/kernel/configs.c
@@ -54,9 +54,6 @@
54 54
55#ifdef CONFIG_IKCONFIG_PROC 55#ifdef CONFIG_IKCONFIG_PROC
56 56
57/**************************************************/
58/* globals and useful constants */
59
60static ssize_t 57static ssize_t
61ikconfig_read_current(struct file *file, char __user *buf, 58ikconfig_read_current(struct file *file, char __user *buf,
62 size_t len, loff_t * offset) 59 size_t len, loff_t * offset)
@@ -71,9 +68,6 @@ static const struct file_operations ikconfig_file_ops = {
71 .read = ikconfig_read_current, 68 .read = ikconfig_read_current,
72}; 69};
73 70
74/***************************************************/
75/* ikconfig_init: start up everything we need to */
76
77static int __init ikconfig_init(void) 71static int __init ikconfig_init(void)
78{ 72{
79 struct proc_dir_entry *entry; 73 struct proc_dir_entry *entry;
@@ -89,9 +83,6 @@ static int __init ikconfig_init(void)
89 return 0; 83 return 0;
90} 84}
91 85
92/***************************************************/
93/* ikconfig_cleanup: clean up our mess */
94
95static void __exit ikconfig_cleanup(void) 86static void __exit ikconfig_cleanup(void)
96{ 87{
97 remove_proc_entry("config.gz", NULL); 88 remove_proc_entry("config.gz", NULL);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index cfb1d43ab801..86d49045daed 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -64,6 +64,8 @@ void __init cpu_hotplug_init(void)
64 cpu_hotplug.refcount = 0; 64 cpu_hotplug.refcount = 0;
65} 65}
66 66
67cpumask_t cpu_active_map;
68
67#ifdef CONFIG_HOTPLUG_CPU 69#ifdef CONFIG_HOTPLUG_CPU
68 70
69void get_online_cpus(void) 71void get_online_cpus(void)
@@ -197,13 +199,14 @@ static int __ref take_cpu_down(void *_param)
197 struct take_cpu_down_param *param = _param; 199 struct take_cpu_down_param *param = _param;
198 int err; 200 int err;
199 201
200 raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
201 param->hcpu);
202 /* Ensure this CPU doesn't handle any more interrupts. */ 202 /* Ensure this CPU doesn't handle any more interrupts. */
203 err = __cpu_disable(); 203 err = __cpu_disable();
204 if (err < 0) 204 if (err < 0)
205 return err; 205 return err;
206 206
207 raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod,
208 param->hcpu);
209
207 /* Force idle task to run as soon as we yield: it should 210 /* Force idle task to run as soon as we yield: it should
208 immediately notice cpu is offline and die quickly. */ 211 immediately notice cpu is offline and die quickly. */
209 sched_idle_next(); 212 sched_idle_next();
@@ -214,7 +217,6 @@ static int __ref take_cpu_down(void *_param)
214static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) 217static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
215{ 218{
216 int err, nr_calls = 0; 219 int err, nr_calls = 0;
217 struct task_struct *p;
218 cpumask_t old_allowed, tmp; 220 cpumask_t old_allowed, tmp;
219 void *hcpu = (void *)(long)cpu; 221 void *hcpu = (void *)(long)cpu;
220 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; 222 unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0;
@@ -247,21 +249,18 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
247 cpus_setall(tmp); 249 cpus_setall(tmp);
248 cpu_clear(cpu, tmp); 250 cpu_clear(cpu, tmp);
249 set_cpus_allowed_ptr(current, &tmp); 251 set_cpus_allowed_ptr(current, &tmp);
252 tmp = cpumask_of_cpu(cpu);
250 253
251 p = __stop_machine_run(take_cpu_down, &tcd_param, cpu); 254 err = __stop_machine(take_cpu_down, &tcd_param, &tmp);
252 255 if (err) {
253 if (IS_ERR(p) || cpu_online(cpu)) {
254 /* CPU didn't die: tell everyone. Can't complain. */ 256 /* CPU didn't die: tell everyone. Can't complain. */
255 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, 257 if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod,
256 hcpu) == NOTIFY_BAD) 258 hcpu) == NOTIFY_BAD)
257 BUG(); 259 BUG();
258 260
259 if (IS_ERR(p)) { 261 goto out_allowed;
260 err = PTR_ERR(p);
261 goto out_allowed;
262 }
263 goto out_thread;
264 } 262 }
263 BUG_ON(cpu_online(cpu));
265 264
266 /* Wait for it to sleep (leaving idle task). */ 265 /* Wait for it to sleep (leaving idle task). */
267 while (!idle_cpu(cpu)) 266 while (!idle_cpu(cpu))
@@ -277,12 +276,15 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
277 276
278 check_for_tasks(cpu); 277 check_for_tasks(cpu);
279 278
280out_thread:
281 err = kthread_stop(p);
282out_allowed: 279out_allowed:
283 set_cpus_allowed_ptr(current, &old_allowed); 280 set_cpus_allowed_ptr(current, &old_allowed);
284out_release: 281out_release:
285 cpu_hotplug_done(); 282 cpu_hotplug_done();
283 if (!err) {
284 if (raw_notifier_call_chain(&cpu_chain, CPU_POST_DEAD | mod,
285 hcpu) == NOTIFY_BAD)
286 BUG();
287 }
286 return err; 288 return err;
287} 289}
288 290
@@ -291,11 +293,30 @@ int __ref cpu_down(unsigned int cpu)
291 int err = 0; 293 int err = 0;
292 294
293 cpu_maps_update_begin(); 295 cpu_maps_update_begin();
294 if (cpu_hotplug_disabled) 296
297 if (cpu_hotplug_disabled) {
295 err = -EBUSY; 298 err = -EBUSY;
296 else 299 goto out;
297 err = _cpu_down(cpu, 0); 300 }
298 301
302 cpu_clear(cpu, cpu_active_map);
303
304 /*
305 * Make sure the all cpus did the reschedule and are not
306 * using stale version of the cpu_active_map.
307 * This is not strictly necessary becuase stop_machine()
308 * that we run down the line already provides the required
309 * synchronization. But it's really a side effect and we do not
310 * want to depend on the innards of the stop_machine here.
311 */
312 synchronize_sched();
313
314 err = _cpu_down(cpu, 0);
315
316 if (cpu_online(cpu))
317 cpu_set(cpu, cpu_active_map);
318
319out:
299 cpu_maps_update_done(); 320 cpu_maps_update_done();
300 return err; 321 return err;
301} 322}
@@ -329,6 +350,8 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
329 goto out_notify; 350 goto out_notify;
330 BUG_ON(!cpu_online(cpu)); 351 BUG_ON(!cpu_online(cpu));
331 352
353 cpu_set(cpu, cpu_active_map);
354
332 /* Now call notifier in preparation. */ 355 /* Now call notifier in preparation. */
333 raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu); 356 raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu);
334 357
@@ -347,7 +370,7 @@ int __cpuinit cpu_up(unsigned int cpu)
347 if (!cpu_isset(cpu, cpu_possible_map)) { 370 if (!cpu_isset(cpu, cpu_possible_map)) {
348 printk(KERN_ERR "can't online cpu %d because it is not " 371 printk(KERN_ERR "can't online cpu %d because it is not "
349 "configured as may-hotadd at boot time\n", cpu); 372 "configured as may-hotadd at boot time\n", cpu);
350#if defined(CONFIG_IA64) || defined(CONFIG_X86_64) || defined(CONFIG_S390) 373#if defined(CONFIG_IA64) || defined(CONFIG_X86_64)
351 printk(KERN_ERR "please check additional_cpus= boot " 374 printk(KERN_ERR "please check additional_cpus= boot "
352 "parameter\n"); 375 "parameter\n");
353#endif 376#endif
@@ -355,11 +378,15 @@ int __cpuinit cpu_up(unsigned int cpu)
355 } 378 }
356 379
357 cpu_maps_update_begin(); 380 cpu_maps_update_begin();
358 if (cpu_hotplug_disabled) 381
382 if (cpu_hotplug_disabled) {
359 err = -EBUSY; 383 err = -EBUSY;
360 else 384 goto out;
361 err = _cpu_up(cpu, 0); 385 }
362 386
387 err = _cpu_up(cpu, 0);
388
389out:
363 cpu_maps_update_done(); 390 cpu_maps_update_done();
364 return err; 391 return err;
365} 392}
@@ -413,7 +440,7 @@ void __ref enable_nonboot_cpus(void)
413 goto out; 440 goto out;
414 441
415 printk("Enabling non-boot CPUs ...\n"); 442 printk("Enabling non-boot CPUs ...\n");
416 for_each_cpu_mask(cpu, frozen_cpus) { 443 for_each_cpu_mask_nr(cpu, frozen_cpus) {
417 error = _cpu_up(cpu, 1); 444 error = _cpu_up(cpu, 1);
418 if (!error) { 445 if (!error) {
419 printk("CPU%d is up\n", cpu); 446 printk("CPU%d is up\n", cpu);
@@ -427,4 +454,48 @@ out:
427} 454}
428#endif /* CONFIG_PM_SLEEP_SMP */ 455#endif /* CONFIG_PM_SLEEP_SMP */
429 456
457/**
458 * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers
459 * @cpu: cpu that just started
460 *
461 * This function calls the cpu_chain notifiers with CPU_STARTING.
462 * It must be called by the arch code on the new cpu, before the new cpu
463 * enables interrupts and before the "boot" cpu returns from __cpu_up().
464 */
465void notify_cpu_starting(unsigned int cpu)
466{
467 unsigned long val = CPU_STARTING;
468
469#ifdef CONFIG_PM_SLEEP_SMP
470 if (cpu_isset(cpu, frozen_cpus))
471 val = CPU_STARTING_FROZEN;
472#endif /* CONFIG_PM_SLEEP_SMP */
473 raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu);
474}
475
430#endif /* CONFIG_SMP */ 476#endif /* CONFIG_SMP */
477
478/*
479 * cpu_bit_bitmap[] is a special, "compressed" data structure that
480 * represents all NR_CPUS bits binary values of 1<<nr.
481 *
482 * It is used by cpumask_of_cpu() to get a constant address to a CPU
483 * mask value that has a single bit set only.
484 */
485
486/* cpu_bit_bitmap[0] is empty - so we can back into it */
487#define MASK_DECLARE_1(x) [x+1][0] = 1UL << (x)
488#define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
489#define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
490#define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)
491
492const unsigned long cpu_bit_bitmap[BITS_PER_LONG+1][BITS_TO_LONGS(NR_CPUS)] = {
493
494 MASK_DECLARE_8(0), MASK_DECLARE_8(8),
495 MASK_DECLARE_8(16), MASK_DECLARE_8(24),
496#if BITS_PER_LONG > 32
497 MASK_DECLARE_8(32), MASK_DECLARE_8(40),
498 MASK_DECLARE_8(48), MASK_DECLARE_8(56),
499#endif
500};
501EXPORT_SYMBOL_GPL(cpu_bit_bitmap);
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 459d601947a8..3e00526f52ec 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -14,6 +14,8 @@
14 * 2003-10-22 Updates by Stephen Hemminger. 14 * 2003-10-22 Updates by Stephen Hemminger.
15 * 2004 May-July Rework by Paul Jackson. 15 * 2004 May-July Rework by Paul Jackson.
16 * 2006 Rework by Paul Menage to use generic cgroups 16 * 2006 Rework by Paul Menage to use generic cgroups
17 * 2008 Rework of the scheduler domains and CPU hotplug handling
18 * by Max Krasnyansky
17 * 19 *
18 * This file is subject to the terms and conditions of the GNU General Public 20 * This file is subject to the terms and conditions of the GNU General Public
19 * License. See the file COPYING in the main directory of the Linux 21 * License. See the file COPYING in the main directory of the Linux
@@ -54,7 +56,6 @@
54#include <asm/uaccess.h> 56#include <asm/uaccess.h>
55#include <asm/atomic.h> 57#include <asm/atomic.h>
56#include <linux/mutex.h> 58#include <linux/mutex.h>
57#include <linux/kfifo.h>
58#include <linux/workqueue.h> 59#include <linux/workqueue.h>
59#include <linux/cgroup.h> 60#include <linux/cgroup.h>
60 61
@@ -227,10 +228,6 @@ static struct cpuset top_cpuset = {
227 * The task_struct fields mems_allowed and mems_generation may only 228 * The task_struct fields mems_allowed and mems_generation may only
228 * be accessed in the context of that task, so require no locks. 229 * be accessed in the context of that task, so require no locks.
229 * 230 *
230 * The cpuset_common_file_write handler for operations that modify
231 * the cpuset hierarchy holds cgroup_mutex across the entire operation,
232 * single threading all such cpuset modifications across the system.
233 *
234 * The cpuset_common_file_read() handlers only hold callback_mutex across 231 * The cpuset_common_file_read() handlers only hold callback_mutex across
235 * small pieces of code, such as when reading out possibly multi-word 232 * small pieces of code, such as when reading out possibly multi-word
236 * cpumasks and nodemasks. 233 * cpumasks and nodemasks.
@@ -241,9 +238,11 @@ static struct cpuset top_cpuset = {
241 238
242static DEFINE_MUTEX(callback_mutex); 239static DEFINE_MUTEX(callback_mutex);
243 240
244/* This is ugly, but preserves the userspace API for existing cpuset 241/*
242 * This is ugly, but preserves the userspace API for existing cpuset
245 * users. If someone tries to mount the "cpuset" filesystem, we 243 * users. If someone tries to mount the "cpuset" filesystem, we
246 * silently switch it to mount "cgroup" instead */ 244 * silently switch it to mount "cgroup" instead
245 */
247static int cpuset_get_sb(struct file_system_type *fs_type, 246static int cpuset_get_sb(struct file_system_type *fs_type,
248 int flags, const char *unused_dev_name, 247 int flags, const char *unused_dev_name,
249 void *data, struct vfsmount *mnt) 248 void *data, struct vfsmount *mnt)
@@ -369,7 +368,7 @@ void cpuset_update_task_memory_state(void)
369 my_cpusets_mem_gen = top_cpuset.mems_generation; 368 my_cpusets_mem_gen = top_cpuset.mems_generation;
370 } else { 369 } else {
371 rcu_read_lock(); 370 rcu_read_lock();
372 my_cpusets_mem_gen = task_cs(current)->mems_generation; 371 my_cpusets_mem_gen = task_cs(tsk)->mems_generation;
373 rcu_read_unlock(); 372 rcu_read_unlock();
374 } 373 }
375 374
@@ -478,10 +477,9 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
478} 477}
479 478
480/* 479/*
481 * Helper routine for rebuild_sched_domains(). 480 * Helper routine for generate_sched_domains().
482 * Do cpusets a, b have overlapping cpus_allowed masks? 481 * Do cpusets a, b have overlapping cpus_allowed masks?
483 */ 482 */
484
485static int cpusets_overlap(struct cpuset *a, struct cpuset *b) 483static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
486{ 484{
487 return cpus_intersects(a->cpus_allowed, b->cpus_allowed); 485 return cpus_intersects(a->cpus_allowed, b->cpus_allowed);
@@ -490,29 +488,48 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b)
490static void 488static void
491update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) 489update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
492{ 490{
493 if (!dattr)
494 return;
495 if (dattr->relax_domain_level < c->relax_domain_level) 491 if (dattr->relax_domain_level < c->relax_domain_level)
496 dattr->relax_domain_level = c->relax_domain_level; 492 dattr->relax_domain_level = c->relax_domain_level;
497 return; 493 return;
498} 494}
499 495
496static void
497update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c)
498{
499 LIST_HEAD(q);
500
501 list_add(&c->stack_list, &q);
502 while (!list_empty(&q)) {
503 struct cpuset *cp;
504 struct cgroup *cont;
505 struct cpuset *child;
506
507 cp = list_first_entry(&q, struct cpuset, stack_list);
508 list_del(q.next);
509
510 if (cpus_empty(cp->cpus_allowed))
511 continue;
512
513 if (is_sched_load_balance(cp))
514 update_domain_attr(dattr, cp);
515
516 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
517 child = cgroup_cs(cont);
518 list_add_tail(&child->stack_list, &q);
519 }
520 }
521}
522
500/* 523/*
501 * rebuild_sched_domains() 524 * generate_sched_domains()
502 * 525 *
503 * If the flag 'sched_load_balance' of any cpuset with non-empty 526 * This function builds a partial partition of the systems CPUs
504 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset 527 * A 'partial partition' is a set of non-overlapping subsets whose
505 * which has that flag enabled, or if any cpuset with a non-empty 528 * union is a subset of that set.
506 * 'cpus' is removed, then call this routine to rebuild the 529 * The output of this function needs to be passed to kernel/sched.c
507 * scheduler's dynamic sched domains. 530 * partition_sched_domains() routine, which will rebuild the scheduler's
508 * 531 * load balancing domains (sched domains) as specified by that partial
509 * This routine builds a partial partition of the systems CPUs 532 * partition.
510 * (the set of non-overlappping cpumask_t's in the array 'part'
511 * below), and passes that partial partition to the kernel/sched.c
512 * partition_sched_domains() routine, which will rebuild the
513 * schedulers load balancing domains (sched domains) as specified
514 * by that partial partition. A 'partial partition' is a set of
515 * non-overlapping subsets whose union is a subset of that set.
516 * 533 *
517 * See "What is sched_load_balance" in Documentation/cpusets.txt 534 * See "What is sched_load_balance" in Documentation/cpusets.txt
518 * for a background explanation of this. 535 * for a background explanation of this.
@@ -522,16 +539,10 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
522 * domains when operating in the severe memory shortage situations 539 * domains when operating in the severe memory shortage situations
523 * that could cause allocation failures below. 540 * that could cause allocation failures below.
524 * 541 *
525 * Call with cgroup_mutex held. May take callback_mutex during 542 * Must be called with cgroup_lock held.
526 * call due to the kfifo_alloc() and kmalloc() calls. May nest
527 * a call to the get_online_cpus()/put_online_cpus() pair.
528 * Must not be called holding callback_mutex, because we must not
529 * call get_online_cpus() while holding callback_mutex. Elsewhere
530 * the kernel nests callback_mutex inside get_online_cpus() calls.
531 * So the reverse nesting would risk an ABBA deadlock.
532 * 543 *
533 * The three key local variables below are: 544 * The three key local variables below are:
534 * q - a kfifo queue of cpuset pointers, used to implement a 545 * q - a linked-list queue of cpuset pointers, used to implement a
535 * top-down scan of all cpusets. This scan loads a pointer 546 * top-down scan of all cpusets. This scan loads a pointer
536 * to each cpuset marked is_sched_load_balance into the 547 * to each cpuset marked is_sched_load_balance into the
537 * array 'csa'. For our purposes, rebuilding the schedulers 548 * array 'csa'. For our purposes, rebuilding the schedulers
@@ -563,10 +574,10 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c)
563 * element of the partition (one sched domain) to be passed to 574 * element of the partition (one sched domain) to be passed to
564 * partition_sched_domains(). 575 * partition_sched_domains().
565 */ 576 */
566 577static int generate_sched_domains(cpumask_t **domains,
567static void rebuild_sched_domains(void) 578 struct sched_domain_attr **attributes)
568{ 579{
569 struct kfifo *q; /* queue of cpusets to be scanned */ 580 LIST_HEAD(q); /* queue of cpusets to be scanned */
570 struct cpuset *cp; /* scans q */ 581 struct cpuset *cp; /* scans q */
571 struct cpuset **csa; /* array of all cpuset ptrs */ 582 struct cpuset **csa; /* array of all cpuset ptrs */
572 int csn; /* how many cpuset ptrs in csa so far */ 583 int csn; /* how many cpuset ptrs in csa so far */
@@ -576,44 +587,58 @@ static void rebuild_sched_domains(void)
576 int ndoms; /* number of sched domains in result */ 587 int ndoms; /* number of sched domains in result */
577 int nslot; /* next empty doms[] cpumask_t slot */ 588 int nslot; /* next empty doms[] cpumask_t slot */
578 589
579 q = NULL; 590 ndoms = 0;
580 csa = NULL;
581 doms = NULL; 591 doms = NULL;
582 dattr = NULL; 592 dattr = NULL;
593 csa = NULL;
583 594
584 /* Special case for the 99% of systems with one, full, sched domain */ 595 /* Special case for the 99% of systems with one, full, sched domain */
585 if (is_sched_load_balance(&top_cpuset)) { 596 if (is_sched_load_balance(&top_cpuset)) {
586 ndoms = 1;
587 doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); 597 doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL);
588 if (!doms) 598 if (!doms)
589 goto rebuild; 599 goto done;
600
590 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); 601 dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL);
591 if (dattr) { 602 if (dattr) {
592 *dattr = SD_ATTR_INIT; 603 *dattr = SD_ATTR_INIT;
593 update_domain_attr(dattr, &top_cpuset); 604 update_domain_attr_tree(dattr, &top_cpuset);
594 } 605 }
595 *doms = top_cpuset.cpus_allowed; 606 *doms = top_cpuset.cpus_allowed;
596 goto rebuild;
597 }
598 607
599 q = kfifo_alloc(number_of_cpusets * sizeof(cp), GFP_KERNEL, NULL); 608 ndoms = 1;
600 if (IS_ERR(q))
601 goto done; 609 goto done;
610 }
611
602 csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); 612 csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL);
603 if (!csa) 613 if (!csa)
604 goto done; 614 goto done;
605 csn = 0; 615 csn = 0;
606 616
607 cp = &top_cpuset; 617 list_add(&top_cpuset.stack_list, &q);
608 __kfifo_put(q, (void *)&cp, sizeof(cp)); 618 while (!list_empty(&q)) {
609 while (__kfifo_get(q, (void *)&cp, sizeof(cp))) {
610 struct cgroup *cont; 619 struct cgroup *cont;
611 struct cpuset *child; /* scans child cpusets of cp */ 620 struct cpuset *child; /* scans child cpusets of cp */
612 if (is_sched_load_balance(cp)) 621
622 cp = list_first_entry(&q, struct cpuset, stack_list);
623 list_del(q.next);
624
625 if (cpus_empty(cp->cpus_allowed))
626 continue;
627
628 /*
629 * All child cpusets contain a subset of the parent's cpus, so
630 * just skip them, and then we call update_domain_attr_tree()
631 * to calc relax_domain_level of the corresponding sched
632 * domain.
633 */
634 if (is_sched_load_balance(cp)) {
613 csa[csn++] = cp; 635 csa[csn++] = cp;
636 continue;
637 }
638
614 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { 639 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
615 child = cgroup_cs(cont); 640 child = cgroup_cs(cont);
616 __kfifo_put(q, (void *)&child, sizeof(cp)); 641 list_add_tail(&child->stack_list, &q);
617 } 642 }
618 } 643 }
619 644
@@ -644,91 +669,141 @@ restart:
644 } 669 }
645 } 670 }
646 671
647 /* Convert <csn, csa> to <ndoms, doms> */ 672 /*
673 * Now we know how many domains to create.
674 * Convert <csn, csa> to <ndoms, doms> and populate cpu masks.
675 */
648 doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); 676 doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL);
649 if (!doms) 677 if (!doms) {
650 goto rebuild; 678 ndoms = 0;
679 goto done;
680 }
681
682 /*
683 * The rest of the code, including the scheduler, can deal with
684 * dattr==NULL case. No need to abort if alloc fails.
685 */
651 dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); 686 dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL);
652 687
653 for (nslot = 0, i = 0; i < csn; i++) { 688 for (nslot = 0, i = 0; i < csn; i++) {
654 struct cpuset *a = csa[i]; 689 struct cpuset *a = csa[i];
690 cpumask_t *dp;
655 int apn = a->pn; 691 int apn = a->pn;
656 692
657 if (apn >= 0) { 693 if (apn < 0) {
658 cpumask_t *dp = doms + nslot; 694 /* Skip completed partitions */
659 695 continue;
660 if (nslot == ndoms) { 696 }
661 static int warnings = 10; 697
662 if (warnings) { 698 dp = doms + nslot;
663 printk(KERN_WARNING 699
664 "rebuild_sched_domains confused:" 700 if (nslot == ndoms) {
665 " nslot %d, ndoms %d, csn %d, i %d," 701 static int warnings = 10;
666 " apn %d\n", 702 if (warnings) {
667 nslot, ndoms, csn, i, apn); 703 printk(KERN_WARNING
668 warnings--; 704 "rebuild_sched_domains confused:"
669 } 705 " nslot %d, ndoms %d, csn %d, i %d,"
670 continue; 706 " apn %d\n",
707 nslot, ndoms, csn, i, apn);
708 warnings--;
671 } 709 }
710 continue;
711 }
672 712
673 cpus_clear(*dp); 713 cpus_clear(*dp);
674 if (dattr) 714 if (dattr)
675 *(dattr + nslot) = SD_ATTR_INIT; 715 *(dattr + nslot) = SD_ATTR_INIT;
676 for (j = i; j < csn; j++) { 716 for (j = i; j < csn; j++) {
677 struct cpuset *b = csa[j]; 717 struct cpuset *b = csa[j];
678 718
679 if (apn == b->pn) { 719 if (apn == b->pn) {
680 cpus_or(*dp, *dp, b->cpus_allowed); 720 cpus_or(*dp, *dp, b->cpus_allowed);
681 b->pn = -1; 721 if (dattr)
682 update_domain_attr(dattr, b); 722 update_domain_attr_tree(dattr + nslot, b);
683 } 723
724 /* Done with this partition */
725 b->pn = -1;
684 } 726 }
685 nslot++;
686 } 727 }
728 nslot++;
687 } 729 }
688 BUG_ON(nslot != ndoms); 730 BUG_ON(nslot != ndoms);
689 731
690rebuild:
691 /* Have scheduler rebuild sched domains */
692 get_online_cpus();
693 partition_sched_domains(ndoms, doms, dattr);
694 put_online_cpus();
695
696done: 732done:
697 if (q && !IS_ERR(q))
698 kfifo_free(q);
699 kfree(csa); 733 kfree(csa);
700 /* Don't kfree(doms) -- partition_sched_domains() does that. */ 734
701 /* Don't kfree(dattr) -- partition_sched_domains() does that. */ 735 *domains = doms;
736 *attributes = dattr;
737 return ndoms;
702} 738}
703 739
704static inline int started_after_time(struct task_struct *t1, 740/*
705 struct timespec *time, 741 * Rebuild scheduler domains.
706 struct task_struct *t2) 742 *
743 * Call with neither cgroup_mutex held nor within get_online_cpus().
744 * Takes both cgroup_mutex and get_online_cpus().
745 *
746 * Cannot be directly called from cpuset code handling changes
747 * to the cpuset pseudo-filesystem, because it cannot be called
748 * from code that already holds cgroup_mutex.
749 */
750static void do_rebuild_sched_domains(struct work_struct *unused)
707{ 751{
708 int start_diff = timespec_compare(&t1->start_time, time); 752 struct sched_domain_attr *attr;
709 if (start_diff > 0) { 753 cpumask_t *doms;
710 return 1; 754 int ndoms;
711 } else if (start_diff < 0) { 755
712 return 0; 756 get_online_cpus();
713 } else { 757
714 /* 758 /* Generate domain masks and attrs */
715 * Arbitrarily, if two processes started at the same 759 cgroup_lock();
716 * time, we'll say that the lower pointer value 760 ndoms = generate_sched_domains(&doms, &attr);
717 * started first. Note that t2 may have exited by now 761 cgroup_unlock();
718 * so this may not be a valid pointer any longer, but 762
719 * that's fine - it still serves to distinguish 763 /* Have scheduler rebuild the domains */
720 * between two tasks started (effectively) 764 partition_sched_domains(ndoms, doms, attr);
721 * simultaneously. 765
722 */ 766 put_online_cpus();
723 return t1 > t2; 767}
724 } 768
769static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
770
771/*
772 * Rebuild scheduler domains, asynchronously via workqueue.
773 *
774 * If the flag 'sched_load_balance' of any cpuset with non-empty
775 * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset
776 * which has that flag enabled, or if any cpuset with a non-empty
777 * 'cpus' is removed, then call this routine to rebuild the
778 * scheduler's dynamic sched domains.
779 *
780 * The rebuild_sched_domains() and partition_sched_domains()
781 * routines must nest cgroup_lock() inside get_online_cpus(),
782 * but such cpuset changes as these must nest that locking the
783 * other way, holding cgroup_lock() for much of the code.
784 *
785 * So in order to avoid an ABBA deadlock, the cpuset code handling
786 * these user changes delegates the actual sched domain rebuilding
787 * to a separate workqueue thread, which ends up processing the
788 * above do_rebuild_sched_domains() function.
789 */
790static void async_rebuild_sched_domains(void)
791{
792 schedule_work(&rebuild_sched_domains_work);
725} 793}
726 794
727static inline int started_after(void *p1, void *p2) 795/*
796 * Accomplishes the same scheduler domain rebuild as the above
797 * async_rebuild_sched_domains(), however it directly calls the
798 * rebuild routine synchronously rather than calling it via an
799 * asynchronous work thread.
800 *
801 * This can only be called from code that is not holding
802 * cgroup_mutex (not nested in a cgroup_lock() call.)
803 */
804void rebuild_sched_domains(void)
728{ 805{
729 struct task_struct *t1 = p1; 806 do_rebuild_sched_domains(NULL);
730 struct task_struct *t2 = p2;
731 return started_after_time(t1, &t2->start_time, t2);
732} 807}
733 808
734/** 809/**
@@ -766,15 +841,38 @@ static void cpuset_change_cpumask(struct task_struct *tsk,
766} 841}
767 842
768/** 843/**
844 * update_tasks_cpumask - Update the cpumasks of tasks in the cpuset.
845 * @cs: the cpuset in which each task's cpus_allowed mask needs to be changed
846 * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
847 *
848 * Called with cgroup_mutex held
849 *
850 * The cgroup_scan_tasks() function will scan all the tasks in a cgroup,
851 * calling callback functions for each.
852 *
853 * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
854 * if @heap != NULL.
855 */
856static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
857{
858 struct cgroup_scanner scan;
859
860 scan.cg = cs->css.cgroup;
861 scan.test_task = cpuset_test_cpumask;
862 scan.process_task = cpuset_change_cpumask;
863 scan.heap = heap;
864 cgroup_scan_tasks(&scan);
865}
866
867/**
769 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it 868 * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
770 * @cs: the cpuset to consider 869 * @cs: the cpuset to consider
771 * @buf: buffer of cpu numbers written to this cpuset 870 * @buf: buffer of cpu numbers written to this cpuset
772 */ 871 */
773static int update_cpumask(struct cpuset *cs, char *buf) 872static int update_cpumask(struct cpuset *cs, const char *buf)
774{ 873{
775 struct cpuset trialcs;
776 struct cgroup_scanner scan;
777 struct ptr_heap heap; 874 struct ptr_heap heap;
875 struct cpuset trialcs;
778 int retval; 876 int retval;
779 int is_load_balanced; 877 int is_load_balanced;
780 878
@@ -790,7 +888,6 @@ static int update_cpumask(struct cpuset *cs, char *buf)
790 * that parsing. The validate_change() call ensures that cpusets 888 * that parsing. The validate_change() call ensures that cpusets
791 * with tasks have cpus. 889 * with tasks have cpus.
792 */ 890 */
793 buf = strstrip(buf);
794 if (!*buf) { 891 if (!*buf) {
795 cpus_clear(trialcs.cpus_allowed); 892 cpus_clear(trialcs.cpus_allowed);
796 } else { 893 } else {
@@ -809,7 +906,7 @@ static int update_cpumask(struct cpuset *cs, char *buf)
809 if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed)) 906 if (cpus_equal(cs->cpus_allowed, trialcs.cpus_allowed))
810 return 0; 907 return 0;
811 908
812 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after); 909 retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
813 if (retval) 910 if (retval)
814 return retval; 911 return retval;
815 912
@@ -823,15 +920,12 @@ static int update_cpumask(struct cpuset *cs, char *buf)
823 * Scan tasks in the cpuset, and update the cpumasks of any 920 * Scan tasks in the cpuset, and update the cpumasks of any
824 * that need an update. 921 * that need an update.
825 */ 922 */
826 scan.cg = cs->css.cgroup; 923 update_tasks_cpumask(cs, &heap);
827 scan.test_task = cpuset_test_cpumask; 924
828 scan.process_task = cpuset_change_cpumask;
829 scan.heap = &heap;
830 cgroup_scan_tasks(&scan);
831 heap_free(&heap); 925 heap_free(&heap);
832 926
833 if (is_load_balanced) 927 if (is_load_balanced)
834 rebuild_sched_domains(); 928 async_rebuild_sched_domains();
835 return 0; 929 return 0;
836} 930}
837 931
@@ -884,74 +978,25 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
884 mutex_unlock(&callback_mutex); 978 mutex_unlock(&callback_mutex);
885} 979}
886 980
887/*
888 * Handle user request to change the 'mems' memory placement
889 * of a cpuset. Needs to validate the request, update the
890 * cpusets mems_allowed and mems_generation, and for each
891 * task in the cpuset, rebind any vma mempolicies and if
892 * the cpuset is marked 'memory_migrate', migrate the tasks
893 * pages to the new memory.
894 *
895 * Call with cgroup_mutex held. May take callback_mutex during call.
896 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
897 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
898 * their mempolicies to the cpusets new mems_allowed.
899 */
900
901static void *cpuset_being_rebound; 981static void *cpuset_being_rebound;
902 982
903static int update_nodemask(struct cpuset *cs, char *buf) 983/**
984 * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
985 * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
986 * @oldmem: old mems_allowed of cpuset cs
987 *
988 * Called with cgroup_mutex held
989 * Return 0 if successful, -errno if not.
990 */
991static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
904{ 992{
905 struct cpuset trialcs;
906 nodemask_t oldmem;
907 struct task_struct *p; 993 struct task_struct *p;
908 struct mm_struct **mmarray; 994 struct mm_struct **mmarray;
909 int i, n, ntasks; 995 int i, n, ntasks;
910 int migrate; 996 int migrate;
911 int fudge; 997 int fudge;
912 int retval;
913 struct cgroup_iter it; 998 struct cgroup_iter it;
914 999 int retval;
915 /*
916 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
917 * it's read-only
918 */
919 if (cs == &top_cpuset)
920 return -EACCES;
921
922 trialcs = *cs;
923
924 /*
925 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
926 * Since nodelist_parse() fails on an empty mask, we special case
927 * that parsing. The validate_change() call ensures that cpusets
928 * with tasks have memory.
929 */
930 buf = strstrip(buf);
931 if (!*buf) {
932 nodes_clear(trialcs.mems_allowed);
933 } else {
934 retval = nodelist_parse(buf, trialcs.mems_allowed);
935 if (retval < 0)
936 goto done;
937
938 if (!nodes_subset(trialcs.mems_allowed,
939 node_states[N_HIGH_MEMORY]))
940 return -EINVAL;
941 }
942 oldmem = cs->mems_allowed;
943 if (nodes_equal(oldmem, trialcs.mems_allowed)) {
944 retval = 0; /* Too easy - nothing to do */
945 goto done;
946 }
947 retval = validate_change(cs, &trialcs);
948 if (retval < 0)
949 goto done;
950
951 mutex_lock(&callback_mutex);
952 cs->mems_allowed = trialcs.mems_allowed;
953 cs->mems_generation = cpuset_mems_generation++;
954 mutex_unlock(&callback_mutex);
955 1000
956 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */ 1001 cpuset_being_rebound = cs; /* causes mpol_dup() rebind */
957 1002
@@ -1018,7 +1063,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
1018 1063
1019 mpol_rebind_mm(mm, &cs->mems_allowed); 1064 mpol_rebind_mm(mm, &cs->mems_allowed);
1020 if (migrate) 1065 if (migrate)
1021 cpuset_migrate_mm(mm, &oldmem, &cs->mems_allowed); 1066 cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
1022 mmput(mm); 1067 mmput(mm);
1023 } 1068 }
1024 1069
@@ -1030,6 +1075,70 @@ done:
1030 return retval; 1075 return retval;
1031} 1076}
1032 1077
1078/*
1079 * Handle user request to change the 'mems' memory placement
1080 * of a cpuset. Needs to validate the request, update the
1081 * cpusets mems_allowed and mems_generation, and for each
1082 * task in the cpuset, rebind any vma mempolicies and if
1083 * the cpuset is marked 'memory_migrate', migrate the tasks
1084 * pages to the new memory.
1085 *
1086 * Call with cgroup_mutex held. May take callback_mutex during call.
1087 * Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
1088 * lock each such tasks mm->mmap_sem, scan its vma's and rebind
1089 * their mempolicies to the cpusets new mems_allowed.
1090 */
1091static int update_nodemask(struct cpuset *cs, const char *buf)
1092{
1093 struct cpuset trialcs;
1094 nodemask_t oldmem;
1095 int retval;
1096
1097 /*
1098 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
1099 * it's read-only
1100 */
1101 if (cs == &top_cpuset)
1102 return -EACCES;
1103
1104 trialcs = *cs;
1105
1106 /*
1107 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
1108 * Since nodelist_parse() fails on an empty mask, we special case
1109 * that parsing. The validate_change() call ensures that cpusets
1110 * with tasks have memory.
1111 */
1112 if (!*buf) {
1113 nodes_clear(trialcs.mems_allowed);
1114 } else {
1115 retval = nodelist_parse(buf, trialcs.mems_allowed);
1116 if (retval < 0)
1117 goto done;
1118
1119 if (!nodes_subset(trialcs.mems_allowed,
1120 node_states[N_HIGH_MEMORY]))
1121 return -EINVAL;
1122 }
1123 oldmem = cs->mems_allowed;
1124 if (nodes_equal(oldmem, trialcs.mems_allowed)) {
1125 retval = 0; /* Too easy - nothing to do */
1126 goto done;
1127 }
1128 retval = validate_change(cs, &trialcs);
1129 if (retval < 0)
1130 goto done;
1131
1132 mutex_lock(&callback_mutex);
1133 cs->mems_allowed = trialcs.mems_allowed;
1134 cs->mems_generation = cpuset_mems_generation++;
1135 mutex_unlock(&callback_mutex);
1136
1137 retval = update_tasks_nodemask(cs, &oldmem);
1138done:
1139 return retval;
1140}
1141
1033int current_cpuset_is_being_rebound(void) 1142int current_cpuset_is_being_rebound(void)
1034{ 1143{
1035 return task_cs(current) == cpuset_being_rebound; 1144 return task_cs(current) == cpuset_being_rebound;
@@ -1042,7 +1151,8 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
1042 1151
1043 if (val != cs->relax_domain_level) { 1152 if (val != cs->relax_domain_level) {
1044 cs->relax_domain_level = val; 1153 cs->relax_domain_level = val;
1045 rebuild_sched_domains(); 1154 if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs))
1155 async_rebuild_sched_domains();
1046 } 1156 }
1047 1157
1048 return 0; 1158 return 0;
@@ -1062,7 +1172,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1062{ 1172{
1063 struct cpuset trialcs; 1173 struct cpuset trialcs;
1064 int err; 1174 int err;
1065 int cpus_nonempty, balance_flag_changed; 1175 int balance_flag_changed;
1066 1176
1067 trialcs = *cs; 1177 trialcs = *cs;
1068 if (turning_on) 1178 if (turning_on)
@@ -1074,7 +1184,6 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1074 if (err < 0) 1184 if (err < 0)
1075 return err; 1185 return err;
1076 1186
1077 cpus_nonempty = !cpus_empty(trialcs.cpus_allowed);
1078 balance_flag_changed = (is_sched_load_balance(cs) != 1187 balance_flag_changed = (is_sched_load_balance(cs) !=
1079 is_sched_load_balance(&trialcs)); 1188 is_sched_load_balance(&trialcs));
1080 1189
@@ -1082,8 +1191,8 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
1082 cs->flags = trialcs.flags; 1191 cs->flags = trialcs.flags;
1083 mutex_unlock(&callback_mutex); 1192 mutex_unlock(&callback_mutex);
1084 1193
1085 if (cpus_nonempty && balance_flag_changed) 1194 if (!cpus_empty(trialcs.cpus_allowed) && balance_flag_changed)
1086 rebuild_sched_domains(); 1195 async_rebuild_sched_domains();
1087 1196
1088 return 0; 1197 return 0;
1089} 1198}
@@ -1254,72 +1363,14 @@ typedef enum {
1254 FILE_SPREAD_SLAB, 1363 FILE_SPREAD_SLAB,
1255} cpuset_filetype_t; 1364} cpuset_filetype_t;
1256 1365
1257static ssize_t cpuset_common_file_write(struct cgroup *cont,
1258 struct cftype *cft,
1259 struct file *file,
1260 const char __user *userbuf,
1261 size_t nbytes, loff_t *unused_ppos)
1262{
1263 struct cpuset *cs = cgroup_cs(cont);
1264 cpuset_filetype_t type = cft->private;
1265 char *buffer;
1266 int retval = 0;
1267
1268 /* Crude upper limit on largest legitimate cpulist user might write. */
1269 if (nbytes > 100U + 6 * max(NR_CPUS, MAX_NUMNODES))
1270 return -E2BIG;
1271
1272 /* +1 for nul-terminator */
1273 buffer = kmalloc(nbytes + 1, GFP_KERNEL);
1274 if (!buffer)
1275 return -ENOMEM;
1276
1277 if (copy_from_user(buffer, userbuf, nbytes)) {
1278 retval = -EFAULT;
1279 goto out1;
1280 }
1281 buffer[nbytes] = 0; /* nul-terminate */
1282
1283 cgroup_lock();
1284
1285 if (cgroup_is_removed(cont)) {
1286 retval = -ENODEV;
1287 goto out2;
1288 }
1289
1290 switch (type) {
1291 case FILE_CPULIST:
1292 retval = update_cpumask(cs, buffer);
1293 break;
1294 case FILE_MEMLIST:
1295 retval = update_nodemask(cs, buffer);
1296 break;
1297 default:
1298 retval = -EINVAL;
1299 goto out2;
1300 }
1301
1302 if (retval == 0)
1303 retval = nbytes;
1304out2:
1305 cgroup_unlock();
1306out1:
1307 kfree(buffer);
1308 return retval;
1309}
1310
1311static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val) 1366static int cpuset_write_u64(struct cgroup *cgrp, struct cftype *cft, u64 val)
1312{ 1367{
1313 int retval = 0; 1368 int retval = 0;
1314 struct cpuset *cs = cgroup_cs(cgrp); 1369 struct cpuset *cs = cgroup_cs(cgrp);
1315 cpuset_filetype_t type = cft->private; 1370 cpuset_filetype_t type = cft->private;
1316 1371
1317 cgroup_lock(); 1372 if (!cgroup_lock_live_group(cgrp))
1318
1319 if (cgroup_is_removed(cgrp)) {
1320 cgroup_unlock();
1321 return -ENODEV; 1373 return -ENODEV;
1322 }
1323 1374
1324 switch (type) { 1375 switch (type) {
1325 case FILE_CPU_EXCLUSIVE: 1376 case FILE_CPU_EXCLUSIVE:
@@ -1365,12 +1416,9 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
1365 struct cpuset *cs = cgroup_cs(cgrp); 1416 struct cpuset *cs = cgroup_cs(cgrp);
1366 cpuset_filetype_t type = cft->private; 1417 cpuset_filetype_t type = cft->private;
1367 1418
1368 cgroup_lock(); 1419 if (!cgroup_lock_live_group(cgrp))
1369
1370 if (cgroup_is_removed(cgrp)) {
1371 cgroup_unlock();
1372 return -ENODEV; 1420 return -ENODEV;
1373 } 1421
1374 switch (type) { 1422 switch (type) {
1375 case FILE_SCHED_RELAX_DOMAIN_LEVEL: 1423 case FILE_SCHED_RELAX_DOMAIN_LEVEL:
1376 retval = update_relax_domain_level(cs, val); 1424 retval = update_relax_domain_level(cs, val);
@@ -1384,6 +1432,32 @@ static int cpuset_write_s64(struct cgroup *cgrp, struct cftype *cft, s64 val)
1384} 1432}
1385 1433
1386/* 1434/*
1435 * Common handling for a write to a "cpus" or "mems" file.
1436 */
1437static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
1438 const char *buf)
1439{
1440 int retval = 0;
1441
1442 if (!cgroup_lock_live_group(cgrp))
1443 return -ENODEV;
1444
1445 switch (cft->private) {
1446 case FILE_CPULIST:
1447 retval = update_cpumask(cgroup_cs(cgrp), buf);
1448 break;
1449 case FILE_MEMLIST:
1450 retval = update_nodemask(cgroup_cs(cgrp), buf);
1451 break;
1452 default:
1453 retval = -EINVAL;
1454 break;
1455 }
1456 cgroup_unlock();
1457 return retval;
1458}
1459
1460/*
1387 * These ascii lists should be read in a single call, by using a user 1461 * These ascii lists should be read in a single call, by using a user
1388 * buffer large enough to hold the entire map. If read in smaller 1462 * buffer large enough to hold the entire map. If read in smaller
1389 * chunks, there is no guarantee of atomicity. Since the display format 1463 * chunks, there is no guarantee of atomicity. Since the display format
@@ -1479,6 +1553,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft)
1479 default: 1553 default:
1480 BUG(); 1554 BUG();
1481 } 1555 }
1556
1557 /* Unreachable but makes gcc happy */
1558 return 0;
1482} 1559}
1483 1560
1484static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) 1561static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
@@ -1491,6 +1568,9 @@ static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft)
1491 default: 1568 default:
1492 BUG(); 1569 BUG();
1493 } 1570 }
1571
1572 /* Unrechable but makes gcc happy */
1573 return 0;
1494} 1574}
1495 1575
1496 1576
@@ -1502,14 +1582,16 @@ static struct cftype files[] = {
1502 { 1582 {
1503 .name = "cpus", 1583 .name = "cpus",
1504 .read = cpuset_common_file_read, 1584 .read = cpuset_common_file_read,
1505 .write = cpuset_common_file_write, 1585 .write_string = cpuset_write_resmask,
1586 .max_write_len = (100U + 6 * NR_CPUS),
1506 .private = FILE_CPULIST, 1587 .private = FILE_CPULIST,
1507 }, 1588 },
1508 1589
1509 { 1590 {
1510 .name = "mems", 1591 .name = "mems",
1511 .read = cpuset_common_file_read, 1592 .read = cpuset_common_file_read,
1512 .write = cpuset_common_file_write, 1593 .write_string = cpuset_write_resmask,
1594 .max_write_len = (100U + 6 * MAX_NUMNODES),
1513 .private = FILE_MEMLIST, 1595 .private = FILE_MEMLIST,
1514 }, 1596 },
1515 1597
@@ -1677,15 +1759,9 @@ static struct cgroup_subsys_state *cpuset_create(
1677} 1759}
1678 1760
1679/* 1761/*
1680 * Locking note on the strange update_flag() call below:
1681 *
1682 * If the cpuset being removed has its flag 'sched_load_balance' 1762 * If the cpuset being removed has its flag 'sched_load_balance'
1683 * enabled, then simulate turning sched_load_balance off, which 1763 * enabled, then simulate turning sched_load_balance off, which
1684 * will call rebuild_sched_domains(). The get_online_cpus() 1764 * will call async_rebuild_sched_domains().
1685 * call in rebuild_sched_domains() must not be made while holding
1686 * callback_mutex. Elsewhere the kernel nests callback_mutex inside
1687 * get_online_cpus() calls. So the reverse nesting would risk an
1688 * ABBA deadlock.
1689 */ 1765 */
1690 1766
1691static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) 1767static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
@@ -1704,7 +1780,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont)
1704struct cgroup_subsys cpuset_subsys = { 1780struct cgroup_subsys cpuset_subsys = {
1705 .name = "cpuset", 1781 .name = "cpuset",
1706 .create = cpuset_create, 1782 .create = cpuset_create,
1707 .destroy = cpuset_destroy, 1783 .destroy = cpuset_destroy,
1708 .can_attach = cpuset_can_attach, 1784 .can_attach = cpuset_can_attach,
1709 .attach = cpuset_attach, 1785 .attach = cpuset_attach,
1710 .populate = cpuset_populate, 1786 .populate = cpuset_populate,
@@ -1790,13 +1866,13 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
1790 scan.scan.heap = NULL; 1866 scan.scan.heap = NULL;
1791 scan.to = to->css.cgroup; 1867 scan.to = to->css.cgroup;
1792 1868
1793 if (cgroup_scan_tasks((struct cgroup_scanner *)&scan)) 1869 if (cgroup_scan_tasks(&scan.scan))
1794 printk(KERN_ERR "move_member_tasks_to_cpuset: " 1870 printk(KERN_ERR "move_member_tasks_to_cpuset: "
1795 "cgroup_scan_tasks failed\n"); 1871 "cgroup_scan_tasks failed\n");
1796} 1872}
1797 1873
1798/* 1874/*
1799 * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs 1875 * If CPU and/or memory hotplug handlers, below, unplug any CPUs
1800 * or memory nodes, we need to walk over the cpuset hierarchy, 1876 * or memory nodes, we need to walk over the cpuset hierarchy,
1801 * removing that CPU or node from all cpusets. If this removes the 1877 * removing that CPU or node from all cpusets. If this removes the
1802 * last CPU or node from a cpuset, then move the tasks in the empty 1878 * last CPU or node from a cpuset, then move the tasks in the empty
@@ -1844,31 +1920,31 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs)
1844 * that has tasks along with an empty 'mems'. But if we did see such 1920 * that has tasks along with an empty 'mems'. But if we did see such
1845 * a cpuset, we'd handle it just like we do if its 'cpus' was empty. 1921 * a cpuset, we'd handle it just like we do if its 'cpus' was empty.
1846 */ 1922 */
1847static void scan_for_empty_cpusets(const struct cpuset *root) 1923static void scan_for_empty_cpusets(struct cpuset *root)
1848{ 1924{
1925 LIST_HEAD(queue);
1849 struct cpuset *cp; /* scans cpusets being updated */ 1926 struct cpuset *cp; /* scans cpusets being updated */
1850 struct cpuset *child; /* scans child cpusets of cp */ 1927 struct cpuset *child; /* scans child cpusets of cp */
1851 struct list_head queue;
1852 struct cgroup *cont; 1928 struct cgroup *cont;
1853 1929 nodemask_t oldmems;
1854 INIT_LIST_HEAD(&queue);
1855 1930
1856 list_add_tail((struct list_head *)&root->stack_list, &queue); 1931 list_add_tail((struct list_head *)&root->stack_list, &queue);
1857 1932
1858 while (!list_empty(&queue)) { 1933 while (!list_empty(&queue)) {
1859 cp = container_of(queue.next, struct cpuset, stack_list); 1934 cp = list_first_entry(&queue, struct cpuset, stack_list);
1860 list_del(queue.next); 1935 list_del(queue.next);
1861 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { 1936 list_for_each_entry(cont, &cp->css.cgroup->children, sibling) {
1862 child = cgroup_cs(cont); 1937 child = cgroup_cs(cont);
1863 list_add_tail(&child->stack_list, &queue); 1938 list_add_tail(&child->stack_list, &queue);
1864 } 1939 }
1865 cont = cp->css.cgroup;
1866 1940
1867 /* Continue past cpusets with all cpus, mems online */ 1941 /* Continue past cpusets with all cpus, mems online */
1868 if (cpus_subset(cp->cpus_allowed, cpu_online_map) && 1942 if (cpus_subset(cp->cpus_allowed, cpu_online_map) &&
1869 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) 1943 nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
1870 continue; 1944 continue;
1871 1945
1946 oldmems = cp->mems_allowed;
1947
1872 /* Remove offline cpus and mems from this cpuset. */ 1948 /* Remove offline cpus and mems from this cpuset. */
1873 mutex_lock(&callback_mutex); 1949 mutex_lock(&callback_mutex);
1874 cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map); 1950 cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map);
@@ -1880,39 +1956,14 @@ static void scan_for_empty_cpusets(const struct cpuset *root)
1880 if (cpus_empty(cp->cpus_allowed) || 1956 if (cpus_empty(cp->cpus_allowed) ||
1881 nodes_empty(cp->mems_allowed)) 1957 nodes_empty(cp->mems_allowed))
1882 remove_tasks_in_empty_cpuset(cp); 1958 remove_tasks_in_empty_cpuset(cp);
1959 else {
1960 update_tasks_cpumask(cp, NULL);
1961 update_tasks_nodemask(cp, &oldmems);
1962 }
1883 } 1963 }
1884} 1964}
1885 1965
1886/* 1966/*
1887 * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track
1888 * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to
1889 * track what's online after any CPU or memory node hotplug or unplug event.
1890 *
1891 * Since there are two callers of this routine, one for CPU hotplug
1892 * events and one for memory node hotplug events, we could have coded
1893 * two separate routines here. We code it as a single common routine
1894 * in order to minimize text size.
1895 */
1896
1897static void common_cpu_mem_hotplug_unplug(int rebuild_sd)
1898{
1899 cgroup_lock();
1900
1901 top_cpuset.cpus_allowed = cpu_online_map;
1902 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
1903 scan_for_empty_cpusets(&top_cpuset);
1904
1905 /*
1906 * Scheduler destroys domains on hotplug events.
1907 * Rebuild them based on the current settings.
1908 */
1909 if (rebuild_sd)
1910 rebuild_sched_domains();
1911
1912 cgroup_unlock();
1913}
1914
1915/*
1916 * The top_cpuset tracks what CPUs and Memory Nodes are online, 1967 * The top_cpuset tracks what CPUs and Memory Nodes are online,
1917 * period. This is necessary in order to make cpusets transparent 1968 * period. This is necessary in order to make cpusets transparent
1918 * (of no affect) on systems that are actively using CPU hotplug 1969 * (of no affect) on systems that are actively using CPU hotplug
@@ -1920,40 +1971,52 @@ static void common_cpu_mem_hotplug_unplug(int rebuild_sd)
1920 * 1971 *
1921 * This routine ensures that top_cpuset.cpus_allowed tracks 1972 * This routine ensures that top_cpuset.cpus_allowed tracks
1922 * cpu_online_map on each CPU hotplug (cpuhp) event. 1973 * cpu_online_map on each CPU hotplug (cpuhp) event.
1974 *
1975 * Called within get_online_cpus(). Needs to call cgroup_lock()
1976 * before calling generate_sched_domains().
1923 */ 1977 */
1924 1978static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
1925static int cpuset_handle_cpuhp(struct notifier_block *unused_nb,
1926 unsigned long phase, void *unused_cpu) 1979 unsigned long phase, void *unused_cpu)
1927{ 1980{
1981 struct sched_domain_attr *attr;
1982 cpumask_t *doms;
1983 int ndoms;
1984
1928 switch (phase) { 1985 switch (phase) {
1929 case CPU_UP_CANCELED:
1930 case CPU_UP_CANCELED_FROZEN:
1931 case CPU_DOWN_FAILED:
1932 case CPU_DOWN_FAILED_FROZEN:
1933 case CPU_ONLINE: 1986 case CPU_ONLINE:
1934 case CPU_ONLINE_FROZEN: 1987 case CPU_ONLINE_FROZEN:
1935 case CPU_DEAD: 1988 case CPU_DEAD:
1936 case CPU_DEAD_FROZEN: 1989 case CPU_DEAD_FROZEN:
1937 common_cpu_mem_hotplug_unplug(1);
1938 break; 1990 break;
1991
1939 default: 1992 default:
1940 return NOTIFY_DONE; 1993 return NOTIFY_DONE;
1941 } 1994 }
1942 1995
1996 cgroup_lock();
1997 top_cpuset.cpus_allowed = cpu_online_map;
1998 scan_for_empty_cpusets(&top_cpuset);
1999 ndoms = generate_sched_domains(&doms, &attr);
2000 cgroup_unlock();
2001
2002 /* Have scheduler rebuild the domains */
2003 partition_sched_domains(ndoms, doms, attr);
2004
1943 return NOTIFY_OK; 2005 return NOTIFY_OK;
1944} 2006}
1945 2007
1946#ifdef CONFIG_MEMORY_HOTPLUG 2008#ifdef CONFIG_MEMORY_HOTPLUG
1947/* 2009/*
1948 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. 2010 * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY].
1949 * Call this routine anytime after you change 2011 * Call this routine anytime after node_states[N_HIGH_MEMORY] changes.
1950 * node_states[N_HIGH_MEMORY]. 2012 * See also the previous routine cpuset_track_online_cpus().
1951 * See also the previous routine cpuset_handle_cpuhp().
1952 */ 2013 */
1953
1954void cpuset_track_online_nodes(void) 2014void cpuset_track_online_nodes(void)
1955{ 2015{
1956 common_cpu_mem_hotplug_unplug(0); 2016 cgroup_lock();
2017 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
2018 scan_for_empty_cpusets(&top_cpuset);
2019 cgroup_unlock();
1957} 2020}
1958#endif 2021#endif
1959 2022
@@ -1968,11 +2031,10 @@ void __init cpuset_init_smp(void)
1968 top_cpuset.cpus_allowed = cpu_online_map; 2031 top_cpuset.cpus_allowed = cpu_online_map;
1969 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; 2032 top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
1970 2033
1971 hotcpu_notifier(cpuset_handle_cpuhp, 0); 2034 hotcpu_notifier(cpuset_track_online_cpus, 0);
1972} 2035}
1973 2036
1974/** 2037/**
1975
1976 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset. 2038 * cpuset_cpus_allowed - return cpus_allowed mask from a tasks cpuset.
1977 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed. 2039 * @tsk: pointer to task_struct from which to obtain cpuset->cpus_allowed.
1978 * @pmask: pointer to cpumask_t variable to receive cpus_allowed set. 2040 * @pmask: pointer to cpumask_t variable to receive cpus_allowed set.
@@ -2374,19 +2436,15 @@ const struct file_operations proc_cpuset_operations = {
2374void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task) 2436void cpuset_task_status_allowed(struct seq_file *m, struct task_struct *task)
2375{ 2437{
2376 seq_printf(m, "Cpus_allowed:\t"); 2438 seq_printf(m, "Cpus_allowed:\t");
2377 m->count += cpumask_scnprintf(m->buf + m->count, m->size - m->count, 2439 seq_cpumask(m, &task->cpus_allowed);
2378 task->cpus_allowed);
2379 seq_printf(m, "\n"); 2440 seq_printf(m, "\n");
2380 seq_printf(m, "Cpus_allowed_list:\t"); 2441 seq_printf(m, "Cpus_allowed_list:\t");
2381 m->count += cpulist_scnprintf(m->buf + m->count, m->size - m->count, 2442 seq_cpumask_list(m, &task->cpus_allowed);
2382 task->cpus_allowed);
2383 seq_printf(m, "\n"); 2443 seq_printf(m, "\n");
2384 seq_printf(m, "Mems_allowed:\t"); 2444 seq_printf(m, "Mems_allowed:\t");
2385 m->count += nodemask_scnprintf(m->buf + m->count, m->size - m->count, 2445 seq_nodemask(m, &task->mems_allowed);
2386 task->mems_allowed);
2387 seq_printf(m, "\n"); 2446 seq_printf(m, "\n");
2388 seq_printf(m, "Mems_allowed_list:\t"); 2447 seq_printf(m, "Mems_allowed_list:\t");
2389 m->count += nodelist_scnprintf(m->buf + m->count, m->size - m->count, 2448 seq_nodemask_list(m, &task->mems_allowed);
2390 task->mems_allowed);
2391 seq_printf(m, "\n"); 2449 seq_printf(m, "\n");
2392} 2450}
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
index 10e43fd8b721..b3179dad71be 100644
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -145,8 +145,11 @@ int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
145 d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp; 145 d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp;
146 tmp = d->swapin_delay_total + tsk->delays->swapin_delay; 146 tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
147 d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp; 147 d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp;
148 tmp = d->freepages_delay_total + tsk->delays->freepages_delay;
149 d->freepages_delay_total = (tmp < d->freepages_delay_total) ? 0 : tmp;
148 d->blkio_count += tsk->delays->blkio_count; 150 d->blkio_count += tsk->delays->blkio_count;
149 d->swapin_count += tsk->delays->swapin_count; 151 d->swapin_count += tsk->delays->swapin_count;
152 d->freepages_count += tsk->delays->freepages_count;
150 spin_unlock_irqrestore(&tsk->delays->lock, flags); 153 spin_unlock_irqrestore(&tsk->delays->lock, flags);
151 154
152done: 155done:
@@ -165,3 +168,16 @@ __u64 __delayacct_blkio_ticks(struct task_struct *tsk)
165 return ret; 168 return ret;
166} 169}
167 170
171void __delayacct_freepages_start(void)
172{
173 delayacct_start(&current->delays->freepages_start);
174}
175
176void __delayacct_freepages_end(void)
177{
178 delayacct_end(&current->delays->freepages_start,
179 &current->delays->freepages_end,
180 &current->delays->freepages_delay,
181 &current->delays->freepages_count);
182}
183
diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
new file mode 100644
index 000000000000..f013a0c2e111
--- /dev/null
+++ b/kernel/dma-coherent.c
@@ -0,0 +1,155 @@
1/*
2 * Coherent per-device memory handling.
3 * Borrowed from i386
4 */
5#include <linux/kernel.h>
6#include <linux/dma-mapping.h>
7
8struct dma_coherent_mem {
9 void *virt_base;
10 u32 device_base;
11 int size;
12 int flags;
13 unsigned long *bitmap;
14};
15
16int dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
17 dma_addr_t device_addr, size_t size, int flags)
18{
19 void __iomem *mem_base = NULL;
20 int pages = size >> PAGE_SHIFT;
21 int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long);
22
23 if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0)
24 goto out;
25 if (!size)
26 goto out;
27 if (dev->dma_mem)
28 goto out;
29
30 /* FIXME: this routine just ignores DMA_MEMORY_INCLUDES_CHILDREN */
31
32 mem_base = ioremap(bus_addr, size);
33 if (!mem_base)
34 goto out;
35
36 dev->dma_mem = kzalloc(sizeof(struct dma_coherent_mem), GFP_KERNEL);
37 if (!dev->dma_mem)
38 goto out;
39 dev->dma_mem->bitmap = kzalloc(bitmap_size, GFP_KERNEL);
40 if (!dev->dma_mem->bitmap)
41 goto free1_out;
42
43 dev->dma_mem->virt_base = mem_base;
44 dev->dma_mem->device_base = device_addr;
45 dev->dma_mem->size = pages;
46 dev->dma_mem->flags = flags;
47
48 if (flags & DMA_MEMORY_MAP)
49 return DMA_MEMORY_MAP;
50
51 return DMA_MEMORY_IO;
52
53 free1_out:
54 kfree(dev->dma_mem);
55 out:
56 if (mem_base)
57 iounmap(mem_base);
58 return 0;
59}
60EXPORT_SYMBOL(dma_declare_coherent_memory);
61
62void dma_release_declared_memory(struct device *dev)
63{
64 struct dma_coherent_mem *mem = dev->dma_mem;
65
66 if (!mem)
67 return;
68 dev->dma_mem = NULL;
69 iounmap(mem->virt_base);
70 kfree(mem->bitmap);
71 kfree(mem);
72}
73EXPORT_SYMBOL(dma_release_declared_memory);
74
75void *dma_mark_declared_memory_occupied(struct device *dev,
76 dma_addr_t device_addr, size_t size)
77{
78 struct dma_coherent_mem *mem = dev->dma_mem;
79 int pos, err;
80
81 size += device_addr & ~PAGE_MASK;
82
83 if (!mem)
84 return ERR_PTR(-EINVAL);
85
86 pos = (device_addr - mem->device_base) >> PAGE_SHIFT;
87 err = bitmap_allocate_region(mem->bitmap, pos, get_order(size));
88 if (err != 0)
89 return ERR_PTR(err);
90 return mem->virt_base + (pos << PAGE_SHIFT);
91}
92EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
93
94/**
95 * dma_alloc_from_coherent() - try to allocate memory from the per-device coherent area
96 *
97 * @dev: device from which we allocate memory
98 * @size: size of requested memory area
99 * @dma_handle: This will be filled with the correct dma handle
100 * @ret: This pointer will be filled with the virtual address
101 * to allocated area.
102 *
103 * This function should be only called from per-arch dma_alloc_coherent()
104 * to support allocation from per-device coherent memory pools.
105 *
106 * Returns 0 if dma_alloc_coherent should continue with allocating from
107 * generic memory areas, or !0 if dma_alloc_coherent should return @ret.
108 */
109int dma_alloc_from_coherent(struct device *dev, ssize_t size,
110 dma_addr_t *dma_handle, void **ret)
111{
112 struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
113 int order = get_order(size);
114
115 if (mem) {
116 int page = bitmap_find_free_region(mem->bitmap, mem->size,
117 order);
118 if (page >= 0) {
119 *dma_handle = mem->device_base + (page << PAGE_SHIFT);
120 *ret = mem->virt_base + (page << PAGE_SHIFT);
121 memset(*ret, 0, size);
122 } else if (mem->flags & DMA_MEMORY_EXCLUSIVE)
123 *ret = NULL;
124 }
125 return (mem != NULL);
126}
127EXPORT_SYMBOL(dma_alloc_from_coherent);
128
129/**
130 * dma_release_from_coherent() - try to free the memory allocated from per-device coherent memory pool
131 * @dev: device from which the memory was allocated
132 * @order: the order of pages allocated
133 * @vaddr: virtual address of allocated pages
134 *
135 * This checks whether the memory was allocated from the per-device
136 * coherent memory pool and if so, releases that memory.
137 *
138 * Returns 1 if we correctly released the memory, or 0 if
139 * dma_release_coherent() should proceed with releasing memory from
140 * generic pools.
141 */
142int dma_release_from_coherent(struct device *dev, int order, void *vaddr)
143{
144 struct dma_coherent_mem *mem = dev ? dev->dma_mem : NULL;
145
146 if (mem && vaddr >= mem->virt_base && vaddr <
147 (mem->virt_base + (mem->size << PAGE_SHIFT))) {
148 int page = (vaddr - mem->virt_base) >> PAGE_SHIFT;
149
150 bitmap_release_region(mem->bitmap, page, order);
151 return 1;
152 }
153 return 0;
154}
155EXPORT_SYMBOL(dma_release_from_coherent);
diff --git a/kernel/dma.c b/kernel/dma.c
index d2c60a822790..f903189c5304 100644
--- a/kernel/dma.c
+++ b/kernel/dma.c
@@ -1,4 +1,4 @@
1/* $Id: dma.c,v 1.7 1994/12/28 03:35:33 root Exp root $ 1/*
2 * linux/kernel/dma.c: A DMA channel allocator. Inspired by linux/kernel/irq.c. 2 * linux/kernel/dma.c: A DMA channel allocator. Inspired by linux/kernel/irq.c.
3 * 3 *
4 * Written by Hennus Bergman, 1992. 4 * Written by Hennus Bergman, 1992.
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index a9e6bad9f706..0511716e9424 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -12,7 +12,9 @@
12#include <linux/kmod.h> 12#include <linux/kmod.h>
13#include <linux/module.h> 13#include <linux/module.h>
14#include <linux/personality.h> 14#include <linux/personality.h>
15#include <linux/proc_fs.h>
15#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/seq_file.h>
16#include <linux/syscalls.h> 18#include <linux/syscalls.h>
17#include <linux/sysctl.h> 19#include <linux/sysctl.h>
18#include <linux/types.h> 20#include <linux/types.h>
@@ -65,7 +67,7 @@ lookup_exec_domain(u_long personality)
65 goto out; 67 goto out;
66 } 68 }
67 69
68#ifdef CONFIG_KMOD 70#ifdef CONFIG_MODULES
69 read_unlock(&exec_domains_lock); 71 read_unlock(&exec_domains_lock);
70 request_module("personality-%ld", pers); 72 request_module("personality-%ld", pers);
71 read_lock(&exec_domains_lock); 73 read_lock(&exec_domains_lock);
@@ -168,26 +170,44 @@ __set_personality(u_long personality)
168 current->personality = personality; 170 current->personality = personality;
169 oep = current_thread_info()->exec_domain; 171 oep = current_thread_info()->exec_domain;
170 current_thread_info()->exec_domain = ep; 172 current_thread_info()->exec_domain = ep;
171 set_fs_altroot();
172 173
173 module_put(oep->module); 174 module_put(oep->module);
174 return 0; 175 return 0;
175} 176}
176 177
177int 178#ifdef CONFIG_PROC_FS
178get_exec_domain_list(char *page) 179static int execdomains_proc_show(struct seq_file *m, void *v)
179{ 180{
180 struct exec_domain *ep; 181 struct exec_domain *ep;
181 int len = 0;
182 182
183 read_lock(&exec_domains_lock); 183 read_lock(&exec_domains_lock);
184 for (ep = exec_domains; ep && len < PAGE_SIZE - 80; ep = ep->next) 184 for (ep = exec_domains; ep; ep = ep->next)
185 len += sprintf(page + len, "%d-%d\t%-16s\t[%s]\n", 185 seq_printf(m, "%d-%d\t%-16s\t[%s]\n",
186 ep->pers_low, ep->pers_high, ep->name, 186 ep->pers_low, ep->pers_high, ep->name,
187 module_name(ep->module)); 187 module_name(ep->module));
188 read_unlock(&exec_domains_lock); 188 read_unlock(&exec_domains_lock);
189 return (len); 189 return 0;
190}
191
192static int execdomains_proc_open(struct inode *inode, struct file *file)
193{
194 return single_open(file, execdomains_proc_show, NULL);
195}
196
197static const struct file_operations execdomains_proc_fops = {
198 .open = execdomains_proc_open,
199 .read = seq_read,
200 .llseek = seq_lseek,
201 .release = single_release,
202};
203
204static int __init proc_execdomains_init(void)
205{
206 proc_create("execdomains", 0, NULL, &execdomains_proc_fops);
207 return 0;
190} 208}
209module_init(proc_execdomains_init);
210#endif
191 211
192asmlinkage long 212asmlinkage long
193sys_personality(u_long personality) 213sys_personality(u_long personality)
diff --git a/kernel/exit.c b/kernel/exit.c
index 93d2711b9381..80137a5d9467 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -46,6 +46,8 @@
46#include <linux/resource.h> 46#include <linux/resource.h>
47#include <linux/blkdev.h> 47#include <linux/blkdev.h>
48#include <linux/task_io_accounting_ops.h> 48#include <linux/task_io_accounting_ops.h>
49#include <linux/tracehook.h>
50#include <trace/sched.h>
49 51
50#include <asm/uaccess.h> 52#include <asm/uaccess.h>
51#include <asm/unistd.h> 53#include <asm/unistd.h>
@@ -85,7 +87,6 @@ static void __exit_signal(struct task_struct *tsk)
85 BUG_ON(!sig); 87 BUG_ON(!sig);
86 BUG_ON(!atomic_read(&sig->count)); 88 BUG_ON(!atomic_read(&sig->count));
87 89
88 rcu_read_lock();
89 sighand = rcu_dereference(tsk->sighand); 90 sighand = rcu_dereference(tsk->sighand);
90 spin_lock(&sighand->siglock); 91 spin_lock(&sighand->siglock);
91 92
@@ -112,16 +113,14 @@ static void __exit_signal(struct task_struct *tsk)
112 * We won't ever get here for the group leader, since it 113 * We won't ever get here for the group leader, since it
113 * will have been the last reference on the signal_struct. 114 * will have been the last reference on the signal_struct.
114 */ 115 */
115 sig->utime = cputime_add(sig->utime, tsk->utime); 116 sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));
116 sig->stime = cputime_add(sig->stime, tsk->stime);
117 sig->gtime = cputime_add(sig->gtime, tsk->gtime);
118 sig->min_flt += tsk->min_flt; 117 sig->min_flt += tsk->min_flt;
119 sig->maj_flt += tsk->maj_flt; 118 sig->maj_flt += tsk->maj_flt;
120 sig->nvcsw += tsk->nvcsw; 119 sig->nvcsw += tsk->nvcsw;
121 sig->nivcsw += tsk->nivcsw; 120 sig->nivcsw += tsk->nivcsw;
122 sig->inblock += task_io_get_inblock(tsk); 121 sig->inblock += task_io_get_inblock(tsk);
123 sig->oublock += task_io_get_oublock(tsk); 122 sig->oublock += task_io_get_oublock(tsk);
124 sig->sum_sched_runtime += tsk->se.sum_exec_runtime; 123 task_io_accounting_add(&sig->ioac, &tsk->ioac);
125 sig = NULL; /* Marker for below. */ 124 sig = NULL; /* Marker for below. */
126 } 125 }
127 126
@@ -136,7 +135,6 @@ static void __exit_signal(struct task_struct *tsk)
136 tsk->signal = NULL; 135 tsk->signal = NULL;
137 tsk->sighand = NULL; 136 tsk->sighand = NULL;
138 spin_unlock(&sighand->siglock); 137 spin_unlock(&sighand->siglock);
139 rcu_read_unlock();
140 138
141 __cleanup_sighand(sighand); 139 __cleanup_sighand(sighand);
142 clear_tsk_thread_flag(tsk,TIF_SIGPENDING); 140 clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
@@ -149,30 +147,23 @@ static void __exit_signal(struct task_struct *tsk)
149 147
150static void delayed_put_task_struct(struct rcu_head *rhp) 148static void delayed_put_task_struct(struct rcu_head *rhp)
151{ 149{
152 put_task_struct(container_of(rhp, struct task_struct, rcu)); 150 struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
153}
154 151
155/* 152 trace_sched_process_free(tsk);
156 * Do final ptrace-related cleanup of a zombie being reaped. 153 put_task_struct(tsk);
157 *
158 * Called with write_lock(&tasklist_lock) held.
159 */
160static void ptrace_release_task(struct task_struct *p)
161{
162 BUG_ON(!list_empty(&p->ptraced));
163 ptrace_unlink(p);
164 BUG_ON(!list_empty(&p->ptrace_entry));
165} 154}
166 155
156
167void release_task(struct task_struct * p) 157void release_task(struct task_struct * p)
168{ 158{
169 struct task_struct *leader; 159 struct task_struct *leader;
170 int zap_leader; 160 int zap_leader;
171repeat: 161repeat:
162 tracehook_prepare_release_task(p);
172 atomic_dec(&p->user->processes); 163 atomic_dec(&p->user->processes);
173 proc_flush_task(p); 164 proc_flush_task(p);
174 write_lock_irq(&tasklist_lock); 165 write_lock_irq(&tasklist_lock);
175 ptrace_release_task(p); 166 tracehook_finish_release_task(p);
176 __exit_signal(p); 167 __exit_signal(p);
177 168
178 /* 169 /*
@@ -194,6 +185,13 @@ repeat:
194 * that case. 185 * that case.
195 */ 186 */
196 zap_leader = task_detached(leader); 187 zap_leader = task_detached(leader);
188
189 /*
190 * This maintains the invariant that release_task()
191 * only runs on a task in EXIT_DEAD, just for sanity.
192 */
193 if (zap_leader)
194 leader->exit_state = EXIT_DEAD;
197 } 195 }
198 196
199 write_unlock_irq(&tasklist_lock); 197 write_unlock_irq(&tasklist_lock);
@@ -432,7 +430,7 @@ void daemonize(const char *name, ...)
432 * We don't want to have TIF_FREEZE set if the system-wide hibernation 430 * We don't want to have TIF_FREEZE set if the system-wide hibernation
433 * or suspend transition begins right now. 431 * or suspend transition begins right now.
434 */ 432 */
435 current->flags |= PF_NOFREEZE; 433 current->flags |= (PF_NOFREEZE | PF_KTHREAD);
436 434
437 if (current->nsproxy != &init_nsproxy) { 435 if (current->nsproxy != &init_nsproxy) {
438 get_nsproxy(&init_nsproxy); 436 get_nsproxy(&init_nsproxy);
@@ -557,8 +555,6 @@ void put_fs_struct(struct fs_struct *fs)
557 if (atomic_dec_and_test(&fs->count)) { 555 if (atomic_dec_and_test(&fs->count)) {
558 path_put(&fs->root); 556 path_put(&fs->root);
559 path_put(&fs->pwd); 557 path_put(&fs->pwd);
560 if (fs->altroot.dentry)
561 path_put(&fs->altroot);
562 kmem_cache_free(fs_cachep, fs); 558 kmem_cache_free(fs_cachep, fs);
563 } 559 }
564} 560}
@@ -588,8 +584,6 @@ mm_need_new_owner(struct mm_struct *mm, struct task_struct *p)
588 * If there are other users of the mm and the owner (us) is exiting 584 * If there are other users of the mm and the owner (us) is exiting
589 * we need to find a new owner to take on the responsibility. 585 * we need to find a new owner to take on the responsibility.
590 */ 586 */
591 if (!mm)
592 return 0;
593 if (atomic_read(&mm->mm_users) <= 1) 587 if (atomic_read(&mm->mm_users) <= 1)
594 return 0; 588 return 0;
595 if (mm->owner != p) 589 if (mm->owner != p)
@@ -632,29 +626,38 @@ retry:
632 } while_each_thread(g, c); 626 } while_each_thread(g, c);
633 627
634 read_unlock(&tasklist_lock); 628 read_unlock(&tasklist_lock);
629 /*
630 * We found no owner yet mm_users > 1: this implies that we are
631 * most likely racing with swapoff (try_to_unuse()) or /proc or
632 * ptrace or page migration (get_task_mm()). Mark owner as NULL,
633 * so that subsystems can understand the callback and take action.
634 */
635 down_write(&mm->mmap_sem);
636 cgroup_mm_owner_callbacks(mm->owner, NULL);
637 mm->owner = NULL;
638 up_write(&mm->mmap_sem);
635 return; 639 return;
636 640
637assign_new_owner: 641assign_new_owner:
638 BUG_ON(c == p); 642 BUG_ON(c == p);
639 get_task_struct(c); 643 get_task_struct(c);
644 read_unlock(&tasklist_lock);
645 down_write(&mm->mmap_sem);
640 /* 646 /*
641 * The task_lock protects c->mm from changing. 647 * The task_lock protects c->mm from changing.
642 * We always want mm->owner->mm == mm 648 * We always want mm->owner->mm == mm
643 */ 649 */
644 task_lock(c); 650 task_lock(c);
645 /*
646 * Delay read_unlock() till we have the task_lock()
647 * to ensure that c does not slip away underneath us
648 */
649 read_unlock(&tasklist_lock);
650 if (c->mm != mm) { 651 if (c->mm != mm) {
651 task_unlock(c); 652 task_unlock(c);
653 up_write(&mm->mmap_sem);
652 put_task_struct(c); 654 put_task_struct(c);
653 goto retry; 655 goto retry;
654 } 656 }
655 cgroup_mm_owner_callbacks(mm->owner, c); 657 cgroup_mm_owner_callbacks(mm->owner, c);
656 mm->owner = c; 658 mm->owner = c;
657 task_unlock(c); 659 task_unlock(c);
660 up_write(&mm->mmap_sem);
658 put_task_struct(c); 661 put_task_struct(c);
659} 662}
660#endif /* CONFIG_MM_OWNER */ 663#endif /* CONFIG_MM_OWNER */
@@ -666,26 +669,40 @@ assign_new_owner:
666static void exit_mm(struct task_struct * tsk) 669static void exit_mm(struct task_struct * tsk)
667{ 670{
668 struct mm_struct *mm = tsk->mm; 671 struct mm_struct *mm = tsk->mm;
672 struct core_state *core_state;
669 673
670 mm_release(tsk, mm); 674 mm_release(tsk, mm);
671 if (!mm) 675 if (!mm)
672 return; 676 return;
673 /* 677 /*
674 * Serialize with any possible pending coredump. 678 * Serialize with any possible pending coredump.
675 * We must hold mmap_sem around checking core_waiters 679 * We must hold mmap_sem around checking core_state
676 * and clearing tsk->mm. The core-inducing thread 680 * and clearing tsk->mm. The core-inducing thread
677 * will increment core_waiters for each thread in the 681 * will increment ->nr_threads for each thread in the
678 * group with ->mm != NULL. 682 * group with ->mm != NULL.
679 */ 683 */
680 down_read(&mm->mmap_sem); 684 down_read(&mm->mmap_sem);
681 if (mm->core_waiters) { 685 core_state = mm->core_state;
686 if (core_state) {
687 struct core_thread self;
682 up_read(&mm->mmap_sem); 688 up_read(&mm->mmap_sem);
683 down_write(&mm->mmap_sem);
684 if (!--mm->core_waiters)
685 complete(mm->core_startup_done);
686 up_write(&mm->mmap_sem);
687 689
688 wait_for_completion(&mm->core_done); 690 self.task = tsk;
691 self.next = xchg(&core_state->dumper.next, &self);
692 /*
693 * Implies mb(), the result of xchg() must be visible
694 * to core_state->dumper.
695 */
696 if (atomic_dec_and_test(&core_state->nr_threads))
697 complete(&core_state->startup);
698
699 for (;;) {
700 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
701 if (!self.task) /* see coredump_finish() */
702 break;
703 schedule();
704 }
705 __set_task_state(tsk, TASK_RUNNING);
689 down_read(&mm->mmap_sem); 706 down_read(&mm->mmap_sem);
690 } 707 }
691 atomic_inc(&mm->mm_count); 708 atomic_inc(&mm->mm_count);
@@ -822,26 +839,50 @@ static void reparent_thread(struct task_struct *p, struct task_struct *father)
822 * the child reaper process (ie "init") in our pid 839 * the child reaper process (ie "init") in our pid
823 * space. 840 * space.
824 */ 841 */
842static struct task_struct *find_new_reaper(struct task_struct *father)
843{
844 struct pid_namespace *pid_ns = task_active_pid_ns(father);
845 struct task_struct *thread;
846
847 thread = father;
848 while_each_thread(father, thread) {
849 if (thread->flags & PF_EXITING)
850 continue;
851 if (unlikely(pid_ns->child_reaper == father))
852 pid_ns->child_reaper = thread;
853 return thread;
854 }
855
856 if (unlikely(pid_ns->child_reaper == father)) {
857 write_unlock_irq(&tasklist_lock);
858 if (unlikely(pid_ns == &init_pid_ns))
859 panic("Attempted to kill init!");
860
861 zap_pid_ns_processes(pid_ns);
862 write_lock_irq(&tasklist_lock);
863 /*
864 * We can not clear ->child_reaper or leave it alone.
865 * There may by stealth EXIT_DEAD tasks on ->children,
866 * forget_original_parent() must move them somewhere.
867 */
868 pid_ns->child_reaper = init_pid_ns.child_reaper;
869 }
870
871 return pid_ns->child_reaper;
872}
873
825static void forget_original_parent(struct task_struct *father) 874static void forget_original_parent(struct task_struct *father)
826{ 875{
827 struct task_struct *p, *n, *reaper = father; 876 struct task_struct *p, *n, *reaper;
828 LIST_HEAD(ptrace_dead); 877 LIST_HEAD(ptrace_dead);
829 878
830 write_lock_irq(&tasklist_lock); 879 write_lock_irq(&tasklist_lock);
831 880 reaper = find_new_reaper(father);
832 /* 881 /*
833 * First clean up ptrace if we were using it. 882 * First clean up ptrace if we were using it.
834 */ 883 */
835 ptrace_exit(father, &ptrace_dead); 884 ptrace_exit(father, &ptrace_dead);
836 885
837 do {
838 reaper = next_thread(reaper);
839 if (reaper == father) {
840 reaper = task_child_reaper(father);
841 break;
842 }
843 } while (reaper->flags & PF_EXITING);
844
845 list_for_each_entry_safe(p, n, &father->children, sibling) { 886 list_for_each_entry_safe(p, n, &father->children, sibling) {
846 p->real_parent = reaper; 887 p->real_parent = reaper;
847 if (p->parent == father) { 888 if (p->parent == father) {
@@ -863,7 +904,8 @@ static void forget_original_parent(struct task_struct *father)
863 */ 904 */
864static void exit_notify(struct task_struct *tsk, int group_dead) 905static void exit_notify(struct task_struct *tsk, int group_dead)
865{ 906{
866 int state; 907 int signal;
908 void *cookie;
867 909
868 /* 910 /*
869 * This does two things: 911 * This does two things:
@@ -900,33 +942,24 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
900 !capable(CAP_KILL)) 942 !capable(CAP_KILL))
901 tsk->exit_signal = SIGCHLD; 943 tsk->exit_signal = SIGCHLD;
902 944
903 /* If something other than our normal parent is ptracing us, then 945 signal = tracehook_notify_death(tsk, &cookie, group_dead);
904 * send it a SIGCHLD instead of honoring exit_signal. exit_signal 946 if (signal >= 0)
905 * only has special meaning to our real parent. 947 signal = do_notify_parent(tsk, signal);
906 */
907 if (!task_detached(tsk) && thread_group_empty(tsk)) {
908 int signal = ptrace_reparented(tsk) ?
909 SIGCHLD : tsk->exit_signal;
910 do_notify_parent(tsk, signal);
911 } else if (tsk->ptrace) {
912 do_notify_parent(tsk, SIGCHLD);
913 }
914 948
915 state = EXIT_ZOMBIE; 949 tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE;
916 if (task_detached(tsk) && likely(!tsk->ptrace))
917 state = EXIT_DEAD;
918 tsk->exit_state = state;
919 950
920 /* mt-exec, de_thread() is waiting for us */ 951 /* mt-exec, de_thread() is waiting for us */
921 if (thread_group_leader(tsk) && 952 if (thread_group_leader(tsk) &&
922 tsk->signal->notify_count < 0 && 953 tsk->signal->group_exit_task &&
923 tsk->signal->group_exit_task) 954 tsk->signal->notify_count < 0)
924 wake_up_process(tsk->signal->group_exit_task); 955 wake_up_process(tsk->signal->group_exit_task);
925 956
926 write_unlock_irq(&tasklist_lock); 957 write_unlock_irq(&tasklist_lock);
927 958
959 tracehook_report_death(tsk, signal, cookie, group_dead);
960
928 /* If the process is dead, release it - nobody will wait for it */ 961 /* If the process is dead, release it - nobody will wait for it */
929 if (state == EXIT_DEAD) 962 if (signal == DEATH_REAP)
930 release_task(tsk); 963 release_task(tsk);
931} 964}
932 965
@@ -958,39 +991,6 @@ static void check_stack_usage(void)
958static inline void check_stack_usage(void) {} 991static inline void check_stack_usage(void) {}
959#endif 992#endif
960 993
961static inline void exit_child_reaper(struct task_struct *tsk)
962{
963 if (likely(tsk->group_leader != task_child_reaper(tsk)))
964 return;
965
966 if (tsk->nsproxy->pid_ns == &init_pid_ns)
967 panic("Attempted to kill init!");
968
969 /*
970 * @tsk is the last thread in the 'cgroup-init' and is exiting.
971 * Terminate all remaining processes in the namespace and reap them
972 * before exiting @tsk.
973 *
974 * Note that @tsk (last thread of cgroup-init) may not necessarily
975 * be the child-reaper (i.e main thread of cgroup-init) of the
976 * namespace i.e the child_reaper may have already exited.
977 *
978 * Even after a child_reaper exits, we let it inherit orphaned children,
979 * because, pid_ns->child_reaper remains valid as long as there is
980 * at least one living sub-thread in the cgroup init.
981
982 * This living sub-thread of the cgroup-init will be notified when
983 * a child inherited by the 'child-reaper' exits (do_notify_parent()
984 * uses __group_send_sig_info()). Further, when reaping child processes,
985 * do_wait() iterates over children of all living sub threads.
986
987 * i.e even though 'child_reaper' thread is listed as the parent of the
988 * orphaned children, any living sub-thread in the cgroup-init can
989 * perform the role of the child_reaper.
990 */
991 zap_pid_ns_processes(tsk->nsproxy->pid_ns);
992}
993
994NORET_TYPE void do_exit(long code) 994NORET_TYPE void do_exit(long code)
995{ 995{
996 struct task_struct *tsk = current; 996 struct task_struct *tsk = current;
@@ -1005,10 +1005,7 @@ NORET_TYPE void do_exit(long code)
1005 if (unlikely(!tsk->pid)) 1005 if (unlikely(!tsk->pid))
1006 panic("Attempted to kill the idle task!"); 1006 panic("Attempted to kill the idle task!");
1007 1007
1008 if (unlikely(current->ptrace & PT_TRACE_EXIT)) { 1008 tracehook_report_exit(&code);
1009 current->ptrace_message = code;
1010 ptrace_notify((PTRACE_EVENT_EXIT << 8) | SIGTRAP);
1011 }
1012 1009
1013 /* 1010 /*
1014 * We're taking recursive faults here in do_exit. Safest is to just 1011 * We're taking recursive faults here in do_exit. Safest is to just
@@ -1053,7 +1050,6 @@ NORET_TYPE void do_exit(long code)
1053 } 1050 }
1054 group_dead = atomic_dec_and_test(&tsk->signal->live); 1051 group_dead = atomic_dec_and_test(&tsk->signal->live);
1055 if (group_dead) { 1052 if (group_dead) {
1056 exit_child_reaper(tsk);
1057 hrtimer_cancel(&tsk->signal->real_timer); 1053 hrtimer_cancel(&tsk->signal->real_timer);
1058 exit_itimers(tsk->signal); 1054 exit_itimers(tsk->signal);
1059 } 1055 }
@@ -1078,6 +1074,8 @@ NORET_TYPE void do_exit(long code)
1078 1074
1079 if (group_dead) 1075 if (group_dead)
1080 acct_process(); 1076 acct_process();
1077 trace_sched_process_exit(tsk);
1078
1081 exit_sem(tsk); 1079 exit_sem(tsk);
1082 exit_files(tsk); 1080 exit_files(tsk);
1083 exit_fs(tsk); 1081 exit_fs(tsk);
@@ -1306,6 +1304,7 @@ static int wait_task_zombie(struct task_struct *p, int options,
1306 if (likely(!traced)) { 1304 if (likely(!traced)) {
1307 struct signal_struct *psig; 1305 struct signal_struct *psig;
1308 struct signal_struct *sig; 1306 struct signal_struct *sig;
1307 struct task_cputime cputime;
1309 1308
1310 /* 1309 /*
1311 * The resource counters for the group leader are in its 1310 * The resource counters for the group leader are in its
@@ -1321,20 +1320,23 @@ static int wait_task_zombie(struct task_struct *p, int options,
1321 * need to protect the access to p->parent->signal fields, 1320 * need to protect the access to p->parent->signal fields,
1322 * as other threads in the parent group can be right 1321 * as other threads in the parent group can be right
1323 * here reaping other children at the same time. 1322 * here reaping other children at the same time.
1323 *
1324 * We use thread_group_cputime() to get times for the thread
1325 * group, which consolidates times for all threads in the
1326 * group including the group leader.
1324 */ 1327 */
1325 spin_lock_irq(&p->parent->sighand->siglock); 1328 spin_lock_irq(&p->parent->sighand->siglock);
1326 psig = p->parent->signal; 1329 psig = p->parent->signal;
1327 sig = p->signal; 1330 sig = p->signal;
1331 thread_group_cputime(p, &cputime);
1328 psig->cutime = 1332 psig->cutime =
1329 cputime_add(psig->cutime, 1333 cputime_add(psig->cutime,
1330 cputime_add(p->utime, 1334 cputime_add(cputime.utime,
1331 cputime_add(sig->utime, 1335 sig->cutime));
1332 sig->cutime)));
1333 psig->cstime = 1336 psig->cstime =
1334 cputime_add(psig->cstime, 1337 cputime_add(psig->cstime,
1335 cputime_add(p->stime, 1338 cputime_add(cputime.stime,
1336 cputime_add(sig->stime, 1339 sig->cstime));
1337 sig->cstime)));
1338 psig->cgtime = 1340 psig->cgtime =
1339 cputime_add(psig->cgtime, 1341 cputime_add(psig->cgtime,
1340 cputime_add(p->gtime, 1342 cputime_add(p->gtime,
@@ -1354,6 +1356,8 @@ static int wait_task_zombie(struct task_struct *p, int options,
1354 psig->coublock += 1356 psig->coublock +=
1355 task_io_get_oublock(p) + 1357 task_io_get_oublock(p) +
1356 sig->oublock + sig->coublock; 1358 sig->oublock + sig->coublock;
1359 task_io_accounting_add(&psig->ioac, &p->ioac);
1360 task_io_accounting_add(&psig->ioac, &sig->ioac);
1357 spin_unlock_irq(&p->parent->sighand->siglock); 1361 spin_unlock_irq(&p->parent->sighand->siglock);
1358 } 1362 }
1359 1363
@@ -1677,6 +1681,8 @@ static long do_wait(enum pid_type type, struct pid *pid, int options,
1677 struct task_struct *tsk; 1681 struct task_struct *tsk;
1678 int retval; 1682 int retval;
1679 1683
1684 trace_sched_process_wait(pid);
1685
1680 add_wait_queue(&current->signal->wait_chldexit,&wait); 1686 add_wait_queue(&current->signal->wait_chldexit,&wait);
1681repeat: 1687repeat:
1682 /* 1688 /*
diff --git a/kernel/fork.c b/kernel/fork.c
index adefc1131f27..f6083561dfe0 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -27,15 +27,18 @@
27#include <linux/key.h> 27#include <linux/key.h>
28#include <linux/binfmts.h> 28#include <linux/binfmts.h>
29#include <linux/mman.h> 29#include <linux/mman.h>
30#include <linux/mmu_notifier.h>
30#include <linux/fs.h> 31#include <linux/fs.h>
31#include <linux/nsproxy.h> 32#include <linux/nsproxy.h>
32#include <linux/capability.h> 33#include <linux/capability.h>
33#include <linux/cpu.h> 34#include <linux/cpu.h>
34#include <linux/cgroup.h> 35#include <linux/cgroup.h>
35#include <linux/security.h> 36#include <linux/security.h>
37#include <linux/hugetlb.h>
36#include <linux/swap.h> 38#include <linux/swap.h>
37#include <linux/syscalls.h> 39#include <linux/syscalls.h>
38#include <linux/jiffies.h> 40#include <linux/jiffies.h>
41#include <linux/tracehook.h>
39#include <linux/futex.h> 42#include <linux/futex.h>
40#include <linux/task_io_accounting_ops.h> 43#include <linux/task_io_accounting_ops.h>
41#include <linux/rcupdate.h> 44#include <linux/rcupdate.h>
@@ -55,6 +58,7 @@
55#include <linux/tty.h> 58#include <linux/tty.h>
56#include <linux/proc_fs.h> 59#include <linux/proc_fs.h>
57#include <linux/blkdev.h> 60#include <linux/blkdev.h>
61#include <trace/sched.h>
58 62
59#include <asm/pgtable.h> 63#include <asm/pgtable.h>
60#include <asm/pgalloc.h> 64#include <asm/pgalloc.h>
@@ -92,6 +96,23 @@ int nr_processes(void)
92static struct kmem_cache *task_struct_cachep; 96static struct kmem_cache *task_struct_cachep;
93#endif 97#endif
94 98
99#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR
100static inline struct thread_info *alloc_thread_info(struct task_struct *tsk)
101{
102#ifdef CONFIG_DEBUG_STACK_USAGE
103 gfp_t mask = GFP_KERNEL | __GFP_ZERO;
104#else
105 gfp_t mask = GFP_KERNEL;
106#endif
107 return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER);
108}
109
110static inline void free_thread_info(struct thread_info *ti)
111{
112 free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
113}
114#endif
115
95/* SLAB cache for signal_struct structures (tsk->signal) */ 116/* SLAB cache for signal_struct structures (tsk->signal) */
96static struct kmem_cache *signal_cachep; 117static struct kmem_cache *signal_cachep;
97 118
@@ -307,6 +328,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
307 } 328 }
308 329
309 /* 330 /*
331 * Clear hugetlb-related page reserves for children. This only
332 * affects MAP_PRIVATE mappings. Faults generated by the child
333 * are not guaranteed to succeed, even if read-only
334 */
335 if (is_vm_hugetlb_page(tmp))
336 reset_vma_resv_huge_pages(tmp);
337
338 /*
310 * Link in the new vma and copy the page table entries. 339 * Link in the new vma and copy the page table entries.
311 */ 340 */
312 *pprev = tmp; 341 *pprev = tmp;
@@ -374,7 +403,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
374 INIT_LIST_HEAD(&mm->mmlist); 403 INIT_LIST_HEAD(&mm->mmlist);
375 mm->flags = (current->mm) ? current->mm->flags 404 mm->flags = (current->mm) ? current->mm->flags
376 : MMF_DUMP_FILTER_DEFAULT; 405 : MMF_DUMP_FILTER_DEFAULT;
377 mm->core_waiters = 0; 406 mm->core_state = NULL;
378 mm->nr_ptes = 0; 407 mm->nr_ptes = 0;
379 set_mm_counter(mm, file_rss, 0); 408 set_mm_counter(mm, file_rss, 0);
380 set_mm_counter(mm, anon_rss, 0); 409 set_mm_counter(mm, anon_rss, 0);
@@ -387,6 +416,7 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
387 416
388 if (likely(!mm_alloc_pgd(mm))) { 417 if (likely(!mm_alloc_pgd(mm))) {
389 mm->def_flags = 0; 418 mm->def_flags = 0;
419 mmu_notifier_mm_init(mm);
390 return mm; 420 return mm;
391 } 421 }
392 422
@@ -419,6 +449,7 @@ void __mmdrop(struct mm_struct *mm)
419 BUG_ON(mm == &init_mm); 449 BUG_ON(mm == &init_mm);
420 mm_free_pgd(mm); 450 mm_free_pgd(mm);
421 destroy_context(mm); 451 destroy_context(mm);
452 mmu_notifier_mm_destroy(mm);
422 free_mm(mm); 453 free_mm(mm);
423} 454}
424EXPORT_SYMBOL_GPL(__mmdrop); 455EXPORT_SYMBOL_GPL(__mmdrop);
@@ -448,7 +479,7 @@ EXPORT_SYMBOL_GPL(mmput);
448/** 479/**
449 * get_task_mm - acquire a reference to the task's mm 480 * get_task_mm - acquire a reference to the task's mm
450 * 481 *
451 * Returns %NULL if the task has no mm. Checks PF_BORROWED_MM (meaning 482 * Returns %NULL if the task has no mm. Checks PF_KTHREAD (meaning
452 * this kernel workthread has transiently adopted a user mm with use_mm, 483 * this kernel workthread has transiently adopted a user mm with use_mm,
453 * to do its AIO) is not set and if so returns a reference to it, after 484 * to do its AIO) is not set and if so returns a reference to it, after
454 * bumping up the use count. User must release the mm via mmput() 485 * bumping up the use count. User must release the mm via mmput()
@@ -461,7 +492,7 @@ struct mm_struct *get_task_mm(struct task_struct *task)
461 task_lock(task); 492 task_lock(task);
462 mm = task->mm; 493 mm = task->mm;
463 if (mm) { 494 if (mm) {
464 if (task->flags & PF_BORROWED_MM) 495 if (task->flags & PF_KTHREAD)
465 mm = NULL; 496 mm = NULL;
466 else 497 else
467 atomic_inc(&mm->mm_users); 498 atomic_inc(&mm->mm_users);
@@ -630,13 +661,6 @@ static struct fs_struct *__copy_fs_struct(struct fs_struct *old)
630 path_get(&old->root); 661 path_get(&old->root);
631 fs->pwd = old->pwd; 662 fs->pwd = old->pwd;
632 path_get(&old->pwd); 663 path_get(&old->pwd);
633 if (old->altroot.dentry) {
634 fs->altroot = old->altroot;
635 path_get(&old->altroot);
636 } else {
637 fs->altroot.mnt = NULL;
638 fs->altroot.dentry = NULL;
639 }
640 read_unlock(&old->lock); 664 read_unlock(&old->lock);
641 } 665 }
642 return fs; 666 return fs;
@@ -736,15 +760,44 @@ void __cleanup_sighand(struct sighand_struct *sighand)
736 kmem_cache_free(sighand_cachep, sighand); 760 kmem_cache_free(sighand_cachep, sighand);
737} 761}
738 762
763
764/*
765 * Initialize POSIX timer handling for a thread group.
766 */
767static void posix_cpu_timers_init_group(struct signal_struct *sig)
768{
769 /* Thread group counters. */
770 thread_group_cputime_init(sig);
771
772 /* Expiration times and increments. */
773 sig->it_virt_expires = cputime_zero;
774 sig->it_virt_incr = cputime_zero;
775 sig->it_prof_expires = cputime_zero;
776 sig->it_prof_incr = cputime_zero;
777
778 /* Cached expiration times. */
779 sig->cputime_expires.prof_exp = cputime_zero;
780 sig->cputime_expires.virt_exp = cputime_zero;
781 sig->cputime_expires.sched_exp = 0;
782
783 /* The timer lists. */
784 INIT_LIST_HEAD(&sig->cpu_timers[0]);
785 INIT_LIST_HEAD(&sig->cpu_timers[1]);
786 INIT_LIST_HEAD(&sig->cpu_timers[2]);
787}
788
739static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) 789static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
740{ 790{
741 struct signal_struct *sig; 791 struct signal_struct *sig;
742 int ret; 792 int ret;
743 793
744 if (clone_flags & CLONE_THREAD) { 794 if (clone_flags & CLONE_THREAD) {
745 atomic_inc(&current->signal->count); 795 ret = thread_group_cputime_clone_thread(current);
746 atomic_inc(&current->signal->live); 796 if (likely(!ret)) {
747 return 0; 797 atomic_inc(&current->signal->count);
798 atomic_inc(&current->signal->live);
799 }
800 return ret;
748 } 801 }
749 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); 802 sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
750 tsk->signal = sig; 803 tsk->signal = sig;
@@ -772,38 +825,25 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
772 sig->it_real_incr.tv64 = 0; 825 sig->it_real_incr.tv64 = 0;
773 sig->real_timer.function = it_real_fn; 826 sig->real_timer.function = it_real_fn;
774 827
775 sig->it_virt_expires = cputime_zero;
776 sig->it_virt_incr = cputime_zero;
777 sig->it_prof_expires = cputime_zero;
778 sig->it_prof_incr = cputime_zero;
779
780 sig->leader = 0; /* session leadership doesn't inherit */ 828 sig->leader = 0; /* session leadership doesn't inherit */
781 sig->tty_old_pgrp = NULL; 829 sig->tty_old_pgrp = NULL;
830 sig->tty = NULL;
782 831
783 sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero; 832 sig->cutime = sig->cstime = cputime_zero;
784 sig->gtime = cputime_zero; 833 sig->gtime = cputime_zero;
785 sig->cgtime = cputime_zero; 834 sig->cgtime = cputime_zero;
786 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0; 835 sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
787 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0; 836 sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
788 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0; 837 sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
789 sig->sum_sched_runtime = 0; 838 task_io_accounting_init(&sig->ioac);
790 INIT_LIST_HEAD(&sig->cpu_timers[0]);
791 INIT_LIST_HEAD(&sig->cpu_timers[1]);
792 INIT_LIST_HEAD(&sig->cpu_timers[2]);
793 taskstats_tgid_init(sig); 839 taskstats_tgid_init(sig);
794 840
795 task_lock(current->group_leader); 841 task_lock(current->group_leader);
796 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); 842 memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
797 task_unlock(current->group_leader); 843 task_unlock(current->group_leader);
798 844
799 if (sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY) { 845 posix_cpu_timers_init_group(sig);
800 /* 846
801 * New sole thread in the process gets an expiry time
802 * of the whole CPU time limit.
803 */
804 tsk->it_prof_expires =
805 secs_to_cputime(sig->rlim[RLIMIT_CPU].rlim_cur);
806 }
807 acct_init_pacct(&sig->pacct); 847 acct_init_pacct(&sig->pacct);
808 848
809 tty_audit_fork(sig); 849 tty_audit_fork(sig);
@@ -813,7 +853,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
813 853
814void __cleanup_signal(struct signal_struct *sig) 854void __cleanup_signal(struct signal_struct *sig)
815{ 855{
856 thread_group_cputime_free(sig);
816 exit_thread_group_keys(sig); 857 exit_thread_group_keys(sig);
858 tty_kref_put(sig->tty);
817 kmem_cache_free(signal_cachep, sig); 859 kmem_cache_free(signal_cachep, sig);
818} 860}
819 861
@@ -833,8 +875,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p)
833 875
834 new_flags &= ~PF_SUPERPRIV; 876 new_flags &= ~PF_SUPERPRIV;
835 new_flags |= PF_FORKNOEXEC; 877 new_flags |= PF_FORKNOEXEC;
836 if (!(clone_flags & CLONE_PTRACE)) 878 new_flags |= PF_STARTING;
837 p->ptrace = 0;
838 p->flags = new_flags; 879 p->flags = new_flags;
839 clear_freeze_flag(p); 880 clear_freeze_flag(p);
840} 881}
@@ -863,6 +904,19 @@ void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
863#endif /* CONFIG_MM_OWNER */ 904#endif /* CONFIG_MM_OWNER */
864 905
865/* 906/*
907 * Initialize POSIX timer handling for a single task.
908 */
909static void posix_cpu_timers_init(struct task_struct *tsk)
910{
911 tsk->cputime_expires.prof_exp = cputime_zero;
912 tsk->cputime_expires.virt_exp = cputime_zero;
913 tsk->cputime_expires.sched_exp = 0;
914 INIT_LIST_HEAD(&tsk->cpu_timers[0]);
915 INIT_LIST_HEAD(&tsk->cpu_timers[1]);
916 INIT_LIST_HEAD(&tsk->cpu_timers[2]);
917}
918
919/*
866 * This creates a new process as a copy of the old one, 920 * This creates a new process as a copy of the old one,
867 * but does not actually start it yet. 921 * but does not actually start it yet.
868 * 922 *
@@ -875,7 +929,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
875 struct pt_regs *regs, 929 struct pt_regs *regs,
876 unsigned long stack_size, 930 unsigned long stack_size,
877 int __user *child_tidptr, 931 int __user *child_tidptr,
878 struct pid *pid) 932 struct pid *pid,
933 int trace)
879{ 934{
880 int retval; 935 int retval;
881 struct task_struct *p; 936 struct task_struct *p;
@@ -963,26 +1018,17 @@ static struct task_struct *copy_process(unsigned long clone_flags,
963 p->prev_utime = cputime_zero; 1018 p->prev_utime = cputime_zero;
964 p->prev_stime = cputime_zero; 1019 p->prev_stime = cputime_zero;
965 1020
1021 p->default_timer_slack_ns = current->timer_slack_ns;
1022
966#ifdef CONFIG_DETECT_SOFTLOCKUP 1023#ifdef CONFIG_DETECT_SOFTLOCKUP
967 p->last_switch_count = 0; 1024 p->last_switch_count = 0;
968 p->last_switch_timestamp = 0; 1025 p->last_switch_timestamp = 0;
969#endif 1026#endif
970 1027
971#ifdef CONFIG_TASK_XACCT 1028 task_io_accounting_init(&p->ioac);
972 p->rchar = 0; /* I/O counter: bytes read */
973 p->wchar = 0; /* I/O counter: bytes written */
974 p->syscr = 0; /* I/O counter: read syscalls */
975 p->syscw = 0; /* I/O counter: write syscalls */
976#endif
977 task_io_accounting_init(p);
978 acct_clear_integrals(p); 1029 acct_clear_integrals(p);
979 1030
980 p->it_virt_expires = cputime_zero; 1031 posix_cpu_timers_init(p);
981 p->it_prof_expires = cputime_zero;
982 p->it_sched_expires = 0;
983 INIT_LIST_HEAD(&p->cpu_timers[0]);
984 INIT_LIST_HEAD(&p->cpu_timers[1]);
985 INIT_LIST_HEAD(&p->cpu_timers[2]);
986 1032
987 p->lock_depth = -1; /* -1 = no lock */ 1033 p->lock_depth = -1; /* -1 = no lock */
988 do_posix_clock_monotonic_gettime(&p->start_time); 1034 do_posix_clock_monotonic_gettime(&p->start_time);
@@ -1081,6 +1127,12 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1081 if (clone_flags & CLONE_THREAD) 1127 if (clone_flags & CLONE_THREAD)
1082 p->tgid = current->tgid; 1128 p->tgid = current->tgid;
1083 1129
1130 if (current->nsproxy != p->nsproxy) {
1131 retval = ns_cgroup_clone(p, pid);
1132 if (retval)
1133 goto bad_fork_free_pid;
1134 }
1135
1084 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL; 1136 p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
1085 /* 1137 /*
1086 * Clear TID on mm_release()? 1138 * Clear TID on mm_release()?
@@ -1125,8 +1177,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1125 */ 1177 */
1126 p->group_leader = p; 1178 p->group_leader = p;
1127 INIT_LIST_HEAD(&p->thread_group); 1179 INIT_LIST_HEAD(&p->thread_group);
1128 INIT_LIST_HEAD(&p->ptrace_entry);
1129 INIT_LIST_HEAD(&p->ptraced);
1130 1180
1131 /* Now that the task is set up, run cgroup callbacks if 1181 /* Now that the task is set up, run cgroup callbacks if
1132 * necessary. We need to run them before the task is visible 1182 * necessary. We need to run them before the task is visible
@@ -1157,7 +1207,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1157 p->real_parent = current->real_parent; 1207 p->real_parent = current->real_parent;
1158 else 1208 else
1159 p->real_parent = current; 1209 p->real_parent = current;
1160 p->parent = p->real_parent;
1161 1210
1162 spin_lock(&current->sighand->siglock); 1211 spin_lock(&current->sighand->siglock);
1163 1212
@@ -1180,34 +1229,19 @@ static struct task_struct *copy_process(unsigned long clone_flags,
1180 if (clone_flags & CLONE_THREAD) { 1229 if (clone_flags & CLONE_THREAD) {
1181 p->group_leader = current->group_leader; 1230 p->group_leader = current->group_leader;
1182 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); 1231 list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group);
1183
1184 if (!cputime_eq(current->signal->it_virt_expires,
1185 cputime_zero) ||
1186 !cputime_eq(current->signal->it_prof_expires,
1187 cputime_zero) ||
1188 current->signal->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY ||
1189 !list_empty(&current->signal->cpu_timers[0]) ||
1190 !list_empty(&current->signal->cpu_timers[1]) ||
1191 !list_empty(&current->signal->cpu_timers[2])) {
1192 /*
1193 * Have child wake up on its first tick to check
1194 * for process CPU timers.
1195 */
1196 p->it_prof_expires = jiffies_to_cputime(1);
1197 }
1198 } 1232 }
1199 1233
1200 if (likely(p->pid)) { 1234 if (likely(p->pid)) {
1201 list_add_tail(&p->sibling, &p->real_parent->children); 1235 list_add_tail(&p->sibling, &p->real_parent->children);
1202 if (unlikely(p->ptrace & PT_PTRACED)) 1236 tracehook_finish_clone(p, clone_flags, trace);
1203 __ptrace_link(p, current->parent);
1204 1237
1205 if (thread_group_leader(p)) { 1238 if (thread_group_leader(p)) {
1206 if (clone_flags & CLONE_NEWPID) 1239 if (clone_flags & CLONE_NEWPID)
1207 p->nsproxy->pid_ns->child_reaper = p; 1240 p->nsproxy->pid_ns->child_reaper = p;
1208 1241
1209 p->signal->leader_pid = pid; 1242 p->signal->leader_pid = pid;
1210 p->signal->tty = current->signal->tty; 1243 tty_kref_put(p->signal->tty);
1244 p->signal->tty = tty_kref_get(current->signal->tty);
1211 set_task_pgrp(p, task_pgrp_nr(current)); 1245 set_task_pgrp(p, task_pgrp_nr(current));
1212 set_task_session(p, task_session_nr(current)); 1246 set_task_session(p, task_session_nr(current));
1213 attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); 1247 attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
@@ -1285,29 +1319,13 @@ struct task_struct * __cpuinit fork_idle(int cpu)
1285 struct pt_regs regs; 1319 struct pt_regs regs;
1286 1320
1287 task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL, 1321 task = copy_process(CLONE_VM, 0, idle_regs(&regs), 0, NULL,
1288 &init_struct_pid); 1322 &init_struct_pid, 0);
1289 if (!IS_ERR(task)) 1323 if (!IS_ERR(task))
1290 init_idle(task, cpu); 1324 init_idle(task, cpu);
1291 1325
1292 return task; 1326 return task;
1293} 1327}
1294 1328
1295static int fork_traceflag(unsigned clone_flags)
1296{
1297 if (clone_flags & CLONE_UNTRACED)
1298 return 0;
1299 else if (clone_flags & CLONE_VFORK) {
1300 if (current->ptrace & PT_TRACE_VFORK)
1301 return PTRACE_EVENT_VFORK;
1302 } else if ((clone_flags & CSIGNAL) != SIGCHLD) {
1303 if (current->ptrace & PT_TRACE_CLONE)
1304 return PTRACE_EVENT_CLONE;
1305 } else if (current->ptrace & PT_TRACE_FORK)
1306 return PTRACE_EVENT_FORK;
1307
1308 return 0;
1309}
1310
1311/* 1329/*
1312 * Ok, this is the main fork-routine. 1330 * Ok, this is the main fork-routine.
1313 * 1331 *
@@ -1342,14 +1360,14 @@ long do_fork(unsigned long clone_flags,
1342 } 1360 }
1343 } 1361 }
1344 1362
1345 if (unlikely(current->ptrace)) { 1363 /*
1346 trace = fork_traceflag (clone_flags); 1364 * When called from kernel_thread, don't do user tracing stuff.
1347 if (trace) 1365 */
1348 clone_flags |= CLONE_PTRACE; 1366 if (likely(user_mode(regs)))
1349 } 1367 trace = tracehook_prepare_clone(clone_flags);
1350 1368
1351 p = copy_process(clone_flags, stack_start, regs, stack_size, 1369 p = copy_process(clone_flags, stack_start, regs, stack_size,
1352 child_tidptr, NULL); 1370 child_tidptr, NULL, trace);
1353 /* 1371 /*
1354 * Do this prior waking up the new thread - the thread pointer 1372 * Do this prior waking up the new thread - the thread pointer
1355 * might get invalid after that point, if the thread exits quickly. 1373 * might get invalid after that point, if the thread exits quickly.
@@ -1357,6 +1375,8 @@ long do_fork(unsigned long clone_flags,
1357 if (!IS_ERR(p)) { 1375 if (!IS_ERR(p)) {
1358 struct completion vfork; 1376 struct completion vfork;
1359 1377
1378 trace_sched_process_fork(current, p);
1379
1360 nr = task_pid_vnr(p); 1380 nr = task_pid_vnr(p);
1361 1381
1362 if (clone_flags & CLONE_PARENT_SETTID) 1382 if (clone_flags & CLONE_PARENT_SETTID)
@@ -1367,32 +1387,35 @@ long do_fork(unsigned long clone_flags,
1367 init_completion(&vfork); 1387 init_completion(&vfork);
1368 } 1388 }
1369 1389
1370 if ((p->ptrace & PT_PTRACED) || (clone_flags & CLONE_STOPPED)) { 1390 tracehook_report_clone(trace, regs, clone_flags, nr, p);
1391
1392 /*
1393 * We set PF_STARTING at creation in case tracing wants to
1394 * use this to distinguish a fully live task from one that
1395 * hasn't gotten to tracehook_report_clone() yet. Now we
1396 * clear it and set the child going.
1397 */
1398 p->flags &= ~PF_STARTING;
1399
1400 if (unlikely(clone_flags & CLONE_STOPPED)) {
1371 /* 1401 /*
1372 * We'll start up with an immediate SIGSTOP. 1402 * We'll start up with an immediate SIGSTOP.
1373 */ 1403 */
1374 sigaddset(&p->pending.signal, SIGSTOP); 1404 sigaddset(&p->pending.signal, SIGSTOP);
1375 set_tsk_thread_flag(p, TIF_SIGPENDING); 1405 set_tsk_thread_flag(p, TIF_SIGPENDING);
1376 }
1377
1378 if (!(clone_flags & CLONE_STOPPED))
1379 wake_up_new_task(p, clone_flags);
1380 else
1381 __set_task_state(p, TASK_STOPPED); 1406 __set_task_state(p, TASK_STOPPED);
1382 1407 } else {
1383 if (unlikely (trace)) { 1408 wake_up_new_task(p, clone_flags);
1384 current->ptrace_message = nr;
1385 ptrace_notify ((trace << 8) | SIGTRAP);
1386 } 1409 }
1387 1410
1411 tracehook_report_clone_complete(trace, regs,
1412 clone_flags, nr, p);
1413
1388 if (clone_flags & CLONE_VFORK) { 1414 if (clone_flags & CLONE_VFORK) {
1389 freezer_do_not_count(); 1415 freezer_do_not_count();
1390 wait_for_completion(&vfork); 1416 wait_for_completion(&vfork);
1391 freezer_count(); 1417 freezer_count();
1392 if (unlikely (current->ptrace & PT_TRACE_VFORK_DONE)) { 1418 tracehook_report_vfork_done(p, nr);
1393 current->ptrace_message = nr;
1394 ptrace_notify ((PTRACE_EVENT_VFORK_DONE << 8) | SIGTRAP);
1395 }
1396 } 1419 }
1397 } else { 1420 } else {
1398 nr = PTR_ERR(p); 1421 nr = PTR_ERR(p);
@@ -1404,7 +1427,7 @@ long do_fork(unsigned long clone_flags,
1404#define ARCH_MIN_MMSTRUCT_ALIGN 0 1427#define ARCH_MIN_MMSTRUCT_ALIGN 0
1405#endif 1428#endif
1406 1429
1407static void sighand_ctor(struct kmem_cache *cachep, void *data) 1430static void sighand_ctor(void *data)
1408{ 1431{
1409 struct sighand_struct *sighand = data; 1432 struct sighand_struct *sighand = data;
1410 1433
diff --git a/kernel/freezer.c b/kernel/freezer.c
new file mode 100644
index 000000000000..ba6248b323ef
--- /dev/null
+++ b/kernel/freezer.c
@@ -0,0 +1,154 @@
1/*
2 * kernel/freezer.c - Function to freeze a process
3 *
4 * Originally from kernel/power/process.c
5 */
6
7#include <linux/interrupt.h>
8#include <linux/suspend.h>
9#include <linux/module.h>
10#include <linux/syscalls.h>
11#include <linux/freezer.h>
12
13/*
14 * freezing is complete, mark current process as frozen
15 */
16static inline void frozen_process(void)
17{
18 if (!unlikely(current->flags & PF_NOFREEZE)) {
19 current->flags |= PF_FROZEN;
20 wmb();
21 }
22 clear_freeze_flag(current);
23}
24
25/* Refrigerator is place where frozen processes are stored :-). */
26void refrigerator(void)
27{
28 /* Hmm, should we be allowed to suspend when there are realtime
29 processes around? */
30 long save;
31
32 task_lock(current);
33 if (freezing(current)) {
34 frozen_process();
35 task_unlock(current);
36 } else {
37 task_unlock(current);
38 return;
39 }
40 save = current->state;
41 pr_debug("%s entered refrigerator\n", current->comm);
42
43 spin_lock_irq(&current->sighand->siglock);
44 recalc_sigpending(); /* We sent fake signal, clean it up */
45 spin_unlock_irq(&current->sighand->siglock);
46
47 for (;;) {
48 set_current_state(TASK_UNINTERRUPTIBLE);
49 if (!frozen(current))
50 break;
51 schedule();
52 }
53 pr_debug("%s left refrigerator\n", current->comm);
54 __set_current_state(save);
55}
56EXPORT_SYMBOL(refrigerator);
57
58static void fake_signal_wake_up(struct task_struct *p)
59{
60 unsigned long flags;
61
62 spin_lock_irqsave(&p->sighand->siglock, flags);
63 signal_wake_up(p, 0);
64 spin_unlock_irqrestore(&p->sighand->siglock, flags);
65}
66
67/**
68 * freeze_task - send a freeze request to given task
69 * @p: task to send the request to
70 * @sig_only: if set, the request will only be sent if the task has the
71 * PF_FREEZER_NOSIG flag unset
72 * Return value: 'false', if @sig_only is set and the task has
73 * PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise
74 *
75 * The freeze request is sent by setting the tasks's TIF_FREEZE flag and
76 * either sending a fake signal to it or waking it up, depending on whether
77 * or not it has PF_FREEZER_NOSIG set. If @sig_only is set and the task
78 * has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its
79 * TIF_FREEZE flag will not be set.
80 */
81bool freeze_task(struct task_struct *p, bool sig_only)
82{
83 /*
84 * We first check if the task is freezing and next if it has already
85 * been frozen to avoid the race with frozen_process() which first marks
86 * the task as frozen and next clears its TIF_FREEZE.
87 */
88 if (!freezing(p)) {
89 rmb();
90 if (frozen(p))
91 return false;
92
93 if (!sig_only || should_send_signal(p))
94 set_freeze_flag(p);
95 else
96 return false;
97 }
98
99 if (should_send_signal(p)) {
100 if (!signal_pending(p))
101 fake_signal_wake_up(p);
102 } else if (sig_only) {
103 return false;
104 } else {
105 wake_up_state(p, TASK_INTERRUPTIBLE);
106 }
107
108 return true;
109}
110
111void cancel_freezing(struct task_struct *p)
112{
113 unsigned long flags;
114
115 if (freezing(p)) {
116 pr_debug(" clean up: %s\n", p->comm);
117 clear_freeze_flag(p);
118 spin_lock_irqsave(&p->sighand->siglock, flags);
119 recalc_sigpending_and_wake(p);
120 spin_unlock_irqrestore(&p->sighand->siglock, flags);
121 }
122}
123
124/*
125 * Wake up a frozen process
126 *
127 * task_lock() is needed to prevent the race with refrigerator() which may
128 * occur if the freezing of tasks fails. Namely, without the lock, if the
129 * freezing of tasks failed, thaw_tasks() might have run before a task in
130 * refrigerator() could call frozen_process(), in which case the task would be
131 * frozen and no one would thaw it.
132 */
133int __thaw_process(struct task_struct *p)
134{
135 if (frozen(p)) {
136 p->flags &= ~PF_FROZEN;
137 return 1;
138 }
139 clear_freeze_flag(p);
140 return 0;
141}
142
143int thaw_process(struct task_struct *p)
144{
145 task_lock(p);
146 if (__thaw_process(p) == 1) {
147 task_unlock(p);
148 wake_up_process(p);
149 return 1;
150 }
151 task_unlock(p);
152 return 0;
153}
154EXPORT_SYMBOL(thaw_process);
diff --git a/kernel/futex.c b/kernel/futex.c
index 7d1136e97c14..8af10027514b 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1296,13 +1296,16 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared,
1296 if (!abs_time) 1296 if (!abs_time)
1297 schedule(); 1297 schedule();
1298 else { 1298 else {
1299 unsigned long slack;
1300 slack = current->timer_slack_ns;
1301 if (rt_task(current))
1302 slack = 0;
1299 hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, 1303 hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC,
1300 HRTIMER_MODE_ABS); 1304 HRTIMER_MODE_ABS);
1301 hrtimer_init_sleeper(&t, current); 1305 hrtimer_init_sleeper(&t, current);
1302 t.timer.expires = *abs_time; 1306 hrtimer_set_expires_range_ns(&t.timer, *abs_time, slack);
1303 1307
1304 hrtimer_start(&t.timer, t.timer.expires, 1308 hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
1305 HRTIMER_MODE_ABS);
1306 if (!hrtimer_active(&t.timer)) 1309 if (!hrtimer_active(&t.timer))
1307 t.task = NULL; 1310 t.task = NULL;
1308 1311
@@ -1404,7 +1407,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared,
1404 hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME, 1407 hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME,
1405 HRTIMER_MODE_ABS); 1408 HRTIMER_MODE_ABS);
1406 hrtimer_init_sleeper(to, current); 1409 hrtimer_init_sleeper(to, current);
1407 to->timer.expires = *time; 1410 hrtimer_set_expires(&to->timer, *time);
1408 } 1411 }
1409 1412
1410 q.pi_state = NULL; 1413 q.pi_state = NULL;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index b8e4dce80a74..2b465dfde426 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -517,7 +517,7 @@ static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base)
517 if (!base->first) 517 if (!base->first)
518 continue; 518 continue;
519 timer = rb_entry(base->first, struct hrtimer, node); 519 timer = rb_entry(base->first, struct hrtimer, node);
520 expires = ktime_sub(timer->expires, base->offset); 520 expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
521 if (expires.tv64 < cpu_base->expires_next.tv64) 521 if (expires.tv64 < cpu_base->expires_next.tv64)
522 cpu_base->expires_next = expires; 522 cpu_base->expires_next = expires;
523 } 523 }
@@ -539,10 +539,10 @@ static int hrtimer_reprogram(struct hrtimer *timer,
539 struct hrtimer_clock_base *base) 539 struct hrtimer_clock_base *base)
540{ 540{
541 ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next; 541 ktime_t *expires_next = &__get_cpu_var(hrtimer_bases).expires_next;
542 ktime_t expires = ktime_sub(timer->expires, base->offset); 542 ktime_t expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
543 int res; 543 int res;
544 544
545 WARN_ON_ONCE(timer->expires.tv64 < 0); 545 WARN_ON_ONCE(hrtimer_get_expires_tv64(timer) < 0);
546 546
547 /* 547 /*
548 * When the callback is running, we do not reprogram the clock event 548 * When the callback is running, we do not reprogram the clock event
@@ -672,13 +672,14 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
672 */ 672 */
673 BUG_ON(timer->function(timer) != HRTIMER_NORESTART); 673 BUG_ON(timer->function(timer) != HRTIMER_NORESTART);
674 return 1; 674 return 1;
675 case HRTIMER_CB_IRQSAFE_NO_SOFTIRQ: 675 case HRTIMER_CB_IRQSAFE_PERCPU:
676 case HRTIMER_CB_IRQSAFE_UNLOCKED:
676 /* 677 /*
677 * This is solely for the sched tick emulation with 678 * This is solely for the sched tick emulation with
678 * dynamic tick support to ensure that we do not 679 * dynamic tick support to ensure that we do not
679 * restart the tick right on the edge and end up with 680 * restart the tick right on the edge and end up with
680 * the tick timer in the softirq ! The calling site 681 * the tick timer in the softirq ! The calling site
681 * takes care of this. 682 * takes care of this. Also used for hrtimer sleeper !
682 */ 683 */
683 debug_hrtimer_deactivate(timer); 684 debug_hrtimer_deactivate(timer);
684 return 1; 685 return 1;
@@ -794,7 +795,7 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
794 u64 orun = 1; 795 u64 orun = 1;
795 ktime_t delta; 796 ktime_t delta;
796 797
797 delta = ktime_sub(now, timer->expires); 798 delta = ktime_sub(now, hrtimer_get_expires(timer));
798 799
799 if (delta.tv64 < 0) 800 if (delta.tv64 < 0)
800 return 0; 801 return 0;
@@ -806,8 +807,8 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
806 s64 incr = ktime_to_ns(interval); 807 s64 incr = ktime_to_ns(interval);
807 808
808 orun = ktime_divns(delta, incr); 809 orun = ktime_divns(delta, incr);
809 timer->expires = ktime_add_ns(timer->expires, incr * orun); 810 hrtimer_add_expires_ns(timer, incr * orun);
810 if (timer->expires.tv64 > now.tv64) 811 if (hrtimer_get_expires_tv64(timer) > now.tv64)
811 return orun; 812 return orun;
812 /* 813 /*
813 * This (and the ktime_add() below) is the 814 * This (and the ktime_add() below) is the
@@ -815,7 +816,7 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval)
815 */ 816 */
816 orun++; 817 orun++;
817 } 818 }
818 timer->expires = ktime_add_safe(timer->expires, interval); 819 hrtimer_add_expires(timer, interval);
819 820
820 return orun; 821 return orun;
821} 822}
@@ -847,7 +848,8 @@ static void enqueue_hrtimer(struct hrtimer *timer,
847 * We dont care about collisions. Nodes with 848 * We dont care about collisions. Nodes with
848 * the same expiry time stay together. 849 * the same expiry time stay together.
849 */ 850 */
850 if (timer->expires.tv64 < entry->expires.tv64) { 851 if (hrtimer_get_expires_tv64(timer) <
852 hrtimer_get_expires_tv64(entry)) {
851 link = &(*link)->rb_left; 853 link = &(*link)->rb_left;
852 } else { 854 } else {
853 link = &(*link)->rb_right; 855 link = &(*link)->rb_right;
@@ -944,9 +946,10 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
944} 946}
945 947
946/** 948/**
947 * hrtimer_start - (re)start an relative timer on the current CPU 949 * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
948 * @timer: the timer to be added 950 * @timer: the timer to be added
949 * @tim: expiry time 951 * @tim: expiry time
952 * @delta_ns: "slack" range for the timer
950 * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL) 953 * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
951 * 954 *
952 * Returns: 955 * Returns:
@@ -954,7 +957,8 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
954 * 1 when the timer was active 957 * 1 when the timer was active
955 */ 958 */
956int 959int
957hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) 960hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_ns,
961 const enum hrtimer_mode mode)
958{ 962{
959 struct hrtimer_clock_base *base, *new_base; 963 struct hrtimer_clock_base *base, *new_base;
960 unsigned long flags; 964 unsigned long flags;
@@ -982,7 +986,7 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
982#endif 986#endif
983 } 987 }
984 988
985 timer->expires = tim; 989 hrtimer_set_expires_range_ns(timer, tim, delta_ns);
986 990
987 timer_stats_hrtimer_set_start_info(timer); 991 timer_stats_hrtimer_set_start_info(timer);
988 992
@@ -1015,8 +1019,26 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
1015 1019
1016 return ret; 1020 return ret;
1017} 1021}
1022EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
1023
1024/**
1025 * hrtimer_start - (re)start an hrtimer on the current CPU
1026 * @timer: the timer to be added
1027 * @tim: expiry time
1028 * @mode: expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
1029 *
1030 * Returns:
1031 * 0 on success
1032 * 1 when the timer was active
1033 */
1034int
1035hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
1036{
1037 return hrtimer_start_range_ns(timer, tim, 0, mode);
1038}
1018EXPORT_SYMBOL_GPL(hrtimer_start); 1039EXPORT_SYMBOL_GPL(hrtimer_start);
1019 1040
1041
1020/** 1042/**
1021 * hrtimer_try_to_cancel - try to deactivate a timer 1043 * hrtimer_try_to_cancel - try to deactivate a timer
1022 * @timer: hrtimer to stop 1044 * @timer: hrtimer to stop
@@ -1076,7 +1098,7 @@ ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
1076 ktime_t rem; 1098 ktime_t rem;
1077 1099
1078 base = lock_hrtimer_base(timer, &flags); 1100 base = lock_hrtimer_base(timer, &flags);
1079 rem = ktime_sub(timer->expires, base->get_time()); 1101 rem = hrtimer_expires_remaining(timer);
1080 unlock_hrtimer_base(timer, &flags); 1102 unlock_hrtimer_base(timer, &flags);
1081 1103
1082 return rem; 1104 return rem;
@@ -1108,7 +1130,7 @@ ktime_t hrtimer_get_next_event(void)
1108 continue; 1130 continue;
1109 1131
1110 timer = rb_entry(base->first, struct hrtimer, node); 1132 timer = rb_entry(base->first, struct hrtimer, node);
1111 delta.tv64 = timer->expires.tv64; 1133 delta.tv64 = hrtimer_get_expires_tv64(timer);
1112 delta = ktime_sub(delta, base->get_time()); 1134 delta = ktime_sub(delta, base->get_time());
1113 if (delta.tv64 < mindelta.tv64) 1135 if (delta.tv64 < mindelta.tv64)
1114 mindelta.tv64 = delta.tv64; 1136 mindelta.tv64 = delta.tv64;
@@ -1245,7 +1267,8 @@ static void __run_hrtimer(struct hrtimer *timer)
1245 timer_stats_account_hrtimer(timer); 1267 timer_stats_account_hrtimer(timer);
1246 1268
1247 fn = timer->function; 1269 fn = timer->function;
1248 if (timer->cb_mode == HRTIMER_CB_IRQSAFE_NO_SOFTIRQ) { 1270 if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU ||
1271 timer->cb_mode == HRTIMER_CB_IRQSAFE_UNLOCKED) {
1249 /* 1272 /*
1250 * Used for scheduler timers, avoid lock inversion with 1273 * Used for scheduler timers, avoid lock inversion with
1251 * rq->lock and tasklist_lock. 1274 * rq->lock and tasklist_lock.
@@ -1308,10 +1331,23 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1308 1331
1309 timer = rb_entry(node, struct hrtimer, node); 1332 timer = rb_entry(node, struct hrtimer, node);
1310 1333
1311 if (basenow.tv64 < timer->expires.tv64) { 1334 /*
1335 * The immediate goal for using the softexpires is
1336 * minimizing wakeups, not running timers at the
1337 * earliest interrupt after their soft expiration.
1338 * This allows us to avoid using a Priority Search
1339 * Tree, which can answer a stabbing querry for
1340 * overlapping intervals and instead use the simple
1341 * BST we already have.
1342 * We don't add extra wakeups by delaying timers that
1343 * are right-of a not yet expired timer, because that
1344 * timer will have to trigger a wakeup anyway.
1345 */
1346
1347 if (basenow.tv64 < hrtimer_get_softexpires_tv64(timer)) {
1312 ktime_t expires; 1348 ktime_t expires;
1313 1349
1314 expires = ktime_sub(timer->expires, 1350 expires = ktime_sub(hrtimer_get_expires(timer),
1315 base->offset); 1351 base->offset);
1316 if (expires.tv64 < expires_next.tv64) 1352 if (expires.tv64 < expires_next.tv64)
1317 expires_next = expires; 1353 expires_next = expires;
@@ -1347,6 +1383,30 @@ void hrtimer_interrupt(struct clock_event_device *dev)
1347 raise_softirq(HRTIMER_SOFTIRQ); 1383 raise_softirq(HRTIMER_SOFTIRQ);
1348} 1384}
1349 1385
1386/**
1387 * hrtimer_peek_ahead_timers -- run soft-expired timers now
1388 *
1389 * hrtimer_peek_ahead_timers will peek at the timer queue of
1390 * the current cpu and check if there are any timers for which
1391 * the soft expires time has passed. If any such timers exist,
1392 * they are run immediately and then removed from the timer queue.
1393 *
1394 */
1395void hrtimer_peek_ahead_timers(void)
1396{
1397 struct tick_device *td;
1398 unsigned long flags;
1399
1400 if (!hrtimer_hres_active())
1401 return;
1402
1403 local_irq_save(flags);
1404 td = &__get_cpu_var(tick_cpu_device);
1405 if (td && td->evtdev)
1406 hrtimer_interrupt(td->evtdev);
1407 local_irq_restore(flags);
1408}
1409
1350static void run_hrtimer_softirq(struct softirq_action *h) 1410static void run_hrtimer_softirq(struct softirq_action *h)
1351{ 1411{
1352 run_hrtimer_pending(&__get_cpu_var(hrtimer_bases)); 1412 run_hrtimer_pending(&__get_cpu_var(hrtimer_bases));
@@ -1401,9 +1461,7 @@ void hrtimer_run_queues(void)
1401 if (!base->first) 1461 if (!base->first)
1402 continue; 1462 continue;
1403 1463
1404 if (base->get_softirq_time) 1464 if (gettime) {
1405 base->softirq_time = base->get_softirq_time();
1406 else if (gettime) {
1407 hrtimer_get_softirq_time(cpu_base); 1465 hrtimer_get_softirq_time(cpu_base);
1408 gettime = 0; 1466 gettime = 0;
1409 } 1467 }
@@ -1414,7 +1472,8 @@ void hrtimer_run_queues(void)
1414 struct hrtimer *timer; 1472 struct hrtimer *timer;
1415 1473
1416 timer = rb_entry(node, struct hrtimer, node); 1474 timer = rb_entry(node, struct hrtimer, node);
1417 if (base->softirq_time.tv64 <= timer->expires.tv64) 1475 if (base->softirq_time.tv64 <=
1476 hrtimer_get_expires_tv64(timer))
1418 break; 1477 break;
1419 1478
1420 if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) { 1479 if (timer->cb_mode == HRTIMER_CB_SOFTIRQ) {
@@ -1452,7 +1511,7 @@ void hrtimer_init_sleeper(struct hrtimer_sleeper *sl, struct task_struct *task)
1452 sl->timer.function = hrtimer_wakeup; 1511 sl->timer.function = hrtimer_wakeup;
1453 sl->task = task; 1512 sl->task = task;
1454#ifdef CONFIG_HIGH_RES_TIMERS 1513#ifdef CONFIG_HIGH_RES_TIMERS
1455 sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 1514 sl->timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
1456#endif 1515#endif
1457} 1516}
1458 1517
@@ -1462,7 +1521,7 @@ static int __sched do_nanosleep(struct hrtimer_sleeper *t, enum hrtimer_mode mod
1462 1521
1463 do { 1522 do {
1464 set_current_state(TASK_INTERRUPTIBLE); 1523 set_current_state(TASK_INTERRUPTIBLE);
1465 hrtimer_start(&t->timer, t->timer.expires, mode); 1524 hrtimer_start_expires(&t->timer, mode);
1466 if (!hrtimer_active(&t->timer)) 1525 if (!hrtimer_active(&t->timer))
1467 t->task = NULL; 1526 t->task = NULL;
1468 1527
@@ -1484,7 +1543,7 @@ static int update_rmtp(struct hrtimer *timer, struct timespec __user *rmtp)
1484 struct timespec rmt; 1543 struct timespec rmt;
1485 ktime_t rem; 1544 ktime_t rem;
1486 1545
1487 rem = ktime_sub(timer->expires, timer->base->get_time()); 1546 rem = hrtimer_expires_remaining(timer);
1488 if (rem.tv64 <= 0) 1547 if (rem.tv64 <= 0)
1489 return 0; 1548 return 0;
1490 rmt = ktime_to_timespec(rem); 1549 rmt = ktime_to_timespec(rem);
@@ -1503,7 +1562,7 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
1503 1562
1504 hrtimer_init_on_stack(&t.timer, restart->nanosleep.index, 1563 hrtimer_init_on_stack(&t.timer, restart->nanosleep.index,
1505 HRTIMER_MODE_ABS); 1564 HRTIMER_MODE_ABS);
1506 t.timer.expires.tv64 = restart->nanosleep.expires; 1565 hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
1507 1566
1508 if (do_nanosleep(&t, HRTIMER_MODE_ABS)) 1567 if (do_nanosleep(&t, HRTIMER_MODE_ABS))
1509 goto out; 1568 goto out;
@@ -1528,9 +1587,14 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
1528 struct restart_block *restart; 1587 struct restart_block *restart;
1529 struct hrtimer_sleeper t; 1588 struct hrtimer_sleeper t;
1530 int ret = 0; 1589 int ret = 0;
1590 unsigned long slack;
1591
1592 slack = current->timer_slack_ns;
1593 if (rt_task(current))
1594 slack = 0;
1531 1595
1532 hrtimer_init_on_stack(&t.timer, clockid, mode); 1596 hrtimer_init_on_stack(&t.timer, clockid, mode);
1533 t.timer.expires = timespec_to_ktime(*rqtp); 1597 hrtimer_set_expires_range_ns(&t.timer, timespec_to_ktime(*rqtp), slack);
1534 if (do_nanosleep(&t, mode)) 1598 if (do_nanosleep(&t, mode))
1535 goto out; 1599 goto out;
1536 1600
@@ -1550,7 +1614,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
1550 restart->fn = hrtimer_nanosleep_restart; 1614 restart->fn = hrtimer_nanosleep_restart;
1551 restart->nanosleep.index = t.timer.base->index; 1615 restart->nanosleep.index = t.timer.base->index;
1552 restart->nanosleep.rmtp = rmtp; 1616 restart->nanosleep.rmtp = rmtp;
1553 restart->nanosleep.expires = t.timer.expires.tv64; 1617 restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
1554 1618
1555 ret = -ERESTART_RESTARTBLOCK; 1619 ret = -ERESTART_RESTARTBLOCK;
1556out: 1620out:
@@ -1591,49 +1655,123 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
1591 1655
1592#ifdef CONFIG_HOTPLUG_CPU 1656#ifdef CONFIG_HOTPLUG_CPU
1593 1657
1594static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, 1658static int migrate_hrtimer_list(struct hrtimer_clock_base *old_base,
1595 struct hrtimer_clock_base *new_base) 1659 struct hrtimer_clock_base *new_base, int dcpu)
1596{ 1660{
1597 struct hrtimer *timer; 1661 struct hrtimer *timer;
1598 struct rb_node *node; 1662 struct rb_node *node;
1663 int raise = 0;
1599 1664
1600 while ((node = rb_first(&old_base->active))) { 1665 while ((node = rb_first(&old_base->active))) {
1601 timer = rb_entry(node, struct hrtimer, node); 1666 timer = rb_entry(node, struct hrtimer, node);
1602 BUG_ON(hrtimer_callback_running(timer)); 1667 BUG_ON(hrtimer_callback_running(timer));
1603 debug_hrtimer_deactivate(timer); 1668 debug_hrtimer_deactivate(timer);
1604 __remove_hrtimer(timer, old_base, HRTIMER_STATE_INACTIVE, 0); 1669
1670 /*
1671 * Should not happen. Per CPU timers should be
1672 * canceled _before_ the migration code is called
1673 */
1674 if (timer->cb_mode == HRTIMER_CB_IRQSAFE_PERCPU) {
1675 __remove_hrtimer(timer, old_base,
1676 HRTIMER_STATE_INACTIVE, 0);
1677 WARN(1, "hrtimer (%p %p)active but cpu %d dead\n",
1678 timer, timer->function, dcpu);
1679 continue;
1680 }
1681
1682 /*
1683 * Mark it as STATE_MIGRATE not INACTIVE otherwise the
1684 * timer could be seen as !active and just vanish away
1685 * under us on another CPU
1686 */
1687 __remove_hrtimer(timer, old_base, HRTIMER_STATE_MIGRATE, 0);
1605 timer->base = new_base; 1688 timer->base = new_base;
1606 /* 1689 /*
1607 * Enqueue the timer. Allow reprogramming of the event device 1690 * Enqueue the timer. Allow reprogramming of the event device
1608 */ 1691 */
1609 enqueue_hrtimer(timer, new_base, 1); 1692 enqueue_hrtimer(timer, new_base, 1);
1693
1694#ifdef CONFIG_HIGH_RES_TIMERS
1695 /*
1696 * Happens with high res enabled when the timer was
1697 * already expired and the callback mode is
1698 * HRTIMER_CB_IRQSAFE_UNLOCKED (hrtimer_sleeper). The
1699 * enqueue code does not move them to the soft irq
1700 * pending list for performance/latency reasons, but
1701 * in the migration state, we need to do that
1702 * otherwise we end up with a stale timer.
1703 */
1704 if (timer->state == HRTIMER_STATE_MIGRATE) {
1705 timer->state = HRTIMER_STATE_PENDING;
1706 list_add_tail(&timer->cb_entry,
1707 &new_base->cpu_base->cb_pending);
1708 raise = 1;
1709 }
1710#endif
1711 /* Clear the migration state bit */
1712 timer->state &= ~HRTIMER_STATE_MIGRATE;
1713 }
1714 return raise;
1715}
1716
1717#ifdef CONFIG_HIGH_RES_TIMERS
1718static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
1719 struct hrtimer_cpu_base *new_base)
1720{
1721 struct hrtimer *timer;
1722 int raise = 0;
1723
1724 while (!list_empty(&old_base->cb_pending)) {
1725 timer = list_entry(old_base->cb_pending.next,
1726 struct hrtimer, cb_entry);
1727
1728 __remove_hrtimer(timer, timer->base, HRTIMER_STATE_PENDING, 0);
1729 timer->base = &new_base->clock_base[timer->base->index];
1730 list_add_tail(&timer->cb_entry, &new_base->cb_pending);
1731 raise = 1;
1610 } 1732 }
1733 return raise;
1734}
1735#else
1736static int migrate_hrtimer_pending(struct hrtimer_cpu_base *old_base,
1737 struct hrtimer_cpu_base *new_base)
1738{
1739 return 0;
1611} 1740}
1741#endif
1612 1742
1613static void migrate_hrtimers(int cpu) 1743static void migrate_hrtimers(int cpu)
1614{ 1744{
1615 struct hrtimer_cpu_base *old_base, *new_base; 1745 struct hrtimer_cpu_base *old_base, *new_base;
1616 int i; 1746 int i, raise = 0;
1617 1747
1618 BUG_ON(cpu_online(cpu)); 1748 BUG_ON(cpu_online(cpu));
1619 old_base = &per_cpu(hrtimer_bases, cpu); 1749 old_base = &per_cpu(hrtimer_bases, cpu);
1620 new_base = &get_cpu_var(hrtimer_bases); 1750 new_base = &get_cpu_var(hrtimer_bases);
1621 1751
1622 tick_cancel_sched_timer(cpu); 1752 tick_cancel_sched_timer(cpu);
1623 1753 /*
1624 local_irq_disable(); 1754 * The caller is globally serialized and nobody else
1625 spin_lock(&new_base->lock); 1755 * takes two locks at once, deadlock is not possible.
1756 */
1757 spin_lock_irq(&new_base->lock);
1626 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); 1758 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
1627 1759
1628 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { 1760 for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
1629 migrate_hrtimer_list(&old_base->clock_base[i], 1761 if (migrate_hrtimer_list(&old_base->clock_base[i],
1630 &new_base->clock_base[i]); 1762 &new_base->clock_base[i], cpu))
1763 raise = 1;
1631 } 1764 }
1632 1765
1766 if (migrate_hrtimer_pending(old_base, new_base))
1767 raise = 1;
1768
1633 spin_unlock(&old_base->lock); 1769 spin_unlock(&old_base->lock);
1634 spin_unlock(&new_base->lock); 1770 spin_unlock_irq(&new_base->lock);
1635 local_irq_enable();
1636 put_cpu_var(hrtimer_bases); 1771 put_cpu_var(hrtimer_bases);
1772
1773 if (raise)
1774 hrtimer_raise_softirq();
1637} 1775}
1638#endif /* CONFIG_HOTPLUG_CPU */ 1776#endif /* CONFIG_HOTPLUG_CPU */
1639 1777
@@ -1678,3 +1816,103 @@ void __init hrtimers_init(void)
1678#endif 1816#endif
1679} 1817}
1680 1818
1819/**
1820 * schedule_hrtimeout_range - sleep until timeout
1821 * @expires: timeout value (ktime_t)
1822 * @delta: slack in expires timeout (ktime_t)
1823 * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
1824 *
1825 * Make the current task sleep until the given expiry time has
1826 * elapsed. The routine will return immediately unless
1827 * the current task state has been set (see set_current_state()).
1828 *
1829 * The @delta argument gives the kernel the freedom to schedule the
1830 * actual wakeup to a time that is both power and performance friendly.
1831 * The kernel give the normal best effort behavior for "@expires+@delta",
1832 * but may decide to fire the timer earlier, but no earlier than @expires.
1833 *
1834 * You can set the task state as follows -
1835 *
1836 * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
1837 * pass before the routine returns.
1838 *
1839 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
1840 * delivered to the current task.
1841 *
1842 * The current task state is guaranteed to be TASK_RUNNING when this
1843 * routine returns.
1844 *
1845 * Returns 0 when the timer has expired otherwise -EINTR
1846 */
1847int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
1848 const enum hrtimer_mode mode)
1849{
1850 struct hrtimer_sleeper t;
1851
1852 /*
1853 * Optimize when a zero timeout value is given. It does not
1854 * matter whether this is an absolute or a relative time.
1855 */
1856 if (expires && !expires->tv64) {
1857 __set_current_state(TASK_RUNNING);
1858 return 0;
1859 }
1860
1861 /*
1862 * A NULL parameter means "inifinte"
1863 */
1864 if (!expires) {
1865 schedule();
1866 __set_current_state(TASK_RUNNING);
1867 return -EINTR;
1868 }
1869
1870 hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, mode);
1871 hrtimer_set_expires_range_ns(&t.timer, *expires, delta);
1872
1873 hrtimer_init_sleeper(&t, current);
1874
1875 hrtimer_start_expires(&t.timer, mode);
1876 if (!hrtimer_active(&t.timer))
1877 t.task = NULL;
1878
1879 if (likely(t.task))
1880 schedule();
1881
1882 hrtimer_cancel(&t.timer);
1883 destroy_hrtimer_on_stack(&t.timer);
1884
1885 __set_current_state(TASK_RUNNING);
1886
1887 return !t.task ? 0 : -EINTR;
1888}
1889EXPORT_SYMBOL_GPL(schedule_hrtimeout_range);
1890
1891/**
1892 * schedule_hrtimeout - sleep until timeout
1893 * @expires: timeout value (ktime_t)
1894 * @mode: timer mode, HRTIMER_MODE_ABS or HRTIMER_MODE_REL
1895 *
1896 * Make the current task sleep until the given expiry time has
1897 * elapsed. The routine will return immediately unless
1898 * the current task state has been set (see set_current_state()).
1899 *
1900 * You can set the task state as follows -
1901 *
1902 * %TASK_UNINTERRUPTIBLE - at least @timeout time is guaranteed to
1903 * pass before the routine returns.
1904 *
1905 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
1906 * delivered to the current task.
1907 *
1908 * The current task state is guaranteed to be TASK_RUNNING when this
1909 * routine returns.
1910 *
1911 * Returns 0 when the timer has expired otherwise -EINTR
1912 */
1913int __sched schedule_hrtimeout(ktime_t *expires,
1914 const enum hrtimer_mode mode)
1915{
1916 return schedule_hrtimeout_range(expires, 0, mode);
1917}
1918EXPORT_SYMBOL_GPL(schedule_hrtimeout);
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 533068cfb607..cc0f7321b8ce 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -30,17 +30,16 @@ static DEFINE_MUTEX(probing_active);
30unsigned long probe_irq_on(void) 30unsigned long probe_irq_on(void)
31{ 31{
32 struct irq_desc *desc; 32 struct irq_desc *desc;
33 unsigned long mask; 33 unsigned long mask = 0;
34 unsigned int i; 34 unsigned int status;
35 int i;
35 36
36 mutex_lock(&probing_active); 37 mutex_lock(&probing_active);
37 /* 38 /*
38 * something may have generated an irq long ago and we want to 39 * something may have generated an irq long ago and we want to
39 * flush such a longstanding irq before considering it as spurious. 40 * flush such a longstanding irq before considering it as spurious.
40 */ 41 */
41 for (i = NR_IRQS-1; i > 0; i--) { 42 for_each_irq_desc_reverse(i, desc) {
42 desc = irq_desc + i;
43
44 spin_lock_irq(&desc->lock); 43 spin_lock_irq(&desc->lock);
45 if (!desc->action && !(desc->status & IRQ_NOPROBE)) { 44 if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
46 /* 45 /*
@@ -68,9 +67,7 @@ unsigned long probe_irq_on(void)
68 * (we must startup again here because if a longstanding irq 67 * (we must startup again here because if a longstanding irq
69 * happened in the previous stage, it may have masked itself) 68 * happened in the previous stage, it may have masked itself)
70 */ 69 */
71 for (i = NR_IRQS-1; i > 0; i--) { 70 for_each_irq_desc_reverse(i, desc) {
72 desc = irq_desc + i;
73
74 spin_lock_irq(&desc->lock); 71 spin_lock_irq(&desc->lock);
75 if (!desc->action && !(desc->status & IRQ_NOPROBE)) { 72 if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
76 desc->status |= IRQ_AUTODETECT | IRQ_WAITING; 73 desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
@@ -88,11 +85,7 @@ unsigned long probe_irq_on(void)
88 /* 85 /*
89 * Now filter out any obviously spurious interrupts 86 * Now filter out any obviously spurious interrupts
90 */ 87 */
91 mask = 0; 88 for_each_irq_desc(i, desc) {
92 for (i = 0; i < NR_IRQS; i++) {
93 unsigned int status;
94
95 desc = irq_desc + i;
96 spin_lock_irq(&desc->lock); 89 spin_lock_irq(&desc->lock);
97 status = desc->status; 90 status = desc->status;
98 91
@@ -126,14 +119,11 @@ EXPORT_SYMBOL(probe_irq_on);
126 */ 119 */
127unsigned int probe_irq_mask(unsigned long val) 120unsigned int probe_irq_mask(unsigned long val)
128{ 121{
129 unsigned int mask; 122 unsigned int status, mask = 0;
123 struct irq_desc *desc;
130 int i; 124 int i;
131 125
132 mask = 0; 126 for_each_irq_desc(i, desc) {
133 for (i = 0; i < NR_IRQS; i++) {
134 struct irq_desc *desc = irq_desc + i;
135 unsigned int status;
136
137 spin_lock_irq(&desc->lock); 127 spin_lock_irq(&desc->lock);
138 status = desc->status; 128 status = desc->status;
139 129
@@ -171,20 +161,19 @@ EXPORT_SYMBOL(probe_irq_mask);
171 */ 161 */
172int probe_irq_off(unsigned long val) 162int probe_irq_off(unsigned long val)
173{ 163{
174 int i, irq_found = 0, nr_irqs = 0; 164 int i, irq_found = 0, nr_of_irqs = 0;
175 165 struct irq_desc *desc;
176 for (i = 0; i < NR_IRQS; i++) { 166 unsigned int status;
177 struct irq_desc *desc = irq_desc + i;
178 unsigned int status;
179 167
168 for_each_irq_desc(i, desc) {
180 spin_lock_irq(&desc->lock); 169 spin_lock_irq(&desc->lock);
181 status = desc->status; 170 status = desc->status;
182 171
183 if (status & IRQ_AUTODETECT) { 172 if (status & IRQ_AUTODETECT) {
184 if (!(status & IRQ_WAITING)) { 173 if (!(status & IRQ_WAITING)) {
185 if (!nr_irqs) 174 if (!nr_of_irqs)
186 irq_found = i; 175 irq_found = i;
187 nr_irqs++; 176 nr_of_irqs++;
188 } 177 }
189 desc->status = status & ~IRQ_AUTODETECT; 178 desc->status = status & ~IRQ_AUTODETECT;
190 desc->chip->shutdown(i); 179 desc->chip->shutdown(i);
@@ -193,7 +182,7 @@ int probe_irq_off(unsigned long val)
193 } 182 }
194 mutex_unlock(&probing_active); 183 mutex_unlock(&probing_active);
195 184
196 if (nr_irqs > 1) 185 if (nr_of_irqs > 1)
197 irq_found = -irq_found; 186 irq_found = -irq_found;
198 187
199 return irq_found; 188 return irq_found;
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 964964baefa2..10b5092e9bfe 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -24,17 +24,15 @@
24 */ 24 */
25void dynamic_irq_init(unsigned int irq) 25void dynamic_irq_init(unsigned int irq)
26{ 26{
27 struct irq_desc *desc; 27 struct irq_desc *desc = irq_to_desc(irq);
28 unsigned long flags; 28 unsigned long flags;
29 29
30 if (irq >= NR_IRQS) { 30 if (!desc) {
31 printk(KERN_ERR "Trying to initialize invalid IRQ%d\n", irq); 31 WARN(1, KERN_ERR "Trying to initialize invalid IRQ%d\n", irq);
32 WARN_ON(1);
33 return; 32 return;
34 } 33 }
35 34
36 /* Ensure we don't have left over values from a previous use of this irq */ 35 /* Ensure we don't have left over values from a previous use of this irq */
37 desc = irq_desc + irq;
38 spin_lock_irqsave(&desc->lock, flags); 36 spin_lock_irqsave(&desc->lock, flags);
39 desc->status = IRQ_DISABLED; 37 desc->status = IRQ_DISABLED;
40 desc->chip = &no_irq_chip; 38 desc->chip = &no_irq_chip;
@@ -58,22 +56,19 @@ void dynamic_irq_init(unsigned int irq)
58 */ 56 */
59void dynamic_irq_cleanup(unsigned int irq) 57void dynamic_irq_cleanup(unsigned int irq)
60{ 58{
61 struct irq_desc *desc; 59 struct irq_desc *desc = irq_to_desc(irq);
62 unsigned long flags; 60 unsigned long flags;
63 61
64 if (irq >= NR_IRQS) { 62 if (!desc) {
65 printk(KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq); 63 WARN(1, KERN_ERR "Trying to cleanup invalid IRQ%d\n", irq);
66 WARN_ON(1);
67 return; 64 return;
68 } 65 }
69 66
70 desc = irq_desc + irq;
71 spin_lock_irqsave(&desc->lock, flags); 67 spin_lock_irqsave(&desc->lock, flags);
72 if (desc->action) { 68 if (desc->action) {
73 spin_unlock_irqrestore(&desc->lock, flags); 69 spin_unlock_irqrestore(&desc->lock, flags);
74 printk(KERN_ERR "Destroying IRQ%d without calling free_irq\n", 70 WARN(1, KERN_ERR "Destroying IRQ%d without calling free_irq\n",
75 irq); 71 irq);
76 WARN_ON(1);
77 return; 72 return;
78 } 73 }
79 desc->msi_desc = NULL; 74 desc->msi_desc = NULL;
@@ -81,6 +76,7 @@ void dynamic_irq_cleanup(unsigned int irq)
81 desc->chip_data = NULL; 76 desc->chip_data = NULL;
82 desc->handle_irq = handle_bad_irq; 77 desc->handle_irq = handle_bad_irq;
83 desc->chip = &no_irq_chip; 78 desc->chip = &no_irq_chip;
79 desc->name = NULL;
84 spin_unlock_irqrestore(&desc->lock, flags); 80 spin_unlock_irqrestore(&desc->lock, flags);
85} 81}
86 82
@@ -92,19 +88,17 @@ void dynamic_irq_cleanup(unsigned int irq)
92 */ 88 */
93int set_irq_chip(unsigned int irq, struct irq_chip *chip) 89int set_irq_chip(unsigned int irq, struct irq_chip *chip)
94{ 90{
95 struct irq_desc *desc; 91 struct irq_desc *desc = irq_to_desc(irq);
96 unsigned long flags; 92 unsigned long flags;
97 93
98 if (irq >= NR_IRQS) { 94 if (!desc) {
99 printk(KERN_ERR "Trying to install chip for IRQ%d\n", irq); 95 WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq);
100 WARN_ON(1);
101 return -EINVAL; 96 return -EINVAL;
102 } 97 }
103 98
104 if (!chip) 99 if (!chip)
105 chip = &no_irq_chip; 100 chip = &no_irq_chip;
106 101
107 desc = irq_desc + irq;
108 spin_lock_irqsave(&desc->lock, flags); 102 spin_lock_irqsave(&desc->lock, flags);
109 irq_chip_set_defaults(chip); 103 irq_chip_set_defaults(chip);
110 desc->chip = chip; 104 desc->chip = chip;
@@ -115,27 +109,27 @@ int set_irq_chip(unsigned int irq, struct irq_chip *chip)
115EXPORT_SYMBOL(set_irq_chip); 109EXPORT_SYMBOL(set_irq_chip);
116 110
117/** 111/**
118 * set_irq_type - set the irq type for an irq 112 * set_irq_type - set the irq trigger type for an irq
119 * @irq: irq number 113 * @irq: irq number
120 * @type: interrupt type - see include/linux/interrupt.h 114 * @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h
121 */ 115 */
122int set_irq_type(unsigned int irq, unsigned int type) 116int set_irq_type(unsigned int irq, unsigned int type)
123{ 117{
124 struct irq_desc *desc; 118 struct irq_desc *desc = irq_to_desc(irq);
125 unsigned long flags; 119 unsigned long flags;
126 int ret = -ENXIO; 120 int ret = -ENXIO;
127 121
128 if (irq >= NR_IRQS) { 122 if (!desc) {
129 printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq); 123 printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq);
130 return -ENODEV; 124 return -ENODEV;
131 } 125 }
132 126
133 desc = irq_desc + irq; 127 if (type == IRQ_TYPE_NONE)
134 if (desc->chip->set_type) { 128 return 0;
135 spin_lock_irqsave(&desc->lock, flags); 129
136 ret = desc->chip->set_type(irq, type); 130 spin_lock_irqsave(&desc->lock, flags);
137 spin_unlock_irqrestore(&desc->lock, flags); 131 ret = __irq_set_trigger(desc, irq, type);
138 } 132 spin_unlock_irqrestore(&desc->lock, flags);
139 return ret; 133 return ret;
140} 134}
141EXPORT_SYMBOL(set_irq_type); 135EXPORT_SYMBOL(set_irq_type);
@@ -149,16 +143,15 @@ EXPORT_SYMBOL(set_irq_type);
149 */ 143 */
150int set_irq_data(unsigned int irq, void *data) 144int set_irq_data(unsigned int irq, void *data)
151{ 145{
152 struct irq_desc *desc; 146 struct irq_desc *desc = irq_to_desc(irq);
153 unsigned long flags; 147 unsigned long flags;
154 148
155 if (irq >= NR_IRQS) { 149 if (!desc) {
156 printk(KERN_ERR 150 printk(KERN_ERR
157 "Trying to install controller data for IRQ%d\n", irq); 151 "Trying to install controller data for IRQ%d\n", irq);
158 return -EINVAL; 152 return -EINVAL;
159 } 153 }
160 154
161 desc = irq_desc + irq;
162 spin_lock_irqsave(&desc->lock, flags); 155 spin_lock_irqsave(&desc->lock, flags);
163 desc->handler_data = data; 156 desc->handler_data = data;
164 spin_unlock_irqrestore(&desc->lock, flags); 157 spin_unlock_irqrestore(&desc->lock, flags);
@@ -175,15 +168,15 @@ EXPORT_SYMBOL(set_irq_data);
175 */ 168 */
176int set_irq_msi(unsigned int irq, struct msi_desc *entry) 169int set_irq_msi(unsigned int irq, struct msi_desc *entry)
177{ 170{
178 struct irq_desc *desc; 171 struct irq_desc *desc = irq_to_desc(irq);
179 unsigned long flags; 172 unsigned long flags;
180 173
181 if (irq >= NR_IRQS) { 174 if (!desc) {
182 printk(KERN_ERR 175 printk(KERN_ERR
183 "Trying to install msi data for IRQ%d\n", irq); 176 "Trying to install msi data for IRQ%d\n", irq);
184 return -EINVAL; 177 return -EINVAL;
185 } 178 }
186 desc = irq_desc + irq; 179
187 spin_lock_irqsave(&desc->lock, flags); 180 spin_lock_irqsave(&desc->lock, flags);
188 desc->msi_desc = entry; 181 desc->msi_desc = entry;
189 if (entry) 182 if (entry)
@@ -201,10 +194,16 @@ int set_irq_msi(unsigned int irq, struct msi_desc *entry)
201 */ 194 */
202int set_irq_chip_data(unsigned int irq, void *data) 195int set_irq_chip_data(unsigned int irq, void *data)
203{ 196{
204 struct irq_desc *desc = irq_desc + irq; 197 struct irq_desc *desc = irq_to_desc(irq);
205 unsigned long flags; 198 unsigned long flags;
206 199
207 if (irq >= NR_IRQS || !desc->chip) { 200 if (!desc) {
201 printk(KERN_ERR
202 "Trying to install chip data for IRQ%d\n", irq);
203 return -EINVAL;
204 }
205
206 if (!desc->chip) {
208 printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq); 207 printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq);
209 return -EINVAL; 208 return -EINVAL;
210 } 209 }
@@ -222,7 +221,7 @@ EXPORT_SYMBOL(set_irq_chip_data);
222 */ 221 */
223static void default_enable(unsigned int irq) 222static void default_enable(unsigned int irq)
224{ 223{
225 struct irq_desc *desc = irq_desc + irq; 224 struct irq_desc *desc = irq_to_desc(irq);
226 225
227 desc->chip->unmask(irq); 226 desc->chip->unmask(irq);
228 desc->status &= ~IRQ_MASKED; 227 desc->status &= ~IRQ_MASKED;
@@ -240,8 +239,9 @@ static void default_disable(unsigned int irq)
240 */ 239 */
241static unsigned int default_startup(unsigned int irq) 240static unsigned int default_startup(unsigned int irq)
242{ 241{
243 irq_desc[irq].chip->enable(irq); 242 struct irq_desc *desc = irq_to_desc(irq);
244 243
244 desc->chip->enable(irq);
245 return 0; 245 return 0;
246} 246}
247 247
@@ -250,7 +250,7 @@ static unsigned int default_startup(unsigned int irq)
250 */ 250 */
251static void default_shutdown(unsigned int irq) 251static void default_shutdown(unsigned int irq)
252{ 252{
253 struct irq_desc *desc = irq_desc + irq; 253 struct irq_desc *desc = irq_to_desc(irq);
254 254
255 desc->chip->mask(irq); 255 desc->chip->mask(irq);
256 desc->status |= IRQ_MASKED; 256 desc->status |= IRQ_MASKED;
@@ -309,14 +309,13 @@ handle_simple_irq(unsigned int irq, struct irq_desc *desc)
309{ 309{
310 struct irqaction *action; 310 struct irqaction *action;
311 irqreturn_t action_ret; 311 irqreturn_t action_ret;
312 const unsigned int cpu = smp_processor_id();
313 312
314 spin_lock(&desc->lock); 313 spin_lock(&desc->lock);
315 314
316 if (unlikely(desc->status & IRQ_INPROGRESS)) 315 if (unlikely(desc->status & IRQ_INPROGRESS))
317 goto out_unlock; 316 goto out_unlock;
318 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 317 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
319 kstat_cpu(cpu).irqs[irq]++; 318 kstat_incr_irqs_this_cpu(irq, desc);
320 319
321 action = desc->action; 320 action = desc->action;
322 if (unlikely(!action || (desc->status & IRQ_DISABLED))) 321 if (unlikely(!action || (desc->status & IRQ_DISABLED)))
@@ -348,7 +347,6 @@ out_unlock:
348void 347void
349handle_level_irq(unsigned int irq, struct irq_desc *desc) 348handle_level_irq(unsigned int irq, struct irq_desc *desc)
350{ 349{
351 unsigned int cpu = smp_processor_id();
352 struct irqaction *action; 350 struct irqaction *action;
353 irqreturn_t action_ret; 351 irqreturn_t action_ret;
354 352
@@ -358,7 +356,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
358 if (unlikely(desc->status & IRQ_INPROGRESS)) 356 if (unlikely(desc->status & IRQ_INPROGRESS))
359 goto out_unlock; 357 goto out_unlock;
360 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 358 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
361 kstat_cpu(cpu).irqs[irq]++; 359 kstat_incr_irqs_this_cpu(irq, desc);
362 360
363 /* 361 /*
364 * If its disabled or no action available 362 * If its disabled or no action available
@@ -396,7 +394,6 @@ out_unlock:
396void 394void
397handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc) 395handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
398{ 396{
399 unsigned int cpu = smp_processor_id();
400 struct irqaction *action; 397 struct irqaction *action;
401 irqreturn_t action_ret; 398 irqreturn_t action_ret;
402 399
@@ -406,7 +403,7 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
406 goto out; 403 goto out;
407 404
408 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 405 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
409 kstat_cpu(cpu).irqs[irq]++; 406 kstat_incr_irqs_this_cpu(irq, desc);
410 407
411 /* 408 /*
412 * If its disabled or no action available 409 * If its disabled or no action available
@@ -455,8 +452,6 @@ out:
455void 452void
456handle_edge_irq(unsigned int irq, struct irq_desc *desc) 453handle_edge_irq(unsigned int irq, struct irq_desc *desc)
457{ 454{
458 const unsigned int cpu = smp_processor_id();
459
460 spin_lock(&desc->lock); 455 spin_lock(&desc->lock);
461 456
462 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING); 457 desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
@@ -472,8 +467,7 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
472 mask_ack_irq(desc, irq); 467 mask_ack_irq(desc, irq);
473 goto out_unlock; 468 goto out_unlock;
474 } 469 }
475 470 kstat_incr_irqs_this_cpu(irq, desc);
476 kstat_cpu(cpu).irqs[irq]++;
477 471
478 /* Start handling the irq */ 472 /* Start handling the irq */
479 desc->chip->ack(irq); 473 desc->chip->ack(irq);
@@ -528,7 +522,7 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
528{ 522{
529 irqreturn_t action_ret; 523 irqreturn_t action_ret;
530 524
531 kstat_this_cpu.irqs[irq]++; 525 kstat_incr_irqs_this_cpu(irq, desc);
532 526
533 if (desc->chip->ack) 527 if (desc->chip->ack)
534 desc->chip->ack(irq); 528 desc->chip->ack(irq);
@@ -545,17 +539,15 @@ void
545__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained, 539__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
546 const char *name) 540 const char *name)
547{ 541{
548 struct irq_desc *desc; 542 struct irq_desc *desc = irq_to_desc(irq);
549 unsigned long flags; 543 unsigned long flags;
550 544
551 if (irq >= NR_IRQS) { 545 if (!desc) {
552 printk(KERN_ERR 546 printk(KERN_ERR
553 "Trying to install type control for IRQ%d\n", irq); 547 "Trying to install type control for IRQ%d\n", irq);
554 return; 548 return;
555 } 549 }
556 550
557 desc = irq_desc + irq;
558
559 if (!handle) 551 if (!handle)
560 handle = handle_bad_irq; 552 handle = handle_bad_irq;
561 else if (desc->chip == &no_irq_chip) { 553 else if (desc->chip == &no_irq_chip) {
@@ -587,7 +579,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
587 desc->status &= ~IRQ_DISABLED; 579 desc->status &= ~IRQ_DISABLED;
588 desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE; 580 desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE;
589 desc->depth = 0; 581 desc->depth = 0;
590 desc->chip->unmask(irq); 582 desc->chip->startup(irq);
591 } 583 }
592 spin_unlock_irqrestore(&desc->lock, flags); 584 spin_unlock_irqrestore(&desc->lock, flags);
593} 585}
@@ -610,17 +602,14 @@ set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
610 602
611void __init set_irq_noprobe(unsigned int irq) 603void __init set_irq_noprobe(unsigned int irq)
612{ 604{
613 struct irq_desc *desc; 605 struct irq_desc *desc = irq_to_desc(irq);
614 unsigned long flags; 606 unsigned long flags;
615 607
616 if (irq >= NR_IRQS) { 608 if (!desc) {
617 printk(KERN_ERR "Trying to mark IRQ%d non-probeable\n", irq); 609 printk(KERN_ERR "Trying to mark IRQ%d non-probeable\n", irq);
618
619 return; 610 return;
620 } 611 }
621 612
622 desc = irq_desc + irq;
623
624 spin_lock_irqsave(&desc->lock, flags); 613 spin_lock_irqsave(&desc->lock, flags);
625 desc->status |= IRQ_NOPROBE; 614 desc->status |= IRQ_NOPROBE;
626 spin_unlock_irqrestore(&desc->lock, flags); 615 spin_unlock_irqrestore(&desc->lock, flags);
@@ -628,17 +617,14 @@ void __init set_irq_noprobe(unsigned int irq)
628 617
629void __init set_irq_probe(unsigned int irq) 618void __init set_irq_probe(unsigned int irq)
630{ 619{
631 struct irq_desc *desc; 620 struct irq_desc *desc = irq_to_desc(irq);
632 unsigned long flags; 621 unsigned long flags;
633 622
634 if (irq >= NR_IRQS) { 623 if (!desc) {
635 printk(KERN_ERR "Trying to mark IRQ%d probeable\n", irq); 624 printk(KERN_ERR "Trying to mark IRQ%d probeable\n", irq);
636
637 return; 625 return;
638 } 626 }
639 627
640 desc = irq_desc + irq;
641
642 spin_lock_irqsave(&desc->lock, flags); 628 spin_lock_irqsave(&desc->lock, flags);
643 desc->status &= ~IRQ_NOPROBE; 629 desc->status &= ~IRQ_NOPROBE;
644 spin_unlock_irqrestore(&desc->lock, flags); 630 spin_unlock_irqrestore(&desc->lock, flags);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 5fa6198e9139..c815b42d0f5b 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -25,11 +25,10 @@
25 * 25 *
26 * Handles spurious and unhandled IRQ's. It also prints a debugmessage. 26 * Handles spurious and unhandled IRQ's. It also prints a debugmessage.
27 */ 27 */
28void 28void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
29handle_bad_irq(unsigned int irq, struct irq_desc *desc)
30{ 29{
31 print_irq_desc(irq, desc); 30 print_irq_desc(irq, desc);
32 kstat_this_cpu.irqs[irq]++; 31 kstat_incr_irqs_this_cpu(irq, desc);
33 ack_bad_irq(irq); 32 ack_bad_irq(irq);
34} 33}
35 34
@@ -47,6 +46,9 @@ handle_bad_irq(unsigned int irq, struct irq_desc *desc)
47 * 46 *
48 * Controller mappings for all interrupt sources: 47 * Controller mappings for all interrupt sources:
49 */ 48 */
49int nr_irqs = NR_IRQS;
50EXPORT_SYMBOL_GPL(nr_irqs);
51
50struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = { 52struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
51 [0 ... NR_IRQS-1] = { 53 [0 ... NR_IRQS-1] = {
52 .status = IRQ_DISABLED, 54 .status = IRQ_DISABLED,
@@ -66,7 +68,9 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
66 */ 68 */
67static void ack_bad(unsigned int irq) 69static void ack_bad(unsigned int irq)
68{ 70{
69 print_irq_desc(irq, irq_desc + irq); 71 struct irq_desc *desc = irq_to_desc(irq);
72
73 print_irq_desc(irq, desc);
70 ack_bad_irq(irq); 74 ack_bad_irq(irq);
71} 75}
72 76
@@ -131,8 +135,6 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
131 irqreturn_t ret, retval = IRQ_NONE; 135 irqreturn_t ret, retval = IRQ_NONE;
132 unsigned int status = 0; 136 unsigned int status = 0;
133 137
134 handle_dynamic_tick(action);
135
136 if (!(action->flags & IRQF_DISABLED)) 138 if (!(action->flags & IRQF_DISABLED))
137 local_irq_enable_in_hardirq(); 139 local_irq_enable_in_hardirq();
138 140
@@ -165,11 +167,12 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
165 */ 167 */
166unsigned int __do_IRQ(unsigned int irq) 168unsigned int __do_IRQ(unsigned int irq)
167{ 169{
168 struct irq_desc *desc = irq_desc + irq; 170 struct irq_desc *desc = irq_to_desc(irq);
169 struct irqaction *action; 171 struct irqaction *action;
170 unsigned int status; 172 unsigned int status;
171 173
172 kstat_this_cpu.irqs[irq]++; 174 kstat_incr_irqs_this_cpu(irq, desc);
175
173 if (CHECK_IRQ_PER_CPU(desc->status)) { 176 if (CHECK_IRQ_PER_CPU(desc->status)) {
174 irqreturn_t action_ret; 177 irqreturn_t action_ret;
175 178
@@ -256,8 +259,8 @@ out:
256} 259}
257#endif 260#endif
258 261
259#ifdef CONFIG_TRACE_IRQFLAGS
260 262
263#ifdef CONFIG_TRACE_IRQFLAGS
261/* 264/*
262 * lockdep: we want to handle all irq_desc locks as a single lock-class: 265 * lockdep: we want to handle all irq_desc locks as a single lock-class:
263 */ 266 */
@@ -265,10 +268,10 @@ static struct lock_class_key irq_desc_lock_class;
265 268
266void early_init_irq_lock_class(void) 269void early_init_irq_lock_class(void)
267{ 270{
271 struct irq_desc *desc;
268 int i; 272 int i;
269 273
270 for (i = 0; i < NR_IRQS; i++) 274 for_each_irq_desc(i, desc)
271 lockdep_set_class(&irq_desc[i].lock, &irq_desc_lock_class); 275 lockdep_set_class(&desc->lock, &irq_desc_lock_class);
272} 276}
273
274#endif 277#endif
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 08a849a22447..c9767e641980 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -10,12 +10,15 @@ extern void irq_chip_set_defaults(struct irq_chip *chip);
10/* Set default handler: */ 10/* Set default handler: */
11extern void compat_irq_chip_set_default_handler(struct irq_desc *desc); 11extern void compat_irq_chip_set_default_handler(struct irq_desc *desc);
12 12
13extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
14 unsigned long flags);
15
13#ifdef CONFIG_PROC_FS 16#ifdef CONFIG_PROC_FS
14extern void register_irq_proc(unsigned int irq); 17extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
15extern void register_handler_proc(unsigned int irq, struct irqaction *action); 18extern void register_handler_proc(unsigned int irq, struct irqaction *action);
16extern void unregister_handler_proc(unsigned int irq, struct irqaction *action); 19extern void unregister_handler_proc(unsigned int irq, struct irqaction *action);
17#else 20#else
18static inline void register_irq_proc(unsigned int irq) { } 21static inline void register_irq_proc(unsigned int irq, struct irq_desc *desc) { }
19static inline void register_handler_proc(unsigned int irq, 22static inline void register_handler_proc(unsigned int irq,
20 struct irqaction *action) { } 23 struct irqaction *action) { }
21static inline void unregister_handler_proc(unsigned int irq, 24static inline void unregister_handler_proc(unsigned int irq,
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 77a51be36010..c498a1b8c621 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -31,10 +31,10 @@ cpumask_t irq_default_affinity = CPU_MASK_ALL;
31 */ 31 */
32void synchronize_irq(unsigned int irq) 32void synchronize_irq(unsigned int irq)
33{ 33{
34 struct irq_desc *desc = irq_desc + irq; 34 struct irq_desc *desc = irq_to_desc(irq);
35 unsigned int status; 35 unsigned int status;
36 36
37 if (irq >= NR_IRQS) 37 if (!desc)
38 return; 38 return;
39 39
40 do { 40 do {
@@ -64,7 +64,7 @@ EXPORT_SYMBOL(synchronize_irq);
64 */ 64 */
65int irq_can_set_affinity(unsigned int irq) 65int irq_can_set_affinity(unsigned int irq)
66{ 66{
67 struct irq_desc *desc = irq_desc + irq; 67 struct irq_desc *desc = irq_to_desc(irq);
68 68
69 if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip || 69 if (CHECK_IRQ_PER_CPU(desc->status) || !desc->chip ||
70 !desc->chip->set_affinity) 70 !desc->chip->set_affinity)
@@ -81,15 +81,21 @@ int irq_can_set_affinity(unsigned int irq)
81 */ 81 */
82int irq_set_affinity(unsigned int irq, cpumask_t cpumask) 82int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
83{ 83{
84 struct irq_desc *desc = irq_desc + irq; 84 struct irq_desc *desc = irq_to_desc(irq);
85 85
86 if (!desc->chip->set_affinity) 86 if (!desc->chip->set_affinity)
87 return -EINVAL; 87 return -EINVAL;
88 88
89 set_balance_irq_affinity(irq, cpumask);
90
91#ifdef CONFIG_GENERIC_PENDING_IRQ 89#ifdef CONFIG_GENERIC_PENDING_IRQ
92 set_pending_irq(irq, cpumask); 90 if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) {
91 unsigned long flags;
92
93 spin_lock_irqsave(&desc->lock, flags);
94 desc->affinity = cpumask;
95 desc->chip->set_affinity(irq, cpumask);
96 spin_unlock_irqrestore(&desc->lock, flags);
97 } else
98 set_pending_irq(irq, cpumask);
93#else 99#else
94 desc->affinity = cpumask; 100 desc->affinity = cpumask;
95 desc->chip->set_affinity(irq, cpumask); 101 desc->chip->set_affinity(irq, cpumask);
@@ -104,16 +110,17 @@ int irq_set_affinity(unsigned int irq, cpumask_t cpumask)
104int irq_select_affinity(unsigned int irq) 110int irq_select_affinity(unsigned int irq)
105{ 111{
106 cpumask_t mask; 112 cpumask_t mask;
113 struct irq_desc *desc;
107 114
108 if (!irq_can_set_affinity(irq)) 115 if (!irq_can_set_affinity(irq))
109 return 0; 116 return 0;
110 117
111 cpus_and(mask, cpu_online_map, irq_default_affinity); 118 cpus_and(mask, cpu_online_map, irq_default_affinity);
112 119
113 irq_desc[irq].affinity = mask; 120 desc = irq_to_desc(irq);
114 irq_desc[irq].chip->set_affinity(irq, mask); 121 desc->affinity = mask;
122 desc->chip->set_affinity(irq, mask);
115 123
116 set_balance_irq_affinity(irq, mask);
117 return 0; 124 return 0;
118} 125}
119#endif 126#endif
@@ -133,10 +140,10 @@ int irq_select_affinity(unsigned int irq)
133 */ 140 */
134void disable_irq_nosync(unsigned int irq) 141void disable_irq_nosync(unsigned int irq)
135{ 142{
136 struct irq_desc *desc = irq_desc + irq; 143 struct irq_desc *desc = irq_to_desc(irq);
137 unsigned long flags; 144 unsigned long flags;
138 145
139 if (irq >= NR_IRQS) 146 if (!desc)
140 return; 147 return;
141 148
142 spin_lock_irqsave(&desc->lock, flags); 149 spin_lock_irqsave(&desc->lock, flags);
@@ -162,9 +169,9 @@ EXPORT_SYMBOL(disable_irq_nosync);
162 */ 169 */
163void disable_irq(unsigned int irq) 170void disable_irq(unsigned int irq)
164{ 171{
165 struct irq_desc *desc = irq_desc + irq; 172 struct irq_desc *desc = irq_to_desc(irq);
166 173
167 if (irq >= NR_IRQS) 174 if (!desc)
168 return; 175 return;
169 176
170 disable_irq_nosync(irq); 177 disable_irq_nosync(irq);
@@ -177,8 +184,7 @@ static void __enable_irq(struct irq_desc *desc, unsigned int irq)
177{ 184{
178 switch (desc->depth) { 185 switch (desc->depth) {
179 case 0: 186 case 0:
180 printk(KERN_WARNING "Unbalanced enable for IRQ %d\n", irq); 187 WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
181 WARN_ON(1);
182 break; 188 break;
183 case 1: { 189 case 1: {
184 unsigned int status = desc->status & ~IRQ_DISABLED; 190 unsigned int status = desc->status & ~IRQ_DISABLED;
@@ -205,10 +211,10 @@ static void __enable_irq(struct irq_desc *desc, unsigned int irq)
205 */ 211 */
206void enable_irq(unsigned int irq) 212void enable_irq(unsigned int irq)
207{ 213{
208 struct irq_desc *desc = irq_desc + irq; 214 struct irq_desc *desc = irq_to_desc(irq);
209 unsigned long flags; 215 unsigned long flags;
210 216
211 if (irq >= NR_IRQS) 217 if (!desc)
212 return; 218 return;
213 219
214 spin_lock_irqsave(&desc->lock, flags); 220 spin_lock_irqsave(&desc->lock, flags);
@@ -217,6 +223,17 @@ void enable_irq(unsigned int irq)
217} 223}
218EXPORT_SYMBOL(enable_irq); 224EXPORT_SYMBOL(enable_irq);
219 225
226static int set_irq_wake_real(unsigned int irq, unsigned int on)
227{
228 struct irq_desc *desc = irq_to_desc(irq);
229 int ret = -ENXIO;
230
231 if (desc->chip->set_wake)
232 ret = desc->chip->set_wake(irq, on);
233
234 return ret;
235}
236
220/** 237/**
221 * set_irq_wake - control irq power management wakeup 238 * set_irq_wake - control irq power management wakeup
222 * @irq: interrupt to control 239 * @irq: interrupt to control
@@ -231,32 +248,34 @@ EXPORT_SYMBOL(enable_irq);
231 */ 248 */
232int set_irq_wake(unsigned int irq, unsigned int on) 249int set_irq_wake(unsigned int irq, unsigned int on)
233{ 250{
234 struct irq_desc *desc = irq_desc + irq; 251 struct irq_desc *desc = irq_to_desc(irq);
235 unsigned long flags; 252 unsigned long flags;
236 int ret = -ENXIO; 253 int ret = 0;
237 int (*set_wake)(unsigned, unsigned) = desc->chip->set_wake;
238 254
239 /* wakeup-capable irqs can be shared between drivers that 255 /* wakeup-capable irqs can be shared between drivers that
240 * don't need to have the same sleep mode behaviors. 256 * don't need to have the same sleep mode behaviors.
241 */ 257 */
242 spin_lock_irqsave(&desc->lock, flags); 258 spin_lock_irqsave(&desc->lock, flags);
243 if (on) { 259 if (on) {
244 if (desc->wake_depth++ == 0) 260 if (desc->wake_depth++ == 0) {
245 desc->status |= IRQ_WAKEUP; 261 ret = set_irq_wake_real(irq, on);
246 else 262 if (ret)
247 set_wake = NULL; 263 desc->wake_depth = 0;
264 else
265 desc->status |= IRQ_WAKEUP;
266 }
248 } else { 267 } else {
249 if (desc->wake_depth == 0) { 268 if (desc->wake_depth == 0) {
250 printk(KERN_WARNING "Unbalanced IRQ %d " 269 WARN(1, "Unbalanced IRQ %d wake disable\n", irq);
251 "wake disable\n", irq); 270 } else if (--desc->wake_depth == 0) {
252 WARN_ON(1); 271 ret = set_irq_wake_real(irq, on);
253 } else if (--desc->wake_depth == 0) 272 if (ret)
254 desc->status &= ~IRQ_WAKEUP; 273 desc->wake_depth = 1;
255 else 274 else
256 set_wake = NULL; 275 desc->status &= ~IRQ_WAKEUP;
276 }
257 } 277 }
258 if (set_wake) 278
259 ret = desc->chip->set_wake(irq, on);
260 spin_unlock_irqrestore(&desc->lock, flags); 279 spin_unlock_irqrestore(&desc->lock, flags);
261 return ret; 280 return ret;
262} 281}
@@ -269,12 +288,16 @@ EXPORT_SYMBOL(set_irq_wake);
269 */ 288 */
270int can_request_irq(unsigned int irq, unsigned long irqflags) 289int can_request_irq(unsigned int irq, unsigned long irqflags)
271{ 290{
291 struct irq_desc *desc = irq_to_desc(irq);
272 struct irqaction *action; 292 struct irqaction *action;
273 293
274 if (irq >= NR_IRQS || irq_desc[irq].status & IRQ_NOREQUEST) 294 if (!desc)
275 return 0; 295 return 0;
276 296
277 action = irq_desc[irq].action; 297 if (desc->status & IRQ_NOREQUEST)
298 return 0;
299
300 action = desc->action;
278 if (action) 301 if (action)
279 if (irqflags & action->flags & IRQF_SHARED) 302 if (irqflags & action->flags & IRQF_SHARED)
280 action = NULL; 303 action = NULL;
@@ -293,19 +316,51 @@ void compat_irq_chip_set_default_handler(struct irq_desc *desc)
293 desc->handle_irq = NULL; 316 desc->handle_irq = NULL;
294} 317}
295 318
319int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
320 unsigned long flags)
321{
322 int ret;
323 struct irq_chip *chip = desc->chip;
324
325 if (!chip || !chip->set_type) {
326 /*
327 * IRQF_TRIGGER_* but the PIC does not support multiple
328 * flow-types?
329 */
330 pr_warning("No set_type function for IRQ %d (%s)\n", irq,
331 chip ? (chip->name ? : "unknown") : "unknown");
332 return 0;
333 }
334
335 ret = chip->set_type(irq, flags & IRQF_TRIGGER_MASK);
336
337 if (ret)
338 pr_err("setting trigger mode %d for irq %u failed (%pF)\n",
339 (int)(flags & IRQF_TRIGGER_MASK),
340 irq, chip->set_type);
341 else {
342 /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */
343 desc->status &= ~IRQ_TYPE_SENSE_MASK;
344 desc->status |= flags & IRQ_TYPE_SENSE_MASK;
345 }
346
347 return ret;
348}
349
296/* 350/*
297 * Internal function to register an irqaction - typically used to 351 * Internal function to register an irqaction - typically used to
298 * allocate special interrupts that are part of the architecture. 352 * allocate special interrupts that are part of the architecture.
299 */ 353 */
300int setup_irq(unsigned int irq, struct irqaction *new) 354static int
355__setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
301{ 356{
302 struct irq_desc *desc = irq_desc + irq;
303 struct irqaction *old, **p; 357 struct irqaction *old, **p;
304 const char *old_name = NULL; 358 const char *old_name = NULL;
305 unsigned long flags; 359 unsigned long flags;
306 int shared = 0; 360 int shared = 0;
361 int ret;
307 362
308 if (irq >= NR_IRQS) 363 if (!desc)
309 return -EINVAL; 364 return -EINVAL;
310 365
311 if (desc->chip == &no_irq_chip) 366 if (desc->chip == &no_irq_chip)
@@ -361,35 +416,23 @@ int setup_irq(unsigned int irq, struct irqaction *new)
361 shared = 1; 416 shared = 1;
362 } 417 }
363 418
364 *p = new;
365
366 /* Exclude IRQ from balancing */
367 if (new->flags & IRQF_NOBALANCING)
368 desc->status |= IRQ_NO_BALANCING;
369
370 if (!shared) { 419 if (!shared) {
371 irq_chip_set_defaults(desc->chip); 420 irq_chip_set_defaults(desc->chip);
372 421
373#if defined(CONFIG_IRQ_PER_CPU)
374 if (new->flags & IRQF_PERCPU)
375 desc->status |= IRQ_PER_CPU;
376#endif
377
378 /* Setup the type (level, edge polarity) if configured: */ 422 /* Setup the type (level, edge polarity) if configured: */
379 if (new->flags & IRQF_TRIGGER_MASK) { 423 if (new->flags & IRQF_TRIGGER_MASK) {
380 if (desc->chip->set_type) 424 ret = __irq_set_trigger(desc, irq, new->flags);
381 desc->chip->set_type(irq, 425
382 new->flags & IRQF_TRIGGER_MASK); 426 if (ret) {
383 else 427 spin_unlock_irqrestore(&desc->lock, flags);
384 /* 428 return ret;
385 * IRQF_TRIGGER_* but the PIC does not support 429 }
386 * multiple flow-types?
387 */
388 printk(KERN_WARNING "No IRQF_TRIGGER set_type "
389 "function for IRQ %d (%s)\n", irq,
390 desc->chip->name);
391 } else 430 } else
392 compat_irq_chip_set_default_handler(desc); 431 compat_irq_chip_set_default_handler(desc);
432#if defined(CONFIG_IRQ_PER_CPU)
433 if (new->flags & IRQF_PERCPU)
434 desc->status |= IRQ_PER_CPU;
435#endif
393 436
394 desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | 437 desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING |
395 IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED); 438 IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED);
@@ -397,17 +440,29 @@ int setup_irq(unsigned int irq, struct irqaction *new)
397 if (!(desc->status & IRQ_NOAUTOEN)) { 440 if (!(desc->status & IRQ_NOAUTOEN)) {
398 desc->depth = 0; 441 desc->depth = 0;
399 desc->status &= ~IRQ_DISABLED; 442 desc->status &= ~IRQ_DISABLED;
400 if (desc->chip->startup) 443 desc->chip->startup(irq);
401 desc->chip->startup(irq);
402 else
403 desc->chip->enable(irq);
404 } else 444 } else
405 /* Undo nested disables: */ 445 /* Undo nested disables: */
406 desc->depth = 1; 446 desc->depth = 1;
407 447
408 /* Set default affinity mask once everything is setup */ 448 /* Set default affinity mask once everything is setup */
409 irq_select_affinity(irq); 449 irq_select_affinity(irq);
450
451 } else if ((new->flags & IRQF_TRIGGER_MASK)
452 && (new->flags & IRQF_TRIGGER_MASK)
453 != (desc->status & IRQ_TYPE_SENSE_MASK)) {
454 /* hope the handler works with the actual trigger mode... */
455 pr_warning("IRQ %d uses trigger mode %d; requested %d\n",
456 irq, (int)(desc->status & IRQ_TYPE_SENSE_MASK),
457 (int)(new->flags & IRQF_TRIGGER_MASK));
410 } 458 }
459
460 *p = new;
461
462 /* Exclude IRQ from balancing */
463 if (new->flags & IRQF_NOBALANCING)
464 desc->status |= IRQ_NO_BALANCING;
465
411 /* Reset broken irq detection when installing new handler */ 466 /* Reset broken irq detection when installing new handler */
412 desc->irq_count = 0; 467 desc->irq_count = 0;
413 desc->irqs_unhandled = 0; 468 desc->irqs_unhandled = 0;
@@ -424,7 +479,7 @@ int setup_irq(unsigned int irq, struct irqaction *new)
424 spin_unlock_irqrestore(&desc->lock, flags); 479 spin_unlock_irqrestore(&desc->lock, flags);
425 480
426 new->irq = irq; 481 new->irq = irq;
427 register_irq_proc(irq); 482 register_irq_proc(irq, desc);
428 new->dir = NULL; 483 new->dir = NULL;
429 register_handler_proc(irq, new); 484 register_handler_proc(irq, new);
430 485
@@ -444,6 +499,20 @@ mismatch:
444} 499}
445 500
446/** 501/**
502 * setup_irq - setup an interrupt
503 * @irq: Interrupt line to setup
504 * @act: irqaction for the interrupt
505 *
506 * Used to statically setup interrupts in the early boot process.
507 */
508int setup_irq(unsigned int irq, struct irqaction *act)
509{
510 struct irq_desc *desc = irq_to_desc(irq);
511
512 return __setup_irq(irq, desc, act);
513}
514
515/**
447 * free_irq - free an interrupt 516 * free_irq - free an interrupt
448 * @irq: Interrupt line to free 517 * @irq: Interrupt line to free
449 * @dev_id: Device identity to free 518 * @dev_id: Device identity to free
@@ -459,15 +528,15 @@ mismatch:
459 */ 528 */
460void free_irq(unsigned int irq, void *dev_id) 529void free_irq(unsigned int irq, void *dev_id)
461{ 530{
462 struct irq_desc *desc; 531 struct irq_desc *desc = irq_to_desc(irq);
463 struct irqaction **p; 532 struct irqaction **p;
464 unsigned long flags; 533 unsigned long flags;
465 534
466 WARN_ON(in_interrupt()); 535 WARN_ON(in_interrupt());
467 if (irq >= NR_IRQS) 536
537 if (!desc)
468 return; 538 return;
469 539
470 desc = irq_desc + irq;
471 spin_lock_irqsave(&desc->lock, flags); 540 spin_lock_irqsave(&desc->lock, flags);
472 p = &desc->action; 541 p = &desc->action;
473 for (;;) { 542 for (;;) {
@@ -556,12 +625,14 @@ EXPORT_SYMBOL(free_irq);
556 * IRQF_SHARED Interrupt is shared 625 * IRQF_SHARED Interrupt is shared
557 * IRQF_DISABLED Disable local interrupts while processing 626 * IRQF_DISABLED Disable local interrupts while processing
558 * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy 627 * IRQF_SAMPLE_RANDOM The interrupt can be used for entropy
628 * IRQF_TRIGGER_* Specify active edge(s) or level
559 * 629 *
560 */ 630 */
561int request_irq(unsigned int irq, irq_handler_t handler, 631int request_irq(unsigned int irq, irq_handler_t handler,
562 unsigned long irqflags, const char *devname, void *dev_id) 632 unsigned long irqflags, const char *devname, void *dev_id)
563{ 633{
564 struct irqaction *action; 634 struct irqaction *action;
635 struct irq_desc *desc;
565 int retval; 636 int retval;
566 637
567#ifdef CONFIG_LOCKDEP 638#ifdef CONFIG_LOCKDEP
@@ -578,9 +649,12 @@ int request_irq(unsigned int irq, irq_handler_t handler,
578 */ 649 */
579 if ((irqflags & IRQF_SHARED) && !dev_id) 650 if ((irqflags & IRQF_SHARED) && !dev_id)
580 return -EINVAL; 651 return -EINVAL;
581 if (irq >= NR_IRQS) 652
653 desc = irq_to_desc(irq);
654 if (!desc)
582 return -EINVAL; 655 return -EINVAL;
583 if (irq_desc[irq].status & IRQ_NOREQUEST) 656
657 if (desc->status & IRQ_NOREQUEST)
584 return -EINVAL; 658 return -EINVAL;
585 if (!handler) 659 if (!handler)
586 return -EINVAL; 660 return -EINVAL;
@@ -596,26 +670,29 @@ int request_irq(unsigned int irq, irq_handler_t handler,
596 action->next = NULL; 670 action->next = NULL;
597 action->dev_id = dev_id; 671 action->dev_id = dev_id;
598 672
673 retval = __setup_irq(irq, desc, action);
674 if (retval)
675 kfree(action);
676
599#ifdef CONFIG_DEBUG_SHIRQ 677#ifdef CONFIG_DEBUG_SHIRQ
600 if (irqflags & IRQF_SHARED) { 678 if (irqflags & IRQF_SHARED) {
601 /* 679 /*
602 * It's a shared IRQ -- the driver ought to be prepared for it 680 * It's a shared IRQ -- the driver ought to be prepared for it
603 * to happen immediately, so let's make sure.... 681 * to happen immediately, so let's make sure....
604 * We do this before actually registering it, to make sure that 682 * We disable the irq to make sure that a 'real' IRQ doesn't
605 * a 'real' IRQ doesn't run in parallel with our fake 683 * run in parallel with our fake.
606 */ 684 */
607 unsigned long flags; 685 unsigned long flags;
608 686
687 disable_irq(irq);
609 local_irq_save(flags); 688 local_irq_save(flags);
689
610 handler(irq, dev_id); 690 handler(irq, dev_id);
691
611 local_irq_restore(flags); 692 local_irq_restore(flags);
693 enable_irq(irq);
612 } 694 }
613#endif 695#endif
614
615 retval = setup_irq(irq, action);
616 if (retval)
617 kfree(action);
618
619 return retval; 696 return retval;
620} 697}
621EXPORT_SYMBOL(request_irq); 698EXPORT_SYMBOL(request_irq);
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 77b7acc875c5..90b920d3f52b 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -3,18 +3,18 @@
3 3
4void set_pending_irq(unsigned int irq, cpumask_t mask) 4void set_pending_irq(unsigned int irq, cpumask_t mask)
5{ 5{
6 struct irq_desc *desc = irq_desc + irq; 6 struct irq_desc *desc = irq_to_desc(irq);
7 unsigned long flags; 7 unsigned long flags;
8 8
9 spin_lock_irqsave(&desc->lock, flags); 9 spin_lock_irqsave(&desc->lock, flags);
10 desc->status |= IRQ_MOVE_PENDING; 10 desc->status |= IRQ_MOVE_PENDING;
11 irq_desc[irq].pending_mask = mask; 11 desc->pending_mask = mask;
12 spin_unlock_irqrestore(&desc->lock, flags); 12 spin_unlock_irqrestore(&desc->lock, flags);
13} 13}
14 14
15void move_masked_irq(int irq) 15void move_masked_irq(int irq)
16{ 16{
17 struct irq_desc *desc = irq_desc + irq; 17 struct irq_desc *desc = irq_to_desc(irq);
18 cpumask_t tmp; 18 cpumask_t tmp;
19 19
20 if (likely(!(desc->status & IRQ_MOVE_PENDING))) 20 if (likely(!(desc->status & IRQ_MOVE_PENDING)))
@@ -30,7 +30,7 @@ void move_masked_irq(int irq)
30 30
31 desc->status &= ~IRQ_MOVE_PENDING; 31 desc->status &= ~IRQ_MOVE_PENDING;
32 32
33 if (unlikely(cpus_empty(irq_desc[irq].pending_mask))) 33 if (unlikely(cpus_empty(desc->pending_mask)))
34 return; 34 return;
35 35
36 if (!desc->chip->set_affinity) 36 if (!desc->chip->set_affinity)
@@ -38,7 +38,7 @@ void move_masked_irq(int irq)
38 38
39 assert_spin_locked(&desc->lock); 39 assert_spin_locked(&desc->lock);
40 40
41 cpus_and(tmp, irq_desc[irq].pending_mask, cpu_online_map); 41 cpus_and(tmp, desc->pending_mask, cpu_online_map);
42 42
43 /* 43 /*
44 * If there was a valid mask to work with, please 44 * If there was a valid mask to work with, please
@@ -55,12 +55,12 @@ void move_masked_irq(int irq)
55 if (likely(!cpus_empty(tmp))) { 55 if (likely(!cpus_empty(tmp))) {
56 desc->chip->set_affinity(irq,tmp); 56 desc->chip->set_affinity(irq,tmp);
57 } 57 }
58 cpus_clear(irq_desc[irq].pending_mask); 58 cpus_clear(desc->pending_mask);
59} 59}
60 60
61void move_native_irq(int irq) 61void move_native_irq(int irq)
62{ 62{
63 struct irq_desc *desc = irq_desc + irq; 63 struct irq_desc *desc = irq_to_desc(irq);
64 64
65 if (likely(!(desc->status & IRQ_MOVE_PENDING))) 65 if (likely(!(desc->status & IRQ_MOVE_PENDING)))
66 return; 66 return;
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 6c6d35d68ee9..fac014a81b24 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -8,6 +8,7 @@
8 8
9#include <linux/irq.h> 9#include <linux/irq.h>
10#include <linux/proc_fs.h> 10#include <linux/proc_fs.h>
11#include <linux/seq_file.h>
11#include <linux/interrupt.h> 12#include <linux/interrupt.h>
12 13
13#include "internals.h" 14#include "internals.h"
@@ -16,23 +17,18 @@ static struct proc_dir_entry *root_irq_dir;
16 17
17#ifdef CONFIG_SMP 18#ifdef CONFIG_SMP
18 19
19static int irq_affinity_read_proc(char *page, char **start, off_t off, 20static int irq_affinity_proc_show(struct seq_file *m, void *v)
20 int count, int *eof, void *data)
21{ 21{
22 struct irq_desc *desc = irq_desc + (long)data; 22 struct irq_desc *desc = irq_to_desc((long)m->private);
23 cpumask_t *mask = &desc->affinity; 23 cpumask_t *mask = &desc->affinity;
24 int len;
25 24
26#ifdef CONFIG_GENERIC_PENDING_IRQ 25#ifdef CONFIG_GENERIC_PENDING_IRQ
27 if (desc->status & IRQ_MOVE_PENDING) 26 if (desc->status & IRQ_MOVE_PENDING)
28 mask = &desc->pending_mask; 27 mask = &desc->pending_mask;
29#endif 28#endif
30 len = cpumask_scnprintf(page, count, *mask); 29 seq_cpumask(m, mask);
31 30 seq_putc(m, '\n');
32 if (count - len < 2) 31 return 0;
33 return -EINVAL;
34 len += sprintf(page + len, "\n");
35 return len;
36} 32}
37 33
38#ifndef is_affinity_mask_valid 34#ifndef is_affinity_mask_valid
@@ -40,13 +36,14 @@ static int irq_affinity_read_proc(char *page, char **start, off_t off,
40#endif 36#endif
41 37
42int no_irq_affinity; 38int no_irq_affinity;
43static int irq_affinity_write_proc(struct file *file, const char __user *buffer, 39static ssize_t irq_affinity_proc_write(struct file *file,
44 unsigned long count, void *data) 40 const char __user *buffer, size_t count, loff_t *pos)
45{ 41{
46 unsigned int irq = (int)(long)data, full_count = count, err; 42 unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data;
47 cpumask_t new_value; 43 cpumask_t new_value;
44 int err;
48 45
49 if (!irq_desc[irq].chip->set_affinity || no_irq_affinity || 46 if (!irq_to_desc(irq)->chip->set_affinity || no_irq_affinity ||
50 irq_balancing_disabled(irq)) 47 irq_balancing_disabled(irq))
51 return -EIO; 48 return -EIO;
52 49
@@ -65,28 +62,38 @@ static int irq_affinity_write_proc(struct file *file, const char __user *buffer,
65 if (!cpus_intersects(new_value, cpu_online_map)) 62 if (!cpus_intersects(new_value, cpu_online_map))
66 /* Special case for empty set - allow the architecture 63 /* Special case for empty set - allow the architecture
67 code to set default SMP affinity. */ 64 code to set default SMP affinity. */
68 return irq_select_affinity(irq) ? -EINVAL : full_count; 65 return irq_select_affinity(irq) ? -EINVAL : count;
69 66
70 irq_set_affinity(irq, new_value); 67 irq_set_affinity(irq, new_value);
71 68
72 return full_count; 69 return count;
73} 70}
74 71
75static int default_affinity_read(char *page, char **start, off_t off, 72static int irq_affinity_proc_open(struct inode *inode, struct file *file)
76 int count, int *eof, void *data)
77{ 73{
78 int len = cpumask_scnprintf(page, count, irq_default_affinity); 74 return single_open(file, irq_affinity_proc_show, PDE(inode)->data);
79 if (count - len < 2)
80 return -EINVAL;
81 len += sprintf(page + len, "\n");
82 return len;
83} 75}
84 76
85static int default_affinity_write(struct file *file, const char __user *buffer, 77static const struct file_operations irq_affinity_proc_fops = {
86 unsigned long count, void *data) 78 .open = irq_affinity_proc_open,
79 .read = seq_read,
80 .llseek = seq_lseek,
81 .release = single_release,
82 .write = irq_affinity_proc_write,
83};
84
85static int default_affinity_show(struct seq_file *m, void *v)
86{
87 seq_cpumask(m, &irq_default_affinity);
88 seq_putc(m, '\n');
89 return 0;
90}
91
92static ssize_t default_affinity_write(struct file *file,
93 const char __user *buffer, size_t count, loff_t *ppos)
87{ 94{
88 unsigned int full_count = count, err;
89 cpumask_t new_value; 95 cpumask_t new_value;
96 int err;
90 97
91 err = cpumask_parse_user(buffer, count, new_value); 98 err = cpumask_parse_user(buffer, count, new_value);
92 if (err) 99 if (err)
@@ -105,27 +112,40 @@ static int default_affinity_write(struct file *file, const char __user *buffer,
105 112
106 irq_default_affinity = new_value; 113 irq_default_affinity = new_value;
107 114
108 return full_count; 115 return count;
109} 116}
117
118static int default_affinity_open(struct inode *inode, struct file *file)
119{
120 return single_open(file, default_affinity_show, NULL);
121}
122
123static const struct file_operations default_affinity_proc_fops = {
124 .open = default_affinity_open,
125 .read = seq_read,
126 .llseek = seq_lseek,
127 .release = single_release,
128 .write = default_affinity_write,
129};
110#endif 130#endif
111 131
112static int irq_spurious_read(char *page, char **start, off_t off, 132static int irq_spurious_read(char *page, char **start, off_t off,
113 int count, int *eof, void *data) 133 int count, int *eof, void *data)
114{ 134{
115 struct irq_desc *d = &irq_desc[(long) data]; 135 struct irq_desc *desc = irq_to_desc((long) data);
116 return sprintf(page, "count %u\n" 136 return sprintf(page, "count %u\n"
117 "unhandled %u\n" 137 "unhandled %u\n"
118 "last_unhandled %u ms\n", 138 "last_unhandled %u ms\n",
119 d->irq_count, 139 desc->irq_count,
120 d->irqs_unhandled, 140 desc->irqs_unhandled,
121 jiffies_to_msecs(d->last_unhandled)); 141 jiffies_to_msecs(desc->last_unhandled));
122} 142}
123 143
124#define MAX_NAMELEN 128 144#define MAX_NAMELEN 128
125 145
126static int name_unique(unsigned int irq, struct irqaction *new_action) 146static int name_unique(unsigned int irq, struct irqaction *new_action)
127{ 147{
128 struct irq_desc *desc = irq_desc + irq; 148 struct irq_desc *desc = irq_to_desc(irq);
129 struct irqaction *action; 149 struct irqaction *action;
130 unsigned long flags; 150 unsigned long flags;
131 int ret = 1; 151 int ret = 1;
@@ -145,8 +165,9 @@ static int name_unique(unsigned int irq, struct irqaction *new_action)
145void register_handler_proc(unsigned int irq, struct irqaction *action) 165void register_handler_proc(unsigned int irq, struct irqaction *action)
146{ 166{
147 char name [MAX_NAMELEN]; 167 char name [MAX_NAMELEN];
168 struct irq_desc *desc = irq_to_desc(irq);
148 169
149 if (!irq_desc[irq].dir || action->dir || !action->name || 170 if (!desc->dir || action->dir || !action->name ||
150 !name_unique(irq, action)) 171 !name_unique(irq, action))
151 return; 172 return;
152 173
@@ -154,43 +175,34 @@ void register_handler_proc(unsigned int irq, struct irqaction *action)
154 snprintf(name, MAX_NAMELEN, "%s", action->name); 175 snprintf(name, MAX_NAMELEN, "%s", action->name);
155 176
156 /* create /proc/irq/1234/handler/ */ 177 /* create /proc/irq/1234/handler/ */
157 action->dir = proc_mkdir(name, irq_desc[irq].dir); 178 action->dir = proc_mkdir(name, desc->dir);
158} 179}
159 180
160#undef MAX_NAMELEN 181#undef MAX_NAMELEN
161 182
162#define MAX_NAMELEN 10 183#define MAX_NAMELEN 10
163 184
164void register_irq_proc(unsigned int irq) 185void register_irq_proc(unsigned int irq, struct irq_desc *desc)
165{ 186{
166 char name [MAX_NAMELEN]; 187 char name [MAX_NAMELEN];
167 struct proc_dir_entry *entry; 188 struct proc_dir_entry *entry;
168 189
169 if (!root_irq_dir || 190 if (!root_irq_dir || (desc->chip == &no_irq_chip) || desc->dir)
170 (irq_desc[irq].chip == &no_irq_chip) ||
171 irq_desc[irq].dir)
172 return; 191 return;
173 192
174 memset(name, 0, MAX_NAMELEN); 193 memset(name, 0, MAX_NAMELEN);
175 sprintf(name, "%d", irq); 194 sprintf(name, "%d", irq);
176 195
177 /* create /proc/irq/1234 */ 196 /* create /proc/irq/1234 */
178 irq_desc[irq].dir = proc_mkdir(name, root_irq_dir); 197 desc->dir = proc_mkdir(name, root_irq_dir);
179 198
180#ifdef CONFIG_SMP 199#ifdef CONFIG_SMP
181 { 200 /* create /proc/irq/<irq>/smp_affinity */
182 /* create /proc/irq/<irq>/smp_affinity */ 201 proc_create_data("smp_affinity", 0600, desc->dir,
183 entry = create_proc_entry("smp_affinity", 0600, irq_desc[irq].dir); 202 &irq_affinity_proc_fops, (void *)(long)irq);
184
185 if (entry) {
186 entry->data = (void *)(long)irq;
187 entry->read_proc = irq_affinity_read_proc;
188 entry->write_proc = irq_affinity_write_proc;
189 }
190 }
191#endif 203#endif
192 204
193 entry = create_proc_entry("spurious", 0444, irq_desc[irq].dir); 205 entry = create_proc_entry("spurious", 0444, desc->dir);
194 if (entry) { 206 if (entry) {
195 entry->data = (void *)(long)irq; 207 entry->data = (void *)(long)irq;
196 entry->read_proc = irq_spurious_read; 208 entry->read_proc = irq_spurious_read;
@@ -201,28 +213,25 @@ void register_irq_proc(unsigned int irq)
201 213
202void unregister_handler_proc(unsigned int irq, struct irqaction *action) 214void unregister_handler_proc(unsigned int irq, struct irqaction *action)
203{ 215{
204 if (action->dir) 216 if (action->dir) {
205 remove_proc_entry(action->dir->name, irq_desc[irq].dir); 217 struct irq_desc *desc = irq_to_desc(irq);
218
219 remove_proc_entry(action->dir->name, desc->dir);
220 }
206} 221}
207 222
208void register_default_affinity_proc(void) 223void register_default_affinity_proc(void)
209{ 224{
210#ifdef CONFIG_SMP 225#ifdef CONFIG_SMP
211 struct proc_dir_entry *entry; 226 proc_create("irq/default_smp_affinity", 0600, NULL,
212 227 &default_affinity_proc_fops);
213 /* create /proc/irq/default_smp_affinity */
214 entry = create_proc_entry("default_smp_affinity", 0600, root_irq_dir);
215 if (entry) {
216 entry->data = NULL;
217 entry->read_proc = default_affinity_read;
218 entry->write_proc = default_affinity_write;
219 }
220#endif 228#endif
221} 229}
222 230
223void init_irq_proc(void) 231void init_irq_proc(void)
224{ 232{
225 int i; 233 unsigned int irq;
234 struct irq_desc *desc;
226 235
227 /* create /proc/irq */ 236 /* create /proc/irq */
228 root_irq_dir = proc_mkdir("irq", NULL); 237 root_irq_dir = proc_mkdir("irq", NULL);
@@ -234,7 +243,7 @@ void init_irq_proc(void)
234 /* 243 /*
235 * Create entries for all existing IRQs. 244 * Create entries for all existing IRQs.
236 */ 245 */
237 for (i = 0; i < NR_IRQS; i++) 246 for_each_irq_desc(irq, desc)
238 register_irq_proc(i); 247 register_irq_proc(irq, desc);
239} 248}
240 249
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index a8046791ba2d..89c7117acf2b 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -33,10 +33,10 @@ static void resend_irqs(unsigned long arg)
33 struct irq_desc *desc; 33 struct irq_desc *desc;
34 int irq; 34 int irq;
35 35
36 while (!bitmap_empty(irqs_resend, NR_IRQS)) { 36 while (!bitmap_empty(irqs_resend, nr_irqs)) {
37 irq = find_first_bit(irqs_resend, NR_IRQS); 37 irq = find_first_bit(irqs_resend, nr_irqs);
38 clear_bit(irq, irqs_resend); 38 clear_bit(irq, irqs_resend);
39 desc = irq_desc + irq; 39 desc = irq_to_desc(irq);
40 local_irq_disable(); 40 local_irq_disable();
41 desc->handle_irq(irq, desc); 41 desc->handle_irq(irq, desc);
42 local_irq_enable(); 42 local_irq_enable();
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index c66d3f10e853..dd364c11e56e 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -12,83 +12,122 @@
12#include <linux/kallsyms.h> 12#include <linux/kallsyms.h>
13#include <linux/interrupt.h> 13#include <linux/interrupt.h>
14#include <linux/moduleparam.h> 14#include <linux/moduleparam.h>
15#include <linux/timer.h>
15 16
16static int irqfixup __read_mostly; 17static int irqfixup __read_mostly;
17 18
19#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
20static void poll_spurious_irqs(unsigned long dummy);
21static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0);
22
18/* 23/*
19 * Recovery handler for misrouted interrupts. 24 * Recovery handler for misrouted interrupts.
20 */ 25 */
21static int misrouted_irq(int irq) 26static int try_one_irq(int irq, struct irq_desc *desc)
22{ 27{
23 int i; 28 struct irqaction *action;
24 int ok = 0; 29 int ok = 0, work = 0;
25 int work = 0; /* Did we do work for a real IRQ */
26
27 for (i = 1; i < NR_IRQS; i++) {
28 struct irq_desc *desc = irq_desc + i;
29 struct irqaction *action;
30
31 if (i == irq) /* Already tried */
32 continue;
33 30
34 spin_lock(&desc->lock); 31 spin_lock(&desc->lock);
35 /* Already running on another processor */ 32 /* Already running on another processor */
36 if (desc->status & IRQ_INPROGRESS) { 33 if (desc->status & IRQ_INPROGRESS) {
37 /* 34 /*
38 * Already running: If it is shared get the other 35 * Already running: If it is shared get the other
39 * CPU to go looking for our mystery interrupt too 36 * CPU to go looking for our mystery interrupt too
40 */ 37 */
41 if (desc->action && (desc->action->flags & IRQF_SHARED)) 38 if (desc->action && (desc->action->flags & IRQF_SHARED))
42 desc->status |= IRQ_PENDING; 39 desc->status |= IRQ_PENDING;
43 spin_unlock(&desc->lock);
44 continue;
45 }
46 /* Honour the normal IRQ locking */
47 desc->status |= IRQ_INPROGRESS;
48 action = desc->action;
49 spin_unlock(&desc->lock); 40 spin_unlock(&desc->lock);
41 return ok;
42 }
43 /* Honour the normal IRQ locking */
44 desc->status |= IRQ_INPROGRESS;
45 action = desc->action;
46 spin_unlock(&desc->lock);
50 47
51 while (action) { 48 while (action) {
52 /* Only shared IRQ handlers are safe to call */ 49 /* Only shared IRQ handlers are safe to call */
53 if (action->flags & IRQF_SHARED) { 50 if (action->flags & IRQF_SHARED) {
54 if (action->handler(i, action->dev_id) == 51 if (action->handler(irq, action->dev_id) ==
55 IRQ_HANDLED) 52 IRQ_HANDLED)
56 ok = 1; 53 ok = 1;
57 }
58 action = action->next;
59 } 54 }
60 local_irq_disable(); 55 action = action->next;
61 /* Now clean up the flags */ 56 }
62 spin_lock(&desc->lock); 57 local_irq_disable();
63 action = desc->action; 58 /* Now clean up the flags */
59 spin_lock(&desc->lock);
60 action = desc->action;
64 61
62 /*
63 * While we were looking for a fixup someone queued a real
64 * IRQ clashing with our walk:
65 */
66 while ((desc->status & IRQ_PENDING) && action) {
65 /* 67 /*
66 * While we were looking for a fixup someone queued a real 68 * Perform real IRQ processing for the IRQ we deferred
67 * IRQ clashing with our walk:
68 */
69 while ((desc->status & IRQ_PENDING) && action) {
70 /*
71 * Perform real IRQ processing for the IRQ we deferred
72 */
73 work = 1;
74 spin_unlock(&desc->lock);
75 handle_IRQ_event(i, action);
76 spin_lock(&desc->lock);
77 desc->status &= ~IRQ_PENDING;
78 }
79 desc->status &= ~IRQ_INPROGRESS;
80 /*
81 * If we did actual work for the real IRQ line we must let the
82 * IRQ controller clean up too
83 */ 69 */
84 if (work && desc->chip && desc->chip->end) 70 work = 1;
85 desc->chip->end(i);
86 spin_unlock(&desc->lock); 71 spin_unlock(&desc->lock);
72 handle_IRQ_event(irq, action);
73 spin_lock(&desc->lock);
74 desc->status &= ~IRQ_PENDING;
75 }
76 desc->status &= ~IRQ_INPROGRESS;
77 /*
78 * If we did actual work for the real IRQ line we must let the
79 * IRQ controller clean up too
80 */
81 if (work && desc->chip && desc->chip->end)
82 desc->chip->end(irq);
83 spin_unlock(&desc->lock);
84
85 return ok;
86}
87
88static int misrouted_irq(int irq)
89{
90 struct irq_desc *desc;
91 int i, ok = 0;
92
93 for_each_irq_desc(i, desc) {
94 if (!i)
95 continue;
96
97 if (i == irq) /* Already tried */
98 continue;
99
100 if (try_one_irq(i, desc))
101 ok = 1;
87 } 102 }
88 /* So the caller can adjust the irq error counts */ 103 /* So the caller can adjust the irq error counts */
89 return ok; 104 return ok;
90} 105}
91 106
107static void poll_spurious_irqs(unsigned long dummy)
108{
109 struct irq_desc *desc;
110 int i;
111
112 for_each_irq_desc(i, desc) {
113 unsigned int status;
114
115 if (!i)
116 continue;
117
118 /* Racy but it doesn't matter */
119 status = desc->status;
120 barrier();
121 if (!(status & IRQ_SPURIOUS_DISABLED))
122 continue;
123
124 try_one_irq(i, desc);
125 }
126
127 mod_timer(&poll_spurious_irq_timer,
128 jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
129}
130
92/* 131/*
93 * If 99,900 of the previous 100,000 interrupts have not been handled 132 * If 99,900 of the previous 100,000 interrupts have not been handled
94 * then assume that the IRQ is stuck in some manner. Drop a diagnostic 133 * then assume that the IRQ is stuck in some manner. Drop a diagnostic
@@ -137,7 +176,9 @@ report_bad_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret)
137 } 176 }
138} 177}
139 178
140static inline int try_misrouted_irq(unsigned int irq, struct irq_desc *desc, irqreturn_t action_ret) 179static inline int
180try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
181 irqreturn_t action_ret)
141{ 182{
142 struct irqaction *action; 183 struct irqaction *action;
143 184
@@ -212,6 +253,9 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
212 desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED; 253 desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED;
213 desc->depth++; 254 desc->depth++;
214 desc->chip->disable(irq); 255 desc->chip->disable(irq);
256
257 mod_timer(&poll_spurious_irq_timer,
258 jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
215 } 259 }
216 desc->irqs_unhandled = 0; 260 desc->irqs_unhandled = 0;
217} 261}
@@ -241,7 +285,7 @@ static int __init irqfixup_setup(char *str)
241 285
242__setup("irqfixup", irqfixup_setup); 286__setup("irqfixup", irqfixup_setup);
243module_param(irqfixup, int, 0644); 287module_param(irqfixup, int, 0644);
244MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode 2: irqpoll mode"); 288MODULE_PARM_DESC("irqfixup", "0: No fixup, 1: irqfixup mode, 2: irqpoll mode");
245 289
246static int __init irqpoll_setup(char *str) 290static int __init irqpoll_setup(char *str)
247{ 291{
diff --git a/kernel/itimer.c b/kernel/itimer.c
index ab982747d9bd..db7c358b9a02 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -55,17 +55,15 @@ int do_getitimer(int which, struct itimerval *value)
55 spin_unlock_irq(&tsk->sighand->siglock); 55 spin_unlock_irq(&tsk->sighand->siglock);
56 break; 56 break;
57 case ITIMER_VIRTUAL: 57 case ITIMER_VIRTUAL:
58 read_lock(&tasklist_lock);
59 spin_lock_irq(&tsk->sighand->siglock); 58 spin_lock_irq(&tsk->sighand->siglock);
60 cval = tsk->signal->it_virt_expires; 59 cval = tsk->signal->it_virt_expires;
61 cinterval = tsk->signal->it_virt_incr; 60 cinterval = tsk->signal->it_virt_incr;
62 if (!cputime_eq(cval, cputime_zero)) { 61 if (!cputime_eq(cval, cputime_zero)) {
63 struct task_struct *t = tsk; 62 struct task_cputime cputime;
64 cputime_t utime = tsk->signal->utime; 63 cputime_t utime;
65 do { 64
66 utime = cputime_add(utime, t->utime); 65 thread_group_cputime(tsk, &cputime);
67 t = next_thread(t); 66 utime = cputime.utime;
68 } while (t != tsk);
69 if (cputime_le(cval, utime)) { /* about to fire */ 67 if (cputime_le(cval, utime)) { /* about to fire */
70 cval = jiffies_to_cputime(1); 68 cval = jiffies_to_cputime(1);
71 } else { 69 } else {
@@ -73,25 +71,19 @@ int do_getitimer(int which, struct itimerval *value)
73 } 71 }
74 } 72 }
75 spin_unlock_irq(&tsk->sighand->siglock); 73 spin_unlock_irq(&tsk->sighand->siglock);
76 read_unlock(&tasklist_lock);
77 cputime_to_timeval(cval, &value->it_value); 74 cputime_to_timeval(cval, &value->it_value);
78 cputime_to_timeval(cinterval, &value->it_interval); 75 cputime_to_timeval(cinterval, &value->it_interval);
79 break; 76 break;
80 case ITIMER_PROF: 77 case ITIMER_PROF:
81 read_lock(&tasklist_lock);
82 spin_lock_irq(&tsk->sighand->siglock); 78 spin_lock_irq(&tsk->sighand->siglock);
83 cval = tsk->signal->it_prof_expires; 79 cval = tsk->signal->it_prof_expires;
84 cinterval = tsk->signal->it_prof_incr; 80 cinterval = tsk->signal->it_prof_incr;
85 if (!cputime_eq(cval, cputime_zero)) { 81 if (!cputime_eq(cval, cputime_zero)) {
86 struct task_struct *t = tsk; 82 struct task_cputime times;
87 cputime_t ptime = cputime_add(tsk->signal->utime, 83 cputime_t ptime;
88 tsk->signal->stime); 84
89 do { 85 thread_group_cputime(tsk, &times);
90 ptime = cputime_add(ptime, 86 ptime = cputime_add(times.utime, times.stime);
91 cputime_add(t->utime,
92 t->stime));
93 t = next_thread(t);
94 } while (t != tsk);
95 if (cputime_le(cval, ptime)) { /* about to fire */ 87 if (cputime_le(cval, ptime)) { /* about to fire */
96 cval = jiffies_to_cputime(1); 88 cval = jiffies_to_cputime(1);
97 } else { 89 } else {
@@ -99,7 +91,6 @@ int do_getitimer(int which, struct itimerval *value)
99 } 91 }
100 } 92 }
101 spin_unlock_irq(&tsk->sighand->siglock); 93 spin_unlock_irq(&tsk->sighand->siglock);
102 read_unlock(&tasklist_lock);
103 cputime_to_timeval(cval, &value->it_value); 94 cputime_to_timeval(cval, &value->it_value);
104 cputime_to_timeval(cinterval, &value->it_interval); 95 cputime_to_timeval(cinterval, &value->it_interval);
105 break; 96 break;
@@ -185,7 +176,6 @@ again:
185 case ITIMER_VIRTUAL: 176 case ITIMER_VIRTUAL:
186 nval = timeval_to_cputime(&value->it_value); 177 nval = timeval_to_cputime(&value->it_value);
187 ninterval = timeval_to_cputime(&value->it_interval); 178 ninterval = timeval_to_cputime(&value->it_interval);
188 read_lock(&tasklist_lock);
189 spin_lock_irq(&tsk->sighand->siglock); 179 spin_lock_irq(&tsk->sighand->siglock);
190 cval = tsk->signal->it_virt_expires; 180 cval = tsk->signal->it_virt_expires;
191 cinterval = tsk->signal->it_virt_incr; 181 cinterval = tsk->signal->it_virt_incr;
@@ -200,7 +190,6 @@ again:
200 tsk->signal->it_virt_expires = nval; 190 tsk->signal->it_virt_expires = nval;
201 tsk->signal->it_virt_incr = ninterval; 191 tsk->signal->it_virt_incr = ninterval;
202 spin_unlock_irq(&tsk->sighand->siglock); 192 spin_unlock_irq(&tsk->sighand->siglock);
203 read_unlock(&tasklist_lock);
204 if (ovalue) { 193 if (ovalue) {
205 cputime_to_timeval(cval, &ovalue->it_value); 194 cputime_to_timeval(cval, &ovalue->it_value);
206 cputime_to_timeval(cinterval, &ovalue->it_interval); 195 cputime_to_timeval(cinterval, &ovalue->it_interval);
@@ -209,7 +198,6 @@ again:
209 case ITIMER_PROF: 198 case ITIMER_PROF:
210 nval = timeval_to_cputime(&value->it_value); 199 nval = timeval_to_cputime(&value->it_value);
211 ninterval = timeval_to_cputime(&value->it_interval); 200 ninterval = timeval_to_cputime(&value->it_interval);
212 read_lock(&tasklist_lock);
213 spin_lock_irq(&tsk->sighand->siglock); 201 spin_lock_irq(&tsk->sighand->siglock);
214 cval = tsk->signal->it_prof_expires; 202 cval = tsk->signal->it_prof_expires;
215 cinterval = tsk->signal->it_prof_incr; 203 cinterval = tsk->signal->it_prof_incr;
@@ -224,7 +212,6 @@ again:
224 tsk->signal->it_prof_expires = nval; 212 tsk->signal->it_prof_expires = nval;
225 tsk->signal->it_prof_incr = ninterval; 213 tsk->signal->it_prof_incr = ninterval;
226 spin_unlock_irq(&tsk->sighand->siglock); 214 spin_unlock_irq(&tsk->sighand->siglock);
227 read_unlock(&tasklist_lock);
228 if (ovalue) { 215 if (ovalue) {
229 cputime_to_timeval(cval, &ovalue->it_value); 216 cputime_to_timeval(cval, &ovalue->it_value);
230 cputime_to_timeval(cinterval, &ovalue->it_interval); 217 cputime_to_timeval(cinterval, &ovalue->it_interval);
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 6fc0040f3e3a..5072cf1685a2 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -176,7 +176,7 @@ static unsigned long get_symbol_pos(unsigned long addr,
176 high = kallsyms_num_syms; 176 high = kallsyms_num_syms;
177 177
178 while (high - low > 1) { 178 while (high - low > 1) {
179 mid = (low + high) / 2; 179 mid = low + (high - low) / 2;
180 if (kallsyms_addresses[mid] <= addr) 180 if (kallsyms_addresses[mid] <= addr)
181 low = mid; 181 low = mid;
182 else 182 else
@@ -260,7 +260,6 @@ const char *kallsyms_lookup(unsigned long addr,
260 /* see if it's in a module */ 260 /* see if it's in a module */
261 return module_address_lookup(addr, symbolsize, offset, modname, 261 return module_address_lookup(addr, symbolsize, offset, modname,
262 namebuf); 262 namebuf);
263 return NULL;
264} 263}
265 264
266int lookup_symbol_name(unsigned long addr, char *symname) 265int lookup_symbol_name(unsigned long addr, char *symname)
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 1c5fcacbcf33..ac0fde7b54d0 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -12,7 +12,7 @@
12#include <linux/slab.h> 12#include <linux/slab.h>
13#include <linux/fs.h> 13#include <linux/fs.h>
14#include <linux/kexec.h> 14#include <linux/kexec.h>
15#include <linux/spinlock.h> 15#include <linux/mutex.h>
16#include <linux/list.h> 16#include <linux/list.h>
17#include <linux/highmem.h> 17#include <linux/highmem.h>
18#include <linux/syscalls.h> 18#include <linux/syscalls.h>
@@ -24,6 +24,13 @@
24#include <linux/utsrelease.h> 24#include <linux/utsrelease.h>
25#include <linux/utsname.h> 25#include <linux/utsname.h>
26#include <linux/numa.h> 26#include <linux/numa.h>
27#include <linux/suspend.h>
28#include <linux/device.h>
29#include <linux/freezer.h>
30#include <linux/pm.h>
31#include <linux/cpu.h>
32#include <linux/console.h>
33#include <linux/vmalloc.h>
27 34
28#include <asm/page.h> 35#include <asm/page.h>
29#include <asm/uaccess.h> 36#include <asm/uaccess.h>
@@ -71,7 +78,7 @@ int kexec_should_crash(struct task_struct *p)
71 * 78 *
72 * The code for the transition from the current kernel to the 79 * The code for the transition from the current kernel to the
73 * the new kernel is placed in the control_code_buffer, whose size 80 * the new kernel is placed in the control_code_buffer, whose size
74 * is given by KEXEC_CONTROL_CODE_SIZE. In the best case only a single 81 * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single
75 * page of memory is necessary, but some architectures require more. 82 * page of memory is necessary, but some architectures require more.
76 * Because this memory must be identity mapped in the transition from 83 * Because this memory must be identity mapped in the transition from
77 * virtual to physical addresses it must live in the range 84 * virtual to physical addresses it must live in the range
@@ -236,12 +243,18 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
236 */ 243 */
237 result = -ENOMEM; 244 result = -ENOMEM;
238 image->control_code_page = kimage_alloc_control_pages(image, 245 image->control_code_page = kimage_alloc_control_pages(image,
239 get_order(KEXEC_CONTROL_CODE_SIZE)); 246 get_order(KEXEC_CONTROL_PAGE_SIZE));
240 if (!image->control_code_page) { 247 if (!image->control_code_page) {
241 printk(KERN_ERR "Could not allocate control_code_buffer\n"); 248 printk(KERN_ERR "Could not allocate control_code_buffer\n");
242 goto out; 249 goto out;
243 } 250 }
244 251
252 image->swap_page = kimage_alloc_control_pages(image, 0);
253 if (!image->swap_page) {
254 printk(KERN_ERR "Could not allocate swap buffer\n");
255 goto out;
256 }
257
245 result = 0; 258 result = 0;
246 out: 259 out:
247 if (result == 0) 260 if (result == 0)
@@ -305,7 +318,7 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
305 */ 318 */
306 result = -ENOMEM; 319 result = -ENOMEM;
307 image->control_code_page = kimage_alloc_control_pages(image, 320 image->control_code_page = kimage_alloc_control_pages(image,
308 get_order(KEXEC_CONTROL_CODE_SIZE)); 321 get_order(KEXEC_CONTROL_PAGE_SIZE));
309 if (!image->control_code_page) { 322 if (!image->control_code_page) {
310 printk(KERN_ERR "Could not allocate control_code_buffer\n"); 323 printk(KERN_ERR "Could not allocate control_code_buffer\n");
311 goto out; 324 goto out;
@@ -589,14 +602,12 @@ static void kimage_free_extra_pages(struct kimage *image)
589 kimage_free_page_list(&image->unuseable_pages); 602 kimage_free_page_list(&image->unuseable_pages);
590 603
591} 604}
592static int kimage_terminate(struct kimage *image) 605static void kimage_terminate(struct kimage *image)
593{ 606{
594 if (*image->entry != 0) 607 if (*image->entry != 0)
595 image->entry++; 608 image->entry++;
596 609
597 *image->entry = IND_DONE; 610 *image->entry = IND_DONE;
598
599 return 0;
600} 611}
601 612
602#define for_each_kimage_entry(image, ptr, entry) \ 613#define for_each_kimage_entry(image, ptr, entry) \
@@ -743,8 +754,14 @@ static struct page *kimage_alloc_page(struct kimage *image,
743 *old = addr | (*old & ~PAGE_MASK); 754 *old = addr | (*old & ~PAGE_MASK);
744 755
745 /* The old page I have found cannot be a 756 /* The old page I have found cannot be a
746 * destination page, so return it. 757 * destination page, so return it if it's
758 * gfp_flags honor the ones passed in.
747 */ 759 */
760 if (!(gfp_mask & __GFP_HIGHMEM) &&
761 PageHighMem(old_page)) {
762 kimage_free_pages(old_page);
763 continue;
764 }
748 addr = old_addr; 765 addr = old_addr;
749 page = old_page; 766 page = old_page;
750 break; 767 break;
@@ -914,19 +931,14 @@ static int kimage_load_segment(struct kimage *image,
914 */ 931 */
915struct kimage *kexec_image; 932struct kimage *kexec_image;
916struct kimage *kexec_crash_image; 933struct kimage *kexec_crash_image;
917/* 934
918 * A home grown binary mutex. 935static DEFINE_MUTEX(kexec_mutex);
919 * Nothing can wait so this mutex is safe to use
920 * in interrupt context :)
921 */
922static int kexec_lock;
923 936
924asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments, 937asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
925 struct kexec_segment __user *segments, 938 struct kexec_segment __user *segments,
926 unsigned long flags) 939 unsigned long flags)
927{ 940{
928 struct kimage **dest_image, *image; 941 struct kimage **dest_image, *image;
929 int locked;
930 int result; 942 int result;
931 943
932 /* We only trust the superuser with rebooting the system. */ 944 /* We only trust the superuser with rebooting the system. */
@@ -962,8 +974,7 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
962 * 974 *
963 * KISS: always take the mutex. 975 * KISS: always take the mutex.
964 */ 976 */
965 locked = xchg(&kexec_lock, 1); 977 if (!mutex_trylock(&kexec_mutex))
966 if (locked)
967 return -EBUSY; 978 return -EBUSY;
968 979
969 dest_image = &kexec_image; 980 dest_image = &kexec_image;
@@ -988,6 +999,8 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
988 if (result) 999 if (result)
989 goto out; 1000 goto out;
990 1001
1002 if (flags & KEXEC_PRESERVE_CONTEXT)
1003 image->preserve_context = 1;
991 result = machine_kexec_prepare(image); 1004 result = machine_kexec_prepare(image);
992 if (result) 1005 if (result)
993 goto out; 1006 goto out;
@@ -997,16 +1010,13 @@ asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
997 if (result) 1010 if (result)
998 goto out; 1011 goto out;
999 } 1012 }
1000 result = kimage_terminate(image); 1013 kimage_terminate(image);
1001 if (result)
1002 goto out;
1003 } 1014 }
1004 /* Install the new kernel, and Uninstall the old */ 1015 /* Install the new kernel, and Uninstall the old */
1005 image = xchg(dest_image, image); 1016 image = xchg(dest_image, image);
1006 1017
1007out: 1018out:
1008 locked = xchg(&kexec_lock, 0); /* Release the mutex */ 1019 mutex_unlock(&kexec_mutex);
1009 BUG_ON(!locked);
1010 kimage_free(image); 1020 kimage_free(image);
1011 1021
1012 return result; 1022 return result;
@@ -1053,10 +1063,7 @@ asmlinkage long compat_sys_kexec_load(unsigned long entry,
1053 1063
1054void crash_kexec(struct pt_regs *regs) 1064void crash_kexec(struct pt_regs *regs)
1055{ 1065{
1056 int locked; 1066 /* Take the kexec_mutex here to prevent sys_kexec_load
1057
1058
1059 /* Take the kexec_lock here to prevent sys_kexec_load
1060 * running on one cpu from replacing the crash kernel 1067 * running on one cpu from replacing the crash kernel
1061 * we are using after a panic on a different cpu. 1068 * we are using after a panic on a different cpu.
1062 * 1069 *
@@ -1064,8 +1071,7 @@ void crash_kexec(struct pt_regs *regs)
1064 * of memory the xchg(&kexec_crash_image) would be 1071 * of memory the xchg(&kexec_crash_image) would be
1065 * sufficient. But since I reuse the memory... 1072 * sufficient. But since I reuse the memory...
1066 */ 1073 */
1067 locked = xchg(&kexec_lock, 1); 1074 if (mutex_trylock(&kexec_mutex)) {
1068 if (!locked) {
1069 if (kexec_crash_image) { 1075 if (kexec_crash_image) {
1070 struct pt_regs fixed_regs; 1076 struct pt_regs fixed_regs;
1071 crash_setup_regs(&fixed_regs, regs); 1077 crash_setup_regs(&fixed_regs, regs);
@@ -1073,8 +1079,7 @@ void crash_kexec(struct pt_regs *regs)
1073 machine_crash_shutdown(&fixed_regs); 1079 machine_crash_shutdown(&fixed_regs);
1074 machine_kexec(kexec_crash_image); 1080 machine_kexec(kexec_crash_image);
1075 } 1081 }
1076 locked = xchg(&kexec_lock, 0); 1082 mutex_unlock(&kexec_mutex);
1077 BUG_ON(!locked);
1078 } 1083 }
1079} 1084}
1080 1085
@@ -1367,6 +1372,7 @@ static int __init crash_save_vmcoreinfo_init(void)
1367 VMCOREINFO_SYMBOL(node_online_map); 1372 VMCOREINFO_SYMBOL(node_online_map);
1368 VMCOREINFO_SYMBOL(swapper_pg_dir); 1373 VMCOREINFO_SYMBOL(swapper_pg_dir);
1369 VMCOREINFO_SYMBOL(_stext); 1374 VMCOREINFO_SYMBOL(_stext);
1375 VMCOREINFO_SYMBOL(vmlist);
1370 1376
1371#ifndef CONFIG_NEED_MULTIPLE_NODES 1377#ifndef CONFIG_NEED_MULTIPLE_NODES
1372 VMCOREINFO_SYMBOL(mem_map); 1378 VMCOREINFO_SYMBOL(mem_map);
@@ -1402,6 +1408,7 @@ static int __init crash_save_vmcoreinfo_init(void)
1402 VMCOREINFO_OFFSET(free_area, free_list); 1408 VMCOREINFO_OFFSET(free_area, free_list);
1403 VMCOREINFO_OFFSET(list_head, next); 1409 VMCOREINFO_OFFSET(list_head, next);
1404 VMCOREINFO_OFFSET(list_head, prev); 1410 VMCOREINFO_OFFSET(list_head, prev);
1411 VMCOREINFO_OFFSET(vm_struct, addr);
1405 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); 1412 VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
1406 VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); 1413 VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
1407 VMCOREINFO_NUMBER(NR_FREE_PAGES); 1414 VMCOREINFO_NUMBER(NR_FREE_PAGES);
@@ -1415,3 +1422,79 @@ static int __init crash_save_vmcoreinfo_init(void)
1415} 1422}
1416 1423
1417module_init(crash_save_vmcoreinfo_init) 1424module_init(crash_save_vmcoreinfo_init)
1425
1426/*
1427 * Move into place and start executing a preloaded standalone
1428 * executable. If nothing was preloaded return an error.
1429 */
1430int kernel_kexec(void)
1431{
1432 int error = 0;
1433
1434 if (!mutex_trylock(&kexec_mutex))
1435 return -EBUSY;
1436 if (!kexec_image) {
1437 error = -EINVAL;
1438 goto Unlock;
1439 }
1440
1441#ifdef CONFIG_KEXEC_JUMP
1442 if (kexec_image->preserve_context) {
1443 mutex_lock(&pm_mutex);
1444 pm_prepare_console();
1445 error = freeze_processes();
1446 if (error) {
1447 error = -EBUSY;
1448 goto Restore_console;
1449 }
1450 suspend_console();
1451 error = device_suspend(PMSG_FREEZE);
1452 if (error)
1453 goto Resume_console;
1454 error = disable_nonboot_cpus();
1455 if (error)
1456 goto Resume_devices;
1457 device_pm_lock();
1458 local_irq_disable();
1459 /* At this point, device_suspend() has been called,
1460 * but *not* device_power_down(). We *must*
1461 * device_power_down() now. Otherwise, drivers for
1462 * some devices (e.g. interrupt controllers) become
1463 * desynchronized with the actual state of the
1464 * hardware at resume time, and evil weirdness ensues.
1465 */
1466 error = device_power_down(PMSG_FREEZE);
1467 if (error)
1468 goto Enable_irqs;
1469 } else
1470#endif
1471 {
1472 kernel_restart_prepare(NULL);
1473 printk(KERN_EMERG "Starting new kernel\n");
1474 machine_shutdown();
1475 }
1476
1477 machine_kexec(kexec_image);
1478
1479#ifdef CONFIG_KEXEC_JUMP
1480 if (kexec_image->preserve_context) {
1481 device_power_up(PMSG_RESTORE);
1482 Enable_irqs:
1483 local_irq_enable();
1484 device_pm_unlock();
1485 enable_nonboot_cpus();
1486 Resume_devices:
1487 device_resume(PMSG_RESTORE);
1488 Resume_console:
1489 resume_console();
1490 thaw_processes();
1491 Restore_console:
1492 pm_restore_console();
1493 mutex_unlock(&pm_mutex);
1494 }
1495#endif
1496
1497 Unlock:
1498 mutex_unlock(&kexec_mutex);
1499 return error;
1500}
diff --git a/kernel/kgdb.c b/kernel/kgdb.c
index 3ec23c3ec97f..e4dcfb2272a4 100644
--- a/kernel/kgdb.c
+++ b/kernel/kgdb.c
@@ -56,12 +56,14 @@
56 56
57static int kgdb_break_asap; 57static int kgdb_break_asap;
58 58
59#define KGDB_MAX_THREAD_QUERY 17
59struct kgdb_state { 60struct kgdb_state {
60 int ex_vector; 61 int ex_vector;
61 int signo; 62 int signo;
62 int err_code; 63 int err_code;
63 int cpu; 64 int cpu;
64 int pass_exception; 65 int pass_exception;
66 unsigned long thr_query;
65 unsigned long threadid; 67 unsigned long threadid;
66 long kgdb_usethreadid; 68 long kgdb_usethreadid;
67 struct pt_regs *linux_regs; 69 struct pt_regs *linux_regs;
@@ -166,13 +168,6 @@ early_param("nokgdbroundup", opt_nokgdbroundup);
166 * Weak aliases for breakpoint management, 168 * Weak aliases for breakpoint management,
167 * can be overriden by architectures when needed: 169 * can be overriden by architectures when needed:
168 */ 170 */
169int __weak kgdb_validate_break_address(unsigned long addr)
170{
171 char tmp_variable[BREAK_INSTR_SIZE];
172
173 return probe_kernel_read(tmp_variable, (char *)addr, BREAK_INSTR_SIZE);
174}
175
176int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr) 171int __weak kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr)
177{ 172{
178 int err; 173 int err;
@@ -191,6 +186,25 @@ int __weak kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle)
191 (char *)bundle, BREAK_INSTR_SIZE); 186 (char *)bundle, BREAK_INSTR_SIZE);
192} 187}
193 188
189int __weak kgdb_validate_break_address(unsigned long addr)
190{
191 char tmp_variable[BREAK_INSTR_SIZE];
192 int err;
193 /* Validate setting the breakpoint and then removing it. In the
194 * remove fails, the kernel needs to emit a bad message because we
195 * are deep trouble not being able to put things back the way we
196 * found them.
197 */
198 err = kgdb_arch_set_breakpoint(addr, tmp_variable);
199 if (err)
200 return err;
201 err = kgdb_arch_remove_breakpoint(addr, tmp_variable);
202 if (err)
203 printk(KERN_ERR "KGDB: Critical breakpoint error, kernel "
204 "memory destroyed at: %lx", addr);
205 return err;
206}
207
194unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs) 208unsigned long __weak kgdb_arch_pc(int exception, struct pt_regs *regs)
195{ 209{
196 return instruction_pointer(regs); 210 return instruction_pointer(regs);
@@ -433,9 +447,14 @@ int kgdb_hex2long(char **ptr, unsigned long *long_val)
433{ 447{
434 int hex_val; 448 int hex_val;
435 int num = 0; 449 int num = 0;
450 int negate = 0;
436 451
437 *long_val = 0; 452 *long_val = 0;
438 453
454 if (**ptr == '-') {
455 negate = 1;
456 (*ptr)++;
457 }
439 while (**ptr) { 458 while (**ptr) {
440 hex_val = hex(**ptr); 459 hex_val = hex(**ptr);
441 if (hex_val < 0) 460 if (hex_val < 0)
@@ -446,6 +465,9 @@ int kgdb_hex2long(char **ptr, unsigned long *long_val)
446 (*ptr)++; 465 (*ptr)++;
447 } 466 }
448 467
468 if (negate)
469 *long_val = -*long_val;
470
449 return num; 471 return num;
450} 472}
451 473
@@ -466,7 +488,7 @@ static int write_mem_msg(int binary)
466 if (err) 488 if (err)
467 return err; 489 return err;
468 if (CACHE_FLUSH_IS_SAFE) 490 if (CACHE_FLUSH_IS_SAFE)
469 flush_icache_range(addr, addr + length + 1); 491 flush_icache_range(addr, addr + length);
470 return 0; 492 return 0;
471 } 493 }
472 494
@@ -515,10 +537,16 @@ static void int_to_threadref(unsigned char *id, int value)
515static struct task_struct *getthread(struct pt_regs *regs, int tid) 537static struct task_struct *getthread(struct pt_regs *regs, int tid)
516{ 538{
517 /* 539 /*
518 * Non-positive TIDs are remapped idle tasks: 540 * Non-positive TIDs are remapped to the cpu shadow information
519 */ 541 */
520 if (tid <= 0) 542 if (tid == 0 || tid == -1)
521 return idle_task(-tid); 543 tid = -atomic_read(&kgdb_active) - 2;
544 if (tid < 0) {
545 if (kgdb_info[-tid - 2].task)
546 return kgdb_info[-tid - 2].task;
547 else
548 return idle_task(-tid - 2);
549 }
522 550
523 /* 551 /*
524 * find_task_by_pid_ns() does not take the tasklist lock anymore 552 * find_task_by_pid_ns() does not take the tasklist lock anymore
@@ -562,6 +590,7 @@ static void kgdb_wait(struct pt_regs *regs)
562 590
563 /* Signal the primary CPU that we are done: */ 591 /* Signal the primary CPU that we are done: */
564 atomic_set(&cpu_in_kgdb[cpu], 0); 592 atomic_set(&cpu_in_kgdb[cpu], 0);
593 touch_softlockup_watchdog();
565 clocksource_touch_watchdog(); 594 clocksource_touch_watchdog();
566 local_irq_restore(flags); 595 local_irq_restore(flags);
567} 596}
@@ -725,14 +754,15 @@ setundefined:
725} 754}
726 755
727/* 756/*
728 * Remap normal tasks to their real PID, idle tasks to -1 ... -NR_CPUs: 757 * Remap normal tasks to their real PID,
758 * CPU shadow threads are mapped to -CPU - 2
729 */ 759 */
730static inline int shadow_pid(int realpid) 760static inline int shadow_pid(int realpid)
731{ 761{
732 if (realpid) 762 if (realpid)
733 return realpid; 763 return realpid;
734 764
735 return -1-raw_smp_processor_id(); 765 return -raw_smp_processor_id() - 2;
736} 766}
737 767
738static char gdbmsgbuf[BUFMAX + 1]; 768static char gdbmsgbuf[BUFMAX + 1];
@@ -826,7 +856,7 @@ static void gdb_cmd_getregs(struct kgdb_state *ks)
826 local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo; 856 local_debuggerinfo = kgdb_info[ks->cpu].debuggerinfo;
827 } else { 857 } else {
828 local_debuggerinfo = NULL; 858 local_debuggerinfo = NULL;
829 for (i = 0; i < NR_CPUS; i++) { 859 for_each_online_cpu(i) {
830 /* 860 /*
831 * Try to find the task on some other 861 * Try to find the task on some other
832 * or possibly this node if we do not 862 * or possibly this node if we do not
@@ -960,10 +990,13 @@ static int gdb_cmd_reboot(struct kgdb_state *ks)
960/* Handle the 'q' query packets */ 990/* Handle the 'q' query packets */
961static void gdb_cmd_query(struct kgdb_state *ks) 991static void gdb_cmd_query(struct kgdb_state *ks)
962{ 992{
963 struct task_struct *thread; 993 struct task_struct *g;
994 struct task_struct *p;
964 unsigned char thref[8]; 995 unsigned char thref[8];
965 char *ptr; 996 char *ptr;
966 int i; 997 int i;
998 int cpu;
999 int finished = 0;
967 1000
968 switch (remcom_in_buffer[1]) { 1001 switch (remcom_in_buffer[1]) {
969 case 's': 1002 case 's':
@@ -973,22 +1006,34 @@ static void gdb_cmd_query(struct kgdb_state *ks)
973 break; 1006 break;
974 } 1007 }
975 1008
976 if (remcom_in_buffer[1] == 'f') 1009 i = 0;
977 ks->threadid = 1;
978
979 remcom_out_buffer[0] = 'm'; 1010 remcom_out_buffer[0] = 'm';
980 ptr = remcom_out_buffer + 1; 1011 ptr = remcom_out_buffer + 1;
981 1012 if (remcom_in_buffer[1] == 'f') {
982 for (i = 0; i < 17; ks->threadid++) { 1013 /* Each cpu is a shadow thread */
983 thread = getthread(ks->linux_regs, ks->threadid); 1014 for_each_online_cpu(cpu) {
984 if (thread) { 1015 ks->thr_query = 0;
985 int_to_threadref(thref, ks->threadid); 1016 int_to_threadref(thref, -cpu - 2);
986 pack_threadid(ptr, thref); 1017 pack_threadid(ptr, thref);
987 ptr += BUF_THREAD_ID_SIZE; 1018 ptr += BUF_THREAD_ID_SIZE;
988 *(ptr++) = ','; 1019 *(ptr++) = ',';
989 i++; 1020 i++;
990 } 1021 }
991 } 1022 }
1023
1024 do_each_thread(g, p) {
1025 if (i >= ks->thr_query && !finished) {
1026 int_to_threadref(thref, p->pid);
1027 pack_threadid(ptr, thref);
1028 ptr += BUF_THREAD_ID_SIZE;
1029 *(ptr++) = ',';
1030 ks->thr_query++;
1031 if (ks->thr_query % KGDB_MAX_THREAD_QUERY == 0)
1032 finished = 1;
1033 }
1034 i++;
1035 } while_each_thread(g, p);
1036
992 *(--ptr) = '\0'; 1037 *(--ptr) = '\0';
993 break; 1038 break;
994 1039
@@ -1011,15 +1056,15 @@ static void gdb_cmd_query(struct kgdb_state *ks)
1011 error_packet(remcom_out_buffer, -EINVAL); 1056 error_packet(remcom_out_buffer, -EINVAL);
1012 break; 1057 break;
1013 } 1058 }
1014 if (ks->threadid > 0) { 1059 if ((int)ks->threadid > 0) {
1015 kgdb_mem2hex(getthread(ks->linux_regs, 1060 kgdb_mem2hex(getthread(ks->linux_regs,
1016 ks->threadid)->comm, 1061 ks->threadid)->comm,
1017 remcom_out_buffer, 16); 1062 remcom_out_buffer, 16);
1018 } else { 1063 } else {
1019 static char tmpstr[23 + BUF_THREAD_ID_SIZE]; 1064 static char tmpstr[23 + BUF_THREAD_ID_SIZE];
1020 1065
1021 sprintf(tmpstr, "Shadow task %d for pid 0", 1066 sprintf(tmpstr, "shadowCPU%d",
1022 (int)(-ks->threadid-1)); 1067 (int)(-ks->threadid - 2));
1023 kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr)); 1068 kgdb_mem2hex(tmpstr, remcom_out_buffer, strlen(tmpstr));
1024 } 1069 }
1025 break; 1070 break;
@@ -1388,6 +1433,7 @@ acquirelock:
1388 atomic_read(&kgdb_cpu_doing_single_step) != cpu) { 1433 atomic_read(&kgdb_cpu_doing_single_step) != cpu) {
1389 1434
1390 atomic_set(&kgdb_active, -1); 1435 atomic_set(&kgdb_active, -1);
1436 touch_softlockup_watchdog();
1391 clocksource_touch_watchdog(); 1437 clocksource_touch_watchdog();
1392 local_irq_restore(flags); 1438 local_irq_restore(flags);
1393 1439
@@ -1418,7 +1464,7 @@ acquirelock:
1418 * Get the passive CPU lock which will hold all the non-primary 1464 * Get the passive CPU lock which will hold all the non-primary
1419 * CPU in a spin state while the debugger is active 1465 * CPU in a spin state while the debugger is active
1420 */ 1466 */
1421 if (!kgdb_single_step || !kgdb_contthread) { 1467 if (!kgdb_single_step) {
1422 for (i = 0; i < NR_CPUS; i++) 1468 for (i = 0; i < NR_CPUS; i++)
1423 atomic_set(&passive_cpu_wait[i], 1); 1469 atomic_set(&passive_cpu_wait[i], 1);
1424 } 1470 }
@@ -1431,7 +1477,7 @@ acquirelock:
1431 1477
1432#ifdef CONFIG_SMP 1478#ifdef CONFIG_SMP
1433 /* Signal the other CPUs to enter kgdb_wait() */ 1479 /* Signal the other CPUs to enter kgdb_wait() */
1434 if ((!kgdb_single_step || !kgdb_contthread) && kgdb_do_roundup) 1480 if ((!kgdb_single_step) && kgdb_do_roundup)
1435 kgdb_roundup_cpus(flags); 1481 kgdb_roundup_cpus(flags);
1436#endif 1482#endif
1437 1483
@@ -1450,7 +1496,7 @@ acquirelock:
1450 kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code); 1496 kgdb_post_primary_code(ks->linux_regs, ks->ex_vector, ks->err_code);
1451 kgdb_deactivate_sw_breakpoints(); 1497 kgdb_deactivate_sw_breakpoints();
1452 kgdb_single_step = 0; 1498 kgdb_single_step = 0;
1453 kgdb_contthread = NULL; 1499 kgdb_contthread = current;
1454 exception_level = 0; 1500 exception_level = 0;
1455 1501
1456 /* Talk to debugger with gdbserial protocol */ 1502 /* Talk to debugger with gdbserial protocol */
@@ -1464,7 +1510,7 @@ acquirelock:
1464 kgdb_info[ks->cpu].task = NULL; 1510 kgdb_info[ks->cpu].task = NULL;
1465 atomic_set(&cpu_in_kgdb[ks->cpu], 0); 1511 atomic_set(&cpu_in_kgdb[ks->cpu], 0);
1466 1512
1467 if (!kgdb_single_step || !kgdb_contthread) { 1513 if (!kgdb_single_step) {
1468 for (i = NR_CPUS-1; i >= 0; i--) 1514 for (i = NR_CPUS-1; i >= 0; i--)
1469 atomic_set(&passive_cpu_wait[i], 0); 1515 atomic_set(&passive_cpu_wait[i], 0);
1470 /* 1516 /*
@@ -1480,6 +1526,7 @@ acquirelock:
1480kgdb_restore: 1526kgdb_restore:
1481 /* Free kgdb_active */ 1527 /* Free kgdb_active */
1482 atomic_set(&kgdb_active, -1); 1528 atomic_set(&kgdb_active, -1);
1529 touch_softlockup_watchdog();
1483 clocksource_touch_watchdog(); 1530 clocksource_touch_watchdog();
1484 local_irq_restore(flags); 1531 local_irq_restore(flags);
1485 1532
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 8df97d3dfda8..3d3c3ea3a023 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -42,7 +42,7 @@ extern int max_threads;
42 42
43static struct workqueue_struct *khelper_wq; 43static struct workqueue_struct *khelper_wq;
44 44
45#ifdef CONFIG_KMOD 45#ifdef CONFIG_MODULES
46 46
47/* 47/*
48 modprobe_path is set via /proc/sys. 48 modprobe_path is set via /proc/sys.
@@ -113,7 +113,7 @@ int request_module(const char *fmt, ...)
113 return ret; 113 return ret;
114} 114}
115EXPORT_SYMBOL(request_module); 115EXPORT_SYMBOL(request_module);
116#endif /* CONFIG_KMOD */ 116#endif /* CONFIG_MODULES */
117 117
118struct subprocess_info { 118struct subprocess_info {
119 struct work_struct work; 119 struct work_struct work;
@@ -265,7 +265,7 @@ static void __call_usermodehelper(struct work_struct *work)
265 } 265 }
266} 266}
267 267
268#ifdef CONFIG_PM 268#ifdef CONFIG_PM_SLEEP
269/* 269/*
270 * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY 270 * If set, call_usermodehelper_exec() will exit immediately returning -EBUSY
271 * (used for preventing user land processes from being created after the user 271 * (used for preventing user land processes from being created after the user
@@ -288,39 +288,37 @@ static DECLARE_WAIT_QUEUE_HEAD(running_helpers_waitq);
288 */ 288 */
289#define RUNNING_HELPERS_TIMEOUT (5 * HZ) 289#define RUNNING_HELPERS_TIMEOUT (5 * HZ)
290 290
291static int usermodehelper_pm_callback(struct notifier_block *nfb, 291/**
292 unsigned long action, 292 * usermodehelper_disable - prevent new helpers from being started
293 void *ignored) 293 */
294int usermodehelper_disable(void)
294{ 295{
295 long retval; 296 long retval;
296 297
297 switch (action) { 298 usermodehelper_disabled = 1;
298 case PM_HIBERNATION_PREPARE: 299 smp_mb();
299 case PM_SUSPEND_PREPARE: 300 /*
300 usermodehelper_disabled = 1; 301 * From now on call_usermodehelper_exec() won't start any new
301 smp_mb(); 302 * helpers, so it is sufficient if running_helpers turns out to
302 /* 303 * be zero at one point (it may be increased later, but that
303 * From now on call_usermodehelper_exec() won't start any new 304 * doesn't matter).
304 * helpers, so it is sufficient if running_helpers turns out to 305 */
305 * be zero at one point (it may be increased later, but that 306 retval = wait_event_timeout(running_helpers_waitq,
306 * doesn't matter).
307 */
308 retval = wait_event_timeout(running_helpers_waitq,
309 atomic_read(&running_helpers) == 0, 307 atomic_read(&running_helpers) == 0,
310 RUNNING_HELPERS_TIMEOUT); 308 RUNNING_HELPERS_TIMEOUT);
311 if (retval) { 309 if (retval)
312 return NOTIFY_OK; 310 return 0;
313 } else {
314 usermodehelper_disabled = 0;
315 return NOTIFY_BAD;
316 }
317 case PM_POST_HIBERNATION:
318 case PM_POST_SUSPEND:
319 usermodehelper_disabled = 0;
320 return NOTIFY_OK;
321 }
322 311
323 return NOTIFY_DONE; 312 usermodehelper_disabled = 0;
313 return -EAGAIN;
314}
315
316/**
317 * usermodehelper_enable - allow new helpers to be started again
318 */
319void usermodehelper_enable(void)
320{
321 usermodehelper_disabled = 0;
324} 322}
325 323
326static void helper_lock(void) 324static void helper_lock(void)
@@ -334,34 +332,29 @@ static void helper_unlock(void)
334 if (atomic_dec_and_test(&running_helpers)) 332 if (atomic_dec_and_test(&running_helpers))
335 wake_up(&running_helpers_waitq); 333 wake_up(&running_helpers_waitq);
336} 334}
337 335#else /* CONFIG_PM_SLEEP */
338static void register_pm_notifier_callback(void)
339{
340 pm_notifier(usermodehelper_pm_callback, 0);
341}
342#else /* CONFIG_PM */
343#define usermodehelper_disabled 0 336#define usermodehelper_disabled 0
344 337
345static inline void helper_lock(void) {} 338static inline void helper_lock(void) {}
346static inline void helper_unlock(void) {} 339static inline void helper_unlock(void) {}
347static inline void register_pm_notifier_callback(void) {} 340#endif /* CONFIG_PM_SLEEP */
348#endif /* CONFIG_PM */
349 341
350/** 342/**
351 * call_usermodehelper_setup - prepare to call a usermode helper 343 * call_usermodehelper_setup - prepare to call a usermode helper
352 * @path: path to usermode executable 344 * @path: path to usermode executable
353 * @argv: arg vector for process 345 * @argv: arg vector for process
354 * @envp: environment for process 346 * @envp: environment for process
347 * @gfp_mask: gfp mask for memory allocation
355 * 348 *
356 * Returns either %NULL on allocation failure, or a subprocess_info 349 * Returns either %NULL on allocation failure, or a subprocess_info
357 * structure. This should be passed to call_usermodehelper_exec to 350 * structure. This should be passed to call_usermodehelper_exec to
358 * exec the process and free the structure. 351 * exec the process and free the structure.
359 */ 352 */
360struct subprocess_info *call_usermodehelper_setup(char *path, 353struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
361 char **argv, char **envp) 354 char **envp, gfp_t gfp_mask)
362{ 355{
363 struct subprocess_info *sub_info; 356 struct subprocess_info *sub_info;
364 sub_info = kzalloc(sizeof(struct subprocess_info), GFP_ATOMIC); 357 sub_info = kzalloc(sizeof(struct subprocess_info), gfp_mask);
365 if (!sub_info) 358 if (!sub_info)
366 goto out; 359 goto out;
367 360
@@ -417,12 +410,12 @@ int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info,
417{ 410{
418 struct file *f; 411 struct file *f;
419 412
420 f = create_write_pipe(); 413 f = create_write_pipe(0);
421 if (IS_ERR(f)) 414 if (IS_ERR(f))
422 return PTR_ERR(f); 415 return PTR_ERR(f);
423 *filp = f; 416 *filp = f;
424 417
425 f = create_read_pipe(f); 418 f = create_read_pipe(f, 0);
426 if (IS_ERR(f)) { 419 if (IS_ERR(f)) {
427 free_write_pipe(*filp); 420 free_write_pipe(*filp);
428 return PTR_ERR(f); 421 return PTR_ERR(f);
@@ -494,7 +487,7 @@ int call_usermodehelper_pipe(char *path, char **argv, char **envp,
494 struct subprocess_info *sub_info; 487 struct subprocess_info *sub_info;
495 int ret; 488 int ret;
496 489
497 sub_info = call_usermodehelper_setup(path, argv, envp); 490 sub_info = call_usermodehelper_setup(path, argv, envp, GFP_KERNEL);
498 if (sub_info == NULL) 491 if (sub_info == NULL)
499 return -ENOMEM; 492 return -ENOMEM;
500 493
@@ -514,5 +507,4 @@ void __init usermodehelper_init(void)
514{ 507{
515 khelper_wq = create_singlethread_workqueue("khelper"); 508 khelper_wq = create_singlethread_workqueue("khelper");
516 BUG_ON(!khelper_wq); 509 BUG_ON(!khelper_wq);
517 register_pm_notifier_callback();
518} 510}
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 1485ca8d0e00..8b57a2597f21 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -62,6 +62,7 @@
62 addr = ((kprobe_opcode_t *)(kallsyms_lookup_name(name))) 62 addr = ((kprobe_opcode_t *)(kallsyms_lookup_name(name)))
63#endif 63#endif
64 64
65static int kprobes_initialized;
65static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE]; 66static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
66static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE]; 67static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
67 68
@@ -69,8 +70,15 @@ static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
69static bool kprobe_enabled; 70static bool kprobe_enabled;
70 71
71DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */ 72DEFINE_MUTEX(kprobe_mutex); /* Protects kprobe_table */
72DEFINE_SPINLOCK(kretprobe_lock); /* Protects kretprobe_inst_table */
73static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL; 73static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
74static struct {
75 spinlock_t lock ____cacheline_aligned;
76} kretprobe_table_locks[KPROBE_TABLE_SIZE];
77
78static spinlock_t *kretprobe_table_lock_ptr(unsigned long hash)
79{
80 return &(kretprobe_table_locks[hash].lock);
81}
74 82
75/* 83/*
76 * Normally, functions that we'd want to prohibit kprobes in, are marked 84 * Normally, functions that we'd want to prohibit kprobes in, are marked
@@ -368,26 +376,53 @@ void __kprobes kprobes_inc_nmissed_count(struct kprobe *p)
368 return; 376 return;
369} 377}
370 378
371/* Called with kretprobe_lock held */
372void __kprobes recycle_rp_inst(struct kretprobe_instance *ri, 379void __kprobes recycle_rp_inst(struct kretprobe_instance *ri,
373 struct hlist_head *head) 380 struct hlist_head *head)
374{ 381{
382 struct kretprobe *rp = ri->rp;
383
375 /* remove rp inst off the rprobe_inst_table */ 384 /* remove rp inst off the rprobe_inst_table */
376 hlist_del(&ri->hlist); 385 hlist_del(&ri->hlist);
377 if (ri->rp) { 386 INIT_HLIST_NODE(&ri->hlist);
378 /* remove rp inst off the used list */ 387 if (likely(rp)) {
379 hlist_del(&ri->uflist); 388 spin_lock(&rp->lock);
380 /* put rp inst back onto the free list */ 389 hlist_add_head(&ri->hlist, &rp->free_instances);
381 INIT_HLIST_NODE(&ri->uflist); 390 spin_unlock(&rp->lock);
382 hlist_add_head(&ri->uflist, &ri->rp->free_instances);
383 } else 391 } else
384 /* Unregistering */ 392 /* Unregistering */
385 hlist_add_head(&ri->hlist, head); 393 hlist_add_head(&ri->hlist, head);
386} 394}
387 395
388struct hlist_head __kprobes *kretprobe_inst_table_head(struct task_struct *tsk) 396void kretprobe_hash_lock(struct task_struct *tsk,
397 struct hlist_head **head, unsigned long *flags)
389{ 398{
390 return &kretprobe_inst_table[hash_ptr(tsk, KPROBE_HASH_BITS)]; 399 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
400 spinlock_t *hlist_lock;
401
402 *head = &kretprobe_inst_table[hash];
403 hlist_lock = kretprobe_table_lock_ptr(hash);
404 spin_lock_irqsave(hlist_lock, *flags);
405}
406
407static void kretprobe_table_lock(unsigned long hash, unsigned long *flags)
408{
409 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
410 spin_lock_irqsave(hlist_lock, *flags);
411}
412
413void kretprobe_hash_unlock(struct task_struct *tsk, unsigned long *flags)
414{
415 unsigned long hash = hash_ptr(tsk, KPROBE_HASH_BITS);
416 spinlock_t *hlist_lock;
417
418 hlist_lock = kretprobe_table_lock_ptr(hash);
419 spin_unlock_irqrestore(hlist_lock, *flags);
420}
421
422void kretprobe_table_unlock(unsigned long hash, unsigned long *flags)
423{
424 spinlock_t *hlist_lock = kretprobe_table_lock_ptr(hash);
425 spin_unlock_irqrestore(hlist_lock, *flags);
391} 426}
392 427
393/* 428/*
@@ -401,17 +436,21 @@ void __kprobes kprobe_flush_task(struct task_struct *tk)
401 struct kretprobe_instance *ri; 436 struct kretprobe_instance *ri;
402 struct hlist_head *head, empty_rp; 437 struct hlist_head *head, empty_rp;
403 struct hlist_node *node, *tmp; 438 struct hlist_node *node, *tmp;
404 unsigned long flags = 0; 439 unsigned long hash, flags = 0;
405 440
406 INIT_HLIST_HEAD(&empty_rp); 441 if (unlikely(!kprobes_initialized))
407 spin_lock_irqsave(&kretprobe_lock, flags); 442 /* Early boot. kretprobe_table_locks not yet initialized. */
408 head = kretprobe_inst_table_head(tk); 443 return;
444
445 hash = hash_ptr(tk, KPROBE_HASH_BITS);
446 head = &kretprobe_inst_table[hash];
447 kretprobe_table_lock(hash, &flags);
409 hlist_for_each_entry_safe(ri, node, tmp, head, hlist) { 448 hlist_for_each_entry_safe(ri, node, tmp, head, hlist) {
410 if (ri->task == tk) 449 if (ri->task == tk)
411 recycle_rp_inst(ri, &empty_rp); 450 recycle_rp_inst(ri, &empty_rp);
412 } 451 }
413 spin_unlock_irqrestore(&kretprobe_lock, flags); 452 kretprobe_table_unlock(hash, &flags);
414 453 INIT_HLIST_HEAD(&empty_rp);
415 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) { 454 hlist_for_each_entry_safe(ri, node, tmp, &empty_rp, hlist) {
416 hlist_del(&ri->hlist); 455 hlist_del(&ri->hlist);
417 kfree(ri); 456 kfree(ri);
@@ -423,24 +462,29 @@ static inline void free_rp_inst(struct kretprobe *rp)
423 struct kretprobe_instance *ri; 462 struct kretprobe_instance *ri;
424 struct hlist_node *pos, *next; 463 struct hlist_node *pos, *next;
425 464
426 hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, uflist) { 465 hlist_for_each_entry_safe(ri, pos, next, &rp->free_instances, hlist) {
427 hlist_del(&ri->uflist); 466 hlist_del(&ri->hlist);
428 kfree(ri); 467 kfree(ri);
429 } 468 }
430} 469}
431 470
432static void __kprobes cleanup_rp_inst(struct kretprobe *rp) 471static void __kprobes cleanup_rp_inst(struct kretprobe *rp)
433{ 472{
434 unsigned long flags; 473 unsigned long flags, hash;
435 struct kretprobe_instance *ri; 474 struct kretprobe_instance *ri;
436 struct hlist_node *pos, *next; 475 struct hlist_node *pos, *next;
476 struct hlist_head *head;
477
437 /* No race here */ 478 /* No race here */
438 spin_lock_irqsave(&kretprobe_lock, flags); 479 for (hash = 0; hash < KPROBE_TABLE_SIZE; hash++) {
439 hlist_for_each_entry_safe(ri, pos, next, &rp->used_instances, uflist) { 480 kretprobe_table_lock(hash, &flags);
440 ri->rp = NULL; 481 head = &kretprobe_inst_table[hash];
441 hlist_del(&ri->uflist); 482 hlist_for_each_entry_safe(ri, pos, next, head, hlist) {
483 if (ri->rp == rp)
484 ri->rp = NULL;
485 }
486 kretprobe_table_unlock(hash, &flags);
442 } 487 }
443 spin_unlock_irqrestore(&kretprobe_lock, flags);
444 free_rp_inst(rp); 488 free_rp_inst(rp);
445} 489}
446 490
@@ -831,32 +875,37 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
831 struct pt_regs *regs) 875 struct pt_regs *regs)
832{ 876{
833 struct kretprobe *rp = container_of(p, struct kretprobe, kp); 877 struct kretprobe *rp = container_of(p, struct kretprobe, kp);
834 unsigned long flags = 0; 878 unsigned long hash, flags = 0;
879 struct kretprobe_instance *ri;
835 880
836 /*TODO: consider to only swap the RA after the last pre_handler fired */ 881 /*TODO: consider to only swap the RA after the last pre_handler fired */
837 spin_lock_irqsave(&kretprobe_lock, flags); 882 hash = hash_ptr(current, KPROBE_HASH_BITS);
883 spin_lock_irqsave(&rp->lock, flags);
838 if (!hlist_empty(&rp->free_instances)) { 884 if (!hlist_empty(&rp->free_instances)) {
839 struct kretprobe_instance *ri;
840
841 ri = hlist_entry(rp->free_instances.first, 885 ri = hlist_entry(rp->free_instances.first,
842 struct kretprobe_instance, uflist); 886 struct kretprobe_instance, hlist);
887 hlist_del(&ri->hlist);
888 spin_unlock_irqrestore(&rp->lock, flags);
889
843 ri->rp = rp; 890 ri->rp = rp;
844 ri->task = current; 891 ri->task = current;
845 892
846 if (rp->entry_handler && rp->entry_handler(ri, regs)) { 893 if (rp->entry_handler && rp->entry_handler(ri, regs)) {
847 spin_unlock_irqrestore(&kretprobe_lock, flags); 894 spin_unlock_irqrestore(&rp->lock, flags);
848 return 0; 895 return 0;
849 } 896 }
850 897
851 arch_prepare_kretprobe(ri, regs); 898 arch_prepare_kretprobe(ri, regs);
852 899
853 /* XXX(hch): why is there no hlist_move_head? */ 900 /* XXX(hch): why is there no hlist_move_head? */
854 hlist_del(&ri->uflist); 901 INIT_HLIST_NODE(&ri->hlist);
855 hlist_add_head(&ri->uflist, &ri->rp->used_instances); 902 kretprobe_table_lock(hash, &flags);
856 hlist_add_head(&ri->hlist, kretprobe_inst_table_head(ri->task)); 903 hlist_add_head(&ri->hlist, &kretprobe_inst_table[hash]);
857 } else 904 kretprobe_table_unlock(hash, &flags);
905 } else {
858 rp->nmissed++; 906 rp->nmissed++;
859 spin_unlock_irqrestore(&kretprobe_lock, flags); 907 spin_unlock_irqrestore(&rp->lock, flags);
908 }
860 return 0; 909 return 0;
861} 910}
862 911
@@ -892,7 +941,7 @@ static int __kprobes __register_kretprobe(struct kretprobe *rp,
892 rp->maxactive = NR_CPUS; 941 rp->maxactive = NR_CPUS;
893#endif 942#endif
894 } 943 }
895 INIT_HLIST_HEAD(&rp->used_instances); 944 spin_lock_init(&rp->lock);
896 INIT_HLIST_HEAD(&rp->free_instances); 945 INIT_HLIST_HEAD(&rp->free_instances);
897 for (i = 0; i < rp->maxactive; i++) { 946 for (i = 0; i < rp->maxactive; i++) {
898 inst = kmalloc(sizeof(struct kretprobe_instance) + 947 inst = kmalloc(sizeof(struct kretprobe_instance) +
@@ -901,8 +950,8 @@ static int __kprobes __register_kretprobe(struct kretprobe *rp,
901 free_rp_inst(rp); 950 free_rp_inst(rp);
902 return -ENOMEM; 951 return -ENOMEM;
903 } 952 }
904 INIT_HLIST_NODE(&inst->uflist); 953 INIT_HLIST_NODE(&inst->hlist);
905 hlist_add_head(&inst->uflist, &rp->free_instances); 954 hlist_add_head(&inst->hlist, &rp->free_instances);
906 } 955 }
907 956
908 rp->nmissed = 0; 957 rp->nmissed = 0;
@@ -1009,6 +1058,7 @@ static int __init init_kprobes(void)
1009 for (i = 0; i < KPROBE_TABLE_SIZE; i++) { 1058 for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
1010 INIT_HLIST_HEAD(&kprobe_table[i]); 1059 INIT_HLIST_HEAD(&kprobe_table[i]);
1011 INIT_HLIST_HEAD(&kretprobe_inst_table[i]); 1060 INIT_HLIST_HEAD(&kretprobe_inst_table[i]);
1061 spin_lock_init(&(kretprobe_table_locks[i].lock));
1012 } 1062 }
1013 1063
1014 /* 1064 /*
@@ -1050,6 +1100,7 @@ static int __init init_kprobes(void)
1050 err = arch_init_kprobes(); 1100 err = arch_init_kprobes();
1051 if (!err) 1101 if (!err)
1052 err = register_die_notifier(&kprobe_exceptions_nb); 1102 err = register_die_notifier(&kprobe_exceptions_nb);
1103 kprobes_initialized = (err == 0);
1053 1104
1054 if (!err) 1105 if (!err)
1055 init_test_probes(); 1106 init_test_probes();
@@ -1286,13 +1337,8 @@ EXPORT_SYMBOL_GPL(register_jprobe);
1286EXPORT_SYMBOL_GPL(unregister_jprobe); 1337EXPORT_SYMBOL_GPL(unregister_jprobe);
1287EXPORT_SYMBOL_GPL(register_jprobes); 1338EXPORT_SYMBOL_GPL(register_jprobes);
1288EXPORT_SYMBOL_GPL(unregister_jprobes); 1339EXPORT_SYMBOL_GPL(unregister_jprobes);
1289#ifdef CONFIG_KPROBES
1290EXPORT_SYMBOL_GPL(jprobe_return); 1340EXPORT_SYMBOL_GPL(jprobe_return);
1291#endif
1292
1293#ifdef CONFIG_KPROBES
1294EXPORT_SYMBOL_GPL(register_kretprobe); 1341EXPORT_SYMBOL_GPL(register_kretprobe);
1295EXPORT_SYMBOL_GPL(unregister_kretprobe); 1342EXPORT_SYMBOL_GPL(unregister_kretprobe);
1296EXPORT_SYMBOL_GPL(register_kretprobes); 1343EXPORT_SYMBOL_GPL(register_kretprobes);
1297EXPORT_SYMBOL_GPL(unregister_kretprobes); 1344EXPORT_SYMBOL_GPL(unregister_kretprobes);
1298#endif
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index e53bc30e9ba5..08dd8ed86c77 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -14,6 +14,7 @@
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/kexec.h> 16#include <linux/kexec.h>
17#include <linux/profile.h>
17#include <linux/sched.h> 18#include <linux/sched.h>
18 19
19#define KERNEL_ATTR_RO(_name) \ 20#define KERNEL_ATTR_RO(_name) \
@@ -53,6 +54,37 @@ static ssize_t uevent_helper_store(struct kobject *kobj,
53KERNEL_ATTR_RW(uevent_helper); 54KERNEL_ATTR_RW(uevent_helper);
54#endif 55#endif
55 56
57#ifdef CONFIG_PROFILING
58static ssize_t profiling_show(struct kobject *kobj,
59 struct kobj_attribute *attr, char *buf)
60{
61 return sprintf(buf, "%d\n", prof_on);
62}
63static ssize_t profiling_store(struct kobject *kobj,
64 struct kobj_attribute *attr,
65 const char *buf, size_t count)
66{
67 int ret;
68
69 if (prof_on)
70 return -EEXIST;
71 /*
72 * This eventually calls into get_option() which
73 * has a ton of callers and is not const. It is
74 * easiest to cast it away here.
75 */
76 profile_setup((char *)buf);
77 ret = profile_init();
78 if (ret)
79 return ret;
80 ret = create_proc_profile();
81 if (ret)
82 return ret;
83 return count;
84}
85KERNEL_ATTR_RW(profiling);
86#endif
87
56#ifdef CONFIG_KEXEC 88#ifdef CONFIG_KEXEC
57static ssize_t kexec_loaded_show(struct kobject *kobj, 89static ssize_t kexec_loaded_show(struct kobject *kobj,
58 struct kobj_attribute *attr, char *buf) 90 struct kobj_attribute *attr, char *buf)
@@ -109,6 +141,9 @@ static struct attribute * kernel_attrs[] = {
109 &uevent_seqnum_attr.attr, 141 &uevent_seqnum_attr.attr,
110 &uevent_helper_attr.attr, 142 &uevent_helper_attr.attr,
111#endif 143#endif
144#ifdef CONFIG_PROFILING
145 &profiling_attr.attr,
146#endif
112#ifdef CONFIG_KEXEC 147#ifdef CONFIG_KEXEC
113 &kexec_loaded_attr.attr, 148 &kexec_loaded_attr.attr,
114 &kexec_crash_loaded_attr.attr, 149 &kexec_crash_loaded_attr.attr,
diff --git a/kernel/kthread.c b/kernel/kthread.c
index ac3fb7326641..8e7a7ce3ed0a 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -13,6 +13,7 @@
13#include <linux/file.h> 13#include <linux/file.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/mutex.h> 15#include <linux/mutex.h>
16#include <trace/sched.h>
16 17
17#define KTHREAD_NICE_LEVEL (-5) 18#define KTHREAD_NICE_LEVEL (-5)
18 19
@@ -106,7 +107,7 @@ static void create_kthread(struct kthread_create_info *create)
106 */ 107 */
107 sched_setscheduler(create->result, SCHED_NORMAL, &param); 108 sched_setscheduler(create->result, SCHED_NORMAL, &param);
108 set_user_nice(create->result, KTHREAD_NICE_LEVEL); 109 set_user_nice(create->result, KTHREAD_NICE_LEVEL);
109 set_cpus_allowed(create->result, CPU_MASK_ALL); 110 set_cpus_allowed_ptr(create->result, CPU_MASK_ALL_PTR);
110 } 111 }
111 complete(&create->done); 112 complete(&create->done);
112} 113}
@@ -171,12 +172,11 @@ EXPORT_SYMBOL(kthread_create);
171 */ 172 */
172void kthread_bind(struct task_struct *k, unsigned int cpu) 173void kthread_bind(struct task_struct *k, unsigned int cpu)
173{ 174{
174 if (k->state != TASK_UNINTERRUPTIBLE) { 175 /* Must have done schedule() in kthread() before we set_task_cpu */
176 if (!wait_task_inactive(k, TASK_UNINTERRUPTIBLE)) {
175 WARN_ON(1); 177 WARN_ON(1);
176 return; 178 return;
177 } 179 }
178 /* Must have done schedule() in kthread() before we set_task_cpu */
179 wait_task_inactive(k);
180 set_task_cpu(k, cpu); 180 set_task_cpu(k, cpu);
181 k->cpus_allowed = cpumask_of_cpu(cpu); 181 k->cpus_allowed = cpumask_of_cpu(cpu);
182 k->rt.nr_cpus_allowed = 1; 182 k->rt.nr_cpus_allowed = 1;
@@ -206,6 +206,8 @@ int kthread_stop(struct task_struct *k)
206 /* It could exit after stop_info.k set, but before wake_up_process. */ 206 /* It could exit after stop_info.k set, but before wake_up_process. */
207 get_task_struct(k); 207 get_task_struct(k);
208 208
209 trace_sched_kthread_stop(k);
210
209 /* Must init completion *before* thread sees kthread_stop_info.k */ 211 /* Must init completion *before* thread sees kthread_stop_info.k */
210 init_completion(&kthread_stop_info.done); 212 init_completion(&kthread_stop_info.done);
211 smp_wmb(); 213 smp_wmb();
@@ -221,6 +223,8 @@ int kthread_stop(struct task_struct *k)
221 ret = kthread_stop_info.err; 223 ret = kthread_stop_info.err;
222 mutex_unlock(&kthread_stop_lock); 224 mutex_unlock(&kthread_stop_lock);
223 225
226 trace_sched_kthread_stop_ret(ret);
227
224 return ret; 228 return ret;
225} 229}
226EXPORT_SYMBOL(kthread_stop); 230EXPORT_SYMBOL(kthread_stop);
@@ -233,7 +237,7 @@ int kthreadd(void *unused)
233 set_task_comm(tsk, "kthreadd"); 237 set_task_comm(tsk, "kthreadd");
234 ignore_signals(tsk); 238 ignore_signals(tsk);
235 set_user_nice(tsk, KTHREAD_NICE_LEVEL); 239 set_user_nice(tsk, KTHREAD_NICE_LEVEL);
236 set_cpus_allowed(tsk, CPU_MASK_ALL); 240 set_cpus_allowed_ptr(tsk, CPU_MASK_ALL_PTR);
237 241
238 current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG; 242 current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
239 243
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index d38a64362973..dbda475b13bd 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -124,6 +124,15 @@ static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES];
124unsigned long nr_lock_classes; 124unsigned long nr_lock_classes;
125static struct lock_class lock_classes[MAX_LOCKDEP_KEYS]; 125static struct lock_class lock_classes[MAX_LOCKDEP_KEYS];
126 126
127static inline struct lock_class *hlock_class(struct held_lock *hlock)
128{
129 if (!hlock->class_idx) {
130 DEBUG_LOCKS_WARN_ON(1);
131 return NULL;
132 }
133 return lock_classes + hlock->class_idx - 1;
134}
135
127#ifdef CONFIG_LOCK_STAT 136#ifdef CONFIG_LOCK_STAT
128static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats); 137static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats);
129 138
@@ -222,7 +231,7 @@ static void lock_release_holdtime(struct held_lock *hlock)
222 231
223 holdtime = sched_clock() - hlock->holdtime_stamp; 232 holdtime = sched_clock() - hlock->holdtime_stamp;
224 233
225 stats = get_lock_stats(hlock->class); 234 stats = get_lock_stats(hlock_class(hlock));
226 if (hlock->read) 235 if (hlock->read)
227 lock_time_inc(&stats->read_holdtime, holdtime); 236 lock_time_inc(&stats->read_holdtime, holdtime);
228 else 237 else
@@ -372,6 +381,19 @@ unsigned int nr_process_chains;
372unsigned int max_lockdep_depth; 381unsigned int max_lockdep_depth;
373unsigned int max_recursion_depth; 382unsigned int max_recursion_depth;
374 383
384static unsigned int lockdep_dependency_gen_id;
385
386static bool lockdep_dependency_visit(struct lock_class *source,
387 unsigned int depth)
388{
389 if (!depth)
390 lockdep_dependency_gen_id++;
391 if (source->dep_gen_id == lockdep_dependency_gen_id)
392 return true;
393 source->dep_gen_id = lockdep_dependency_gen_id;
394 return false;
395}
396
375#ifdef CONFIG_DEBUG_LOCKDEP 397#ifdef CONFIG_DEBUG_LOCKDEP
376/* 398/*
377 * We cannot printk in early bootup code. Not even early_printk() 399 * We cannot printk in early bootup code. Not even early_printk()
@@ -505,7 +527,7 @@ static void print_lockdep_cache(struct lockdep_map *lock)
505 527
506static void print_lock(struct held_lock *hlock) 528static void print_lock(struct held_lock *hlock)
507{ 529{
508 print_lock_name(hlock->class); 530 print_lock_name(hlock_class(hlock));
509 printk(", at: "); 531 printk(", at: ");
510 print_ip_sym(hlock->acquire_ip); 532 print_ip_sym(hlock->acquire_ip);
511} 533}
@@ -558,6 +580,9 @@ static void print_lock_dependencies(struct lock_class *class, int depth)
558{ 580{
559 struct lock_list *entry; 581 struct lock_list *entry;
560 582
583 if (lockdep_dependency_visit(class, depth))
584 return;
585
561 if (DEBUG_LOCKS_WARN_ON(depth >= 20)) 586 if (DEBUG_LOCKS_WARN_ON(depth >= 20))
562 return; 587 return;
563 588
@@ -850,11 +875,11 @@ static int add_lock_to_list(struct lock_class *class, struct lock_class *this,
850 if (!entry) 875 if (!entry)
851 return 0; 876 return 0;
852 877
853 entry->class = this;
854 entry->distance = distance;
855 if (!save_trace(&entry->trace)) 878 if (!save_trace(&entry->trace))
856 return 0; 879 return 0;
857 880
881 entry->class = this;
882 entry->distance = distance;
858 /* 883 /*
859 * Since we never remove from the dependency list, the list can 884 * Since we never remove from the dependency list, the list can
860 * be walked lockless by other CPUs, it's only allocation 885 * be walked lockless by other CPUs, it's only allocation
@@ -932,7 +957,7 @@ static noinline int print_circular_bug_tail(void)
932 if (debug_locks_silent) 957 if (debug_locks_silent)
933 return 0; 958 return 0;
934 959
935 this.class = check_source->class; 960 this.class = hlock_class(check_source);
936 if (!save_trace(&this.trace)) 961 if (!save_trace(&this.trace))
937 return 0; 962 return 0;
938 963
@@ -959,6 +984,67 @@ static int noinline print_infinite_recursion_bug(void)
959 return 0; 984 return 0;
960} 985}
961 986
987unsigned long __lockdep_count_forward_deps(struct lock_class *class,
988 unsigned int depth)
989{
990 struct lock_list *entry;
991 unsigned long ret = 1;
992
993 if (lockdep_dependency_visit(class, depth))
994 return 0;
995
996 /*
997 * Recurse this class's dependency list:
998 */
999 list_for_each_entry(entry, &class->locks_after, entry)
1000 ret += __lockdep_count_forward_deps(entry->class, depth + 1);
1001
1002 return ret;
1003}
1004
1005unsigned long lockdep_count_forward_deps(struct lock_class *class)
1006{
1007 unsigned long ret, flags;
1008
1009 local_irq_save(flags);
1010 __raw_spin_lock(&lockdep_lock);
1011 ret = __lockdep_count_forward_deps(class, 0);
1012 __raw_spin_unlock(&lockdep_lock);
1013 local_irq_restore(flags);
1014
1015 return ret;
1016}
1017
1018unsigned long __lockdep_count_backward_deps(struct lock_class *class,
1019 unsigned int depth)
1020{
1021 struct lock_list *entry;
1022 unsigned long ret = 1;
1023
1024 if (lockdep_dependency_visit(class, depth))
1025 return 0;
1026 /*
1027 * Recurse this class's dependency list:
1028 */
1029 list_for_each_entry(entry, &class->locks_before, entry)
1030 ret += __lockdep_count_backward_deps(entry->class, depth + 1);
1031
1032 return ret;
1033}
1034
1035unsigned long lockdep_count_backward_deps(struct lock_class *class)
1036{
1037 unsigned long ret, flags;
1038
1039 local_irq_save(flags);
1040 __raw_spin_lock(&lockdep_lock);
1041 ret = __lockdep_count_backward_deps(class, 0);
1042 __raw_spin_unlock(&lockdep_lock);
1043 local_irq_restore(flags);
1044
1045 return ret;
1046}
1047
962/* 1048/*
963 * Prove that the dependency graph starting at <entry> can not 1049 * Prove that the dependency graph starting at <entry> can not
964 * lead to <target>. Print an error and return 0 if it does. 1050 * lead to <target>. Print an error and return 0 if it does.
@@ -968,6 +1054,9 @@ check_noncircular(struct lock_class *source, unsigned int depth)
968{ 1054{
969 struct lock_list *entry; 1055 struct lock_list *entry;
970 1056
1057 if (lockdep_dependency_visit(source, depth))
1058 return 1;
1059
971 debug_atomic_inc(&nr_cyclic_check_recursions); 1060 debug_atomic_inc(&nr_cyclic_check_recursions);
972 if (depth > max_recursion_depth) 1061 if (depth > max_recursion_depth)
973 max_recursion_depth = depth; 1062 max_recursion_depth = depth;
@@ -977,7 +1066,7 @@ check_noncircular(struct lock_class *source, unsigned int depth)
977 * Check this lock's dependency list: 1066 * Check this lock's dependency list:
978 */ 1067 */
979 list_for_each_entry(entry, &source->locks_after, entry) { 1068 list_for_each_entry(entry, &source->locks_after, entry) {
980 if (entry->class == check_target->class) 1069 if (entry->class == hlock_class(check_target))
981 return print_circular_bug_header(entry, depth+1); 1070 return print_circular_bug_header(entry, depth+1);
982 debug_atomic_inc(&nr_cyclic_checks); 1071 debug_atomic_inc(&nr_cyclic_checks);
983 if (!check_noncircular(entry->class, depth+1)) 1072 if (!check_noncircular(entry->class, depth+1))
@@ -1011,6 +1100,9 @@ find_usage_forwards(struct lock_class *source, unsigned int depth)
1011 struct lock_list *entry; 1100 struct lock_list *entry;
1012 int ret; 1101 int ret;
1013 1102
1103 if (lockdep_dependency_visit(source, depth))
1104 return 1;
1105
1014 if (depth > max_recursion_depth) 1106 if (depth > max_recursion_depth)
1015 max_recursion_depth = depth; 1107 max_recursion_depth = depth;
1016 if (depth >= RECURSION_LIMIT) 1108 if (depth >= RECURSION_LIMIT)
@@ -1050,6 +1142,9 @@ find_usage_backwards(struct lock_class *source, unsigned int depth)
1050 struct lock_list *entry; 1142 struct lock_list *entry;
1051 int ret; 1143 int ret;
1052 1144
1145 if (lockdep_dependency_visit(source, depth))
1146 return 1;
1147
1053 if (!__raw_spin_is_locked(&lockdep_lock)) 1148 if (!__raw_spin_is_locked(&lockdep_lock))
1054 return DEBUG_LOCKS_WARN_ON(1); 1149 return DEBUG_LOCKS_WARN_ON(1);
1055 1150
@@ -1064,6 +1159,11 @@ find_usage_backwards(struct lock_class *source, unsigned int depth)
1064 return 2; 1159 return 2;
1065 } 1160 }
1066 1161
1162 if (!source && debug_locks_off_graph_unlock()) {
1163 WARN_ON(1);
1164 return 0;
1165 }
1166
1067 /* 1167 /*
1068 * Check this lock's dependency list: 1168 * Check this lock's dependency list:
1069 */ 1169 */
@@ -1103,9 +1203,9 @@ print_bad_irq_dependency(struct task_struct *curr,
1103 printk("\nand this task is already holding:\n"); 1203 printk("\nand this task is already holding:\n");
1104 print_lock(prev); 1204 print_lock(prev);
1105 printk("which would create a new lock dependency:\n"); 1205 printk("which would create a new lock dependency:\n");
1106 print_lock_name(prev->class); 1206 print_lock_name(hlock_class(prev));
1107 printk(" ->"); 1207 printk(" ->");
1108 print_lock_name(next->class); 1208 print_lock_name(hlock_class(next));
1109 printk("\n"); 1209 printk("\n");
1110 1210
1111 printk("\nbut this new dependency connects a %s-irq-safe lock:\n", 1211 printk("\nbut this new dependency connects a %s-irq-safe lock:\n",
@@ -1146,12 +1246,12 @@ check_usage(struct task_struct *curr, struct held_lock *prev,
1146 1246
1147 find_usage_bit = bit_backwards; 1247 find_usage_bit = bit_backwards;
1148 /* fills in <backwards_match> */ 1248 /* fills in <backwards_match> */
1149 ret = find_usage_backwards(prev->class, 0); 1249 ret = find_usage_backwards(hlock_class(prev), 0);
1150 if (!ret || ret == 1) 1250 if (!ret || ret == 1)
1151 return ret; 1251 return ret;
1152 1252
1153 find_usage_bit = bit_forwards; 1253 find_usage_bit = bit_forwards;
1154 ret = find_usage_forwards(next->class, 0); 1254 ret = find_usage_forwards(hlock_class(next), 0);
1155 if (!ret || ret == 1) 1255 if (!ret || ret == 1)
1156 return ret; 1256 return ret;
1157 /* ret == 2 */ 1257 /* ret == 2 */
@@ -1272,18 +1372,32 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
1272 struct lockdep_map *next_instance, int read) 1372 struct lockdep_map *next_instance, int read)
1273{ 1373{
1274 struct held_lock *prev; 1374 struct held_lock *prev;
1375 struct held_lock *nest = NULL;
1275 int i; 1376 int i;
1276 1377
1277 for (i = 0; i < curr->lockdep_depth; i++) { 1378 for (i = 0; i < curr->lockdep_depth; i++) {
1278 prev = curr->held_locks + i; 1379 prev = curr->held_locks + i;
1279 if (prev->class != next->class) 1380
1381 if (prev->instance == next->nest_lock)
1382 nest = prev;
1383
1384 if (hlock_class(prev) != hlock_class(next))
1280 continue; 1385 continue;
1386
1281 /* 1387 /*
1282 * Allow read-after-read recursion of the same 1388 * Allow read-after-read recursion of the same
1283 * lock class (i.e. read_lock(lock)+read_lock(lock)): 1389 * lock class (i.e. read_lock(lock)+read_lock(lock)):
1284 */ 1390 */
1285 if ((read == 2) && prev->read) 1391 if ((read == 2) && prev->read)
1286 return 2; 1392 return 2;
1393
1394 /*
1395 * We're holding the nest_lock, which serializes this lock's
1396 * nesting behaviour.
1397 */
1398 if (nest)
1399 return 2;
1400
1287 return print_deadlock_bug(curr, prev, next); 1401 return print_deadlock_bug(curr, prev, next);
1288 } 1402 }
1289 return 1; 1403 return 1;
@@ -1329,7 +1443,7 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
1329 */ 1443 */
1330 check_source = next; 1444 check_source = next;
1331 check_target = prev; 1445 check_target = prev;
1332 if (!(check_noncircular(next->class, 0))) 1446 if (!(check_noncircular(hlock_class(next), 0)))
1333 return print_circular_bug_tail(); 1447 return print_circular_bug_tail();
1334 1448
1335 if (!check_prev_add_irq(curr, prev, next)) 1449 if (!check_prev_add_irq(curr, prev, next))
@@ -1353,8 +1467,8 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
1353 * chains - the second one will be new, but L1 already has 1467 * chains - the second one will be new, but L1 already has
1354 * L2 added to its dependency list, due to the first chain.) 1468 * L2 added to its dependency list, due to the first chain.)
1355 */ 1469 */
1356 list_for_each_entry(entry, &prev->class->locks_after, entry) { 1470 list_for_each_entry(entry, &hlock_class(prev)->locks_after, entry) {
1357 if (entry->class == next->class) { 1471 if (entry->class == hlock_class(next)) {
1358 if (distance == 1) 1472 if (distance == 1)
1359 entry->distance = 1; 1473 entry->distance = 1;
1360 return 2; 1474 return 2;
@@ -1365,26 +1479,28 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
1365 * Ok, all validations passed, add the new lock 1479 * Ok, all validations passed, add the new lock
1366 * to the previous lock's dependency list: 1480 * to the previous lock's dependency list:
1367 */ 1481 */
1368 ret = add_lock_to_list(prev->class, next->class, 1482 ret = add_lock_to_list(hlock_class(prev), hlock_class(next),
1369 &prev->class->locks_after, next->acquire_ip, distance); 1483 &hlock_class(prev)->locks_after,
1484 next->acquire_ip, distance);
1370 1485
1371 if (!ret) 1486 if (!ret)
1372 return 0; 1487 return 0;
1373 1488
1374 ret = add_lock_to_list(next->class, prev->class, 1489 ret = add_lock_to_list(hlock_class(next), hlock_class(prev),
1375 &next->class->locks_before, next->acquire_ip, distance); 1490 &hlock_class(next)->locks_before,
1491 next->acquire_ip, distance);
1376 if (!ret) 1492 if (!ret)
1377 return 0; 1493 return 0;
1378 1494
1379 /* 1495 /*
1380 * Debugging printouts: 1496 * Debugging printouts:
1381 */ 1497 */
1382 if (verbose(prev->class) || verbose(next->class)) { 1498 if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) {
1383 graph_unlock(); 1499 graph_unlock();
1384 printk("\n new dependency: "); 1500 printk("\n new dependency: ");
1385 print_lock_name(prev->class); 1501 print_lock_name(hlock_class(prev));
1386 printk(" => "); 1502 printk(" => ");
1387 print_lock_name(next->class); 1503 print_lock_name(hlock_class(next));
1388 printk("\n"); 1504 printk("\n");
1389 dump_stack(); 1505 dump_stack();
1390 return graph_lock(); 1506 return graph_lock();
@@ -1481,7 +1597,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
1481 struct held_lock *hlock, 1597 struct held_lock *hlock,
1482 u64 chain_key) 1598 u64 chain_key)
1483{ 1599{
1484 struct lock_class *class = hlock->class; 1600 struct lock_class *class = hlock_class(hlock);
1485 struct list_head *hash_head = chainhashentry(chain_key); 1601 struct list_head *hash_head = chainhashentry(chain_key);
1486 struct lock_chain *chain; 1602 struct lock_chain *chain;
1487 struct held_lock *hlock_curr, *hlock_next; 1603 struct held_lock *hlock_curr, *hlock_next;
@@ -1554,7 +1670,7 @@ cache_hit:
1554 if (likely(cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { 1670 if (likely(cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
1555 chain->base = cn; 1671 chain->base = cn;
1556 for (j = 0; j < chain->depth - 1; j++, i++) { 1672 for (j = 0; j < chain->depth - 1; j++, i++) {
1557 int lock_id = curr->held_locks[i].class - lock_classes; 1673 int lock_id = curr->held_locks[i].class_idx - 1;
1558 chain_hlocks[chain->base + j] = lock_id; 1674 chain_hlocks[chain->base + j] = lock_id;
1559 } 1675 }
1560 chain_hlocks[chain->base + j] = class - lock_classes; 1676 chain_hlocks[chain->base + j] = class - lock_classes;
@@ -1643,14 +1759,13 @@ static void check_chain_key(struct task_struct *curr)
1643 hlock = curr->held_locks + i; 1759 hlock = curr->held_locks + i;
1644 if (chain_key != hlock->prev_chain_key) { 1760 if (chain_key != hlock->prev_chain_key) {
1645 debug_locks_off(); 1761 debug_locks_off();
1646 printk("hm#1, depth: %u [%u], %016Lx != %016Lx\n", 1762 WARN(1, "hm#1, depth: %u [%u], %016Lx != %016Lx\n",
1647 curr->lockdep_depth, i, 1763 curr->lockdep_depth, i,
1648 (unsigned long long)chain_key, 1764 (unsigned long long)chain_key,
1649 (unsigned long long)hlock->prev_chain_key); 1765 (unsigned long long)hlock->prev_chain_key);
1650 WARN_ON(1);
1651 return; 1766 return;
1652 } 1767 }
1653 id = hlock->class - lock_classes; 1768 id = hlock->class_idx - 1;
1654 if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) 1769 if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS))
1655 return; 1770 return;
1656 1771
@@ -1662,11 +1777,10 @@ static void check_chain_key(struct task_struct *curr)
1662 } 1777 }
1663 if (chain_key != curr->curr_chain_key) { 1778 if (chain_key != curr->curr_chain_key) {
1664 debug_locks_off(); 1779 debug_locks_off();
1665 printk("hm#2, depth: %u [%u], %016Lx != %016Lx\n", 1780 WARN(1, "hm#2, depth: %u [%u], %016Lx != %016Lx\n",
1666 curr->lockdep_depth, i, 1781 curr->lockdep_depth, i,
1667 (unsigned long long)chain_key, 1782 (unsigned long long)chain_key,
1668 (unsigned long long)curr->curr_chain_key); 1783 (unsigned long long)curr->curr_chain_key);
1669 WARN_ON(1);
1670 } 1784 }
1671#endif 1785#endif
1672} 1786}
@@ -1695,7 +1809,7 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
1695 print_lock(this); 1809 print_lock(this);
1696 1810
1697 printk("{%s} state was registered at:\n", usage_str[prev_bit]); 1811 printk("{%s} state was registered at:\n", usage_str[prev_bit]);
1698 print_stack_trace(this->class->usage_traces + prev_bit, 1); 1812 print_stack_trace(hlock_class(this)->usage_traces + prev_bit, 1);
1699 1813
1700 print_irqtrace_events(curr); 1814 print_irqtrace_events(curr);
1701 printk("\nother info that might help us debug this:\n"); 1815 printk("\nother info that might help us debug this:\n");
@@ -1714,7 +1828,7 @@ static inline int
1714valid_state(struct task_struct *curr, struct held_lock *this, 1828valid_state(struct task_struct *curr, struct held_lock *this,
1715 enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit) 1829 enum lock_usage_bit new_bit, enum lock_usage_bit bad_bit)
1716{ 1830{
1717 if (unlikely(this->class->usage_mask & (1 << bad_bit))) 1831 if (unlikely(hlock_class(this)->usage_mask & (1 << bad_bit)))
1718 return print_usage_bug(curr, this, bad_bit, new_bit); 1832 return print_usage_bug(curr, this, bad_bit, new_bit);
1719 return 1; 1833 return 1;
1720} 1834}
@@ -1753,7 +1867,7 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
1753 lockdep_print_held_locks(curr); 1867 lockdep_print_held_locks(curr);
1754 1868
1755 printk("\nthe first lock's dependencies:\n"); 1869 printk("\nthe first lock's dependencies:\n");
1756 print_lock_dependencies(this->class, 0); 1870 print_lock_dependencies(hlock_class(this), 0);
1757 1871
1758 printk("\nthe second lock's dependencies:\n"); 1872 printk("\nthe second lock's dependencies:\n");
1759 print_lock_dependencies(other, 0); 1873 print_lock_dependencies(other, 0);
@@ -1776,7 +1890,7 @@ check_usage_forwards(struct task_struct *curr, struct held_lock *this,
1776 1890
1777 find_usage_bit = bit; 1891 find_usage_bit = bit;
1778 /* fills in <forwards_match> */ 1892 /* fills in <forwards_match> */
1779 ret = find_usage_forwards(this->class, 0); 1893 ret = find_usage_forwards(hlock_class(this), 0);
1780 if (!ret || ret == 1) 1894 if (!ret || ret == 1)
1781 return ret; 1895 return ret;
1782 1896
@@ -1795,7 +1909,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this,
1795 1909
1796 find_usage_bit = bit; 1910 find_usage_bit = bit;
1797 /* fills in <backwards_match> */ 1911 /* fills in <backwards_match> */
1798 ret = find_usage_backwards(this->class, 0); 1912 ret = find_usage_backwards(hlock_class(this), 0);
1799 if (!ret || ret == 1) 1913 if (!ret || ret == 1)
1800 return ret; 1914 return ret;
1801 1915
@@ -1861,7 +1975,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
1861 LOCK_ENABLED_HARDIRQS_READ, "hard-read")) 1975 LOCK_ENABLED_HARDIRQS_READ, "hard-read"))
1862 return 0; 1976 return 0;
1863#endif 1977#endif
1864 if (hardirq_verbose(this->class)) 1978 if (hardirq_verbose(hlock_class(this)))
1865 ret = 2; 1979 ret = 2;
1866 break; 1980 break;
1867 case LOCK_USED_IN_SOFTIRQ: 1981 case LOCK_USED_IN_SOFTIRQ:
@@ -1886,7 +2000,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
1886 LOCK_ENABLED_SOFTIRQS_READ, "soft-read")) 2000 LOCK_ENABLED_SOFTIRQS_READ, "soft-read"))
1887 return 0; 2001 return 0;
1888#endif 2002#endif
1889 if (softirq_verbose(this->class)) 2003 if (softirq_verbose(hlock_class(this)))
1890 ret = 2; 2004 ret = 2;
1891 break; 2005 break;
1892 case LOCK_USED_IN_HARDIRQ_READ: 2006 case LOCK_USED_IN_HARDIRQ_READ:
@@ -1899,7 +2013,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
1899 if (!check_usage_forwards(curr, this, 2013 if (!check_usage_forwards(curr, this,
1900 LOCK_ENABLED_HARDIRQS, "hard")) 2014 LOCK_ENABLED_HARDIRQS, "hard"))
1901 return 0; 2015 return 0;
1902 if (hardirq_verbose(this->class)) 2016 if (hardirq_verbose(hlock_class(this)))
1903 ret = 2; 2017 ret = 2;
1904 break; 2018 break;
1905 case LOCK_USED_IN_SOFTIRQ_READ: 2019 case LOCK_USED_IN_SOFTIRQ_READ:
@@ -1912,7 +2026,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
1912 if (!check_usage_forwards(curr, this, 2026 if (!check_usage_forwards(curr, this,
1913 LOCK_ENABLED_SOFTIRQS, "soft")) 2027 LOCK_ENABLED_SOFTIRQS, "soft"))
1914 return 0; 2028 return 0;
1915 if (softirq_verbose(this->class)) 2029 if (softirq_verbose(hlock_class(this)))
1916 ret = 2; 2030 ret = 2;
1917 break; 2031 break;
1918 case LOCK_ENABLED_HARDIRQS: 2032 case LOCK_ENABLED_HARDIRQS:
@@ -1938,7 +2052,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
1938 LOCK_USED_IN_HARDIRQ_READ, "hard-read")) 2052 LOCK_USED_IN_HARDIRQ_READ, "hard-read"))
1939 return 0; 2053 return 0;
1940#endif 2054#endif
1941 if (hardirq_verbose(this->class)) 2055 if (hardirq_verbose(hlock_class(this)))
1942 ret = 2; 2056 ret = 2;
1943 break; 2057 break;
1944 case LOCK_ENABLED_SOFTIRQS: 2058 case LOCK_ENABLED_SOFTIRQS:
@@ -1964,7 +2078,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
1964 LOCK_USED_IN_SOFTIRQ_READ, "soft-read")) 2078 LOCK_USED_IN_SOFTIRQ_READ, "soft-read"))
1965 return 0; 2079 return 0;
1966#endif 2080#endif
1967 if (softirq_verbose(this->class)) 2081 if (softirq_verbose(hlock_class(this)))
1968 ret = 2; 2082 ret = 2;
1969 break; 2083 break;
1970 case LOCK_ENABLED_HARDIRQS_READ: 2084 case LOCK_ENABLED_HARDIRQS_READ:
@@ -1979,7 +2093,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
1979 LOCK_USED_IN_HARDIRQ, "hard")) 2093 LOCK_USED_IN_HARDIRQ, "hard"))
1980 return 0; 2094 return 0;
1981#endif 2095#endif
1982 if (hardirq_verbose(this->class)) 2096 if (hardirq_verbose(hlock_class(this)))
1983 ret = 2; 2097 ret = 2;
1984 break; 2098 break;
1985 case LOCK_ENABLED_SOFTIRQS_READ: 2099 case LOCK_ENABLED_SOFTIRQS_READ:
@@ -1994,7 +2108,7 @@ static int mark_lock_irq(struct task_struct *curr, struct held_lock *this,
1994 LOCK_USED_IN_SOFTIRQ, "soft")) 2108 LOCK_USED_IN_SOFTIRQ, "soft"))
1995 return 0; 2109 return 0;
1996#endif 2110#endif
1997 if (softirq_verbose(this->class)) 2111 if (softirq_verbose(hlock_class(this)))
1998 ret = 2; 2112 ret = 2;
1999 break; 2113 break;
2000 default: 2114 default:
@@ -2310,7 +2424,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
2310 * If already set then do not dirty the cacheline, 2424 * If already set then do not dirty the cacheline,
2311 * nor do any checks: 2425 * nor do any checks:
2312 */ 2426 */
2313 if (likely(this->class->usage_mask & new_mask)) 2427 if (likely(hlock_class(this)->usage_mask & new_mask))
2314 return 1; 2428 return 1;
2315 2429
2316 if (!graph_lock()) 2430 if (!graph_lock())
@@ -2318,14 +2432,14 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
2318 /* 2432 /*
2319 * Make sure we didnt race: 2433 * Make sure we didnt race:
2320 */ 2434 */
2321 if (unlikely(this->class->usage_mask & new_mask)) { 2435 if (unlikely(hlock_class(this)->usage_mask & new_mask)) {
2322 graph_unlock(); 2436 graph_unlock();
2323 return 1; 2437 return 1;
2324 } 2438 }
2325 2439
2326 this->class->usage_mask |= new_mask; 2440 hlock_class(this)->usage_mask |= new_mask;
2327 2441
2328 if (!save_trace(this->class->usage_traces + new_bit)) 2442 if (!save_trace(hlock_class(this)->usage_traces + new_bit))
2329 return 0; 2443 return 0;
2330 2444
2331 switch (new_bit) { 2445 switch (new_bit) {
@@ -2405,7 +2519,7 @@ EXPORT_SYMBOL_GPL(lockdep_init_map);
2405 */ 2519 */
2406static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, 2520static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2407 int trylock, int read, int check, int hardirqs_off, 2521 int trylock, int read, int check, int hardirqs_off,
2408 unsigned long ip) 2522 struct lockdep_map *nest_lock, unsigned long ip)
2409{ 2523{
2410 struct task_struct *curr = current; 2524 struct task_struct *curr = current;
2411 struct lock_class *class = NULL; 2525 struct lock_class *class = NULL;
@@ -2459,14 +2573,16 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2459 return 0; 2573 return 0;
2460 2574
2461 hlock = curr->held_locks + depth; 2575 hlock = curr->held_locks + depth;
2462 2576 if (DEBUG_LOCKS_WARN_ON(!class))
2463 hlock->class = class; 2577 return 0;
2578 hlock->class_idx = class - lock_classes + 1;
2464 hlock->acquire_ip = ip; 2579 hlock->acquire_ip = ip;
2465 hlock->instance = lock; 2580 hlock->instance = lock;
2581 hlock->nest_lock = nest_lock;
2466 hlock->trylock = trylock; 2582 hlock->trylock = trylock;
2467 hlock->read = read; 2583 hlock->read = read;
2468 hlock->check = check; 2584 hlock->check = check;
2469 hlock->hardirqs_off = hardirqs_off; 2585 hlock->hardirqs_off = !!hardirqs_off;
2470#ifdef CONFIG_LOCK_STAT 2586#ifdef CONFIG_LOCK_STAT
2471 hlock->waittime_stamp = 0; 2587 hlock->waittime_stamp = 0;
2472 hlock->holdtime_stamp = sched_clock(); 2588 hlock->holdtime_stamp = sched_clock();
@@ -2574,6 +2690,55 @@ static int check_unlock(struct task_struct *curr, struct lockdep_map *lock,
2574 return 1; 2690 return 1;
2575} 2691}
2576 2692
2693static int
2694__lock_set_subclass(struct lockdep_map *lock,
2695 unsigned int subclass, unsigned long ip)
2696{
2697 struct task_struct *curr = current;
2698 struct held_lock *hlock, *prev_hlock;
2699 struct lock_class *class;
2700 unsigned int depth;
2701 int i;
2702
2703 depth = curr->lockdep_depth;
2704 if (DEBUG_LOCKS_WARN_ON(!depth))
2705 return 0;
2706
2707 prev_hlock = NULL;
2708 for (i = depth-1; i >= 0; i--) {
2709 hlock = curr->held_locks + i;
2710 /*
2711 * We must not cross into another context:
2712 */
2713 if (prev_hlock && prev_hlock->irq_context != hlock->irq_context)
2714 break;
2715 if (hlock->instance == lock)
2716 goto found_it;
2717 prev_hlock = hlock;
2718 }
2719 return print_unlock_inbalance_bug(curr, lock, ip);
2720
2721found_it:
2722 class = register_lock_class(lock, subclass, 0);
2723 hlock->class_idx = class - lock_classes + 1;
2724
2725 curr->lockdep_depth = i;
2726 curr->curr_chain_key = hlock->prev_chain_key;
2727
2728 for (; i < depth; i++) {
2729 hlock = curr->held_locks + i;
2730 if (!__lock_acquire(hlock->instance,
2731 hlock_class(hlock)->subclass, hlock->trylock,
2732 hlock->read, hlock->check, hlock->hardirqs_off,
2733 hlock->nest_lock, hlock->acquire_ip))
2734 return 0;
2735 }
2736
2737 if (DEBUG_LOCKS_WARN_ON(curr->lockdep_depth != depth))
2738 return 0;
2739 return 1;
2740}
2741
2577/* 2742/*
2578 * Remove the lock to the list of currently held locks in a 2743 * Remove the lock to the list of currently held locks in a
2579 * potentially non-nested (out of order) manner. This is a 2744 * potentially non-nested (out of order) manner. This is a
@@ -2624,9 +2789,9 @@ found_it:
2624 for (i++; i < depth; i++) { 2789 for (i++; i < depth; i++) {
2625 hlock = curr->held_locks + i; 2790 hlock = curr->held_locks + i;
2626 if (!__lock_acquire(hlock->instance, 2791 if (!__lock_acquire(hlock->instance,
2627 hlock->class->subclass, hlock->trylock, 2792 hlock_class(hlock)->subclass, hlock->trylock,
2628 hlock->read, hlock->check, hlock->hardirqs_off, 2793 hlock->read, hlock->check, hlock->hardirqs_off,
2629 hlock->acquire_ip)) 2794 hlock->nest_lock, hlock->acquire_ip))
2630 return 0; 2795 return 0;
2631 } 2796 }
2632 2797
@@ -2669,7 +2834,7 @@ static int lock_release_nested(struct task_struct *curr,
2669 2834
2670#ifdef CONFIG_DEBUG_LOCKDEP 2835#ifdef CONFIG_DEBUG_LOCKDEP
2671 hlock->prev_chain_key = 0; 2836 hlock->prev_chain_key = 0;
2672 hlock->class = NULL; 2837 hlock->class_idx = 0;
2673 hlock->acquire_ip = 0; 2838 hlock->acquire_ip = 0;
2674 hlock->irq_context = 0; 2839 hlock->irq_context = 0;
2675#endif 2840#endif
@@ -2738,18 +2903,36 @@ static void check_flags(unsigned long flags)
2738#endif 2903#endif
2739} 2904}
2740 2905
2906void
2907lock_set_subclass(struct lockdep_map *lock,
2908 unsigned int subclass, unsigned long ip)
2909{
2910 unsigned long flags;
2911
2912 if (unlikely(current->lockdep_recursion))
2913 return;
2914
2915 raw_local_irq_save(flags);
2916 current->lockdep_recursion = 1;
2917 check_flags(flags);
2918 if (__lock_set_subclass(lock, subclass, ip))
2919 check_chain_key(current);
2920 current->lockdep_recursion = 0;
2921 raw_local_irq_restore(flags);
2922}
2923
2924EXPORT_SYMBOL_GPL(lock_set_subclass);
2925
2741/* 2926/*
2742 * We are not always called with irqs disabled - do that here, 2927 * We are not always called with irqs disabled - do that here,
2743 * and also avoid lockdep recursion: 2928 * and also avoid lockdep recursion:
2744 */ 2929 */
2745void lock_acquire(struct lockdep_map *lock, unsigned int subclass, 2930void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2746 int trylock, int read, int check, unsigned long ip) 2931 int trylock, int read, int check,
2932 struct lockdep_map *nest_lock, unsigned long ip)
2747{ 2933{
2748 unsigned long flags; 2934 unsigned long flags;
2749 2935
2750 if (unlikely(!lock_stat && !prove_locking))
2751 return;
2752
2753 if (unlikely(current->lockdep_recursion)) 2936 if (unlikely(current->lockdep_recursion))
2754 return; 2937 return;
2755 2938
@@ -2758,7 +2941,7 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
2758 2941
2759 current->lockdep_recursion = 1; 2942 current->lockdep_recursion = 1;
2760 __lock_acquire(lock, subclass, trylock, read, check, 2943 __lock_acquire(lock, subclass, trylock, read, check,
2761 irqs_disabled_flags(flags), ip); 2944 irqs_disabled_flags(flags), nest_lock, ip);
2762 current->lockdep_recursion = 0; 2945 current->lockdep_recursion = 0;
2763 raw_local_irq_restore(flags); 2946 raw_local_irq_restore(flags);
2764} 2947}
@@ -2770,9 +2953,6 @@ void lock_release(struct lockdep_map *lock, int nested,
2770{ 2953{
2771 unsigned long flags; 2954 unsigned long flags;
2772 2955
2773 if (unlikely(!lock_stat && !prove_locking))
2774 return;
2775
2776 if (unlikely(current->lockdep_recursion)) 2956 if (unlikely(current->lockdep_recursion))
2777 return; 2957 return;
2778 2958
@@ -2845,11 +3025,11 @@ __lock_contended(struct lockdep_map *lock, unsigned long ip)
2845found_it: 3025found_it:
2846 hlock->waittime_stamp = sched_clock(); 3026 hlock->waittime_stamp = sched_clock();
2847 3027
2848 point = lock_contention_point(hlock->class, ip); 3028 point = lock_contention_point(hlock_class(hlock), ip);
2849 3029
2850 stats = get_lock_stats(hlock->class); 3030 stats = get_lock_stats(hlock_class(hlock));
2851 if (point < ARRAY_SIZE(stats->contention_point)) 3031 if (point < ARRAY_SIZE(stats->contention_point))
2852 stats->contention_point[i]++; 3032 stats->contention_point[point]++;
2853 if (lock->cpu != smp_processor_id()) 3033 if (lock->cpu != smp_processor_id())
2854 stats->bounces[bounce_contended + !!hlock->read]++; 3034 stats->bounces[bounce_contended + !!hlock->read]++;
2855 put_lock_stats(stats); 3035 put_lock_stats(stats);
@@ -2893,7 +3073,7 @@ found_it:
2893 hlock->holdtime_stamp = now; 3073 hlock->holdtime_stamp = now;
2894 } 3074 }
2895 3075
2896 stats = get_lock_stats(hlock->class); 3076 stats = get_lock_stats(hlock_class(hlock));
2897 if (waittime) { 3077 if (waittime) {
2898 if (hlock->read) 3078 if (hlock->read)
2899 lock_time_inc(&stats->read_waittime, waittime); 3079 lock_time_inc(&stats->read_waittime, waittime);
@@ -2988,6 +3168,7 @@ static void zap_class(struct lock_class *class)
2988 list_del_rcu(&class->hash_entry); 3168 list_del_rcu(&class->hash_entry);
2989 list_del_rcu(&class->lock_entry); 3169 list_del_rcu(&class->lock_entry);
2990 3170
3171 class->key = NULL;
2991} 3172}
2992 3173
2993static inline int within(const void *addr, void *start, unsigned long size) 3174static inline int within(const void *addr, void *start, unsigned long size)
diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h
index c3600a091a28..56b196932c08 100644
--- a/kernel/lockdep_internals.h
+++ b/kernel/lockdep_internals.h
@@ -17,9 +17,6 @@
17 */ 17 */
18#define MAX_LOCKDEP_ENTRIES 8192UL 18#define MAX_LOCKDEP_ENTRIES 8192UL
19 19
20#define MAX_LOCKDEP_KEYS_BITS 11
21#define MAX_LOCKDEP_KEYS (1UL << MAX_LOCKDEP_KEYS_BITS)
22
23#define MAX_LOCKDEP_CHAINS_BITS 14 20#define MAX_LOCKDEP_CHAINS_BITS 14
24#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) 21#define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS)
25 22
@@ -53,6 +50,22 @@ extern unsigned int nr_process_chains;
53extern unsigned int max_lockdep_depth; 50extern unsigned int max_lockdep_depth;
54extern unsigned int max_recursion_depth; 51extern unsigned int max_recursion_depth;
55 52
53#ifdef CONFIG_PROVE_LOCKING
54extern unsigned long lockdep_count_forward_deps(struct lock_class *);
55extern unsigned long lockdep_count_backward_deps(struct lock_class *);
56#else
57static inline unsigned long
58lockdep_count_forward_deps(struct lock_class *class)
59{
60 return 0;
61}
62static inline unsigned long
63lockdep_count_backward_deps(struct lock_class *class)
64{
65 return 0;
66}
67#endif
68
56#ifdef CONFIG_DEBUG_LOCKDEP 69#ifdef CONFIG_DEBUG_LOCKDEP
57/* 70/*
58 * Various lockdep statistics: 71 * Various lockdep statistics:
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 9b0e940e2545..20dbcbf9c7dd 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -63,34 +63,6 @@ static void l_stop(struct seq_file *m, void *v)
63{ 63{
64} 64}
65 65
66static unsigned long count_forward_deps(struct lock_class *class)
67{
68 struct lock_list *entry;
69 unsigned long ret = 1;
70
71 /*
72 * Recurse this class's dependency list:
73 */
74 list_for_each_entry(entry, &class->locks_after, entry)
75 ret += count_forward_deps(entry->class);
76
77 return ret;
78}
79
80static unsigned long count_backward_deps(struct lock_class *class)
81{
82 struct lock_list *entry;
83 unsigned long ret = 1;
84
85 /*
86 * Recurse this class's dependency list:
87 */
88 list_for_each_entry(entry, &class->locks_before, entry)
89 ret += count_backward_deps(entry->class);
90
91 return ret;
92}
93
94static void print_name(struct seq_file *m, struct lock_class *class) 66static void print_name(struct seq_file *m, struct lock_class *class)
95{ 67{
96 char str[128]; 68 char str[128];
@@ -110,7 +82,6 @@ static void print_name(struct seq_file *m, struct lock_class *class)
110 82
111static int l_show(struct seq_file *m, void *v) 83static int l_show(struct seq_file *m, void *v)
112{ 84{
113 unsigned long nr_forward_deps, nr_backward_deps;
114 struct lock_class *class = v; 85 struct lock_class *class = v;
115 struct lock_list *entry; 86 struct lock_list *entry;
116 char c1, c2, c3, c4; 87 char c1, c2, c3, c4;
@@ -124,11 +95,10 @@ static int l_show(struct seq_file *m, void *v)
124#ifdef CONFIG_DEBUG_LOCKDEP 95#ifdef CONFIG_DEBUG_LOCKDEP
125 seq_printf(m, " OPS:%8ld", class->ops); 96 seq_printf(m, " OPS:%8ld", class->ops);
126#endif 97#endif
127 nr_forward_deps = count_forward_deps(class); 98#ifdef CONFIG_PROVE_LOCKING
128 seq_printf(m, " FD:%5ld", nr_forward_deps); 99 seq_printf(m, " FD:%5ld", lockdep_count_forward_deps(class));
129 100 seq_printf(m, " BD:%5ld", lockdep_count_backward_deps(class));
130 nr_backward_deps = count_backward_deps(class); 101#endif
131 seq_printf(m, " BD:%5ld", nr_backward_deps);
132 102
133 get_usage_chars(class, &c1, &c2, &c3, &c4); 103 get_usage_chars(class, &c1, &c2, &c3, &c4);
134 seq_printf(m, " %c%c%c%c", c1, c2, c3, c4); 104 seq_printf(m, " %c%c%c%c", c1, c2, c3, c4);
@@ -229,6 +199,9 @@ static int lc_show(struct seq_file *m, void *v)
229 199
230 for (i = 0; i < chain->depth; i++) { 200 for (i = 0; i < chain->depth; i++) {
231 class = lock_chain_get_class(chain, i); 201 class = lock_chain_get_class(chain, i);
202 if (!class->key)
203 continue;
204
232 seq_printf(m, "[%p] ", class->key); 205 seq_printf(m, "[%p] ", class->key);
233 print_name(m, class); 206 print_name(m, class);
234 seq_puts(m, "\n"); 207 seq_puts(m, "\n");
@@ -350,7 +323,9 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
350 if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ) 323 if (class->usage_mask & LOCKF_ENABLED_HARDIRQS_READ)
351 nr_hardirq_read_unsafe++; 324 nr_hardirq_read_unsafe++;
352 325
353 sum_forward_deps += count_forward_deps(class); 326#ifdef CONFIG_PROVE_LOCKING
327 sum_forward_deps += lockdep_count_forward_deps(class);
328#endif
354 } 329 }
355#ifdef CONFIG_DEBUG_LOCKDEP 330#ifdef CONFIG_DEBUG_LOCKDEP
356 DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused); 331 DEBUG_LOCKS_WARN_ON(debug_atomic_read(&nr_unused_locks) != nr_unused);
@@ -497,8 +472,9 @@ static void snprint_time(char *buf, size_t bufsiz, s64 nr)
497{ 472{
498 unsigned long rem; 473 unsigned long rem;
499 474
475 nr += 5; /* for display rounding */
500 rem = do_div(nr, 1000); /* XXX: do_div_signed */ 476 rem = do_div(nr, 1000); /* XXX: do_div_signed */
501 snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, ((int)rem+5)/10); 477 snprintf(buf, bufsiz, "%lld.%02d", (long long)nr, (int)rem/10);
502} 478}
503 479
504static void seq_time(struct seq_file *m, s64 time) 480static void seq_time(struct seq_file *m, s64 time)
diff --git a/kernel/marker.c b/kernel/marker.c
index 1abfb923b761..e9c6b2bc9400 100644
--- a/kernel/marker.c
+++ b/kernel/marker.c
@@ -62,7 +62,7 @@ struct marker_entry {
62 int refcount; /* Number of times armed. 0 if disarmed. */ 62 int refcount; /* Number of times armed. 0 if disarmed. */
63 struct rcu_head rcu; 63 struct rcu_head rcu;
64 void *oldptr; 64 void *oldptr;
65 unsigned char rcu_pending:1; 65 int rcu_pending;
66 unsigned char ptype:1; 66 unsigned char ptype:1;
67 char name[0]; /* Contains name'\0'format'\0' */ 67 char name[0]; /* Contains name'\0'format'\0' */
68}; 68};
@@ -103,11 +103,11 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
103 char ptype; 103 char ptype;
104 104
105 /* 105 /*
106 * preempt_disable does two things : disabling preemption to make sure 106 * rcu_read_lock_sched does two things : disabling preemption to make
107 * the teardown of the callbacks can be done correctly when they are in 107 * sure the teardown of the callbacks can be done correctly when they
108 * modules and they insure RCU read coherency. 108 * are in modules and they insure RCU read coherency.
109 */ 109 */
110 preempt_disable(); 110 rcu_read_lock_sched();
111 ptype = mdata->ptype; 111 ptype = mdata->ptype;
112 if (likely(!ptype)) { 112 if (likely(!ptype)) {
113 marker_probe_func *func; 113 marker_probe_func *func;
@@ -126,6 +126,11 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
126 struct marker_probe_closure *multi; 126 struct marker_probe_closure *multi;
127 int i; 127 int i;
128 /* 128 /*
129 * Read mdata->ptype before mdata->multi.
130 */
131 smp_rmb();
132 multi = mdata->multi;
133 /*
129 * multi points to an array, therefore accessing the array 134 * multi points to an array, therefore accessing the array
130 * depends on reading multi. However, even in this case, 135 * depends on reading multi. However, even in this case,
131 * we must insure that the pointer is read _before_ the array 136 * we must insure that the pointer is read _before_ the array
@@ -133,7 +138,6 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
133 * in the fast path, so put the explicit barrier here. 138 * in the fast path, so put the explicit barrier here.
134 */ 139 */
135 smp_read_barrier_depends(); 140 smp_read_barrier_depends();
136 multi = mdata->multi;
137 for (i = 0; multi[i].func; i++) { 141 for (i = 0; multi[i].func; i++) {
138 va_start(args, call_private); 142 va_start(args, call_private);
139 multi[i].func(multi[i].probe_private, call_private, 143 multi[i].func(multi[i].probe_private, call_private,
@@ -141,7 +145,7 @@ void marker_probe_cb(const struct marker *mdata, void *call_private, ...)
141 va_end(args); 145 va_end(args);
142 } 146 }
143 } 147 }
144 preempt_enable(); 148 rcu_read_unlock_sched();
145} 149}
146EXPORT_SYMBOL_GPL(marker_probe_cb); 150EXPORT_SYMBOL_GPL(marker_probe_cb);
147 151
@@ -158,7 +162,7 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
158 va_list args; /* not initialized */ 162 va_list args; /* not initialized */
159 char ptype; 163 char ptype;
160 164
161 preempt_disable(); 165 rcu_read_lock_sched();
162 ptype = mdata->ptype; 166 ptype = mdata->ptype;
163 if (likely(!ptype)) { 167 if (likely(!ptype)) {
164 marker_probe_func *func; 168 marker_probe_func *func;
@@ -175,6 +179,11 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
175 struct marker_probe_closure *multi; 179 struct marker_probe_closure *multi;
176 int i; 180 int i;
177 /* 181 /*
182 * Read mdata->ptype before mdata->multi.
183 */
184 smp_rmb();
185 multi = mdata->multi;
186 /*
178 * multi points to an array, therefore accessing the array 187 * multi points to an array, therefore accessing the array
179 * depends on reading multi. However, even in this case, 188 * depends on reading multi. However, even in this case,
180 * we must insure that the pointer is read _before_ the array 189 * we must insure that the pointer is read _before_ the array
@@ -182,12 +191,11 @@ void marker_probe_cb_noarg(const struct marker *mdata, void *call_private, ...)
182 * in the fast path, so put the explicit barrier here. 191 * in the fast path, so put the explicit barrier here.
183 */ 192 */
184 smp_read_barrier_depends(); 193 smp_read_barrier_depends();
185 multi = mdata->multi;
186 for (i = 0; multi[i].func; i++) 194 for (i = 0; multi[i].func; i++)
187 multi[i].func(multi[i].probe_private, call_private, 195 multi[i].func(multi[i].probe_private, call_private,
188 mdata->format, &args); 196 mdata->format, &args);
189 } 197 }
190 preempt_enable(); 198 rcu_read_unlock_sched();
191} 199}
192EXPORT_SYMBOL_GPL(marker_probe_cb_noarg); 200EXPORT_SYMBOL_GPL(marker_probe_cb_noarg);
193 201
@@ -441,7 +449,7 @@ static int remove_marker(const char *name)
441 hlist_del(&e->hlist); 449 hlist_del(&e->hlist);
442 /* Make sure the call_rcu has been executed */ 450 /* Make sure the call_rcu has been executed */
443 if (e->rcu_pending) 451 if (e->rcu_pending)
444 rcu_barrier(); 452 rcu_barrier_sched();
445 kfree(e); 453 kfree(e);
446 return 0; 454 return 0;
447} 455}
@@ -476,7 +484,7 @@ static int marker_set_format(struct marker_entry **entry, const char *format)
476 hlist_del(&(*entry)->hlist); 484 hlist_del(&(*entry)->hlist);
477 /* Make sure the call_rcu has been executed */ 485 /* Make sure the call_rcu has been executed */
478 if ((*entry)->rcu_pending) 486 if ((*entry)->rcu_pending)
479 rcu_barrier(); 487 rcu_barrier_sched();
480 kfree(*entry); 488 kfree(*entry);
481 *entry = e; 489 *entry = e;
482 trace_mark(core_marker_format, "name %s format %s", 490 trace_mark(core_marker_format, "name %s format %s",
@@ -552,7 +560,7 @@ static int set_marker(struct marker_entry **entry, struct marker *elem,
552 * Disable a marker and its probe callback. 560 * Disable a marker and its probe callback.
553 * Note: only waiting an RCU period after setting elem->call to the empty 561 * Note: only waiting an RCU period after setting elem->call to the empty
554 * function insures that the original callback is not used anymore. This insured 562 * function insures that the original callback is not used anymore. This insured
555 * by preempt_disable around the call site. 563 * by rcu_read_lock_sched around the call site.
556 */ 564 */
557static void disable_marker(struct marker *elem) 565static void disable_marker(struct marker *elem)
558{ 566{
@@ -645,17 +653,23 @@ int marker_probe_register(const char *name, const char *format,
645 entry = get_marker(name); 653 entry = get_marker(name);
646 if (!entry) { 654 if (!entry) {
647 entry = add_marker(name, format); 655 entry = add_marker(name, format);
648 if (IS_ERR(entry)) { 656 if (IS_ERR(entry))
649 ret = PTR_ERR(entry); 657 ret = PTR_ERR(entry);
650 goto end; 658 } else if (format) {
651 } 659 if (!entry->format)
660 ret = marker_set_format(&entry, format);
661 else if (strcmp(entry->format, format))
662 ret = -EPERM;
652 } 663 }
664 if (ret)
665 goto end;
666
653 /* 667 /*
654 * If we detect that a call_rcu is pending for this marker, 668 * If we detect that a call_rcu is pending for this marker,
655 * make sure it's executed now. 669 * make sure it's executed now.
656 */ 670 */
657 if (entry->rcu_pending) 671 if (entry->rcu_pending)
658 rcu_barrier(); 672 rcu_barrier_sched();
659 old = marker_entry_add_probe(entry, probe, probe_private); 673 old = marker_entry_add_probe(entry, probe, probe_private);
660 if (IS_ERR(old)) { 674 if (IS_ERR(old)) {
661 ret = PTR_ERR(old); 675 ret = PTR_ERR(old);
@@ -666,14 +680,13 @@ int marker_probe_register(const char *name, const char *format,
666 mutex_lock(&markers_mutex); 680 mutex_lock(&markers_mutex);
667 entry = get_marker(name); 681 entry = get_marker(name);
668 WARN_ON(!entry); 682 WARN_ON(!entry);
683 if (entry->rcu_pending)
684 rcu_barrier_sched();
669 entry->oldptr = old; 685 entry->oldptr = old;
670 entry->rcu_pending = 1; 686 entry->rcu_pending = 1;
671 /* write rcu_pending before calling the RCU callback */ 687 /* write rcu_pending before calling the RCU callback */
672 smp_wmb(); 688 smp_wmb();
673#ifdef CONFIG_PREEMPT_RCU 689 call_rcu_sched(&entry->rcu, free_old_closure);
674 synchronize_sched(); /* Until we have the call_rcu_sched() */
675#endif
676 call_rcu(&entry->rcu, free_old_closure);
677end: 690end:
678 mutex_unlock(&markers_mutex); 691 mutex_unlock(&markers_mutex);
679 return ret; 692 return ret;
@@ -704,7 +717,7 @@ int marker_probe_unregister(const char *name,
704 if (!entry) 717 if (!entry)
705 goto end; 718 goto end;
706 if (entry->rcu_pending) 719 if (entry->rcu_pending)
707 rcu_barrier(); 720 rcu_barrier_sched();
708 old = marker_entry_remove_probe(entry, probe, probe_private); 721 old = marker_entry_remove_probe(entry, probe, probe_private);
709 mutex_unlock(&markers_mutex); 722 mutex_unlock(&markers_mutex);
710 marker_update_probes(); /* may update entry */ 723 marker_update_probes(); /* may update entry */
@@ -712,14 +725,13 @@ int marker_probe_unregister(const char *name,
712 entry = get_marker(name); 725 entry = get_marker(name);
713 if (!entry) 726 if (!entry)
714 goto end; 727 goto end;
728 if (entry->rcu_pending)
729 rcu_barrier_sched();
715 entry->oldptr = old; 730 entry->oldptr = old;
716 entry->rcu_pending = 1; 731 entry->rcu_pending = 1;
717 /* write rcu_pending before calling the RCU callback */ 732 /* write rcu_pending before calling the RCU callback */
718 smp_wmb(); 733 smp_wmb();
719#ifdef CONFIG_PREEMPT_RCU 734 call_rcu_sched(&entry->rcu, free_old_closure);
720 synchronize_sched(); /* Until we have the call_rcu_sched() */
721#endif
722 call_rcu(&entry->rcu, free_old_closure);
723 remove_marker(name); /* Ignore busy error message */ 735 remove_marker(name); /* Ignore busy error message */
724 ret = 0; 736 ret = 0;
725end: 737end:
@@ -786,21 +798,20 @@ int marker_probe_unregister_private_data(marker_probe_func *probe,
786 goto end; 798 goto end;
787 } 799 }
788 if (entry->rcu_pending) 800 if (entry->rcu_pending)
789 rcu_barrier(); 801 rcu_barrier_sched();
790 old = marker_entry_remove_probe(entry, NULL, probe_private); 802 old = marker_entry_remove_probe(entry, NULL, probe_private);
791 mutex_unlock(&markers_mutex); 803 mutex_unlock(&markers_mutex);
792 marker_update_probes(); /* may update entry */ 804 marker_update_probes(); /* may update entry */
793 mutex_lock(&markers_mutex); 805 mutex_lock(&markers_mutex);
794 entry = get_marker_from_private_data(probe, probe_private); 806 entry = get_marker_from_private_data(probe, probe_private);
795 WARN_ON(!entry); 807 WARN_ON(!entry);
808 if (entry->rcu_pending)
809 rcu_barrier_sched();
796 entry->oldptr = old; 810 entry->oldptr = old;
797 entry->rcu_pending = 1; 811 entry->rcu_pending = 1;
798 /* write rcu_pending before calling the RCU callback */ 812 /* write rcu_pending before calling the RCU callback */
799 smp_wmb(); 813 smp_wmb();
800#ifdef CONFIG_PREEMPT_RCU 814 call_rcu_sched(&entry->rcu, free_old_closure);
801 synchronize_sched(); /* Until we have the call_rcu_sched() */
802#endif
803 call_rcu(&entry->rcu, free_old_closure);
804 remove_marker(entry->name); /* Ignore busy error message */ 815 remove_marker(entry->name); /* Ignore busy error message */
805end: 816end:
806 mutex_unlock(&markers_mutex); 817 mutex_unlock(&markers_mutex);
diff --git a/kernel/module.c b/kernel/module.c
index 5f80478b746d..1f4cc00e0c20 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -20,11 +20,13 @@
20#include <linux/moduleloader.h> 20#include <linux/moduleloader.h>
21#include <linux/init.h> 21#include <linux/init.h>
22#include <linux/kallsyms.h> 22#include <linux/kallsyms.h>
23#include <linux/fs.h>
23#include <linux/sysfs.h> 24#include <linux/sysfs.h>
24#include <linux/kernel.h> 25#include <linux/kernel.h>
25#include <linux/slab.h> 26#include <linux/slab.h>
26#include <linux/vmalloc.h> 27#include <linux/vmalloc.h>
27#include <linux/elf.h> 28#include <linux/elf.h>
29#include <linux/proc_fs.h>
28#include <linux/seq_file.h> 30#include <linux/seq_file.h>
29#include <linux/syscalls.h> 31#include <linux/syscalls.h>
30#include <linux/fcntl.h> 32#include <linux/fcntl.h>
@@ -42,10 +44,13 @@
42#include <linux/string.h> 44#include <linux/string.h>
43#include <linux/mutex.h> 45#include <linux/mutex.h>
44#include <linux/unwind.h> 46#include <linux/unwind.h>
47#include <linux/rculist.h>
45#include <asm/uaccess.h> 48#include <asm/uaccess.h>
46#include <asm/cacheflush.h> 49#include <asm/cacheflush.h>
47#include <linux/license.h> 50#include <linux/license.h>
48#include <asm/sections.h> 51#include <asm/sections.h>
52#include <linux/tracepoint.h>
53#include <linux/ftrace.h>
49 54
50#if 0 55#if 0
51#define DEBUGP printk 56#define DEBUGP printk
@@ -61,7 +66,7 @@
61#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) 66#define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1))
62 67
63/* List of modules, protected by module_mutex or preempt_disable 68/* List of modules, protected by module_mutex or preempt_disable
64 * (add/delete uses stop_machine). */ 69 * (delete uses stop_machine/add uses RCU list operations). */
65static DEFINE_MUTEX(module_mutex); 70static DEFINE_MUTEX(module_mutex);
66static LIST_HEAD(modules); 71static LIST_HEAD(modules);
67 72
@@ -70,6 +75,9 @@ static DECLARE_WAIT_QUEUE_HEAD(module_wq);
70 75
71static BLOCKING_NOTIFIER_HEAD(module_notify_list); 76static BLOCKING_NOTIFIER_HEAD(module_notify_list);
72 77
78/* Bounds of module allocation, for speeding __module_text_address */
79static unsigned long module_addr_min = -1UL, module_addr_max = 0;
80
73int register_module_notifier(struct notifier_block * nb) 81int register_module_notifier(struct notifier_block * nb)
74{ 82{
75 return blocking_notifier_chain_register(&module_notify_list, nb); 83 return blocking_notifier_chain_register(&module_notify_list, nb);
@@ -97,7 +105,7 @@ static inline int strong_try_module_get(struct module *mod)
97static inline void add_taint_module(struct module *mod, unsigned flag) 105static inline void add_taint_module(struct module *mod, unsigned flag)
98{ 106{
99 add_taint(flag); 107 add_taint(flag);
100 mod->taints |= flag; 108 mod->taints |= (1U << flag);
101} 109}
102 110
103/* 111/*
@@ -127,6 +135,29 @@ static unsigned int find_sec(Elf_Ehdr *hdr,
127 return 0; 135 return 0;
128} 136}
129 137
138/* Find a module section, or NULL. */
139static void *section_addr(Elf_Ehdr *hdr, Elf_Shdr *shdrs,
140 const char *secstrings, const char *name)
141{
142 /* Section 0 has sh_addr 0. */
143 return (void *)shdrs[find_sec(hdr, shdrs, secstrings, name)].sh_addr;
144}
145
146/* Find a module section, or NULL. Fill in number of "objects" in section. */
147static void *section_objs(Elf_Ehdr *hdr,
148 Elf_Shdr *sechdrs,
149 const char *secstrings,
150 const char *name,
151 size_t object_size,
152 unsigned int *num)
153{
154 unsigned int sec = find_sec(hdr, sechdrs, secstrings, name);
155
156 /* Section 0 has sh_addr 0 and sh_size 0. */
157 *num = sechdrs[sec].sh_size / object_size;
158 return (void *)sechdrs[sec].sh_addr;
159}
160
130/* Provided by the linker */ 161/* Provided by the linker */
131extern const struct kernel_symbol __start___ksymtab[]; 162extern const struct kernel_symbol __start___ksymtab[];
132extern const struct kernel_symbol __stop___ksymtab[]; 163extern const struct kernel_symbol __stop___ksymtab[];
@@ -134,17 +165,19 @@ extern const struct kernel_symbol __start___ksymtab_gpl[];
134extern const struct kernel_symbol __stop___ksymtab_gpl[]; 165extern const struct kernel_symbol __stop___ksymtab_gpl[];
135extern const struct kernel_symbol __start___ksymtab_gpl_future[]; 166extern const struct kernel_symbol __start___ksymtab_gpl_future[];
136extern const struct kernel_symbol __stop___ksymtab_gpl_future[]; 167extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
137extern const struct kernel_symbol __start___ksymtab_unused[];
138extern const struct kernel_symbol __stop___ksymtab_unused[];
139extern const struct kernel_symbol __start___ksymtab_unused_gpl[];
140extern const struct kernel_symbol __stop___ksymtab_unused_gpl[];
141extern const struct kernel_symbol __start___ksymtab_gpl_future[]; 168extern const struct kernel_symbol __start___ksymtab_gpl_future[];
142extern const struct kernel_symbol __stop___ksymtab_gpl_future[]; 169extern const struct kernel_symbol __stop___ksymtab_gpl_future[];
143extern const unsigned long __start___kcrctab[]; 170extern const unsigned long __start___kcrctab[];
144extern const unsigned long __start___kcrctab_gpl[]; 171extern const unsigned long __start___kcrctab_gpl[];
145extern const unsigned long __start___kcrctab_gpl_future[]; 172extern const unsigned long __start___kcrctab_gpl_future[];
173#ifdef CONFIG_UNUSED_SYMBOLS
174extern const struct kernel_symbol __start___ksymtab_unused[];
175extern const struct kernel_symbol __stop___ksymtab_unused[];
176extern const struct kernel_symbol __start___ksymtab_unused_gpl[];
177extern const struct kernel_symbol __stop___ksymtab_unused_gpl[];
146extern const unsigned long __start___kcrctab_unused[]; 178extern const unsigned long __start___kcrctab_unused[];
147extern const unsigned long __start___kcrctab_unused_gpl[]; 179extern const unsigned long __start___kcrctab_unused_gpl[];
180#endif
148 181
149#ifndef CONFIG_MODVERSIONS 182#ifndef CONFIG_MODVERSIONS
150#define symversion(base, idx) NULL 183#define symversion(base, idx) NULL
@@ -152,152 +185,170 @@ extern const unsigned long __start___kcrctab_unused_gpl[];
152#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL) 185#define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
153#endif 186#endif
154 187
155/* lookup symbol in given range of kernel_symbols */
156static const struct kernel_symbol *lookup_symbol(const char *name,
157 const struct kernel_symbol *start,
158 const struct kernel_symbol *stop)
159{
160 const struct kernel_symbol *ks = start;
161 for (; ks < stop; ks++)
162 if (strcmp(ks->name, name) == 0)
163 return ks;
164 return NULL;
165}
166
167static bool always_ok(bool gplok, bool warn, const char *name)
168{
169 return true;
170}
171
172static bool printk_unused_warning(bool gplok, bool warn, const char *name)
173{
174 if (warn) {
175 printk(KERN_WARNING "Symbol %s is marked as UNUSED, "
176 "however this module is using it.\n", name);
177 printk(KERN_WARNING
178 "This symbol will go away in the future.\n");
179 printk(KERN_WARNING
180 "Please evalute if this is the right api to use and if "
181 "it really is, submit a report the linux kernel "
182 "mailinglist together with submitting your code for "
183 "inclusion.\n");
184 }
185 return true;
186}
187
188static bool gpl_only_unused_warning(bool gplok, bool warn, const char *name)
189{
190 if (!gplok)
191 return false;
192 return printk_unused_warning(gplok, warn, name);
193}
194
195static bool gpl_only(bool gplok, bool warn, const char *name)
196{
197 return gplok;
198}
199
200static bool warn_if_not_gpl(bool gplok, bool warn, const char *name)
201{
202 if (!gplok && warn) {
203 printk(KERN_WARNING "Symbol %s is being used "
204 "by a non-GPL module, which will not "
205 "be allowed in the future\n", name);
206 printk(KERN_WARNING "Please see the file "
207 "Documentation/feature-removal-schedule.txt "
208 "in the kernel source tree for more details.\n");
209 }
210 return true;
211}
212
213struct symsearch { 188struct symsearch {
214 const struct kernel_symbol *start, *stop; 189 const struct kernel_symbol *start, *stop;
215 const unsigned long *crcs; 190 const unsigned long *crcs;
216 bool (*check)(bool gplok, bool warn, const char *name); 191 enum {
192 NOT_GPL_ONLY,
193 GPL_ONLY,
194 WILL_BE_GPL_ONLY,
195 } licence;
196 bool unused;
217}; 197};
218 198
219/* Look through this array of symbol tables for a symbol match which 199static bool each_symbol_in_section(const struct symsearch *arr,
220 * passes the check function. */ 200 unsigned int arrsize,
221static const struct kernel_symbol *search_symarrays(const struct symsearch *arr, 201 struct module *owner,
222 unsigned int num, 202 bool (*fn)(const struct symsearch *syms,
223 const char *name, 203 struct module *owner,
224 bool gplok, 204 unsigned int symnum, void *data),
225 bool warn, 205 void *data)
226 const unsigned long **crc)
227{ 206{
228 unsigned int i; 207 unsigned int i, j;
229 const struct kernel_symbol *ks;
230
231 for (i = 0; i < num; i++) {
232 ks = lookup_symbol(name, arr[i].start, arr[i].stop);
233 if (!ks || !arr[i].check(gplok, warn, name))
234 continue;
235 208
236 if (crc) 209 for (j = 0; j < arrsize; j++) {
237 *crc = symversion(arr[i].crcs, ks - arr[i].start); 210 for (i = 0; i < arr[j].stop - arr[j].start; i++)
238 return ks; 211 if (fn(&arr[j], owner, i, data))
212 return true;
239 } 213 }
240 return NULL; 214
215 return false;
241} 216}
242 217
243/* Find a symbol, return value, (optional) crc and (optional) module 218/* Returns true as soon as fn returns true, otherwise false. */
244 * which owns it */ 219static bool each_symbol(bool (*fn)(const struct symsearch *arr,
245static unsigned long find_symbol(const char *name, 220 struct module *owner,
246 struct module **owner, 221 unsigned int symnum, void *data),
247 const unsigned long **crc, 222 void *data)
248 bool gplok,
249 bool warn)
250{ 223{
251 struct module *mod; 224 struct module *mod;
252 const struct kernel_symbol *ks;
253 const struct symsearch arr[] = { 225 const struct symsearch arr[] = {
254 { __start___ksymtab, __stop___ksymtab, __start___kcrctab, 226 { __start___ksymtab, __stop___ksymtab, __start___kcrctab,
255 always_ok }, 227 NOT_GPL_ONLY, false },
256 { __start___ksymtab_gpl, __stop___ksymtab_gpl, 228 { __start___ksymtab_gpl, __stop___ksymtab_gpl,
257 __start___kcrctab_gpl, gpl_only }, 229 __start___kcrctab_gpl,
230 GPL_ONLY, false },
258 { __start___ksymtab_gpl_future, __stop___ksymtab_gpl_future, 231 { __start___ksymtab_gpl_future, __stop___ksymtab_gpl_future,
259 __start___kcrctab_gpl_future, warn_if_not_gpl }, 232 __start___kcrctab_gpl_future,
233 WILL_BE_GPL_ONLY, false },
234#ifdef CONFIG_UNUSED_SYMBOLS
260 { __start___ksymtab_unused, __stop___ksymtab_unused, 235 { __start___ksymtab_unused, __stop___ksymtab_unused,
261 __start___kcrctab_unused, printk_unused_warning }, 236 __start___kcrctab_unused,
237 NOT_GPL_ONLY, true },
262 { __start___ksymtab_unused_gpl, __stop___ksymtab_unused_gpl, 238 { __start___ksymtab_unused_gpl, __stop___ksymtab_unused_gpl,
263 __start___kcrctab_unused_gpl, gpl_only_unused_warning }, 239 __start___kcrctab_unused_gpl,
240 GPL_ONLY, true },
241#endif
264 }; 242 };
265 243
266 /* Core kernel first. */ 244 if (each_symbol_in_section(arr, ARRAY_SIZE(arr), NULL, fn, data))
267 ks = search_symarrays(arr, ARRAY_SIZE(arr), name, gplok, warn, crc); 245 return true;
268 if (ks) {
269 if (owner)
270 *owner = NULL;
271 return ks->value;
272 }
273 246
274 /* Now try modules. */ 247 list_for_each_entry_rcu(mod, &modules, list) {
275 list_for_each_entry(mod, &modules, list) {
276 struct symsearch arr[] = { 248 struct symsearch arr[] = {
277 { mod->syms, mod->syms + mod->num_syms, mod->crcs, 249 { mod->syms, mod->syms + mod->num_syms, mod->crcs,
278 always_ok }, 250 NOT_GPL_ONLY, false },
279 { mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms, 251 { mod->gpl_syms, mod->gpl_syms + mod->num_gpl_syms,
280 mod->gpl_crcs, gpl_only }, 252 mod->gpl_crcs,
253 GPL_ONLY, false },
281 { mod->gpl_future_syms, 254 { mod->gpl_future_syms,
282 mod->gpl_future_syms + mod->num_gpl_future_syms, 255 mod->gpl_future_syms + mod->num_gpl_future_syms,
283 mod->gpl_future_crcs, warn_if_not_gpl }, 256 mod->gpl_future_crcs,
257 WILL_BE_GPL_ONLY, false },
258#ifdef CONFIG_UNUSED_SYMBOLS
284 { mod->unused_syms, 259 { mod->unused_syms,
285 mod->unused_syms + mod->num_unused_syms, 260 mod->unused_syms + mod->num_unused_syms,
286 mod->unused_crcs, printk_unused_warning }, 261 mod->unused_crcs,
262 NOT_GPL_ONLY, true },
287 { mod->unused_gpl_syms, 263 { mod->unused_gpl_syms,
288 mod->unused_gpl_syms + mod->num_unused_gpl_syms, 264 mod->unused_gpl_syms + mod->num_unused_gpl_syms,
289 mod->unused_gpl_crcs, gpl_only_unused_warning }, 265 mod->unused_gpl_crcs,
266 GPL_ONLY, true },
267#endif
290 }; 268 };
291 269
292 ks = search_symarrays(arr, ARRAY_SIZE(arr), 270 if (each_symbol_in_section(arr, ARRAY_SIZE(arr), mod, fn, data))
293 name, gplok, warn, crc); 271 return true;
294 if (ks) { 272 }
295 if (owner) 273 return false;
296 *owner = mod; 274}
297 return ks->value; 275
276struct find_symbol_arg {
277 /* Input */
278 const char *name;
279 bool gplok;
280 bool warn;
281
282 /* Output */
283 struct module *owner;
284 const unsigned long *crc;
285 unsigned long value;
286};
287
288static bool find_symbol_in_section(const struct symsearch *syms,
289 struct module *owner,
290 unsigned int symnum, void *data)
291{
292 struct find_symbol_arg *fsa = data;
293
294 if (strcmp(syms->start[symnum].name, fsa->name) != 0)
295 return false;
296
297 if (!fsa->gplok) {
298 if (syms->licence == GPL_ONLY)
299 return false;
300 if (syms->licence == WILL_BE_GPL_ONLY && fsa->warn) {
301 printk(KERN_WARNING "Symbol %s is being used "
302 "by a non-GPL module, which will not "
303 "be allowed in the future\n", fsa->name);
304 printk(KERN_WARNING "Please see the file "
305 "Documentation/feature-removal-schedule.txt "
306 "in the kernel source tree for more details.\n");
298 } 307 }
299 } 308 }
300 309
310#ifdef CONFIG_UNUSED_SYMBOLS
311 if (syms->unused && fsa->warn) {
312 printk(KERN_WARNING "Symbol %s is marked as UNUSED, "
313 "however this module is using it.\n", fsa->name);
314 printk(KERN_WARNING
315 "This symbol will go away in the future.\n");
316 printk(KERN_WARNING
317 "Please evalute if this is the right api to use and if "
318 "it really is, submit a report the linux kernel "
319 "mailinglist together with submitting your code for "
320 "inclusion.\n");
321 }
322#endif
323
324 fsa->owner = owner;
325 fsa->crc = symversion(syms->crcs, symnum);
326 fsa->value = syms->start[symnum].value;
327 return true;
328}
329
330/* Find a symbol, return value, (optional) crc and (optional) module
331 * which owns it */
332static unsigned long find_symbol(const char *name,
333 struct module **owner,
334 const unsigned long **crc,
335 bool gplok,
336 bool warn)
337{
338 struct find_symbol_arg fsa;
339
340 fsa.name = name;
341 fsa.gplok = gplok;
342 fsa.warn = warn;
343
344 if (each_symbol(find_symbol_in_section, &fsa)) {
345 if (owner)
346 *owner = fsa.owner;
347 if (crc)
348 *crc = fsa.crc;
349 return fsa.value;
350 }
351
301 DEBUGP("Failed to find symbol %s\n", name); 352 DEBUGP("Failed to find symbol %s\n", name);
302 return -ENOENT; 353 return -ENOENT;
303} 354}
@@ -639,8 +690,8 @@ static int __try_stop_module(void *_sref)
639{ 690{
640 struct stopref *sref = _sref; 691 struct stopref *sref = _sref;
641 692
642 /* If it's not unused, quit unless we are told to block. */ 693 /* If it's not unused, quit unless we're forcing. */
643 if ((sref->flags & O_NONBLOCK) && module_refcount(sref->mod) != 0) { 694 if (module_refcount(sref->mod) != 0) {
644 if (!(*sref->forced = try_force_unload(sref->flags))) 695 if (!(*sref->forced = try_force_unload(sref->flags)))
645 return -EWOULDBLOCK; 696 return -EWOULDBLOCK;
646 } 697 }
@@ -652,9 +703,16 @@ static int __try_stop_module(void *_sref)
652 703
653static int try_stop_module(struct module *mod, int flags, int *forced) 704static int try_stop_module(struct module *mod, int flags, int *forced)
654{ 705{
655 struct stopref sref = { mod, flags, forced }; 706 if (flags & O_NONBLOCK) {
707 struct stopref sref = { mod, flags, forced };
656 708
657 return stop_machine_run(__try_stop_module, &sref, NR_CPUS); 709 return stop_machine(__try_stop_module, &sref, NULL);
710 } else {
711 /* We don't need to stop the machine for this. */
712 mod->state = MODULE_STATE_GOING;
713 synchronize_sched();
714 return 0;
715 }
658} 716}
659 717
660unsigned int module_refcount(struct module *mod) 718unsigned int module_refcount(struct module *mod)
@@ -754,6 +812,7 @@ sys_delete_module(const char __user *name_user, unsigned int flags)
754 mutex_lock(&module_mutex); 812 mutex_lock(&module_mutex);
755 /* Store the name of the last unloaded module for diagnostic purposes */ 813 /* Store the name of the last unloaded module for diagnostic purposes */
756 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); 814 strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
815 unregister_dynamic_debug_module(mod->name);
757 free_module(mod); 816 free_module(mod);
758 817
759 out: 818 out:
@@ -893,7 +952,7 @@ static const char vermagic[] = VERMAGIC_STRING;
893static int try_to_force_load(struct module *mod, const char *symname) 952static int try_to_force_load(struct module *mod, const char *symname)
894{ 953{
895#ifdef CONFIG_MODULE_FORCE_LOAD 954#ifdef CONFIG_MODULE_FORCE_LOAD
896 if (!(tainted & TAINT_FORCED_MODULE)) 955 if (!test_taint(TAINT_FORCED_MODULE))
897 printk("%s: no version for \"%s\" found: kernel tainted.\n", 956 printk("%s: no version for \"%s\" found: kernel tainted.\n",
898 mod->name, symname); 957 mod->name, symname);
899 add_taint_module(mod, TAINT_FORCED_MODULE); 958 add_taint_module(mod, TAINT_FORCED_MODULE);
@@ -1003,7 +1062,7 @@ static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
1003 const unsigned long *crc; 1062 const unsigned long *crc;
1004 1063
1005 ret = find_symbol(name, &owner, &crc, 1064 ret = find_symbol(name, &owner, &crc,
1006 !(mod->taints & TAINT_PROPRIETARY_MODULE), true); 1065 !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
1007 if (!IS_ERR_VALUE(ret)) { 1066 if (!IS_ERR_VALUE(ret)) {
1008 /* use_module can fail due to OOM, 1067 /* use_module can fail due to OOM,
1009 or module initialization or unloading */ 1068 or module initialization or unloading */
@@ -1143,7 +1202,7 @@ static void free_notes_attrs(struct module_notes_attrs *notes_attrs,
1143 while (i-- > 0) 1202 while (i-- > 0)
1144 sysfs_remove_bin_file(notes_attrs->dir, 1203 sysfs_remove_bin_file(notes_attrs->dir,
1145 &notes_attrs->attrs[i]); 1204 &notes_attrs->attrs[i]);
1146 kobject_del(notes_attrs->dir); 1205 kobject_put(notes_attrs->dir);
1147 } 1206 }
1148 kfree(notes_attrs); 1207 kfree(notes_attrs);
1149} 1208}
@@ -1361,17 +1420,6 @@ static void mod_kobject_remove(struct module *mod)
1361} 1420}
1362 1421
1363/* 1422/*
1364 * link the module with the whole machine is stopped with interrupts off
1365 * - this defends against kallsyms not taking locks
1366 */
1367static int __link_module(void *_mod)
1368{
1369 struct module *mod = _mod;
1370 list_add(&mod->list, &modules);
1371 return 0;
1372}
1373
1374/*
1375 * unlink the module with the whole machine is stopped with interrupts off 1423 * unlink the module with the whole machine is stopped with interrupts off
1376 * - this defends against kallsyms not taking locks 1424 * - this defends against kallsyms not taking locks
1377 */ 1425 */
@@ -1386,7 +1434,7 @@ static int __unlink_module(void *_mod)
1386static void free_module(struct module *mod) 1434static void free_module(struct module *mod)
1387{ 1435{
1388 /* Delete from various lists */ 1436 /* Delete from various lists */
1389 stop_machine_run(__unlink_module, mod, NR_CPUS); 1437 stop_machine(__unlink_module, mod, NULL);
1390 remove_notes_attrs(mod); 1438 remove_notes_attrs(mod);
1391 remove_sect_attrs(mod); 1439 remove_sect_attrs(mod);
1392 mod_kobject_remove(mod); 1440 mod_kobject_remove(mod);
@@ -1399,6 +1447,9 @@ static void free_module(struct module *mod)
1399 /* Module unload stuff */ 1447 /* Module unload stuff */
1400 module_unload_free(mod); 1448 module_unload_free(mod);
1401 1449
1450 /* release any pointers to mcount in this module */
1451 ftrace_release(mod->module_core, mod->core_size);
1452
1402 /* This may be NULL, but that's OK */ 1453 /* This may be NULL, but that's OK */
1403 module_free(mod, mod->module_init); 1454 module_free(mod, mod->module_init);
1404 kfree(mod->args); 1455 kfree(mod->args);
@@ -1445,8 +1496,10 @@ static int verify_export_symbols(struct module *mod)
1445 { mod->syms, mod->num_syms }, 1496 { mod->syms, mod->num_syms },
1446 { mod->gpl_syms, mod->num_gpl_syms }, 1497 { mod->gpl_syms, mod->num_gpl_syms },
1447 { mod->gpl_future_syms, mod->num_gpl_future_syms }, 1498 { mod->gpl_future_syms, mod->num_gpl_future_syms },
1499#ifdef CONFIG_UNUSED_SYMBOLS
1448 { mod->unused_syms, mod->num_unused_syms }, 1500 { mod->unused_syms, mod->num_unused_syms },
1449 { mod->unused_gpl_syms, mod->num_unused_gpl_syms }, 1501 { mod->unused_gpl_syms, mod->num_unused_gpl_syms },
1502#endif
1450 }; 1503 };
1451 1504
1452 for (i = 0; i < ARRAY_SIZE(arr); i++) { 1505 for (i = 0; i < ARRAY_SIZE(arr); i++) {
@@ -1526,7 +1579,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
1526} 1579}
1527 1580
1528/* Update size with this section: return offset. */ 1581/* Update size with this section: return offset. */
1529static long get_offset(unsigned long *size, Elf_Shdr *sechdr) 1582static long get_offset(unsigned int *size, Elf_Shdr *sechdr)
1530{ 1583{
1531 long ret; 1584 long ret;
1532 1585
@@ -1602,7 +1655,7 @@ static void set_license(struct module *mod, const char *license)
1602 license = "unspecified"; 1655 license = "unspecified";
1603 1656
1604 if (!license_is_gpl_compatible(license)) { 1657 if (!license_is_gpl_compatible(license)) {
1605 if (!(tainted & TAINT_PROPRIETARY_MODULE)) 1658 if (!test_taint(TAINT_PROPRIETARY_MODULE))
1606 printk(KERN_WARNING "%s: module license '%s' taints " 1659 printk(KERN_WARNING "%s: module license '%s' taints "
1607 "kernel.\n", mod->name, license); 1660 "kernel.\n", mod->name, license);
1608 add_taint_module(mod, TAINT_PROPRIETARY_MODULE); 1661 add_taint_module(mod, TAINT_PROPRIETARY_MODULE);
@@ -1659,6 +1712,19 @@ static void setup_modinfo(struct module *mod, Elf_Shdr *sechdrs,
1659} 1712}
1660 1713
1661#ifdef CONFIG_KALLSYMS 1714#ifdef CONFIG_KALLSYMS
1715
1716/* lookup symbol in given range of kernel_symbols */
1717static const struct kernel_symbol *lookup_symbol(const char *name,
1718 const struct kernel_symbol *start,
1719 const struct kernel_symbol *stop)
1720{
1721 const struct kernel_symbol *ks = start;
1722 for (; ks < stop; ks++)
1723 if (strcmp(ks->name, name) == 0)
1724 return ks;
1725 return NULL;
1726}
1727
1662static int is_exported(const char *name, const struct module *mod) 1728static int is_exported(const char *name, const struct module *mod)
1663{ 1729{
1664 if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab)) 1730 if (!mod && lookup_symbol(name, __start___ksymtab, __stop___ksymtab))
@@ -1738,42 +1804,56 @@ static inline void add_kallsyms(struct module *mod,
1738} 1804}
1739#endif /* CONFIG_KALLSYMS */ 1805#endif /* CONFIG_KALLSYMS */
1740 1806
1807static void dynamic_printk_setup(struct mod_debug *debug, unsigned int num)
1808{
1809#ifdef CONFIG_DYNAMIC_PRINTK_DEBUG
1810 unsigned int i;
1811
1812 for (i = 0; i < num; i++) {
1813 register_dynamic_debug_module(debug[i].modname,
1814 debug[i].type,
1815 debug[i].logical_modname,
1816 debug[i].flag_names,
1817 debug[i].hash, debug[i].hash2);
1818 }
1819#endif /* CONFIG_DYNAMIC_PRINTK_DEBUG */
1820}
1821
1822static void *module_alloc_update_bounds(unsigned long size)
1823{
1824 void *ret = module_alloc(size);
1825
1826 if (ret) {
1827 /* Update module bounds. */
1828 if ((unsigned long)ret < module_addr_min)
1829 module_addr_min = (unsigned long)ret;
1830 if ((unsigned long)ret + size > module_addr_max)
1831 module_addr_max = (unsigned long)ret + size;
1832 }
1833 return ret;
1834}
1835
1741/* Allocate and load the module: note that size of section 0 is always 1836/* Allocate and load the module: note that size of section 0 is always
1742 zero, and we rely on this for optional sections. */ 1837 zero, and we rely on this for optional sections. */
1743static struct module *load_module(void __user *umod, 1838static noinline struct module *load_module(void __user *umod,
1744 unsigned long len, 1839 unsigned long len,
1745 const char __user *uargs) 1840 const char __user *uargs)
1746{ 1841{
1747 Elf_Ehdr *hdr; 1842 Elf_Ehdr *hdr;
1748 Elf_Shdr *sechdrs; 1843 Elf_Shdr *sechdrs;
1749 char *secstrings, *args, *modmagic, *strtab = NULL; 1844 char *secstrings, *args, *modmagic, *strtab = NULL;
1845 char *staging;
1750 unsigned int i; 1846 unsigned int i;
1751 unsigned int symindex = 0; 1847 unsigned int symindex = 0;
1752 unsigned int strindex = 0; 1848 unsigned int strindex = 0;
1753 unsigned int setupindex; 1849 unsigned int modindex, versindex, infoindex, pcpuindex;
1754 unsigned int exindex;
1755 unsigned int exportindex;
1756 unsigned int modindex;
1757 unsigned int obsparmindex;
1758 unsigned int infoindex;
1759 unsigned int gplindex;
1760 unsigned int crcindex;
1761 unsigned int gplcrcindex;
1762 unsigned int versindex;
1763 unsigned int pcpuindex;
1764 unsigned int gplfutureindex;
1765 unsigned int gplfuturecrcindex;
1766 unsigned int unwindex = 0; 1850 unsigned int unwindex = 0;
1767 unsigned int unusedindex; 1851 unsigned int num_kp, num_mcount;
1768 unsigned int unusedcrcindex; 1852 struct kernel_param *kp;
1769 unsigned int unusedgplindex;
1770 unsigned int unusedgplcrcindex;
1771 unsigned int markersindex;
1772 unsigned int markersstringsindex;
1773 struct module *mod; 1853 struct module *mod;
1774 long err = 0; 1854 long err = 0;
1775 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ 1855 void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
1776 struct exception_table_entry *extable; 1856 unsigned long *mseg;
1777 mm_segment_t old_fs; 1857 mm_segment_t old_fs;
1778 1858
1779 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", 1859 DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
@@ -1837,6 +1917,7 @@ static struct module *load_module(void __user *umod,
1837 err = -ENOEXEC; 1917 err = -ENOEXEC;
1838 goto free_hdr; 1918 goto free_hdr;
1839 } 1919 }
1920 /* This is temporary: point mod into copy of data. */
1840 mod = (void *)sechdrs[modindex].sh_addr; 1921 mod = (void *)sechdrs[modindex].sh_addr;
1841 1922
1842 if (symindex == 0) { 1923 if (symindex == 0) {
@@ -1846,20 +1927,6 @@ static struct module *load_module(void __user *umod,
1846 goto free_hdr; 1927 goto free_hdr;
1847 } 1928 }
1848 1929
1849 /* Optional sections */
1850 exportindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab");
1851 gplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl");
1852 gplfutureindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_gpl_future");
1853 unusedindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused");
1854 unusedgplindex = find_sec(hdr, sechdrs, secstrings, "__ksymtab_unused_gpl");
1855 crcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab");
1856 gplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl");
1857 gplfuturecrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_gpl_future");
1858 unusedcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused");
1859 unusedgplcrcindex = find_sec(hdr, sechdrs, secstrings, "__kcrctab_unused_gpl");
1860 setupindex = find_sec(hdr, sechdrs, secstrings, "__param");
1861 exindex = find_sec(hdr, sechdrs, secstrings, "__ex_table");
1862 obsparmindex = find_sec(hdr, sechdrs, secstrings, "__obsparm");
1863 versindex = find_sec(hdr, sechdrs, secstrings, "__versions"); 1930 versindex = find_sec(hdr, sechdrs, secstrings, "__versions");
1864 infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo"); 1931 infoindex = find_sec(hdr, sechdrs, secstrings, ".modinfo");
1865 pcpuindex = find_pcpusec(hdr, sechdrs, secstrings); 1932 pcpuindex = find_pcpusec(hdr, sechdrs, secstrings);
@@ -1897,6 +1964,14 @@ static struct module *load_module(void __user *umod,
1897 goto free_hdr; 1964 goto free_hdr;
1898 } 1965 }
1899 1966
1967 staging = get_modinfo(sechdrs, infoindex, "staging");
1968 if (staging) {
1969 add_taint_module(mod, TAINT_CRAP);
1970 printk(KERN_WARNING "%s: module is from the staging directory,"
1971 " the quality is unknown, you have been warned.\n",
1972 mod->name);
1973 }
1974
1900 /* Now copy in args */ 1975 /* Now copy in args */
1901 args = strndup_user(uargs, ~0UL >> 1); 1976 args = strndup_user(uargs, ~0UL >> 1);
1902 if (IS_ERR(args)) { 1977 if (IS_ERR(args)) {
@@ -1935,7 +2010,7 @@ static struct module *load_module(void __user *umod,
1935 layout_sections(mod, hdr, sechdrs, secstrings); 2010 layout_sections(mod, hdr, sechdrs, secstrings);
1936 2011
1937 /* Do the allocs. */ 2012 /* Do the allocs. */
1938 ptr = module_alloc(mod->core_size); 2013 ptr = module_alloc_update_bounds(mod->core_size);
1939 if (!ptr) { 2014 if (!ptr) {
1940 err = -ENOMEM; 2015 err = -ENOMEM;
1941 goto free_percpu; 2016 goto free_percpu;
@@ -1943,7 +2018,7 @@ static struct module *load_module(void __user *umod,
1943 memset(ptr, 0, mod->core_size); 2018 memset(ptr, 0, mod->core_size);
1944 mod->module_core = ptr; 2019 mod->module_core = ptr;
1945 2020
1946 ptr = module_alloc(mod->init_size); 2021 ptr = module_alloc_update_bounds(mod->init_size);
1947 if (!ptr && mod->init_size) { 2022 if (!ptr && mod->init_size) {
1948 err = -ENOMEM; 2023 err = -ENOMEM;
1949 goto free_core; 2024 goto free_core;
@@ -2007,48 +2082,65 @@ static struct module *load_module(void __user *umod,
2007 if (err < 0) 2082 if (err < 0)
2008 goto cleanup; 2083 goto cleanup;
2009 2084
2010 /* Set up EXPORTed & EXPORT_GPLed symbols (section 0 is 0 length) */ 2085 /* Now we've got everything in the final locations, we can
2011 mod->num_syms = sechdrs[exportindex].sh_size / sizeof(*mod->syms); 2086 * find optional sections. */
2012 mod->syms = (void *)sechdrs[exportindex].sh_addr; 2087 kp = section_objs(hdr, sechdrs, secstrings, "__param", sizeof(*kp),
2013 if (crcindex) 2088 &num_kp);
2014 mod->crcs = (void *)sechdrs[crcindex].sh_addr; 2089 mod->syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab",
2015 mod->num_gpl_syms = sechdrs[gplindex].sh_size / sizeof(*mod->gpl_syms); 2090 sizeof(*mod->syms), &mod->num_syms);
2016 mod->gpl_syms = (void *)sechdrs[gplindex].sh_addr; 2091 mod->crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab");
2017 if (gplcrcindex) 2092 mod->gpl_syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab_gpl",
2018 mod->gpl_crcs = (void *)sechdrs[gplcrcindex].sh_addr; 2093 sizeof(*mod->gpl_syms),
2019 mod->num_gpl_future_syms = sechdrs[gplfutureindex].sh_size / 2094 &mod->num_gpl_syms);
2020 sizeof(*mod->gpl_future_syms); 2095 mod->gpl_crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab_gpl");
2021 mod->num_unused_syms = sechdrs[unusedindex].sh_size / 2096 mod->gpl_future_syms = section_objs(hdr, sechdrs, secstrings,
2022 sizeof(*mod->unused_syms); 2097 "__ksymtab_gpl_future",
2023 mod->num_unused_gpl_syms = sechdrs[unusedgplindex].sh_size / 2098 sizeof(*mod->gpl_future_syms),
2024 sizeof(*mod->unused_gpl_syms); 2099 &mod->num_gpl_future_syms);
2025 mod->gpl_future_syms = (void *)sechdrs[gplfutureindex].sh_addr; 2100 mod->gpl_future_crcs = section_addr(hdr, sechdrs, secstrings,
2026 if (gplfuturecrcindex) 2101 "__kcrctab_gpl_future");
2027 mod->gpl_future_crcs = (void *)sechdrs[gplfuturecrcindex].sh_addr; 2102
2028 2103#ifdef CONFIG_UNUSED_SYMBOLS
2029 mod->unused_syms = (void *)sechdrs[unusedindex].sh_addr; 2104 mod->unused_syms = section_objs(hdr, sechdrs, secstrings,
2030 if (unusedcrcindex) 2105 "__ksymtab_unused",
2031 mod->unused_crcs = (void *)sechdrs[unusedcrcindex].sh_addr; 2106 sizeof(*mod->unused_syms),
2032 mod->unused_gpl_syms = (void *)sechdrs[unusedgplindex].sh_addr; 2107 &mod->num_unused_syms);
2033 if (unusedgplcrcindex) 2108 mod->unused_crcs = section_addr(hdr, sechdrs, secstrings,
2034 mod->unused_gpl_crcs 2109 "__kcrctab_unused");
2035 = (void *)sechdrs[unusedgplcrcindex].sh_addr; 2110 mod->unused_gpl_syms = section_objs(hdr, sechdrs, secstrings,
2111 "__ksymtab_unused_gpl",
2112 sizeof(*mod->unused_gpl_syms),
2113 &mod->num_unused_gpl_syms);
2114 mod->unused_gpl_crcs = section_addr(hdr, sechdrs, secstrings,
2115 "__kcrctab_unused_gpl");
2116#endif
2117
2118#ifdef CONFIG_MARKERS
2119 mod->markers = section_objs(hdr, sechdrs, secstrings, "__markers",
2120 sizeof(*mod->markers), &mod->num_markers);
2121#endif
2122#ifdef CONFIG_TRACEPOINTS
2123 mod->tracepoints = section_objs(hdr, sechdrs, secstrings,
2124 "__tracepoints",
2125 sizeof(*mod->tracepoints),
2126 &mod->num_tracepoints);
2127#endif
2036 2128
2037#ifdef CONFIG_MODVERSIONS 2129#ifdef CONFIG_MODVERSIONS
2038 if ((mod->num_syms && !crcindex) || 2130 if ((mod->num_syms && !mod->crcs)
2039 (mod->num_gpl_syms && !gplcrcindex) || 2131 || (mod->num_gpl_syms && !mod->gpl_crcs)
2040 (mod->num_gpl_future_syms && !gplfuturecrcindex) || 2132 || (mod->num_gpl_future_syms && !mod->gpl_future_crcs)
2041 (mod->num_unused_syms && !unusedcrcindex) || 2133#ifdef CONFIG_UNUSED_SYMBOLS
2042 (mod->num_unused_gpl_syms && !unusedgplcrcindex)) { 2134 || (mod->num_unused_syms && !mod->unused_crcs)
2135 || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs)
2136#endif
2137 ) {
2043 printk(KERN_WARNING "%s: No versions for exported symbols.\n", mod->name); 2138 printk(KERN_WARNING "%s: No versions for exported symbols.\n", mod->name);
2044 err = try_to_force_load(mod, "nocrc"); 2139 err = try_to_force_load(mod, "nocrc");
2045 if (err) 2140 if (err)
2046 goto cleanup; 2141 goto cleanup;
2047 } 2142 }
2048#endif 2143#endif
2049 markersindex = find_sec(hdr, sechdrs, secstrings, "__markers");
2050 markersstringsindex = find_sec(hdr, sechdrs, secstrings,
2051 "__markers_strings");
2052 2144
2053 /* Now do relocations. */ 2145 /* Now do relocations. */
2054 for (i = 1; i < hdr->e_shnum; i++) { 2146 for (i = 1; i < hdr->e_shnum; i++) {
@@ -2071,22 +2163,16 @@ static struct module *load_module(void __user *umod,
2071 if (err < 0) 2163 if (err < 0)
2072 goto cleanup; 2164 goto cleanup;
2073 } 2165 }
2074#ifdef CONFIG_MARKERS
2075 mod->markers = (void *)sechdrs[markersindex].sh_addr;
2076 mod->num_markers =
2077 sechdrs[markersindex].sh_size / sizeof(*mod->markers);
2078#endif
2079 2166
2080 /* Find duplicate symbols */ 2167 /* Find duplicate symbols */
2081 err = verify_export_symbols(mod); 2168 err = verify_export_symbols(mod);
2082
2083 if (err < 0) 2169 if (err < 0)
2084 goto cleanup; 2170 goto cleanup;
2085 2171
2086 /* Set up and sort exception table */ 2172 /* Set up and sort exception table */
2087 mod->num_exentries = sechdrs[exindex].sh_size / sizeof(*mod->extable); 2173 mod->extable = section_objs(hdr, sechdrs, secstrings, "__ex_table",
2088 mod->extable = extable = (void *)sechdrs[exindex].sh_addr; 2174 sizeof(*mod->extable), &mod->num_exentries);
2089 sort_extable(extable, extable + mod->num_exentries); 2175 sort_extable(mod->extable, mod->extable + mod->num_exentries);
2090 2176
2091 /* Finally, copy percpu area over. */ 2177 /* Finally, copy percpu area over. */
2092 percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr, 2178 percpu_modcopy(mod->percpu, (void *)sechdrs[pcpuindex].sh_addr,
@@ -2094,11 +2180,29 @@ static struct module *load_module(void __user *umod,
2094 2180
2095 add_kallsyms(mod, sechdrs, symindex, strindex, secstrings); 2181 add_kallsyms(mod, sechdrs, symindex, strindex, secstrings);
2096 2182
2183 if (!mod->taints) {
2184 struct mod_debug *debug;
2185 unsigned int num_debug;
2186
2097#ifdef CONFIG_MARKERS 2187#ifdef CONFIG_MARKERS
2098 if (!mod->taints)
2099 marker_update_probe_range(mod->markers, 2188 marker_update_probe_range(mod->markers,
2100 mod->markers + mod->num_markers); 2189 mod->markers + mod->num_markers);
2101#endif 2190#endif
2191 debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
2192 sizeof(*debug), &num_debug);
2193 dynamic_printk_setup(debug, num_debug);
2194
2195#ifdef CONFIG_TRACEPOINTS
2196 tracepoint_update_probe_range(mod->tracepoints,
2197 mod->tracepoints + mod->num_tracepoints);
2198#endif
2199 }
2200
2201 /* sechdrs[0].sh_size is always zero */
2202 mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc",
2203 sizeof(*mseg), &num_mcount);
2204 ftrace_init_module(mseg, mseg + num_mcount);
2205
2102 err = module_finalize(hdr, sechdrs, mod); 2206 err = module_finalize(hdr, sechdrs, mod);
2103 if (err < 0) 2207 if (err < 0)
2104 goto cleanup; 2208 goto cleanup;
@@ -2122,30 +2226,24 @@ static struct module *load_module(void __user *umod,
2122 set_fs(old_fs); 2226 set_fs(old_fs);
2123 2227
2124 mod->args = args; 2228 mod->args = args;
2125 if (obsparmindex) 2229 if (section_addr(hdr, sechdrs, secstrings, "__obsparm"))
2126 printk(KERN_WARNING "%s: Ignoring obsolete parameters\n", 2230 printk(KERN_WARNING "%s: Ignoring obsolete parameters\n",
2127 mod->name); 2231 mod->name);
2128 2232
2129 /* Now sew it into the lists so we can get lockdep and oops 2233 /* Now sew it into the lists so we can get lockdep and oops
2130 * info during argument parsing. Noone should access us, since 2234 * info during argument parsing. Noone should access us, since
2131 * strong_try_module_get() will fail. */ 2235 * strong_try_module_get() will fail.
2132 stop_machine_run(__link_module, mod, NR_CPUS); 2236 * lockdep/oops can run asynchronous, so use the RCU list insertion
2133 2237 * function to insert in a way safe to concurrent readers.
2134 /* Size of section 0 is 0, so this works well if no params */ 2238 * The mutex protects against concurrent writers.
2135 err = parse_args(mod->name, mod->args, 2239 */
2136 (struct kernel_param *) 2240 list_add_rcu(&mod->list, &modules);
2137 sechdrs[setupindex].sh_addr, 2241
2138 sechdrs[setupindex].sh_size 2242 err = parse_args(mod->name, mod->args, kp, num_kp, NULL);
2139 / sizeof(struct kernel_param),
2140 NULL);
2141 if (err < 0) 2243 if (err < 0)
2142 goto unlink; 2244 goto unlink;
2143 2245
2144 err = mod_sysfs_setup(mod, 2246 err = mod_sysfs_setup(mod, kp, num_kp);
2145 (struct kernel_param *)
2146 sechdrs[setupindex].sh_addr,
2147 sechdrs[setupindex].sh_size
2148 / sizeof(struct kernel_param));
2149 if (err < 0) 2247 if (err < 0)
2150 goto unlink; 2248 goto unlink;
2151 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); 2249 add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
@@ -2163,11 +2261,12 @@ static struct module *load_module(void __user *umod,
2163 return mod; 2261 return mod;
2164 2262
2165 unlink: 2263 unlink:
2166 stop_machine_run(__unlink_module, mod, NR_CPUS); 2264 stop_machine(__unlink_module, mod, NULL);
2167 module_arch_cleanup(mod); 2265 module_arch_cleanup(mod);
2168 cleanup: 2266 cleanup:
2169 kobject_del(&mod->mkobj.kobj); 2267 kobject_del(&mod->mkobj.kobj);
2170 kobject_put(&mod->mkobj.kobj); 2268 kobject_put(&mod->mkobj.kobj);
2269 ftrace_release(mod->module_core, mod->core_size);
2171 free_unload: 2270 free_unload:
2172 module_unload_free(mod); 2271 module_unload_free(mod);
2173 module_free(mod, mod->module_init); 2272 module_free(mod, mod->module_init);
@@ -2220,7 +2319,7 @@ sys_init_module(void __user *umod,
2220 2319
2221 /* Start the module */ 2320 /* Start the module */
2222 if (mod->init != NULL) 2321 if (mod->init != NULL)
2223 ret = mod->init(); 2322 ret = do_one_initcall(mod->init);
2224 if (ret < 0) { 2323 if (ret < 0) {
2225 /* Init routine failed: abort. Try to protect us from 2324 /* Init routine failed: abort. Try to protect us from
2226 buggy refcounters. */ 2325 buggy refcounters. */
@@ -2333,7 +2432,7 @@ const char *module_address_lookup(unsigned long addr,
2333 const char *ret = NULL; 2432 const char *ret = NULL;
2334 2433
2335 preempt_disable(); 2434 preempt_disable();
2336 list_for_each_entry(mod, &modules, list) { 2435 list_for_each_entry_rcu(mod, &modules, list) {
2337 if (within(addr, mod->module_init, mod->init_size) 2436 if (within(addr, mod->module_init, mod->init_size)
2338 || within(addr, mod->module_core, mod->core_size)) { 2437 || within(addr, mod->module_core, mod->core_size)) {
2339 if (modname) 2438 if (modname)
@@ -2356,7 +2455,7 @@ int lookup_module_symbol_name(unsigned long addr, char *symname)
2356 struct module *mod; 2455 struct module *mod;
2357 2456
2358 preempt_disable(); 2457 preempt_disable();
2359 list_for_each_entry(mod, &modules, list) { 2458 list_for_each_entry_rcu(mod, &modules, list) {
2360 if (within(addr, mod->module_init, mod->init_size) || 2459 if (within(addr, mod->module_init, mod->init_size) ||
2361 within(addr, mod->module_core, mod->core_size)) { 2460 within(addr, mod->module_core, mod->core_size)) {
2362 const char *sym; 2461 const char *sym;
@@ -2380,7 +2479,7 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size,
2380 struct module *mod; 2479 struct module *mod;
2381 2480
2382 preempt_disable(); 2481 preempt_disable();
2383 list_for_each_entry(mod, &modules, list) { 2482 list_for_each_entry_rcu(mod, &modules, list) {
2384 if (within(addr, mod->module_init, mod->init_size) || 2483 if (within(addr, mod->module_init, mod->init_size) ||
2385 within(addr, mod->module_core, mod->core_size)) { 2484 within(addr, mod->module_core, mod->core_size)) {
2386 const char *sym; 2485 const char *sym;
@@ -2407,7 +2506,7 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
2407 struct module *mod; 2506 struct module *mod;
2408 2507
2409 preempt_disable(); 2508 preempt_disable();
2410 list_for_each_entry(mod, &modules, list) { 2509 list_for_each_entry_rcu(mod, &modules, list) {
2411 if (symnum < mod->num_symtab) { 2510 if (symnum < mod->num_symtab) {
2412 *value = mod->symtab[symnum].st_value; 2511 *value = mod->symtab[symnum].st_value;
2413 *type = mod->symtab[symnum].st_info; 2512 *type = mod->symtab[symnum].st_info;
@@ -2450,7 +2549,7 @@ unsigned long module_kallsyms_lookup_name(const char *name)
2450 ret = mod_find_symname(mod, colon+1); 2549 ret = mod_find_symname(mod, colon+1);
2451 *colon = ':'; 2550 *colon = ':';
2452 } else { 2551 } else {
2453 list_for_each_entry(mod, &modules, list) 2552 list_for_each_entry_rcu(mod, &modules, list)
2454 if ((ret = mod_find_symname(mod, name)) != 0) 2553 if ((ret = mod_find_symname(mod, name)) != 0)
2455 break; 2554 break;
2456 } 2555 }
@@ -2459,23 +2558,6 @@ unsigned long module_kallsyms_lookup_name(const char *name)
2459} 2558}
2460#endif /* CONFIG_KALLSYMS */ 2559#endif /* CONFIG_KALLSYMS */
2461 2560
2462/* Called by the /proc file system to return a list of modules. */
2463static void *m_start(struct seq_file *m, loff_t *pos)
2464{
2465 mutex_lock(&module_mutex);
2466 return seq_list_start(&modules, *pos);
2467}
2468
2469static void *m_next(struct seq_file *m, void *p, loff_t *pos)
2470{
2471 return seq_list_next(p, &modules, pos);
2472}
2473
2474static void m_stop(struct seq_file *m, void *p)
2475{
2476 mutex_unlock(&module_mutex);
2477}
2478
2479static char *module_flags(struct module *mod, char *buf) 2561static char *module_flags(struct module *mod, char *buf)
2480{ 2562{
2481 int bx = 0; 2563 int bx = 0;
@@ -2484,10 +2566,12 @@ static char *module_flags(struct module *mod, char *buf)
2484 mod->state == MODULE_STATE_GOING || 2566 mod->state == MODULE_STATE_GOING ||
2485 mod->state == MODULE_STATE_COMING) { 2567 mod->state == MODULE_STATE_COMING) {
2486 buf[bx++] = '('; 2568 buf[bx++] = '(';
2487 if (mod->taints & TAINT_PROPRIETARY_MODULE) 2569 if (mod->taints & (1 << TAINT_PROPRIETARY_MODULE))
2488 buf[bx++] = 'P'; 2570 buf[bx++] = 'P';
2489 if (mod->taints & TAINT_FORCED_MODULE) 2571 if (mod->taints & (1 << TAINT_FORCED_MODULE))
2490 buf[bx++] = 'F'; 2572 buf[bx++] = 'F';
2573 if (mod->taints & (1 << TAINT_CRAP))
2574 buf[bx++] = 'C';
2491 /* 2575 /*
2492 * TAINT_FORCED_RMMOD: could be added. 2576 * TAINT_FORCED_RMMOD: could be added.
2493 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't 2577 * TAINT_UNSAFE_SMP, TAINT_MACHINE_CHECK, TAINT_BAD_PAGE don't
@@ -2507,12 +2591,30 @@ static char *module_flags(struct module *mod, char *buf)
2507 return buf; 2591 return buf;
2508} 2592}
2509 2593
2594#ifdef CONFIG_PROC_FS
2595/* Called by the /proc file system to return a list of modules. */
2596static void *m_start(struct seq_file *m, loff_t *pos)
2597{
2598 mutex_lock(&module_mutex);
2599 return seq_list_start(&modules, *pos);
2600}
2601
2602static void *m_next(struct seq_file *m, void *p, loff_t *pos)
2603{
2604 return seq_list_next(p, &modules, pos);
2605}
2606
2607static void m_stop(struct seq_file *m, void *p)
2608{
2609 mutex_unlock(&module_mutex);
2610}
2611
2510static int m_show(struct seq_file *m, void *p) 2612static int m_show(struct seq_file *m, void *p)
2511{ 2613{
2512 struct module *mod = list_entry(p, struct module, list); 2614 struct module *mod = list_entry(p, struct module, list);
2513 char buf[8]; 2615 char buf[8];
2514 2616
2515 seq_printf(m, "%s %lu", 2617 seq_printf(m, "%s %u",
2516 mod->name, mod->init_size + mod->core_size); 2618 mod->name, mod->init_size + mod->core_size);
2517 print_unload_info(m, mod); 2619 print_unload_info(m, mod);
2518 2620
@@ -2537,13 +2639,33 @@ static int m_show(struct seq_file *m, void *p)
2537 Where refcount is a number or -, and deps is a comma-separated list 2639 Where refcount is a number or -, and deps is a comma-separated list
2538 of depends or -. 2640 of depends or -.
2539*/ 2641*/
2540const struct seq_operations modules_op = { 2642static const struct seq_operations modules_op = {
2541 .start = m_start, 2643 .start = m_start,
2542 .next = m_next, 2644 .next = m_next,
2543 .stop = m_stop, 2645 .stop = m_stop,
2544 .show = m_show 2646 .show = m_show
2545}; 2647};
2546 2648
2649static int modules_open(struct inode *inode, struct file *file)
2650{
2651 return seq_open(file, &modules_op);
2652}
2653
2654static const struct file_operations proc_modules_operations = {
2655 .open = modules_open,
2656 .read = seq_read,
2657 .llseek = seq_lseek,
2658 .release = seq_release,
2659};
2660
2661static int __init proc_modules_init(void)
2662{
2663 proc_create("modules", 0, NULL, &proc_modules_operations);
2664 return 0;
2665}
2666module_init(proc_modules_init);
2667#endif
2668
2547/* Given an address, look for it in the module exception tables. */ 2669/* Given an address, look for it in the module exception tables. */
2548const struct exception_table_entry *search_module_extables(unsigned long addr) 2670const struct exception_table_entry *search_module_extables(unsigned long addr)
2549{ 2671{
@@ -2551,7 +2673,7 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
2551 struct module *mod; 2673 struct module *mod;
2552 2674
2553 preempt_disable(); 2675 preempt_disable();
2554 list_for_each_entry(mod, &modules, list) { 2676 list_for_each_entry_rcu(mod, &modules, list) {
2555 if (mod->num_exentries == 0) 2677 if (mod->num_exentries == 0)
2556 continue; 2678 continue;
2557 2679
@@ -2577,7 +2699,7 @@ int is_module_address(unsigned long addr)
2577 2699
2578 preempt_disable(); 2700 preempt_disable();
2579 2701
2580 list_for_each_entry(mod, &modules, list) { 2702 list_for_each_entry_rcu(mod, &modules, list) {
2581 if (within(addr, mod->module_core, mod->core_size)) { 2703 if (within(addr, mod->module_core, mod->core_size)) {
2582 preempt_enable(); 2704 preempt_enable();
2583 return 1; 2705 return 1;
@@ -2595,7 +2717,10 @@ struct module *__module_text_address(unsigned long addr)
2595{ 2717{
2596 struct module *mod; 2718 struct module *mod;
2597 2719
2598 list_for_each_entry(mod, &modules, list) 2720 if (addr < module_addr_min || addr > module_addr_max)
2721 return NULL;
2722
2723 list_for_each_entry_rcu(mod, &modules, list)
2599 if (within(addr, mod->module_init, mod->init_text_size) 2724 if (within(addr, mod->module_init, mod->init_text_size)
2600 || within(addr, mod->module_core, mod->core_text_size)) 2725 || within(addr, mod->module_core, mod->core_text_size))
2601 return mod; 2726 return mod;
@@ -2620,8 +2745,11 @@ void print_modules(void)
2620 char buf[8]; 2745 char buf[8];
2621 2746
2622 printk("Modules linked in:"); 2747 printk("Modules linked in:");
2623 list_for_each_entry(mod, &modules, list) 2748 /* Most callers should already have preempt disabled, but make sure */
2749 preempt_disable();
2750 list_for_each_entry_rcu(mod, &modules, list)
2624 printk(" %s%s", mod->name, module_flags(mod, buf)); 2751 printk(" %s%s", mod->name, module_flags(mod, buf));
2752 preempt_enable();
2625 if (last_unloaded_module[0]) 2753 if (last_unloaded_module[0])
2626 printk(" [last unloaded: %s]", last_unloaded_module); 2754 printk(" [last unloaded: %s]", last_unloaded_module);
2627 printk("\n"); 2755 printk("\n");
@@ -2646,3 +2774,50 @@ void module_update_markers(void)
2646 mutex_unlock(&module_mutex); 2774 mutex_unlock(&module_mutex);
2647} 2775}
2648#endif 2776#endif
2777
2778#ifdef CONFIG_TRACEPOINTS
2779void module_update_tracepoints(void)
2780{
2781 struct module *mod;
2782
2783 mutex_lock(&module_mutex);
2784 list_for_each_entry(mod, &modules, list)
2785 if (!mod->taints)
2786 tracepoint_update_probe_range(mod->tracepoints,
2787 mod->tracepoints + mod->num_tracepoints);
2788 mutex_unlock(&module_mutex);
2789}
2790
2791/*
2792 * Returns 0 if current not found.
2793 * Returns 1 if current found.
2794 */
2795int module_get_iter_tracepoints(struct tracepoint_iter *iter)
2796{
2797 struct module *iter_mod;
2798 int found = 0;
2799
2800 mutex_lock(&module_mutex);
2801 list_for_each_entry(iter_mod, &modules, list) {
2802 if (!iter_mod->taints) {
2803 /*
2804 * Sorted module list
2805 */
2806 if (iter_mod < iter->module)
2807 continue;
2808 else if (iter_mod > iter->module)
2809 iter->tracepoint = NULL;
2810 found = tracepoint_get_iter_range(&iter->tracepoint,
2811 iter_mod->tracepoints,
2812 iter_mod->tracepoints
2813 + iter_mod->num_tracepoints);
2814 if (found) {
2815 iter->module = iter_mod;
2816 break;
2817 }
2818 }
2819 }
2820 mutex_unlock(&module_mutex);
2821 return found;
2822}
2823#endif
diff --git a/kernel/mutex.c b/kernel/mutex.c
index bcdc9ac8ef60..12c779dc65d4 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -34,6 +34,7 @@
34/*** 34/***
35 * mutex_init - initialize the mutex 35 * mutex_init - initialize the mutex
36 * @lock: the mutex to be initialized 36 * @lock: the mutex to be initialized
37 * @key: the lock_class_key for the class; used by mutex lock debugging
37 * 38 *
38 * Initialize the mutex to unlocked state. 39 * Initialize the mutex to unlocked state.
39 * 40 *
diff --git a/kernel/notifier.c b/kernel/notifier.c
index 823be11584ef..4282c0a40a57 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -550,7 +550,7 @@ EXPORT_SYMBOL(unregister_reboot_notifier);
550 550
551static ATOMIC_NOTIFIER_HEAD(die_chain); 551static ATOMIC_NOTIFIER_HEAD(die_chain);
552 552
553int notify_die(enum die_val val, const char *str, 553int notrace notify_die(enum die_val val, const char *str,
554 struct pt_regs *regs, long err, int trap, int sig) 554 struct pt_regs *regs, long err, int trap, int sig)
555{ 555{
556 struct die_args args = { 556 struct die_args args = {
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 48d7ed6fc3a4..43c2111cd54d 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -7,6 +7,7 @@
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/cgroup.h> 8#include <linux/cgroup.h>
9#include <linux/fs.h> 9#include <linux/fs.h>
10#include <linux/proc_fs.h>
10#include <linux/slab.h> 11#include <linux/slab.h>
11#include <linux/nsproxy.h> 12#include <linux/nsproxy.h>
12 13
@@ -24,9 +25,12 @@ static inline struct ns_cgroup *cgroup_to_ns(
24 struct ns_cgroup, css); 25 struct ns_cgroup, css);
25} 26}
26 27
27int ns_cgroup_clone(struct task_struct *task) 28int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
28{ 29{
29 return cgroup_clone(task, &ns_subsys); 30 char name[PROC_NUMBUF];
31
32 snprintf(name, PROC_NUMBUF, "%d", pid_vnr(pid));
33 return cgroup_clone(task, &ns_subsys, name);
30} 34}
31 35
32/* 36/*
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index adc785146a1c..1d3ef29a2583 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -14,7 +14,6 @@
14 */ 14 */
15 15
16#include <linux/module.h> 16#include <linux/module.h>
17#include <linux/version.h>
18#include <linux/nsproxy.h> 17#include <linux/nsproxy.h>
19#include <linux/init_task.h> 18#include <linux/init_task.h>
20#include <linux/mnt_namespace.h> 19#include <linux/mnt_namespace.h>
@@ -157,12 +156,6 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk)
157 goto out; 156 goto out;
158 } 157 }
159 158
160 err = ns_cgroup_clone(tsk);
161 if (err) {
162 put_nsproxy(new_ns);
163 goto out;
164 }
165
166 tsk->nsproxy = new_ns; 159 tsk->nsproxy = new_ns;
167 160
168out: 161out:
@@ -209,7 +202,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
209 goto out; 202 goto out;
210 } 203 }
211 204
212 err = ns_cgroup_clone(current); 205 err = ns_cgroup_clone(current, task_pid(current));
213 if (err) 206 if (err)
214 put_nsproxy(*new_nsp); 207 put_nsproxy(*new_nsp);
215 208
diff --git a/kernel/panic.c b/kernel/panic.c
index 425567f45b9f..6513aac8e992 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -23,7 +23,7 @@
23#include <linux/kallsyms.h> 23#include <linux/kallsyms.h>
24 24
25int panic_on_oops; 25int panic_on_oops;
26int tainted; 26static unsigned long tainted_mask;
27static int pause_on_oops; 27static int pause_on_oops;
28static int pause_on_oops_flag; 28static int pause_on_oops_flag;
29static DEFINE_SPINLOCK(pause_on_oops_lock); 29static DEFINE_SPINLOCK(pause_on_oops_lock);
@@ -34,13 +34,6 @@ ATOMIC_NOTIFIER_HEAD(panic_notifier_list);
34 34
35EXPORT_SYMBOL(panic_notifier_list); 35EXPORT_SYMBOL(panic_notifier_list);
36 36
37static int __init panic_setup(char *str)
38{
39 panic_timeout = simple_strtoul(str, NULL, 0);
40 return 1;
41}
42__setup("panic=", panic_setup);
43
44static long no_blink(long time) 37static long no_blink(long time)
45{ 38{
46 return 0; 39 return 0;
@@ -143,6 +136,27 @@ NORET_TYPE void panic(const char * fmt, ...)
143 136
144EXPORT_SYMBOL(panic); 137EXPORT_SYMBOL(panic);
145 138
139
140struct tnt {
141 u8 bit;
142 char true;
143 char false;
144};
145
146static const struct tnt tnts[] = {
147 { TAINT_PROPRIETARY_MODULE, 'P', 'G' },
148 { TAINT_FORCED_MODULE, 'F', ' ' },
149 { TAINT_UNSAFE_SMP, 'S', ' ' },
150 { TAINT_FORCED_RMMOD, 'R', ' ' },
151 { TAINT_MACHINE_CHECK, 'M', ' ' },
152 { TAINT_BAD_PAGE, 'B', ' ' },
153 { TAINT_USER, 'U', ' ' },
154 { TAINT_DIE, 'D', ' ' },
155 { TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' },
156 { TAINT_WARN, 'W', ' ' },
157 { TAINT_CRAP, 'C', ' ' },
158};
159
146/** 160/**
147 * print_tainted - return a string to represent the kernel taint state. 161 * print_tainted - return a string to represent the kernel taint state.
148 * 162 *
@@ -155,44 +169,47 @@ EXPORT_SYMBOL(panic);
155 * 'U' - Userspace-defined naughtiness. 169 * 'U' - Userspace-defined naughtiness.
156 * 'A' - ACPI table overridden. 170 * 'A' - ACPI table overridden.
157 * 'W' - Taint on warning. 171 * 'W' - Taint on warning.
172 * 'C' - modules from drivers/staging are loaded.
158 * 173 *
159 * The string is overwritten by the next call to print_taint(). 174 * The string is overwritten by the next call to print_taint().
160 */ 175 */
161
162const char *print_tainted(void) 176const char *print_tainted(void)
163{ 177{
164 static char buf[20]; 178 static char buf[ARRAY_SIZE(tnts) + sizeof("Tainted: ") + 1];
165 if (tainted) { 179
166 snprintf(buf, sizeof(buf), "Tainted: %c%c%c%c%c%c%c%c%c%c", 180 if (tainted_mask) {
167 tainted & TAINT_PROPRIETARY_MODULE ? 'P' : 'G', 181 char *s;
168 tainted & TAINT_FORCED_MODULE ? 'F' : ' ', 182 int i;
169 tainted & TAINT_UNSAFE_SMP ? 'S' : ' ', 183
170 tainted & TAINT_FORCED_RMMOD ? 'R' : ' ', 184 s = buf + sprintf(buf, "Tainted: ");
171 tainted & TAINT_MACHINE_CHECK ? 'M' : ' ', 185 for (i = 0; i < ARRAY_SIZE(tnts); i++) {
172 tainted & TAINT_BAD_PAGE ? 'B' : ' ', 186 const struct tnt *t = &tnts[i];
173 tainted & TAINT_USER ? 'U' : ' ', 187 *s++ = test_bit(t->bit, &tainted_mask) ?
174 tainted & TAINT_DIE ? 'D' : ' ', 188 t->true : t->false;
175 tainted & TAINT_OVERRIDDEN_ACPI_TABLE ? 'A' : ' ', 189 }
176 tainted & TAINT_WARN ? 'W' : ' '); 190 *s = 0;
177 } 191 } else
178 else
179 snprintf(buf, sizeof(buf), "Not tainted"); 192 snprintf(buf, sizeof(buf), "Not tainted");
180 return(buf); 193 return(buf);
181} 194}
182 195
183void add_taint(unsigned flag) 196int test_taint(unsigned flag)
184{ 197{
185 debug_locks = 0; /* can't trust the integrity of the kernel anymore */ 198 return test_bit(flag, &tainted_mask);
186 tainted |= flag;
187} 199}
188EXPORT_SYMBOL(add_taint); 200EXPORT_SYMBOL(test_taint);
189 201
190static int __init pause_on_oops_setup(char *str) 202unsigned long get_taint(void)
191{ 203{
192 pause_on_oops = simple_strtoul(str, NULL, 0); 204 return tainted_mask;
193 return 1;
194} 205}
195__setup("pause_on_oops=", pause_on_oops_setup); 206
207void add_taint(unsigned flag)
208{
209 debug_locks = 0; /* can't trust the integrity of the kernel anymore */
210 set_bit(flag, &tainted_mask);
211}
212EXPORT_SYMBOL(add_taint);
196 213
197static void spin_msec(int msecs) 214static void spin_msec(int msecs)
198{ 215{
@@ -318,6 +335,28 @@ void warn_on_slowpath(const char *file, int line)
318 add_taint(TAINT_WARN); 335 add_taint(TAINT_WARN);
319} 336}
320EXPORT_SYMBOL(warn_on_slowpath); 337EXPORT_SYMBOL(warn_on_slowpath);
338
339
340void warn_slowpath(const char *file, int line, const char *fmt, ...)
341{
342 va_list args;
343 char function[KSYM_SYMBOL_LEN];
344 unsigned long caller = (unsigned long)__builtin_return_address(0);
345 sprint_symbol(function, caller);
346
347 printk(KERN_WARNING "------------[ cut here ]------------\n");
348 printk(KERN_WARNING "WARNING: at %s:%d %s()\n", file,
349 line, function);
350 va_start(args, fmt);
351 vprintk(fmt, args);
352 va_end(args);
353
354 print_modules();
355 dump_stack();
356 print_oops_end_marker();
357 add_taint(TAINT_WARN);
358}
359EXPORT_SYMBOL(warn_slowpath);
321#endif 360#endif
322 361
323#ifdef CONFIG_CC_STACKPROTECTOR 362#ifdef CONFIG_CC_STACKPROTECTOR
@@ -331,3 +370,6 @@ void __stack_chk_fail(void)
331} 370}
332EXPORT_SYMBOL(__stack_chk_fail); 371EXPORT_SYMBOL(__stack_chk_fail);
333#endif 372#endif
373
374core_param(panic, panic_timeout, int, 0644);
375core_param(pause_on_oops, pause_on_oops, int, 0644);
diff --git a/kernel/params.c b/kernel/params.c
index afc46a23eb6d..a1e3025b19a9 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -373,6 +373,8 @@ int param_get_string(char *buffer, struct kernel_param *kp)
373} 373}
374 374
375/* sysfs output in /sys/modules/XYZ/parameters/ */ 375/* sysfs output in /sys/modules/XYZ/parameters/ */
376#define to_module_attr(n) container_of(n, struct module_attribute, attr);
377#define to_module_kobject(n) container_of(n, struct module_kobject, kobj);
376 378
377extern struct kernel_param __start___param[], __stop___param[]; 379extern struct kernel_param __start___param[], __stop___param[];
378 380
@@ -384,6 +386,7 @@ struct param_attribute
384 386
385struct module_param_attrs 387struct module_param_attrs
386{ 388{
389 unsigned int num;
387 struct attribute_group grp; 390 struct attribute_group grp;
388 struct param_attribute attrs[0]; 391 struct param_attribute attrs[0];
389}; 392};
@@ -434,93 +437,120 @@ static ssize_t param_attr_store(struct module_attribute *mattr,
434 437
435#ifdef CONFIG_SYSFS 438#ifdef CONFIG_SYSFS
436/* 439/*
437 * param_sysfs_setup - setup sysfs support for one module or KBUILD_MODNAME 440 * add_sysfs_param - add a parameter to sysfs
438 * @mk: struct module_kobject (contains parent kobject) 441 * @mk: struct module_kobject
439 * @kparam: array of struct kernel_param, the actual parameter definitions 442 * @kparam: the actual parameter definition to add to sysfs
440 * @num_params: number of entries in array 443 * @name: name of parameter
441 * @name_skip: offset where the parameter name start in kparam[].name. Needed for built-in "modules"
442 * 444 *
443 * Create a kobject for a (per-module) group of parameters, and create files 445 * Create a kobject if for a (per-module) parameter if mp NULL, and
444 * in sysfs. A pointer to the param_kobject is returned on success, 446 * create file in sysfs. Returns an error on out of memory. Always cleans up
445 * NULL if there's no parameter to export, or other ERR_PTR(err). 447 * if there's an error.
446 */ 448 */
447static __modinit struct module_param_attrs * 449static __modinit int add_sysfs_param(struct module_kobject *mk,
448param_sysfs_setup(struct module_kobject *mk, 450 struct kernel_param *kp,
449 struct kernel_param *kparam, 451 const char *name)
450 unsigned int num_params,
451 unsigned int name_skip)
452{ 452{
453 struct module_param_attrs *mp; 453 struct module_param_attrs *new;
454 unsigned int valid_attrs = 0; 454 struct attribute **attrs;
455 unsigned int i, size[2]; 455 int err, num;
456 struct param_attribute *pattr; 456
457 struct attribute **gattr; 457 /* We don't bother calling this with invisible parameters. */
458 int err; 458 BUG_ON(!kp->perm);
459 459
460 for (i=0; i<num_params; i++) { 460 if (!mk->mp) {
461 if (kparam[i].perm) 461 num = 0;
462 valid_attrs++; 462 attrs = NULL;
463 } else {
464 num = mk->mp->num;
465 attrs = mk->mp->grp.attrs;
463 } 466 }
464 467
465 if (!valid_attrs) 468 /* Enlarge. */
466 return NULL; 469 new = krealloc(mk->mp,
467 470 sizeof(*mk->mp) + sizeof(mk->mp->attrs[0]) * (num+1),
468 size[0] = ALIGN(sizeof(*mp) + 471 GFP_KERNEL);
469 valid_attrs * sizeof(mp->attrs[0]), 472 if (!new) {
470 sizeof(mp->grp.attrs[0])); 473 kfree(mk->mp);
471 size[1] = (valid_attrs + 1) * sizeof(mp->grp.attrs[0]); 474 err = -ENOMEM;
472 475 goto fail;
473 mp = kzalloc(size[0] + size[1], GFP_KERNEL);
474 if (!mp)
475 return ERR_PTR(-ENOMEM);
476
477 mp->grp.name = "parameters";
478 mp->grp.attrs = (void *)mp + size[0];
479
480 pattr = &mp->attrs[0];
481 gattr = &mp->grp.attrs[0];
482 for (i = 0; i < num_params; i++) {
483 struct kernel_param *kp = &kparam[i];
484 if (kp->perm) {
485 pattr->param = kp;
486 pattr->mattr.show = param_attr_show;
487 pattr->mattr.store = param_attr_store;
488 pattr->mattr.attr.name = (char *)&kp->name[name_skip];
489 pattr->mattr.attr.mode = kp->perm;
490 *(gattr++) = &(pattr++)->mattr.attr;
491 }
492 } 476 }
493 *gattr = NULL; 477 attrs = krealloc(attrs, sizeof(new->grp.attrs[0])*(num+2), GFP_KERNEL);
494 478 if (!attrs) {
495 if ((err = sysfs_create_group(&mk->kobj, &mp->grp))) { 479 err = -ENOMEM;
496 kfree(mp); 480 goto fail_free_new;
497 return ERR_PTR(err);
498 } 481 }
499 return mp; 482
483 /* Sysfs wants everything zeroed. */
484 memset(new, 0, sizeof(*new));
485 memset(&new->attrs[num], 0, sizeof(new->attrs[num]));
486 memset(&attrs[num], 0, sizeof(attrs[num]));
487 new->grp.name = "parameters";
488 new->grp.attrs = attrs;
489
490 /* Tack new one on the end. */
491 new->attrs[num].param = kp;
492 new->attrs[num].mattr.show = param_attr_show;
493 new->attrs[num].mattr.store = param_attr_store;
494 new->attrs[num].mattr.attr.name = (char *)name;
495 new->attrs[num].mattr.attr.mode = kp->perm;
496 new->num = num+1;
497
498 /* Fix up all the pointers, since krealloc can move us */
499 for (num = 0; num < new->num; num++)
500 new->grp.attrs[num] = &new->attrs[num].mattr.attr;
501 new->grp.attrs[num] = NULL;
502
503 mk->mp = new;
504 return 0;
505
506fail_free_new:
507 kfree(new);
508fail:
509 mk->mp = NULL;
510 return err;
500} 511}
501 512
502#ifdef CONFIG_MODULES 513#ifdef CONFIG_MODULES
514static void free_module_param_attrs(struct module_kobject *mk)
515{
516 kfree(mk->mp->grp.attrs);
517 kfree(mk->mp);
518 mk->mp = NULL;
519}
520
503/* 521/*
504 * module_param_sysfs_setup - setup sysfs support for one module 522 * module_param_sysfs_setup - setup sysfs support for one module
505 * @mod: module 523 * @mod: module
506 * @kparam: module parameters (array) 524 * @kparam: module parameters (array)
507 * @num_params: number of module parameters 525 * @num_params: number of module parameters
508 * 526 *
509 * Adds sysfs entries for module parameters, and creates a link from 527 * Adds sysfs entries for module parameters under
510 * /sys/module/[mod->name]/parameters to /sys/parameters/[mod->name]/ 528 * /sys/module/[mod->name]/parameters/
511 */ 529 */
512int module_param_sysfs_setup(struct module *mod, 530int module_param_sysfs_setup(struct module *mod,
513 struct kernel_param *kparam, 531 struct kernel_param *kparam,
514 unsigned int num_params) 532 unsigned int num_params)
515{ 533{
516 struct module_param_attrs *mp; 534 int i, err;
535 bool params = false;
536
537 for (i = 0; i < num_params; i++) {
538 if (kparam[i].perm == 0)
539 continue;
540 err = add_sysfs_param(&mod->mkobj, &kparam[i], kparam[i].name);
541 if (err)
542 return err;
543 params = true;
544 }
517 545
518 mp = param_sysfs_setup(&mod->mkobj, kparam, num_params, 0); 546 if (!params)
519 if (IS_ERR(mp)) 547 return 0;
520 return PTR_ERR(mp);
521 548
522 mod->param_attrs = mp; 549 /* Create the param group. */
523 return 0; 550 err = sysfs_create_group(&mod->mkobj.kobj, &mod->mkobj.mp->grp);
551 if (err)
552 free_module_param_attrs(&mod->mkobj);
553 return err;
524} 554}
525 555
526/* 556/*
@@ -532,43 +562,55 @@ int module_param_sysfs_setup(struct module *mod,
532 */ 562 */
533void module_param_sysfs_remove(struct module *mod) 563void module_param_sysfs_remove(struct module *mod)
534{ 564{
535 if (mod->param_attrs) { 565 if (mod->mkobj.mp) {
536 sysfs_remove_group(&mod->mkobj.kobj, 566 sysfs_remove_group(&mod->mkobj.kobj, &mod->mkobj.mp->grp);
537 &mod->param_attrs->grp);
538 /* We are positive that no one is using any param 567 /* We are positive that no one is using any param
539 * attrs at this point. Deallocate immediately. */ 568 * attrs at this point. Deallocate immediately. */
540 kfree(mod->param_attrs); 569 free_module_param_attrs(&mod->mkobj);
541 mod->param_attrs = NULL;
542 } 570 }
543} 571}
544#endif 572#endif
545 573
546/* 574static void __init kernel_add_sysfs_param(const char *name,
547 * kernel_param_sysfs_setup - wrapper for built-in params support 575 struct kernel_param *kparam,
548 */ 576 unsigned int name_skip)
549static void __init kernel_param_sysfs_setup(const char *name,
550 struct kernel_param *kparam,
551 unsigned int num_params,
552 unsigned int name_skip)
553{ 577{
554 struct module_kobject *mk; 578 struct module_kobject *mk;
555 int ret; 579 struct kobject *kobj;
580 int err;
556 581
557 mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL); 582 kobj = kset_find_obj(module_kset, name);
558 BUG_ON(!mk); 583 if (kobj) {
559 584 /* We already have one. Remove params so we can add more. */
560 mk->mod = THIS_MODULE; 585 mk = to_module_kobject(kobj);
561 mk->kobj.kset = module_kset; 586 /* We need to remove it before adding parameters. */
562 ret = kobject_init_and_add(&mk->kobj, &module_ktype, NULL, "%s", name); 587 sysfs_remove_group(&mk->kobj, &mk->mp->grp);
563 if (ret) { 588 } else {
564 kobject_put(&mk->kobj); 589 mk = kzalloc(sizeof(struct module_kobject), GFP_KERNEL);
565 printk(KERN_ERR "Module '%s' failed to be added to sysfs, " 590 BUG_ON(!mk);
566 "error number %d\n", name, ret); 591
567 printk(KERN_ERR "The system will be unstable now.\n"); 592 mk->mod = THIS_MODULE;
568 return; 593 mk->kobj.kset = module_kset;
594 err = kobject_init_and_add(&mk->kobj, &module_ktype, NULL,
595 "%s", name);
596 if (err) {
597 kobject_put(&mk->kobj);
598 printk(KERN_ERR "Module '%s' failed add to sysfs, "
599 "error number %d\n", name, err);
600 printk(KERN_ERR "The system will be unstable now.\n");
601 return;
602 }
603 /* So that exit path is even. */
604 kobject_get(&mk->kobj);
569 } 605 }
570 param_sysfs_setup(mk, kparam, num_params, name_skip); 606
607 /* These should not fail at boot. */
608 err = add_sysfs_param(mk, kparam, kparam->name + name_skip);
609 BUG_ON(err);
610 err = sysfs_create_group(&mk->kobj, &mk->mp->grp);
611 BUG_ON(err);
571 kobject_uevent(&mk->kobj, KOBJ_ADD); 612 kobject_uevent(&mk->kobj, KOBJ_ADD);
613 kobject_put(&mk->kobj);
572} 614}
573 615
574/* 616/*
@@ -579,60 +621,36 @@ static void __init kernel_param_sysfs_setup(const char *name,
579 * The "module" name (KBUILD_MODNAME) is stored before a dot, the 621 * The "module" name (KBUILD_MODNAME) is stored before a dot, the
580 * "parameter" name is stored behind a dot in kernel_param->name. So, 622 * "parameter" name is stored behind a dot in kernel_param->name. So,
581 * extract the "module" name for all built-in kernel_param-eters, 623 * extract the "module" name for all built-in kernel_param-eters,
582 * and for all who have the same, call kernel_param_sysfs_setup. 624 * and for all who have the same, call kernel_add_sysfs_param.
583 */ 625 */
584static void __init param_sysfs_builtin(void) 626static void __init param_sysfs_builtin(void)
585{ 627{
586 struct kernel_param *kp, *kp_begin = NULL; 628 struct kernel_param *kp;
587 unsigned int i, name_len, count = 0; 629 unsigned int name_len;
588 char modname[MODULE_NAME_LEN + 1] = ""; 630 char modname[MODULE_NAME_LEN];
589 631
590 for (i=0; i < __stop___param - __start___param; i++) { 632 for (kp = __start___param; kp < __stop___param; kp++) {
591 char *dot; 633 char *dot;
592 size_t max_name_len;
593 634
594 kp = &__start___param[i]; 635 if (kp->perm == 0)
595 max_name_len = 636 continue;
596 min_t(size_t, MODULE_NAME_LEN, strlen(kp->name));
597 637
598 dot = memchr(kp->name, '.', max_name_len); 638 dot = strchr(kp->name, '.');
599 if (!dot) { 639 if (!dot) {
600 DEBUGP("couldn't find period in first %d characters " 640 /* This happens for core_param() */
601 "of %s\n", MODULE_NAME_LEN, kp->name); 641 strcpy(modname, "kernel");
602 continue; 642 name_len = 0;
603 } 643 } else {
604 name_len = dot - kp->name; 644 name_len = dot - kp->name + 1;
605 645 strlcpy(modname, kp->name, name_len);
606 /* new kbuild_modname? */
607 if (strlen(modname) != name_len
608 || strncmp(modname, kp->name, name_len) != 0) {
609 /* add a new kobject for previous kernel_params. */
610 if (count)
611 kernel_param_sysfs_setup(modname,
612 kp_begin,
613 count,
614 strlen(modname)+1);
615
616 strncpy(modname, kp->name, name_len);
617 modname[name_len] = '\0';
618 count = 0;
619 kp_begin = kp;
620 } 646 }
621 count++; 647 kernel_add_sysfs_param(modname, kp, name_len);
622 } 648 }
623
624 /* last kernel_params need to be registered as well */
625 if (count)
626 kernel_param_sysfs_setup(modname, kp_begin, count,
627 strlen(modname)+1);
628} 649}
629 650
630 651
631/* module-related sysfs stuff */ 652/* module-related sysfs stuff */
632 653
633#define to_module_attr(n) container_of(n, struct module_attribute, attr);
634#define to_module_kobject(n) container_of(n, struct module_kobject, kobj);
635
636static ssize_t module_attr_show(struct kobject *kobj, 654static ssize_t module_attr_show(struct kobject *kobj,
637 struct attribute *attr, 655 struct attribute *attr,
638 char *buf) 656 char *buf)
diff --git a/kernel/pid.c b/kernel/pid.c
index 30bd5d4b2ac7..064e76afa507 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -309,12 +309,6 @@ struct pid *find_vpid(int nr)
309} 309}
310EXPORT_SYMBOL_GPL(find_vpid); 310EXPORT_SYMBOL_GPL(find_vpid);
311 311
312struct pid *find_pid(int nr)
313{
314 return find_pid_ns(nr, &init_pid_ns);
315}
316EXPORT_SYMBOL_GPL(find_pid);
317
318/* 312/*
319 * attach_pid() must be called with the tasklist_lock write-held. 313 * attach_pid() must be called with the tasklist_lock write-held.
320 */ 314 */
@@ -435,6 +429,7 @@ struct pid *find_get_pid(pid_t nr)
435 429
436 return pid; 430 return pid;
437} 431}
432EXPORT_SYMBOL_GPL(find_get_pid);
438 433
439pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns) 434pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns)
440{ 435{
@@ -482,7 +477,7 @@ EXPORT_SYMBOL(task_session_nr_ns);
482/* 477/*
483 * Used by proc to find the first pid that is greater then or equal to nr. 478 * Used by proc to find the first pid that is greater then or equal to nr.
484 * 479 *
485 * If there is a pid at nr this function is exactly the same as find_pid. 480 * If there is a pid at nr this function is exactly the same as find_pid_ns.
486 */ 481 */
487struct pid *find_ge_pid(int nr, struct pid_namespace *ns) 482struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
488{ 483{
@@ -497,7 +492,6 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
497 492
498 return pid; 493 return pid;
499} 494}
500EXPORT_SYMBOL_GPL(find_get_pid);
501 495
502/* 496/*
503 * The pid hash table is scaled according to the amount of memory in the 497 * The pid hash table is scaled according to the amount of memory in the
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 98702b4b8851..fab8ea86fac3 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -12,6 +12,7 @@
12#include <linux/pid_namespace.h> 12#include <linux/pid_namespace.h>
13#include <linux/syscalls.h> 13#include <linux/syscalls.h>
14#include <linux/err.h> 14#include <linux/err.h>
15#include <linux/acct.h>
15 16
16#define BITS_PER_PAGE (PAGE_SIZE*8) 17#define BITS_PER_PAGE (PAGE_SIZE*8)
17 18
@@ -71,7 +72,7 @@ static struct pid_namespace *create_pid_namespace(unsigned int level)
71 struct pid_namespace *ns; 72 struct pid_namespace *ns;
72 int i; 73 int i;
73 74
74 ns = kmem_cache_alloc(pid_ns_cachep, GFP_KERNEL); 75 ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
75 if (ns == NULL) 76 if (ns == NULL)
76 goto out; 77 goto out;
77 78
@@ -84,17 +85,13 @@ static struct pid_namespace *create_pid_namespace(unsigned int level)
84 goto out_free_map; 85 goto out_free_map;
85 86
86 kref_init(&ns->kref); 87 kref_init(&ns->kref);
87 ns->last_pid = 0;
88 ns->child_reaper = NULL;
89 ns->level = level; 88 ns->level = level;
90 89
91 set_bit(0, ns->pidmap[0].page); 90 set_bit(0, ns->pidmap[0].page);
92 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1); 91 atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
93 92
94 for (i = 1; i < PIDMAP_ENTRIES; i++) { 93 for (i = 1; i < PIDMAP_ENTRIES; i++)
95 ns->pidmap[i].page = NULL;
96 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE); 94 atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
97 }
98 95
99 return ns; 96 return ns;
100 97
@@ -182,9 +179,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
182 rc = sys_wait4(-1, NULL, __WALL, NULL); 179 rc = sys_wait4(-1, NULL, __WALL, NULL);
183 } while (rc != -ECHILD); 180 } while (rc != -ECHILD);
184 181
185 182 acct_exit_ns(pid_ns);
186 /* Child reaper for the pid namespace is going away */
187 pid_ns->child_reaper = NULL;
188 return; 183 return;
189} 184}
190 185
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index 8cb757026386..dfdec524d1b7 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -24,7 +24,7 @@
24 * requirement that the application has is cleaned up when closes the file 24 * requirement that the application has is cleaned up when closes the file
25 * pointer or exits the pm_qos_object will get an opportunity to clean up. 25 * pointer or exits the pm_qos_object will get an opportunity to clean up.
26 * 26 *
27 * mark gross mgross@linux.intel.com 27 * Mark Gross <mgross@linux.intel.com>
28 */ 28 */
29 29
30#include <linux/pm_qos_params.h> 30#include <linux/pm_qos_params.h>
@@ -43,7 +43,7 @@
43#include <linux/uaccess.h> 43#include <linux/uaccess.h>
44 44
45/* 45/*
46 * locking rule: all changes to target_value or requirements or notifiers lists 46 * locking rule: all changes to requirements or notifiers lists
47 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock 47 * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock
48 * held, taken with _irqsave. One lock to rule them all 48 * held, taken with _irqsave. One lock to rule them all
49 */ 49 */
@@ -66,7 +66,7 @@ struct pm_qos_object {
66 struct miscdevice pm_qos_power_miscdev; 66 struct miscdevice pm_qos_power_miscdev;
67 char *name; 67 char *name;
68 s32 default_value; 68 s32 default_value;
69 s32 target_value; 69 atomic_t target_value;
70 s32 (*comparitor)(s32, s32); 70 s32 (*comparitor)(s32, s32);
71}; 71};
72 72
@@ -77,7 +77,7 @@ static struct pm_qos_object cpu_dma_pm_qos = {
77 .notifiers = &cpu_dma_lat_notifier, 77 .notifiers = &cpu_dma_lat_notifier,
78 .name = "cpu_dma_latency", 78 .name = "cpu_dma_latency",
79 .default_value = 2000 * USEC_PER_SEC, 79 .default_value = 2000 * USEC_PER_SEC,
80 .target_value = 2000 * USEC_PER_SEC, 80 .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC),
81 .comparitor = min_compare 81 .comparitor = min_compare
82}; 82};
83 83
@@ -87,7 +87,7 @@ static struct pm_qos_object network_lat_pm_qos = {
87 .notifiers = &network_lat_notifier, 87 .notifiers = &network_lat_notifier,
88 .name = "network_latency", 88 .name = "network_latency",
89 .default_value = 2000 * USEC_PER_SEC, 89 .default_value = 2000 * USEC_PER_SEC,
90 .target_value = 2000 * USEC_PER_SEC, 90 .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC),
91 .comparitor = min_compare 91 .comparitor = min_compare
92}; 92};
93 93
@@ -99,7 +99,7 @@ static struct pm_qos_object network_throughput_pm_qos = {
99 .notifiers = &network_throughput_notifier, 99 .notifiers = &network_throughput_notifier,
100 .name = "network_throughput", 100 .name = "network_throughput",
101 .default_value = 0, 101 .default_value = 0,
102 .target_value = 0, 102 .target_value = ATOMIC_INIT(0),
103 .comparitor = max_compare 103 .comparitor = max_compare
104}; 104};
105 105
@@ -150,11 +150,11 @@ static void update_target(int target)
150 extreme_value = pm_qos_array[target]->comparitor( 150 extreme_value = pm_qos_array[target]->comparitor(
151 extreme_value, node->value); 151 extreme_value, node->value);
152 } 152 }
153 if (pm_qos_array[target]->target_value != extreme_value) { 153 if (atomic_read(&pm_qos_array[target]->target_value) != extreme_value) {
154 call_notifier = 1; 154 call_notifier = 1;
155 pm_qos_array[target]->target_value = extreme_value; 155 atomic_set(&pm_qos_array[target]->target_value, extreme_value);
156 pr_debug(KERN_ERR "new target for qos %d is %d\n", target, 156 pr_debug(KERN_ERR "new target for qos %d is %d\n", target,
157 pm_qos_array[target]->target_value); 157 atomic_read(&pm_qos_array[target]->target_value));
158 } 158 }
159 spin_unlock_irqrestore(&pm_qos_lock, flags); 159 spin_unlock_irqrestore(&pm_qos_lock, flags);
160 160
@@ -193,14 +193,7 @@ static int find_pm_qos_object_by_minor(int minor)
193 */ 193 */
194int pm_qos_requirement(int pm_qos_class) 194int pm_qos_requirement(int pm_qos_class)
195{ 195{
196 int ret_val; 196 return atomic_read(&pm_qos_array[pm_qos_class]->target_value);
197 unsigned long flags;
198
199 spin_lock_irqsave(&pm_qos_lock, flags);
200 ret_val = pm_qos_array[pm_qos_class]->target_value;
201 spin_unlock_irqrestore(&pm_qos_lock, flags);
202
203 return ret_val;
204} 197}
205EXPORT_SYMBOL_GPL(pm_qos_requirement); 198EXPORT_SYMBOL_GPL(pm_qos_requirement);
206 199
@@ -211,8 +204,8 @@ EXPORT_SYMBOL_GPL(pm_qos_requirement);
211 * @value: defines the qos request 204 * @value: defines the qos request
212 * 205 *
213 * This function inserts a new entry in the pm_qos_class list of requested qos 206 * This function inserts a new entry in the pm_qos_class list of requested qos
214 * performance charactoistics. It recomputes the agregate QoS expectations for 207 * performance characteristics. It recomputes the aggregate QoS expectations
215 * the pm_qos_class of parrameters. 208 * for the pm_qos_class of parameters.
216 */ 209 */
217int pm_qos_add_requirement(int pm_qos_class, char *name, s32 value) 210int pm_qos_add_requirement(int pm_qos_class, char *name, s32 value)
218{ 211{
@@ -250,10 +243,10 @@ EXPORT_SYMBOL_GPL(pm_qos_add_requirement);
250 * @name: identifies the request 243 * @name: identifies the request
251 * @value: defines the qos request 244 * @value: defines the qos request
252 * 245 *
253 * Updates an existing qos requierement for the pm_qos_class of parameters along 246 * Updates an existing qos requirement for the pm_qos_class of parameters along
254 * with updating the target pm_qos_class value. 247 * with updating the target pm_qos_class value.
255 * 248 *
256 * If the named request isn't in the lest then no change is made. 249 * If the named request isn't in the list then no change is made.
257 */ 250 */
258int pm_qos_update_requirement(int pm_qos_class, char *name, s32 new_value) 251int pm_qos_update_requirement(int pm_qos_class, char *name, s32 new_value)
259{ 252{
@@ -287,7 +280,7 @@ EXPORT_SYMBOL_GPL(pm_qos_update_requirement);
287 * @pm_qos_class: identifies which list of qos request to us 280 * @pm_qos_class: identifies which list of qos request to us
288 * @name: identifies the request 281 * @name: identifies the request
289 * 282 *
290 * Will remove named qos request from pm_qos_class list of parrameters and 283 * Will remove named qos request from pm_qos_class list of parameters and
291 * recompute the current target value for the pm_qos_class. 284 * recompute the current target value for the pm_qos_class.
292 */ 285 */
293void pm_qos_remove_requirement(int pm_qos_class, char *name) 286void pm_qos_remove_requirement(int pm_qos_class, char *name)
@@ -319,7 +312,7 @@ EXPORT_SYMBOL_GPL(pm_qos_remove_requirement);
319 * @notifier: notifier block managed by caller. 312 * @notifier: notifier block managed by caller.
320 * 313 *
321 * will register the notifier into a notification chain that gets called 314 * will register the notifier into a notification chain that gets called
322 * uppon changes to the pm_qos_class target value. 315 * upon changes to the pm_qos_class target value.
323 */ 316 */
324 int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier) 317 int pm_qos_add_notifier(int pm_qos_class, struct notifier_block *notifier)
325{ 318{
@@ -338,7 +331,7 @@ EXPORT_SYMBOL_GPL(pm_qos_add_notifier);
338 * @notifier: notifier block to be removed. 331 * @notifier: notifier block to be removed.
339 * 332 *
340 * will remove the notifier from the notification chain that gets called 333 * will remove the notifier from the notification chain that gets called
341 * uppon changes to the pm_qos_class target value. 334 * upon changes to the pm_qos_class target value.
342 */ 335 */
343int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier) 336int pm_qos_remove_notifier(int pm_qos_class, struct notifier_block *notifier)
344{ 337{
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index c42a03aef36f..153dcb2639c3 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -7,6 +7,93 @@
7#include <linux/errno.h> 7#include <linux/errno.h>
8#include <linux/math64.h> 8#include <linux/math64.h>
9#include <asm/uaccess.h> 9#include <asm/uaccess.h>
10#include <linux/kernel_stat.h>
11
12/*
13 * Allocate the thread_group_cputime structure appropriately and fill in the
14 * current values of the fields. Called from copy_signal() via
15 * thread_group_cputime_clone_thread() when adding a second or subsequent
16 * thread to a thread group. Assumes interrupts are enabled when called.
17 */
18int thread_group_cputime_alloc(struct task_struct *tsk)
19{
20 struct signal_struct *sig = tsk->signal;
21 struct task_cputime *cputime;
22
23 /*
24 * If we have multiple threads and we don't already have a
25 * per-CPU task_cputime struct (checked in the caller), allocate
26 * one and fill it in with the times accumulated so far. We may
27 * race with another thread so recheck after we pick up the sighand
28 * lock.
29 */
30 cputime = alloc_percpu(struct task_cputime);
31 if (cputime == NULL)
32 return -ENOMEM;
33 spin_lock_irq(&tsk->sighand->siglock);
34 if (sig->cputime.totals) {
35 spin_unlock_irq(&tsk->sighand->siglock);
36 free_percpu(cputime);
37 return 0;
38 }
39 sig->cputime.totals = cputime;
40 cputime = per_cpu_ptr(sig->cputime.totals, smp_processor_id());
41 cputime->utime = tsk->utime;
42 cputime->stime = tsk->stime;
43 cputime->sum_exec_runtime = tsk->se.sum_exec_runtime;
44 spin_unlock_irq(&tsk->sighand->siglock);
45 return 0;
46}
47
48/**
49 * thread_group_cputime - Sum the thread group time fields across all CPUs.
50 *
51 * @tsk: The task we use to identify the thread group.
52 * @times: task_cputime structure in which we return the summed fields.
53 *
54 * Walk the list of CPUs to sum the per-CPU time fields in the thread group
55 * time structure.
56 */
57void thread_group_cputime(
58 struct task_struct *tsk,
59 struct task_cputime *times)
60{
61 struct signal_struct *sig;
62 int i;
63 struct task_cputime *tot;
64
65 sig = tsk->signal;
66 if (unlikely(!sig) || !sig->cputime.totals) {
67 times->utime = tsk->utime;
68 times->stime = tsk->stime;
69 times->sum_exec_runtime = tsk->se.sum_exec_runtime;
70 return;
71 }
72 times->stime = times->utime = cputime_zero;
73 times->sum_exec_runtime = 0;
74 for_each_possible_cpu(i) {
75 tot = per_cpu_ptr(tsk->signal->cputime.totals, i);
76 times->utime = cputime_add(times->utime, tot->utime);
77 times->stime = cputime_add(times->stime, tot->stime);
78 times->sum_exec_runtime += tot->sum_exec_runtime;
79 }
80}
81
82/*
83 * Called after updating RLIMIT_CPU to set timer expiration if necessary.
84 */
85void update_rlimit_cpu(unsigned long rlim_new)
86{
87 cputime_t cputime;
88
89 cputime = secs_to_cputime(rlim_new);
90 if (cputime_eq(current->signal->it_prof_expires, cputime_zero) ||
91 cputime_lt(current->signal->it_prof_expires, cputime)) {
92 spin_lock_irq(&current->sighand->siglock);
93 set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
94 spin_unlock_irq(&current->sighand->siglock);
95 }
96}
10 97
11static int check_clock(const clockid_t which_clock) 98static int check_clock(const clockid_t which_clock)
12{ 99{
@@ -158,10 +245,6 @@ static inline cputime_t virt_ticks(struct task_struct *p)
158{ 245{
159 return p->utime; 246 return p->utime;
160} 247}
161static inline unsigned long long sched_ns(struct task_struct *p)
162{
163 return task_sched_runtime(p);
164}
165 248
166int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp) 249int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
167{ 250{
@@ -211,7 +294,7 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
211 cpu->cpu = virt_ticks(p); 294 cpu->cpu = virt_ticks(p);
212 break; 295 break;
213 case CPUCLOCK_SCHED: 296 case CPUCLOCK_SCHED:
214 cpu->sched = sched_ns(p); 297 cpu->sched = p->se.sum_exec_runtime + task_delta_exec(p);
215 break; 298 break;
216 } 299 }
217 return 0; 300 return 0;
@@ -220,59 +303,30 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
220/* 303/*
221 * Sample a process (thread group) clock for the given group_leader task. 304 * Sample a process (thread group) clock for the given group_leader task.
222 * Must be called with tasklist_lock held for reading. 305 * Must be called with tasklist_lock held for reading.
223 * Must be called with tasklist_lock held for reading, and p->sighand->siglock.
224 */ 306 */
225static int cpu_clock_sample_group_locked(unsigned int clock_idx, 307static int cpu_clock_sample_group(const clockid_t which_clock,
226 struct task_struct *p, 308 struct task_struct *p,
227 union cpu_time_count *cpu) 309 union cpu_time_count *cpu)
228{ 310{
229 struct task_struct *t = p; 311 struct task_cputime cputime;
230 switch (clock_idx) { 312
313 thread_group_cputime(p, &cputime);
314 switch (which_clock) {
231 default: 315 default:
232 return -EINVAL; 316 return -EINVAL;
233 case CPUCLOCK_PROF: 317 case CPUCLOCK_PROF:
234 cpu->cpu = cputime_add(p->signal->utime, p->signal->stime); 318 cpu->cpu = cputime_add(cputime.utime, cputime.stime);
235 do {
236 cpu->cpu = cputime_add(cpu->cpu, prof_ticks(t));
237 t = next_thread(t);
238 } while (t != p);
239 break; 319 break;
240 case CPUCLOCK_VIRT: 320 case CPUCLOCK_VIRT:
241 cpu->cpu = p->signal->utime; 321 cpu->cpu = cputime.utime;
242 do {
243 cpu->cpu = cputime_add(cpu->cpu, virt_ticks(t));
244 t = next_thread(t);
245 } while (t != p);
246 break; 322 break;
247 case CPUCLOCK_SCHED: 323 case CPUCLOCK_SCHED:
248 cpu->sched = p->signal->sum_sched_runtime; 324 cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
249 /* Add in each other live thread. */
250 while ((t = next_thread(t)) != p) {
251 cpu->sched += t->se.sum_exec_runtime;
252 }
253 cpu->sched += sched_ns(p);
254 break; 325 break;
255 } 326 }
256 return 0; 327 return 0;
257} 328}
258 329
259/*
260 * Sample a process (thread group) clock for the given group_leader task.
261 * Must be called with tasklist_lock held for reading.
262 */
263static int cpu_clock_sample_group(const clockid_t which_clock,
264 struct task_struct *p,
265 union cpu_time_count *cpu)
266{
267 int ret;
268 unsigned long flags;
269 spin_lock_irqsave(&p->sighand->siglock, flags);
270 ret = cpu_clock_sample_group_locked(CPUCLOCK_WHICH(which_clock), p,
271 cpu);
272 spin_unlock_irqrestore(&p->sighand->siglock, flags);
273 return ret;
274}
275
276 330
277int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) 331int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
278{ 332{
@@ -471,80 +525,11 @@ void posix_cpu_timers_exit(struct task_struct *tsk)
471} 525}
472void posix_cpu_timers_exit_group(struct task_struct *tsk) 526void posix_cpu_timers_exit_group(struct task_struct *tsk)
473{ 527{
474 cleanup_timers(tsk->signal->cpu_timers, 528 struct task_cputime cputime;
475 cputime_add(tsk->utime, tsk->signal->utime),
476 cputime_add(tsk->stime, tsk->signal->stime),
477 tsk->se.sum_exec_runtime + tsk->signal->sum_sched_runtime);
478}
479 529
480 530 thread_group_cputime(tsk, &cputime);
481/* 531 cleanup_timers(tsk->signal->cpu_timers,
482 * Set the expiry times of all the threads in the process so one of them 532 cputime.utime, cputime.stime, cputime.sum_exec_runtime);
483 * will go off before the process cumulative expiry total is reached.
484 */
485static void process_timer_rebalance(struct task_struct *p,
486 unsigned int clock_idx,
487 union cpu_time_count expires,
488 union cpu_time_count val)
489{
490 cputime_t ticks, left;
491 unsigned long long ns, nsleft;
492 struct task_struct *t = p;
493 unsigned int nthreads = atomic_read(&p->signal->live);
494
495 if (!nthreads)
496 return;
497
498 switch (clock_idx) {
499 default:
500 BUG();
501 break;
502 case CPUCLOCK_PROF:
503 left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu),
504 nthreads);
505 do {
506 if (likely(!(t->flags & PF_EXITING))) {
507 ticks = cputime_add(prof_ticks(t), left);
508 if (cputime_eq(t->it_prof_expires,
509 cputime_zero) ||
510 cputime_gt(t->it_prof_expires, ticks)) {
511 t->it_prof_expires = ticks;
512 }
513 }
514 t = next_thread(t);
515 } while (t != p);
516 break;
517 case CPUCLOCK_VIRT:
518 left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu),
519 nthreads);
520 do {
521 if (likely(!(t->flags & PF_EXITING))) {
522 ticks = cputime_add(virt_ticks(t), left);
523 if (cputime_eq(t->it_virt_expires,
524 cputime_zero) ||
525 cputime_gt(t->it_virt_expires, ticks)) {
526 t->it_virt_expires = ticks;
527 }
528 }
529 t = next_thread(t);
530 } while (t != p);
531 break;
532 case CPUCLOCK_SCHED:
533 nsleft = expires.sched - val.sched;
534 do_div(nsleft, nthreads);
535 nsleft = max_t(unsigned long long, nsleft, 1);
536 do {
537 if (likely(!(t->flags & PF_EXITING))) {
538 ns = t->se.sum_exec_runtime + nsleft;
539 if (t->it_sched_expires == 0 ||
540 t->it_sched_expires > ns) {
541 t->it_sched_expires = ns;
542 }
543 }
544 t = next_thread(t);
545 } while (t != p);
546 break;
547 }
548} 533}
549 534
550static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now) 535static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
@@ -608,29 +593,32 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
608 default: 593 default:
609 BUG(); 594 BUG();
610 case CPUCLOCK_PROF: 595 case CPUCLOCK_PROF:
611 if (cputime_eq(p->it_prof_expires, 596 if (cputime_eq(p->cputime_expires.prof_exp,
612 cputime_zero) || 597 cputime_zero) ||
613 cputime_gt(p->it_prof_expires, 598 cputime_gt(p->cputime_expires.prof_exp,
614 nt->expires.cpu)) 599 nt->expires.cpu))
615 p->it_prof_expires = nt->expires.cpu; 600 p->cputime_expires.prof_exp =
601 nt->expires.cpu;
616 break; 602 break;
617 case CPUCLOCK_VIRT: 603 case CPUCLOCK_VIRT:
618 if (cputime_eq(p->it_virt_expires, 604 if (cputime_eq(p->cputime_expires.virt_exp,
619 cputime_zero) || 605 cputime_zero) ||
620 cputime_gt(p->it_virt_expires, 606 cputime_gt(p->cputime_expires.virt_exp,
621 nt->expires.cpu)) 607 nt->expires.cpu))
622 p->it_virt_expires = nt->expires.cpu; 608 p->cputime_expires.virt_exp =
609 nt->expires.cpu;
623 break; 610 break;
624 case CPUCLOCK_SCHED: 611 case CPUCLOCK_SCHED:
625 if (p->it_sched_expires == 0 || 612 if (p->cputime_expires.sched_exp == 0 ||
626 p->it_sched_expires > nt->expires.sched) 613 p->cputime_expires.sched_exp >
627 p->it_sched_expires = nt->expires.sched; 614 nt->expires.sched)
615 p->cputime_expires.sched_exp =
616 nt->expires.sched;
628 break; 617 break;
629 } 618 }
630 } else { 619 } else {
631 /* 620 /*
632 * For a process timer, we must balance 621 * For a process timer, set the cached expiration time.
633 * all the live threads' expirations.
634 */ 622 */
635 switch (CPUCLOCK_WHICH(timer->it_clock)) { 623 switch (CPUCLOCK_WHICH(timer->it_clock)) {
636 default: 624 default:
@@ -641,7 +629,9 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
641 cputime_lt(p->signal->it_virt_expires, 629 cputime_lt(p->signal->it_virt_expires,
642 timer->it.cpu.expires.cpu)) 630 timer->it.cpu.expires.cpu))
643 break; 631 break;
644 goto rebalance; 632 p->signal->cputime_expires.virt_exp =
633 timer->it.cpu.expires.cpu;
634 break;
645 case CPUCLOCK_PROF: 635 case CPUCLOCK_PROF:
646 if (!cputime_eq(p->signal->it_prof_expires, 636 if (!cputime_eq(p->signal->it_prof_expires,
647 cputime_zero) && 637 cputime_zero) &&
@@ -652,13 +642,12 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
652 if (i != RLIM_INFINITY && 642 if (i != RLIM_INFINITY &&
653 i <= cputime_to_secs(timer->it.cpu.expires.cpu)) 643 i <= cputime_to_secs(timer->it.cpu.expires.cpu))
654 break; 644 break;
655 goto rebalance; 645 p->signal->cputime_expires.prof_exp =
646 timer->it.cpu.expires.cpu;
647 break;
656 case CPUCLOCK_SCHED: 648 case CPUCLOCK_SCHED:
657 rebalance: 649 p->signal->cputime_expires.sched_exp =
658 process_timer_rebalance( 650 timer->it.cpu.expires.sched;
659 timer->it.cpu.task,
660 CPUCLOCK_WHICH(timer->it_clock),
661 timer->it.cpu.expires, now);
662 break; 651 break;
663 } 652 }
664 } 653 }
@@ -969,13 +958,13 @@ static void check_thread_timers(struct task_struct *tsk,
969 struct signal_struct *const sig = tsk->signal; 958 struct signal_struct *const sig = tsk->signal;
970 959
971 maxfire = 20; 960 maxfire = 20;
972 tsk->it_prof_expires = cputime_zero; 961 tsk->cputime_expires.prof_exp = cputime_zero;
973 while (!list_empty(timers)) { 962 while (!list_empty(timers)) {
974 struct cpu_timer_list *t = list_first_entry(timers, 963 struct cpu_timer_list *t = list_first_entry(timers,
975 struct cpu_timer_list, 964 struct cpu_timer_list,
976 entry); 965 entry);
977 if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) { 966 if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) {
978 tsk->it_prof_expires = t->expires.cpu; 967 tsk->cputime_expires.prof_exp = t->expires.cpu;
979 break; 968 break;
980 } 969 }
981 t->firing = 1; 970 t->firing = 1;
@@ -984,13 +973,13 @@ static void check_thread_timers(struct task_struct *tsk,
984 973
985 ++timers; 974 ++timers;
986 maxfire = 20; 975 maxfire = 20;
987 tsk->it_virt_expires = cputime_zero; 976 tsk->cputime_expires.virt_exp = cputime_zero;
988 while (!list_empty(timers)) { 977 while (!list_empty(timers)) {
989 struct cpu_timer_list *t = list_first_entry(timers, 978 struct cpu_timer_list *t = list_first_entry(timers,
990 struct cpu_timer_list, 979 struct cpu_timer_list,
991 entry); 980 entry);
992 if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) { 981 if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) {
993 tsk->it_virt_expires = t->expires.cpu; 982 tsk->cputime_expires.virt_exp = t->expires.cpu;
994 break; 983 break;
995 } 984 }
996 t->firing = 1; 985 t->firing = 1;
@@ -999,13 +988,13 @@ static void check_thread_timers(struct task_struct *tsk,
999 988
1000 ++timers; 989 ++timers;
1001 maxfire = 20; 990 maxfire = 20;
1002 tsk->it_sched_expires = 0; 991 tsk->cputime_expires.sched_exp = 0;
1003 while (!list_empty(timers)) { 992 while (!list_empty(timers)) {
1004 struct cpu_timer_list *t = list_first_entry(timers, 993 struct cpu_timer_list *t = list_first_entry(timers,
1005 struct cpu_timer_list, 994 struct cpu_timer_list,
1006 entry); 995 entry);
1007 if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) { 996 if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) {
1008 tsk->it_sched_expires = t->expires.sched; 997 tsk->cputime_expires.sched_exp = t->expires.sched;
1009 break; 998 break;
1010 } 999 }
1011 t->firing = 1; 1000 t->firing = 1;
@@ -1055,10 +1044,10 @@ static void check_process_timers(struct task_struct *tsk,
1055{ 1044{
1056 int maxfire; 1045 int maxfire;
1057 struct signal_struct *const sig = tsk->signal; 1046 struct signal_struct *const sig = tsk->signal;
1058 cputime_t utime, stime, ptime, virt_expires, prof_expires; 1047 cputime_t utime, ptime, virt_expires, prof_expires;
1059 unsigned long long sum_sched_runtime, sched_expires; 1048 unsigned long long sum_sched_runtime, sched_expires;
1060 struct task_struct *t;
1061 struct list_head *timers = sig->cpu_timers; 1049 struct list_head *timers = sig->cpu_timers;
1050 struct task_cputime cputime;
1062 1051
1063 /* 1052 /*
1064 * Don't sample the current process CPU clocks if there are no timers. 1053 * Don't sample the current process CPU clocks if there are no timers.
@@ -1074,18 +1063,10 @@ static void check_process_timers(struct task_struct *tsk,
1074 /* 1063 /*
1075 * Collect the current process totals. 1064 * Collect the current process totals.
1076 */ 1065 */
1077 utime = sig->utime; 1066 thread_group_cputime(tsk, &cputime);
1078 stime = sig->stime; 1067 utime = cputime.utime;
1079 sum_sched_runtime = sig->sum_sched_runtime; 1068 ptime = cputime_add(utime, cputime.stime);
1080 t = tsk; 1069 sum_sched_runtime = cputime.sum_exec_runtime;
1081 do {
1082 utime = cputime_add(utime, t->utime);
1083 stime = cputime_add(stime, t->stime);
1084 sum_sched_runtime += t->se.sum_exec_runtime;
1085 t = next_thread(t);
1086 } while (t != tsk);
1087 ptime = cputime_add(utime, stime);
1088
1089 maxfire = 20; 1070 maxfire = 20;
1090 prof_expires = cputime_zero; 1071 prof_expires = cputime_zero;
1091 while (!list_empty(timers)) { 1072 while (!list_empty(timers)) {
@@ -1193,60 +1174,18 @@ static void check_process_timers(struct task_struct *tsk,
1193 } 1174 }
1194 } 1175 }
1195 1176
1196 if (!cputime_eq(prof_expires, cputime_zero) || 1177 if (!cputime_eq(prof_expires, cputime_zero) &&
1197 !cputime_eq(virt_expires, cputime_zero) || 1178 (cputime_eq(sig->cputime_expires.prof_exp, cputime_zero) ||
1198 sched_expires != 0) { 1179 cputime_gt(sig->cputime_expires.prof_exp, prof_expires)))
1199 /* 1180 sig->cputime_expires.prof_exp = prof_expires;
1200 * Rebalance the threads' expiry times for the remaining 1181 if (!cputime_eq(virt_expires, cputime_zero) &&
1201 * process CPU timers. 1182 (cputime_eq(sig->cputime_expires.virt_exp, cputime_zero) ||
1202 */ 1183 cputime_gt(sig->cputime_expires.virt_exp, virt_expires)))
1203 1184 sig->cputime_expires.virt_exp = virt_expires;
1204 cputime_t prof_left, virt_left, ticks; 1185 if (sched_expires != 0 &&
1205 unsigned long long sched_left, sched; 1186 (sig->cputime_expires.sched_exp == 0 ||
1206 const unsigned int nthreads = atomic_read(&sig->live); 1187 sig->cputime_expires.sched_exp > sched_expires))
1207 1188 sig->cputime_expires.sched_exp = sched_expires;
1208 if (!nthreads)
1209 return;
1210
1211 prof_left = cputime_sub(prof_expires, utime);
1212 prof_left = cputime_sub(prof_left, stime);
1213 prof_left = cputime_div_non_zero(prof_left, nthreads);
1214 virt_left = cputime_sub(virt_expires, utime);
1215 virt_left = cputime_div_non_zero(virt_left, nthreads);
1216 if (sched_expires) {
1217 sched_left = sched_expires - sum_sched_runtime;
1218 do_div(sched_left, nthreads);
1219 sched_left = max_t(unsigned long long, sched_left, 1);
1220 } else {
1221 sched_left = 0;
1222 }
1223 t = tsk;
1224 do {
1225 if (unlikely(t->flags & PF_EXITING))
1226 continue;
1227
1228 ticks = cputime_add(cputime_add(t->utime, t->stime),
1229 prof_left);
1230 if (!cputime_eq(prof_expires, cputime_zero) &&
1231 (cputime_eq(t->it_prof_expires, cputime_zero) ||
1232 cputime_gt(t->it_prof_expires, ticks))) {
1233 t->it_prof_expires = ticks;
1234 }
1235
1236 ticks = cputime_add(t->utime, virt_left);
1237 if (!cputime_eq(virt_expires, cputime_zero) &&
1238 (cputime_eq(t->it_virt_expires, cputime_zero) ||
1239 cputime_gt(t->it_virt_expires, ticks))) {
1240 t->it_virt_expires = ticks;
1241 }
1242
1243 sched = t->se.sum_exec_runtime + sched_left;
1244 if (sched_expires && (t->it_sched_expires == 0 ||
1245 t->it_sched_expires > sched)) {
1246 t->it_sched_expires = sched;
1247 }
1248 } while ((t = next_thread(t)) != tsk);
1249 }
1250} 1189}
1251 1190
1252/* 1191/*
@@ -1314,6 +1253,86 @@ out:
1314 ++timer->it_requeue_pending; 1253 ++timer->it_requeue_pending;
1315} 1254}
1316 1255
1256/**
1257 * task_cputime_zero - Check a task_cputime struct for all zero fields.
1258 *
1259 * @cputime: The struct to compare.
1260 *
1261 * Checks @cputime to see if all fields are zero. Returns true if all fields
1262 * are zero, false if any field is nonzero.
1263 */
1264static inline int task_cputime_zero(const struct task_cputime *cputime)
1265{
1266 if (cputime_eq(cputime->utime, cputime_zero) &&
1267 cputime_eq(cputime->stime, cputime_zero) &&
1268 cputime->sum_exec_runtime == 0)
1269 return 1;
1270 return 0;
1271}
1272
1273/**
1274 * task_cputime_expired - Compare two task_cputime entities.
1275 *
1276 * @sample: The task_cputime structure to be checked for expiration.
1277 * @expires: Expiration times, against which @sample will be checked.
1278 *
1279 * Checks @sample against @expires to see if any field of @sample has expired.
1280 * Returns true if any field of the former is greater than the corresponding
1281 * field of the latter if the latter field is set. Otherwise returns false.
1282 */
1283static inline int task_cputime_expired(const struct task_cputime *sample,
1284 const struct task_cputime *expires)
1285{
1286 if (!cputime_eq(expires->utime, cputime_zero) &&
1287 cputime_ge(sample->utime, expires->utime))
1288 return 1;
1289 if (!cputime_eq(expires->stime, cputime_zero) &&
1290 cputime_ge(cputime_add(sample->utime, sample->stime),
1291 expires->stime))
1292 return 1;
1293 if (expires->sum_exec_runtime != 0 &&
1294 sample->sum_exec_runtime >= expires->sum_exec_runtime)
1295 return 1;
1296 return 0;
1297}
1298
1299/**
1300 * fastpath_timer_check - POSIX CPU timers fast path.
1301 *
1302 * @tsk: The task (thread) being checked.
1303 *
1304 * Check the task and thread group timers. If both are zero (there are no
1305 * timers set) return false. Otherwise snapshot the task and thread group
1306 * timers and compare them with the corresponding expiration times. Return
1307 * true if a timer has expired, else return false.
1308 */
1309static inline int fastpath_timer_check(struct task_struct *tsk)
1310{
1311 struct signal_struct *sig = tsk->signal;
1312
1313 if (unlikely(!sig))
1314 return 0;
1315
1316 if (!task_cputime_zero(&tsk->cputime_expires)) {
1317 struct task_cputime task_sample = {
1318 .utime = tsk->utime,
1319 .stime = tsk->stime,
1320 .sum_exec_runtime = tsk->se.sum_exec_runtime
1321 };
1322
1323 if (task_cputime_expired(&task_sample, &tsk->cputime_expires))
1324 return 1;
1325 }
1326 if (!task_cputime_zero(&sig->cputime_expires)) {
1327 struct task_cputime group_sample;
1328
1329 thread_group_cputime(tsk, &group_sample);
1330 if (task_cputime_expired(&group_sample, &sig->cputime_expires))
1331 return 1;
1332 }
1333 return 0;
1334}
1335
1317/* 1336/*
1318 * This is called from the timer interrupt handler. The irq handler has 1337 * This is called from the timer interrupt handler. The irq handler has
1319 * already updated our counts. We need to check if any timers fire now. 1338 * already updated our counts. We need to check if any timers fire now.
@@ -1326,42 +1345,31 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1326 1345
1327 BUG_ON(!irqs_disabled()); 1346 BUG_ON(!irqs_disabled());
1328 1347
1329#define UNEXPIRED(clock) \ 1348 /*
1330 (cputime_eq(tsk->it_##clock##_expires, cputime_zero) || \ 1349 * The fast path checks that there are no expired thread or thread
1331 cputime_lt(clock##_ticks(tsk), tsk->it_##clock##_expires)) 1350 * group timers. If that's so, just return.
1332 1351 */
1333 if (UNEXPIRED(prof) && UNEXPIRED(virt) && 1352 if (!fastpath_timer_check(tsk))
1334 (tsk->it_sched_expires == 0 ||
1335 tsk->se.sum_exec_runtime < tsk->it_sched_expires))
1336 return; 1353 return;
1337 1354
1338#undef UNEXPIRED 1355 spin_lock(&tsk->sighand->siglock);
1339
1340 /* 1356 /*
1341 * Double-check with locks held. 1357 * Here we take off tsk->signal->cpu_timers[N] and
1358 * tsk->cpu_timers[N] all the timers that are firing, and
1359 * put them on the firing list.
1342 */ 1360 */
1343 read_lock(&tasklist_lock); 1361 check_thread_timers(tsk, &firing);
1344 if (likely(tsk->signal != NULL)) { 1362 check_process_timers(tsk, &firing);
1345 spin_lock(&tsk->sighand->siglock);
1346 1363
1347 /* 1364 /*
1348 * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N] 1365 * We must release these locks before taking any timer's lock.
1349 * all the timers that are firing, and put them on the firing list. 1366 * There is a potential race with timer deletion here, as the
1350 */ 1367 * siglock now protects our private firing list. We have set
1351 check_thread_timers(tsk, &firing); 1368 * the firing flag in each timer, so that a deletion attempt
1352 check_process_timers(tsk, &firing); 1369 * that gets the timer lock before we do will give it up and
1353 1370 * spin until we've taken care of that timer below.
1354 /* 1371 */
1355 * We must release these locks before taking any timer's lock. 1372 spin_unlock(&tsk->sighand->siglock);
1356 * There is a potential race with timer deletion here, as the
1357 * siglock now protects our private firing list. We have set
1358 * the firing flag in each timer, so that a deletion attempt
1359 * that gets the timer lock before we do will give it up and
1360 * spin until we've taken care of that timer below.
1361 */
1362 spin_unlock(&tsk->sighand->siglock);
1363 }
1364 read_unlock(&tasklist_lock);
1365 1373
1366 /* 1374 /*
1367 * Now that all the timers on our list have the firing flag, 1375 * Now that all the timers on our list have the firing flag,
@@ -1389,10 +1397,9 @@ void run_posix_cpu_timers(struct task_struct *tsk)
1389 1397
1390/* 1398/*
1391 * Set one of the process-wide special case CPU timers. 1399 * Set one of the process-wide special case CPU timers.
1392 * The tasklist_lock and tsk->sighand->siglock must be held by the caller. 1400 * The tsk->sighand->siglock must be held by the caller.
1393 * The oldval argument is null for the RLIMIT_CPU timer, where *newval is 1401 * The *newval argument is relative and we update it to be absolute, *oldval
1394 * absolute; non-null for ITIMER_*, where *newval is relative and we update 1402 * is absolute and we update it to be relative.
1395 * it to be absolute, *oldval is absolute and we update it to be relative.
1396 */ 1403 */
1397void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, 1404void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1398 cputime_t *newval, cputime_t *oldval) 1405 cputime_t *newval, cputime_t *oldval)
@@ -1401,7 +1408,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1401 struct list_head *head; 1408 struct list_head *head;
1402 1409
1403 BUG_ON(clock_idx == CPUCLOCK_SCHED); 1410 BUG_ON(clock_idx == CPUCLOCK_SCHED);
1404 cpu_clock_sample_group_locked(clock_idx, tsk, &now); 1411 cpu_clock_sample_group(clock_idx, tsk, &now);
1405 1412
1406 if (oldval) { 1413 if (oldval) {
1407 if (!cputime_eq(*oldval, cputime_zero)) { 1414 if (!cputime_eq(*oldval, cputime_zero)) {
@@ -1435,13 +1442,14 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
1435 cputime_ge(list_first_entry(head, 1442 cputime_ge(list_first_entry(head,
1436 struct cpu_timer_list, entry)->expires.cpu, 1443 struct cpu_timer_list, entry)->expires.cpu,
1437 *newval)) { 1444 *newval)) {
1438 /* 1445 switch (clock_idx) {
1439 * Rejigger each thread's expiry time so that one will 1446 case CPUCLOCK_PROF:
1440 * notice before we hit the process-cumulative expiry time. 1447 tsk->signal->cputime_expires.prof_exp = *newval;
1441 */ 1448 break;
1442 union cpu_time_count expires = { .sched = 0 }; 1449 case CPUCLOCK_VIRT:
1443 expires.cpu = *newval; 1450 tsk->signal->cputime_expires.virt_exp = *newval;
1444 process_timer_rebalance(tsk, clock_idx, expires, now); 1451 break;
1452 }
1445 } 1453 }
1446} 1454}
1447 1455
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index dbd8398ddb0b..5e79c662294b 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -223,6 +223,15 @@ static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp)
223} 223}
224 224
225/* 225/*
226 * Get monotonic time for posix timers
227 */
228static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
229{
230 getrawmonotonic(tp);
231 return 0;
232}
233
234/*
226 * Initialize everything, well, just everything in Posix clocks/timers ;) 235 * Initialize everything, well, just everything in Posix clocks/timers ;)
227 */ 236 */
228static __init int init_posix_timers(void) 237static __init int init_posix_timers(void)
@@ -235,9 +244,15 @@ static __init int init_posix_timers(void)
235 .clock_get = posix_ktime_get_ts, 244 .clock_get = posix_ktime_get_ts,
236 .clock_set = do_posix_clock_nosettime, 245 .clock_set = do_posix_clock_nosettime,
237 }; 246 };
247 struct k_clock clock_monotonic_raw = {
248 .clock_getres = hrtimer_get_res,
249 .clock_get = posix_get_monotonic_raw,
250 .clock_set = do_posix_clock_nosettime,
251 };
238 252
239 register_posix_clock(CLOCK_REALTIME, &clock_realtime); 253 register_posix_clock(CLOCK_REALTIME, &clock_realtime);
240 register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic); 254 register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
255 register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
241 256
242 posix_timers_cache = kmem_cache_create("posix_timers_cache", 257 posix_timers_cache = kmem_cache_create("posix_timers_cache",
243 sizeof (struct k_itimer), 0, SLAB_PANIC, 258 sizeof (struct k_itimer), 0, SLAB_PANIC,
@@ -289,39 +304,33 @@ void do_schedule_next_timer(struct siginfo *info)
289 else 304 else
290 schedule_next_timer(timr); 305 schedule_next_timer(timr);
291 306
292 info->si_overrun = timr->it_overrun_last; 307 info->si_overrun += timr->it_overrun_last;
293 } 308 }
294 309
295 if (timr) 310 if (timr)
296 unlock_timer(timr, flags); 311 unlock_timer(timr, flags);
297} 312}
298 313
299int posix_timer_event(struct k_itimer *timr,int si_private) 314int posix_timer_event(struct k_itimer *timr, int si_private)
300{ 315{
301 memset(&timr->sigq->info, 0, sizeof(siginfo_t)); 316 int shared, ret;
317 /*
318 * FIXME: if ->sigq is queued we can race with
319 * dequeue_signal()->do_schedule_next_timer().
320 *
321 * If dequeue_signal() sees the "right" value of
322 * si_sys_private it calls do_schedule_next_timer().
323 * We re-queue ->sigq and drop ->it_lock().
324 * do_schedule_next_timer() locks the timer
325 * and re-schedules it while ->sigq is pending.
326 * Not really bad, but not that we want.
327 */
302 timr->sigq->info.si_sys_private = si_private; 328 timr->sigq->info.si_sys_private = si_private;
303 /* Send signal to the process that owns this timer.*/
304
305 timr->sigq->info.si_signo = timr->it_sigev_signo;
306 timr->sigq->info.si_errno = 0;
307 timr->sigq->info.si_code = SI_TIMER;
308 timr->sigq->info.si_tid = timr->it_id;
309 timr->sigq->info.si_value = timr->it_sigev_value;
310
311 if (timr->it_sigev_notify & SIGEV_THREAD_ID) {
312 struct task_struct *leader;
313 int ret = send_sigqueue(timr->sigq, timr->it_process, 0);
314
315 if (likely(ret >= 0))
316 return ret;
317
318 timr->it_sigev_notify = SIGEV_SIGNAL;
319 leader = timr->it_process->group_leader;
320 put_task_struct(timr->it_process);
321 timr->it_process = leader;
322 }
323 329
324 return send_sigqueue(timr->sigq, timr->it_process, 1); 330 shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID);
331 ret = send_sigqueue(timr->sigq, timr->it_process, shared);
332 /* If we failed to send the signal the timer stops. */
333 return ret > 0;
325} 334}
326EXPORT_SYMBOL_GPL(posix_timer_event); 335EXPORT_SYMBOL_GPL(posix_timer_event);
327 336
@@ -433,8 +442,9 @@ static struct k_itimer * alloc_posix_timer(void)
433 return tmr; 442 return tmr;
434 if (unlikely(!(tmr->sigq = sigqueue_alloc()))) { 443 if (unlikely(!(tmr->sigq = sigqueue_alloc()))) {
435 kmem_cache_free(posix_timers_cache, tmr); 444 kmem_cache_free(posix_timers_cache, tmr);
436 tmr = NULL; 445 return NULL;
437 } 446 }
447 memset(&tmr->sigq->info, 0, sizeof(siginfo_t));
438 return tmr; 448 return tmr;
439} 449}
440 450
@@ -449,9 +459,6 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
449 spin_unlock_irqrestore(&idr_lock, flags); 459 spin_unlock_irqrestore(&idr_lock, flags);
450 } 460 }
451 sigqueue_free(tmr->sigq); 461 sigqueue_free(tmr->sigq);
452 if (unlikely(tmr->it_process) &&
453 tmr->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
454 put_task_struct(tmr->it_process);
455 kmem_cache_free(posix_timers_cache, tmr); 462 kmem_cache_free(posix_timers_cache, tmr);
456} 463}
457 464
@@ -462,11 +469,9 @@ sys_timer_create(const clockid_t which_clock,
462 struct sigevent __user *timer_event_spec, 469 struct sigevent __user *timer_event_spec,
463 timer_t __user * created_timer_id) 470 timer_t __user * created_timer_id)
464{ 471{
465 int error = 0; 472 struct k_itimer *new_timer;
466 struct k_itimer *new_timer = NULL; 473 int error, new_timer_id;
467 int new_timer_id; 474 struct task_struct *process;
468 struct task_struct *process = NULL;
469 unsigned long flags;
470 sigevent_t event; 475 sigevent_t event;
471 int it_id_set = IT_ID_NOT_SET; 476 int it_id_set = IT_ID_NOT_SET;
472 477
@@ -484,12 +489,11 @@ sys_timer_create(const clockid_t which_clock,
484 goto out; 489 goto out;
485 } 490 }
486 spin_lock_irq(&idr_lock); 491 spin_lock_irq(&idr_lock);
487 error = idr_get_new(&posix_timers_id, (void *) new_timer, 492 error = idr_get_new(&posix_timers_id, new_timer, &new_timer_id);
488 &new_timer_id);
489 spin_unlock_irq(&idr_lock); 493 spin_unlock_irq(&idr_lock);
490 if (error == -EAGAIN) 494 if (error) {
491 goto retry; 495 if (error == -EAGAIN)
492 else if (error) { 496 goto retry;
493 /* 497 /*
494 * Weird looking, but we return EAGAIN if the IDR is 498 * Weird looking, but we return EAGAIN if the IDR is
495 * full (proper POSIX return value for this) 499 * full (proper POSIX return value for this)
@@ -520,67 +524,43 @@ sys_timer_create(const clockid_t which_clock,
520 error = -EFAULT; 524 error = -EFAULT;
521 goto out; 525 goto out;
522 } 526 }
523 new_timer->it_sigev_notify = event.sigev_notify; 527 rcu_read_lock();
524 new_timer->it_sigev_signo = event.sigev_signo; 528 process = good_sigevent(&event);
525 new_timer->it_sigev_value = event.sigev_value; 529 if (process)
526 530 get_task_struct(process);
527 read_lock(&tasklist_lock); 531 rcu_read_unlock();
528 if ((process = good_sigevent(&event))) {
529 /*
530 * We may be setting up this process for another
531 * thread. It may be exiting. To catch this
532 * case the we check the PF_EXITING flag. If
533 * the flag is not set, the siglock will catch
534 * him before it is too late (in exit_itimers).
535 *
536 * The exec case is a bit more invloved but easy
537 * to code. If the process is in our thread
538 * group (and it must be or we would not allow
539 * it here) and is doing an exec, it will cause
540 * us to be killed. In this case it will wait
541 * for us to die which means we can finish this
542 * linkage with our last gasp. I.e. no code :)
543 */
544 spin_lock_irqsave(&process->sighand->siglock, flags);
545 if (!(process->flags & PF_EXITING)) {
546 new_timer->it_process = process;
547 list_add(&new_timer->list,
548 &process->signal->posix_timers);
549 if (new_timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID))
550 get_task_struct(process);
551 spin_unlock_irqrestore(&process->sighand->siglock, flags);
552 } else {
553 spin_unlock_irqrestore(&process->sighand->siglock, flags);
554 process = NULL;
555 }
556 }
557 read_unlock(&tasklist_lock);
558 if (!process) { 532 if (!process) {
559 error = -EINVAL; 533 error = -EINVAL;
560 goto out; 534 goto out;
561 } 535 }
562 } else { 536 } else {
563 new_timer->it_sigev_notify = SIGEV_SIGNAL; 537 event.sigev_notify = SIGEV_SIGNAL;
564 new_timer->it_sigev_signo = SIGALRM; 538 event.sigev_signo = SIGALRM;
565 new_timer->it_sigev_value.sival_int = new_timer->it_id; 539 event.sigev_value.sival_int = new_timer->it_id;
566 process = current->group_leader; 540 process = current->group_leader;
567 spin_lock_irqsave(&process->sighand->siglock, flags); 541 get_task_struct(process);
568 new_timer->it_process = process;
569 list_add(&new_timer->list, &process->signal->posix_timers);
570 spin_unlock_irqrestore(&process->sighand->siglock, flags);
571 } 542 }
572 543
544 new_timer->it_sigev_notify = event.sigev_notify;
545 new_timer->sigq->info.si_signo = event.sigev_signo;
546 new_timer->sigq->info.si_value = event.sigev_value;
547 new_timer->sigq->info.si_tid = new_timer->it_id;
548 new_timer->sigq->info.si_code = SI_TIMER;
549
550 spin_lock_irq(&current->sighand->siglock);
551 new_timer->it_process = process;
552 list_add(&new_timer->list, &current->signal->posix_timers);
553 spin_unlock_irq(&current->sighand->siglock);
554
555 return 0;
573 /* 556 /*
574 * In the case of the timer belonging to another task, after 557 * In the case of the timer belonging to another task, after
575 * the task is unlocked, the timer is owned by the other task 558 * the task is unlocked, the timer is owned by the other task
576 * and may cease to exist at any time. Don't use or modify 559 * and may cease to exist at any time. Don't use or modify
577 * new_timer after the unlock call. 560 * new_timer after the unlock call.
578 */ 561 */
579
580out: 562out:
581 if (error) 563 release_posix_timer(new_timer, it_id_set);
582 release_posix_timer(new_timer, it_id_set);
583
584 return error; 564 return error;
585} 565}
586 566
@@ -591,7 +571,7 @@ out:
591 * the find to the timer lock. To avoid a dead lock, the timer id MUST 571 * the find to the timer lock. To avoid a dead lock, the timer id MUST
592 * be release with out holding the timer lock. 572 * be release with out holding the timer lock.
593 */ 573 */
594static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags) 574static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags)
595{ 575{
596 struct k_itimer *timr; 576 struct k_itimer *timr;
597 /* 577 /*
@@ -599,23 +579,20 @@ static struct k_itimer * lock_timer(timer_t timer_id, unsigned long *flags)
599 * flags part over to the timer lock. Must not let interrupts in 579 * flags part over to the timer lock. Must not let interrupts in
600 * while we are moving the lock. 580 * while we are moving the lock.
601 */ 581 */
602
603 spin_lock_irqsave(&idr_lock, *flags); 582 spin_lock_irqsave(&idr_lock, *flags);
604 timr = (struct k_itimer *) idr_find(&posix_timers_id, (int) timer_id); 583 timr = idr_find(&posix_timers_id, (int)timer_id);
605 if (timr) { 584 if (timr) {
606 spin_lock(&timr->it_lock); 585 spin_lock(&timr->it_lock);
607 586 if (timr->it_process &&
608 if ((timr->it_id != timer_id) || !(timr->it_process) || 587 same_thread_group(timr->it_process, current)) {
609 !same_thread_group(timr->it_process, current)) {
610 spin_unlock(&timr->it_lock);
611 spin_unlock_irqrestore(&idr_lock, *flags);
612 timr = NULL;
613 } else
614 spin_unlock(&idr_lock); 588 spin_unlock(&idr_lock);
615 } else 589 return timr;
616 spin_unlock_irqrestore(&idr_lock, *flags); 590 }
591 spin_unlock(&timr->it_lock);
592 }
593 spin_unlock_irqrestore(&idr_lock, *flags);
617 594
618 return timr; 595 return NULL;
619} 596}
620 597
621/* 598/*
@@ -662,7 +639,7 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
662 (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) 639 (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
663 timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv); 640 timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv);
664 641
665 remaining = ktime_sub(timer->expires, now); 642 remaining = ktime_sub(hrtimer_get_expires(timer), now);
666 /* Return 0 only, when the timer is expired and not pending */ 643 /* Return 0 only, when the timer is expired and not pending */
667 if (remaining.tv64 <= 0) { 644 if (remaining.tv64 <= 0) {
668 /* 645 /*
@@ -756,7 +733,7 @@ common_timer_set(struct k_itimer *timr, int flags,
756 hrtimer_init(&timr->it.real.timer, timr->it_clock, mode); 733 hrtimer_init(&timr->it.real.timer, timr->it_clock, mode);
757 timr->it.real.timer.function = posix_timer_fn; 734 timr->it.real.timer.function = posix_timer_fn;
758 735
759 timer->expires = timespec_to_ktime(new_setting->it_value); 736 hrtimer_set_expires(timer, timespec_to_ktime(new_setting->it_value));
760 737
761 /* Convert interval */ 738 /* Convert interval */
762 timr->it.real.interval = timespec_to_ktime(new_setting->it_interval); 739 timr->it.real.interval = timespec_to_ktime(new_setting->it_interval);
@@ -765,14 +742,12 @@ common_timer_set(struct k_itimer *timr, int flags,
765 if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) { 742 if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) {
766 /* Setup correct expiry time for relative timers */ 743 /* Setup correct expiry time for relative timers */
767 if (mode == HRTIMER_MODE_REL) { 744 if (mode == HRTIMER_MODE_REL) {
768 timer->expires = 745 hrtimer_add_expires(timer, timer->base->get_time());
769 ktime_add_safe(timer->expires,
770 timer->base->get_time());
771 } 746 }
772 return 0; 747 return 0;
773 } 748 }
774 749
775 hrtimer_start(timer, timer->expires, mode); 750 hrtimer_start_expires(timer, mode);
776 return 0; 751 return 0;
777} 752}
778 753
@@ -856,11 +831,9 @@ retry_delete:
856 * This keeps any tasks waiting on the spin lock from thinking 831 * This keeps any tasks waiting on the spin lock from thinking
857 * they got something (see the lock code above). 832 * they got something (see the lock code above).
858 */ 833 */
859 if (timer->it_process) { 834 put_task_struct(timer->it_process);
860 if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) 835 timer->it_process = NULL;
861 put_task_struct(timer->it_process); 836
862 timer->it_process = NULL;
863 }
864 unlock_timer(timer, flags); 837 unlock_timer(timer, flags);
865 release_posix_timer(timer, IT_ID_SET); 838 release_posix_timer(timer, IT_ID_SET);
866 return 0; 839 return 0;
@@ -885,11 +858,9 @@ retry_delete:
885 * This keeps any tasks waiting on the spin lock from thinking 858 * This keeps any tasks waiting on the spin lock from thinking
886 * they got something (see the lock code above). 859 * they got something (see the lock code above).
887 */ 860 */
888 if (timer->it_process) { 861 put_task_struct(timer->it_process);
889 if (timer->it_sigev_notify == (SIGEV_SIGNAL|SIGEV_THREAD_ID)) 862 timer->it_process = NULL;
890 put_task_struct(timer->it_process); 863
891 timer->it_process = NULL;
892 }
893 unlock_timer(timer, flags); 864 unlock_timer(timer, flags);
894 release_posix_timer(timer, IT_ID_SET); 865 release_posix_timer(timer, IT_ID_SET);
895} 866}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index b45da40e8d25..dcd165f92a88 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -82,7 +82,7 @@ config PM_SLEEP_SMP
82 82
83config PM_SLEEP 83config PM_SLEEP
84 bool 84 bool
85 depends on SUSPEND || HIBERNATION 85 depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE
86 default y 86 default y
87 87
88config SUSPEND 88config SUSPEND
@@ -94,6 +94,17 @@ config SUSPEND
94 powered and thus its contents are preserved, such as the 94 powered and thus its contents are preserved, such as the
95 suspend-to-RAM state (e.g. the ACPI S3 state). 95 suspend-to-RAM state (e.g. the ACPI S3 state).
96 96
97config PM_TEST_SUSPEND
98 bool "Test suspend/resume and wakealarm during bootup"
99 depends on SUSPEND && PM_DEBUG && RTC_LIB=y
100 ---help---
101 This option will let you suspend your machine during bootup, and
102 make it wake up a few seconds later using an RTC wakeup alarm.
103 Enable this with a kernel parameter like "test_suspend=mem".
104
105 You probably want to have your system's RTC driver statically
106 linked, ensuring that it's available when this test runs.
107
97config SUSPEND_FREEZER 108config SUSPEND_FREEZER
98 bool "Enable freezer for suspend to RAM/standby" \ 109 bool "Enable freezer for suspend to RAM/standby" \
99 if ARCH_WANTS_FREEZER_CONTROL || BROKEN 110 if ARCH_WANTS_FREEZER_CONTROL || BROKEN
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index f011e0870b52..c9d74083746f 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -14,6 +14,7 @@
14#include <linux/reboot.h> 14#include <linux/reboot.h>
15#include <linux/string.h> 15#include <linux/string.h>
16#include <linux/device.h> 16#include <linux/device.h>
17#include <linux/kmod.h>
17#include <linux/delay.h> 18#include <linux/delay.h>
18#include <linux/fs.h> 19#include <linux/fs.h>
19#include <linux/mount.h> 20#include <linux/mount.h>
@@ -21,6 +22,7 @@
21#include <linux/console.h> 22#include <linux/console.h>
22#include <linux/cpu.h> 23#include <linux/cpu.h>
23#include <linux/freezer.h> 24#include <linux/freezer.h>
25#include <linux/ftrace.h>
24 26
25#include "power.h" 27#include "power.h"
26 28
@@ -255,7 +257,7 @@ static int create_image(int platform_mode)
255 257
256int hibernation_snapshot(int platform_mode) 258int hibernation_snapshot(int platform_mode)
257{ 259{
258 int error; 260 int error, ftrace_save;
259 261
260 /* Free memory before shutting down devices. */ 262 /* Free memory before shutting down devices. */
261 error = swsusp_shrink_memory(); 263 error = swsusp_shrink_memory();
@@ -267,6 +269,7 @@ int hibernation_snapshot(int platform_mode)
267 goto Close; 269 goto Close;
268 270
269 suspend_console(); 271 suspend_console();
272 ftrace_save = __ftrace_enabled_save();
270 error = device_suspend(PMSG_FREEZE); 273 error = device_suspend(PMSG_FREEZE);
271 if (error) 274 if (error)
272 goto Recover_platform; 275 goto Recover_platform;
@@ -296,6 +299,7 @@ int hibernation_snapshot(int platform_mode)
296 Resume_devices: 299 Resume_devices:
297 device_resume(in_suspend ? 300 device_resume(in_suspend ?
298 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE); 301 (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
302 __ftrace_enabled_restore(ftrace_save);
299 resume_console(); 303 resume_console();
300 Close: 304 Close:
301 platform_end(platform_mode); 305 platform_end(platform_mode);
@@ -366,10 +370,11 @@ static int resume_target_kernel(void)
366 370
367int hibernation_restore(int platform_mode) 371int hibernation_restore(int platform_mode)
368{ 372{
369 int error; 373 int error, ftrace_save;
370 374
371 pm_prepare_console(); 375 pm_prepare_console();
372 suspend_console(); 376 suspend_console();
377 ftrace_save = __ftrace_enabled_save();
373 error = device_suspend(PMSG_QUIESCE); 378 error = device_suspend(PMSG_QUIESCE);
374 if (error) 379 if (error)
375 goto Finish; 380 goto Finish;
@@ -384,6 +389,7 @@ int hibernation_restore(int platform_mode)
384 platform_restore_cleanup(platform_mode); 389 platform_restore_cleanup(platform_mode);
385 device_resume(PMSG_RECOVER); 390 device_resume(PMSG_RECOVER);
386 Finish: 391 Finish:
392 __ftrace_enabled_restore(ftrace_save);
387 resume_console(); 393 resume_console();
388 pm_restore_console(); 394 pm_restore_console();
389 return error; 395 return error;
@@ -396,7 +402,7 @@ int hibernation_restore(int platform_mode)
396 402
397int hibernation_platform_enter(void) 403int hibernation_platform_enter(void)
398{ 404{
399 int error; 405 int error, ftrace_save;
400 406
401 if (!hibernation_ops) 407 if (!hibernation_ops)
402 return -ENOSYS; 408 return -ENOSYS;
@@ -411,6 +417,7 @@ int hibernation_platform_enter(void)
411 goto Close; 417 goto Close;
412 418
413 suspend_console(); 419 suspend_console();
420 ftrace_save = __ftrace_enabled_save();
414 error = device_suspend(PMSG_HIBERNATE); 421 error = device_suspend(PMSG_HIBERNATE);
415 if (error) { 422 if (error) {
416 if (hibernation_ops->recover) 423 if (hibernation_ops->recover)
@@ -445,6 +452,7 @@ int hibernation_platform_enter(void)
445 hibernation_ops->finish(); 452 hibernation_ops->finish();
446 Resume_devices: 453 Resume_devices:
447 device_resume(PMSG_RESTORE); 454 device_resume(PMSG_RESTORE);
455 __ftrace_enabled_restore(ftrace_save);
448 resume_console(); 456 resume_console();
449 Close: 457 Close:
450 hibernation_ops->end(); 458 hibernation_ops->end();
@@ -513,6 +521,10 @@ int hibernate(void)
513 if (error) 521 if (error)
514 goto Exit; 522 goto Exit;
515 523
524 error = usermodehelper_disable();
525 if (error)
526 goto Exit;
527
516 /* Allocate memory management structures */ 528 /* Allocate memory management structures */
517 error = create_basic_memory_bitmaps(); 529 error = create_basic_memory_bitmaps();
518 if (error) 530 if (error)
@@ -551,6 +563,7 @@ int hibernate(void)
551 thaw_processes(); 563 thaw_processes();
552 Finish: 564 Finish:
553 free_basic_memory_bitmaps(); 565 free_basic_memory_bitmaps();
566 usermodehelper_enable();
554 Exit: 567 Exit:
555 pm_notifier_call_chain(PM_POST_HIBERNATION); 568 pm_notifier_call_chain(PM_POST_HIBERNATION);
556 pm_restore_console(); 569 pm_restore_console();
@@ -627,6 +640,10 @@ static int software_resume(void)
627 if (error) 640 if (error)
628 goto Finish; 641 goto Finish;
629 642
643 error = usermodehelper_disable();
644 if (error)
645 goto Finish;
646
630 error = create_basic_memory_bitmaps(); 647 error = create_basic_memory_bitmaps();
631 if (error) 648 if (error)
632 goto Finish; 649 goto Finish;
@@ -634,7 +651,7 @@ static int software_resume(void)
634 pr_debug("PM: Preparing processes for restore.\n"); 651 pr_debug("PM: Preparing processes for restore.\n");
635 error = prepare_processes(); 652 error = prepare_processes();
636 if (error) { 653 if (error) {
637 swsusp_close(); 654 swsusp_close(FMODE_READ);
638 goto Done; 655 goto Done;
639 } 656 }
640 657
@@ -649,6 +666,7 @@ static int software_resume(void)
649 thaw_processes(); 666 thaw_processes();
650 Done: 667 Done:
651 free_basic_memory_bitmaps(); 668 free_basic_memory_bitmaps();
669 usermodehelper_enable();
652 Finish: 670 Finish:
653 pm_notifier_call_chain(PM_POST_RESTORE); 671 pm_notifier_call_chain(PM_POST_RESTORE);
654 pm_restore_console(); 672 pm_restore_console();
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 3398f4651aa1..19122cf6d827 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -14,6 +14,7 @@
14#include <linux/string.h> 14#include <linux/string.h>
15#include <linux/delay.h> 15#include <linux/delay.h>
16#include <linux/errno.h> 16#include <linux/errno.h>
17#include <linux/kmod.h>
17#include <linux/init.h> 18#include <linux/init.h>
18#include <linux/console.h> 19#include <linux/console.h>
19#include <linux/cpu.h> 20#include <linux/cpu.h>
@@ -21,6 +22,7 @@
21#include <linux/freezer.h> 22#include <linux/freezer.h>
22#include <linux/vmstat.h> 23#include <linux/vmstat.h>
23#include <linux/syscalls.h> 24#include <linux/syscalls.h>
25#include <linux/ftrace.h>
24 26
25#include "power.h" 27#include "power.h"
26 28
@@ -132,6 +134,61 @@ static inline int suspend_test(int level) { return 0; }
132 134
133#ifdef CONFIG_SUSPEND 135#ifdef CONFIG_SUSPEND
134 136
137#ifdef CONFIG_PM_TEST_SUSPEND
138
139/*
140 * We test the system suspend code by setting an RTC wakealarm a short
141 * time in the future, then suspending. Suspending the devices won't
142 * normally take long ... some systems only need a few milliseconds.
143 *
144 * The time it takes is system-specific though, so when we test this
145 * during system bootup we allow a LOT of time.
146 */
147#define TEST_SUSPEND_SECONDS 5
148
149static unsigned long suspend_test_start_time;
150
151static void suspend_test_start(void)
152{
153 /* FIXME Use better timebase than "jiffies", ideally a clocksource.
154 * What we want is a hardware counter that will work correctly even
155 * during the irqs-are-off stages of the suspend/resume cycle...
156 */
157 suspend_test_start_time = jiffies;
158}
159
160static void suspend_test_finish(const char *label)
161{
162 long nj = jiffies - suspend_test_start_time;
163 unsigned msec;
164
165 msec = jiffies_to_msecs(abs(nj));
166 pr_info("PM: %s took %d.%03d seconds\n", label,
167 msec / 1000, msec % 1000);
168
169 /* Warning on suspend means the RTC alarm period needs to be
170 * larger -- the system was sooo slooowwww to suspend that the
171 * alarm (should have) fired before the system went to sleep!
172 *
173 * Warning on either suspend or resume also means the system
174 * has some performance issues. The stack dump of a WARN_ON
175 * is more likely to get the right attention than a printk...
176 */
177 WARN_ON(msec > (TEST_SUSPEND_SECONDS * 1000));
178}
179
180#else
181
182static void suspend_test_start(void)
183{
184}
185
186static void suspend_test_finish(const char *label)
187{
188}
189
190#endif
191
135/* This is just an arbitrary number */ 192/* This is just an arbitrary number */
136#define FREE_PAGE_NUMBER (100) 193#define FREE_PAGE_NUMBER (100)
137 194
@@ -181,6 +238,10 @@ static int suspend_prepare(void)
181 if (error) 238 if (error)
182 goto Finish; 239 goto Finish;
183 240
241 error = usermodehelper_disable();
242 if (error)
243 goto Finish;
244
184 if (suspend_freeze_processes()) { 245 if (suspend_freeze_processes()) {
185 error = -EAGAIN; 246 error = -EAGAIN;
186 goto Thaw; 247 goto Thaw;
@@ -200,6 +261,7 @@ static int suspend_prepare(void)
200 261
201 Thaw: 262 Thaw:
202 suspend_thaw_processes(); 263 suspend_thaw_processes();
264 usermodehelper_enable();
203 Finish: 265 Finish:
204 pm_notifier_call_chain(PM_POST_SUSPEND); 266 pm_notifier_call_chain(PM_POST_SUSPEND);
205 pm_restore_console(); 267 pm_restore_console();
@@ -255,7 +317,7 @@ static int suspend_enter(suspend_state_t state)
255 */ 317 */
256int suspend_devices_and_enter(suspend_state_t state) 318int suspend_devices_and_enter(suspend_state_t state)
257{ 319{
258 int error; 320 int error, ftrace_save;
259 321
260 if (!suspend_ops) 322 if (!suspend_ops)
261 return -ENOSYS; 323 return -ENOSYS;
@@ -266,12 +328,14 @@ int suspend_devices_and_enter(suspend_state_t state)
266 goto Close; 328 goto Close;
267 } 329 }
268 suspend_console(); 330 suspend_console();
331 ftrace_save = __ftrace_enabled_save();
332 suspend_test_start();
269 error = device_suspend(PMSG_SUSPEND); 333 error = device_suspend(PMSG_SUSPEND);
270 if (error) { 334 if (error) {
271 printk(KERN_ERR "PM: Some devices failed to suspend\n"); 335 printk(KERN_ERR "PM: Some devices failed to suspend\n");
272 goto Recover_platform; 336 goto Recover_platform;
273 } 337 }
274 338 suspend_test_finish("suspend devices");
275 if (suspend_test(TEST_DEVICES)) 339 if (suspend_test(TEST_DEVICES))
276 goto Recover_platform; 340 goto Recover_platform;
277 341
@@ -293,7 +357,10 @@ int suspend_devices_and_enter(suspend_state_t state)
293 if (suspend_ops->finish) 357 if (suspend_ops->finish)
294 suspend_ops->finish(); 358 suspend_ops->finish();
295 Resume_devices: 359 Resume_devices:
360 suspend_test_start();
296 device_resume(PMSG_RESUME); 361 device_resume(PMSG_RESUME);
362 suspend_test_finish("resume devices");
363 __ftrace_enabled_restore(ftrace_save);
297 resume_console(); 364 resume_console();
298 Close: 365 Close:
299 if (suspend_ops->end) 366 if (suspend_ops->end)
@@ -315,6 +382,7 @@ int suspend_devices_and_enter(suspend_state_t state)
315static void suspend_finish(void) 382static void suspend_finish(void)
316{ 383{
317 suspend_thaw_processes(); 384 suspend_thaw_processes();
385 usermodehelper_enable();
318 pm_notifier_call_chain(PM_POST_SUSPEND); 386 pm_notifier_call_chain(PM_POST_SUSPEND);
319 pm_restore_console(); 387 pm_restore_console();
320} 388}
@@ -521,3 +589,144 @@ static int __init pm_init(void)
521} 589}
522 590
523core_initcall(pm_init); 591core_initcall(pm_init);
592
593
594#ifdef CONFIG_PM_TEST_SUSPEND
595
596#include <linux/rtc.h>
597
598/*
599 * To test system suspend, we need a hands-off mechanism to resume the
600 * system. RTCs wake alarms are a common self-contained mechanism.
601 */
602
603static void __init test_wakealarm(struct rtc_device *rtc, suspend_state_t state)
604{
605 static char err_readtime[] __initdata =
606 KERN_ERR "PM: can't read %s time, err %d\n";
607 static char err_wakealarm [] __initdata =
608 KERN_ERR "PM: can't set %s wakealarm, err %d\n";
609 static char err_suspend[] __initdata =
610 KERN_ERR "PM: suspend test failed, error %d\n";
611 static char info_test[] __initdata =
612 KERN_INFO "PM: test RTC wakeup from '%s' suspend\n";
613
614 unsigned long now;
615 struct rtc_wkalrm alm;
616 int status;
617
618 /* this may fail if the RTC hasn't been initialized */
619 status = rtc_read_time(rtc, &alm.time);
620 if (status < 0) {
621 printk(err_readtime, rtc->dev.bus_id, status);
622 return;
623 }
624 rtc_tm_to_time(&alm.time, &now);
625
626 memset(&alm, 0, sizeof alm);
627 rtc_time_to_tm(now + TEST_SUSPEND_SECONDS, &alm.time);
628 alm.enabled = true;
629
630 status = rtc_set_alarm(rtc, &alm);
631 if (status < 0) {
632 printk(err_wakealarm, rtc->dev.bus_id, status);
633 return;
634 }
635
636 if (state == PM_SUSPEND_MEM) {
637 printk(info_test, pm_states[state]);
638 status = pm_suspend(state);
639 if (status == -ENODEV)
640 state = PM_SUSPEND_STANDBY;
641 }
642 if (state == PM_SUSPEND_STANDBY) {
643 printk(info_test, pm_states[state]);
644 status = pm_suspend(state);
645 }
646 if (status < 0)
647 printk(err_suspend, status);
648
649 /* Some platforms can't detect that the alarm triggered the
650 * wakeup, or (accordingly) disable it after it afterwards.
651 * It's supposed to give oneshot behavior; cope.
652 */
653 alm.enabled = false;
654 rtc_set_alarm(rtc, &alm);
655}
656
657static int __init has_wakealarm(struct device *dev, void *name_ptr)
658{
659 struct rtc_device *candidate = to_rtc_device(dev);
660
661 if (!candidate->ops->set_alarm)
662 return 0;
663 if (!device_may_wakeup(candidate->dev.parent))
664 return 0;
665
666 *(char **)name_ptr = dev->bus_id;
667 return 1;
668}
669
670/*
671 * Kernel options like "test_suspend=mem" force suspend/resume sanity tests
672 * at startup time. They're normally disabled, for faster boot and because
673 * we can't know which states really work on this particular system.
674 */
675static suspend_state_t test_state __initdata = PM_SUSPEND_ON;
676
677static char warn_bad_state[] __initdata =
678 KERN_WARNING "PM: can't test '%s' suspend state\n";
679
680static int __init setup_test_suspend(char *value)
681{
682 unsigned i;
683
684 /* "=mem" ==> "mem" */
685 value++;
686 for (i = 0; i < PM_SUSPEND_MAX; i++) {
687 if (!pm_states[i])
688 continue;
689 if (strcmp(pm_states[i], value) != 0)
690 continue;
691 test_state = (__force suspend_state_t) i;
692 return 0;
693 }
694 printk(warn_bad_state, value);
695 return 0;
696}
697__setup("test_suspend", setup_test_suspend);
698
699static int __init test_suspend(void)
700{
701 static char warn_no_rtc[] __initdata =
702 KERN_WARNING "PM: no wakealarm-capable RTC driver is ready\n";
703
704 char *pony = NULL;
705 struct rtc_device *rtc = NULL;
706
707 /* PM is initialized by now; is that state testable? */
708 if (test_state == PM_SUSPEND_ON)
709 goto done;
710 if (!valid_state(test_state)) {
711 printk(warn_bad_state, pm_states[test_state]);
712 goto done;
713 }
714
715 /* RTCs have initialized by now too ... can we use one? */
716 class_find_device(rtc_class, NULL, &pony, has_wakealarm);
717 if (pony)
718 rtc = rtc_class_open(pony);
719 if (!rtc) {
720 printk(warn_no_rtc);
721 goto done;
722 }
723
724 /* go for it */
725 test_wakealarm(rtc, test_state);
726 rtc_class_close(rtc);
727done:
728 return 0;
729}
730late_initcall(test_suspend);
731
732#endif /* CONFIG_PM_TEST_SUSPEND */
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 700f44ec8406..46b5ec7a3afb 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -53,8 +53,6 @@ extern int hibernation_platform_enter(void);
53 53
54extern int pfn_is_nosave(unsigned long); 54extern int pfn_is_nosave(unsigned long);
55 55
56extern struct mutex pm_mutex;
57
58#define power_attr(_name) \ 56#define power_attr(_name) \
59static struct kobj_attribute _name##_attr = { \ 57static struct kobj_attribute _name##_attr = { \
60 .attr = { \ 58 .attr = { \
@@ -155,7 +153,7 @@ extern int swsusp_shrink_memory(void);
155extern void swsusp_free(void); 153extern void swsusp_free(void);
156extern int swsusp_read(unsigned int *flags_p); 154extern int swsusp_read(unsigned int *flags_p);
157extern int swsusp_write(unsigned int flags); 155extern int swsusp_write(unsigned int flags);
158extern void swsusp_close(void); 156extern void swsusp_close(fmode_t);
159 157
160struct timeval; 158struct timeval;
161/* kernel/power/swsusp.c */ 159/* kernel/power/swsusp.c */
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
index 678ec736076b..72016f051477 100644
--- a/kernel/power/poweroff.c
+++ b/kernel/power/poweroff.c
@@ -10,6 +10,7 @@
10#include <linux/pm.h> 10#include <linux/pm.h>
11#include <linux/workqueue.h> 11#include <linux/workqueue.h>
12#include <linux/reboot.h> 12#include <linux/reboot.h>
13#include <linux/cpumask.h>
13 14
14/* 15/*
15 * When the user hits Sys-Rq o to power down the machine this is the 16 * When the user hits Sys-Rq o to power down the machine this is the
@@ -25,7 +26,8 @@ static DECLARE_WORK(poweroff_work, do_poweroff);
25 26
26static void handle_poweroff(int key, struct tty_struct *tty) 27static void handle_poweroff(int key, struct tty_struct *tty)
27{ 28{
28 schedule_work(&poweroff_work); 29 /* run sysrq poweroff on boot cpu */
30 schedule_work_on(first_cpu(cpu_online_map), &poweroff_work);
29} 31}
30 32
31static struct sysrq_key_op sysrq_poweroff_op = { 33static struct sysrq_key_op sysrq_poweroff_op = {
diff --git a/kernel/power/process.c b/kernel/power/process.c
index 5fb87652f214..ca634019497a 100644
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -28,128 +28,13 @@ static inline int freezeable(struct task_struct * p)
28 return 1; 28 return 1;
29} 29}
30 30
31/*
32 * freezing is complete, mark current process as frozen
33 */
34static inline void frozen_process(void)
35{
36 if (!unlikely(current->flags & PF_NOFREEZE)) {
37 current->flags |= PF_FROZEN;
38 wmb();
39 }
40 clear_freeze_flag(current);
41}
42
43/* Refrigerator is place where frozen processes are stored :-). */
44void refrigerator(void)
45{
46 /* Hmm, should we be allowed to suspend when there are realtime
47 processes around? */
48 long save;
49
50 task_lock(current);
51 if (freezing(current)) {
52 frozen_process();
53 task_unlock(current);
54 } else {
55 task_unlock(current);
56 return;
57 }
58 save = current->state;
59 pr_debug("%s entered refrigerator\n", current->comm);
60
61 spin_lock_irq(&current->sighand->siglock);
62 recalc_sigpending(); /* We sent fake signal, clean it up */
63 spin_unlock_irq(&current->sighand->siglock);
64
65 for (;;) {
66 set_current_state(TASK_UNINTERRUPTIBLE);
67 if (!frozen(current))
68 break;
69 schedule();
70 }
71 pr_debug("%s left refrigerator\n", current->comm);
72 __set_current_state(save);
73}
74
75static void fake_signal_wake_up(struct task_struct *p)
76{
77 unsigned long flags;
78
79 spin_lock_irqsave(&p->sighand->siglock, flags);
80 signal_wake_up(p, 0);
81 spin_unlock_irqrestore(&p->sighand->siglock, flags);
82}
83
84static inline bool should_send_signal(struct task_struct *p)
85{
86 return !(p->flags & PF_FREEZER_NOSIG);
87}
88
89/**
90 * freeze_task - send a freeze request to given task
91 * @p: task to send the request to
92 * @sig_only: if set, the request will only be sent if the task has the
93 * PF_FREEZER_NOSIG flag unset
94 * Return value: 'false', if @sig_only is set and the task has
95 * PF_FREEZER_NOSIG set or the task is frozen, 'true', otherwise
96 *
97 * The freeze request is sent by setting the tasks's TIF_FREEZE flag and
98 * either sending a fake signal to it or waking it up, depending on whether
99 * or not it has PF_FREEZER_NOSIG set. If @sig_only is set and the task
100 * has PF_FREEZER_NOSIG set (ie. it is a typical kernel thread), its
101 * TIF_FREEZE flag will not be set.
102 */
103static bool freeze_task(struct task_struct *p, bool sig_only)
104{
105 /*
106 * We first check if the task is freezing and next if it has already
107 * been frozen to avoid the race with frozen_process() which first marks
108 * the task as frozen and next clears its TIF_FREEZE.
109 */
110 if (!freezing(p)) {
111 rmb();
112 if (frozen(p))
113 return false;
114
115 if (!sig_only || should_send_signal(p))
116 set_freeze_flag(p);
117 else
118 return false;
119 }
120
121 if (should_send_signal(p)) {
122 if (!signal_pending(p))
123 fake_signal_wake_up(p);
124 } else if (sig_only) {
125 return false;
126 } else {
127 wake_up_state(p, TASK_INTERRUPTIBLE);
128 }
129
130 return true;
131}
132
133static void cancel_freezing(struct task_struct *p)
134{
135 unsigned long flags;
136
137 if (freezing(p)) {
138 pr_debug(" clean up: %s\n", p->comm);
139 clear_freeze_flag(p);
140 spin_lock_irqsave(&p->sighand->siglock, flags);
141 recalc_sigpending_and_wake(p);
142 spin_unlock_irqrestore(&p->sighand->siglock, flags);
143 }
144}
145
146static int try_to_freeze_tasks(bool sig_only) 31static int try_to_freeze_tasks(bool sig_only)
147{ 32{
148 struct task_struct *g, *p; 33 struct task_struct *g, *p;
149 unsigned long end_time; 34 unsigned long end_time;
150 unsigned int todo; 35 unsigned int todo;
151 struct timeval start, end; 36 struct timeval start, end;
152 s64 elapsed_csecs64; 37 u64 elapsed_csecs64;
153 unsigned int elapsed_csecs; 38 unsigned int elapsed_csecs;
154 39
155 do_gettimeofday(&start); 40 do_gettimeofday(&start);
@@ -250,6 +135,9 @@ static void thaw_tasks(bool nosig_only)
250 if (nosig_only && should_send_signal(p)) 135 if (nosig_only && should_send_signal(p))
251 continue; 136 continue;
252 137
138 if (cgroup_frozen(p))
139 continue;
140
253 thaw_process(p); 141 thaw_process(p);
254 } while_each_thread(g, p); 142 } while_each_thread(g, p);
255 read_unlock(&tasklist_lock); 143 read_unlock(&tasklist_lock);
@@ -264,4 +152,3 @@ void thaw_processes(void)
264 printk("done.\n"); 152 printk("done.\n");
265} 153}
266 154
267EXPORT_SYMBOL(refrigerator);
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 5f91a07c4eac..5d2ab836e998 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -205,8 +205,7 @@ static void chain_free(struct chain_allocator *ca, int clear_page_nosave)
205 * objects. The main list's elements are of type struct zone_bitmap 205 * objects. The main list's elements are of type struct zone_bitmap
206 * and each of them corresonds to one zone. For each zone bitmap 206 * and each of them corresonds to one zone. For each zone bitmap
207 * object there is a list of objects of type struct bm_block that 207 * object there is a list of objects of type struct bm_block that
208 * represent each blocks of bit chunks in which information is 208 * represent each blocks of bitmap in which information is stored.
209 * stored.
210 * 209 *
211 * struct memory_bitmap contains a pointer to the main list of zone 210 * struct memory_bitmap contains a pointer to the main list of zone
212 * bitmap objects, a struct bm_position used for browsing the bitmap, 211 * bitmap objects, a struct bm_position used for browsing the bitmap,
@@ -224,26 +223,27 @@ static void chain_free(struct chain_allocator *ca, int clear_page_nosave)
224 * pfns that correspond to the start and end of the represented zone. 223 * pfns that correspond to the start and end of the represented zone.
225 * 224 *
226 * struct bm_block contains a pointer to the memory page in which 225 * struct bm_block contains a pointer to the memory page in which
227 * information is stored (in the form of a block of bit chunks 226 * information is stored (in the form of a block of bitmap)
228 * of type unsigned long each). It also contains the pfns that 227 * It also contains the pfns that correspond to the start and end of
229 * correspond to the start and end of the represented memory area and 228 * the represented memory area.
230 * the number of bit chunks in the block.
231 */ 229 */
232 230
233#define BM_END_OF_MAP (~0UL) 231#define BM_END_OF_MAP (~0UL)
234 232
235#define BM_CHUNKS_PER_BLOCK (PAGE_SIZE / sizeof(long))
236#define BM_BITS_PER_CHUNK (sizeof(long) << 3)
237#define BM_BITS_PER_BLOCK (PAGE_SIZE << 3) 233#define BM_BITS_PER_BLOCK (PAGE_SIZE << 3)
238 234
239struct bm_block { 235struct bm_block {
240 struct bm_block *next; /* next element of the list */ 236 struct bm_block *next; /* next element of the list */
241 unsigned long start_pfn; /* pfn represented by the first bit */ 237 unsigned long start_pfn; /* pfn represented by the first bit */
242 unsigned long end_pfn; /* pfn represented by the last bit plus 1 */ 238 unsigned long end_pfn; /* pfn represented by the last bit plus 1 */
243 unsigned int size; /* number of bit chunks */ 239 unsigned long *data; /* bitmap representing pages */
244 unsigned long *data; /* chunks of bits representing pages */
245}; 240};
246 241
242static inline unsigned long bm_block_bits(struct bm_block *bb)
243{
244 return bb->end_pfn - bb->start_pfn;
245}
246
247struct zone_bitmap { 247struct zone_bitmap {
248 struct zone_bitmap *next; /* next element of the list */ 248 struct zone_bitmap *next; /* next element of the list */
249 unsigned long start_pfn; /* minimal pfn in this zone */ 249 unsigned long start_pfn; /* minimal pfn in this zone */
@@ -257,7 +257,6 @@ struct zone_bitmap {
257struct bm_position { 257struct bm_position {
258 struct zone_bitmap *zone_bm; 258 struct zone_bitmap *zone_bm;
259 struct bm_block *block; 259 struct bm_block *block;
260 int chunk;
261 int bit; 260 int bit;
262}; 261};
263 262
@@ -272,12 +271,6 @@ struct memory_bitmap {
272 271
273/* Functions that operate on memory bitmaps */ 272/* Functions that operate on memory bitmaps */
274 273
275static inline void memory_bm_reset_chunk(struct memory_bitmap *bm)
276{
277 bm->cur.chunk = 0;
278 bm->cur.bit = -1;
279}
280
281static void memory_bm_position_reset(struct memory_bitmap *bm) 274static void memory_bm_position_reset(struct memory_bitmap *bm)
282{ 275{
283 struct zone_bitmap *zone_bm; 276 struct zone_bitmap *zone_bm;
@@ -285,7 +278,7 @@ static void memory_bm_position_reset(struct memory_bitmap *bm)
285 zone_bm = bm->zone_bm_list; 278 zone_bm = bm->zone_bm_list;
286 bm->cur.zone_bm = zone_bm; 279 bm->cur.zone_bm = zone_bm;
287 bm->cur.block = zone_bm->bm_blocks; 280 bm->cur.block = zone_bm->bm_blocks;
288 memory_bm_reset_chunk(bm); 281 bm->cur.bit = 0;
289} 282}
290 283
291static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); 284static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
@@ -394,12 +387,10 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
394 bb->start_pfn = pfn; 387 bb->start_pfn = pfn;
395 if (nr >= BM_BITS_PER_BLOCK) { 388 if (nr >= BM_BITS_PER_BLOCK) {
396 pfn += BM_BITS_PER_BLOCK; 389 pfn += BM_BITS_PER_BLOCK;
397 bb->size = BM_CHUNKS_PER_BLOCK;
398 nr -= BM_BITS_PER_BLOCK; 390 nr -= BM_BITS_PER_BLOCK;
399 } else { 391 } else {
400 /* This is executed only once in the loop */ 392 /* This is executed only once in the loop */
401 pfn += nr; 393 pfn += nr;
402 bb->size = DIV_ROUND_UP(nr, BM_BITS_PER_CHUNK);
403 } 394 }
404 bb->end_pfn = pfn; 395 bb->end_pfn = pfn;
405 bb = bb->next; 396 bb = bb->next;
@@ -478,8 +469,8 @@ static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
478 } 469 }
479 zone_bm->cur_block = bb; 470 zone_bm->cur_block = bb;
480 pfn -= bb->start_pfn; 471 pfn -= bb->start_pfn;
481 *bit_nr = pfn % BM_BITS_PER_CHUNK; 472 *bit_nr = pfn;
482 *addr = bb->data + pfn / BM_BITS_PER_CHUNK; 473 *addr = bb->data;
483 return 0; 474 return 0;
484} 475}
485 476
@@ -528,36 +519,6 @@ static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
528 return test_bit(bit, addr); 519 return test_bit(bit, addr);
529} 520}
530 521
531/* Two auxiliary functions for memory_bm_next_pfn */
532
533/* Find the first set bit in the given chunk, if there is one */
534
535static inline int next_bit_in_chunk(int bit, unsigned long *chunk_p)
536{
537 bit++;
538 while (bit < BM_BITS_PER_CHUNK) {
539 if (test_bit(bit, chunk_p))
540 return bit;
541
542 bit++;
543 }
544 return -1;
545}
546
547/* Find a chunk containing some bits set in given block of bits */
548
549static inline int next_chunk_in_block(int n, struct bm_block *bb)
550{
551 n++;
552 while (n < bb->size) {
553 if (bb->data[n])
554 return n;
555
556 n++;
557 }
558 return -1;
559}
560
561/** 522/**
562 * memory_bm_next_pfn - find the pfn that corresponds to the next set bit 523 * memory_bm_next_pfn - find the pfn that corresponds to the next set bit
563 * in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is 524 * in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is
@@ -571,40 +532,33 @@ static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
571{ 532{
572 struct zone_bitmap *zone_bm; 533 struct zone_bitmap *zone_bm;
573 struct bm_block *bb; 534 struct bm_block *bb;
574 int chunk;
575 int bit; 535 int bit;
576 536
577 do { 537 do {
578 bb = bm->cur.block; 538 bb = bm->cur.block;
579 do { 539 do {
580 chunk = bm->cur.chunk;
581 bit = bm->cur.bit; 540 bit = bm->cur.bit;
582 do { 541 bit = find_next_bit(bb->data, bm_block_bits(bb), bit);
583 bit = next_bit_in_chunk(bit, bb->data + chunk); 542 if (bit < bm_block_bits(bb))
584 if (bit >= 0) 543 goto Return_pfn;
585 goto Return_pfn; 544
586
587 chunk = next_chunk_in_block(chunk, bb);
588 bit = -1;
589 } while (chunk >= 0);
590 bb = bb->next; 545 bb = bb->next;
591 bm->cur.block = bb; 546 bm->cur.block = bb;
592 memory_bm_reset_chunk(bm); 547 bm->cur.bit = 0;
593 } while (bb); 548 } while (bb);
594 zone_bm = bm->cur.zone_bm->next; 549 zone_bm = bm->cur.zone_bm->next;
595 if (zone_bm) { 550 if (zone_bm) {
596 bm->cur.zone_bm = zone_bm; 551 bm->cur.zone_bm = zone_bm;
597 bm->cur.block = zone_bm->bm_blocks; 552 bm->cur.block = zone_bm->bm_blocks;
598 memory_bm_reset_chunk(bm); 553 bm->cur.bit = 0;
599 } 554 }
600 } while (zone_bm); 555 } while (zone_bm);
601 memory_bm_position_reset(bm); 556 memory_bm_position_reset(bm);
602 return BM_END_OF_MAP; 557 return BM_END_OF_MAP;
603 558
604 Return_pfn: 559 Return_pfn:
605 bm->cur.chunk = chunk; 560 bm->cur.bit = bit + 1;
606 bm->cur.bit = bit; 561 return bb->start_pfn + bit;
607 return bb->start_pfn + chunk * BM_BITS_PER_CHUNK + bit;
608} 562}
609 563
610/** 564/**
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index a0abf9a463f9..b7713b53d07a 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -14,7 +14,6 @@
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/file.h> 15#include <linux/file.h>
16#include <linux/utsname.h> 16#include <linux/utsname.h>
17#include <linux/version.h>
18#include <linux/delay.h> 17#include <linux/delay.h>
19#include <linux/bitops.h> 18#include <linux/bitops.h>
20#include <linux/genhd.h> 19#include <linux/genhd.h>
@@ -173,13 +172,13 @@ static int swsusp_swap_check(void) /* This is called before saving image */
173 return res; 172 return res;
174 173
175 root_swap = res; 174 root_swap = res;
176 res = blkdev_get(resume_bdev, FMODE_WRITE, O_RDWR); 175 res = blkdev_get(resume_bdev, FMODE_WRITE);
177 if (res) 176 if (res)
178 return res; 177 return res;
179 178
180 res = set_blocksize(resume_bdev, PAGE_SIZE); 179 res = set_blocksize(resume_bdev, PAGE_SIZE);
181 if (res < 0) 180 if (res < 0)
182 blkdev_put(resume_bdev); 181 blkdev_put(resume_bdev, FMODE_WRITE);
183 182
184 return res; 183 return res;
185} 184}
@@ -427,7 +426,7 @@ int swsusp_write(unsigned int flags)
427 426
428 release_swap_writer(&handle); 427 release_swap_writer(&handle);
429 out: 428 out:
430 swsusp_close(); 429 swsusp_close(FMODE_WRITE);
431 return error; 430 return error;
432} 431}
433 432
@@ -575,7 +574,7 @@ int swsusp_read(unsigned int *flags_p)
575 error = load_image(&handle, &snapshot, header->pages - 1); 574 error = load_image(&handle, &snapshot, header->pages - 1);
576 release_swap_reader(&handle); 575 release_swap_reader(&handle);
577 576
578 blkdev_put(resume_bdev); 577 blkdev_put(resume_bdev, FMODE_READ);
579 578
580 if (!error) 579 if (!error)
581 pr_debug("PM: Image successfully loaded\n"); 580 pr_debug("PM: Image successfully loaded\n");
@@ -610,7 +609,7 @@ int swsusp_check(void)
610 return -EINVAL; 609 return -EINVAL;
611 } 610 }
612 if (error) 611 if (error)
613 blkdev_put(resume_bdev); 612 blkdev_put(resume_bdev, FMODE_READ);
614 else 613 else
615 pr_debug("PM: Signature found, resuming\n"); 614 pr_debug("PM: Signature found, resuming\n");
616 } else { 615 } else {
@@ -627,14 +626,14 @@ int swsusp_check(void)
627 * swsusp_close - close swap device. 626 * swsusp_close - close swap device.
628 */ 627 */
629 628
630void swsusp_close(void) 629void swsusp_close(fmode_t mode)
631{ 630{
632 if (IS_ERR(resume_bdev)) { 631 if (IS_ERR(resume_bdev)) {
633 pr_debug("PM: Image device not initialised\n"); 632 pr_debug("PM: Image device not initialised\n");
634 return; 633 return;
635 } 634 }
636 635
637 blkdev_put(resume_bdev); 636 blkdev_put(resume_bdev, mode); /* move up */
638} 637}
639 638
640static int swsusp_header_init(void) 639static int swsusp_header_init(void)
diff --git a/kernel/power/user.c b/kernel/power/user.c
index a6332a313262..005b93d839ba 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -212,13 +212,20 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
212 case SNAPSHOT_FREEZE: 212 case SNAPSHOT_FREEZE:
213 if (data->frozen) 213 if (data->frozen)
214 break; 214 break;
215
215 printk("Syncing filesystems ... "); 216 printk("Syncing filesystems ... ");
216 sys_sync(); 217 sys_sync();
217 printk("done.\n"); 218 printk("done.\n");
218 219
219 error = freeze_processes(); 220 error = usermodehelper_disable();
220 if (error) 221 if (error)
222 break;
223
224 error = freeze_processes();
225 if (error) {
221 thaw_processes(); 226 thaw_processes();
227 usermodehelper_enable();
228 }
222 if (!error) 229 if (!error)
223 data->frozen = 1; 230 data->frozen = 1;
224 break; 231 break;
@@ -227,6 +234,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
227 if (!data->frozen || data->ready) 234 if (!data->frozen || data->ready)
228 break; 235 break;
229 thaw_processes(); 236 thaw_processes();
237 usermodehelper_enable();
230 data->frozen = 0; 238 data->frozen = 0;
231 break; 239 break;
232 240
diff --git a/kernel/printk.c b/kernel/printk.c
index 07ad9e7f7a66..6341af77eb65 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -13,7 +13,7 @@
13 * Fixed SMP synchronization, 08/08/99, Manfred Spraul 13 * Fixed SMP synchronization, 08/08/99, Manfred Spraul
14 * manfred@colorfullife.com 14 * manfred@colorfullife.com
15 * Rewrote bits to get rid of console_lock 15 * Rewrote bits to get rid of console_lock
16 * 01Mar01 Andrew Morton <andrewm@uow.edu.au> 16 * 01Mar01 Andrew Morton
17 */ 17 */
18 18
19#include <linux/kernel.h> 19#include <linux/kernel.h>
@@ -577,9 +577,6 @@ static int have_callable_console(void)
577 * @fmt: format string 577 * @fmt: format string
578 * 578 *
579 * This is printk(). It can be called from any context. We want it to work. 579 * This is printk(). It can be called from any context. We want it to work.
580 * Be aware of the fact that if oops_in_progress is not set, we might try to
581 * wake klogd up which could deadlock on runqueue lock if printk() is called
582 * from scheduler code.
583 * 580 *
584 * We try to grab the console_sem. If we succeed, it's easy - we log the output and 581 * We try to grab the console_sem. If we succeed, it's easy - we log the output and
585 * call the console drivers. If we fail to get the semaphore we place the output 582 * call the console drivers. If we fail to get the semaphore we place the output
@@ -593,6 +590,8 @@ static int have_callable_console(void)
593 * 590 *
594 * See also: 591 * See also:
595 * printf(3) 592 * printf(3)
593 *
594 * See the vsnprintf() documentation for format string extensions over C99.
596 */ 595 */
597 596
598asmlinkage int printk(const char *fmt, ...) 597asmlinkage int printk(const char *fmt, ...)
@@ -933,7 +932,7 @@ void suspend_console(void)
933{ 932{
934 if (!console_suspend_enabled) 933 if (!console_suspend_enabled)
935 return; 934 return;
936 printk("Suspending console(s)\n"); 935 printk("Suspending console(s) (use no_console_suspend to debug)\n");
937 acquire_console_sem(); 936 acquire_console_sem();
938 console_suspended = 1; 937 console_suspended = 1;
939} 938}
@@ -982,10 +981,25 @@ int is_console_locked(void)
982 return console_locked; 981 return console_locked;
983} 982}
984 983
985void wake_up_klogd(void) 984static DEFINE_PER_CPU(int, printk_pending);
985
986void printk_tick(void)
986{ 987{
987 if (!oops_in_progress && waitqueue_active(&log_wait)) 988 if (__get_cpu_var(printk_pending)) {
989 __get_cpu_var(printk_pending) = 0;
988 wake_up_interruptible(&log_wait); 990 wake_up_interruptible(&log_wait);
991 }
992}
993
994int printk_needs_cpu(int cpu)
995{
996 return per_cpu(printk_pending, cpu);
997}
998
999void wake_up_klogd(void)
1000{
1001 if (waitqueue_active(&log_wait))
1002 __raw_get_cpu_var(printk_pending) = 1;
989} 1003}
990 1004
991/** 1005/**
@@ -1291,46 +1305,19 @@ static int __init disable_boot_consoles(void)
1291} 1305}
1292late_initcall(disable_boot_consoles); 1306late_initcall(disable_boot_consoles);
1293 1307
1294/**
1295 * tty_write_message - write a message to a certain tty, not just the console.
1296 * @tty: the destination tty_struct
1297 * @msg: the message to write
1298 *
1299 * This is used for messages that need to be redirected to a specific tty.
1300 * We don't put it into the syslog queue right now maybe in the future if
1301 * really needed.
1302 */
1303void tty_write_message(struct tty_struct *tty, char *msg)
1304{
1305 if (tty && tty->ops->write)
1306 tty->ops->write(tty, msg, strlen(msg));
1307 return;
1308}
1309
1310#if defined CONFIG_PRINTK 1308#if defined CONFIG_PRINTK
1309
1311/* 1310/*
1312 * printk rate limiting, lifted from the networking subsystem. 1311 * printk rate limiting, lifted from the networking subsystem.
1313 * 1312 *
1314 * This enforces a rate limit: not more than one kernel message 1313 * This enforces a rate limit: not more than 10 kernel messages
1315 * every printk_ratelimit_jiffies to make a denial-of-service 1314 * every 5s to make a denial-of-service attack impossible.
1316 * attack impossible.
1317 */ 1315 */
1318int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst) 1316DEFINE_RATELIMIT_STATE(printk_ratelimit_state, 5 * HZ, 10);
1319{
1320 return __ratelimit(ratelimit_jiffies, ratelimit_burst);
1321}
1322EXPORT_SYMBOL(__printk_ratelimit);
1323
1324/* minimum time in jiffies between messages */
1325int printk_ratelimit_jiffies = 5 * HZ;
1326
1327/* number of messages we send before ratelimiting */
1328int printk_ratelimit_burst = 10;
1329 1317
1330int printk_ratelimit(void) 1318int printk_ratelimit(void)
1331{ 1319{
1332 return __printk_ratelimit(printk_ratelimit_jiffies, 1320 return __ratelimit(&printk_ratelimit_state);
1333 printk_ratelimit_burst);
1334} 1321}
1335EXPORT_SYMBOL(printk_ratelimit); 1322EXPORT_SYMBOL(printk_ratelimit);
1336 1323
diff --git a/kernel/profile.c b/kernel/profile.c
index 58926411eb2a..a9e422df6bf6 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -22,6 +22,8 @@
22#include <linux/cpu.h> 22#include <linux/cpu.h>
23#include <linux/highmem.h> 23#include <linux/highmem.h>
24#include <linux/mutex.h> 24#include <linux/mutex.h>
25#include <linux/slab.h>
26#include <linux/vmalloc.h>
25#include <asm/sections.h> 27#include <asm/sections.h>
26#include <asm/irq_regs.h> 28#include <asm/irq_regs.h>
27#include <asm/ptrace.h> 29#include <asm/ptrace.h>
@@ -50,11 +52,11 @@ static DEFINE_PER_CPU(int, cpu_profile_flip);
50static DEFINE_MUTEX(profile_flip_mutex); 52static DEFINE_MUTEX(profile_flip_mutex);
51#endif /* CONFIG_SMP */ 53#endif /* CONFIG_SMP */
52 54
53static int __init profile_setup(char *str) 55int profile_setup(char *str)
54{ 56{
55 static char __initdata schedstr[] = "schedule"; 57 static char schedstr[] = "schedule";
56 static char __initdata sleepstr[] = "sleep"; 58 static char sleepstr[] = "sleep";
57 static char __initdata kvmstr[] = "kvm"; 59 static char kvmstr[] = "kvm";
58 int par; 60 int par;
59 61
60 if (!strncmp(str, sleepstr, strlen(sleepstr))) { 62 if (!strncmp(str, sleepstr, strlen(sleepstr))) {
@@ -100,20 +102,37 @@ static int __init profile_setup(char *str)
100__setup("profile=", profile_setup); 102__setup("profile=", profile_setup);
101 103
102 104
103void __init profile_init(void) 105int profile_init(void)
104{ 106{
107 int buffer_bytes;
105 if (!prof_on) 108 if (!prof_on)
106 return; 109 return 0;
107 110
108 /* only text is profiled */ 111 /* only text is profiled */
109 prof_len = (_etext - _stext) >> prof_shift; 112 prof_len = (_etext - _stext) >> prof_shift;
110 prof_buffer = alloc_bootmem(prof_len*sizeof(atomic_t)); 113 buffer_bytes = prof_len*sizeof(atomic_t);
114 if (!slab_is_available()) {
115 prof_buffer = alloc_bootmem(buffer_bytes);
116 return 0;
117 }
118
119 prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL);
120 if (prof_buffer)
121 return 0;
122
123 prof_buffer = alloc_pages_exact(buffer_bytes, GFP_KERNEL|__GFP_ZERO);
124 if (prof_buffer)
125 return 0;
126
127 prof_buffer = vmalloc(buffer_bytes);
128 if (prof_buffer)
129 return 0;
130
131 return -ENOMEM;
111} 132}
112 133
113/* Profile event notifications */ 134/* Profile event notifications */
114 135
115#ifdef CONFIG_PROFILING
116
117static BLOCKING_NOTIFIER_HEAD(task_exit_notifier); 136static BLOCKING_NOTIFIER_HEAD(task_exit_notifier);
118static ATOMIC_NOTIFIER_HEAD(task_free_notifier); 137static ATOMIC_NOTIFIER_HEAD(task_free_notifier);
119static BLOCKING_NOTIFIER_HEAD(munmap_notifier); 138static BLOCKING_NOTIFIER_HEAD(munmap_notifier);
@@ -203,8 +222,6 @@ void unregister_timer_hook(int (*hook)(struct pt_regs *))
203} 222}
204EXPORT_SYMBOL_GPL(unregister_timer_hook); 223EXPORT_SYMBOL_GPL(unregister_timer_hook);
205 224
206#endif /* CONFIG_PROFILING */
207
208 225
209#ifdef CONFIG_SMP 226#ifdef CONFIG_SMP
210/* 227/*
@@ -531,7 +548,7 @@ static void __init profile_nop(void *unused)
531{ 548{
532} 549}
533 550
534static int __init create_hash_tables(void) 551static int create_hash_tables(void)
535{ 552{
536 int cpu; 553 int cpu;
537 554
@@ -579,14 +596,14 @@ out_cleanup:
579#define create_hash_tables() ({ 0; }) 596#define create_hash_tables() ({ 0; })
580#endif 597#endif
581 598
582static int __init create_proc_profile(void) 599int create_proc_profile(void)
583{ 600{
584 struct proc_dir_entry *entry; 601 struct proc_dir_entry *entry;
585 602
586 if (!prof_on) 603 if (!prof_on)
587 return 0; 604 return 0;
588 if (create_hash_tables()) 605 if (create_hash_tables())
589 return -1; 606 return -ENOMEM;
590 entry = proc_create("profile", S_IWUSR | S_IRUGO, 607 entry = proc_create("profile", S_IWUSR | S_IRUGO,
591 NULL, &proc_profile_operations); 608 NULL, &proc_profile_operations);
592 if (!entry) 609 if (!entry)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 8392a9da6450..1e68e4c39e2c 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -45,7 +45,7 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
45 * TASK_TRACED, resume it now. 45 * TASK_TRACED, resume it now.
46 * Requires that irqs be disabled. 46 * Requires that irqs be disabled.
47 */ 47 */
48void ptrace_untrace(struct task_struct *child) 48static void ptrace_untrace(struct task_struct *child)
49{ 49{
50 spin_lock(&child->sighand->siglock); 50 spin_lock(&child->sighand->siglock);
51 if (task_is_traced(child)) { 51 if (task_is_traced(child)) {
@@ -107,7 +107,7 @@ int ptrace_check_attach(struct task_struct *child, int kill)
107 read_unlock(&tasklist_lock); 107 read_unlock(&tasklist_lock);
108 108
109 if (!ret && !kill) 109 if (!ret && !kill)
110 wait_task_inactive(child); 110 ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH;
111 111
112 /* All systems go.. */ 112 /* All systems go.. */
113 return ret; 113 return ret;
@@ -140,7 +140,7 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
140 if (!dumpable && !capable(CAP_SYS_PTRACE)) 140 if (!dumpable && !capable(CAP_SYS_PTRACE))
141 return -EPERM; 141 return -EPERM;
142 142
143 return security_ptrace(current, task, mode); 143 return security_ptrace_may_access(task, mode);
144} 144}
145 145
146bool ptrace_may_access(struct task_struct *task, unsigned int mode) 146bool ptrace_may_access(struct task_struct *task, unsigned int mode)
@@ -499,8 +499,7 @@ repeat:
499 goto repeat; 499 goto repeat;
500 } 500 }
501 501
502 ret = security_ptrace(current->parent, current, 502 ret = security_ptrace_traceme(current->parent);
503 PTRACE_MODE_ATTACH);
504 503
505 /* 504 /*
506 * Set the ptrace bit in the process ptrace flags. 505 * Set the ptrace bit in the process ptrace flags.
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 16eeeaa9d618..37f72e551542 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -47,6 +47,7 @@
47#include <linux/notifier.h> 47#include <linux/notifier.h>
48#include <linux/cpu.h> 48#include <linux/cpu.h>
49#include <linux/mutex.h> 49#include <linux/mutex.h>
50#include <linux/time.h>
50 51
51#ifdef CONFIG_DEBUG_LOCK_ALLOC 52#ifdef CONFIG_DEBUG_LOCK_ALLOC
52static struct lock_class_key rcu_lock_key; 53static struct lock_class_key rcu_lock_key;
@@ -60,12 +61,14 @@ EXPORT_SYMBOL_GPL(rcu_lock_map);
60static struct rcu_ctrlblk rcu_ctrlblk = { 61static struct rcu_ctrlblk rcu_ctrlblk = {
61 .cur = -300, 62 .cur = -300,
62 .completed = -300, 63 .completed = -300,
64 .pending = -300,
63 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock), 65 .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
64 .cpumask = CPU_MASK_NONE, 66 .cpumask = CPU_MASK_NONE,
65}; 67};
66static struct rcu_ctrlblk rcu_bh_ctrlblk = { 68static struct rcu_ctrlblk rcu_bh_ctrlblk = {
67 .cur = -300, 69 .cur = -300,
68 .completed = -300, 70 .completed = -300,
71 .pending = -300,
69 .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock), 72 .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
70 .cpumask = CPU_MASK_NONE, 73 .cpumask = CPU_MASK_NONE,
71}; 74};
@@ -83,7 +86,10 @@ static void force_quiescent_state(struct rcu_data *rdp,
83{ 86{
84 int cpu; 87 int cpu;
85 cpumask_t cpumask; 88 cpumask_t cpumask;
89 unsigned long flags;
90
86 set_need_resched(); 91 set_need_resched();
92 spin_lock_irqsave(&rcp->lock, flags);
87 if (unlikely(!rcp->signaled)) { 93 if (unlikely(!rcp->signaled)) {
88 rcp->signaled = 1; 94 rcp->signaled = 1;
89 /* 95 /*
@@ -91,8 +97,8 @@ static void force_quiescent_state(struct rcu_data *rdp,
91 * rdp->cpu is the current cpu. 97 * rdp->cpu is the current cpu.
92 * 98 *
93 * cpu_online_map is updated by the _cpu_down() 99 * cpu_online_map is updated by the _cpu_down()
94 * using stop_machine_run(). Since we're in irqs disabled 100 * using __stop_machine(). Since we're in irqs disabled
95 * section, stop_machine_run() is not exectuting, hence 101 * section, __stop_machine() is not exectuting, hence
96 * the cpu_online_map is stable. 102 * the cpu_online_map is stable.
97 * 103 *
98 * However, a cpu might have been offlined _just_ before 104 * However, a cpu might have been offlined _just_ before
@@ -106,9 +112,10 @@ static void force_quiescent_state(struct rcu_data *rdp,
106 */ 112 */
107 cpus_and(cpumask, rcp->cpumask, cpu_online_map); 113 cpus_and(cpumask, rcp->cpumask, cpu_online_map);
108 cpu_clear(rdp->cpu, cpumask); 114 cpu_clear(rdp->cpu, cpumask);
109 for_each_cpu_mask(cpu, cpumask) 115 for_each_cpu_mask_nr(cpu, cpumask)
110 smp_send_reschedule(cpu); 116 smp_send_reschedule(cpu);
111 } 117 }
118 spin_unlock_irqrestore(&rcp->lock, flags);
112} 119}
113#else 120#else
114static inline void force_quiescent_state(struct rcu_data *rdp, 121static inline void force_quiescent_state(struct rcu_data *rdp,
@@ -118,6 +125,126 @@ static inline void force_quiescent_state(struct rcu_data *rdp,
118} 125}
119#endif 126#endif
120 127
128static void __call_rcu(struct rcu_head *head, struct rcu_ctrlblk *rcp,
129 struct rcu_data *rdp)
130{
131 long batch;
132
133 head->next = NULL;
134 smp_mb(); /* Read of rcu->cur must happen after any change by caller. */
135
136 /*
137 * Determine the batch number of this callback.
138 *
139 * Using ACCESS_ONCE to avoid the following error when gcc eliminates
140 * local variable "batch" and emits codes like this:
141 * 1) rdp->batch = rcp->cur + 1 # gets old value
142 * ......
143 * 2)rcu_batch_after(rcp->cur + 1, rdp->batch) # gets new value
144 * then [*nxttail[0], *nxttail[1]) may contain callbacks
145 * that batch# = rdp->batch, see the comment of struct rcu_data.
146 */
147 batch = ACCESS_ONCE(rcp->cur) + 1;
148
149 if (rdp->nxtlist && rcu_batch_after(batch, rdp->batch)) {
150 /* process callbacks */
151 rdp->nxttail[0] = rdp->nxttail[1];
152 rdp->nxttail[1] = rdp->nxttail[2];
153 if (rcu_batch_after(batch - 1, rdp->batch))
154 rdp->nxttail[0] = rdp->nxttail[2];
155 }
156
157 rdp->batch = batch;
158 *rdp->nxttail[2] = head;
159 rdp->nxttail[2] = &head->next;
160
161 if (unlikely(++rdp->qlen > qhimark)) {
162 rdp->blimit = INT_MAX;
163 force_quiescent_state(rdp, &rcu_ctrlblk);
164 }
165}
166
167#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
168
169static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
170{
171 rcp->gp_start = jiffies;
172 rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_CHECK;
173}
174
175static void print_other_cpu_stall(struct rcu_ctrlblk *rcp)
176{
177 int cpu;
178 long delta;
179 unsigned long flags;
180
181 /* Only let one CPU complain about others per time interval. */
182
183 spin_lock_irqsave(&rcp->lock, flags);
184 delta = jiffies - rcp->jiffies_stall;
185 if (delta < 2 || rcp->cur != rcp->completed) {
186 spin_unlock_irqrestore(&rcp->lock, flags);
187 return;
188 }
189 rcp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
190 spin_unlock_irqrestore(&rcp->lock, flags);
191
192 /* OK, time to rat on our buddy... */
193
194 printk(KERN_ERR "RCU detected CPU stalls:");
195 for_each_possible_cpu(cpu) {
196 if (cpu_isset(cpu, rcp->cpumask))
197 printk(" %d", cpu);
198 }
199 printk(" (detected by %d, t=%ld jiffies)\n",
200 smp_processor_id(), (long)(jiffies - rcp->gp_start));
201}
202
203static void print_cpu_stall(struct rcu_ctrlblk *rcp)
204{
205 unsigned long flags;
206
207 printk(KERN_ERR "RCU detected CPU %d stall (t=%lu/%lu jiffies)\n",
208 smp_processor_id(), jiffies,
209 jiffies - rcp->gp_start);
210 dump_stack();
211 spin_lock_irqsave(&rcp->lock, flags);
212 if ((long)(jiffies - rcp->jiffies_stall) >= 0)
213 rcp->jiffies_stall =
214 jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
215 spin_unlock_irqrestore(&rcp->lock, flags);
216 set_need_resched(); /* kick ourselves to get things going. */
217}
218
219static void check_cpu_stall(struct rcu_ctrlblk *rcp)
220{
221 long delta;
222
223 delta = jiffies - rcp->jiffies_stall;
224 if (cpu_isset(smp_processor_id(), rcp->cpumask) && delta >= 0) {
225
226 /* We haven't checked in, so go dump stack. */
227 print_cpu_stall(rcp);
228
229 } else if (rcp->cur != rcp->completed && delta >= 2) {
230
231 /* They had two seconds to dump stack, so complain. */
232 print_other_cpu_stall(rcp);
233 }
234}
235
236#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
237
238static void record_gp_stall_check_time(struct rcu_ctrlblk *rcp)
239{
240}
241
242static inline void check_cpu_stall(struct rcu_ctrlblk *rcp)
243{
244}
245
246#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
247
121/** 248/**
122 * call_rcu - Queue an RCU callback for invocation after a grace period. 249 * call_rcu - Queue an RCU callback for invocation after a grace period.
123 * @head: structure to be used for queueing the RCU updates. 250 * @head: structure to be used for queueing the RCU updates.
@@ -133,18 +260,10 @@ void call_rcu(struct rcu_head *head,
133 void (*func)(struct rcu_head *rcu)) 260 void (*func)(struct rcu_head *rcu))
134{ 261{
135 unsigned long flags; 262 unsigned long flags;
136 struct rcu_data *rdp;
137 263
138 head->func = func; 264 head->func = func;
139 head->next = NULL;
140 local_irq_save(flags); 265 local_irq_save(flags);
141 rdp = &__get_cpu_var(rcu_data); 266 __call_rcu(head, &rcu_ctrlblk, &__get_cpu_var(rcu_data));
142 *rdp->nxttail = head;
143 rdp->nxttail = &head->next;
144 if (unlikely(++rdp->qlen > qhimark)) {
145 rdp->blimit = INT_MAX;
146 force_quiescent_state(rdp, &rcu_ctrlblk);
147 }
148 local_irq_restore(flags); 267 local_irq_restore(flags);
149} 268}
150EXPORT_SYMBOL_GPL(call_rcu); 269EXPORT_SYMBOL_GPL(call_rcu);
@@ -169,20 +288,10 @@ void call_rcu_bh(struct rcu_head *head,
169 void (*func)(struct rcu_head *rcu)) 288 void (*func)(struct rcu_head *rcu))
170{ 289{
171 unsigned long flags; 290 unsigned long flags;
172 struct rcu_data *rdp;
173 291
174 head->func = func; 292 head->func = func;
175 head->next = NULL;
176 local_irq_save(flags); 293 local_irq_save(flags);
177 rdp = &__get_cpu_var(rcu_bh_data); 294 __call_rcu(head, &rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
178 *rdp->nxttail = head;
179 rdp->nxttail = &head->next;
180
181 if (unlikely(++rdp->qlen > qhimark)) {
182 rdp->blimit = INT_MAX;
183 force_quiescent_state(rdp, &rcu_bh_ctrlblk);
184 }
185
186 local_irq_restore(flags); 295 local_irq_restore(flags);
187} 296}
188EXPORT_SYMBOL_GPL(call_rcu_bh); 297EXPORT_SYMBOL_GPL(call_rcu_bh);
@@ -211,12 +320,6 @@ EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
211static inline void raise_rcu_softirq(void) 320static inline void raise_rcu_softirq(void)
212{ 321{
213 raise_softirq(RCU_SOFTIRQ); 322 raise_softirq(RCU_SOFTIRQ);
214 /*
215 * The smp_mb() here is required to ensure that this cpu's
216 * __rcu_process_callbacks() reads the most recently updated
217 * value of rcu->cur.
218 */
219 smp_mb();
220} 323}
221 324
222/* 325/*
@@ -225,6 +328,7 @@ static inline void raise_rcu_softirq(void)
225 */ 328 */
226static void rcu_do_batch(struct rcu_data *rdp) 329static void rcu_do_batch(struct rcu_data *rdp)
227{ 330{
331 unsigned long flags;
228 struct rcu_head *next, *list; 332 struct rcu_head *next, *list;
229 int count = 0; 333 int count = 0;
230 334
@@ -239,9 +343,9 @@ static void rcu_do_batch(struct rcu_data *rdp)
239 } 343 }
240 rdp->donelist = list; 344 rdp->donelist = list;
241 345
242 local_irq_disable(); 346 local_irq_save(flags);
243 rdp->qlen -= count; 347 rdp->qlen -= count;
244 local_irq_enable(); 348 local_irq_restore(flags);
245 if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark) 349 if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
246 rdp->blimit = blimit; 350 rdp->blimit = blimit;
247 351
@@ -269,6 +373,7 @@ static void rcu_do_batch(struct rcu_data *rdp)
269 * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace 373 * rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
270 * period (if necessary). 374 * period (if necessary).
271 */ 375 */
376
272/* 377/*
273 * Register a new batch of callbacks, and start it up if there is currently no 378 * Register a new batch of callbacks, and start it up if there is currently no
274 * active batch and the batch to be registered has not already occurred. 379 * active batch and the batch to be registered has not already occurred.
@@ -276,15 +381,10 @@ static void rcu_do_batch(struct rcu_data *rdp)
276 */ 381 */
277static void rcu_start_batch(struct rcu_ctrlblk *rcp) 382static void rcu_start_batch(struct rcu_ctrlblk *rcp)
278{ 383{
279 if (rcp->next_pending && 384 if (rcp->cur != rcp->pending &&
280 rcp->completed == rcp->cur) { 385 rcp->completed == rcp->cur) {
281 rcp->next_pending = 0;
282 /*
283 * next_pending == 0 must be visible in
284 * __rcu_process_callbacks() before it can see new value of cur.
285 */
286 smp_wmb();
287 rcp->cur++; 386 rcp->cur++;
387 record_gp_stall_check_time(rcp);
288 388
289 /* 389 /*
290 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a 390 * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
@@ -322,6 +422,8 @@ static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
322static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp, 422static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
323 struct rcu_data *rdp) 423 struct rcu_data *rdp)
324{ 424{
425 unsigned long flags;
426
325 if (rdp->quiescbatch != rcp->cur) { 427 if (rdp->quiescbatch != rcp->cur) {
326 /* start new grace period: */ 428 /* start new grace period: */
327 rdp->qs_pending = 1; 429 rdp->qs_pending = 1;
@@ -345,7 +447,7 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
345 return; 447 return;
346 rdp->qs_pending = 0; 448 rdp->qs_pending = 0;
347 449
348 spin_lock(&rcp->lock); 450 spin_lock_irqsave(&rcp->lock, flags);
349 /* 451 /*
350 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync 452 * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
351 * during cpu startup. Ignore the quiescent state. 453 * during cpu startup. Ignore the quiescent state.
@@ -353,7 +455,7 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
353 if (likely(rdp->quiescbatch == rcp->cur)) 455 if (likely(rdp->quiescbatch == rcp->cur))
354 cpu_quiet(rdp->cpu, rcp); 456 cpu_quiet(rdp->cpu, rcp);
355 457
356 spin_unlock(&rcp->lock); 458 spin_unlock_irqrestore(&rcp->lock, flags);
357} 459}
358 460
359 461
@@ -364,33 +466,38 @@ static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
364 * which is dead and hence not processing interrupts. 466 * which is dead and hence not processing interrupts.
365 */ 467 */
366static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list, 468static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
367 struct rcu_head **tail) 469 struct rcu_head **tail, long batch)
368{ 470{
369 local_irq_disable(); 471 unsigned long flags;
370 *this_rdp->nxttail = list; 472
371 if (list) 473 if (list) {
372 this_rdp->nxttail = tail; 474 local_irq_save(flags);
373 local_irq_enable(); 475 this_rdp->batch = batch;
476 *this_rdp->nxttail[2] = list;
477 this_rdp->nxttail[2] = tail;
478 local_irq_restore(flags);
479 }
374} 480}
375 481
376static void __rcu_offline_cpu(struct rcu_data *this_rdp, 482static void __rcu_offline_cpu(struct rcu_data *this_rdp,
377 struct rcu_ctrlblk *rcp, struct rcu_data *rdp) 483 struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
378{ 484{
379 /* if the cpu going offline owns the grace period 485 unsigned long flags;
486
487 /*
488 * if the cpu going offline owns the grace period
380 * we can block indefinitely waiting for it, so flush 489 * we can block indefinitely waiting for it, so flush
381 * it here 490 * it here
382 */ 491 */
383 spin_lock_bh(&rcp->lock); 492 spin_lock_irqsave(&rcp->lock, flags);
384 if (rcp->cur != rcp->completed) 493 if (rcp->cur != rcp->completed)
385 cpu_quiet(rdp->cpu, rcp); 494 cpu_quiet(rdp->cpu, rcp);
386 spin_unlock_bh(&rcp->lock); 495 rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail, rcp->cur + 1);
387 rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail); 496 rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail[2], rcp->cur + 1);
388 rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail); 497 spin_unlock(&rcp->lock);
389 rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
390 498
391 local_irq_disable();
392 this_rdp->qlen += rdp->qlen; 499 this_rdp->qlen += rdp->qlen;
393 local_irq_enable(); 500 local_irq_restore(flags);
394} 501}
395 502
396static void rcu_offline_cpu(int cpu) 503static void rcu_offline_cpu(int cpu)
@@ -420,38 +527,52 @@ static void rcu_offline_cpu(int cpu)
420static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp, 527static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
421 struct rcu_data *rdp) 528 struct rcu_data *rdp)
422{ 529{
423 if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) { 530 unsigned long flags;
424 *rdp->donetail = rdp->curlist; 531 long completed_snap;
425 rdp->donetail = rdp->curtail;
426 rdp->curlist = NULL;
427 rdp->curtail = &rdp->curlist;
428 }
429 532
430 if (rdp->nxtlist && !rdp->curlist) { 533 if (rdp->nxtlist) {
431 local_irq_disable(); 534 local_irq_save(flags);
432 rdp->curlist = rdp->nxtlist; 535 completed_snap = ACCESS_ONCE(rcp->completed);
433 rdp->curtail = rdp->nxttail;
434 rdp->nxtlist = NULL;
435 rdp->nxttail = &rdp->nxtlist;
436 local_irq_enable();
437 536
438 /* 537 /*
439 * start the next batch of callbacks 538 * move the other grace-period-completed entries to
539 * [rdp->nxtlist, *rdp->nxttail[0]) temporarily
440 */ 540 */
541 if (!rcu_batch_before(completed_snap, rdp->batch))
542 rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2];
543 else if (!rcu_batch_before(completed_snap, rdp->batch - 1))
544 rdp->nxttail[0] = rdp->nxttail[1];
441 545
442 /* determine batch number */ 546 /*
443 rdp->batch = rcp->cur + 1; 547 * the grace period for entries in
444 /* see the comment and corresponding wmb() in 548 * [rdp->nxtlist, *rdp->nxttail[0]) has completed and
445 * the rcu_start_batch() 549 * move these entries to donelist
446 */ 550 */
447 smp_rmb(); 551 if (rdp->nxttail[0] != &rdp->nxtlist) {
552 *rdp->donetail = rdp->nxtlist;
553 rdp->donetail = rdp->nxttail[0];
554 rdp->nxtlist = *rdp->nxttail[0];
555 *rdp->donetail = NULL;
556
557 if (rdp->nxttail[1] == rdp->nxttail[0])
558 rdp->nxttail[1] = &rdp->nxtlist;
559 if (rdp->nxttail[2] == rdp->nxttail[0])
560 rdp->nxttail[2] = &rdp->nxtlist;
561 rdp->nxttail[0] = &rdp->nxtlist;
562 }
563
564 local_irq_restore(flags);
565
566 if (rcu_batch_after(rdp->batch, rcp->pending)) {
567 unsigned long flags2;
448 568
449 if (!rcp->next_pending) {
450 /* and start it/schedule start if it's a new batch */ 569 /* and start it/schedule start if it's a new batch */
451 spin_lock(&rcp->lock); 570 spin_lock_irqsave(&rcp->lock, flags2);
452 rcp->next_pending = 1; 571 if (rcu_batch_after(rdp->batch, rcp->pending)) {
453 rcu_start_batch(rcp); 572 rcp->pending = rdp->batch;
454 spin_unlock(&rcp->lock); 573 rcu_start_batch(rcp);
574 }
575 spin_unlock_irqrestore(&rcp->lock, flags2);
455 } 576 }
456 } 577 }
457 578
@@ -462,21 +583,53 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
462 583
463static void rcu_process_callbacks(struct softirq_action *unused) 584static void rcu_process_callbacks(struct softirq_action *unused)
464{ 585{
586 /*
587 * Memory references from any prior RCU read-side critical sections
588 * executed by the interrupted code must be see before any RCU
589 * grace-period manupulations below.
590 */
591
592 smp_mb(); /* See above block comment. */
593
465 __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data)); 594 __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
466 __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data)); 595 __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
596
597 /*
598 * Memory references from any later RCU read-side critical sections
599 * executed by the interrupted code must be see after any RCU
600 * grace-period manupulations above.
601 */
602
603 smp_mb(); /* See above block comment. */
467} 604}
468 605
469static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp) 606static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
470{ 607{
471 /* This cpu has pending rcu entries and the grace period 608 /* Check for CPU stalls, if enabled. */
472 * for them has completed. 609 check_cpu_stall(rcp);
473 */
474 if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch))
475 return 1;
476 610
477 /* This cpu has no pending entries, but there are new entries */ 611 if (rdp->nxtlist) {
478 if (!rdp->curlist && rdp->nxtlist) 612 long completed_snap = ACCESS_ONCE(rcp->completed);
479 return 1; 613
614 /*
615 * This cpu has pending rcu entries and the grace period
616 * for them has completed.
617 */
618 if (!rcu_batch_before(completed_snap, rdp->batch))
619 return 1;
620 if (!rcu_batch_before(completed_snap, rdp->batch - 1) &&
621 rdp->nxttail[0] != rdp->nxttail[1])
622 return 1;
623 if (rdp->nxttail[0] != &rdp->nxtlist)
624 return 1;
625
626 /*
627 * This cpu has pending rcu entries and the new batch
628 * for then hasn't been started nor scheduled start
629 */
630 if (rcu_batch_after(rdp->batch, rcp->pending))
631 return 1;
632 }
480 633
481 /* This cpu has finished callbacks to invoke */ 634 /* This cpu has finished callbacks to invoke */
482 if (rdp->donelist) 635 if (rdp->donelist)
@@ -512,9 +665,15 @@ int rcu_needs_cpu(int cpu)
512 struct rcu_data *rdp = &per_cpu(rcu_data, cpu); 665 struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
513 struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu); 666 struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
514 667
515 return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu)); 668 return !!rdp->nxtlist || !!rdp_bh->nxtlist || rcu_pending(cpu);
516} 669}
517 670
671/*
672 * Top-level function driving RCU grace-period detection, normally
673 * invoked from the scheduler-clock interrupt. This function simply
674 * increments counters that are read only from softirq by this same
675 * CPU, so there are no memory barriers required.
676 */
518void rcu_check_callbacks(int cpu, int user) 677void rcu_check_callbacks(int cpu, int user)
519{ 678{
520 if (user || 679 if (user ||
@@ -558,14 +717,17 @@ void rcu_check_callbacks(int cpu, int user)
558static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp, 717static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
559 struct rcu_data *rdp) 718 struct rcu_data *rdp)
560{ 719{
720 unsigned long flags;
721
722 spin_lock_irqsave(&rcp->lock, flags);
561 memset(rdp, 0, sizeof(*rdp)); 723 memset(rdp, 0, sizeof(*rdp));
562 rdp->curtail = &rdp->curlist; 724 rdp->nxttail[0] = rdp->nxttail[1] = rdp->nxttail[2] = &rdp->nxtlist;
563 rdp->nxttail = &rdp->nxtlist;
564 rdp->donetail = &rdp->donelist; 725 rdp->donetail = &rdp->donelist;
565 rdp->quiescbatch = rcp->completed; 726 rdp->quiescbatch = rcp->completed;
566 rdp->qs_pending = 0; 727 rdp->qs_pending = 0;
567 rdp->cpu = cpu; 728 rdp->cpu = cpu;
568 rdp->blimit = blimit; 729 rdp->blimit = blimit;
730 spin_unlock_irqrestore(&rcp->lock, flags);
569} 731}
570 732
571static void __cpuinit rcu_online_cpu(int cpu) 733static void __cpuinit rcu_online_cpu(int cpu)
@@ -610,6 +772,9 @@ static struct notifier_block __cpuinitdata rcu_nb = {
610 */ 772 */
611void __init __rcu_init(void) 773void __init __rcu_init(void)
612{ 774{
775#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
776 printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
777#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
613 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE, 778 rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
614 (void *)(long)smp_processor_id()); 779 (void *)(long)smp_processor_id());
615 /* Register notifier for non-boot CPUs */ 780 /* Register notifier for non-boot CPUs */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index f14f372cf6f5..ad63af8b2521 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -77,6 +77,7 @@ void wakeme_after_rcu(struct rcu_head *head)
77 * sections are delimited by rcu_read_lock() and rcu_read_unlock(), 77 * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
78 * and may be nested. 78 * and may be nested.
79 */ 79 */
80void synchronize_rcu(void); /* Makes kernel-doc tools happy */
80synchronize_rcu_xxx(synchronize_rcu, call_rcu) 81synchronize_rcu_xxx(synchronize_rcu, call_rcu)
81EXPORT_SYMBOL_GPL(synchronize_rcu); 82EXPORT_SYMBOL_GPL(synchronize_rcu);
82 83
@@ -118,18 +119,19 @@ static void _rcu_barrier(enum rcu_barrier type)
118 /* Take cpucontrol mutex to protect against CPU hotplug */ 119 /* Take cpucontrol mutex to protect against CPU hotplug */
119 mutex_lock(&rcu_barrier_mutex); 120 mutex_lock(&rcu_barrier_mutex);
120 init_completion(&rcu_barrier_completion); 121 init_completion(&rcu_barrier_completion);
121 atomic_set(&rcu_barrier_cpu_count, 0);
122 /* 122 /*
123 * The queueing of callbacks in all CPUs must be atomic with 123 * Initialize rcu_barrier_cpu_count to 1, then invoke
124 * respect to RCU, otherwise one CPU may queue a callback, 124 * rcu_barrier_func() on each CPU, so that each CPU also has
125 * wait for a grace period, decrement barrier count and call 125 * incremented rcu_barrier_cpu_count. Only then is it safe to
126 * complete(), while other CPUs have not yet queued anything. 126 * decrement rcu_barrier_cpu_count -- otherwise the first CPU
127 * So, we need to make sure that grace periods cannot complete 127 * might complete its grace period before all of the other CPUs
128 * until all the callbacks are queued. 128 * did their increment, causing this function to return too
129 * early.
129 */ 130 */
130 rcu_read_lock(); 131 atomic_set(&rcu_barrier_cpu_count, 1);
131 on_each_cpu(rcu_barrier_func, (void *)type, 1); 132 on_each_cpu(rcu_barrier_func, (void *)type, 1);
132 rcu_read_unlock(); 133 if (atomic_dec_and_test(&rcu_barrier_cpu_count))
134 complete(&rcu_barrier_completion);
133 wait_for_completion(&rcu_barrier_completion); 135 wait_for_completion(&rcu_barrier_completion);
134 mutex_unlock(&rcu_barrier_mutex); 136 mutex_unlock(&rcu_barrier_mutex);
135} 137}
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 6f62b77d93c4..59236e8b9daa 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -54,17 +54,9 @@
54#include <linux/cpu.h> 54#include <linux/cpu.h>
55#include <linux/random.h> 55#include <linux/random.h>
56#include <linux/delay.h> 56#include <linux/delay.h>
57#include <linux/byteorder/swabb.h>
58#include <linux/cpumask.h> 57#include <linux/cpumask.h>
59#include <linux/rcupreempt_trace.h> 58#include <linux/rcupreempt_trace.h>
60 59#include <asm/byteorder.h>
61/*
62 * Macro that prevents the compiler from reordering accesses, but does
63 * absolutely -nothing- to prevent CPUs from reordering. This is used
64 * only to mediate communication between mainline code and hardware
65 * interrupt and NMI handlers.
66 */
67#define ACCESS_ONCE(x) (*(volatile typeof(x) *)&(x))
68 60
69/* 61/*
70 * PREEMPT_RCU data structures. 62 * PREEMPT_RCU data structures.
@@ -756,7 +748,7 @@ rcu_try_flip_idle(void)
756 748
757 /* Now ask each CPU for acknowledgement of the flip. */ 749 /* Now ask each CPU for acknowledgement of the flip. */
758 750
759 for_each_cpu_mask(cpu, rcu_cpu_online_map) { 751 for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) {
760 per_cpu(rcu_flip_flag, cpu) = rcu_flipped; 752 per_cpu(rcu_flip_flag, cpu) = rcu_flipped;
761 dyntick_save_progress_counter(cpu); 753 dyntick_save_progress_counter(cpu);
762 } 754 }
@@ -774,7 +766,7 @@ rcu_try_flip_waitack(void)
774 int cpu; 766 int cpu;
775 767
776 RCU_TRACE_ME(rcupreempt_trace_try_flip_a1); 768 RCU_TRACE_ME(rcupreempt_trace_try_flip_a1);
777 for_each_cpu_mask(cpu, rcu_cpu_online_map) 769 for_each_cpu_mask_nr(cpu, rcu_cpu_online_map)
778 if (rcu_try_flip_waitack_needed(cpu) && 770 if (rcu_try_flip_waitack_needed(cpu) &&
779 per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) { 771 per_cpu(rcu_flip_flag, cpu) != rcu_flip_seen) {
780 RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1); 772 RCU_TRACE_ME(rcupreempt_trace_try_flip_ae1);
@@ -806,7 +798,7 @@ rcu_try_flip_waitzero(void)
806 /* Check to see if the sum of the "last" counters is zero. */ 798 /* Check to see if the sum of the "last" counters is zero. */
807 799
808 RCU_TRACE_ME(rcupreempt_trace_try_flip_z1); 800 RCU_TRACE_ME(rcupreempt_trace_try_flip_z1);
809 for_each_cpu_mask(cpu, rcu_cpu_online_map) 801 for_each_cpu_mask_nr(cpu, rcu_cpu_online_map)
810 sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx]; 802 sum += RCU_DATA_CPU(cpu)->rcu_flipctr[lastidx];
811 if (sum != 0) { 803 if (sum != 0) {
812 RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1); 804 RCU_TRACE_ME(rcupreempt_trace_try_flip_ze1);
@@ -821,7 +813,7 @@ rcu_try_flip_waitzero(void)
821 smp_mb(); /* ^^^^^^^^^^^^ */ 813 smp_mb(); /* ^^^^^^^^^^^^ */
822 814
823 /* Call for a memory barrier from each CPU. */ 815 /* Call for a memory barrier from each CPU. */
824 for_each_cpu_mask(cpu, rcu_cpu_online_map) { 816 for_each_cpu_mask_nr(cpu, rcu_cpu_online_map) {
825 per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed; 817 per_cpu(rcu_mb_flag, cpu) = rcu_mb_needed;
826 dyntick_save_progress_counter(cpu); 818 dyntick_save_progress_counter(cpu);
827 } 819 }
@@ -841,7 +833,7 @@ rcu_try_flip_waitmb(void)
841 int cpu; 833 int cpu;
842 834
843 RCU_TRACE_ME(rcupreempt_trace_try_flip_m1); 835 RCU_TRACE_ME(rcupreempt_trace_try_flip_m1);
844 for_each_cpu_mask(cpu, rcu_cpu_online_map) 836 for_each_cpu_mask_nr(cpu, rcu_cpu_online_map)
845 if (rcu_try_flip_waitmb_needed(cpu) && 837 if (rcu_try_flip_waitmb_needed(cpu) &&
846 per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) { 838 per_cpu(rcu_mb_flag, cpu) != rcu_mb_done) {
847 RCU_TRACE_ME(rcupreempt_trace_try_flip_me1); 839 RCU_TRACE_ME(rcupreempt_trace_try_flip_me1);
diff --git a/kernel/rcupreempt_trace.c b/kernel/rcupreempt_trace.c
index 5edf82c34bbc..35c2d3360ecf 100644
--- a/kernel/rcupreempt_trace.c
+++ b/kernel/rcupreempt_trace.c
@@ -308,11 +308,16 @@ out:
308 308
309static int __init rcupreempt_trace_init(void) 309static int __init rcupreempt_trace_init(void)
310{ 310{
311 int ret;
312
311 mutex_init(&rcupreempt_trace_mutex); 313 mutex_init(&rcupreempt_trace_mutex);
312 rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL); 314 rcupreempt_trace_buf = kmalloc(RCUPREEMPT_TRACE_BUF_SIZE, GFP_KERNEL);
313 if (!rcupreempt_trace_buf) 315 if (!rcupreempt_trace_buf)
314 return 1; 316 return 1;
315 return rcupreempt_debugfs_init(); 317 ret = rcupreempt_debugfs_init();
318 if (ret)
319 kfree(rcupreempt_trace_buf);
320 return ret;
316} 321}
317 322
318static void __exit rcupreempt_trace_cleanup(void) 323static void __exit rcupreempt_trace_cleanup(void)
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 90b5b123f7a1..85cb90588a55 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -42,10 +42,10 @@
42#include <linux/freezer.h> 42#include <linux/freezer.h>
43#include <linux/cpu.h> 43#include <linux/cpu.h>
44#include <linux/delay.h> 44#include <linux/delay.h>
45#include <linux/byteorder/swabb.h>
46#include <linux/stat.h> 45#include <linux/stat.h>
47#include <linux/srcu.h> 46#include <linux/srcu.h>
48#include <linux/slab.h> 47#include <linux/slab.h>
48#include <asm/byteorder.h>
49 49
50MODULE_LICENSE("GPL"); 50MODULE_LICENSE("GPL");
51MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and " 51MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
diff --git a/kernel/relay.c b/kernel/relay.c
index 7de644cdec43..8d13a7855c08 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -407,6 +407,35 @@ void relay_reset(struct rchan *chan)
407} 407}
408EXPORT_SYMBOL_GPL(relay_reset); 408EXPORT_SYMBOL_GPL(relay_reset);
409 409
410static inline void relay_set_buf_dentry(struct rchan_buf *buf,
411 struct dentry *dentry)
412{
413 buf->dentry = dentry;
414 buf->dentry->d_inode->i_size = buf->early_bytes;
415}
416
417static struct dentry *relay_create_buf_file(struct rchan *chan,
418 struct rchan_buf *buf,
419 unsigned int cpu)
420{
421 struct dentry *dentry;
422 char *tmpname;
423
424 tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL);
425 if (!tmpname)
426 return NULL;
427 snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu);
428
429 /* Create file in fs */
430 dentry = chan->cb->create_buf_file(tmpname, chan->parent,
431 S_IRUSR, buf,
432 &chan->is_global);
433
434 kfree(tmpname);
435
436 return dentry;
437}
438
410/* 439/*
411 * relay_open_buf - create a new relay channel buffer 440 * relay_open_buf - create a new relay channel buffer
412 * 441 *
@@ -416,45 +445,34 @@ static struct rchan_buf *relay_open_buf(struct rchan *chan, unsigned int cpu)
416{ 445{
417 struct rchan_buf *buf = NULL; 446 struct rchan_buf *buf = NULL;
418 struct dentry *dentry; 447 struct dentry *dentry;
419 char *tmpname;
420 448
421 if (chan->is_global) 449 if (chan->is_global)
422 return chan->buf[0]; 450 return chan->buf[0];
423 451
424 tmpname = kzalloc(NAME_MAX + 1, GFP_KERNEL);
425 if (!tmpname)
426 goto end;
427 snprintf(tmpname, NAME_MAX, "%s%d", chan->base_filename, cpu);
428
429 buf = relay_create_buf(chan); 452 buf = relay_create_buf(chan);
430 if (!buf) 453 if (!buf)
431 goto free_name; 454 return NULL;
455
456 if (chan->has_base_filename) {
457 dentry = relay_create_buf_file(chan, buf, cpu);
458 if (!dentry)
459 goto free_buf;
460 relay_set_buf_dentry(buf, dentry);
461 }
432 462
433 buf->cpu = cpu; 463 buf->cpu = cpu;
434 __relay_reset(buf, 1); 464 __relay_reset(buf, 1);
435 465
436 /* Create file in fs */
437 dentry = chan->cb->create_buf_file(tmpname, chan->parent, S_IRUSR,
438 buf, &chan->is_global);
439 if (!dentry)
440 goto free_buf;
441
442 buf->dentry = dentry;
443
444 if(chan->is_global) { 466 if(chan->is_global) {
445 chan->buf[0] = buf; 467 chan->buf[0] = buf;
446 buf->cpu = 0; 468 buf->cpu = 0;
447 } 469 }
448 470
449 goto free_name; 471 return buf;
450 472
451free_buf: 473free_buf:
452 relay_destroy_buf(buf); 474 relay_destroy_buf(buf);
453 buf = NULL; 475 return NULL;
454free_name:
455 kfree(tmpname);
456end:
457 return buf;
458} 476}
459 477
460/** 478/**
@@ -537,8 +555,8 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb,
537 555
538/** 556/**
539 * relay_open - create a new relay channel 557 * relay_open - create a new relay channel
540 * @base_filename: base name of files to create 558 * @base_filename: base name of files to create, %NULL for buffering only
541 * @parent: dentry of parent directory, %NULL for root directory 559 * @parent: dentry of parent directory, %NULL for root directory or buffer
542 * @subbuf_size: size of sub-buffers 560 * @subbuf_size: size of sub-buffers
543 * @n_subbufs: number of sub-buffers 561 * @n_subbufs: number of sub-buffers
544 * @cb: client callback functions 562 * @cb: client callback functions
@@ -560,8 +578,6 @@ struct rchan *relay_open(const char *base_filename,
560{ 578{
561 unsigned int i; 579 unsigned int i;
562 struct rchan *chan; 580 struct rchan *chan;
563 if (!base_filename)
564 return NULL;
565 581
566 if (!(subbuf_size && n_subbufs)) 582 if (!(subbuf_size && n_subbufs))
567 return NULL; 583 return NULL;
@@ -576,7 +592,10 @@ struct rchan *relay_open(const char *base_filename,
576 chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs); 592 chan->alloc_size = FIX_SIZE(subbuf_size * n_subbufs);
577 chan->parent = parent; 593 chan->parent = parent;
578 chan->private_data = private_data; 594 chan->private_data = private_data;
579 strlcpy(chan->base_filename, base_filename, NAME_MAX); 595 if (base_filename) {
596 chan->has_base_filename = 1;
597 strlcpy(chan->base_filename, base_filename, NAME_MAX);
598 }
580 setup_callbacks(chan, cb); 599 setup_callbacks(chan, cb);
581 kref_init(&chan->kref); 600 kref_init(&chan->kref);
582 601
@@ -604,6 +623,94 @@ free_bufs:
604} 623}
605EXPORT_SYMBOL_GPL(relay_open); 624EXPORT_SYMBOL_GPL(relay_open);
606 625
626struct rchan_percpu_buf_dispatcher {
627 struct rchan_buf *buf;
628 struct dentry *dentry;
629};
630
631/* Called in atomic context. */
632static void __relay_set_buf_dentry(void *info)
633{
634 struct rchan_percpu_buf_dispatcher *p = info;
635
636 relay_set_buf_dentry(p->buf, p->dentry);
637}
638
639/**
640 * relay_late_setup_files - triggers file creation
641 * @chan: channel to operate on
642 * @base_filename: base name of files to create
643 * @parent: dentry of parent directory, %NULL for root directory
644 *
645 * Returns 0 if successful, non-zero otherwise.
646 *
647 * Use to setup files for a previously buffer-only channel.
648 * Useful to do early tracing in kernel, before VFS is up, for example.
649 */
650int relay_late_setup_files(struct rchan *chan,
651 const char *base_filename,
652 struct dentry *parent)
653{
654 int err = 0;
655 unsigned int i, curr_cpu;
656 unsigned long flags;
657 struct dentry *dentry;
658 struct rchan_percpu_buf_dispatcher disp;
659
660 if (!chan || !base_filename)
661 return -EINVAL;
662
663 strlcpy(chan->base_filename, base_filename, NAME_MAX);
664
665 mutex_lock(&relay_channels_mutex);
666 /* Is chan already set up? */
667 if (unlikely(chan->has_base_filename))
668 return -EEXIST;
669 chan->has_base_filename = 1;
670 chan->parent = parent;
671 curr_cpu = get_cpu();
672 /*
673 * The CPU hotplug notifier ran before us and created buffers with
674 * no files associated. So it's safe to call relay_setup_buf_file()
675 * on all currently online CPUs.
676 */
677 for_each_online_cpu(i) {
678 if (unlikely(!chan->buf[i])) {
679 printk(KERN_ERR "relay_late_setup_files: CPU %u "
680 "has no buffer, it must have!\n", i);
681 BUG();
682 err = -EINVAL;
683 break;
684 }
685
686 dentry = relay_create_buf_file(chan, chan->buf[i], i);
687 if (unlikely(!dentry)) {
688 err = -EINVAL;
689 break;
690 }
691
692 if (curr_cpu == i) {
693 local_irq_save(flags);
694 relay_set_buf_dentry(chan->buf[i], dentry);
695 local_irq_restore(flags);
696 } else {
697 disp.buf = chan->buf[i];
698 disp.dentry = dentry;
699 smp_mb();
700 /* relay_channels_mutex must be held, so wait. */
701 err = smp_call_function_single(i,
702 __relay_set_buf_dentry,
703 &disp, 1);
704 }
705 if (unlikely(err))
706 break;
707 }
708 put_cpu();
709 mutex_unlock(&relay_channels_mutex);
710
711 return err;
712}
713
607/** 714/**
608 * relay_switch_subbuf - switch to a new sub-buffer 715 * relay_switch_subbuf - switch to a new sub-buffer
609 * @buf: channel buffer 716 * @buf: channel buffer
@@ -627,8 +734,13 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
627 old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs; 734 old_subbuf = buf->subbufs_produced % buf->chan->n_subbufs;
628 buf->padding[old_subbuf] = buf->prev_padding; 735 buf->padding[old_subbuf] = buf->prev_padding;
629 buf->subbufs_produced++; 736 buf->subbufs_produced++;
630 buf->dentry->d_inode->i_size += buf->chan->subbuf_size - 737 if (buf->dentry)
631 buf->padding[old_subbuf]; 738 buf->dentry->d_inode->i_size +=
739 buf->chan->subbuf_size -
740 buf->padding[old_subbuf];
741 else
742 buf->early_bytes += buf->chan->subbuf_size -
743 buf->padding[old_subbuf];
632 smp_mb(); 744 smp_mb();
633 if (waitqueue_active(&buf->read_wait)) 745 if (waitqueue_active(&buf->read_wait))
634 /* 746 /*
@@ -832,6 +944,10 @@ static void relay_file_read_consume(struct rchan_buf *buf,
832 size_t n_subbufs = buf->chan->n_subbufs; 944 size_t n_subbufs = buf->chan->n_subbufs;
833 size_t read_subbuf; 945 size_t read_subbuf;
834 946
947 if (buf->subbufs_produced == buf->subbufs_consumed &&
948 buf->offset == buf->bytes_consumed)
949 return;
950
835 if (buf->bytes_consumed + bytes_consumed > subbuf_size) { 951 if (buf->bytes_consumed + bytes_consumed > subbuf_size) {
836 relay_subbufs_consumed(buf->chan, buf->cpu, 1); 952 relay_subbufs_consumed(buf->chan, buf->cpu, 1);
837 buf->bytes_consumed = 0; 953 buf->bytes_consumed = 0;
@@ -863,6 +979,8 @@ static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
863 979
864 relay_file_read_consume(buf, read_pos, 0); 980 relay_file_read_consume(buf, read_pos, 0);
865 981
982 consumed = buf->subbufs_consumed;
983
866 if (unlikely(buf->offset > subbuf_size)) { 984 if (unlikely(buf->offset > subbuf_size)) {
867 if (produced == consumed) 985 if (produced == consumed)
868 return 0; 986 return 0;
@@ -881,8 +999,12 @@ static int relay_file_read_avail(struct rchan_buf *buf, size_t read_pos)
881 if (consumed > produced) 999 if (consumed > produced)
882 produced += n_subbufs * subbuf_size; 1000 produced += n_subbufs * subbuf_size;
883 1001
884 if (consumed == produced) 1002 if (consumed == produced) {
1003 if (buf->offset == subbuf_size &&
1004 buf->subbufs_produced > buf->subbufs_consumed)
1005 return 1;
885 return 0; 1006 return 0;
1007 }
886 1008
887 return 1; 1009 return 1;
888} 1010}
@@ -1237,4 +1359,4 @@ static __init int relay_init(void)
1237 return 0; 1359 return 0;
1238} 1360}
1239 1361
1240module_init(relay_init); 1362early_initcall(relay_init);
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index d3c61b4ebef2..f275c8eca772 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -13,6 +13,7 @@
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/res_counter.h> 14#include <linux/res_counter.h>
15#include <linux/uaccess.h> 15#include <linux/uaccess.h>
16#include <linux/mm.h>
16 17
17void res_counter_init(struct res_counter *counter) 18void res_counter_init(struct res_counter *counter)
18{ 19{
@@ -102,44 +103,37 @@ u64 res_counter_read_u64(struct res_counter *counter, int member)
102 return *res_counter_member(counter, member); 103 return *res_counter_member(counter, member);
103} 104}
104 105
105ssize_t res_counter_write(struct res_counter *counter, int member, 106int res_counter_memparse_write_strategy(const char *buf,
106 const char __user *userbuf, size_t nbytes, loff_t *pos, 107 unsigned long long *res)
107 int (*write_strategy)(char *st_buf, unsigned long long *val))
108{ 108{
109 int ret; 109 char *end;
110 char *buf, *end; 110 /* FIXME - make memparse() take const char* args */
111 unsigned long flags; 111 *res = memparse((char *)buf, &end);
112 unsigned long long tmp, *val; 112 if (*end != '\0')
113 113 return -EINVAL;
114 buf = kmalloc(nbytes + 1, GFP_KERNEL);
115 ret = -ENOMEM;
116 if (buf == NULL)
117 goto out;
118 114
119 buf[nbytes] = '\0'; 115 *res = PAGE_ALIGN(*res);
120 ret = -EFAULT; 116 return 0;
121 if (copy_from_user(buf, userbuf, nbytes)) 117}
122 goto out_free;
123 118
124 ret = -EINVAL; 119int res_counter_write(struct res_counter *counter, int member,
120 const char *buf, write_strategy_fn write_strategy)
121{
122 char *end;
123 unsigned long flags;
124 unsigned long long tmp, *val;
125 125
126 strstrip(buf);
127 if (write_strategy) { 126 if (write_strategy) {
128 if (write_strategy(buf, &tmp)) { 127 if (write_strategy(buf, &tmp))
129 goto out_free; 128 return -EINVAL;
130 }
131 } else { 129 } else {
132 tmp = simple_strtoull(buf, &end, 10); 130 tmp = simple_strtoull(buf, &end, 10);
133 if (*end != '\0') 131 if (*end != '\0')
134 goto out_free; 132 return -EINVAL;
135 } 133 }
136 spin_lock_irqsave(&counter->lock, flags); 134 spin_lock_irqsave(&counter->lock, flags);
137 val = res_counter_member(counter, member); 135 val = res_counter_member(counter, member);
138 *val = tmp; 136 *val = tmp;
139 spin_unlock_irqrestore(&counter->lock, flags); 137 spin_unlock_irqrestore(&counter->lock, flags);
140 ret = nbytes; 138 return 0;
141out_free:
142 kfree(buf);
143out:
144 return ret;
145} 139}
diff --git a/kernel/resource.c b/kernel/resource.c
index 74af2d7cb5a1..4089d12af6e0 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -38,10 +38,6 @@ EXPORT_SYMBOL(iomem_resource);
38 38
39static DEFINE_RWLOCK(resource_lock); 39static DEFINE_RWLOCK(resource_lock);
40 40
41#ifdef CONFIG_PROC_FS
42
43enum { MAX_IORES_LEVEL = 5 };
44
45static void *r_next(struct seq_file *m, void *v, loff_t *pos) 41static void *r_next(struct seq_file *m, void *v, loff_t *pos)
46{ 42{
47 struct resource *p = v; 43 struct resource *p = v;
@@ -53,6 +49,10 @@ static void *r_next(struct seq_file *m, void *v, loff_t *pos)
53 return p->sibling; 49 return p->sibling;
54} 50}
55 51
52#ifdef CONFIG_PROC_FS
53
54enum { MAX_IORES_LEVEL = 5 };
55
56static void *r_start(struct seq_file *m, loff_t *pos) 56static void *r_start(struct seq_file *m, loff_t *pos)
57 __acquires(resource_lock) 57 __acquires(resource_lock)
58{ 58{
@@ -362,35 +362,21 @@ int allocate_resource(struct resource *root, struct resource *new,
362 362
363EXPORT_SYMBOL(allocate_resource); 363EXPORT_SYMBOL(allocate_resource);
364 364
365/** 365/*
366 * insert_resource - Inserts a resource in the resource tree 366 * Insert a resource into the resource tree. If successful, return NULL,
367 * @parent: parent of the new resource 367 * otherwise return the conflicting resource (compare to __request_resource())
368 * @new: new resource to insert
369 *
370 * Returns 0 on success, -EBUSY if the resource can't be inserted.
371 *
372 * This function is equivalent to request_resource when no conflict
373 * happens. If a conflict happens, and the conflicting resources
374 * entirely fit within the range of the new resource, then the new
375 * resource is inserted and the conflicting resources become children of
376 * the new resource.
377 */ 368 */
378int insert_resource(struct resource *parent, struct resource *new) 369static struct resource * __insert_resource(struct resource *parent, struct resource *new)
379{ 370{
380 int result;
381 struct resource *first, *next; 371 struct resource *first, *next;
382 372
383 write_lock(&resource_lock);
384
385 for (;; parent = first) { 373 for (;; parent = first) {
386 result = 0;
387 first = __request_resource(parent, new); 374 first = __request_resource(parent, new);
388 if (!first) 375 if (!first)
389 goto out; 376 return first;
390 377
391 result = -EBUSY;
392 if (first == parent) 378 if (first == parent)
393 goto out; 379 return first;
394 380
395 if ((first->start > new->start) || (first->end < new->end)) 381 if ((first->start > new->start) || (first->end < new->end))
396 break; 382 break;
@@ -401,15 +387,13 @@ int insert_resource(struct resource *parent, struct resource *new)
401 for (next = first; ; next = next->sibling) { 387 for (next = first; ; next = next->sibling) {
402 /* Partial overlap? Bad, and unfixable */ 388 /* Partial overlap? Bad, and unfixable */
403 if (next->start < new->start || next->end > new->end) 389 if (next->start < new->start || next->end > new->end)
404 goto out; 390 return next;
405 if (!next->sibling) 391 if (!next->sibling)
406 break; 392 break;
407 if (next->sibling->start > new->end) 393 if (next->sibling->start > new->end)
408 break; 394 break;
409 } 395 }
410 396
411 result = 0;
412
413 new->parent = parent; 397 new->parent = parent;
414 new->sibling = next->sibling; 398 new->sibling = next->sibling;
415 new->child = first; 399 new->child = first;
@@ -426,10 +410,64 @@ int insert_resource(struct resource *parent, struct resource *new)
426 next = next->sibling; 410 next = next->sibling;
427 next->sibling = new; 411 next->sibling = new;
428 } 412 }
413 return NULL;
414}
429 415
430 out: 416/**
417 * insert_resource - Inserts a resource in the resource tree
418 * @parent: parent of the new resource
419 * @new: new resource to insert
420 *
421 * Returns 0 on success, -EBUSY if the resource can't be inserted.
422 *
423 * This function is equivalent to request_resource when no conflict
424 * happens. If a conflict happens, and the conflicting resources
425 * entirely fit within the range of the new resource, then the new
426 * resource is inserted and the conflicting resources become children of
427 * the new resource.
428 */
429int insert_resource(struct resource *parent, struct resource *new)
430{
431 struct resource *conflict;
432
433 write_lock(&resource_lock);
434 conflict = __insert_resource(parent, new);
435 write_unlock(&resource_lock);
436 return conflict ? -EBUSY : 0;
437}
438
439/**
440 * insert_resource_expand_to_fit - Insert a resource into the resource tree
441 * @root: root resource descriptor
442 * @new: new resource to insert
443 *
444 * Insert a resource into the resource tree, possibly expanding it in order
445 * to make it encompass any conflicting resources.
446 */
447void insert_resource_expand_to_fit(struct resource *root, struct resource *new)
448{
449 if (new->parent)
450 return;
451
452 write_lock(&resource_lock);
453 for (;;) {
454 struct resource *conflict;
455
456 conflict = __insert_resource(root, new);
457 if (!conflict)
458 break;
459 if (conflict == root)
460 break;
461
462 /* Ok, expand resource to cover the conflict, then try again .. */
463 if (conflict->start < new->start)
464 new->start = conflict->start;
465 if (conflict->end > new->end)
466 new->end = conflict->end;
467
468 printk("Expanded resource %s due to conflict with %s\n", new->name, conflict->name);
469 }
431 write_unlock(&resource_lock); 470 write_unlock(&resource_lock);
432 return result;
433} 471}
434 472
435/** 473/**
@@ -478,6 +516,70 @@ int adjust_resource(struct resource *res, resource_size_t start, resource_size_t
478 return result; 516 return result;
479} 517}
480 518
519static void __init __reserve_region_with_split(struct resource *root,
520 resource_size_t start, resource_size_t end,
521 const char *name)
522{
523 struct resource *parent = root;
524 struct resource *conflict;
525 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
526
527 if (!res)
528 return;
529
530 res->name = name;
531 res->start = start;
532 res->end = end;
533 res->flags = IORESOURCE_BUSY;
534
535 for (;;) {
536 conflict = __request_resource(parent, res);
537 if (!conflict)
538 break;
539 if (conflict != parent) {
540 parent = conflict;
541 if (!(conflict->flags & IORESOURCE_BUSY))
542 continue;
543 }
544
545 /* Uhhuh, that didn't work out.. */
546 kfree(res);
547 res = NULL;
548 break;
549 }
550
551 if (!res) {
552 /* failed, split and try again */
553
554 /* conflict covered whole area */
555 if (conflict->start <= start && conflict->end >= end)
556 return;
557
558 if (conflict->start > start)
559 __reserve_region_with_split(root, start, conflict->start-1, name);
560 if (!(conflict->flags & IORESOURCE_BUSY)) {
561 resource_size_t common_start, common_end;
562
563 common_start = max(conflict->start, start);
564 common_end = min(conflict->end, end);
565 if (common_start < common_end)
566 __reserve_region_with_split(root, common_start, common_end, name);
567 }
568 if (conflict->end < end)
569 __reserve_region_with_split(root, conflict->end+1, end, name);
570 }
571
572}
573
574void reserve_region_with_split(struct resource *root,
575 resource_size_t start, resource_size_t end,
576 const char *name)
577{
578 write_lock(&resource_lock);
579 __reserve_region_with_split(root, start, end, name);
580 write_unlock(&resource_lock);
581}
582
481EXPORT_SYMBOL(adjust_resource); 583EXPORT_SYMBOL(adjust_resource);
482 584
483/** 585/**
@@ -490,7 +592,7 @@ resource_size_t resource_alignment(struct resource *res)
490{ 592{
491 switch (res->flags & (IORESOURCE_SIZEALIGN | IORESOURCE_STARTALIGN)) { 593 switch (res->flags & (IORESOURCE_SIZEALIGN | IORESOURCE_STARTALIGN)) {
492 case IORESOURCE_SIZEALIGN: 594 case IORESOURCE_SIZEALIGN:
493 return res->end - res->start + 1; 595 return resource_size(res);
494 case IORESOURCE_STARTALIGN: 596 case IORESOURCE_STARTALIGN:
495 return res->start; 597 return res->start;
496 default: 598 default:
@@ -524,33 +626,34 @@ struct resource * __request_region(struct resource *parent,
524{ 626{
525 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); 627 struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
526 628
527 if (res) { 629 if (!res)
528 res->name = name; 630 return NULL;
529 res->start = start;
530 res->end = start + n - 1;
531 res->flags = IORESOURCE_BUSY;
532 631
533 write_lock(&resource_lock); 632 res->name = name;
633 res->start = start;
634 res->end = start + n - 1;
635 res->flags = IORESOURCE_BUSY;
534 636
535 for (;;) { 637 write_lock(&resource_lock);
536 struct resource *conflict;
537 638
538 conflict = __request_resource(parent, res); 639 for (;;) {
539 if (!conflict) 640 struct resource *conflict;
540 break;
541 if (conflict != parent) {
542 parent = conflict;
543 if (!(conflict->flags & IORESOURCE_BUSY))
544 continue;
545 }
546 641
547 /* Uhhuh, that didn't work out.. */ 642 conflict = __request_resource(parent, res);
548 kfree(res); 643 if (!conflict)
549 res = NULL;
550 break; 644 break;
645 if (conflict != parent) {
646 parent = conflict;
647 if (!(conflict->flags & IORESOURCE_BUSY))
648 continue;
551 } 649 }
552 write_unlock(&resource_lock); 650
651 /* Uhhuh, that didn't work out.. */
652 kfree(res);
653 res = NULL;
654 break;
553 } 655 }
656 write_unlock(&resource_lock);
554 return res; 657 return res;
555} 658}
556EXPORT_SYMBOL(__request_region); 659EXPORT_SYMBOL(__request_region);
@@ -725,3 +828,40 @@ static int __init reserve_setup(char *str)
725} 828}
726 829
727__setup("reserve=", reserve_setup); 830__setup("reserve=", reserve_setup);
831
832/*
833 * Check if the requested addr and size spans more than any slot in the
834 * iomem resource tree.
835 */
836int iomem_map_sanity_check(resource_size_t addr, unsigned long size)
837{
838 struct resource *p = &iomem_resource;
839 int err = 0;
840 loff_t l;
841
842 read_lock(&resource_lock);
843 for (p = p->child; p ; p = r_next(NULL, p, &l)) {
844 /*
845 * We can probably skip the resources without
846 * IORESOURCE_IO attribute?
847 */
848 if (p->start >= addr + size)
849 continue;
850 if (p->end < addr)
851 continue;
852 if (p->start <= addr && (p->end >= addr + size - 1))
853 continue;
854 printk(KERN_WARNING "resource map sanity check conflict: "
855 "0x%llx 0x%llx 0x%llx 0x%llx %s\n",
856 (unsigned long long)addr,
857 (unsigned long long)(addr + size - 1),
858 (unsigned long long)p->start,
859 (unsigned long long)p->end,
860 p->name);
861 err = -1;
862 break;
863 }
864 read_unlock(&resource_lock);
865
866 return err;
867}
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 092e4c620af9..a56f629b057a 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -297,8 +297,8 @@ static int test_func(void *data)
297 * 297 *
298 * opcode:data 298 * opcode:data
299 */ 299 */
300static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf, 300static ssize_t sysfs_test_command(struct sys_device *dev, struct sysdev_attribute *attr,
301 size_t count) 301 const char *buf, size_t count)
302{ 302{
303 struct sched_param schedpar; 303 struct sched_param schedpar;
304 struct test_thread_data *td; 304 struct test_thread_data *td;
@@ -360,7 +360,8 @@ static ssize_t sysfs_test_command(struct sys_device *dev, const char *buf,
360 * @dev: thread to query 360 * @dev: thread to query
361 * @buf: char buffer to be filled with thread status info 361 * @buf: char buffer to be filled with thread status info
362 */ 362 */
363static ssize_t sysfs_test_status(struct sys_device *dev, char *buf) 363static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute *attr,
364 char *buf)
364{ 365{
365 struct test_thread_data *td; 366 struct test_thread_data *td;
366 struct task_struct *tsk; 367 struct task_struct *tsk;
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index 6522ae5b14a2..69d9cb921ffa 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -631,8 +631,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
631 631
632 /* Setup the timer, when timeout != NULL */ 632 /* Setup the timer, when timeout != NULL */
633 if (unlikely(timeout)) { 633 if (unlikely(timeout)) {
634 hrtimer_start(&timeout->timer, timeout->timer.expires, 634 hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
635 HRTIMER_MODE_ABS);
636 if (!hrtimer_active(&timeout->timer)) 635 if (!hrtimer_active(&timeout->timer))
637 timeout->task = NULL; 636 timeout->task = NULL;
638 } 637 }
diff --git a/kernel/sched.c b/kernel/sched.c
index 99e6d850ecab..6625c3c4b10d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -55,6 +55,7 @@
55#include <linux/cpuset.h> 55#include <linux/cpuset.h>
56#include <linux/percpu.h> 56#include <linux/percpu.h>
57#include <linux/kthread.h> 57#include <linux/kthread.h>
58#include <linux/proc_fs.h>
58#include <linux/seq_file.h> 59#include <linux/seq_file.h>
59#include <linux/sysctl.h> 60#include <linux/sysctl.h>
60#include <linux/syscalls.h> 61#include <linux/syscalls.h>
@@ -71,6 +72,7 @@
71#include <linux/debugfs.h> 72#include <linux/debugfs.h>
72#include <linux/ctype.h> 73#include <linux/ctype.h>
73#include <linux/ftrace.h> 74#include <linux/ftrace.h>
75#include <trace/sched.h>
74 76
75#include <asm/tlb.h> 77#include <asm/tlb.h>
76#include <asm/irq_regs.h> 78#include <asm/irq_regs.h>
@@ -201,14 +203,19 @@ void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
201 hrtimer_init(&rt_b->rt_period_timer, 203 hrtimer_init(&rt_b->rt_period_timer,
202 CLOCK_MONOTONIC, HRTIMER_MODE_REL); 204 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
203 rt_b->rt_period_timer.function = sched_rt_period_timer; 205 rt_b->rt_period_timer.function = sched_rt_period_timer;
204 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 206 rt_b->rt_period_timer.cb_mode = HRTIMER_CB_IRQSAFE_UNLOCKED;
207}
208
209static inline int rt_bandwidth_enabled(void)
210{
211 return sysctl_sched_rt_runtime >= 0;
205} 212}
206 213
207static void start_rt_bandwidth(struct rt_bandwidth *rt_b) 214static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
208{ 215{
209 ktime_t now; 216 ktime_t now;
210 217
211 if (rt_b->rt_runtime == RUNTIME_INF) 218 if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
212 return; 219 return;
213 220
214 if (hrtimer_active(&rt_b->rt_period_timer)) 221 if (hrtimer_active(&rt_b->rt_period_timer))
@@ -221,9 +228,8 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
221 228
222 now = hrtimer_cb_get_time(&rt_b->rt_period_timer); 229 now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
223 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period); 230 hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
224 hrtimer_start(&rt_b->rt_period_timer, 231 hrtimer_start_expires(&rt_b->rt_period_timer,
225 rt_b->rt_period_timer.expires, 232 HRTIMER_MODE_ABS);
226 HRTIMER_MODE_ABS);
227 } 233 }
228 spin_unlock(&rt_b->rt_runtime_lock); 234 spin_unlock(&rt_b->rt_runtime_lock);
229} 235}
@@ -298,9 +304,9 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp;
298static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); 304static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity);
299static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; 305static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
300#endif /* CONFIG_RT_GROUP_SCHED */ 306#endif /* CONFIG_RT_GROUP_SCHED */
301#else /* !CONFIG_FAIR_GROUP_SCHED */ 307#else /* !CONFIG_USER_SCHED */
302#define root_task_group init_task_group 308#define root_task_group init_task_group
303#endif /* CONFIG_FAIR_GROUP_SCHED */ 309#endif /* CONFIG_USER_SCHED */
304 310
305/* task_group_lock serializes add/remove of task groups and also changes to 311/* task_group_lock serializes add/remove of task groups and also changes to
306 * a task group's cpu shares. 312 * a task group's cpu shares.
@@ -571,8 +577,10 @@ struct rq {
571#endif 577#endif
572 578
573#ifdef CONFIG_SCHED_HRTICK 579#ifdef CONFIG_SCHED_HRTICK
574 unsigned long hrtick_flags; 580#ifdef CONFIG_SMP
575 ktime_t hrtick_expire; 581 int hrtick_csd_pending;
582 struct call_single_data hrtick_csd;
583#endif
576 struct hrtimer hrtick_timer; 584 struct hrtimer hrtick_timer;
577#endif 585#endif
578 586
@@ -598,14 +606,13 @@ struct rq {
598 /* BKL stats */ 606 /* BKL stats */
599 unsigned int bkl_count; 607 unsigned int bkl_count;
600#endif 608#endif
601 struct lock_class_key rq_lock_key;
602}; 609};
603 610
604static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); 611static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
605 612
606static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) 613static inline void check_preempt_curr(struct rq *rq, struct task_struct *p, int sync)
607{ 614{
608 rq->curr->sched_class->check_preempt_curr(rq, p); 615 rq->curr->sched_class->check_preempt_curr(rq, p, sync);
609} 616}
610 617
611static inline int cpu_of(struct rq *rq) 618static inline int cpu_of(struct rq *rq)
@@ -807,9 +814,16 @@ const_debug unsigned int sysctl_sched_nr_migrate = 32;
807 814
808/* 815/*
809 * ratelimit for updating the group shares. 816 * ratelimit for updating the group shares.
810 * default: 0.5ms 817 * default: 0.25ms
811 */ 818 */
812const_debug unsigned int sysctl_sched_shares_ratelimit = 500000; 819unsigned int sysctl_sched_shares_ratelimit = 250000;
820
821/*
822 * Inject some fuzzyness into changing the per-cpu group shares
823 * this avoids remote rq-locks at the expense of fairness.
824 * default: 4
825 */
826unsigned int sysctl_sched_shares_thresh = 4;
813 827
814/* 828/*
815 * period over which we measure -rt task cpu usage in us. 829 * period over which we measure -rt task cpu usage in us.
@@ -832,7 +846,7 @@ static inline u64 global_rt_period(void)
832 846
833static inline u64 global_rt_runtime(void) 847static inline u64 global_rt_runtime(void)
834{ 848{
835 if (sysctl_sched_rt_period < 0) 849 if (sysctl_sched_rt_runtime < 0)
836 return RUNTIME_INF; 850 return RUNTIME_INF;
837 851
838 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; 852 return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
@@ -983,13 +997,6 @@ static struct rq *this_rq_lock(void)
983 return rq; 997 return rq;
984} 998}
985 999
986static void __resched_task(struct task_struct *p, int tif_bit);
987
988static inline void resched_task(struct task_struct *p)
989{
990 __resched_task(p, TIF_NEED_RESCHED);
991}
992
993#ifdef CONFIG_SCHED_HRTICK 1000#ifdef CONFIG_SCHED_HRTICK
994/* 1001/*
995 * Use HR-timers to deliver accurate preemption points. 1002 * Use HR-timers to deliver accurate preemption points.
@@ -1001,25 +1008,6 @@ static inline void resched_task(struct task_struct *p)
1001 * When we get rescheduled we reprogram the hrtick_timer outside of the 1008 * When we get rescheduled we reprogram the hrtick_timer outside of the
1002 * rq->lock. 1009 * rq->lock.
1003 */ 1010 */
1004static inline void resched_hrt(struct task_struct *p)
1005{
1006 __resched_task(p, TIF_HRTICK_RESCHED);
1007}
1008
1009static inline void resched_rq(struct rq *rq)
1010{
1011 unsigned long flags;
1012
1013 spin_lock_irqsave(&rq->lock, flags);
1014 resched_task(rq->curr);
1015 spin_unlock_irqrestore(&rq->lock, flags);
1016}
1017
1018enum {
1019 HRTICK_SET, /* re-programm hrtick_timer */
1020 HRTICK_RESET, /* not a new slice */
1021 HRTICK_BLOCK, /* stop hrtick operations */
1022};
1023 1011
1024/* 1012/*
1025 * Use hrtick when: 1013 * Use hrtick when:
@@ -1030,40 +1018,11 @@ static inline int hrtick_enabled(struct rq *rq)
1030{ 1018{
1031 if (!sched_feat(HRTICK)) 1019 if (!sched_feat(HRTICK))
1032 return 0; 1020 return 0;
1033 if (unlikely(test_bit(HRTICK_BLOCK, &rq->hrtick_flags))) 1021 if (!cpu_active(cpu_of(rq)))
1034 return 0; 1022 return 0;
1035 return hrtimer_is_hres_active(&rq->hrtick_timer); 1023 return hrtimer_is_hres_active(&rq->hrtick_timer);
1036} 1024}
1037 1025
1038/*
1039 * Called to set the hrtick timer state.
1040 *
1041 * called with rq->lock held and irqs disabled
1042 */
1043static void hrtick_start(struct rq *rq, u64 delay, int reset)
1044{
1045 assert_spin_locked(&rq->lock);
1046
1047 /*
1048 * preempt at: now + delay
1049 */
1050 rq->hrtick_expire =
1051 ktime_add_ns(rq->hrtick_timer.base->get_time(), delay);
1052 /*
1053 * indicate we need to program the timer
1054 */
1055 __set_bit(HRTICK_SET, &rq->hrtick_flags);
1056 if (reset)
1057 __set_bit(HRTICK_RESET, &rq->hrtick_flags);
1058
1059 /*
1060 * New slices are called from the schedule path and don't need a
1061 * forced reschedule.
1062 */
1063 if (reset)
1064 resched_hrt(rq->curr);
1065}
1066
1067static void hrtick_clear(struct rq *rq) 1026static void hrtick_clear(struct rq *rq)
1068{ 1027{
1069 if (hrtimer_active(&rq->hrtick_timer)) 1028 if (hrtimer_active(&rq->hrtick_timer))
@@ -1071,32 +1030,6 @@ static void hrtick_clear(struct rq *rq)
1071} 1030}
1072 1031
1073/* 1032/*
1074 * Update the timer from the possible pending state.
1075 */
1076static void hrtick_set(struct rq *rq)
1077{
1078 ktime_t time;
1079 int set, reset;
1080 unsigned long flags;
1081
1082 WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
1083
1084 spin_lock_irqsave(&rq->lock, flags);
1085 set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags);
1086 reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags);
1087 time = rq->hrtick_expire;
1088 clear_thread_flag(TIF_HRTICK_RESCHED);
1089 spin_unlock_irqrestore(&rq->lock, flags);
1090
1091 if (set) {
1092 hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS);
1093 if (reset && !hrtimer_active(&rq->hrtick_timer))
1094 resched_rq(rq);
1095 } else
1096 hrtick_clear(rq);
1097}
1098
1099/*
1100 * High-resolution timer tick. 1033 * High-resolution timer tick.
1101 * Runs from hardirq context with interrupts disabled. 1034 * Runs from hardirq context with interrupts disabled.
1102 */ 1035 */
@@ -1115,27 +1048,37 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
1115} 1048}
1116 1049
1117#ifdef CONFIG_SMP 1050#ifdef CONFIG_SMP
1118static void hotplug_hrtick_disable(int cpu) 1051/*
1052 * called from hardirq (IPI) context
1053 */
1054static void __hrtick_start(void *arg)
1119{ 1055{
1120 struct rq *rq = cpu_rq(cpu); 1056 struct rq *rq = arg;
1121 unsigned long flags;
1122
1123 spin_lock_irqsave(&rq->lock, flags);
1124 rq->hrtick_flags = 0;
1125 __set_bit(HRTICK_BLOCK, &rq->hrtick_flags);
1126 spin_unlock_irqrestore(&rq->lock, flags);
1127 1057
1128 hrtick_clear(rq); 1058 spin_lock(&rq->lock);
1059 hrtimer_restart(&rq->hrtick_timer);
1060 rq->hrtick_csd_pending = 0;
1061 spin_unlock(&rq->lock);
1129} 1062}
1130 1063
1131static void hotplug_hrtick_enable(int cpu) 1064/*
1065 * Called to set the hrtick timer state.
1066 *
1067 * called with rq->lock held and irqs disabled
1068 */
1069static void hrtick_start(struct rq *rq, u64 delay)
1132{ 1070{
1133 struct rq *rq = cpu_rq(cpu); 1071 struct hrtimer *timer = &rq->hrtick_timer;
1134 unsigned long flags; 1072 ktime_t time = ktime_add_ns(timer->base->get_time(), delay);
1135 1073
1136 spin_lock_irqsave(&rq->lock, flags); 1074 hrtimer_set_expires(timer, time);
1137 __clear_bit(HRTICK_BLOCK, &rq->hrtick_flags); 1075
1138 spin_unlock_irqrestore(&rq->lock, flags); 1076 if (rq == this_rq()) {
1077 hrtimer_restart(timer);
1078 } else if (!rq->hrtick_csd_pending) {
1079 __smp_call_function_single(cpu_of(rq), &rq->hrtick_csd);
1080 rq->hrtick_csd_pending = 1;
1081 }
1139} 1082}
1140 1083
1141static int 1084static int
@@ -1150,70 +1093,60 @@ hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu)
1150 case CPU_DOWN_PREPARE_FROZEN: 1093 case CPU_DOWN_PREPARE_FROZEN:
1151 case CPU_DEAD: 1094 case CPU_DEAD:
1152 case CPU_DEAD_FROZEN: 1095 case CPU_DEAD_FROZEN:
1153 hotplug_hrtick_disable(cpu); 1096 hrtick_clear(cpu_rq(cpu));
1154 return NOTIFY_OK;
1155
1156 case CPU_UP_PREPARE:
1157 case CPU_UP_PREPARE_FROZEN:
1158 case CPU_DOWN_FAILED:
1159 case CPU_DOWN_FAILED_FROZEN:
1160 case CPU_ONLINE:
1161 case CPU_ONLINE_FROZEN:
1162 hotplug_hrtick_enable(cpu);
1163 return NOTIFY_OK; 1097 return NOTIFY_OK;
1164 } 1098 }
1165 1099
1166 return NOTIFY_DONE; 1100 return NOTIFY_DONE;
1167} 1101}
1168 1102
1169static void init_hrtick(void) 1103static __init void init_hrtick(void)
1170{ 1104{
1171 hotcpu_notifier(hotplug_hrtick, 0); 1105 hotcpu_notifier(hotplug_hrtick, 0);
1172} 1106}
1173#endif /* CONFIG_SMP */ 1107#else
1108/*
1109 * Called to set the hrtick timer state.
1110 *
1111 * called with rq->lock held and irqs disabled
1112 */
1113static void hrtick_start(struct rq *rq, u64 delay)
1114{
1115 hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
1116}
1174 1117
1175static void init_rq_hrtick(struct rq *rq) 1118static inline void init_hrtick(void)
1176{ 1119{
1177 rq->hrtick_flags = 0;
1178 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1179 rq->hrtick_timer.function = hrtick;
1180 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ;
1181} 1120}
1121#endif /* CONFIG_SMP */
1182 1122
1183void hrtick_resched(void) 1123static void init_rq_hrtick(struct rq *rq)
1184{ 1124{
1185 struct rq *rq; 1125#ifdef CONFIG_SMP
1186 unsigned long flags; 1126 rq->hrtick_csd_pending = 0;
1187 1127
1188 if (!test_thread_flag(TIF_HRTICK_RESCHED)) 1128 rq->hrtick_csd.flags = 0;
1189 return; 1129 rq->hrtick_csd.func = __hrtick_start;
1130 rq->hrtick_csd.info = rq;
1131#endif
1190 1132
1191 local_irq_save(flags); 1133 hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
1192 rq = cpu_rq(smp_processor_id()); 1134 rq->hrtick_timer.function = hrtick;
1193 hrtick_set(rq); 1135 rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
1194 local_irq_restore(flags);
1195} 1136}
1196#else 1137#else /* CONFIG_SCHED_HRTICK */
1197static inline void hrtick_clear(struct rq *rq) 1138static inline void hrtick_clear(struct rq *rq)
1198{ 1139{
1199} 1140}
1200 1141
1201static inline void hrtick_set(struct rq *rq)
1202{
1203}
1204
1205static inline void init_rq_hrtick(struct rq *rq) 1142static inline void init_rq_hrtick(struct rq *rq)
1206{ 1143{
1207} 1144}
1208 1145
1209void hrtick_resched(void)
1210{
1211}
1212
1213static inline void init_hrtick(void) 1146static inline void init_hrtick(void)
1214{ 1147{
1215} 1148}
1216#endif 1149#endif /* CONFIG_SCHED_HRTICK */
1217 1150
1218/* 1151/*
1219 * resched_task - mark a task 'to be rescheduled now'. 1152 * resched_task - mark a task 'to be rescheduled now'.
@@ -1228,16 +1161,16 @@ static inline void init_hrtick(void)
1228#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) 1161#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG)
1229#endif 1162#endif
1230 1163
1231static void __resched_task(struct task_struct *p, int tif_bit) 1164static void resched_task(struct task_struct *p)
1232{ 1165{
1233 int cpu; 1166 int cpu;
1234 1167
1235 assert_spin_locked(&task_rq(p)->lock); 1168 assert_spin_locked(&task_rq(p)->lock);
1236 1169
1237 if (unlikely(test_tsk_thread_flag(p, tif_bit))) 1170 if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
1238 return; 1171 return;
1239 1172
1240 set_tsk_thread_flag(p, tif_bit); 1173 set_tsk_thread_flag(p, TIF_NEED_RESCHED);
1241 1174
1242 cpu = task_cpu(p); 1175 cpu = task_cpu(p);
1243 if (cpu == smp_processor_id()) 1176 if (cpu == smp_processor_id())
@@ -1303,10 +1236,10 @@ void wake_up_idle_cpu(int cpu)
1303#endif /* CONFIG_NO_HZ */ 1236#endif /* CONFIG_NO_HZ */
1304 1237
1305#else /* !CONFIG_SMP */ 1238#else /* !CONFIG_SMP */
1306static void __resched_task(struct task_struct *p, int tif_bit) 1239static void resched_task(struct task_struct *p)
1307{ 1240{
1308 assert_spin_locked(&task_rq(p)->lock); 1241 assert_spin_locked(&task_rq(p)->lock);
1309 set_tsk_thread_flag(p, tif_bit); 1242 set_tsk_need_resched(p);
1310} 1243}
1311#endif /* CONFIG_SMP */ 1244#endif /* CONFIG_SMP */
1312 1245
@@ -1460,38 +1393,24 @@ static inline void dec_cpu_load(struct rq *rq, unsigned long load)
1460 update_load_sub(&rq->load, load); 1393 update_load_sub(&rq->load, load);
1461} 1394}
1462 1395
1463#ifdef CONFIG_SMP 1396#if (defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)) || defined(CONFIG_RT_GROUP_SCHED)
1464static unsigned long source_load(int cpu, int type); 1397typedef int (*tg_visitor)(struct task_group *, void *);
1465static unsigned long target_load(int cpu, int type);
1466static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1467
1468static unsigned long cpu_avg_load_per_task(int cpu)
1469{
1470 struct rq *rq = cpu_rq(cpu);
1471
1472 if (rq->nr_running)
1473 rq->avg_load_per_task = rq->load.weight / rq->nr_running;
1474
1475 return rq->avg_load_per_task;
1476}
1477
1478#ifdef CONFIG_FAIR_GROUP_SCHED
1479
1480typedef void (*tg_visitor)(struct task_group *, int, struct sched_domain *);
1481 1398
1482/* 1399/*
1483 * Iterate the full tree, calling @down when first entering a node and @up when 1400 * Iterate the full tree, calling @down when first entering a node and @up when
1484 * leaving it for the final time. 1401 * leaving it for the final time.
1485 */ 1402 */
1486static void 1403static int walk_tg_tree(tg_visitor down, tg_visitor up, void *data)
1487walk_tg_tree(tg_visitor down, tg_visitor up, int cpu, struct sched_domain *sd)
1488{ 1404{
1489 struct task_group *parent, *child; 1405 struct task_group *parent, *child;
1406 int ret;
1490 1407
1491 rcu_read_lock(); 1408 rcu_read_lock();
1492 parent = &root_task_group; 1409 parent = &root_task_group;
1493down: 1410down:
1494 (*down)(parent, cpu, sd); 1411 ret = (*down)(parent, data);
1412 if (ret)
1413 goto out_unlock;
1495 list_for_each_entry_rcu(child, &parent->children, siblings) { 1414 list_for_each_entry_rcu(child, &parent->children, siblings) {
1496 parent = child; 1415 parent = child;
1497 goto down; 1416 goto down;
@@ -1499,23 +1418,51 @@ down:
1499up: 1418up:
1500 continue; 1419 continue;
1501 } 1420 }
1502 (*up)(parent, cpu, sd); 1421 ret = (*up)(parent, data);
1422 if (ret)
1423 goto out_unlock;
1503 1424
1504 child = parent; 1425 child = parent;
1505 parent = parent->parent; 1426 parent = parent->parent;
1506 if (parent) 1427 if (parent)
1507 goto up; 1428 goto up;
1429out_unlock:
1508 rcu_read_unlock(); 1430 rcu_read_unlock();
1431
1432 return ret;
1433}
1434
1435static int tg_nop(struct task_group *tg, void *data)
1436{
1437 return 0;
1438}
1439#endif
1440
1441#ifdef CONFIG_SMP
1442static unsigned long source_load(int cpu, int type);
1443static unsigned long target_load(int cpu, int type);
1444static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
1445
1446static unsigned long cpu_avg_load_per_task(int cpu)
1447{
1448 struct rq *rq = cpu_rq(cpu);
1449
1450 if (rq->nr_running)
1451 rq->avg_load_per_task = rq->load.weight / rq->nr_running;
1452
1453 return rq->avg_load_per_task;
1509} 1454}
1510 1455
1456#ifdef CONFIG_FAIR_GROUP_SCHED
1457
1511static void __set_se_shares(struct sched_entity *se, unsigned long shares); 1458static void __set_se_shares(struct sched_entity *se, unsigned long shares);
1512 1459
1513/* 1460/*
1514 * Calculate and set the cpu's group shares. 1461 * Calculate and set the cpu's group shares.
1515 */ 1462 */
1516static void 1463static void
1517__update_group_shares_cpu(struct task_group *tg, int cpu, 1464update_group_shares_cpu(struct task_group *tg, int cpu,
1518 unsigned long sd_shares, unsigned long sd_rq_weight) 1465 unsigned long sd_shares, unsigned long sd_rq_weight)
1519{ 1466{
1520 int boost = 0; 1467 int boost = 0;
1521 unsigned long shares; 1468 unsigned long shares;
@@ -1546,19 +1493,23 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
1546 * 1493 *
1547 */ 1494 */
1548 shares = (sd_shares * rq_weight) / (sd_rq_weight + 1); 1495 shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
1496 shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
1549 1497
1550 /* 1498 if (abs(shares - tg->se[cpu]->load.weight) >
1551 * record the actual number of shares, not the boosted amount. 1499 sysctl_sched_shares_thresh) {
1552 */ 1500 struct rq *rq = cpu_rq(cpu);
1553 tg->cfs_rq[cpu]->shares = boost ? 0 : shares; 1501 unsigned long flags;
1554 tg->cfs_rq[cpu]->rq_weight = rq_weight;
1555 1502
1556 if (shares < MIN_SHARES) 1503 spin_lock_irqsave(&rq->lock, flags);
1557 shares = MIN_SHARES; 1504 /*
1558 else if (shares > MAX_SHARES) 1505 * record the actual number of shares, not the boosted amount.
1559 shares = MAX_SHARES; 1506 */
1507 tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
1508 tg->cfs_rq[cpu]->rq_weight = rq_weight;
1560 1509
1561 __set_se_shares(tg->se[cpu], shares); 1510 __set_se_shares(tg->se[cpu], shares);
1511 spin_unlock_irqrestore(&rq->lock, flags);
1512 }
1562} 1513}
1563 1514
1564/* 1515/*
@@ -1566,11 +1517,11 @@ __update_group_shares_cpu(struct task_group *tg, int cpu,
1566 * This needs to be done in a bottom-up fashion because the rq weight of a 1517 * This needs to be done in a bottom-up fashion because the rq weight of a
1567 * parent group depends on the shares of its child groups. 1518 * parent group depends on the shares of its child groups.
1568 */ 1519 */
1569static void 1520static int tg_shares_up(struct task_group *tg, void *data)
1570tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1571{ 1521{
1572 unsigned long rq_weight = 0; 1522 unsigned long rq_weight = 0;
1573 unsigned long shares = 0; 1523 unsigned long shares = 0;
1524 struct sched_domain *sd = data;
1574 int i; 1525 int i;
1575 1526
1576 for_each_cpu_mask(i, sd->span) { 1527 for_each_cpu_mask(i, sd->span) {
@@ -1587,14 +1538,10 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1587 if (!rq_weight) 1538 if (!rq_weight)
1588 rq_weight = cpus_weight(sd->span) * NICE_0_LOAD; 1539 rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
1589 1540
1590 for_each_cpu_mask(i, sd->span) { 1541 for_each_cpu_mask(i, sd->span)
1591 struct rq *rq = cpu_rq(i); 1542 update_group_shares_cpu(tg, i, shares, rq_weight);
1592 unsigned long flags;
1593 1543
1594 spin_lock_irqsave(&rq->lock, flags); 1544 return 0;
1595 __update_group_shares_cpu(tg, i, shares, rq_weight);
1596 spin_unlock_irqrestore(&rq->lock, flags);
1597 }
1598} 1545}
1599 1546
1600/* 1547/*
@@ -1602,10 +1549,10 @@ tg_shares_up(struct task_group *tg, int cpu, struct sched_domain *sd)
1602 * This needs to be done in a top-down fashion because the load of a child 1549 * This needs to be done in a top-down fashion because the load of a child
1603 * group is a fraction of its parents load. 1550 * group is a fraction of its parents load.
1604 */ 1551 */
1605static void 1552static int tg_load_down(struct task_group *tg, void *data)
1606tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
1607{ 1553{
1608 unsigned long load; 1554 unsigned long load;
1555 long cpu = (long)data;
1609 1556
1610 if (!tg->parent) { 1557 if (!tg->parent) {
1611 load = cpu_rq(cpu)->load.weight; 1558 load = cpu_rq(cpu)->load.weight;
@@ -1616,11 +1563,8 @@ tg_load_down(struct task_group *tg, int cpu, struct sched_domain *sd)
1616 } 1563 }
1617 1564
1618 tg->cfs_rq[cpu]->h_load = load; 1565 tg->cfs_rq[cpu]->h_load = load;
1619}
1620 1566
1621static void 1567 return 0;
1622tg_nop(struct task_group *tg, int cpu, struct sched_domain *sd)
1623{
1624} 1568}
1625 1569
1626static void update_shares(struct sched_domain *sd) 1570static void update_shares(struct sched_domain *sd)
@@ -1630,7 +1574,7 @@ static void update_shares(struct sched_domain *sd)
1630 1574
1631 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) { 1575 if (elapsed >= (s64)(u64)sysctl_sched_shares_ratelimit) {
1632 sd->last_update = now; 1576 sd->last_update = now;
1633 walk_tg_tree(tg_nop, tg_shares_up, 0, sd); 1577 walk_tg_tree(tg_nop, tg_shares_up, sd);
1634 } 1578 }
1635} 1579}
1636 1580
@@ -1641,9 +1585,9 @@ static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
1641 spin_lock(&rq->lock); 1585 spin_lock(&rq->lock);
1642} 1586}
1643 1587
1644static void update_h_load(int cpu) 1588static void update_h_load(long cpu)
1645{ 1589{
1646 walk_tg_tree(tg_load_down, tg_nop, cpu, NULL); 1590 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
1647} 1591}
1648 1592
1649#else 1593#else
@@ -1946,16 +1890,24 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
1946/* 1890/*
1947 * wait_task_inactive - wait for a thread to unschedule. 1891 * wait_task_inactive - wait for a thread to unschedule.
1948 * 1892 *
1893 * If @match_state is nonzero, it's the @p->state value just checked and
1894 * not expected to change. If it changes, i.e. @p might have woken up,
1895 * then return zero. When we succeed in waiting for @p to be off its CPU,
1896 * we return a positive number (its total switch count). If a second call
1897 * a short while later returns the same number, the caller can be sure that
1898 * @p has remained unscheduled the whole time.
1899 *
1949 * The caller must ensure that the task *will* unschedule sometime soon, 1900 * The caller must ensure that the task *will* unschedule sometime soon,
1950 * else this function might spin for a *long* time. This function can't 1901 * else this function might spin for a *long* time. This function can't
1951 * be called with interrupts off, or it may introduce deadlock with 1902 * be called with interrupts off, or it may introduce deadlock with
1952 * smp_call_function() if an IPI is sent by the same process we are 1903 * smp_call_function() if an IPI is sent by the same process we are
1953 * waiting to become inactive. 1904 * waiting to become inactive.
1954 */ 1905 */
1955void wait_task_inactive(struct task_struct *p) 1906unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1956{ 1907{
1957 unsigned long flags; 1908 unsigned long flags;
1958 int running, on_rq; 1909 int running, on_rq;
1910 unsigned long ncsw;
1959 struct rq *rq; 1911 struct rq *rq;
1960 1912
1961 for (;;) { 1913 for (;;) {
@@ -1978,8 +1930,11 @@ void wait_task_inactive(struct task_struct *p)
1978 * return false if the runqueue has changed and p 1930 * return false if the runqueue has changed and p
1979 * is actually now running somewhere else! 1931 * is actually now running somewhere else!
1980 */ 1932 */
1981 while (task_running(rq, p)) 1933 while (task_running(rq, p)) {
1934 if (match_state && unlikely(p->state != match_state))
1935 return 0;
1982 cpu_relax(); 1936 cpu_relax();
1937 }
1983 1938
1984 /* 1939 /*
1985 * Ok, time to look more closely! We need the rq 1940 * Ok, time to look more closely! We need the rq
@@ -1987,11 +1942,21 @@ void wait_task_inactive(struct task_struct *p)
1987 * just go back and repeat. 1942 * just go back and repeat.
1988 */ 1943 */
1989 rq = task_rq_lock(p, &flags); 1944 rq = task_rq_lock(p, &flags);
1945 trace_sched_wait_task(rq, p);
1990 running = task_running(rq, p); 1946 running = task_running(rq, p);
1991 on_rq = p->se.on_rq; 1947 on_rq = p->se.on_rq;
1948 ncsw = 0;
1949 if (!match_state || p->state == match_state)
1950 ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
1992 task_rq_unlock(rq, &flags); 1951 task_rq_unlock(rq, &flags);
1993 1952
1994 /* 1953 /*
1954 * If it changed from the expected state, bail out now.
1955 */
1956 if (unlikely(!ncsw))
1957 break;
1958
1959 /*
1995 * Was it really running after all now that we 1960 * Was it really running after all now that we
1996 * checked with the proper locks actually held? 1961 * checked with the proper locks actually held?
1997 * 1962 *
@@ -2023,6 +1988,8 @@ void wait_task_inactive(struct task_struct *p)
2023 */ 1988 */
2024 break; 1989 break;
2025 } 1990 }
1991
1992 return ncsw;
2026} 1993}
2027 1994
2028/*** 1995/***
@@ -2108,7 +2075,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
2108 /* Tally up the load of all CPUs in the group */ 2075 /* Tally up the load of all CPUs in the group */
2109 avg_load = 0; 2076 avg_load = 0;
2110 2077
2111 for_each_cpu_mask(i, group->cpumask) { 2078 for_each_cpu_mask_nr(i, group->cpumask) {
2112 /* Bias balancing toward cpus of our domain */ 2079 /* Bias balancing toward cpus of our domain */
2113 if (local_group) 2080 if (local_group)
2114 load = source_load(i, load_idx); 2081 load = source_load(i, load_idx);
@@ -2150,7 +2117,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu,
2150 /* Traverse only the allowed CPUs */ 2117 /* Traverse only the allowed CPUs */
2151 cpus_and(*tmp, group->cpumask, p->cpus_allowed); 2118 cpus_and(*tmp, group->cpumask, p->cpus_allowed);
2152 2119
2153 for_each_cpu_mask(i, *tmp) { 2120 for_each_cpu_mask_nr(i, *tmp) {
2154 load = weighted_cpuload(i); 2121 load = weighted_cpuload(i);
2155 2122
2156 if (load < min_load || (load == min_load && i == this_cpu)) { 2123 if (load < min_load || (load == min_load && i == this_cpu)) {
@@ -2337,10 +2304,8 @@ out_activate:
2337 success = 1; 2304 success = 1;
2338 2305
2339out_running: 2306out_running:
2340 trace_mark(kernel_sched_wakeup, 2307 trace_sched_wakeup(rq, p);
2341 "pid %d state %ld ## rq %p task %p rq->curr %p", 2308 check_preempt_curr(rq, p, sync);
2342 p->pid, p->state, rq, p, rq->curr);
2343 check_preempt_curr(rq, p);
2344 2309
2345 p->state = TASK_RUNNING; 2310 p->state = TASK_RUNNING;
2346#ifdef CONFIG_SMP 2311#ifdef CONFIG_SMP
@@ -2472,10 +2437,8 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
2472 p->sched_class->task_new(rq, p); 2437 p->sched_class->task_new(rq, p);
2473 inc_nr_running(rq); 2438 inc_nr_running(rq);
2474 } 2439 }
2475 trace_mark(kernel_sched_wakeup_new, 2440 trace_sched_wakeup_new(rq, p);
2476 "pid %d state %ld ## rq %p task %p rq->curr %p", 2441 check_preempt_curr(rq, p, 0);
2477 p->pid, p->state, rq, p, rq->curr);
2478 check_preempt_curr(rq, p);
2479#ifdef CONFIG_SMP 2442#ifdef CONFIG_SMP
2480 if (p->sched_class->task_wake_up) 2443 if (p->sched_class->task_wake_up)
2481 p->sched_class->task_wake_up(rq, p); 2444 p->sched_class->task_wake_up(rq, p);
@@ -2647,11 +2610,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
2647 struct mm_struct *mm, *oldmm; 2610 struct mm_struct *mm, *oldmm;
2648 2611
2649 prepare_task_switch(rq, prev, next); 2612 prepare_task_switch(rq, prev, next);
2650 trace_mark(kernel_sched_schedule, 2613 trace_sched_switch(rq, prev, next);
2651 "prev_pid %d next_pid %d prev_state %ld "
2652 "## rq %p prev %p next %p",
2653 prev->pid, next->pid, prev->state,
2654 rq, prev, next);
2655 mm = next->mm; 2614 mm = next->mm;
2656 oldmm = prev->active_mm; 2615 oldmm = prev->active_mm;
2657 /* 2616 /*
@@ -2813,10 +2772,10 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2)
2813 } else { 2772 } else {
2814 if (rq1 < rq2) { 2773 if (rq1 < rq2) {
2815 spin_lock(&rq1->lock); 2774 spin_lock(&rq1->lock);
2816 spin_lock(&rq2->lock); 2775 spin_lock_nested(&rq2->lock, SINGLE_DEPTH_NESTING);
2817 } else { 2776 } else {
2818 spin_lock(&rq2->lock); 2777 spin_lock(&rq2->lock);
2819 spin_lock(&rq1->lock); 2778 spin_lock_nested(&rq1->lock, SINGLE_DEPTH_NESTING);
2820 } 2779 }
2821 } 2780 }
2822 update_rq_clock(rq1); 2781 update_rq_clock(rq1);
@@ -2859,14 +2818,21 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
2859 if (busiest < this_rq) { 2818 if (busiest < this_rq) {
2860 spin_unlock(&this_rq->lock); 2819 spin_unlock(&this_rq->lock);
2861 spin_lock(&busiest->lock); 2820 spin_lock(&busiest->lock);
2862 spin_lock(&this_rq->lock); 2821 spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
2863 ret = 1; 2822 ret = 1;
2864 } else 2823 } else
2865 spin_lock(&busiest->lock); 2824 spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
2866 } 2825 }
2867 return ret; 2826 return ret;
2868} 2827}
2869 2828
2829static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
2830 __releases(busiest->lock)
2831{
2832 spin_unlock(&busiest->lock);
2833 lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
2834}
2835
2870/* 2836/*
2871 * If dest_cpu is allowed for this process, migrate the task to it. 2837 * If dest_cpu is allowed for this process, migrate the task to it.
2872 * This is accomplished by forcing the cpu_allowed mask to only 2838 * This is accomplished by forcing the cpu_allowed mask to only
@@ -2881,9 +2847,10 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
2881 2847
2882 rq = task_rq_lock(p, &flags); 2848 rq = task_rq_lock(p, &flags);
2883 if (!cpu_isset(dest_cpu, p->cpus_allowed) 2849 if (!cpu_isset(dest_cpu, p->cpus_allowed)
2884 || unlikely(cpu_is_offline(dest_cpu))) 2850 || unlikely(!cpu_active(dest_cpu)))
2885 goto out; 2851 goto out;
2886 2852
2853 trace_sched_migrate_task(rq, p, dest_cpu);
2887 /* force the process onto the specified CPU */ 2854 /* force the process onto the specified CPU */
2888 if (migrate_task(p, dest_cpu, &req)) { 2855 if (migrate_task(p, dest_cpu, &req)) {
2889 /* Need to wait for migration thread (might exit: take ref). */ 2856 /* Need to wait for migration thread (might exit: take ref). */
@@ -2928,7 +2895,7 @@ static void pull_task(struct rq *src_rq, struct task_struct *p,
2928 * Note that idle threads have a prio of MAX_PRIO, for this test 2895 * Note that idle threads have a prio of MAX_PRIO, for this test
2929 * to be always true for them. 2896 * to be always true for them.
2930 */ 2897 */
2931 check_preempt_curr(this_rq, p); 2898 check_preempt_curr(this_rq, p, 0);
2932} 2899}
2933 2900
2934/* 2901/*
@@ -3168,7 +3135,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
3168 max_cpu_load = 0; 3135 max_cpu_load = 0;
3169 min_cpu_load = ~0UL; 3136 min_cpu_load = ~0UL;
3170 3137
3171 for_each_cpu_mask(i, group->cpumask) { 3138 for_each_cpu_mask_nr(i, group->cpumask) {
3172 struct rq *rq; 3139 struct rq *rq;
3173 3140
3174 if (!cpu_isset(i, *cpus)) 3141 if (!cpu_isset(i, *cpus))
@@ -3447,7 +3414,7 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
3447 unsigned long max_load = 0; 3414 unsigned long max_load = 0;
3448 int i; 3415 int i;
3449 3416
3450 for_each_cpu_mask(i, group->cpumask) { 3417 for_each_cpu_mask_nr(i, group->cpumask) {
3451 unsigned long wl; 3418 unsigned long wl;
3452 3419
3453 if (!cpu_isset(i, *cpus)) 3420 if (!cpu_isset(i, *cpus))
@@ -3691,7 +3658,7 @@ redo:
3691 ld_moved = move_tasks(this_rq, this_cpu, busiest, 3658 ld_moved = move_tasks(this_rq, this_cpu, busiest,
3692 imbalance, sd, CPU_NEWLY_IDLE, 3659 imbalance, sd, CPU_NEWLY_IDLE,
3693 &all_pinned); 3660 &all_pinned);
3694 spin_unlock(&busiest->lock); 3661 double_unlock_balance(this_rq, busiest);
3695 3662
3696 if (unlikely(all_pinned)) { 3663 if (unlikely(all_pinned)) {
3697 cpu_clear(cpu_of(busiest), *cpus); 3664 cpu_clear(cpu_of(busiest), *cpus);
@@ -3806,7 +3773,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
3806 else 3773 else
3807 schedstat_inc(sd, alb_failed); 3774 schedstat_inc(sd, alb_failed);
3808 } 3775 }
3809 spin_unlock(&target_rq->lock); 3776 double_unlock_balance(busiest_rq, target_rq);
3810} 3777}
3811 3778
3812#ifdef CONFIG_NO_HZ 3779#ifdef CONFIG_NO_HZ
@@ -3849,7 +3816,7 @@ int select_nohz_load_balancer(int stop_tick)
3849 /* 3816 /*
3850 * If we are going offline and still the leader, give up! 3817 * If we are going offline and still the leader, give up!
3851 */ 3818 */
3852 if (cpu_is_offline(cpu) && 3819 if (!cpu_active(cpu) &&
3853 atomic_read(&nohz.load_balancer) == cpu) { 3820 atomic_read(&nohz.load_balancer) == cpu) {
3854 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) 3821 if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
3855 BUG(); 3822 BUG();
@@ -3989,7 +3956,7 @@ static void run_rebalance_domains(struct softirq_action *h)
3989 int balance_cpu; 3956 int balance_cpu;
3990 3957
3991 cpu_clear(this_cpu, cpus); 3958 cpu_clear(this_cpu, cpus);
3992 for_each_cpu_mask(balance_cpu, cpus) { 3959 for_each_cpu_mask_nr(balance_cpu, cpus) {
3993 /* 3960 /*
3994 * If this cpu gets work to do, stop the load balancing 3961 * If this cpu gets work to do, stop the load balancing
3995 * work being done for other cpus. Next load 3962 * work being done for other cpus. Next load
@@ -4085,23 +4052,26 @@ DEFINE_PER_CPU(struct kernel_stat, kstat);
4085EXPORT_PER_CPU_SYMBOL(kstat); 4052EXPORT_PER_CPU_SYMBOL(kstat);
4086 4053
4087/* 4054/*
4088 * Return p->sum_exec_runtime plus any more ns on the sched_clock 4055 * Return any ns on the sched_clock that have not yet been banked in
4089 * that have not yet been banked in case the task is currently running. 4056 * @p in case that task is currently running.
4090 */ 4057 */
4091unsigned long long task_sched_runtime(struct task_struct *p) 4058unsigned long long task_delta_exec(struct task_struct *p)
4092{ 4059{
4093 unsigned long flags; 4060 unsigned long flags;
4094 u64 ns, delta_exec;
4095 struct rq *rq; 4061 struct rq *rq;
4062 u64 ns = 0;
4096 4063
4097 rq = task_rq_lock(p, &flags); 4064 rq = task_rq_lock(p, &flags);
4098 ns = p->se.sum_exec_runtime; 4065
4099 if (task_current(rq, p)) { 4066 if (task_current(rq, p)) {
4067 u64 delta_exec;
4068
4100 update_rq_clock(rq); 4069 update_rq_clock(rq);
4101 delta_exec = rq->clock - p->se.exec_start; 4070 delta_exec = rq->clock - p->se.exec_start;
4102 if ((s64)delta_exec > 0) 4071 if ((s64)delta_exec > 0)
4103 ns += delta_exec; 4072 ns = delta_exec;
4104 } 4073 }
4074
4105 task_rq_unlock(rq, &flags); 4075 task_rq_unlock(rq, &flags);
4106 4076
4107 return ns; 4077 return ns;
@@ -4118,6 +4088,7 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
4118 cputime64_t tmp; 4088 cputime64_t tmp;
4119 4089
4120 p->utime = cputime_add(p->utime, cputime); 4090 p->utime = cputime_add(p->utime, cputime);
4091 account_group_user_time(p, cputime);
4121 4092
4122 /* Add user time to cpustat. */ 4093 /* Add user time to cpustat. */
4123 tmp = cputime_to_cputime64(cputime); 4094 tmp = cputime_to_cputime64(cputime);
@@ -4125,6 +4096,8 @@ void account_user_time(struct task_struct *p, cputime_t cputime)
4125 cpustat->nice = cputime64_add(cpustat->nice, tmp); 4096 cpustat->nice = cputime64_add(cpustat->nice, tmp);
4126 else 4097 else
4127 cpustat->user = cputime64_add(cpustat->user, tmp); 4098 cpustat->user = cputime64_add(cpustat->user, tmp);
4099 /* Account for user time used */
4100 acct_update_integrals(p);
4128} 4101}
4129 4102
4130/* 4103/*
@@ -4140,6 +4113,7 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime)
4140 tmp = cputime_to_cputime64(cputime); 4113 tmp = cputime_to_cputime64(cputime);
4141 4114
4142 p->utime = cputime_add(p->utime, cputime); 4115 p->utime = cputime_add(p->utime, cputime);
4116 account_group_user_time(p, cputime);
4143 p->gtime = cputime_add(p->gtime, cputime); 4117 p->gtime = cputime_add(p->gtime, cputime);
4144 4118
4145 cpustat->user = cputime64_add(cpustat->user, tmp); 4119 cpustat->user = cputime64_add(cpustat->user, tmp);
@@ -4175,6 +4149,7 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
4175 } 4149 }
4176 4150
4177 p->stime = cputime_add(p->stime, cputime); 4151 p->stime = cputime_add(p->stime, cputime);
4152 account_group_system_time(p, cputime);
4178 4153
4179 /* Add system time to cpustat. */ 4154 /* Add system time to cpustat. */
4180 tmp = cputime_to_cputime64(cputime); 4155 tmp = cputime_to_cputime64(cputime);
@@ -4216,6 +4191,7 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
4216 4191
4217 if (p == rq->idle) { 4192 if (p == rq->idle) {
4218 p->stime = cputime_add(p->stime, steal); 4193 p->stime = cputime_add(p->stime, steal);
4194 account_group_system_time(p, steal);
4219 if (atomic_read(&rq->nr_iowait) > 0) 4195 if (atomic_read(&rq->nr_iowait) > 0)
4220 cpustat->iowait = cputime64_add(cpustat->iowait, tmp); 4196 cpustat->iowait = cputime64_add(cpustat->iowait, tmp);
4221 else 4197 else
@@ -4225,6 +4201,65 @@ void account_steal_time(struct task_struct *p, cputime_t steal)
4225} 4201}
4226 4202
4227/* 4203/*
4204 * Use precise platform statistics if available:
4205 */
4206#ifdef CONFIG_VIRT_CPU_ACCOUNTING
4207cputime_t task_utime(struct task_struct *p)
4208{
4209 return p->utime;
4210}
4211
4212cputime_t task_stime(struct task_struct *p)
4213{
4214 return p->stime;
4215}
4216#else
4217cputime_t task_utime(struct task_struct *p)
4218{
4219 clock_t utime = cputime_to_clock_t(p->utime),
4220 total = utime + cputime_to_clock_t(p->stime);
4221 u64 temp;
4222
4223 /*
4224 * Use CFS's precise accounting:
4225 */
4226 temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime);
4227
4228 if (total) {
4229 temp *= utime;
4230 do_div(temp, total);
4231 }
4232 utime = (clock_t)temp;
4233
4234 p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime));
4235 return p->prev_utime;
4236}
4237
4238cputime_t task_stime(struct task_struct *p)
4239{
4240 clock_t stime;
4241
4242 /*
4243 * Use CFS's precise accounting. (we subtract utime from
4244 * the total, to make sure the total observed by userspace
4245 * grows monotonically - apps rely on that):
4246 */
4247 stime = nsec_to_clock_t(p->se.sum_exec_runtime) -
4248 cputime_to_clock_t(task_utime(p));
4249
4250 if (stime >= 0)
4251 p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime));
4252
4253 return p->prev_stime;
4254}
4255#endif
4256
4257inline cputime_t task_gtime(struct task_struct *p)
4258{
4259 return p->gtime;
4260}
4261
4262/*
4228 * This function gets called by the timer code, with HZ frequency. 4263 * This function gets called by the timer code, with HZ frequency.
4229 * We call it with interrupts disabled. 4264 * We call it with interrupts disabled.
4230 * 4265 *
@@ -4395,7 +4430,7 @@ asmlinkage void __sched schedule(void)
4395 struct task_struct *prev, *next; 4430 struct task_struct *prev, *next;
4396 unsigned long *switch_count; 4431 unsigned long *switch_count;
4397 struct rq *rq; 4432 struct rq *rq;
4398 int cpu, hrtick = sched_feat(HRTICK); 4433 int cpu;
4399 4434
4400need_resched: 4435need_resched:
4401 preempt_disable(); 4436 preempt_disable();
@@ -4410,15 +4445,11 @@ need_resched_nonpreemptible:
4410 4445
4411 schedule_debug(prev); 4446 schedule_debug(prev);
4412 4447
4413 if (hrtick) 4448 if (sched_feat(HRTICK))
4414 hrtick_clear(rq); 4449 hrtick_clear(rq);
4415 4450
4416 /* 4451 spin_lock_irq(&rq->lock);
4417 * Do the rq-clock update outside the rq lock:
4418 */
4419 local_irq_disable();
4420 update_rq_clock(rq); 4452 update_rq_clock(rq);
4421 spin_lock(&rq->lock);
4422 clear_tsk_need_resched(prev); 4453 clear_tsk_need_resched(prev);
4423 4454
4424 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { 4455 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
@@ -4457,9 +4488,6 @@ need_resched_nonpreemptible:
4457 } else 4488 } else
4458 spin_unlock_irq(&rq->lock); 4489 spin_unlock_irq(&rq->lock);
4459 4490
4460 if (hrtick)
4461 hrtick_set(rq);
4462
4463 if (unlikely(reacquire_kernel_lock(current) < 0)) 4491 if (unlikely(reacquire_kernel_lock(current) < 0))
4464 goto need_resched_nonpreemptible; 4492 goto need_resched_nonpreemptible;
4465 4493
@@ -4617,6 +4645,15 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
4617} 4645}
4618EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ 4646EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */
4619 4647
4648/**
4649 * complete: - signals a single thread waiting on this completion
4650 * @x: holds the state of this particular completion
4651 *
4652 * This will wake up a single thread waiting on this completion. Threads will be
4653 * awakened in the same order in which they were queued.
4654 *
4655 * See also complete_all(), wait_for_completion() and related routines.
4656 */
4620void complete(struct completion *x) 4657void complete(struct completion *x)
4621{ 4658{
4622 unsigned long flags; 4659 unsigned long flags;
@@ -4628,6 +4665,12 @@ void complete(struct completion *x)
4628} 4665}
4629EXPORT_SYMBOL(complete); 4666EXPORT_SYMBOL(complete);
4630 4667
4668/**
4669 * complete_all: - signals all threads waiting on this completion
4670 * @x: holds the state of this particular completion
4671 *
4672 * This will wake up all threads waiting on this particular completion event.
4673 */
4631void complete_all(struct completion *x) 4674void complete_all(struct completion *x)
4632{ 4675{
4633 unsigned long flags; 4676 unsigned long flags;
@@ -4648,10 +4691,7 @@ do_wait_for_common(struct completion *x, long timeout, int state)
4648 wait.flags |= WQ_FLAG_EXCLUSIVE; 4691 wait.flags |= WQ_FLAG_EXCLUSIVE;
4649 __add_wait_queue_tail(&x->wait, &wait); 4692 __add_wait_queue_tail(&x->wait, &wait);
4650 do { 4693 do {
4651 if ((state == TASK_INTERRUPTIBLE && 4694 if (signal_pending_state(state, current)) {
4652 signal_pending(current)) ||
4653 (state == TASK_KILLABLE &&
4654 fatal_signal_pending(current))) {
4655 timeout = -ERESTARTSYS; 4695 timeout = -ERESTARTSYS;
4656 break; 4696 break;
4657 } 4697 }
@@ -4679,12 +4719,31 @@ wait_for_common(struct completion *x, long timeout, int state)
4679 return timeout; 4719 return timeout;
4680} 4720}
4681 4721
4722/**
4723 * wait_for_completion: - waits for completion of a task
4724 * @x: holds the state of this particular completion
4725 *
4726 * This waits to be signaled for completion of a specific task. It is NOT
4727 * interruptible and there is no timeout.
4728 *
4729 * See also similar routines (i.e. wait_for_completion_timeout()) with timeout
4730 * and interrupt capability. Also see complete().
4731 */
4682void __sched wait_for_completion(struct completion *x) 4732void __sched wait_for_completion(struct completion *x)
4683{ 4733{
4684 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE); 4734 wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_UNINTERRUPTIBLE);
4685} 4735}
4686EXPORT_SYMBOL(wait_for_completion); 4736EXPORT_SYMBOL(wait_for_completion);
4687 4737
4738/**
4739 * wait_for_completion_timeout: - waits for completion of a task (w/timeout)
4740 * @x: holds the state of this particular completion
4741 * @timeout: timeout value in jiffies
4742 *
4743 * This waits for either a completion of a specific task to be signaled or for a
4744 * specified timeout to expire. The timeout is in jiffies. It is not
4745 * interruptible.
4746 */
4688unsigned long __sched 4747unsigned long __sched
4689wait_for_completion_timeout(struct completion *x, unsigned long timeout) 4748wait_for_completion_timeout(struct completion *x, unsigned long timeout)
4690{ 4749{
@@ -4692,6 +4751,13 @@ wait_for_completion_timeout(struct completion *x, unsigned long timeout)
4692} 4751}
4693EXPORT_SYMBOL(wait_for_completion_timeout); 4752EXPORT_SYMBOL(wait_for_completion_timeout);
4694 4753
4754/**
4755 * wait_for_completion_interruptible: - waits for completion of a task (w/intr)
4756 * @x: holds the state of this particular completion
4757 *
4758 * This waits for completion of a specific task to be signaled. It is
4759 * interruptible.
4760 */
4695int __sched wait_for_completion_interruptible(struct completion *x) 4761int __sched wait_for_completion_interruptible(struct completion *x)
4696{ 4762{
4697 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE); 4763 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_INTERRUPTIBLE);
@@ -4701,6 +4767,14 @@ int __sched wait_for_completion_interruptible(struct completion *x)
4701} 4767}
4702EXPORT_SYMBOL(wait_for_completion_interruptible); 4768EXPORT_SYMBOL(wait_for_completion_interruptible);
4703 4769
4770/**
4771 * wait_for_completion_interruptible_timeout: - waits for completion (w/(to,intr))
4772 * @x: holds the state of this particular completion
4773 * @timeout: timeout value in jiffies
4774 *
4775 * This waits for either a completion of a specific task to be signaled or for a
4776 * specified timeout to expire. It is interruptible. The timeout is in jiffies.
4777 */
4704unsigned long __sched 4778unsigned long __sched
4705wait_for_completion_interruptible_timeout(struct completion *x, 4779wait_for_completion_interruptible_timeout(struct completion *x,
4706 unsigned long timeout) 4780 unsigned long timeout)
@@ -4709,6 +4783,13 @@ wait_for_completion_interruptible_timeout(struct completion *x,
4709} 4783}
4710EXPORT_SYMBOL(wait_for_completion_interruptible_timeout); 4784EXPORT_SYMBOL(wait_for_completion_interruptible_timeout);
4711 4785
4786/**
4787 * wait_for_completion_killable: - waits for completion of a task (killable)
4788 * @x: holds the state of this particular completion
4789 *
4790 * This waits to be signaled for completion of a specific task. It can be
4791 * interrupted by a kill signal.
4792 */
4712int __sched wait_for_completion_killable(struct completion *x) 4793int __sched wait_for_completion_killable(struct completion *x)
4713{ 4794{
4714 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE); 4795 long t = wait_for_common(x, MAX_SCHEDULE_TIMEOUT, TASK_KILLABLE);
@@ -4718,6 +4799,52 @@ int __sched wait_for_completion_killable(struct completion *x)
4718} 4799}
4719EXPORT_SYMBOL(wait_for_completion_killable); 4800EXPORT_SYMBOL(wait_for_completion_killable);
4720 4801
4802/**
4803 * try_wait_for_completion - try to decrement a completion without blocking
4804 * @x: completion structure
4805 *
4806 * Returns: 0 if a decrement cannot be done without blocking
4807 * 1 if a decrement succeeded.
4808 *
4809 * If a completion is being used as a counting completion,
4810 * attempt to decrement the counter without blocking. This
4811 * enables us to avoid waiting if the resource the completion
4812 * is protecting is not available.
4813 */
4814bool try_wait_for_completion(struct completion *x)
4815{
4816 int ret = 1;
4817
4818 spin_lock_irq(&x->wait.lock);
4819 if (!x->done)
4820 ret = 0;
4821 else
4822 x->done--;
4823 spin_unlock_irq(&x->wait.lock);
4824 return ret;
4825}
4826EXPORT_SYMBOL(try_wait_for_completion);
4827
4828/**
4829 * completion_done - Test to see if a completion has any waiters
4830 * @x: completion structure
4831 *
4832 * Returns: 0 if there are waiters (wait_for_completion() in progress)
4833 * 1 if there are no waiters.
4834 *
4835 */
4836bool completion_done(struct completion *x)
4837{
4838 int ret = 1;
4839
4840 spin_lock_irq(&x->wait.lock);
4841 if (!x->done)
4842 ret = 0;
4843 spin_unlock_irq(&x->wait.lock);
4844 return ret;
4845}
4846EXPORT_SYMBOL(completion_done);
4847
4721static long __sched 4848static long __sched
4722sleep_on_common(wait_queue_head_t *q, int state, long timeout) 4849sleep_on_common(wait_queue_head_t *q, int state, long timeout)
4723{ 4850{
@@ -5059,19 +5186,22 @@ recheck:
5059 return -EPERM; 5186 return -EPERM;
5060 } 5187 }
5061 5188
5189 if (user) {
5062#ifdef CONFIG_RT_GROUP_SCHED 5190#ifdef CONFIG_RT_GROUP_SCHED
5063 /* 5191 /*
5064 * Do not allow realtime tasks into groups that have no runtime 5192 * Do not allow realtime tasks into groups that have no runtime
5065 * assigned. 5193 * assigned.
5066 */ 5194 */
5067 if (user 5195 if (rt_bandwidth_enabled() && rt_policy(policy) &&
5068 && rt_policy(policy) && task_group(p)->rt_bandwidth.rt_runtime == 0) 5196 task_group(p)->rt_bandwidth.rt_runtime == 0)
5069 return -EPERM; 5197 return -EPERM;
5070#endif 5198#endif
5071 5199
5072 retval = security_task_setscheduler(p, policy, param); 5200 retval = security_task_setscheduler(p, policy, param);
5073 if (retval) 5201 if (retval)
5074 return retval; 5202 return retval;
5203 }
5204
5075 /* 5205 /*
5076 * make sure no PI-waiters arrive (or leave) while we are 5206 * make sure no PI-waiters arrive (or leave) while we are
5077 * changing the priority of the task: 5207 * changing the priority of the task:
@@ -5787,6 +5917,8 @@ static inline void sched_init_granularity(void)
5787 sysctl_sched_latency = limit; 5917 sysctl_sched_latency = limit;
5788 5918
5789 sysctl_sched_wakeup_granularity *= factor; 5919 sysctl_sched_wakeup_granularity *= factor;
5920
5921 sysctl_sched_shares_ratelimit *= factor;
5790} 5922}
5791 5923
5792#ifdef CONFIG_SMP 5924#ifdef CONFIG_SMP
@@ -5876,7 +6008,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5876 struct rq *rq_dest, *rq_src; 6008 struct rq *rq_dest, *rq_src;
5877 int ret = 0, on_rq; 6009 int ret = 0, on_rq;
5878 6010
5879 if (unlikely(cpu_is_offline(dest_cpu))) 6011 if (unlikely(!cpu_active(dest_cpu)))
5880 return ret; 6012 return ret;
5881 6013
5882 rq_src = cpu_rq(src_cpu); 6014 rq_src = cpu_rq(src_cpu);
@@ -5897,7 +6029,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
5897 set_task_cpu(p, dest_cpu); 6029 set_task_cpu(p, dest_cpu);
5898 if (on_rq) { 6030 if (on_rq) {
5899 activate_task(rq_dest, p, 0); 6031 activate_task(rq_dest, p, 0);
5900 check_preempt_curr(rq_dest, p); 6032 check_preempt_curr(rq_dest, p, 0);
5901 } 6033 }
5902done: 6034done:
5903 ret = 1; 6035 ret = 1;
@@ -6222,7 +6354,7 @@ set_table_entry(struct ctl_table *entry,
6222static struct ctl_table * 6354static struct ctl_table *
6223sd_alloc_ctl_domain_table(struct sched_domain *sd) 6355sd_alloc_ctl_domain_table(struct sched_domain *sd)
6224{ 6356{
6225 struct ctl_table *table = sd_alloc_ctl_entry(12); 6357 struct ctl_table *table = sd_alloc_ctl_entry(13);
6226 6358
6227 if (table == NULL) 6359 if (table == NULL)
6228 return NULL; 6360 return NULL;
@@ -6250,7 +6382,9 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
6250 sizeof(int), 0644, proc_dointvec_minmax); 6382 sizeof(int), 0644, proc_dointvec_minmax);
6251 set_table_entry(&table[10], "flags", &sd->flags, 6383 set_table_entry(&table[10], "flags", &sd->flags,
6252 sizeof(int), 0644, proc_dointvec_minmax); 6384 sizeof(int), 0644, proc_dointvec_minmax);
6253 /* &table[11] is terminator */ 6385 set_table_entry(&table[11], "name", sd->name,
6386 CORENAME_MAX_SIZE, 0444, proc_dostring);
6387 /* &table[12] is terminator */
6254 6388
6255 return table; 6389 return table;
6256} 6390}
@@ -6469,7 +6603,7 @@ static struct notifier_block __cpuinitdata migration_notifier = {
6469 .priority = 10 6603 .priority = 10
6470}; 6604};
6471 6605
6472void __init migration_init(void) 6606static int __init migration_init(void)
6473{ 6607{
6474 void *cpu = (void *)(long)smp_processor_id(); 6608 void *cpu = (void *)(long)smp_processor_id();
6475 int err; 6609 int err;
@@ -6479,7 +6613,10 @@ void __init migration_init(void)
6479 BUG_ON(err == NOTIFY_BAD); 6613 BUG_ON(err == NOTIFY_BAD);
6480 migration_call(&migration_notifier, CPU_ONLINE, cpu); 6614 migration_call(&migration_notifier, CPU_ONLINE, cpu);
6481 register_cpu_notifier(&migration_notifier); 6615 register_cpu_notifier(&migration_notifier);
6616
6617 return err;
6482} 6618}
6619early_initcall(migration_init);
6483#endif 6620#endif
6484 6621
6485#ifdef CONFIG_SMP 6622#ifdef CONFIG_SMP
@@ -6768,7 +6905,8 @@ static cpumask_t cpu_isolated_map = CPU_MASK_NONE;
6768/* Setup the mask of cpus configured for isolated domains */ 6905/* Setup the mask of cpus configured for isolated domains */
6769static int __init isolated_cpu_setup(char *str) 6906static int __init isolated_cpu_setup(char *str)
6770{ 6907{
6771 int ints[NR_CPUS], i; 6908 static int __initdata ints[NR_CPUS];
6909 int i;
6772 6910
6773 str = get_options(str, ARRAY_SIZE(ints), ints); 6911 str = get_options(str, ARRAY_SIZE(ints), ints);
6774 cpus_clear(cpu_isolated_map); 6912 cpus_clear(cpu_isolated_map);
@@ -6802,7 +6940,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
6802 6940
6803 cpus_clear(*covered); 6941 cpus_clear(*covered);
6804 6942
6805 for_each_cpu_mask(i, *span) { 6943 for_each_cpu_mask_nr(i, *span) {
6806 struct sched_group *sg; 6944 struct sched_group *sg;
6807 int group = group_fn(i, cpu_map, &sg, tmpmask); 6945 int group = group_fn(i, cpu_map, &sg, tmpmask);
6808 int j; 6946 int j;
@@ -6813,7 +6951,7 @@ init_sched_build_groups(const cpumask_t *span, const cpumask_t *cpu_map,
6813 cpus_clear(sg->cpumask); 6951 cpus_clear(sg->cpumask);
6814 sg->__cpu_power = 0; 6952 sg->__cpu_power = 0;
6815 6953
6816 for_each_cpu_mask(j, *span) { 6954 for_each_cpu_mask_nr(j, *span) {
6817 if (group_fn(j, cpu_map, NULL, tmpmask) != group) 6955 if (group_fn(j, cpu_map, NULL, tmpmask) != group)
6818 continue; 6956 continue;
6819 6957
@@ -7013,7 +7151,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
7013 if (!sg) 7151 if (!sg)
7014 return; 7152 return;
7015 do { 7153 do {
7016 for_each_cpu_mask(j, sg->cpumask) { 7154 for_each_cpu_mask_nr(j, sg->cpumask) {
7017 struct sched_domain *sd; 7155 struct sched_domain *sd;
7018 7156
7019 sd = &per_cpu(phys_domains, j); 7157 sd = &per_cpu(phys_domains, j);
@@ -7038,7 +7176,7 @@ static void free_sched_groups(const cpumask_t *cpu_map, cpumask_t *nodemask)
7038{ 7176{
7039 int cpu, i; 7177 int cpu, i;
7040 7178
7041 for_each_cpu_mask(cpu, *cpu_map) { 7179 for_each_cpu_mask_nr(cpu, *cpu_map) {
7042 struct sched_group **sched_group_nodes 7180 struct sched_group **sched_group_nodes
7043 = sched_group_nodes_bycpu[cpu]; 7181 = sched_group_nodes_bycpu[cpu];
7044 7182
@@ -7130,13 +7268,21 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
7130 * Non-inlined to reduce accumulated stack pressure in build_sched_domains() 7268 * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
7131 */ 7269 */
7132 7270
7271#ifdef CONFIG_SCHED_DEBUG
7272# define SD_INIT_NAME(sd, type) sd->name = #type
7273#else
7274# define SD_INIT_NAME(sd, type) do { } while (0)
7275#endif
7276
7133#define SD_INIT(sd, type) sd_init_##type(sd) 7277#define SD_INIT(sd, type) sd_init_##type(sd)
7278
7134#define SD_INIT_FUNC(type) \ 7279#define SD_INIT_FUNC(type) \
7135static noinline void sd_init_##type(struct sched_domain *sd) \ 7280static noinline void sd_init_##type(struct sched_domain *sd) \
7136{ \ 7281{ \
7137 memset(sd, 0, sizeof(*sd)); \ 7282 memset(sd, 0, sizeof(*sd)); \
7138 *sd = SD_##type##_INIT; \ 7283 *sd = SD_##type##_INIT; \
7139 sd->level = SD_LV_##type; \ 7284 sd->level = SD_LV_##type; \
7285 SD_INIT_NAME(sd, type); \
7140} 7286}
7141 7287
7142SD_INIT_FUNC(CPU) 7288SD_INIT_FUNC(CPU)
@@ -7277,7 +7423,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7277 /* 7423 /*
7278 * Set up domains for cpus specified by the cpu_map. 7424 * Set up domains for cpus specified by the cpu_map.
7279 */ 7425 */
7280 for_each_cpu_mask(i, *cpu_map) { 7426 for_each_cpu_mask_nr(i, *cpu_map) {
7281 struct sched_domain *sd = NULL, *p; 7427 struct sched_domain *sd = NULL, *p;
7282 SCHED_CPUMASK_VAR(nodemask, allmasks); 7428 SCHED_CPUMASK_VAR(nodemask, allmasks);
7283 7429
@@ -7344,7 +7490,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7344 7490
7345#ifdef CONFIG_SCHED_SMT 7491#ifdef CONFIG_SCHED_SMT
7346 /* Set up CPU (sibling) groups */ 7492 /* Set up CPU (sibling) groups */
7347 for_each_cpu_mask(i, *cpu_map) { 7493 for_each_cpu_mask_nr(i, *cpu_map) {
7348 SCHED_CPUMASK_VAR(this_sibling_map, allmasks); 7494 SCHED_CPUMASK_VAR(this_sibling_map, allmasks);
7349 SCHED_CPUMASK_VAR(send_covered, allmasks); 7495 SCHED_CPUMASK_VAR(send_covered, allmasks);
7350 7496
@@ -7361,7 +7507,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7361 7507
7362#ifdef CONFIG_SCHED_MC 7508#ifdef CONFIG_SCHED_MC
7363 /* Set up multi-core groups */ 7509 /* Set up multi-core groups */
7364 for_each_cpu_mask(i, *cpu_map) { 7510 for_each_cpu_mask_nr(i, *cpu_map) {
7365 SCHED_CPUMASK_VAR(this_core_map, allmasks); 7511 SCHED_CPUMASK_VAR(this_core_map, allmasks);
7366 SCHED_CPUMASK_VAR(send_covered, allmasks); 7512 SCHED_CPUMASK_VAR(send_covered, allmasks);
7367 7513
@@ -7428,7 +7574,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7428 goto error; 7574 goto error;
7429 } 7575 }
7430 sched_group_nodes[i] = sg; 7576 sched_group_nodes[i] = sg;
7431 for_each_cpu_mask(j, *nodemask) { 7577 for_each_cpu_mask_nr(j, *nodemask) {
7432 struct sched_domain *sd; 7578 struct sched_domain *sd;
7433 7579
7434 sd = &per_cpu(node_domains, j); 7580 sd = &per_cpu(node_domains, j);
@@ -7474,21 +7620,21 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7474 7620
7475 /* Calculate CPU power for physical packages and nodes */ 7621 /* Calculate CPU power for physical packages and nodes */
7476#ifdef CONFIG_SCHED_SMT 7622#ifdef CONFIG_SCHED_SMT
7477 for_each_cpu_mask(i, *cpu_map) { 7623 for_each_cpu_mask_nr(i, *cpu_map) {
7478 struct sched_domain *sd = &per_cpu(cpu_domains, i); 7624 struct sched_domain *sd = &per_cpu(cpu_domains, i);
7479 7625
7480 init_sched_groups_power(i, sd); 7626 init_sched_groups_power(i, sd);
7481 } 7627 }
7482#endif 7628#endif
7483#ifdef CONFIG_SCHED_MC 7629#ifdef CONFIG_SCHED_MC
7484 for_each_cpu_mask(i, *cpu_map) { 7630 for_each_cpu_mask_nr(i, *cpu_map) {
7485 struct sched_domain *sd = &per_cpu(core_domains, i); 7631 struct sched_domain *sd = &per_cpu(core_domains, i);
7486 7632
7487 init_sched_groups_power(i, sd); 7633 init_sched_groups_power(i, sd);
7488 } 7634 }
7489#endif 7635#endif
7490 7636
7491 for_each_cpu_mask(i, *cpu_map) { 7637 for_each_cpu_mask_nr(i, *cpu_map) {
7492 struct sched_domain *sd = &per_cpu(phys_domains, i); 7638 struct sched_domain *sd = &per_cpu(phys_domains, i);
7493 7639
7494 init_sched_groups_power(i, sd); 7640 init_sched_groups_power(i, sd);
@@ -7508,7 +7654,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
7508#endif 7654#endif
7509 7655
7510 /* Attach the domains */ 7656 /* Attach the domains */
7511 for_each_cpu_mask(i, *cpu_map) { 7657 for_each_cpu_mask_nr(i, *cpu_map) {
7512 struct sched_domain *sd; 7658 struct sched_domain *sd;
7513#ifdef CONFIG_SCHED_SMT 7659#ifdef CONFIG_SCHED_SMT
7514 sd = &per_cpu(cpu_domains, i); 7660 sd = &per_cpu(cpu_domains, i);
@@ -7553,18 +7699,6 @@ void __attribute__((weak)) arch_update_cpu_topology(void)
7553} 7699}
7554 7700
7555/* 7701/*
7556 * Free current domain masks.
7557 * Called after all cpus are attached to NULL domain.
7558 */
7559static void free_sched_domains(void)
7560{
7561 ndoms_cur = 0;
7562 if (doms_cur != &fallback_doms)
7563 kfree(doms_cur);
7564 doms_cur = &fallback_doms;
7565}
7566
7567/*
7568 * Set up scheduler domains and groups. Callers must hold the hotplug lock. 7702 * Set up scheduler domains and groups. Callers must hold the hotplug lock.
7569 * For now this just excludes isolated cpus, but could be used to 7703 * For now this just excludes isolated cpus, but could be used to
7570 * exclude other special cases in the future. 7704 * exclude other special cases in the future.
@@ -7603,7 +7737,7 @@ static void detach_destroy_domains(const cpumask_t *cpu_map)
7603 7737
7604 unregister_sched_domain_sysctl(); 7738 unregister_sched_domain_sysctl();
7605 7739
7606 for_each_cpu_mask(i, *cpu_map) 7740 for_each_cpu_mask_nr(i, *cpu_map)
7607 cpu_attach_domain(NULL, &def_root_domain, i); 7741 cpu_attach_domain(NULL, &def_root_domain, i);
7608 synchronize_sched(); 7742 synchronize_sched();
7609 arch_destroy_sched_domains(cpu_map, &tmpmask); 7743 arch_destroy_sched_domains(cpu_map, &tmpmask);
@@ -7642,30 +7776,29 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
7642 * ownership of it and will kfree it when done with it. If the caller 7776 * ownership of it and will kfree it when done with it. If the caller
7643 * failed the kmalloc call, then it can pass in doms_new == NULL, 7777 * failed the kmalloc call, then it can pass in doms_new == NULL,
7644 * and partition_sched_domains() will fallback to the single partition 7778 * and partition_sched_domains() will fallback to the single partition
7645 * 'fallback_doms'. 7779 * 'fallback_doms', it also forces the domains to be rebuilt.
7780 *
7781 * If doms_new==NULL it will be replaced with cpu_online_map.
7782 * ndoms_new==0 is a special case for destroying existing domains.
7783 * It will not create the default domain.
7646 * 7784 *
7647 * Call with hotplug lock held 7785 * Call with hotplug lock held
7648 */ 7786 */
7649void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, 7787void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
7650 struct sched_domain_attr *dattr_new) 7788 struct sched_domain_attr *dattr_new)
7651{ 7789{
7652 int i, j; 7790 int i, j, n;
7653 7791
7654 mutex_lock(&sched_domains_mutex); 7792 mutex_lock(&sched_domains_mutex);
7655 7793
7656 /* always unregister in case we don't destroy any domains */ 7794 /* always unregister in case we don't destroy any domains */
7657 unregister_sched_domain_sysctl(); 7795 unregister_sched_domain_sysctl();
7658 7796
7659 if (doms_new == NULL) { 7797 n = doms_new ? ndoms_new : 0;
7660 ndoms_new = 1;
7661 doms_new = &fallback_doms;
7662 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
7663 dattr_new = NULL;
7664 }
7665 7798
7666 /* Destroy deleted domains */ 7799 /* Destroy deleted domains */
7667 for (i = 0; i < ndoms_cur; i++) { 7800 for (i = 0; i < ndoms_cur; i++) {
7668 for (j = 0; j < ndoms_new; j++) { 7801 for (j = 0; j < n; j++) {
7669 if (cpus_equal(doms_cur[i], doms_new[j]) 7802 if (cpus_equal(doms_cur[i], doms_new[j])
7670 && dattrs_equal(dattr_cur, i, dattr_new, j)) 7803 && dattrs_equal(dattr_cur, i, dattr_new, j))
7671 goto match1; 7804 goto match1;
@@ -7676,6 +7809,13 @@ match1:
7676 ; 7809 ;
7677 } 7810 }
7678 7811
7812 if (doms_new == NULL) {
7813 ndoms_cur = 0;
7814 doms_new = &fallback_doms;
7815 cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map);
7816 dattr_new = NULL;
7817 }
7818
7679 /* Build new domains */ 7819 /* Build new domains */
7680 for (i = 0; i < ndoms_new; i++) { 7820 for (i = 0; i < ndoms_new; i++) {
7681 for (j = 0; j < ndoms_cur; j++) { 7821 for (j = 0; j < ndoms_cur; j++) {
@@ -7706,17 +7846,15 @@ match2:
7706#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) 7846#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
7707int arch_reinit_sched_domains(void) 7847int arch_reinit_sched_domains(void)
7708{ 7848{
7709 int err;
7710
7711 get_online_cpus(); 7849 get_online_cpus();
7712 mutex_lock(&sched_domains_mutex); 7850
7713 detach_destroy_domains(&cpu_online_map); 7851 /* Destroy domains first to force the rebuild */
7714 free_sched_domains(); 7852 partition_sched_domains(0, NULL, NULL);
7715 err = arch_init_sched_domains(&cpu_online_map); 7853
7716 mutex_unlock(&sched_domains_mutex); 7854 rebuild_sched_domains();
7717 put_online_cpus(); 7855 put_online_cpus();
7718 7856
7719 return err; 7857 return 0;
7720} 7858}
7721 7859
7722static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) 7860static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
@@ -7737,30 +7875,34 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
7737} 7875}
7738 7876
7739#ifdef CONFIG_SCHED_MC 7877#ifdef CONFIG_SCHED_MC
7740static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page) 7878static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
7879 char *page)
7741{ 7880{
7742 return sprintf(page, "%u\n", sched_mc_power_savings); 7881 return sprintf(page, "%u\n", sched_mc_power_savings);
7743} 7882}
7744static ssize_t sched_mc_power_savings_store(struct sys_device *dev, 7883static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
7745 const char *buf, size_t count) 7884 const char *buf, size_t count)
7746{ 7885{
7747 return sched_power_savings_store(buf, count, 0); 7886 return sched_power_savings_store(buf, count, 0);
7748} 7887}
7749static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, 7888static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
7750 sched_mc_power_savings_store); 7889 sched_mc_power_savings_show,
7890 sched_mc_power_savings_store);
7751#endif 7891#endif
7752 7892
7753#ifdef CONFIG_SCHED_SMT 7893#ifdef CONFIG_SCHED_SMT
7754static ssize_t sched_smt_power_savings_show(struct sys_device *dev, char *page) 7894static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
7895 char *page)
7755{ 7896{
7756 return sprintf(page, "%u\n", sched_smt_power_savings); 7897 return sprintf(page, "%u\n", sched_smt_power_savings);
7757} 7898}
7758static ssize_t sched_smt_power_savings_store(struct sys_device *dev, 7899static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
7759 const char *buf, size_t count) 7900 const char *buf, size_t count)
7760{ 7901{
7761 return sched_power_savings_store(buf, count, 1); 7902 return sched_power_savings_store(buf, count, 1);
7762} 7903}
7763static SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, 7904static SYSDEV_CLASS_ATTR(sched_smt_power_savings, 0644,
7905 sched_smt_power_savings_show,
7764 sched_smt_power_savings_store); 7906 sched_smt_power_savings_store);
7765#endif 7907#endif
7766 7908
@@ -7782,59 +7924,49 @@ int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls)
7782} 7924}
7783#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */ 7925#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
7784 7926
7927#ifndef CONFIG_CPUSETS
7785/* 7928/*
7786 * Force a reinitialization of the sched domains hierarchy. The domains 7929 * Add online and remove offline CPUs from the scheduler domains.
7787 * and groups cannot be updated in place without racing with the balancing 7930 * When cpusets are enabled they take over this function.
7788 * code, so we temporarily attach all running cpus to the NULL domain
7789 * which will prevent rebalancing while the sched domains are recalculated.
7790 */ 7931 */
7791static int update_sched_domains(struct notifier_block *nfb, 7932static int update_sched_domains(struct notifier_block *nfb,
7792 unsigned long action, void *hcpu) 7933 unsigned long action, void *hcpu)
7793{ 7934{
7935 switch (action) {
7936 case CPU_ONLINE:
7937 case CPU_ONLINE_FROZEN:
7938 case CPU_DEAD:
7939 case CPU_DEAD_FROZEN:
7940 partition_sched_domains(1, NULL, NULL);
7941 return NOTIFY_OK;
7942
7943 default:
7944 return NOTIFY_DONE;
7945 }
7946}
7947#endif
7948
7949static int update_runtime(struct notifier_block *nfb,
7950 unsigned long action, void *hcpu)
7951{
7794 int cpu = (int)(long)hcpu; 7952 int cpu = (int)(long)hcpu;
7795 7953
7796 switch (action) { 7954 switch (action) {
7797 case CPU_DOWN_PREPARE: 7955 case CPU_DOWN_PREPARE:
7798 case CPU_DOWN_PREPARE_FROZEN: 7956 case CPU_DOWN_PREPARE_FROZEN:
7799 disable_runtime(cpu_rq(cpu)); 7957 disable_runtime(cpu_rq(cpu));
7800 /* fall-through */
7801 case CPU_UP_PREPARE:
7802 case CPU_UP_PREPARE_FROZEN:
7803 detach_destroy_domains(&cpu_online_map);
7804 free_sched_domains();
7805 return NOTIFY_OK; 7958 return NOTIFY_OK;
7806 7959
7807
7808 case CPU_DOWN_FAILED: 7960 case CPU_DOWN_FAILED:
7809 case CPU_DOWN_FAILED_FROZEN: 7961 case CPU_DOWN_FAILED_FROZEN:
7810 case CPU_ONLINE: 7962 case CPU_ONLINE:
7811 case CPU_ONLINE_FROZEN: 7963 case CPU_ONLINE_FROZEN:
7812 enable_runtime(cpu_rq(cpu)); 7964 enable_runtime(cpu_rq(cpu));
7813 /* fall-through */ 7965 return NOTIFY_OK;
7814 case CPU_UP_CANCELED: 7966
7815 case CPU_UP_CANCELED_FROZEN:
7816 case CPU_DEAD:
7817 case CPU_DEAD_FROZEN:
7818 /*
7819 * Fall through and re-initialise the domains.
7820 */
7821 break;
7822 default: 7967 default:
7823 return NOTIFY_DONE; 7968 return NOTIFY_DONE;
7824 } 7969 }
7825
7826#ifndef CONFIG_CPUSETS
7827 /*
7828 * Create default domain partitioning if cpusets are disabled.
7829 * Otherwise we let cpusets rebuild the domains based on the
7830 * current setup.
7831 */
7832
7833 /* The hotplug lock is already held by cpu_up/cpu_down */
7834 arch_init_sched_domains(&cpu_online_map);
7835#endif
7836
7837 return NOTIFY_OK;
7838} 7970}
7839 7971
7840void __init sched_init_smp(void) 7972void __init sched_init_smp(void)
@@ -7854,8 +7986,15 @@ void __init sched_init_smp(void)
7854 cpu_set(smp_processor_id(), non_isolated_cpus); 7986 cpu_set(smp_processor_id(), non_isolated_cpus);
7855 mutex_unlock(&sched_domains_mutex); 7987 mutex_unlock(&sched_domains_mutex);
7856 put_online_cpus(); 7988 put_online_cpus();
7989
7990#ifndef CONFIG_CPUSETS
7857 /* XXX: Theoretical race here - CPU may be hotplugged now */ 7991 /* XXX: Theoretical race here - CPU may be hotplugged now */
7858 hotcpu_notifier(update_sched_domains, 0); 7992 hotcpu_notifier(update_sched_domains, 0);
7993#endif
7994
7995 /* RT runtime code needs to handle some hotplug events */
7996 hotcpu_notifier(update_runtime, 0);
7997
7859 init_hrtick(); 7998 init_hrtick();
7860 7999
7861 /* Move init over to a non-isolated CPU */ 8000 /* Move init over to a non-isolated CPU */
@@ -8063,7 +8202,6 @@ void __init sched_init(void)
8063 8202
8064 rq = cpu_rq(i); 8203 rq = cpu_rq(i);
8065 spin_lock_init(&rq->lock); 8204 spin_lock_init(&rq->lock);
8066 lockdep_set_class(&rq->lock, &rq->rq_lock_key);
8067 rq->nr_running = 0; 8205 rq->nr_running = 0;
8068 init_cfs_rq(&rq->cfs, rq); 8206 init_cfs_rq(&rq->cfs, rq);
8069 init_rt_rq(&rq->rt, rq); 8207 init_rt_rq(&rq->rt, rq);
@@ -8186,20 +8324,25 @@ void __might_sleep(char *file, int line)
8186#ifdef in_atomic 8324#ifdef in_atomic
8187 static unsigned long prev_jiffy; /* ratelimiting */ 8325 static unsigned long prev_jiffy; /* ratelimiting */
8188 8326
8189 if ((in_atomic() || irqs_disabled()) && 8327 if ((!in_atomic() && !irqs_disabled()) ||
8190 system_state == SYSTEM_RUNNING && !oops_in_progress) { 8328 system_state != SYSTEM_RUNNING || oops_in_progress)
8191 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) 8329 return;
8192 return; 8330 if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
8193 prev_jiffy = jiffies; 8331 return;
8194 printk(KERN_ERR "BUG: sleeping function called from invalid" 8332 prev_jiffy = jiffies;
8195 " context at %s:%d\n", file, line); 8333
8196 printk("in_atomic():%d, irqs_disabled():%d\n", 8334 printk(KERN_ERR
8197 in_atomic(), irqs_disabled()); 8335 "BUG: sleeping function called from invalid context at %s:%d\n",
8198 debug_show_held_locks(current); 8336 file, line);
8199 if (irqs_disabled()) 8337 printk(KERN_ERR
8200 print_irqtrace_events(current); 8338 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
8201 dump_stack(); 8339 in_atomic(), irqs_disabled(),
8202 } 8340 current->pid, current->comm);
8341
8342 debug_show_held_locks(current);
8343 if (irqs_disabled())
8344 print_irqtrace_events(current);
8345 dump_stack();
8203#endif 8346#endif
8204} 8347}
8205EXPORT_SYMBOL(__might_sleep); 8348EXPORT_SYMBOL(__might_sleep);
@@ -8520,8 +8663,8 @@ struct task_group *sched_create_group(struct task_group *parent)
8520 WARN_ON(!parent); /* root should already exist */ 8663 WARN_ON(!parent); /* root should already exist */
8521 8664
8522 tg->parent = parent; 8665 tg->parent = parent;
8523 list_add_rcu(&tg->siblings, &parent->children);
8524 INIT_LIST_HEAD(&tg->children); 8666 INIT_LIST_HEAD(&tg->children);
8667 list_add_rcu(&tg->siblings, &parent->children);
8525 spin_unlock_irqrestore(&task_group_lock, flags); 8668 spin_unlock_irqrestore(&task_group_lock, flags);
8526 8669
8527 return tg; 8670 return tg;
@@ -8697,73 +8840,95 @@ static DEFINE_MUTEX(rt_constraints_mutex);
8697static unsigned long to_ratio(u64 period, u64 runtime) 8840static unsigned long to_ratio(u64 period, u64 runtime)
8698{ 8841{
8699 if (runtime == RUNTIME_INF) 8842 if (runtime == RUNTIME_INF)
8700 return 1ULL << 16; 8843 return 1ULL << 20;
8701 8844
8702 return div64_u64(runtime << 16, period); 8845 return div64_u64(runtime << 20, period);
8703} 8846}
8704 8847
8705#ifdef CONFIG_CGROUP_SCHED 8848/* Must be called with tasklist_lock held */
8706static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime) 8849static inline int tg_has_rt_tasks(struct task_group *tg)
8707{ 8850{
8708 struct task_group *tgi, *parent = tg->parent; 8851 struct task_struct *g, *p;
8709 unsigned long total = 0;
8710 8852
8711 if (!parent) { 8853 do_each_thread(g, p) {
8712 if (global_rt_period() < period) 8854 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg)
8713 return 0; 8855 return 1;
8856 } while_each_thread(g, p);
8714 8857
8715 return to_ratio(period, runtime) < 8858 return 0;
8716 to_ratio(global_rt_period(), global_rt_runtime()); 8859}
8717 }
8718 8860
8719 if (ktime_to_ns(parent->rt_bandwidth.rt_period) < period) 8861struct rt_schedulable_data {
8720 return 0; 8862 struct task_group *tg;
8863 u64 rt_period;
8864 u64 rt_runtime;
8865};
8721 8866
8722 rcu_read_lock(); 8867static int tg_schedulable(struct task_group *tg, void *data)
8723 list_for_each_entry_rcu(tgi, &parent->children, siblings) { 8868{
8724 if (tgi == tg) 8869 struct rt_schedulable_data *d = data;
8725 continue; 8870 struct task_group *child;
8871 unsigned long total, sum = 0;
8872 u64 period, runtime;
8726 8873
8727 total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), 8874 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
8728 tgi->rt_bandwidth.rt_runtime); 8875 runtime = tg->rt_bandwidth.rt_runtime;
8876
8877 if (tg == d->tg) {
8878 period = d->rt_period;
8879 runtime = d->rt_runtime;
8729 } 8880 }
8730 rcu_read_unlock();
8731 8881
8732 return total + to_ratio(period, runtime) <= 8882 /*
8733 to_ratio(ktime_to_ns(parent->rt_bandwidth.rt_period), 8883 * Cannot have more runtime than the period.
8734 parent->rt_bandwidth.rt_runtime); 8884 */
8735} 8885 if (runtime > period && runtime != RUNTIME_INF)
8736#elif defined CONFIG_USER_SCHED 8886 return -EINVAL;
8737static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8738{
8739 struct task_group *tgi;
8740 unsigned long total = 0;
8741 unsigned long global_ratio =
8742 to_ratio(global_rt_period(), global_rt_runtime());
8743 8887
8744 rcu_read_lock(); 8888 /*
8745 list_for_each_entry_rcu(tgi, &task_groups, list) { 8889 * Ensure we don't starve existing RT tasks.
8746 if (tgi == tg) 8890 */
8747 continue; 8891 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
8892 return -EBUSY;
8893
8894 total = to_ratio(period, runtime);
8748 8895
8749 total += to_ratio(ktime_to_ns(tgi->rt_bandwidth.rt_period), 8896 /*
8750 tgi->rt_bandwidth.rt_runtime); 8897 * Nobody can have more than the global setting allows.
8898 */
8899 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
8900 return -EINVAL;
8901
8902 /*
8903 * The sum of our children's runtime should not exceed our own.
8904 */
8905 list_for_each_entry_rcu(child, &tg->children, siblings) {
8906 period = ktime_to_ns(child->rt_bandwidth.rt_period);
8907 runtime = child->rt_bandwidth.rt_runtime;
8908
8909 if (child == d->tg) {
8910 period = d->rt_period;
8911 runtime = d->rt_runtime;
8912 }
8913
8914 sum += to_ratio(period, runtime);
8751 } 8915 }
8752 rcu_read_unlock();
8753 8916
8754 return total + to_ratio(period, runtime) < global_ratio; 8917 if (sum > total)
8918 return -EINVAL;
8919
8920 return 0;
8755} 8921}
8756#endif
8757 8922
8758/* Must be called with tasklist_lock held */ 8923static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
8759static inline int tg_has_rt_tasks(struct task_group *tg)
8760{ 8924{
8761 struct task_struct *g, *p; 8925 struct rt_schedulable_data data = {
8762 do_each_thread(g, p) { 8926 .tg = tg,
8763 if (rt_task(p) && rt_rq_of_se(&p->rt)->tg == tg) 8927 .rt_period = period,
8764 return 1; 8928 .rt_runtime = runtime,
8765 } while_each_thread(g, p); 8929 };
8766 return 0; 8930
8931 return walk_tg_tree(tg_schedulable, tg_nop, &data);
8767} 8932}
8768 8933
8769static int tg_set_bandwidth(struct task_group *tg, 8934static int tg_set_bandwidth(struct task_group *tg,
@@ -8773,14 +8938,9 @@ static int tg_set_bandwidth(struct task_group *tg,
8773 8938
8774 mutex_lock(&rt_constraints_mutex); 8939 mutex_lock(&rt_constraints_mutex);
8775 read_lock(&tasklist_lock); 8940 read_lock(&tasklist_lock);
8776 if (rt_runtime == 0 && tg_has_rt_tasks(tg)) { 8941 err = __rt_schedulable(tg, rt_period, rt_runtime);
8777 err = -EBUSY; 8942 if (err)
8778 goto unlock; 8943 goto unlock;
8779 }
8780 if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
8781 err = -EINVAL;
8782 goto unlock;
8783 }
8784 8944
8785 spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock); 8945 spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
8786 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period); 8946 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
@@ -8849,16 +9009,25 @@ long sched_group_rt_period(struct task_group *tg)
8849 9009
8850static int sched_rt_global_constraints(void) 9010static int sched_rt_global_constraints(void)
8851{ 9011{
8852 struct task_group *tg = &root_task_group; 9012 u64 runtime, period;
8853 u64 rt_runtime, rt_period;
8854 int ret = 0; 9013 int ret = 0;
8855 9014
8856 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period); 9015 if (sysctl_sched_rt_period <= 0)
8857 rt_runtime = tg->rt_bandwidth.rt_runtime; 9016 return -EINVAL;
9017
9018 runtime = global_rt_runtime();
9019 period = global_rt_period();
9020
9021 /*
9022 * Sanity check on the sysctl variables.
9023 */
9024 if (runtime > period && runtime != RUNTIME_INF)
9025 return -EINVAL;
8858 9026
8859 mutex_lock(&rt_constraints_mutex); 9027 mutex_lock(&rt_constraints_mutex);
8860 if (!__rt_schedulable(tg, rt_period, rt_runtime)) 9028 read_lock(&tasklist_lock);
8861 ret = -EINVAL; 9029 ret = __rt_schedulable(NULL, 0, 0);
9030 read_unlock(&tasklist_lock);
8862 mutex_unlock(&rt_constraints_mutex); 9031 mutex_unlock(&rt_constraints_mutex);
8863 9032
8864 return ret; 9033 return ret;
@@ -8869,6 +9038,9 @@ static int sched_rt_global_constraints(void)
8869 unsigned long flags; 9038 unsigned long flags;
8870 int i; 9039 int i;
8871 9040
9041 if (sysctl_sched_rt_period <= 0)
9042 return -EINVAL;
9043
8872 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); 9044 spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
8873 for_each_possible_cpu(i) { 9045 for_each_possible_cpu(i) {
8874 struct rt_rq *rt_rq = &cpu_rq(i)->rt; 9046 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
@@ -8929,7 +9101,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
8929 9101
8930 if (!cgrp->parent) { 9102 if (!cgrp->parent) {
8931 /* This is early initialization for the top cgroup */ 9103 /* This is early initialization for the top cgroup */
8932 init_task_group.css.cgroup = cgrp;
8933 return &init_task_group.css; 9104 return &init_task_group.css;
8934 } 9105 }
8935 9106
@@ -8938,9 +9109,6 @@ cpu_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cgrp)
8938 if (IS_ERR(tg)) 9109 if (IS_ERR(tg))
8939 return ERR_PTR(-ENOMEM); 9110 return ERR_PTR(-ENOMEM);
8940 9111
8941 /* Bind the cgroup to task_group object we just created */
8942 tg->css.cgroup = cgrp;
8943
8944 return &tg->css; 9112 return &tg->css;
8945} 9113}
8946 9114
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index 22ed55d1167f..81787248b60f 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -12,19 +12,17 @@
12 * 12 *
13 * Create a semi stable clock from a mixture of other events, including: 13 * Create a semi stable clock from a mixture of other events, including:
14 * - gtod 14 * - gtod
15 * - jiffies
16 * - sched_clock() 15 * - sched_clock()
17 * - explicit idle events 16 * - explicit idle events
18 * 17 *
19 * We use gtod as base and the unstable clock deltas. The deltas are filtered, 18 * We use gtod as base and the unstable clock deltas. The deltas are filtered,
20 * making it monotonic and keeping it within an expected window. This window 19 * making it monotonic and keeping it within an expected window.
21 * is set up using jiffies.
22 * 20 *
23 * Furthermore, explicit sleep and wakeup hooks allow us to account for time 21 * Furthermore, explicit sleep and wakeup hooks allow us to account for time
24 * that is otherwise invisible (TSC gets stopped). 22 * that is otherwise invisible (TSC gets stopped).
25 * 23 *
26 * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat 24 * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat
27 * consistent between cpus (never more than 1 jiffies difference). 25 * consistent between cpus (never more than 2 jiffies difference).
28 */ 26 */
29#include <linux/sched.h> 27#include <linux/sched.h>
30#include <linux/percpu.h> 28#include <linux/percpu.h>
@@ -32,13 +30,19 @@
32#include <linux/ktime.h> 30#include <linux/ktime.h>
33#include <linux/module.h> 31#include <linux/module.h>
34 32
33/*
34 * Scheduler clock - returns current time in nanosec units.
35 * This is default implementation.
36 * Architectures and sub-architectures can override this.
37 */
38unsigned long long __attribute__((weak)) sched_clock(void)
39{
40 return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ);
41}
35 42
36#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK 43static __read_mostly int sched_clock_running;
37 44
38#define MULTI_SHIFT 15 45#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
39/* Max is double, Min is 1/2 */
40#define MAX_MULTI (2LL << MULTI_SHIFT)
41#define MIN_MULTI (1LL << (MULTI_SHIFT-1))
42 46
43struct sched_clock_data { 47struct sched_clock_data {
44 /* 48 /*
@@ -48,15 +52,9 @@ struct sched_clock_data {
48 */ 52 */
49 raw_spinlock_t lock; 53 raw_spinlock_t lock;
50 54
51 unsigned long tick_jiffies;
52 u64 prev_raw;
53 u64 tick_raw; 55 u64 tick_raw;
54 u64 tick_gtod; 56 u64 tick_gtod;
55 u64 clock; 57 u64 clock;
56 s64 multi;
57#ifdef CONFIG_NO_HZ
58 int check_max;
59#endif
60}; 58};
61 59
62static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); 60static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data);
@@ -71,121 +69,69 @@ static inline struct sched_clock_data *cpu_sdc(int cpu)
71 return &per_cpu(sched_clock_data, cpu); 69 return &per_cpu(sched_clock_data, cpu);
72} 70}
73 71
74static __read_mostly int sched_clock_running;
75
76void sched_clock_init(void) 72void sched_clock_init(void)
77{ 73{
78 u64 ktime_now = ktime_to_ns(ktime_get()); 74 u64 ktime_now = ktime_to_ns(ktime_get());
79 unsigned long now_jiffies = jiffies;
80 int cpu; 75 int cpu;
81 76
82 for_each_possible_cpu(cpu) { 77 for_each_possible_cpu(cpu) {
83 struct sched_clock_data *scd = cpu_sdc(cpu); 78 struct sched_clock_data *scd = cpu_sdc(cpu);
84 79
85 scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 80 scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
86 scd->tick_jiffies = now_jiffies;
87 scd->prev_raw = 0;
88 scd->tick_raw = 0; 81 scd->tick_raw = 0;
89 scd->tick_gtod = ktime_now; 82 scd->tick_gtod = ktime_now;
90 scd->clock = ktime_now; 83 scd->clock = ktime_now;
91 scd->multi = 1 << MULTI_SHIFT;
92#ifdef CONFIG_NO_HZ
93 scd->check_max = 1;
94#endif
95 } 84 }
96 85
97 sched_clock_running = 1; 86 sched_clock_running = 1;
98} 87}
99 88
100#ifdef CONFIG_NO_HZ
101/* 89/*
102 * The dynamic ticks makes the delta jiffies inaccurate. This 90 * min,max except they take wrapping into account
103 * prevents us from checking the maximum time update.
104 * Disable the maximum check during stopped ticks.
105 */ 91 */
106void sched_clock_tick_stop(int cpu)
107{
108 struct sched_clock_data *scd = cpu_sdc(cpu);
109
110 scd->check_max = 0;
111}
112 92
113void sched_clock_tick_start(int cpu) 93static inline u64 wrap_min(u64 x, u64 y)
114{ 94{
115 struct sched_clock_data *scd = cpu_sdc(cpu); 95 return (s64)(x - y) < 0 ? x : y;
116
117 scd->check_max = 1;
118} 96}
119 97
120static int check_max(struct sched_clock_data *scd) 98static inline u64 wrap_max(u64 x, u64 y)
121{ 99{
122 return scd->check_max; 100 return (s64)(x - y) > 0 ? x : y;
123} 101}
124#else
125static int check_max(struct sched_clock_data *scd)
126{
127 return 1;
128}
129#endif /* CONFIG_NO_HZ */
130 102
131/* 103/*
132 * update the percpu scd from the raw @now value 104 * update the percpu scd from the raw @now value
133 * 105 *
134 * - filter out backward motion 106 * - filter out backward motion
135 * - use jiffies to generate a min,max window to clip the raw values 107 * - use the GTOD tick value to create a window to filter crazy TSC values
136 */ 108 */
137static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *time) 109static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
138{ 110{
139 unsigned long now_jiffies = jiffies; 111 s64 delta = now - scd->tick_raw;
140 long delta_jiffies = now_jiffies - scd->tick_jiffies; 112 u64 clock, min_clock, max_clock;
141 u64 clock = scd->clock;
142 u64 min_clock, max_clock;
143 s64 delta = now - scd->prev_raw;
144 113
145 WARN_ON_ONCE(!irqs_disabled()); 114 WARN_ON_ONCE(!irqs_disabled());
146 115
147 /* 116 if (unlikely(delta < 0))
148 * At schedule tick the clock can be just under the gtod. We don't 117 delta = 0;
149 * want to push it too prematurely.
150 */
151 min_clock = scd->tick_gtod + (delta_jiffies * TICK_NSEC);
152 if (min_clock > TICK_NSEC)
153 min_clock -= TICK_NSEC / 2;
154
155 if (unlikely(delta < 0)) {
156 clock++;
157 goto out;
158 }
159 118
160 /* 119 /*
161 * The clock must stay within a jiffie of the gtod. 120 * scd->clock = clamp(scd->tick_gtod + delta,
162 * But since we may be at the start of a jiffy or the end of one 121 * max(scd->tick_gtod, scd->clock),
163 * we add another jiffy buffer. 122 * max(scd->clock, scd->tick_gtod + TICK_NSEC));
164 */ 123 */
165 max_clock = scd->tick_gtod + (2 + delta_jiffies) * TICK_NSEC;
166 124
167 delta *= scd->multi; 125 clock = scd->tick_gtod + delta;
168 delta >>= MULTI_SHIFT; 126 min_clock = wrap_max(scd->tick_gtod, scd->clock);
127 max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC);
169 128
170 if (unlikely(clock + delta > max_clock) && check_max(scd)) { 129 clock = wrap_max(clock, min_clock);
171 if (clock < max_clock) 130 clock = wrap_min(clock, max_clock);
172 clock = max_clock;
173 else
174 clock++;
175 } else {
176 clock += delta;
177 }
178 131
179 out: 132 scd->clock = clock;
180 if (unlikely(clock < min_clock))
181 clock = min_clock;
182 133
183 if (time) 134 return scd->clock;
184 *time = clock;
185 else {
186 scd->prev_raw = now;
187 scd->clock = clock;
188 }
189} 135}
190 136
191static void lock_double_clock(struct sched_clock_data *data1, 137static void lock_double_clock(struct sched_clock_data *data1,
@@ -203,7 +149,7 @@ static void lock_double_clock(struct sched_clock_data *data1,
203u64 sched_clock_cpu(int cpu) 149u64 sched_clock_cpu(int cpu)
204{ 150{
205 struct sched_clock_data *scd = cpu_sdc(cpu); 151 struct sched_clock_data *scd = cpu_sdc(cpu);
206 u64 now, clock; 152 u64 now, clock, this_clock, remote_clock;
207 153
208 if (unlikely(!sched_clock_running)) 154 if (unlikely(!sched_clock_running))
209 return 0ull; 155 return 0ull;
@@ -212,43 +158,44 @@ u64 sched_clock_cpu(int cpu)
212 now = sched_clock(); 158 now = sched_clock();
213 159
214 if (cpu != raw_smp_processor_id()) { 160 if (cpu != raw_smp_processor_id()) {
215 /*
216 * in order to update a remote cpu's clock based on our
217 * unstable raw time rebase it against:
218 * tick_raw (offset between raw counters)
219 * tick_gotd (tick offset between cpus)
220 */
221 struct sched_clock_data *my_scd = this_scd(); 161 struct sched_clock_data *my_scd = this_scd();
222 162
223 lock_double_clock(scd, my_scd); 163 lock_double_clock(scd, my_scd);
224 164
225 now -= my_scd->tick_raw; 165 this_clock = __update_sched_clock(my_scd, now);
226 now += scd->tick_raw; 166 remote_clock = scd->clock;
227 167
228 now += my_scd->tick_gtod; 168 /*
229 now -= scd->tick_gtod; 169 * Use the opportunity that we have both locks
170 * taken to couple the two clocks: we take the
171 * larger time as the latest time for both
172 * runqueues. (this creates monotonic movement)
173 */
174 if (likely((s64)(remote_clock - this_clock) < 0)) {
175 clock = this_clock;
176 scd->clock = clock;
177 } else {
178 /*
179 * Should be rare, but possible:
180 */
181 clock = remote_clock;
182 my_scd->clock = remote_clock;
183 }
230 184
231 __raw_spin_unlock(&my_scd->lock); 185 __raw_spin_unlock(&my_scd->lock);
232
233 __update_sched_clock(scd, now, &clock);
234
235 __raw_spin_unlock(&scd->lock);
236
237 } else { 186 } else {
238 __raw_spin_lock(&scd->lock); 187 __raw_spin_lock(&scd->lock);
239 __update_sched_clock(scd, now, NULL); 188 clock = __update_sched_clock(scd, now);
240 clock = scd->clock;
241 __raw_spin_unlock(&scd->lock);
242 } 189 }
243 190
191 __raw_spin_unlock(&scd->lock);
192
244 return clock; 193 return clock;
245} 194}
246 195
247void sched_clock_tick(void) 196void sched_clock_tick(void)
248{ 197{
249 struct sched_clock_data *scd = this_scd(); 198 struct sched_clock_data *scd = this_scd();
250 unsigned long now_jiffies = jiffies;
251 s64 mult, delta_gtod, delta_raw;
252 u64 now, now_gtod; 199 u64 now, now_gtod;
253 200
254 if (unlikely(!sched_clock_running)) 201 if (unlikely(!sched_clock_running))
@@ -260,29 +207,9 @@ void sched_clock_tick(void)
260 now = sched_clock(); 207 now = sched_clock();
261 208
262 __raw_spin_lock(&scd->lock); 209 __raw_spin_lock(&scd->lock);
263 __update_sched_clock(scd, now, NULL);
264 /*
265 * update tick_gtod after __update_sched_clock() because that will
266 * already observe 1 new jiffy; adding a new tick_gtod to that would
267 * increase the clock 2 jiffies.
268 */
269 delta_gtod = now_gtod - scd->tick_gtod;
270 delta_raw = now - scd->tick_raw;
271
272 if ((long)delta_raw > 0) {
273 mult = delta_gtod << MULTI_SHIFT;
274 do_div(mult, delta_raw);
275 scd->multi = mult;
276 if (scd->multi > MAX_MULTI)
277 scd->multi = MAX_MULTI;
278 else if (scd->multi < MIN_MULTI)
279 scd->multi = MIN_MULTI;
280 } else
281 scd->multi = 1 << MULTI_SHIFT;
282
283 scd->tick_raw = now; 210 scd->tick_raw = now;
284 scd->tick_gtod = now_gtod; 211 scd->tick_gtod = now_gtod;
285 scd->tick_jiffies = now_jiffies; 212 __update_sched_clock(scd, now);
286 __raw_spin_unlock(&scd->lock); 213 __raw_spin_unlock(&scd->lock);
287} 214}
288 215
@@ -300,37 +227,28 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event);
300 */ 227 */
301void sched_clock_idle_wakeup_event(u64 delta_ns) 228void sched_clock_idle_wakeup_event(u64 delta_ns)
302{ 229{
303 struct sched_clock_data *scd = this_scd(); 230 sched_clock_tick();
304 u64 now = sched_clock();
305
306 /*
307 * Override the previous timestamp and ignore all
308 * sched_clock() deltas that occured while we idled,
309 * and use the PM-provided delta_ns to advance the
310 * rq clock:
311 */
312 __raw_spin_lock(&scd->lock);
313 scd->prev_raw = now;
314 scd->clock += delta_ns;
315 scd->multi = 1 << MULTI_SHIFT;
316 __raw_spin_unlock(&scd->lock);
317
318 touch_softlockup_watchdog(); 231 touch_softlockup_watchdog();
319} 232}
320EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); 233EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
321 234
322#endif 235#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
323 236
324/* 237void sched_clock_init(void)
325 * Scheduler clock - returns current time in nanosec units.
326 * This is default implementation.
327 * Architectures and sub-architectures can override this.
328 */
329unsigned long long __attribute__((weak)) sched_clock(void)
330{ 238{
331 return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); 239 sched_clock_running = 1;
332} 240}
333 241
242u64 sched_clock_cpu(int cpu)
243{
244 if (unlikely(!sched_clock_running))
245 return 0;
246
247 return sched_clock();
248}
249
250#endif
251
334unsigned long long cpu_clock(int cpu) 252unsigned long long cpu_clock(int cpu)
335{ 253{
336 unsigned long long clock; 254 unsigned long long clock;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index bbe6b31c3c56..ad958c1ec708 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -333,12 +333,10 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
333 unsigned long flags; 333 unsigned long flags;
334 int num_threads = 1; 334 int num_threads = 1;
335 335
336 rcu_read_lock();
337 if (lock_task_sighand(p, &flags)) { 336 if (lock_task_sighand(p, &flags)) {
338 num_threads = atomic_read(&p->signal->count); 337 num_threads = atomic_read(&p->signal->count);
339 unlock_task_sighand(p, &flags); 338 unlock_task_sighand(p, &flags);
340 } 339 }
341 rcu_read_unlock();
342 340
343 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads); 341 SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads);
344 SEQ_printf(m, 342 SEQ_printf(m,
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index f2aa987027d6..9573c33688b8 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -73,6 +73,8 @@ unsigned int sysctl_sched_wakeup_granularity = 5000000UL;
73 73
74const_debug unsigned int sysctl_sched_migration_cost = 500000UL; 74const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
75 75
76static const struct sched_class fair_sched_class;
77
76/************************************************************** 78/**************************************************************
77 * CFS operations on generic schedulable entities: 79 * CFS operations on generic schedulable entities:
78 */ 80 */
@@ -334,7 +336,7 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
334#endif 336#endif
335 337
336/* 338/*
337 * delta *= w / rw 339 * delta *= P[w / rw]
338 */ 340 */
339static inline unsigned long 341static inline unsigned long
340calc_delta_weight(unsigned long delta, struct sched_entity *se) 342calc_delta_weight(unsigned long delta, struct sched_entity *se)
@@ -348,15 +350,13 @@ calc_delta_weight(unsigned long delta, struct sched_entity *se)
348} 350}
349 351
350/* 352/*
351 * delta *= rw / w 353 * delta /= w
352 */ 354 */
353static inline unsigned long 355static inline unsigned long
354calc_delta_fair(unsigned long delta, struct sched_entity *se) 356calc_delta_fair(unsigned long delta, struct sched_entity *se)
355{ 357{
356 for_each_sched_entity(se) { 358 if (unlikely(se->load.weight != NICE_0_LOAD))
357 delta = calc_delta_mine(delta, 359 delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
358 cfs_rq_of(se)->load.weight, &se->load);
359 }
360 360
361 return delta; 361 return delta;
362} 362}
@@ -386,84 +386,26 @@ static u64 __sched_period(unsigned long nr_running)
386 * We calculate the wall-time slice from the period by taking a part 386 * We calculate the wall-time slice from the period by taking a part
387 * proportional to the weight. 387 * proportional to the weight.
388 * 388 *
389 * s = p*w/rw 389 * s = p*P[w/rw]
390 */ 390 */
391static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) 391static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
392{ 392{
393 return calc_delta_weight(__sched_period(cfs_rq->nr_running), se);
394}
395
396/*
397 * We calculate the vruntime slice of a to be inserted task
398 *
399 * vs = s*rw/w = p
400 */
401static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se)
402{
403 unsigned long nr_running = cfs_rq->nr_running; 393 unsigned long nr_running = cfs_rq->nr_running;
404 394
405 if (!se->on_rq) 395 if (unlikely(!se->on_rq))
406 nr_running++; 396 nr_running++;
407 397
408 return __sched_period(nr_running); 398 return calc_delta_weight(__sched_period(nr_running), se);
409} 399}
410 400
411/* 401/*
412 * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in 402 * We calculate the vruntime slice of a to be inserted task
413 * that it favours >=0 over <0.
414 *
415 * -20 |
416 * |
417 * 0 --------+-------
418 * .'
419 * 19 .'
420 * 403 *
404 * vs = s/w
421 */ 405 */
422static unsigned long 406static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
423calc_delta_asym(unsigned long delta, struct sched_entity *se)
424{ 407{
425 struct load_weight lw = { 408 return calc_delta_fair(sched_slice(cfs_rq, se), se);
426 .weight = NICE_0_LOAD,
427 .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT)
428 };
429
430 for_each_sched_entity(se) {
431 struct load_weight *se_lw = &se->load;
432 unsigned long rw = cfs_rq_of(se)->load.weight;
433
434#ifdef CONFIG_FAIR_SCHED_GROUP
435 struct cfs_rq *cfs_rq = se->my_q;
436 struct task_group *tg = NULL
437
438 if (cfs_rq)
439 tg = cfs_rq->tg;
440
441 if (tg && tg->shares < NICE_0_LOAD) {
442 /*
443 * scale shares to what it would have been had
444 * tg->weight been NICE_0_LOAD:
445 *
446 * weight = 1024 * shares / tg->weight
447 */
448 lw.weight *= se->load.weight;
449 lw.weight /= tg->shares;
450
451 lw.inv_weight = 0;
452
453 se_lw = &lw;
454 rw += lw.weight - se->load.weight;
455 } else
456#endif
457
458 if (se->load.weight < NICE_0_LOAD) {
459 se_lw = &lw;
460 rw += NICE_0_LOAD - se->load.weight;
461 }
462
463 delta = calc_delta_mine(delta, rw, se_lw);
464 }
465
466 return delta;
467} 409}
468 410
469/* 411/*
@@ -507,6 +449,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
507 struct task_struct *curtask = task_of(curr); 449 struct task_struct *curtask = task_of(curr);
508 450
509 cpuacct_charge(curtask, delta_exec); 451 cpuacct_charge(curtask, delta_exec);
452 account_group_exec_runtime(curtask, delta_exec);
510 } 453 }
511} 454}
512 455
@@ -586,11 +529,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
586 update_load_add(&cfs_rq->load, se->load.weight); 529 update_load_add(&cfs_rq->load, se->load.weight);
587 if (!parent_entity(se)) 530 if (!parent_entity(se))
588 inc_cpu_load(rq_of(cfs_rq), se->load.weight); 531 inc_cpu_load(rq_of(cfs_rq), se->load.weight);
589 if (entity_is_task(se)) 532 if (entity_is_task(se)) {
590 add_cfs_task_weight(cfs_rq, se->load.weight); 533 add_cfs_task_weight(cfs_rq, se->load.weight);
534 list_add(&se->group_node, &cfs_rq->tasks);
535 }
591 cfs_rq->nr_running++; 536 cfs_rq->nr_running++;
592 se->on_rq = 1; 537 se->on_rq = 1;
593 list_add(&se->group_node, &cfs_rq->tasks);
594} 538}
595 539
596static void 540static void
@@ -599,11 +543,12 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
599 update_load_sub(&cfs_rq->load, se->load.weight); 543 update_load_sub(&cfs_rq->load, se->load.weight);
600 if (!parent_entity(se)) 544 if (!parent_entity(se))
601 dec_cpu_load(rq_of(cfs_rq), se->load.weight); 545 dec_cpu_load(rq_of(cfs_rq), se->load.weight);
602 if (entity_is_task(se)) 546 if (entity_is_task(se)) {
603 add_cfs_task_weight(cfs_rq, -se->load.weight); 547 add_cfs_task_weight(cfs_rq, -se->load.weight);
548 list_del_init(&se->group_node);
549 }
604 cfs_rq->nr_running--; 550 cfs_rq->nr_running--;
605 se->on_rq = 0; 551 se->on_rq = 0;
606 list_del_init(&se->group_node);
607} 552}
608 553
609static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) 554static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -683,7 +628,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
683 * stays open at the end. 628 * stays open at the end.
684 */ 629 */
685 if (initial && sched_feat(START_DEBIT)) 630 if (initial && sched_feat(START_DEBIT))
686 vruntime += sched_vslice_add(cfs_rq, se); 631 vruntime += sched_vslice(cfs_rq, se);
687 632
688 if (!initial) { 633 if (!initial) {
689 /* sleeps upto a single latency don't count. */ 634 /* sleeps upto a single latency don't count. */
@@ -803,7 +748,7 @@ pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se)
803 struct rq *rq = rq_of(cfs_rq); 748 struct rq *rq = rq_of(cfs_rq);
804 u64 pair_slice = rq->clock - cfs_rq->pair_start; 749 u64 pair_slice = rq->clock - cfs_rq->pair_start;
805 750
806 if (!cfs_rq->next || pair_slice > sched_slice(cfs_rq, cfs_rq->next)) { 751 if (!cfs_rq->next || pair_slice > sysctl_sched_min_granularity) {
807 cfs_rq->pair_start = rq->clock; 752 cfs_rq->pair_start = rq->clock;
808 return se; 753 return se;
809 } 754 }
@@ -878,7 +823,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
878#ifdef CONFIG_SCHED_HRTICK 823#ifdef CONFIG_SCHED_HRTICK
879static void hrtick_start_fair(struct rq *rq, struct task_struct *p) 824static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
880{ 825{
881 int requeue = rq->curr == p;
882 struct sched_entity *se = &p->se; 826 struct sched_entity *se = &p->se;
883 struct cfs_rq *cfs_rq = cfs_rq_of(se); 827 struct cfs_rq *cfs_rq = cfs_rq_of(se);
884 828
@@ -899,17 +843,37 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
899 * Don't schedule slices shorter than 10000ns, that just 843 * Don't schedule slices shorter than 10000ns, that just
900 * doesn't make sense. Rely on vruntime for fairness. 844 * doesn't make sense. Rely on vruntime for fairness.
901 */ 845 */
902 if (!requeue) 846 if (rq->curr != p)
903 delta = max(10000LL, delta); 847 delta = max_t(s64, 10000LL, delta);
904 848
905 hrtick_start(rq, delta, requeue); 849 hrtick_start(rq, delta);
906 } 850 }
907} 851}
852
853/*
854 * called from enqueue/dequeue and updates the hrtick when the
855 * current task is from our class and nr_running is low enough
856 * to matter.
857 */
858static void hrtick_update(struct rq *rq)
859{
860 struct task_struct *curr = rq->curr;
861
862 if (curr->sched_class != &fair_sched_class)
863 return;
864
865 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
866 hrtick_start_fair(rq, curr);
867}
908#else /* !CONFIG_SCHED_HRTICK */ 868#else /* !CONFIG_SCHED_HRTICK */
909static inline void 869static inline void
910hrtick_start_fair(struct rq *rq, struct task_struct *p) 870hrtick_start_fair(struct rq *rq, struct task_struct *p)
911{ 871{
912} 872}
873
874static inline void hrtick_update(struct rq *rq)
875{
876}
913#endif 877#endif
914 878
915/* 879/*
@@ -930,7 +894,7 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup)
930 wakeup = 1; 894 wakeup = 1;
931 } 895 }
932 896
933 hrtick_start_fair(rq, rq->curr); 897 hrtick_update(rq);
934} 898}
935 899
936/* 900/*
@@ -952,7 +916,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep)
952 sleep = 1; 916 sleep = 1;
953 } 917 }
954 918
955 hrtick_start_fair(rq, rq->curr); 919 hrtick_update(rq);
956} 920}
957 921
958/* 922/*
@@ -1004,6 +968,8 @@ static void yield_task_fair(struct rq *rq)
1004 * not idle and an idle cpu is available. The span of cpus to 968 * not idle and an idle cpu is available. The span of cpus to
1005 * search starts with cpus closest then further out as needed, 969 * search starts with cpus closest then further out as needed,
1006 * so we always favor a closer, idle cpu. 970 * so we always favor a closer, idle cpu.
971 * Domains may include CPUs that are not usable for migration,
972 * hence we need to mask them out (cpu_active_map)
1007 * 973 *
1008 * Returns the CPU we should wake onto. 974 * Returns the CPU we should wake onto.
1009 */ 975 */
@@ -1031,7 +997,8 @@ static int wake_idle(int cpu, struct task_struct *p)
1031 || ((sd->flags & SD_WAKE_IDLE_FAR) 997 || ((sd->flags & SD_WAKE_IDLE_FAR)
1032 && !task_hot(p, task_rq(p)->clock, sd))) { 998 && !task_hot(p, task_rq(p)->clock, sd))) {
1033 cpus_and(tmp, sd->span, p->cpus_allowed); 999 cpus_and(tmp, sd->span, p->cpus_allowed);
1034 for_each_cpu_mask(i, tmp) { 1000 cpus_and(tmp, tmp, cpu_active_map);
1001 for_each_cpu_mask_nr(i, tmp) {
1035 if (idle_cpu(i)) { 1002 if (idle_cpu(i)) {
1036 if (i != task_cpu(p)) { 1003 if (i != task_cpu(p)) {
1037 schedstat_inc(p, 1004 schedstat_inc(p,
@@ -1055,8 +1022,6 @@ static inline int wake_idle(int cpu, struct task_struct *p)
1055 1022
1056#ifdef CONFIG_SMP 1023#ifdef CONFIG_SMP
1057 1024
1058static const struct sched_class fair_sched_class;
1059
1060#ifdef CONFIG_FAIR_GROUP_SCHED 1025#ifdef CONFIG_FAIR_GROUP_SCHED
1061/* 1026/*
1062 * effective_load() calculates the load change as seen from the root_task_group 1027 * effective_load() calculates the load change as seen from the root_task_group
@@ -1083,7 +1048,6 @@ static long effective_load(struct task_group *tg, int cpu,
1083 long wl, long wg) 1048 long wl, long wg)
1084{ 1049{
1085 struct sched_entity *se = tg->se[cpu]; 1050 struct sched_entity *se = tg->se[cpu];
1086 long more_w;
1087 1051
1088 if (!tg->parent) 1052 if (!tg->parent)
1089 return wl; 1053 return wl;
@@ -1095,18 +1059,17 @@ static long effective_load(struct task_group *tg, int cpu,
1095 if (!wl && sched_feat(ASYM_EFF_LOAD)) 1059 if (!wl && sched_feat(ASYM_EFF_LOAD))
1096 return wl; 1060 return wl;
1097 1061
1098 /*
1099 * Instead of using this increment, also add the difference
1100 * between when the shares were last updated and now.
1101 */
1102 more_w = se->my_q->load.weight - se->my_q->rq_weight;
1103 wl += more_w;
1104 wg += more_w;
1105
1106 for_each_sched_entity(se) { 1062 for_each_sched_entity(se) {
1107#define D(n) (likely(n) ? (n) : 1)
1108
1109 long S, rw, s, a, b; 1063 long S, rw, s, a, b;
1064 long more_w;
1065
1066 /*
1067 * Instead of using this increment, also add the difference
1068 * between when the shares were last updated and now.
1069 */
1070 more_w = se->my_q->load.weight - se->my_q->rq_weight;
1071 wl += more_w;
1072 wg += more_w;
1110 1073
1111 S = se->my_q->tg->shares; 1074 S = se->my_q->tg->shares;
1112 s = se->my_q->shares; 1075 s = se->my_q->shares;
@@ -1115,7 +1078,11 @@ static long effective_load(struct task_group *tg, int cpu,
1115 a = S*(rw + wl); 1078 a = S*(rw + wl);
1116 b = S*rw + s*wg; 1079 b = S*rw + s*wg;
1117 1080
1118 wl = s*(a-b)/D(b); 1081 wl = s*(a-b);
1082
1083 if (likely(b))
1084 wl /= b;
1085
1119 /* 1086 /*
1120 * Assume the group is already running and will 1087 * Assume the group is already running and will
1121 * thus already be accounted for in the weight. 1088 * thus already be accounted for in the weight.
@@ -1124,7 +1091,6 @@ static long effective_load(struct task_group *tg, int cpu,
1124 * alter the group weight. 1091 * alter the group weight.
1125 */ 1092 */
1126 wg = 0; 1093 wg = 0;
1127#undef D
1128 } 1094 }
1129 1095
1130 return wl; 1096 return wl;
@@ -1141,7 +1107,7 @@ static inline unsigned long effective_load(struct task_group *tg, int cpu,
1141#endif 1107#endif
1142 1108
1143static int 1109static int
1144wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, 1110wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
1145 struct task_struct *p, int prev_cpu, int this_cpu, int sync, 1111 struct task_struct *p, int prev_cpu, int this_cpu, int sync,
1146 int idx, unsigned long load, unsigned long this_load, 1112 int idx, unsigned long load, unsigned long this_load,
1147 unsigned int imbalance) 1113 unsigned int imbalance)
@@ -1156,6 +1122,11 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
1156 if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) 1122 if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
1157 return 0; 1123 return 0;
1158 1124
1125 if (!sync && sched_feat(SYNC_WAKEUPS) &&
1126 curr->se.avg_overlap < sysctl_sched_migration_cost &&
1127 p->se.avg_overlap < sysctl_sched_migration_cost)
1128 sync = 1;
1129
1159 /* 1130 /*
1160 * If sync wakeup then subtract the (maximum possible) 1131 * If sync wakeup then subtract the (maximum possible)
1161 * effect of the currently running task from the load 1132 * effect of the currently running task from the load
@@ -1180,17 +1151,14 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq,
1180 * a reasonable amount of time then attract this newly 1151 * a reasonable amount of time then attract this newly
1181 * woken task: 1152 * woken task:
1182 */ 1153 */
1183 if (sync && balanced) { 1154 if (sync && balanced)
1184 if (curr->se.avg_overlap < sysctl_sched_migration_cost && 1155 return 1;
1185 p->se.avg_overlap < sysctl_sched_migration_cost)
1186 return 1;
1187 }
1188 1156
1189 schedstat_inc(p, se.nr_wakeups_affine_attempts); 1157 schedstat_inc(p, se.nr_wakeups_affine_attempts);
1190 tl_per_task = cpu_avg_load_per_task(this_cpu); 1158 tl_per_task = cpu_avg_load_per_task(this_cpu);
1191 1159
1192 if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) || 1160 if (balanced || (tl <= load && tl + target_load(prev_cpu, idx) <=
1193 balanced) { 1161 tl_per_task)) {
1194 /* 1162 /*
1195 * This domain has SD_WAKE_AFFINE and 1163 * This domain has SD_WAKE_AFFINE and
1196 * p is cache cold in this domain, and 1164 * p is cache cold in this domain, and
@@ -1209,16 +1177,17 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
1209 struct sched_domain *sd, *this_sd = NULL; 1177 struct sched_domain *sd, *this_sd = NULL;
1210 int prev_cpu, this_cpu, new_cpu; 1178 int prev_cpu, this_cpu, new_cpu;
1211 unsigned long load, this_load; 1179 unsigned long load, this_load;
1212 struct rq *rq, *this_rq; 1180 struct rq *this_rq;
1213 unsigned int imbalance; 1181 unsigned int imbalance;
1214 int idx; 1182 int idx;
1215 1183
1216 prev_cpu = task_cpu(p); 1184 prev_cpu = task_cpu(p);
1217 rq = task_rq(p);
1218 this_cpu = smp_processor_id(); 1185 this_cpu = smp_processor_id();
1219 this_rq = cpu_rq(this_cpu); 1186 this_rq = cpu_rq(this_cpu);
1220 new_cpu = prev_cpu; 1187 new_cpu = prev_cpu;
1221 1188
1189 if (prev_cpu == this_cpu)
1190 goto out;
1222 /* 1191 /*
1223 * 'this_sd' is the first domain that both 1192 * 'this_sd' is the first domain that both
1224 * this_cpu and prev_cpu are present in: 1193 * this_cpu and prev_cpu are present in:
@@ -1246,13 +1215,10 @@ static int select_task_rq_fair(struct task_struct *p, int sync)
1246 load = source_load(prev_cpu, idx); 1215 load = source_load(prev_cpu, idx);
1247 this_load = target_load(this_cpu, idx); 1216 this_load = target_load(this_cpu, idx);
1248 1217
1249 if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, 1218 if (wake_affine(this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx,
1250 load, this_load, imbalance)) 1219 load, this_load, imbalance))
1251 return this_cpu; 1220 return this_cpu;
1252 1221
1253 if (prev_cpu == this_cpu)
1254 goto out;
1255
1256 /* 1222 /*
1257 * Start passive balancing when half the imbalance_pct 1223 * Start passive balancing when half the imbalance_pct
1258 * limit is reached. 1224 * limit is reached.
@@ -1279,62 +1245,20 @@ static unsigned long wakeup_gran(struct sched_entity *se)
1279 * + nice tasks. 1245 * + nice tasks.
1280 */ 1246 */
1281 if (sched_feat(ASYM_GRAN)) 1247 if (sched_feat(ASYM_GRAN))
1282 gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se); 1248 gran = calc_delta_mine(gran, NICE_0_LOAD, &se->load);
1283 else
1284 gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
1285 1249
1286 return gran; 1250 return gran;
1287} 1251}
1288 1252
1289/* 1253/*
1290 * Should 'se' preempt 'curr'.
1291 *
1292 * |s1
1293 * |s2
1294 * |s3
1295 * g
1296 * |<--->|c
1297 *
1298 * w(c, s1) = -1
1299 * w(c, s2) = 0
1300 * w(c, s3) = 1
1301 *
1302 */
1303static int
1304wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
1305{
1306 s64 gran, vdiff = curr->vruntime - se->vruntime;
1307
1308 if (vdiff < 0)
1309 return -1;
1310
1311 gran = wakeup_gran(curr);
1312 if (vdiff > gran)
1313 return 1;
1314
1315 return 0;
1316}
1317
1318/* return depth at which a sched entity is present in the hierarchy */
1319static inline int depth_se(struct sched_entity *se)
1320{
1321 int depth = 0;
1322
1323 for_each_sched_entity(se)
1324 depth++;
1325
1326 return depth;
1327}
1328
1329/*
1330 * Preempt the current task with a newly woken task if needed: 1254 * Preempt the current task with a newly woken task if needed:
1331 */ 1255 */
1332static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) 1256static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
1333{ 1257{
1334 struct task_struct *curr = rq->curr; 1258 struct task_struct *curr = rq->curr;
1335 struct cfs_rq *cfs_rq = task_cfs_rq(curr); 1259 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
1336 struct sched_entity *se = &curr->se, *pse = &p->se; 1260 struct sched_entity *se = &curr->se, *pse = &p->se;
1337 int se_depth, pse_depth; 1261 s64 delta_exec;
1338 1262
1339 if (unlikely(rt_prio(p->prio))) { 1263 if (unlikely(rt_prio(p->prio))) {
1340 update_rq_clock(rq); 1264 update_rq_clock(rq);
@@ -1349,6 +1273,13 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
1349 cfs_rq_of(pse)->next = pse; 1273 cfs_rq_of(pse)->next = pse;
1350 1274
1351 /* 1275 /*
1276 * We can come here with TIF_NEED_RESCHED already set from new task
1277 * wake up path.
1278 */
1279 if (test_tsk_need_resched(curr))
1280 return;
1281
1282 /*
1352 * Batch tasks do not preempt (their preemption is driven by 1283 * Batch tasks do not preempt (their preemption is driven by
1353 * the tick): 1284 * the tick):
1354 */ 1285 */
@@ -1358,33 +1289,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p)
1358 if (!sched_feat(WAKEUP_PREEMPT)) 1289 if (!sched_feat(WAKEUP_PREEMPT))
1359 return; 1290 return;
1360 1291
1361 /* 1292 if (sched_feat(WAKEUP_OVERLAP) && (sync ||
1362 * preemption test can be made between sibling entities who are in the 1293 (se->avg_overlap < sysctl_sched_migration_cost &&
1363 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of 1294 pse->avg_overlap < sysctl_sched_migration_cost))) {
1364 * both tasks until we find their ancestors who are siblings of common 1295 resched_task(curr);
1365 * parent. 1296 return;
1366 */
1367
1368 /* First walk up until both entities are at same depth */
1369 se_depth = depth_se(se);
1370 pse_depth = depth_se(pse);
1371
1372 while (se_depth > pse_depth) {
1373 se_depth--;
1374 se = parent_entity(se);
1375 }
1376
1377 while (pse_depth > se_depth) {
1378 pse_depth--;
1379 pse = parent_entity(pse);
1380 }
1381
1382 while (!is_same_group(se, pse)) {
1383 se = parent_entity(se);
1384 pse = parent_entity(pse);
1385 } 1297 }
1386 1298
1387 if (wakeup_preempt_entity(se, pse) == 1) 1299 delta_exec = se->sum_exec_runtime - se->prev_sum_exec_runtime;
1300 if (delta_exec > wakeup_gran(pse))
1388 resched_task(curr); 1301 resched_task(curr);
1389} 1302}
1390 1303
@@ -1440,18 +1353,13 @@ __load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next)
1440 struct task_struct *p = NULL; 1353 struct task_struct *p = NULL;
1441 struct sched_entity *se; 1354 struct sched_entity *se;
1442 1355
1443 while (next != &cfs_rq->tasks) { 1356 if (next == &cfs_rq->tasks)
1444 se = list_entry(next, struct sched_entity, group_node); 1357 return NULL;
1445 next = next->next;
1446 1358
1447 /* Skip over entities that are not tasks */ 1359 se = list_entry(next, struct sched_entity, group_node);
1448 if (entity_is_task(se)) { 1360 p = task_of(se);
1449 p = task_of(se); 1361 cfs_rq->balance_iterator = next->next;
1450 break;
1451 }
1452 }
1453 1362
1454 cfs_rq->balance_iterator = next;
1455 return p; 1363 return p;
1456} 1364}
1457 1365
@@ -1500,7 +1408,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
1500 rcu_read_lock(); 1408 rcu_read_lock();
1501 update_h_load(busiest_cpu); 1409 update_h_load(busiest_cpu);
1502 1410
1503 list_for_each_entry(tg, &task_groups, list) { 1411 list_for_each_entry_rcu(tg, &task_groups, list) {
1504 struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; 1412 struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
1505 unsigned long busiest_h_load = busiest_cfs_rq->h_load; 1413 unsigned long busiest_h_load = busiest_cfs_rq->h_load;
1506 unsigned long busiest_weight = busiest_cfs_rq->load.weight; 1414 unsigned long busiest_weight = busiest_cfs_rq->load.weight;
@@ -1613,10 +1521,10 @@ static void task_new_fair(struct rq *rq, struct task_struct *p)
1613 * 'current' within the tree based on its new key value. 1521 * 'current' within the tree based on its new key value.
1614 */ 1522 */
1615 swap(curr->vruntime, se->vruntime); 1523 swap(curr->vruntime, se->vruntime);
1524 resched_task(rq->curr);
1616 } 1525 }
1617 1526
1618 enqueue_task_fair(rq, p, 0); 1527 enqueue_task_fair(rq, p, 0);
1619 resched_task(rq->curr);
1620} 1528}
1621 1529
1622/* 1530/*
@@ -1635,7 +1543,7 @@ static void prio_changed_fair(struct rq *rq, struct task_struct *p,
1635 if (p->prio > oldprio) 1543 if (p->prio > oldprio)
1636 resched_task(rq->curr); 1544 resched_task(rq->curr);
1637 } else 1545 } else
1638 check_preempt_curr(rq, p); 1546 check_preempt_curr(rq, p, 0);
1639} 1547}
1640 1548
1641/* 1549/*
@@ -1652,7 +1560,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p,
1652 if (running) 1560 if (running)
1653 resched_task(rq->curr); 1561 resched_task(rq->curr);
1654 else 1562 else
1655 check_preempt_curr(rq, p); 1563 check_preempt_curr(rq, p, 0);
1656} 1564}
1657 1565
1658/* Account for a task changing its policy or group. 1566/* Account for a task changing its policy or group.
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 862b06bd560a..fda016218296 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -5,9 +5,10 @@ SCHED_FEAT(START_DEBIT, 1)
5SCHED_FEAT(AFFINE_WAKEUPS, 1) 5SCHED_FEAT(AFFINE_WAKEUPS, 1)
6SCHED_FEAT(CACHE_HOT_BUDDY, 1) 6SCHED_FEAT(CACHE_HOT_BUDDY, 1)
7SCHED_FEAT(SYNC_WAKEUPS, 1) 7SCHED_FEAT(SYNC_WAKEUPS, 1)
8SCHED_FEAT(HRTICK, 1) 8SCHED_FEAT(HRTICK, 0)
9SCHED_FEAT(DOUBLE_TICK, 0) 9SCHED_FEAT(DOUBLE_TICK, 0)
10SCHED_FEAT(ASYM_GRAN, 1) 10SCHED_FEAT(ASYM_GRAN, 1)
11SCHED_FEAT(LB_BIAS, 0) 11SCHED_FEAT(LB_BIAS, 1)
12SCHED_FEAT(LB_WAKEUP_UPDATE, 1) 12SCHED_FEAT(LB_WAKEUP_UPDATE, 1)
13SCHED_FEAT(ASYM_EFF_LOAD, 1) 13SCHED_FEAT(ASYM_EFF_LOAD, 1)
14SCHED_FEAT(WAKEUP_OVERLAP, 0)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 3a4f92dbbe66..dec4ccabe2f5 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -14,7 +14,7 @@ static int select_task_rq_idle(struct task_struct *p, int sync)
14/* 14/*
15 * Idle tasks are unconditionally rescheduled: 15 * Idle tasks are unconditionally rescheduled:
16 */ 16 */
17static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p) 17static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sync)
18{ 18{
19 resched_task(rq->idle); 19 resched_task(rq->idle);
20} 20}
@@ -76,7 +76,7 @@ static void switched_to_idle(struct rq *rq, struct task_struct *p,
76 if (running) 76 if (running)
77 resched_task(rq->curr); 77 resched_task(rq->curr);
78 else 78 else
79 check_preempt_curr(rq, p); 79 check_preempt_curr(rq, p, 0);
80} 80}
81 81
82static void prio_changed_idle(struct rq *rq, struct task_struct *p, 82static void prio_changed_idle(struct rq *rq, struct task_struct *p,
@@ -93,7 +93,7 @@ static void prio_changed_idle(struct rq *rq, struct task_struct *p,
93 if (p->prio > oldprio) 93 if (p->prio > oldprio)
94 resched_task(rq->curr); 94 resched_task(rq->curr);
95 } else 95 } else
96 check_preempt_curr(rq, p); 96 check_preempt_curr(rq, p, 0);
97} 97}
98 98
99/* 99/*
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 47ceac9e8552..b446dc87494f 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -102,12 +102,12 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
102 102
103static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 103static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
104{ 104{
105 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
105 struct sched_rt_entity *rt_se = rt_rq->rt_se; 106 struct sched_rt_entity *rt_se = rt_rq->rt_se;
106 107
107 if (rt_se && !on_rt_rq(rt_se) && rt_rq->rt_nr_running) { 108 if (rt_rq->rt_nr_running) {
108 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr; 109 if (rt_se && !on_rt_rq(rt_se))
109 110 enqueue_rt_entity(rt_se);
110 enqueue_rt_entity(rt_se);
111 if (rt_rq->highest_prio < curr->prio) 111 if (rt_rq->highest_prio < curr->prio)
112 resched_task(curr); 112 resched_task(curr);
113 } 113 }
@@ -199,6 +199,8 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
199 199
200static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) 200static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
201{ 201{
202 if (rt_rq->rt_nr_running)
203 resched_task(rq_of_rt_rq(rt_rq)->curr);
202} 204}
203 205
204static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) 206static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
@@ -229,6 +231,9 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
229#endif /* CONFIG_RT_GROUP_SCHED */ 231#endif /* CONFIG_RT_GROUP_SCHED */
230 232
231#ifdef CONFIG_SMP 233#ifdef CONFIG_SMP
234/*
235 * We ran out of runtime, see if we can borrow some from our neighbours.
236 */
232static int do_balance_runtime(struct rt_rq *rt_rq) 237static int do_balance_runtime(struct rt_rq *rt_rq)
233{ 238{
234 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 239 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
@@ -240,7 +245,7 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
240 245
241 spin_lock(&rt_b->rt_runtime_lock); 246 spin_lock(&rt_b->rt_runtime_lock);
242 rt_period = ktime_to_ns(rt_b->rt_period); 247 rt_period = ktime_to_ns(rt_b->rt_period);
243 for_each_cpu_mask(i, rd->span) { 248 for_each_cpu_mask_nr(i, rd->span) {
244 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 249 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
245 s64 diff; 250 s64 diff;
246 251
@@ -248,12 +253,21 @@ static int do_balance_runtime(struct rt_rq *rt_rq)
248 continue; 253 continue;
249 254
250 spin_lock(&iter->rt_runtime_lock); 255 spin_lock(&iter->rt_runtime_lock);
256 /*
257 * Either all rqs have inf runtime and there's nothing to steal
258 * or __disable_runtime() below sets a specific rq to inf to
259 * indicate its been disabled and disalow stealing.
260 */
251 if (iter->rt_runtime == RUNTIME_INF) 261 if (iter->rt_runtime == RUNTIME_INF)
252 goto next; 262 goto next;
253 263
264 /*
265 * From runqueues with spare time, take 1/n part of their
266 * spare time, but no more than our period.
267 */
254 diff = iter->rt_runtime - iter->rt_time; 268 diff = iter->rt_runtime - iter->rt_time;
255 if (diff > 0) { 269 if (diff > 0) {
256 do_div(diff, weight); 270 diff = div_u64((u64)diff, weight);
257 if (rt_rq->rt_runtime + diff > rt_period) 271 if (rt_rq->rt_runtime + diff > rt_period)
258 diff = rt_period - rt_rq->rt_runtime; 272 diff = rt_period - rt_rq->rt_runtime;
259 iter->rt_runtime -= diff; 273 iter->rt_runtime -= diff;
@@ -272,6 +286,9 @@ next:
272 return more; 286 return more;
273} 287}
274 288
289/*
290 * Ensure this RQ takes back all the runtime it lend to its neighbours.
291 */
275static void __disable_runtime(struct rq *rq) 292static void __disable_runtime(struct rq *rq)
276{ 293{
277 struct root_domain *rd = rq->rd; 294 struct root_domain *rd = rq->rd;
@@ -287,18 +304,34 @@ static void __disable_runtime(struct rq *rq)
287 304
288 spin_lock(&rt_b->rt_runtime_lock); 305 spin_lock(&rt_b->rt_runtime_lock);
289 spin_lock(&rt_rq->rt_runtime_lock); 306 spin_lock(&rt_rq->rt_runtime_lock);
307 /*
308 * Either we're all inf and nobody needs to borrow, or we're
309 * already disabled and thus have nothing to do, or we have
310 * exactly the right amount of runtime to take out.
311 */
290 if (rt_rq->rt_runtime == RUNTIME_INF || 312 if (rt_rq->rt_runtime == RUNTIME_INF ||
291 rt_rq->rt_runtime == rt_b->rt_runtime) 313 rt_rq->rt_runtime == rt_b->rt_runtime)
292 goto balanced; 314 goto balanced;
293 spin_unlock(&rt_rq->rt_runtime_lock); 315 spin_unlock(&rt_rq->rt_runtime_lock);
294 316
317 /*
318 * Calculate the difference between what we started out with
319 * and what we current have, that's the amount of runtime
320 * we lend and now have to reclaim.
321 */
295 want = rt_b->rt_runtime - rt_rq->rt_runtime; 322 want = rt_b->rt_runtime - rt_rq->rt_runtime;
296 323
324 /*
325 * Greedy reclaim, take back as much as we can.
326 */
297 for_each_cpu_mask(i, rd->span) { 327 for_each_cpu_mask(i, rd->span) {
298 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i); 328 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
299 s64 diff; 329 s64 diff;
300 330
301 if (iter == rt_rq) 331 /*
332 * Can't reclaim from ourselves or disabled runqueues.
333 */
334 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
302 continue; 335 continue;
303 336
304 spin_lock(&iter->rt_runtime_lock); 337 spin_lock(&iter->rt_runtime_lock);
@@ -317,8 +350,16 @@ static void __disable_runtime(struct rq *rq)
317 } 350 }
318 351
319 spin_lock(&rt_rq->rt_runtime_lock); 352 spin_lock(&rt_rq->rt_runtime_lock);
353 /*
354 * We cannot be left wanting - that would mean some runtime
355 * leaked out of the system.
356 */
320 BUG_ON(want); 357 BUG_ON(want);
321balanced: 358balanced:
359 /*
360 * Disable all the borrow logic by pretending we have inf
361 * runtime - in which case borrowing doesn't make sense.
362 */
322 rt_rq->rt_runtime = RUNTIME_INF; 363 rt_rq->rt_runtime = RUNTIME_INF;
323 spin_unlock(&rt_rq->rt_runtime_lock); 364 spin_unlock(&rt_rq->rt_runtime_lock);
324 spin_unlock(&rt_b->rt_runtime_lock); 365 spin_unlock(&rt_b->rt_runtime_lock);
@@ -341,6 +382,9 @@ static void __enable_runtime(struct rq *rq)
341 if (unlikely(!scheduler_running)) 382 if (unlikely(!scheduler_running))
342 return; 383 return;
343 384
385 /*
386 * Reset each runqueue's bandwidth settings
387 */
344 for_each_leaf_rt_rq(rt_rq, rq) { 388 for_each_leaf_rt_rq(rt_rq, rq) {
345 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); 389 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
346 390
@@ -348,6 +392,7 @@ static void __enable_runtime(struct rq *rq)
348 spin_lock(&rt_rq->rt_runtime_lock); 392 spin_lock(&rt_rq->rt_runtime_lock);
349 rt_rq->rt_runtime = rt_b->rt_runtime; 393 rt_rq->rt_runtime = rt_b->rt_runtime;
350 rt_rq->rt_time = 0; 394 rt_rq->rt_time = 0;
395 rt_rq->rt_throttled = 0;
351 spin_unlock(&rt_rq->rt_runtime_lock); 396 spin_unlock(&rt_rq->rt_runtime_lock);
352 spin_unlock(&rt_b->rt_runtime_lock); 397 spin_unlock(&rt_b->rt_runtime_lock);
353 } 398 }
@@ -386,7 +431,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
386 int i, idle = 1; 431 int i, idle = 1;
387 cpumask_t span; 432 cpumask_t span;
388 433
389 if (rt_b->rt_runtime == RUNTIME_INF) 434 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
390 return 1; 435 return 1;
391 436
392 span = sched_rt_period_mask(); 437 span = sched_rt_period_mask();
@@ -438,9 +483,6 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
438{ 483{
439 u64 runtime = sched_rt_runtime(rt_rq); 484 u64 runtime = sched_rt_runtime(rt_rq);
440 485
441 if (runtime == RUNTIME_INF)
442 return 0;
443
444 if (rt_rq->rt_throttled) 486 if (rt_rq->rt_throttled)
445 return rt_rq_throttled(rt_rq); 487 return rt_rq_throttled(rt_rq);
446 488
@@ -484,16 +526,23 @@ static void update_curr_rt(struct rq *rq)
484 schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec)); 526 schedstat_set(curr->se.exec_max, max(curr->se.exec_max, delta_exec));
485 527
486 curr->se.sum_exec_runtime += delta_exec; 528 curr->se.sum_exec_runtime += delta_exec;
529 account_group_exec_runtime(curr, delta_exec);
530
487 curr->se.exec_start = rq->clock; 531 curr->se.exec_start = rq->clock;
488 cpuacct_charge(curr, delta_exec); 532 cpuacct_charge(curr, delta_exec);
489 533
534 if (!rt_bandwidth_enabled())
535 return;
536
490 for_each_sched_rt_entity(rt_se) { 537 for_each_sched_rt_entity(rt_se) {
491 rt_rq = rt_rq_of_se(rt_se); 538 rt_rq = rt_rq_of_se(rt_se);
492 539
493 spin_lock(&rt_rq->rt_runtime_lock); 540 spin_lock(&rt_rq->rt_runtime_lock);
494 rt_rq->rt_time += delta_exec; 541 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
495 if (sched_rt_runtime_exceeded(rt_rq)) 542 rt_rq->rt_time += delta_exec;
496 resched_task(curr); 543 if (sched_rt_runtime_exceeded(rt_rq))
544 resched_task(curr);
545 }
497 spin_unlock(&rt_rq->rt_runtime_lock); 546 spin_unlock(&rt_rq->rt_runtime_lock);
498 } 547 }
499} 548}
@@ -505,7 +554,9 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
505 rt_rq->rt_nr_running++; 554 rt_rq->rt_nr_running++;
506#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED 555#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
507 if (rt_se_prio(rt_se) < rt_rq->highest_prio) { 556 if (rt_se_prio(rt_se) < rt_rq->highest_prio) {
557#ifdef CONFIG_SMP
508 struct rq *rq = rq_of_rt_rq(rt_rq); 558 struct rq *rq = rq_of_rt_rq(rt_rq);
559#endif
509 560
510 rt_rq->highest_prio = rt_se_prio(rt_se); 561 rt_rq->highest_prio = rt_se_prio(rt_se);
511#ifdef CONFIG_SMP 562#ifdef CONFIG_SMP
@@ -599,11 +650,7 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
599 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) 650 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
600 return; 651 return;
601 652
602 if (rt_se->nr_cpus_allowed == 1) 653 list_add_tail(&rt_se->run_list, queue);
603 list_add(&rt_se->run_list, queue);
604 else
605 list_add_tail(&rt_se->run_list, queue);
606
607 __set_bit(rt_se_prio(rt_se), array->bitmap); 654 __set_bit(rt_se_prio(rt_se), array->bitmap);
608 655
609 inc_rt_tasks(rt_se, rt_rq); 656 inc_rt_tasks(rt_se, rt_rq);
@@ -688,32 +735,34 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
688 * Put task to the end of the run list without the overhead of dequeue 735 * Put task to the end of the run list without the overhead of dequeue
689 * followed by enqueue. 736 * followed by enqueue.
690 */ 737 */
691static 738static void
692void requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) 739requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
693{ 740{
694 struct rt_prio_array *array = &rt_rq->active;
695
696 if (on_rt_rq(rt_se)) { 741 if (on_rt_rq(rt_se)) {
697 list_del_init(&rt_se->run_list); 742 struct rt_prio_array *array = &rt_rq->active;
698 list_add_tail(&rt_se->run_list, 743 struct list_head *queue = array->queue + rt_se_prio(rt_se);
699 array->queue + rt_se_prio(rt_se)); 744
745 if (head)
746 list_move(&rt_se->run_list, queue);
747 else
748 list_move_tail(&rt_se->run_list, queue);
700 } 749 }
701} 750}
702 751
703static void requeue_task_rt(struct rq *rq, struct task_struct *p) 752static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
704{ 753{
705 struct sched_rt_entity *rt_se = &p->rt; 754 struct sched_rt_entity *rt_se = &p->rt;
706 struct rt_rq *rt_rq; 755 struct rt_rq *rt_rq;
707 756
708 for_each_sched_rt_entity(rt_se) { 757 for_each_sched_rt_entity(rt_se) {
709 rt_rq = rt_rq_of_se(rt_se); 758 rt_rq = rt_rq_of_se(rt_se);
710 requeue_rt_entity(rt_rq, rt_se); 759 requeue_rt_entity(rt_rq, rt_se, head);
711 } 760 }
712} 761}
713 762
714static void yield_task_rt(struct rq *rq) 763static void yield_task_rt(struct rq *rq)
715{ 764{
716 requeue_task_rt(rq, rq->curr); 765 requeue_task_rt(rq, rq->curr, 0);
717} 766}
718 767
719#ifdef CONFIG_SMP 768#ifdef CONFIG_SMP
@@ -753,12 +802,36 @@ static int select_task_rq_rt(struct task_struct *p, int sync)
753 */ 802 */
754 return task_cpu(p); 803 return task_cpu(p);
755} 804}
805
806static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
807{
808 cpumask_t mask;
809
810 if (rq->curr->rt.nr_cpus_allowed == 1)
811 return;
812
813 if (p->rt.nr_cpus_allowed != 1
814 && cpupri_find(&rq->rd->cpupri, p, &mask))
815 return;
816
817 if (!cpupri_find(&rq->rd->cpupri, rq->curr, &mask))
818 return;
819
820 /*
821 * There appears to be other cpus that can accept
822 * current and none to run 'p', so lets reschedule
823 * to try and push current away:
824 */
825 requeue_task_rt(rq, p, 1);
826 resched_task(rq->curr);
827}
828
756#endif /* CONFIG_SMP */ 829#endif /* CONFIG_SMP */
757 830
758/* 831/*
759 * Preempt the current task with a newly woken task if needed: 832 * Preempt the current task with a newly woken task if needed:
760 */ 833 */
761static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) 834static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int sync)
762{ 835{
763 if (p->prio < rq->curr->prio) { 836 if (p->prio < rq->curr->prio) {
764 resched_task(rq->curr); 837 resched_task(rq->curr);
@@ -778,18 +851,8 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p)
778 * to move current somewhere else, making room for our non-migratable 851 * to move current somewhere else, making room for our non-migratable
779 * task. 852 * task.
780 */ 853 */
781 if((p->prio == rq->curr->prio) 854 if (p->prio == rq->curr->prio && !need_resched())
782 && p->rt.nr_cpus_allowed == 1 855 check_preempt_equal_prio(rq, p);
783 && rq->curr->rt.nr_cpus_allowed != 1) {
784 cpumask_t mask;
785
786 if (cpupri_find(&rq->rd->cpupri, rq->curr, &mask))
787 /*
788 * There appears to be other cpus that can accept
789 * current, so lets reschedule to try and push it away
790 */
791 resched_task(rq->curr);
792 }
793#endif 856#endif
794} 857}
795 858
@@ -847,6 +910,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
847#define RT_MAX_TRIES 3 910#define RT_MAX_TRIES 3
848 911
849static int double_lock_balance(struct rq *this_rq, struct rq *busiest); 912static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
913static void double_unlock_balance(struct rq *this_rq, struct rq *busiest);
914
850static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep); 915static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
851 916
852static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) 917static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
@@ -922,6 +987,13 @@ static int find_lowest_rq(struct task_struct *task)
922 return -1; /* No targets found */ 987 return -1; /* No targets found */
923 988
924 /* 989 /*
990 * Only consider CPUs that are usable for migration.
991 * I guess we might want to change cpupri_find() to ignore those
992 * in the first place.
993 */
994 cpus_and(*lowest_mask, *lowest_mask, cpu_active_map);
995
996 /*
925 * At this point we have built a mask of cpus representing the 997 * At this point we have built a mask of cpus representing the
926 * lowest priority tasks in the system. Now we want to elect 998 * lowest priority tasks in the system. Now we want to elect
927 * the best one based on our affinity and topology. 999 * the best one based on our affinity and topology.
@@ -1001,7 +1073,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1001 break; 1073 break;
1002 1074
1003 /* try again */ 1075 /* try again */
1004 spin_unlock(&lowest_rq->lock); 1076 double_unlock_balance(rq, lowest_rq);
1005 lowest_rq = NULL; 1077 lowest_rq = NULL;
1006 } 1078 }
1007 1079
@@ -1070,7 +1142,7 @@ static int push_rt_task(struct rq *rq)
1070 1142
1071 resched_task(lowest_rq->curr); 1143 resched_task(lowest_rq->curr);
1072 1144
1073 spin_unlock(&lowest_rq->lock); 1145 double_unlock_balance(rq, lowest_rq);
1074 1146
1075 ret = 1; 1147 ret = 1;
1076out: 1148out:
@@ -1107,7 +1179,7 @@ static int pull_rt_task(struct rq *this_rq)
1107 1179
1108 next = pick_next_task_rt(this_rq); 1180 next = pick_next_task_rt(this_rq);
1109 1181
1110 for_each_cpu_mask(cpu, this_rq->rd->rto_mask) { 1182 for_each_cpu_mask_nr(cpu, this_rq->rd->rto_mask) {
1111 if (this_cpu == cpu) 1183 if (this_cpu == cpu)
1112 continue; 1184 continue;
1113 1185
@@ -1176,7 +1248,7 @@ static int pull_rt_task(struct rq *this_rq)
1176 1248
1177 } 1249 }
1178 skip: 1250 skip:
1179 spin_unlock(&src_rq->lock); 1251 double_unlock_balance(this_rq, src_rq);
1180 } 1252 }
1181 1253
1182 return ret; 1254 return ret;
@@ -1388,7 +1460,7 @@ static void watchdog(struct rq *rq, struct task_struct *p)
1388 p->rt.timeout++; 1460 p->rt.timeout++;
1389 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ); 1461 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
1390 if (p->rt.timeout > next) 1462 if (p->rt.timeout > next)
1391 p->it_sched_expires = p->se.sum_exec_runtime; 1463 p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
1392 } 1464 }
1393} 1465}
1394 1466
@@ -1415,7 +1487,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
1415 * on the queue: 1487 * on the queue:
1416 */ 1488 */
1417 if (p->rt.run_list.prev != p->rt.run_list.next) { 1489 if (p->rt.run_list.prev != p->rt.run_list.next) {
1418 requeue_task_rt(rq, p); 1490 requeue_task_rt(rq, p, 0);
1419 set_tsk_need_resched(p); 1491 set_tsk_need_resched(p);
1420 } 1492 }
1421} 1493}
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 8385d43987e2..ee71bec1da66 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -9,7 +9,7 @@
9static int show_schedstat(struct seq_file *seq, void *v) 9static int show_schedstat(struct seq_file *seq, void *v)
10{ 10{
11 int cpu; 11 int cpu;
12 int mask_len = NR_CPUS/32 * 9; 12 int mask_len = DIV_ROUND_UP(NR_CPUS, 32) * 9;
13 char *mask_str = kmalloc(mask_len, GFP_KERNEL); 13 char *mask_str = kmalloc(mask_len, GFP_KERNEL);
14 14
15 if (mask_str == NULL) 15 if (mask_str == NULL)
@@ -90,13 +90,20 @@ static int schedstat_open(struct inode *inode, struct file *file)
90 return res; 90 return res;
91} 91}
92 92
93const struct file_operations proc_schedstat_operations = { 93static const struct file_operations proc_schedstat_operations = {
94 .open = schedstat_open, 94 .open = schedstat_open,
95 .read = seq_read, 95 .read = seq_read,
96 .llseek = seq_lseek, 96 .llseek = seq_lseek,
97 .release = single_release, 97 .release = single_release,
98}; 98};
99 99
100static int __init proc_schedstat_init(void)
101{
102 proc_create("schedstat", 0, NULL, &proc_schedstat_operations);
103 return 0;
104}
105module_init(proc_schedstat_init);
106
100/* 107/*
101 * Expects runqueue lock to be held for atomicity of update 108 * Expects runqueue lock to be held for atomicity of update
102 */ 109 */
@@ -270,3 +277,89 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
270#define sched_info_switch(t, next) do { } while (0) 277#define sched_info_switch(t, next) do { } while (0)
271#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ 278#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
272 279
280/*
281 * The following are functions that support scheduler-internal time accounting.
282 * These functions are generally called at the timer tick. None of this depends
283 * on CONFIG_SCHEDSTATS.
284 */
285
286/**
287 * account_group_user_time - Maintain utime for a thread group.
288 *
289 * @tsk: Pointer to task structure.
290 * @cputime: Time value by which to increment the utime field of the
291 * thread_group_cputime structure.
292 *
293 * If thread group time is being maintained, get the structure for the
294 * running CPU and update the utime field there.
295 */
296static inline void account_group_user_time(struct task_struct *tsk,
297 cputime_t cputime)
298{
299 struct signal_struct *sig;
300
301 sig = tsk->signal;
302 if (unlikely(!sig))
303 return;
304 if (sig->cputime.totals) {
305 struct task_cputime *times;
306
307 times = per_cpu_ptr(sig->cputime.totals, get_cpu());
308 times->utime = cputime_add(times->utime, cputime);
309 put_cpu_no_resched();
310 }
311}
312
313/**
314 * account_group_system_time - Maintain stime for a thread group.
315 *
316 * @tsk: Pointer to task structure.
317 * @cputime: Time value by which to increment the stime field of the
318 * thread_group_cputime structure.
319 *
320 * If thread group time is being maintained, get the structure for the
321 * running CPU and update the stime field there.
322 */
323static inline void account_group_system_time(struct task_struct *tsk,
324 cputime_t cputime)
325{
326 struct signal_struct *sig;
327
328 sig = tsk->signal;
329 if (unlikely(!sig))
330 return;
331 if (sig->cputime.totals) {
332 struct task_cputime *times;
333
334 times = per_cpu_ptr(sig->cputime.totals, get_cpu());
335 times->stime = cputime_add(times->stime, cputime);
336 put_cpu_no_resched();
337 }
338}
339
340/**
341 * account_group_exec_runtime - Maintain exec runtime for a thread group.
342 *
343 * @tsk: Pointer to task structure.
344 * @ns: Time value by which to increment the sum_exec_runtime field
345 * of the thread_group_cputime structure.
346 *
347 * If thread group time is being maintained, get the structure for the
348 * running CPU and update the sum_exec_runtime field there.
349 */
350static inline void account_group_exec_runtime(struct task_struct *tsk,
351 unsigned long long ns)
352{
353 struct signal_struct *sig;
354
355 sig = tsk->signal;
356 if (unlikely(!sig))
357 return;
358 if (sig->cputime.totals) {
359 struct task_cputime *times;
360
361 times = per_cpu_ptr(sig->cputime.totals, get_cpu());
362 times->sum_exec_runtime += ns;
363 put_cpu_no_resched();
364 }
365}
diff --git a/kernel/semaphore.c b/kernel/semaphore.c
index aaaeae8244e7..94a62c0d4ade 100644
--- a/kernel/semaphore.c
+++ b/kernel/semaphore.c
@@ -212,9 +212,7 @@ static inline int __sched __down_common(struct semaphore *sem, long state,
212 waiter.up = 0; 212 waiter.up = 0;
213 213
214 for (;;) { 214 for (;;) {
215 if (state == TASK_INTERRUPTIBLE && signal_pending(task)) 215 if (signal_pending_state(state, task))
216 goto interrupted;
217 if (state == TASK_KILLABLE && fatal_signal_pending(task))
218 goto interrupted; 216 goto interrupted;
219 if (timeout <= 0) 217 if (timeout <= 0)
220 goto timed_out; 218 goto timed_out;
diff --git a/kernel/signal.c b/kernel/signal.c
index 6c0958e52ea7..105217da5c82 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -22,10 +22,12 @@
22#include <linux/ptrace.h> 22#include <linux/ptrace.h>
23#include <linux/signal.h> 23#include <linux/signal.h>
24#include <linux/signalfd.h> 24#include <linux/signalfd.h>
25#include <linux/tracehook.h>
25#include <linux/capability.h> 26#include <linux/capability.h>
26#include <linux/freezer.h> 27#include <linux/freezer.h>
27#include <linux/pid_namespace.h> 28#include <linux/pid_namespace.h>
28#include <linux/nsproxy.h> 29#include <linux/nsproxy.h>
30#include <trace/sched.h>
29 31
30#include <asm/param.h> 32#include <asm/param.h>
31#include <asm/uaccess.h> 33#include <asm/uaccess.h>
@@ -39,24 +41,21 @@
39 41
40static struct kmem_cache *sigqueue_cachep; 42static struct kmem_cache *sigqueue_cachep;
41 43
42static int __sig_ignored(struct task_struct *t, int sig) 44static void __user *sig_handler(struct task_struct *t, int sig)
43{ 45{
44 void __user *handler; 46 return t->sighand->action[sig - 1].sa.sa_handler;
47}
45 48
49static int sig_handler_ignored(void __user *handler, int sig)
50{
46 /* Is it explicitly or implicitly ignored? */ 51 /* Is it explicitly or implicitly ignored? */
47
48 handler = t->sighand->action[sig - 1].sa.sa_handler;
49 return handler == SIG_IGN || 52 return handler == SIG_IGN ||
50 (handler == SIG_DFL && sig_kernel_ignore(sig)); 53 (handler == SIG_DFL && sig_kernel_ignore(sig));
51} 54}
52 55
53static int sig_ignored(struct task_struct *t, int sig) 56static int sig_ignored(struct task_struct *t, int sig)
54{ 57{
55 /* 58 void __user *handler;
56 * Tracers always want to know about signals..
57 */
58 if (t->ptrace & PT_PTRACED)
59 return 0;
60 59
61 /* 60 /*
62 * Blocked signals are never ignored, since the 61 * Blocked signals are never ignored, since the
@@ -66,7 +65,14 @@ static int sig_ignored(struct task_struct *t, int sig)
66 if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig)) 65 if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
67 return 0; 66 return 0;
68 67
69 return __sig_ignored(t, sig); 68 handler = sig_handler(t, sig);
69 if (!sig_handler_ignored(handler, sig))
70 return 0;
71
72 /*
73 * Tracers may want to know about even ignored signals.
74 */
75 return !tracehook_consider_ignored_signal(t, sig, handler);
70} 76}
71 77
72/* 78/*
@@ -129,7 +135,9 @@ void recalc_sigpending_and_wake(struct task_struct *t)
129 135
130void recalc_sigpending(void) 136void recalc_sigpending(void)
131{ 137{
132 if (!recalc_sigpending_tsk(current) && !freezing(current)) 138 if (unlikely(tracehook_force_sigpending()))
139 set_thread_flag(TIF_SIGPENDING);
140 else if (!recalc_sigpending_tsk(current) && !freezing(current))
133 clear_thread_flag(TIF_SIGPENDING); 141 clear_thread_flag(TIF_SIGPENDING);
134 142
135} 143}
@@ -295,12 +303,12 @@ flush_signal_handlers(struct task_struct *t, int force_default)
295 303
296int unhandled_signal(struct task_struct *tsk, int sig) 304int unhandled_signal(struct task_struct *tsk, int sig)
297{ 305{
306 void __user *handler = tsk->sighand->action[sig-1].sa.sa_handler;
298 if (is_global_init(tsk)) 307 if (is_global_init(tsk))
299 return 1; 308 return 1;
300 if (tsk->ptrace & PT_PTRACED) 309 if (handler != SIG_IGN && handler != SIG_DFL)
301 return 0; 310 return 0;
302 return (tsk->sighand->action[sig-1].sa.sa_handler == SIG_IGN) || 311 return !tracehook_consider_fatal_signal(tsk, sig, handler);
303 (tsk->sighand->action[sig-1].sa.sa_handler == SIG_DFL);
304} 312}
305 313
306 314
@@ -338,13 +346,9 @@ unblock_all_signals(void)
338 spin_unlock_irqrestore(&current->sighand->siglock, flags); 346 spin_unlock_irqrestore(&current->sighand->siglock, flags);
339} 347}
340 348
341static int collect_signal(int sig, struct sigpending *list, siginfo_t *info) 349static void collect_signal(int sig, struct sigpending *list, siginfo_t *info)
342{ 350{
343 struct sigqueue *q, *first = NULL; 351 struct sigqueue *q, *first = NULL;
344 int still_pending = 0;
345
346 if (unlikely(!sigismember(&list->signal, sig)))
347 return 0;
348 352
349 /* 353 /*
350 * Collect the siginfo appropriate to this signal. Check if 354 * Collect the siginfo appropriate to this signal. Check if
@@ -352,33 +356,30 @@ static int collect_signal(int sig, struct sigpending *list, siginfo_t *info)
352 */ 356 */
353 list_for_each_entry(q, &list->list, list) { 357 list_for_each_entry(q, &list->list, list) {
354 if (q->info.si_signo == sig) { 358 if (q->info.si_signo == sig) {
355 if (first) { 359 if (first)
356 still_pending = 1; 360 goto still_pending;
357 break;
358 }
359 first = q; 361 first = q;
360 } 362 }
361 } 363 }
364
365 sigdelset(&list->signal, sig);
366
362 if (first) { 367 if (first) {
368still_pending:
363 list_del_init(&first->list); 369 list_del_init(&first->list);
364 copy_siginfo(info, &first->info); 370 copy_siginfo(info, &first->info);
365 __sigqueue_free(first); 371 __sigqueue_free(first);
366 if (!still_pending)
367 sigdelset(&list->signal, sig);
368 } else { 372 } else {
369
370 /* Ok, it wasn't in the queue. This must be 373 /* Ok, it wasn't in the queue. This must be
371 a fast-pathed signal or we must have been 374 a fast-pathed signal or we must have been
372 out of queue space. So zero out the info. 375 out of queue space. So zero out the info.
373 */ 376 */
374 sigdelset(&list->signal, sig);
375 info->si_signo = sig; 377 info->si_signo = sig;
376 info->si_errno = 0; 378 info->si_errno = 0;
377 info->si_code = 0; 379 info->si_code = 0;
378 info->si_pid = 0; 380 info->si_pid = 0;
379 info->si_uid = 0; 381 info->si_uid = 0;
380 } 382 }
381 return 1;
382} 383}
383 384
384static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, 385static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
@@ -396,8 +397,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
396 } 397 }
397 } 398 }
398 399
399 if (!collect_signal(sig, pending, info)) 400 collect_signal(sig, pending, info);
400 sig = 0;
401 } 401 }
402 402
403 return sig; 403 return sig;
@@ -462,8 +462,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
462 * is to alert stop-signal processing code when another 462 * is to alert stop-signal processing code when another
463 * processor has come along and cleared the flag. 463 * processor has come along and cleared the flag.
464 */ 464 */
465 if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) 465 tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
466 tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
467 } 466 }
468 if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) { 467 if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) {
469 /* 468 /*
@@ -600,9 +599,6 @@ static int check_kill_permission(int sig, struct siginfo *info,
600 return security_task_kill(t, info, sig, 0); 599 return security_task_kill(t, info, sig, 0);
601} 600}
602 601
603/* forward decl */
604static void do_notify_parent_cldstop(struct task_struct *tsk, int why);
605
606/* 602/*
607 * Handle magic process-wide effects of stop/continue signals. Unlike 603 * Handle magic process-wide effects of stop/continue signals. Unlike
608 * the signal actions, these happen immediately at signal-generation 604 * the signal actions, these happen immediately at signal-generation
@@ -765,7 +761,8 @@ static void complete_signal(int sig, struct task_struct *p, int group)
765 if (sig_fatal(p, sig) && 761 if (sig_fatal(p, sig) &&
766 !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) && 762 !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
767 !sigismember(&t->real_blocked, sig) && 763 !sigismember(&t->real_blocked, sig) &&
768 (sig == SIGKILL || !(t->ptrace & PT_PTRACED))) { 764 (sig == SIGKILL ||
765 !tracehook_consider_fatal_signal(t, sig, SIG_DFL))) {
769 /* 766 /*
770 * This signal will be fatal to the whole group. 767 * This signal will be fatal to the whole group.
771 */ 768 */
@@ -807,6 +804,8 @@ static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
807 struct sigpending *pending; 804 struct sigpending *pending;
808 struct sigqueue *q; 805 struct sigqueue *q;
809 806
807 trace_sched_signal_send(sig, t);
808
810 assert_spin_locked(&t->sighand->siglock); 809 assert_spin_locked(&t->sighand->siglock);
811 if (!prepare_signal(sig, t)) 810 if (!prepare_signal(sig, t))
812 return 0; 811 return 0;
@@ -1125,7 +1124,7 @@ EXPORT_SYMBOL_GPL(kill_pid_info_as_uid);
1125 * is probably wrong. Should make it like BSD or SYSV. 1124 * is probably wrong. Should make it like BSD or SYSV.
1126 */ 1125 */
1127 1126
1128static int kill_something_info(int sig, struct siginfo *info, int pid) 1127static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
1129{ 1128{
1130 int ret; 1129 int ret;
1131 1130
@@ -1237,17 +1236,6 @@ int kill_pid(struct pid *pid, int sig, int priv)
1237} 1236}
1238EXPORT_SYMBOL(kill_pid); 1237EXPORT_SYMBOL(kill_pid);
1239 1238
1240int
1241kill_proc(pid_t pid, int sig, int priv)
1242{
1243 int ret;
1244
1245 rcu_read_lock();
1246 ret = kill_pid_info(sig, __si_special(priv), find_pid(pid));
1247 rcu_read_unlock();
1248 return ret;
1249}
1250
1251/* 1239/*
1252 * These functions support sending signals using preallocated sigqueue 1240 * These functions support sending signals using preallocated sigqueue
1253 * structures. This is needed "because realtime applications cannot 1241 * structures. This is needed "because realtime applications cannot
@@ -1319,6 +1307,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
1319 q->info.si_overrun++; 1307 q->info.si_overrun++;
1320 goto out; 1308 goto out;
1321 } 1309 }
1310 q->info.si_overrun = 0;
1322 1311
1323 signalfd_notify(t, sig); 1312 signalfd_notify(t, sig);
1324 pending = group ? &t->signal->shared_pending : &t->pending; 1313 pending = group ? &t->signal->shared_pending : &t->pending;
@@ -1343,13 +1332,17 @@ static inline void __wake_up_parent(struct task_struct *p,
1343/* 1332/*
1344 * Let a parent know about the death of a child. 1333 * Let a parent know about the death of a child.
1345 * For a stopped/continued status change, use do_notify_parent_cldstop instead. 1334 * For a stopped/continued status change, use do_notify_parent_cldstop instead.
1335 *
1336 * Returns -1 if our parent ignored us and so we've switched to
1337 * self-reaping, or else @sig.
1346 */ 1338 */
1347 1339int do_notify_parent(struct task_struct *tsk, int sig)
1348void do_notify_parent(struct task_struct *tsk, int sig)
1349{ 1340{
1350 struct siginfo info; 1341 struct siginfo info;
1351 unsigned long flags; 1342 unsigned long flags;
1352 struct sighand_struct *psig; 1343 struct sighand_struct *psig;
1344 struct task_cputime cputime;
1345 int ret = sig;
1353 1346
1354 BUG_ON(sig == -1); 1347 BUG_ON(sig == -1);
1355 1348
@@ -1379,11 +1372,9 @@ void do_notify_parent(struct task_struct *tsk, int sig)
1379 1372
1380 info.si_uid = tsk->uid; 1373 info.si_uid = tsk->uid;
1381 1374
1382 /* FIXME: find out whether or not this is supposed to be c*time. */ 1375 thread_group_cputime(tsk, &cputime);
1383 info.si_utime = cputime_to_jiffies(cputime_add(tsk->utime, 1376 info.si_utime = cputime_to_jiffies(cputime.utime);
1384 tsk->signal->utime)); 1377 info.si_stime = cputime_to_jiffies(cputime.stime);
1385 info.si_stime = cputime_to_jiffies(cputime_add(tsk->stime,
1386 tsk->signal->stime));
1387 1378
1388 info.si_status = tsk->exit_code & 0x7f; 1379 info.si_status = tsk->exit_code & 0x7f;
1389 if (tsk->exit_code & 0x80) 1380 if (tsk->exit_code & 0x80)
@@ -1415,14 +1406,16 @@ void do_notify_parent(struct task_struct *tsk, int sig)
1415 * is implementation-defined: we do (if you don't want 1406 * is implementation-defined: we do (if you don't want
1416 * it, just use SIG_IGN instead). 1407 * it, just use SIG_IGN instead).
1417 */ 1408 */
1418 tsk->exit_signal = -1; 1409 ret = tsk->exit_signal = -1;
1419 if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) 1410 if (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN)
1420 sig = 0; 1411 sig = -1;
1421 } 1412 }
1422 if (valid_signal(sig) && sig > 0) 1413 if (valid_signal(sig) && sig > 0)
1423 __group_send_sig_info(sig, &info, tsk->parent); 1414 __group_send_sig_info(sig, &info, tsk->parent);
1424 __wake_up_parent(tsk, tsk->parent); 1415 __wake_up_parent(tsk, tsk->parent);
1425 spin_unlock_irqrestore(&psig->siglock, flags); 1416 spin_unlock_irqrestore(&psig->siglock, flags);
1417
1418 return ret;
1426} 1419}
1427 1420
1428static void do_notify_parent_cldstop(struct task_struct *tsk, int why) 1421static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
@@ -1450,9 +1443,8 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
1450 1443
1451 info.si_uid = tsk->uid; 1444 info.si_uid = tsk->uid;
1452 1445
1453 /* FIXME: find out whether or not this is supposed to be c*time. */ 1446 info.si_utime = cputime_to_clock_t(tsk->utime);
1454 info.si_utime = cputime_to_jiffies(tsk->utime); 1447 info.si_stime = cputime_to_clock_t(tsk->stime);
1455 info.si_stime = cputime_to_jiffies(tsk->stime);
1456 1448
1457 info.si_code = why; 1449 info.si_code = why;
1458 switch (why) { 1450 switch (why) {
@@ -1491,10 +1483,10 @@ static inline int may_ptrace_stop(void)
1491 * is a deadlock situation, and pointless because our tracer 1483 * is a deadlock situation, and pointless because our tracer
1492 * is dead so don't allow us to stop. 1484 * is dead so don't allow us to stop.
1493 * If SIGKILL was already sent before the caller unlocked 1485 * If SIGKILL was already sent before the caller unlocked
1494 * ->siglock we must see ->core_waiters != 0. Otherwise it 1486 * ->siglock we must see ->core_state != NULL. Otherwise it
1495 * is safe to enter schedule(). 1487 * is safe to enter schedule().
1496 */ 1488 */
1497 if (unlikely(current->mm->core_waiters) && 1489 if (unlikely(current->mm->core_state) &&
1498 unlikely(current->mm == current->parent->mm)) 1490 unlikely(current->mm == current->parent->mm))
1499 return 0; 1491 return 0;
1500 1492
@@ -1507,9 +1499,8 @@ static inline int may_ptrace_stop(void)
1507 */ 1499 */
1508static int sigkill_pending(struct task_struct *tsk) 1500static int sigkill_pending(struct task_struct *tsk)
1509{ 1501{
1510 return ((sigismember(&tsk->pending.signal, SIGKILL) || 1502 return sigismember(&tsk->pending.signal, SIGKILL) ||
1511 sigismember(&tsk->signal->shared_pending.signal, SIGKILL)) && 1503 sigismember(&tsk->signal->shared_pending.signal, SIGKILL);
1512 !unlikely(sigismember(&tsk->blocked, SIGKILL)));
1513} 1504}
1514 1505
1515/* 1506/*
@@ -1525,8 +1516,6 @@ static int sigkill_pending(struct task_struct *tsk)
1525 */ 1516 */
1526static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info) 1517static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
1527{ 1518{
1528 int killed = 0;
1529
1530 if (arch_ptrace_stop_needed(exit_code, info)) { 1519 if (arch_ptrace_stop_needed(exit_code, info)) {
1531 /* 1520 /*
1532 * The arch code has something special to do before a 1521 * The arch code has something special to do before a
@@ -1542,7 +1531,8 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
1542 spin_unlock_irq(&current->sighand->siglock); 1531 spin_unlock_irq(&current->sighand->siglock);
1543 arch_ptrace_stop(exit_code, info); 1532 arch_ptrace_stop(exit_code, info);
1544 spin_lock_irq(&current->sighand->siglock); 1533 spin_lock_irq(&current->sighand->siglock);
1545 killed = sigkill_pending(current); 1534 if (sigkill_pending(current))
1535 return;
1546 } 1536 }
1547 1537
1548 /* 1538 /*
@@ -1559,7 +1549,7 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
1559 __set_current_state(TASK_TRACED); 1549 __set_current_state(TASK_TRACED);
1560 spin_unlock_irq(&current->sighand->siglock); 1550 spin_unlock_irq(&current->sighand->siglock);
1561 read_lock(&tasklist_lock); 1551 read_lock(&tasklist_lock);
1562 if (!unlikely(killed) && may_ptrace_stop()) { 1552 if (may_ptrace_stop()) {
1563 do_notify_parent_cldstop(current, CLD_TRAPPED); 1553 do_notify_parent_cldstop(current, CLD_TRAPPED);
1564 read_unlock(&tasklist_lock); 1554 read_unlock(&tasklist_lock);
1565 schedule(); 1555 schedule();
@@ -1623,7 +1613,7 @@ finish_stop(int stop_count)
1623 * a group stop in progress and we are the last to stop, 1613 * a group stop in progress and we are the last to stop,
1624 * report to the parent. When ptraced, every thread reports itself. 1614 * report to the parent. When ptraced, every thread reports itself.
1625 */ 1615 */
1626 if (stop_count == 0 || (current->ptrace & PT_PTRACED)) { 1616 if (tracehook_notify_jctl(stop_count == 0, CLD_STOPPED)) {
1627 read_lock(&tasklist_lock); 1617 read_lock(&tasklist_lock);
1628 do_notify_parent_cldstop(current, CLD_STOPPED); 1618 do_notify_parent_cldstop(current, CLD_STOPPED);
1629 read_unlock(&tasklist_lock); 1619 read_unlock(&tasklist_lock);
@@ -1658,8 +1648,7 @@ static int do_signal_stop(int signr)
1658 } else { 1648 } else {
1659 struct task_struct *t; 1649 struct task_struct *t;
1660 1650
1661 if (unlikely((sig->flags & (SIGNAL_STOP_DEQUEUED | SIGNAL_UNKILLABLE)) 1651 if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) ||
1662 != SIGNAL_STOP_DEQUEUED) ||
1663 unlikely(signal_group_exit(sig))) 1652 unlikely(signal_group_exit(sig)))
1664 return 0; 1653 return 0;
1665 /* 1654 /*
@@ -1760,6 +1749,9 @@ relock:
1760 signal->flags &= ~SIGNAL_CLD_MASK; 1749 signal->flags &= ~SIGNAL_CLD_MASK;
1761 spin_unlock_irq(&sighand->siglock); 1750 spin_unlock_irq(&sighand->siglock);
1762 1751
1752 if (unlikely(!tracehook_notify_jctl(1, why)))
1753 goto relock;
1754
1763 read_lock(&tasklist_lock); 1755 read_lock(&tasklist_lock);
1764 do_notify_parent_cldstop(current->group_leader, why); 1756 do_notify_parent_cldstop(current->group_leader, why);
1765 read_unlock(&tasklist_lock); 1757 read_unlock(&tasklist_lock);
@@ -1773,17 +1765,33 @@ relock:
1773 do_signal_stop(0)) 1765 do_signal_stop(0))
1774 goto relock; 1766 goto relock;
1775 1767
1776 signr = dequeue_signal(current, &current->blocked, info); 1768 /*
1777 if (!signr) 1769 * Tracing can induce an artifical signal and choose sigaction.
1778 break; /* will return 0 */ 1770 * The return value in @signr determines the default action,
1771 * but @info->si_signo is the signal number we will report.
1772 */
1773 signr = tracehook_get_signal(current, regs, info, return_ka);
1774 if (unlikely(signr < 0))
1775 goto relock;
1776 if (unlikely(signr != 0))
1777 ka = return_ka;
1778 else {
1779 signr = dequeue_signal(current, &current->blocked,
1780 info);
1779 1781
1780 if (signr != SIGKILL) {
1781 signr = ptrace_signal(signr, info, regs, cookie);
1782 if (!signr) 1782 if (!signr)
1783 continue; 1783 break; /* will return 0 */
1784
1785 if (signr != SIGKILL) {
1786 signr = ptrace_signal(signr, info,
1787 regs, cookie);
1788 if (!signr)
1789 continue;
1790 }
1791
1792 ka = &sighand->action[signr-1];
1784 } 1793 }
1785 1794
1786 ka = &sighand->action[signr-1];
1787 if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */ 1795 if (ka->sa.sa_handler == SIG_IGN) /* Do nothing. */
1788 continue; 1796 continue;
1789 if (ka->sa.sa_handler != SIG_DFL) { 1797 if (ka->sa.sa_handler != SIG_DFL) {
@@ -1831,7 +1839,7 @@ relock:
1831 spin_lock_irq(&sighand->siglock); 1839 spin_lock_irq(&sighand->siglock);
1832 } 1840 }
1833 1841
1834 if (likely(do_signal_stop(signr))) { 1842 if (likely(do_signal_stop(info->si_signo))) {
1835 /* It released the siglock. */ 1843 /* It released the siglock. */
1836 goto relock; 1844 goto relock;
1837 } 1845 }
@@ -1852,7 +1860,7 @@ relock:
1852 1860
1853 if (sig_kernel_coredump(signr)) { 1861 if (sig_kernel_coredump(signr)) {
1854 if (print_fatal_signals) 1862 if (print_fatal_signals)
1855 print_fatal_signal(regs, signr); 1863 print_fatal_signal(regs, info->si_signo);
1856 /* 1864 /*
1857 * If it was able to dump core, this kills all 1865 * If it was able to dump core, this kills all
1858 * other threads in the group and synchronizes with 1866 * other threads in the group and synchronizes with
@@ -1861,13 +1869,13 @@ relock:
1861 * first and our do_group_exit call below will use 1869 * first and our do_group_exit call below will use
1862 * that value and ignore the one we pass it. 1870 * that value and ignore the one we pass it.
1863 */ 1871 */
1864 do_coredump((long)signr, signr, regs); 1872 do_coredump(info->si_signo, info->si_signo, regs);
1865 } 1873 }
1866 1874
1867 /* 1875 /*
1868 * Death signals, no core dump. 1876 * Death signals, no core dump.
1869 */ 1877 */
1870 do_group_exit(signr); 1878 do_group_exit(info->si_signo);
1871 /* NOTREACHED */ 1879 /* NOTREACHED */
1872 } 1880 }
1873 spin_unlock_irq(&sighand->siglock); 1881 spin_unlock_irq(&sighand->siglock);
@@ -1909,7 +1917,7 @@ void exit_signals(struct task_struct *tsk)
1909out: 1917out:
1910 spin_unlock_irq(&tsk->sighand->siglock); 1918 spin_unlock_irq(&tsk->sighand->siglock);
1911 1919
1912 if (unlikely(group_stop)) { 1920 if (unlikely(group_stop) && tracehook_notify_jctl(1, CLD_STOPPED)) {
1913 read_lock(&tasklist_lock); 1921 read_lock(&tasklist_lock);
1914 do_notify_parent_cldstop(tsk, CLD_STOPPED); 1922 do_notify_parent_cldstop(tsk, CLD_STOPPED);
1915 read_unlock(&tasklist_lock); 1923 read_unlock(&tasklist_lock);
@@ -1920,8 +1928,6 @@ EXPORT_SYMBOL(recalc_sigpending);
1920EXPORT_SYMBOL_GPL(dequeue_signal); 1928EXPORT_SYMBOL_GPL(dequeue_signal);
1921EXPORT_SYMBOL(flush_signals); 1929EXPORT_SYMBOL(flush_signals);
1922EXPORT_SYMBOL(force_sig); 1930EXPORT_SYMBOL(force_sig);
1923EXPORT_SYMBOL(kill_proc);
1924EXPORT_SYMBOL(ptrace_notify);
1925EXPORT_SYMBOL(send_sig); 1931EXPORT_SYMBOL(send_sig);
1926EXPORT_SYMBOL(send_sig_info); 1932EXPORT_SYMBOL(send_sig_info);
1927EXPORT_SYMBOL(sigprocmask); 1933EXPORT_SYMBOL(sigprocmask);
@@ -2196,7 +2202,7 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese,
2196} 2202}
2197 2203
2198asmlinkage long 2204asmlinkage long
2199sys_kill(int pid, int sig) 2205sys_kill(pid_t pid, int sig)
2200{ 2206{
2201 struct siginfo info; 2207 struct siginfo info;
2202 2208
@@ -2209,7 +2215,7 @@ sys_kill(int pid, int sig)
2209 return kill_something_info(sig, &info, pid); 2215 return kill_something_info(sig, &info, pid);
2210} 2216}
2211 2217
2212static int do_tkill(int tgid, int pid, int sig) 2218static int do_tkill(pid_t tgid, pid_t pid, int sig)
2213{ 2219{
2214 int error; 2220 int error;
2215 struct siginfo info; 2221 struct siginfo info;
@@ -2255,7 +2261,7 @@ static int do_tkill(int tgid, int pid, int sig)
2255 * exists but it's not belonging to the target process anymore. This 2261 * exists but it's not belonging to the target process anymore. This
2256 * method solves the problem of threads exiting and PIDs getting reused. 2262 * method solves the problem of threads exiting and PIDs getting reused.
2257 */ 2263 */
2258asmlinkage long sys_tgkill(int tgid, int pid, int sig) 2264asmlinkage long sys_tgkill(pid_t tgid, pid_t pid, int sig)
2259{ 2265{
2260 /* This is only valid for single tasks */ 2266 /* This is only valid for single tasks */
2261 if (pid <= 0 || tgid <= 0) 2267 if (pid <= 0 || tgid <= 0)
@@ -2268,7 +2274,7 @@ asmlinkage long sys_tgkill(int tgid, int pid, int sig)
2268 * Send a signal to only one task, even if it's a CLONE_THREAD task. 2274 * Send a signal to only one task, even if it's a CLONE_THREAD task.
2269 */ 2275 */
2270asmlinkage long 2276asmlinkage long
2271sys_tkill(int pid, int sig) 2277sys_tkill(pid_t pid, int sig)
2272{ 2278{
2273 /* This is only valid for single tasks */ 2279 /* This is only valid for single tasks */
2274 if (pid <= 0) 2280 if (pid <= 0)
@@ -2278,7 +2284,7 @@ sys_tkill(int pid, int sig)
2278} 2284}
2279 2285
2280asmlinkage long 2286asmlinkage long
2281sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo) 2287sys_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t __user *uinfo)
2282{ 2288{
2283 siginfo_t info; 2289 siginfo_t info;
2284 2290
@@ -2325,7 +2331,7 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
2325 * (for example, SIGCHLD), shall cause the pending signal to 2331 * (for example, SIGCHLD), shall cause the pending signal to
2326 * be discarded, whether or not it is blocked" 2332 * be discarded, whether or not it is blocked"
2327 */ 2333 */
2328 if (__sig_ignored(t, sig)) { 2334 if (sig_handler_ignored(sig_handler(t, sig), sig)) {
2329 sigemptyset(&mask); 2335 sigemptyset(&mask);
2330 sigaddset(&mask, sig); 2336 sigaddset(&mask, sig);
2331 rm_from_queue_full(&mask, &t->signal->shared_pending); 2337 rm_from_queue_full(&mask, &t->signal->shared_pending);
diff --git a/kernel/smp.c b/kernel/smp.c
index 462c785ca1ee..f362a8553777 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -33,7 +33,7 @@ struct call_single_queue {
33 spinlock_t lock; 33 spinlock_t lock;
34}; 34};
35 35
36void __cpuinit init_call_single_data(void) 36static int __cpuinit init_call_single_data(void)
37{ 37{
38 int i; 38 int i;
39 39
@@ -43,7 +43,9 @@ void __cpuinit init_call_single_data(void)
43 spin_lock_init(&q->lock); 43 spin_lock_init(&q->lock);
44 INIT_LIST_HEAD(&q->list); 44 INIT_LIST_HEAD(&q->list);
45 } 45 }
46 return 0;
46} 47}
48early_initcall(init_call_single_data);
47 49
48static void csd_flag_wait(struct call_single_data *data) 50static void csd_flag_wait(struct call_single_data *data)
49{ 51{
@@ -133,7 +135,8 @@ void generic_smp_call_function_interrupt(void)
133 */ 135 */
134 smp_wmb(); 136 smp_wmb();
135 data->csd.flags &= ~CSD_FLAG_WAIT; 137 data->csd.flags &= ~CSD_FLAG_WAIT;
136 } else 138 }
139 if (data->csd.flags & CSD_FLAG_ALLOC)
137 call_rcu(&data->rcu_head, rcu_free_call_data); 140 call_rcu(&data->rcu_head, rcu_free_call_data);
138 } 141 }
139 rcu_read_unlock(); 142 rcu_read_unlock();
@@ -207,8 +210,10 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
207{ 210{
208 struct call_single_data d; 211 struct call_single_data d;
209 unsigned long flags; 212 unsigned long flags;
210 /* prevent preemption and reschedule on another processor */ 213 /* prevent preemption and reschedule on another processor,
214 as well as CPU removal */
211 int me = get_cpu(); 215 int me = get_cpu();
216 int err = 0;
212 217
213 /* Can deadlock when called with interrupts disabled */ 218 /* Can deadlock when called with interrupts disabled */
214 WARN_ON(irqs_disabled()); 219 WARN_ON(irqs_disabled());
@@ -217,7 +222,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
217 local_irq_save(flags); 222 local_irq_save(flags);
218 func(info); 223 func(info);
219 local_irq_restore(flags); 224 local_irq_restore(flags);
220 } else { 225 } else if ((unsigned)cpu < NR_CPUS && cpu_online(cpu)) {
221 struct call_single_data *data = NULL; 226 struct call_single_data *data = NULL;
222 227
223 if (!wait) { 228 if (!wait) {
@@ -233,10 +238,12 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
233 data->func = func; 238 data->func = func;
234 data->info = info; 239 data->info = info;
235 generic_exec_single(cpu, data); 240 generic_exec_single(cpu, data);
241 } else {
242 err = -ENXIO; /* CPU not online */
236 } 243 }
237 244
238 put_cpu(); 245 put_cpu();
239 return 0; 246 return err;
240} 247}
241EXPORT_SYMBOL(smp_call_function_single); 248EXPORT_SYMBOL(smp_call_function_single);
242 249
@@ -258,6 +265,42 @@ void __smp_call_function_single(int cpu, struct call_single_data *data)
258 generic_exec_single(cpu, data); 265 generic_exec_single(cpu, data);
259} 266}
260 267
268/* Dummy function */
269static void quiesce_dummy(void *unused)
270{
271}
272
273/*
274 * Ensure stack based data used in call function mask is safe to free.
275 *
276 * This is needed by smp_call_function_mask when using on-stack data, because
277 * a single call function queue is shared by all CPUs, and any CPU may pick up
278 * the data item on the queue at any time before it is deleted. So we need to
279 * ensure that all CPUs have transitioned through a quiescent state after
280 * this call.
281 *
282 * This is a very slow function, implemented by sending synchronous IPIs to
283 * all possible CPUs. For this reason, we have to alloc data rather than use
284 * stack based data even in the case of synchronous calls. The stack based
285 * data is then just used for deadlock/oom fallback which will be very rare.
286 *
287 * If a faster scheme can be made, we could go back to preferring stack based
288 * data -- the data allocation/free is non-zero cost.
289 */
290static void smp_call_function_mask_quiesce_stack(cpumask_t mask)
291{
292 struct call_single_data data;
293 int cpu;
294
295 data.func = quiesce_dummy;
296 data.info = NULL;
297
298 for_each_cpu_mask(cpu, mask) {
299 data.flags = CSD_FLAG_WAIT;
300 generic_exec_single(cpu, &data);
301 }
302}
303
261/** 304/**
262 * smp_call_function_mask(): Run a function on a set of other CPUs. 305 * smp_call_function_mask(): Run a function on a set of other CPUs.
263 * @mask: The set of cpus to run on. 306 * @mask: The set of cpus to run on.
@@ -283,6 +326,7 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
283 cpumask_t allbutself; 326 cpumask_t allbutself;
284 unsigned long flags; 327 unsigned long flags;
285 int cpu, num_cpus; 328 int cpu, num_cpus;
329 int slowpath = 0;
286 330
287 /* Can deadlock when called with interrupts disabled */ 331 /* Can deadlock when called with interrupts disabled */
288 WARN_ON(irqs_disabled()); 332 WARN_ON(irqs_disabled());
@@ -304,15 +348,16 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
304 return smp_call_function_single(cpu, func, info, wait); 348 return smp_call_function_single(cpu, func, info, wait);
305 } 349 }
306 350
307 if (!wait) { 351 data = kmalloc(sizeof(*data), GFP_ATOMIC);
308 data = kmalloc(sizeof(*data), GFP_ATOMIC); 352 if (data) {
309 if (data) 353 data->csd.flags = CSD_FLAG_ALLOC;
310 data->csd.flags = CSD_FLAG_ALLOC; 354 if (wait)
311 } 355 data->csd.flags |= CSD_FLAG_WAIT;
312 if (!data) { 356 } else {
313 data = &d; 357 data = &d;
314 data->csd.flags = CSD_FLAG_WAIT; 358 data->csd.flags = CSD_FLAG_WAIT;
315 wait = 1; 359 wait = 1;
360 slowpath = 1;
316 } 361 }
317 362
318 spin_lock_init(&data->lock); 363 spin_lock_init(&data->lock);
@@ -329,8 +374,11 @@ int smp_call_function_mask(cpumask_t mask, void (*func)(void *), void *info,
329 arch_send_call_function_ipi(mask); 374 arch_send_call_function_ipi(mask);
330 375
331 /* optionally wait for the CPUs to complete */ 376 /* optionally wait for the CPUs to complete */
332 if (wait) 377 if (wait) {
333 csd_flag_wait(&data->csd); 378 csd_flag_wait(&data->csd);
379 if (unlikely(slowpath))
380 smp_call_function_mask_quiesce_stack(mask);
381 }
334 382
335 return 0; 383 return 0;
336} 384}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 81e2fe0f983a..7110daeb9a90 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -6,6 +6,8 @@
6 * Distribute under GPLv2. 6 * Distribute under GPLv2.
7 * 7 *
8 * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903) 8 * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
9 *
10 * Remote softirq infrastructure is by Jens Axboe.
9 */ 11 */
10 12
11#include <linux/module.h> 13#include <linux/module.h>
@@ -46,7 +48,7 @@ irq_cpustat_t irq_stat[NR_CPUS] ____cacheline_aligned;
46EXPORT_SYMBOL(irq_stat); 48EXPORT_SYMBOL(irq_stat);
47#endif 49#endif
48 50
49static struct softirq_action softirq_vec[32] __cacheline_aligned_in_smp; 51static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
50 52
51static DEFINE_PER_CPU(struct task_struct *, ksoftirqd); 53static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
52 54
@@ -205,7 +207,18 @@ restart:
205 207
206 do { 208 do {
207 if (pending & 1) { 209 if (pending & 1) {
210 int prev_count = preempt_count();
211
208 h->action(h); 212 h->action(h);
213
214 if (unlikely(prev_count != preempt_count())) {
215 printk(KERN_ERR "huh, entered softirq %td %p"
216 "with preempt_count %08x,"
217 " exited with %08x?\n", h - softirq_vec,
218 h->action, prev_count, preempt_count());
219 preempt_count() = prev_count;
220 }
221
209 rcu_bh_qsctr_inc(cpu); 222 rcu_bh_qsctr_inc(cpu);
210 } 223 }
211 h++; 224 h++;
@@ -254,16 +267,12 @@ asmlinkage void do_softirq(void)
254 */ 267 */
255void irq_enter(void) 268void irq_enter(void)
256{ 269{
257#ifdef CONFIG_NO_HZ
258 int cpu = smp_processor_id(); 270 int cpu = smp_processor_id();
271
259 if (idle_cpu(cpu) && !in_interrupt()) 272 if (idle_cpu(cpu) && !in_interrupt())
260 tick_nohz_stop_idle(cpu); 273 tick_check_idle(cpu);
261#endif 274
262 __irq_enter(); 275 __irq_enter();
263#ifdef CONFIG_NO_HZ
264 if (idle_cpu(cpu))
265 tick_nohz_update_jiffies();
266#endif
267} 276}
268 277
269#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED 278#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
@@ -286,7 +295,7 @@ void irq_exit(void)
286#ifdef CONFIG_NO_HZ 295#ifdef CONFIG_NO_HZ
287 /* Make sure that timer wheel updates are propagated */ 296 /* Make sure that timer wheel updates are propagated */
288 if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched()) 297 if (!in_interrupt() && idle_cpu(smp_processor_id()) && !need_resched())
289 tick_nohz_stop_sched_tick(); 298 tick_nohz_stop_sched_tick(0);
290 rcu_irq_exit(); 299 rcu_irq_exit();
291#endif 300#endif
292 preempt_enable_no_resched(); 301 preempt_enable_no_resched();
@@ -463,17 +472,144 @@ void tasklet_kill(struct tasklet_struct *t)
463 472
464EXPORT_SYMBOL(tasklet_kill); 473EXPORT_SYMBOL(tasklet_kill);
465 474
475DEFINE_PER_CPU(struct list_head [NR_SOFTIRQS], softirq_work_list);
476EXPORT_PER_CPU_SYMBOL(softirq_work_list);
477
478static void __local_trigger(struct call_single_data *cp, int softirq)
479{
480 struct list_head *head = &__get_cpu_var(softirq_work_list[softirq]);
481
482 list_add_tail(&cp->list, head);
483
484 /* Trigger the softirq only if the list was previously empty. */
485 if (head->next == &cp->list)
486 raise_softirq_irqoff(softirq);
487}
488
489#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
490static void remote_softirq_receive(void *data)
491{
492 struct call_single_data *cp = data;
493 unsigned long flags;
494 int softirq;
495
496 softirq = cp->priv;
497
498 local_irq_save(flags);
499 __local_trigger(cp, softirq);
500 local_irq_restore(flags);
501}
502
503static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
504{
505 if (cpu_online(cpu)) {
506 cp->func = remote_softirq_receive;
507 cp->info = cp;
508 cp->flags = 0;
509 cp->priv = softirq;
510
511 __smp_call_function_single(cpu, cp);
512 return 0;
513 }
514 return 1;
515}
516#else /* CONFIG_USE_GENERIC_SMP_HELPERS */
517static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
518{
519 return 1;
520}
521#endif
522
523/**
524 * __send_remote_softirq - try to schedule softirq work on a remote cpu
525 * @cp: private SMP call function data area
526 * @cpu: the remote cpu
527 * @this_cpu: the currently executing cpu
528 * @softirq: the softirq for the work
529 *
530 * Attempt to schedule softirq work on a remote cpu. If this cannot be
531 * done, the work is instead queued up on the local cpu.
532 *
533 * Interrupts must be disabled.
534 */
535void __send_remote_softirq(struct call_single_data *cp, int cpu, int this_cpu, int softirq)
536{
537 if (cpu == this_cpu || __try_remote_softirq(cp, cpu, softirq))
538 __local_trigger(cp, softirq);
539}
540EXPORT_SYMBOL(__send_remote_softirq);
541
542/**
543 * send_remote_softirq - try to schedule softirq work on a remote cpu
544 * @cp: private SMP call function data area
545 * @cpu: the remote cpu
546 * @softirq: the softirq for the work
547 *
548 * Like __send_remote_softirq except that disabling interrupts and
549 * computing the current cpu is done for the caller.
550 */
551void send_remote_softirq(struct call_single_data *cp, int cpu, int softirq)
552{
553 unsigned long flags;
554 int this_cpu;
555
556 local_irq_save(flags);
557 this_cpu = smp_processor_id();
558 __send_remote_softirq(cp, cpu, this_cpu, softirq);
559 local_irq_restore(flags);
560}
561EXPORT_SYMBOL(send_remote_softirq);
562
563static int __cpuinit remote_softirq_cpu_notify(struct notifier_block *self,
564 unsigned long action, void *hcpu)
565{
566 /*
567 * If a CPU goes away, splice its entries to the current CPU
568 * and trigger a run of the softirq
569 */
570 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
571 int cpu = (unsigned long) hcpu;
572 int i;
573
574 local_irq_disable();
575 for (i = 0; i < NR_SOFTIRQS; i++) {
576 struct list_head *head = &per_cpu(softirq_work_list[i], cpu);
577 struct list_head *local_head;
578
579 if (list_empty(head))
580 continue;
581
582 local_head = &__get_cpu_var(softirq_work_list[i]);
583 list_splice_init(head, local_head);
584 raise_softirq_irqoff(i);
585 }
586 local_irq_enable();
587 }
588
589 return NOTIFY_OK;
590}
591
592static struct notifier_block __cpuinitdata remote_softirq_cpu_notifier = {
593 .notifier_call = remote_softirq_cpu_notify,
594};
595
466void __init softirq_init(void) 596void __init softirq_init(void)
467{ 597{
468 int cpu; 598 int cpu;
469 599
470 for_each_possible_cpu(cpu) { 600 for_each_possible_cpu(cpu) {
601 int i;
602
471 per_cpu(tasklet_vec, cpu).tail = 603 per_cpu(tasklet_vec, cpu).tail =
472 &per_cpu(tasklet_vec, cpu).head; 604 &per_cpu(tasklet_vec, cpu).head;
473 per_cpu(tasklet_hi_vec, cpu).tail = 605 per_cpu(tasklet_hi_vec, cpu).tail =
474 &per_cpu(tasklet_hi_vec, cpu).head; 606 &per_cpu(tasklet_hi_vec, cpu).head;
607 for (i = 0; i < NR_SOFTIRQS; i++)
608 INIT_LIST_HEAD(&per_cpu(softirq_work_list[i], cpu));
475 } 609 }
476 610
611 register_hotcpu_notifier(&remote_softirq_cpu_notifier);
612
477 open_softirq(TASKLET_SOFTIRQ, tasklet_action); 613 open_softirq(TASKLET_SOFTIRQ, tasklet_action);
478 open_softirq(HI_SOFTIRQ, tasklet_hi_action); 614 open_softirq(HI_SOFTIRQ, tasklet_hi_action);
479} 615}
@@ -630,7 +766,7 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
630 .notifier_call = cpu_callback 766 .notifier_call = cpu_callback
631}; 767};
632 768
633__init int spawn_ksoftirqd(void) 769static __init int spawn_ksoftirqd(void)
634{ 770{
635 void *cpu = (void *)(long)smp_processor_id(); 771 void *cpu = (void *)(long)smp_processor_id();
636 int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); 772 int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
@@ -640,6 +776,7 @@ __init int spawn_ksoftirqd(void)
640 register_cpu_notifier(&cpu_nfb); 776 register_cpu_notifier(&cpu_nfb);
641 return 0; 777 return 0;
642} 778}
779early_initcall(spawn_ksoftirqd);
643 780
644#ifdef CONFIG_SMP 781#ifdef CONFIG_SMP
645/* 782/*
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index a272d78185eb..3953e4aed733 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -13,6 +13,7 @@
13#include <linux/delay.h> 13#include <linux/delay.h>
14#include <linux/freezer.h> 14#include <linux/freezer.h>
15#include <linux/kthread.h> 15#include <linux/kthread.h>
16#include <linux/lockdep.h>
16#include <linux/notifier.h> 17#include <linux/notifier.h>
17#include <linux/module.h> 18#include <linux/module.h>
18 19
@@ -25,7 +26,22 @@ static DEFINE_PER_CPU(unsigned long, print_timestamp);
25static DEFINE_PER_CPU(struct task_struct *, watchdog_task); 26static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
26 27
27static int __read_mostly did_panic; 28static int __read_mostly did_panic;
28unsigned long __read_mostly softlockup_thresh = 60; 29int __read_mostly softlockup_thresh = 60;
30
31/*
32 * Should we panic (and reboot, if panic_timeout= is set) when a
33 * soft-lockup occurs:
34 */
35unsigned int __read_mostly softlockup_panic =
36 CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE;
37
38static int __init softlockup_panic_setup(char *str)
39{
40 softlockup_panic = simple_strtoul(str, NULL, 0);
41
42 return 1;
43}
44__setup("softlockup_panic=", softlockup_panic_setup);
29 45
30static int 46static int
31softlock_panic(struct notifier_block *this, unsigned long event, void *ptr) 47softlock_panic(struct notifier_block *this, unsigned long event, void *ptr)
@@ -84,6 +100,14 @@ void softlockup_tick(void)
84 struct pt_regs *regs = get_irq_regs(); 100 struct pt_regs *regs = get_irq_regs();
85 unsigned long now; 101 unsigned long now;
86 102
103 /* Is detection switched off? */
104 if (!per_cpu(watchdog_task, this_cpu) || softlockup_thresh <= 0) {
105 /* Be sure we don't false trigger if switched back on */
106 if (touch_timestamp)
107 per_cpu(touch_timestamp, this_cpu) = 0;
108 return;
109 }
110
87 if (touch_timestamp == 0) { 111 if (touch_timestamp == 0) {
88 __touch_softlockup_watchdog(); 112 __touch_softlockup_watchdog();
89 return; 113 return;
@@ -92,11 +116,8 @@ void softlockup_tick(void)
92 print_timestamp = per_cpu(print_timestamp, this_cpu); 116 print_timestamp = per_cpu(print_timestamp, this_cpu);
93 117
94 /* report at most once a second */ 118 /* report at most once a second */
95 if ((print_timestamp >= touch_timestamp && 119 if (print_timestamp == touch_timestamp || did_panic)
96 print_timestamp < (touch_timestamp + 1)) ||
97 did_panic || !per_cpu(watchdog_task, this_cpu)) {
98 return; 120 return;
99 }
100 121
101 /* do not print during early bootup: */ 122 /* do not print during early bootup: */
102 if (unlikely(system_state != SYSTEM_RUNNING)) { 123 if (unlikely(system_state != SYSTEM_RUNNING)) {
@@ -106,8 +127,11 @@ void softlockup_tick(void)
106 127
107 now = get_timestamp(this_cpu); 128 now = get_timestamp(this_cpu);
108 129
109 /* Wake up the high-prio watchdog task every second: */ 130 /*
110 if (now > (touch_timestamp + 1)) 131 * Wake up the high-prio watchdog task twice per
132 * threshold timespan.
133 */
134 if (now > touch_timestamp + softlockup_thresh/2)
111 wake_up_process(per_cpu(watchdog_task, this_cpu)); 135 wake_up_process(per_cpu(watchdog_task, this_cpu));
112 136
113 /* Warn about unreasonable delays: */ 137 /* Warn about unreasonable delays: */
@@ -121,11 +145,15 @@ void softlockup_tick(void)
121 this_cpu, now - touch_timestamp, 145 this_cpu, now - touch_timestamp,
122 current->comm, task_pid_nr(current)); 146 current->comm, task_pid_nr(current));
123 print_modules(); 147 print_modules();
148 print_irqtrace_events(current);
124 if (regs) 149 if (regs)
125 show_regs(regs); 150 show_regs(regs);
126 else 151 else
127 dump_stack(); 152 dump_stack();
128 spin_unlock(&print_lock); 153 spin_unlock(&print_lock);
154
155 if (softlockup_panic)
156 panic("softlockup: hung tasks");
129} 157}
130 158
131/* 159/*
@@ -178,6 +206,9 @@ static void check_hung_task(struct task_struct *t, unsigned long now)
178 206
179 t->last_switch_timestamp = now; 207 t->last_switch_timestamp = now;
180 touch_nmi_watchdog(); 208 touch_nmi_watchdog();
209
210 if (softlockup_panic)
211 panic("softlockup: blocked tasks");
181} 212}
182 213
183/* 214/*
@@ -195,14 +226,15 @@ static void check_hung_uninterruptible_tasks(int this_cpu)
195 * If the system crashed already then all bets are off, 226 * If the system crashed already then all bets are off,
196 * do not report extra hung tasks: 227 * do not report extra hung tasks:
197 */ 228 */
198 if ((tainted & TAINT_DIE) || did_panic) 229 if (test_taint(TAINT_DIE) || did_panic)
199 return; 230 return;
200 231
201 read_lock(&tasklist_lock); 232 read_lock(&tasklist_lock);
202 do_each_thread(g, t) { 233 do_each_thread(g, t) {
203 if (!--max_count) 234 if (!--max_count)
204 goto unlock; 235 goto unlock;
205 if (t->state & TASK_UNINTERRUPTIBLE) 236 /* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
237 if (t->state == TASK_UNINTERRUPTIBLE)
206 check_hung_task(t, now); 238 check_hung_task(t, now);
207 } while_each_thread(g, t); 239 } while_each_thread(g, t);
208 unlock: 240 unlock:
@@ -307,14 +339,33 @@ static struct notifier_block __cpuinitdata cpu_nfb = {
307 .notifier_call = cpu_callback 339 .notifier_call = cpu_callback
308}; 340};
309 341
310__init void spawn_softlockup_task(void) 342static int __initdata nosoftlockup;
343
344static int __init nosoftlockup_setup(char *str)
345{
346 nosoftlockup = 1;
347 return 1;
348}
349__setup("nosoftlockup", nosoftlockup_setup);
350
351static int __init spawn_softlockup_task(void)
311{ 352{
312 void *cpu = (void *)(long)smp_processor_id(); 353 void *cpu = (void *)(long)smp_processor_id();
313 int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); 354 int err;
355
356 if (nosoftlockup)
357 return 0;
314 358
315 BUG_ON(err == NOTIFY_BAD); 359 err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
360 if (err == NOTIFY_BAD) {
361 BUG();
362 return 1;
363 }
316 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); 364 cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
317 register_cpu_notifier(&cpu_nfb); 365 register_cpu_notifier(&cpu_nfb);
318 366
319 atomic_notifier_chain_register(&panic_notifier_list, &panic_block); 367 atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
368
369 return 0;
320} 370}
371early_initcall(spawn_softlockup_task);
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index a1fb54c93cdd..29ab20749dd3 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -290,8 +290,8 @@ void __lockfunc _spin_lock_nested(spinlock_t *lock, int subclass)
290 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); 290 spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
291 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock); 291 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
292} 292}
293
294EXPORT_SYMBOL(_spin_lock_nested); 293EXPORT_SYMBOL(_spin_lock_nested);
294
295unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass) 295unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclass)
296{ 296{
297 unsigned long flags; 297 unsigned long flags;
@@ -311,9 +311,17 @@ unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclas
311#endif 311#endif
312 return flags; 312 return flags;
313} 313}
314
315EXPORT_SYMBOL(_spin_lock_irqsave_nested); 314EXPORT_SYMBOL(_spin_lock_irqsave_nested);
316 315
316void __lockfunc _spin_lock_nest_lock(spinlock_t *lock,
317 struct lockdep_map *nest_lock)
318{
319 preempt_disable();
320 spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_);
321 LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
322}
323EXPORT_SYMBOL(_spin_lock_nest_lock);
324
317#endif 325#endif
318 326
319void __lockfunc _spin_unlock(spinlock_t *lock) 327void __lockfunc _spin_unlock(spinlock_t *lock)
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index ba9b2054ecbd..9bc4c00872c9 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -1,4 +1,4 @@
1/* Copyright 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation. 1/* Copyright 2008, 2005 Rusty Russell rusty@rustcorp.com.au IBM Corporation.
2 * GPL v2 and any later version. 2 * GPL v2 and any later version.
3 */ 3 */
4#include <linux/cpu.h> 4#include <linux/cpu.h>
@@ -13,203 +13,151 @@
13#include <asm/atomic.h> 13#include <asm/atomic.h>
14#include <asm/uaccess.h> 14#include <asm/uaccess.h>
15 15
16/* Since we effect priority and affinity (both of which are visible 16/* This controls the threads on each CPU. */
17 * to, and settable by outside processes) we do indirection via a
18 * kthread. */
19
20/* Thread to stop each CPU in user context. */
21enum stopmachine_state { 17enum stopmachine_state {
22 STOPMACHINE_WAIT, 18 /* Dummy starting state for thread. */
19 STOPMACHINE_NONE,
20 /* Awaiting everyone to be scheduled. */
23 STOPMACHINE_PREPARE, 21 STOPMACHINE_PREPARE,
22 /* Disable interrupts. */
24 STOPMACHINE_DISABLE_IRQ, 23 STOPMACHINE_DISABLE_IRQ,
24 /* Run the function */
25 STOPMACHINE_RUN,
26 /* Exit */
25 STOPMACHINE_EXIT, 27 STOPMACHINE_EXIT,
26}; 28};
29static enum stopmachine_state state;
27 30
28static enum stopmachine_state stopmachine_state; 31struct stop_machine_data {
29static unsigned int stopmachine_num_threads; 32 int (*fn)(void *);
30static atomic_t stopmachine_thread_ack; 33 void *data;
31 34 int fnret;
32static int stopmachine(void *cpu) 35};
33{
34 int irqs_disabled = 0;
35 int prepared = 0;
36
37 set_cpus_allowed_ptr(current, &cpumask_of_cpu((int)(long)cpu));
38
39 /* Ack: we are alive */
40 smp_mb(); /* Theoretically the ack = 0 might not be on this CPU yet. */
41 atomic_inc(&stopmachine_thread_ack);
42
43 /* Simple state machine */
44 while (stopmachine_state != STOPMACHINE_EXIT) {
45 if (stopmachine_state == STOPMACHINE_DISABLE_IRQ
46 && !irqs_disabled) {
47 local_irq_disable();
48 hard_irq_disable();
49 irqs_disabled = 1;
50 /* Ack: irqs disabled. */
51 smp_mb(); /* Must read state first. */
52 atomic_inc(&stopmachine_thread_ack);
53 } else if (stopmachine_state == STOPMACHINE_PREPARE
54 && !prepared) {
55 /* Everyone is in place, hold CPU. */
56 preempt_disable();
57 prepared = 1;
58 smp_mb(); /* Must read state first. */
59 atomic_inc(&stopmachine_thread_ack);
60 }
61 /* Yield in first stage: migration threads need to
62 * help our sisters onto their CPUs. */
63 if (!prepared && !irqs_disabled)
64 yield();
65 cpu_relax();
66 }
67 36
68 /* Ack: we are exiting. */ 37/* Like num_online_cpus(), but hotplug cpu uses us, so we need this. */
69 smp_mb(); /* Must read state first. */ 38static unsigned int num_threads;
70 atomic_inc(&stopmachine_thread_ack); 39static atomic_t thread_ack;
40static DEFINE_MUTEX(lock);
71 41
72 if (irqs_disabled) 42static struct workqueue_struct *stop_machine_wq;
73 local_irq_enable(); 43static struct stop_machine_data active, idle;
74 if (prepared) 44static const cpumask_t *active_cpus;
75 preempt_enable(); 45static void *stop_machine_work;
76 46
77 return 0; 47static void set_state(enum stopmachine_state newstate)
78}
79
80/* Change the thread state */
81static void stopmachine_set_state(enum stopmachine_state state)
82{ 48{
83 atomic_set(&stopmachine_thread_ack, 0); 49 /* Reset ack counter. */
50 atomic_set(&thread_ack, num_threads);
84 smp_wmb(); 51 smp_wmb();
85 stopmachine_state = state; 52 state = newstate;
86 while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads)
87 cpu_relax();
88} 53}
89 54
90static int stop_machine(void) 55/* Last one to ack a state moves to the next state. */
56static void ack_state(void)
91{ 57{
92 int i, ret = 0; 58 if (atomic_dec_and_test(&thread_ack))
93 59 set_state(state + 1);
94 atomic_set(&stopmachine_thread_ack, 0); 60}
95 stopmachine_num_threads = 0;
96 stopmachine_state = STOPMACHINE_WAIT;
97 61
98 for_each_online_cpu(i) { 62/* This is the actual function which stops the CPU. It runs
99 if (i == raw_smp_processor_id()) 63 * in the context of a dedicated stopmachine workqueue. */
100 continue; 64static void stop_cpu(struct work_struct *unused)
101 ret = kernel_thread(stopmachine, (void *)(long)i,CLONE_KERNEL); 65{
102 if (ret < 0) 66 enum stopmachine_state curstate = STOPMACHINE_NONE;
103 break; 67 struct stop_machine_data *smdata = &idle;
104 stopmachine_num_threads++; 68 int cpu = smp_processor_id();
69 int err;
70
71 if (!active_cpus) {
72 if (cpu == first_cpu(cpu_online_map))
73 smdata = &active;
74 } else {
75 if (cpu_isset(cpu, *active_cpus))
76 smdata = &active;
105 } 77 }
106 78 /* Simple state machine */
107 /* Wait for them all to come to life. */ 79 do {
108 while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads) { 80 /* Chill out and ensure we re-read stopmachine_state. */
109 yield();
110 cpu_relax(); 81 cpu_relax();
111 } 82 if (state != curstate) {
112 83 curstate = state;
113 /* If some failed, kill them all. */ 84 switch (curstate) {
114 if (ret < 0) { 85 case STOPMACHINE_DISABLE_IRQ:
115 stopmachine_set_state(STOPMACHINE_EXIT); 86 local_irq_disable();
116 return ret; 87 hard_irq_disable();
117 } 88 break;
118 89 case STOPMACHINE_RUN:
119 /* Now they are all started, make them hold the CPUs, ready. */ 90 /* On multiple CPUs only a single error code
120 preempt_disable(); 91 * is needed to tell that something failed. */
121 stopmachine_set_state(STOPMACHINE_PREPARE); 92 err = smdata->fn(smdata->data);
122 93 if (err)
123 /* Make them disable irqs. */ 94 smdata->fnret = err;
124 local_irq_disable(); 95 break;
125 hard_irq_disable(); 96 default:
126 stopmachine_set_state(STOPMACHINE_DISABLE_IRQ); 97 break;
127 98 }
128 return 0; 99 ack_state();
129} 100 }
101 } while (curstate != STOPMACHINE_EXIT);
130 102
131static void restart_machine(void)
132{
133 stopmachine_set_state(STOPMACHINE_EXIT);
134 local_irq_enable(); 103 local_irq_enable();
135 preempt_enable_no_resched();
136} 104}
137 105
138struct stop_machine_data { 106/* Callback for CPUs which aren't supposed to do anything. */
139 int (*fn)(void *); 107static int chill(void *unused)
140 void *data;
141 struct completion done;
142};
143
144static int do_stop(void *_smdata)
145{ 108{
146 struct stop_machine_data *smdata = _smdata; 109 return 0;
147 int ret;
148
149 ret = stop_machine();
150 if (ret == 0) {
151 ret = smdata->fn(smdata->data);
152 restart_machine();
153 }
154
155 /* We're done: you can kthread_stop us now */
156 complete(&smdata->done);
157
158 /* Wait for kthread_stop */
159 set_current_state(TASK_INTERRUPTIBLE);
160 while (!kthread_should_stop()) {
161 schedule();
162 set_current_state(TASK_INTERRUPTIBLE);
163 }
164 __set_current_state(TASK_RUNNING);
165 return ret;
166} 110}
167 111
168struct task_struct *__stop_machine_run(int (*fn)(void *), void *data, 112int __stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
169 unsigned int cpu)
170{ 113{
171 static DEFINE_MUTEX(stopmachine_mutex); 114 struct work_struct *sm_work;
172 struct stop_machine_data smdata; 115 int i;
173 struct task_struct *p; 116
174 117 /* Set up initial state. */
175 smdata.fn = fn; 118 mutex_lock(&lock);
176 smdata.data = data; 119 num_threads = num_online_cpus();
177 init_completion(&smdata.done); 120 active_cpus = cpus;
178 121 active.fn = fn;
179 mutex_lock(&stopmachine_mutex); 122 active.data = data;
180 123 active.fnret = 0;
181 /* If they don't care which CPU fn runs on, bind to any online one. */ 124 idle.fn = chill;
182 if (cpu == NR_CPUS) 125 idle.data = NULL;
183 cpu = raw_smp_processor_id(); 126
184 127 set_state(STOPMACHINE_PREPARE);
185 p = kthread_create(do_stop, &smdata, "kstopmachine"); 128
186 if (!IS_ERR(p)) { 129 /* Schedule the stop_cpu work on all cpus: hold this CPU so one
187 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; 130 * doesn't hit this CPU until we're ready. */
188 131 get_cpu();
189 /* One high-prio thread per cpu. We'll do this one. */ 132 for_each_online_cpu(i) {
190 sched_setscheduler_nocheck(p, SCHED_FIFO, &param); 133 sm_work = percpu_ptr(stop_machine_work, i);
191 kthread_bind(p, cpu); 134 INIT_WORK(sm_work, stop_cpu);
192 wake_up_process(p); 135 queue_work_on(i, stop_machine_wq, sm_work);
193 wait_for_completion(&smdata.done);
194 } 136 }
195 mutex_unlock(&stopmachine_mutex); 137 /* This will release the thread on our CPU. */
196 return p; 138 put_cpu();
139 flush_workqueue(stop_machine_wq);
140 mutex_unlock(&lock);
141 return active.fnret;
197} 142}
198 143
199int stop_machine_run(int (*fn)(void *), void *data, unsigned int cpu) 144int stop_machine(int (*fn)(void *), void *data, const cpumask_t *cpus)
200{ 145{
201 struct task_struct *p;
202 int ret; 146 int ret;
203 147
204 /* No CPUs can come up or down during this. */ 148 /* No CPUs can come up or down during this. */
205 get_online_cpus(); 149 get_online_cpus();
206 p = __stop_machine_run(fn, data, cpu); 150 ret = __stop_machine(fn, data, cpus);
207 if (!IS_ERR(p))
208 ret = kthread_stop(p);
209 else
210 ret = PTR_ERR(p);
211 put_online_cpus(); 151 put_online_cpus();
212 152
213 return ret; 153 return ret;
214} 154}
215EXPORT_SYMBOL_GPL(stop_machine_run); 155EXPORT_SYMBOL_GPL(stop_machine);
156
157static int __init stop_machine_init(void)
158{
159 stop_machine_wq = create_rt_workqueue("kstop");
160 stop_machine_work = alloc_percpu(struct work_struct);
161 return 0;
162}
163core_initcall(stop_machine_init);
diff --git a/kernel/sys.c b/kernel/sys.c
index 14e97282eb6c..31deba8f7d16 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -169,9 +169,9 @@ asmlinkage long sys_setpriority(int which, int who, int niceval)
169 pgrp = find_vpid(who); 169 pgrp = find_vpid(who);
170 else 170 else
171 pgrp = task_pgrp(current); 171 pgrp = task_pgrp(current);
172 do_each_pid_task(pgrp, PIDTYPE_PGID, p) { 172 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
173 error = set_one_prio(p, niceval, error); 173 error = set_one_prio(p, niceval, error);
174 } while_each_pid_task(pgrp, PIDTYPE_PGID, p); 174 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
175 break; 175 break;
176 case PRIO_USER: 176 case PRIO_USER:
177 user = current->user; 177 user = current->user;
@@ -229,11 +229,11 @@ asmlinkage long sys_getpriority(int which, int who)
229 pgrp = find_vpid(who); 229 pgrp = find_vpid(who);
230 else 230 else
231 pgrp = task_pgrp(current); 231 pgrp = task_pgrp(current);
232 do_each_pid_task(pgrp, PIDTYPE_PGID, p) { 232 do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
233 niceval = 20 - task_nice(p); 233 niceval = 20 - task_nice(p);
234 if (niceval > retval) 234 if (niceval > retval)
235 retval = niceval; 235 retval = niceval;
236 } while_each_pid_task(pgrp, PIDTYPE_PGID, p); 236 } while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
237 break; 237 break;
238 case PRIO_USER: 238 case PRIO_USER:
239 user = current->user; 239 user = current->user;
@@ -274,7 +274,7 @@ void emergency_restart(void)
274} 274}
275EXPORT_SYMBOL_GPL(emergency_restart); 275EXPORT_SYMBOL_GPL(emergency_restart);
276 276
277static void kernel_restart_prepare(char *cmd) 277void kernel_restart_prepare(char *cmd)
278{ 278{
279 blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd); 279 blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
280 system_state = SYSTEM_RESTART; 280 system_state = SYSTEM_RESTART;
@@ -301,26 +301,6 @@ void kernel_restart(char *cmd)
301} 301}
302EXPORT_SYMBOL_GPL(kernel_restart); 302EXPORT_SYMBOL_GPL(kernel_restart);
303 303
304/**
305 * kernel_kexec - reboot the system
306 *
307 * Move into place and start executing a preloaded standalone
308 * executable. If nothing was preloaded return an error.
309 */
310static void kernel_kexec(void)
311{
312#ifdef CONFIG_KEXEC
313 struct kimage *image;
314 image = xchg(&kexec_image, NULL);
315 if (!image)
316 return;
317 kernel_restart_prepare(NULL);
318 printk(KERN_EMERG "Starting new kernel\n");
319 machine_shutdown();
320 machine_kexec(image);
321#endif
322}
323
324static void kernel_shutdown_prepare(enum system_states state) 304static void kernel_shutdown_prepare(enum system_states state)
325{ 305{
326 blocking_notifier_call_chain(&reboot_notifier_list, 306 blocking_notifier_call_chain(&reboot_notifier_list,
@@ -425,10 +405,15 @@ asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user
425 kernel_restart(buffer); 405 kernel_restart(buffer);
426 break; 406 break;
427 407
408#ifdef CONFIG_KEXEC
428 case LINUX_REBOOT_CMD_KEXEC: 409 case LINUX_REBOOT_CMD_KEXEC:
429 kernel_kexec(); 410 {
430 unlock_kernel(); 411 int ret;
431 return -EINVAL; 412 ret = kernel_kexec();
413 unlock_kernel();
414 return ret;
415 }
416#endif
432 417
433#ifdef CONFIG_HIBERNATION 418#ifdef CONFIG_HIBERNATION
434 case LINUX_REBOOT_CMD_SW_SUSPEND: 419 case LINUX_REBOOT_CMD_SW_SUSPEND:
@@ -868,38 +853,28 @@ asmlinkage long sys_setfsgid(gid_t gid)
868 return old_fsgid; 853 return old_fsgid;
869} 854}
870 855
856void do_sys_times(struct tms *tms)
857{
858 struct task_cputime cputime;
859 cputime_t cutime, cstime;
860
861 spin_lock_irq(&current->sighand->siglock);
862 thread_group_cputime(current, &cputime);
863 cutime = current->signal->cutime;
864 cstime = current->signal->cstime;
865 spin_unlock_irq(&current->sighand->siglock);
866 tms->tms_utime = cputime_to_clock_t(cputime.utime);
867 tms->tms_stime = cputime_to_clock_t(cputime.stime);
868 tms->tms_cutime = cputime_to_clock_t(cutime);
869 tms->tms_cstime = cputime_to_clock_t(cstime);
870}
871
871asmlinkage long sys_times(struct tms __user * tbuf) 872asmlinkage long sys_times(struct tms __user * tbuf)
872{ 873{
873 /*
874 * In the SMP world we might just be unlucky and have one of
875 * the times increment as we use it. Since the value is an
876 * atomically safe type this is just fine. Conceptually its
877 * as if the syscall took an instant longer to occur.
878 */
879 if (tbuf) { 874 if (tbuf) {
880 struct tms tmp; 875 struct tms tmp;
881 struct task_struct *tsk = current; 876
882 struct task_struct *t; 877 do_sys_times(&tmp);
883 cputime_t utime, stime, cutime, cstime;
884
885 spin_lock_irq(&tsk->sighand->siglock);
886 utime = tsk->signal->utime;
887 stime = tsk->signal->stime;
888 t = tsk;
889 do {
890 utime = cputime_add(utime, t->utime);
891 stime = cputime_add(stime, t->stime);
892 t = next_thread(t);
893 } while (t != tsk);
894
895 cutime = tsk->signal->cutime;
896 cstime = tsk->signal->cstime;
897 spin_unlock_irq(&tsk->sighand->siglock);
898
899 tmp.tms_utime = cputime_to_clock_t(utime);
900 tmp.tms_stime = cputime_to_clock_t(stime);
901 tmp.tms_cutime = cputime_to_clock_t(cutime);
902 tmp.tms_cstime = cputime_to_clock_t(cstime);
903 if (copy_to_user(tbuf, &tmp, sizeof(struct tms))) 878 if (copy_to_user(tbuf, &tmp, sizeof(struct tms)))
904 return -EFAULT; 879 return -EFAULT;
905 } 880 }
@@ -1075,9 +1050,7 @@ asmlinkage long sys_setsid(void)
1075 group_leader->signal->leader = 1; 1050 group_leader->signal->leader = 1;
1076 __set_special_pids(sid); 1051 __set_special_pids(sid);
1077 1052
1078 spin_lock(&group_leader->sighand->siglock); 1053 proc_clear_tty(group_leader);
1079 group_leader->signal->tty = NULL;
1080 spin_unlock(&group_leader->sighand->siglock);
1081 1054
1082 err = session; 1055 err = session;
1083out: 1056out:
@@ -1343,8 +1316,6 @@ EXPORT_SYMBOL(in_egroup_p);
1343 1316
1344DECLARE_RWSEM(uts_sem); 1317DECLARE_RWSEM(uts_sem);
1345 1318
1346EXPORT_SYMBOL(uts_sem);
1347
1348asmlinkage long sys_newuname(struct new_utsname __user * name) 1319asmlinkage long sys_newuname(struct new_utsname __user * name)
1349{ 1320{
1350 int errno = 0; 1321 int errno = 0;
@@ -1368,8 +1339,10 @@ asmlinkage long sys_sethostname(char __user *name, int len)
1368 down_write(&uts_sem); 1339 down_write(&uts_sem);
1369 errno = -EFAULT; 1340 errno = -EFAULT;
1370 if (!copy_from_user(tmp, name, len)) { 1341 if (!copy_from_user(tmp, name, len)) {
1371 memcpy(utsname()->nodename, tmp, len); 1342 struct new_utsname *u = utsname();
1372 utsname()->nodename[len] = 0; 1343
1344 memcpy(u->nodename, tmp, len);
1345 memset(u->nodename + len, 0, sizeof(u->nodename) - len);
1373 errno = 0; 1346 errno = 0;
1374 } 1347 }
1375 up_write(&uts_sem); 1348 up_write(&uts_sem);
@@ -1381,15 +1354,17 @@ asmlinkage long sys_sethostname(char __user *name, int len)
1381asmlinkage long sys_gethostname(char __user *name, int len) 1354asmlinkage long sys_gethostname(char __user *name, int len)
1382{ 1355{
1383 int i, errno; 1356 int i, errno;
1357 struct new_utsname *u;
1384 1358
1385 if (len < 0) 1359 if (len < 0)
1386 return -EINVAL; 1360 return -EINVAL;
1387 down_read(&uts_sem); 1361 down_read(&uts_sem);
1388 i = 1 + strlen(utsname()->nodename); 1362 u = utsname();
1363 i = 1 + strlen(u->nodename);
1389 if (i > len) 1364 if (i > len)
1390 i = len; 1365 i = len;
1391 errno = 0; 1366 errno = 0;
1392 if (copy_to_user(name, utsname()->nodename, i)) 1367 if (copy_to_user(name, u->nodename, i))
1393 errno = -EFAULT; 1368 errno = -EFAULT;
1394 up_read(&uts_sem); 1369 up_read(&uts_sem);
1395 return errno; 1370 return errno;
@@ -1414,8 +1389,10 @@ asmlinkage long sys_setdomainname(char __user *name, int len)
1414 down_write(&uts_sem); 1389 down_write(&uts_sem);
1415 errno = -EFAULT; 1390 errno = -EFAULT;
1416 if (!copy_from_user(tmp, name, len)) { 1391 if (!copy_from_user(tmp, name, len)) {
1417 memcpy(utsname()->domainname, tmp, len); 1392 struct new_utsname *u = utsname();
1418 utsname()->domainname[len] = 0; 1393
1394 memcpy(u->domainname, tmp, len);
1395 memset(u->domainname + len, 0, sizeof(u->domainname) - len);
1419 errno = 0; 1396 errno = 0;
1420 } 1397 }
1421 up_write(&uts_sem); 1398 up_write(&uts_sem);
@@ -1462,21 +1439,28 @@ asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *r
1462asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim) 1439asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
1463{ 1440{
1464 struct rlimit new_rlim, *old_rlim; 1441 struct rlimit new_rlim, *old_rlim;
1465 unsigned long it_prof_secs;
1466 int retval; 1442 int retval;
1467 1443
1468 if (resource >= RLIM_NLIMITS) 1444 if (resource >= RLIM_NLIMITS)
1469 return -EINVAL; 1445 return -EINVAL;
1470 if (copy_from_user(&new_rlim, rlim, sizeof(*rlim))) 1446 if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
1471 return -EFAULT; 1447 return -EFAULT;
1472 if (new_rlim.rlim_cur > new_rlim.rlim_max)
1473 return -EINVAL;
1474 old_rlim = current->signal->rlim + resource; 1448 old_rlim = current->signal->rlim + resource;
1475 if ((new_rlim.rlim_max > old_rlim->rlim_max) && 1449 if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
1476 !capable(CAP_SYS_RESOURCE)) 1450 !capable(CAP_SYS_RESOURCE))
1477 return -EPERM; 1451 return -EPERM;
1478 if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open) 1452
1479 return -EPERM; 1453 if (resource == RLIMIT_NOFILE) {
1454 if (new_rlim.rlim_max == RLIM_INFINITY)
1455 new_rlim.rlim_max = sysctl_nr_open;
1456 if (new_rlim.rlim_cur == RLIM_INFINITY)
1457 new_rlim.rlim_cur = sysctl_nr_open;
1458 if (new_rlim.rlim_max > sysctl_nr_open)
1459 return -EPERM;
1460 }
1461
1462 if (new_rlim.rlim_cur > new_rlim.rlim_max)
1463 return -EINVAL;
1480 1464
1481 retval = security_task_setrlimit(resource, &new_rlim); 1465 retval = security_task_setrlimit(resource, &new_rlim);
1482 if (retval) 1466 if (retval)
@@ -1508,18 +1492,7 @@ asmlinkage long sys_setrlimit(unsigned int resource, struct rlimit __user *rlim)
1508 if (new_rlim.rlim_cur == RLIM_INFINITY) 1492 if (new_rlim.rlim_cur == RLIM_INFINITY)
1509 goto out; 1493 goto out;
1510 1494
1511 it_prof_secs = cputime_to_secs(current->signal->it_prof_expires); 1495 update_rlimit_cpu(new_rlim.rlim_cur);
1512 if (it_prof_secs == 0 || new_rlim.rlim_cur <= it_prof_secs) {
1513 unsigned long rlim_cur = new_rlim.rlim_cur;
1514 cputime_t cputime;
1515
1516 cputime = secs_to_cputime(rlim_cur);
1517 read_lock(&tasklist_lock);
1518 spin_lock_irq(&current->sighand->siglock);
1519 set_process_cpu_timer(current, CPUCLOCK_PROF, &cputime, NULL);
1520 spin_unlock_irq(&current->sighand->siglock);
1521 read_unlock(&tasklist_lock);
1522 }
1523out: 1496out:
1524 return 0; 1497 return 0;
1525} 1498}
@@ -1557,11 +1530,8 @@ out:
1557 * 1530 *
1558 */ 1531 */
1559 1532
1560static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r, 1533static void accumulate_thread_rusage(struct task_struct *t, struct rusage *r)
1561 cputime_t *utimep, cputime_t *stimep)
1562{ 1534{
1563 *utimep = cputime_add(*utimep, t->utime);
1564 *stimep = cputime_add(*stimep, t->stime);
1565 r->ru_nvcsw += t->nvcsw; 1535 r->ru_nvcsw += t->nvcsw;
1566 r->ru_nivcsw += t->nivcsw; 1536 r->ru_nivcsw += t->nivcsw;
1567 r->ru_minflt += t->min_flt; 1537 r->ru_minflt += t->min_flt;
@@ -1575,12 +1545,13 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1575 struct task_struct *t; 1545 struct task_struct *t;
1576 unsigned long flags; 1546 unsigned long flags;
1577 cputime_t utime, stime; 1547 cputime_t utime, stime;
1548 struct task_cputime cputime;
1578 1549
1579 memset((char *) r, 0, sizeof *r); 1550 memset((char *) r, 0, sizeof *r);
1580 utime = stime = cputime_zero; 1551 utime = stime = cputime_zero;
1581 1552
1582 if (who == RUSAGE_THREAD) { 1553 if (who == RUSAGE_THREAD) {
1583 accumulate_thread_rusage(p, r, &utime, &stime); 1554 accumulate_thread_rusage(p, r);
1584 goto out; 1555 goto out;
1585 } 1556 }
1586 1557
@@ -1603,8 +1574,9 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1603 break; 1574 break;
1604 1575
1605 case RUSAGE_SELF: 1576 case RUSAGE_SELF:
1606 utime = cputime_add(utime, p->signal->utime); 1577 thread_group_cputime(p, &cputime);
1607 stime = cputime_add(stime, p->signal->stime); 1578 utime = cputime_add(utime, cputime.utime);
1579 stime = cputime_add(stime, cputime.stime);
1608 r->ru_nvcsw += p->signal->nvcsw; 1580 r->ru_nvcsw += p->signal->nvcsw;
1609 r->ru_nivcsw += p->signal->nivcsw; 1581 r->ru_nivcsw += p->signal->nivcsw;
1610 r->ru_minflt += p->signal->min_flt; 1582 r->ru_minflt += p->signal->min_flt;
@@ -1613,7 +1585,7 @@ static void k_getrusage(struct task_struct *p, int who, struct rusage *r)
1613 r->ru_oublock += p->signal->oublock; 1585 r->ru_oublock += p->signal->oublock;
1614 t = p; 1586 t = p;
1615 do { 1587 do {
1616 accumulate_thread_rusage(t, r, &utime, &stime); 1588 accumulate_thread_rusage(t, r);
1617 t = next_thread(t); 1589 t = next_thread(t);
1618 } while (t != p); 1590 } while (t != p);
1619 break; 1591 break;
@@ -1744,6 +1716,16 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
1744 case PR_SET_TSC: 1716 case PR_SET_TSC:
1745 error = SET_TSC_CTL(arg2); 1717 error = SET_TSC_CTL(arg2);
1746 break; 1718 break;
1719 case PR_GET_TIMERSLACK:
1720 error = current->timer_slack_ns;
1721 break;
1722 case PR_SET_TIMERSLACK:
1723 if (arg2 <= 0)
1724 current->timer_slack_ns =
1725 current->default_timer_slack_ns;
1726 else
1727 current->timer_slack_ns = arg2;
1728 break;
1747 default: 1729 default:
1748 error = -EINVAL; 1730 error = -EINVAL;
1749 break; 1731 break;
@@ -1795,7 +1777,7 @@ int orderly_poweroff(bool force)
1795 goto out; 1777 goto out;
1796 } 1778 }
1797 1779
1798 info = call_usermodehelper_setup(argv[0], argv, envp); 1780 info = call_usermodehelper_setup(argv[0], argv, envp, GFP_ATOMIC);
1799 if (info == NULL) { 1781 if (info == NULL) {
1800 argv_free(argv); 1782 argv_free(argv);
1801 goto out; 1783 goto out;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 5b9b467de070..a77b27b11b04 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -31,6 +31,7 @@ cond_syscall(sys_socketpair);
31cond_syscall(sys_bind); 31cond_syscall(sys_bind);
32cond_syscall(sys_listen); 32cond_syscall(sys_listen);
33cond_syscall(sys_accept); 33cond_syscall(sys_accept);
34cond_syscall(sys_paccept);
34cond_syscall(sys_connect); 35cond_syscall(sys_connect);
35cond_syscall(sys_getsockname); 36cond_syscall(sys_getsockname);
36cond_syscall(sys_getpeername); 37cond_syscall(sys_getpeername);
@@ -56,9 +57,11 @@ cond_syscall(compat_sys_set_robust_list);
56cond_syscall(sys_get_robust_list); 57cond_syscall(sys_get_robust_list);
57cond_syscall(compat_sys_get_robust_list); 58cond_syscall(compat_sys_get_robust_list);
58cond_syscall(sys_epoll_create); 59cond_syscall(sys_epoll_create);
60cond_syscall(sys_epoll_create1);
59cond_syscall(sys_epoll_ctl); 61cond_syscall(sys_epoll_ctl);
60cond_syscall(sys_epoll_wait); 62cond_syscall(sys_epoll_wait);
61cond_syscall(sys_epoll_pwait); 63cond_syscall(sys_epoll_pwait);
64cond_syscall(compat_sys_epoll_pwait);
62cond_syscall(sys_semget); 65cond_syscall(sys_semget);
63cond_syscall(sys_semop); 66cond_syscall(sys_semop);
64cond_syscall(sys_semtimedop); 67cond_syscall(sys_semtimedop);
@@ -94,6 +97,7 @@ cond_syscall(sys_keyctl);
94cond_syscall(compat_sys_keyctl); 97cond_syscall(compat_sys_keyctl);
95cond_syscall(compat_sys_socketcall); 98cond_syscall(compat_sys_socketcall);
96cond_syscall(sys_inotify_init); 99cond_syscall(sys_inotify_init);
100cond_syscall(sys_inotify_init1);
97cond_syscall(sys_inotify_add_watch); 101cond_syscall(sys_inotify_add_watch);
98cond_syscall(sys_inotify_rm_watch); 102cond_syscall(sys_inotify_rm_watch);
99cond_syscall(sys_migrate_pages); 103cond_syscall(sys_migrate_pages);
@@ -121,6 +125,12 @@ cond_syscall(sys_vm86old);
121cond_syscall(sys_vm86); 125cond_syscall(sys_vm86);
122cond_syscall(compat_sys_ipc); 126cond_syscall(compat_sys_ipc);
123cond_syscall(compat_sys_sysctl); 127cond_syscall(compat_sys_sysctl);
128cond_syscall(sys_flock);
129cond_syscall(sys_io_setup);
130cond_syscall(sys_io_destroy);
131cond_syscall(sys_io_submit);
132cond_syscall(sys_io_cancel);
133cond_syscall(sys_io_getevents);
124 134
125/* arch-specific weak syscall entries */ 135/* arch-specific weak syscall entries */
126cond_syscall(sys_pciconfig_read); 136cond_syscall(sys_pciconfig_read);
@@ -154,10 +164,13 @@ cond_syscall(sys_ioprio_get);
154 164
155/* New file descriptors */ 165/* New file descriptors */
156cond_syscall(sys_signalfd); 166cond_syscall(sys_signalfd);
167cond_syscall(sys_signalfd4);
157cond_syscall(compat_sys_signalfd); 168cond_syscall(compat_sys_signalfd);
169cond_syscall(compat_sys_signalfd4);
158cond_syscall(sys_timerfd_create); 170cond_syscall(sys_timerfd_create);
159cond_syscall(sys_timerfd_settime); 171cond_syscall(sys_timerfd_settime);
160cond_syscall(sys_timerfd_gettime); 172cond_syscall(sys_timerfd_gettime);
161cond_syscall(compat_sys_timerfd_settime); 173cond_syscall(compat_sys_timerfd_settime);
162cond_syscall(compat_sys_timerfd_gettime); 174cond_syscall(compat_sys_timerfd_gettime);
163cond_syscall(sys_eventfd); 175cond_syscall(sys_eventfd);
176cond_syscall(sys_eventfd2);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6b16e16428d8..a13bd4dfaeb1 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -43,6 +43,7 @@
43#include <linux/limits.h> 43#include <linux/limits.h>
44#include <linux/dcache.h> 44#include <linux/dcache.h>
45#include <linux/syscalls.h> 45#include <linux/syscalls.h>
46#include <linux/vmstat.h>
46#include <linux/nfs_fs.h> 47#include <linux/nfs_fs.h>
47#include <linux/acpi.h> 48#include <linux/acpi.h>
48#include <linux/reboot.h> 49#include <linux/reboot.h>
@@ -79,8 +80,6 @@ extern int pid_max_min, pid_max_max;
79extern int sysctl_drop_caches; 80extern int sysctl_drop_caches;
80extern int percpu_pagelist_fraction; 81extern int percpu_pagelist_fraction;
81extern int compat_log; 82extern int compat_log;
82extern int maps_protect;
83extern int sysctl_stat_interval;
84extern int latencytop_enabled; 83extern int latencytop_enabled;
85extern int sysctl_nr_open_min, sysctl_nr_open_max; 84extern int sysctl_nr_open_min, sysctl_nr_open_max;
86#ifdef CONFIG_RCU_TORTURE_TEST 85#ifdef CONFIG_RCU_TORTURE_TEST
@@ -88,15 +87,16 @@ extern int rcutorture_runnable;
88#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ 87#endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
89 88
90/* Constants used for minimum and maximum */ 89/* Constants used for minimum and maximum */
91#if defined(CONFIG_DETECT_SOFTLOCKUP) || defined(CONFIG_HIGHMEM) 90#if defined(CONFIG_HIGHMEM) || defined(CONFIG_DETECT_SOFTLOCKUP)
92static int one = 1; 91static int one = 1;
93#endif 92#endif
94 93
95#ifdef CONFIG_DETECT_SOFTLOCKUP 94#ifdef CONFIG_DETECT_SOFTLOCKUP
96static int sixty = 60; 95static int sixty = 60;
96static int neg_one = -1;
97#endif 97#endif
98 98
99#ifdef CONFIG_MMU 99#if defined(CONFIG_MMU) && defined(CONFIG_FILE_LOCKING)
100static int two = 2; 100static int two = 2;
101#endif 101#endif
102 102
@@ -110,17 +110,15 @@ static int min_percpu_pagelist_fract = 8;
110 110
111static int ngroups_max = NGROUPS_MAX; 111static int ngroups_max = NGROUPS_MAX;
112 112
113#ifdef CONFIG_KMOD 113#ifdef CONFIG_MODULES
114extern char modprobe_path[]; 114extern char modprobe_path[];
115#endif 115#endif
116#ifdef CONFIG_CHR_DEV_SG 116#ifdef CONFIG_CHR_DEV_SG
117extern int sg_big_buff; 117extern int sg_big_buff;
118#endif 118#endif
119 119
120#ifdef __sparc__ 120#ifdef CONFIG_SPARC
121extern char reboot_command []; 121#include <asm/system.h>
122extern int stop_a_enabled;
123extern int scons_pwroff;
124#endif 122#endif
125 123
126#ifdef __hppa__ 124#ifdef __hppa__
@@ -151,20 +149,22 @@ extern int max_lock_depth;
151#ifdef CONFIG_PROC_SYSCTL 149#ifdef CONFIG_PROC_SYSCTL
152static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp, 150static int proc_do_cad_pid(struct ctl_table *table, int write, struct file *filp,
153 void __user *buffer, size_t *lenp, loff_t *ppos); 151 void __user *buffer, size_t *lenp, loff_t *ppos);
154static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *filp, 152static int proc_taint(struct ctl_table *table, int write, struct file *filp,
155 void __user *buffer, size_t *lenp, loff_t *ppos); 153 void __user *buffer, size_t *lenp, loff_t *ppos);
156#endif 154#endif
157 155
158static struct ctl_table root_table[]; 156static struct ctl_table root_table[];
159static struct ctl_table_root sysctl_table_root; 157static struct ctl_table_root sysctl_table_root;
160static struct ctl_table_header root_table_header = { 158static struct ctl_table_header root_table_header = {
159 .count = 1,
161 .ctl_table = root_table, 160 .ctl_table = root_table,
162 .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.header_list), 161 .ctl_entry = LIST_HEAD_INIT(sysctl_table_root.default_set.list),
163 .root = &sysctl_table_root, 162 .root = &sysctl_table_root,
163 .set = &sysctl_table_root.default_set,
164}; 164};
165static struct ctl_table_root sysctl_table_root = { 165static struct ctl_table_root sysctl_table_root = {
166 .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list), 166 .root_list = LIST_HEAD_INIT(sysctl_table_root.root_list),
167 .header_list = LIST_HEAD_INIT(root_table_header.ctl_entry), 167 .default_set.list = LIST_HEAD_INIT(root_table_header.ctl_entry),
168}; 168};
169 169
170static struct ctl_table kern_table[]; 170static struct ctl_table kern_table[];
@@ -276,6 +276,16 @@ static struct ctl_table kern_table[] = {
276 }, 276 },
277 { 277 {
278 .ctl_name = CTL_UNNUMBERED, 278 .ctl_name = CTL_UNNUMBERED,
279 .procname = "sched_shares_thresh",
280 .data = &sysctl_sched_shares_thresh,
281 .maxlen = sizeof(unsigned int),
282 .mode = 0644,
283 .proc_handler = &proc_dointvec_minmax,
284 .strategy = &sysctl_intvec,
285 .extra1 = &zero,
286 },
287 {
288 .ctl_name = CTL_UNNUMBERED,
279 .procname = "sched_child_runs_first", 289 .procname = "sched_child_runs_first",
280 .data = &sysctl_sched_child_runs_first, 290 .data = &sysctl_sched_child_runs_first,
281 .maxlen = sizeof(unsigned int), 291 .maxlen = sizeof(unsigned int),
@@ -379,10 +389,9 @@ static struct ctl_table kern_table[] = {
379#ifdef CONFIG_PROC_SYSCTL 389#ifdef CONFIG_PROC_SYSCTL
380 { 390 {
381 .procname = "tainted", 391 .procname = "tainted",
382 .data = &tainted, 392 .maxlen = sizeof(long),
383 .maxlen = sizeof(int),
384 .mode = 0644, 393 .mode = 0644,
385 .proc_handler = &proc_dointvec_taint, 394 .proc_handler = &proc_taint,
386 }, 395 },
387#endif 396#endif
388#ifdef CONFIG_LATENCYTOP 397#ifdef CONFIG_LATENCYTOP
@@ -412,7 +421,7 @@ static struct ctl_table kern_table[] = {
412 .mode = 0644, 421 .mode = 0644,
413 .proc_handler = &proc_dointvec, 422 .proc_handler = &proc_dointvec,
414 }, 423 },
415#ifdef __sparc__ 424#ifdef CONFIG_SPARC
416 { 425 {
417 .ctl_name = KERN_SPARC_REBOOT, 426 .ctl_name = KERN_SPARC_REBOOT,
418 .procname = "reboot-cmd", 427 .procname = "reboot-cmd",
@@ -475,7 +484,7 @@ static struct ctl_table kern_table[] = {
475 .proc_handler = &ftrace_enable_sysctl, 484 .proc_handler = &ftrace_enable_sysctl,
476 }, 485 },
477#endif 486#endif
478#ifdef CONFIG_KMOD 487#ifdef CONFIG_MODULES
479 { 488 {
480 .ctl_name = KERN_MODPROBE, 489 .ctl_name = KERN_MODPROBE,
481 .procname = "modprobe", 490 .procname = "modprobe",
@@ -623,7 +632,7 @@ static struct ctl_table kern_table[] = {
623 { 632 {
624 .ctl_name = KERN_PRINTK_RATELIMIT, 633 .ctl_name = KERN_PRINTK_RATELIMIT,
625 .procname = "printk_ratelimit", 634 .procname = "printk_ratelimit",
626 .data = &printk_ratelimit_jiffies, 635 .data = &printk_ratelimit_state.interval,
627 .maxlen = sizeof(int), 636 .maxlen = sizeof(int),
628 .mode = 0644, 637 .mode = 0644,
629 .proc_handler = &proc_dointvec_jiffies, 638 .proc_handler = &proc_dointvec_jiffies,
@@ -632,7 +641,7 @@ static struct ctl_table kern_table[] = {
632 { 641 {
633 .ctl_name = KERN_PRINTK_RATELIMIT_BURST, 642 .ctl_name = KERN_PRINTK_RATELIMIT_BURST,
634 .procname = "printk_ratelimit_burst", 643 .procname = "printk_ratelimit_burst",
635 .data = &printk_ratelimit_burst, 644 .data = &printk_ratelimit_state.burst,
636 .maxlen = sizeof(int), 645 .maxlen = sizeof(int),
637 .mode = 0644, 646 .mode = 0644,
638 .proc_handler = &proc_dointvec, 647 .proc_handler = &proc_dointvec,
@@ -739,13 +748,24 @@ static struct ctl_table kern_table[] = {
739#ifdef CONFIG_DETECT_SOFTLOCKUP 748#ifdef CONFIG_DETECT_SOFTLOCKUP
740 { 749 {
741 .ctl_name = CTL_UNNUMBERED, 750 .ctl_name = CTL_UNNUMBERED,
751 .procname = "softlockup_panic",
752 .data = &softlockup_panic,
753 .maxlen = sizeof(int),
754 .mode = 0644,
755 .proc_handler = &proc_dointvec_minmax,
756 .strategy = &sysctl_intvec,
757 .extra1 = &zero,
758 .extra2 = &one,
759 },
760 {
761 .ctl_name = CTL_UNNUMBERED,
742 .procname = "softlockup_thresh", 762 .procname = "softlockup_thresh",
743 .data = &softlockup_thresh, 763 .data = &softlockup_thresh,
744 .maxlen = sizeof(unsigned long), 764 .maxlen = sizeof(int),
745 .mode = 0644, 765 .mode = 0644,
746 .proc_handler = &proc_doulongvec_minmax, 766 .proc_handler = &proc_dointvec_minmax,
747 .strategy = &sysctl_intvec, 767 .strategy = &sysctl_intvec,
748 .extra1 = &one, 768 .extra1 = &neg_one,
749 .extra2 = &sixty, 769 .extra2 = &sixty,
750 }, 770 },
751 { 771 {
@@ -796,16 +816,6 @@ static struct ctl_table kern_table[] = {
796 .proc_handler = &proc_dointvec, 816 .proc_handler = &proc_dointvec,
797 }, 817 },
798#endif 818#endif
799#ifdef CONFIG_PROC_FS
800 {
801 .ctl_name = CTL_UNNUMBERED,
802 .procname = "maps_protect",
803 .data = &maps_protect,
804 .maxlen = sizeof(int),
805 .mode = 0644,
806 .proc_handler = &proc_dointvec,
807 },
808#endif
809 { 819 {
810 .ctl_name = CTL_UNNUMBERED, 820 .ctl_name = CTL_UNNUMBERED,
811 .procname = "poweroff_cmd", 821 .procname = "poweroff_cmd",
@@ -833,6 +843,16 @@ static struct ctl_table kern_table[] = {
833 .proc_handler = &proc_dointvec, 843 .proc_handler = &proc_dointvec,
834 }, 844 },
835#endif 845#endif
846#ifdef CONFIG_UNEVICTABLE_LRU
847 {
848 .ctl_name = CTL_UNNUMBERED,
849 .procname = "scan_unevictable_pages",
850 .data = &scan_unevictable_pages,
851 .maxlen = sizeof(scan_unevictable_pages),
852 .mode = 0644,
853 .proc_handler = &scan_unevictable_handler,
854 },
855#endif
836/* 856/*
837 * NOTE: do not add new entries to this table unless you have read 857 * NOTE: do not add new entries to this table unless you have read
838 * Documentation/sysctl/ctl_unnumbered.txt 858 * Documentation/sysctl/ctl_unnumbered.txt
@@ -947,7 +967,7 @@ static struct ctl_table vm_table[] = {
947#ifdef CONFIG_HUGETLB_PAGE 967#ifdef CONFIG_HUGETLB_PAGE
948 { 968 {
949 .procname = "nr_hugepages", 969 .procname = "nr_hugepages",
950 .data = &max_huge_pages, 970 .data = NULL,
951 .maxlen = sizeof(unsigned long), 971 .maxlen = sizeof(unsigned long),
952 .mode = 0644, 972 .mode = 0644,
953 .proc_handler = &hugetlb_sysctl_handler, 973 .proc_handler = &hugetlb_sysctl_handler,
@@ -973,10 +993,12 @@ static struct ctl_table vm_table[] = {
973 { 993 {
974 .ctl_name = CTL_UNNUMBERED, 994 .ctl_name = CTL_UNNUMBERED,
975 .procname = "nr_overcommit_hugepages", 995 .procname = "nr_overcommit_hugepages",
976 .data = &sysctl_overcommit_huge_pages, 996 .data = NULL,
977 .maxlen = sizeof(sysctl_overcommit_huge_pages), 997 .maxlen = sizeof(unsigned long),
978 .mode = 0644, 998 .mode = 0644,
979 .proc_handler = &hugetlb_overcommit_handler, 999 .proc_handler = &hugetlb_overcommit_handler,
1000 .extra1 = (void *)&hugetlb_zero,
1001 .extra2 = (void *)&hugetlb_infinity,
980 }, 1002 },
981#endif 1003#endif
982 { 1004 {
@@ -1245,6 +1267,7 @@ static struct ctl_table fs_table[] = {
1245 .extra1 = &minolduid, 1267 .extra1 = &minolduid,
1246 .extra2 = &maxolduid, 1268 .extra2 = &maxolduid,
1247 }, 1269 },
1270#ifdef CONFIG_FILE_LOCKING
1248 { 1271 {
1249 .ctl_name = FS_LEASES, 1272 .ctl_name = FS_LEASES,
1250 .procname = "leases-enable", 1273 .procname = "leases-enable",
@@ -1253,6 +1276,7 @@ static struct ctl_table fs_table[] = {
1253 .mode = 0644, 1276 .mode = 0644,
1254 .proc_handler = &proc_dointvec, 1277 .proc_handler = &proc_dointvec,
1255 }, 1278 },
1279#endif
1256#ifdef CONFIG_DNOTIFY 1280#ifdef CONFIG_DNOTIFY
1257 { 1281 {
1258 .ctl_name = FS_DIR_NOTIFY, 1282 .ctl_name = FS_DIR_NOTIFY,
@@ -1264,6 +1288,7 @@ static struct ctl_table fs_table[] = {
1264 }, 1288 },
1265#endif 1289#endif
1266#ifdef CONFIG_MMU 1290#ifdef CONFIG_MMU
1291#ifdef CONFIG_FILE_LOCKING
1267 { 1292 {
1268 .ctl_name = FS_LEASE_TIME, 1293 .ctl_name = FS_LEASE_TIME,
1269 .procname = "lease-break-time", 1294 .procname = "lease-break-time",
@@ -1275,6 +1300,8 @@ static struct ctl_table fs_table[] = {
1275 .extra1 = &zero, 1300 .extra1 = &zero,
1276 .extra2 = &two, 1301 .extra2 = &two,
1277 }, 1302 },
1303#endif
1304#ifdef CONFIG_AIO
1278 { 1305 {
1279 .procname = "aio-nr", 1306 .procname = "aio-nr",
1280 .data = &aio_nr, 1307 .data = &aio_nr,
@@ -1289,6 +1316,7 @@ static struct ctl_table fs_table[] = {
1289 .mode = 0644, 1316 .mode = 0644,
1290 .proc_handler = &proc_doulongvec_minmax, 1317 .proc_handler = &proc_doulongvec_minmax,
1291 }, 1318 },
1319#endif /* CONFIG_AIO */
1292#ifdef CONFIG_INOTIFY_USER 1320#ifdef CONFIG_INOTIFY_USER
1293 { 1321 {
1294 .ctl_name = FS_INOTIFY, 1322 .ctl_name = FS_INOTIFY,
@@ -1372,6 +1400,9 @@ static void start_unregistering(struct ctl_table_header *p)
1372 spin_unlock(&sysctl_lock); 1400 spin_unlock(&sysctl_lock);
1373 wait_for_completion(&wait); 1401 wait_for_completion(&wait);
1374 spin_lock(&sysctl_lock); 1402 spin_lock(&sysctl_lock);
1403 } else {
1404 /* anything non-NULL; we'll never dereference it */
1405 p->unregistering = ERR_PTR(-EINVAL);
1375 } 1406 }
1376 /* 1407 /*
1377 * do not remove from the list until nobody holds it; walking the 1408 * do not remove from the list until nobody holds it; walking the
@@ -1380,6 +1411,32 @@ static void start_unregistering(struct ctl_table_header *p)
1380 list_del_init(&p->ctl_entry); 1411 list_del_init(&p->ctl_entry);
1381} 1412}
1382 1413
1414void sysctl_head_get(struct ctl_table_header *head)
1415{
1416 spin_lock(&sysctl_lock);
1417 head->count++;
1418 spin_unlock(&sysctl_lock);
1419}
1420
1421void sysctl_head_put(struct ctl_table_header *head)
1422{
1423 spin_lock(&sysctl_lock);
1424 if (!--head->count)
1425 kfree(head);
1426 spin_unlock(&sysctl_lock);
1427}
1428
1429struct ctl_table_header *sysctl_head_grab(struct ctl_table_header *head)
1430{
1431 if (!head)
1432 BUG();
1433 spin_lock(&sysctl_lock);
1434 if (!use_table(head))
1435 head = ERR_PTR(-ENOENT);
1436 spin_unlock(&sysctl_lock);
1437 return head;
1438}
1439
1383void sysctl_head_finish(struct ctl_table_header *head) 1440void sysctl_head_finish(struct ctl_table_header *head)
1384{ 1441{
1385 if (!head) 1442 if (!head)
@@ -1389,14 +1446,20 @@ void sysctl_head_finish(struct ctl_table_header *head)
1389 spin_unlock(&sysctl_lock); 1446 spin_unlock(&sysctl_lock);
1390} 1447}
1391 1448
1449static struct ctl_table_set *
1450lookup_header_set(struct ctl_table_root *root, struct nsproxy *namespaces)
1451{
1452 struct ctl_table_set *set = &root->default_set;
1453 if (root->lookup)
1454 set = root->lookup(root, namespaces);
1455 return set;
1456}
1457
1392static struct list_head * 1458static struct list_head *
1393lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces) 1459lookup_header_list(struct ctl_table_root *root, struct nsproxy *namespaces)
1394{ 1460{
1395 struct list_head *header_list; 1461 struct ctl_table_set *set = lookup_header_set(root, namespaces);
1396 header_list = &root->header_list; 1462 return &set->list;
1397 if (root->lookup)
1398 header_list = root->lookup(root, namespaces);
1399 return header_list;
1400} 1463}
1401 1464
1402struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces, 1465struct ctl_table_header *__sysctl_head_next(struct nsproxy *namespaces,
@@ -1459,22 +1522,20 @@ void register_sysctl_root(struct ctl_table_root *root)
1459/* Perform the actual read/write of a sysctl table entry. */ 1522/* Perform the actual read/write of a sysctl table entry. */
1460static int do_sysctl_strategy(struct ctl_table_root *root, 1523static int do_sysctl_strategy(struct ctl_table_root *root,
1461 struct ctl_table *table, 1524 struct ctl_table *table,
1462 int __user *name, int nlen,
1463 void __user *oldval, size_t __user *oldlenp, 1525 void __user *oldval, size_t __user *oldlenp,
1464 void __user *newval, size_t newlen) 1526 void __user *newval, size_t newlen)
1465{ 1527{
1466 int op = 0, rc; 1528 int op = 0, rc;
1467 1529
1468 if (oldval) 1530 if (oldval)
1469 op |= 004; 1531 op |= MAY_READ;
1470 if (newval) 1532 if (newval)
1471 op |= 002; 1533 op |= MAY_WRITE;
1472 if (sysctl_perm(root, table, op)) 1534 if (sysctl_perm(root, table, op))
1473 return -EPERM; 1535 return -EPERM;
1474 1536
1475 if (table->strategy) { 1537 if (table->strategy) {
1476 rc = table->strategy(table, name, nlen, oldval, oldlenp, 1538 rc = table->strategy(table, oldval, oldlenp, newval, newlen);
1477 newval, newlen);
1478 if (rc < 0) 1539 if (rc < 0)
1479 return rc; 1540 return rc;
1480 if (rc > 0) 1541 if (rc > 0)
@@ -1484,8 +1545,7 @@ static int do_sysctl_strategy(struct ctl_table_root *root,
1484 /* If there is no strategy routine, or if the strategy returns 1545 /* If there is no strategy routine, or if the strategy returns
1485 * zero, proceed with automatic r/w */ 1546 * zero, proceed with automatic r/w */
1486 if (table->data && table->maxlen) { 1547 if (table->data && table->maxlen) {
1487 rc = sysctl_data(table, name, nlen, oldval, oldlenp, 1548 rc = sysctl_data(table, oldval, oldlenp, newval, newlen);
1488 newval, newlen);
1489 if (rc < 0) 1549 if (rc < 0)
1490 return rc; 1550 return rc;
1491 } 1551 }
@@ -1510,14 +1570,14 @@ repeat:
1510 if (n == table->ctl_name) { 1570 if (n == table->ctl_name) {
1511 int error; 1571 int error;
1512 if (table->child) { 1572 if (table->child) {
1513 if (sysctl_perm(root, table, 001)) 1573 if (sysctl_perm(root, table, MAY_EXEC))
1514 return -EPERM; 1574 return -EPERM;
1515 name++; 1575 name++;
1516 nlen--; 1576 nlen--;
1517 table = table->child; 1577 table = table->child;
1518 goto repeat; 1578 goto repeat;
1519 } 1579 }
1520 error = do_sysctl_strategy(root, table, name, nlen, 1580 error = do_sysctl_strategy(root, table,
1521 oldval, oldlenp, 1581 oldval, oldlenp,
1522 newval, newlen); 1582 newval, newlen);
1523 return error; 1583 return error;
@@ -1585,7 +1645,7 @@ static int test_perm(int mode, int op)
1585 mode >>= 6; 1645 mode >>= 6;
1586 else if (in_egroup_p(0)) 1646 else if (in_egroup_p(0))
1587 mode >>= 3; 1647 mode >>= 3;
1588 if ((mode & op & 0007) == op) 1648 if ((op & ~mode & (MAY_READ|MAY_WRITE|MAY_EXEC)) == 0)
1589 return 0; 1649 return 0;
1590 return -EACCES; 1650 return -EACCES;
1591} 1651}
@@ -1595,7 +1655,7 @@ int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
1595 int error; 1655 int error;
1596 int mode; 1656 int mode;
1597 1657
1598 error = security_sysctl(table, op); 1658 error = security_sysctl(table, op & (MAY_READ | MAY_WRITE | MAY_EXEC));
1599 if (error) 1659 if (error)
1600 return error; 1660 return error;
1601 1661
@@ -1630,6 +1690,54 @@ static __init int sysctl_init(void)
1630 1690
1631core_initcall(sysctl_init); 1691core_initcall(sysctl_init);
1632 1692
1693static struct ctl_table *is_branch_in(struct ctl_table *branch,
1694 struct ctl_table *table)
1695{
1696 struct ctl_table *p;
1697 const char *s = branch->procname;
1698
1699 /* branch should have named subdirectory as its first element */
1700 if (!s || !branch->child)
1701 return NULL;
1702
1703 /* ... and nothing else */
1704 if (branch[1].procname || branch[1].ctl_name)
1705 return NULL;
1706
1707 /* table should contain subdirectory with the same name */
1708 for (p = table; p->procname || p->ctl_name; p++) {
1709 if (!p->child)
1710 continue;
1711 if (p->procname && strcmp(p->procname, s) == 0)
1712 return p;
1713 }
1714 return NULL;
1715}
1716
1717/* see if attaching q to p would be an improvement */
1718static void try_attach(struct ctl_table_header *p, struct ctl_table_header *q)
1719{
1720 struct ctl_table *to = p->ctl_table, *by = q->ctl_table;
1721 struct ctl_table *next;
1722 int is_better = 0;
1723 int not_in_parent = !p->attached_by;
1724
1725 while ((next = is_branch_in(by, to)) != NULL) {
1726 if (by == q->attached_by)
1727 is_better = 1;
1728 if (to == p->attached_by)
1729 not_in_parent = 1;
1730 by = by->child;
1731 to = next->child;
1732 }
1733
1734 if (is_better && not_in_parent) {
1735 q->attached_by = by;
1736 q->attached_to = to;
1737 q->parent = p;
1738 }
1739}
1740
1633/** 1741/**
1634 * __register_sysctl_paths - register a sysctl hierarchy 1742 * __register_sysctl_paths - register a sysctl hierarchy
1635 * @root: List of sysctl headers to register on 1743 * @root: List of sysctl headers to register on
@@ -1706,10 +1814,10 @@ struct ctl_table_header *__register_sysctl_paths(
1706 struct nsproxy *namespaces, 1814 struct nsproxy *namespaces,
1707 const struct ctl_path *path, struct ctl_table *table) 1815 const struct ctl_path *path, struct ctl_table *table)
1708{ 1816{
1709 struct list_head *header_list;
1710 struct ctl_table_header *header; 1817 struct ctl_table_header *header;
1711 struct ctl_table *new, **prevp; 1818 struct ctl_table *new, **prevp;
1712 unsigned int n, npath; 1819 unsigned int n, npath;
1820 struct ctl_table_set *set;
1713 1821
1714 /* Count the path components */ 1822 /* Count the path components */
1715 for (npath = 0; path[npath].ctl_name || path[npath].procname; ++npath) 1823 for (npath = 0; path[npath].ctl_name || path[npath].procname; ++npath)
@@ -1751,6 +1859,7 @@ struct ctl_table_header *__register_sysctl_paths(
1751 header->unregistering = NULL; 1859 header->unregistering = NULL;
1752 header->root = root; 1860 header->root = root;
1753 sysctl_set_parent(NULL, header->ctl_table); 1861 sysctl_set_parent(NULL, header->ctl_table);
1862 header->count = 1;
1754#ifdef CONFIG_SYSCTL_SYSCALL_CHECK 1863#ifdef CONFIG_SYSCTL_SYSCALL_CHECK
1755 if (sysctl_check_table(namespaces, header->ctl_table)) { 1864 if (sysctl_check_table(namespaces, header->ctl_table)) {
1756 kfree(header); 1865 kfree(header);
@@ -1758,8 +1867,20 @@ struct ctl_table_header *__register_sysctl_paths(
1758 } 1867 }
1759#endif 1868#endif
1760 spin_lock(&sysctl_lock); 1869 spin_lock(&sysctl_lock);
1761 header_list = lookup_header_list(root, namespaces); 1870 header->set = lookup_header_set(root, namespaces);
1762 list_add_tail(&header->ctl_entry, header_list); 1871 header->attached_by = header->ctl_table;
1872 header->attached_to = root_table;
1873 header->parent = &root_table_header;
1874 for (set = header->set; set; set = set->parent) {
1875 struct ctl_table_header *p;
1876 list_for_each_entry(p, &set->list, ctl_entry) {
1877 if (p->unregistering)
1878 continue;
1879 try_attach(p, header);
1880 }
1881 }
1882 header->parent->count++;
1883 list_add_tail(&header->ctl_entry, &header->set->list);
1763 spin_unlock(&sysctl_lock); 1884 spin_unlock(&sysctl_lock);
1764 1885
1765 return header; 1886 return header;
@@ -1814,8 +1935,37 @@ void unregister_sysctl_table(struct ctl_table_header * header)
1814 1935
1815 spin_lock(&sysctl_lock); 1936 spin_lock(&sysctl_lock);
1816 start_unregistering(header); 1937 start_unregistering(header);
1938 if (!--header->parent->count) {
1939 WARN_ON(1);
1940 kfree(header->parent);
1941 }
1942 if (!--header->count)
1943 kfree(header);
1944 spin_unlock(&sysctl_lock);
1945}
1946
1947int sysctl_is_seen(struct ctl_table_header *p)
1948{
1949 struct ctl_table_set *set = p->set;
1950 int res;
1951 spin_lock(&sysctl_lock);
1952 if (p->unregistering)
1953 res = 0;
1954 else if (!set->is_seen)
1955 res = 1;
1956 else
1957 res = set->is_seen(set);
1817 spin_unlock(&sysctl_lock); 1958 spin_unlock(&sysctl_lock);
1818 kfree(header); 1959 return res;
1960}
1961
1962void setup_sysctl_set(struct ctl_table_set *p,
1963 struct ctl_table_set *parent,
1964 int (*is_seen)(struct ctl_table_set *))
1965{
1966 INIT_LIST_HEAD(&p->list);
1967 p->parent = parent ? parent : &sysctl_table_root.default_set;
1968 p->is_seen = is_seen;
1819} 1969}
1820 1970
1821#else /* !CONFIG_SYSCTL */ 1971#else /* !CONFIG_SYSCTL */
@@ -1834,6 +1984,16 @@ void unregister_sysctl_table(struct ctl_table_header * table)
1834{ 1984{
1835} 1985}
1836 1986
1987void setup_sysctl_set(struct ctl_table_set *p,
1988 struct ctl_table_set *parent,
1989 int (*is_seen)(struct ctl_table_set *))
1990{
1991}
1992
1993void sysctl_head_put(struct ctl_table_header *head)
1994{
1995}
1996
1837#endif /* CONFIG_SYSCTL */ 1997#endif /* CONFIG_SYSCTL */
1838 1998
1839/* 1999/*
@@ -2086,49 +2246,39 @@ int proc_dointvec(struct ctl_table *table, int write, struct file *filp,
2086 NULL,NULL); 2246 NULL,NULL);
2087} 2247}
2088 2248
2089#define OP_SET 0
2090#define OP_AND 1
2091#define OP_OR 2
2092
2093static int do_proc_dointvec_bset_conv(int *negp, unsigned long *lvalp,
2094 int *valp,
2095 int write, void *data)
2096{
2097 int op = *(int *)data;
2098 if (write) {
2099 int val = *negp ? -*lvalp : *lvalp;
2100 switch(op) {
2101 case OP_SET: *valp = val; break;
2102 case OP_AND: *valp &= val; break;
2103 case OP_OR: *valp |= val; break;
2104 }
2105 } else {
2106 int val = *valp;
2107 if (val < 0) {
2108 *negp = -1;
2109 *lvalp = (unsigned long)-val;
2110 } else {
2111 *negp = 0;
2112 *lvalp = (unsigned long)val;
2113 }
2114 }
2115 return 0;
2116}
2117
2118/* 2249/*
2119 * Taint values can only be increased 2250 * Taint values can only be increased
2251 * This means we can safely use a temporary.
2120 */ 2252 */
2121static int proc_dointvec_taint(struct ctl_table *table, int write, struct file *filp, 2253static int proc_taint(struct ctl_table *table, int write, struct file *filp,
2122 void __user *buffer, size_t *lenp, loff_t *ppos) 2254 void __user *buffer, size_t *lenp, loff_t *ppos)
2123{ 2255{
2124 int op; 2256 struct ctl_table t;
2257 unsigned long tmptaint = get_taint();
2258 int err;
2125 2259
2126 if (write && !capable(CAP_SYS_ADMIN)) 2260 if (write && !capable(CAP_SYS_ADMIN))
2127 return -EPERM; 2261 return -EPERM;
2128 2262
2129 op = OP_OR; 2263 t = *table;
2130 return do_proc_dointvec(table,write,filp,buffer,lenp,ppos, 2264 t.data = &tmptaint;
2131 do_proc_dointvec_bset_conv,&op); 2265 err = proc_doulongvec_minmax(&t, write, filp, buffer, lenp, ppos);
2266 if (err < 0)
2267 return err;
2268
2269 if (write) {
2270 /*
2271 * Poor man's atomic or. Not worth adding a primitive
2272 * to everyone's atomic.h for this
2273 */
2274 int i;
2275 for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) {
2276 if ((tmptaint >> i) & 1)
2277 add_taint(i);
2278 }
2279 }
2280
2281 return err;
2132} 2282}
2133 2283
2134struct do_proc_dointvec_minmax_conv_param { 2284struct do_proc_dointvec_minmax_conv_param {
@@ -2576,7 +2726,7 @@ int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int write,
2576 */ 2726 */
2577 2727
2578/* The generic sysctl data routine (used if no strategy routine supplied) */ 2728/* The generic sysctl data routine (used if no strategy routine supplied) */
2579int sysctl_data(struct ctl_table *table, int __user *name, int nlen, 2729int sysctl_data(struct ctl_table *table,
2580 void __user *oldval, size_t __user *oldlenp, 2730 void __user *oldval, size_t __user *oldlenp,
2581 void __user *newval, size_t newlen) 2731 void __user *newval, size_t newlen)
2582{ 2732{
@@ -2610,7 +2760,7 @@ int sysctl_data(struct ctl_table *table, int __user *name, int nlen,
2610} 2760}
2611 2761
2612/* The generic string strategy routine: */ 2762/* The generic string strategy routine: */
2613int sysctl_string(struct ctl_table *table, int __user *name, int nlen, 2763int sysctl_string(struct ctl_table *table,
2614 void __user *oldval, size_t __user *oldlenp, 2764 void __user *oldval, size_t __user *oldlenp,
2615 void __user *newval, size_t newlen) 2765 void __user *newval, size_t newlen)
2616{ 2766{
@@ -2656,7 +2806,7 @@ int sysctl_string(struct ctl_table *table, int __user *name, int nlen,
2656 * are between the minimum and maximum values given in the arrays 2806 * are between the minimum and maximum values given in the arrays
2657 * table->extra1 and table->extra2, respectively. 2807 * table->extra1 and table->extra2, respectively.
2658 */ 2808 */
2659int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen, 2809int sysctl_intvec(struct ctl_table *table,
2660 void __user *oldval, size_t __user *oldlenp, 2810 void __user *oldval, size_t __user *oldlenp,
2661 void __user *newval, size_t newlen) 2811 void __user *newval, size_t newlen)
2662{ 2812{
@@ -2692,7 +2842,7 @@ int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen,
2692} 2842}
2693 2843
2694/* Strategy function to convert jiffies to seconds */ 2844/* Strategy function to convert jiffies to seconds */
2695int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen, 2845int sysctl_jiffies(struct ctl_table *table,
2696 void __user *oldval, size_t __user *oldlenp, 2846 void __user *oldval, size_t __user *oldlenp,
2697 void __user *newval, size_t newlen) 2847 void __user *newval, size_t newlen)
2698{ 2848{
@@ -2726,7 +2876,7 @@ int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen,
2726} 2876}
2727 2877
2728/* Strategy function to convert jiffies to seconds */ 2878/* Strategy function to convert jiffies to seconds */
2729int sysctl_ms_jiffies(struct ctl_table *table, int __user *name, int nlen, 2879int sysctl_ms_jiffies(struct ctl_table *table,
2730 void __user *oldval, size_t __user *oldlenp, 2880 void __user *oldval, size_t __user *oldlenp,
2731 void __user *newval, size_t newlen) 2881 void __user *newval, size_t newlen)
2732{ 2882{
@@ -2781,35 +2931,35 @@ asmlinkage long sys_sysctl(struct __sysctl_args __user *args)
2781 return error; 2931 return error;
2782} 2932}
2783 2933
2784int sysctl_data(struct ctl_table *table, int __user *name, int nlen, 2934int sysctl_data(struct ctl_table *table,
2785 void __user *oldval, size_t __user *oldlenp, 2935 void __user *oldval, size_t __user *oldlenp,
2786 void __user *newval, size_t newlen) 2936 void __user *newval, size_t newlen)
2787{ 2937{
2788 return -ENOSYS; 2938 return -ENOSYS;
2789} 2939}
2790 2940
2791int sysctl_string(struct ctl_table *table, int __user *name, int nlen, 2941int sysctl_string(struct ctl_table *table,
2792 void __user *oldval, size_t __user *oldlenp, 2942 void __user *oldval, size_t __user *oldlenp,
2793 void __user *newval, size_t newlen) 2943 void __user *newval, size_t newlen)
2794{ 2944{
2795 return -ENOSYS; 2945 return -ENOSYS;
2796} 2946}
2797 2947
2798int sysctl_intvec(struct ctl_table *table, int __user *name, int nlen, 2948int sysctl_intvec(struct ctl_table *table,
2799 void __user *oldval, size_t __user *oldlenp, 2949 void __user *oldval, size_t __user *oldlenp,
2800 void __user *newval, size_t newlen) 2950 void __user *newval, size_t newlen)
2801{ 2951{
2802 return -ENOSYS; 2952 return -ENOSYS;
2803} 2953}
2804 2954
2805int sysctl_jiffies(struct ctl_table *table, int __user *name, int nlen, 2955int sysctl_jiffies(struct ctl_table *table,
2806 void __user *oldval, size_t __user *oldlenp, 2956 void __user *oldval, size_t __user *oldlenp,
2807 void __user *newval, size_t newlen) 2957 void __user *newval, size_t newlen)
2808{ 2958{
2809 return -ENOSYS; 2959 return -ENOSYS;
2810} 2960}
2811 2961
2812int sysctl_ms_jiffies(struct ctl_table *table, int __user *name, int nlen, 2962int sysctl_ms_jiffies(struct ctl_table *table,
2813 void __user *oldval, size_t __user *oldlenp, 2963 void __user *oldval, size_t __user *oldlenp,
2814 void __user *newval, size_t newlen) 2964 void __user *newval, size_t newlen)
2815{ 2965{
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index c09350d564f2..c35da23ab8fb 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -1532,6 +1532,8 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
1532 sysctl_check_leaf(namespaces, table, &fail); 1532 sysctl_check_leaf(namespaces, table, &fail);
1533 } 1533 }
1534 sysctl_check_bin_path(table, &fail); 1534 sysctl_check_bin_path(table, &fail);
1535 if (table->mode > 0777)
1536 set_fail(&fail, table, "bogus .mode");
1535 if (fail) { 1537 if (fail) {
1536 set_fail(&fail, table, NULL); 1538 set_fail(&fail, table, NULL);
1537 error = -EINVAL; 1539 error = -EINVAL;
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 4a23517169a6..bd6be76303cf 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -35,7 +35,7 @@
35 */ 35 */
36#define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS) 36#define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS)
37 37
38static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 }; 38static DEFINE_PER_CPU(__u32, taskstats_seqnum);
39static int family_registered; 39static int family_registered;
40struct kmem_cache *taskstats_cache; 40struct kmem_cache *taskstats_cache;
41 41
@@ -301,7 +301,7 @@ static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
301 return -EINVAL; 301 return -EINVAL;
302 302
303 if (isadd == REGISTER) { 303 if (isadd == REGISTER) {
304 for_each_cpu_mask(cpu, mask) { 304 for_each_cpu_mask_nr(cpu, mask) {
305 s = kmalloc_node(sizeof(struct listener), GFP_KERNEL, 305 s = kmalloc_node(sizeof(struct listener), GFP_KERNEL,
306 cpu_to_node(cpu)); 306 cpu_to_node(cpu));
307 if (!s) 307 if (!s)
@@ -320,7 +320,7 @@ static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
320 320
321 /* Deregister or cleanup */ 321 /* Deregister or cleanup */
322cleanup: 322cleanup:
323 for_each_cpu_mask(cpu, mask) { 323 for_each_cpu_mask_nr(cpu, mask) {
324 listeners = &per_cpu(listener_array, cpu); 324 listeners = &per_cpu(listener_array, cpu);
325 down_write(&listeners->sem); 325 down_write(&listeners->sem);
326 list_for_each_entry_safe(s, tmp, &listeners->list, list) { 326 list_for_each_entry_safe(s, tmp, &listeners->list, list) {
diff --git a/kernel/time.c b/kernel/time.c
index 6a08660b4fac..d63a4336fad6 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -669,3 +669,21 @@ EXPORT_SYMBOL(get_jiffies_64);
669#endif 669#endif
670 670
671EXPORT_SYMBOL(jiffies); 671EXPORT_SYMBOL(jiffies);
672
673/*
674 * Add two timespec values and do a safety check for overflow.
675 * It's assumed that both values are valid (>= 0)
676 */
677struct timespec timespec_add_safe(const struct timespec lhs,
678 const struct timespec rhs)
679{
680 struct timespec res;
681
682 set_normalized_timespec(&res, lhs.tv_sec + rhs.tv_sec,
683 lhs.tv_nsec + rhs.tv_nsec);
684
685 if (res.tv_sec < lhs.tv_sec || res.tv_sec < rhs.tv_sec)
686 res.tv_sec = TIME_T_MAX;
687
688 return res;
689}
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index 8d53106a0a92..95ed42951e0a 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -3,7 +3,6 @@
3# 3#
4config TICK_ONESHOT 4config TICK_ONESHOT
5 bool 5 bool
6 default n
7 6
8config NO_HZ 7config NO_HZ
9 bool "Tickless System (Dynamic Ticks)" 8 bool "Tickless System (Dynamic Ticks)"
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index 3d1e3e1a1971..f8d968063cea 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -72,6 +72,16 @@ void clockevents_set_mode(struct clock_event_device *dev,
72} 72}
73 73
74/** 74/**
75 * clockevents_shutdown - shutdown the device and clear next_event
76 * @dev: device to shutdown
77 */
78void clockevents_shutdown(struct clock_event_device *dev)
79{
80 clockevents_set_mode(dev, CLOCK_EVT_MODE_SHUTDOWN);
81 dev->next_event.tv64 = KTIME_MAX;
82}
83
84/**
75 * clockevents_program_event - Reprogram the clock event device. 85 * clockevents_program_event - Reprogram the clock event device.
76 * @expires: absolute expiry time (monotonic clock) 86 * @expires: absolute expiry time (monotonic clock)
77 * 87 *
@@ -177,7 +187,7 @@ void clockevents_register_device(struct clock_event_device *dev)
177/* 187/*
178 * Noop handler when we shut down an event device 188 * Noop handler when we shut down an event device
179 */ 189 */
180static void clockevents_handle_noop(struct clock_event_device *dev) 190void clockevents_handle_noop(struct clock_event_device *dev)
181{ 191{
182} 192}
183 193
@@ -199,7 +209,6 @@ void clockevents_exchange_device(struct clock_event_device *old,
199 * released list and do a notify add later. 209 * released list and do a notify add later.
200 */ 210 */
201 if (old) { 211 if (old) {
202 old->event_handler = clockevents_handle_noop;
203 clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); 212 clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED);
204 list_del(&old->list); 213 list_del(&old->list);
205 list_add(&old->list, &clockevents_released); 214 list_add(&old->list, &clockevents_released);
@@ -207,7 +216,7 @@ void clockevents_exchange_device(struct clock_event_device *old,
207 216
208 if (new) { 217 if (new) {
209 BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED); 218 BUG_ON(new->mode != CLOCK_EVT_MODE_UNUSED);
210 clockevents_set_mode(new, CLOCK_EVT_MODE_SHUTDOWN); 219 clockevents_shutdown(new);
211 } 220 }
212 local_irq_restore(flags); 221 local_irq_restore(flags);
213} 222}
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index dadde5361f32..9ed2eec97526 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -145,9 +145,9 @@ static void clocksource_watchdog(unsigned long data)
145 * Cycle through CPUs to check if the CPUs stay 145 * Cycle through CPUs to check if the CPUs stay
146 * synchronized to each other. 146 * synchronized to each other.
147 */ 147 */
148 int next_cpu = next_cpu(raw_smp_processor_id(), cpu_online_map); 148 int next_cpu = next_cpu_nr(raw_smp_processor_id(), cpu_online_map);
149 149
150 if (next_cpu >= NR_CPUS) 150 if (next_cpu >= nr_cpu_ids)
151 next_cpu = first_cpu(cpu_online_map); 151 next_cpu = first_cpu(cpu_online_map);
152 watchdog_timer.expires += WATCHDOG_INTERVAL; 152 watchdog_timer.expires += WATCHDOG_INTERVAL;
153 add_timer_on(&watchdog_timer, next_cpu); 153 add_timer_on(&watchdog_timer, next_cpu);
@@ -325,6 +325,9 @@ int clocksource_register(struct clocksource *c)
325 unsigned long flags; 325 unsigned long flags;
326 int ret; 326 int ret;
327 327
328 /* save mult_orig on registration */
329 c->mult_orig = c->mult;
330
328 spin_lock_irqsave(&clocksource_lock, flags); 331 spin_lock_irqsave(&clocksource_lock, flags);
329 ret = clocksource_enqueue(c); 332 ret = clocksource_enqueue(c);
330 if (!ret) 333 if (!ret)
@@ -376,7 +379,8 @@ void clocksource_unregister(struct clocksource *cs)
376 * Provides sysfs interface for listing current clocksource. 379 * Provides sysfs interface for listing current clocksource.
377 */ 380 */
378static ssize_t 381static ssize_t
379sysfs_show_current_clocksources(struct sys_device *dev, char *buf) 382sysfs_show_current_clocksources(struct sys_device *dev,
383 struct sysdev_attribute *attr, char *buf)
380{ 384{
381 ssize_t count = 0; 385 ssize_t count = 0;
382 386
@@ -397,6 +401,7 @@ sysfs_show_current_clocksources(struct sys_device *dev, char *buf)
397 * clocksource selction. 401 * clocksource selction.
398 */ 402 */
399static ssize_t sysfs_override_clocksource(struct sys_device *dev, 403static ssize_t sysfs_override_clocksource(struct sys_device *dev,
404 struct sysdev_attribute *attr,
400 const char *buf, size_t count) 405 const char *buf, size_t count)
401{ 406{
402 struct clocksource *ovr = NULL; 407 struct clocksource *ovr = NULL;
@@ -449,7 +454,9 @@ static ssize_t sysfs_override_clocksource(struct sys_device *dev,
449 * Provides sysfs interface for listing registered clocksources 454 * Provides sysfs interface for listing registered clocksources
450 */ 455 */
451static ssize_t 456static ssize_t
452sysfs_show_available_clocksources(struct sys_device *dev, char *buf) 457sysfs_show_available_clocksources(struct sys_device *dev,
458 struct sysdev_attribute *attr,
459 char *buf)
453{ 460{
454 struct clocksource *src; 461 struct clocksource *src;
455 ssize_t count = 0; 462 ssize_t count = 0;
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 4c256fdb8875..1ca99557e929 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -61,6 +61,7 @@ struct clocksource clocksource_jiffies = {
61 .read = jiffies_read, 61 .read = jiffies_read,
62 .mask = 0xffffffff, /*32bits*/ 62 .mask = 0xffffffff, /*32bits*/
63 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ 63 .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
64 .mult_orig = NSEC_PER_JIFFY << JIFFIES_SHIFT,
64 .shift = JIFFIES_SHIFT, 65 .shift = JIFFIES_SHIFT,
65}; 66};
66 67
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 5125ddd8196b..8ff15e5d486b 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -10,13 +10,13 @@
10 10
11#include <linux/mm.h> 11#include <linux/mm.h>
12#include <linux/time.h> 12#include <linux/time.h>
13#include <linux/timer.h>
14#include <linux/timex.h> 13#include <linux/timex.h>
15#include <linux/jiffies.h> 14#include <linux/jiffies.h>
16#include <linux/hrtimer.h> 15#include <linux/hrtimer.h>
17#include <linux/capability.h> 16#include <linux/capability.h>
18#include <linux/math64.h> 17#include <linux/math64.h>
19#include <linux/clocksource.h> 18#include <linux/clocksource.h>
19#include <linux/workqueue.h>
20#include <asm/timex.h> 20#include <asm/timex.h>
21 21
22/* 22/*
@@ -142,8 +142,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
142 time_state = TIME_OOP; 142 time_state = TIME_OOP;
143 printk(KERN_NOTICE "Clock: " 143 printk(KERN_NOTICE "Clock: "
144 "inserting leap second 23:59:60 UTC\n"); 144 "inserting leap second 23:59:60 UTC\n");
145 leap_timer.expires = ktime_add_ns(leap_timer.expires, 145 hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC);
146 NSEC_PER_SEC);
147 res = HRTIMER_RESTART; 146 res = HRTIMER_RESTART;
148 break; 147 break;
149 case TIME_DEL: 148 case TIME_DEL:
@@ -218,11 +217,11 @@ void second_overflow(void)
218/* Disable the cmos update - used by virtualization and embedded */ 217/* Disable the cmos update - used by virtualization and embedded */
219int no_sync_cmos_clock __read_mostly; 218int no_sync_cmos_clock __read_mostly;
220 219
221static void sync_cmos_clock(unsigned long dummy); 220static void sync_cmos_clock(struct work_struct *work);
222 221
223static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0); 222static DECLARE_DELAYED_WORK(sync_cmos_work, sync_cmos_clock);
224 223
225static void sync_cmos_clock(unsigned long dummy) 224static void sync_cmos_clock(struct work_struct *work)
226{ 225{
227 struct timespec now, next; 226 struct timespec now, next;
228 int fail = 1; 227 int fail = 1;
@@ -245,7 +244,7 @@ static void sync_cmos_clock(unsigned long dummy)
245 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) 244 if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2)
246 fail = update_persistent_clock(now); 245 fail = update_persistent_clock(now);
247 246
248 next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec; 247 next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2);
249 if (next.tv_nsec <= 0) 248 if (next.tv_nsec <= 0)
250 next.tv_nsec += NSEC_PER_SEC; 249 next.tv_nsec += NSEC_PER_SEC;
251 250
@@ -258,13 +257,13 @@ static void sync_cmos_clock(unsigned long dummy)
258 next.tv_sec++; 257 next.tv_sec++;
259 next.tv_nsec -= NSEC_PER_SEC; 258 next.tv_nsec -= NSEC_PER_SEC;
260 } 259 }
261 mod_timer(&sync_cmos_timer, jiffies + timespec_to_jiffies(&next)); 260 schedule_delayed_work(&sync_cmos_work, timespec_to_jiffies(&next));
262} 261}
263 262
264static void notify_cmos_timer(void) 263static void notify_cmos_timer(void)
265{ 264{
266 if (!no_sync_cmos_clock) 265 if (!no_sync_cmos_clock)
267 mod_timer(&sync_cmos_timer, jiffies + 1); 266 schedule_delayed_work(&sync_cmos_work, 0);
268} 267}
269 268
270#else 269#else
@@ -277,38 +276,50 @@ static inline void notify_cmos_timer(void) { }
277int do_adjtimex(struct timex *txc) 276int do_adjtimex(struct timex *txc)
278{ 277{
279 struct timespec ts; 278 struct timespec ts;
280 long save_adjust, sec;
281 int result; 279 int result;
282 280
283 /* In order to modify anything, you gotta be super-user! */ 281 /* Validate the data before disabling interrupts */
284 if (txc->modes && !capable(CAP_SYS_TIME)) 282 if (txc->modes & ADJ_ADJTIME) {
285 return -EPERM;
286
287 /* Now we validate the data before disabling interrupts */
288
289 if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT) {
290 /* singleshot must not be used with any other mode bits */ 283 /* singleshot must not be used with any other mode bits */
291 if (txc->modes & ~ADJ_OFFSET_SS_READ) 284 if (!(txc->modes & ADJ_OFFSET_SINGLESHOT))
292 return -EINVAL; 285 return -EINVAL;
286 if (!(txc->modes & ADJ_OFFSET_READONLY) &&
287 !capable(CAP_SYS_TIME))
288 return -EPERM;
289 } else {
290 /* In order to modify anything, you gotta be super-user! */
291 if (txc->modes && !capable(CAP_SYS_TIME))
292 return -EPERM;
293
294 /* if the quartz is off by more than 10% something is VERY wrong! */
295 if (txc->modes & ADJ_TICK &&
296 (txc->tick < 900000/USER_HZ ||
297 txc->tick > 1100000/USER_HZ))
298 return -EINVAL;
299
300 if (txc->modes & ADJ_STATUS && time_state != TIME_OK)
301 hrtimer_cancel(&leap_timer);
293 } 302 }
294 303
295 /* if the quartz is off by more than 10% something is VERY wrong ! */
296 if (txc->modes & ADJ_TICK)
297 if (txc->tick < 900000/USER_HZ ||
298 txc->tick > 1100000/USER_HZ)
299 return -EINVAL;
300
301 if (time_state != TIME_OK && txc->modes & ADJ_STATUS)
302 hrtimer_cancel(&leap_timer);
303 getnstimeofday(&ts); 304 getnstimeofday(&ts);
304 305
305 write_seqlock_irq(&xtime_lock); 306 write_seqlock_irq(&xtime_lock);
306 307
307 /* Save for later - semantics of adjtime is to return old value */
308 save_adjust = time_adjust;
309
310 /* If there are input parameters, then process them */ 308 /* If there are input parameters, then process them */
309 if (txc->modes & ADJ_ADJTIME) {
310 long save_adjust = time_adjust;
311
312 if (!(txc->modes & ADJ_OFFSET_READONLY)) {
313 /* adjtime() is independent from ntp_adjtime() */
314 time_adjust = txc->offset;
315 ntp_update_frequency();
316 }
317 txc->offset = save_adjust;
318 goto adj_done;
319 }
311 if (txc->modes) { 320 if (txc->modes) {
321 long sec;
322
312 if (txc->modes & ADJ_STATUS) { 323 if (txc->modes & ADJ_STATUS) {
313 if ((time_status & STA_PLL) && 324 if ((time_status & STA_PLL) &&
314 !(txc->status & STA_PLL)) { 325 !(txc->status & STA_PLL)) {
@@ -375,13 +386,8 @@ int do_adjtimex(struct timex *txc)
375 if (txc->modes & ADJ_TAI && txc->constant > 0) 386 if (txc->modes & ADJ_TAI && txc->constant > 0)
376 time_tai = txc->constant; 387 time_tai = txc->constant;
377 388
378 if (txc->modes & ADJ_OFFSET) { 389 if (txc->modes & ADJ_OFFSET)
379 if (txc->modes == ADJ_OFFSET_SINGLESHOT) 390 ntp_update_offset(txc->offset);
380 /* adjtime() is independent from ntp_adjtime() */
381 time_adjust = txc->offset;
382 else
383 ntp_update_offset(txc->offset);
384 }
385 if (txc->modes & ADJ_TICK) 391 if (txc->modes & ADJ_TICK)
386 tick_usec = txc->tick; 392 tick_usec = txc->tick;
387 393
@@ -389,22 +395,18 @@ int do_adjtimex(struct timex *txc)
389 ntp_update_frequency(); 395 ntp_update_frequency();
390 } 396 }
391 397
398 txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
399 NTP_SCALE_SHIFT);
400 if (!(time_status & STA_NANO))
401 txc->offset /= NSEC_PER_USEC;
402
403adj_done:
392 result = time_state; /* mostly `TIME_OK' */ 404 result = time_state; /* mostly `TIME_OK' */
393 if (time_status & (STA_UNSYNC|STA_CLOCKERR)) 405 if (time_status & (STA_UNSYNC|STA_CLOCKERR))
394 result = TIME_ERROR; 406 result = TIME_ERROR;
395 407
396 if ((txc->modes == ADJ_OFFSET_SINGLESHOT) || 408 txc->freq = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) *
397 (txc->modes == ADJ_OFFSET_SS_READ)) 409 (s64)PPM_SCALE_INV, NTP_SCALE_SHIFT);
398 txc->offset = save_adjust;
399 else {
400 txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
401 NTP_SCALE_SHIFT);
402 if (!(time_status & STA_NANO))
403 txc->offset /= NSEC_PER_USEC;
404 }
405 txc->freq = shift_right((s32)(time_freq >> PPM_SCALE_INV_SHIFT) *
406 (s64)PPM_SCALE_INV,
407 NTP_SCALE_SHIFT);
408 txc->maxerror = time_maxerror; 410 txc->maxerror = time_maxerror;
409 txc->esterror = time_esterror; 411 txc->esterror = time_esterror;
410 txc->status = time_status; 412 txc->status = time_status;
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index f48d0f09d32f..f98a1b7b16e9 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -175,6 +175,8 @@ static void tick_do_periodic_broadcast(void)
175 */ 175 */
176static void tick_handle_periodic_broadcast(struct clock_event_device *dev) 176static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
177{ 177{
178 ktime_t next;
179
178 tick_do_periodic_broadcast(); 180 tick_do_periodic_broadcast();
179 181
180 /* 182 /*
@@ -185,10 +187,13 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev)
185 187
186 /* 188 /*
187 * Setup the next period for devices, which do not have 189 * Setup the next period for devices, which do not have
188 * periodic mode: 190 * periodic mode. We read dev->next_event first and add to it
191 * when the event alrady expired. clockevents_program_event()
192 * sets dev->next_event only when the event is really
193 * programmed to the device.
189 */ 194 */
190 for (;;) { 195 for (next = dev->next_event; ;) {
191 ktime_t next = ktime_add(dev->next_event, tick_period); 196 next = ktime_add(next, tick_period);
192 197
193 if (!clockevents_program_event(dev, next, ktime_get())) 198 if (!clockevents_program_event(dev, next, ktime_get()))
194 return; 199 return;
@@ -205,7 +210,7 @@ static void tick_do_broadcast_on_off(void *why)
205 struct clock_event_device *bc, *dev; 210 struct clock_event_device *bc, *dev;
206 struct tick_device *td; 211 struct tick_device *td;
207 unsigned long flags, *reason = why; 212 unsigned long flags, *reason = why;
208 int cpu; 213 int cpu, bc_stopped;
209 214
210 spin_lock_irqsave(&tick_broadcast_lock, flags); 215 spin_lock_irqsave(&tick_broadcast_lock, flags);
211 216
@@ -223,14 +228,16 @@ static void tick_do_broadcast_on_off(void *why)
223 if (!tick_device_is_functional(dev)) 228 if (!tick_device_is_functional(dev))
224 goto out; 229 goto out;
225 230
231 bc_stopped = cpus_empty(tick_broadcast_mask);
232
226 switch (*reason) { 233 switch (*reason) {
227 case CLOCK_EVT_NOTIFY_BROADCAST_ON: 234 case CLOCK_EVT_NOTIFY_BROADCAST_ON:
228 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: 235 case CLOCK_EVT_NOTIFY_BROADCAST_FORCE:
229 if (!cpu_isset(cpu, tick_broadcast_mask)) { 236 if (!cpu_isset(cpu, tick_broadcast_mask)) {
230 cpu_set(cpu, tick_broadcast_mask); 237 cpu_set(cpu, tick_broadcast_mask);
231 if (td->mode == TICKDEV_MODE_PERIODIC) 238 if (tick_broadcast_device.mode ==
232 clockevents_set_mode(dev, 239 TICKDEV_MODE_PERIODIC)
233 CLOCK_EVT_MODE_SHUTDOWN); 240 clockevents_shutdown(dev);
234 } 241 }
235 if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE) 242 if (*reason == CLOCK_EVT_NOTIFY_BROADCAST_FORCE)
236 tick_broadcast_force = 1; 243 tick_broadcast_force = 1;
@@ -239,15 +246,17 @@ static void tick_do_broadcast_on_off(void *why)
239 if (!tick_broadcast_force && 246 if (!tick_broadcast_force &&
240 cpu_isset(cpu, tick_broadcast_mask)) { 247 cpu_isset(cpu, tick_broadcast_mask)) {
241 cpu_clear(cpu, tick_broadcast_mask); 248 cpu_clear(cpu, tick_broadcast_mask);
242 if (td->mode == TICKDEV_MODE_PERIODIC) 249 if (tick_broadcast_device.mode ==
250 TICKDEV_MODE_PERIODIC)
243 tick_setup_periodic(dev, 0); 251 tick_setup_periodic(dev, 0);
244 } 252 }
245 break; 253 break;
246 } 254 }
247 255
248 if (cpus_empty(tick_broadcast_mask)) 256 if (cpus_empty(tick_broadcast_mask)) {
249 clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); 257 if (!bc_stopped)
250 else { 258 clockevents_shutdown(bc);
259 } else if (bc_stopped) {
251 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) 260 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
252 tick_broadcast_start_periodic(bc); 261 tick_broadcast_start_periodic(bc);
253 else 262 else
@@ -298,7 +307,7 @@ void tick_shutdown_broadcast(unsigned int *cpup)
298 307
299 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) { 308 if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) {
300 if (bc && cpus_empty(tick_broadcast_mask)) 309 if (bc && cpus_empty(tick_broadcast_mask))
301 clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); 310 clockevents_shutdown(bc);
302 } 311 }
303 312
304 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 313 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
@@ -313,7 +322,7 @@ void tick_suspend_broadcast(void)
313 322
314 bc = tick_broadcast_device.evtdev; 323 bc = tick_broadcast_device.evtdev;
315 if (bc) 324 if (bc)
316 clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); 325 clockevents_shutdown(bc);
317 326
318 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 327 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
319} 328}
@@ -364,16 +373,8 @@ cpumask_t *tick_get_broadcast_oneshot_mask(void)
364static int tick_broadcast_set_event(ktime_t expires, int force) 373static int tick_broadcast_set_event(ktime_t expires, int force)
365{ 374{
366 struct clock_event_device *bc = tick_broadcast_device.evtdev; 375 struct clock_event_device *bc = tick_broadcast_device.evtdev;
367 ktime_t now = ktime_get(); 376
368 int res; 377 return tick_dev_program_event(bc, expires, force);
369
370 for(;;) {
371 res = clockevents_program_event(bc, expires, now);
372 if (!res || !force)
373 return res;
374 now = ktime_get();
375 expires = ktime_add(now, ktime_set(0, bc->min_delta_ns));
376 }
377} 378}
378 379
379int tick_resume_broadcast_oneshot(struct clock_event_device *bc) 380int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
@@ -383,6 +384,19 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
383} 384}
384 385
385/* 386/*
387 * Called from irq_enter() when idle was interrupted to reenable the
388 * per cpu device.
389 */
390void tick_check_oneshot_broadcast(int cpu)
391{
392 if (cpu_isset(cpu, tick_broadcast_oneshot_mask)) {
393 struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
394
395 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_ONESHOT);
396 }
397}
398
399/*
386 * Handle oneshot mode broadcasting 400 * Handle oneshot mode broadcasting
387 */ 401 */
388static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) 402static void tick_handle_oneshot_broadcast(struct clock_event_device *dev)
@@ -399,8 +413,7 @@ again:
399 mask = CPU_MASK_NONE; 413 mask = CPU_MASK_NONE;
400 now = ktime_get(); 414 now = ktime_get();
401 /* Find all expired events */ 415 /* Find all expired events */
402 for (cpu = first_cpu(tick_broadcast_oneshot_mask); cpu != NR_CPUS; 416 for_each_cpu_mask_nr(cpu, tick_broadcast_oneshot_mask) {
403 cpu = next_cpu(cpu, tick_broadcast_oneshot_mask)) {
404 td = &per_cpu(tick_cpu_device, cpu); 417 td = &per_cpu(tick_cpu_device, cpu);
405 if (td->evtdev->next_event.tv64 <= now.tv64) 418 if (td->evtdev->next_event.tv64 <= now.tv64)
406 cpu_set(cpu, mask); 419 cpu_set(cpu, mask);
@@ -492,14 +505,52 @@ static void tick_broadcast_clear_oneshot(int cpu)
492 cpu_clear(cpu, tick_broadcast_oneshot_mask); 505 cpu_clear(cpu, tick_broadcast_oneshot_mask);
493} 506}
494 507
508static void tick_broadcast_init_next_event(cpumask_t *mask, ktime_t expires)
509{
510 struct tick_device *td;
511 int cpu;
512
513 for_each_cpu_mask_nr(cpu, *mask) {
514 td = &per_cpu(tick_cpu_device, cpu);
515 if (td->evtdev)
516 td->evtdev->next_event = expires;
517 }
518}
519
495/** 520/**
496 * tick_broadcast_setup_oneshot - setup the broadcast device 521 * tick_broadcast_setup_oneshot - setup the broadcast device
497 */ 522 */
498void tick_broadcast_setup_oneshot(struct clock_event_device *bc) 523void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
499{ 524{
500 bc->event_handler = tick_handle_oneshot_broadcast; 525 /* Set it up only once ! */
501 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); 526 if (bc->event_handler != tick_handle_oneshot_broadcast) {
502 bc->next_event.tv64 = KTIME_MAX; 527 int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC;
528 int cpu = smp_processor_id();
529 cpumask_t mask;
530
531 bc->event_handler = tick_handle_oneshot_broadcast;
532 clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
533
534 /* Take the do_timer update */
535 tick_do_timer_cpu = cpu;
536
537 /*
538 * We must be careful here. There might be other CPUs
539 * waiting for periodic broadcast. We need to set the
540 * oneshot_mask bits for those and program the
541 * broadcast device to fire.
542 */
543 mask = tick_broadcast_mask;
544 cpu_clear(cpu, mask);
545 cpus_or(tick_broadcast_oneshot_mask,
546 tick_broadcast_oneshot_mask, mask);
547
548 if (was_periodic && !cpus_empty(mask)) {
549 tick_broadcast_init_next_event(&mask, tick_next_period);
550 tick_broadcast_set_event(tick_next_period, 1);
551 } else
552 bc->next_event.tv64 = KTIME_MAX;
553 }
503} 554}
504 555
505/* 556/*
@@ -539,4 +590,12 @@ void tick_shutdown_broadcast_oneshot(unsigned int *cpup)
539 spin_unlock_irqrestore(&tick_broadcast_lock, flags); 590 spin_unlock_irqrestore(&tick_broadcast_lock, flags);
540} 591}
541 592
593/*
594 * Check, whether the broadcast device is in one shot mode
595 */
596int tick_broadcast_oneshot_active(void)
597{
598 return tick_broadcast_device.mode == TICKDEV_MODE_ONESHOT;
599}
600
542#endif 601#endif
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 4f3886562b8c..df12434b43ca 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -33,7 +33,7 @@ DEFINE_PER_CPU(struct tick_device, tick_cpu_device);
33 */ 33 */
34ktime_t tick_next_period; 34ktime_t tick_next_period;
35ktime_t tick_period; 35ktime_t tick_period;
36int tick_do_timer_cpu __read_mostly = -1; 36int tick_do_timer_cpu __read_mostly = TICK_DO_TIMER_BOOT;
37DEFINE_SPINLOCK(tick_device_lock); 37DEFINE_SPINLOCK(tick_device_lock);
38 38
39/* 39/*
@@ -109,7 +109,8 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
109 if (!tick_device_is_functional(dev)) 109 if (!tick_device_is_functional(dev))
110 return; 110 return;
111 111
112 if (dev->features & CLOCK_EVT_FEAT_PERIODIC) { 112 if ((dev->features & CLOCK_EVT_FEAT_PERIODIC) &&
113 !tick_broadcast_oneshot_active()) {
113 clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC); 114 clockevents_set_mode(dev, CLOCK_EVT_MODE_PERIODIC);
114 } else { 115 } else {
115 unsigned long seq; 116 unsigned long seq;
@@ -135,7 +136,7 @@ void tick_setup_periodic(struct clock_event_device *dev, int broadcast)
135 */ 136 */
136static void tick_setup_device(struct tick_device *td, 137static void tick_setup_device(struct tick_device *td,
137 struct clock_event_device *newdev, int cpu, 138 struct clock_event_device *newdev, int cpu,
138 cpumask_t cpumask) 139 const cpumask_t *cpumask)
139{ 140{
140 ktime_t next_event; 141 ktime_t next_event;
141 void (*handler)(struct clock_event_device *) = NULL; 142 void (*handler)(struct clock_event_device *) = NULL;
@@ -148,7 +149,7 @@ static void tick_setup_device(struct tick_device *td,
148 * If no cpu took the do_timer update, assign it to 149 * If no cpu took the do_timer update, assign it to
149 * this cpu: 150 * this cpu:
150 */ 151 */
151 if (tick_do_timer_cpu == -1) { 152 if (tick_do_timer_cpu == TICK_DO_TIMER_BOOT) {
152 tick_do_timer_cpu = cpu; 153 tick_do_timer_cpu = cpu;
153 tick_next_period = ktime_get(); 154 tick_next_period = ktime_get();
154 tick_period = ktime_set(0, NSEC_PER_SEC / HZ); 155 tick_period = ktime_set(0, NSEC_PER_SEC / HZ);
@@ -161,6 +162,7 @@ static void tick_setup_device(struct tick_device *td,
161 } else { 162 } else {
162 handler = td->evtdev->event_handler; 163 handler = td->evtdev->event_handler;
163 next_event = td->evtdev->next_event; 164 next_event = td->evtdev->next_event;
165 td->evtdev->event_handler = clockevents_handle_noop;
164 } 166 }
165 167
166 td->evtdev = newdev; 168 td->evtdev = newdev;
@@ -169,8 +171,8 @@ static void tick_setup_device(struct tick_device *td,
169 * When the device is not per cpu, pin the interrupt to the 171 * When the device is not per cpu, pin the interrupt to the
170 * current cpu: 172 * current cpu:
171 */ 173 */
172 if (!cpus_equal(newdev->cpumask, cpumask)) 174 if (!cpus_equal(newdev->cpumask, *cpumask))
173 irq_set_affinity(newdev->irq, cpumask); 175 irq_set_affinity(newdev->irq, *cpumask);
174 176
175 /* 177 /*
176 * When global broadcasting is active, check if the current 178 * When global broadcasting is active, check if the current
@@ -196,7 +198,6 @@ static int tick_check_new_device(struct clock_event_device *newdev)
196 struct tick_device *td; 198 struct tick_device *td;
197 int cpu, ret = NOTIFY_OK; 199 int cpu, ret = NOTIFY_OK;
198 unsigned long flags; 200 unsigned long flags;
199 cpumask_t cpumask;
200 201
201 spin_lock_irqsave(&tick_device_lock, flags); 202 spin_lock_irqsave(&tick_device_lock, flags);
202 203
@@ -206,10 +207,9 @@ static int tick_check_new_device(struct clock_event_device *newdev)
206 207
207 td = &per_cpu(tick_cpu_device, cpu); 208 td = &per_cpu(tick_cpu_device, cpu);
208 curdev = td->evtdev; 209 curdev = td->evtdev;
209 cpumask = cpumask_of_cpu(cpu);
210 210
211 /* cpu local device ? */ 211 /* cpu local device ? */
212 if (!cpus_equal(newdev->cpumask, cpumask)) { 212 if (!cpus_equal(newdev->cpumask, cpumask_of_cpu(cpu))) {
213 213
214 /* 214 /*
215 * If the cpu affinity of the device interrupt can not 215 * If the cpu affinity of the device interrupt can not
@@ -222,7 +222,7 @@ static int tick_check_new_device(struct clock_event_device *newdev)
222 * If we have a cpu local device already, do not replace it 222 * If we have a cpu local device already, do not replace it
223 * by a non cpu local device 223 * by a non cpu local device
224 */ 224 */
225 if (curdev && cpus_equal(curdev->cpumask, cpumask)) 225 if (curdev && cpus_equal(curdev->cpumask, cpumask_of_cpu(cpu)))
226 goto out_bc; 226 goto out_bc;
227 } 227 }
228 228
@@ -250,11 +250,11 @@ static int tick_check_new_device(struct clock_event_device *newdev)
250 * not give it back to the clockevents layer ! 250 * not give it back to the clockevents layer !
251 */ 251 */
252 if (tick_is_broadcast_device(curdev)) { 252 if (tick_is_broadcast_device(curdev)) {
253 clockevents_set_mode(curdev, CLOCK_EVT_MODE_SHUTDOWN); 253 clockevents_shutdown(curdev);
254 curdev = NULL; 254 curdev = NULL;
255 } 255 }
256 clockevents_exchange_device(curdev, newdev); 256 clockevents_exchange_device(curdev, newdev);
257 tick_setup_device(td, newdev, cpu, cpumask); 257 tick_setup_device(td, newdev, cpu, &cpumask_of_cpu(cpu));
258 if (newdev->features & CLOCK_EVT_FEAT_ONESHOT) 258 if (newdev->features & CLOCK_EVT_FEAT_ONESHOT)
259 tick_oneshot_notify(); 259 tick_oneshot_notify();
260 260
@@ -301,7 +301,8 @@ static void tick_shutdown(unsigned int *cpup)
301 if (*cpup == tick_do_timer_cpu) { 301 if (*cpup == tick_do_timer_cpu) {
302 int cpu = first_cpu(cpu_online_map); 302 int cpu = first_cpu(cpu_online_map);
303 303
304 tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu : -1; 304 tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu :
305 TICK_DO_TIMER_NONE;
305 } 306 }
306 spin_unlock_irqrestore(&tick_device_lock, flags); 307 spin_unlock_irqrestore(&tick_device_lock, flags);
307} 308}
@@ -312,7 +313,7 @@ static void tick_suspend(void)
312 unsigned long flags; 313 unsigned long flags;
313 314
314 spin_lock_irqsave(&tick_device_lock, flags); 315 spin_lock_irqsave(&tick_device_lock, flags);
315 clockevents_set_mode(td->evtdev, CLOCK_EVT_MODE_SHUTDOWN); 316 clockevents_shutdown(td->evtdev);
316 spin_unlock_irqrestore(&tick_device_lock, flags); 317 spin_unlock_irqrestore(&tick_device_lock, flags);
317} 318}
318 319
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index f13f2b7f4fd4..b1c05bf75ee0 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -1,6 +1,10 @@
1/* 1/*
2 * tick internal variable and functions used by low/high res code 2 * tick internal variable and functions used by low/high res code
3 */ 3 */
4
5#define TICK_DO_TIMER_NONE -1
6#define TICK_DO_TIMER_BOOT -2
7
4DECLARE_PER_CPU(struct tick_device, tick_cpu_device); 8DECLARE_PER_CPU(struct tick_device, tick_cpu_device);
5extern spinlock_t tick_device_lock; 9extern spinlock_t tick_device_lock;
6extern ktime_t tick_next_period; 10extern ktime_t tick_next_period;
@@ -10,6 +14,8 @@ extern int tick_do_timer_cpu __read_mostly;
10extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast); 14extern void tick_setup_periodic(struct clock_event_device *dev, int broadcast);
11extern void tick_handle_periodic(struct clock_event_device *dev); 15extern void tick_handle_periodic(struct clock_event_device *dev);
12 16
17extern void clockevents_shutdown(struct clock_event_device *dev);
18
13/* 19/*
14 * NO_HZ / high resolution timer shared code 20 * NO_HZ / high resolution timer shared code
15 */ 21 */
@@ -17,6 +23,8 @@ extern void tick_handle_periodic(struct clock_event_device *dev);
17extern void tick_setup_oneshot(struct clock_event_device *newdev, 23extern void tick_setup_oneshot(struct clock_event_device *newdev,
18 void (*handler)(struct clock_event_device *), 24 void (*handler)(struct clock_event_device *),
19 ktime_t nextevt); 25 ktime_t nextevt);
26extern int tick_dev_program_event(struct clock_event_device *dev,
27 ktime_t expires, int force);
20extern int tick_program_event(ktime_t expires, int force); 28extern int tick_program_event(ktime_t expires, int force);
21extern void tick_oneshot_notify(void); 29extern void tick_oneshot_notify(void);
22extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); 30extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *));
@@ -27,6 +35,8 @@ extern void tick_broadcast_oneshot_control(unsigned long reason);
27extern void tick_broadcast_switch_to_oneshot(void); 35extern void tick_broadcast_switch_to_oneshot(void);
28extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup); 36extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
29extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc); 37extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
38extern int tick_broadcast_oneshot_active(void);
39extern void tick_check_oneshot_broadcast(int cpu);
30# else /* BROADCAST */ 40# else /* BROADCAST */
31static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc) 41static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
32{ 42{
@@ -35,6 +45,8 @@ static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
35static inline void tick_broadcast_oneshot_control(unsigned long reason) { } 45static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
36static inline void tick_broadcast_switch_to_oneshot(void) { } 46static inline void tick_broadcast_switch_to_oneshot(void) { }
37static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { } 47static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
48static inline int tick_broadcast_oneshot_active(void) { return 0; }
49static inline void tick_check_oneshot_broadcast(int cpu) { }
38# endif /* !BROADCAST */ 50# endif /* !BROADCAST */
39 51
40#else /* !ONESHOT */ 52#else /* !ONESHOT */
@@ -64,6 +76,7 @@ static inline int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
64{ 76{
65 return 0; 77 return 0;
66} 78}
79static inline int tick_broadcast_oneshot_active(void) { return 0; }
67#endif /* !TICK_ONESHOT */ 80#endif /* !TICK_ONESHOT */
68 81
69/* 82/*
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 450c04935b66..2e8de678e767 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -23,24 +23,56 @@
23#include "tick-internal.h" 23#include "tick-internal.h"
24 24
25/** 25/**
26 * tick_program_event 26 * tick_program_event internal worker function
27 */ 27 */
28int tick_program_event(ktime_t expires, int force) 28int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires,
29 int force)
29{ 30{
30 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
31 ktime_t now = ktime_get(); 31 ktime_t now = ktime_get();
32 int i;
32 33
33 while (1) { 34 for (i = 0;;) {
34 int ret = clockevents_program_event(dev, expires, now); 35 int ret = clockevents_program_event(dev, expires, now);
35 36
36 if (!ret || !force) 37 if (!ret || !force)
37 return ret; 38 return ret;
39
40 /*
41 * We tried 2 times to program the device with the given
42 * min_delta_ns. If that's not working then we double it
43 * and emit a warning.
44 */
45 if (++i > 2) {
46 /* Increase the min. delta and try again */
47 if (!dev->min_delta_ns)
48 dev->min_delta_ns = 5000;
49 else
50 dev->min_delta_ns += dev->min_delta_ns >> 1;
51
52 printk(KERN_WARNING
53 "CE: %s increasing min_delta_ns to %lu nsec\n",
54 dev->name ? dev->name : "?",
55 dev->min_delta_ns << 1);
56
57 i = 0;
58 }
59
38 now = ktime_get(); 60 now = ktime_get();
39 expires = ktime_add(now, ktime_set(0, dev->min_delta_ns)); 61 expires = ktime_add_ns(now, dev->min_delta_ns);
40 } 62 }
41} 63}
42 64
43/** 65/**
66 * tick_program_event
67 */
68int tick_program_event(ktime_t expires, int force)
69{
70 struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev;
71
72 return tick_dev_program_event(dev, expires, force);
73}
74
75/**
44 * tick_resume_onshot - resume oneshot mode 76 * tick_resume_onshot - resume oneshot mode
45 */ 77 */
46void tick_resume_oneshot(void) 78void tick_resume_oneshot(void)
@@ -61,7 +93,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev,
61{ 93{
62 newdev->event_handler = handler; 94 newdev->event_handler = handler;
63 clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT); 95 clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT);
64 clockevents_program_event(newdev, next_event, ktime_get()); 96 tick_dev_program_event(newdev, next_event, 1);
65} 97}
66 98
67/** 99/**
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index beef7ccdf842..5bbb1044f847 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -20,6 +20,7 @@
20#include <linux/profile.h> 20#include <linux/profile.h>
21#include <linux/sched.h> 21#include <linux/sched.h>
22#include <linux/tick.h> 22#include <linux/tick.h>
23#include <linux/module.h>
23 24
24#include <asm/irq_regs.h> 25#include <asm/irq_regs.h>
25 26
@@ -75,6 +76,9 @@ static void tick_do_update_jiffies64(ktime_t now)
75 incr * ticks); 76 incr * ticks);
76 } 77 }
77 do_timer(++ticks); 78 do_timer(++ticks);
79
80 /* Keep the tick_next_period variable up to date */
81 tick_next_period = ktime_add(last_jiffies_update, tick_period);
78 } 82 }
79 write_sequnlock(&xtime_lock); 83 write_sequnlock(&xtime_lock);
80} 84}
@@ -140,8 +144,6 @@ void tick_nohz_update_jiffies(void)
140 if (!ts->tick_stopped) 144 if (!ts->tick_stopped)
141 return; 145 return;
142 146
143 touch_softlockup_watchdog();
144
145 cpu_clear(cpu, nohz_cpu_mask); 147 cpu_clear(cpu, nohz_cpu_mask);
146 now = ktime_get(); 148 now = ktime_get();
147 ts->idle_waketime = now; 149 ts->idle_waketime = now;
@@ -149,9 +151,11 @@ void tick_nohz_update_jiffies(void)
149 local_irq_save(flags); 151 local_irq_save(flags);
150 tick_do_update_jiffies64(now); 152 tick_do_update_jiffies64(now);
151 local_irq_restore(flags); 153 local_irq_restore(flags);
154
155 touch_softlockup_watchdog();
152} 156}
153 157
154void tick_nohz_stop_idle(int cpu) 158static void tick_nohz_stop_idle(int cpu)
155{ 159{
156 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 160 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
157 161
@@ -162,6 +166,8 @@ void tick_nohz_stop_idle(int cpu)
162 ts->idle_lastupdate = now; 166 ts->idle_lastupdate = now;
163 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); 167 ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta);
164 ts->idle_active = 0; 168 ts->idle_active = 0;
169
170 sched_clock_idle_wakeup_event(0);
165 } 171 }
166} 172}
167 173
@@ -177,6 +183,7 @@ static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
177 } 183 }
178 ts->idle_entrytime = now; 184 ts->idle_entrytime = now;
179 ts->idle_active = 1; 185 ts->idle_active = 1;
186 sched_clock_idle_sleep_event();
180 return now; 187 return now;
181} 188}
182 189
@@ -184,9 +191,17 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
184{ 191{
185 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 192 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
186 193
187 *last_update_time = ktime_to_us(ts->idle_lastupdate); 194 if (!tick_nohz_enabled)
195 return -1;
196
197 if (ts->idle_active)
198 *last_update_time = ktime_to_us(ts->idle_lastupdate);
199 else
200 *last_update_time = ktime_to_us(ktime_get());
201
188 return ktime_to_us(ts->idle_sleeptime); 202 return ktime_to_us(ts->idle_sleeptime);
189} 203}
204EXPORT_SYMBOL_GPL(get_cpu_idle_time_us);
190 205
191/** 206/**
192 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task 207 * tick_nohz_stop_sched_tick - stop the idle tick from the idle task
@@ -195,7 +210,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time)
195 * Called either from the idle loop or from irq_exit() when an idle period was 210 * Called either from the idle loop or from irq_exit() when an idle period was
196 * just interrupted by an interrupt which did not cause a reschedule. 211 * just interrupted by an interrupt which did not cause a reschedule.
197 */ 212 */
198void tick_nohz_stop_sched_tick(void) 213void tick_nohz_stop_sched_tick(int inidle)
199{ 214{
200 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags; 215 unsigned long seq, last_jiffies, next_jiffies, delta_jiffies, flags;
201 struct tick_sched *ts; 216 struct tick_sched *ts;
@@ -218,12 +233,17 @@ void tick_nohz_stop_sched_tick(void)
218 */ 233 */
219 if (unlikely(!cpu_online(cpu))) { 234 if (unlikely(!cpu_online(cpu))) {
220 if (cpu == tick_do_timer_cpu) 235 if (cpu == tick_do_timer_cpu)
221 tick_do_timer_cpu = -1; 236 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
222 } 237 }
223 238
224 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE)) 239 if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
225 goto end; 240 goto end;
226 241
242 if (!inidle && !ts->inidle)
243 goto end;
244
245 ts->inidle = 1;
246
227 if (need_resched()) 247 if (need_resched())
228 goto end; 248 goto end;
229 249
@@ -250,7 +270,7 @@ void tick_nohz_stop_sched_tick(void)
250 next_jiffies = get_next_timer_interrupt(last_jiffies); 270 next_jiffies = get_next_timer_interrupt(last_jiffies);
251 delta_jiffies = next_jiffies - last_jiffies; 271 delta_jiffies = next_jiffies - last_jiffies;
252 272
253 if (rcu_needs_cpu(cpu)) 273 if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu))
254 delta_jiffies = 1; 274 delta_jiffies = 1;
255 /* 275 /*
256 * Do not stop the tick, if we are only one off 276 * Do not stop the tick, if we are only one off
@@ -280,11 +300,10 @@ void tick_nohz_stop_sched_tick(void)
280 goto out; 300 goto out;
281 } 301 }
282 302
283 ts->idle_tick = ts->sched_timer.expires; 303 ts->idle_tick = hrtimer_get_expires(&ts->sched_timer);
284 ts->tick_stopped = 1; 304 ts->tick_stopped = 1;
285 ts->idle_jiffies = last_jiffies; 305 ts->idle_jiffies = last_jiffies;
286 rcu_enter_nohz(); 306 rcu_enter_nohz();
287 sched_clock_tick_stop(cpu);
288 } 307 }
289 308
290 /* 309 /*
@@ -296,7 +315,7 @@ void tick_nohz_stop_sched_tick(void)
296 * invoked. 315 * invoked.
297 */ 316 */
298 if (cpu == tick_do_timer_cpu) 317 if (cpu == tick_do_timer_cpu)
299 tick_do_timer_cpu = -1; 318 tick_do_timer_cpu = TICK_DO_TIMER_NONE;
300 319
301 ts->idle_sleeps++; 320 ts->idle_sleeps++;
302 321
@@ -358,6 +377,32 @@ ktime_t tick_nohz_get_sleep_length(void)
358 return ts->sleep_length; 377 return ts->sleep_length;
359} 378}
360 379
380static void tick_nohz_restart(struct tick_sched *ts, ktime_t now)
381{
382 hrtimer_cancel(&ts->sched_timer);
383 hrtimer_set_expires(&ts->sched_timer, ts->idle_tick);
384
385 while (1) {
386 /* Forward the time to expire in the future */
387 hrtimer_forward(&ts->sched_timer, now, tick_period);
388
389 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
390 hrtimer_start_expires(&ts->sched_timer,
391 HRTIMER_MODE_ABS);
392 /* Check, if the timer was already in the past */
393 if (hrtimer_active(&ts->sched_timer))
394 break;
395 } else {
396 if (!tick_program_event(
397 hrtimer_get_expires(&ts->sched_timer), 0))
398 break;
399 }
400 /* Update jiffies and reread time */
401 tick_do_update_jiffies64(now);
402 now = ktime_get();
403 }
404}
405
361/** 406/**
362 * tick_nohz_restart_sched_tick - restart the idle tick from the idle task 407 * tick_nohz_restart_sched_tick - restart the idle tick from the idle task
363 * 408 *
@@ -373,18 +418,20 @@ void tick_nohz_restart_sched_tick(void)
373 local_irq_disable(); 418 local_irq_disable();
374 tick_nohz_stop_idle(cpu); 419 tick_nohz_stop_idle(cpu);
375 420
376 if (!ts->tick_stopped) { 421 if (!ts->inidle || !ts->tick_stopped) {
422 ts->inidle = 0;
377 local_irq_enable(); 423 local_irq_enable();
378 return; 424 return;
379 } 425 }
380 426
427 ts->inidle = 0;
428
381 rcu_exit_nohz(); 429 rcu_exit_nohz();
382 430
383 /* Update jiffies first */ 431 /* Update jiffies first */
384 select_nohz_load_balancer(0); 432 select_nohz_load_balancer(0);
385 now = ktime_get(); 433 now = ktime_get();
386 tick_do_update_jiffies64(now); 434 tick_do_update_jiffies64(now);
387 sched_clock_tick_start(cpu);
388 cpu_clear(cpu, nohz_cpu_mask); 435 cpu_clear(cpu, nohz_cpu_mask);
389 436
390 /* 437 /*
@@ -409,35 +456,16 @@ void tick_nohz_restart_sched_tick(void)
409 */ 456 */
410 ts->tick_stopped = 0; 457 ts->tick_stopped = 0;
411 ts->idle_exittime = now; 458 ts->idle_exittime = now;
412 hrtimer_cancel(&ts->sched_timer);
413 ts->sched_timer.expires = ts->idle_tick;
414 459
415 while (1) { 460 tick_nohz_restart(ts, now);
416 /* Forward the time to expire in the future */
417 hrtimer_forward(&ts->sched_timer, now, tick_period);
418 461
419 if (ts->nohz_mode == NOHZ_MODE_HIGHRES) {
420 hrtimer_start(&ts->sched_timer,
421 ts->sched_timer.expires,
422 HRTIMER_MODE_ABS);
423 /* Check, if the timer was already in the past */
424 if (hrtimer_active(&ts->sched_timer))
425 break;
426 } else {
427 if (!tick_program_event(ts->sched_timer.expires, 0))
428 break;
429 }
430 /* Update jiffies and reread time */
431 tick_do_update_jiffies64(now);
432 now = ktime_get();
433 }
434 local_irq_enable(); 462 local_irq_enable();
435} 463}
436 464
437static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now) 465static int tick_nohz_reprogram(struct tick_sched *ts, ktime_t now)
438{ 466{
439 hrtimer_forward(&ts->sched_timer, now, tick_period); 467 hrtimer_forward(&ts->sched_timer, now, tick_period);
440 return tick_program_event(ts->sched_timer.expires, 0); 468 return tick_program_event(hrtimer_get_expires(&ts->sched_timer), 0);
441} 469}
442 470
443/* 471/*
@@ -459,7 +487,7 @@ static void tick_nohz_handler(struct clock_event_device *dev)
459 * this duty, then the jiffies update is still serialized by 487 * this duty, then the jiffies update is still serialized by
460 * xtime_lock. 488 * xtime_lock.
461 */ 489 */
462 if (unlikely(tick_do_timer_cpu == -1)) 490 if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
463 tick_do_timer_cpu = cpu; 491 tick_do_timer_cpu = cpu;
464 492
465 /* Check, if the jiffies need an update */ 493 /* Check, if the jiffies need an update */
@@ -482,10 +510,6 @@ static void tick_nohz_handler(struct clock_event_device *dev)
482 update_process_times(user_mode(regs)); 510 update_process_times(user_mode(regs));
483 profile_tick(CPU_PROFILING); 511 profile_tick(CPU_PROFILING);
484 512
485 /* Do not restart, when we are in the idle loop */
486 if (ts->tick_stopped)
487 return;
488
489 while (tick_nohz_reprogram(ts, now)) { 513 while (tick_nohz_reprogram(ts, now)) {
490 now = ktime_get(); 514 now = ktime_get();
491 tick_do_update_jiffies64(now); 515 tick_do_update_jiffies64(now);
@@ -520,7 +544,7 @@ static void tick_nohz_switch_to_nohz(void)
520 next = tick_init_jiffy_update(); 544 next = tick_init_jiffy_update();
521 545
522 for (;;) { 546 for (;;) {
523 ts->sched_timer.expires = next; 547 hrtimer_set_expires(&ts->sched_timer, next);
524 if (!tick_program_event(next, 0)) 548 if (!tick_program_event(next, 0))
525 break; 549 break;
526 next = ktime_add(next, tick_period); 550 next = ktime_add(next, tick_period);
@@ -531,6 +555,37 @@ static void tick_nohz_switch_to_nohz(void)
531 smp_processor_id()); 555 smp_processor_id());
532} 556}
533 557
558/*
559 * When NOHZ is enabled and the tick is stopped, we need to kick the
560 * tick timer from irq_enter() so that the jiffies update is kept
561 * alive during long running softirqs. That's ugly as hell, but
562 * correctness is key even if we need to fix the offending softirq in
563 * the first place.
564 *
565 * Note, this is different to tick_nohz_restart. We just kick the
566 * timer and do not touch the other magic bits which need to be done
567 * when idle is left.
568 */
569static void tick_nohz_kick_tick(int cpu)
570{
571 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
572 ktime_t delta, now;
573
574 if (!ts->tick_stopped)
575 return;
576
577 /*
578 * Do not touch the tick device, when the next expiry is either
579 * already reached or less/equal than the tick period.
580 */
581 now = ktime_get();
582 delta = ktime_sub(hrtimer_get_expires(&ts->sched_timer), now);
583 if (delta.tv64 <= tick_period.tv64)
584 return;
585
586 tick_nohz_restart(ts, now);
587}
588
534#else 589#else
535 590
536static inline void tick_nohz_switch_to_nohz(void) { } 591static inline void tick_nohz_switch_to_nohz(void) { }
@@ -538,6 +593,19 @@ static inline void tick_nohz_switch_to_nohz(void) { }
538#endif /* NO_HZ */ 593#endif /* NO_HZ */
539 594
540/* 595/*
596 * Called from irq_enter to notify about the possible interruption of idle()
597 */
598void tick_check_idle(int cpu)
599{
600 tick_check_oneshot_broadcast(cpu);
601#ifdef CONFIG_NO_HZ
602 tick_nohz_stop_idle(cpu);
603 tick_nohz_update_jiffies();
604 tick_nohz_kick_tick(cpu);
605#endif
606}
607
608/*
541 * High resolution timer specific code 609 * High resolution timer specific code
542 */ 610 */
543#ifdef CONFIG_HIGH_RES_TIMERS 611#ifdef CONFIG_HIGH_RES_TIMERS
@@ -561,7 +629,7 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
561 * this duty, then the jiffies update is still serialized by 629 * this duty, then the jiffies update is still serialized by
562 * xtime_lock. 630 * xtime_lock.
563 */ 631 */
564 if (unlikely(tick_do_timer_cpu == -1)) 632 if (unlikely(tick_do_timer_cpu == TICK_DO_TIMER_NONE))
565 tick_do_timer_cpu = cpu; 633 tick_do_timer_cpu = cpu;
566#endif 634#endif
567 635
@@ -590,10 +658,6 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
590 profile_tick(CPU_PROFILING); 658 profile_tick(CPU_PROFILING);
591 } 659 }
592 660
593 /* Do not restart, when we are in the idle loop */
594 if (ts->tick_stopped)
595 return HRTIMER_NORESTART;
596
597 hrtimer_forward(timer, now, tick_period); 661 hrtimer_forward(timer, now, tick_period);
598 662
599 return HRTIMER_RESTART; 663 return HRTIMER_RESTART;
@@ -613,19 +677,18 @@ void tick_setup_sched_timer(void)
613 */ 677 */
614 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); 678 hrtimer_init(&ts->sched_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
615 ts->sched_timer.function = tick_sched_timer; 679 ts->sched_timer.function = tick_sched_timer;
616 ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 680 ts->sched_timer.cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
617 681
618 /* Get the next period (per cpu) */ 682 /* Get the next period (per cpu) */
619 ts->sched_timer.expires = tick_init_jiffy_update(); 683 hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
620 offset = ktime_to_ns(tick_period) >> 1; 684 offset = ktime_to_ns(tick_period) >> 1;
621 do_div(offset, num_possible_cpus()); 685 do_div(offset, num_possible_cpus());
622 offset *= smp_processor_id(); 686 offset *= smp_processor_id();
623 ts->sched_timer.expires = ktime_add_ns(ts->sched_timer.expires, offset); 687 hrtimer_add_expires_ns(&ts->sched_timer, offset);
624 688
625 for (;;) { 689 for (;;) {
626 hrtimer_forward(&ts->sched_timer, now, tick_period); 690 hrtimer_forward(&ts->sched_timer, now, tick_period);
627 hrtimer_start(&ts->sched_timer, ts->sched_timer.expires, 691 hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS);
628 HRTIMER_MODE_ABS);
629 /* Check, if the timer was already in the past */ 692 /* Check, if the timer was already in the past */
630 if (hrtimer_active(&ts->sched_timer)) 693 if (hrtimer_active(&ts->sched_timer))
631 break; 694 break;
@@ -637,17 +700,21 @@ void tick_setup_sched_timer(void)
637 ts->nohz_mode = NOHZ_MODE_HIGHRES; 700 ts->nohz_mode = NOHZ_MODE_HIGHRES;
638#endif 701#endif
639} 702}
703#endif /* HIGH_RES_TIMERS */
640 704
705#if defined CONFIG_NO_HZ || defined CONFIG_HIGH_RES_TIMERS
641void tick_cancel_sched_timer(int cpu) 706void tick_cancel_sched_timer(int cpu)
642{ 707{
643 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); 708 struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
644 709
710# ifdef CONFIG_HIGH_RES_TIMERS
645 if (ts->sched_timer.base) 711 if (ts->sched_timer.base)
646 hrtimer_cancel(&ts->sched_timer); 712 hrtimer_cancel(&ts->sched_timer);
713# endif
647 714
648 ts->nohz_mode = NOHZ_MODE_INACTIVE; 715 ts->nohz_mode = NOHZ_MODE_INACTIVE;
649} 716}
650#endif /* HIGH_RES_TIMERS */ 717#endif
651 718
652/** 719/**
653 * Async notification about clocksource changes 720 * Async notification about clocksource changes
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index e91c29f961c9..e7acfb482a68 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -58,27 +58,26 @@ struct clocksource *clock;
58 58
59#ifdef CONFIG_GENERIC_TIME 59#ifdef CONFIG_GENERIC_TIME
60/** 60/**
61 * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook 61 * clocksource_forward_now - update clock to the current time
62 * 62 *
63 * private function, must hold xtime_lock lock when being 63 * Forward the current clock to update its state since the last call to
64 * called. Returns the number of nanoseconds since the 64 * update_wall_time(). This is useful before significant clock changes,
65 * last call to update_wall_time() (adjusted by NTP scaling) 65 * as it avoids having to deal with this time offset explicitly.
66 */ 66 */
67static inline s64 __get_nsec_offset(void) 67static void clocksource_forward_now(void)
68{ 68{
69 cycle_t cycle_now, cycle_delta; 69 cycle_t cycle_now, cycle_delta;
70 s64 ns_offset; 70 s64 nsec;
71 71
72 /* read clocksource: */
73 cycle_now = clocksource_read(clock); 72 cycle_now = clocksource_read(clock);
74
75 /* calculate the delta since the last update_wall_time: */
76 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; 73 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
74 clock->cycle_last = cycle_now;
77 75
78 /* convert to nanoseconds: */ 76 nsec = cyc2ns(clock, cycle_delta);
79 ns_offset = cyc2ns(clock, cycle_delta); 77 timespec_add_ns(&xtime, nsec);
80 78
81 return ns_offset; 79 nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
80 clock->raw_time.tv_nsec += nsec;
82} 81}
83 82
84/** 83/**
@@ -89,6 +88,7 @@ static inline s64 __get_nsec_offset(void)
89 */ 88 */
90void getnstimeofday(struct timespec *ts) 89void getnstimeofday(struct timespec *ts)
91{ 90{
91 cycle_t cycle_now, cycle_delta;
92 unsigned long seq; 92 unsigned long seq;
93 s64 nsecs; 93 s64 nsecs;
94 94
@@ -96,7 +96,15 @@ void getnstimeofday(struct timespec *ts)
96 seq = read_seqbegin(&xtime_lock); 96 seq = read_seqbegin(&xtime_lock);
97 97
98 *ts = xtime; 98 *ts = xtime;
99 nsecs = __get_nsec_offset(); 99
100 /* read clocksource: */
101 cycle_now = clocksource_read(clock);
102
103 /* calculate the delta since the last update_wall_time: */
104 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
105
106 /* convert to nanoseconds: */
107 nsecs = cyc2ns(clock, cycle_delta);
100 108
101 } while (read_seqretry(&xtime_lock, seq)); 109 } while (read_seqretry(&xtime_lock, seq));
102 110
@@ -129,22 +137,22 @@ EXPORT_SYMBOL(do_gettimeofday);
129 */ 137 */
130int do_settimeofday(struct timespec *tv) 138int do_settimeofday(struct timespec *tv)
131{ 139{
140 struct timespec ts_delta;
132 unsigned long flags; 141 unsigned long flags;
133 time_t wtm_sec, sec = tv->tv_sec;
134 long wtm_nsec, nsec = tv->tv_nsec;
135 142
136 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) 143 if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
137 return -EINVAL; 144 return -EINVAL;
138 145
139 write_seqlock_irqsave(&xtime_lock, flags); 146 write_seqlock_irqsave(&xtime_lock, flags);
140 147
141 nsec -= __get_nsec_offset(); 148 clocksource_forward_now();
149
150 ts_delta.tv_sec = tv->tv_sec - xtime.tv_sec;
151 ts_delta.tv_nsec = tv->tv_nsec - xtime.tv_nsec;
152 wall_to_monotonic = timespec_sub(wall_to_monotonic, ts_delta);
142 153
143 wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); 154 xtime = *tv;
144 wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
145 155
146 set_normalized_timespec(&xtime, sec, nsec);
147 set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
148 update_xtime_cache(0); 156 update_xtime_cache(0);
149 157
150 clock->error = 0; 158 clock->error = 0;
@@ -170,22 +178,19 @@ EXPORT_SYMBOL(do_settimeofday);
170static void change_clocksource(void) 178static void change_clocksource(void)
171{ 179{
172 struct clocksource *new; 180 struct clocksource *new;
173 cycle_t now;
174 u64 nsec;
175 181
176 new = clocksource_get_next(); 182 new = clocksource_get_next();
177 183
178 if (clock == new) 184 if (clock == new)
179 return; 185 return;
180 186
181 new->cycle_last = 0; 187 clocksource_forward_now();
182 now = clocksource_read(new);
183 nsec = __get_nsec_offset();
184 timespec_add_ns(&xtime, nsec);
185 188
186 clock = new; 189 new->raw_time = clock->raw_time;
187 clock->cycle_last = now;
188 190
191 clock = new;
192 clock->cycle_last = 0;
193 clock->cycle_last = clocksource_read(new);
189 clock->error = 0; 194 clock->error = 0;
190 clock->xtime_nsec = 0; 195 clock->xtime_nsec = 0;
191 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH); 196 clocksource_calculate_interval(clock, NTP_INTERVAL_LENGTH);
@@ -200,11 +205,44 @@ static void change_clocksource(void)
200 */ 205 */
201} 206}
202#else 207#else
208static inline void clocksource_forward_now(void) { }
203static inline void change_clocksource(void) { } 209static inline void change_clocksource(void) { }
204static inline s64 __get_nsec_offset(void) { return 0; }
205#endif 210#endif
206 211
207/** 212/**
213 * getrawmonotonic - Returns the raw monotonic time in a timespec
214 * @ts: pointer to the timespec to be set
215 *
216 * Returns the raw monotonic time (completely un-modified by ntp)
217 */
218void getrawmonotonic(struct timespec *ts)
219{
220 unsigned long seq;
221 s64 nsecs;
222 cycle_t cycle_now, cycle_delta;
223
224 do {
225 seq = read_seqbegin(&xtime_lock);
226
227 /* read clocksource: */
228 cycle_now = clocksource_read(clock);
229
230 /* calculate the delta since the last update_wall_time: */
231 cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
232
233 /* convert to nanoseconds: */
234 nsecs = ((s64)cycle_delta * clock->mult_orig) >> clock->shift;
235
236 *ts = clock->raw_time;
237
238 } while (read_seqretry(&xtime_lock, seq));
239
240 timespec_add_ns(ts, nsecs);
241}
242EXPORT_SYMBOL(getrawmonotonic);
243
244
245/**
208 * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres 246 * timekeeping_valid_for_hres - Check if timekeeping is suitable for hres
209 */ 247 */
210int timekeeping_valid_for_hres(void) 248int timekeeping_valid_for_hres(void)
@@ -265,8 +303,6 @@ void __init timekeeping_init(void)
265static int timekeeping_suspended; 303static int timekeeping_suspended;
266/* time in seconds when suspend began */ 304/* time in seconds when suspend began */
267static unsigned long timekeeping_suspend_time; 305static unsigned long timekeeping_suspend_time;
268/* xtime offset when we went into suspend */
269static s64 timekeeping_suspend_nsecs;
270 306
271/** 307/**
272 * timekeeping_resume - Resumes the generic timekeeping subsystem. 308 * timekeeping_resume - Resumes the generic timekeeping subsystem.
@@ -292,8 +328,6 @@ static int timekeeping_resume(struct sys_device *dev)
292 wall_to_monotonic.tv_sec -= sleep_length; 328 wall_to_monotonic.tv_sec -= sleep_length;
293 total_sleep_time += sleep_length; 329 total_sleep_time += sleep_length;
294 } 330 }
295 /* Make sure that we have the correct xtime reference */
296 timespec_add_ns(&xtime, timekeeping_suspend_nsecs);
297 update_xtime_cache(0); 331 update_xtime_cache(0);
298 /* re-base the last cycle value */ 332 /* re-base the last cycle value */
299 clock->cycle_last = 0; 333 clock->cycle_last = 0;
@@ -319,8 +353,7 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
319 timekeeping_suspend_time = read_persistent_clock(); 353 timekeeping_suspend_time = read_persistent_clock();
320 354
321 write_seqlock_irqsave(&xtime_lock, flags); 355 write_seqlock_irqsave(&xtime_lock, flags);
322 /* Get the current xtime offset */ 356 clocksource_forward_now();
323 timekeeping_suspend_nsecs = __get_nsec_offset();
324 timekeeping_suspended = 1; 357 timekeeping_suspended = 1;
325 write_sequnlock_irqrestore(&xtime_lock, flags); 358 write_sequnlock_irqrestore(&xtime_lock, flags);
326 359
@@ -454,23 +487,29 @@ void update_wall_time(void)
454#else 487#else
455 offset = clock->cycle_interval; 488 offset = clock->cycle_interval;
456#endif 489#endif
457 clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift; 490 clock->xtime_nsec = (s64)xtime.tv_nsec << clock->shift;
458 491
459 /* normally this loop will run just once, however in the 492 /* normally this loop will run just once, however in the
460 * case of lost or late ticks, it will accumulate correctly. 493 * case of lost or late ticks, it will accumulate correctly.
461 */ 494 */
462 while (offset >= clock->cycle_interval) { 495 while (offset >= clock->cycle_interval) {
463 /* accumulate one interval */ 496 /* accumulate one interval */
464 clock->xtime_nsec += clock->xtime_interval;
465 clock->cycle_last += clock->cycle_interval;
466 offset -= clock->cycle_interval; 497 offset -= clock->cycle_interval;
498 clock->cycle_last += clock->cycle_interval;
467 499
500 clock->xtime_nsec += clock->xtime_interval;
468 if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) { 501 if (clock->xtime_nsec >= (u64)NSEC_PER_SEC << clock->shift) {
469 clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift; 502 clock->xtime_nsec -= (u64)NSEC_PER_SEC << clock->shift;
470 xtime.tv_sec++; 503 xtime.tv_sec++;
471 second_overflow(); 504 second_overflow();
472 } 505 }
473 506
507 clock->raw_time.tv_nsec += clock->raw_interval;
508 if (clock->raw_time.tv_nsec >= NSEC_PER_SEC) {
509 clock->raw_time.tv_nsec -= NSEC_PER_SEC;
510 clock->raw_time.tv_sec++;
511 }
512
474 /* accumulate error between NTP and clock interval */ 513 /* accumulate error between NTP and clock interval */
475 clock->error += tick_length; 514 clock->error += tick_length;
476 clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift); 515 clock->error -= clock->xtime_interval << (NTP_SCALE_SHIFT - clock->shift);
@@ -479,9 +518,12 @@ void update_wall_time(void)
479 /* correct the clock when NTP error is too big */ 518 /* correct the clock when NTP error is too big */
480 clocksource_adjust(offset); 519 clocksource_adjust(offset);
481 520
482 /* store full nanoseconds into xtime */ 521 /* store full nanoseconds into xtime after rounding it up and
483 xtime.tv_nsec = (s64)clock->xtime_nsec >> clock->shift; 522 * add the remainder to the error difference.
523 */
524 xtime.tv_nsec = ((s64)clock->xtime_nsec >> clock->shift) + 1;
484 clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift; 525 clock->xtime_nsec -= (s64)xtime.tv_nsec << clock->shift;
526 clock->error += clock->xtime_nsec << (NTP_SCALE_SHIFT - clock->shift);
485 527
486 update_xtime_cache(cyc2ns(clock, offset)); 528 update_xtime_cache(cyc2ns(clock, offset));
487 529
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index a40e20fd0001..a999b92a1277 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -47,13 +47,14 @@ static void print_name_offset(struct seq_file *m, void *sym)
47} 47}
48 48
49static void 49static void
50print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now) 50print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer,
51 int idx, u64 now)
51{ 52{
52#ifdef CONFIG_TIMER_STATS 53#ifdef CONFIG_TIMER_STATS
53 char tmp[TASK_COMM_LEN + 1]; 54 char tmp[TASK_COMM_LEN + 1];
54#endif 55#endif
55 SEQ_printf(m, " #%d: ", idx); 56 SEQ_printf(m, " #%d: ", idx);
56 print_name_offset(m, timer); 57 print_name_offset(m, taddr);
57 SEQ_printf(m, ", "); 58 SEQ_printf(m, ", ");
58 print_name_offset(m, timer->function); 59 print_name_offset(m, timer->function);
59 SEQ_printf(m, ", S:%02lx", timer->state); 60 SEQ_printf(m, ", S:%02lx", timer->state);
@@ -65,9 +66,11 @@ print_timer(struct seq_file *m, struct hrtimer *timer, int idx, u64 now)
65 SEQ_printf(m, ", %s/%d", tmp, timer->start_pid); 66 SEQ_printf(m, ", %s/%d", tmp, timer->start_pid);
66#endif 67#endif
67 SEQ_printf(m, "\n"); 68 SEQ_printf(m, "\n");
68 SEQ_printf(m, " # expires at %Lu nsecs [in %Ld nsecs]\n", 69 SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n",
69 (unsigned long long)ktime_to_ns(timer->expires), 70 (unsigned long long)ktime_to_ns(hrtimer_get_softexpires(timer)),
70 (long long)(ktime_to_ns(timer->expires) - now)); 71 (unsigned long long)ktime_to_ns(hrtimer_get_expires(timer)),
72 (long long)(ktime_to_ns(hrtimer_get_softexpires(timer)) - now),
73 (long long)(ktime_to_ns(hrtimer_get_expires(timer)) - now));
71} 74}
72 75
73static void 76static void
@@ -99,7 +102,7 @@ next_one:
99 tmp = *timer; 102 tmp = *timer;
100 spin_unlock_irqrestore(&base->cpu_base->lock, flags); 103 spin_unlock_irqrestore(&base->cpu_base->lock, flags);
101 104
102 print_timer(m, &tmp, i, now); 105 print_timer(m, timer, &tmp, i, now);
103 next++; 106 next++;
104 goto next_one; 107 goto next_one;
105 } 108 }
@@ -109,6 +112,7 @@ next_one:
109static void 112static void
110print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) 113print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now)
111{ 114{
115 SEQ_printf(m, " .base: %p\n", base);
112 SEQ_printf(m, " .index: %d\n", 116 SEQ_printf(m, " .index: %d\n",
113 base->index); 117 base->index);
114 SEQ_printf(m, " .resolution: %Lu nsecs\n", 118 SEQ_printf(m, " .resolution: %Lu nsecs\n",
@@ -183,12 +187,16 @@ static void print_cpu(struct seq_file *m, int cpu, u64 now)
183 187
184#ifdef CONFIG_GENERIC_CLOCKEVENTS 188#ifdef CONFIG_GENERIC_CLOCKEVENTS
185static void 189static void
186print_tickdevice(struct seq_file *m, struct tick_device *td) 190print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu)
187{ 191{
188 struct clock_event_device *dev = td->evtdev; 192 struct clock_event_device *dev = td->evtdev;
189 193
190 SEQ_printf(m, "\n"); 194 SEQ_printf(m, "\n");
191 SEQ_printf(m, "Tick Device: mode: %d\n", td->mode); 195 SEQ_printf(m, "Tick Device: mode: %d\n", td->mode);
196 if (cpu < 0)
197 SEQ_printf(m, "Broadcast device\n");
198 else
199 SEQ_printf(m, "Per CPU device: %d\n", cpu);
192 200
193 SEQ_printf(m, "Clock Event Device: "); 201 SEQ_printf(m, "Clock Event Device: ");
194 if (!dev) { 202 if (!dev) {
@@ -222,7 +230,7 @@ static void timer_list_show_tickdevices(struct seq_file *m)
222 int cpu; 230 int cpu;
223 231
224#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST 232#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
225 print_tickdevice(m, tick_get_broadcast_device()); 233 print_tickdevice(m, tick_get_broadcast_device(), -1);
226 SEQ_printf(m, "tick_broadcast_mask: %08lx\n", 234 SEQ_printf(m, "tick_broadcast_mask: %08lx\n",
227 tick_get_broadcast_mask()->bits[0]); 235 tick_get_broadcast_mask()->bits[0]);
228#ifdef CONFIG_TICK_ONESHOT 236#ifdef CONFIG_TICK_ONESHOT
@@ -232,7 +240,7 @@ static void timer_list_show_tickdevices(struct seq_file *m)
232 SEQ_printf(m, "\n"); 240 SEQ_printf(m, "\n");
233#endif 241#endif
234 for_each_online_cpu(cpu) 242 for_each_online_cpu(cpu)
235 print_tickdevice(m, tick_get_device(cpu)); 243 print_tickdevice(m, tick_get_device(cpu), cpu);
236 SEQ_printf(m, "\n"); 244 SEQ_printf(m, "\n");
237} 245}
238#else 246#else
@@ -244,7 +252,7 @@ static int timer_list_show(struct seq_file *m, void *v)
244 u64 now = ktime_to_ns(ktime_get()); 252 u64 now = ktime_to_ns(ktime_get());
245 int cpu; 253 int cpu;
246 254
247 SEQ_printf(m, "Timer List Version: v0.3\n"); 255 SEQ_printf(m, "Timer List Version: v0.4\n");
248 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES); 256 SEQ_printf(m, "HRTIMER_MAX_CLOCK_BASES: %d\n", HRTIMER_MAX_CLOCK_BASES);
249 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now); 257 SEQ_printf(m, "now at %Ld nsecs\n", (unsigned long long)now);
250 258
diff --git a/kernel/timer.c b/kernel/timer.c
index 03bc7f1f1593..56becf373c58 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -978,6 +978,7 @@ void update_process_times(int user_tick)
978 run_local_timers(); 978 run_local_timers();
979 if (rcu_pending(cpu)) 979 if (rcu_pending(cpu))
980 rcu_check_callbacks(cpu, user_tick); 980 rcu_check_callbacks(cpu, user_tick);
981 printk_tick();
981 scheduler_tick(); 982 scheduler_tick();
982 run_posix_cpu_timers(p); 983 run_posix_cpu_timers(p);
983} 984}
@@ -1435,9 +1436,11 @@ static void __cpuinit migrate_timers(int cpu)
1435 BUG_ON(cpu_online(cpu)); 1436 BUG_ON(cpu_online(cpu));
1436 old_base = per_cpu(tvec_bases, cpu); 1437 old_base = per_cpu(tvec_bases, cpu);
1437 new_base = get_cpu_var(tvec_bases); 1438 new_base = get_cpu_var(tvec_bases);
1438 1439 /*
1439 local_irq_disable(); 1440 * The caller is globally serialized and nobody else
1440 spin_lock(&new_base->lock); 1441 * takes two locks at once, deadlock is not possible.
1442 */
1443 spin_lock_irq(&new_base->lock);
1441 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); 1444 spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
1442 1445
1443 BUG_ON(old_base->running_timer); 1446 BUG_ON(old_base->running_timer);
@@ -1452,8 +1455,7 @@ static void __cpuinit migrate_timers(int cpu)
1452 } 1455 }
1453 1456
1454 spin_unlock(&old_base->lock); 1457 spin_unlock(&old_base->lock);
1455 spin_unlock(&new_base->lock); 1458 spin_unlock_irq(&new_base->lock);
1456 local_irq_enable();
1457 put_cpu_var(tvec_bases); 1459 put_cpu_var(tvec_bases);
1458} 1460}
1459#endif /* CONFIG_HOTPLUG_CPU */ 1461#endif /* CONFIG_HOTPLUG_CPU */
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 263e9e6bbd60..1cb3e1f616af 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -1,23 +1,37 @@
1# 1#
2# Architectures that offer an FTRACE implementation should select HAVE_FTRACE: 2# Architectures that offer an FTRACE implementation should select HAVE_FTRACE:
3# 3#
4
5config NOP_TRACER
6 bool
7
4config HAVE_FTRACE 8config HAVE_FTRACE
5 bool 9 bool
10 select NOP_TRACER
6 11
7config HAVE_DYNAMIC_FTRACE 12config HAVE_DYNAMIC_FTRACE
8 bool 13 bool
9 14
15config HAVE_FTRACE_MCOUNT_RECORD
16 bool
17
10config TRACER_MAX_TRACE 18config TRACER_MAX_TRACE
11 bool 19 bool
12 20
21config RING_BUFFER
22 bool
23
13config TRACING 24config TRACING
14 bool 25 bool
15 select DEBUG_FS 26 select DEBUG_FS
27 select RING_BUFFER
16 select STACKTRACE 28 select STACKTRACE
29 select TRACEPOINTS
17 30
18config FTRACE 31config FTRACE
19 bool "Kernel Function Tracer" 32 bool "Kernel Function Tracer"
20 depends on HAVE_FTRACE 33 depends on HAVE_FTRACE
34 depends on DEBUG_KERNEL
21 select FRAME_POINTER 35 select FRAME_POINTER
22 select TRACING 36 select TRACING
23 select CONTEXT_SWITCH_TRACER 37 select CONTEXT_SWITCH_TRACER
@@ -36,6 +50,7 @@ config IRQSOFF_TRACER
36 depends on TRACE_IRQFLAGS_SUPPORT 50 depends on TRACE_IRQFLAGS_SUPPORT
37 depends on GENERIC_TIME 51 depends on GENERIC_TIME
38 depends on HAVE_FTRACE 52 depends on HAVE_FTRACE
53 depends on DEBUG_KERNEL
39 select TRACE_IRQFLAGS 54 select TRACE_IRQFLAGS
40 select TRACING 55 select TRACING
41 select TRACER_MAX_TRACE 56 select TRACER_MAX_TRACE
@@ -59,6 +74,7 @@ config PREEMPT_TRACER
59 depends on GENERIC_TIME 74 depends on GENERIC_TIME
60 depends on PREEMPT 75 depends on PREEMPT
61 depends on HAVE_FTRACE 76 depends on HAVE_FTRACE
77 depends on DEBUG_KERNEL
62 select TRACING 78 select TRACING
63 select TRACER_MAX_TRACE 79 select TRACER_MAX_TRACE
64 help 80 help
@@ -86,6 +102,7 @@ config SYSPROF_TRACER
86config SCHED_TRACER 102config SCHED_TRACER
87 bool "Scheduling Latency Tracer" 103 bool "Scheduling Latency Tracer"
88 depends on HAVE_FTRACE 104 depends on HAVE_FTRACE
105 depends on DEBUG_KERNEL
89 select TRACING 106 select TRACING
90 select CONTEXT_SWITCH_TRACER 107 select CONTEXT_SWITCH_TRACER
91 select TRACER_MAX_TRACE 108 select TRACER_MAX_TRACE
@@ -96,16 +113,56 @@ config SCHED_TRACER
96config CONTEXT_SWITCH_TRACER 113config CONTEXT_SWITCH_TRACER
97 bool "Trace process context switches" 114 bool "Trace process context switches"
98 depends on HAVE_FTRACE 115 depends on HAVE_FTRACE
116 depends on DEBUG_KERNEL
99 select TRACING 117 select TRACING
100 select MARKERS 118 select MARKERS
101 help 119 help
102 This tracer gets called from the context switch and records 120 This tracer gets called from the context switch and records
103 all switching of tasks. 121 all switching of tasks.
104 122
123config BOOT_TRACER
124 bool "Trace boot initcalls"
125 depends on HAVE_FTRACE
126 depends on DEBUG_KERNEL
127 select TRACING
128 help
129 This tracer helps developers to optimize boot times: it records
130 the timings of the initcalls and traces key events and the identity
131 of tasks that can cause boot delays, such as context-switches.
132
133 Its aim is to be parsed by the /scripts/bootgraph.pl tool to
134 produce pretty graphics about boot inefficiencies, giving a visual
135 representation of the delays during initcalls - but the raw
136 /debug/tracing/trace text output is readable too.
137
138 ( Note that tracing self tests can't be enabled if this tracer is
139 selected, because the self-tests are an initcall as well and that
140 would invalidate the boot trace. )
141
142config STACK_TRACER
143 bool "Trace max stack"
144 depends on HAVE_FTRACE
145 depends on DEBUG_KERNEL
146 select FTRACE
147 select STACKTRACE
148 help
149 This special tracer records the maximum stack footprint of the
150 kernel and displays it in debugfs/tracing/stack_trace.
151
152 This tracer works by hooking into every function call that the
153 kernel executes, and keeping a maximum stack depth value and
154 stack-trace saved. Because this logic has to execute in every
155 kernel function, all the time, this option can slow down the
156 kernel measurably and is generally intended for kernel
157 developers only.
158
159 Say N if unsure.
160
105config DYNAMIC_FTRACE 161config DYNAMIC_FTRACE
106 bool "enable/disable ftrace tracepoints dynamically" 162 bool "enable/disable ftrace tracepoints dynamically"
107 depends on FTRACE 163 depends on FTRACE
108 depends on HAVE_DYNAMIC_FTRACE 164 depends on HAVE_DYNAMIC_FTRACE
165 depends on DEBUG_KERNEL
109 default y 166 default y
110 help 167 help
111 This option will modify all the calls to ftrace dynamically 168 This option will modify all the calls to ftrace dynamically
@@ -121,12 +178,17 @@ config DYNAMIC_FTRACE
121 were made. If so, it runs stop_machine (stops all CPUS) 178 were made. If so, it runs stop_machine (stops all CPUS)
122 and modifies the code to jump over the call to ftrace. 179 and modifies the code to jump over the call to ftrace.
123 180
181config FTRACE_MCOUNT_RECORD
182 def_bool y
183 depends on DYNAMIC_FTRACE
184 depends on HAVE_FTRACE_MCOUNT_RECORD
185
124config FTRACE_SELFTEST 186config FTRACE_SELFTEST
125 bool 187 bool
126 188
127config FTRACE_STARTUP_TEST 189config FTRACE_STARTUP_TEST
128 bool "Perform a startup test on ftrace" 190 bool "Perform a startup test on ftrace"
129 depends on TRACING 191 depends on TRACING && DEBUG_KERNEL && !BOOT_TRACER
130 select FTRACE_SELFTEST 192 select FTRACE_SELFTEST
131 help 193 help
132 This option performs a series of startup tests on ftrace. On bootup 194 This option performs a series of startup tests on ftrace. On bootup
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 71d17de17288..a85dfba88ba0 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -11,6 +11,7 @@ obj-y += trace_selftest_dynamic.o
11endif 11endif
12 12
13obj-$(CONFIG_FTRACE) += libftrace.o 13obj-$(CONFIG_FTRACE) += libftrace.o
14obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
14 15
15obj-$(CONFIG_TRACING) += trace.o 16obj-$(CONFIG_TRACING) += trace.o
16obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o 17obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
@@ -19,6 +20,9 @@ obj-$(CONFIG_FTRACE) += trace_functions.o
19obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o 20obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
20obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o 21obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
21obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o 22obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
23obj-$(CONFIG_NOP_TRACER) += trace_nop.o
24obj-$(CONFIG_STACK_TRACER) += trace_stack.o
22obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o 25obj-$(CONFIG_MMIOTRACE) += trace_mmiotrace.o
26obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
23 27
24libftrace-y := ftrace.o 28libftrace-y := ftrace.o
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4231a3dc224a..4dda4f60a2a9 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -81,7 +81,7 @@ void clear_ftrace_function(void)
81 81
82static int __register_ftrace_function(struct ftrace_ops *ops) 82static int __register_ftrace_function(struct ftrace_ops *ops)
83{ 83{
84 /* Should never be called by interrupts */ 84 /* should not be called from interrupt context */
85 spin_lock(&ftrace_lock); 85 spin_lock(&ftrace_lock);
86 86
87 ops->next = ftrace_list; 87 ops->next = ftrace_list;
@@ -115,6 +115,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
115 struct ftrace_ops **p; 115 struct ftrace_ops **p;
116 int ret = 0; 116 int ret = 0;
117 117
118 /* should not be called from interrupt context */
118 spin_lock(&ftrace_lock); 119 spin_lock(&ftrace_lock);
119 120
120 /* 121 /*
@@ -153,6 +154,30 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
153 154
154#ifdef CONFIG_DYNAMIC_FTRACE 155#ifdef CONFIG_DYNAMIC_FTRACE
155 156
157#ifndef CONFIG_FTRACE_MCOUNT_RECORD
158/*
159 * The hash lock is only needed when the recording of the mcount
160 * callers are dynamic. That is, by the caller themselves and
161 * not recorded via the compilation.
162 */
163static DEFINE_SPINLOCK(ftrace_hash_lock);
164#define ftrace_hash_lock(flags) spin_lock_irqsave(&ftrace_hash_lock, flags)
165#define ftrace_hash_unlock(flags) \
166 spin_unlock_irqrestore(&ftrace_hash_lock, flags)
167#else
168/* This is protected via the ftrace_lock with MCOUNT_RECORD. */
169#define ftrace_hash_lock(flags) do { (void)(flags); } while (0)
170#define ftrace_hash_unlock(flags) do { } while(0)
171#endif
172
173/*
174 * Since MCOUNT_ADDR may point to mcount itself, we do not want
175 * to get it confused by reading a reference in the code as we
176 * are parsing on objcopy output of text. Use a variable for
177 * it instead.
178 */
179static unsigned long mcount_addr = MCOUNT_ADDR;
180
156static struct task_struct *ftraced_task; 181static struct task_struct *ftraced_task;
157 182
158enum { 183enum {
@@ -171,7 +196,6 @@ static struct hlist_head ftrace_hash[FTRACE_HASHSIZE];
171 196
172static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu); 197static DEFINE_PER_CPU(int, ftrace_shutdown_disable_cpu);
173 198
174static DEFINE_SPINLOCK(ftrace_shutdown_lock);
175static DEFINE_MUTEX(ftraced_lock); 199static DEFINE_MUTEX(ftraced_lock);
176static DEFINE_MUTEX(ftrace_regex_lock); 200static DEFINE_MUTEX(ftrace_regex_lock);
177 201
@@ -294,13 +318,37 @@ static inline void ftrace_del_hash(struct dyn_ftrace *node)
294 318
295static void ftrace_free_rec(struct dyn_ftrace *rec) 319static void ftrace_free_rec(struct dyn_ftrace *rec)
296{ 320{
297 /* no locking, only called from kstop_machine */
298
299 rec->ip = (unsigned long)ftrace_free_records; 321 rec->ip = (unsigned long)ftrace_free_records;
300 ftrace_free_records = rec; 322 ftrace_free_records = rec;
301 rec->flags |= FTRACE_FL_FREE; 323 rec->flags |= FTRACE_FL_FREE;
302} 324}
303 325
326void ftrace_release(void *start, unsigned long size)
327{
328 struct dyn_ftrace *rec;
329 struct ftrace_page *pg;
330 unsigned long s = (unsigned long)start;
331 unsigned long e = s + size;
332 int i;
333
334 if (ftrace_disabled || !start)
335 return;
336
337 /* should not be called from interrupt context */
338 spin_lock(&ftrace_lock);
339
340 for (pg = ftrace_pages_start; pg; pg = pg->next) {
341 for (i = 0; i < pg->index; i++) {
342 rec = &pg->records[i];
343
344 if ((rec->ip >= s) && (rec->ip < e))
345 ftrace_free_rec(rec);
346 }
347 }
348 spin_unlock(&ftrace_lock);
349
350}
351
304static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) 352static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
305{ 353{
306 struct dyn_ftrace *rec; 354 struct dyn_ftrace *rec;
@@ -338,7 +386,6 @@ ftrace_record_ip(unsigned long ip)
338 unsigned long flags; 386 unsigned long flags;
339 unsigned long key; 387 unsigned long key;
340 int resched; 388 int resched;
341 int atomic;
342 int cpu; 389 int cpu;
343 390
344 if (!ftrace_enabled || ftrace_disabled) 391 if (!ftrace_enabled || ftrace_disabled)
@@ -368,9 +415,7 @@ ftrace_record_ip(unsigned long ip)
368 if (ftrace_ip_in_hash(ip, key)) 415 if (ftrace_ip_in_hash(ip, key))
369 goto out; 416 goto out;
370 417
371 atomic = irqs_disabled(); 418 ftrace_hash_lock(flags);
372
373 spin_lock_irqsave(&ftrace_shutdown_lock, flags);
374 419
375 /* This ip may have hit the hash before the lock */ 420 /* This ip may have hit the hash before the lock */
376 if (ftrace_ip_in_hash(ip, key)) 421 if (ftrace_ip_in_hash(ip, key))
@@ -387,7 +432,7 @@ ftrace_record_ip(unsigned long ip)
387 ftraced_trigger = 1; 432 ftraced_trigger = 1;
388 433
389 out_unlock: 434 out_unlock:
390 spin_unlock_irqrestore(&ftrace_shutdown_lock, flags); 435 ftrace_hash_unlock(flags);
391 out: 436 out:
392 per_cpu(ftrace_shutdown_disable_cpu, cpu)--; 437 per_cpu(ftrace_shutdown_disable_cpu, cpu)--;
393 438
@@ -531,6 +576,16 @@ static void ftrace_shutdown_replenish(void)
531 ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL); 576 ftrace_pages->next = (void *)get_zeroed_page(GFP_KERNEL);
532} 577}
533 578
579static void print_ip_ins(const char *fmt, unsigned char *p)
580{
581 int i;
582
583 printk(KERN_CONT "%s", fmt);
584
585 for (i = 0; i < MCOUNT_INSN_SIZE; i++)
586 printk(KERN_CONT "%s%02x", i ? ":" : "", p[i]);
587}
588
534static int 589static int
535ftrace_code_disable(struct dyn_ftrace *rec) 590ftrace_code_disable(struct dyn_ftrace *rec)
536{ 591{
@@ -541,10 +596,27 @@ ftrace_code_disable(struct dyn_ftrace *rec)
541 ip = rec->ip; 596 ip = rec->ip;
542 597
543 nop = ftrace_nop_replace(); 598 nop = ftrace_nop_replace();
544 call = ftrace_call_replace(ip, MCOUNT_ADDR); 599 call = ftrace_call_replace(ip, mcount_addr);
545 600
546 failed = ftrace_modify_code(ip, call, nop); 601 failed = ftrace_modify_code(ip, call, nop);
547 if (failed) { 602 if (failed) {
603 switch (failed) {
604 case 1:
605 WARN_ON_ONCE(1);
606 pr_info("ftrace faulted on modifying ");
607 print_ip_sym(ip);
608 break;
609 case 2:
610 WARN_ON_ONCE(1);
611 pr_info("ftrace failed to modify ");
612 print_ip_sym(ip);
613 print_ip_ins(" expected: ", call);
614 print_ip_ins(" actual: ", (unsigned char *)ip);
615 print_ip_ins(" replace: ", nop);
616 printk(KERN_CONT "\n");
617 break;
618 }
619
548 rec->flags |= FTRACE_FL_FAILED; 620 rec->flags |= FTRACE_FL_FAILED;
549 return 0; 621 return 0;
550 } 622 }
@@ -587,7 +659,7 @@ static int __ftrace_modify_code(void *data)
587 659
588static void ftrace_run_update_code(int command) 660static void ftrace_run_update_code(int command)
589{ 661{
590 stop_machine_run(__ftrace_modify_code, &command, NR_CPUS); 662 stop_machine(__ftrace_modify_code, &command, NULL);
591} 663}
592 664
593void ftrace_disable_daemon(void) 665void ftrace_disable_daemon(void)
@@ -787,52 +859,12 @@ static int ftrace_update_code(void)
787 !ftrace_enabled || !ftraced_trigger) 859 !ftrace_enabled || !ftraced_trigger)
788 return 0; 860 return 0;
789 861
790 stop_machine_run(__ftrace_update_code, NULL, NR_CPUS); 862 stop_machine(__ftrace_update_code, NULL, NULL);
791 863
792 return 1; 864 return 1;
793} 865}
794 866
795static int ftraced(void *ignore) 867static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
796{
797 unsigned long usecs;
798
799 while (!kthread_should_stop()) {
800
801 set_current_state(TASK_INTERRUPTIBLE);
802
803 /* check once a second */
804 schedule_timeout(HZ);
805
806 if (unlikely(ftrace_disabled))
807 continue;
808
809 mutex_lock(&ftrace_sysctl_lock);
810 mutex_lock(&ftraced_lock);
811 if (!ftraced_suspend && !ftraced_stop &&
812 ftrace_update_code()) {
813 usecs = nsecs_to_usecs(ftrace_update_time);
814 if (ftrace_update_tot_cnt > 100000) {
815 ftrace_update_tot_cnt = 0;
816 pr_info("hm, dftrace overflow: %lu change%s"
817 " (%lu total) in %lu usec%s\n",
818 ftrace_update_cnt,
819 ftrace_update_cnt != 1 ? "s" : "",
820 ftrace_update_tot_cnt,
821 usecs, usecs != 1 ? "s" : "");
822 ftrace_disabled = 1;
823 WARN_ON_ONCE(1);
824 }
825 }
826 mutex_unlock(&ftraced_lock);
827 mutex_unlock(&ftrace_sysctl_lock);
828
829 ftrace_shutdown_replenish();
830 }
831 __set_current_state(TASK_RUNNING);
832 return 0;
833}
834
835static int __init ftrace_dyn_table_alloc(void)
836{ 868{
837 struct ftrace_page *pg; 869 struct ftrace_page *pg;
838 int cnt; 870 int cnt;
@@ -859,7 +891,9 @@ static int __init ftrace_dyn_table_alloc(void)
859 891
860 pg = ftrace_pages = ftrace_pages_start; 892 pg = ftrace_pages = ftrace_pages_start;
861 893
862 cnt = NR_TO_INIT / ENTRIES_PER_PAGE; 894 cnt = num_to_init / ENTRIES_PER_PAGE;
895 pr_info("ftrace: allocating %ld hash entries in %d pages\n",
896 num_to_init, cnt);
863 897
864 for (i = 0; i < cnt; i++) { 898 for (i = 0; i < cnt; i++) {
865 pg->next = (void *)get_zeroed_page(GFP_KERNEL); 899 pg->next = (void *)get_zeroed_page(GFP_KERNEL);
@@ -901,6 +935,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
901 935
902 (*pos)++; 936 (*pos)++;
903 937
938 /* should not be called from interrupt context */
939 spin_lock(&ftrace_lock);
904 retry: 940 retry:
905 if (iter->idx >= iter->pg->index) { 941 if (iter->idx >= iter->pg->index) {
906 if (iter->pg->next) { 942 if (iter->pg->next) {
@@ -910,15 +946,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
910 } 946 }
911 } else { 947 } else {
912 rec = &iter->pg->records[iter->idx++]; 948 rec = &iter->pg->records[iter->idx++];
913 if ((!(iter->flags & FTRACE_ITER_FAILURES) && 949 if ((rec->flags & FTRACE_FL_FREE) ||
950
951 (!(iter->flags & FTRACE_ITER_FAILURES) &&
914 (rec->flags & FTRACE_FL_FAILED)) || 952 (rec->flags & FTRACE_FL_FAILED)) ||
915 953
916 ((iter->flags & FTRACE_ITER_FAILURES) && 954 ((iter->flags & FTRACE_ITER_FAILURES) &&
917 (!(rec->flags & FTRACE_FL_FAILED) || 955 !(rec->flags & FTRACE_FL_FAILED)) ||
918 (rec->flags & FTRACE_FL_FREE))) ||
919
920 ((iter->flags & FTRACE_ITER_FILTER) &&
921 !(rec->flags & FTRACE_FL_FILTER)) ||
922 956
923 ((iter->flags & FTRACE_ITER_NOTRACE) && 957 ((iter->flags & FTRACE_ITER_NOTRACE) &&
924 !(rec->flags & FTRACE_FL_NOTRACE))) { 958 !(rec->flags & FTRACE_FL_NOTRACE))) {
@@ -926,6 +960,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
926 goto retry; 960 goto retry;
927 } 961 }
928 } 962 }
963 spin_unlock(&ftrace_lock);
929 964
930 iter->pos = *pos; 965 iter->pos = *pos;
931 966
@@ -1039,8 +1074,8 @@ static void ftrace_filter_reset(int enable)
1039 unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE; 1074 unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
1040 unsigned i; 1075 unsigned i;
1041 1076
1042 /* keep kstop machine from running */ 1077 /* should not be called from interrupt context */
1043 preempt_disable(); 1078 spin_lock(&ftrace_lock);
1044 if (enable) 1079 if (enable)
1045 ftrace_filtered = 0; 1080 ftrace_filtered = 0;
1046 pg = ftrace_pages_start; 1081 pg = ftrace_pages_start;
@@ -1053,7 +1088,7 @@ static void ftrace_filter_reset(int enable)
1053 } 1088 }
1054 pg = pg->next; 1089 pg = pg->next;
1055 } 1090 }
1056 preempt_enable(); 1091 spin_unlock(&ftrace_lock);
1057} 1092}
1058 1093
1059static int 1094static int
@@ -1165,8 +1200,8 @@ ftrace_match(unsigned char *buff, int len, int enable)
1165 } 1200 }
1166 } 1201 }
1167 1202
1168 /* keep kstop machine from running */ 1203 /* should not be called from interrupt context */
1169 preempt_disable(); 1204 spin_lock(&ftrace_lock);
1170 if (enable) 1205 if (enable)
1171 ftrace_filtered = 1; 1206 ftrace_filtered = 1;
1172 pg = ftrace_pages_start; 1207 pg = ftrace_pages_start;
@@ -1203,7 +1238,7 @@ ftrace_match(unsigned char *buff, int len, int enable)
1203 } 1238 }
1204 pg = pg->next; 1239 pg = pg->next;
1205 } 1240 }
1206 preempt_enable(); 1241 spin_unlock(&ftrace_lock);
1207} 1242}
1208 1243
1209static ssize_t 1244static ssize_t
@@ -1556,6 +1591,114 @@ static __init int ftrace_init_debugfs(void)
1556 1591
1557fs_initcall(ftrace_init_debugfs); 1592fs_initcall(ftrace_init_debugfs);
1558 1593
1594#ifdef CONFIG_FTRACE_MCOUNT_RECORD
1595static int ftrace_convert_nops(unsigned long *start,
1596 unsigned long *end)
1597{
1598 unsigned long *p;
1599 unsigned long addr;
1600 unsigned long flags;
1601
1602 p = start;
1603 while (p < end) {
1604 addr = ftrace_call_adjust(*p++);
1605 /* should not be called from interrupt context */
1606 spin_lock(&ftrace_lock);
1607 ftrace_record_ip(addr);
1608 spin_unlock(&ftrace_lock);
1609 ftrace_shutdown_replenish();
1610 }
1611
1612 /* p is ignored */
1613 local_irq_save(flags);
1614 __ftrace_update_code(p);
1615 local_irq_restore(flags);
1616
1617 return 0;
1618}
1619
1620void ftrace_init_module(unsigned long *start, unsigned long *end)
1621{
1622 if (ftrace_disabled || start == end)
1623 return;
1624 ftrace_convert_nops(start, end);
1625}
1626
1627extern unsigned long __start_mcount_loc[];
1628extern unsigned long __stop_mcount_loc[];
1629
1630void __init ftrace_init(void)
1631{
1632 unsigned long count, addr, flags;
1633 int ret;
1634
1635 /* Keep the ftrace pointer to the stub */
1636 addr = (unsigned long)ftrace_stub;
1637
1638 local_irq_save(flags);
1639 ftrace_dyn_arch_init(&addr);
1640 local_irq_restore(flags);
1641
1642 /* ftrace_dyn_arch_init places the return code in addr */
1643 if (addr)
1644 goto failed;
1645
1646 count = __stop_mcount_loc - __start_mcount_loc;
1647
1648 ret = ftrace_dyn_table_alloc(count);
1649 if (ret)
1650 goto failed;
1651
1652 last_ftrace_enabled = ftrace_enabled = 1;
1653
1654 ret = ftrace_convert_nops(__start_mcount_loc,
1655 __stop_mcount_loc);
1656
1657 return;
1658 failed:
1659 ftrace_disabled = 1;
1660}
1661#else /* CONFIG_FTRACE_MCOUNT_RECORD */
1662static int ftraced(void *ignore)
1663{
1664 unsigned long usecs;
1665
1666 while (!kthread_should_stop()) {
1667
1668 set_current_state(TASK_INTERRUPTIBLE);
1669
1670 /* check once a second */
1671 schedule_timeout(HZ);
1672
1673 if (unlikely(ftrace_disabled))
1674 continue;
1675
1676 mutex_lock(&ftrace_sysctl_lock);
1677 mutex_lock(&ftraced_lock);
1678 if (!ftraced_suspend && !ftraced_stop &&
1679 ftrace_update_code()) {
1680 usecs = nsecs_to_usecs(ftrace_update_time);
1681 if (ftrace_update_tot_cnt > 100000) {
1682 ftrace_update_tot_cnt = 0;
1683 pr_info("hm, dftrace overflow: %lu change%s"
1684 " (%lu total) in %lu usec%s\n",
1685 ftrace_update_cnt,
1686 ftrace_update_cnt != 1 ? "s" : "",
1687 ftrace_update_tot_cnt,
1688 usecs, usecs != 1 ? "s" : "");
1689 ftrace_disabled = 1;
1690 WARN_ON_ONCE(1);
1691 }
1692 }
1693 mutex_unlock(&ftraced_lock);
1694 mutex_unlock(&ftrace_sysctl_lock);
1695
1696 ftrace_shutdown_replenish();
1697 }
1698 __set_current_state(TASK_RUNNING);
1699 return 0;
1700}
1701
1559static int __init ftrace_dynamic_init(void) 1702static int __init ftrace_dynamic_init(void)
1560{ 1703{
1561 struct task_struct *p; 1704 struct task_struct *p;
@@ -1564,7 +1707,7 @@ static int __init ftrace_dynamic_init(void)
1564 1707
1565 addr = (unsigned long)ftrace_record_ip; 1708 addr = (unsigned long)ftrace_record_ip;
1566 1709
1567 stop_machine_run(ftrace_dyn_arch_init, &addr, NR_CPUS); 1710 stop_machine(ftrace_dyn_arch_init, &addr, NULL);
1568 1711
1569 /* ftrace_dyn_arch_init places the return code in addr */ 1712 /* ftrace_dyn_arch_init places the return code in addr */
1570 if (addr) { 1713 if (addr) {
@@ -1572,7 +1715,7 @@ static int __init ftrace_dynamic_init(void)
1572 goto failed; 1715 goto failed;
1573 } 1716 }
1574 1717
1575 ret = ftrace_dyn_table_alloc(); 1718 ret = ftrace_dyn_table_alloc(NR_TO_INIT);
1576 if (ret) 1719 if (ret)
1577 goto failed; 1720 goto failed;
1578 1721
@@ -1593,6 +1736,8 @@ static int __init ftrace_dynamic_init(void)
1593} 1736}
1594 1737
1595core_initcall(ftrace_dynamic_init); 1738core_initcall(ftrace_dynamic_init);
1739#endif /* CONFIG_FTRACE_MCOUNT_RECORD */
1740
1596#else 1741#else
1597# define ftrace_startup() do { } while (0) 1742# define ftrace_startup() do { } while (0)
1598# define ftrace_shutdown() do { } while (0) 1743# define ftrace_shutdown() do { } while (0)
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
new file mode 100644
index 000000000000..94af1fe56bb4
--- /dev/null
+++ b/kernel/trace/ring_buffer.c
@@ -0,0 +1,2014 @@
1/*
2 * Generic ring buffer
3 *
4 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
5 */
6#include <linux/ring_buffer.h>
7#include <linux/spinlock.h>
8#include <linux/debugfs.h>
9#include <linux/uaccess.h>
10#include <linux/module.h>
11#include <linux/percpu.h>
12#include <linux/mutex.h>
13#include <linux/sched.h> /* used for sched_clock() (for now) */
14#include <linux/init.h>
15#include <linux/hash.h>
16#include <linux/list.h>
17#include <linux/fs.h>
18
19/* Up this if you want to test the TIME_EXTENTS and normalization */
20#define DEBUG_SHIFT 0
21
22/* FIXME!!! */
23u64 ring_buffer_time_stamp(int cpu)
24{
25 /* shift to debug/test normalization and TIME_EXTENTS */
26 return sched_clock() << DEBUG_SHIFT;
27}
28
29void ring_buffer_normalize_time_stamp(int cpu, u64 *ts)
30{
31 /* Just stupid testing the normalize function and deltas */
32 *ts >>= DEBUG_SHIFT;
33}
34
35#define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event))
36#define RB_ALIGNMENT_SHIFT 2
37#define RB_ALIGNMENT (1 << RB_ALIGNMENT_SHIFT)
38#define RB_MAX_SMALL_DATA 28
39
40enum {
41 RB_LEN_TIME_EXTEND = 8,
42 RB_LEN_TIME_STAMP = 16,
43};
44
45/* inline for ring buffer fast paths */
46static inline unsigned
47rb_event_length(struct ring_buffer_event *event)
48{
49 unsigned length;
50
51 switch (event->type) {
52 case RINGBUF_TYPE_PADDING:
53 /* undefined */
54 return -1;
55
56 case RINGBUF_TYPE_TIME_EXTEND:
57 return RB_LEN_TIME_EXTEND;
58
59 case RINGBUF_TYPE_TIME_STAMP:
60 return RB_LEN_TIME_STAMP;
61
62 case RINGBUF_TYPE_DATA:
63 if (event->len)
64 length = event->len << RB_ALIGNMENT_SHIFT;
65 else
66 length = event->array[0];
67 return length + RB_EVNT_HDR_SIZE;
68 default:
69 BUG();
70 }
71 /* not hit */
72 return 0;
73}
74
75/**
76 * ring_buffer_event_length - return the length of the event
77 * @event: the event to get the length of
78 */
79unsigned ring_buffer_event_length(struct ring_buffer_event *event)
80{
81 return rb_event_length(event);
82}
83
84/* inline for ring buffer fast paths */
85static inline void *
86rb_event_data(struct ring_buffer_event *event)
87{
88 BUG_ON(event->type != RINGBUF_TYPE_DATA);
89 /* If length is in len field, then array[0] has the data */
90 if (event->len)
91 return (void *)&event->array[0];
92 /* Otherwise length is in array[0] and array[1] has the data */
93 return (void *)&event->array[1];
94}
95
96/**
97 * ring_buffer_event_data - return the data of the event
98 * @event: the event to get the data from
99 */
100void *ring_buffer_event_data(struct ring_buffer_event *event)
101{
102 return rb_event_data(event);
103}
104
105#define for_each_buffer_cpu(buffer, cpu) \
106 for_each_cpu_mask(cpu, buffer->cpumask)
107
108#define TS_SHIFT 27
109#define TS_MASK ((1ULL << TS_SHIFT) - 1)
110#define TS_DELTA_TEST (~TS_MASK)
111
112/*
113 * This hack stolen from mm/slob.c.
114 * We can store per page timing information in the page frame of the page.
115 * Thanks to Peter Zijlstra for suggesting this idea.
116 */
117struct buffer_page {
118 u64 time_stamp; /* page time stamp */
119 local_t write; /* index for next write */
120 local_t commit; /* write commited index */
121 unsigned read; /* index for next read */
122 struct list_head list; /* list of free pages */
123 void *page; /* Actual data page */
124};
125
126/*
127 * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing
128 * this issue out.
129 */
130static inline void free_buffer_page(struct buffer_page *bpage)
131{
132 if (bpage->page)
133 __free_page(bpage->page);
134 kfree(bpage);
135}
136
137/*
138 * We need to fit the time_stamp delta into 27 bits.
139 */
140static inline int test_time_stamp(u64 delta)
141{
142 if (delta & TS_DELTA_TEST)
143 return 1;
144 return 0;
145}
146
147#define BUF_PAGE_SIZE PAGE_SIZE
148
149/*
150 * head_page == tail_page && head == tail then buffer is empty.
151 */
152struct ring_buffer_per_cpu {
153 int cpu;
154 struct ring_buffer *buffer;
155 spinlock_t lock;
156 struct lock_class_key lock_key;
157 struct list_head pages;
158 struct buffer_page *head_page; /* read from head */
159 struct buffer_page *tail_page; /* write to tail */
160 struct buffer_page *commit_page; /* commited pages */
161 struct buffer_page *reader_page;
162 unsigned long overrun;
163 unsigned long entries;
164 u64 write_stamp;
165 u64 read_stamp;
166 atomic_t record_disabled;
167};
168
169struct ring_buffer {
170 unsigned long size;
171 unsigned pages;
172 unsigned flags;
173 int cpus;
174 cpumask_t cpumask;
175 atomic_t record_disabled;
176
177 struct mutex mutex;
178
179 struct ring_buffer_per_cpu **buffers;
180};
181
182struct ring_buffer_iter {
183 struct ring_buffer_per_cpu *cpu_buffer;
184 unsigned long head;
185 struct buffer_page *head_page;
186 u64 read_stamp;
187};
188
189#define RB_WARN_ON(buffer, cond) \
190 do { \
191 if (unlikely(cond)) { \
192 atomic_inc(&buffer->record_disabled); \
193 WARN_ON(1); \
194 } \
195 } while (0)
196
197#define RB_WARN_ON_RET(buffer, cond) \
198 do { \
199 if (unlikely(cond)) { \
200 atomic_inc(&buffer->record_disabled); \
201 WARN_ON(1); \
202 return -1; \
203 } \
204 } while (0)
205
206#define RB_WARN_ON_ONCE(buffer, cond) \
207 do { \
208 static int once; \
209 if (unlikely(cond) && !once) { \
210 once++; \
211 atomic_inc(&buffer->record_disabled); \
212 WARN_ON(1); \
213 } \
214 } while (0)
215
216/**
217 * check_pages - integrity check of buffer pages
218 * @cpu_buffer: CPU buffer with pages to test
219 *
220 * As a safty measure we check to make sure the data pages have not
221 * been corrupted.
222 */
223static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
224{
225 struct list_head *head = &cpu_buffer->pages;
226 struct buffer_page *page, *tmp;
227
228 RB_WARN_ON_RET(cpu_buffer, head->next->prev != head);
229 RB_WARN_ON_RET(cpu_buffer, head->prev->next != head);
230
231 list_for_each_entry_safe(page, tmp, head, list) {
232 RB_WARN_ON_RET(cpu_buffer,
233 page->list.next->prev != &page->list);
234 RB_WARN_ON_RET(cpu_buffer,
235 page->list.prev->next != &page->list);
236 }
237
238 return 0;
239}
240
241static int rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
242 unsigned nr_pages)
243{
244 struct list_head *head = &cpu_buffer->pages;
245 struct buffer_page *page, *tmp;
246 unsigned long addr;
247 LIST_HEAD(pages);
248 unsigned i;
249
250 for (i = 0; i < nr_pages; i++) {
251 page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()),
252 GFP_KERNEL, cpu_to_node(cpu_buffer->cpu));
253 if (!page)
254 goto free_pages;
255 list_add(&page->list, &pages);
256
257 addr = __get_free_page(GFP_KERNEL);
258 if (!addr)
259 goto free_pages;
260 page->page = (void *)addr;
261 }
262
263 list_splice(&pages, head);
264
265 rb_check_pages(cpu_buffer);
266
267 return 0;
268
269 free_pages:
270 list_for_each_entry_safe(page, tmp, &pages, list) {
271 list_del_init(&page->list);
272 free_buffer_page(page);
273 }
274 return -ENOMEM;
275}
276
277static struct ring_buffer_per_cpu *
278rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
279{
280 struct ring_buffer_per_cpu *cpu_buffer;
281 struct buffer_page *page;
282 unsigned long addr;
283 int ret;
284
285 cpu_buffer = kzalloc_node(ALIGN(sizeof(*cpu_buffer), cache_line_size()),
286 GFP_KERNEL, cpu_to_node(cpu));
287 if (!cpu_buffer)
288 return NULL;
289
290 cpu_buffer->cpu = cpu;
291 cpu_buffer->buffer = buffer;
292 spin_lock_init(&cpu_buffer->lock);
293 INIT_LIST_HEAD(&cpu_buffer->pages);
294
295 page = kzalloc_node(ALIGN(sizeof(*page), cache_line_size()),
296 GFP_KERNEL, cpu_to_node(cpu));
297 if (!page)
298 goto fail_free_buffer;
299
300 cpu_buffer->reader_page = page;
301 addr = __get_free_page(GFP_KERNEL);
302 if (!addr)
303 goto fail_free_reader;
304 page->page = (void *)addr;
305
306 INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
307
308 ret = rb_allocate_pages(cpu_buffer, buffer->pages);
309 if (ret < 0)
310 goto fail_free_reader;
311
312 cpu_buffer->head_page
313 = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
314 cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
315
316 return cpu_buffer;
317
318 fail_free_reader:
319 free_buffer_page(cpu_buffer->reader_page);
320
321 fail_free_buffer:
322 kfree(cpu_buffer);
323 return NULL;
324}
325
326static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
327{
328 struct list_head *head = &cpu_buffer->pages;
329 struct buffer_page *page, *tmp;
330
331 list_del_init(&cpu_buffer->reader_page->list);
332 free_buffer_page(cpu_buffer->reader_page);
333
334 list_for_each_entry_safe(page, tmp, head, list) {
335 list_del_init(&page->list);
336 free_buffer_page(page);
337 }
338 kfree(cpu_buffer);
339}
340
341/*
342 * Causes compile errors if the struct buffer_page gets bigger
343 * than the struct page.
344 */
345extern int ring_buffer_page_too_big(void);
346
347/**
348 * ring_buffer_alloc - allocate a new ring_buffer
349 * @size: the size in bytes that is needed.
350 * @flags: attributes to set for the ring buffer.
351 *
352 * Currently the only flag that is available is the RB_FL_OVERWRITE
353 * flag. This flag means that the buffer will overwrite old data
354 * when the buffer wraps. If this flag is not set, the buffer will
355 * drop data when the tail hits the head.
356 */
357struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
358{
359 struct ring_buffer *buffer;
360 int bsize;
361 int cpu;
362
363 /* Paranoid! Optimizes out when all is well */
364 if (sizeof(struct buffer_page) > sizeof(struct page))
365 ring_buffer_page_too_big();
366
367
368 /* keep it in its own cache line */
369 buffer = kzalloc(ALIGN(sizeof(*buffer), cache_line_size()),
370 GFP_KERNEL);
371 if (!buffer)
372 return NULL;
373
374 buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
375 buffer->flags = flags;
376
377 /* need at least two pages */
378 if (buffer->pages == 1)
379 buffer->pages++;
380
381 buffer->cpumask = cpu_possible_map;
382 buffer->cpus = nr_cpu_ids;
383
384 bsize = sizeof(void *) * nr_cpu_ids;
385 buffer->buffers = kzalloc(ALIGN(bsize, cache_line_size()),
386 GFP_KERNEL);
387 if (!buffer->buffers)
388 goto fail_free_buffer;
389
390 for_each_buffer_cpu(buffer, cpu) {
391 buffer->buffers[cpu] =
392 rb_allocate_cpu_buffer(buffer, cpu);
393 if (!buffer->buffers[cpu])
394 goto fail_free_buffers;
395 }
396
397 mutex_init(&buffer->mutex);
398
399 return buffer;
400
401 fail_free_buffers:
402 for_each_buffer_cpu(buffer, cpu) {
403 if (buffer->buffers[cpu])
404 rb_free_cpu_buffer(buffer->buffers[cpu]);
405 }
406 kfree(buffer->buffers);
407
408 fail_free_buffer:
409 kfree(buffer);
410 return NULL;
411}
412
413/**
414 * ring_buffer_free - free a ring buffer.
415 * @buffer: the buffer to free.
416 */
417void
418ring_buffer_free(struct ring_buffer *buffer)
419{
420 int cpu;
421
422 for_each_buffer_cpu(buffer, cpu)
423 rb_free_cpu_buffer(buffer->buffers[cpu]);
424
425 kfree(buffer);
426}
427
428static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
429
430static void
431rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
432{
433 struct buffer_page *page;
434 struct list_head *p;
435 unsigned i;
436
437 atomic_inc(&cpu_buffer->record_disabled);
438 synchronize_sched();
439
440 for (i = 0; i < nr_pages; i++) {
441 BUG_ON(list_empty(&cpu_buffer->pages));
442 p = cpu_buffer->pages.next;
443 page = list_entry(p, struct buffer_page, list);
444 list_del_init(&page->list);
445 free_buffer_page(page);
446 }
447 BUG_ON(list_empty(&cpu_buffer->pages));
448
449 rb_reset_cpu(cpu_buffer);
450
451 rb_check_pages(cpu_buffer);
452
453 atomic_dec(&cpu_buffer->record_disabled);
454
455}
456
457static void
458rb_insert_pages(struct ring_buffer_per_cpu *cpu_buffer,
459 struct list_head *pages, unsigned nr_pages)
460{
461 struct buffer_page *page;
462 struct list_head *p;
463 unsigned i;
464
465 atomic_inc(&cpu_buffer->record_disabled);
466 synchronize_sched();
467
468 for (i = 0; i < nr_pages; i++) {
469 BUG_ON(list_empty(pages));
470 p = pages->next;
471 page = list_entry(p, struct buffer_page, list);
472 list_del_init(&page->list);
473 list_add_tail(&page->list, &cpu_buffer->pages);
474 }
475 rb_reset_cpu(cpu_buffer);
476
477 rb_check_pages(cpu_buffer);
478
479 atomic_dec(&cpu_buffer->record_disabled);
480}
481
482/**
483 * ring_buffer_resize - resize the ring buffer
484 * @buffer: the buffer to resize.
485 * @size: the new size.
486 *
487 * The tracer is responsible for making sure that the buffer is
488 * not being used while changing the size.
489 * Note: We may be able to change the above requirement by using
490 * RCU synchronizations.
491 *
492 * Minimum size is 2 * BUF_PAGE_SIZE.
493 *
494 * Returns -1 on failure.
495 */
496int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
497{
498 struct ring_buffer_per_cpu *cpu_buffer;
499 unsigned nr_pages, rm_pages, new_pages;
500 struct buffer_page *page, *tmp;
501 unsigned long buffer_size;
502 unsigned long addr;
503 LIST_HEAD(pages);
504 int i, cpu;
505
506 size = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
507 size *= BUF_PAGE_SIZE;
508 buffer_size = buffer->pages * BUF_PAGE_SIZE;
509
510 /* we need a minimum of two pages */
511 if (size < BUF_PAGE_SIZE * 2)
512 size = BUF_PAGE_SIZE * 2;
513
514 if (size == buffer_size)
515 return size;
516
517 mutex_lock(&buffer->mutex);
518
519 nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
520
521 if (size < buffer_size) {
522
523 /* easy case, just free pages */
524 BUG_ON(nr_pages >= buffer->pages);
525
526 rm_pages = buffer->pages - nr_pages;
527
528 for_each_buffer_cpu(buffer, cpu) {
529 cpu_buffer = buffer->buffers[cpu];
530 rb_remove_pages(cpu_buffer, rm_pages);
531 }
532 goto out;
533 }
534
535 /*
536 * This is a bit more difficult. We only want to add pages
537 * when we can allocate enough for all CPUs. We do this
538 * by allocating all the pages and storing them on a local
539 * link list. If we succeed in our allocation, then we
540 * add these pages to the cpu_buffers. Otherwise we just free
541 * them all and return -ENOMEM;
542 */
543 BUG_ON(nr_pages <= buffer->pages);
544 new_pages = nr_pages - buffer->pages;
545
546 for_each_buffer_cpu(buffer, cpu) {
547 for (i = 0; i < new_pages; i++) {
548 page = kzalloc_node(ALIGN(sizeof(*page),
549 cache_line_size()),
550 GFP_KERNEL, cpu_to_node(cpu));
551 if (!page)
552 goto free_pages;
553 list_add(&page->list, &pages);
554 addr = __get_free_page(GFP_KERNEL);
555 if (!addr)
556 goto free_pages;
557 page->page = (void *)addr;
558 }
559 }
560
561 for_each_buffer_cpu(buffer, cpu) {
562 cpu_buffer = buffer->buffers[cpu];
563 rb_insert_pages(cpu_buffer, &pages, new_pages);
564 }
565
566 BUG_ON(!list_empty(&pages));
567
568 out:
569 buffer->pages = nr_pages;
570 mutex_unlock(&buffer->mutex);
571
572 return size;
573
574 free_pages:
575 list_for_each_entry_safe(page, tmp, &pages, list) {
576 list_del_init(&page->list);
577 free_buffer_page(page);
578 }
579 return -ENOMEM;
580}
581
582static inline int rb_null_event(struct ring_buffer_event *event)
583{
584 return event->type == RINGBUF_TYPE_PADDING;
585}
586
587static inline void *__rb_page_index(struct buffer_page *page, unsigned index)
588{
589 return page->page + index;
590}
591
592static inline struct ring_buffer_event *
593rb_reader_event(struct ring_buffer_per_cpu *cpu_buffer)
594{
595 return __rb_page_index(cpu_buffer->reader_page,
596 cpu_buffer->reader_page->read);
597}
598
599static inline struct ring_buffer_event *
600rb_head_event(struct ring_buffer_per_cpu *cpu_buffer)
601{
602 return __rb_page_index(cpu_buffer->head_page,
603 cpu_buffer->head_page->read);
604}
605
606static inline struct ring_buffer_event *
607rb_iter_head_event(struct ring_buffer_iter *iter)
608{
609 return __rb_page_index(iter->head_page, iter->head);
610}
611
612static inline unsigned rb_page_write(struct buffer_page *bpage)
613{
614 return local_read(&bpage->write);
615}
616
617static inline unsigned rb_page_commit(struct buffer_page *bpage)
618{
619 return local_read(&bpage->commit);
620}
621
622/* Size is determined by what has been commited */
623static inline unsigned rb_page_size(struct buffer_page *bpage)
624{
625 return rb_page_commit(bpage);
626}
627
628static inline unsigned
629rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer)
630{
631 return rb_page_commit(cpu_buffer->commit_page);
632}
633
634static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer)
635{
636 return rb_page_commit(cpu_buffer->head_page);
637}
638
639/*
640 * When the tail hits the head and the buffer is in overwrite mode,
641 * the head jumps to the next page and all content on the previous
642 * page is discarded. But before doing so, we update the overrun
643 * variable of the buffer.
644 */
645static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
646{
647 struct ring_buffer_event *event;
648 unsigned long head;
649
650 for (head = 0; head < rb_head_size(cpu_buffer);
651 head += rb_event_length(event)) {
652
653 event = __rb_page_index(cpu_buffer->head_page, head);
654 BUG_ON(rb_null_event(event));
655 /* Only count data entries */
656 if (event->type != RINGBUF_TYPE_DATA)
657 continue;
658 cpu_buffer->overrun++;
659 cpu_buffer->entries--;
660 }
661}
662
663static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
664 struct buffer_page **page)
665{
666 struct list_head *p = (*page)->list.next;
667
668 if (p == &cpu_buffer->pages)
669 p = p->next;
670
671 *page = list_entry(p, struct buffer_page, list);
672}
673
674static inline unsigned
675rb_event_index(struct ring_buffer_event *event)
676{
677 unsigned long addr = (unsigned long)event;
678
679 return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE);
680}
681
682static inline int
683rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
684 struct ring_buffer_event *event)
685{
686 unsigned long addr = (unsigned long)event;
687 unsigned long index;
688
689 index = rb_event_index(event);
690 addr &= PAGE_MASK;
691
692 return cpu_buffer->commit_page->page == (void *)addr &&
693 rb_commit_index(cpu_buffer) == index;
694}
695
696static inline void
697rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer,
698 struct ring_buffer_event *event)
699{
700 unsigned long addr = (unsigned long)event;
701 unsigned long index;
702
703 index = rb_event_index(event);
704 addr &= PAGE_MASK;
705
706 while (cpu_buffer->commit_page->page != (void *)addr) {
707 RB_WARN_ON(cpu_buffer,
708 cpu_buffer->commit_page == cpu_buffer->tail_page);
709 cpu_buffer->commit_page->commit =
710 cpu_buffer->commit_page->write;
711 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
712 cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp;
713 }
714
715 /* Now set the commit to the event's index */
716 local_set(&cpu_buffer->commit_page->commit, index);
717}
718
719static inline void
720rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
721{
722 /*
723 * We only race with interrupts and NMIs on this CPU.
724 * If we own the commit event, then we can commit
725 * all others that interrupted us, since the interruptions
726 * are in stack format (they finish before they come
727 * back to us). This allows us to do a simple loop to
728 * assign the commit to the tail.
729 */
730 while (cpu_buffer->commit_page != cpu_buffer->tail_page) {
731 cpu_buffer->commit_page->commit =
732 cpu_buffer->commit_page->write;
733 rb_inc_page(cpu_buffer, &cpu_buffer->commit_page);
734 cpu_buffer->write_stamp = cpu_buffer->commit_page->time_stamp;
735 /* add barrier to keep gcc from optimizing too much */
736 barrier();
737 }
738 while (rb_commit_index(cpu_buffer) !=
739 rb_page_write(cpu_buffer->commit_page)) {
740 cpu_buffer->commit_page->commit =
741 cpu_buffer->commit_page->write;
742 barrier();
743 }
744}
745
746static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
747{
748 cpu_buffer->read_stamp = cpu_buffer->reader_page->time_stamp;
749 cpu_buffer->reader_page->read = 0;
750}
751
752static inline void rb_inc_iter(struct ring_buffer_iter *iter)
753{
754 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
755
756 /*
757 * The iterator could be on the reader page (it starts there).
758 * But the head could have moved, since the reader was
759 * found. Check for this case and assign the iterator
760 * to the head page instead of next.
761 */
762 if (iter->head_page == cpu_buffer->reader_page)
763 iter->head_page = cpu_buffer->head_page;
764 else
765 rb_inc_page(cpu_buffer, &iter->head_page);
766
767 iter->read_stamp = iter->head_page->time_stamp;
768 iter->head = 0;
769}
770
771/**
772 * ring_buffer_update_event - update event type and data
773 * @event: the even to update
774 * @type: the type of event
775 * @length: the size of the event field in the ring buffer
776 *
777 * Update the type and data fields of the event. The length
778 * is the actual size that is written to the ring buffer,
779 * and with this, we can determine what to place into the
780 * data field.
781 */
782static inline void
783rb_update_event(struct ring_buffer_event *event,
784 unsigned type, unsigned length)
785{
786 event->type = type;
787
788 switch (type) {
789
790 case RINGBUF_TYPE_PADDING:
791 break;
792
793 case RINGBUF_TYPE_TIME_EXTEND:
794 event->len =
795 (RB_LEN_TIME_EXTEND + (RB_ALIGNMENT-1))
796 >> RB_ALIGNMENT_SHIFT;
797 break;
798
799 case RINGBUF_TYPE_TIME_STAMP:
800 event->len =
801 (RB_LEN_TIME_STAMP + (RB_ALIGNMENT-1))
802 >> RB_ALIGNMENT_SHIFT;
803 break;
804
805 case RINGBUF_TYPE_DATA:
806 length -= RB_EVNT_HDR_SIZE;
807 if (length > RB_MAX_SMALL_DATA) {
808 event->len = 0;
809 event->array[0] = length;
810 } else
811 event->len =
812 (length + (RB_ALIGNMENT-1))
813 >> RB_ALIGNMENT_SHIFT;
814 break;
815 default:
816 BUG();
817 }
818}
819
820static inline unsigned rb_calculate_event_length(unsigned length)
821{
822 struct ring_buffer_event event; /* Used only for sizeof array */
823
824 /* zero length can cause confusions */
825 if (!length)
826 length = 1;
827
828 if (length > RB_MAX_SMALL_DATA)
829 length += sizeof(event.array[0]);
830
831 length += RB_EVNT_HDR_SIZE;
832 length = ALIGN(length, RB_ALIGNMENT);
833
834 return length;
835}
836
837static struct ring_buffer_event *
838__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
839 unsigned type, unsigned long length, u64 *ts)
840{
841 struct buffer_page *tail_page, *head_page, *reader_page;
842 unsigned long tail, write;
843 struct ring_buffer *buffer = cpu_buffer->buffer;
844 struct ring_buffer_event *event;
845 unsigned long flags;
846
847 tail_page = cpu_buffer->tail_page;
848 write = local_add_return(length, &tail_page->write);
849 tail = write - length;
850
851 /* See if we shot pass the end of this buffer page */
852 if (write > BUF_PAGE_SIZE) {
853 struct buffer_page *next_page = tail_page;
854
855 spin_lock_irqsave(&cpu_buffer->lock, flags);
856
857 rb_inc_page(cpu_buffer, &next_page);
858
859 head_page = cpu_buffer->head_page;
860 reader_page = cpu_buffer->reader_page;
861
862 /* we grabbed the lock before incrementing */
863 RB_WARN_ON(cpu_buffer, next_page == reader_page);
864
865 /*
866 * If for some reason, we had an interrupt storm that made
867 * it all the way around the buffer, bail, and warn
868 * about it.
869 */
870 if (unlikely(next_page == cpu_buffer->commit_page)) {
871 WARN_ON_ONCE(1);
872 goto out_unlock;
873 }
874
875 if (next_page == head_page) {
876 if (!(buffer->flags & RB_FL_OVERWRITE)) {
877 /* reset write */
878 if (tail <= BUF_PAGE_SIZE)
879 local_set(&tail_page->write, tail);
880 goto out_unlock;
881 }
882
883 /* tail_page has not moved yet? */
884 if (tail_page == cpu_buffer->tail_page) {
885 /* count overflows */
886 rb_update_overflow(cpu_buffer);
887
888 rb_inc_page(cpu_buffer, &head_page);
889 cpu_buffer->head_page = head_page;
890 cpu_buffer->head_page->read = 0;
891 }
892 }
893
894 /*
895 * If the tail page is still the same as what we think
896 * it is, then it is up to us to update the tail
897 * pointer.
898 */
899 if (tail_page == cpu_buffer->tail_page) {
900 local_set(&next_page->write, 0);
901 local_set(&next_page->commit, 0);
902 cpu_buffer->tail_page = next_page;
903
904 /* reread the time stamp */
905 *ts = ring_buffer_time_stamp(cpu_buffer->cpu);
906 cpu_buffer->tail_page->time_stamp = *ts;
907 }
908
909 /*
910 * The actual tail page has moved forward.
911 */
912 if (tail < BUF_PAGE_SIZE) {
913 /* Mark the rest of the page with padding */
914 event = __rb_page_index(tail_page, tail);
915 event->type = RINGBUF_TYPE_PADDING;
916 }
917
918 if (tail <= BUF_PAGE_SIZE)
919 /* Set the write back to the previous setting */
920 local_set(&tail_page->write, tail);
921
922 /*
923 * If this was a commit entry that failed,
924 * increment that too
925 */
926 if (tail_page == cpu_buffer->commit_page &&
927 tail == rb_commit_index(cpu_buffer)) {
928 rb_set_commit_to_write(cpu_buffer);
929 }
930
931 spin_unlock_irqrestore(&cpu_buffer->lock, flags);
932
933 /* fail and let the caller try again */
934 return ERR_PTR(-EAGAIN);
935 }
936
937 /* We reserved something on the buffer */
938
939 BUG_ON(write > BUF_PAGE_SIZE);
940
941 event = __rb_page_index(tail_page, tail);
942 rb_update_event(event, type, length);
943
944 /*
945 * If this is a commit and the tail is zero, then update
946 * this page's time stamp.
947 */
948 if (!tail && rb_is_commit(cpu_buffer, event))
949 cpu_buffer->commit_page->time_stamp = *ts;
950
951 return event;
952
953 out_unlock:
954 spin_unlock_irqrestore(&cpu_buffer->lock, flags);
955 return NULL;
956}
957
958static int
959rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
960 u64 *ts, u64 *delta)
961{
962 struct ring_buffer_event *event;
963 static int once;
964 int ret;
965
966 if (unlikely(*delta > (1ULL << 59) && !once++)) {
967 printk(KERN_WARNING "Delta way too big! %llu"
968 " ts=%llu write stamp = %llu\n",
969 *delta, *ts, cpu_buffer->write_stamp);
970 WARN_ON(1);
971 }
972
973 /*
974 * The delta is too big, we to add a
975 * new timestamp.
976 */
977 event = __rb_reserve_next(cpu_buffer,
978 RINGBUF_TYPE_TIME_EXTEND,
979 RB_LEN_TIME_EXTEND,
980 ts);
981 if (!event)
982 return -EBUSY;
983
984 if (PTR_ERR(event) == -EAGAIN)
985 return -EAGAIN;
986
987 /* Only a commited time event can update the write stamp */
988 if (rb_is_commit(cpu_buffer, event)) {
989 /*
990 * If this is the first on the page, then we need to
991 * update the page itself, and just put in a zero.
992 */
993 if (rb_event_index(event)) {
994 event->time_delta = *delta & TS_MASK;
995 event->array[0] = *delta >> TS_SHIFT;
996 } else {
997 cpu_buffer->commit_page->time_stamp = *ts;
998 event->time_delta = 0;
999 event->array[0] = 0;
1000 }
1001 cpu_buffer->write_stamp = *ts;
1002 /* let the caller know this was the commit */
1003 ret = 1;
1004 } else {
1005 /* Darn, this is just wasted space */
1006 event->time_delta = 0;
1007 event->array[0] = 0;
1008 ret = 0;
1009 }
1010
1011 *delta = 0;
1012
1013 return ret;
1014}
1015
1016static struct ring_buffer_event *
1017rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
1018 unsigned type, unsigned long length)
1019{
1020 struct ring_buffer_event *event;
1021 u64 ts, delta;
1022 int commit = 0;
1023
1024 again:
1025 ts = ring_buffer_time_stamp(cpu_buffer->cpu);
1026
1027 /*
1028 * Only the first commit can update the timestamp.
1029 * Yes there is a race here. If an interrupt comes in
1030 * just after the conditional and it traces too, then it
1031 * will also check the deltas. More than one timestamp may
1032 * also be made. But only the entry that did the actual
1033 * commit will be something other than zero.
1034 */
1035 if (cpu_buffer->tail_page == cpu_buffer->commit_page &&
1036 rb_page_write(cpu_buffer->tail_page) ==
1037 rb_commit_index(cpu_buffer)) {
1038
1039 delta = ts - cpu_buffer->write_stamp;
1040
1041 /* make sure this delta is calculated here */
1042 barrier();
1043
1044 /* Did the write stamp get updated already? */
1045 if (unlikely(ts < cpu_buffer->write_stamp))
1046 goto again;
1047
1048 if (test_time_stamp(delta)) {
1049
1050 commit = rb_add_time_stamp(cpu_buffer, &ts, &delta);
1051
1052 if (commit == -EBUSY)
1053 return NULL;
1054
1055 if (commit == -EAGAIN)
1056 goto again;
1057
1058 RB_WARN_ON(cpu_buffer, commit < 0);
1059 }
1060 } else
1061 /* Non commits have zero deltas */
1062 delta = 0;
1063
1064 event = __rb_reserve_next(cpu_buffer, type, length, &ts);
1065 if (PTR_ERR(event) == -EAGAIN)
1066 goto again;
1067
1068 if (!event) {
1069 if (unlikely(commit))
1070 /*
1071 * Ouch! We needed a timestamp and it was commited. But
1072 * we didn't get our event reserved.
1073 */
1074 rb_set_commit_to_write(cpu_buffer);
1075 return NULL;
1076 }
1077
1078 /*
1079 * If the timestamp was commited, make the commit our entry
1080 * now so that we will update it when needed.
1081 */
1082 if (commit)
1083 rb_set_commit_event(cpu_buffer, event);
1084 else if (!rb_is_commit(cpu_buffer, event))
1085 delta = 0;
1086
1087 event->time_delta = delta;
1088
1089 return event;
1090}
1091
1092static DEFINE_PER_CPU(int, rb_need_resched);
1093
1094/**
1095 * ring_buffer_lock_reserve - reserve a part of the buffer
1096 * @buffer: the ring buffer to reserve from
1097 * @length: the length of the data to reserve (excluding event header)
1098 * @flags: a pointer to save the interrupt flags
1099 *
1100 * Returns a reseverd event on the ring buffer to copy directly to.
1101 * The user of this interface will need to get the body to write into
1102 * and can use the ring_buffer_event_data() interface.
1103 *
1104 * The length is the length of the data needed, not the event length
1105 * which also includes the event header.
1106 *
1107 * Must be paired with ring_buffer_unlock_commit, unless NULL is returned.
1108 * If NULL is returned, then nothing has been allocated or locked.
1109 */
1110struct ring_buffer_event *
1111ring_buffer_lock_reserve(struct ring_buffer *buffer,
1112 unsigned long length,
1113 unsigned long *flags)
1114{
1115 struct ring_buffer_per_cpu *cpu_buffer;
1116 struct ring_buffer_event *event;
1117 int cpu, resched;
1118
1119 if (atomic_read(&buffer->record_disabled))
1120 return NULL;
1121
1122 /* If we are tracing schedule, we don't want to recurse */
1123 resched = need_resched();
1124 preempt_disable_notrace();
1125
1126 cpu = raw_smp_processor_id();
1127
1128 if (!cpu_isset(cpu, buffer->cpumask))
1129 goto out;
1130
1131 cpu_buffer = buffer->buffers[cpu];
1132
1133 if (atomic_read(&cpu_buffer->record_disabled))
1134 goto out;
1135
1136 length = rb_calculate_event_length(length);
1137 if (length > BUF_PAGE_SIZE)
1138 goto out;
1139
1140 event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length);
1141 if (!event)
1142 goto out;
1143
1144 /*
1145 * Need to store resched state on this cpu.
1146 * Only the first needs to.
1147 */
1148
1149 if (preempt_count() == 1)
1150 per_cpu(rb_need_resched, cpu) = resched;
1151
1152 return event;
1153
1154 out:
1155 if (resched)
1156 preempt_enable_notrace();
1157 else
1158 preempt_enable_notrace();
1159 return NULL;
1160}
1161
1162static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
1163 struct ring_buffer_event *event)
1164{
1165 cpu_buffer->entries++;
1166
1167 /* Only process further if we own the commit */
1168 if (!rb_is_commit(cpu_buffer, event))
1169 return;
1170
1171 cpu_buffer->write_stamp += event->time_delta;
1172
1173 rb_set_commit_to_write(cpu_buffer);
1174}
1175
1176/**
1177 * ring_buffer_unlock_commit - commit a reserved
1178 * @buffer: The buffer to commit to
1179 * @event: The event pointer to commit.
1180 * @flags: the interrupt flags received from ring_buffer_lock_reserve.
1181 *
1182 * This commits the data to the ring buffer, and releases any locks held.
1183 *
1184 * Must be paired with ring_buffer_lock_reserve.
1185 */
1186int ring_buffer_unlock_commit(struct ring_buffer *buffer,
1187 struct ring_buffer_event *event,
1188 unsigned long flags)
1189{
1190 struct ring_buffer_per_cpu *cpu_buffer;
1191 int cpu = raw_smp_processor_id();
1192
1193 cpu_buffer = buffer->buffers[cpu];
1194
1195 rb_commit(cpu_buffer, event);
1196
1197 /*
1198 * Only the last preempt count needs to restore preemption.
1199 */
1200 if (preempt_count() == 1) {
1201 if (per_cpu(rb_need_resched, cpu))
1202 preempt_enable_no_resched_notrace();
1203 else
1204 preempt_enable_notrace();
1205 } else
1206 preempt_enable_no_resched_notrace();
1207
1208 return 0;
1209}
1210
1211/**
1212 * ring_buffer_write - write data to the buffer without reserving
1213 * @buffer: The ring buffer to write to.
1214 * @length: The length of the data being written (excluding the event header)
1215 * @data: The data to write to the buffer.
1216 *
1217 * This is like ring_buffer_lock_reserve and ring_buffer_unlock_commit as
1218 * one function. If you already have the data to write to the buffer, it
1219 * may be easier to simply call this function.
1220 *
1221 * Note, like ring_buffer_lock_reserve, the length is the length of the data
1222 * and not the length of the event which would hold the header.
1223 */
1224int ring_buffer_write(struct ring_buffer *buffer,
1225 unsigned long length,
1226 void *data)
1227{
1228 struct ring_buffer_per_cpu *cpu_buffer;
1229 struct ring_buffer_event *event;
1230 unsigned long event_length;
1231 void *body;
1232 int ret = -EBUSY;
1233 int cpu, resched;
1234
1235 if (atomic_read(&buffer->record_disabled))
1236 return -EBUSY;
1237
1238 resched = need_resched();
1239 preempt_disable_notrace();
1240
1241 cpu = raw_smp_processor_id();
1242
1243 if (!cpu_isset(cpu, buffer->cpumask))
1244 goto out;
1245
1246 cpu_buffer = buffer->buffers[cpu];
1247
1248 if (atomic_read(&cpu_buffer->record_disabled))
1249 goto out;
1250
1251 event_length = rb_calculate_event_length(length);
1252 event = rb_reserve_next_event(cpu_buffer,
1253 RINGBUF_TYPE_DATA, event_length);
1254 if (!event)
1255 goto out;
1256
1257 body = rb_event_data(event);
1258
1259 memcpy(body, data, length);
1260
1261 rb_commit(cpu_buffer, event);
1262
1263 ret = 0;
1264 out:
1265 if (resched)
1266 preempt_enable_no_resched_notrace();
1267 else
1268 preempt_enable_notrace();
1269
1270 return ret;
1271}
1272
1273static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
1274{
1275 struct buffer_page *reader = cpu_buffer->reader_page;
1276 struct buffer_page *head = cpu_buffer->head_page;
1277 struct buffer_page *commit = cpu_buffer->commit_page;
1278
1279 return reader->read == rb_page_commit(reader) &&
1280 (commit == reader ||
1281 (commit == head &&
1282 head->read == rb_page_commit(commit)));
1283}
1284
1285/**
1286 * ring_buffer_record_disable - stop all writes into the buffer
1287 * @buffer: The ring buffer to stop writes to.
1288 *
1289 * This prevents all writes to the buffer. Any attempt to write
1290 * to the buffer after this will fail and return NULL.
1291 *
1292 * The caller should call synchronize_sched() after this.
1293 */
1294void ring_buffer_record_disable(struct ring_buffer *buffer)
1295{
1296 atomic_inc(&buffer->record_disabled);
1297}
1298
1299/**
1300 * ring_buffer_record_enable - enable writes to the buffer
1301 * @buffer: The ring buffer to enable writes
1302 *
1303 * Note, multiple disables will need the same number of enables
1304 * to truely enable the writing (much like preempt_disable).
1305 */
1306void ring_buffer_record_enable(struct ring_buffer *buffer)
1307{
1308 atomic_dec(&buffer->record_disabled);
1309}
1310
1311/**
1312 * ring_buffer_record_disable_cpu - stop all writes into the cpu_buffer
1313 * @buffer: The ring buffer to stop writes to.
1314 * @cpu: The CPU buffer to stop
1315 *
1316 * This prevents all writes to the buffer. Any attempt to write
1317 * to the buffer after this will fail and return NULL.
1318 *
1319 * The caller should call synchronize_sched() after this.
1320 */
1321void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu)
1322{
1323 struct ring_buffer_per_cpu *cpu_buffer;
1324
1325 if (!cpu_isset(cpu, buffer->cpumask))
1326 return;
1327
1328 cpu_buffer = buffer->buffers[cpu];
1329 atomic_inc(&cpu_buffer->record_disabled);
1330}
1331
1332/**
1333 * ring_buffer_record_enable_cpu - enable writes to the buffer
1334 * @buffer: The ring buffer to enable writes
1335 * @cpu: The CPU to enable.
1336 *
1337 * Note, multiple disables will need the same number of enables
1338 * to truely enable the writing (much like preempt_disable).
1339 */
1340void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
1341{
1342 struct ring_buffer_per_cpu *cpu_buffer;
1343
1344 if (!cpu_isset(cpu, buffer->cpumask))
1345 return;
1346
1347 cpu_buffer = buffer->buffers[cpu];
1348 atomic_dec(&cpu_buffer->record_disabled);
1349}
1350
1351/**
1352 * ring_buffer_entries_cpu - get the number of entries in a cpu buffer
1353 * @buffer: The ring buffer
1354 * @cpu: The per CPU buffer to get the entries from.
1355 */
1356unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
1357{
1358 struct ring_buffer_per_cpu *cpu_buffer;
1359
1360 if (!cpu_isset(cpu, buffer->cpumask))
1361 return 0;
1362
1363 cpu_buffer = buffer->buffers[cpu];
1364 return cpu_buffer->entries;
1365}
1366
1367/**
1368 * ring_buffer_overrun_cpu - get the number of overruns in a cpu_buffer
1369 * @buffer: The ring buffer
1370 * @cpu: The per CPU buffer to get the number of overruns from
1371 */
1372unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
1373{
1374 struct ring_buffer_per_cpu *cpu_buffer;
1375
1376 if (!cpu_isset(cpu, buffer->cpumask))
1377 return 0;
1378
1379 cpu_buffer = buffer->buffers[cpu];
1380 return cpu_buffer->overrun;
1381}
1382
1383/**
1384 * ring_buffer_entries - get the number of entries in a buffer
1385 * @buffer: The ring buffer
1386 *
1387 * Returns the total number of entries in the ring buffer
1388 * (all CPU entries)
1389 */
1390unsigned long ring_buffer_entries(struct ring_buffer *buffer)
1391{
1392 struct ring_buffer_per_cpu *cpu_buffer;
1393 unsigned long entries = 0;
1394 int cpu;
1395
1396 /* if you care about this being correct, lock the buffer */
1397 for_each_buffer_cpu(buffer, cpu) {
1398 cpu_buffer = buffer->buffers[cpu];
1399 entries += cpu_buffer->entries;
1400 }
1401
1402 return entries;
1403}
1404
1405/**
1406 * ring_buffer_overrun_cpu - get the number of overruns in buffer
1407 * @buffer: The ring buffer
1408 *
1409 * Returns the total number of overruns in the ring buffer
1410 * (all CPU entries)
1411 */
1412unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
1413{
1414 struct ring_buffer_per_cpu *cpu_buffer;
1415 unsigned long overruns = 0;
1416 int cpu;
1417
1418 /* if you care about this being correct, lock the buffer */
1419 for_each_buffer_cpu(buffer, cpu) {
1420 cpu_buffer = buffer->buffers[cpu];
1421 overruns += cpu_buffer->overrun;
1422 }
1423
1424 return overruns;
1425}
1426
1427/**
1428 * ring_buffer_iter_reset - reset an iterator
1429 * @iter: The iterator to reset
1430 *
1431 * Resets the iterator, so that it will start from the beginning
1432 * again.
1433 */
1434void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
1435{
1436 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
1437
1438 /* Iterator usage is expected to have record disabled */
1439 if (list_empty(&cpu_buffer->reader_page->list)) {
1440 iter->head_page = cpu_buffer->head_page;
1441 iter->head = cpu_buffer->head_page->read;
1442 } else {
1443 iter->head_page = cpu_buffer->reader_page;
1444 iter->head = cpu_buffer->reader_page->read;
1445 }
1446 if (iter->head)
1447 iter->read_stamp = cpu_buffer->read_stamp;
1448 else
1449 iter->read_stamp = iter->head_page->time_stamp;
1450}
1451
1452/**
1453 * ring_buffer_iter_empty - check if an iterator has no more to read
1454 * @iter: The iterator to check
1455 */
1456int ring_buffer_iter_empty(struct ring_buffer_iter *iter)
1457{
1458 struct ring_buffer_per_cpu *cpu_buffer;
1459
1460 cpu_buffer = iter->cpu_buffer;
1461
1462 return iter->head_page == cpu_buffer->commit_page &&
1463 iter->head == rb_commit_index(cpu_buffer);
1464}
1465
1466static void
1467rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
1468 struct ring_buffer_event *event)
1469{
1470 u64 delta;
1471
1472 switch (event->type) {
1473 case RINGBUF_TYPE_PADDING:
1474 return;
1475
1476 case RINGBUF_TYPE_TIME_EXTEND:
1477 delta = event->array[0];
1478 delta <<= TS_SHIFT;
1479 delta += event->time_delta;
1480 cpu_buffer->read_stamp += delta;
1481 return;
1482
1483 case RINGBUF_TYPE_TIME_STAMP:
1484 /* FIXME: not implemented */
1485 return;
1486
1487 case RINGBUF_TYPE_DATA:
1488 cpu_buffer->read_stamp += event->time_delta;
1489 return;
1490
1491 default:
1492 BUG();
1493 }
1494 return;
1495}
1496
1497static void
1498rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
1499 struct ring_buffer_event *event)
1500{
1501 u64 delta;
1502
1503 switch (event->type) {
1504 case RINGBUF_TYPE_PADDING:
1505 return;
1506
1507 case RINGBUF_TYPE_TIME_EXTEND:
1508 delta = event->array[0];
1509 delta <<= TS_SHIFT;
1510 delta += event->time_delta;
1511 iter->read_stamp += delta;
1512 return;
1513
1514 case RINGBUF_TYPE_TIME_STAMP:
1515 /* FIXME: not implemented */
1516 return;
1517
1518 case RINGBUF_TYPE_DATA:
1519 iter->read_stamp += event->time_delta;
1520 return;
1521
1522 default:
1523 BUG();
1524 }
1525 return;
1526}
1527
1528static struct buffer_page *
1529rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
1530{
1531 struct buffer_page *reader = NULL;
1532 unsigned long flags;
1533
1534 spin_lock_irqsave(&cpu_buffer->lock, flags);
1535
1536 again:
1537 reader = cpu_buffer->reader_page;
1538
1539 /* If there's more to read, return this page */
1540 if (cpu_buffer->reader_page->read < rb_page_size(reader))
1541 goto out;
1542
1543 /* Never should we have an index greater than the size */
1544 RB_WARN_ON(cpu_buffer,
1545 cpu_buffer->reader_page->read > rb_page_size(reader));
1546
1547 /* check if we caught up to the tail */
1548 reader = NULL;
1549 if (cpu_buffer->commit_page == cpu_buffer->reader_page)
1550 goto out;
1551
1552 /*
1553 * Splice the empty reader page into the list around the head.
1554 * Reset the reader page to size zero.
1555 */
1556
1557 reader = cpu_buffer->head_page;
1558 cpu_buffer->reader_page->list.next = reader->list.next;
1559 cpu_buffer->reader_page->list.prev = reader->list.prev;
1560
1561 local_set(&cpu_buffer->reader_page->write, 0);
1562 local_set(&cpu_buffer->reader_page->commit, 0);
1563
1564 /* Make the reader page now replace the head */
1565 reader->list.prev->next = &cpu_buffer->reader_page->list;
1566 reader->list.next->prev = &cpu_buffer->reader_page->list;
1567
1568 /*
1569 * If the tail is on the reader, then we must set the head
1570 * to the inserted page, otherwise we set it one before.
1571 */
1572 cpu_buffer->head_page = cpu_buffer->reader_page;
1573
1574 if (cpu_buffer->commit_page != reader)
1575 rb_inc_page(cpu_buffer, &cpu_buffer->head_page);
1576
1577 /* Finally update the reader page to the new head */
1578 cpu_buffer->reader_page = reader;
1579 rb_reset_reader_page(cpu_buffer);
1580
1581 goto again;
1582
1583 out:
1584 spin_unlock_irqrestore(&cpu_buffer->lock, flags);
1585
1586 return reader;
1587}
1588
1589static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
1590{
1591 struct ring_buffer_event *event;
1592 struct buffer_page *reader;
1593 unsigned length;
1594
1595 reader = rb_get_reader_page(cpu_buffer);
1596
1597 /* This function should not be called when buffer is empty */
1598 BUG_ON(!reader);
1599
1600 event = rb_reader_event(cpu_buffer);
1601
1602 if (event->type == RINGBUF_TYPE_DATA)
1603 cpu_buffer->entries--;
1604
1605 rb_update_read_stamp(cpu_buffer, event);
1606
1607 length = rb_event_length(event);
1608 cpu_buffer->reader_page->read += length;
1609}
1610
1611static void rb_advance_iter(struct ring_buffer_iter *iter)
1612{
1613 struct ring_buffer *buffer;
1614 struct ring_buffer_per_cpu *cpu_buffer;
1615 struct ring_buffer_event *event;
1616 unsigned length;
1617
1618 cpu_buffer = iter->cpu_buffer;
1619 buffer = cpu_buffer->buffer;
1620
1621 /*
1622 * Check if we are at the end of the buffer.
1623 */
1624 if (iter->head >= rb_page_size(iter->head_page)) {
1625 BUG_ON(iter->head_page == cpu_buffer->commit_page);
1626 rb_inc_iter(iter);
1627 return;
1628 }
1629
1630 event = rb_iter_head_event(iter);
1631
1632 length = rb_event_length(event);
1633
1634 /*
1635 * This should not be called to advance the header if we are
1636 * at the tail of the buffer.
1637 */
1638 BUG_ON((iter->head_page == cpu_buffer->commit_page) &&
1639 (iter->head + length > rb_commit_index(cpu_buffer)));
1640
1641 rb_update_iter_read_stamp(iter, event);
1642
1643 iter->head += length;
1644
1645 /* check for end of page padding */
1646 if ((iter->head >= rb_page_size(iter->head_page)) &&
1647 (iter->head_page != cpu_buffer->commit_page))
1648 rb_advance_iter(iter);
1649}
1650
1651/**
1652 * ring_buffer_peek - peek at the next event to be read
1653 * @buffer: The ring buffer to read
1654 * @cpu: The cpu to peak at
1655 * @ts: The timestamp counter of this event.
1656 *
1657 * This will return the event that will be read next, but does
1658 * not consume the data.
1659 */
1660struct ring_buffer_event *
1661ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
1662{
1663 struct ring_buffer_per_cpu *cpu_buffer;
1664 struct ring_buffer_event *event;
1665 struct buffer_page *reader;
1666
1667 if (!cpu_isset(cpu, buffer->cpumask))
1668 return NULL;
1669
1670 cpu_buffer = buffer->buffers[cpu];
1671
1672 again:
1673 reader = rb_get_reader_page(cpu_buffer);
1674 if (!reader)
1675 return NULL;
1676
1677 event = rb_reader_event(cpu_buffer);
1678
1679 switch (event->type) {
1680 case RINGBUF_TYPE_PADDING:
1681 RB_WARN_ON(cpu_buffer, 1);
1682 rb_advance_reader(cpu_buffer);
1683 return NULL;
1684
1685 case RINGBUF_TYPE_TIME_EXTEND:
1686 /* Internal data, OK to advance */
1687 rb_advance_reader(cpu_buffer);
1688 goto again;
1689
1690 case RINGBUF_TYPE_TIME_STAMP:
1691 /* FIXME: not implemented */
1692 rb_advance_reader(cpu_buffer);
1693 goto again;
1694
1695 case RINGBUF_TYPE_DATA:
1696 if (ts) {
1697 *ts = cpu_buffer->read_stamp + event->time_delta;
1698 ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts);
1699 }
1700 return event;
1701
1702 default:
1703 BUG();
1704 }
1705
1706 return NULL;
1707}
1708
1709/**
1710 * ring_buffer_iter_peek - peek at the next event to be read
1711 * @iter: The ring buffer iterator
1712 * @ts: The timestamp counter of this event.
1713 *
1714 * This will return the event that will be read next, but does
1715 * not increment the iterator.
1716 */
1717struct ring_buffer_event *
1718ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
1719{
1720 struct ring_buffer *buffer;
1721 struct ring_buffer_per_cpu *cpu_buffer;
1722 struct ring_buffer_event *event;
1723
1724 if (ring_buffer_iter_empty(iter))
1725 return NULL;
1726
1727 cpu_buffer = iter->cpu_buffer;
1728 buffer = cpu_buffer->buffer;
1729
1730 again:
1731 if (rb_per_cpu_empty(cpu_buffer))
1732 return NULL;
1733
1734 event = rb_iter_head_event(iter);
1735
1736 switch (event->type) {
1737 case RINGBUF_TYPE_PADDING:
1738 rb_inc_iter(iter);
1739 goto again;
1740
1741 case RINGBUF_TYPE_TIME_EXTEND:
1742 /* Internal data, OK to advance */
1743 rb_advance_iter(iter);
1744 goto again;
1745
1746 case RINGBUF_TYPE_TIME_STAMP:
1747 /* FIXME: not implemented */
1748 rb_advance_iter(iter);
1749 goto again;
1750
1751 case RINGBUF_TYPE_DATA:
1752 if (ts) {
1753 *ts = iter->read_stamp + event->time_delta;
1754 ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts);
1755 }
1756 return event;
1757
1758 default:
1759 BUG();
1760 }
1761
1762 return NULL;
1763}
1764
1765/**
1766 * ring_buffer_consume - return an event and consume it
1767 * @buffer: The ring buffer to get the next event from
1768 *
1769 * Returns the next event in the ring buffer, and that event is consumed.
1770 * Meaning, that sequential reads will keep returning a different event,
1771 * and eventually empty the ring buffer if the producer is slower.
1772 */
1773struct ring_buffer_event *
1774ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
1775{
1776 struct ring_buffer_per_cpu *cpu_buffer;
1777 struct ring_buffer_event *event;
1778
1779 if (!cpu_isset(cpu, buffer->cpumask))
1780 return NULL;
1781
1782 event = ring_buffer_peek(buffer, cpu, ts);
1783 if (!event)
1784 return NULL;
1785
1786 cpu_buffer = buffer->buffers[cpu];
1787 rb_advance_reader(cpu_buffer);
1788
1789 return event;
1790}
1791
1792/**
1793 * ring_buffer_read_start - start a non consuming read of the buffer
1794 * @buffer: The ring buffer to read from
1795 * @cpu: The cpu buffer to iterate over
1796 *
1797 * This starts up an iteration through the buffer. It also disables
1798 * the recording to the buffer until the reading is finished.
1799 * This prevents the reading from being corrupted. This is not
1800 * a consuming read, so a producer is not expected.
1801 *
1802 * Must be paired with ring_buffer_finish.
1803 */
1804struct ring_buffer_iter *
1805ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
1806{
1807 struct ring_buffer_per_cpu *cpu_buffer;
1808 struct ring_buffer_iter *iter;
1809 unsigned long flags;
1810
1811 if (!cpu_isset(cpu, buffer->cpumask))
1812 return NULL;
1813
1814 iter = kmalloc(sizeof(*iter), GFP_KERNEL);
1815 if (!iter)
1816 return NULL;
1817
1818 cpu_buffer = buffer->buffers[cpu];
1819
1820 iter->cpu_buffer = cpu_buffer;
1821
1822 atomic_inc(&cpu_buffer->record_disabled);
1823 synchronize_sched();
1824
1825 spin_lock_irqsave(&cpu_buffer->lock, flags);
1826 ring_buffer_iter_reset(iter);
1827 spin_unlock_irqrestore(&cpu_buffer->lock, flags);
1828
1829 return iter;
1830}
1831
1832/**
1833 * ring_buffer_finish - finish reading the iterator of the buffer
1834 * @iter: The iterator retrieved by ring_buffer_start
1835 *
1836 * This re-enables the recording to the buffer, and frees the
1837 * iterator.
1838 */
1839void
1840ring_buffer_read_finish(struct ring_buffer_iter *iter)
1841{
1842 struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
1843
1844 atomic_dec(&cpu_buffer->record_disabled);
1845 kfree(iter);
1846}
1847
1848/**
1849 * ring_buffer_read - read the next item in the ring buffer by the iterator
1850 * @iter: The ring buffer iterator
1851 * @ts: The time stamp of the event read.
1852 *
1853 * This reads the next event in the ring buffer and increments the iterator.
1854 */
1855struct ring_buffer_event *
1856ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
1857{
1858 struct ring_buffer_event *event;
1859
1860 event = ring_buffer_iter_peek(iter, ts);
1861 if (!event)
1862 return NULL;
1863
1864 rb_advance_iter(iter);
1865
1866 return event;
1867}
1868
1869/**
1870 * ring_buffer_size - return the size of the ring buffer (in bytes)
1871 * @buffer: The ring buffer.
1872 */
1873unsigned long ring_buffer_size(struct ring_buffer *buffer)
1874{
1875 return BUF_PAGE_SIZE * buffer->pages;
1876}
1877
1878static void
1879rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
1880{
1881 cpu_buffer->head_page
1882 = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
1883 local_set(&cpu_buffer->head_page->write, 0);
1884 local_set(&cpu_buffer->head_page->commit, 0);
1885
1886 cpu_buffer->head_page->read = 0;
1887
1888 cpu_buffer->tail_page = cpu_buffer->head_page;
1889 cpu_buffer->commit_page = cpu_buffer->head_page;
1890
1891 INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
1892 local_set(&cpu_buffer->reader_page->write, 0);
1893 local_set(&cpu_buffer->reader_page->commit, 0);
1894 cpu_buffer->reader_page->read = 0;
1895
1896 cpu_buffer->overrun = 0;
1897 cpu_buffer->entries = 0;
1898}
1899
1900/**
1901 * ring_buffer_reset_cpu - reset a ring buffer per CPU buffer
1902 * @buffer: The ring buffer to reset a per cpu buffer of
1903 * @cpu: The CPU buffer to be reset
1904 */
1905void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
1906{
1907 struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
1908 unsigned long flags;
1909
1910 if (!cpu_isset(cpu, buffer->cpumask))
1911 return;
1912
1913 spin_lock_irqsave(&cpu_buffer->lock, flags);
1914
1915 rb_reset_cpu(cpu_buffer);
1916
1917 spin_unlock_irqrestore(&cpu_buffer->lock, flags);
1918}
1919
1920/**
1921 * ring_buffer_reset - reset a ring buffer
1922 * @buffer: The ring buffer to reset all cpu buffers
1923 */
1924void ring_buffer_reset(struct ring_buffer *buffer)
1925{
1926 int cpu;
1927
1928 for_each_buffer_cpu(buffer, cpu)
1929 ring_buffer_reset_cpu(buffer, cpu);
1930}
1931
1932/**
1933 * rind_buffer_empty - is the ring buffer empty?
1934 * @buffer: The ring buffer to test
1935 */
1936int ring_buffer_empty(struct ring_buffer *buffer)
1937{
1938 struct ring_buffer_per_cpu *cpu_buffer;
1939 int cpu;
1940
1941 /* yes this is racy, but if you don't like the race, lock the buffer */
1942 for_each_buffer_cpu(buffer, cpu) {
1943 cpu_buffer = buffer->buffers[cpu];
1944 if (!rb_per_cpu_empty(cpu_buffer))
1945 return 0;
1946 }
1947 return 1;
1948}
1949
1950/**
1951 * ring_buffer_empty_cpu - is a cpu buffer of a ring buffer empty?
1952 * @buffer: The ring buffer
1953 * @cpu: The CPU buffer to test
1954 */
1955int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
1956{
1957 struct ring_buffer_per_cpu *cpu_buffer;
1958
1959 if (!cpu_isset(cpu, buffer->cpumask))
1960 return 1;
1961
1962 cpu_buffer = buffer->buffers[cpu];
1963 return rb_per_cpu_empty(cpu_buffer);
1964}
1965
1966/**
1967 * ring_buffer_swap_cpu - swap a CPU buffer between two ring buffers
1968 * @buffer_a: One buffer to swap with
1969 * @buffer_b: The other buffer to swap with
1970 *
1971 * This function is useful for tracers that want to take a "snapshot"
1972 * of a CPU buffer and has another back up buffer lying around.
1973 * it is expected that the tracer handles the cpu buffer not being
1974 * used at the moment.
1975 */
1976int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
1977 struct ring_buffer *buffer_b, int cpu)
1978{
1979 struct ring_buffer_per_cpu *cpu_buffer_a;
1980 struct ring_buffer_per_cpu *cpu_buffer_b;
1981
1982 if (!cpu_isset(cpu, buffer_a->cpumask) ||
1983 !cpu_isset(cpu, buffer_b->cpumask))
1984 return -EINVAL;
1985
1986 /* At least make sure the two buffers are somewhat the same */
1987 if (buffer_a->size != buffer_b->size ||
1988 buffer_a->pages != buffer_b->pages)
1989 return -EINVAL;
1990
1991 cpu_buffer_a = buffer_a->buffers[cpu];
1992 cpu_buffer_b = buffer_b->buffers[cpu];
1993
1994 /*
1995 * We can't do a synchronize_sched here because this
1996 * function can be called in atomic context.
1997 * Normally this will be called from the same CPU as cpu.
1998 * If not it's up to the caller to protect this.
1999 */
2000 atomic_inc(&cpu_buffer_a->record_disabled);
2001 atomic_inc(&cpu_buffer_b->record_disabled);
2002
2003 buffer_a->buffers[cpu] = cpu_buffer_b;
2004 buffer_b->buffers[cpu] = cpu_buffer_a;
2005
2006 cpu_buffer_b->buffer = buffer_a;
2007 cpu_buffer_a->buffer = buffer_b;
2008
2009 atomic_dec(&cpu_buffer_a->record_disabled);
2010 atomic_dec(&cpu_buffer_b->record_disabled);
2011
2012 return 0;
2013}
2014
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 868e121c8e38..d345d649d073 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -14,6 +14,7 @@
14#include <linux/utsrelease.h> 14#include <linux/utsrelease.h>
15#include <linux/kallsyms.h> 15#include <linux/kallsyms.h>
16#include <linux/seq_file.h> 16#include <linux/seq_file.h>
17#include <linux/notifier.h>
17#include <linux/debugfs.h> 18#include <linux/debugfs.h>
18#include <linux/pagemap.h> 19#include <linux/pagemap.h>
19#include <linux/hardirq.h> 20#include <linux/hardirq.h>
@@ -22,6 +23,7 @@
22#include <linux/ftrace.h> 23#include <linux/ftrace.h>
23#include <linux/module.h> 24#include <linux/module.h>
24#include <linux/percpu.h> 25#include <linux/percpu.h>
26#include <linux/kdebug.h>
25#include <linux/ctype.h> 27#include <linux/ctype.h>
26#include <linux/init.h> 28#include <linux/init.h>
27#include <linux/poll.h> 29#include <linux/poll.h>
@@ -31,25 +33,36 @@
31#include <linux/writeback.h> 33#include <linux/writeback.h>
32 34
33#include <linux/stacktrace.h> 35#include <linux/stacktrace.h>
36#include <linux/ring_buffer.h>
34 37
35#include "trace.h" 38#include "trace.h"
36 39
40#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE)
41
37unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX; 42unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX;
38unsigned long __read_mostly tracing_thresh; 43unsigned long __read_mostly tracing_thresh;
39 44
40static unsigned long __read_mostly tracing_nr_buffers; 45static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled);
46
47static inline void ftrace_disable_cpu(void)
48{
49 preempt_disable();
50 local_inc(&__get_cpu_var(ftrace_cpu_disabled));
51}
52
53static inline void ftrace_enable_cpu(void)
54{
55 local_dec(&__get_cpu_var(ftrace_cpu_disabled));
56 preempt_enable();
57}
58
41static cpumask_t __read_mostly tracing_buffer_mask; 59static cpumask_t __read_mostly tracing_buffer_mask;
42 60
43#define for_each_tracing_cpu(cpu) \ 61#define for_each_tracing_cpu(cpu) \
44 for_each_cpu_mask(cpu, tracing_buffer_mask) 62 for_each_cpu_mask(cpu, tracing_buffer_mask)
45 63
46static int trace_alloc_page(void);
47static int trace_free_page(void);
48
49static int tracing_disabled = 1; 64static int tracing_disabled = 1;
50 65
51static unsigned long tracing_pages_allocated;
52
53long 66long
54ns2usecs(cycle_t nsec) 67ns2usecs(cycle_t nsec)
55{ 68{
@@ -60,7 +73,9 @@ ns2usecs(cycle_t nsec)
60 73
61cycle_t ftrace_now(int cpu) 74cycle_t ftrace_now(int cpu)
62{ 75{
63 return cpu_clock(cpu); 76 u64 ts = ring_buffer_time_stamp(cpu);
77 ring_buffer_normalize_time_stamp(cpu, &ts);
78 return ts;
64} 79}
65 80
66/* 81/*
@@ -100,11 +115,18 @@ static int tracer_enabled = 1;
100int ftrace_function_enabled; 115int ftrace_function_enabled;
101 116
102/* 117/*
103 * trace_nr_entries is the number of entries that is allocated 118 * trace_buf_size is the size in bytes that is allocated
104 * for a buffer. Note, the number of entries is always rounded 119 * for a buffer. Note, the number of bytes is always rounded
105 * to ENTRIES_PER_PAGE. 120 * to page size.
121 *
122 * This number is purposely set to a low number of 16384.
123 * If the dump on oops happens, it will be much appreciated
124 * to not have to wait for all that output. Anyway this can be
125 * boot time and run time configurable.
106 */ 126 */
107static unsigned long trace_nr_entries = 65536UL; 127#define TRACE_BUF_SIZE_DEFAULT 1441792UL /* 16384 * 88 (sizeof(entry)) */
128
129static unsigned long trace_buf_size = TRACE_BUF_SIZE_DEFAULT;
108 130
109/* trace_types holds a link list of available tracers. */ 131/* trace_types holds a link list of available tracers. */
110static struct tracer *trace_types __read_mostly; 132static struct tracer *trace_types __read_mostly;
@@ -133,24 +155,6 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
133/* trace_flags holds iter_ctrl options */ 155/* trace_flags holds iter_ctrl options */
134unsigned long trace_flags = TRACE_ITER_PRINT_PARENT; 156unsigned long trace_flags = TRACE_ITER_PRINT_PARENT;
135 157
136static notrace void no_trace_init(struct trace_array *tr)
137{
138 int cpu;
139
140 ftrace_function_enabled = 0;
141 if(tr->ctrl)
142 for_each_online_cpu(cpu)
143 tracing_reset(tr->data[cpu]);
144 tracer_enabled = 0;
145}
146
147/* dummy trace to disable tracing */
148static struct tracer no_tracer __read_mostly = {
149 .name = "none",
150 .init = no_trace_init
151};
152
153
154/** 158/**
155 * trace_wake_up - wake up tasks waiting for trace input 159 * trace_wake_up - wake up tasks waiting for trace input
156 * 160 *
@@ -167,23 +171,21 @@ void trace_wake_up(void)
167 wake_up(&trace_wait); 171 wake_up(&trace_wait);
168} 172}
169 173
170#define ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(struct trace_entry)) 174static int __init set_buf_size(char *str)
171
172static int __init set_nr_entries(char *str)
173{ 175{
174 unsigned long nr_entries; 176 unsigned long buf_size;
175 int ret; 177 int ret;
176 178
177 if (!str) 179 if (!str)
178 return 0; 180 return 0;
179 ret = strict_strtoul(str, 0, &nr_entries); 181 ret = strict_strtoul(str, 0, &buf_size);
180 /* nr_entries can not be zero */ 182 /* nr_entries can not be zero */
181 if (ret < 0 || nr_entries == 0) 183 if (ret < 0 || buf_size == 0)
182 return 0; 184 return 0;
183 trace_nr_entries = nr_entries; 185 trace_buf_size = buf_size;
184 return 1; 186 return 1;
185} 187}
186__setup("trace_entries=", set_nr_entries); 188__setup("trace_buf_size=", set_buf_size);
187 189
188unsigned long nsecs_to_usecs(unsigned long nsecs) 190unsigned long nsecs_to_usecs(unsigned long nsecs)
189{ 191{
@@ -191,21 +193,6 @@ unsigned long nsecs_to_usecs(unsigned long nsecs)
191} 193}
192 194
193/* 195/*
194 * trace_flag_type is an enumeration that holds different
195 * states when a trace occurs. These are:
196 * IRQS_OFF - interrupts were disabled
197 * NEED_RESCED - reschedule is requested
198 * HARDIRQ - inside an interrupt handler
199 * SOFTIRQ - inside a softirq handler
200 */
201enum trace_flag_type {
202 TRACE_FLAG_IRQS_OFF = 0x01,
203 TRACE_FLAG_NEED_RESCHED = 0x02,
204 TRACE_FLAG_HARDIRQ = 0x04,
205 TRACE_FLAG_SOFTIRQ = 0x08,
206};
207
208/*
209 * TRACE_ITER_SYM_MASK masks the options in trace_flags that 196 * TRACE_ITER_SYM_MASK masks the options in trace_flags that
210 * control the output of kernel symbols. 197 * control the output of kernel symbols.
211 */ 198 */
@@ -224,6 +211,7 @@ static const char *trace_options[] = {
224 "block", 211 "block",
225 "stacktrace", 212 "stacktrace",
226 "sched-tree", 213 "sched-tree",
214 "ftrace_printk",
227 NULL 215 NULL
228}; 216};
229 217
@@ -266,54 +254,6 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
266 tracing_record_cmdline(current); 254 tracing_record_cmdline(current);
267} 255}
268 256
269#define CHECK_COND(cond) \
270 if (unlikely(cond)) { \
271 tracing_disabled = 1; \
272 WARN_ON(1); \
273 return -1; \
274 }
275
276/**
277 * check_pages - integrity check of trace buffers
278 *
279 * As a safty measure we check to make sure the data pages have not
280 * been corrupted.
281 */
282int check_pages(struct trace_array_cpu *data)
283{
284 struct page *page, *tmp;
285
286 CHECK_COND(data->trace_pages.next->prev != &data->trace_pages);
287 CHECK_COND(data->trace_pages.prev->next != &data->trace_pages);
288
289 list_for_each_entry_safe(page, tmp, &data->trace_pages, lru) {
290 CHECK_COND(page->lru.next->prev != &page->lru);
291 CHECK_COND(page->lru.prev->next != &page->lru);
292 }
293
294 return 0;
295}
296
297/**
298 * head_page - page address of the first page in per_cpu buffer.
299 *
300 * head_page returns the page address of the first page in
301 * a per_cpu buffer. This also preforms various consistency
302 * checks to make sure the buffer has not been corrupted.
303 */
304void *head_page(struct trace_array_cpu *data)
305{
306 struct page *page;
307
308 if (list_empty(&data->trace_pages))
309 return NULL;
310
311 page = list_entry(data->trace_pages.next, struct page, lru);
312 BUG_ON(&page->lru == &data->trace_pages);
313
314 return page_address(page);
315}
316
317/** 257/**
318 * trace_seq_printf - sequence printing of trace information 258 * trace_seq_printf - sequence printing of trace information
319 * @s: trace sequence descriptor 259 * @s: trace sequence descriptor
@@ -395,28 +335,23 @@ trace_seq_putmem(struct trace_seq *s, void *mem, size_t len)
395 return len; 335 return len;
396} 336}
397 337
398#define HEX_CHARS 17 338#define MAX_MEMHEX_BYTES 8
399static const char hex2asc[] = "0123456789abcdef"; 339#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
400 340
401static int 341static int
402trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len) 342trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
403{ 343{
404 unsigned char hex[HEX_CHARS]; 344 unsigned char hex[HEX_CHARS];
405 unsigned char *data = mem; 345 unsigned char *data = mem;
406 unsigned char byte;
407 int i, j; 346 int i, j;
408 347
409 BUG_ON(len >= HEX_CHARS);
410
411#ifdef __BIG_ENDIAN 348#ifdef __BIG_ENDIAN
412 for (i = 0, j = 0; i < len; i++) { 349 for (i = 0, j = 0; i < len; i++) {
413#else 350#else
414 for (i = len-1, j = 0; i >= 0; i--) { 351 for (i = len-1, j = 0; i >= 0; i--) {
415#endif 352#endif
416 byte = data[i]; 353 hex[j++] = hex_asc_hi(data[i]);
417 354 hex[j++] = hex_asc_lo(data[i]);
418 hex[j++] = hex2asc[byte & 0x0f];
419 hex[j++] = hex2asc[byte >> 4];
420 } 355 }
421 hex[j++] = ' '; 356 hex[j++] = ' ';
422 357
@@ -460,34 +395,6 @@ trace_print_seq(struct seq_file *m, struct trace_seq *s)
460 trace_seq_reset(s); 395 trace_seq_reset(s);
461} 396}
462 397
463/*
464 * flip the trace buffers between two trace descriptors.
465 * This usually is the buffers between the global_trace and
466 * the max_tr to record a snapshot of a current trace.
467 *
468 * The ftrace_max_lock must be held.
469 */
470static void
471flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2)
472{
473 struct list_head flip_pages;
474
475 INIT_LIST_HEAD(&flip_pages);
476
477 memcpy(&tr1->trace_head_idx, &tr2->trace_head_idx,
478 sizeof(struct trace_array_cpu) -
479 offsetof(struct trace_array_cpu, trace_head_idx));
480
481 check_pages(tr1);
482 check_pages(tr2);
483 list_splice_init(&tr1->trace_pages, &flip_pages);
484 list_splice_init(&tr2->trace_pages, &tr1->trace_pages);
485 list_splice_init(&flip_pages, &tr2->trace_pages);
486 BUG_ON(!list_empty(&flip_pages));
487 check_pages(tr1);
488 check_pages(tr2);
489}
490
491/** 398/**
492 * update_max_tr - snapshot all trace buffers from global_trace to max_tr 399 * update_max_tr - snapshot all trace buffers from global_trace to max_tr
493 * @tr: tracer 400 * @tr: tracer
@@ -500,17 +407,17 @@ flip_trace(struct trace_array_cpu *tr1, struct trace_array_cpu *tr2)
500void 407void
501update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) 408update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
502{ 409{
503 struct trace_array_cpu *data; 410 struct ring_buffer *buf = tr->buffer;
504 int i;
505 411
506 WARN_ON_ONCE(!irqs_disabled()); 412 WARN_ON_ONCE(!irqs_disabled());
507 __raw_spin_lock(&ftrace_max_lock); 413 __raw_spin_lock(&ftrace_max_lock);
508 /* clear out all the previous traces */ 414
509 for_each_tracing_cpu(i) { 415 tr->buffer = max_tr.buffer;
510 data = tr->data[i]; 416 max_tr.buffer = buf;
511 flip_trace(max_tr.data[i], data); 417
512 tracing_reset(data); 418 ftrace_disable_cpu();
513 } 419 ring_buffer_reset(tr->buffer);
420 ftrace_enable_cpu();
514 421
515 __update_max_tr(tr, tsk, cpu); 422 __update_max_tr(tr, tsk, cpu);
516 __raw_spin_unlock(&ftrace_max_lock); 423 __raw_spin_unlock(&ftrace_max_lock);
@@ -527,16 +434,19 @@ update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
527void 434void
528update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) 435update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
529{ 436{
530 struct trace_array_cpu *data = tr->data[cpu]; 437 int ret;
531 int i;
532 438
533 WARN_ON_ONCE(!irqs_disabled()); 439 WARN_ON_ONCE(!irqs_disabled());
534 __raw_spin_lock(&ftrace_max_lock); 440 __raw_spin_lock(&ftrace_max_lock);
535 for_each_tracing_cpu(i)
536 tracing_reset(max_tr.data[i]);
537 441
538 flip_trace(max_tr.data[cpu], data); 442 ftrace_disable_cpu();
539 tracing_reset(data); 443
444 ring_buffer_reset(max_tr.buffer);
445 ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu);
446
447 ftrace_enable_cpu();
448
449 WARN_ON_ONCE(ret);
540 450
541 __update_max_tr(tr, tsk, cpu); 451 __update_max_tr(tr, tsk, cpu);
542 __raw_spin_unlock(&ftrace_max_lock); 452 __raw_spin_unlock(&ftrace_max_lock);
@@ -573,7 +483,6 @@ int register_tracer(struct tracer *type)
573#ifdef CONFIG_FTRACE_STARTUP_TEST 483#ifdef CONFIG_FTRACE_STARTUP_TEST
574 if (type->selftest) { 484 if (type->selftest) {
575 struct tracer *saved_tracer = current_trace; 485 struct tracer *saved_tracer = current_trace;
576 struct trace_array_cpu *data;
577 struct trace_array *tr = &global_trace; 486 struct trace_array *tr = &global_trace;
578 int saved_ctrl = tr->ctrl; 487 int saved_ctrl = tr->ctrl;
579 int i; 488 int i;
@@ -585,10 +494,7 @@ int register_tracer(struct tracer *type)
585 * If we fail, we do not register this tracer. 494 * If we fail, we do not register this tracer.
586 */ 495 */
587 for_each_tracing_cpu(i) { 496 for_each_tracing_cpu(i) {
588 data = tr->data[i]; 497 tracing_reset(tr, i);
589 if (!head_page(data))
590 continue;
591 tracing_reset(data);
592 } 498 }
593 current_trace = type; 499 current_trace = type;
594 tr->ctrl = 0; 500 tr->ctrl = 0;
@@ -604,10 +510,7 @@ int register_tracer(struct tracer *type)
604 } 510 }
605 /* Only reset on passing, to avoid touching corrupted buffers */ 511 /* Only reset on passing, to avoid touching corrupted buffers */
606 for_each_tracing_cpu(i) { 512 for_each_tracing_cpu(i) {
607 data = tr->data[i]; 513 tracing_reset(tr, i);
608 if (!head_page(data))
609 continue;
610 tracing_reset(data);
611 } 514 }
612 printk(KERN_CONT "PASSED\n"); 515 printk(KERN_CONT "PASSED\n");
613 } 516 }
@@ -653,13 +556,11 @@ void unregister_tracer(struct tracer *type)
653 mutex_unlock(&trace_types_lock); 556 mutex_unlock(&trace_types_lock);
654} 557}
655 558
656void tracing_reset(struct trace_array_cpu *data) 559void tracing_reset(struct trace_array *tr, int cpu)
657{ 560{
658 data->trace_idx = 0; 561 ftrace_disable_cpu();
659 data->overrun = 0; 562 ring_buffer_reset_cpu(tr->buffer, cpu);
660 data->trace_head = data->trace_tail = head_page(data); 563 ftrace_enable_cpu();
661 data->trace_head_idx = 0;
662 data->trace_tail_idx = 0;
663} 564}
664 565
665#define SAVED_CMDLINES 128 566#define SAVED_CMDLINES 128
@@ -745,82 +646,16 @@ void tracing_record_cmdline(struct task_struct *tsk)
745 trace_save_cmdline(tsk); 646 trace_save_cmdline(tsk);
746} 647}
747 648
748static inline struct list_head * 649void
749trace_next_list(struct trace_array_cpu *data, struct list_head *next) 650tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
750{ 651 int pc)
751 /*
752 * Roundrobin - but skip the head (which is not a real page):
753 */
754 next = next->next;
755 if (unlikely(next == &data->trace_pages))
756 next = next->next;
757 BUG_ON(next == &data->trace_pages);
758
759 return next;
760}
761
762static inline void *
763trace_next_page(struct trace_array_cpu *data, void *addr)
764{
765 struct list_head *next;
766 struct page *page;
767
768 page = virt_to_page(addr);
769
770 next = trace_next_list(data, &page->lru);
771 page = list_entry(next, struct page, lru);
772
773 return page_address(page);
774}
775
776static inline struct trace_entry *
777tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data)
778{
779 unsigned long idx, idx_next;
780 struct trace_entry *entry;
781
782 data->trace_idx++;
783 idx = data->trace_head_idx;
784 idx_next = idx + 1;
785
786 BUG_ON(idx * TRACE_ENTRY_SIZE >= PAGE_SIZE);
787
788 entry = data->trace_head + idx * TRACE_ENTRY_SIZE;
789
790 if (unlikely(idx_next >= ENTRIES_PER_PAGE)) {
791 data->trace_head = trace_next_page(data, data->trace_head);
792 idx_next = 0;
793 }
794
795 if (data->trace_head == data->trace_tail &&
796 idx_next == data->trace_tail_idx) {
797 /* overrun */
798 data->overrun++;
799 data->trace_tail_idx++;
800 if (data->trace_tail_idx >= ENTRIES_PER_PAGE) {
801 data->trace_tail =
802 trace_next_page(data, data->trace_tail);
803 data->trace_tail_idx = 0;
804 }
805 }
806
807 data->trace_head_idx = idx_next;
808
809 return entry;
810}
811
812static inline void
813tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags)
814{ 652{
815 struct task_struct *tsk = current; 653 struct task_struct *tsk = current;
816 unsigned long pc;
817
818 pc = preempt_count();
819 654
820 entry->preempt_count = pc & 0xff; 655 entry->preempt_count = pc & 0xff;
821 entry->pid = (tsk) ? tsk->pid : 0; 656 entry->pid = (tsk) ? tsk->pid : 0;
822 entry->t = ftrace_now(raw_smp_processor_id()); 657 entry->flags =
823 entry->flags = (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) | 658 (irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
824 ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | 659 ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
825 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | 660 ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
826 (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0); 661 (need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
@@ -828,145 +663,139 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags)
828 663
829void 664void
830trace_function(struct trace_array *tr, struct trace_array_cpu *data, 665trace_function(struct trace_array *tr, struct trace_array_cpu *data,
831 unsigned long ip, unsigned long parent_ip, unsigned long flags) 666 unsigned long ip, unsigned long parent_ip, unsigned long flags,
667 int pc)
832{ 668{
833 struct trace_entry *entry; 669 struct ring_buffer_event *event;
670 struct ftrace_entry *entry;
834 unsigned long irq_flags; 671 unsigned long irq_flags;
835 672
836 raw_local_irq_save(irq_flags); 673 /* If we are reading the ring buffer, don't trace */
837 __raw_spin_lock(&data->lock); 674 if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
838 entry = tracing_get_trace_entry(tr, data); 675 return;
839 tracing_generic_entry_update(entry, flags); 676
840 entry->type = TRACE_FN; 677 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
841 entry->fn.ip = ip; 678 &irq_flags);
842 entry->fn.parent_ip = parent_ip; 679 if (!event)
843 __raw_spin_unlock(&data->lock); 680 return;
844 raw_local_irq_restore(irq_flags); 681 entry = ring_buffer_event_data(event);
682 tracing_generic_entry_update(&entry->ent, flags, pc);
683 entry->ent.type = TRACE_FN;
684 entry->ip = ip;
685 entry->parent_ip = parent_ip;
686 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
845} 687}
846 688
847void 689void
848ftrace(struct trace_array *tr, struct trace_array_cpu *data, 690ftrace(struct trace_array *tr, struct trace_array_cpu *data,
849 unsigned long ip, unsigned long parent_ip, unsigned long flags) 691 unsigned long ip, unsigned long parent_ip, unsigned long flags,
692 int pc)
850{ 693{
851 if (likely(!atomic_read(&data->disabled))) 694 if (likely(!atomic_read(&data->disabled)))
852 trace_function(tr, data, ip, parent_ip, flags); 695 trace_function(tr, data, ip, parent_ip, flags, pc);
853} 696}
854 697
855#ifdef CONFIG_MMIOTRACE 698static void ftrace_trace_stack(struct trace_array *tr,
856void __trace_mmiotrace_rw(struct trace_array *tr, struct trace_array_cpu *data, 699 struct trace_array_cpu *data,
857 struct mmiotrace_rw *rw) 700 unsigned long flags,
701 int skip, int pc)
858{ 702{
859 struct trace_entry *entry; 703 struct ring_buffer_event *event;
704 struct stack_entry *entry;
705 struct stack_trace trace;
860 unsigned long irq_flags; 706 unsigned long irq_flags;
861 707
862 raw_local_irq_save(irq_flags); 708 if (!(trace_flags & TRACE_ITER_STACKTRACE))
863 __raw_spin_lock(&data->lock); 709 return;
864
865 entry = tracing_get_trace_entry(tr, data);
866 tracing_generic_entry_update(entry, 0);
867 entry->type = TRACE_MMIO_RW;
868 entry->mmiorw = *rw;
869
870 __raw_spin_unlock(&data->lock);
871 raw_local_irq_restore(irq_flags);
872
873 trace_wake_up();
874}
875
876void __trace_mmiotrace_map(struct trace_array *tr, struct trace_array_cpu *data,
877 struct mmiotrace_map *map)
878{
879 struct trace_entry *entry;
880 unsigned long irq_flags;
881 710
882 raw_local_irq_save(irq_flags); 711 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
883 __raw_spin_lock(&data->lock); 712 &irq_flags);
713 if (!event)
714 return;
715 entry = ring_buffer_event_data(event);
716 tracing_generic_entry_update(&entry->ent, flags, pc);
717 entry->ent.type = TRACE_STACK;
884 718
885 entry = tracing_get_trace_entry(tr, data); 719 memset(&entry->caller, 0, sizeof(entry->caller));
886 tracing_generic_entry_update(entry, 0);
887 entry->type = TRACE_MMIO_MAP;
888 entry->mmiomap = *map;
889 720
890 __raw_spin_unlock(&data->lock); 721 trace.nr_entries = 0;
891 raw_local_irq_restore(irq_flags); 722 trace.max_entries = FTRACE_STACK_ENTRIES;
723 trace.skip = skip;
724 trace.entries = entry->caller;
892 725
893 trace_wake_up(); 726 save_stack_trace(&trace);
727 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
894} 728}
895#endif
896 729
897void __trace_stack(struct trace_array *tr, 730void __trace_stack(struct trace_array *tr,
898 struct trace_array_cpu *data, 731 struct trace_array_cpu *data,
899 unsigned long flags, 732 unsigned long flags,
900 int skip) 733 int skip)
901{ 734{
902 struct trace_entry *entry; 735 ftrace_trace_stack(tr, data, flags, skip, preempt_count());
903 struct stack_trace trace;
904
905 if (!(trace_flags & TRACE_ITER_STACKTRACE))
906 return;
907
908 entry = tracing_get_trace_entry(tr, data);
909 tracing_generic_entry_update(entry, flags);
910 entry->type = TRACE_STACK;
911
912 memset(&entry->stack, 0, sizeof(entry->stack));
913
914 trace.nr_entries = 0;
915 trace.max_entries = FTRACE_STACK_ENTRIES;
916 trace.skip = skip;
917 trace.entries = entry->stack.caller;
918
919 save_stack_trace(&trace);
920} 736}
921 737
922void 738static void
923__trace_special(void *__tr, void *__data, 739ftrace_trace_special(void *__tr, void *__data,
924 unsigned long arg1, unsigned long arg2, unsigned long arg3) 740 unsigned long arg1, unsigned long arg2, unsigned long arg3,
741 int pc)
925{ 742{
743 struct ring_buffer_event *event;
926 struct trace_array_cpu *data = __data; 744 struct trace_array_cpu *data = __data;
927 struct trace_array *tr = __tr; 745 struct trace_array *tr = __tr;
928 struct trace_entry *entry; 746 struct special_entry *entry;
929 unsigned long irq_flags; 747 unsigned long irq_flags;
930 748
931 raw_local_irq_save(irq_flags); 749 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
932 __raw_spin_lock(&data->lock); 750 &irq_flags);
933 entry = tracing_get_trace_entry(tr, data); 751 if (!event)
934 tracing_generic_entry_update(entry, 0); 752 return;
935 entry->type = TRACE_SPECIAL; 753 entry = ring_buffer_event_data(event);
936 entry->special.arg1 = arg1; 754 tracing_generic_entry_update(&entry->ent, 0, pc);
937 entry->special.arg2 = arg2; 755 entry->ent.type = TRACE_SPECIAL;
938 entry->special.arg3 = arg3; 756 entry->arg1 = arg1;
939 __trace_stack(tr, data, irq_flags, 4); 757 entry->arg2 = arg2;
940 __raw_spin_unlock(&data->lock); 758 entry->arg3 = arg3;
941 raw_local_irq_restore(irq_flags); 759 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
760 ftrace_trace_stack(tr, data, irq_flags, 4, pc);
942 761
943 trace_wake_up(); 762 trace_wake_up();
944} 763}
945 764
946void 765void
766__trace_special(void *__tr, void *__data,
767 unsigned long arg1, unsigned long arg2, unsigned long arg3)
768{
769 ftrace_trace_special(__tr, __data, arg1, arg2, arg3, preempt_count());
770}
771
772void
947tracing_sched_switch_trace(struct trace_array *tr, 773tracing_sched_switch_trace(struct trace_array *tr,
948 struct trace_array_cpu *data, 774 struct trace_array_cpu *data,
949 struct task_struct *prev, 775 struct task_struct *prev,
950 struct task_struct *next, 776 struct task_struct *next,
951 unsigned long flags) 777 unsigned long flags, int pc)
952{ 778{
953 struct trace_entry *entry; 779 struct ring_buffer_event *event;
780 struct ctx_switch_entry *entry;
954 unsigned long irq_flags; 781 unsigned long irq_flags;
955 782
956 raw_local_irq_save(irq_flags); 783 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
957 __raw_spin_lock(&data->lock); 784 &irq_flags);
958 entry = tracing_get_trace_entry(tr, data); 785 if (!event)
959 tracing_generic_entry_update(entry, flags); 786 return;
960 entry->type = TRACE_CTX; 787 entry = ring_buffer_event_data(event);
961 entry->ctx.prev_pid = prev->pid; 788 tracing_generic_entry_update(&entry->ent, flags, pc);
962 entry->ctx.prev_prio = prev->prio; 789 entry->ent.type = TRACE_CTX;
963 entry->ctx.prev_state = prev->state; 790 entry->prev_pid = prev->pid;
964 entry->ctx.next_pid = next->pid; 791 entry->prev_prio = prev->prio;
965 entry->ctx.next_prio = next->prio; 792 entry->prev_state = prev->state;
966 entry->ctx.next_state = next->state; 793 entry->next_pid = next->pid;
967 __trace_stack(tr, data, flags, 5); 794 entry->next_prio = next->prio;
968 __raw_spin_unlock(&data->lock); 795 entry->next_state = next->state;
969 raw_local_irq_restore(irq_flags); 796 entry->next_cpu = task_cpu(next);
797 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
798 ftrace_trace_stack(tr, data, flags, 5, pc);
970} 799}
971 800
972void 801void
@@ -974,25 +803,28 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
974 struct trace_array_cpu *data, 803 struct trace_array_cpu *data,
975 struct task_struct *wakee, 804 struct task_struct *wakee,
976 struct task_struct *curr, 805 struct task_struct *curr,
977 unsigned long flags) 806 unsigned long flags, int pc)
978{ 807{
979 struct trace_entry *entry; 808 struct ring_buffer_event *event;
809 struct ctx_switch_entry *entry;
980 unsigned long irq_flags; 810 unsigned long irq_flags;
981 811
982 raw_local_irq_save(irq_flags); 812 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
983 __raw_spin_lock(&data->lock); 813 &irq_flags);
984 entry = tracing_get_trace_entry(tr, data); 814 if (!event)
985 tracing_generic_entry_update(entry, flags); 815 return;
986 entry->type = TRACE_WAKE; 816 entry = ring_buffer_event_data(event);
987 entry->ctx.prev_pid = curr->pid; 817 tracing_generic_entry_update(&entry->ent, flags, pc);
988 entry->ctx.prev_prio = curr->prio; 818 entry->ent.type = TRACE_WAKE;
989 entry->ctx.prev_state = curr->state; 819 entry->prev_pid = curr->pid;
990 entry->ctx.next_pid = wakee->pid; 820 entry->prev_prio = curr->prio;
991 entry->ctx.next_prio = wakee->prio; 821 entry->prev_state = curr->state;
992 entry->ctx.next_state = wakee->state; 822 entry->next_pid = wakee->pid;
993 __trace_stack(tr, data, flags, 6); 823 entry->next_prio = wakee->prio;
994 __raw_spin_unlock(&data->lock); 824 entry->next_state = wakee->state;
995 raw_local_irq_restore(irq_flags); 825 entry->next_cpu = task_cpu(wakee);
826 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
827 ftrace_trace_stack(tr, data, flags, 6, pc);
996 828
997 trace_wake_up(); 829 trace_wake_up();
998} 830}
@@ -1002,23 +834,21 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
1002{ 834{
1003 struct trace_array *tr = &global_trace; 835 struct trace_array *tr = &global_trace;
1004 struct trace_array_cpu *data; 836 struct trace_array_cpu *data;
1005 unsigned long flags;
1006 long disabled;
1007 int cpu; 837 int cpu;
838 int pc;
1008 839
1009 if (tracing_disabled || current_trace == &no_tracer || !tr->ctrl) 840 if (tracing_disabled || !tr->ctrl)
1010 return; 841 return;
1011 842
1012 local_irq_save(flags); 843 pc = preempt_count();
844 preempt_disable_notrace();
1013 cpu = raw_smp_processor_id(); 845 cpu = raw_smp_processor_id();
1014 data = tr->data[cpu]; 846 data = tr->data[cpu];
1015 disabled = atomic_inc_return(&data->disabled);
1016 847
1017 if (likely(disabled == 1)) 848 if (likely(!atomic_read(&data->disabled)))
1018 __trace_special(tr, data, arg1, arg2, arg3); 849 ftrace_trace_special(tr, data, arg1, arg2, arg3, pc);
1019 850
1020 atomic_dec(&data->disabled); 851 preempt_enable_notrace();
1021 local_irq_restore(flags);
1022} 852}
1023 853
1024#ifdef CONFIG_FTRACE 854#ifdef CONFIG_FTRACE
@@ -1029,7 +859,8 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
1029 struct trace_array_cpu *data; 859 struct trace_array_cpu *data;
1030 unsigned long flags; 860 unsigned long flags;
1031 long disabled; 861 long disabled;
1032 int cpu; 862 int cpu, resched;
863 int pc;
1033 864
1034 if (unlikely(!ftrace_function_enabled)) 865 if (unlikely(!ftrace_function_enabled))
1035 return; 866 return;
@@ -1037,16 +868,22 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
1037 if (skip_trace(ip)) 868 if (skip_trace(ip))
1038 return; 869 return;
1039 870
1040 local_irq_save(flags); 871 pc = preempt_count();
872 resched = need_resched();
873 preempt_disable_notrace();
874 local_save_flags(flags);
1041 cpu = raw_smp_processor_id(); 875 cpu = raw_smp_processor_id();
1042 data = tr->data[cpu]; 876 data = tr->data[cpu];
1043 disabled = atomic_inc_return(&data->disabled); 877 disabled = atomic_inc_return(&data->disabled);
1044 878
1045 if (likely(disabled == 1)) 879 if (likely(disabled == 1))
1046 trace_function(tr, data, ip, parent_ip, flags); 880 trace_function(tr, data, ip, parent_ip, flags, pc);
1047 881
1048 atomic_dec(&data->disabled); 882 atomic_dec(&data->disabled);
1049 local_irq_restore(flags); 883 if (resched)
884 preempt_enable_no_resched_notrace();
885 else
886 preempt_enable_notrace();
1050} 887}
1051 888
1052static struct ftrace_ops trace_ops __read_mostly = 889static struct ftrace_ops trace_ops __read_mostly =
@@ -1073,117 +910,101 @@ enum trace_file_type {
1073 TRACE_FILE_LAT_FMT = 1, 910 TRACE_FILE_LAT_FMT = 1,
1074}; 911};
1075 912
1076static struct trace_entry * 913static void trace_iterator_increment(struct trace_iterator *iter, int cpu)
1077trace_entry_idx(struct trace_array *tr, struct trace_array_cpu *data,
1078 struct trace_iterator *iter, int cpu)
1079{ 914{
1080 struct page *page; 915 /* Don't allow ftrace to trace into the ring buffers */
1081 struct trace_entry *array; 916 ftrace_disable_cpu();
1082 917
1083 if (iter->next_idx[cpu] >= tr->entries || 918 iter->idx++;
1084 iter->next_idx[cpu] >= data->trace_idx || 919 if (iter->buffer_iter[iter->cpu])
1085 (data->trace_head == data->trace_tail && 920 ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
1086 data->trace_head_idx == data->trace_tail_idx))
1087 return NULL;
1088 921
1089 if (!iter->next_page[cpu]) { 922 ftrace_enable_cpu();
1090 /* Initialize the iterator for this cpu trace buffer */ 923}
1091 WARN_ON(!data->trace_tail); 924
1092 page = virt_to_page(data->trace_tail); 925static struct trace_entry *
1093 iter->next_page[cpu] = &page->lru; 926peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts)
1094 iter->next_page_idx[cpu] = data->trace_tail_idx; 927{
1095 } 928 struct ring_buffer_event *event;
929 struct ring_buffer_iter *buf_iter = iter->buffer_iter[cpu];
1096 930
1097 page = list_entry(iter->next_page[cpu], struct page, lru); 931 /* Don't allow ftrace to trace into the ring buffers */
1098 BUG_ON(&data->trace_pages == &page->lru); 932 ftrace_disable_cpu();
933
934 if (buf_iter)
935 event = ring_buffer_iter_peek(buf_iter, ts);
936 else
937 event = ring_buffer_peek(iter->tr->buffer, cpu, ts);
1099 938
1100 array = page_address(page); 939 ftrace_enable_cpu();
1101 940
1102 WARN_ON(iter->next_page_idx[cpu] >= ENTRIES_PER_PAGE); 941 return event ? ring_buffer_event_data(event) : NULL;
1103 return &array[iter->next_page_idx[cpu]];
1104} 942}
1105 943
1106static struct trace_entry * 944static struct trace_entry *
1107find_next_entry(struct trace_iterator *iter, int *ent_cpu) 945__find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1108{ 946{
1109 struct trace_array *tr = iter->tr; 947 struct ring_buffer *buffer = iter->tr->buffer;
1110 struct trace_entry *ent, *next = NULL; 948 struct trace_entry *ent, *next = NULL;
949 u64 next_ts = 0, ts;
1111 int next_cpu = -1; 950 int next_cpu = -1;
1112 int cpu; 951 int cpu;
1113 952
1114 for_each_tracing_cpu(cpu) { 953 for_each_tracing_cpu(cpu) {
1115 if (!head_page(tr->data[cpu])) 954
955 if (ring_buffer_empty_cpu(buffer, cpu))
1116 continue; 956 continue;
1117 ent = trace_entry_idx(tr, tr->data[cpu], iter, cpu); 957
958 ent = peek_next_entry(iter, cpu, &ts);
959
1118 /* 960 /*
1119 * Pick the entry with the smallest timestamp: 961 * Pick the entry with the smallest timestamp:
1120 */ 962 */
1121 if (ent && (!next || ent->t < next->t)) { 963 if (ent && (!next || ts < next_ts)) {
1122 next = ent; 964 next = ent;
1123 next_cpu = cpu; 965 next_cpu = cpu;
966 next_ts = ts;
1124 } 967 }
1125 } 968 }
1126 969
1127 if (ent_cpu) 970 if (ent_cpu)
1128 *ent_cpu = next_cpu; 971 *ent_cpu = next_cpu;
1129 972
973 if (ent_ts)
974 *ent_ts = next_ts;
975
1130 return next; 976 return next;
1131} 977}
1132 978
1133static void trace_iterator_increment(struct trace_iterator *iter) 979/* Find the next real entry, without updating the iterator itself */
980static struct trace_entry *
981find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
1134{ 982{
1135 iter->idx++; 983 return __find_next_entry(iter, ent_cpu, ent_ts);
1136 iter->next_idx[iter->cpu]++;
1137 iter->next_page_idx[iter->cpu]++;
1138
1139 if (iter->next_page_idx[iter->cpu] >= ENTRIES_PER_PAGE) {
1140 struct trace_array_cpu *data = iter->tr->data[iter->cpu];
1141
1142 iter->next_page_idx[iter->cpu] = 0;
1143 iter->next_page[iter->cpu] =
1144 trace_next_list(data, iter->next_page[iter->cpu]);
1145 }
1146} 984}
1147 985
1148static void trace_consume(struct trace_iterator *iter) 986/* Find the next real entry, and increment the iterator to the next entry */
987static void *find_next_entry_inc(struct trace_iterator *iter)
1149{ 988{
1150 struct trace_array_cpu *data = iter->tr->data[iter->cpu]; 989 iter->ent = __find_next_entry(iter, &iter->cpu, &iter->ts);
1151 990
1152 data->trace_tail_idx++; 991 if (iter->ent)
1153 if (data->trace_tail_idx >= ENTRIES_PER_PAGE) { 992 trace_iterator_increment(iter, iter->cpu);
1154 data->trace_tail = trace_next_page(data, data->trace_tail);
1155 data->trace_tail_idx = 0;
1156 }
1157 993
1158 /* Check if we empty it, then reset the index */ 994 return iter->ent ? iter : NULL;
1159 if (data->trace_head == data->trace_tail &&
1160 data->trace_head_idx == data->trace_tail_idx)
1161 data->trace_idx = 0;
1162} 995}
1163 996
1164static void *find_next_entry_inc(struct trace_iterator *iter) 997static void trace_consume(struct trace_iterator *iter)
1165{ 998{
1166 struct trace_entry *next; 999 /* Don't allow ftrace to trace into the ring buffers */
1167 int next_cpu = -1; 1000 ftrace_disable_cpu();
1168 1001 ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts);
1169 next = find_next_entry(iter, &next_cpu); 1002 ftrace_enable_cpu();
1170
1171 iter->prev_ent = iter->ent;
1172 iter->prev_cpu = iter->cpu;
1173
1174 iter->ent = next;
1175 iter->cpu = next_cpu;
1176
1177 if (next)
1178 trace_iterator_increment(iter);
1179
1180 return next ? iter : NULL;
1181} 1003}
1182 1004
1183static void *s_next(struct seq_file *m, void *v, loff_t *pos) 1005static void *s_next(struct seq_file *m, void *v, loff_t *pos)
1184{ 1006{
1185 struct trace_iterator *iter = m->private; 1007 struct trace_iterator *iter = m->private;
1186 void *last_ent = iter->ent;
1187 int i = (int)*pos; 1008 int i = (int)*pos;
1188 void *ent; 1009 void *ent;
1189 1010
@@ -1203,9 +1024,6 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
1203 1024
1204 iter->pos = *pos; 1025 iter->pos = *pos;
1205 1026
1206 if (last_ent && !ent)
1207 seq_puts(m, "\n\nvim:ft=help\n");
1208
1209 return ent; 1027 return ent;
1210} 1028}
1211 1029
@@ -1214,7 +1032,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1214 struct trace_iterator *iter = m->private; 1032 struct trace_iterator *iter = m->private;
1215 void *p = NULL; 1033 void *p = NULL;
1216 loff_t l = 0; 1034 loff_t l = 0;
1217 int i; 1035 int cpu;
1218 1036
1219 mutex_lock(&trace_types_lock); 1037 mutex_lock(&trace_types_lock);
1220 1038
@@ -1233,14 +1051,15 @@ static void *s_start(struct seq_file *m, loff_t *pos)
1233 iter->ent = NULL; 1051 iter->ent = NULL;
1234 iter->cpu = 0; 1052 iter->cpu = 0;
1235 iter->idx = -1; 1053 iter->idx = -1;
1236 iter->prev_ent = NULL;
1237 iter->prev_cpu = -1;
1238 1054
1239 for_each_tracing_cpu(i) { 1055 ftrace_disable_cpu();
1240 iter->next_idx[i] = 0; 1056
1241 iter->next_page[i] = NULL; 1057 for_each_tracing_cpu(cpu) {
1058 ring_buffer_iter_reset(iter->buffer_iter[cpu]);
1242 } 1059 }
1243 1060
1061 ftrace_enable_cpu();
1062
1244 for (p = iter; p && l < *pos; p = s_next(m, p, &l)) 1063 for (p = iter; p && l < *pos; p = s_next(m, p, &l))
1245 ; 1064 ;
1246 1065
@@ -1334,21 +1153,21 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
1334 1153
1335static void print_lat_help_header(struct seq_file *m) 1154static void print_lat_help_header(struct seq_file *m)
1336{ 1155{
1337 seq_puts(m, "# _------=> CPU# \n"); 1156 seq_puts(m, "# _------=> CPU# \n");
1338 seq_puts(m, "# / _-----=> irqs-off \n"); 1157 seq_puts(m, "# / _-----=> irqs-off \n");
1339 seq_puts(m, "# | / _----=> need-resched \n"); 1158 seq_puts(m, "# | / _----=> need-resched \n");
1340 seq_puts(m, "# || / _---=> hardirq/softirq \n"); 1159 seq_puts(m, "# || / _---=> hardirq/softirq \n");
1341 seq_puts(m, "# ||| / _--=> preempt-depth \n"); 1160 seq_puts(m, "# ||| / _--=> preempt-depth \n");
1342 seq_puts(m, "# |||| / \n"); 1161 seq_puts(m, "# |||| / \n");
1343 seq_puts(m, "# ||||| delay \n"); 1162 seq_puts(m, "# ||||| delay \n");
1344 seq_puts(m, "# cmd pid ||||| time | caller \n"); 1163 seq_puts(m, "# cmd pid ||||| time | caller \n");
1345 seq_puts(m, "# \\ / ||||| \\ | / \n"); 1164 seq_puts(m, "# \\ / ||||| \\ | / \n");
1346} 1165}
1347 1166
1348static void print_func_help_header(struct seq_file *m) 1167static void print_func_help_header(struct seq_file *m)
1349{ 1168{
1350 seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); 1169 seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n");
1351 seq_puts(m, "# | | | | |\n"); 1170 seq_puts(m, "# | | | | |\n");
1352} 1171}
1353 1172
1354 1173
@@ -1359,23 +1178,16 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
1359 struct trace_array *tr = iter->tr; 1178 struct trace_array *tr = iter->tr;
1360 struct trace_array_cpu *data = tr->data[tr->cpu]; 1179 struct trace_array_cpu *data = tr->data[tr->cpu];
1361 struct tracer *type = current_trace; 1180 struct tracer *type = current_trace;
1362 unsigned long total = 0; 1181 unsigned long total;
1363 unsigned long entries = 0; 1182 unsigned long entries;
1364 int cpu;
1365 const char *name = "preemption"; 1183 const char *name = "preemption";
1366 1184
1367 if (type) 1185 if (type)
1368 name = type->name; 1186 name = type->name;
1369 1187
1370 for_each_tracing_cpu(cpu) { 1188 entries = ring_buffer_entries(iter->tr->buffer);
1371 if (head_page(tr->data[cpu])) { 1189 total = entries +
1372 total += tr->data[cpu]->trace_idx; 1190 ring_buffer_overruns(iter->tr->buffer);
1373 if (tr->data[cpu]->trace_idx > tr->entries)
1374 entries += tr->entries;
1375 else
1376 entries += tr->data[cpu]->trace_idx;
1377 }
1378 }
1379 1191
1380 seq_printf(m, "%s latency trace v1.1.5 on %s\n", 1192 seq_printf(m, "%s latency trace v1.1.5 on %s\n",
1381 name, UTS_RELEASE); 1193 name, UTS_RELEASE);
@@ -1432,7 +1244,7 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
1432 comm = trace_find_cmdline(entry->pid); 1244 comm = trace_find_cmdline(entry->pid);
1433 1245
1434 trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid); 1246 trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid);
1435 trace_seq_printf(s, "%d", cpu); 1247 trace_seq_printf(s, "%3d", cpu);
1436 trace_seq_printf(s, "%c%c", 1248 trace_seq_printf(s, "%c%c",
1437 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : '.', 1249 (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : '.',
1438 ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.')); 1250 ((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'));
@@ -1461,7 +1273,7 @@ lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
1461unsigned long preempt_mark_thresh = 100; 1273unsigned long preempt_mark_thresh = 100;
1462 1274
1463static void 1275static void
1464lat_print_timestamp(struct trace_seq *s, unsigned long long abs_usecs, 1276lat_print_timestamp(struct trace_seq *s, u64 abs_usecs,
1465 unsigned long rel_usecs) 1277 unsigned long rel_usecs)
1466{ 1278{
1467 trace_seq_printf(s, " %4lldus", abs_usecs); 1279 trace_seq_printf(s, " %4lldus", abs_usecs);
@@ -1475,34 +1287,76 @@ lat_print_timestamp(struct trace_seq *s, unsigned long long abs_usecs,
1475 1287
1476static const char state_to_char[] = TASK_STATE_TO_CHAR_STR; 1288static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
1477 1289
1478static int 1290/*
1291 * The message is supposed to contain an ending newline.
1292 * If the printing stops prematurely, try to add a newline of our own.
1293 */
1294void trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter)
1295{
1296 struct trace_entry *ent;
1297 struct trace_field_cont *cont;
1298 bool ok = true;
1299
1300 ent = peek_next_entry(iter, iter->cpu, NULL);
1301 if (!ent || ent->type != TRACE_CONT) {
1302 trace_seq_putc(s, '\n');
1303 return;
1304 }
1305
1306 do {
1307 cont = (struct trace_field_cont *)ent;
1308 if (ok)
1309 ok = (trace_seq_printf(s, "%s", cont->buf) > 0);
1310
1311 ftrace_disable_cpu();
1312
1313 if (iter->buffer_iter[iter->cpu])
1314 ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
1315 else
1316 ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
1317
1318 ftrace_enable_cpu();
1319
1320 ent = peek_next_entry(iter, iter->cpu, NULL);
1321 } while (ent && ent->type == TRACE_CONT);
1322
1323 if (!ok)
1324 trace_seq_putc(s, '\n');
1325}
1326
1327static enum print_line_t
1479print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) 1328print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
1480{ 1329{
1481 struct trace_seq *s = &iter->seq; 1330 struct trace_seq *s = &iter->seq;
1482 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); 1331 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
1483 struct trace_entry *next_entry = find_next_entry(iter, NULL); 1332 struct trace_entry *next_entry;
1484 unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE); 1333 unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
1485 struct trace_entry *entry = iter->ent; 1334 struct trace_entry *entry = iter->ent;
1486 unsigned long abs_usecs; 1335 unsigned long abs_usecs;
1487 unsigned long rel_usecs; 1336 unsigned long rel_usecs;
1337 u64 next_ts;
1488 char *comm; 1338 char *comm;
1489 int S, T; 1339 int S, T;
1490 int i; 1340 int i;
1491 unsigned state; 1341 unsigned state;
1492 1342
1343 if (entry->type == TRACE_CONT)
1344 return TRACE_TYPE_HANDLED;
1345
1346 next_entry = find_next_entry(iter, NULL, &next_ts);
1493 if (!next_entry) 1347 if (!next_entry)
1494 next_entry = entry; 1348 next_ts = iter->ts;
1495 rel_usecs = ns2usecs(next_entry->t - entry->t); 1349 rel_usecs = ns2usecs(next_ts - iter->ts);
1496 abs_usecs = ns2usecs(entry->t - iter->tr->time_start); 1350 abs_usecs = ns2usecs(iter->ts - iter->tr->time_start);
1497 1351
1498 if (verbose) { 1352 if (verbose) {
1499 comm = trace_find_cmdline(entry->pid); 1353 comm = trace_find_cmdline(entry->pid);
1500 trace_seq_printf(s, "%16s %5d %d %d %08x %08x [%08lx]" 1354 trace_seq_printf(s, "%16s %5d %3d %d %08x %08x [%08lx]"
1501 " %ld.%03ldms (+%ld.%03ldms): ", 1355 " %ld.%03ldms (+%ld.%03ldms): ",
1502 comm, 1356 comm,
1503 entry->pid, cpu, entry->flags, 1357 entry->pid, cpu, entry->flags,
1504 entry->preempt_count, trace_idx, 1358 entry->preempt_count, trace_idx,
1505 ns2usecs(entry->t), 1359 ns2usecs(iter->ts),
1506 abs_usecs/1000, 1360 abs_usecs/1000,
1507 abs_usecs % 1000, rel_usecs/1000, 1361 abs_usecs % 1000, rel_usecs/1000,
1508 rel_usecs % 1000); 1362 rel_usecs % 1000);
@@ -1511,52 +1365,85 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
1511 lat_print_timestamp(s, abs_usecs, rel_usecs); 1365 lat_print_timestamp(s, abs_usecs, rel_usecs);
1512 } 1366 }
1513 switch (entry->type) { 1367 switch (entry->type) {
1514 case TRACE_FN: 1368 case TRACE_FN: {
1515 seq_print_ip_sym(s, entry->fn.ip, sym_flags); 1369 struct ftrace_entry *field;
1370
1371 trace_assign_type(field, entry);
1372
1373 seq_print_ip_sym(s, field->ip, sym_flags);
1516 trace_seq_puts(s, " ("); 1374 trace_seq_puts(s, " (");
1517 if (kretprobed(entry->fn.parent_ip)) 1375 if (kretprobed(field->parent_ip))
1518 trace_seq_puts(s, KRETPROBE_MSG); 1376 trace_seq_puts(s, KRETPROBE_MSG);
1519 else 1377 else
1520 seq_print_ip_sym(s, entry->fn.parent_ip, sym_flags); 1378 seq_print_ip_sym(s, field->parent_ip, sym_flags);
1521 trace_seq_puts(s, ")\n"); 1379 trace_seq_puts(s, ")\n");
1522 break; 1380 break;
1381 }
1523 case TRACE_CTX: 1382 case TRACE_CTX:
1524 case TRACE_WAKE: 1383 case TRACE_WAKE: {
1525 T = entry->ctx.next_state < sizeof(state_to_char) ? 1384 struct ctx_switch_entry *field;
1526 state_to_char[entry->ctx.next_state] : 'X'; 1385
1386 trace_assign_type(field, entry);
1527 1387
1528 state = entry->ctx.prev_state ? __ffs(entry->ctx.prev_state) + 1 : 0; 1388 T = field->next_state < sizeof(state_to_char) ?
1389 state_to_char[field->next_state] : 'X';
1390
1391 state = field->prev_state ?
1392 __ffs(field->prev_state) + 1 : 0;
1529 S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 'X'; 1393 S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 'X';
1530 comm = trace_find_cmdline(entry->ctx.next_pid); 1394 comm = trace_find_cmdline(field->next_pid);
1531 trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c %s\n", 1395 trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
1532 entry->ctx.prev_pid, 1396 field->prev_pid,
1533 entry->ctx.prev_prio, 1397 field->prev_prio,
1534 S, entry->type == TRACE_CTX ? "==>" : " +", 1398 S, entry->type == TRACE_CTX ? "==>" : " +",
1535 entry->ctx.next_pid, 1399 field->next_cpu,
1536 entry->ctx.next_prio, 1400 field->next_pid,
1401 field->next_prio,
1537 T, comm); 1402 T, comm);
1538 break; 1403 break;
1539 case TRACE_SPECIAL: 1404 }
1405 case TRACE_SPECIAL: {
1406 struct special_entry *field;
1407
1408 trace_assign_type(field, entry);
1409
1540 trace_seq_printf(s, "# %ld %ld %ld\n", 1410 trace_seq_printf(s, "# %ld %ld %ld\n",
1541 entry->special.arg1, 1411 field->arg1,
1542 entry->special.arg2, 1412 field->arg2,
1543 entry->special.arg3); 1413 field->arg3);
1544 break; 1414 break;
1545 case TRACE_STACK: 1415 }
1416 case TRACE_STACK: {
1417 struct stack_entry *field;
1418
1419 trace_assign_type(field, entry);
1420
1546 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { 1421 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
1547 if (i) 1422 if (i)
1548 trace_seq_puts(s, " <= "); 1423 trace_seq_puts(s, " <= ");
1549 seq_print_ip_sym(s, entry->stack.caller[i], sym_flags); 1424 seq_print_ip_sym(s, field->caller[i], sym_flags);
1550 } 1425 }
1551 trace_seq_puts(s, "\n"); 1426 trace_seq_puts(s, "\n");
1552 break; 1427 break;
1428 }
1429 case TRACE_PRINT: {
1430 struct print_entry *field;
1431
1432 trace_assign_type(field, entry);
1433
1434 seq_print_ip_sym(s, field->ip, sym_flags);
1435 trace_seq_printf(s, ": %s", field->buf);
1436 if (entry->flags & TRACE_FLAG_CONT)
1437 trace_seq_print_cont(s, iter);
1438 break;
1439 }
1553 default: 1440 default:
1554 trace_seq_printf(s, "Unknown type %d\n", entry->type); 1441 trace_seq_printf(s, "Unknown type %d\n", entry->type);
1555 } 1442 }
1556 return 1; 1443 return TRACE_TYPE_HANDLED;
1557} 1444}
1558 1445
1559static int print_trace_fmt(struct trace_iterator *iter) 1446static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
1560{ 1447{
1561 struct trace_seq *s = &iter->seq; 1448 struct trace_seq *s = &iter->seq;
1562 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); 1449 unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
@@ -1571,90 +1458,126 @@ static int print_trace_fmt(struct trace_iterator *iter)
1571 1458
1572 entry = iter->ent; 1459 entry = iter->ent;
1573 1460
1461 if (entry->type == TRACE_CONT)
1462 return TRACE_TYPE_HANDLED;
1463
1574 comm = trace_find_cmdline(iter->ent->pid); 1464 comm = trace_find_cmdline(iter->ent->pid);
1575 1465
1576 t = ns2usecs(entry->t); 1466 t = ns2usecs(iter->ts);
1577 usec_rem = do_div(t, 1000000ULL); 1467 usec_rem = do_div(t, 1000000ULL);
1578 secs = (unsigned long)t; 1468 secs = (unsigned long)t;
1579 1469
1580 ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid); 1470 ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid);
1581 if (!ret) 1471 if (!ret)
1582 return 0; 1472 return TRACE_TYPE_PARTIAL_LINE;
1583 ret = trace_seq_printf(s, "[%02d] ", iter->cpu); 1473 ret = trace_seq_printf(s, "[%03d] ", iter->cpu);
1584 if (!ret) 1474 if (!ret)
1585 return 0; 1475 return TRACE_TYPE_PARTIAL_LINE;
1586 ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem); 1476 ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem);
1587 if (!ret) 1477 if (!ret)
1588 return 0; 1478 return TRACE_TYPE_PARTIAL_LINE;
1589 1479
1590 switch (entry->type) { 1480 switch (entry->type) {
1591 case TRACE_FN: 1481 case TRACE_FN: {
1592 ret = seq_print_ip_sym(s, entry->fn.ip, sym_flags); 1482 struct ftrace_entry *field;
1483
1484 trace_assign_type(field, entry);
1485
1486 ret = seq_print_ip_sym(s, field->ip, sym_flags);
1593 if (!ret) 1487 if (!ret)
1594 return 0; 1488 return TRACE_TYPE_PARTIAL_LINE;
1595 if ((sym_flags & TRACE_ITER_PRINT_PARENT) && 1489 if ((sym_flags & TRACE_ITER_PRINT_PARENT) &&
1596 entry->fn.parent_ip) { 1490 field->parent_ip) {
1597 ret = trace_seq_printf(s, " <-"); 1491 ret = trace_seq_printf(s, " <-");
1598 if (!ret) 1492 if (!ret)
1599 return 0; 1493 return TRACE_TYPE_PARTIAL_LINE;
1600 if (kretprobed(entry->fn.parent_ip)) 1494 if (kretprobed(field->parent_ip))
1601 ret = trace_seq_puts(s, KRETPROBE_MSG); 1495 ret = trace_seq_puts(s, KRETPROBE_MSG);
1602 else 1496 else
1603 ret = seq_print_ip_sym(s, entry->fn.parent_ip, 1497 ret = seq_print_ip_sym(s,
1498 field->parent_ip,
1604 sym_flags); 1499 sym_flags);
1605 if (!ret) 1500 if (!ret)
1606 return 0; 1501 return TRACE_TYPE_PARTIAL_LINE;
1607 } 1502 }
1608 ret = trace_seq_printf(s, "\n"); 1503 ret = trace_seq_printf(s, "\n");
1609 if (!ret) 1504 if (!ret)
1610 return 0; 1505 return TRACE_TYPE_PARTIAL_LINE;
1611 break; 1506 break;
1507 }
1612 case TRACE_CTX: 1508 case TRACE_CTX:
1613 case TRACE_WAKE: 1509 case TRACE_WAKE: {
1614 S = entry->ctx.prev_state < sizeof(state_to_char) ? 1510 struct ctx_switch_entry *field;
1615 state_to_char[entry->ctx.prev_state] : 'X'; 1511
1616 T = entry->ctx.next_state < sizeof(state_to_char) ? 1512 trace_assign_type(field, entry);
1617 state_to_char[entry->ctx.next_state] : 'X'; 1513
1618 ret = trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c\n", 1514 S = field->prev_state < sizeof(state_to_char) ?
1619 entry->ctx.prev_pid, 1515 state_to_char[field->prev_state] : 'X';
1620 entry->ctx.prev_prio, 1516 T = field->next_state < sizeof(state_to_char) ?
1517 state_to_char[field->next_state] : 'X';
1518 ret = trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c\n",
1519 field->prev_pid,
1520 field->prev_prio,
1621 S, 1521 S,
1622 entry->type == TRACE_CTX ? "==>" : " +", 1522 entry->type == TRACE_CTX ? "==>" : " +",
1623 entry->ctx.next_pid, 1523 field->next_cpu,
1624 entry->ctx.next_prio, 1524 field->next_pid,
1525 field->next_prio,
1625 T); 1526 T);
1626 if (!ret) 1527 if (!ret)
1627 return 0; 1528 return TRACE_TYPE_PARTIAL_LINE;
1628 break; 1529 break;
1629 case TRACE_SPECIAL: 1530 }
1531 case TRACE_SPECIAL: {
1532 struct special_entry *field;
1533
1534 trace_assign_type(field, entry);
1535
1630 ret = trace_seq_printf(s, "# %ld %ld %ld\n", 1536 ret = trace_seq_printf(s, "# %ld %ld %ld\n",
1631 entry->special.arg1, 1537 field->arg1,
1632 entry->special.arg2, 1538 field->arg2,
1633 entry->special.arg3); 1539 field->arg3);
1634 if (!ret) 1540 if (!ret)
1635 return 0; 1541 return TRACE_TYPE_PARTIAL_LINE;
1636 break; 1542 break;
1637 case TRACE_STACK: 1543 }
1544 case TRACE_STACK: {
1545 struct stack_entry *field;
1546
1547 trace_assign_type(field, entry);
1548
1638 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { 1549 for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
1639 if (i) { 1550 if (i) {
1640 ret = trace_seq_puts(s, " <= "); 1551 ret = trace_seq_puts(s, " <= ");
1641 if (!ret) 1552 if (!ret)
1642 return 0; 1553 return TRACE_TYPE_PARTIAL_LINE;
1643 } 1554 }
1644 ret = seq_print_ip_sym(s, entry->stack.caller[i], 1555 ret = seq_print_ip_sym(s, field->caller[i],
1645 sym_flags); 1556 sym_flags);
1646 if (!ret) 1557 if (!ret)
1647 return 0; 1558 return TRACE_TYPE_PARTIAL_LINE;
1648 } 1559 }
1649 ret = trace_seq_puts(s, "\n"); 1560 ret = trace_seq_puts(s, "\n");
1650 if (!ret) 1561 if (!ret)
1651 return 0; 1562 return TRACE_TYPE_PARTIAL_LINE;
1652 break; 1563 break;
1653 } 1564 }
1654 return 1; 1565 case TRACE_PRINT: {
1566 struct print_entry *field;
1567
1568 trace_assign_type(field, entry);
1569
1570 seq_print_ip_sym(s, field->ip, sym_flags);
1571 trace_seq_printf(s, ": %s", field->buf);
1572 if (entry->flags & TRACE_FLAG_CONT)
1573 trace_seq_print_cont(s, iter);
1574 break;
1575 }
1576 }
1577 return TRACE_TYPE_HANDLED;
1655} 1578}
1656 1579
1657static int print_raw_fmt(struct trace_iterator *iter) 1580static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
1658{ 1581{
1659 struct trace_seq *s = &iter->seq; 1582 struct trace_seq *s = &iter->seq;
1660 struct trace_entry *entry; 1583 struct trace_entry *entry;
@@ -1663,47 +1586,77 @@ static int print_raw_fmt(struct trace_iterator *iter)
1663 1586
1664 entry = iter->ent; 1587 entry = iter->ent;
1665 1588
1589 if (entry->type == TRACE_CONT)
1590 return TRACE_TYPE_HANDLED;
1591
1666 ret = trace_seq_printf(s, "%d %d %llu ", 1592 ret = trace_seq_printf(s, "%d %d %llu ",
1667 entry->pid, iter->cpu, entry->t); 1593 entry->pid, iter->cpu, iter->ts);
1668 if (!ret) 1594 if (!ret)
1669 return 0; 1595 return TRACE_TYPE_PARTIAL_LINE;
1670 1596
1671 switch (entry->type) { 1597 switch (entry->type) {
1672 case TRACE_FN: 1598 case TRACE_FN: {
1599 struct ftrace_entry *field;
1600
1601 trace_assign_type(field, entry);
1602
1673 ret = trace_seq_printf(s, "%x %x\n", 1603 ret = trace_seq_printf(s, "%x %x\n",
1674 entry->fn.ip, entry->fn.parent_ip); 1604 field->ip,
1605 field->parent_ip);
1675 if (!ret) 1606 if (!ret)
1676 return 0; 1607 return TRACE_TYPE_PARTIAL_LINE;
1677 break; 1608 break;
1609 }
1678 case TRACE_CTX: 1610 case TRACE_CTX:
1679 case TRACE_WAKE: 1611 case TRACE_WAKE: {
1680 S = entry->ctx.prev_state < sizeof(state_to_char) ? 1612 struct ctx_switch_entry *field;
1681 state_to_char[entry->ctx.prev_state] : 'X'; 1613
1682 T = entry->ctx.next_state < sizeof(state_to_char) ? 1614 trace_assign_type(field, entry);
1683 state_to_char[entry->ctx.next_state] : 'X'; 1615
1616 S = field->prev_state < sizeof(state_to_char) ?
1617 state_to_char[field->prev_state] : 'X';
1618 T = field->next_state < sizeof(state_to_char) ?
1619 state_to_char[field->next_state] : 'X';
1684 if (entry->type == TRACE_WAKE) 1620 if (entry->type == TRACE_WAKE)
1685 S = '+'; 1621 S = '+';
1686 ret = trace_seq_printf(s, "%d %d %c %d %d %c\n", 1622 ret = trace_seq_printf(s, "%d %d %c %d %d %d %c\n",
1687 entry->ctx.prev_pid, 1623 field->prev_pid,
1688 entry->ctx.prev_prio, 1624 field->prev_prio,
1689 S, 1625 S,
1690 entry->ctx.next_pid, 1626 field->next_cpu,
1691 entry->ctx.next_prio, 1627 field->next_pid,
1628 field->next_prio,
1692 T); 1629 T);
1693 if (!ret) 1630 if (!ret)
1694 return 0; 1631 return TRACE_TYPE_PARTIAL_LINE;
1695 break; 1632 break;
1633 }
1696 case TRACE_SPECIAL: 1634 case TRACE_SPECIAL:
1697 case TRACE_STACK: 1635 case TRACE_STACK: {
1636 struct special_entry *field;
1637
1638 trace_assign_type(field, entry);
1639
1698 ret = trace_seq_printf(s, "# %ld %ld %ld\n", 1640 ret = trace_seq_printf(s, "# %ld %ld %ld\n",
1699 entry->special.arg1, 1641 field->arg1,
1700 entry->special.arg2, 1642 field->arg2,
1701 entry->special.arg3); 1643 field->arg3);
1702 if (!ret) 1644 if (!ret)
1703 return 0; 1645 return TRACE_TYPE_PARTIAL_LINE;
1704 break; 1646 break;
1705 } 1647 }
1706 return 1; 1648 case TRACE_PRINT: {
1649 struct print_entry *field;
1650
1651 trace_assign_type(field, entry);
1652
1653 trace_seq_printf(s, "# %lx %s", field->ip, field->buf);
1654 if (entry->flags & TRACE_FLAG_CONT)
1655 trace_seq_print_cont(s, iter);
1656 break;
1657 }
1658 }
1659 return TRACE_TYPE_HANDLED;
1707} 1660}
1708 1661
1709#define SEQ_PUT_FIELD_RET(s, x) \ 1662#define SEQ_PUT_FIELD_RET(s, x) \
@@ -1714,11 +1667,12 @@ do { \
1714 1667
1715#define SEQ_PUT_HEX_FIELD_RET(s, x) \ 1668#define SEQ_PUT_HEX_FIELD_RET(s, x) \
1716do { \ 1669do { \
1670 BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES); \
1717 if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \ 1671 if (!trace_seq_putmem_hex(s, &(x), sizeof(x))) \
1718 return 0; \ 1672 return 0; \
1719} while (0) 1673} while (0)
1720 1674
1721static int print_hex_fmt(struct trace_iterator *iter) 1675static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
1722{ 1676{
1723 struct trace_seq *s = &iter->seq; 1677 struct trace_seq *s = &iter->seq;
1724 unsigned char newline = '\n'; 1678 unsigned char newline = '\n';
@@ -1727,97 +1681,139 @@ static int print_hex_fmt(struct trace_iterator *iter)
1727 1681
1728 entry = iter->ent; 1682 entry = iter->ent;
1729 1683
1684 if (entry->type == TRACE_CONT)
1685 return TRACE_TYPE_HANDLED;
1686
1730 SEQ_PUT_HEX_FIELD_RET(s, entry->pid); 1687 SEQ_PUT_HEX_FIELD_RET(s, entry->pid);
1731 SEQ_PUT_HEX_FIELD_RET(s, iter->cpu); 1688 SEQ_PUT_HEX_FIELD_RET(s, iter->cpu);
1732 SEQ_PUT_HEX_FIELD_RET(s, entry->t); 1689 SEQ_PUT_HEX_FIELD_RET(s, iter->ts);
1733 1690
1734 switch (entry->type) { 1691 switch (entry->type) {
1735 case TRACE_FN: 1692 case TRACE_FN: {
1736 SEQ_PUT_HEX_FIELD_RET(s, entry->fn.ip); 1693 struct ftrace_entry *field;
1737 SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip); 1694
1695 trace_assign_type(field, entry);
1696
1697 SEQ_PUT_HEX_FIELD_RET(s, field->ip);
1698 SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip);
1738 break; 1699 break;
1700 }
1739 case TRACE_CTX: 1701 case TRACE_CTX:
1740 case TRACE_WAKE: 1702 case TRACE_WAKE: {
1741 S = entry->ctx.prev_state < sizeof(state_to_char) ? 1703 struct ctx_switch_entry *field;
1742 state_to_char[entry->ctx.prev_state] : 'X'; 1704
1743 T = entry->ctx.next_state < sizeof(state_to_char) ? 1705 trace_assign_type(field, entry);
1744 state_to_char[entry->ctx.next_state] : 'X'; 1706
1707 S = field->prev_state < sizeof(state_to_char) ?
1708 state_to_char[field->prev_state] : 'X';
1709 T = field->next_state < sizeof(state_to_char) ?
1710 state_to_char[field->next_state] : 'X';
1745 if (entry->type == TRACE_WAKE) 1711 if (entry->type == TRACE_WAKE)
1746 S = '+'; 1712 S = '+';
1747 SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_pid); 1713 SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid);
1748 SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.prev_prio); 1714 SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio);
1749 SEQ_PUT_HEX_FIELD_RET(s, S); 1715 SEQ_PUT_HEX_FIELD_RET(s, S);
1750 SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_pid); 1716 SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu);
1751 SEQ_PUT_HEX_FIELD_RET(s, entry->ctx.next_prio); 1717 SEQ_PUT_HEX_FIELD_RET(s, field->next_pid);
1752 SEQ_PUT_HEX_FIELD_RET(s, entry->fn.parent_ip); 1718 SEQ_PUT_HEX_FIELD_RET(s, field->next_prio);
1753 SEQ_PUT_HEX_FIELD_RET(s, T); 1719 SEQ_PUT_HEX_FIELD_RET(s, T);
1754 break; 1720 break;
1721 }
1755 case TRACE_SPECIAL: 1722 case TRACE_SPECIAL:
1756 case TRACE_STACK: 1723 case TRACE_STACK: {
1757 SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg1); 1724 struct special_entry *field;
1758 SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg2); 1725
1759 SEQ_PUT_HEX_FIELD_RET(s, entry->special.arg3); 1726 trace_assign_type(field, entry);
1727
1728 SEQ_PUT_HEX_FIELD_RET(s, field->arg1);
1729 SEQ_PUT_HEX_FIELD_RET(s, field->arg2);
1730 SEQ_PUT_HEX_FIELD_RET(s, field->arg3);
1760 break; 1731 break;
1761 } 1732 }
1733 }
1762 SEQ_PUT_FIELD_RET(s, newline); 1734 SEQ_PUT_FIELD_RET(s, newline);
1763 1735
1764 return 1; 1736 return TRACE_TYPE_HANDLED;
1765} 1737}
1766 1738
1767static int print_bin_fmt(struct trace_iterator *iter) 1739static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
1768{ 1740{
1769 struct trace_seq *s = &iter->seq; 1741 struct trace_seq *s = &iter->seq;
1770 struct trace_entry *entry; 1742 struct trace_entry *entry;
1771 1743
1772 entry = iter->ent; 1744 entry = iter->ent;
1773 1745
1746 if (entry->type == TRACE_CONT)
1747 return TRACE_TYPE_HANDLED;
1748
1774 SEQ_PUT_FIELD_RET(s, entry->pid); 1749 SEQ_PUT_FIELD_RET(s, entry->pid);
1775 SEQ_PUT_FIELD_RET(s, entry->cpu); 1750 SEQ_PUT_FIELD_RET(s, iter->cpu);
1776 SEQ_PUT_FIELD_RET(s, entry->t); 1751 SEQ_PUT_FIELD_RET(s, iter->ts);
1777 1752
1778 switch (entry->type) { 1753 switch (entry->type) {
1779 case TRACE_FN: 1754 case TRACE_FN: {
1780 SEQ_PUT_FIELD_RET(s, entry->fn.ip); 1755 struct ftrace_entry *field;
1781 SEQ_PUT_FIELD_RET(s, entry->fn.parent_ip); 1756
1757 trace_assign_type(field, entry);
1758
1759 SEQ_PUT_FIELD_RET(s, field->ip);
1760 SEQ_PUT_FIELD_RET(s, field->parent_ip);
1782 break; 1761 break;
1783 case TRACE_CTX: 1762 }
1784 SEQ_PUT_FIELD_RET(s, entry->ctx.prev_pid); 1763 case TRACE_CTX: {
1785 SEQ_PUT_FIELD_RET(s, entry->ctx.prev_prio); 1764 struct ctx_switch_entry *field;
1786 SEQ_PUT_FIELD_RET(s, entry->ctx.prev_state); 1765
1787 SEQ_PUT_FIELD_RET(s, entry->ctx.next_pid); 1766 trace_assign_type(field, entry);
1788 SEQ_PUT_FIELD_RET(s, entry->ctx.next_prio); 1767
1789 SEQ_PUT_FIELD_RET(s, entry->ctx.next_state); 1768 SEQ_PUT_FIELD_RET(s, field->prev_pid);
1769 SEQ_PUT_FIELD_RET(s, field->prev_prio);
1770 SEQ_PUT_FIELD_RET(s, field->prev_state);
1771 SEQ_PUT_FIELD_RET(s, field->next_pid);
1772 SEQ_PUT_FIELD_RET(s, field->next_prio);
1773 SEQ_PUT_FIELD_RET(s, field->next_state);
1790 break; 1774 break;
1775 }
1791 case TRACE_SPECIAL: 1776 case TRACE_SPECIAL:
1792 case TRACE_STACK: 1777 case TRACE_STACK: {
1793 SEQ_PUT_FIELD_RET(s, entry->special.arg1); 1778 struct special_entry *field;
1794 SEQ_PUT_FIELD_RET(s, entry->special.arg2); 1779
1795 SEQ_PUT_FIELD_RET(s, entry->special.arg3); 1780 trace_assign_type(field, entry);
1781
1782 SEQ_PUT_FIELD_RET(s, field->arg1);
1783 SEQ_PUT_FIELD_RET(s, field->arg2);
1784 SEQ_PUT_FIELD_RET(s, field->arg3);
1796 break; 1785 break;
1797 } 1786 }
1787 }
1798 return 1; 1788 return 1;
1799} 1789}
1800 1790
1801static int trace_empty(struct trace_iterator *iter) 1791static int trace_empty(struct trace_iterator *iter)
1802{ 1792{
1803 struct trace_array_cpu *data;
1804 int cpu; 1793 int cpu;
1805 1794
1806 for_each_tracing_cpu(cpu) { 1795 for_each_tracing_cpu(cpu) {
1807 data = iter->tr->data[cpu]; 1796 if (iter->buffer_iter[cpu]) {
1808 1797 if (!ring_buffer_iter_empty(iter->buffer_iter[cpu]))
1809 if (head_page(data) && data->trace_idx && 1798 return 0;
1810 (data->trace_tail != data->trace_head || 1799 } else {
1811 data->trace_tail_idx != data->trace_head_idx)) 1800 if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu))
1812 return 0; 1801 return 0;
1802 }
1813 } 1803 }
1804
1814 return 1; 1805 return 1;
1815} 1806}
1816 1807
1817static int print_trace_line(struct trace_iterator *iter) 1808static enum print_line_t print_trace_line(struct trace_iterator *iter)
1818{ 1809{
1819 if (iter->trace && iter->trace->print_line) 1810 enum print_line_t ret;
1820 return iter->trace->print_line(iter); 1811
1812 if (iter->trace && iter->trace->print_line) {
1813 ret = iter->trace->print_line(iter);
1814 if (ret != TRACE_TYPE_UNHANDLED)
1815 return ret;
1816 }
1821 1817
1822 if (trace_flags & TRACE_ITER_BIN) 1818 if (trace_flags & TRACE_ITER_BIN)
1823 return print_bin_fmt(iter); 1819 return print_bin_fmt(iter);
@@ -1873,6 +1869,8 @@ static struct trace_iterator *
1873__tracing_open(struct inode *inode, struct file *file, int *ret) 1869__tracing_open(struct inode *inode, struct file *file, int *ret)
1874{ 1870{
1875 struct trace_iterator *iter; 1871 struct trace_iterator *iter;
1872 struct seq_file *m;
1873 int cpu;
1876 1874
1877 if (tracing_disabled) { 1875 if (tracing_disabled) {
1878 *ret = -ENODEV; 1876 *ret = -ENODEV;
@@ -1893,28 +1891,45 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
1893 iter->trace = current_trace; 1891 iter->trace = current_trace;
1894 iter->pos = -1; 1892 iter->pos = -1;
1895 1893
1894 for_each_tracing_cpu(cpu) {
1895
1896 iter->buffer_iter[cpu] =
1897 ring_buffer_read_start(iter->tr->buffer, cpu);
1898
1899 if (!iter->buffer_iter[cpu])
1900 goto fail_buffer;
1901 }
1902
1896 /* TODO stop tracer */ 1903 /* TODO stop tracer */
1897 *ret = seq_open(file, &tracer_seq_ops); 1904 *ret = seq_open(file, &tracer_seq_ops);
1898 if (!*ret) { 1905 if (*ret)
1899 struct seq_file *m = file->private_data; 1906 goto fail_buffer;
1900 m->private = iter;
1901 1907
1902 /* stop the trace while dumping */ 1908 m = file->private_data;
1903 if (iter->tr->ctrl) { 1909 m->private = iter;
1904 tracer_enabled = 0;
1905 ftrace_function_enabled = 0;
1906 }
1907 1910
1908 if (iter->trace && iter->trace->open) 1911 /* stop the trace while dumping */
1909 iter->trace->open(iter); 1912 if (iter->tr->ctrl) {
1910 } else { 1913 tracer_enabled = 0;
1911 kfree(iter); 1914 ftrace_function_enabled = 0;
1912 iter = NULL;
1913 } 1915 }
1916
1917 if (iter->trace && iter->trace->open)
1918 iter->trace->open(iter);
1919
1914 mutex_unlock(&trace_types_lock); 1920 mutex_unlock(&trace_types_lock);
1915 1921
1916 out: 1922 out:
1917 return iter; 1923 return iter;
1924
1925 fail_buffer:
1926 for_each_tracing_cpu(cpu) {
1927 if (iter->buffer_iter[cpu])
1928 ring_buffer_read_finish(iter->buffer_iter[cpu]);
1929 }
1930 mutex_unlock(&trace_types_lock);
1931
1932 return ERR_PTR(-ENOMEM);
1918} 1933}
1919 1934
1920int tracing_open_generic(struct inode *inode, struct file *filp) 1935int tracing_open_generic(struct inode *inode, struct file *filp)
@@ -1930,8 +1945,14 @@ int tracing_release(struct inode *inode, struct file *file)
1930{ 1945{
1931 struct seq_file *m = (struct seq_file *)file->private_data; 1946 struct seq_file *m = (struct seq_file *)file->private_data;
1932 struct trace_iterator *iter = m->private; 1947 struct trace_iterator *iter = m->private;
1948 int cpu;
1933 1949
1934 mutex_lock(&trace_types_lock); 1950 mutex_lock(&trace_types_lock);
1951 for_each_tracing_cpu(cpu) {
1952 if (iter->buffer_iter[cpu])
1953 ring_buffer_read_finish(iter->buffer_iter[cpu]);
1954 }
1955
1935 if (iter->trace && iter->trace->close) 1956 if (iter->trace && iter->trace->close)
1936 iter->trace->close(iter); 1957 iter->trace->close(iter);
1937 1958
@@ -2356,9 +2377,11 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
2356 struct tracer *t; 2377 struct tracer *t;
2357 char buf[max_tracer_type_len+1]; 2378 char buf[max_tracer_type_len+1];
2358 int i; 2379 int i;
2380 size_t ret;
2359 2381
2360 if (cnt > max_tracer_type_len) 2382 if (cnt > max_tracer_type_len)
2361 cnt = max_tracer_type_len; 2383 cnt = max_tracer_type_len;
2384 ret = cnt;
2362 2385
2363 if (copy_from_user(&buf, ubuf, cnt)) 2386 if (copy_from_user(&buf, ubuf, cnt))
2364 return -EFAULT; 2387 return -EFAULT;
@@ -2374,7 +2397,11 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
2374 if (strcmp(t->name, buf) == 0) 2397 if (strcmp(t->name, buf) == 0)
2375 break; 2398 break;
2376 } 2399 }
2377 if (!t || t == current_trace) 2400 if (!t) {
2401 ret = -EINVAL;
2402 goto out;
2403 }
2404 if (t == current_trace)
2378 goto out; 2405 goto out;
2379 2406
2380 if (current_trace && current_trace->reset) 2407 if (current_trace && current_trace->reset)
@@ -2387,9 +2414,10 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
2387 out: 2414 out:
2388 mutex_unlock(&trace_types_lock); 2415 mutex_unlock(&trace_types_lock);
2389 2416
2390 filp->f_pos += cnt; 2417 if (ret == cnt)
2418 filp->f_pos += cnt;
2391 2419
2392 return cnt; 2420 return ret;
2393} 2421}
2394 2422
2395static ssize_t 2423static ssize_t
@@ -2504,20 +2532,12 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
2504 size_t cnt, loff_t *ppos) 2532 size_t cnt, loff_t *ppos)
2505{ 2533{
2506 struct trace_iterator *iter = filp->private_data; 2534 struct trace_iterator *iter = filp->private_data;
2507 struct trace_array_cpu *data;
2508 static cpumask_t mask;
2509 unsigned long flags;
2510#ifdef CONFIG_FTRACE
2511 int ftrace_save;
2512#endif
2513 int cpu;
2514 ssize_t sret; 2535 ssize_t sret;
2515 2536
2516 /* return any leftover data */ 2537 /* return any leftover data */
2517 sret = trace_seq_to_user(&iter->seq, ubuf, cnt); 2538 sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
2518 if (sret != -EBUSY) 2539 if (sret != -EBUSY)
2519 return sret; 2540 return sret;
2520 sret = 0;
2521 2541
2522 trace_seq_reset(&iter->seq); 2542 trace_seq_reset(&iter->seq);
2523 2543
@@ -2528,6 +2548,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
2528 goto out; 2548 goto out;
2529 } 2549 }
2530 2550
2551waitagain:
2552 sret = 0;
2531 while (trace_empty(iter)) { 2553 while (trace_empty(iter)) {
2532 2554
2533 if ((filp->f_flags & O_NONBLOCK)) { 2555 if ((filp->f_flags & O_NONBLOCK)) {
@@ -2592,46 +2614,12 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
2592 offsetof(struct trace_iterator, seq)); 2614 offsetof(struct trace_iterator, seq));
2593 iter->pos = -1; 2615 iter->pos = -1;
2594 2616
2595 /*
2596 * We need to stop all tracing on all CPUS to read the
2597 * the next buffer. This is a bit expensive, but is
2598 * not done often. We fill all what we can read,
2599 * and then release the locks again.
2600 */
2601
2602 cpus_clear(mask);
2603 local_irq_save(flags);
2604#ifdef CONFIG_FTRACE
2605 ftrace_save = ftrace_enabled;
2606 ftrace_enabled = 0;
2607#endif
2608 smp_wmb();
2609 for_each_tracing_cpu(cpu) {
2610 data = iter->tr->data[cpu];
2611
2612 if (!head_page(data) || !data->trace_idx)
2613 continue;
2614
2615 atomic_inc(&data->disabled);
2616 cpu_set(cpu, mask);
2617 }
2618
2619 for_each_cpu_mask(cpu, mask) {
2620 data = iter->tr->data[cpu];
2621 __raw_spin_lock(&data->lock);
2622
2623 if (data->overrun > iter->last_overrun[cpu])
2624 iter->overrun[cpu] +=
2625 data->overrun - iter->last_overrun[cpu];
2626 iter->last_overrun[cpu] = data->overrun;
2627 }
2628
2629 while (find_next_entry_inc(iter) != NULL) { 2617 while (find_next_entry_inc(iter) != NULL) {
2630 int ret; 2618 enum print_line_t ret;
2631 int len = iter->seq.len; 2619 int len = iter->seq.len;
2632 2620
2633 ret = print_trace_line(iter); 2621 ret = print_trace_line(iter);
2634 if (!ret) { 2622 if (ret == TRACE_TYPE_PARTIAL_LINE) {
2635 /* don't print partial lines */ 2623 /* don't print partial lines */
2636 iter->seq.len = len; 2624 iter->seq.len = len;
2637 break; 2625 break;
@@ -2643,26 +2631,17 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
2643 break; 2631 break;
2644 } 2632 }
2645 2633
2646 for_each_cpu_mask(cpu, mask) {
2647 data = iter->tr->data[cpu];
2648 __raw_spin_unlock(&data->lock);
2649 }
2650
2651 for_each_cpu_mask(cpu, mask) {
2652 data = iter->tr->data[cpu];
2653 atomic_dec(&data->disabled);
2654 }
2655#ifdef CONFIG_FTRACE
2656 ftrace_enabled = ftrace_save;
2657#endif
2658 local_irq_restore(flags);
2659
2660 /* Now copy what we have to the user */ 2634 /* Now copy what we have to the user */
2661 sret = trace_seq_to_user(&iter->seq, ubuf, cnt); 2635 sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
2662 if (iter->seq.readpos >= iter->seq.len) 2636 if (iter->seq.readpos >= iter->seq.len)
2663 trace_seq_reset(&iter->seq); 2637 trace_seq_reset(&iter->seq);
2638
2639 /*
2640 * If there was nothing to send to user, inspite of consuming trace
2641 * entries, go back to wait for more entries.
2642 */
2664 if (sret == -EBUSY) 2643 if (sret == -EBUSY)
2665 sret = 0; 2644 goto waitagain;
2666 2645
2667out: 2646out:
2668 mutex_unlock(&trace_types_lock); 2647 mutex_unlock(&trace_types_lock);
@@ -2688,7 +2667,8 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
2688{ 2667{
2689 unsigned long val; 2668 unsigned long val;
2690 char buf[64]; 2669 char buf[64];
2691 int i, ret; 2670 int ret;
2671 struct trace_array *tr = filp->private_data;
2692 2672
2693 if (cnt >= sizeof(buf)) 2673 if (cnt >= sizeof(buf))
2694 return -EINVAL; 2674 return -EINVAL;
@@ -2708,59 +2688,38 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
2708 2688
2709 mutex_lock(&trace_types_lock); 2689 mutex_lock(&trace_types_lock);
2710 2690
2711 if (current_trace != &no_tracer) { 2691 if (tr->ctrl) {
2712 cnt = -EBUSY; 2692 cnt = -EBUSY;
2713 pr_info("ftrace: set current_tracer to none" 2693 pr_info("ftrace: please disable tracing"
2714 " before modifying buffer size\n"); 2694 " before modifying buffer size\n");
2715 goto out; 2695 goto out;
2716 } 2696 }
2717 2697
2718 if (val > global_trace.entries) { 2698 if (val != global_trace.entries) {
2719 long pages_requested; 2699 ret = ring_buffer_resize(global_trace.buffer, val);
2720 unsigned long freeable_pages; 2700 if (ret < 0) {
2721 2701 cnt = ret;
2722 /* make sure we have enough memory before mapping */
2723 pages_requested =
2724 (val + (ENTRIES_PER_PAGE-1)) / ENTRIES_PER_PAGE;
2725
2726 /* account for each buffer (and max_tr) */
2727 pages_requested *= tracing_nr_buffers * 2;
2728
2729 /* Check for overflow */
2730 if (pages_requested < 0) {
2731 cnt = -ENOMEM;
2732 goto out;
2733 }
2734
2735 freeable_pages = determine_dirtyable_memory();
2736
2737 /* we only allow to request 1/4 of useable memory */
2738 if (pages_requested >
2739 ((freeable_pages + tracing_pages_allocated) / 4)) {
2740 cnt = -ENOMEM;
2741 goto out; 2702 goto out;
2742 } 2703 }
2743 2704
2744 while (global_trace.entries < val) { 2705 ret = ring_buffer_resize(max_tr.buffer, val);
2745 if (trace_alloc_page()) { 2706 if (ret < 0) {
2746 cnt = -ENOMEM; 2707 int r;
2747 goto out; 2708 cnt = ret;
2709 r = ring_buffer_resize(global_trace.buffer,
2710 global_trace.entries);
2711 if (r < 0) {
2712 /* AARGH! We are left with different
2713 * size max buffer!!!! */
2714 WARN_ON(1);
2715 tracing_disabled = 1;
2748 } 2716 }
2749 /* double check that we don't go over the known pages */ 2717 goto out;
2750 if (tracing_pages_allocated > pages_requested)
2751 break;
2752 } 2718 }
2753 2719
2754 } else { 2720 global_trace.entries = val;
2755 /* include the number of entries in val (inc of page entries) */
2756 while (global_trace.entries > val + (ENTRIES_PER_PAGE - 1))
2757 trace_free_page();
2758 } 2721 }
2759 2722
2760 /* check integrity */
2761 for_each_tracing_cpu(i)
2762 check_pages(global_trace.data[i]);
2763
2764 filp->f_pos += cnt; 2723 filp->f_pos += cnt;
2765 2724
2766 /* If check pages failed, return ENOMEM */ 2725 /* If check pages failed, return ENOMEM */
@@ -2773,6 +2732,52 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
2773 return cnt; 2732 return cnt;
2774} 2733}
2775 2734
2735static int mark_printk(const char *fmt, ...)
2736{
2737 int ret;
2738 va_list args;
2739 va_start(args, fmt);
2740 ret = trace_vprintk(0, fmt, args);
2741 va_end(args);
2742 return ret;
2743}
2744
2745static ssize_t
2746tracing_mark_write(struct file *filp, const char __user *ubuf,
2747 size_t cnt, loff_t *fpos)
2748{
2749 char *buf;
2750 char *end;
2751 struct trace_array *tr = &global_trace;
2752
2753 if (!tr->ctrl || tracing_disabled)
2754 return -EINVAL;
2755
2756 if (cnt > TRACE_BUF_SIZE)
2757 cnt = TRACE_BUF_SIZE;
2758
2759 buf = kmalloc(cnt + 1, GFP_KERNEL);
2760 if (buf == NULL)
2761 return -ENOMEM;
2762
2763 if (copy_from_user(buf, ubuf, cnt)) {
2764 kfree(buf);
2765 return -EFAULT;
2766 }
2767
2768 /* Cut from the first nil or newline. */
2769 buf[cnt] = '\0';
2770 end = strchr(buf, '\n');
2771 if (end)
2772 *end = '\0';
2773
2774 cnt = mark_printk("%s\n", buf);
2775 kfree(buf);
2776 *fpos += cnt;
2777
2778 return cnt;
2779}
2780
2776static struct file_operations tracing_max_lat_fops = { 2781static struct file_operations tracing_max_lat_fops = {
2777 .open = tracing_open_generic, 2782 .open = tracing_open_generic,
2778 .read = tracing_max_lat_read, 2783 .read = tracing_max_lat_read,
@@ -2804,6 +2809,11 @@ static struct file_operations tracing_entries_fops = {
2804 .write = tracing_entries_write, 2809 .write = tracing_entries_write,
2805}; 2810};
2806 2811
2812static struct file_operations tracing_mark_fops = {
2813 .open = tracing_open_generic,
2814 .write = tracing_mark_write,
2815};
2816
2807#ifdef CONFIG_DYNAMIC_FTRACE 2817#ifdef CONFIG_DYNAMIC_FTRACE
2808 2818
2809static ssize_t 2819static ssize_t
@@ -2850,7 +2860,7 @@ struct dentry *tracing_init_dentry(void)
2850#include "trace_selftest.c" 2860#include "trace_selftest.c"
2851#endif 2861#endif
2852 2862
2853static __init void tracer_init_debugfs(void) 2863static __init int tracer_init_debugfs(void)
2854{ 2864{
2855 struct dentry *d_tracer; 2865 struct dentry *d_tracer;
2856 struct dentry *entry; 2866 struct dentry *entry;
@@ -2885,12 +2895,12 @@ static __init void tracer_init_debugfs(void)
2885 entry = debugfs_create_file("available_tracers", 0444, d_tracer, 2895 entry = debugfs_create_file("available_tracers", 0444, d_tracer,
2886 &global_trace, &show_traces_fops); 2896 &global_trace, &show_traces_fops);
2887 if (!entry) 2897 if (!entry)
2888 pr_warning("Could not create debugfs 'trace' entry\n"); 2898 pr_warning("Could not create debugfs 'available_tracers' entry\n");
2889 2899
2890 entry = debugfs_create_file("current_tracer", 0444, d_tracer, 2900 entry = debugfs_create_file("current_tracer", 0444, d_tracer,
2891 &global_trace, &set_tracer_fops); 2901 &global_trace, &set_tracer_fops);
2892 if (!entry) 2902 if (!entry)
2893 pr_warning("Could not create debugfs 'trace' entry\n"); 2903 pr_warning("Could not create debugfs 'current_tracer' entry\n");
2894 2904
2895 entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer, 2905 entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer,
2896 &tracing_max_latency, 2906 &tracing_max_latency,
@@ -2903,7 +2913,7 @@ static __init void tracer_init_debugfs(void)
2903 &tracing_thresh, &tracing_max_lat_fops); 2913 &tracing_thresh, &tracing_max_lat_fops);
2904 if (!entry) 2914 if (!entry)
2905 pr_warning("Could not create debugfs " 2915 pr_warning("Could not create debugfs "
2906 "'tracing_threash' entry\n"); 2916 "'tracing_thresh' entry\n");
2907 entry = debugfs_create_file("README", 0644, d_tracer, 2917 entry = debugfs_create_file("README", 0644, d_tracer,
2908 NULL, &tracing_readme_fops); 2918 NULL, &tracing_readme_fops);
2909 if (!entry) 2919 if (!entry)
@@ -2913,13 +2923,19 @@ static __init void tracer_init_debugfs(void)
2913 NULL, &tracing_pipe_fops); 2923 NULL, &tracing_pipe_fops);
2914 if (!entry) 2924 if (!entry)
2915 pr_warning("Could not create debugfs " 2925 pr_warning("Could not create debugfs "
2916 "'tracing_threash' entry\n"); 2926 "'trace_pipe' entry\n");
2917 2927
2918 entry = debugfs_create_file("trace_entries", 0644, d_tracer, 2928 entry = debugfs_create_file("trace_entries", 0644, d_tracer,
2919 &global_trace, &tracing_entries_fops); 2929 &global_trace, &tracing_entries_fops);
2920 if (!entry) 2930 if (!entry)
2921 pr_warning("Could not create debugfs " 2931 pr_warning("Could not create debugfs "
2922 "'tracing_threash' entry\n"); 2932 "'trace_entries' entry\n");
2933
2934 entry = debugfs_create_file("trace_marker", 0220, d_tracer,
2935 NULL, &tracing_mark_fops);
2936 if (!entry)
2937 pr_warning("Could not create debugfs "
2938 "'trace_marker' entry\n");
2923 2939
2924#ifdef CONFIG_DYNAMIC_FTRACE 2940#ifdef CONFIG_DYNAMIC_FTRACE
2925 entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer, 2941 entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
@@ -2932,230 +2948,263 @@ static __init void tracer_init_debugfs(void)
2932#ifdef CONFIG_SYSPROF_TRACER 2948#ifdef CONFIG_SYSPROF_TRACER
2933 init_tracer_sysprof_debugfs(d_tracer); 2949 init_tracer_sysprof_debugfs(d_tracer);
2934#endif 2950#endif
2951 return 0;
2935} 2952}
2936 2953
2937static int trace_alloc_page(void) 2954int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
2938{ 2955{
2956 static DEFINE_SPINLOCK(trace_buf_lock);
2957 static char trace_buf[TRACE_BUF_SIZE];
2958
2959 struct ring_buffer_event *event;
2960 struct trace_array *tr = &global_trace;
2939 struct trace_array_cpu *data; 2961 struct trace_array_cpu *data;
2940 struct page *page, *tmp; 2962 struct print_entry *entry;
2941 LIST_HEAD(pages); 2963 unsigned long flags, irq_flags;
2942 void *array; 2964 int cpu, len = 0, size, pc;
2943 unsigned pages_allocated = 0;
2944 int i;
2945 2965
2946 /* first allocate a page for each CPU */ 2966 if (!tr->ctrl || tracing_disabled)
2947 for_each_tracing_cpu(i) { 2967 return 0;
2948 array = (void *)__get_free_page(GFP_KERNEL);
2949 if (array == NULL) {
2950 printk(KERN_ERR "tracer: failed to allocate page"
2951 "for trace buffer!\n");
2952 goto free_pages;
2953 }
2954 2968
2955 pages_allocated++; 2969 pc = preempt_count();
2956 page = virt_to_page(array); 2970 preempt_disable_notrace();
2957 list_add(&page->lru, &pages); 2971 cpu = raw_smp_processor_id();
2972 data = tr->data[cpu];
2958 2973
2959/* Only allocate if we are actually using the max trace */ 2974 if (unlikely(atomic_read(&data->disabled)))
2960#ifdef CONFIG_TRACER_MAX_TRACE 2975 goto out;
2961 array = (void *)__get_free_page(GFP_KERNEL);
2962 if (array == NULL) {
2963 printk(KERN_ERR "tracer: failed to allocate page"
2964 "for trace buffer!\n");
2965 goto free_pages;
2966 }
2967 pages_allocated++;
2968 page = virt_to_page(array);
2969 list_add(&page->lru, &pages);
2970#endif
2971 }
2972 2976
2973 /* Now that we successfully allocate a page per CPU, add them */ 2977 spin_lock_irqsave(&trace_buf_lock, flags);
2974 for_each_tracing_cpu(i) { 2978 len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
2975 data = global_trace.data[i];
2976 page = list_entry(pages.next, struct page, lru);
2977 list_del_init(&page->lru);
2978 list_add_tail(&page->lru, &data->trace_pages);
2979 ClearPageLRU(page);
2980 2979
2981#ifdef CONFIG_TRACER_MAX_TRACE 2980 len = min(len, TRACE_BUF_SIZE-1);
2982 data = max_tr.data[i]; 2981 trace_buf[len] = 0;
2983 page = list_entry(pages.next, struct page, lru);
2984 list_del_init(&page->lru);
2985 list_add_tail(&page->lru, &data->trace_pages);
2986 SetPageLRU(page);
2987#endif
2988 }
2989 tracing_pages_allocated += pages_allocated;
2990 global_trace.entries += ENTRIES_PER_PAGE;
2991 2982
2992 return 0; 2983 size = sizeof(*entry) + len + 1;
2984 event = ring_buffer_lock_reserve(tr->buffer, size, &irq_flags);
2985 if (!event)
2986 goto out_unlock;
2987 entry = ring_buffer_event_data(event);
2988 tracing_generic_entry_update(&entry->ent, flags, pc);
2989 entry->ent.type = TRACE_PRINT;
2990 entry->ip = ip;
2993 2991
2994 free_pages: 2992 memcpy(&entry->buf, trace_buf, len);
2995 list_for_each_entry_safe(page, tmp, &pages, lru) { 2993 entry->buf[len] = 0;
2996 list_del_init(&page->lru); 2994 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
2997 __free_page(page); 2995
2998 } 2996 out_unlock:
2999 return -ENOMEM; 2997 spin_unlock_irqrestore(&trace_buf_lock, flags);
2998
2999 out:
3000 preempt_enable_notrace();
3001
3002 return len;
3000} 3003}
3004EXPORT_SYMBOL_GPL(trace_vprintk);
3001 3005
3002static int trace_free_page(void) 3006int __ftrace_printk(unsigned long ip, const char *fmt, ...)
3003{ 3007{
3004 struct trace_array_cpu *data; 3008 int ret;
3005 struct page *page; 3009 va_list ap;
3006 struct list_head *p;
3007 int i;
3008 int ret = 0;
3009 3010
3010 /* free one page from each buffer */ 3011 if (!(trace_flags & TRACE_ITER_PRINTK))
3011 for_each_tracing_cpu(i) { 3012 return 0;
3012 data = global_trace.data[i];
3013 p = data->trace_pages.next;
3014 if (p == &data->trace_pages) {
3015 /* should never happen */
3016 WARN_ON(1);
3017 tracing_disabled = 1;
3018 ret = -1;
3019 break;
3020 }
3021 page = list_entry(p, struct page, lru);
3022 ClearPageLRU(page);
3023 list_del(&page->lru);
3024 tracing_pages_allocated--;
3025 tracing_pages_allocated--;
3026 __free_page(page);
3027 3013
3028 tracing_reset(data); 3014 va_start(ap, fmt);
3015 ret = trace_vprintk(ip, fmt, ap);
3016 va_end(ap);
3017 return ret;
3018}
3019EXPORT_SYMBOL_GPL(__ftrace_printk);
3029 3020
3030#ifdef CONFIG_TRACER_MAX_TRACE 3021static int trace_panic_handler(struct notifier_block *this,
3031 data = max_tr.data[i]; 3022 unsigned long event, void *unused)
3032 p = data->trace_pages.next; 3023{
3033 if (p == &data->trace_pages) { 3024 ftrace_dump();
3034 /* should never happen */ 3025 return NOTIFY_OK;
3035 WARN_ON(1); 3026}
3036 tracing_disabled = 1;
3037 ret = -1;
3038 break;
3039 }
3040 page = list_entry(p, struct page, lru);
3041 ClearPageLRU(page);
3042 list_del(&page->lru);
3043 __free_page(page);
3044 3027
3045 tracing_reset(data); 3028static struct notifier_block trace_panic_notifier = {
3046#endif 3029 .notifier_call = trace_panic_handler,
3047 } 3030 .next = NULL,
3048 global_trace.entries -= ENTRIES_PER_PAGE; 3031 .priority = 150 /* priority: INT_MAX >= x >= 0 */
3032};
3049 3033
3050 return ret; 3034static int trace_die_handler(struct notifier_block *self,
3035 unsigned long val,
3036 void *data)
3037{
3038 switch (val) {
3039 case DIE_OOPS:
3040 ftrace_dump();
3041 break;
3042 default:
3043 break;
3044 }
3045 return NOTIFY_OK;
3051} 3046}
3052 3047
3053__init static int tracer_alloc_buffers(void) 3048static struct notifier_block trace_die_notifier = {
3049 .notifier_call = trace_die_handler,
3050 .priority = 200
3051};
3052
3053/*
3054 * printk is set to max of 1024, we really don't need it that big.
3055 * Nothing should be printing 1000 characters anyway.
3056 */
3057#define TRACE_MAX_PRINT 1000
3058
3059/*
3060 * Define here KERN_TRACE so that we have one place to modify
3061 * it if we decide to change what log level the ftrace dump
3062 * should be at.
3063 */
3064#define KERN_TRACE KERN_INFO
3065
3066static void
3067trace_printk_seq(struct trace_seq *s)
3054{ 3068{
3055 struct trace_array_cpu *data; 3069 /* Probably should print a warning here. */
3056 void *array; 3070 if (s->len >= 1000)
3057 struct page *page; 3071 s->len = 1000;
3058 int pages = 0;
3059 int ret = -ENOMEM;
3060 int i;
3061 3072
3062 /* TODO: make the number of buffers hot pluggable with CPUS */ 3073 /* should be zero ended, but we are paranoid. */
3063 tracing_nr_buffers = num_possible_cpus(); 3074 s->buffer[s->len] = 0;
3064 tracing_buffer_mask = cpu_possible_map;
3065 3075
3066 /* Allocate the first page for all buffers */ 3076 printk(KERN_TRACE "%s", s->buffer);
3067 for_each_tracing_cpu(i) {
3068 data = global_trace.data[i] = &per_cpu(global_trace_cpu, i);
3069 max_tr.data[i] = &per_cpu(max_data, i);
3070 3077
3071 array = (void *)__get_free_page(GFP_KERNEL); 3078 trace_seq_reset(s);
3072 if (array == NULL) { 3079}
3073 printk(KERN_ERR "tracer: failed to allocate page" 3080
3074 "for trace buffer!\n"); 3081
3075 goto free_buffers; 3082void ftrace_dump(void)
3076 } 3083{
3084 static DEFINE_SPINLOCK(ftrace_dump_lock);
3085 /* use static because iter can be a bit big for the stack */
3086 static struct trace_iterator iter;
3087 static cpumask_t mask;
3088 static int dump_ran;
3089 unsigned long flags;
3090 int cnt = 0, cpu;
3077 3091
3078 /* set the array to the list */ 3092 /* only one dump */
3079 INIT_LIST_HEAD(&data->trace_pages); 3093 spin_lock_irqsave(&ftrace_dump_lock, flags);
3080 page = virt_to_page(array); 3094 if (dump_ran)
3081 list_add(&page->lru, &data->trace_pages); 3095 goto out;
3082 /* use the LRU flag to differentiate the two buffers */
3083 ClearPageLRU(page);
3084 3096
3085 data->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; 3097 dump_ran = 1;
3086 max_tr.data[i]->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
3087 3098
3088/* Only allocate if we are actually using the max trace */ 3099 /* No turning back! */
3089#ifdef CONFIG_TRACER_MAX_TRACE 3100 ftrace_kill_atomic();
3090 array = (void *)__get_free_page(GFP_KERNEL);
3091 if (array == NULL) {
3092 printk(KERN_ERR "tracer: failed to allocate page"
3093 "for trace buffer!\n");
3094 goto free_buffers;
3095 }
3096 3101
3097 INIT_LIST_HEAD(&max_tr.data[i]->trace_pages); 3102 for_each_tracing_cpu(cpu) {
3098 page = virt_to_page(array); 3103 atomic_inc(&global_trace.data[cpu]->disabled);
3099 list_add(&page->lru, &max_tr.data[i]->trace_pages);
3100 SetPageLRU(page);
3101#endif
3102 } 3104 }
3103 3105
3106 printk(KERN_TRACE "Dumping ftrace buffer:\n");
3107
3108 iter.tr = &global_trace;
3109 iter.trace = current_trace;
3110
3104 /* 3111 /*
3105 * Since we allocate by orders of pages, we may be able to 3112 * We need to stop all tracing on all CPUS to read the
3106 * round up a bit. 3113 * the next buffer. This is a bit expensive, but is
3114 * not done often. We fill all what we can read,
3115 * and then release the locks again.
3107 */ 3116 */
3108 global_trace.entries = ENTRIES_PER_PAGE;
3109 pages++;
3110 3117
3111 while (global_trace.entries < trace_nr_entries) { 3118 cpus_clear(mask);
3112 if (trace_alloc_page()) 3119
3113 break; 3120 while (!trace_empty(&iter)) {
3114 pages++; 3121
3122 if (!cnt)
3123 printk(KERN_TRACE "---------------------------------\n");
3124
3125 cnt++;
3126
3127 /* reset all but tr, trace, and overruns */
3128 memset(&iter.seq, 0,
3129 sizeof(struct trace_iterator) -
3130 offsetof(struct trace_iterator, seq));
3131 iter.iter_flags |= TRACE_FILE_LAT_FMT;
3132 iter.pos = -1;
3133
3134 if (find_next_entry_inc(&iter) != NULL) {
3135 print_trace_line(&iter);
3136 trace_consume(&iter);
3137 }
3138
3139 trace_printk_seq(&iter.seq);
3115 } 3140 }
3116 max_tr.entries = global_trace.entries;
3117 3141
3118 pr_info("tracer: %d pages allocated for %ld entries of %ld bytes\n", 3142 if (!cnt)
3119 pages, trace_nr_entries, (long)TRACE_ENTRY_SIZE); 3143 printk(KERN_TRACE " (ftrace buffer empty)\n");
3120 pr_info(" actual entries %ld\n", global_trace.entries); 3144 else
3145 printk(KERN_TRACE "---------------------------------\n");
3146
3147 out:
3148 spin_unlock_irqrestore(&ftrace_dump_lock, flags);
3149}
3150
3151__init static int tracer_alloc_buffers(void)
3152{
3153 struct trace_array_cpu *data;
3154 int i;
3155
3156 /* TODO: make the number of buffers hot pluggable with CPUS */
3157 tracing_buffer_mask = cpu_possible_map;
3158
3159 global_trace.buffer = ring_buffer_alloc(trace_buf_size,
3160 TRACE_BUFFER_FLAGS);
3161 if (!global_trace.buffer) {
3162 printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
3163 WARN_ON(1);
3164 return 0;
3165 }
3166 global_trace.entries = ring_buffer_size(global_trace.buffer);
3121 3167
3122 tracer_init_debugfs(); 3168#ifdef CONFIG_TRACER_MAX_TRACE
3169 max_tr.buffer = ring_buffer_alloc(trace_buf_size,
3170 TRACE_BUFFER_FLAGS);
3171 if (!max_tr.buffer) {
3172 printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
3173 WARN_ON(1);
3174 ring_buffer_free(global_trace.buffer);
3175 return 0;
3176 }
3177 max_tr.entries = ring_buffer_size(max_tr.buffer);
3178 WARN_ON(max_tr.entries != global_trace.entries);
3179#endif
3180
3181 /* Allocate the first page for all buffers */
3182 for_each_tracing_cpu(i) {
3183 data = global_trace.data[i] = &per_cpu(global_trace_cpu, i);
3184 max_tr.data[i] = &per_cpu(max_data, i);
3185 }
3123 3186
3124 trace_init_cmdlines(); 3187 trace_init_cmdlines();
3125 3188
3126 register_tracer(&no_tracer); 3189 register_tracer(&nop_trace);
3127 current_trace = &no_tracer; 3190#ifdef CONFIG_BOOT_TRACER
3191 register_tracer(&boot_tracer);
3192 current_trace = &boot_tracer;
3193 current_trace->init(&global_trace);
3194#else
3195 current_trace = &nop_trace;
3196#endif
3128 3197
3129 /* All seems OK, enable tracing */ 3198 /* All seems OK, enable tracing */
3130 global_trace.ctrl = tracer_enabled; 3199 global_trace.ctrl = tracer_enabled;
3131 tracing_disabled = 0; 3200 tracing_disabled = 0;
3132 3201
3133 return 0; 3202 atomic_notifier_chain_register(&panic_notifier_list,
3203 &trace_panic_notifier);
3134 3204
3135 free_buffers: 3205 register_die_notifier(&trace_die_notifier);
3136 for (i-- ; i >= 0; i--) {
3137 struct page *page, *tmp;
3138 struct trace_array_cpu *data = global_trace.data[i];
3139 3206
3140 if (data) { 3207 return 0;
3141 list_for_each_entry_safe(page, tmp,
3142 &data->trace_pages, lru) {
3143 list_del_init(&page->lru);
3144 __free_page(page);
3145 }
3146 }
3147
3148#ifdef CONFIG_TRACER_MAX_TRACE
3149 data = max_tr.data[i];
3150 if (data) {
3151 list_for_each_entry_safe(page, tmp,
3152 &data->trace_pages, lru) {
3153 list_del_init(&page->lru);
3154 __free_page(page);
3155 }
3156 }
3157#endif
3158 }
3159 return ret;
3160} 3208}
3161fs_initcall(tracer_alloc_buffers); 3209early_initcall(tracer_alloc_buffers);
3210fs_initcall(tracer_init_debugfs);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f69f86788c2b..f1f99572cde7 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -5,7 +5,9 @@
5#include <asm/atomic.h> 5#include <asm/atomic.h>
6#include <linux/sched.h> 6#include <linux/sched.h>
7#include <linux/clocksource.h> 7#include <linux/clocksource.h>
8#include <linux/ring_buffer.h>
8#include <linux/mmiotrace.h> 9#include <linux/mmiotrace.h>
10#include <linux/ftrace.h>
9 11
10enum trace_type { 12enum trace_type {
11 __TRACE_FIRST_TYPE = 0, 13 __TRACE_FIRST_TYPE = 0,
@@ -13,38 +15,60 @@ enum trace_type {
13 TRACE_FN, 15 TRACE_FN,
14 TRACE_CTX, 16 TRACE_CTX,
15 TRACE_WAKE, 17 TRACE_WAKE,
18 TRACE_CONT,
16 TRACE_STACK, 19 TRACE_STACK,
20 TRACE_PRINT,
17 TRACE_SPECIAL, 21 TRACE_SPECIAL,
18 TRACE_MMIO_RW, 22 TRACE_MMIO_RW,
19 TRACE_MMIO_MAP, 23 TRACE_MMIO_MAP,
24 TRACE_BOOT,
20 25
21 __TRACE_LAST_TYPE 26 __TRACE_LAST_TYPE
22}; 27};
23 28
24/* 29/*
30 * The trace entry - the most basic unit of tracing. This is what
31 * is printed in the end as a single line in the trace output, such as:
32 *
33 * bash-15816 [01] 235.197585: idle_cpu <- irq_enter
34 */
35struct trace_entry {
36 unsigned char type;
37 unsigned char cpu;
38 unsigned char flags;
39 unsigned char preempt_count;
40 int pid;
41};
42
43/*
25 * Function trace entry - function address and parent function addres: 44 * Function trace entry - function address and parent function addres:
26 */ 45 */
27struct ftrace_entry { 46struct ftrace_entry {
47 struct trace_entry ent;
28 unsigned long ip; 48 unsigned long ip;
29 unsigned long parent_ip; 49 unsigned long parent_ip;
30}; 50};
51extern struct tracer boot_tracer;
31 52
32/* 53/*
33 * Context switch trace entry - which task (and prio) we switched from/to: 54 * Context switch trace entry - which task (and prio) we switched from/to:
34 */ 55 */
35struct ctx_switch_entry { 56struct ctx_switch_entry {
57 struct trace_entry ent;
36 unsigned int prev_pid; 58 unsigned int prev_pid;
37 unsigned char prev_prio; 59 unsigned char prev_prio;
38 unsigned char prev_state; 60 unsigned char prev_state;
39 unsigned int next_pid; 61 unsigned int next_pid;
40 unsigned char next_prio; 62 unsigned char next_prio;
41 unsigned char next_state; 63 unsigned char next_state;
64 unsigned int next_cpu;
42}; 65};
43 66
44/* 67/*
45 * Special (free-form) trace entry: 68 * Special (free-form) trace entry:
46 */ 69 */
47struct special_entry { 70struct special_entry {
71 struct trace_entry ent;
48 unsigned long arg1; 72 unsigned long arg1;
49 unsigned long arg2; 73 unsigned long arg2;
50 unsigned long arg3; 74 unsigned long arg3;
@@ -57,33 +81,60 @@ struct special_entry {
57#define FTRACE_STACK_ENTRIES 8 81#define FTRACE_STACK_ENTRIES 8
58 82
59struct stack_entry { 83struct stack_entry {
84 struct trace_entry ent;
60 unsigned long caller[FTRACE_STACK_ENTRIES]; 85 unsigned long caller[FTRACE_STACK_ENTRIES];
61}; 86};
62 87
63/* 88/*
64 * The trace entry - the most basic unit of tracing. This is what 89 * ftrace_printk entry:
65 * is printed in the end as a single line in the trace output, such as:
66 *
67 * bash-15816 [01] 235.197585: idle_cpu <- irq_enter
68 */ 90 */
69struct trace_entry { 91struct print_entry {
70 char type; 92 struct trace_entry ent;
71 char cpu; 93 unsigned long ip;
72 char flags; 94 char buf[];
73 char preempt_count; 95};
74 int pid; 96
75 cycle_t t; 97#define TRACE_OLD_SIZE 88
76 union { 98
77 struct ftrace_entry fn; 99struct trace_field_cont {
78 struct ctx_switch_entry ctx; 100 unsigned char type;
79 struct special_entry special; 101 /* Temporary till we get rid of this completely */
80 struct stack_entry stack; 102 char buf[TRACE_OLD_SIZE - 1];
81 struct mmiotrace_rw mmiorw; 103};
82 struct mmiotrace_map mmiomap; 104
83 }; 105struct trace_mmiotrace_rw {
106 struct trace_entry ent;
107 struct mmiotrace_rw rw;
84}; 108};
85 109
86#define TRACE_ENTRY_SIZE sizeof(struct trace_entry) 110struct trace_mmiotrace_map {
111 struct trace_entry ent;
112 struct mmiotrace_map map;
113};
114
115struct trace_boot {
116 struct trace_entry ent;
117 struct boot_trace initcall;
118};
119
120/*
121 * trace_flag_type is an enumeration that holds different
122 * states when a trace occurs. These are:
123 * IRQS_OFF - interrupts were disabled
124 * NEED_RESCED - reschedule is requested
125 * HARDIRQ - inside an interrupt handler
126 * SOFTIRQ - inside a softirq handler
127 * CONT - multiple entries hold the trace item
128 */
129enum trace_flag_type {
130 TRACE_FLAG_IRQS_OFF = 0x01,
131 TRACE_FLAG_NEED_RESCHED = 0x02,
132 TRACE_FLAG_HARDIRQ = 0x04,
133 TRACE_FLAG_SOFTIRQ = 0x08,
134 TRACE_FLAG_CONT = 0x10,
135};
136
137#define TRACE_BUF_SIZE 1024
87 138
88/* 139/*
89 * The CPU trace array - it consists of thousands of trace entries 140 * The CPU trace array - it consists of thousands of trace entries
@@ -91,16 +142,9 @@ struct trace_entry {
91 * the trace, etc.) 142 * the trace, etc.)
92 */ 143 */
93struct trace_array_cpu { 144struct trace_array_cpu {
94 struct list_head trace_pages;
95 atomic_t disabled; 145 atomic_t disabled;
96 raw_spinlock_t lock;
97 struct lock_class_key lock_key;
98 146
99 /* these fields get copied into max-trace: */ 147 /* these fields get copied into max-trace: */
100 unsigned trace_head_idx;
101 unsigned trace_tail_idx;
102 void *trace_head; /* producer */
103 void *trace_tail; /* consumer */
104 unsigned long trace_idx; 148 unsigned long trace_idx;
105 unsigned long overrun; 149 unsigned long overrun;
106 unsigned long saved_latency; 150 unsigned long saved_latency;
@@ -124,6 +168,7 @@ struct trace_iterator;
124 * They have on/off state as well: 168 * They have on/off state as well:
125 */ 169 */
126struct trace_array { 170struct trace_array {
171 struct ring_buffer *buffer;
127 unsigned long entries; 172 unsigned long entries;
128 long ctrl; 173 long ctrl;
129 int cpu; 174 int cpu;
@@ -132,6 +177,56 @@ struct trace_array {
132 struct trace_array_cpu *data[NR_CPUS]; 177 struct trace_array_cpu *data[NR_CPUS];
133}; 178};
134 179
180#define FTRACE_CMP_TYPE(var, type) \
181 __builtin_types_compatible_p(typeof(var), type *)
182
183#undef IF_ASSIGN
184#define IF_ASSIGN(var, entry, etype, id) \
185 if (FTRACE_CMP_TYPE(var, etype)) { \
186 var = (typeof(var))(entry); \
187 WARN_ON(id && (entry)->type != id); \
188 break; \
189 }
190
191/* Will cause compile errors if type is not found. */
192extern void __ftrace_bad_type(void);
193
194/*
195 * The trace_assign_type is a verifier that the entry type is
196 * the same as the type being assigned. To add new types simply
197 * add a line with the following format:
198 *
199 * IF_ASSIGN(var, ent, type, id);
200 *
201 * Where "type" is the trace type that includes the trace_entry
202 * as the "ent" item. And "id" is the trace identifier that is
203 * used in the trace_type enum.
204 *
205 * If the type can have more than one id, then use zero.
206 */
207#define trace_assign_type(var, ent) \
208 do { \
209 IF_ASSIGN(var, ent, struct ftrace_entry, TRACE_FN); \
210 IF_ASSIGN(var, ent, struct ctx_switch_entry, 0); \
211 IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \
212 IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \
213 IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \
214 IF_ASSIGN(var, ent, struct special_entry, 0); \
215 IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \
216 TRACE_MMIO_RW); \
217 IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \
218 TRACE_MMIO_MAP); \
219 IF_ASSIGN(var, ent, struct trace_boot, TRACE_BOOT); \
220 __ftrace_bad_type(); \
221 } while (0)
222
223/* Return values for print_line callback */
224enum print_line_t {
225 TRACE_TYPE_PARTIAL_LINE = 0, /* Retry after flushing the seq */
226 TRACE_TYPE_HANDLED = 1,
227 TRACE_TYPE_UNHANDLED = 2 /* Relay to other output functions */
228};
229
135/* 230/*
136 * A specific tracer, represented by methods that operate on a trace array: 231 * A specific tracer, represented by methods that operate on a trace array:
137 */ 232 */
@@ -152,7 +247,7 @@ struct tracer {
152 int (*selftest)(struct tracer *trace, 247 int (*selftest)(struct tracer *trace,
153 struct trace_array *tr); 248 struct trace_array *tr);
154#endif 249#endif
155 int (*print_line)(struct trace_iterator *iter); 250 enum print_line_t (*print_line)(struct trace_iterator *iter);
156 struct tracer *next; 251 struct tracer *next;
157 int print_max; 252 int print_max;
158}; 253};
@@ -171,57 +266,58 @@ struct trace_iterator {
171 struct trace_array *tr; 266 struct trace_array *tr;
172 struct tracer *trace; 267 struct tracer *trace;
173 void *private; 268 void *private;
174 long last_overrun[NR_CPUS]; 269 struct ring_buffer_iter *buffer_iter[NR_CPUS];
175 long overrun[NR_CPUS];
176 270
177 /* The below is zeroed out in pipe_read */ 271 /* The below is zeroed out in pipe_read */
178 struct trace_seq seq; 272 struct trace_seq seq;
179 struct trace_entry *ent; 273 struct trace_entry *ent;
180 int cpu; 274 int cpu;
181 275 u64 ts;
182 struct trace_entry *prev_ent;
183 int prev_cpu;
184 276
185 unsigned long iter_flags; 277 unsigned long iter_flags;
186 loff_t pos; 278 loff_t pos;
187 unsigned long next_idx[NR_CPUS];
188 struct list_head *next_page[NR_CPUS];
189 unsigned next_page_idx[NR_CPUS];
190 long idx; 279 long idx;
191}; 280};
192 281
193void tracing_reset(struct trace_array_cpu *data); 282void trace_wake_up(void);
283void tracing_reset(struct trace_array *tr, int cpu);
194int tracing_open_generic(struct inode *inode, struct file *filp); 284int tracing_open_generic(struct inode *inode, struct file *filp);
195struct dentry *tracing_init_dentry(void); 285struct dentry *tracing_init_dentry(void);
196void init_tracer_sysprof_debugfs(struct dentry *d_tracer); 286void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
197 287
288struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
289 struct trace_array_cpu *data);
290void tracing_generic_entry_update(struct trace_entry *entry,
291 unsigned long flags,
292 int pc);
293
198void ftrace(struct trace_array *tr, 294void ftrace(struct trace_array *tr,
199 struct trace_array_cpu *data, 295 struct trace_array_cpu *data,
200 unsigned long ip, 296 unsigned long ip,
201 unsigned long parent_ip, 297 unsigned long parent_ip,
202 unsigned long flags); 298 unsigned long flags, int pc);
203void tracing_sched_switch_trace(struct trace_array *tr, 299void tracing_sched_switch_trace(struct trace_array *tr,
204 struct trace_array_cpu *data, 300 struct trace_array_cpu *data,
205 struct task_struct *prev, 301 struct task_struct *prev,
206 struct task_struct *next, 302 struct task_struct *next,
207 unsigned long flags); 303 unsigned long flags, int pc);
208void tracing_record_cmdline(struct task_struct *tsk); 304void tracing_record_cmdline(struct task_struct *tsk);
209 305
210void tracing_sched_wakeup_trace(struct trace_array *tr, 306void tracing_sched_wakeup_trace(struct trace_array *tr,
211 struct trace_array_cpu *data, 307 struct trace_array_cpu *data,
212 struct task_struct *wakee, 308 struct task_struct *wakee,
213 struct task_struct *cur, 309 struct task_struct *cur,
214 unsigned long flags); 310 unsigned long flags, int pc);
215void trace_special(struct trace_array *tr, 311void trace_special(struct trace_array *tr,
216 struct trace_array_cpu *data, 312 struct trace_array_cpu *data,
217 unsigned long arg1, 313 unsigned long arg1,
218 unsigned long arg2, 314 unsigned long arg2,
219 unsigned long arg3); 315 unsigned long arg3, int pc);
220void trace_function(struct trace_array *tr, 316void trace_function(struct trace_array *tr,
221 struct trace_array_cpu *data, 317 struct trace_array_cpu *data,
222 unsigned long ip, 318 unsigned long ip,
223 unsigned long parent_ip, 319 unsigned long parent_ip,
224 unsigned long flags); 320 unsigned long flags, int pc);
225 321
226void tracing_start_cmdline_record(void); 322void tracing_start_cmdline_record(void);
227void tracing_stop_cmdline_record(void); 323void tracing_stop_cmdline_record(void);
@@ -268,51 +364,33 @@ extern unsigned long ftrace_update_tot_cnt;
268extern int DYN_FTRACE_TEST_NAME(void); 364extern int DYN_FTRACE_TEST_NAME(void);
269#endif 365#endif
270 366
271#ifdef CONFIG_MMIOTRACE
272extern void __trace_mmiotrace_rw(struct trace_array *tr,
273 struct trace_array_cpu *data,
274 struct mmiotrace_rw *rw);
275extern void __trace_mmiotrace_map(struct trace_array *tr,
276 struct trace_array_cpu *data,
277 struct mmiotrace_map *map);
278#endif
279
280#ifdef CONFIG_FTRACE_STARTUP_TEST 367#ifdef CONFIG_FTRACE_STARTUP_TEST
281#ifdef CONFIG_FTRACE
282extern int trace_selftest_startup_function(struct tracer *trace, 368extern int trace_selftest_startup_function(struct tracer *trace,
283 struct trace_array *tr); 369 struct trace_array *tr);
284#endif
285#ifdef CONFIG_IRQSOFF_TRACER
286extern int trace_selftest_startup_irqsoff(struct tracer *trace, 370extern int trace_selftest_startup_irqsoff(struct tracer *trace,
287 struct trace_array *tr); 371 struct trace_array *tr);
288#endif
289#ifdef CONFIG_PREEMPT_TRACER
290extern int trace_selftest_startup_preemptoff(struct tracer *trace, 372extern int trace_selftest_startup_preemptoff(struct tracer *trace,
291 struct trace_array *tr); 373 struct trace_array *tr);
292#endif
293#if defined(CONFIG_IRQSOFF_TRACER) && defined(CONFIG_PREEMPT_TRACER)
294extern int trace_selftest_startup_preemptirqsoff(struct tracer *trace, 374extern int trace_selftest_startup_preemptirqsoff(struct tracer *trace,
295 struct trace_array *tr); 375 struct trace_array *tr);
296#endif
297#ifdef CONFIG_SCHED_TRACER
298extern int trace_selftest_startup_wakeup(struct tracer *trace, 376extern int trace_selftest_startup_wakeup(struct tracer *trace,
299 struct trace_array *tr); 377 struct trace_array *tr);
300#endif 378extern int trace_selftest_startup_nop(struct tracer *trace,
301#ifdef CONFIG_CONTEXT_SWITCH_TRACER 379 struct trace_array *tr);
302extern int trace_selftest_startup_sched_switch(struct tracer *trace, 380extern int trace_selftest_startup_sched_switch(struct tracer *trace,
303 struct trace_array *tr); 381 struct trace_array *tr);
304#endif
305#ifdef CONFIG_SYSPROF_TRACER
306extern int trace_selftest_startup_sysprof(struct tracer *trace, 382extern int trace_selftest_startup_sysprof(struct tracer *trace,
307 struct trace_array *tr); 383 struct trace_array *tr);
308#endif
309#endif /* CONFIG_FTRACE_STARTUP_TEST */ 384#endif /* CONFIG_FTRACE_STARTUP_TEST */
310 385
311extern void *head_page(struct trace_array_cpu *data); 386extern void *head_page(struct trace_array_cpu *data);
312extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...); 387extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
388extern void trace_seq_print_cont(struct trace_seq *s,
389 struct trace_iterator *iter);
313extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, 390extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
314 size_t cnt); 391 size_t cnt);
315extern long ns2usecs(cycle_t nsec); 392extern long ns2usecs(cycle_t nsec);
393extern int trace_vprintk(unsigned long ip, const char *fmt, va_list args);
316 394
317extern unsigned long trace_flags; 395extern unsigned long trace_flags;
318 396
@@ -334,6 +412,9 @@ enum trace_iterator_flags {
334 TRACE_ITER_BLOCK = 0x80, 412 TRACE_ITER_BLOCK = 0x80,
335 TRACE_ITER_STACKTRACE = 0x100, 413 TRACE_ITER_STACKTRACE = 0x100,
336 TRACE_ITER_SCHED_TREE = 0x200, 414 TRACE_ITER_SCHED_TREE = 0x200,
415 TRACE_ITER_PRINTK = 0x400,
337}; 416};
338 417
418extern struct tracer nop_trace;
419
339#endif /* _LINUX_KERNEL_TRACE_H */ 420#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
new file mode 100644
index 000000000000..d0a5e50eeff2
--- /dev/null
+++ b/kernel/trace/trace_boot.c
@@ -0,0 +1,126 @@
1/*
2 * ring buffer based initcalls tracer
3 *
4 * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
5 *
6 */
7
8#include <linux/init.h>
9#include <linux/debugfs.h>
10#include <linux/ftrace.h>
11#include <linux/kallsyms.h>
12
13#include "trace.h"
14
15static struct trace_array *boot_trace;
16static int trace_boot_enabled;
17
18
19/* Should be started after do_pre_smp_initcalls() in init/main.c */
20void start_boot_trace(void)
21{
22 trace_boot_enabled = 1;
23}
24
25void stop_boot_trace(void)
26{
27 trace_boot_enabled = 0;
28}
29
30void reset_boot_trace(struct trace_array *tr)
31{
32 stop_boot_trace();
33}
34
35static void boot_trace_init(struct trace_array *tr)
36{
37 int cpu;
38 boot_trace = tr;
39
40 trace_boot_enabled = 0;
41
42 for_each_cpu_mask(cpu, cpu_possible_map)
43 tracing_reset(tr, cpu);
44}
45
46static void boot_trace_ctrl_update(struct trace_array *tr)
47{
48 if (tr->ctrl)
49 start_boot_trace();
50 else
51 stop_boot_trace();
52}
53
54static enum print_line_t initcall_print_line(struct trace_iterator *iter)
55{
56 int ret;
57 struct trace_entry *entry = iter->ent;
58 struct trace_boot *field = (struct trace_boot *)entry;
59 struct boot_trace *it = &field->initcall;
60 struct trace_seq *s = &iter->seq;
61 struct timespec calltime = ktime_to_timespec(it->calltime);
62 struct timespec rettime = ktime_to_timespec(it->rettime);
63
64 if (entry->type == TRACE_BOOT) {
65 ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n",
66 calltime.tv_sec,
67 calltime.tv_nsec,
68 it->func, it->caller);
69 if (!ret)
70 return TRACE_TYPE_PARTIAL_LINE;
71
72 ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "
73 "returned %d after %lld msecs\n",
74 rettime.tv_sec,
75 rettime.tv_nsec,
76 it->func, it->result, it->duration);
77
78 if (!ret)
79 return TRACE_TYPE_PARTIAL_LINE;
80 return TRACE_TYPE_HANDLED;
81 }
82 return TRACE_TYPE_UNHANDLED;
83}
84
85struct tracer boot_tracer __read_mostly =
86{
87 .name = "initcall",
88 .init = boot_trace_init,
89 .reset = reset_boot_trace,
90 .ctrl_update = boot_trace_ctrl_update,
91 .print_line = initcall_print_line,
92};
93
94void trace_boot(struct boot_trace *it, initcall_t fn)
95{
96 struct ring_buffer_event *event;
97 struct trace_boot *entry;
98 struct trace_array_cpu *data;
99 unsigned long irq_flags;
100 struct trace_array *tr = boot_trace;
101
102 if (!trace_boot_enabled)
103 return;
104
105 /* Get its name now since this function could
106 * disappear because it is in the .init section.
107 */
108 sprint_symbol(it->func, (unsigned long)fn);
109 preempt_disable();
110 data = tr->data[smp_processor_id()];
111
112 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
113 &irq_flags);
114 if (!event)
115 goto out;
116 entry = ring_buffer_event_data(event);
117 tracing_generic_entry_update(&entry->ent, 0, 0);
118 entry->ent.type = TRACE_BOOT;
119 entry->initcall = *it;
120 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
121
122 trace_wake_up();
123
124 out:
125 preempt_enable();
126}
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 312144897970..e90eb0c2c56c 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -23,7 +23,7 @@ static void function_reset(struct trace_array *tr)
23 tr->time_start = ftrace_now(tr->cpu); 23 tr->time_start = ftrace_now(tr->cpu);
24 24
25 for_each_online_cpu(cpu) 25 for_each_online_cpu(cpu)
26 tracing_reset(tr->data[cpu]); 26 tracing_reset(tr, cpu);
27} 27}
28 28
29static void start_function_trace(struct trace_array *tr) 29static void start_function_trace(struct trace_array *tr)
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 421d6fe3650e..a7db7f040ae0 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -95,7 +95,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
95 disabled = atomic_inc_return(&data->disabled); 95 disabled = atomic_inc_return(&data->disabled);
96 96
97 if (likely(disabled == 1)) 97 if (likely(disabled == 1))
98 trace_function(tr, data, ip, parent_ip, flags); 98 trace_function(tr, data, ip, parent_ip, flags, preempt_count());
99 99
100 atomic_dec(&data->disabled); 100 atomic_dec(&data->disabled);
101} 101}
@@ -130,6 +130,7 @@ check_critical_timing(struct trace_array *tr,
130 unsigned long latency, t0, t1; 130 unsigned long latency, t0, t1;
131 cycle_t T0, T1, delta; 131 cycle_t T0, T1, delta;
132 unsigned long flags; 132 unsigned long flags;
133 int pc;
133 134
134 /* 135 /*
135 * usecs conversion is slow so we try to delay the conversion 136 * usecs conversion is slow so we try to delay the conversion
@@ -141,6 +142,8 @@ check_critical_timing(struct trace_array *tr,
141 142
142 local_save_flags(flags); 143 local_save_flags(flags);
143 144
145 pc = preempt_count();
146
144 if (!report_latency(delta)) 147 if (!report_latency(delta))
145 goto out; 148 goto out;
146 149
@@ -150,7 +153,7 @@ check_critical_timing(struct trace_array *tr,
150 if (!report_latency(delta)) 153 if (!report_latency(delta))
151 goto out_unlock; 154 goto out_unlock;
152 155
153 trace_function(tr, data, CALLER_ADDR0, parent_ip, flags); 156 trace_function(tr, data, CALLER_ADDR0, parent_ip, flags, pc);
154 157
155 latency = nsecs_to_usecs(delta); 158 latency = nsecs_to_usecs(delta);
156 159
@@ -173,8 +176,8 @@ out_unlock:
173out: 176out:
174 data->critical_sequence = max_sequence; 177 data->critical_sequence = max_sequence;
175 data->preempt_timestamp = ftrace_now(cpu); 178 data->preempt_timestamp = ftrace_now(cpu);
176 tracing_reset(data); 179 tracing_reset(tr, cpu);
177 trace_function(tr, data, CALLER_ADDR0, parent_ip, flags); 180 trace_function(tr, data, CALLER_ADDR0, parent_ip, flags, pc);
178} 181}
179 182
180static inline void 183static inline void
@@ -203,11 +206,11 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
203 data->critical_sequence = max_sequence; 206 data->critical_sequence = max_sequence;
204 data->preempt_timestamp = ftrace_now(cpu); 207 data->preempt_timestamp = ftrace_now(cpu);
205 data->critical_start = parent_ip ? : ip; 208 data->critical_start = parent_ip ? : ip;
206 tracing_reset(data); 209 tracing_reset(tr, cpu);
207 210
208 local_save_flags(flags); 211 local_save_flags(flags);
209 212
210 trace_function(tr, data, ip, parent_ip, flags); 213 trace_function(tr, data, ip, parent_ip, flags, preempt_count());
211 214
212 per_cpu(tracing_cpu, cpu) = 1; 215 per_cpu(tracing_cpu, cpu) = 1;
213 216
@@ -234,14 +237,14 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
234 237
235 data = tr->data[cpu]; 238 data = tr->data[cpu];
236 239
237 if (unlikely(!data) || unlikely(!head_page(data)) || 240 if (unlikely(!data) ||
238 !data->critical_start || atomic_read(&data->disabled)) 241 !data->critical_start || atomic_read(&data->disabled))
239 return; 242 return;
240 243
241 atomic_inc(&data->disabled); 244 atomic_inc(&data->disabled);
242 245
243 local_save_flags(flags); 246 local_save_flags(flags);
244 trace_function(tr, data, ip, parent_ip, flags); 247 trace_function(tr, data, ip, parent_ip, flags, preempt_count());
245 check_critical_timing(tr, data, parent_ip ? : ip, cpu); 248 check_critical_timing(tr, data, parent_ip ? : ip, cpu);
246 data->critical_start = 0; 249 data->critical_start = 0;
247 atomic_dec(&data->disabled); 250 atomic_dec(&data->disabled);
@@ -253,12 +256,14 @@ void start_critical_timings(void)
253 if (preempt_trace() || irq_trace()) 256 if (preempt_trace() || irq_trace())
254 start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); 257 start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
255} 258}
259EXPORT_SYMBOL_GPL(start_critical_timings);
256 260
257void stop_critical_timings(void) 261void stop_critical_timings(void)
258{ 262{
259 if (preempt_trace() || irq_trace()) 263 if (preempt_trace() || irq_trace())
260 stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); 264 stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
261} 265}
266EXPORT_SYMBOL_GPL(stop_critical_timings);
262 267
263#ifdef CONFIG_IRQSOFF_TRACER 268#ifdef CONFIG_IRQSOFF_TRACER
264#ifdef CONFIG_PROVE_LOCKING 269#ifdef CONFIG_PROVE_LOCKING
@@ -337,12 +342,14 @@ EXPORT_SYMBOL(trace_hardirqs_off_caller);
337#ifdef CONFIG_PREEMPT_TRACER 342#ifdef CONFIG_PREEMPT_TRACER
338void trace_preempt_on(unsigned long a0, unsigned long a1) 343void trace_preempt_on(unsigned long a0, unsigned long a1)
339{ 344{
340 stop_critical_timing(a0, a1); 345 if (preempt_trace())
346 stop_critical_timing(a0, a1);
341} 347}
342 348
343void trace_preempt_off(unsigned long a0, unsigned long a1) 349void trace_preempt_off(unsigned long a0, unsigned long a1)
344{ 350{
345 start_critical_timing(a0, a1); 351 if (preempt_trace())
352 start_critical_timing(a0, a1);
346} 353}
347#endif /* CONFIG_PREEMPT_TRACER */ 354#endif /* CONFIG_PREEMPT_TRACER */
348 355
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index b13dc19dcbb4..f28484618ff0 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -27,7 +27,7 @@ static void mmio_reset_data(struct trace_array *tr)
27 tr->time_start = ftrace_now(tr->cpu); 27 tr->time_start = ftrace_now(tr->cpu);
28 28
29 for_each_online_cpu(cpu) 29 for_each_online_cpu(cpu)
30 tracing_reset(tr->data[cpu]); 30 tracing_reset(tr, cpu);
31} 31}
32 32
33static void mmio_trace_init(struct trace_array *tr) 33static void mmio_trace_init(struct trace_array *tr)
@@ -130,10 +130,14 @@ static unsigned long count_overruns(struct trace_iterator *iter)
130{ 130{
131 int cpu; 131 int cpu;
132 unsigned long cnt = 0; 132 unsigned long cnt = 0;
133/* FIXME: */
134#if 0
133 for_each_online_cpu(cpu) { 135 for_each_online_cpu(cpu) {
134 cnt += iter->overrun[cpu]; 136 cnt += iter->overrun[cpu];
135 iter->overrun[cpu] = 0; 137 iter->overrun[cpu] = 0;
136 } 138 }
139#endif
140 (void)cpu;
137 return cnt; 141 return cnt;
138} 142}
139 143
@@ -171,17 +175,21 @@ print_out:
171 return (ret == -EBUSY) ? 0 : ret; 175 return (ret == -EBUSY) ? 0 : ret;
172} 176}
173 177
174static int mmio_print_rw(struct trace_iterator *iter) 178static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
175{ 179{
176 struct trace_entry *entry = iter->ent; 180 struct trace_entry *entry = iter->ent;
177 struct mmiotrace_rw *rw = &entry->mmiorw; 181 struct trace_mmiotrace_rw *field;
182 struct mmiotrace_rw *rw;
178 struct trace_seq *s = &iter->seq; 183 struct trace_seq *s = &iter->seq;
179 unsigned long long t = ns2usecs(entry->t); 184 unsigned long long t = ns2usecs(iter->ts);
180 unsigned long usec_rem = do_div(t, 1000000ULL); 185 unsigned long usec_rem = do_div(t, 1000000ULL);
181 unsigned secs = (unsigned long)t; 186 unsigned secs = (unsigned long)t;
182 int ret = 1; 187 int ret = 1;
183 188
184 switch (entry->mmiorw.opcode) { 189 trace_assign_type(field, entry);
190 rw = &field->rw;
191
192 switch (rw->opcode) {
185 case MMIO_READ: 193 case MMIO_READ:
186 ret = trace_seq_printf(s, 194 ret = trace_seq_printf(s,
187 "R %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n", 195 "R %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
@@ -209,21 +217,25 @@ static int mmio_print_rw(struct trace_iterator *iter)
209 break; 217 break;
210 } 218 }
211 if (ret) 219 if (ret)
212 return 1; 220 return TRACE_TYPE_HANDLED;
213 return 0; 221 return TRACE_TYPE_PARTIAL_LINE;
214} 222}
215 223
216static int mmio_print_map(struct trace_iterator *iter) 224static enum print_line_t mmio_print_map(struct trace_iterator *iter)
217{ 225{
218 struct trace_entry *entry = iter->ent; 226 struct trace_entry *entry = iter->ent;
219 struct mmiotrace_map *m = &entry->mmiomap; 227 struct trace_mmiotrace_map *field;
228 struct mmiotrace_map *m;
220 struct trace_seq *s = &iter->seq; 229 struct trace_seq *s = &iter->seq;
221 unsigned long long t = ns2usecs(entry->t); 230 unsigned long long t = ns2usecs(iter->ts);
222 unsigned long usec_rem = do_div(t, 1000000ULL); 231 unsigned long usec_rem = do_div(t, 1000000ULL);
223 unsigned secs = (unsigned long)t; 232 unsigned secs = (unsigned long)t;
224 int ret = 1; 233 int ret;
225 234
226 switch (entry->mmiorw.opcode) { 235 trace_assign_type(field, entry);
236 m = &field->map;
237
238 switch (m->opcode) {
227 case MMIO_PROBE: 239 case MMIO_PROBE:
228 ret = trace_seq_printf(s, 240 ret = trace_seq_printf(s,
229 "MAP %lu.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n", 241 "MAP %lu.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n",
@@ -241,20 +253,43 @@ static int mmio_print_map(struct trace_iterator *iter)
241 break; 253 break;
242 } 254 }
243 if (ret) 255 if (ret)
244 return 1; 256 return TRACE_TYPE_HANDLED;
245 return 0; 257 return TRACE_TYPE_PARTIAL_LINE;
258}
259
260static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
261{
262 struct trace_entry *entry = iter->ent;
263 struct print_entry *print = (struct print_entry *)entry;
264 const char *msg = print->buf;
265 struct trace_seq *s = &iter->seq;
266 unsigned long long t = ns2usecs(iter->ts);
267 unsigned long usec_rem = do_div(t, 1000000ULL);
268 unsigned secs = (unsigned long)t;
269 int ret;
270
271 /* The trailing newline must be in the message. */
272 ret = trace_seq_printf(s, "MARK %lu.%06lu %s", secs, usec_rem, msg);
273 if (!ret)
274 return TRACE_TYPE_PARTIAL_LINE;
275
276 if (entry->flags & TRACE_FLAG_CONT)
277 trace_seq_print_cont(s, iter);
278
279 return TRACE_TYPE_HANDLED;
246} 280}
247 281
248/* return 0 to abort printing without consuming current entry in pipe mode */ 282static enum print_line_t mmio_print_line(struct trace_iterator *iter)
249static int mmio_print_line(struct trace_iterator *iter)
250{ 283{
251 switch (iter->ent->type) { 284 switch (iter->ent->type) {
252 case TRACE_MMIO_RW: 285 case TRACE_MMIO_RW:
253 return mmio_print_rw(iter); 286 return mmio_print_rw(iter);
254 case TRACE_MMIO_MAP: 287 case TRACE_MMIO_MAP:
255 return mmio_print_map(iter); 288 return mmio_print_map(iter);
289 case TRACE_PRINT:
290 return mmio_print_mark(iter);
256 default: 291 default:
257 return 1; /* ignore unknown entries */ 292 return TRACE_TYPE_HANDLED; /* ignore unknown entries */
258 } 293 }
259} 294}
260 295
@@ -276,6 +311,27 @@ __init static int init_mmio_trace(void)
276} 311}
277device_initcall(init_mmio_trace); 312device_initcall(init_mmio_trace);
278 313
314static void __trace_mmiotrace_rw(struct trace_array *tr,
315 struct trace_array_cpu *data,
316 struct mmiotrace_rw *rw)
317{
318 struct ring_buffer_event *event;
319 struct trace_mmiotrace_rw *entry;
320 unsigned long irq_flags;
321
322 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
323 &irq_flags);
324 if (!event)
325 return;
326 entry = ring_buffer_event_data(event);
327 tracing_generic_entry_update(&entry->ent, 0, preempt_count());
328 entry->ent.type = TRACE_MMIO_RW;
329 entry->rw = *rw;
330 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
331
332 trace_wake_up();
333}
334
279void mmio_trace_rw(struct mmiotrace_rw *rw) 335void mmio_trace_rw(struct mmiotrace_rw *rw)
280{ 336{
281 struct trace_array *tr = mmio_trace_array; 337 struct trace_array *tr = mmio_trace_array;
@@ -283,6 +339,27 @@ void mmio_trace_rw(struct mmiotrace_rw *rw)
283 __trace_mmiotrace_rw(tr, data, rw); 339 __trace_mmiotrace_rw(tr, data, rw);
284} 340}
285 341
342static void __trace_mmiotrace_map(struct trace_array *tr,
343 struct trace_array_cpu *data,
344 struct mmiotrace_map *map)
345{
346 struct ring_buffer_event *event;
347 struct trace_mmiotrace_map *entry;
348 unsigned long irq_flags;
349
350 event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
351 &irq_flags);
352 if (!event)
353 return;
354 entry = ring_buffer_event_data(event);
355 tracing_generic_entry_update(&entry->ent, 0, preempt_count());
356 entry->ent.type = TRACE_MMIO_MAP;
357 entry->map = *map;
358 ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
359
360 trace_wake_up();
361}
362
286void mmio_trace_mapping(struct mmiotrace_map *map) 363void mmio_trace_mapping(struct mmiotrace_map *map)
287{ 364{
288 struct trace_array *tr = mmio_trace_array; 365 struct trace_array *tr = mmio_trace_array;
@@ -293,3 +370,8 @@ void mmio_trace_mapping(struct mmiotrace_map *map)
293 __trace_mmiotrace_map(tr, data, map); 370 __trace_mmiotrace_map(tr, data, map);
294 preempt_enable(); 371 preempt_enable();
295} 372}
373
374int mmio_trace_printk(const char *fmt, va_list args)
375{
376 return trace_vprintk(0, fmt, args);
377}
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
new file mode 100644
index 000000000000..4592b4862515
--- /dev/null
+++ b/kernel/trace/trace_nop.c
@@ -0,0 +1,64 @@
1/*
2 * nop tracer
3 *
4 * Copyright (C) 2008 Steven Noonan <steven@uplinklabs.net>
5 *
6 */
7
8#include <linux/module.h>
9#include <linux/fs.h>
10#include <linux/debugfs.h>
11#include <linux/ftrace.h>
12
13#include "trace.h"
14
15static struct trace_array *ctx_trace;
16
17static void start_nop_trace(struct trace_array *tr)
18{
19 /* Nothing to do! */
20}
21
22static void stop_nop_trace(struct trace_array *tr)
23{
24 /* Nothing to do! */
25}
26
27static void nop_trace_init(struct trace_array *tr)
28{
29 int cpu;
30 ctx_trace = tr;
31
32 for_each_online_cpu(cpu)
33 tracing_reset(tr, cpu);
34
35 if (tr->ctrl)
36 start_nop_trace(tr);
37}
38
39static void nop_trace_reset(struct trace_array *tr)
40{
41 if (tr->ctrl)
42 stop_nop_trace(tr);
43}
44
45static void nop_trace_ctrl_update(struct trace_array *tr)
46{
47 /* When starting a new trace, reset the buffers */
48 if (tr->ctrl)
49 start_nop_trace(tr);
50 else
51 stop_nop_trace(tr);
52}
53
54struct tracer nop_trace __read_mostly =
55{
56 .name = "nop",
57 .init = nop_trace_init,
58 .reset = nop_trace_reset,
59 .ctrl_update = nop_trace_ctrl_update,
60#ifdef CONFIG_FTRACE_SELFTEST
61 .selftest = trace_selftest_startup_nop,
62#endif
63};
64
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index cb817a209aa0..b8f56beb1a62 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -9,8 +9,8 @@
9#include <linux/debugfs.h> 9#include <linux/debugfs.h>
10#include <linux/kallsyms.h> 10#include <linux/kallsyms.h>
11#include <linux/uaccess.h> 11#include <linux/uaccess.h>
12#include <linux/marker.h>
13#include <linux/ftrace.h> 12#include <linux/ftrace.h>
13#include <trace/sched.h>
14 14
15#include "trace.h" 15#include "trace.h"
16 16
@@ -19,15 +19,16 @@ static int __read_mostly tracer_enabled;
19static atomic_t sched_ref; 19static atomic_t sched_ref;
20 20
21static void 21static void
22sched_switch_func(void *private, void *__rq, struct task_struct *prev, 22probe_sched_switch(struct rq *__rq, struct task_struct *prev,
23 struct task_struct *next) 23 struct task_struct *next)
24{ 24{
25 struct trace_array **ptr = private;
26 struct trace_array *tr = *ptr;
27 struct trace_array_cpu *data; 25 struct trace_array_cpu *data;
28 unsigned long flags; 26 unsigned long flags;
29 long disabled;
30 int cpu; 27 int cpu;
28 int pc;
29
30 if (!atomic_read(&sched_ref))
31 return;
31 32
32 tracing_record_cmdline(prev); 33 tracing_record_cmdline(prev);
33 tracing_record_cmdline(next); 34 tracing_record_cmdline(next);
@@ -35,97 +36,41 @@ sched_switch_func(void *private, void *__rq, struct task_struct *prev,
35 if (!tracer_enabled) 36 if (!tracer_enabled)
36 return; 37 return;
37 38
39 pc = preempt_count();
38 local_irq_save(flags); 40 local_irq_save(flags);
39 cpu = raw_smp_processor_id(); 41 cpu = raw_smp_processor_id();
40 data = tr->data[cpu]; 42 data = ctx_trace->data[cpu];
41 disabled = atomic_inc_return(&data->disabled);
42 43
43 if (likely(disabled == 1)) 44 if (likely(!atomic_read(&data->disabled)))
44 tracing_sched_switch_trace(tr, data, prev, next, flags); 45 tracing_sched_switch_trace(ctx_trace, data, prev, next, flags, pc);
45 46
46 atomic_dec(&data->disabled);
47 local_irq_restore(flags); 47 local_irq_restore(flags);
48} 48}
49 49
50static notrace void
51sched_switch_callback(void *probe_data, void *call_data,
52 const char *format, va_list *args)
53{
54 struct task_struct *prev;
55 struct task_struct *next;
56 struct rq *__rq;
57
58 if (!atomic_read(&sched_ref))
59 return;
60
61 /* skip prev_pid %d next_pid %d prev_state %ld */
62 (void)va_arg(*args, int);
63 (void)va_arg(*args, int);
64 (void)va_arg(*args, long);
65 __rq = va_arg(*args, typeof(__rq));
66 prev = va_arg(*args, typeof(prev));
67 next = va_arg(*args, typeof(next));
68
69 /*
70 * If tracer_switch_func only points to the local
71 * switch func, it still needs the ptr passed to it.
72 */
73 sched_switch_func(probe_data, __rq, prev, next);
74}
75
76static void 50static void
77wakeup_func(void *private, void *__rq, struct task_struct *wakee, struct 51probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee)
78 task_struct *curr)
79{ 52{
80 struct trace_array **ptr = private;
81 struct trace_array *tr = *ptr;
82 struct trace_array_cpu *data; 53 struct trace_array_cpu *data;
83 unsigned long flags; 54 unsigned long flags;
84 long disabled; 55 int cpu, pc;
85 int cpu;
86 56
87 if (!tracer_enabled) 57 if (!likely(tracer_enabled))
88 return; 58 return;
89 59
90 tracing_record_cmdline(curr); 60 pc = preempt_count();
61 tracing_record_cmdline(current);
91 62
92 local_irq_save(flags); 63 local_irq_save(flags);
93 cpu = raw_smp_processor_id(); 64 cpu = raw_smp_processor_id();
94 data = tr->data[cpu]; 65 data = ctx_trace->data[cpu];
95 disabled = atomic_inc_return(&data->disabled);
96 66
97 if (likely(disabled == 1)) 67 if (likely(!atomic_read(&data->disabled)))
98 tracing_sched_wakeup_trace(tr, data, wakee, curr, flags); 68 tracing_sched_wakeup_trace(ctx_trace, data, wakee, current,
69 flags, pc);
99 70
100 atomic_dec(&data->disabled);
101 local_irq_restore(flags); 71 local_irq_restore(flags);
102} 72}
103 73
104static notrace void
105wake_up_callback(void *probe_data, void *call_data,
106 const char *format, va_list *args)
107{
108 struct task_struct *curr;
109 struct task_struct *task;
110 struct rq *__rq;
111
112 if (likely(!tracer_enabled))
113 return;
114
115 /* Skip pid %d state %ld */
116 (void)va_arg(*args, int);
117 (void)va_arg(*args, long);
118 /* now get the meat: "rq %p task %p rq->curr %p" */
119 __rq = va_arg(*args, typeof(__rq));
120 task = va_arg(*args, typeof(task));
121 curr = va_arg(*args, typeof(curr));
122
123 tracing_record_cmdline(task);
124 tracing_record_cmdline(curr);
125
126 wakeup_func(probe_data, __rq, task, curr);
127}
128
129static void sched_switch_reset(struct trace_array *tr) 74static void sched_switch_reset(struct trace_array *tr)
130{ 75{
131 int cpu; 76 int cpu;
@@ -133,67 +78,47 @@ static void sched_switch_reset(struct trace_array *tr)
133 tr->time_start = ftrace_now(tr->cpu); 78 tr->time_start = ftrace_now(tr->cpu);
134 79
135 for_each_online_cpu(cpu) 80 for_each_online_cpu(cpu)
136 tracing_reset(tr->data[cpu]); 81 tracing_reset(tr, cpu);
137} 82}
138 83
139static int tracing_sched_register(void) 84static int tracing_sched_register(void)
140{ 85{
141 int ret; 86 int ret;
142 87
143 ret = marker_probe_register("kernel_sched_wakeup", 88 ret = register_trace_sched_wakeup(probe_sched_wakeup);
144 "pid %d state %ld ## rq %p task %p rq->curr %p",
145 wake_up_callback,
146 &ctx_trace);
147 if (ret) { 89 if (ret) {
148 pr_info("wakeup trace: Couldn't add marker" 90 pr_info("wakeup trace: Couldn't activate tracepoint"
149 " probe to kernel_sched_wakeup\n"); 91 " probe to kernel_sched_wakeup\n");
150 return ret; 92 return ret;
151 } 93 }
152 94
153 ret = marker_probe_register("kernel_sched_wakeup_new", 95 ret = register_trace_sched_wakeup_new(probe_sched_wakeup);
154 "pid %d state %ld ## rq %p task %p rq->curr %p",
155 wake_up_callback,
156 &ctx_trace);
157 if (ret) { 96 if (ret) {
158 pr_info("wakeup trace: Couldn't add marker" 97 pr_info("wakeup trace: Couldn't activate tracepoint"
159 " probe to kernel_sched_wakeup_new\n"); 98 " probe to kernel_sched_wakeup_new\n");
160 goto fail_deprobe; 99 goto fail_deprobe;
161 } 100 }
162 101
163 ret = marker_probe_register("kernel_sched_schedule", 102 ret = register_trace_sched_switch(probe_sched_switch);
164 "prev_pid %d next_pid %d prev_state %ld "
165 "## rq %p prev %p next %p",
166 sched_switch_callback,
167 &ctx_trace);
168 if (ret) { 103 if (ret) {
169 pr_info("sched trace: Couldn't add marker" 104 pr_info("sched trace: Couldn't activate tracepoint"
170 " probe to kernel_sched_schedule\n"); 105 " probe to kernel_sched_schedule\n");
171 goto fail_deprobe_wake_new; 106 goto fail_deprobe_wake_new;
172 } 107 }
173 108
174 return ret; 109 return ret;
175fail_deprobe_wake_new: 110fail_deprobe_wake_new:
176 marker_probe_unregister("kernel_sched_wakeup_new", 111 unregister_trace_sched_wakeup_new(probe_sched_wakeup);
177 wake_up_callback,
178 &ctx_trace);
179fail_deprobe: 112fail_deprobe:
180 marker_probe_unregister("kernel_sched_wakeup", 113 unregister_trace_sched_wakeup(probe_sched_wakeup);
181 wake_up_callback,
182 &ctx_trace);
183 return ret; 114 return ret;
184} 115}
185 116
186static void tracing_sched_unregister(void) 117static void tracing_sched_unregister(void)
187{ 118{
188 marker_probe_unregister("kernel_sched_schedule", 119 unregister_trace_sched_switch(probe_sched_switch);
189 sched_switch_callback, 120 unregister_trace_sched_wakeup_new(probe_sched_wakeup);
190 &ctx_trace); 121 unregister_trace_sched_wakeup(probe_sched_wakeup);
191 marker_probe_unregister("kernel_sched_wakeup_new",
192 wake_up_callback,
193 &ctx_trace);
194 marker_probe_unregister("kernel_sched_wakeup",
195 wake_up_callback,
196 &ctx_trace);
197} 122}
198 123
199static void tracing_start_sched_switch(void) 124static void tracing_start_sched_switch(void)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 3c8d61df4474..fe4a252c2363 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -15,7 +15,7 @@
15#include <linux/kallsyms.h> 15#include <linux/kallsyms.h>
16#include <linux/uaccess.h> 16#include <linux/uaccess.h>
17#include <linux/ftrace.h> 17#include <linux/ftrace.h>
18#include <linux/marker.h> 18#include <trace/sched.h>
19 19
20#include "trace.h" 20#include "trace.h"
21 21
@@ -26,7 +26,8 @@ static struct task_struct *wakeup_task;
26static int wakeup_cpu; 26static int wakeup_cpu;
27static unsigned wakeup_prio = -1; 27static unsigned wakeup_prio = -1;
28 28
29static DEFINE_SPINLOCK(wakeup_lock); 29static raw_spinlock_t wakeup_lock =
30 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
30 31
31static void __wakeup_reset(struct trace_array *tr); 32static void __wakeup_reset(struct trace_array *tr);
32 33
@@ -43,10 +44,12 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
43 long disabled; 44 long disabled;
44 int resched; 45 int resched;
45 int cpu; 46 int cpu;
47 int pc;
46 48
47 if (likely(!wakeup_task)) 49 if (likely(!wakeup_task))
48 return; 50 return;
49 51
52 pc = preempt_count();
50 resched = need_resched(); 53 resched = need_resched();
51 preempt_disable_notrace(); 54 preempt_disable_notrace();
52 55
@@ -56,7 +59,8 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
56 if (unlikely(disabled != 1)) 59 if (unlikely(disabled != 1))
57 goto out; 60 goto out;
58 61
59 spin_lock_irqsave(&wakeup_lock, flags); 62 local_irq_save(flags);
63 __raw_spin_lock(&wakeup_lock);
60 64
61 if (unlikely(!wakeup_task)) 65 if (unlikely(!wakeup_task))
62 goto unlock; 66 goto unlock;
@@ -68,10 +72,11 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
68 if (task_cpu(wakeup_task) != cpu) 72 if (task_cpu(wakeup_task) != cpu)
69 goto unlock; 73 goto unlock;
70 74
71 trace_function(tr, data, ip, parent_ip, flags); 75 trace_function(tr, data, ip, parent_ip, flags, pc);
72 76
73 unlock: 77 unlock:
74 spin_unlock_irqrestore(&wakeup_lock, flags); 78 __raw_spin_unlock(&wakeup_lock);
79 local_irq_restore(flags);
75 80
76 out: 81 out:
77 atomic_dec(&data->disabled); 82 atomic_dec(&data->disabled);
@@ -109,17 +114,18 @@ static int report_latency(cycle_t delta)
109} 114}
110 115
111static void notrace 116static void notrace
112wakeup_sched_switch(void *private, void *rq, struct task_struct *prev, 117probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
113 struct task_struct *next) 118 struct task_struct *next)
114{ 119{
115 unsigned long latency = 0, t0 = 0, t1 = 0; 120 unsigned long latency = 0, t0 = 0, t1 = 0;
116 struct trace_array **ptr = private;
117 struct trace_array *tr = *ptr;
118 struct trace_array_cpu *data; 121 struct trace_array_cpu *data;
119 cycle_t T0, T1, delta; 122 cycle_t T0, T1, delta;
120 unsigned long flags; 123 unsigned long flags;
121 long disabled; 124 long disabled;
122 int cpu; 125 int cpu;
126 int pc;
127
128 tracing_record_cmdline(prev);
123 129
124 if (unlikely(!tracer_enabled)) 130 if (unlikely(!tracer_enabled))
125 return; 131 return;
@@ -136,22 +142,25 @@ wakeup_sched_switch(void *private, void *rq, struct task_struct *prev,
136 if (next != wakeup_task) 142 if (next != wakeup_task)
137 return; 143 return;
138 144
145 pc = preempt_count();
146
139 /* The task we are waiting for is waking up */ 147 /* The task we are waiting for is waking up */
140 data = tr->data[wakeup_cpu]; 148 data = wakeup_trace->data[wakeup_cpu];
141 149
142 /* disable local data, not wakeup_cpu data */ 150 /* disable local data, not wakeup_cpu data */
143 cpu = raw_smp_processor_id(); 151 cpu = raw_smp_processor_id();
144 disabled = atomic_inc_return(&tr->data[cpu]->disabled); 152 disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled);
145 if (likely(disabled != 1)) 153 if (likely(disabled != 1))
146 goto out; 154 goto out;
147 155
148 spin_lock_irqsave(&wakeup_lock, flags); 156 local_irq_save(flags);
157 __raw_spin_lock(&wakeup_lock);
149 158
150 /* We could race with grabbing wakeup_lock */ 159 /* We could race with grabbing wakeup_lock */
151 if (unlikely(!tracer_enabled || next != wakeup_task)) 160 if (unlikely(!tracer_enabled || next != wakeup_task))
152 goto out_unlock; 161 goto out_unlock;
153 162
154 trace_function(tr, data, CALLER_ADDR1, CALLER_ADDR2, flags); 163 trace_function(wakeup_trace, data, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
155 164
156 /* 165 /*
157 * usecs conversion is slow so we try to delay the conversion 166 * usecs conversion is slow so we try to delay the conversion
@@ -170,38 +179,14 @@ wakeup_sched_switch(void *private, void *rq, struct task_struct *prev,
170 t0 = nsecs_to_usecs(T0); 179 t0 = nsecs_to_usecs(T0);
171 t1 = nsecs_to_usecs(T1); 180 t1 = nsecs_to_usecs(T1);
172 181
173 update_max_tr(tr, wakeup_task, wakeup_cpu); 182 update_max_tr(wakeup_trace, wakeup_task, wakeup_cpu);
174 183
175out_unlock: 184out_unlock:
176 __wakeup_reset(tr); 185 __wakeup_reset(wakeup_trace);
177 spin_unlock_irqrestore(&wakeup_lock, flags); 186 __raw_spin_unlock(&wakeup_lock);
187 local_irq_restore(flags);
178out: 188out:
179 atomic_dec(&tr->data[cpu]->disabled); 189 atomic_dec(&wakeup_trace->data[cpu]->disabled);
180}
181
182static notrace void
183sched_switch_callback(void *probe_data, void *call_data,
184 const char *format, va_list *args)
185{
186 struct task_struct *prev;
187 struct task_struct *next;
188 struct rq *__rq;
189
190 /* skip prev_pid %d next_pid %d prev_state %ld */
191 (void)va_arg(*args, int);
192 (void)va_arg(*args, int);
193 (void)va_arg(*args, long);
194 __rq = va_arg(*args, typeof(__rq));
195 prev = va_arg(*args, typeof(prev));
196 next = va_arg(*args, typeof(next));
197
198 tracing_record_cmdline(prev);
199
200 /*
201 * If tracer_switch_func only points to the local
202 * switch func, it still needs the ptr passed to it.
203 */
204 wakeup_sched_switch(probe_data, __rq, prev, next);
205} 190}
206 191
207static void __wakeup_reset(struct trace_array *tr) 192static void __wakeup_reset(struct trace_array *tr)
@@ -209,11 +194,9 @@ static void __wakeup_reset(struct trace_array *tr)
209 struct trace_array_cpu *data; 194 struct trace_array_cpu *data;
210 int cpu; 195 int cpu;
211 196
212 assert_spin_locked(&wakeup_lock);
213
214 for_each_possible_cpu(cpu) { 197 for_each_possible_cpu(cpu) {
215 data = tr->data[cpu]; 198 data = tr->data[cpu];
216 tracing_reset(data); 199 tracing_reset(tr, cpu);
217 } 200 }
218 201
219 wakeup_cpu = -1; 202 wakeup_cpu = -1;
@@ -229,37 +212,46 @@ static void wakeup_reset(struct trace_array *tr)
229{ 212{
230 unsigned long flags; 213 unsigned long flags;
231 214
232 spin_lock_irqsave(&wakeup_lock, flags); 215 local_irq_save(flags);
216 __raw_spin_lock(&wakeup_lock);
233 __wakeup_reset(tr); 217 __wakeup_reset(tr);
234 spin_unlock_irqrestore(&wakeup_lock, flags); 218 __raw_spin_unlock(&wakeup_lock);
219 local_irq_restore(flags);
235} 220}
236 221
237static void 222static void
238wakeup_check_start(struct trace_array *tr, struct task_struct *p, 223probe_wakeup(struct rq *rq, struct task_struct *p)
239 struct task_struct *curr)
240{ 224{
241 int cpu = smp_processor_id(); 225 int cpu = smp_processor_id();
242 unsigned long flags; 226 unsigned long flags;
243 long disabled; 227 long disabled;
228 int pc;
229
230 if (likely(!tracer_enabled))
231 return;
232
233 tracing_record_cmdline(p);
234 tracing_record_cmdline(current);
244 235
245 if (likely(!rt_task(p)) || 236 if (likely(!rt_task(p)) ||
246 p->prio >= wakeup_prio || 237 p->prio >= wakeup_prio ||
247 p->prio >= curr->prio) 238 p->prio >= current->prio)
248 return; 239 return;
249 240
250 disabled = atomic_inc_return(&tr->data[cpu]->disabled); 241 pc = preempt_count();
242 disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled);
251 if (unlikely(disabled != 1)) 243 if (unlikely(disabled != 1))
252 goto out; 244 goto out;
253 245
254 /* interrupts should be off from try_to_wake_up */ 246 /* interrupts should be off from try_to_wake_up */
255 spin_lock(&wakeup_lock); 247 __raw_spin_lock(&wakeup_lock);
256 248
257 /* check for races. */ 249 /* check for races. */
258 if (!tracer_enabled || p->prio >= wakeup_prio) 250 if (!tracer_enabled || p->prio >= wakeup_prio)
259 goto out_locked; 251 goto out_locked;
260 252
261 /* reset the trace */ 253 /* reset the trace */
262 __wakeup_reset(tr); 254 __wakeup_reset(wakeup_trace);
263 255
264 wakeup_cpu = task_cpu(p); 256 wakeup_cpu = task_cpu(p);
265 wakeup_prio = p->prio; 257 wakeup_prio = p->prio;
@@ -269,74 +261,37 @@ wakeup_check_start(struct trace_array *tr, struct task_struct *p,
269 261
270 local_save_flags(flags); 262 local_save_flags(flags);
271 263
272 tr->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu); 264 wakeup_trace->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu);
273 trace_function(tr, tr->data[wakeup_cpu], 265 trace_function(wakeup_trace, wakeup_trace->data[wakeup_cpu],
274 CALLER_ADDR1, CALLER_ADDR2, flags); 266 CALLER_ADDR1, CALLER_ADDR2, flags, pc);
275 267
276out_locked: 268out_locked:
277 spin_unlock(&wakeup_lock); 269 __raw_spin_unlock(&wakeup_lock);
278out: 270out:
279 atomic_dec(&tr->data[cpu]->disabled); 271 atomic_dec(&wakeup_trace->data[cpu]->disabled);
280}
281
282static notrace void
283wake_up_callback(void *probe_data, void *call_data,
284 const char *format, va_list *args)
285{
286 struct trace_array **ptr = probe_data;
287 struct trace_array *tr = *ptr;
288 struct task_struct *curr;
289 struct task_struct *task;
290 struct rq *__rq;
291
292 if (likely(!tracer_enabled))
293 return;
294
295 /* Skip pid %d state %ld */
296 (void)va_arg(*args, int);
297 (void)va_arg(*args, long);
298 /* now get the meat: "rq %p task %p rq->curr %p" */
299 __rq = va_arg(*args, typeof(__rq));
300 task = va_arg(*args, typeof(task));
301 curr = va_arg(*args, typeof(curr));
302
303 tracing_record_cmdline(task);
304 tracing_record_cmdline(curr);
305
306 wakeup_check_start(tr, task, curr);
307} 272}
308 273
309static void start_wakeup_tracer(struct trace_array *tr) 274static void start_wakeup_tracer(struct trace_array *tr)
310{ 275{
311 int ret; 276 int ret;
312 277
313 ret = marker_probe_register("kernel_sched_wakeup", 278 ret = register_trace_sched_wakeup(probe_wakeup);
314 "pid %d state %ld ## rq %p task %p rq->curr %p",
315 wake_up_callback,
316 &wakeup_trace);
317 if (ret) { 279 if (ret) {
318 pr_info("wakeup trace: Couldn't add marker" 280 pr_info("wakeup trace: Couldn't activate tracepoint"
319 " probe to kernel_sched_wakeup\n"); 281 " probe to kernel_sched_wakeup\n");
320 return; 282 return;
321 } 283 }
322 284
323 ret = marker_probe_register("kernel_sched_wakeup_new", 285 ret = register_trace_sched_wakeup_new(probe_wakeup);
324 "pid %d state %ld ## rq %p task %p rq->curr %p",
325 wake_up_callback,
326 &wakeup_trace);
327 if (ret) { 286 if (ret) {
328 pr_info("wakeup trace: Couldn't add marker" 287 pr_info("wakeup trace: Couldn't activate tracepoint"
329 " probe to kernel_sched_wakeup_new\n"); 288 " probe to kernel_sched_wakeup_new\n");
330 goto fail_deprobe; 289 goto fail_deprobe;
331 } 290 }
332 291
333 ret = marker_probe_register("kernel_sched_schedule", 292 ret = register_trace_sched_switch(probe_wakeup_sched_switch);
334 "prev_pid %d next_pid %d prev_state %ld "
335 "## rq %p prev %p next %p",
336 sched_switch_callback,
337 &wakeup_trace);
338 if (ret) { 293 if (ret) {
339 pr_info("sched trace: Couldn't add marker" 294 pr_info("sched trace: Couldn't activate tracepoint"
340 " probe to kernel_sched_schedule\n"); 295 " probe to kernel_sched_schedule\n");
341 goto fail_deprobe_wake_new; 296 goto fail_deprobe_wake_new;
342 } 297 }
@@ -358,28 +313,18 @@ static void start_wakeup_tracer(struct trace_array *tr)
358 313
359 return; 314 return;
360fail_deprobe_wake_new: 315fail_deprobe_wake_new:
361 marker_probe_unregister("kernel_sched_wakeup_new", 316 unregister_trace_sched_wakeup_new(probe_wakeup);
362 wake_up_callback,
363 &wakeup_trace);
364fail_deprobe: 317fail_deprobe:
365 marker_probe_unregister("kernel_sched_wakeup", 318 unregister_trace_sched_wakeup(probe_wakeup);
366 wake_up_callback,
367 &wakeup_trace);
368} 319}
369 320
370static void stop_wakeup_tracer(struct trace_array *tr) 321static void stop_wakeup_tracer(struct trace_array *tr)
371{ 322{
372 tracer_enabled = 0; 323 tracer_enabled = 0;
373 unregister_ftrace_function(&trace_ops); 324 unregister_ftrace_function(&trace_ops);
374 marker_probe_unregister("kernel_sched_schedule", 325 unregister_trace_sched_switch(probe_wakeup_sched_switch);
375 sched_switch_callback, 326 unregister_trace_sched_wakeup_new(probe_wakeup);
376 &wakeup_trace); 327 unregister_trace_sched_wakeup(probe_wakeup);
377 marker_probe_unregister("kernel_sched_wakeup_new",
378 wake_up_callback,
379 &wakeup_trace);
380 marker_probe_unregister("kernel_sched_wakeup",
381 wake_up_callback,
382 &wakeup_trace);
383} 328}
384 329
385static void wakeup_tracer_init(struct trace_array *tr) 330static void wakeup_tracer_init(struct trace_array *tr)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 0911b7e073bf..09cf230d7eca 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -9,65 +9,29 @@ static inline int trace_valid_entry(struct trace_entry *entry)
9 case TRACE_FN: 9 case TRACE_FN:
10 case TRACE_CTX: 10 case TRACE_CTX:
11 case TRACE_WAKE: 11 case TRACE_WAKE:
12 case TRACE_CONT:
12 case TRACE_STACK: 13 case TRACE_STACK:
14 case TRACE_PRINT:
13 case TRACE_SPECIAL: 15 case TRACE_SPECIAL:
14 return 1; 16 return 1;
15 } 17 }
16 return 0; 18 return 0;
17} 19}
18 20
19static int 21static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)
20trace_test_buffer_cpu(struct trace_array *tr, struct trace_array_cpu *data)
21{ 22{
22 struct trace_entry *entries; 23 struct ring_buffer_event *event;
23 struct page *page; 24 struct trace_entry *entry;
24 int idx = 0;
25 int i;
26 25
27 BUG_ON(list_empty(&data->trace_pages)); 26 while ((event = ring_buffer_consume(tr->buffer, cpu, NULL))) {
28 page = list_entry(data->trace_pages.next, struct page, lru); 27 entry = ring_buffer_event_data(event);
29 entries = page_address(page);
30 28
31 check_pages(data); 29 if (!trace_valid_entry(entry)) {
32 if (head_page(data) != entries)
33 goto failed;
34
35 /*
36 * The starting trace buffer always has valid elements,
37 * if any element exists.
38 */
39 entries = head_page(data);
40
41 for (i = 0; i < tr->entries; i++) {
42
43 if (i < data->trace_idx && !trace_valid_entry(&entries[idx])) {
44 printk(KERN_CONT ".. invalid entry %d ", 30 printk(KERN_CONT ".. invalid entry %d ",
45 entries[idx].type); 31 entry->type);
46 goto failed; 32 goto failed;
47 } 33 }
48
49 idx++;
50 if (idx >= ENTRIES_PER_PAGE) {
51 page = virt_to_page(entries);
52 if (page->lru.next == &data->trace_pages) {
53 if (i != tr->entries - 1) {
54 printk(KERN_CONT ".. entries buffer mismatch");
55 goto failed;
56 }
57 } else {
58 page = list_entry(page->lru.next, struct page, lru);
59 entries = page_address(page);
60 }
61 idx = 0;
62 }
63 } 34 }
64
65 page = virt_to_page(entries);
66 if (page->lru.next != &data->trace_pages) {
67 printk(KERN_CONT ".. too many entries");
68 goto failed;
69 }
70
71 return 0; 35 return 0;
72 36
73 failed: 37 failed:
@@ -89,13 +53,11 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
89 /* Don't allow flipping of max traces now */ 53 /* Don't allow flipping of max traces now */
90 raw_local_irq_save(flags); 54 raw_local_irq_save(flags);
91 __raw_spin_lock(&ftrace_max_lock); 55 __raw_spin_lock(&ftrace_max_lock);
92 for_each_possible_cpu(cpu) {
93 if (!head_page(tr->data[cpu]))
94 continue;
95 56
96 cnt += tr->data[cpu]->trace_idx; 57 cnt = ring_buffer_entries(tr->buffer);
97 58
98 ret = trace_test_buffer_cpu(tr, tr->data[cpu]); 59 for_each_possible_cpu(cpu) {
60 ret = trace_test_buffer_cpu(tr, cpu);
99 if (ret) 61 if (ret)
100 break; 62 break;
101 } 63 }
@@ -120,11 +82,11 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
120 struct trace_array *tr, 82 struct trace_array *tr,
121 int (*func)(void)) 83 int (*func)(void))
122{ 84{
123 unsigned long count;
124 int ret;
125 int save_ftrace_enabled = ftrace_enabled; 85 int save_ftrace_enabled = ftrace_enabled;
126 int save_tracer_enabled = tracer_enabled; 86 int save_tracer_enabled = tracer_enabled;
87 unsigned long count;
127 char *func_name; 88 char *func_name;
89 int ret;
128 90
129 /* The ftrace test PASSED */ 91 /* The ftrace test PASSED */
130 printk(KERN_CONT "PASSED\n"); 92 printk(KERN_CONT "PASSED\n");
@@ -157,6 +119,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
157 /* enable tracing */ 119 /* enable tracing */
158 tr->ctrl = 1; 120 tr->ctrl = 1;
159 trace->init(tr); 121 trace->init(tr);
122
160 /* Sleep for a 1/10 of a second */ 123 /* Sleep for a 1/10 of a second */
161 msleep(100); 124 msleep(100);
162 125
@@ -212,10 +175,10 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
212int 175int
213trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) 176trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
214{ 177{
215 unsigned long count;
216 int ret;
217 int save_ftrace_enabled = ftrace_enabled; 178 int save_ftrace_enabled = ftrace_enabled;
218 int save_tracer_enabled = tracer_enabled; 179 int save_tracer_enabled = tracer_enabled;
180 unsigned long count;
181 int ret;
219 182
220 /* make sure msleep has been recorded */ 183 /* make sure msleep has been recorded */
221 msleep(1); 184 msleep(1);
@@ -415,6 +378,15 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
415} 378}
416#endif /* CONFIG_IRQSOFF_TRACER && CONFIG_PREEMPT_TRACER */ 379#endif /* CONFIG_IRQSOFF_TRACER && CONFIG_PREEMPT_TRACER */
417 380
381#ifdef CONFIG_NOP_TRACER
382int
383trace_selftest_startup_nop(struct tracer *trace, struct trace_array *tr)
384{
385 /* What could possibly go wrong? */
386 return 0;
387}
388#endif
389
418#ifdef CONFIG_SCHED_TRACER 390#ifdef CONFIG_SCHED_TRACER
419static int trace_wakeup_test_thread(void *data) 391static int trace_wakeup_test_thread(void *data)
420{ 392{
@@ -486,6 +458,9 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
486 458
487 wake_up_process(p); 459 wake_up_process(p);
488 460
461 /* give a little time to let the thread wake up */
462 msleep(100);
463
489 /* stop the tracing. */ 464 /* stop the tracing. */
490 tr->ctrl = 0; 465 tr->ctrl = 0;
491 trace->ctrl_update(tr); 466 trace->ctrl_update(tr);
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
new file mode 100644
index 000000000000..74c5d9a3afae
--- /dev/null
+++ b/kernel/trace/trace_stack.c
@@ -0,0 +1,310 @@
1/*
2 * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
3 *
4 */
5#include <linux/stacktrace.h>
6#include <linux/kallsyms.h>
7#include <linux/seq_file.h>
8#include <linux/spinlock.h>
9#include <linux/uaccess.h>
10#include <linux/debugfs.h>
11#include <linux/ftrace.h>
12#include <linux/module.h>
13#include <linux/init.h>
14#include <linux/fs.h>
15#include "trace.h"
16
17#define STACK_TRACE_ENTRIES 500
18
19static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] =
20 { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX };
21static unsigned stack_dump_index[STACK_TRACE_ENTRIES];
22
23static struct stack_trace max_stack_trace = {
24 .max_entries = STACK_TRACE_ENTRIES,
25 .entries = stack_dump_trace,
26};
27
28static unsigned long max_stack_size;
29static raw_spinlock_t max_stack_lock =
30 (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
31
32static int stack_trace_disabled __read_mostly;
33static DEFINE_PER_CPU(int, trace_active);
34
35static inline void check_stack(void)
36{
37 unsigned long this_size, flags;
38 unsigned long *p, *top, *start;
39 int i;
40
41 this_size = ((unsigned long)&this_size) & (THREAD_SIZE-1);
42 this_size = THREAD_SIZE - this_size;
43
44 if (this_size <= max_stack_size)
45 return;
46
47 raw_local_irq_save(flags);
48 __raw_spin_lock(&max_stack_lock);
49
50 /* a race could have already updated it */
51 if (this_size <= max_stack_size)
52 goto out;
53
54 max_stack_size = this_size;
55
56 max_stack_trace.nr_entries = 0;
57 max_stack_trace.skip = 3;
58
59 save_stack_trace(&max_stack_trace);
60
61 /*
62 * Now find where in the stack these are.
63 */
64 i = 0;
65 start = &this_size;
66 top = (unsigned long *)
67 (((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE);
68
69 /*
70 * Loop through all the entries. One of the entries may
71 * for some reason be missed on the stack, so we may
72 * have to account for them. If they are all there, this
73 * loop will only happen once. This code only takes place
74 * on a new max, so it is far from a fast path.
75 */
76 while (i < max_stack_trace.nr_entries) {
77
78 stack_dump_index[i] = this_size;
79 p = start;
80
81 for (; p < top && i < max_stack_trace.nr_entries; p++) {
82 if (*p == stack_dump_trace[i]) {
83 this_size = stack_dump_index[i++] =
84 (top - p) * sizeof(unsigned long);
85 /* Start the search from here */
86 start = p + 1;
87 }
88 }
89
90 i++;
91 }
92
93 out:
94 __raw_spin_unlock(&max_stack_lock);
95 raw_local_irq_restore(flags);
96}
97
98static void
99stack_trace_call(unsigned long ip, unsigned long parent_ip)
100{
101 int cpu, resched;
102
103 if (unlikely(!ftrace_enabled || stack_trace_disabled))
104 return;
105
106 resched = need_resched();
107 preempt_disable_notrace();
108
109 cpu = raw_smp_processor_id();
110 /* no atomic needed, we only modify this variable by this cpu */
111 if (per_cpu(trace_active, cpu)++ != 0)
112 goto out;
113
114 check_stack();
115
116 out:
117 per_cpu(trace_active, cpu)--;
118 /* prevent recursion in schedule */
119 if (resched)
120 preempt_enable_no_resched_notrace();
121 else
122 preempt_enable_notrace();
123}
124
125static struct ftrace_ops trace_ops __read_mostly =
126{
127 .func = stack_trace_call,
128};
129
130static ssize_t
131stack_max_size_read(struct file *filp, char __user *ubuf,
132 size_t count, loff_t *ppos)
133{
134 unsigned long *ptr = filp->private_data;
135 char buf[64];
136 int r;
137
138 r = snprintf(buf, sizeof(buf), "%ld\n", *ptr);
139 if (r > sizeof(buf))
140 r = sizeof(buf);
141 return simple_read_from_buffer(ubuf, count, ppos, buf, r);
142}
143
144static ssize_t
145stack_max_size_write(struct file *filp, const char __user *ubuf,
146 size_t count, loff_t *ppos)
147{
148 long *ptr = filp->private_data;
149 unsigned long val, flags;
150 char buf[64];
151 int ret;
152
153 if (count >= sizeof(buf))
154 return -EINVAL;
155
156 if (copy_from_user(&buf, ubuf, count))
157 return -EFAULT;
158
159 buf[count] = 0;
160
161 ret = strict_strtoul(buf, 10, &val);
162 if (ret < 0)
163 return ret;
164
165 raw_local_irq_save(flags);
166 __raw_spin_lock(&max_stack_lock);
167 *ptr = val;
168 __raw_spin_unlock(&max_stack_lock);
169 raw_local_irq_restore(flags);
170
171 return count;
172}
173
174static struct file_operations stack_max_size_fops = {
175 .open = tracing_open_generic,
176 .read = stack_max_size_read,
177 .write = stack_max_size_write,
178};
179
180static void *
181t_next(struct seq_file *m, void *v, loff_t *pos)
182{
183 long i = (long)m->private;
184
185 (*pos)++;
186
187 i++;
188
189 if (i >= max_stack_trace.nr_entries ||
190 stack_dump_trace[i] == ULONG_MAX)
191 return NULL;
192
193 m->private = (void *)i;
194
195 return &m->private;
196}
197
198static void *t_start(struct seq_file *m, loff_t *pos)
199{
200 void *t = &m->private;
201 loff_t l = 0;
202
203 local_irq_disable();
204 __raw_spin_lock(&max_stack_lock);
205
206 for (; t && l < *pos; t = t_next(m, t, &l))
207 ;
208
209 return t;
210}
211
212static void t_stop(struct seq_file *m, void *p)
213{
214 __raw_spin_unlock(&max_stack_lock);
215 local_irq_enable();
216}
217
218static int trace_lookup_stack(struct seq_file *m, long i)
219{
220 unsigned long addr = stack_dump_trace[i];
221#ifdef CONFIG_KALLSYMS
222 char str[KSYM_SYMBOL_LEN];
223
224 sprint_symbol(str, addr);
225
226 return seq_printf(m, "%s\n", str);
227#else
228 return seq_printf(m, "%p\n", (void*)addr);
229#endif
230}
231
232static int t_show(struct seq_file *m, void *v)
233{
234 long i = *(long *)v;
235 int size;
236
237 if (i < 0) {
238 seq_printf(m, " Depth Size Location"
239 " (%d entries)\n"
240 " ----- ---- --------\n",
241 max_stack_trace.nr_entries);
242 return 0;
243 }
244
245 if (i >= max_stack_trace.nr_entries ||
246 stack_dump_trace[i] == ULONG_MAX)
247 return 0;
248
249 if (i+1 == max_stack_trace.nr_entries ||
250 stack_dump_trace[i+1] == ULONG_MAX)
251 size = stack_dump_index[i];
252 else
253 size = stack_dump_index[i] - stack_dump_index[i+1];
254
255 seq_printf(m, "%3ld) %8d %5d ", i, stack_dump_index[i], size);
256
257 trace_lookup_stack(m, i);
258
259 return 0;
260}
261
262static struct seq_operations stack_trace_seq_ops = {
263 .start = t_start,
264 .next = t_next,
265 .stop = t_stop,
266 .show = t_show,
267};
268
269static int stack_trace_open(struct inode *inode, struct file *file)
270{
271 int ret;
272
273 ret = seq_open(file, &stack_trace_seq_ops);
274 if (!ret) {
275 struct seq_file *m = file->private_data;
276 m->private = (void *)-1;
277 }
278
279 return ret;
280}
281
282static struct file_operations stack_trace_fops = {
283 .open = stack_trace_open,
284 .read = seq_read,
285 .llseek = seq_lseek,
286};
287
288static __init int stack_trace_init(void)
289{
290 struct dentry *d_tracer;
291 struct dentry *entry;
292
293 d_tracer = tracing_init_dentry();
294
295 entry = debugfs_create_file("stack_max_size", 0644, d_tracer,
296 &max_stack_size, &stack_max_size_fops);
297 if (!entry)
298 pr_warning("Could not create debugfs 'stack_max_size' entry\n");
299
300 entry = debugfs_create_file("stack_trace", 0444, d_tracer,
301 NULL, &stack_trace_fops);
302 if (!entry)
303 pr_warning("Could not create debugfs 'stack_trace' entry\n");
304
305 register_ftrace_function(&trace_ops);
306
307 return 0;
308}
309
310device_initcall(stack_trace_init);
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 2301e1e7c606..9587d3bcba55 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -161,7 +161,7 @@ static void timer_notify(struct pt_regs *regs, int cpu)
161 __trace_special(tr, data, 2, regs->ip, 0); 161 __trace_special(tr, data, 2, regs->ip, 0);
162 162
163 while (i < sample_max_depth) { 163 while (i < sample_max_depth) {
164 frame.next_fp = 0; 164 frame.next_fp = NULL;
165 frame.return_address = 0; 165 frame.return_address = 0;
166 if (!copy_stack_frame(fp, &frame)) 166 if (!copy_stack_frame(fp, &frame))
167 break; 167 break;
@@ -202,7 +202,7 @@ static void start_stack_timer(int cpu)
202 202
203 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); 203 hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
204 hrtimer->function = stack_trace_timer_fn; 204 hrtimer->function = stack_trace_timer_fn;
205 hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; 205 hrtimer->cb_mode = HRTIMER_CB_IRQSAFE_PERCPU;
206 206
207 hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL); 207 hrtimer_start(hrtimer, ns_to_ktime(sample_period), HRTIMER_MODE_REL);
208} 208}
@@ -241,7 +241,7 @@ static void stack_reset(struct trace_array *tr)
241 tr->time_start = ftrace_now(tr->cpu); 241 tr->time_start = ftrace_now(tr->cpu);
242 242
243 for_each_online_cpu(cpu) 243 for_each_online_cpu(cpu)
244 tracing_reset(tr->data[cpu]); 244 tracing_reset(tr, cpu);
245} 245}
246 246
247static void start_stack_trace(struct trace_array *tr) 247static void start_stack_trace(struct trace_array *tr)
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
new file mode 100644
index 000000000000..f2b7c28a4708
--- /dev/null
+++ b/kernel/tracepoint.c
@@ -0,0 +1,477 @@
1/*
2 * Copyright (C) 2008 Mathieu Desnoyers
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 */
18#include <linux/module.h>
19#include <linux/mutex.h>
20#include <linux/types.h>
21#include <linux/jhash.h>
22#include <linux/list.h>
23#include <linux/rcupdate.h>
24#include <linux/tracepoint.h>
25#include <linux/err.h>
26#include <linux/slab.h>
27
28extern struct tracepoint __start___tracepoints[];
29extern struct tracepoint __stop___tracepoints[];
30
31/* Set to 1 to enable tracepoint debug output */
32static const int tracepoint_debug;
33
34/*
35 * tracepoints_mutex nests inside module_mutex. Tracepoints mutex protects the
36 * builtin and module tracepoints and the hash table.
37 */
38static DEFINE_MUTEX(tracepoints_mutex);
39
40/*
41 * Tracepoint hash table, containing the active tracepoints.
42 * Protected by tracepoints_mutex.
43 */
44#define TRACEPOINT_HASH_BITS 6
45#define TRACEPOINT_TABLE_SIZE (1 << TRACEPOINT_HASH_BITS)
46
47/*
48 * Note about RCU :
49 * It is used to to delay the free of multiple probes array until a quiescent
50 * state is reached.
51 * Tracepoint entries modifications are protected by the tracepoints_mutex.
52 */
53struct tracepoint_entry {
54 struct hlist_node hlist;
55 void **funcs;
56 int refcount; /* Number of times armed. 0 if disarmed. */
57 struct rcu_head rcu;
58 void *oldptr;
59 unsigned char rcu_pending:1;
60 char name[0];
61};
62
63static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE];
64
65static void free_old_closure(struct rcu_head *head)
66{
67 struct tracepoint_entry *entry = container_of(head,
68 struct tracepoint_entry, rcu);
69 kfree(entry->oldptr);
70 /* Make sure we free the data before setting the pending flag to 0 */
71 smp_wmb();
72 entry->rcu_pending = 0;
73}
74
75static void tracepoint_entry_free_old(struct tracepoint_entry *entry, void *old)
76{
77 if (!old)
78 return;
79 entry->oldptr = old;
80 entry->rcu_pending = 1;
81 /* write rcu_pending before calling the RCU callback */
82 smp_wmb();
83 call_rcu_sched(&entry->rcu, free_old_closure);
84}
85
86static void debug_print_probes(struct tracepoint_entry *entry)
87{
88 int i;
89
90 if (!tracepoint_debug)
91 return;
92
93 for (i = 0; entry->funcs[i]; i++)
94 printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i]);
95}
96
97static void *
98tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe)
99{
100 int nr_probes = 0;
101 void **old, **new;
102
103 WARN_ON(!probe);
104
105 debug_print_probes(entry);
106 old = entry->funcs;
107 if (old) {
108 /* (N -> N+1), (N != 0, 1) probes */
109 for (nr_probes = 0; old[nr_probes]; nr_probes++)
110 if (old[nr_probes] == probe)
111 return ERR_PTR(-EEXIST);
112 }
113 /* + 2 : one for new probe, one for NULL func */
114 new = kzalloc((nr_probes + 2) * sizeof(void *), GFP_KERNEL);
115 if (new == NULL)
116 return ERR_PTR(-ENOMEM);
117 if (old)
118 memcpy(new, old, nr_probes * sizeof(void *));
119 new[nr_probes] = probe;
120 entry->refcount = nr_probes + 1;
121 entry->funcs = new;
122 debug_print_probes(entry);
123 return old;
124}
125
126static void *
127tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe)
128{
129 int nr_probes = 0, nr_del = 0, i;
130 void **old, **new;
131
132 old = entry->funcs;
133
134 debug_print_probes(entry);
135 /* (N -> M), (N > 1, M >= 0) probes */
136 for (nr_probes = 0; old[nr_probes]; nr_probes++) {
137 if ((!probe || old[nr_probes] == probe))
138 nr_del++;
139 }
140
141 if (nr_probes - nr_del == 0) {
142 /* N -> 0, (N > 1) */
143 entry->funcs = NULL;
144 entry->refcount = 0;
145 debug_print_probes(entry);
146 return old;
147 } else {
148 int j = 0;
149 /* N -> M, (N > 1, M > 0) */
150 /* + 1 for NULL */
151 new = kzalloc((nr_probes - nr_del + 1)
152 * sizeof(void *), GFP_KERNEL);
153 if (new == NULL)
154 return ERR_PTR(-ENOMEM);
155 for (i = 0; old[i]; i++)
156 if ((probe && old[i] != probe))
157 new[j++] = old[i];
158 entry->refcount = nr_probes - nr_del;
159 entry->funcs = new;
160 }
161 debug_print_probes(entry);
162 return old;
163}
164
165/*
166 * Get tracepoint if the tracepoint is present in the tracepoint hash table.
167 * Must be called with tracepoints_mutex held.
168 * Returns NULL if not present.
169 */
170static struct tracepoint_entry *get_tracepoint(const char *name)
171{
172 struct hlist_head *head;
173 struct hlist_node *node;
174 struct tracepoint_entry *e;
175 u32 hash = jhash(name, strlen(name), 0);
176
177 head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
178 hlist_for_each_entry(e, node, head, hlist) {
179 if (!strcmp(name, e->name))
180 return e;
181 }
182 return NULL;
183}
184
185/*
186 * Add the tracepoint to the tracepoint hash table. Must be called with
187 * tracepoints_mutex held.
188 */
189static struct tracepoint_entry *add_tracepoint(const char *name)
190{
191 struct hlist_head *head;
192 struct hlist_node *node;
193 struct tracepoint_entry *e;
194 size_t name_len = strlen(name) + 1;
195 u32 hash = jhash(name, name_len-1, 0);
196
197 head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
198 hlist_for_each_entry(e, node, head, hlist) {
199 if (!strcmp(name, e->name)) {
200 printk(KERN_NOTICE
201 "tracepoint %s busy\n", name);
202 return ERR_PTR(-EEXIST); /* Already there */
203 }
204 }
205 /*
206 * Using kmalloc here to allocate a variable length element. Could
207 * cause some memory fragmentation if overused.
208 */
209 e = kmalloc(sizeof(struct tracepoint_entry) + name_len, GFP_KERNEL);
210 if (!e)
211 return ERR_PTR(-ENOMEM);
212 memcpy(&e->name[0], name, name_len);
213 e->funcs = NULL;
214 e->refcount = 0;
215 e->rcu_pending = 0;
216 hlist_add_head(&e->hlist, head);
217 return e;
218}
219
220/*
221 * Remove the tracepoint from the tracepoint hash table. Must be called with
222 * mutex_lock held.
223 */
224static int remove_tracepoint(const char *name)
225{
226 struct hlist_head *head;
227 struct hlist_node *node;
228 struct tracepoint_entry *e;
229 int found = 0;
230 size_t len = strlen(name) + 1;
231 u32 hash = jhash(name, len-1, 0);
232
233 head = &tracepoint_table[hash & (TRACEPOINT_TABLE_SIZE - 1)];
234 hlist_for_each_entry(e, node, head, hlist) {
235 if (!strcmp(name, e->name)) {
236 found = 1;
237 break;
238 }
239 }
240 if (!found)
241 return -ENOENT;
242 if (e->refcount)
243 return -EBUSY;
244 hlist_del(&e->hlist);
245 /* Make sure the call_rcu_sched has been executed */
246 if (e->rcu_pending)
247 rcu_barrier_sched();
248 kfree(e);
249 return 0;
250}
251
252/*
253 * Sets the probe callback corresponding to one tracepoint.
254 */
255static void set_tracepoint(struct tracepoint_entry **entry,
256 struct tracepoint *elem, int active)
257{
258 WARN_ON(strcmp((*entry)->name, elem->name) != 0);
259
260 /*
261 * rcu_assign_pointer has a smp_wmb() which makes sure that the new
262 * probe callbacks array is consistent before setting a pointer to it.
263 * This array is referenced by __DO_TRACE from
264 * include/linux/tracepoints.h. A matching smp_read_barrier_depends()
265 * is used.
266 */
267 rcu_assign_pointer(elem->funcs, (*entry)->funcs);
268 elem->state = active;
269}
270
271/*
272 * Disable a tracepoint and its probe callback.
273 * Note: only waiting an RCU period after setting elem->call to the empty
274 * function insures that the original callback is not used anymore. This insured
275 * by preempt_disable around the call site.
276 */
277static void disable_tracepoint(struct tracepoint *elem)
278{
279 elem->state = 0;
280}
281
282/**
283 * tracepoint_update_probe_range - Update a probe range
284 * @begin: beginning of the range
285 * @end: end of the range
286 *
287 * Updates the probe callback corresponding to a range of tracepoints.
288 */
289void tracepoint_update_probe_range(struct tracepoint *begin,
290 struct tracepoint *end)
291{
292 struct tracepoint *iter;
293 struct tracepoint_entry *mark_entry;
294
295 mutex_lock(&tracepoints_mutex);
296 for (iter = begin; iter < end; iter++) {
297 mark_entry = get_tracepoint(iter->name);
298 if (mark_entry) {
299 set_tracepoint(&mark_entry, iter,
300 !!mark_entry->refcount);
301 } else {
302 disable_tracepoint(iter);
303 }
304 }
305 mutex_unlock(&tracepoints_mutex);
306}
307
308/*
309 * Update probes, removing the faulty probes.
310 */
311static void tracepoint_update_probes(void)
312{
313 /* Core kernel tracepoints */
314 tracepoint_update_probe_range(__start___tracepoints,
315 __stop___tracepoints);
316 /* tracepoints in modules. */
317 module_update_tracepoints();
318}
319
320/**
321 * tracepoint_probe_register - Connect a probe to a tracepoint
322 * @name: tracepoint name
323 * @probe: probe handler
324 *
325 * Returns 0 if ok, error value on error.
326 * The probe address must at least be aligned on the architecture pointer size.
327 */
328int tracepoint_probe_register(const char *name, void *probe)
329{
330 struct tracepoint_entry *entry;
331 int ret = 0;
332 void *old;
333
334 mutex_lock(&tracepoints_mutex);
335 entry = get_tracepoint(name);
336 if (!entry) {
337 entry = add_tracepoint(name);
338 if (IS_ERR(entry)) {
339 ret = PTR_ERR(entry);
340 goto end;
341 }
342 }
343 /*
344 * If we detect that a call_rcu_sched is pending for this tracepoint,
345 * make sure it's executed now.
346 */
347 if (entry->rcu_pending)
348 rcu_barrier_sched();
349 old = tracepoint_entry_add_probe(entry, probe);
350 if (IS_ERR(old)) {
351 ret = PTR_ERR(old);
352 goto end;
353 }
354 mutex_unlock(&tracepoints_mutex);
355 tracepoint_update_probes(); /* may update entry */
356 mutex_lock(&tracepoints_mutex);
357 entry = get_tracepoint(name);
358 WARN_ON(!entry);
359 if (entry->rcu_pending)
360 rcu_barrier_sched();
361 tracepoint_entry_free_old(entry, old);
362end:
363 mutex_unlock(&tracepoints_mutex);
364 return ret;
365}
366EXPORT_SYMBOL_GPL(tracepoint_probe_register);
367
368/**
369 * tracepoint_probe_unregister - Disconnect a probe from a tracepoint
370 * @name: tracepoint name
371 * @probe: probe function pointer
372 *
373 * We do not need to call a synchronize_sched to make sure the probes have
374 * finished running before doing a module unload, because the module unload
375 * itself uses stop_machine(), which insures that every preempt disabled section
376 * have finished.
377 */
378int tracepoint_probe_unregister(const char *name, void *probe)
379{
380 struct tracepoint_entry *entry;
381 void *old;
382 int ret = -ENOENT;
383
384 mutex_lock(&tracepoints_mutex);
385 entry = get_tracepoint(name);
386 if (!entry)
387 goto end;
388 if (entry->rcu_pending)
389 rcu_barrier_sched();
390 old = tracepoint_entry_remove_probe(entry, probe);
391 mutex_unlock(&tracepoints_mutex);
392 tracepoint_update_probes(); /* may update entry */
393 mutex_lock(&tracepoints_mutex);
394 entry = get_tracepoint(name);
395 if (!entry)
396 goto end;
397 if (entry->rcu_pending)
398 rcu_barrier_sched();
399 tracepoint_entry_free_old(entry, old);
400 remove_tracepoint(name); /* Ignore busy error message */
401 ret = 0;
402end:
403 mutex_unlock(&tracepoints_mutex);
404 return ret;
405}
406EXPORT_SYMBOL_GPL(tracepoint_probe_unregister);
407
408/**
409 * tracepoint_get_iter_range - Get a next tracepoint iterator given a range.
410 * @tracepoint: current tracepoints (in), next tracepoint (out)
411 * @begin: beginning of the range
412 * @end: end of the range
413 *
414 * Returns whether a next tracepoint has been found (1) or not (0).
415 * Will return the first tracepoint in the range if the input tracepoint is
416 * NULL.
417 */
418int tracepoint_get_iter_range(struct tracepoint **tracepoint,
419 struct tracepoint *begin, struct tracepoint *end)
420{
421 if (!*tracepoint && begin != end) {
422 *tracepoint = begin;
423 return 1;
424 }
425 if (*tracepoint >= begin && *tracepoint < end)
426 return 1;
427 return 0;
428}
429EXPORT_SYMBOL_GPL(tracepoint_get_iter_range);
430
431static void tracepoint_get_iter(struct tracepoint_iter *iter)
432{
433 int found = 0;
434
435 /* Core kernel tracepoints */
436 if (!iter->module) {
437 found = tracepoint_get_iter_range(&iter->tracepoint,
438 __start___tracepoints, __stop___tracepoints);
439 if (found)
440 goto end;
441 }
442 /* tracepoints in modules. */
443 found = module_get_iter_tracepoints(iter);
444end:
445 if (!found)
446 tracepoint_iter_reset(iter);
447}
448
449void tracepoint_iter_start(struct tracepoint_iter *iter)
450{
451 tracepoint_get_iter(iter);
452}
453EXPORT_SYMBOL_GPL(tracepoint_iter_start);
454
455void tracepoint_iter_next(struct tracepoint_iter *iter)
456{
457 iter->tracepoint++;
458 /*
459 * iter->tracepoint may be invalid because we blindly incremented it.
460 * Make sure it is valid by marshalling on the tracepoints, getting the
461 * tracepoints from following modules if necessary.
462 */
463 tracepoint_get_iter(iter);
464}
465EXPORT_SYMBOL_GPL(tracepoint_iter_next);
466
467void tracepoint_iter_stop(struct tracepoint_iter *iter)
468{
469}
470EXPORT_SYMBOL_GPL(tracepoint_iter_stop);
471
472void tracepoint_iter_reset(struct tracepoint_iter *iter)
473{
474 iter->module = NULL;
475 iter->tracepoint = NULL;
476}
477EXPORT_SYMBOL_GPL(tracepoint_iter_reset);
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 4ab1b584961b..8ebcd8532dfb 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -28,14 +28,14 @@
28void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk) 28void bacct_add_tsk(struct taskstats *stats, struct task_struct *tsk)
29{ 29{
30 struct timespec uptime, ts; 30 struct timespec uptime, ts;
31 s64 ac_etime; 31 u64 ac_etime;
32 32
33 BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN); 33 BUILD_BUG_ON(TS_COMM_LEN < TASK_COMM_LEN);
34 34
35 /* calculate task elapsed time in timespec */ 35 /* calculate task elapsed time in timespec */
36 do_posix_clock_monotonic_gettime(&uptime); 36 do_posix_clock_monotonic_gettime(&uptime);
37 ts = timespec_sub(uptime, tsk->start_time); 37 ts = timespec_sub(uptime, tsk->start_time);
38 /* rebase elapsed time to usec */ 38 /* rebase elapsed time to usec (should never be negative) */
39 ac_etime = timespec_to_ns(&ts); 39 ac_etime = timespec_to_ns(&ts);
40 do_div(ac_etime, NSEC_PER_USEC); 40 do_div(ac_etime, NSEC_PER_USEC);
41 stats->ac_etime = ac_etime; 41 stats->ac_etime = ac_etime;
@@ -84,9 +84,9 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
84{ 84{
85 struct mm_struct *mm; 85 struct mm_struct *mm;
86 86
87 /* convert pages-jiffies to Mbyte-usec */ 87 /* convert pages-usec to Mbyte-usec */
88 stats->coremem = jiffies_to_usecs(p->acct_rss_mem1) * PAGE_SIZE / MB; 88 stats->coremem = p->acct_rss_mem1 * PAGE_SIZE / MB;
89 stats->virtmem = jiffies_to_usecs(p->acct_vm_mem1) * PAGE_SIZE / MB; 89 stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE / MB;
90 mm = get_task_mm(p); 90 mm = get_task_mm(p);
91 if (mm) { 91 if (mm) {
92 /* adjust to KB unit */ 92 /* adjust to KB unit */
@@ -94,10 +94,10 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
94 stats->hiwater_vm = mm->hiwater_vm * PAGE_SIZE / KB; 94 stats->hiwater_vm = mm->hiwater_vm * PAGE_SIZE / KB;
95 mmput(mm); 95 mmput(mm);
96 } 96 }
97 stats->read_char = p->rchar; 97 stats->read_char = p->ioac.rchar;
98 stats->write_char = p->wchar; 98 stats->write_char = p->ioac.wchar;
99 stats->read_syscalls = p->syscr; 99 stats->read_syscalls = p->ioac.syscr;
100 stats->write_syscalls = p->syscw; 100 stats->write_syscalls = p->ioac.syscw;
101#ifdef CONFIG_TASK_IO_ACCOUNTING 101#ifdef CONFIG_TASK_IO_ACCOUNTING
102 stats->read_bytes = p->ioac.read_bytes; 102 stats->read_bytes = p->ioac.read_bytes;
103 stats->write_bytes = p->ioac.write_bytes; 103 stats->write_bytes = p->ioac.write_bytes;
@@ -118,12 +118,19 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p)
118void acct_update_integrals(struct task_struct *tsk) 118void acct_update_integrals(struct task_struct *tsk)
119{ 119{
120 if (likely(tsk->mm)) { 120 if (likely(tsk->mm)) {
121 long delta = cputime_to_jiffies( 121 cputime_t time, dtime;
122 cputime_sub(tsk->stime, tsk->acct_stimexpd)); 122 struct timeval value;
123 u64 delta;
124
125 time = tsk->stime + tsk->utime;
126 dtime = cputime_sub(time, tsk->acct_timexpd);
127 jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
128 delta = value.tv_sec;
129 delta = delta * USEC_PER_SEC + value.tv_usec;
123 130
124 if (delta == 0) 131 if (delta == 0)
125 return; 132 return;
126 tsk->acct_stimexpd = tsk->stime; 133 tsk->acct_timexpd = time;
127 tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); 134 tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
128 tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; 135 tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
129 } 136 }
@@ -135,7 +142,7 @@ void acct_update_integrals(struct task_struct *tsk)
135 */ 142 */
136void acct_clear_integrals(struct task_struct *tsk) 143void acct_clear_integrals(struct task_struct *tsk)
137{ 144{
138 tsk->acct_stimexpd = 0; 145 tsk->acct_timexpd = 0;
139 tsk->acct_rss_mem1 = 0; 146 tsk->acct_rss_mem1 = 0;
140 tsk->acct_vm_mem1 = 0; 147 tsk->acct_vm_mem1 = 0;
141} 148}
diff --git a/kernel/user.c b/kernel/user.c
index 865ecf57a096..39d6159fae43 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -169,7 +169,7 @@ static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
169{ 169{
170 struct user_struct *up = container_of(kobj, struct user_struct, kobj); 170 struct user_struct *up = container_of(kobj, struct user_struct, kobj);
171 171
172 return sprintf(buf, "%lu\n", sched_group_rt_runtime(up->tg)); 172 return sprintf(buf, "%ld\n", sched_group_rt_runtime(up->tg));
173} 173}
174 174
175static ssize_t cpu_rt_runtime_store(struct kobject *kobj, 175static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
@@ -180,7 +180,7 @@ static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
180 unsigned long rt_runtime; 180 unsigned long rt_runtime;
181 int rc; 181 int rc;
182 182
183 sscanf(buf, "%lu", &rt_runtime); 183 sscanf(buf, "%ld", &rt_runtime);
184 184
185 rc = sched_group_set_rt_runtime(up->tg, rt_runtime); 185 rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
186 186
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index a9ab0596de44..532858fa5b88 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -6,7 +6,6 @@
6 */ 6 */
7 7
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/version.h>
10#include <linux/nsproxy.h> 9#include <linux/nsproxy.h>
11#include <linux/slab.h> 10#include <linux/slab.h>
12#include <linux/user_namespace.h> 11#include <linux/user_namespace.h>
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 64d398f12444..815237a55af8 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -12,7 +12,6 @@
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/uts.h> 13#include <linux/uts.h>
14#include <linux/utsname.h> 14#include <linux/utsname.h>
15#include <linux/version.h>
16#include <linux/err.h> 15#include <linux/err.h>
17#include <linux/slab.h> 16#include <linux/slab.h>
18 17
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index fe3a56c2256d..3b34b3545936 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -12,7 +12,6 @@
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/uts.h> 13#include <linux/uts.h>
14#include <linux/utsname.h> 14#include <linux/utsname.h>
15#include <linux/version.h>
16#include <linux/sysctl.h> 15#include <linux/sysctl.h>
17 16
18static void *get_uts(ctl_table *table, int write) 17static void *get_uts(ctl_table *table, int write)
@@ -61,7 +60,7 @@ static int proc_do_uts_string(ctl_table *table, int write, struct file *filp,
61 60
62#ifdef CONFIG_SYSCTL_SYSCALL 61#ifdef CONFIG_SYSCTL_SYSCALL
63/* The generic string strategy routine: */ 62/* The generic string strategy routine: */
64static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen, 63static int sysctl_uts_string(ctl_table *table,
65 void __user *oldval, size_t __user *oldlenp, 64 void __user *oldval, size_t __user *oldlenp,
66 void __user *newval, size_t newlen) 65 void __user *newval, size_t newlen)
67{ 66{
@@ -70,8 +69,7 @@ static int sysctl_uts_string(ctl_table *table, int __user *name, int nlen,
70 write = newval && newlen; 69 write = newval && newlen;
71 memcpy(&uts_table, table, sizeof(uts_table)); 70 memcpy(&uts_table, table, sizeof(uts_table));
72 uts_table.data = get_uts(table, write); 71 uts_table.data = get_uts(table, write);
73 r = sysctl_string(&uts_table, name, nlen, 72 r = sysctl_string(&uts_table, oldval, oldlenp, newval, newlen);
74 oldval, oldlenp, newval, newlen);
75 put_uts(table, write, uts_table.data); 73 put_uts(table, write, uts_table.data);
76 return r; 74 return r;
77} 75}
diff --git a/kernel/wait.c b/kernel/wait.c
index c275c56cf2d3..cd87131f2fc2 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -72,12 +72,7 @@ prepare_to_wait(wait_queue_head_t *q, wait_queue_t *wait, int state)
72 spin_lock_irqsave(&q->lock, flags); 72 spin_lock_irqsave(&q->lock, flags);
73 if (list_empty(&wait->task_list)) 73 if (list_empty(&wait->task_list))
74 __add_wait_queue(q, wait); 74 __add_wait_queue(q, wait);
75 /* 75 set_current_state(state);
76 * don't alter the task state if this is just going to
77 * queue an async wait queue callback
78 */
79 if (is_sync_wait(wait))
80 set_current_state(state);
81 spin_unlock_irqrestore(&q->lock, flags); 76 spin_unlock_irqrestore(&q->lock, flags);
82} 77}
83EXPORT_SYMBOL(prepare_to_wait); 78EXPORT_SYMBOL(prepare_to_wait);
@@ -91,12 +86,7 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
91 spin_lock_irqsave(&q->lock, flags); 86 spin_lock_irqsave(&q->lock, flags);
92 if (list_empty(&wait->task_list)) 87 if (list_empty(&wait->task_list))
93 __add_wait_queue_tail(q, wait); 88 __add_wait_queue_tail(q, wait);
94 /* 89 set_current_state(state);
95 * don't alter the task state if this is just going to
96 * queue an async wait queue callback
97 */
98 if (is_sync_wait(wait))
99 set_current_state(state);
100 spin_unlock_irqrestore(&q->lock, flags); 90 spin_unlock_irqrestore(&q->lock, flags);
101} 91}
102EXPORT_SYMBOL(prepare_to_wait_exclusive); 92EXPORT_SYMBOL(prepare_to_wait_exclusive);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ce7799540c91..f928f2a87b9b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -9,7 +9,7 @@
9 * Derived from the taskqueue/keventd code by: 9 * Derived from the taskqueue/keventd code by:
10 * 10 *
11 * David Woodhouse <dwmw2@infradead.org> 11 * David Woodhouse <dwmw2@infradead.org>
12 * Andrew Morton <andrewm@uow.edu.au> 12 * Andrew Morton
13 * Kai Petzke <wpp@marie.physik.tu-berlin.de> 13 * Kai Petzke <wpp@marie.physik.tu-berlin.de>
14 * Theodore Ts'o <tytso@mit.edu> 14 * Theodore Ts'o <tytso@mit.edu>
15 * 15 *
@@ -62,6 +62,7 @@ struct workqueue_struct {
62 const char *name; 62 const char *name;
63 int singlethread; 63 int singlethread;
64 int freezeable; /* Freeze threads during suspend */ 64 int freezeable; /* Freeze threads during suspend */
65 int rt;
65#ifdef CONFIG_LOCKDEP 66#ifdef CONFIG_LOCKDEP
66 struct lockdep_map lockdep_map; 67 struct lockdep_map lockdep_map;
67#endif 68#endif
@@ -125,7 +126,7 @@ struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
125} 126}
126 127
127static void insert_work(struct cpu_workqueue_struct *cwq, 128static void insert_work(struct cpu_workqueue_struct *cwq,
128 struct work_struct *work, int tail) 129 struct work_struct *work, struct list_head *head)
129{ 130{
130 set_wq_data(work, cwq); 131 set_wq_data(work, cwq);
131 /* 132 /*
@@ -133,21 +134,17 @@ static void insert_work(struct cpu_workqueue_struct *cwq,
133 * result of list_add() below, see try_to_grab_pending(). 134 * result of list_add() below, see try_to_grab_pending().
134 */ 135 */
135 smp_wmb(); 136 smp_wmb();
136 if (tail) 137 list_add_tail(&work->entry, head);
137 list_add_tail(&work->entry, &cwq->worklist);
138 else
139 list_add(&work->entry, &cwq->worklist);
140 wake_up(&cwq->more_work); 138 wake_up(&cwq->more_work);
141} 139}
142 140
143/* Preempt must be disabled. */
144static void __queue_work(struct cpu_workqueue_struct *cwq, 141static void __queue_work(struct cpu_workqueue_struct *cwq,
145 struct work_struct *work) 142 struct work_struct *work)
146{ 143{
147 unsigned long flags; 144 unsigned long flags;
148 145
149 spin_lock_irqsave(&cwq->lock, flags); 146 spin_lock_irqsave(&cwq->lock, flags);
150 insert_work(cwq, work, 1); 147 insert_work(cwq, work, &cwq->worklist);
151 spin_unlock_irqrestore(&cwq->lock, flags); 148 spin_unlock_irqrestore(&cwq->lock, flags);
152} 149}
153 150
@@ -163,17 +160,39 @@ static void __queue_work(struct cpu_workqueue_struct *cwq,
163 */ 160 */
164int queue_work(struct workqueue_struct *wq, struct work_struct *work) 161int queue_work(struct workqueue_struct *wq, struct work_struct *work)
165{ 162{
163 int ret;
164
165 ret = queue_work_on(get_cpu(), wq, work);
166 put_cpu();
167
168 return ret;
169}
170EXPORT_SYMBOL_GPL(queue_work);
171
172/**
173 * queue_work_on - queue work on specific cpu
174 * @cpu: CPU number to execute work on
175 * @wq: workqueue to use
176 * @work: work to queue
177 *
178 * Returns 0 if @work was already on a queue, non-zero otherwise.
179 *
180 * We queue the work to a specific CPU, the caller must ensure it
181 * can't go away.
182 */
183int
184queue_work_on(int cpu, struct workqueue_struct *wq, struct work_struct *work)
185{
166 int ret = 0; 186 int ret = 0;
167 187
168 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) { 188 if (!test_and_set_bit(WORK_STRUCT_PENDING, work_data_bits(work))) {
169 BUG_ON(!list_empty(&work->entry)); 189 BUG_ON(!list_empty(&work->entry));
170 __queue_work(wq_per_cpu(wq, get_cpu()), work); 190 __queue_work(wq_per_cpu(wq, cpu), work);
171 put_cpu();
172 ret = 1; 191 ret = 1;
173 } 192 }
174 return ret; 193 return ret;
175} 194}
176EXPORT_SYMBOL_GPL(queue_work); 195EXPORT_SYMBOL_GPL(queue_work_on);
177 196
178static void delayed_work_timer_fn(unsigned long __data) 197static void delayed_work_timer_fn(unsigned long __data)
179{ 198{
@@ -272,11 +291,11 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
272 291
273 BUG_ON(get_wq_data(work) != cwq); 292 BUG_ON(get_wq_data(work) != cwq);
274 work_clear_pending(work); 293 work_clear_pending(work);
275 lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_); 294 lock_map_acquire(&cwq->wq->lockdep_map);
276 lock_acquire(&lockdep_map, 0, 0, 0, 2, _THIS_IP_); 295 lock_map_acquire(&lockdep_map);
277 f(work); 296 f(work);
278 lock_release(&lockdep_map, 1, _THIS_IP_); 297 lock_map_release(&lockdep_map);
279 lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_); 298 lock_map_release(&cwq->wq->lockdep_map);
280 299
281 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) { 300 if (unlikely(in_atomic() || lockdep_depth(current) > 0)) {
282 printk(KERN_ERR "BUG: workqueue leaked lock or atomic: " 301 printk(KERN_ERR "BUG: workqueue leaked lock or atomic: "
@@ -337,14 +356,14 @@ static void wq_barrier_func(struct work_struct *work)
337} 356}
338 357
339static void insert_wq_barrier(struct cpu_workqueue_struct *cwq, 358static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
340 struct wq_barrier *barr, int tail) 359 struct wq_barrier *barr, struct list_head *head)
341{ 360{
342 INIT_WORK(&barr->work, wq_barrier_func); 361 INIT_WORK(&barr->work, wq_barrier_func);
343 __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work)); 362 __set_bit(WORK_STRUCT_PENDING, work_data_bits(&barr->work));
344 363
345 init_completion(&barr->done); 364 init_completion(&barr->done);
346 365
347 insert_work(cwq, &barr->work, tail); 366 insert_work(cwq, &barr->work, head);
348} 367}
349 368
350static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq) 369static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
@@ -364,7 +383,7 @@ static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
364 active = 0; 383 active = 0;
365 spin_lock_irq(&cwq->lock); 384 spin_lock_irq(&cwq->lock);
366 if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) { 385 if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
367 insert_wq_barrier(cwq, &barr, 1); 386 insert_wq_barrier(cwq, &barr, &cwq->worklist);
368 active = 1; 387 active = 1;
369 } 388 }
370 spin_unlock_irq(&cwq->lock); 389 spin_unlock_irq(&cwq->lock);
@@ -395,13 +414,64 @@ void flush_workqueue(struct workqueue_struct *wq)
395 int cpu; 414 int cpu;
396 415
397 might_sleep(); 416 might_sleep();
398 lock_acquire(&wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_); 417 lock_map_acquire(&wq->lockdep_map);
399 lock_release(&wq->lockdep_map, 1, _THIS_IP_); 418 lock_map_release(&wq->lockdep_map);
400 for_each_cpu_mask(cpu, *cpu_map) 419 for_each_cpu_mask_nr(cpu, *cpu_map)
401 flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu)); 420 flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
402} 421}
403EXPORT_SYMBOL_GPL(flush_workqueue); 422EXPORT_SYMBOL_GPL(flush_workqueue);
404 423
424/**
425 * flush_work - block until a work_struct's callback has terminated
426 * @work: the work which is to be flushed
427 *
428 * Returns false if @work has already terminated.
429 *
430 * It is expected that, prior to calling flush_work(), the caller has
431 * arranged for the work to not be requeued, otherwise it doesn't make
432 * sense to use this function.
433 */
434int flush_work(struct work_struct *work)
435{
436 struct cpu_workqueue_struct *cwq;
437 struct list_head *prev;
438 struct wq_barrier barr;
439
440 might_sleep();
441 cwq = get_wq_data(work);
442 if (!cwq)
443 return 0;
444
445 lock_map_acquire(&cwq->wq->lockdep_map);
446 lock_map_release(&cwq->wq->lockdep_map);
447
448 prev = NULL;
449 spin_lock_irq(&cwq->lock);
450 if (!list_empty(&work->entry)) {
451 /*
452 * See the comment near try_to_grab_pending()->smp_rmb().
453 * If it was re-queued under us we are not going to wait.
454 */
455 smp_rmb();
456 if (unlikely(cwq != get_wq_data(work)))
457 goto out;
458 prev = &work->entry;
459 } else {
460 if (cwq->current_work != work)
461 goto out;
462 prev = &cwq->worklist;
463 }
464 insert_wq_barrier(cwq, &barr, prev->next);
465out:
466 spin_unlock_irq(&cwq->lock);
467 if (!prev)
468 return 0;
469
470 wait_for_completion(&barr.done);
471 return 1;
472}
473EXPORT_SYMBOL_GPL(flush_work);
474
405/* 475/*
406 * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit, 476 * Upon a successful return (>= 0), the caller "owns" WORK_STRUCT_PENDING bit,
407 * so this work can't be re-armed in any way. 477 * so this work can't be re-armed in any way.
@@ -449,7 +519,7 @@ static void wait_on_cpu_work(struct cpu_workqueue_struct *cwq,
449 519
450 spin_lock_irq(&cwq->lock); 520 spin_lock_irq(&cwq->lock);
451 if (unlikely(cwq->current_work == work)) { 521 if (unlikely(cwq->current_work == work)) {
452 insert_wq_barrier(cwq, &barr, 0); 522 insert_wq_barrier(cwq, &barr, cwq->worklist.next);
453 running = 1; 523 running = 1;
454 } 524 }
455 spin_unlock_irq(&cwq->lock); 525 spin_unlock_irq(&cwq->lock);
@@ -467,8 +537,8 @@ static void wait_on_work(struct work_struct *work)
467 537
468 might_sleep(); 538 might_sleep();
469 539
470 lock_acquire(&work->lockdep_map, 0, 0, 0, 2, _THIS_IP_); 540 lock_map_acquire(&work->lockdep_map);
471 lock_release(&work->lockdep_map, 1, _THIS_IP_); 541 lock_map_release(&work->lockdep_map);
472 542
473 cwq = get_wq_data(work); 543 cwq = get_wq_data(work);
474 if (!cwq) 544 if (!cwq)
@@ -477,7 +547,7 @@ static void wait_on_work(struct work_struct *work)
477 wq = cwq->wq; 547 wq = cwq->wq;
478 cpu_map = wq_cpu_map(wq); 548 cpu_map = wq_cpu_map(wq);
479 549
480 for_each_cpu_mask(cpu, *cpu_map) 550 for_each_cpu_mask_nr(cpu, *cpu_map)
481 wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work); 551 wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
482} 552}
483 553
@@ -553,6 +623,19 @@ int schedule_work(struct work_struct *work)
553} 623}
554EXPORT_SYMBOL(schedule_work); 624EXPORT_SYMBOL(schedule_work);
555 625
626/*
627 * schedule_work_on - put work task on a specific cpu
628 * @cpu: cpu to put the work task on
629 * @work: job to be done
630 *
631 * This puts a job on a specific cpu
632 */
633int schedule_work_on(int cpu, struct work_struct *work)
634{
635 return queue_work_on(cpu, keventd_wq, work);
636}
637EXPORT_SYMBOL(schedule_work_on);
638
556/** 639/**
557 * schedule_delayed_work - put work task in global workqueue after delay 640 * schedule_delayed_work - put work task in global workqueue after delay
558 * @dwork: job to be done 641 * @dwork: job to be done
@@ -607,10 +690,10 @@ int schedule_on_each_cpu(work_func_t func)
607 struct work_struct *work = per_cpu_ptr(works, cpu); 690 struct work_struct *work = per_cpu_ptr(works, cpu);
608 691
609 INIT_WORK(work, func); 692 INIT_WORK(work, func);
610 set_bit(WORK_STRUCT_PENDING, work_data_bits(work)); 693 schedule_work_on(cpu, work);
611 __queue_work(per_cpu_ptr(keventd_wq->cpu_wq, cpu), work);
612 } 694 }
613 flush_workqueue(keventd_wq); 695 for_each_online_cpu(cpu)
696 flush_work(per_cpu_ptr(works, cpu));
614 put_online_cpus(); 697 put_online_cpus();
615 free_percpu(works); 698 free_percpu(works);
616 return 0; 699 return 0;
@@ -684,6 +767,7 @@ init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
684 767
685static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu) 768static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
686{ 769{
770 struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
687 struct workqueue_struct *wq = cwq->wq; 771 struct workqueue_struct *wq = cwq->wq;
688 const char *fmt = is_single_threaded(wq) ? "%s" : "%s/%d"; 772 const char *fmt = is_single_threaded(wq) ? "%s" : "%s/%d";
689 struct task_struct *p; 773 struct task_struct *p;
@@ -699,7 +783,8 @@ static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
699 */ 783 */
700 if (IS_ERR(p)) 784 if (IS_ERR(p))
701 return PTR_ERR(p); 785 return PTR_ERR(p);
702 786 if (cwq->wq->rt)
787 sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
703 cwq->thread = p; 788 cwq->thread = p;
704 789
705 return 0; 790 return 0;
@@ -719,6 +804,7 @@ static void start_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
719struct workqueue_struct *__create_workqueue_key(const char *name, 804struct workqueue_struct *__create_workqueue_key(const char *name,
720 int singlethread, 805 int singlethread,
721 int freezeable, 806 int freezeable,
807 int rt,
722 struct lock_class_key *key, 808 struct lock_class_key *key,
723 const char *lock_name) 809 const char *lock_name)
724{ 810{
@@ -740,6 +826,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
740 lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); 826 lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
741 wq->singlethread = singlethread; 827 wq->singlethread = singlethread;
742 wq->freezeable = freezeable; 828 wq->freezeable = freezeable;
829 wq->rt = rt;
743 INIT_LIST_HEAD(&wq->list); 830 INIT_LIST_HEAD(&wq->list);
744 831
745 if (singlethread) { 832 if (singlethread) {
@@ -747,11 +834,22 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
747 err = create_workqueue_thread(cwq, singlethread_cpu); 834 err = create_workqueue_thread(cwq, singlethread_cpu);
748 start_workqueue_thread(cwq, -1); 835 start_workqueue_thread(cwq, -1);
749 } else { 836 } else {
750 get_online_cpus(); 837 cpu_maps_update_begin();
838 /*
839 * We must place this wq on list even if the code below fails.
840 * cpu_down(cpu) can remove cpu from cpu_populated_map before
841 * destroy_workqueue() takes the lock, in that case we leak
842 * cwq[cpu]->thread.
843 */
751 spin_lock(&workqueue_lock); 844 spin_lock(&workqueue_lock);
752 list_add(&wq->list, &workqueues); 845 list_add(&wq->list, &workqueues);
753 spin_unlock(&workqueue_lock); 846 spin_unlock(&workqueue_lock);
754 847 /*
848 * We must initialize cwqs for each possible cpu even if we
849 * are going to call destroy_workqueue() finally. Otherwise
850 * cpu_up() can hit the uninitialized cwq once we drop the
851 * lock.
852 */
755 for_each_possible_cpu(cpu) { 853 for_each_possible_cpu(cpu) {
756 cwq = init_cpu_workqueue(wq, cpu); 854 cwq = init_cpu_workqueue(wq, cpu);
757 if (err || !cpu_online(cpu)) 855 if (err || !cpu_online(cpu))
@@ -759,7 +857,7 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
759 err = create_workqueue_thread(cwq, cpu); 857 err = create_workqueue_thread(cwq, cpu);
760 start_workqueue_thread(cwq, cpu); 858 start_workqueue_thread(cwq, cpu);
761 } 859 }
762 put_online_cpus(); 860 cpu_maps_update_done();
763 } 861 }
764 862
765 if (err) { 863 if (err) {
@@ -773,18 +871,18 @@ EXPORT_SYMBOL_GPL(__create_workqueue_key);
773static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq) 871static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
774{ 872{
775 /* 873 /*
776 * Our caller is either destroy_workqueue() or CPU_DEAD, 874 * Our caller is either destroy_workqueue() or CPU_POST_DEAD,
777 * get_online_cpus() protects cwq->thread. 875 * cpu_add_remove_lock protects cwq->thread.
778 */ 876 */
779 if (cwq->thread == NULL) 877 if (cwq->thread == NULL)
780 return; 878 return;
781 879
782 lock_acquire(&cwq->wq->lockdep_map, 0, 0, 0, 2, _THIS_IP_); 880 lock_map_acquire(&cwq->wq->lockdep_map);
783 lock_release(&cwq->wq->lockdep_map, 1, _THIS_IP_); 881 lock_map_release(&cwq->wq->lockdep_map);
784 882
785 flush_cpu_workqueue(cwq); 883 flush_cpu_workqueue(cwq);
786 /* 884 /*
787 * If the caller is CPU_DEAD and cwq->worklist was not empty, 885 * If the caller is CPU_POST_DEAD and cwq->worklist was not empty,
788 * a concurrent flush_workqueue() can insert a barrier after us. 886 * a concurrent flush_workqueue() can insert a barrier after us.
789 * However, in that case run_workqueue() won't return and check 887 * However, in that case run_workqueue() won't return and check
790 * kthread_should_stop() until it flushes all work_struct's. 888 * kthread_should_stop() until it flushes all work_struct's.
@@ -808,14 +906,14 @@ void destroy_workqueue(struct workqueue_struct *wq)
808 const cpumask_t *cpu_map = wq_cpu_map(wq); 906 const cpumask_t *cpu_map = wq_cpu_map(wq);
809 int cpu; 907 int cpu;
810 908
811 get_online_cpus(); 909 cpu_maps_update_begin();
812 spin_lock(&workqueue_lock); 910 spin_lock(&workqueue_lock);
813 list_del(&wq->list); 911 list_del(&wq->list);
814 spin_unlock(&workqueue_lock); 912 spin_unlock(&workqueue_lock);
815 913
816 for_each_cpu_mask(cpu, *cpu_map) 914 for_each_cpu_mask_nr(cpu, *cpu_map)
817 cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu)); 915 cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu));
818 put_online_cpus(); 916 cpu_maps_update_done();
819 917
820 free_percpu(wq->cpu_wq); 918 free_percpu(wq->cpu_wq);
821 kfree(wq); 919 kfree(wq);
@@ -829,6 +927,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
829 unsigned int cpu = (unsigned long)hcpu; 927 unsigned int cpu = (unsigned long)hcpu;
830 struct cpu_workqueue_struct *cwq; 928 struct cpu_workqueue_struct *cwq;
831 struct workqueue_struct *wq; 929 struct workqueue_struct *wq;
930 int ret = NOTIFY_OK;
832 931
833 action &= ~CPU_TASKS_FROZEN; 932 action &= ~CPU_TASKS_FROZEN;
834 933
@@ -836,7 +935,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
836 case CPU_UP_PREPARE: 935 case CPU_UP_PREPARE:
837 cpu_set(cpu, cpu_populated_map); 936 cpu_set(cpu, cpu_populated_map);
838 } 937 }
839 938undo:
840 list_for_each_entry(wq, &workqueues, list) { 939 list_for_each_entry(wq, &workqueues, list) {
841 cwq = per_cpu_ptr(wq->cpu_wq, cpu); 940 cwq = per_cpu_ptr(wq->cpu_wq, cpu);
842 941
@@ -846,7 +945,9 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
846 break; 945 break;
847 printk(KERN_ERR "workqueue [%s] for %i failed\n", 946 printk(KERN_ERR "workqueue [%s] for %i failed\n",
848 wq->name, cpu); 947 wq->name, cpu);
849 return NOTIFY_BAD; 948 action = CPU_UP_CANCELED;
949 ret = NOTIFY_BAD;
950 goto undo;
850 951
851 case CPU_ONLINE: 952 case CPU_ONLINE:
852 start_workqueue_thread(cwq, cpu); 953 start_workqueue_thread(cwq, cpu);
@@ -854,7 +955,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
854 955
855 case CPU_UP_CANCELED: 956 case CPU_UP_CANCELED:
856 start_workqueue_thread(cwq, -1); 957 start_workqueue_thread(cwq, -1);
857 case CPU_DEAD: 958 case CPU_POST_DEAD:
858 cleanup_workqueue_thread(cwq); 959 cleanup_workqueue_thread(cwq);
859 break; 960 break;
860 } 961 }
@@ -862,11 +963,11 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb,
862 963
863 switch (action) { 964 switch (action) {
864 case CPU_UP_CANCELED: 965 case CPU_UP_CANCELED:
865 case CPU_DEAD: 966 case CPU_POST_DEAD:
866 cpu_clear(cpu, cpu_populated_map); 967 cpu_clear(cpu, cpu_populated_map);
867 } 968 }
868 969
869 return NOTIFY_OK; 970 return ret;
870} 971}
871 972
872void __init init_workqueues(void) 973void __init init_workqueues(void)